diff --git a/evaluator-example/README.md b/evaluator-example/README.md new file mode 100644 index 000000000..7897b4aa5 --- /dev/null +++ b/evaluator-example/README.md @@ -0,0 +1,38 @@ +## Evaluator Example + +This is an example of how to use the evaluator. The evaluator is a tool that +can be used to evaluate the performance of a model on a dataset. +It can be used to evaluate the performance of a model on a dataset, +or to compare the performance of multiple models on a dataset. + +This module contains an example that you only have to copy to your project and +adapt to your needs. + +### Pre-requisites + +You need to have the following installed: + +- [Install Poetry](https://python-poetry.org/docs/#installing-with-pipx) +- **Python 3.10.0:** you can configure it with virtualenv +```bash +virtualenv venv --python=python3.10.0. +source venv/bin/activate. +``` + +When you have Poetry installed, you can install the dependencies. You have to +move to `evalTest` folder and execute the following command: + +```bash +poetry install +``` + +### Usage + +To try this example, you can run the following command: + +```bash +./gradlew evaluator +``` + +After running the command, you will have the results saved +in a web, that you can see opening the file: `evalTest/index.html` diff --git a/evaluator-example/build.gradle.kts b/evaluator-example/build.gradle.kts new file mode 100644 index 000000000..2694e6891 --- /dev/null +++ b/evaluator-example/build.gradle.kts @@ -0,0 +1,60 @@ +import java.io.OutputStream + +plugins { + id(libs.plugins.kotlin.jvm.get().pluginId) + id(libs.plugins.kotlinx.serialization.get().pluginId) + alias(libs.plugins.spotless) +} + +repositories { mavenCentral() } + +java { + sourceCompatibility = JavaVersion.VERSION_11 + targetCompatibility = JavaVersion.VERSION_11 + toolchain { languageVersion = JavaLanguageVersion.of(11) } +} + +dependencies { + implementation(projects.xefCore) + implementation(projects.xefOpenai) + implementation(projects.xefEvaluator) + implementation(libs.suspendApp.core) + implementation(libs.bundles.arrow) +} + +spotless { + kotlin { + target("**/*.kt") + ktfmt().googleStyle().configure { it.setRemoveUnusedImport(true) } + } +} + +tasks.create("test-example") { + dependsOn("compileKotlin") + + workingDir("./evalTest") + + group = "Execution" + description = "Test example" + classpath = sourceSets.main.get().runtimeClasspath + mainClass = "com.xebia.funcional.xef.evaluator.examples.TestExample" + + doLast { + println(">> data.json created!") + } +} + +tasks.create("evaluator") { + dependsOn("test-example") + + this.standardOutput = OutputStream.nullOutputStream() + + workingDir("./evalTest") + + commandLine("poetry", "run", "deepeval", "test", "run", "py-evaluator/test_evaluator.py") + + doLast { + println(">> Open evalTest/publish/index.html in your browser") + } +} + diff --git a/evaluator-example/evalTest/.gitignore b/evaluator-example/evalTest/.gitignore new file mode 100644 index 000000000..8bbfc017c --- /dev/null +++ b/evaluator-example/evalTest/.gitignore @@ -0,0 +1,6 @@ +__pycache__ +results.json +data.json +publish/content.js +.pytest_cache +poetry.lock diff --git a/evaluator-example/evalTest/publish/index.html b/evaluator-example/evalTest/publish/index.html new file mode 100644 index 000000000..eae13b286 --- /dev/null +++ b/evaluator-example/evalTest/publish/index.html @@ -0,0 +1,13 @@ + + + + + Tests + + + + + +
+ + diff --git a/evaluator-example/evalTest/publish/script.js b/evaluator-example/evalTest/publish/script.js new file mode 100644 index 000000000..000ec719f --- /dev/null +++ b/evaluator-example/evalTest/publish/script.js @@ -0,0 +1,60 @@ +document.addEventListener('DOMContentLoaded', function() { + + const container = document.getElementById('test-container'); + const summaryDiv = document.createElement('div'); + summaryDiv.classList.add('test-summary'); + + testData.results.forEach(block => { + const blockDiv = document.createElement('div'); + blockDiv.classList.add('test-block'); + + const title = document.createElement('h2'); + title.classList.add('test-title'); + title.textContent = block.description; + blockDiv.appendChild(title); + + block.tests.forEach(test => { + const inputDiv = document.createElement('div'); + inputDiv.classList.add(test.assert ? 'input-passed' : 'input-failed'); + inputDiv.textContent = 'Input: ' + test.input; + blockDiv.appendChild(inputDiv); + + const outputDiv = document.createElement('div'); + outputDiv.classList.add('output'); + outputDiv.textContent = 'Output: ' + test.output; + outputDiv.addEventListener('click', function() { + this.classList.toggle('expanded'); + }); + blockDiv.appendChild(outputDiv); + + const scoreDiv = document.createElement('div'); + scoreDiv.classList.add('score', test.assert ? 'score-passed' : 'score-failed'); + scoreDiv.textContent = 'Score: ' + test.score.toFixed(3); + blockDiv.appendChild(scoreDiv); + }); + + const avgScoreDiv = document.createElement('div'); + avgScoreDiv.classList.add('avg-score'); + avgScoreDiv.textContent = 'Average Score: ' + block.avg.toFixed(3); + blockDiv.appendChild(avgScoreDiv); + + const testInfoDiv = document.createElement('div'); + testInfoDiv.classList.add('test-info'); + testInfoDiv.innerHTML = ` + Tests Passed: ${block.tests_successful}
+ Tests Failed: ${block.tests_failures}
+ Success Rate: ${block.success_rate.toFixed(2)}% + `; + blockDiv.appendChild(testInfoDiv); + + container.appendChild(blockDiv); + + summaryDiv.innerHTML += ` +

${block.description}

+ Average Score: ${block.avg.toFixed(3)}
+ Success Rate: ${block.success_rate.toFixed(2)}%

+ `; + }); + + container.appendChild(summaryDiv); +}); diff --git a/evaluator-example/evalTest/publish/styles.css b/evaluator-example/evalTest/publish/styles.css new file mode 100644 index 000000000..a14683826 --- /dev/null +++ b/evaluator-example/evalTest/publish/styles.css @@ -0,0 +1,87 @@ +body { + font-family: Arial, sans-serif; + margin: 0; + padding: 0; + background-color: #f4f4f4; +} + +#test-container { + width: 80%; + margin: 20px auto; + padding: 15px; + background-color: white; + border-radius: 8px; + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); +} + +.test-block { + margin-bottom: 20px; + border-bottom: 1px solid #eee; + padding-bottom: 20px; +} + +.test-title { + font-size: 1.2em; + color: #333; +} + +.input, .output { + margin: 5px 0; +} + +.input-passed { + margin-top: 25px; + color: green; + font-weight: bold; +} + +.input-failed { + margin-top: 25px; + color: red; + font-weight: bold; +} + +.output { + color: #666; + cursor: pointer; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; +} + +.output.expanded { + white-space: normal; +} + +.score { + font-weight: bold; +} + +.score-passed { + margin-bottom: 25px; + color: #008000; +} + +.score-failed { + margin-bottom: 25px; + color: red; +} + +.avg-score, .test-info { + font-size: 1.2em; + color: #d35400; + margin-top: 10px; +} + +.test-summary { + background-color: #e7e7e7; + padding: 15px; + margin-top: 20px; + border-radius: 8px; +} + +.test-summary h3 { + font-size: 1.1em; + color: #555; + margin-top: 0; +} diff --git a/evaluator-example/evalTest/py-evaluator/test_evaluator.py b/evaluator-example/evalTest/py-evaluator/test_evaluator.py new file mode 100644 index 000000000..c54cb82f8 --- /dev/null +++ b/evaluator-example/evalTest/py-evaluator/test_evaluator.py @@ -0,0 +1,96 @@ +from deepeval.metrics.answer_relevancy import AnswerRelevancyMetric +from deepeval.metrics.factual_consistency import FactualConsistencyMetric +from deepeval.test_case import LLMTestCase +from deepeval.evaluator import execute_test +import json + +f = open('data.json') +data = json.load(f) + +appDescription = data['description'] + +outputs = data['outputs_description'] + +numberOfOutputs = len(outputs) +minimumScore = float(data['minimum_score']) +metric = data['metric'] + +print() +print() +print(appDescription) +print("================") +print() +print(f"Using {metric} metric with {numberOfOutputs} different outputs ({minimumScore} minimum score)") + +currentOutput = 0 + +metricObj = FactualConsistencyMetric(minimum_score=minimumScore) + +if metric == "AnswerRelevancyMetric": + metricObj = AnswerRelevancyMetric(minimum_score=minimumScore) + +jsonResponse = { + "description": appDescription, +} + +jsonItemResultResponses = [] + +for x in range(numberOfOutputs): + jsonItemResponse = { + "description": outputs[x], + + } + cases = [] + for item in data['items']: + context = [] + if "context" in item: + context = item['context'] + cases.append(LLMTestCase(input=item['input'], actual_output=item['actual_outputs'][x], context=context)) + + print() + results = execute_test(cases, [metricObj]) + print(f"Results: {outputs[x]}:") + totalScore = 0 + + jsonResultResponses = [] + + numberTestSuccessful = 0 + for r in results: + score = float(r.metrics[0].score) + testsSuccessful = score >= minimumScore + jsonResultResponse = { + "input": r.input, + "output": r.actual_output, + "score": score, + "assert": testsSuccessful + } + if testsSuccessful: + numberTestSuccessful += 1 + jsonResultResponses.append(jsonResultResponse) + totalScore += r.metrics[0].score + print(f"- {r.input} -> {r.metrics[0].score}") + avg = totalScore / len(results) + successRate = numberTestSuccessful * 100 / len(results) + jsonItemResponse["tests"] = jsonResultResponses + jsonItemResponse["avg"] = avg + jsonItemResponse["tests_successful"] = numberTestSuccessful + jsonItemResponse["tests_failures"] = len(results) - numberTestSuccessful + jsonItemResponse["success_rate"] = successRate + jsonItemResultResponses.append(jsonItemResponse) + print() + print(f"Average: {avg}:") + print(f"Success rate: {successRate}:") + print() + +jsonResponse["results"] = jsonItemResultResponses + +with open("results.json", "w") as outfile: + json.dump(jsonResponse, outfile) + +with open("publish/content.js", "w") as outfile: + jsonStr = json.dumps(jsonResponse) + outfile.write(f"const testData = {jsonStr};") + +print() + +f.close() diff --git a/evaluator-example/evalTest/pyproject.toml b/evaluator-example/evalTest/pyproject.toml new file mode 100644 index 000000000..8293197ca --- /dev/null +++ b/evaluator-example/evalTest/pyproject.toml @@ -0,0 +1,13 @@ +[tool.poetry] +name = "py-evaluator" +version = "0.1.0" +description = "Python evaluator for DeepEval" +authors = ["Xef"] + +[tool.poetry.dependencies] +python = "~3.10.0" +deepeval = "0.20.19" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/evaluator-example/src/main/kotlin/com/xebia/funcional/xef/evaluator/examples/TestExample.kt b/evaluator-example/src/main/kotlin/com/xebia/funcional/xef/evaluator/examples/TestExample.kt new file mode 100644 index 000000000..466088640 --- /dev/null +++ b/evaluator-example/src/main/kotlin/com/xebia/funcional/xef/evaluator/examples/TestExample.kt @@ -0,0 +1,47 @@ +package com.xebia.funcional.xef.evaluator.examples + +import arrow.continuations.SuspendApp +import com.xebia.funcional.xef.evaluator.TestSpecItem +import com.xebia.funcional.xef.evaluator.TestsSpec +import com.xebia.funcional.xef.evaluator.models.ContextDescription +import com.xebia.funcional.xef.evaluator.models.OutputDescription +import com.xebia.funcional.xef.evaluator.models.OutputResponse +import com.xebia.functional.xef.conversation.llm.openai.OpenAI +import com.xebia.functional.xef.conversation.llm.openai.promptMessage +import java.io.File + +object TestExample { + + @JvmStatic + fun main(args: Array) = SuspendApp { + val output: String = args.getOrNull(0) ?: "." + + val file = File("$output/data.json") + + val spec = + TestsSpec(description = "Check GTP3.5 and fake outputs") { + +OutputDescription("Using GPT3.5") + +OutputDescription("Fake outputs with errors") + + +TestSpecItem("Please provide a movie title, genre and director") { + +ContextDescription("Contains information about a movie") + + +OutputResponse { OpenAI.conversation { promptMessage(input) } } + + +OutputResponse("I don't know") + } + + +TestSpecItem("Recipe for a chocolate cake") { + +ContextDescription("Contains instructions for making a cake") + + +OutputResponse { OpenAI.conversation { promptMessage(input) } } + + +OutputResponse("The movie is Jurassic Park") + } + } + + file.writeText(spec.toJSON()) + + println("JSON created successfully") + } +} diff --git a/evaluator/build.gradle.kts b/evaluator/build.gradle.kts new file mode 100644 index 000000000..d84265e83 --- /dev/null +++ b/evaluator/build.gradle.kts @@ -0,0 +1,24 @@ +plugins { + id(libs.plugins.kotlin.jvm.get().pluginId) + id(libs.plugins.kotlinx.serialization.get().pluginId) + alias(libs.plugins.spotless) +} + +repositories { mavenCentral() } + +java { + sourceCompatibility = JavaVersion.VERSION_11 + targetCompatibility = JavaVersion.VERSION_11 + toolchain { languageVersion = JavaLanguageVersion.of(11) } +} + +dependencies { + api(libs.kotlinx.serialization.json) +} + +spotless { + kotlin { + target("**/*.kt") + ktfmt().googleStyle().configure { it.setRemoveUnusedImport(true) } + } +} diff --git a/evaluator/src/main/kotlin/com/xebia/funcional/xef/evaluator/SuiteBuilder.kt b/evaluator/src/main/kotlin/com/xebia/funcional/xef/evaluator/SuiteBuilder.kt new file mode 100644 index 000000000..e1186c9b4 --- /dev/null +++ b/evaluator/src/main/kotlin/com/xebia/funcional/xef/evaluator/SuiteBuilder.kt @@ -0,0 +1,63 @@ +package com.xebia.funcional.xef.evaluator + +import com.xebia.funcional.xef.evaluator.models.OutputDescription +import kotlin.jvm.JvmSynthetic +import kotlinx.serialization.SerialName +import kotlinx.serialization.Serializable +import kotlinx.serialization.encodeToString +import kotlinx.serialization.json.Json + +class SuiteBuilder(private val description: String, private val metric: String) { + + private val outputsDescription: MutableList = mutableListOf() + + private var minimumScore: Double = 0.7 + + private val items = mutableListOf() + + operator fun TestSpecItem.unaryPlus() { + items.add(this) + } + + operator fun OutputDescription.unaryPlus() { + outputsDescription.add(this.value) + } + + fun build() = TestsSpec(description, metric, outputsDescription, minimumScore, items) +} + +@Serializable +data class TestsSpec( + val description: String, + val metric: String, + @SerialName("outputs_description") val outputsDescription: List, + @SerialName("minimum_score") val minimumScore: Double, + val items: List +) { + + fun toJSON(): String = Json.encodeToString(this) + + companion object { + @JvmSynthetic + suspend operator fun invoke( + description: String, + metric: String = "FactualConsistencyMetric", + block: suspend SuiteBuilder.() -> Unit + ): TestsSpec = SuiteBuilder(description, metric).apply { block() }.build() + } +} + +@Serializable +data class TestSpecItem( + val input: String, + val context: List, + @SerialName("actual_outputs") val outputs: List +) { + companion object { + @JvmSynthetic + suspend operator fun invoke( + input: String, + block: suspend TestItemBuilder.() -> Unit + ): TestSpecItem = TestItemBuilder(input).apply { block() }.build() + } +} diff --git a/evaluator/src/main/kotlin/com/xebia/funcional/xef/evaluator/TestItemBuilder.kt b/evaluator/src/main/kotlin/com/xebia/funcional/xef/evaluator/TestItemBuilder.kt new file mode 100644 index 000000000..047386123 --- /dev/null +++ b/evaluator/src/main/kotlin/com/xebia/funcional/xef/evaluator/TestItemBuilder.kt @@ -0,0 +1,21 @@ +package com.xebia.funcional.xef.evaluator + +import com.xebia.funcional.xef.evaluator.models.ContextDescription +import com.xebia.funcional.xef.evaluator.models.OutputResponse + +class TestItemBuilder(val input: String) { + + private val context = mutableListOf() + + private val outputs = mutableListOf() + + operator fun ContextDescription.unaryPlus() { + context.add(value) + } + + operator fun OutputResponse.unaryPlus() { + outputs.add(value) + } + + fun build() = TestSpecItem(input, context, outputs) +} diff --git a/evaluator/src/main/kotlin/com/xebia/funcional/xef/evaluator/models/TestModels.kt b/evaluator/src/main/kotlin/com/xebia/funcional/xef/evaluator/models/TestModels.kt new file mode 100644 index 000000000..e933d50ae --- /dev/null +++ b/evaluator/src/main/kotlin/com/xebia/funcional/xef/evaluator/models/TestModels.kt @@ -0,0 +1,15 @@ +package com.xebia.funcional.xef.evaluator.models + +import kotlin.jvm.JvmSynthetic + +data class OutputDescription(val value: String) + +data class OutputResponse(val value: String) { + companion object { + @JvmSynthetic + suspend operator fun invoke(block: suspend () -> String): OutputResponse = + OutputResponse(block()) + } +} + +data class ContextDescription(val value: String) diff --git a/settings.gradle.kts b/settings.gradle.kts index d985c66d8..97bac7835 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -66,6 +66,12 @@ project(":xef-examples").projectDir = file("examples") include("xef-reasoning") project(":xef-reasoning").projectDir = file("reasoning") +include("xef-evaluator") +project(":xef-evaluator").projectDir = file("evaluator") + +include("xef-evaluator-example") +project(":xef-evaluator-example").projectDir = file("evaluator-example") + // include("xef-server") project(":xef-server").projectDir = file("server")