-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #24 from arthur-ai/develop
release: full local UI
- Loading branch information
Showing
371 changed files
with
20,472 additions
and
2,944 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
recursive-include arthur_bench/server/html/* | ||
recursive-include arthur_bench/server/js/dist * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,19 +9,28 @@ Bench is built for evaluating LLMs for production use cases. Bench can be used f | |
## Getting started | ||
|
||
### Package installation and environment setup | ||
Install Bench with minimum dependencies: | ||
`pip install -e .` | ||
First download the tar file from the Github releases. Next install the package to your python environment. | ||
|
||
Install Bench with optional dependencies for serving results locally (recommended): | ||
`pip install --find-links=./directory_with_tar_file 'arthur-bench[server]'` | ||
|
||
Install Bench with optional dependencies for serving results locally: | ||
`pip install -e '.[server]'` | ||
Install Bench with minimum dependencies: | ||
`pip install --find-links=./directory_with_tar_file 'arthur-bench'` | ||
|
||
Bench saves test suites and test runs to the directory specified by the `BENCH_FILE_DIR`, which defaults to `./bench` | ||
|
||
#### Viewing Examples | ||
To explore Bench suites and runs for an example datasets, run `bench --directory examples/bench`. This will spin up a server where you can view sample created Test Suites and evaluate Runs across different model and prompt configurations. | ||
#### Exploring the UI with Examples | ||
The following commands will spin up a local UI serving two example test suites we've added | ||
|
||
``` | ||
git clone [email protected]:arthur-ai/bench.git | ||
cd bench/examples # navigate to bench root directory | ||
bench | ||
``` | ||
This will spin up a server where you can view sample created Test Suites and evaluate Runs across different model and prompt configurations. | ||
|
||
In the `examples/` folder, you will find demo notebooks used to generate the Test Suites and Run results recorded in the directory. | ||
**Running these notebooks directly, without deleting the pre-existing results from the directory, will result in errors.** Please use these as a a reference in creating your own Test Suites and Runs. | ||
**To run these notebooks directly, configure the BENCH_FILE_DIR to a new file system in the top cell of the notebook.** Please use these as a a reference in creating your own Test Suites and Runs. | ||
|
||
## Key Concepts | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
from typing import Optional, TypeVar | ||
from abc import ABC, abstractmethod | ||
|
||
from arthur_bench.models.models import ( | ||
PaginatedTestSuites, | ||
CreateRunResponse, | ||
CreateRunRequest, | ||
PaginatedRuns, | ||
PaginatedRun, | ||
TestSuiteRequest, | ||
PaginatedTestSuite, | ||
TestSuiteSummary, | ||
) | ||
|
||
TBenchClient = TypeVar("TBenchClient", bound="BenchClient") | ||
|
||
|
||
class BenchClient(ABC): | ||
|
||
@abstractmethod | ||
def get_test_suites( | ||
self, | ||
name: Optional[str] = None, | ||
sort: Optional[str] = None, | ||
scoring_method: Optional[str] = None, | ||
page: int = 1, | ||
page_size: int = 5 | ||
) -> PaginatedTestSuites: | ||
raise NotImplementedError | ||
|
||
@abstractmethod | ||
def create_test_suite(self, json_body: TestSuiteRequest) -> PaginatedTestSuite: | ||
raise NotImplementedError | ||
|
||
@abstractmethod | ||
def get_test_suite( | ||
self, | ||
test_suite_id: str, | ||
page: int = 1, | ||
page_size: int = 5) -> PaginatedTestSuite: | ||
raise NotImplementedError | ||
|
||
@abstractmethod | ||
def get_runs_for_test_suite( | ||
self, | ||
test_suite_id: str, | ||
sort: Optional[str] = None, | ||
page: int = 1, | ||
page_size: int = 5 | ||
) -> PaginatedRuns: | ||
raise NotImplementedError | ||
|
||
@abstractmethod | ||
def create_new_test_run( | ||
self, test_suite_id: str, json_body: CreateRunRequest | ||
) -> CreateRunResponse: | ||
raise NotImplementedError | ||
|
||
@abstractmethod | ||
def get_test_run( | ||
self, | ||
test_suite_id: str, | ||
test_run_id: str, | ||
page: int = 1, | ||
page_size: int = 5, | ||
sort: Optional[bool] = None, | ||
) -> PaginatedRun: | ||
raise NotImplementedError | ||
|
||
@abstractmethod | ||
def get_summary_statistics( | ||
self, | ||
test_suite_id: str, | ||
run_id: Optional[str] = None, | ||
page: int = 1, | ||
page_size: int = 5 | ||
) -> TestSuiteSummary: | ||
raise NotImplementedError | ||
|
||
@abstractmethod | ||
def delete_test_suite(self, test_suite_id: str): | ||
raise NotImplementedError | ||
|
||
@abstractmethod | ||
def delete_test_run(self, test_suite_id: str, test_run_id: str): | ||
raise NotImplementedError |
Oops, something went wrong.