Skip to content

Commit

Permalink
SearchEvaluationLogic
Browse files Browse the repository at this point in the history
  • Loading branch information
NickyHavoc committed Apr 8, 2024
1 parent f77239a commit cffad66
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 62 deletions.
4 changes: 2 additions & 2 deletions src/intelligence_layer/connectors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
from .limited_concurrency_client import (
LimitedConcurrencyClient as LimitedConcurrencyClient,
)
from .retrievers.base_retriever import BaseRetriever # noqa :F401
from .retrievers.base_retriever import SearchResult # noqa :F401
from .retrievers.base_retriever import BaseRetriever as BaseRetriever
from .retrievers.base_retriever import Document as Document
from .retrievers.base_retriever import DocumentChunk as DocumentChunk
from .retrievers.base_retriever import SearchResult as SearchResult
from .retrievers.document_index_retriever import (
DocumentIndexRetriever as DocumentIndexRetriever,
)
Expand Down
10 changes: 5 additions & 5 deletions src/intelligence_layer/use_cases/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,14 @@
from .qa.single_chunk_qa import SingleChunkQa as SingleChunkQa
from .qa.single_chunk_qa import SingleChunkQaInput as SingleChunkQaInput
from .qa.single_chunk_qa import SingleChunkQaOutput as SingleChunkQaOutput
from .search.search import AggregatedSearchEvaluation as AggregatedSearchEvaluation
from .search.search import ExpectedSearchOutput as ExpectedSearchOutput
from .search.search import Search as Search
from .search.search import SearchAggregationLogic as SearchAggregationLogic
from .search.search import SearchEvaluation as SearchEvaluation
from .search.search import SearchEvaluationLogic as SearchEvaluationLogic
from .search.search import SearchInput as SearchInput
from .search.search import SearchOutput as SearchOutput
from .search.search import ExpectedSearchOutput as ExpectedSearchOutput
from .search.search import SearchEvaluationLogic as SearchEvaluationLogic
from .search.search import SearchEvaluation as SearchEvaluation
from .search.search import SearchAggregationLogic as SearchAggregationLogic
from .search.search import AggregatedSearchEvaluation as AggregatedSearchEvaluation
from .summarize.recursive_summarize import RecursiveSummarize as RecursiveSummarize
from .summarize.recursive_summarize import (
RecursiveSummarizeInput as RecursiveSummarizeInput,
Expand Down
63 changes: 29 additions & 34 deletions src/intelligence_layer/use_cases/search/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@
SearchResult,
)
from intelligence_layer.core import Task, TaskSpan
from intelligence_layer.evaluation import EvaluationLogic, Example, SuccessfulExampleOutput
from intelligence_layer.evaluation import (
EvaluationLogic,
Example,
SuccessfulExampleOutput,
)
from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic


Expand Down Expand Up @@ -81,53 +85,44 @@ class ExpectedSearchOutput(BaseModel):


class SearchEvaluation(BaseModel):
""""""

rank: Optional[int]
similarity_score: Optional[float]


class SearchEvaluationLogic(
EvaluationLogic[
SearchInput, SearchOutput, ExpectedSearchOutput, SearchEvaluation
SearchInput, SearchOutput[ID], ExpectedSearchOutput, SearchEvaluation
]
):

def do_evaluate(
self,
example: Example[SearchInput, ExpectedSearchOutput],
*output: SuccessfulExampleOutput[SearchOutput]
*output: SuccessfulExampleOutput[SearchOutput[ID]],
) -> SearchEvaluation:
assert len(output) == 1
results = output[0].output.results

def overlaps(range_1: tuple[int, int], range_2: tuple[int, int]) -> bool:
0, 5 - 5, 6
"hallo hi"
if range_1[0] <= range_2[0]:
return range_1[1] < range_2[0]


next(index for index, result in enumerate(results) if overlaps(
(result.document_chunk.start, result.document_chunk.end),
(example.expected_output.start_idx, example.expected_output.end_idx)
))


# for any example in the source dataset, this function receives:
# the input used to generate the result
# the expected output given the input
# the generated result

# doc chunks overlap?
# calculate MRR

found_start = 1000
found_end = 1500

expected_start = 800
expected_end = 1100

return super().do_evaluate(example, *output)
def overlaps(a: tuple[int, int], b: tuple[int, int]) -> bool:
a_start, a_end = a
b_start, b_end = b
return a_start < b_end and b_start < a_end

index, score = next(
(
(index, result.score)
for index, result in enumerate(results)
if overlaps(
(result.document_chunk.start, result.document_chunk.end),
(
example.expected_output.start_idx,
example.expected_output.end_idx,
),
)
),
(None, None),
)

return SearchEvaluation(rank=index, similarity_score=score)


class MeanTopK(BaseModel):
Expand Down
131 changes: 110 additions & 21 deletions tests/use_cases/search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,22 @@

from pytest import fixture

from intelligence_layer.connectors import Document, SearchResult, QdrantInMemoryRetriever, DocumentChunk
from intelligence_layer.connectors import (
Document,
DocumentChunk,
QdrantInMemoryRetriever,
SearchResult,
)
from intelligence_layer.core import NoOpTracer
from intelligence_layer.evaluation import Example
from intelligence_layer.use_cases import Search, SearchEvaluationLogic, SearchInput, ExpectedSearchOutput, SearchOutput
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput
from intelligence_layer.use_cases import (
ExpectedSearchOutput,
Search,
SearchEvaluationLogic,
SearchInput,
SearchOutput,
)
from tests.conftest import to_document


Expand All @@ -23,23 +35,25 @@ def search(asymmetric_in_memory_retriever: QdrantInMemoryRetriever) -> Search[in
return Search(asymmetric_in_memory_retriever)


@fixture
def example() -> Example:
return Example(input=SearchInput(query=""))


@fixture
def expected_output() -> ExpectedSearchOutput:
return ExpectedSearchOutput(
document_id="1",
start_idx=0,
end_idx=5,
origin_chunk="hallo ",
origin_chunk="hallo",
answer="",
task_label=""
task_label="",
)


@fixture
def example(
expected_output: ExpectedSearchOutput,
) -> Example[SearchInput, ExpectedSearchOutput]:
return Example(input=SearchInput(query=""), expected_output=expected_output)


def test_search(
search: Search[int],
no_op_tracer: NoOpTracer,
Expand All @@ -57,18 +71,93 @@ def test_search(
)


def test_search_evaluation_logic_works_for_non_overlapping_output(example: Example, expected_output: ExpectedSearchOutput) -> None:
def test_search_evaluation_logic_works_for_overlapping_output(
example: Example[SearchInput, ExpectedSearchOutput],
) -> None:
logic = SearchEvaluationLogic[SearchResult[str]]()
output = SuccessfulExampleOutput(
run_id="1",
example_id="1",
output=SearchOutput(
results=[
SearchResult[str](
id="1",
score=0.5,
document_chunk=DocumentChunk(text="llo", start=2, end=5),
)
]
),
)
eval = logic.do_evaluate(example, output)

assert eval.rank == 0
assert eval.similarity_score == output.output.results[0].score


def test_search_evaluation_logic_works_for_wholly_included_output(
example: Example[SearchInput, ExpectedSearchOutput],
) -> None:
logic = SearchEvaluationLogic()
output = SuccessfulExampleOutput(
run_id="1",
example_id="1",
output=SearchOutput(
results=[
SearchResult(
id="1",
score=0.5,
document_chunk=DocumentChunk(text="l", start=2, end=3),
)
]
),
)
eval = logic.do_evaluate(example, *[output])

assert eval.rank == 0
assert eval.similarity_score == output.output.results[0].score


def test_search_evaluation_logic_works_for_identical_ranges(
example: Example[SearchInput, ExpectedSearchOutput],
) -> None:
logic = SearchEvaluationLogic()
output = SuccessfulExampleOutput(
run_id="1",
example_id="1",
output=SearchOutput(
results=[
SearchResult(
id="1",
score=0.5,
document_chunk=DocumentChunk(text="hallo", start=0, end=5),
)
]
),
)
eval = logic.do_evaluate(example, *[output])

assert eval.rank == 0
assert eval.similarity_score == output.output.results[0].score


def test_search_evaluation_logic_works_for_non_overlapping_output(
example: Example[SearchInput, ExpectedSearchOutput],
) -> None:
logic = SearchEvaluationLogic()
output = SearchOutput(
results=[
SearchResult(
id="1",
score=0.5,
document_chunk=DocumentChunk(
text="test ",
start=5,
end=10
output = SuccessfulExampleOutput(
run_id="1",
example_id="1",
output=SearchOutput(
results=[
SearchResult(
id="1",
score=0.5,
document_chunk=DocumentChunk(text=" test.", start=5, end=10),
)
)
]
]
),
)
eval = logic.do_evaluate(example, *[output])

assert not eval.rank
assert not eval.similarity_score

0 comments on commit cffad66

Please sign in to comment.