From cffad663ee3d15ed8ad3e804d8ccf84655c0f637 Mon Sep 17 00:00:00 2001 From: "niklas.finken" Date: Mon, 8 Apr 2024 13:47:58 +0200 Subject: [PATCH] `SearchEvaluationLogic` --- src/intelligence_layer/connectors/__init__.py | 4 +- src/intelligence_layer/use_cases/__init__.py | 10 +- .../use_cases/search/search.py | 63 ++++----- tests/use_cases/search/test_search.py | 131 +++++++++++++++--- 4 files changed, 146 insertions(+), 62 deletions(-) diff --git a/src/intelligence_layer/connectors/__init__.py b/src/intelligence_layer/connectors/__init__.py index 9134dd347..294b91ff6 100644 --- a/src/intelligence_layer/connectors/__init__.py +++ b/src/intelligence_layer/connectors/__init__.py @@ -27,10 +27,10 @@ from .limited_concurrency_client import ( LimitedConcurrencyClient as LimitedConcurrencyClient, ) -from .retrievers.base_retriever import BaseRetriever # noqa :F401 -from .retrievers.base_retriever import SearchResult # noqa :F401 +from .retrievers.base_retriever import BaseRetriever as BaseRetriever from .retrievers.base_retriever import Document as Document from .retrievers.base_retriever import DocumentChunk as DocumentChunk +from .retrievers.base_retriever import SearchResult as SearchResult from .retrievers.document_index_retriever import ( DocumentIndexRetriever as DocumentIndexRetriever, ) diff --git a/src/intelligence_layer/use_cases/__init__.py b/src/intelligence_layer/use_cases/__init__.py index 1c99a4b29..63428283f 100644 --- a/src/intelligence_layer/use_cases/__init__.py +++ b/src/intelligence_layer/use_cases/__init__.py @@ -56,14 +56,14 @@ from .qa.single_chunk_qa import SingleChunkQa as SingleChunkQa from .qa.single_chunk_qa import SingleChunkQaInput as SingleChunkQaInput from .qa.single_chunk_qa import SingleChunkQaOutput as SingleChunkQaOutput +from .search.search import AggregatedSearchEvaluation as AggregatedSearchEvaluation +from .search.search import ExpectedSearchOutput as ExpectedSearchOutput from .search.search import Search as Search +from .search.search import SearchAggregationLogic as SearchAggregationLogic +from .search.search import SearchEvaluation as SearchEvaluation +from .search.search import SearchEvaluationLogic as SearchEvaluationLogic from .search.search import SearchInput as SearchInput from .search.search import SearchOutput as SearchOutput -from .search.search import ExpectedSearchOutput as ExpectedSearchOutput -from .search.search import SearchEvaluationLogic as SearchEvaluationLogic -from .search.search import SearchEvaluation as SearchEvaluation -from .search.search import SearchAggregationLogic as SearchAggregationLogic -from .search.search import AggregatedSearchEvaluation as AggregatedSearchEvaluation from .summarize.recursive_summarize import RecursiveSummarize as RecursiveSummarize from .summarize.recursive_summarize import ( RecursiveSummarizeInput as RecursiveSummarizeInput, diff --git a/src/intelligence_layer/use_cases/search/search.py b/src/intelligence_layer/use_cases/search/search.py index 9d0bc555e..1f1262096 100644 --- a/src/intelligence_layer/use_cases/search/search.py +++ b/src/intelligence_layer/use_cases/search/search.py @@ -8,7 +8,11 @@ SearchResult, ) from intelligence_layer.core import Task, TaskSpan -from intelligence_layer.evaluation import EvaluationLogic, Example, SuccessfulExampleOutput +from intelligence_layer.evaluation import ( + EvaluationLogic, + Example, + SuccessfulExampleOutput, +) from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic @@ -81,53 +85,44 @@ class ExpectedSearchOutput(BaseModel): class SearchEvaluation(BaseModel): - """""" - rank: Optional[int] + similarity_score: Optional[float] class SearchEvaluationLogic( EvaluationLogic[ - SearchInput, SearchOutput, ExpectedSearchOutput, SearchEvaluation + SearchInput, SearchOutput[ID], ExpectedSearchOutput, SearchEvaluation ] ): - def do_evaluate( self, example: Example[SearchInput, ExpectedSearchOutput], - *output: SuccessfulExampleOutput[SearchOutput] + *output: SuccessfulExampleOutput[SearchOutput[ID]], ) -> SearchEvaluation: assert len(output) == 1 results = output[0].output.results - def overlaps(range_1: tuple[int, int], range_2: tuple[int, int]) -> bool: - 0, 5 - 5, 6 - "hallo hi" - if range_1[0] <= range_2[0]: - return range_1[1] < range_2[0] - - - next(index for index, result in enumerate(results) if overlaps( - (result.document_chunk.start, result.document_chunk.end), - (example.expected_output.start_idx, example.expected_output.end_idx) - )) - - - # for any example in the source dataset, this function receives: - # the input used to generate the result - # the expected output given the input - # the generated result - - # doc chunks overlap? - # calculate MRR - - found_start = 1000 - found_end = 1500 - - expected_start = 800 - expected_end = 1100 - - return super().do_evaluate(example, *output) + def overlaps(a: tuple[int, int], b: tuple[int, int]) -> bool: + a_start, a_end = a + b_start, b_end = b + return a_start < b_end and b_start < a_end + + index, score = next( + ( + (index, result.score) + for index, result in enumerate(results) + if overlaps( + (result.document_chunk.start, result.document_chunk.end), + ( + example.expected_output.start_idx, + example.expected_output.end_idx, + ), + ) + ), + (None, None), + ) + + return SearchEvaluation(rank=index, similarity_score=score) class MeanTopK(BaseModel): diff --git a/tests/use_cases/search/test_search.py b/tests/use_cases/search/test_search.py index d63b0e901..6f5d58917 100644 --- a/tests/use_cases/search/test_search.py +++ b/tests/use_cases/search/test_search.py @@ -2,10 +2,22 @@ from pytest import fixture -from intelligence_layer.connectors import Document, SearchResult, QdrantInMemoryRetriever, DocumentChunk +from intelligence_layer.connectors import ( + Document, + DocumentChunk, + QdrantInMemoryRetriever, + SearchResult, +) from intelligence_layer.core import NoOpTracer from intelligence_layer.evaluation import Example -from intelligence_layer.use_cases import Search, SearchEvaluationLogic, SearchInput, ExpectedSearchOutput, SearchOutput +from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput +from intelligence_layer.use_cases import ( + ExpectedSearchOutput, + Search, + SearchEvaluationLogic, + SearchInput, + SearchOutput, +) from tests.conftest import to_document @@ -23,23 +35,25 @@ def search(asymmetric_in_memory_retriever: QdrantInMemoryRetriever) -> Search[in return Search(asymmetric_in_memory_retriever) -@fixture -def example() -> Example: - return Example(input=SearchInput(query="")) - - @fixture def expected_output() -> ExpectedSearchOutput: return ExpectedSearchOutput( document_id="1", start_idx=0, end_idx=5, - origin_chunk="hallo ", + origin_chunk="hallo", answer="", - task_label="" + task_label="", ) +@fixture +def example( + expected_output: ExpectedSearchOutput, +) -> Example[SearchInput, ExpectedSearchOutput]: + return Example(input=SearchInput(query=""), expected_output=expected_output) + + def test_search( search: Search[int], no_op_tracer: NoOpTracer, @@ -57,18 +71,93 @@ def test_search( ) -def test_search_evaluation_logic_works_for_non_overlapping_output(example: Example, expected_output: ExpectedSearchOutput) -> None: +def test_search_evaluation_logic_works_for_overlapping_output( + example: Example[SearchInput, ExpectedSearchOutput], +) -> None: + logic = SearchEvaluationLogic[SearchResult[str]]() + output = SuccessfulExampleOutput( + run_id="1", + example_id="1", + output=SearchOutput( + results=[ + SearchResult[str]( + id="1", + score=0.5, + document_chunk=DocumentChunk(text="llo", start=2, end=5), + ) + ] + ), + ) + eval = logic.do_evaluate(example, output) + + assert eval.rank == 0 + assert eval.similarity_score == output.output.results[0].score + + +def test_search_evaluation_logic_works_for_wholly_included_output( + example: Example[SearchInput, ExpectedSearchOutput], +) -> None: + logic = SearchEvaluationLogic() + output = SuccessfulExampleOutput( + run_id="1", + example_id="1", + output=SearchOutput( + results=[ + SearchResult( + id="1", + score=0.5, + document_chunk=DocumentChunk(text="l", start=2, end=3), + ) + ] + ), + ) + eval = logic.do_evaluate(example, *[output]) + + assert eval.rank == 0 + assert eval.similarity_score == output.output.results[0].score + + +def test_search_evaluation_logic_works_for_identical_ranges( + example: Example[SearchInput, ExpectedSearchOutput], +) -> None: + logic = SearchEvaluationLogic() + output = SuccessfulExampleOutput( + run_id="1", + example_id="1", + output=SearchOutput( + results=[ + SearchResult( + id="1", + score=0.5, + document_chunk=DocumentChunk(text="hallo", start=0, end=5), + ) + ] + ), + ) + eval = logic.do_evaluate(example, *[output]) + + assert eval.rank == 0 + assert eval.similarity_score == output.output.results[0].score + + +def test_search_evaluation_logic_works_for_non_overlapping_output( + example: Example[SearchInput, ExpectedSearchOutput], +) -> None: logic = SearchEvaluationLogic() - output = SearchOutput( - results=[ - SearchResult( - id="1", - score=0.5, - document_chunk=DocumentChunk( - text="test ", - start=5, - end=10 + output = SuccessfulExampleOutput( + run_id="1", + example_id="1", + output=SearchOutput( + results=[ + SearchResult( + id="1", + score=0.5, + document_chunk=DocumentChunk(text=" test.", start=5, end=10), ) - ) - ] + ] + ), ) + eval = logic.do_evaluate(example, *[output]) + + assert not eval.rank + assert not eval.similarity_score