diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b7e29ae5..fc55f82c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,9 @@ - feature: `ExpandChunks` task takes a retriever and some search results to expand the chunks to the desired length ### Fixes -... +- fix: `ExpectedSearchOutput` has only relevant fields and supports generic document-`ID` rather than just str +- fix: `SearchEvaluationLogic` explicitly compares documents by ids + ## 0.9.0 diff --git a/src/intelligence_layer/use_cases/search/search.py b/src/intelligence_layer/use_cases/search/search.py index 6d5348b0d..1472cf06c 100644 --- a/src/intelligence_layer/use_cases/search/search.py +++ b/src/intelligence_layer/use_cases/search/search.py @@ -75,13 +75,10 @@ def do_run(self, input: SearchInput, task_span: TaskSpan) -> SearchOutput[ID]: return SearchOutput(results=results) -class ExpectedSearchOutput(BaseModel): - document_id: str +class ExpectedSearchOutput(BaseModel, Generic[ID]): + document_id: ID start_idx: int end_idx: int - origin_chunk: str - answer: str - task_label: str class SearchEvaluation(BaseModel): @@ -92,26 +89,30 @@ class SearchEvaluation(BaseModel): class SearchEvaluationLogic( Generic[ID], SingleOutputEvaluationLogic[ - SearchInput, SearchOutput[ID], ExpectedSearchOutput, SearchEvaluation + SearchInput, SearchOutput[ID], ExpectedSearchOutput[ID], SearchEvaluation ], ): def do_evaluate_single_output( self, - example: Example[SearchInput, ExpectedSearchOutput], + example: Example[SearchInput, ExpectedSearchOutput[ID]], output: SearchOutput[ID], ) -> SearchEvaluation: results = output.results - def overlaps(a: tuple[int, int], b: tuple[int, int]) -> bool: - a_start, a_end = a - b_start, b_end = b + def same_document(id_a: ID, id_b: ID) -> bool: + return id_a == id_b + + def chunks_overlap(range_a: tuple[int, int], range_b: tuple[int, int]) -> bool: + a_start, a_end = range_a + b_start, b_end = range_b return a_start < b_end and b_start < a_end rank, score = next( ( (index + 1, result.score) for index, result in enumerate(results) - if overlaps( + if same_document(result.id, example.expected_output.document_id) + and chunks_overlap( (result.document_chunk.start, result.document_chunk.end), ( example.expected_output.start_idx, diff --git a/tests/use_cases/search/test_search.py b/tests/use_cases/search/test_search.py index 470534c87..9e36637e8 100644 --- a/tests/use_cases/search/test_search.py +++ b/tests/use_cases/search/test_search.py @@ -38,21 +38,18 @@ def search(asymmetric_in_memory_retriever: QdrantInMemoryRetriever) -> Search[in @fixture -def expected_output() -> ExpectedSearchOutput: +def expected_output() -> ExpectedSearchOutput[str]: return ExpectedSearchOutput( document_id="1", start_idx=0, end_idx=5, - origin_chunk="hallo", - answer="", - task_label="", ) @fixture def example( - expected_output: ExpectedSearchOutput, -) -> Example[SearchInput, ExpectedSearchOutput]: + expected_output: ExpectedSearchOutput[str], +) -> Example[SearchInput, ExpectedSearchOutput[str]]: return Example(input=SearchInput(query=""), expected_output=expected_output) @@ -95,7 +92,7 @@ def test_search( def test_search_evaluation_logic_works_for_overlapping_output( - example: Example[SearchInput, ExpectedSearchOutput], + example: Example[SearchInput, ExpectedSearchOutput[str]], search_eval_logic: SearchEvaluationLogic[str], ) -> None: output = SearchOutput( @@ -114,7 +111,7 @@ def test_search_evaluation_logic_works_for_overlapping_output( def test_search_evaluation_logic_works_for_wholly_included_output( - example: Example[SearchInput, ExpectedSearchOutput], + example: Example[SearchInput, ExpectedSearchOutput[str]], search_eval_logic: SearchEvaluationLogic[str], ) -> None: output = SearchOutput( @@ -133,7 +130,7 @@ def test_search_evaluation_logic_works_for_wholly_included_output( def test_search_evaluation_logic_works_for_identical_ranges( - example: Example[SearchInput, ExpectedSearchOutput], + example: Example[SearchInput, ExpectedSearchOutput[str]], search_eval_logic: SearchEvaluationLogic[str], ) -> None: output = SearchOutput( @@ -152,7 +149,7 @@ def test_search_evaluation_logic_works_for_identical_ranges( def test_search_evaluation_logic_works_for_non_overlapping_output( - example: Example[SearchInput, ExpectedSearchOutput], + example: Example[SearchInput, ExpectedSearchOutput[str]], search_eval_logic: SearchEvaluationLogic[str], ) -> None: output = SearchOutput(