From d1570e75e4b861660e8b9734dbd976d0d6e0a9b9 Mon Sep 17 00:00:00 2001 From: "niklas.finken" Date: Tue, 16 Apr 2024 11:50:45 +0200 Subject: [PATCH 1/6] `ChunkWithIndices`-task that returns the start_index of each chunk within the origin text --- src/intelligence_layer/core/__init__.py | 3 ++ src/intelligence_layer/core/chunk.py | 41 ++++++++++++++++++ tests/core/test_chunk.py | 56 +++++++++++++++++++++++++ 3 files changed, 100 insertions(+) create mode 100644 tests/core/test_chunk.py diff --git a/src/intelligence_layer/core/__init__.py b/src/intelligence_layer/core/__init__.py index 350c75fa0..3966e8828 100644 --- a/src/intelligence_layer/core/__init__.py +++ b/src/intelligence_layer/core/__init__.py @@ -1,6 +1,9 @@ from .chunk import Chunk as Chunk from .chunk import ChunkInput as ChunkInput from .chunk import ChunkOutput as ChunkOutput +from .chunk import ChunkWithIndices as ChunkWithIndices +from .chunk import ChunkWithIndicesOutput as ChunkWithIndicesOutput +from .chunk import ChunkWithStartIndex as ChunkWithStartIndex from .chunk import TextChunk as TextChunk from .detect_language import DetectLanguage as DetectLanguage from .detect_language import DetectLanguageInput as DetectLanguageInput diff --git a/src/intelligence_layer/core/chunk.py b/src/intelligence_layer/core/chunk.py index 7aadb94f6..5fae5636e 100644 --- a/src/intelligence_layer/core/chunk.py +++ b/src/intelligence_layer/core/chunk.py @@ -60,3 +60,44 @@ def do_run(self, input: ChunkInput, task_span: TaskSpan) -> ChunkOutput: for t in self._splitter.chunks(input.text, self._max_tokens_per_chunk) ] return ChunkOutput(chunks=chunks) + + +class ChunkWithStartIndex(BaseModel): + chunk: TextChunk + start_index: int + + +class ChunkWithIndicesOutput(BaseModel): + """The output of a `ChunkTask`. + + Attributes: + chunks_with_indices: A list of smaller sections of the input text. + """ + + chunks_with_indices: Sequence[ChunkWithStartIndex] + + +class ChunkWithIndices(Task[ChunkInput, ChunkWithIndicesOutput]): + """Splits a longer text into smaller text chunks. + + Provide a text of any length and chunk it into smaller pieces using a + tokenizer that is available within the Aleph Alpha client. + + Args: + model: A valid Aleph Alpha model. + max_tokens_per_chunk: The maximum number of tokens to fit into one chunk. + """ + + def __init__(self, model: AlephAlphaModel, max_tokens_per_chunk: int = 512): + super().__init__() + self._splitter = TextSplitter.from_huggingface_tokenizer(model.get_tokenizer()) + self._max_tokens_per_chunk = max_tokens_per_chunk + + def do_run(self, input: ChunkInput, task_span: TaskSpan) -> ChunkWithIndicesOutput: + chunks_with_indices = [ + ChunkWithStartIndex(chunk=TextChunk(t[1]), start_index=t[0]) + for t in self._splitter.chunk_indices( + input.text, self._max_tokens_per_chunk + ) + ] + return ChunkWithIndicesOutput(chunks_with_indices=chunks_with_indices) diff --git a/tests/core/test_chunk.py b/tests/core/test_chunk.py new file mode 100644 index 000000000..e3968abbb --- /dev/null +++ b/tests/core/test_chunk.py @@ -0,0 +1,56 @@ +from pytest import fixture + +from intelligence_layer.core import ( + ChunkInput, + ChunkWithIndices, + LuminousControlModel, + NoOpTracer, +) + + +@fixture +def chunk_input() -> ChunkInput: + return ChunkInput( + text="""In the rolling verdant hills of a realm untouched by the passage of modern times, a kingdom thrived under the rule of a benevolent monarch. The king, known for his wisdom and justice, held the loyalty of his people and the respect of his peers. However, beneath the surface of peace, a shadow loomed that would test the mettle of the kingdom's most valiant defenders: the knights. + +These knights, clad in gleaming armor and bearing the colors of their liege, were not mere soldiers but champions of the realm's ideals. They were sworn to protect the innocent, uphold justice, and maintain the peace, guided by a chivalric code that was as much a part of them as the swords they wielded. Among these noble warriors, Sir Aelwyn stood prominent, known across the land for his prowess in battle and his unyielding honor. + +Sir Aelwyn, the youngest knight ever to be granted the title of Master of the Horse, was a figure of legend. His tales were told in every corner of the kingdom, often embellished with each retelling. From his duel with the Giant of Gormouth to his silent vigil in the Haunted Wood, Aelwyn's life was a tapestry of bravery and adventure. Yet, his greatest challenge lay ahead, whispered in fearful murmurs throughout the castle—the rise of the Dragon of Black Hollow. + +The dragon had awoken from a centuries-long slumber, driven by hunger and wrath, laying waste to the villages on the kingdom's fringes. Smoke and despair rose from the once tranquil borders, drawing the attention of the king and his council. With the threat growing each day, the king summoned Sir Aelwyn and tasked him with a quest that could either save the kingdom or doom it forever—to defeat the dragon. + +As Sir Aelwyn prepared for his journey, the castle buzzed with activity. Blacksmiths forged new armor and weapons, alchemists concocted potent draughts, and scholars poured over ancient texts seeking any knowledge that might aid him. The knight spent his nights in the chapel, praying for strength and wisdom, and his days in the training yard, honing his skills against opponents both real and imagined. + +Accompanying Sir Aelwyn were his loyal companions: Sir Rowan, a strategist known for his cunning and intellect; Lady Elara, a knight whose skill with the bow was unmatched; and Dame Miriel, a warrior-poet whose songs could stir the soul as fiercely as her sword could cleave armor. Together, they represented the kingdom's finest, united under a single cause. + +Their journey was fraught with peril. They crossed through the Whispering Forest, where shadows moved with minds of their own, and over the Mountains of Echoes, where the wind carried voices from the past. Each step brought them closer to their quarry, and the signs of the dragon's passage grew ever more ominous—the charred earth, the ruins of once happy homes, and the air heavy with the scent of sulfur. + +As they approached Black Hollow, the landscape grew bleak, and the sky darkened. The dragon, coiled atop a pile of gold and bones, awaited them, its scales shimmering like molten rock. The air crackled with the heat of its breath, and its eyes, glowing like coals, fixed on Sir Aelwyn and his companions. + +The battle was fierce. Sir Rowan directed their movements with precision, while Lady Elara loosed arrows that found chinks in the dragon's armor. Dame Miriel's voice rose above the clamor, her words bolstering their courage and blinding the beast with bursts of radiant light. Sir Aelwyn faced the dragon head-on, his shield absorbing the flames that poured from its maw, his sword striking with the weight of his oath behind each blow. + +Hours seemed like days as the clash continued, the outcome uncertain. Finally, seeing an opening, Sir Aelwyn drove his sword deep into the dragon's heart. With a final roar that shook the heavens, the dragon fell, its reign of terror ended. + +The return to the kingdom was triumphant. The people lined the streets, showering the knights with flowers and cheers. The king welcomed them back as heroes, their deeds to be recorded in the annals of history for generations to come. Sir Aelwyn and his companions had not only saved the kingdom but had also reaffirmed the values it stood for: courage, honor, and a steadfast commitment to the protection of the realm. + +As the celebrations faded, Sir Aelwyn looked out over the kingdom from the castle's highest tower. The peace they had fought for lay stretched before him, a tapestry of green fields and bustling towns. Yet, he knew that this peace was not permanent but a precious moment to be cherished and protected. For as long as there were threats to the realm, there would be knights to face them, their swords ready and their hearts brave. + +In this timeless land, the cycle of challenge and triumph continued, each generation of knights rising to meet the dangers of their times with the same valor and resolve as those who had come before them. And so, the legends grew, each knight adding their thread to the ever-unfolding story of the kingdom and its defenders.""" + ) + + +def test_chunk_with_indices( + luminous_control_model: LuminousControlModel, + chunk_input: ChunkInput, + no_op_tracer: NoOpTracer, +) -> None: + chunk_with_indices = ChunkWithIndices( + luminous_control_model, max_tokens_per_chunk=128 + ) + + output = chunk_with_indices.do_run(chunk_input, no_op_tracer) + + assert all( + c.start_index < output.chunks_with_indices[idx + 1].start_index + for idx, c in enumerate(output.chunks_with_indices[:-1]) + ) From 65476eee59a471ac820921c459dbaa7c42185803 Mon Sep 17 00:00:00 2001 From: "niklas.finken" Date: Tue, 16 Apr 2024 17:36:13 +0200 Subject: [PATCH 2/6] Implement `ExpandChunks`-task --- src/intelligence_layer/core/chunk.py | 18 +- src/intelligence_layer/use_cases/__init__.py | 3 + .../qa/multiple_chunk_retriever_qa.py | 6 +- .../use_cases/search/expand_chunk.py | 78 ++++++++ tests/use_cases/search/test_expand_chunk.py | 170 ++++++++++++++++++ 5 files changed, 267 insertions(+), 8 deletions(-) create mode 100644 src/intelligence_layer/use_cases/search/expand_chunk.py create mode 100644 tests/use_cases/search/test_expand_chunk.py diff --git a/src/intelligence_layer/core/chunk.py b/src/intelligence_layer/core/chunk.py index 5fae5636e..b71194a57 100644 --- a/src/intelligence_layer/core/chunk.py +++ b/src/intelligence_layer/core/chunk.py @@ -19,7 +19,7 @@ class ChunkInput(BaseModel): - """The input for a `ChunkTask`. + """The input for a `Chunk`-task. Attributes: text: A text of arbitrary length. @@ -63,25 +63,33 @@ def do_run(self, input: ChunkInput, task_span: TaskSpan) -> ChunkOutput: class ChunkWithStartIndex(BaseModel): + """A `TextChunk` and its `start_index` relative to its parent document. + + Attributes: + chunk: The actual text. + start_index: The character start index of the chunk within the respective document. + """ + chunk: TextChunk start_index: int class ChunkWithIndicesOutput(BaseModel): - """The output of a `ChunkTask`. + """The output of a `ChunkWithIndices`-task. Attributes: - chunks_with_indices: A list of smaller sections of the input text. + chunks_with_indices: A list of smaller sections of the input text with the respective start_index. """ chunks_with_indices: Sequence[ChunkWithStartIndex] class ChunkWithIndices(Task[ChunkInput, ChunkWithIndicesOutput]): - """Splits a longer text into smaller text chunks. + """Splits a longer text into smaller text chunks and returns the chunks' start indices. Provide a text of any length and chunk it into smaller pieces using a - tokenizer that is available within the Aleph Alpha client. + tokenizer that is available within the Aleph Alpha client. For each chunk, the respective + start index relative to the document is also returned. Args: model: A valid Aleph Alpha model. diff --git a/src/intelligence_layer/use_cases/__init__.py b/src/intelligence_layer/use_cases/__init__.py index d1f86d7e2..a97a5d630 100644 --- a/src/intelligence_layer/use_cases/__init__.py +++ b/src/intelligence_layer/use_cases/__init__.py @@ -62,6 +62,9 @@ from .qa.single_chunk_qa import SingleChunkQa as SingleChunkQa from .qa.single_chunk_qa import SingleChunkQaInput as SingleChunkQaInput from .qa.single_chunk_qa import SingleChunkQaOutput as SingleChunkQaOutput +from .search.expand_chunk import ExpandChunkInput as ExpandChunkInput +from .search.expand_chunk import ExpandChunkOutput as ExpandChunkOutput +from .search.expand_chunk import ExpandChunks as ExpandChunks from .search.search import AggregatedSearchEvaluation as AggregatedSearchEvaluation from .search.search import ChunkFound as ChunkFound from .search.search import ExpectedSearchOutput as ExpectedSearchOutput diff --git a/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py b/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py index 2552bbcf2..45bcfc183 100644 --- a/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py +++ b/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py @@ -101,9 +101,9 @@ def _get_highlights_per_chunk( if highlight.start < next_start and highlight.end > current_start: highlights_with_indices_fixed = ScoredTextHighlight( start=max(0, highlight.start - current_start), - end=min(highlight.end - current_start, next_start) - if isinstance(next_start, int) - else highlight.end, + end=highlight.end - current_start + if isinstance(next_start, float) + else min(next_start, highlight.end - current_start), score=highlight.score, ) current_overlaps.append(highlights_with_indices_fixed) diff --git a/src/intelligence_layer/use_cases/search/expand_chunk.py b/src/intelligence_layer/use_cases/search/expand_chunk.py new file mode 100644 index 000000000..4d24147ac --- /dev/null +++ b/src/intelligence_layer/use_cases/search/expand_chunk.py @@ -0,0 +1,78 @@ +from typing import Generic, Sequence + +from pydantic import BaseModel + +from intelligence_layer.connectors import BaseRetriever, DocumentChunk +from intelligence_layer.connectors.retrievers.base_retriever import ID +from intelligence_layer.core.chunk import ChunkInput, ChunkWithIndices, TextChunk +from intelligence_layer.core.model import AlephAlphaModel +from intelligence_layer.core.task import Task +from intelligence_layer.core.tracer.tracer import TaskSpan + + +class ExpandChunkInput(BaseModel, Generic[ID]): + document_id: ID + chunks_found: Sequence[DocumentChunk] + + +class ExpandChunkOutput(BaseModel): + chunks: Sequence[TextChunk] + + +class ExpandChunks(Generic[ID], Task[ExpandChunkInput[ID], ExpandChunkOutput]): + def __init__( + self, + retriever: BaseRetriever[ID], + model: AlephAlphaModel, + max_chunk_size: int = 512, + ): + super().__init__() + self._retriever = retriever + self._chunk_with_indices = ChunkWithIndices(model, max_chunk_size) + + def do_run( + self, input: ExpandChunkInput[ID], task_span: TaskSpan + ) -> ExpandChunkOutput: + full_doc = self._retriever.get_full_document(input.document_id) + if not full_doc: + raise RuntimeError(f"No document for id '{input.document_id}' found") + + chunk_with_indices = self._chunk_with_indices.run( + ChunkInput(text=full_doc.text), task_span + ).chunks_with_indices + + overlapping_chunk_indices = self._overlapping_chunk_indices( + [c.start_index for c in chunk_with_indices], + [(chunk.start, chunk.end) for chunk in input.chunks_found], + ) + + return ExpandChunkOutput( + chunks=[ + chunk_with_indices[index].chunk for index in overlapping_chunk_indices + ] + ) + + def _overlapping_chunk_indices( + self, + chunk_start_indices: Sequence[int], + target_ranges: Sequence[tuple[int, int]], + ) -> list[int]: + n = len(chunk_start_indices) + overlapping_indices: list[int] = [] + + for i in range(n): + if i < n - 1: + chunk_end: float = chunk_start_indices[i + 1] + else: + chunk_end = float("inf") + + if any( + ( + chunk_start_indices[i] <= target_range[1] + and chunk_end > target_range[0] + ) + for target_range in target_ranges + ): + overlapping_indices.append(i) + + return overlapping_indices diff --git a/tests/use_cases/search/test_expand_chunk.py b/tests/use_cases/search/test_expand_chunk.py new file mode 100644 index 000000000..474037a81 --- /dev/null +++ b/tests/use_cases/search/test_expand_chunk.py @@ -0,0 +1,170 @@ +from typing import Sequence + +from pytest import fixture + +from intelligence_layer.connectors import ( + Document, + DocumentChunk, + QdrantInMemoryRetriever, +) +from intelligence_layer.core import LuminousControlModel, NoOpTracer +from intelligence_layer.use_cases import ExpandChunkInput, ExpandChunks + + +@fixture +def in_memory_retriever_documents() -> Sequence[Document]: + return [ + Document( + text="""In the rolling verdant hills of a realm untouched by the passage of modern times, a kingdom thrived under the rule of a benevolent monarch. The king, known for his wisdom and justice, held the loyalty of his people and the respect of his peers. However, beneath the surface of peace, a shadow loomed that would test the mettle of the kingdom's most valiant defenders: the knights. + +These knights, clad in gleaming armor and bearing the colors of their liege, were not mere soldiers but champions of the realm's ideals. They were sworn to protect the innocent, uphold justice, and maintain the peace, guided by a chivalric code that was as much a part of them as the swords they wielded. Among these noble warriors, Sir Aelwyn stood prominent, known across the land for his prowess in battle and his unyielding honor. + +Sir Aelwyn, the youngest knight ever to be granted the title of Master of the Horse, was a figure of legend. His tales were told in every corner of the kingdom, often embellished with each retelling. From his duel with the Giant of Gormouth to his silent vigil in the Haunted Wood, Aelwyn's life was a tapestry of bravery and adventure. Yet, his greatest challenge lay ahead, whispered in fearful murmurs throughout the castle—the rise of the Dragon of Black Hollow. + +The dragon had awoken from a centuries-long slumber, driven by hunger and wrath, laying waste to the villages on the kingdom's fringes. Smoke and despair rose from the once tranquil borders, drawing the attention of the king and his council. With the threat growing each day, the king summoned Sir Aelwyn and tasked him with a quest that could either save the kingdom or doom it forever—to defeat the dragon. + +As Sir Aelwyn prepared for his journey, the castle buzzed with activity. Blacksmiths forged new armor and weapons, alchemists concocted potent draughts, and scholars poured over ancient texts seeking any knowledge that might aid him. The knight spent his nights in the chapel, praying for strength and wisdom, and his days in the training yard, honing his skills against opponents both real and imagined. + +Accompanying Sir Aelwyn were his loyal companions: Sir Rowan, a strategist known for his cunning and intellect; Lady Elara, a knight whose skill with the bow was unmatched; and Dame Miriel, a warrior-poet whose songs could stir the soul as fiercely as her sword could cleave armor. Together, they represented the kingdom's finest, united under a single cause. + +Their journey was fraught with peril. They crossed through the Whispering Forest, where shadows moved with minds of their own, and over the Mountains of Echoes, where the wind carried voices from the past. Each step brought them closer to their quarry, and the signs of the dragon's passage grew ever more ominous—the charred earth, the ruins of once happy homes, and the air heavy with the scent of sulfur. + +As they approached Black Hollow, the landscape grew bleak, and the sky darkened. The dragon, coiled atop a pile of gold and bones, awaited them, its scales shimmering like molten rock. The air crackled with the heat of its breath, and its eyes, glowing like coals, fixed on Sir Aelwyn and his companions. + +The battle was fierce. Sir Rowan directed their movements with precision, while Lady Elara loosed arrows that found chinks in the dragon's armor. Dame Miriel's voice rose above the clamor, her words bolstering their courage and blinding the beast with bursts of radiant light. Sir Aelwyn faced the dragon head-on, his shield absorbing the flames that poured from its maw, his sword striking with the weight of his oath behind each blow. + +Hours seemed like days as the clash continued, the outcome uncertain. Finally, seeing an opening, Sir Aelwyn drove his sword deep into the dragon's heart. With a final roar that shook the heavens, the dragon fell, its reign of terror ended. + +The return to the kingdom was triumphant. The people lined the streets, showering the knights with flowers and cheers. The king welcomed them back as heroes, their deeds to be recorded in the annals of history for generations to come. Sir Aelwyn and his companions had not only saved the kingdom but had also reaffirmed the values it stood for: courage, honor, and a steadfast commitment to the protection of the realm. + +As the celebrations faded, Sir Aelwyn looked out over the kingdom from the castle's highest tower. The peace they had fought for lay stretched before him, a tapestry of green fields and bustling towns. Yet, he knew that this peace was not permanent but a precious moment to be cherished and protected. For as long as there were threats to the realm, there would be knights to face them, their swords ready and their hearts brave. + +In this timeless land, the cycle of challenge and triumph continued, each generation of knights rising to meet the dangers of their times with the same valor and resolve as those who had come before them. And so, the legends grew, each knight adding their thread to the ever-unfolding story of the kingdom and its defenders.""" + ) + ] + + +def build_expand_chunk_input( + document: Document, index_ranges: Sequence[tuple[int, int]] +) -> ExpandChunkInput[int]: + return ExpandChunkInput( + document_id=0, + chunks_found=[ + DocumentChunk( + text=document.text[index_range[0] : index_range[1]], + start=index_range[0], + end=index_range[1], + ) + for index_range in index_ranges + ], + ) + + +@fixture +def wholly_included_expand_chunk_input( + in_memory_retriever_documents: Sequence[Document], +) -> ExpandChunkInput[int]: + assert len(in_memory_retriever_documents) == 1 + start_index, end_index = ( + int(len(in_memory_retriever_documents[0].text) * 0.5), + int(len(in_memory_retriever_documents[0].text) * 0.55), + ) + + return build_expand_chunk_input( + in_memory_retriever_documents[0], [(start_index, end_index)] + ) + + +@fixture +def overlapping_expand_chunk_input( + in_memory_retriever_documents: Sequence[Document], +) -> ExpandChunkInput[int]: + assert len(in_memory_retriever_documents) == 1 + start_index, end_index = ( + int(len(in_memory_retriever_documents[0].text) * 0.2), + int(len(in_memory_retriever_documents[0].text) * 0.8), + ) + + return build_expand_chunk_input( + in_memory_retriever_documents[0], [(start_index, end_index)] + ) + + +@fixture +def multiple_chunks_expand_chunk_input( + in_memory_retriever_documents: Sequence[Document], +) -> ExpandChunkInput[int]: + assert len(in_memory_retriever_documents) == 1 + start_index_1, end_index_1 = ( + int(len(in_memory_retriever_documents[0].text) * 0.3), + int(len(in_memory_retriever_documents[0].text) * 0.4), + ) + start_index_2, end_index_2 = ( + int(len(in_memory_retriever_documents[0].text) * 0.45), + int(len(in_memory_retriever_documents[0].text) * 0.6), + ) + + return build_expand_chunk_input( + in_memory_retriever_documents[0], + [(start_index_1, end_index_1), (start_index_2, end_index_2)], + ) + + +def test_expand_chunk_works_for_wholly_included_chunk( + asymmetric_in_memory_retriever: QdrantInMemoryRetriever, + luminous_control_model: LuminousControlModel, + wholly_included_expand_chunk_input: ExpandChunkInput[int], + no_op_tracer: NoOpTracer, +) -> None: + expand_chunk_task = ExpandChunks( + asymmetric_in_memory_retriever, luminous_control_model, 256 + ) + expand_chunk_output = expand_chunk_task.run( + wholly_included_expand_chunk_input, no_op_tracer + ) + + assert ( + len(expand_chunk_output.chunks) + == 1 + == len(wholly_included_expand_chunk_input.chunks_found) + ) + assert ( + wholly_included_expand_chunk_input.chunks_found[0].text + in expand_chunk_output.chunks[0] + ) + + +def test_expand_chunk_works_for_overlapping_chunk( + asymmetric_in_memory_retriever: QdrantInMemoryRetriever, + luminous_control_model: LuminousControlModel, + overlapping_expand_chunk_input: ExpandChunkInput[int], + no_op_tracer: NoOpTracer, +) -> None: + expand_chunk_task = ExpandChunks( + asymmetric_in_memory_retriever, luminous_control_model, 256 + ) + expand_chunk_output = expand_chunk_task.run( + overlapping_expand_chunk_input, no_op_tracer + ) + + assert len(expand_chunk_output.chunks) == 4 + + +def test_expand_chunk_works_for_multiple_chunks( + asymmetric_in_memory_retriever: QdrantInMemoryRetriever, + luminous_control_model: LuminousControlModel, + multiple_chunks_expand_chunk_input: ExpandChunkInput[int], + no_op_tracer: NoOpTracer, +) -> None: + expand_chunk_task = ExpandChunks( + asymmetric_in_memory_retriever, luminous_control_model, 256 + ) + expand_chunk_output = expand_chunk_task.run( + multiple_chunks_expand_chunk_input, no_op_tracer + ) + + assert len(expand_chunk_output.chunks) == 3 + + combined_chunks = "\n\n".join(expand_chunk_output.chunks) + for chunk_found in multiple_chunks_expand_chunk_input.chunks_found: + assert chunk_found.text in combined_chunks From f53aec329833edb2621fa4f52fd62ba77c6a30ad Mon Sep 17 00:00:00 2001 From: "niklas.finken" Date: Wed, 17 Apr 2024 12:14:57 +0200 Subject: [PATCH 3/6] `ExpandChunks` task and use in `MultipleChunkRetrieverQa` --- src/intelligence_layer/use_cases/__init__.py | 6 +- .../qa/multiple_chunk_retriever_qa.py | 65 +++++++++++++++---- .../{expand_chunk.py => expand_chunks.py} | 12 ++-- .../qa/test_multiple_chunk_retriever_qa.py | 4 +- tests/use_cases/search/test_expand_chunk.py | 18 ++--- 5 files changed, 71 insertions(+), 34 deletions(-) rename src/intelligence_layer/use_cases/search/{expand_chunk.py => expand_chunks.py} (88%) diff --git a/src/intelligence_layer/use_cases/__init__.py b/src/intelligence_layer/use_cases/__init__.py index a97a5d630..cc8c06709 100644 --- a/src/intelligence_layer/use_cases/__init__.py +++ b/src/intelligence_layer/use_cases/__init__.py @@ -62,9 +62,9 @@ from .qa.single_chunk_qa import SingleChunkQa as SingleChunkQa from .qa.single_chunk_qa import SingleChunkQaInput as SingleChunkQaInput from .qa.single_chunk_qa import SingleChunkQaOutput as SingleChunkQaOutput -from .search.expand_chunk import ExpandChunkInput as ExpandChunkInput -from .search.expand_chunk import ExpandChunkOutput as ExpandChunkOutput -from .search.expand_chunk import ExpandChunks as ExpandChunks +from .search.expand_chunks import ExpandChunks as ExpandChunks +from .search.expand_chunks import ExpandChunksInput as ExpandChunksInput +from .search.expand_chunks import ExpandChunksOutput as ExpandChunksOutput from .search.search import AggregatedSearchEvaluation as AggregatedSearchEvaluation from .search.search import ChunkFound as ChunkFound from .search.search import ExpectedSearchOutput as ExpectedSearchOutput diff --git a/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py b/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py index 45bcfc183..6c591c7a8 100644 --- a/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py +++ b/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py @@ -1,3 +1,4 @@ +from collections import defaultdict from typing import Generic, Optional, Sequence from pydantic import BaseModel @@ -8,9 +9,14 @@ SearchResult, ) from intelligence_layer.core.chunk import TextChunk +from intelligence_layer.core.model import ControlModel, LuminousControlModel from intelligence_layer.core.task import Task from intelligence_layer.core.text_highlight import ScoredTextHighlight from intelligence_layer.core.tracer.tracer import TaskSpan +from intelligence_layer.use_cases.search.expand_chunks import ( + ExpandChunks, + ExpandChunksInput, +) from intelligence_layer.use_cases.search.search import Search, SearchInput from .retriever_based_qa import RetrieverBasedQaInput @@ -18,13 +24,15 @@ class AnswerSource(BaseModel, Generic[ID]): - search_result: SearchResult[ID] + document_id: ID + chunk: TextChunk highlights: Sequence[ScoredTextHighlight] class MultipleChunkRetrieverQaOutput(BaseModel, Generic[ID]): answer: Optional[str] sources: Sequence[AnswerSource[ID]] + search_results: Sequence[SearchResult[ID]] class MultipleChunkRetrieverQa( @@ -66,13 +74,17 @@ class MultipleChunkRetrieverQa( def __init__( self, retriever: BaseRetriever[ID], - k: int = 5, + model: ControlModel | None = None, + insert_chunk_number: int = 5, + insert_chunk_size: int = 256, single_chunk_qa: Task[SingleChunkQaInput, SingleChunkQaOutput] | None = None, ): super().__init__() + self._model = model or LuminousControlModel("luminous-supreme-control") self._search = Search(retriever) - self._k = k - self._single_chunk_qa = single_chunk_qa or SingleChunkQa() + self._expand_chunks = ExpandChunks(retriever, self._model, insert_chunk_size) + self._single_chunk_qa = single_chunk_qa or SingleChunkQa(self._model) + self._insert_chunk_number = insert_chunk_number @staticmethod def _combine_input_texts(chunks: Sequence[str]) -> tuple[TextChunk, Sequence[int]]: @@ -111,23 +123,46 @@ def _get_highlights_per_chunk( overlapping_ranges.append(current_overlaps) return overlapping_ranges + def _expand_search_result_chunks( + self, search_results: Sequence[SearchResult[ID]], task_span: TaskSpan + ) -> Sequence[tuple[ID, TextChunk]]: + grouped_results: dict[ID, list[SearchResult[ID]]] = defaultdict(list) + for result in search_results: + grouped_results[result.id].append(result) + + chunks_to_insert: list[tuple[ID, TextChunk]] = [] + for id, results in grouped_results.items(): + input = ExpandChunksInput( + document_id=id, chunks_found=[r.document_chunk for r in results] + ) + expand_chunks_output = self._expand_chunks.run(input, task_span) + for chunk in expand_chunks_output.chunks: + if len(chunks_to_insert) >= self._insert_chunk_number: + break + chunks_to_insert.append((id, chunk)) + + return chunks_to_insert + def do_run( self, input: RetrieverBasedQaInput, task_span: TaskSpan ) -> MultipleChunkRetrieverQaOutput[ID]: search_output = self._search.run( SearchInput(query=input.question), task_span ).results - sorted_search_output = sorted( - search_output, - key=lambda output: output.score, # not reversing on purpose because model performs better if relevant info is at the end - )[-self._k :] + sorted_search_results = sorted( + search_output, key=lambda output: output.score, reverse=True + ) + + chunks_to_insert = self._expand_search_result_chunks( + sorted_search_results, task_span + ) - chunk, chunk_start_indices = self._combine_input_texts( - [output.document_chunk.text for output in sorted_search_output] + chunk_for_prompt, chunk_start_indices = self._combine_input_texts( + [c[1] for c in chunks_to_insert] ) single_chunk_qa_input = SingleChunkQaInput( - chunk=chunk, + chunk=chunk_for_prompt, question=input.question, language=input.language, ) @@ -144,9 +179,13 @@ def do_run( answer=single_chunk_qa_output.answer, sources=[ AnswerSource( - search_result=chunk, + document_id=id_and_chunk[0], + chunk=id_and_chunk[1], highlights=highlights, ) - for chunk, highlights in zip(sorted_search_output, highlights_per_chunk) + for id_and_chunk, highlights in zip( + chunks_to_insert, highlights_per_chunk + ) ], + search_results=sorted_search_results, ) diff --git a/src/intelligence_layer/use_cases/search/expand_chunk.py b/src/intelligence_layer/use_cases/search/expand_chunks.py similarity index 88% rename from src/intelligence_layer/use_cases/search/expand_chunk.py rename to src/intelligence_layer/use_cases/search/expand_chunks.py index 4d24147ac..f13a2e784 100644 --- a/src/intelligence_layer/use_cases/search/expand_chunk.py +++ b/src/intelligence_layer/use_cases/search/expand_chunks.py @@ -10,16 +10,16 @@ from intelligence_layer.core.tracer.tracer import TaskSpan -class ExpandChunkInput(BaseModel, Generic[ID]): +class ExpandChunksInput(BaseModel, Generic[ID]): document_id: ID chunks_found: Sequence[DocumentChunk] -class ExpandChunkOutput(BaseModel): +class ExpandChunksOutput(BaseModel): chunks: Sequence[TextChunk] -class ExpandChunks(Generic[ID], Task[ExpandChunkInput[ID], ExpandChunkOutput]): +class ExpandChunks(Generic[ID], Task[ExpandChunksInput[ID], ExpandChunksOutput]): def __init__( self, retriever: BaseRetriever[ID], @@ -31,8 +31,8 @@ def __init__( self._chunk_with_indices = ChunkWithIndices(model, max_chunk_size) def do_run( - self, input: ExpandChunkInput[ID], task_span: TaskSpan - ) -> ExpandChunkOutput: + self, input: ExpandChunksInput[ID], task_span: TaskSpan + ) -> ExpandChunksOutput: full_doc = self._retriever.get_full_document(input.document_id) if not full_doc: raise RuntimeError(f"No document for id '{input.document_id}' found") @@ -46,7 +46,7 @@ def do_run( [(chunk.start, chunk.end) for chunk in input.chunks_found], ) - return ExpandChunkOutput( + return ExpandChunksOutput( chunks=[ chunk_with_indices[index].chunk for index in overlapping_chunk_indices ] diff --git a/tests/use_cases/qa/test_multiple_chunk_retriever_qa.py b/tests/use_cases/qa/test_multiple_chunk_retriever_qa.py index 45b285fbc..c905e33ff 100644 --- a/tests/use_cases/qa/test_multiple_chunk_retriever_qa.py +++ b/tests/use_cases/qa/test_multiple_chunk_retriever_qa.py @@ -1,8 +1,6 @@ from pytest import fixture -from intelligence_layer.connectors.retrievers.qdrant_in_memory_retriever import ( - QdrantInMemoryRetriever, -) +from intelligence_layer.connectors import QdrantInMemoryRetriever from intelligence_layer.core import NoOpTracer from intelligence_layer.use_cases import MultipleChunkRetrieverQa, RetrieverBasedQaInput diff --git a/tests/use_cases/search/test_expand_chunk.py b/tests/use_cases/search/test_expand_chunk.py index 474037a81..a6a9c5fc9 100644 --- a/tests/use_cases/search/test_expand_chunk.py +++ b/tests/use_cases/search/test_expand_chunk.py @@ -8,7 +8,7 @@ QdrantInMemoryRetriever, ) from intelligence_layer.core import LuminousControlModel, NoOpTracer -from intelligence_layer.use_cases import ExpandChunkInput, ExpandChunks +from intelligence_layer.use_cases import ExpandChunks, ExpandChunksInput @fixture @@ -46,8 +46,8 @@ def in_memory_retriever_documents() -> Sequence[Document]: def build_expand_chunk_input( document: Document, index_ranges: Sequence[tuple[int, int]] -) -> ExpandChunkInput[int]: - return ExpandChunkInput( +) -> ExpandChunksInput[int]: + return ExpandChunksInput( document_id=0, chunks_found=[ DocumentChunk( @@ -63,7 +63,7 @@ def build_expand_chunk_input( @fixture def wholly_included_expand_chunk_input( in_memory_retriever_documents: Sequence[Document], -) -> ExpandChunkInput[int]: +) -> ExpandChunksInput[int]: assert len(in_memory_retriever_documents) == 1 start_index, end_index = ( int(len(in_memory_retriever_documents[0].text) * 0.5), @@ -78,7 +78,7 @@ def wholly_included_expand_chunk_input( @fixture def overlapping_expand_chunk_input( in_memory_retriever_documents: Sequence[Document], -) -> ExpandChunkInput[int]: +) -> ExpandChunksInput[int]: assert len(in_memory_retriever_documents) == 1 start_index, end_index = ( int(len(in_memory_retriever_documents[0].text) * 0.2), @@ -93,7 +93,7 @@ def overlapping_expand_chunk_input( @fixture def multiple_chunks_expand_chunk_input( in_memory_retriever_documents: Sequence[Document], -) -> ExpandChunkInput[int]: +) -> ExpandChunksInput[int]: assert len(in_memory_retriever_documents) == 1 start_index_1, end_index_1 = ( int(len(in_memory_retriever_documents[0].text) * 0.3), @@ -113,7 +113,7 @@ def multiple_chunks_expand_chunk_input( def test_expand_chunk_works_for_wholly_included_chunk( asymmetric_in_memory_retriever: QdrantInMemoryRetriever, luminous_control_model: LuminousControlModel, - wholly_included_expand_chunk_input: ExpandChunkInput[int], + wholly_included_expand_chunk_input: ExpandChunksInput[int], no_op_tracer: NoOpTracer, ) -> None: expand_chunk_task = ExpandChunks( @@ -137,7 +137,7 @@ def test_expand_chunk_works_for_wholly_included_chunk( def test_expand_chunk_works_for_overlapping_chunk( asymmetric_in_memory_retriever: QdrantInMemoryRetriever, luminous_control_model: LuminousControlModel, - overlapping_expand_chunk_input: ExpandChunkInput[int], + overlapping_expand_chunk_input: ExpandChunksInput[int], no_op_tracer: NoOpTracer, ) -> None: expand_chunk_task = ExpandChunks( @@ -153,7 +153,7 @@ def test_expand_chunk_works_for_overlapping_chunk( def test_expand_chunk_works_for_multiple_chunks( asymmetric_in_memory_retriever: QdrantInMemoryRetriever, luminous_control_model: LuminousControlModel, - multiple_chunks_expand_chunk_input: ExpandChunkInput[int], + multiple_chunks_expand_chunk_input: ExpandChunksInput[int], no_op_tracer: NoOpTracer, ) -> None: expand_chunk_task = ExpandChunks( From ca7a9d93403346e6ff3461f8e1ddc8357942f616 Mon Sep 17 00:00:00 2001 From: "niklas.finken" Date: Wed, 17 Apr 2024 12:19:42 +0200 Subject: [PATCH 4/6] Adjust CHANGELOG --- CHANGELOG.md | 4 ++-- src/intelligence_layer/use_cases/search/expand_chunks.py | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cbeae410..1b7e29ae5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,10 +3,10 @@ ## Unreleased ### Breaking Changes -... +- breaking change: `MultipleChunkRetrieverQaOutput` now return `sources` and `search_results` ### New Features -... +- feature: `ExpandChunks` task takes a retriever and some search results to expand the chunks to the desired length ### Fixes ... diff --git a/src/intelligence_layer/use_cases/search/expand_chunks.py b/src/intelligence_layer/use_cases/search/expand_chunks.py index f13a2e784..edbace1ef 100644 --- a/src/intelligence_layer/use_cases/search/expand_chunks.py +++ b/src/intelligence_layer/use_cases/search/expand_chunks.py @@ -20,6 +20,14 @@ class ExpandChunksOutput(BaseModel): class ExpandChunks(Generic[ID], Task[ExpandChunksInput[ID], ExpandChunksOutput]): + """Expand chunks found during search. + + Args: + retriever: Used to access and return a set of texts. + model: The model's tokenizer is relevant to calculate the correct size of the returned chunks. + max_chunk_size: The maximum chunk size of each returned chunk. + """ + def __init__( self, retriever: BaseRetriever[ID], From 492c391f6fcbec549b95926b1f72912d156355f1 Mon Sep 17 00:00:00 2001 From: "niklas.finken" Date: Wed, 17 Apr 2024 12:22:11 +0200 Subject: [PATCH 5/6] remove "k" from `MultipleChunkRetrieverQa` --- .../use_cases/qa/multiple_chunk_retriever_qa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py b/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py index 6c591c7a8..874ac8cdf 100644 --- a/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py +++ b/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py @@ -65,7 +65,7 @@ class MultipleChunkRetrieverQa( >>> token = os.getenv("AA_TOKEN") >>> document_index = DocumentIndexClient(token) >>> retriever = DocumentIndexRetriever(document_index, "aleph-alpha", "wikipedia-de", 3) - >>> task = MultipleChunkRetrieverQa(retriever, k=2) + >>> task = MultipleChunkRetrieverQa(retriever) >>> input_data = RetrieverBasedQaInput(question="When was Rome founded?") >>> tracer = InMemoryTracer() >>> output = task.run(input_data, tracer) From 95904bdd9fd8328e2ab771df3e13705f2ddccd1e Mon Sep 17 00:00:00 2001 From: "niklas.finken" Date: Wed, 17 Apr 2024 12:27:33 +0200 Subject: [PATCH 6/6] fix pipeline --- .../use_cases/qa/multiple_chunk_retriever_qa.py | 16 ---------------- .../qa/test_multiple_chunk_retriever_qa.py | 2 +- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py b/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py index 874ac8cdf..229af5f3a 100644 --- a/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py +++ b/src/intelligence_layer/use_cases/qa/multiple_chunk_retriever_qa.py @@ -53,22 +53,6 @@ class MultipleChunkRetrieverQa( k: number of top chunk search results to inject into :class:`SingleChunkQa`-task. qa_task: The task that is used to generate an answer for a single chunk (retrieved through the retriever). Defaults to :class:`SingleChunkQa`. - - Example: - >>> import os - >>> from intelligence_layer.connectors import DocumentIndexClient - >>> from intelligence_layer.connectors import DocumentIndexRetriever - >>> from intelligence_layer.core import InMemoryTracer - >>> from intelligence_layer.use_cases import MultipleChunkRetrieverQa, RetrieverBasedQaInput - - - >>> token = os.getenv("AA_TOKEN") - >>> document_index = DocumentIndexClient(token) - >>> retriever = DocumentIndexRetriever(document_index, "aleph-alpha", "wikipedia-de", 3) - >>> task = MultipleChunkRetrieverQa(retriever) - >>> input_data = RetrieverBasedQaInput(question="When was Rome founded?") - >>> tracer = InMemoryTracer() - >>> output = task.run(input_data, tracer) """ def __init__( diff --git a/tests/use_cases/qa/test_multiple_chunk_retriever_qa.py b/tests/use_cases/qa/test_multiple_chunk_retriever_qa.py index c905e33ff..ed8003744 100644 --- a/tests/use_cases/qa/test_multiple_chunk_retriever_qa.py +++ b/tests/use_cases/qa/test_multiple_chunk_retriever_qa.py @@ -21,4 +21,4 @@ def test_retriever_based_qa_using_in_memory_retriever( output = multiple_chunk_retriever_qa.run(input, no_op_tracer) assert output.answer assert "1888" in output.answer - assert len(output.sources) == 2 + assert len(output.sources) == 5