From 7d80685d44d2c42d22ee2ee097fdf07571fff1c6 Mon Sep 17 00:00:00 2001
From: Felix Fehse <155464791+FelixFehse@users.noreply.github.com>
Date: Thu, 27 Jun 2024 17:51:15 +0200
Subject: [PATCH] fix: correct max highlight range (#936)

---
 CHANGELOG.md                                     |  1 +
 .../examples/qa/multiple_chunk_retriever_qa.py   | 16 ++++++++++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0c52c5b5c..6d634f373 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@
 - `PromptTemplate.to_rich_prompt` now always returns an empty list for prompt ranges that are empty.
 - `SingleChunkQa` no longer crashes if given an empty input and a specific prompt template. This did not affect users who used models provided in `core`.
 - Added default values for `labels` and `metadata` for `EvaluationOverview` and `RunOverview`
+- In the `MultipleChunkRetrieverQa`, text-highlight start and end points are now restricted to within the text length of the respective chunk.
 
 ### Deprecations 
 ...
diff --git a/src/intelligence_layer/examples/qa/multiple_chunk_retriever_qa.py b/src/intelligence_layer/examples/qa/multiple_chunk_retriever_qa.py
index 4d0ef25de..9b4ada941 100644
--- a/src/intelligence_layer/examples/qa/multiple_chunk_retriever_qa.py
+++ b/src/intelligence_layer/examples/qa/multiple_chunk_retriever_qa.py
@@ -156,10 +156,15 @@ def _combine_input_texts(
         start_indices: list[int] = []
         combined_text = ""
         for i, chunk in enumerate(chunks):
-            combined_text += source_appendix.format(i=i + 1)
             start_indices.append(len(combined_text))
-            combined_text += chunk + "\n\n"
-        return (TextChunk(combined_text.strip()), start_indices)
+
+            c = source_appendix.format(i=i + 1)
+            c += chunk + "\n\n"
+            c = c.strip()
+            if i != 0:
+                c = " " + c
+            combined_text += c
+        return (TextChunk(combined_text), start_indices)
 
     @staticmethod
     def _get_highlights_per_chunk(
@@ -182,7 +187,10 @@ def _get_highlights_per_chunk(
                         end=(
                             highlight.end - current_start
                             if isinstance(next_start, float)
-                            else min(next_start, highlight.end - current_start)
+                            else min(
+                                next_start - current_start,
+                                highlight.end - current_start,
+                            )
                         ),
                         score=highlight.score,
                     )