Repair json when LLM returns faulty responses on non json mode (#801)

* fixed json issue * change to use try_parse_json_object onlu * pyproject add json-repair * add check extra description before and after json object * json.loads() before repire_json, based on jbradley1 suggestion. * Fix json parsing and formatting * semver * Nicer tuple parsing --------- Co-authored-by: paulg <[email protected]>
microsoft · Aug 2, 2024 · 487cb96 · 487cb96
1 parent 9020df1
commit 487cb96
Show file tree

Hide file tree

Showing 10 changed files with 132 additions and 135 deletions.
diff --git a/.semversioner/next-release/patch-20240802002107383210.json b/.semversioner/next-release/patch-20240802002107383210.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Fix json parsing when LLM returns faulty responses"
+}
diff --git a/graphrag/index/utils/__init__.py b/graphrag/index/utils/__init__.py
@@ -6,7 +6,6 @@
 from .dicts import dict_has_keys_with_types
 from .hashing import gen_md5_hash
 from .is_null import is_null
-from .json import clean_up_json
 from .load_graph import load_graph
 from .string import clean_str
 from .tokens import num_tokens_from_string, string_from_tokens
@@ -15,7 +14,6 @@
 
 __all__ = [
     "clean_str",
-    "clean_up_json",
     "dict_has_keys_with_types",
     "gen_md5_hash",
     "gen_uuid",

diff --git a/graphrag/index/utils/json.py b/graphrag/index/utils/json.py
diff --git a/graphrag/llm/openai/_json.py b/graphrag/llm/openai/_json.py
diff --git a/graphrag/llm/openai/json_parsing_llm.py b/graphrag/llm/openai/json_parsing_llm.py
@@ -33,5 +33,6 @@ async def __call__(
         """Call the LLM with the input and kwargs."""
         result = await self._delegate(input, **kwargs)
         if kwargs.get("json") and result.json is None and result.output is not None:
-            result.json = try_parse_json_object(result.output)
+            _, parsed_json = try_parse_json_object(result.output)
+            result.json = parsed_json
         return result
diff --git a/graphrag/llm/openai/openai_chat_llm.py b/graphrag/llm/openai/openai_chat_llm.py
@@ -4,7 +4,6 @@
 """The Chat-based language model."""
 
 import logging
-from json import JSONDecodeError
 
 from typing_extensions import Unpack
 
@@ -16,7 +15,6 @@
     LLMOutput,
 )
 
-from ._json import clean_up_json
 from ._prompts import JSON_CHECK_PROMPT
 from .openai_configuration import OpenAIConfiguration
 from .types import OpenAIClientTypes
@@ -104,11 +102,10 @@ async def _native_json(
             },
         )
 
-        raw_output = result.output or ""
-        json_output = try_parse_json_object(raw_output)
+        output, json_output = try_parse_json_object(result.output or "")
 
         return LLMOutput[CompletionOutput](
-            output=raw_output,
+            output=output,
             json=json_output,
             history=result.history,
         )
@@ -119,24 +116,23 @@ async def _manual_json(
         # Otherwise, clean up the output and try to parse it as json
         result = await self._invoke(input, **kwargs)
         history = result.history or []
-        output = clean_up_json(result.output or "")
-        try:
-            json_output = try_parse_json_object(output)
+        output, json_output = try_parse_json_object(result.output or "")
+        if json_output:
             return LLMOutput[CompletionOutput](
-                output=output, json=json_output, history=history
+                output=result.output, json=json_output, history=history
             )
-        except (TypeError, JSONDecodeError):
-            log.warning("error parsing llm json, retrying")
-            # If cleaned up json is unparsable, use the LLM to reformat it (may throw)
-            result = await self._try_clean_json_with_llm(output, **kwargs)
-            output = clean_up_json(result.output or "")
-            json = try_parse_json_object(output)
+        # if not return correct formatted json, retry
+        log.warning("error parsing llm json, retrying")
 
-            return LLMOutput[CompletionOutput](
-                output=output,
-                json=json,
-                history=history,
-            )
+        # If cleaned up json is unparsable, use the LLM to reformat it (may throw)
+        result = await self._try_clean_json_with_llm(output, **kwargs)
+        output, json_output = try_parse_json_object(result.output or "")
+
+        return LLMOutput[CompletionOutput](
+            output=output,
+            json=json_output,
+            history=history,
+        )
 
     async def _try_clean_json_with_llm(
         self, output: str, **kwargs: Unpack[LLMInput]

diff --git a/graphrag/llm/openai/utils.py b/graphrag/llm/openai/utils.py
@@ -5,10 +5,12 @@
 
 import json
 import logging
+import re
 from collections.abc import Callable
 from typing import Any
 
 import tiktoken
+from json_repair import repair_json
 from openai import (
     APIConnectionError,
     InternalServerError,
@@ -87,17 +89,51 @@ def get_completion_llm_args(
     }
 
 
-def try_parse_json_object(input: str) -> dict:
-    """Generate JSON-string output using best-attempt prompting & parsing techniques."""
+def try_parse_json_object(input: str) -> tuple[str, dict]:
+    """JSON cleaning and formatting utilities."""
+    """sometime, the llm return a json string with some extra description, this function will clean it up."""
+    _pattern = r"\{(.*)\}"
+    _match = re.search(_pattern, input)
+    input = "{" + _match.group(1) + "}" if _match else input
+
+    """Clean up json string."""
+    input = (
+        input.replace("{{", "{")
+        .replace("}}", "}")
+        .replace('"[{', "[{")
+        .replace('}]"', "}]")
+        .replace("\\", " ")
+        .replace("\\n", " ")
+        .replace("\n", " ")
+        .replace("\r", "")
+        .strip()
+    )
+
+    # Remove JSON Markdown Frame
+    if input.startswith("```json"):
+        input = input[len("```json") :]
+    if input.endswith("```"):
+        input = input[: len(input) - len("```")]
+
     try:
         result = json.loads(input)
     except json.JSONDecodeError:
-        log.exception("error loading json, json=%s", input)
-        raise
+        """Fixup potentially malformed json string using json_repair."""
+        input = str(repair_json(json_str=input, return_objects=False))
+
+        """Generate JSON-string output using best-attempt prompting & parsing techniques."""
+        try:
+            result = json.loads(input)
+        except json.JSONDecodeError:
+            log.exception("error loading json, json=%s", input)
+            return input, {}
+        else:
+            if not isinstance(result, dict):
+                log.exception("not expected dict type. type=%s:", type(result))
+                return input, {}
+            return input, result
     else:
-        if not isinstance(result, dict):
-            raise TypeError
-        return result
+        return input, result
 
 
 def get_sleep_time_from_error(e: Any) -> float:

diff --git a/graphrag/query/structured_search/global_search/search.py b/graphrag/query/structured_search/global_search/search.py
@@ -13,7 +13,7 @@
 import pandas as pd
 import tiktoken
 
-from graphrag.index.utils.json import clean_up_json
+from graphrag.llm.openai.utils import try_parse_json_object
 from graphrag.query.context_builder.builders import GlobalContextBuilder
 from graphrag.query.context_builder.conversation_history import (
     ConversationHistory,
@@ -188,7 +188,6 @@ async def _map_response_single_batch(
                 processed_response = self.parse_search_response(search_response)
             except ValueError:
                 # Clean up and retry parse
-                search_response = clean_up_json(search_response)
                 try:
                     # parse search response json
                     processed_response = self.parse_search_response(search_response)
@@ -229,6 +228,10 @@ def parse_search_response(self, search_response: str) -> list[dict[str, Any]]:
         list[dict[str, Any]]
             A list of key points, each key point is a dictionary with "answer" and "score" keys
         """
+        search_response, _j = try_parse_json_object(search_response)
+        if _j == {}:
+            return [{"answer": "not avaliable", "score": 0}]
+
         parsed_elements = json.loads(search_response)["points"]
         return [
             {