Skip to content

Commit

Permalink
Repair json when LLM returns faulty responses on non json mode (#801)
Browse files Browse the repository at this point in the history
* fixed json issue

* change to use try_parse_json_object onlu

* pyproject add json-repair

* add check extra description before and after json object

* json.loads() before repire_json, based on jbradley1 suggestion.

* Fix json parsing and formatting

* semver

* Nicer tuple parsing

---------

Co-authored-by: paulg <[email protected]>
  • Loading branch information
AlonsoGuevara and s106916 authored Aug 2, 2024
1 parent 9020df1 commit 487cb96
Show file tree
Hide file tree
Showing 10 changed files with 132 additions and 135 deletions.
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20240802002107383210.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Fix json parsing when LLM returns faulty responses"
}
2 changes: 0 additions & 2 deletions graphrag/index/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from .dicts import dict_has_keys_with_types
from .hashing import gen_md5_hash
from .is_null import is_null
from .json import clean_up_json
from .load_graph import load_graph
from .string import clean_str
from .tokens import num_tokens_from_string, string_from_tokens
Expand All @@ -15,7 +14,6 @@

__all__ = [
"clean_str",
"clean_up_json",
"dict_has_keys_with_types",
"gen_md5_hash",
"gen_uuid",
Expand Down
27 changes: 0 additions & 27 deletions graphrag/index/utils/json.py

This file was deleted.

25 changes: 0 additions & 25 deletions graphrag/llm/openai/_json.py

This file was deleted.

3 changes: 2 additions & 1 deletion graphrag/llm/openai/json_parsing_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,6 @@ async def __call__(
"""Call the LLM with the input and kwargs."""
result = await self._delegate(input, **kwargs)
if kwargs.get("json") and result.json is None and result.output is not None:
result.json = try_parse_json_object(result.output)
_, parsed_json = try_parse_json_object(result.output)
result.json = parsed_json
return result
36 changes: 16 additions & 20 deletions graphrag/llm/openai/openai_chat_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
"""The Chat-based language model."""

import logging
from json import JSONDecodeError

from typing_extensions import Unpack

Expand All @@ -16,7 +15,6 @@
LLMOutput,
)

from ._json import clean_up_json
from ._prompts import JSON_CHECK_PROMPT
from .openai_configuration import OpenAIConfiguration
from .types import OpenAIClientTypes
Expand Down Expand Up @@ -104,11 +102,10 @@ async def _native_json(
},
)

raw_output = result.output or ""
json_output = try_parse_json_object(raw_output)
output, json_output = try_parse_json_object(result.output or "")

return LLMOutput[CompletionOutput](
output=raw_output,
output=output,
json=json_output,
history=result.history,
)
Expand All @@ -119,24 +116,23 @@ async def _manual_json(
# Otherwise, clean up the output and try to parse it as json
result = await self._invoke(input, **kwargs)
history = result.history or []
output = clean_up_json(result.output or "")
try:
json_output = try_parse_json_object(output)
output, json_output = try_parse_json_object(result.output or "")
if json_output:
return LLMOutput[CompletionOutput](
output=output, json=json_output, history=history
output=result.output, json=json_output, history=history
)
except (TypeError, JSONDecodeError):
log.warning("error parsing llm json, retrying")
# If cleaned up json is unparsable, use the LLM to reformat it (may throw)
result = await self._try_clean_json_with_llm(output, **kwargs)
output = clean_up_json(result.output or "")
json = try_parse_json_object(output)
# if not return correct formatted json, retry
log.warning("error parsing llm json, retrying")

return LLMOutput[CompletionOutput](
output=output,
json=json,
history=history,
)
# If cleaned up json is unparsable, use the LLM to reformat it (may throw)
result = await self._try_clean_json_with_llm(output, **kwargs)
output, json_output = try_parse_json_object(result.output or "")

return LLMOutput[CompletionOutput](
output=output,
json=json_output,
history=history,
)

async def _try_clean_json_with_llm(
self, output: str, **kwargs: Unpack[LLMInput]
Expand Down
50 changes: 43 additions & 7 deletions graphrag/llm/openai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@

import json
import logging
import re
from collections.abc import Callable
from typing import Any

import tiktoken
from json_repair import repair_json
from openai import (
APIConnectionError,
InternalServerError,
Expand Down Expand Up @@ -87,17 +89,51 @@ def get_completion_llm_args(
}


def try_parse_json_object(input: str) -> dict:
"""Generate JSON-string output using best-attempt prompting & parsing techniques."""
def try_parse_json_object(input: str) -> tuple[str, dict]:
"""JSON cleaning and formatting utilities."""
"""sometime, the llm return a json string with some extra description, this function will clean it up."""
_pattern = r"\{(.*)\}"
_match = re.search(_pattern, input)
input = "{" + _match.group(1) + "}" if _match else input

"""Clean up json string."""
input = (
input.replace("{{", "{")
.replace("}}", "}")
.replace('"[{', "[{")
.replace('}]"', "}]")
.replace("\\", " ")
.replace("\\n", " ")
.replace("\n", " ")
.replace("\r", "")
.strip()
)

# Remove JSON Markdown Frame
if input.startswith("```json"):
input = input[len("```json") :]
if input.endswith("```"):
input = input[: len(input) - len("```")]

try:
result = json.loads(input)
except json.JSONDecodeError:
log.exception("error loading json, json=%s", input)
raise
"""Fixup potentially malformed json string using json_repair."""
input = str(repair_json(json_str=input, return_objects=False))

"""Generate JSON-string output using best-attempt prompting & parsing techniques."""
try:
result = json.loads(input)
except json.JSONDecodeError:
log.exception("error loading json, json=%s", input)
return input, {}
else:
if not isinstance(result, dict):
log.exception("not expected dict type. type=%s:", type(result))
return input, {}
return input, result
else:
if not isinstance(result, dict):
raise TypeError
return result
return input, result


def get_sleep_time_from_error(e: Any) -> float:
Expand Down
7 changes: 5 additions & 2 deletions graphrag/query/structured_search/global_search/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import pandas as pd
import tiktoken

from graphrag.index.utils.json import clean_up_json
from graphrag.llm.openai.utils import try_parse_json_object
from graphrag.query.context_builder.builders import GlobalContextBuilder
from graphrag.query.context_builder.conversation_history import (
ConversationHistory,
Expand Down Expand Up @@ -188,7 +188,6 @@ async def _map_response_single_batch(
processed_response = self.parse_search_response(search_response)
except ValueError:
# Clean up and retry parse
search_response = clean_up_json(search_response)
try:
# parse search response json
processed_response = self.parse_search_response(search_response)
Expand Down Expand Up @@ -229,6 +228,10 @@ def parse_search_response(self, search_response: str) -> list[dict[str, Any]]:
list[dict[str, Any]]
A list of key points, each key point is a dictionary with "answer" and "score" keys
"""
search_response, _j = try_parse_json_object(search_response)
if _j == {}:
return [{"answer": "not avaliable", "score": 0}]

parsed_elements = json.loads(search_response)["points"]
return [
{
Expand Down
Loading

0 comments on commit 487cb96

Please sign in to comment.