diff --git a/.semversioner/next-release/patch-20240710045321911472.json b/.semversioner/next-release/patch-20240710045321911472.json new file mode 100644 index 0000000000..d0b1295a8f --- /dev/null +++ b/.semversioner/next-release/patch-20240710045321911472.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "update map prompt of global query" +} diff --git a/graphrag/query/structured_search/global_search/map_system_prompt.py b/graphrag/query/structured_search/global_search/map_system_prompt.py index db1a649df3..0002b340f3 100644 --- a/graphrag/query/structured_search/global_search/map_system_prompt.py +++ b/graphrag/query/structured_search/global_search/map_system_prompt.py @@ -4,15 +4,12 @@ """System prompts for global search.""" MAP_SYSTEM_PROMPT = """ ----Role--- - You are a helpful assistant responding to questions about data in the tables provided. +""" - ----Goal--- - +MAP_USER_PROMPT = """ +=============== Generate a response consisting of a list of key points that responds to the user's question, summarizing all relevant information in the input data tables. - You should use the data provided in the data tables below as the primary context for generating the response. If you don't know the answer or if the input data tables do not contain sufficient information to provide an answer, just say so. Do not make anything up. @@ -20,7 +17,7 @@ - Description: A comprehensive description of the point. - Importance Score: An integer score between 0-100 that indicates how important the point is in answering the user's question. An 'I don't know' type of response should have a score of 0. -The response should be JSON formatted as follows: +The response MUST be JSON formatted as follows: {{ "points": [ {{"description": "Description of point 1 [Data: Reports (report ids)]", "score": score_value}}, @@ -28,35 +25,6 @@ ] }} -The response shall preserve the original meaning and use of modal verbs such as "shall", "may" or "will". - -Points supported by data should list the relevant reports as references as follows: -"This is an example sentence supported by data references [Data: Reports (report ids)]" - -**Do not list more than 5 record ids in a single reference**. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. - -For example: -"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Reports (2, 7, 64, 46, 34, +more)]. He is also CEO of company X [Data: Reports (1, 3)]" - -where 1, 2, 3, 7, 34, 46, and 64 represent the id (not the index) of the relevant data report in the provided tables. - -Do not include information where the supporting evidence for it is not provided. - - ----Data tables--- - -{context_data} - ----Goal--- - -Generate a response consisting of a list of key points that responds to the user's question, summarizing all relevant information in the input data tables. - -You should use the data provided in the data tables below as the primary context for generating the response. -If you don't know the answer or if the input data tables do not contain sufficient information to provide an answer, just say so. Do not make anything up. - -Each key point in the response should have the following element: -- Description: A comprehensive description of the point. -- Importance Score: An integer score between 0-100 that indicates how important the point is in answering the user's question. An 'I don't know' type of response should have a score of 0. The response shall preserve the original meaning and use of modal verbs such as "shall", "may" or "will". @@ -65,18 +33,31 @@ **Do not list more than 5 record ids in a single reference**. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. +=============== For example: -"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Reports (2, 7, 64, 46, 34, +more)]. He is also CEO of company X [Data: Reports (1, 3)]" - -where 1, 2, 3, 7, 34, 46, and 64 represent the id (not the index) of the relevant data report in the provided tables. - -Do not include information where the supporting evidence for it is not provided. - -The response should be JSON formatted as follows: +user question: Is Person X currently under investigation for alleged illegal activities or unethical behavior? +---Data Tables--- +| id | title | occurrence weight | content | rank | +|----|--------------------------------------|-------------------|---------|------| +| 1 | Allegations against Person X | 1 | Allegations of financial misconduct | 4.0 | +| 2 | Allegations against Person X | 0.3 | Allegations of unethical business practices | 4.0 | +| 3 | Allegations against Person X | 0.8 | Allegations of workplace harassment | 3.0 | +| 4 | CEO of company X | 1 | Person X is CEO of Company X | 3.0 | +| 5 | owner of company Y | 1 | Person X is the owner of Company Y | 3.0 | + +answer: {{ "points": [ - {{"description": "Description of point 1 [Data: Reports (report ids)]", "score": score_value}}, - {{"description": "Description of point 2 [Data: Reports (report ids)]", "score": score_value}} + {{"description": "Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Reports (1, 2, 3, 4)].", "score": 85}}, + {{"description": "He is also CEO of company X [Data: Reports (6)]", "score": 75}} ] }} + +============== +user question: {user_question} + +---Data tables--- +{context_data} + +answer: """ diff --git a/graphrag/query/structured_search/global_search/search.py b/graphrag/query/structured_search/global_search/search.py index 3b52ecbd8c..c65ee577a5 100644 --- a/graphrag/query/structured_search/global_search/search.py +++ b/graphrag/query/structured_search/global_search/search.py @@ -25,7 +25,7 @@ GlobalSearchLLMCallback, ) from graphrag.query.structured_search.global_search.map_system_prompt import ( - MAP_SYSTEM_PROMPT, + MAP_SYSTEM_PROMPT, MAP_USER_PROMPT, ) from graphrag.query.structured_search.global_search.reduce_system_prompt import ( GENERAL_KNOWLEDGE_INSTRUCTION, @@ -64,6 +64,7 @@ def __init__( context_builder: GlobalContextBuilder, token_encoder: tiktoken.Encoding | None = None, map_system_prompt: str = MAP_SYSTEM_PROMPT, + map_user_prompt: str = MAP_USER_PROMPT, reduce_system_prompt: str = REDUCE_SYSTEM_PROMPT, response_type: str = "multiple paragraphs", allow_general_knowledge: bool = False, @@ -83,6 +84,7 @@ def __init__( context_builder_params=context_builder_params, ) self.map_system_prompt = map_system_prompt + self.map_user_prompt = map_user_prompt self.reduce_system_prompt = reduce_system_prompt self.response_type = response_type self.allow_general_knowledge = allow_general_knowledge @@ -173,10 +175,10 @@ async def _map_response_single_batch( start_time = time.time() search_prompt = "" try: - search_prompt = self.map_system_prompt.format(context_data=context_data) + search_prompt = self.map_user_prompt.format(context_data=context_data, user_question=query) search_messages = [ - {"role": "system", "content": search_prompt}, - {"role": "user", "content": query}, + {"role": "system", "content": self.map_system_prompt}, + {"role": "user", "content": search_prompt}, ] async with self.semaphore: search_response = await self.llm.agenerate(