-
Notifications
You must be signed in to change notification settings - Fork 139
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5468 from pathwaycom/berke/pathway-sheets-app
Berke/pathway sheets app GitOrigin-RevId: b8933ade9adfc31b295b39795813a9d0de2e3c5e
- Loading branch information
1 parent
adb2d69
commit ccee4e7
Showing
3 changed files
with
145 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
import re | ||
|
||
import pathway as pw | ||
|
||
|
||
@pw.udf | ||
def prompt_citing_qa(query: str, docs: list[pw.Json]): | ||
context_pieces = [] | ||
|
||
for i, doc in enumerate(docs, 1): | ||
context_pieces.append(f"# Source {i}/n/n") | ||
context_pieces.append(doc["text"]) # type: ignore | ||
context_pieces.append("") | ||
context_str = "\n".join(context_pieces) | ||
prompt = ( | ||
"Please provide an answer based solely on the provided sources. " | ||
"When referencing information from a source, " | ||
"cite the appropriate source(s) using their corresponding numbers. " | ||
"Every answer should include at least one source citation. " | ||
"Only cite a source when you are explicitly referencing it. " | ||
"If exists, mention specific article/section header you use at the beginning of answer, such as '4.a Client has rights to...'" # noqa: E501 | ||
"Article headers may or may not be in docs, dont mention it if there is none." | ||
# "If none of the sources are helpful, you should indicate that. " | ||
# "For example:\n" | ||
# "# Source 1:\n" | ||
# "4.a The sky is red in the evening and blue in the morning.\n" | ||
# "# Source 2:\n" | ||
# "5.c Water is wet when the sky is red.\n" | ||
# "Query: When is water wet?\n" | ||
# "Answer: *5.c* Water will be wet when the sky is red [2], " | ||
# "which occurs in the evening [1].\n" | ||
# "If several citations are used, separate them with comma such as, '*5.c,4.a*'\n" | ||
"If question cannot be inferred from documents SAY `No information found`" | ||
"Now it's your turn. Below are several numbered sources of information:" | ||
"\n------\n" | ||
f"{context_str}" | ||
"\n------\n" | ||
f"Query: {query}\n" | ||
"Answer: " | ||
) | ||
return prompt | ||
|
||
|
||
@pw.udf | ||
def prompt_short_qa(query: str, docs: list[pw.Json]): | ||
context_pieces = [] | ||
|
||
for i, doc in enumerate(docs, 1): | ||
context_pieces.append(doc["text"]) | ||
context_pieces.append("") # type: ignore | ||
context_str = "\n".join(context_pieces) # type: ignore | ||
prompt = ( | ||
"Please provide an answer based solely on the provided sources. " | ||
"Keep your answer concise and accurate. Make sure that it starts with an expression in standardized format." | ||
"Only respond without any explanation, for example questions asking for date should be answered in strictly date format: `05 January 2011`" # noqa: E501 | ||
"Yes or No questions should be responded with simple `Yes` or `No` and so on." | ||
"If question cannot be inferred from documents SAY `No information found`" | ||
"Now it's your turn. Below are several sources of information:" | ||
"\n------\n" | ||
f"{context_str}" | ||
"\n------\n" | ||
f"Query: {query}\n" | ||
"Answer: " | ||
) | ||
return prompt | ||
|
||
|
||
@pw.udf | ||
def prompt_summarize(text_list: list[str]): | ||
text = "\n".join(text_list) | ||
prompt = f"""Given a list of documents, summarize them in few sentences \ | ||
while preserving important points and entities. | ||
Documents: {text} | ||
Summary:""" | ||
|
||
return prompt | ||
|
||
|
||
@pw.udf | ||
def prompt_query_rewrite_hyde(query: str) -> str: | ||
prompt = f"""Write 4 responses to answer the given question with hypothetical data. | ||
Try to include as many key details as possible. | ||
Question: `{query}`. | ||
Responses:""" | ||
return prompt | ||
|
||
|
||
@pw.udf | ||
def prompt_query_rewrite(query: str, *additional_args: str) -> str: | ||
prompt = f"""Given a question that will be used to retrieve similar documents for RAG application,. | ||
Rewrite question to be better usable in retrieval search. | ||
Your response should be three queries based on the question provided, separated by comma. | ||
Question: `{query}` | ||
""" | ||
|
||
if additional_args: | ||
prompt += """If any of the provided sections are related to question, write section name in the query as well. | ||
Here is additional info that you can include in search: """ | ||
for arg in additional_args: | ||
prompt += f" `{arg}`\n" | ||
|
||
prompt += "Rewritten query:" | ||
return prompt | ||
|
||
|
||
@pw.udf | ||
def parse_cited_response(response_text, docs): | ||
cited_docs = [ | ||
int(cite[1:-1]) - 1 | ||
for cite in set(re.findall("\[\d+\]", response_text)) # noqa: W605 | ||
] | ||
start_index = response_text.find("*") + 1 | ||
end_index = response_text.find("*", start_index) | ||
|
||
citations = [docs[i] for i in cited_docs if i in cited_docs] | ||
cleaned_citations = [] | ||
|
||
if ( | ||
start_index != -1 and end_index != -1 | ||
): # doing this for the GIF, we need a better way to do this, TODO: redo | ||
cited = response_text[start_index:end_index] | ||
response_text = response_text[end_index:].strip() | ||
cited = ( | ||
cited.replace(" ", "") | ||
.replace(",,", ",") | ||
.replace(",", ",\n") | ||
.replace(" ", "\n") | ||
) | ||
|
||
text_body = citations[0]["text"] | ||
new_text = f"<b>{cited}</b>\n\n".replace("\n\n\n", "\n") + text_body | ||
|
||
citations[0]["text"] = new_text | ||
|
||
cleaned_citations.append(citations[0]) | ||
|
||
if len(citations) > 1: | ||
for doc in citations[1:]: | ||
text_body = doc["text"] # TODO: unformat and clean the text | ||
doc["text"] = text_body | ||
cleaned_citations.append(doc) | ||
|
||
return response_text, cleaned_citations |