Skip to content

Commit

Permalink
Merge pull request #5468 from pathwaycom/berke/pathway-sheets-app
Browse files Browse the repository at this point in the history
Berke/pathway sheets app

GitOrigin-RevId: b8933ade9adfc31b295b39795813a9d0de2e3c5e
  • Loading branch information
berkecanrizai authored and Manul from Pathway committed Feb 5, 2024
1 parent adb2d69 commit ccee4e7
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 0 deletions.
1 change: 1 addition & 0 deletions python/pathway/xpacks/llm/llms.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ async def __wrapped__(self, messages: list[dict] | pw.Json, **kwargs) -> str | N

kwargs = {**self.kwargs, **kwargs}
api_key = kwargs.pop("api_key", None)

client = openai_mod.AsyncOpenAI(api_key=api_key)
ret = await client.chat.completions.create(messages=messages_decoded, **kwargs)
return ret.choices[0].message.content
Expand Down
1 change: 1 addition & 0 deletions python/pathway/xpacks/llm/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def __init__(
raise ValueError(
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
)

self.kwargs = dict(
mode=mode,
post_processors=post_processors or [],
Expand Down
143 changes: 143 additions & 0 deletions python/pathway/xpacks/llm/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import re

import pathway as pw


@pw.udf
def prompt_citing_qa(query: str, docs: list[pw.Json]):
context_pieces = []

for i, doc in enumerate(docs, 1):
context_pieces.append(f"# Source {i}/n/n")
context_pieces.append(doc["text"]) # type: ignore
context_pieces.append("")
context_str = "\n".join(context_pieces)
prompt = (
"Please provide an answer based solely on the provided sources. "
"When referencing information from a source, "
"cite the appropriate source(s) using their corresponding numbers. "
"Every answer should include at least one source citation. "
"Only cite a source when you are explicitly referencing it. "
"If exists, mention specific article/section header you use at the beginning of answer, such as '4.a Client has rights to...'" # noqa: E501
"Article headers may or may not be in docs, dont mention it if there is none."
# "If none of the sources are helpful, you should indicate that. "
# "For example:\n"
# "# Source 1:\n"
# "4.a The sky is red in the evening and blue in the morning.\n"
# "# Source 2:\n"
# "5.c Water is wet when the sky is red.\n"
# "Query: When is water wet?\n"
# "Answer: *5.c* Water will be wet when the sky is red [2], "
# "which occurs in the evening [1].\n"
# "If several citations are used, separate them with comma such as, '*5.c,4.a*'\n"
"If question cannot be inferred from documents SAY `No information found`"
"Now it's your turn. Below are several numbered sources of information:"
"\n------\n"
f"{context_str}"
"\n------\n"
f"Query: {query}\n"
"Answer: "
)
return prompt


@pw.udf
def prompt_short_qa(query: str, docs: list[pw.Json]):
context_pieces = []

for i, doc in enumerate(docs, 1):
context_pieces.append(doc["text"])
context_pieces.append("") # type: ignore
context_str = "\n".join(context_pieces) # type: ignore
prompt = (
"Please provide an answer based solely on the provided sources. "
"Keep your answer concise and accurate. Make sure that it starts with an expression in standardized format."
"Only respond without any explanation, for example questions asking for date should be answered in strictly date format: `05 January 2011`" # noqa: E501
"Yes or No questions should be responded with simple `Yes` or `No` and so on."
"If question cannot be inferred from documents SAY `No information found`"
"Now it's your turn. Below are several sources of information:"
"\n------\n"
f"{context_str}"
"\n------\n"
f"Query: {query}\n"
"Answer: "
)
return prompt


@pw.udf
def prompt_summarize(text_list: list[str]):
text = "\n".join(text_list)
prompt = f"""Given a list of documents, summarize them in few sentences \
while preserving important points and entities.
Documents: {text}
Summary:"""

return prompt


@pw.udf
def prompt_query_rewrite_hyde(query: str) -> str:
prompt = f"""Write 4 responses to answer the given question with hypothetical data.
Try to include as many key details as possible.
Question: `{query}`.
Responses:"""
return prompt


@pw.udf
def prompt_query_rewrite(query: str, *additional_args: str) -> str:
prompt = f"""Given a question that will be used to retrieve similar documents for RAG application,.
Rewrite question to be better usable in retrieval search.
Your response should be three queries based on the question provided, separated by comma.
Question: `{query}`
"""

if additional_args:
prompt += """If any of the provided sections are related to question, write section name in the query as well.
Here is additional info that you can include in search: """
for arg in additional_args:
prompt += f" `{arg}`\n"

prompt += "Rewritten query:"
return prompt


@pw.udf
def parse_cited_response(response_text, docs):
cited_docs = [
int(cite[1:-1]) - 1
for cite in set(re.findall("\[\d+\]", response_text)) # noqa: W605
]
start_index = response_text.find("*") + 1
end_index = response_text.find("*", start_index)

citations = [docs[i] for i in cited_docs if i in cited_docs]
cleaned_citations = []

if (
start_index != -1 and end_index != -1
): # doing this for the GIF, we need a better way to do this, TODO: redo
cited = response_text[start_index:end_index]
response_text = response_text[end_index:].strip()
cited = (
cited.replace(" ", "")
.replace(",,", ",")
.replace(",", ",\n")
.replace(" ", "\n")
)

text_body = citations[0]["text"]
new_text = f"<b>{cited}</b>\n\n".replace("\n\n\n", "\n") + text_body

citations[0]["text"] = new_text

cleaned_citations.append(citations[0])

if len(citations) > 1:
for doc in citations[1:]:
text_body = doc["text"] # TODO: unformat and clean the text
doc["text"] = text_body
cleaned_citations.append(doc)

return response_text, cleaned_citations

0 comments on commit ccee4e7

Please sign in to comment.