From 7e646e8e50a7a1219ac1a5d33b9ae09242b3fbd2 Mon Sep 17 00:00:00 2001 From: Ayyub Ibrahim Date: Thu, 11 Jul 2024 21:21:43 -0500 Subject: [PATCH 1/3] updated num of docs pulled from faiss db --- .../googlecloud/functions/getanswer/inquirer.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/packages/googlecloud/functions/getanswer/inquirer.py b/packages/googlecloud/functions/getanswer/inquirer.py index 2f8e51a..5e82465 100644 --- a/packages/googlecloud/functions/getanswer/inquirer.py +++ b/packages/googlecloud/functions/getanswer/inquirer.py @@ -233,24 +233,29 @@ def process_and_concat_documents(retrieved_docs): for source, docs in retrieved_docs.items(): sorted_docs = sort_retrieved_documents(docs) - for doc, score in sorted_docs: + print("Sorted Docs:", sorted_docs) + + # Filter the top 5 docs with the highest similarity scores + top_5_docs = sorted(sorted_docs, key=lambda x: x[1], reverse=True)[:5] + print("Top 5 Docs:", top_5_docs) + + for doc, score in top_5_docs: combined_docs_content.append(doc.page_content) original_documents.append(doc) combined_content = "\n\n".join(combined_docs_content) return combined_content, original_documents - def get_indepth_response_from_query(df, db_fc, db_cj, db_pdf, db_pc, db_news, query, k): logger.info("Performing in-depth summary query...") - llm = ChatOpenAI(model_name="gpt-4-turbo") + llm = ChatOpenAI(model_name="gpt-4o") retrievers = [db_fc, db_cj, db_pdf, db_pc, db_news] - retriever_names = ["fc", "cj", "pdf",] + retriever_names = ["fc", "cj",] retrieval_chains = { - name: RunnableLambda(lambda q, db=db: db.similarity_search_with_score(q, k=10)) + name: RunnableLambda(lambda q, db=db: db.similarity_search_with_score(q, k=25)) for name, db in zip(retriever_names, retrievers) } retrievals = RunnableParallel(retrieval_chains) From f4bdefcb694b384bc2c348933f476a7340c1d48c Mon Sep 17 00:00:00 2001 From: Ayyub Ibrahim Date: Fri, 12 Jul 2024 08:27:42 -0500 Subject: [PATCH 2/3] reduced k from 10 to 5 --- .../functions/getanswer/inquirer.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/packages/googlecloud/functions/getanswer/inquirer.py b/packages/googlecloud/functions/getanswer/inquirer.py index 5e82465..354b751 100644 --- a/packages/googlecloud/functions/getanswer/inquirer.py +++ b/packages/googlecloud/functions/getanswer/inquirer.py @@ -221,7 +221,7 @@ def transform_query_for_date(query): ) -def process_and_concat_documents(retrieved_docs): +def process_and_concat_documents(retrieved_docs, max_docs=5): """ Process and combine documents from multiple sources. @@ -230,18 +230,18 @@ def process_and_concat_documents(retrieved_docs): """ combined_docs_content = [] original_documents = [] + all_docs = [] for source, docs in retrieved_docs.items(): sorted_docs = sort_retrieved_documents(docs) - print("Sorted Docs:", sorted_docs) - - # Filter the top 5 docs with the highest similarity scores - top_5_docs = sorted(sorted_docs, key=lambda x: x[1], reverse=True)[:5] - print("Top 5 Docs:", top_5_docs) - - for doc, score in top_5_docs: - combined_docs_content.append(doc.page_content) - original_documents.append(doc) + all_docs.extend(sorted_docs) + + # Sort all documents by score and take the top max_docs + top_docs = sorted(all_docs, key=lambda x: x[1], reverse=True)[:max_docs] + + for doc, score in top_docs: + combined_docs_content.append(doc.page_content) + original_documents.append(doc) combined_content = "\n\n".join(combined_docs_content) return combined_content, original_documents From 489c48a9737ea7605c2a249ff8ccd1372bb2d0f6 Mon Sep 17 00:00:00 2001 From: Ayyub Ibrahim Date: Sat, 13 Jul 2024 11:58:37 -0500 Subject: [PATCH 3/3] revised prompt; fixed timestamps --- .../functions/getanswer/inquirer.py | 68 +++++++++++-------- packages/web/lib/utils.ts | 8 ++- 2 files changed, 44 insertions(+), 32 deletions(-) diff --git a/packages/googlecloud/functions/getanswer/inquirer.py b/packages/googlecloud/functions/getanswer/inquirer.py index 354b751..e794fbc 100644 --- a/packages/googlecloud/functions/getanswer/inquirer.py +++ b/packages/googlecloud/functions/getanswer/inquirer.py @@ -221,7 +221,7 @@ def transform_query_for_date(query): ) -def process_and_concat_documents(retrieved_docs, max_docs=5): +def process_and_concat_documents(retrieved_docs, max_docs=10): """ Process and combine documents from multiple sources. @@ -237,7 +237,7 @@ def process_and_concat_documents(retrieved_docs, max_docs=5): all_docs.extend(sorted_docs) # Sort all documents by score and take the top max_docs - top_docs = sorted(all_docs, key=lambda x: x[1], reverse=True)[:max_docs] + top_docs = sorted(all_docs, key=lambda x: x[1], reverse=False)[:max_docs] for doc, score in top_docs: combined_docs_content.append(doc.page_content) @@ -245,17 +245,20 @@ def process_and_concat_documents(retrieved_docs, max_docs=5): combined_content = "\n\n".join(combined_docs_content) return combined_content, original_documents - def get_indepth_response_from_query(df, db_fc, db_cj, db_pdf, db_pc, db_news, query, k): logger.info("Performing in-depth summary query...") llm = ChatOpenAI(model_name="gpt-4o") + # New function to edit the query for document retrieval + def edit_query_for_retrieval(query): + return query + "If relevant, include source information that support and that do not support the query. Additionally, include perspectives from city council, civil society, and the public" + retrievers = [db_fc, db_cj, db_pdf, db_pc, db_news] - retriever_names = ["fc", "cj",] + retriever_names = ["fc", "cj"] retrieval_chains = { - name: RunnableLambda(lambda q, db=db: db.similarity_search_with_score(q, k=25)) + name: RunnableLambda(lambda q, db=db: db.similarity_search_with_score(edit_query_for_retrieval(q), k=50)) for name, db in zip(retriever_names, retrievers) } retrievals = RunnableParallel(retrieval_chains) @@ -266,40 +269,48 @@ def get_indepth_response_from_query(df, db_fc, db_cj, db_pdf, db_pc, db_news, qu ) template = """ - ### Task - Focus exclusively on answering the specific question: '{question}'. Extract and include information from the New Orleans city council documents provided that is directly relevant to this question. Refrain from including any additional analysis, context, or details that do not contribute to a concise and direct answer to the question. + You are an AI assistant tasked with analyzing New Orleans city council transcripts to answer specific questions. Your goal is to provide accurate, relevant, and concise responses based on the information contained in the provided documents. Follow these instructions carefully: + 1. You will be given two inputs: + {question} + This is the specific question you need to answer based on the city council documents. - ### Relevance Guidelines - Directly relevant information must explicitly pertain to the question. - Information that is indirectly relevant should only be used to clarify the context necessary for understanding the direct answer. - Omit any information that is irrelevant or tangential to the question. + {docs} + These are the New Orleans city council documents you will analyze to answer the question. - ### Summary Guidelines - Follow the guidelines below if they assist in providing a more clear answer to {question} - If relevant, extract the key points, decisions, and actions discussed during the city council meetings relevant to {question}; - highlight any immediate shortcomings, mistakes, or negative actions by the city council relevant to {question}; - elaborate on the implications and broader societal or community impacts of the identified issues relevant to {question}; - investigate any underlying biases or assumptions present in the city council's discourse or actions relevant to {question}. - If not relevant to the question, answer the question without expanding on these points. + 2. Read and analyze the documents provided in the tags. Focus exclusively on finding information that is directly relevant to answering the question in the tags. - ### Bias Guidelines: - Be mindful of biases in the document corpus. These documents were produced by city council, therefore, you must be aware of the inherent biases toward its behavior. + 3. When analyzing the documents, adhere to these guidelines: + a. Relevance criteria: + - Include only information that explicitly pertains to the question. + - Use indirectly relevant information only if it's necessary to clarify the context of the direct answer. + - Omit any information that is irrelevant or tangential to the question. - ### Formatting Instructions - Deliver the response in unformatted paragraph form. - Avoid any lists or bullet points. - Do not mention document analysis methods or publication dates. + b. Summary guidelines (apply these only if they help answer the specific question): + - Extract key points, decisions, and actions from the city council meetings. + - Highlight any immediate shortcomings, mistakes, or negative actions by the city council. + - Elaborate on the implications and broader societal or community impacts of the identified issues. + - Investigate any underlying biases or assumptions present in the city council's discourse or actions. - If your response includes technical or uncommon terms related to city council that may not be widely understood, provide a brief definition for those terms at the end of your response. Ensure each definition is on a new line, formatted as follows: + c. Bias awareness: + - Be mindful that the documents were produced by the city council and may contain inherent biases toward its own behavior. - Definitions: + 4. If the documents do not contain any relevant information in response to the then return 'The documents do not provide sufficient information to respond to this query'. + + 5. Format your response as follows: + - Deliver your answer in unformatted paragraph form. + - Do not use lists or bullet points. + - Do not mention document analysis methods or publication dates. + 5. If your response includes technical or uncommon terms related to city council that may not be widely understood, provide definitions at the end of your response in this format: + + Definitions: Word: Definition Word: Definition Word: Definition - ### Documents to Analyze - {docs} + Ensure each definition is on a new line. + + 6. Remember to focus solely on answering the specific question provided, using only the information from the given documents. """ prompt_response = ChatPromptTemplate.from_template(template) @@ -312,7 +323,6 @@ def get_indepth_response_from_query(df, db_fc, db_cj, db_pdf, db_pc, db_news, qu return process_streamed_responses_llm(responses_llm, original_documents) - def get_general_summary_response_from_query(db, query, k): logger.info("Performing general summary query...") llm = ChatOpenAI(model_name="gpt-3.5-turbo-0613") diff --git a/packages/web/lib/utils.ts b/packages/web/lib/utils.ts index f370151..aefca6c 100644 --- a/packages/web/lib/utils.ts +++ b/packages/web/lib/utils.ts @@ -36,9 +36,11 @@ export function getYouTubeEmbedUrl( let startTime = 0; if (timestamp) { - const [minutes, seconds] = timestamp.split(":").map(Number); - startTime = minutes * 60 + seconds; + // Split the timestamp if it's a range and take the start time + const startTimestamp = timestamp.split('-')[0]; + const [hours, minutes, seconds] = startTimestamp.split(':').map(Number); + startTime = (hours * 3600) + (minutes * 60) + seconds; } return `https://www.youtube.com/embed/${videoId}?autoplay=0&start=${startTime}`; -} +} \ No newline at end of file