-
Notifications
You must be signed in to change notification settings - Fork 5
/
utils.py
128 lines (109 loc) · 4.32 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from llama_index.core import SimpleDirectoryReader,Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from pypdf import PdfReader
import numpy as np
from llama_index.core import Document
from llama_index.core import SummaryIndex,VectorStoreIndex
from llama_index.core import PromptTemplate
from llama_index.core.tools import QueryEngineTool
from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector
Settings.llm = Ollama(model='phi3',request_timeout=3600.0,temperature=1)
Settings.embed_model = OllamaEmbedding(model_name='nomic-embed-text')
def pre_processing(reader):
documents_1 = ''
first_section = "abstract"
ignore_after = "references"
pg_no = 0
for page in reader.pages:
pg_no += 1
documents_1 += page.extract_text(0)
cleaned_string = documents_1.replace('\n', ' ')
cleaned_string = cleaned_string.lower()
start_index = cleaned_string.find(first_section)
end_index = cleaned_string.rfind(ignore_after)
if start_index!=-1 and end_index!=-1:
cleaned_string = cleaned_string[start_index:end_index]
sentence_list = cleaned_string.split('. ')
context_list = []
group_size = 20
overlap = 1
i = 0
while True:
group = sentence_list[i:i+group_size]
text = '. '.join(group)
if len(text)>10:
context_list.append(text)
i+=group_size-overlap
if i>=len(sentence_list):
break
return context_list
qa_prompt_tmpl_str = ("<|user|>\n"
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given the context information and not prior knowledge, answer the query\n"
"Query: {query_str}"
" <|end|>\n"
"<|assistant|>"
)
refine_prompt_tmpl_str = ("<|user|>\n"
"The original query is as follows: {query_str}\n"
"We have provided an existing answer: {existing_answer}\n"
"We have the opportunity to refine the existing answer (only if needed) with some more context below.\n"
"---------------------\n"
"{context_msg}\n"
"---------------------\n"
"Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer."
" <|end|>\n"
"<|assistant|>"
)
summary_prompt_tmpl_str = ("<|user|>\n"
"Context information from multiple sources is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given the information from multiple sources and not prior knowledge, answer the query.\n"
"Query: {query_str}"
" <|end|>\n"
"<|assistant|>"
)
def get_agent(uploaded_pdf):
reader = PdfReader(uploaded_pdf)
context_list = pre_processing(reader)
documents = [Document(text=t) for t in context_list]
documents_for_summarization = [Document(text=t) for t in context_list[:1]]
summary_index = SummaryIndex(documents_for_summarization)
vector_index = VectorStoreIndex(documents)
vector_query_engine = vector_index.as_query_engine(streaming=True)
summary_query_engine = summary_index.as_query_engine(response_mode="tree_summarize",streaming=True)
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)
vector_query_engine.update_prompts(
{"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)
refine_prompt_tmpl = PromptTemplate(refine_prompt_tmpl_str)
vector_query_engine.update_prompts(
{"response_synthesizer:refine_template": refine_prompt_tmpl}
)
summary_prompt_tmpl = PromptTemplate(summary_prompt_tmpl_str)
summary_query_engine.update_prompts(
{"response_synthesizer:summary_template": summary_prompt_tmpl}
)
summary_tool = QueryEngineTool.from_defaults(
query_engine=summary_query_engine,
description=('Useful for summarization or general description questions')
)
query_tool = QueryEngineTool.from_defaults(
query_engine=vector_query_engine,
description=('Useful for specific questions or topic based questions which are not related to summarization or general description')
)
agent = RouterQueryEngine(
selector=LLMSingleSelector.from_defaults(),
query_engine_tools=[
summary_tool,query_tool
],
verbose=True
)
return agent