-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add qa bot * add code force * mypy * fix md load bug * fix md load bug * add summary * rm origin file * add des * add init info * add notes * reformat and rename * add readme * add readme * fix des * add note * add faiss vectorbase * revise * revise * rm faiss * Update README.md 修改ERNIE Bot Agent * rm db file * rm db file * update EB
- Loading branch information
Showing
3 changed files
with
263 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# ERNIE Bot Agent QA Bot | ||
|
||
ERNIE Bot Agent QA Bot是一个ERNIE Bot Agent使用教学机器人,基于 `FunctionAgentWithRetrieval`,该工具旨在辅助用户解决与EB-Agent相关的问题,帮助用户更快的使用 `erniebot_agent`库,搭建属于自己的Agent。 | ||
|
||
## 架构 | ||
|
||
此应用基于 `FunctionAgentWithRetrieval`(后续 `RetrievalAgent`上线后将同步更换),将此仓库中相关模块的markdown文件以及ipynb的示例代码文件向量化并通过自定义检索工具检索,实现EB-Agent教学机器人。 | ||
|
||
### 自定义检索工具 | ||
|
||
此应用中的检索工具基于 `langchain`的 `faiss`本地向量库,同时基于此应用特性,用户可能需要了解具体的代码实现。因此在实现时同时检索召回说明文档的内容(存储于db)以及相关的代码内容(存储于module_code_db)。 | ||
|
||
```python | ||
class FaissSearch: | ||
def __init__(self, db, embeddings, module_code_db): | ||
self.db = db | ||
self.module_code_db = module_code_db | ||
self.embeddings = embeddings | ||
``` | ||
|
||
## 如何开始 | ||
|
||
**注意:** 建库的过程比较缓慢,请耐心等待。 | ||
|
||
> 第一步:下载项目源代码,请确保您已经安装了erniebot_agent以及erniebot | ||
```bash | ||
git clone https://github.com/PaddlePaddle/ERNIE-Bot-SDK.git | ||
cd ERNIE-Bot-SDK | ||
pip install ernie_agent | ||
``` | ||
|
||
> 第二步:如果是第一次运行,请先初始化向量库(应用中同时上传了向量库也可以) | ||
```bash | ||
python question_bot.py --init=True --access-token <aistudio-access-token> | ||
``` | ||
|
||
> 如果已经初始化过向量库,直接运行即可 | ||
```bash | ||
python question_bot.py --access-token <aistudio-access-token> | ||
``` |
119 changes: 119 additions & 0 deletions
119
erniebot-agent/applications/eb-agent-qa-bot/init_vector_db.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
from typing import List, Union | ||
|
||
import erniebot | ||
import nbformat | ||
from langchain.text_splitter import ( | ||
MarkdownHeaderTextSplitter, | ||
RecursiveCharacterTextSplitter, | ||
) | ||
from langchain.vectorstores import FAISS | ||
from langchain_core.documents import Document | ||
from tqdm import tqdm | ||
|
||
headers_to_split_on = [ | ||
("#", "Header 1"), | ||
("##", "Header 2"), | ||
# ("###", "Header 3"), | ||
# ("####", "Header 4"), | ||
] | ||
|
||
|
||
def get_summary(content: str) -> Union[str, None]: | ||
"""Get summary of md files, you can also change another llm model.""" | ||
chat_message = {"role": "user", "content": f"请帮我给以下markdown文件生成摘要用于用户问文档内容时的检索匹配,不要超过400个字:\n{content}"} | ||
summary = erniebot.ChatCompletion.create( | ||
model="ernie-longtext", | ||
messages=[chat_message], | ||
).get_result() | ||
return summary | ||
|
||
|
||
def open_and_concatenate_ipynb(ipynb_path: str, encoding: str) -> str: | ||
"""Get content of ipynb file.""" | ||
with open(ipynb_path, "r", encoding=encoding) as f: | ||
notebook_content = nbformat.read(f, as_version=4) | ||
|
||
# 按顺序拼接code单元 | ||
concatenated_content = "" | ||
for cell in notebook_content["cells"]: | ||
if cell["cell_type"] == "code": | ||
concatenated_content += "```python\n" + cell["source"] + "```\n\n" | ||
|
||
return concatenated_content | ||
|
||
|
||
def read_md_file(file_path: str) -> Union[str, None]: | ||
try: | ||
with open(file_path, "r", encoding="utf-8") as file: | ||
md_content = file.read() | ||
return md_content | ||
except FileNotFoundError: | ||
print(f"文件 '{file_path}' 不存在。") | ||
return None | ||
except Exception as e: | ||
print(f"读取文件时出现错误: {e}") | ||
return None | ||
|
||
|
||
def load_md_files_to_doc( | ||
file_paths: List[str], | ||
chunk_size: int = 1000, | ||
chunk_overlap: int = 30, | ||
) -> List[Document]: | ||
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on) | ||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | ||
output_document = [] | ||
for file in tqdm(file_paths): | ||
content = read_md_file(file) | ||
if content is None: | ||
continue | ||
md_header_splits = markdown_splitter.split_text(content) | ||
splits = text_splitter.split_documents(md_header_splits) | ||
for i in range(len(splits)): | ||
# 生成summary用于检索 | ||
splits[i].metadata["raw_text"] = splits[i].page_content | ||
splits[i].page_content = get_summary(splits[i].page_content) | ||
output_document.extend(splits) | ||
return output_document | ||
|
||
|
||
def init_db(faiss_name, faiss_name_module, embeddings): | ||
md_file_path = [ | ||
"./docs/modules/file.md", | ||
"./docs/modules/agents.md", | ||
"./docs/modules/memory.md", | ||
"./docs/modules/message.md", | ||
"./docs/modules/chat_models.md", | ||
"./docs/modules/tools.md", | ||
"./docs/quickstart/agent.md", | ||
"./docs/quickstart/use-tool.md", | ||
] | ||
chunk_size = 1000 | ||
chunk_overlap = 30 | ||
content_doc = load_md_files_to_doc(md_file_path, chunk_size, chunk_overlap) | ||
|
||
db = FAISS.from_documents(content_doc, embeddings) | ||
db.save_local(faiss_name) | ||
|
||
ipynb_path = [ | ||
"./docs/cookbooks/agent/function_agent.ipynb", | ||
"./docs/cookbooks/agent/chat_models.ipynb", | ||
"./docs/cookbooks/agent/memory.ipynb", | ||
"./docs/cookbooks/agent/message.ipynb", | ||
"./docs/cookbooks/agent/local_tool.ipynb", | ||
"./docs/cookbooks/agent/tools_intro.ipynb", | ||
"./docs/cookbooks/agent/remote-tool/remote_tool.ipynb", | ||
] | ||
modules = [item[item.rfind("/") + 1 : item.rfind(".ipynb")] for item in ipynb_path] | ||
module_doc = [] | ||
|
||
for i in range(len(modules)): | ||
module_doc.append( | ||
Document( | ||
page_content=modules[i], | ||
metadata={"ipynb": open_and_concatenate_ipynb(ipynb_path[i], "utf-8")}, | ||
) | ||
) | ||
|
||
module_code_db = FAISS.from_documents(module_doc, embeddings) | ||
module_code_db.save_local(faiss_name_module) |
101 changes: 101 additions & 0 deletions
101
erniebot-agent/applications/eb-agent-qa-bot/question_bot.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
import argparse | ||
import os | ||
|
||
os.environ["EB_AGENT_LOGGING_LEVEL"] = "INFO" | ||
|
||
from init_vector_db import init_db | ||
from langchain.vectorstores import FAISS | ||
from sklearn.metrics.pairwise import cosine_similarity | ||
|
||
from erniebot_agent.agents.function_agent_with_retrieval import ( | ||
FunctionAgentWithRetrieval, | ||
) | ||
from erniebot_agent.chat_models import ERNIEBot | ||
from erniebot_agent.extensions.langchain.embeddings import ErnieEmbeddings | ||
from erniebot_agent.memory import SystemMessage | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--init", type=bool, default=False) | ||
parser.add_argument("--access-token", type=str, help="access token for erniebot-agent") | ||
args = parser.parse_args() | ||
|
||
if args.access_token: | ||
os.environ["EB_AGENT_ACCESS_TOKEN"] = args.access_token | ||
|
||
embeddings = ErnieEmbeddings(aistudio_access_token=os.environ["EB_AGENT_ACCESS_TOKEN"], chunk_size=16) | ||
|
||
|
||
class FaissSearch: | ||
def __init__(self, db, embeddings, module_code_db): | ||
self.db = db | ||
self.module_code_db = module_code_db | ||
self.embeddings = embeddings | ||
|
||
def search(self, query: str, top_k: int = 2): | ||
# 搜索时,同时召回最相关的两个文档片段以及最相关的一个代码示例 | ||
docs = self.db.similarity_search(query, top_k) | ||
para_result = self.embeddings.embed_documents([i.page_content for i in docs]) | ||
query_result = self.embeddings.embed_query(query) | ||
similarities = cosine_similarity([query_result], para_result).reshape((-1,)) | ||
retrieval_results = [] | ||
# make sure 'raw_text' in doc.metadata | ||
for index, doc in enumerate(docs): | ||
if "Header 1" in doc.metadata: | ||
retrieval_results.append( | ||
{ | ||
"content": doc.metadata["raw_text"], | ||
"score": similarities[index], | ||
"title": doc.metadata["Header 1"], | ||
} | ||
) | ||
else: | ||
retrieval_results.append( | ||
{"content": doc.metadata["raw_text"], "score": similarities[index], "title": ""} | ||
) | ||
# module_code_db 用于相关代码的召回 | ||
code = self.module_code_db.similarity_search(query, 1)[0] | ||
# make sure 'ipynb' in code.metadata | ||
retrieval_results.append({"content": code.metadata["ipynb"], "score": 1, "title": code.page_content}) | ||
|
||
return retrieval_results | ||
|
||
|
||
def load_agent(): | ||
faiss_name = os.path.join(os.path.dirname(os.path.abspath(__file__)), "faiss_index") | ||
faiss_name_module = os.path.join(os.path.dirname(os.path.abspath(__file__)), "faiss_index_module") | ||
if args.init: | ||
init_db(faiss_name, faiss_name_module, embeddings) | ||
try: | ||
db = FAISS.load_local(faiss_name, embeddings) | ||
module_code_db = FAISS.load_local(faiss_name_module, embeddings) | ||
except RuntimeError as e: | ||
raise RuntimeError(f"Make sure you have initialized the database first.\n {e}") | ||
|
||
llm = ERNIEBot(model="ernie-3.5") | ||
faiss_search = FaissSearch(db=db, embeddings=embeddings, module_code_db=module_code_db) | ||
agent = FunctionAgentWithRetrieval( | ||
llm=llm, | ||
tools=[], | ||
knowledge_base=faiss_search, | ||
threshold=0, | ||
system_message=SystemMessage( | ||
"你是ERNIEBot Agent的小助手,用于解决用户关于EB-Agent的问题,涉及File, Memory, Message, Agent, ChatModels等模块。" | ||
"请你严格按照搜索到的内容回答,不要自己生成相关代码。如果询问与ERNIEBot Agent无关的问题,请直接回答“我只能回答EB—Agent相关问题”" | ||
), | ||
top_k=2, | ||
token_limit=5000, | ||
) | ||
return agent | ||
|
||
|
||
async def main(agent): | ||
# response = await agent.run('怎么从aistudio创建远程tool?') | ||
# response = await agent.run('如何创建一个LocalTool?') | ||
response = await agent.run("如何创建一个agent") | ||
print(response.text) | ||
|
||
|
||
if __name__ == "__main__": | ||
agent = load_agent() | ||
# asyncio.run(main(agent)) | ||
agent.launch_gradio_demo() |