Skip to content

Commit

Permalink
[APP]Add EB QA Bot (#263)
Browse files Browse the repository at this point in the history
* add qa bot

* add code force

* mypy

* fix md load bug

* fix md load bug

* add summary

* rm origin file

* add des

* add init info

* add notes

* reformat and rename

* add readme

* add readme

* fix des

* add note

* add faiss vectorbase

* revise

* revise

* rm faiss

* Update README.md

修改ERNIE Bot Agent

* rm db file

* rm db file

* update EB
  • Loading branch information
Southpika authored Jan 10, 2024
1 parent ed42aa0 commit a77a311
Show file tree
Hide file tree
Showing 3 changed files with 263 additions and 0 deletions.
43 changes: 43 additions & 0 deletions erniebot-agent/applications/eb-agent-qa-bot/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# ERNIE Bot Agent QA Bot

ERNIE Bot Agent QA Bot是一个ERNIE Bot Agent使用教学机器人,基于 `FunctionAgentWithRetrieval`,该工具旨在辅助用户解决与EB-Agent相关的问题,帮助用户更快的使用 `erniebot_agent`库,搭建属于自己的Agent。

## 架构

此应用基于 `FunctionAgentWithRetrieval`(后续 `RetrievalAgent`上线后将同步更换),将此仓库中相关模块的markdown文件以及ipynb的示例代码文件向量化并通过自定义检索工具检索,实现EB-Agent教学机器人。

### 自定义检索工具

此应用中的检索工具基于 `langchain``faiss`本地向量库,同时基于此应用特性,用户可能需要了解具体的代码实现。因此在实现时同时检索召回说明文档的内容(存储于db)以及相关的代码内容(存储于module_code_db)。

```python
class FaissSearch:
def __init__(self, db, embeddings, module_code_db):
self.db = db
self.module_code_db = module_code_db
self.embeddings = embeddings
```

## 如何开始

**注意:** 建库的过程比较缓慢,请耐心等待。

> 第一步:下载项目源代码,请确保您已经安装了erniebot_agent以及erniebot
```bash
git clone https://github.com/PaddlePaddle/ERNIE-Bot-SDK.git
cd ERNIE-Bot-SDK
pip install ernie_agent
```

> 第二步:如果是第一次运行,请先初始化向量库(应用中同时上传了向量库也可以)
```bash
python question_bot.py --init=True --access-token <aistudio-access-token>
```

> 如果已经初始化过向量库,直接运行即可
```bash
python question_bot.py --access-token <aistudio-access-token>
```
119 changes: 119 additions & 0 deletions erniebot-agent/applications/eb-agent-qa-bot/init_vector_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from typing import List, Union

import erniebot
import nbformat
from langchain.text_splitter import (
MarkdownHeaderTextSplitter,
RecursiveCharacterTextSplitter,
)
from langchain.vectorstores import FAISS
from langchain_core.documents import Document
from tqdm import tqdm

headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
# ("###", "Header 3"),
# ("####", "Header 4"),
]


def get_summary(content: str) -> Union[str, None]:
"""Get summary of md files, you can also change another llm model."""
chat_message = {"role": "user", "content": f"请帮我给以下markdown文件生成摘要用于用户问文档内容时的检索匹配,不要超过400个字:\n{content}"}
summary = erniebot.ChatCompletion.create(
model="ernie-longtext",
messages=[chat_message],
).get_result()
return summary


def open_and_concatenate_ipynb(ipynb_path: str, encoding: str) -> str:
"""Get content of ipynb file."""
with open(ipynb_path, "r", encoding=encoding) as f:
notebook_content = nbformat.read(f, as_version=4)

# 按顺序拼接code单元
concatenated_content = ""
for cell in notebook_content["cells"]:
if cell["cell_type"] == "code":
concatenated_content += "```python\n" + cell["source"] + "```\n\n"

return concatenated_content


def read_md_file(file_path: str) -> Union[str, None]:
try:
with open(file_path, "r", encoding="utf-8") as file:
md_content = file.read()
return md_content
except FileNotFoundError:
print(f"文件 '{file_path}' 不存在。")
return None
except Exception as e:
print(f"读取文件时出现错误: {e}")
return None


def load_md_files_to_doc(
file_paths: List[str],
chunk_size: int = 1000,
chunk_overlap: int = 30,
) -> List[Document]:
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
output_document = []
for file in tqdm(file_paths):
content = read_md_file(file)
if content is None:
continue
md_header_splits = markdown_splitter.split_text(content)
splits = text_splitter.split_documents(md_header_splits)
for i in range(len(splits)):
# 生成summary用于检索
splits[i].metadata["raw_text"] = splits[i].page_content
splits[i].page_content = get_summary(splits[i].page_content)
output_document.extend(splits)
return output_document


def init_db(faiss_name, faiss_name_module, embeddings):
md_file_path = [
"./docs/modules/file.md",
"./docs/modules/agents.md",
"./docs/modules/memory.md",
"./docs/modules/message.md",
"./docs/modules/chat_models.md",
"./docs/modules/tools.md",
"./docs/quickstart/agent.md",
"./docs/quickstart/use-tool.md",
]
chunk_size = 1000
chunk_overlap = 30
content_doc = load_md_files_to_doc(md_file_path, chunk_size, chunk_overlap)

db = FAISS.from_documents(content_doc, embeddings)
db.save_local(faiss_name)

ipynb_path = [
"./docs/cookbooks/agent/function_agent.ipynb",
"./docs/cookbooks/agent/chat_models.ipynb",
"./docs/cookbooks/agent/memory.ipynb",
"./docs/cookbooks/agent/message.ipynb",
"./docs/cookbooks/agent/local_tool.ipynb",
"./docs/cookbooks/agent/tools_intro.ipynb",
"./docs/cookbooks/agent/remote-tool/remote_tool.ipynb",
]
modules = [item[item.rfind("/") + 1 : item.rfind(".ipynb")] for item in ipynb_path]
module_doc = []

for i in range(len(modules)):
module_doc.append(
Document(
page_content=modules[i],
metadata={"ipynb": open_and_concatenate_ipynb(ipynb_path[i], "utf-8")},
)
)

module_code_db = FAISS.from_documents(module_doc, embeddings)
module_code_db.save_local(faiss_name_module)
101 changes: 101 additions & 0 deletions erniebot-agent/applications/eb-agent-qa-bot/question_bot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import argparse
import os

os.environ["EB_AGENT_LOGGING_LEVEL"] = "INFO"

from init_vector_db import init_db
from langchain.vectorstores import FAISS
from sklearn.metrics.pairwise import cosine_similarity

from erniebot_agent.agents.function_agent_with_retrieval import (
FunctionAgentWithRetrieval,
)
from erniebot_agent.chat_models import ERNIEBot
from erniebot_agent.extensions.langchain.embeddings import ErnieEmbeddings
from erniebot_agent.memory import SystemMessage

parser = argparse.ArgumentParser()
parser.add_argument("--init", type=bool, default=False)
parser.add_argument("--access-token", type=str, help="access token for erniebot-agent")
args = parser.parse_args()

if args.access_token:
os.environ["EB_AGENT_ACCESS_TOKEN"] = args.access_token

embeddings = ErnieEmbeddings(aistudio_access_token=os.environ["EB_AGENT_ACCESS_TOKEN"], chunk_size=16)


class FaissSearch:
def __init__(self, db, embeddings, module_code_db):
self.db = db
self.module_code_db = module_code_db
self.embeddings = embeddings

def search(self, query: str, top_k: int = 2):
# 搜索时,同时召回最相关的两个文档片段以及最相关的一个代码示例
docs = self.db.similarity_search(query, top_k)
para_result = self.embeddings.embed_documents([i.page_content for i in docs])
query_result = self.embeddings.embed_query(query)
similarities = cosine_similarity([query_result], para_result).reshape((-1,))
retrieval_results = []
# make sure 'raw_text' in doc.metadata
for index, doc in enumerate(docs):
if "Header 1" in doc.metadata:
retrieval_results.append(
{
"content": doc.metadata["raw_text"],
"score": similarities[index],
"title": doc.metadata["Header 1"],
}
)
else:
retrieval_results.append(
{"content": doc.metadata["raw_text"], "score": similarities[index], "title": ""}
)
# module_code_db 用于相关代码的召回
code = self.module_code_db.similarity_search(query, 1)[0]
# make sure 'ipynb' in code.metadata
retrieval_results.append({"content": code.metadata["ipynb"], "score": 1, "title": code.page_content})

return retrieval_results


def load_agent():
faiss_name = os.path.join(os.path.dirname(os.path.abspath(__file__)), "faiss_index")
faiss_name_module = os.path.join(os.path.dirname(os.path.abspath(__file__)), "faiss_index_module")
if args.init:
init_db(faiss_name, faiss_name_module, embeddings)
try:
db = FAISS.load_local(faiss_name, embeddings)
module_code_db = FAISS.load_local(faiss_name_module, embeddings)
except RuntimeError as e:
raise RuntimeError(f"Make sure you have initialized the database first.\n {e}")

llm = ERNIEBot(model="ernie-3.5")
faiss_search = FaissSearch(db=db, embeddings=embeddings, module_code_db=module_code_db)
agent = FunctionAgentWithRetrieval(
llm=llm,
tools=[],
knowledge_base=faiss_search,
threshold=0,
system_message=SystemMessage(
"你是ERNIEBot Agent的小助手,用于解决用户关于EB-Agent的问题,涉及File, Memory, Message, Agent, ChatModels等模块。"
"请你严格按照搜索到的内容回答,不要自己生成相关代码。如果询问与ERNIEBot Agent无关的问题,请直接回答“我只能回答EB—Agent相关问题”"
),
top_k=2,
token_limit=5000,
)
return agent


async def main(agent):
# response = await agent.run('怎么从aistudio创建远程tool?')
# response = await agent.run('如何创建一个LocalTool?')
response = await agent.run("如何创建一个agent")
print(response.text)


if __name__ == "__main__":
agent = load_agent()
# asyncio.run(main(agent))
agent.launch_gradio_demo()

0 comments on commit a77a311

Please sign in to comment.