-
Notifications
You must be signed in to change notification settings - Fork 1
/
Jina_API.py
70 lines (59 loc) · 2.74 KB
/
Jina_API.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from jina import Deployment, Executor, requests
from typing import List, Optional
import numpy as np
import pandas as pd
from docarray import BaseDoc, DocList
from dotenv import load_dotenv
from Embedder import Embedder
from similarity_search_10k import find_kNN
from docarray.typing.tensor.embedding.embedding import AnyEmbedding
import subprocess
import re
load_dotenv("../.env")
port = 1192
class TestDoc(BaseDoc):
text: str = None
embedding: Optional[AnyEmbedding] #= np.zeros((1024, ))
contents: List[str] = []
relatedness: List[float] = []
class RAG_API(Executor):
@requests(on='/jina/embedding')
def jina_embedding(self, docs: DocList[TestDoc], **kwargs) -> DocList[TestDoc]:
embedder = Embedder(use_api = "jina")
input_text = [doc.text for doc in docs if doc.text]
embeddings = embedder.get_embedding(input_text)
for doc, embedding_data in zip(docs, embeddings):
doc.embedding = np.array(embedding_data, dtype="f")
return docs
## This calculates the cosine similarity between the query and all the embeddings, slow
@requests(on='/jina/_search')
def jina__search(self, docs: DocList[TestDoc], **kwargs):
embedder = Embedder(use_api = "jina")
for doc in docs:
query = doc.text
df = pd.read_csv("stories/stories_cn_oesz_ebd_Jina.csv")
df['embedding'] = df.embedding.apply(eval).apply(np.array, dtype="f")
strings, relatednesses = find_kNN(query, df, embedder, top_n=5)
doc.contents = [string[0] for string in strings if string]
doc.relatedness = [relatedness for relatedness in relatednesses if relatedness]
return docs
@requests(on='/jina/search')
def jina_search(self, docs: DocList[TestDoc], **kwargs):
for doc in docs:
query = doc.text
result = subprocess.run(["python3", "faiss_search.py",
"-q", query,
"--db", "stories_cn_Jina.db",
"--index", "stories_cn_Jina.index",
"--top", "5",
"--use_api", "jina",
"--env", ".env"], capture_output=True, text = True)
contents = result.stdout.replace("\n", "").split(">>>>>Result")[1:]
relatedness = [float(content.split(">>>>>Relatedness")[1].split(": ")[1]) for content in contents]
doc.contents = contents
doc.relatedness = relatedness
return docs
if __name__ == '__main__':
dep = Deployment(port=port, name='embedding_executor', uses=RAG_API, host='localhost')
with dep:
dep.block()