-
Notifications
You must be signed in to change notification settings - Fork 0
/
index_urls.py
68 lines (54 loc) · 1.69 KB
/
index_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from db_models.mongo_setup import global_init
from db_models.models.web_model import Web
from bert_serving.client import BertClient
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import globals
import json
import os
global_init()
bc = BertClient(output_fmt='list')
client = Elasticsearch(globals.ELASTIC_SEARCH_HOST)
def getVal(db_obj, key: str, error_res=""):
try:
print(type(key), "key")
val = db_obj[key]
if val is None:
return error_res
return val
except KeyError:
return error_res
def create_document(doc,emb):
print("IN CREATE DOCUMENT")
return {
'text': doc['text'],
'doc_id': doc['doc_id'],
'url': doc['url'],
'file_name': '',
'text_vector': emb,
}
def bulk_predict(docs, batch_size=256):
''' Predict bert embeddings. '''
for i in range(0, len(docs), batch_size):
batch_docs = docs[i: i+batch_size]
embeddings = bc.encode([doc['text'] for doc in batch_docs])
for emb in embeddings:
yield emb
def process_url_doc(id):
bc = BertClient(output_fmt='list')
client = Elasticsearch('localhost:9200')
print(f"id {type(id)}")
db_obj = Web.objects.get(id=id)
document = {}
document["doc_id"] = id
document["text"] = getVal(db_obj, "text")
print("THIS IS THE ID ", id)
document["url"]=getVal(db_obj, "url")
to_index = [document]
for doc, emb in zip(to_index, bulk_predict(to_index)):
d = create_document(doc, emb)
# to_bulk.append(d)
print("SEND TO INDEX", d)
res = client.index(index='semantic', body=d)
print(res)
print("DONE INDEXING SUCCESS")