-
Notifications
You must be signed in to change notification settings - Fork 0
/
import_dump.py
39 lines (30 loc) · 1.19 KB
/
import_dump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from tqdm import tqdm
import models
import postgres
import chunker
import pyarrow.parquet as pq
postgres.init(embeddingLength=models.embedding_length())
with postgres.get_connection().cursor() as cur:
# https://huggingface.co/datasets/euirim/goodwiki
table = pq.read_table('./goodwiki.parquet')
df = table.to_pandas()
for index, row in tqdm(df.iterrows(), total=len(df)):
title = row['title']
desc = row['description']
text = row['markdown']
cur.execute(
"INSERT INTO pages (title, description, text) VALUES (%s, %s, %s) RETURNING id;",
(title, desc, text),
)
result = cur.fetchall()
pageId = result[0][0]
for c in chunker.chunk(text, chunkSize=256, overlap=32):
chunkText = title + "\n" + (desc if desc else "") + "\n" + c
embedding = models.embedding_string(chunkText, models.EmbeddingPrefix.DOCUMENT)
cur.execute(
"INSERT INTO chunks (text, embedding, page_id) VALUES (%s, %s, %s);",
(c, embedding, pageId),
)
# Commit the transaction
postgres.get_connection().commit()
postgres.get_connection().close()