-
Notifications
You must be signed in to change notification settings - Fork 20
/
share_gpt_preprocess.py
37 lines (31 loc) · 1.32 KB
/
share_gpt_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import json
from langdetect import detect
from bs4 import BeautifulSoup
def extract(html_string):
# Set the HTML code style string
# Create a BeautifulSoup object from the HTML string
soup = BeautifulSoup(html_string, 'lxml')
# Get the text from the HTML
text = soup.get_text()
return text
revised_list=[]
check=[]
for llama_json in [json.load(open("outputs/sg_90k_part1.json")), json.load(open("outputs/sg_90k_part2.json"))]:
for index in range(len(llama_json)):
try:
task ={}
task["instruction"] = extract(llama_json[index]["conversations"][0]["value"])
task["input"]=""
task["output"]=extract(llama_json[index]["conversations"][1]["value"])
if detect(task["instruction"])=='en' and detect(task["output"])=='en':
if task["output"][-1]=='?':
check.append(llama_json[index]["conversations"])
else:
revised_list.append(task)
except:
print( llama_json[index]["conversations"])
# with open("outputs/sharegpt_90k_processed.json", 'w+') as fd:
# json.dump(revised_list, fd, indent=4)
with open("../outputs/sharegpt/sharegpt_90k_processed.json", 'w+') as fd:
json.dump(check, fd, indent=4)
# print(len(json.load(open("outputs/sharegpt_90k_processed.json"))))