-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict.py
73 lines (64 loc) · 2.78 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# -*- coding: utf-8 -*-
"""
@Version: 0.1
@Author: Charles
@Time: 2022/11/3 21:11
@File: predict.py
@Desc:
"""
import json
import os
import time
import pandas as pd
from tqdm import tqdm
import torch
import pickle as pkl
from transformers import BertForNextSentencePrediction, BertTokenizer, BertConfig
from models import build_model
import transformers
transformers.logging.set_verbosity_error()
class Predict:
def __init__(self, model_name, model_dir, best_name):
self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
self.tokenizer = BertTokenizer.from_pretrained(model_dir)
self.model = build_model(model_name, model_dir, best_name)
self.model.to(self.device)
self.model.eval()
self.max_length = 100
def __call__(self, texta, textb):
encoding1 = self.tokenizer(texta, add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
encoding2 = self.tokenizer(textb, add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
encoding = self.tokenizer(texta, textb, add_special_tokens=True, max_length=self.max_length*2, padding='max_length', truncation=True, return_tensors='pt')
encoding1 = {k: v.to(self.device) for k, v in encoding1.items()}
encoding2 = {k: v.to(self.device) for k, v in encoding2.items()}
encoding = {k: v.to(self.device) for k, v in encoding.items()}
outputs = self.model([encoding1, encoding2, encoding])
predict = torch.max(outputs, 1)[1].cpu().numpy().tolist()
return predict
def test(file_path, model, dataset, save_dir):
os.makedirs(save_dir, exist_ok=True)
pred = []
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in tqdm(lines):
texta, textb = line.strip().split('\t')
output = model(texta, textb)
pred.append(output[0])
df = pd.DataFrame(pred)
df.to_csv(os.path.join(save_dir, f'{dataset}.tsv'), sep='\t', index_label='index', header=['prediction'])
if __name__ == '__main__':
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
model_path = './output/SBert-20221105-1555/checkpoint'
model_name = 'SBert'
model = Predict(model_name, model_path, best_name='best.ckpt')
test_file = [
'/data/wuzhichao/homework/text_sim/data/bq_corpus/test.tsv',
'/data/wuzhichao/homework/text_sim/data/lcqmc/test.tsv',
'/data/wuzhichao/homework/text_sim/data/paws-x/test.tsv'
]
# save_dir = f'./output/res_{time.strftime("%Y%m%d_%H")}'
save_dir = f'./output/res_20221105_13'
for file in test_file:
dataset = os.path.split(os.path.split(file)[0])[-1]
print(dataset)
test(file, model, dataset, save_dir)