-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_utils.py
101 lines (81 loc) · 3.44 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import json
def load_hits_from_rank_results_queries_corpus(rank_results_file, queries_file, corpus_file=None, qrels_filter_path=None, n_hits_per_query=None, qid_base=10):
print(f"Loading qids from '{queries_file}'")
queries = load_qids_to_queries(queries_file)
qid_filter = load_qid_to_pid_to_score(qrels_filter_path) if qrels_filter_path is not None else None
print(f"Loading corpus from '{corpus_file}'")
corpus = load_pids_to_passages(corpus_file) if corpus_file is not None else None
# Step 3: Load qrels and combine all data
results = {}
with open(rank_results_file, 'r') as f:
for line in f:
# Skip if the first line is the header
if line.startswith("query-id"):
continue
qid, docid, score = line.strip().split('\t')
score = float(score)
if qid_filter and qid not in qid_filter:
continue
# Initialize query entry if not already present
if qid not in results:
results[qid] = {'query': queries[qid], 'hits': []}
# Create a hit entry
hit = {
'qid': qid,
'docid': docid,
'score': score,
'content': corpus[docid] if corpus_file is not None else None
}
results[qid]['hits'].append(hit)
# Step 4: Sort the queries by numeric qid and their hits by score
rank_results = []
for qid in sorted(results.keys(), key=lambda x: int(strip_prefixes(x), qid_base)): # Sort by numeric qid
sorted_hits = sorted(
results[qid]['hits'],
key=lambda x: -x['score'] # Sort hits by score in descending order
)
if n_hits_per_query is not None:
sorted_hits = sorted_hits[:n_hits_per_query]
rank_results.append({
'query': results[qid]['query'],
'hits': sorted_hits
})
return rank_results
def load_qids_to_queries(queries_file):
queries = {}
with open(queries_file, 'r') as f:
for line in f:
line = json.loads(line)
qid, query = line["_id"], line["text"]
queries[qid] = query
return queries
def load_pids_to_passages(corpus_file, append_title=True):
corpus = {}
with open(corpus_file, 'r') as f:
for line in f:
data = json.loads(line)
pid = data["_id"]
# Extract title and text, combining them if the title exists
passage = data["text"]
if append_title:
title = data.get("title", "")
passage = title + "\n" + passage if title and title.strip() else passage
corpus[pid] = passage
return corpus
def load_qid_to_pid_to_score(rank_results_file, is_qrels=False):
qid_to_pid_to_score = {}
with open(rank_results_file, 'r') as f:
for line in f:
if line.startswith("query-id"):
continue
qid, pid, score = line.strip().split('\t')
if is_qrels:
score = int(float(score))
else:
score = float(score)
if qid not in qid_to_pid_to_score:
qid_to_pid_to_score[qid] = {}
qid_to_pid_to_score[qid][pid] = score
return qid_to_pid_to_score
def strip_prefixes(id):
return id.replace("query", "").replace("doc", "").replace("test", "").replace("train", "").replace("dev", "")