-
Notifications
You must be signed in to change notification settings - Fork 0
/
index
executable file
·54 lines (49 loc) · 1.55 KB
/
index
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python3
# Author: Ryan Clancy and Jimmy Lin
# Copied from https://github.com/osirrc/anserini-docker
import argparse
import json
import os
import subprocess
parser = argparse.ArgumentParser()
parser.add_argument("--json", type=json.loads, required=True, help="the args")
# Parse the args
args, unknown = parser.parse_known_args()
# Options for the different supported collections.
index_options = {
"core17": {
"collection": "NewYorkTimesCollection",
"generator": "JsoupGenerator"
},
"core18": {
"collection": "WashingtonPostCollection",
"generator": "WapoGenerator"
},
"cw09b": {
"collection": "ClueWeb09Collection",
"generator": "JsoupGenerator"
},
"cw12b": {
"collection": "ClueWeb12Collection",
"generator": "JsoupGenerator"
},
"gov2": {
"collection": "TrecwebCollection",
"generator": "JsoupGenerator"
},
"robust04": {
"collection": "TrecCollection",
"generator": "JsoupGenerator"
}
}
# Iterate over collections
for collection in args.json["collections"]:
name, path = collection["name"], collection["path"]
collection, generator = index_options[name]["collection"], index_options[name]["generator"]
subprocess.run("""
Anserini/target/appassembler/bin/IndexCollection
-collection {} -generator {}
-threads {} -index {}
-input {}
-storePositions -storeDocvectors -storeRawDocs
""".format(collection, generator, os.cpu_count(), name, path).split())