forked from maximilianh/pubMunch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pubConvBing
executable file
·78 lines (60 loc) · 2.76 KB
/
pubConvBing
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
# load default python packages
import sys, logging, optparse, os, glob, zipfile, types, re, tempfile, shutil, codecs
from os.path import *
import pubConvBing
# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)
# now load our own libraries
import pubGeneric, pubStore, pubConf, maxCommon, pubXml, maxRun
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] <in> <out> - convert data from bing to pubtools format.
If in and out are directories:
When we receive files, we index them just once:
pubConvBing -i /hive/data/outside/pubs/bing/
To convert them:
pubConvBing /hive/data/outside/pubs/bing/ /hive/data/inside/pubs/text/bing/
""")
parser.add_option("-i", "--index", dest="index", action="store_true", help="create indices for tsv files in inDir, has to be run only once")
parser.add_option("", "--chunkSize", dest="chunkSize", action="store", type="int", help="number of articles per chunk, adapt this to your cluster, default %default", default=500)
parser.add_option("", "--minId", dest="minId", action="store", help="numerical IDs written to the pubStore start at this number times one billion to prevent overlaps of numerical IDs between publishers, default %s", default=pubConf.identifierStart["bing"])
parser.add_option("", "--parse", dest="parse", action="store_true", help="for debugging, just parse one single xml file", default=None)
pubGeneric.addGeneralOptions(parser)
(options, args) = parser.parse_args()
if args==[]:
parser.print_help()
exit(1)
pubGeneric.setupLogging(__file__, options)
if options.parse:
logging.debug("Reading file")
data = open(args[0]).read()
logging.debug("Reading done")
data = pubConvBing.convertMicrosoft(data)
artDict, fileDict = pubConvBing.convertHtmlToDicts("http://www.sgi.com", data)
for key, val in artDict.items():
print key, val
#print fileDict
exit(0)
# normal operation
if len(args)==2:
inDir, outDir = args
else:
inDir = args[0]
inDir = abspath(inDir)
maxCommon.mustExist(inDir)
minId = options.minId
if not os.path.isdir(inDir):
print "first parameter must be a directory"
sys.exit(1)
runner = pubGeneric.makeClusterRunner(__file__, maxJob=pubConf.convertMaxJob, headNode=options.cluster)
if options.index:
pubConvBing.createIndexJobs(runner, inDir)
pubConvBing.rewriteIndexesFindDuplicates(inDir)
sys.exit(0)
pubConvBing.createChunksSubmitJobs(inDir, outDir, minId, runner, options.chunkSize)
tsvFnames = glob.glob(join(outDir,"*.articles.gz"))
dbPath = join(outDir, "articles.db")
pubStore.loadNewTsvFilesSqlite(dbPath, "articles", tsvFnames)