pubConvBing

#!/usr/bin/env python

# load default python packages
import sys, logging, optparse, os, glob, zipfile, types, re, tempfile, shutil, codecs
from os.path import *
import pubConvBing

# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir  = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)

# now load our own libraries
import pubGeneric, pubStore, pubConf, maxCommon, pubXml, maxRun

# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] <in> <out> - convert data from bing to pubtools format.

If in and out are directories:

When we receive files, we index them just once:
pubConvBing -i /hive/data/outside/pubs/bing/

To convert them:
pubConvBing /hive/data/outside/pubs/bing/ /hive/data/inside/pubs/text/bing/
""")

parser.add_option("-i", "--index", dest="index", action="store_true", help="create indices for tsv files in inDir, has to be run only once") 
parser.add_option("", "--chunkSize", dest="chunkSize", action="store", type="int", help="number of articles per chunk, adapt this to your cluster, default %default", default=500) 
parser.add_option("", "--minId", dest="minId", action="store", help="numerical IDs written to the pubStore start at this number times one billion to prevent overlaps of numerical IDs between publishers, default %s", default=pubConf.identifierStart["bing"])
parser.add_option("", "--parse", dest="parse", action="store_true", help="for debugging, just parse one single xml file", default=None)
pubGeneric.addGeneralOptions(parser)
(options, args) = parser.parse_args()

if args==[]:
    parser.print_help()
    exit(1)

pubGeneric.setupLogging(__file__, options)
if options.parse:
    logging.debug("Reading file")
    data = open(args[0]).read()
    logging.debug("Reading done")
    data = pubConvBing.convertMicrosoft(data)
    artDict, fileDict = pubConvBing.convertHtmlToDicts("http://www.sgi.com", data)
    for key, val in artDict.items():
        print key, val
    #print fileDict
    exit(0)

# normal operation
if len(args)==2:
    inDir, outDir = args
else:
    inDir = args[0]

inDir = abspath(inDir)

maxCommon.mustExist(inDir)

minId = options.minId

if not os.path.isdir(inDir):
    print "first parameter must be a directory"
    sys.exit(1)

runner = pubGeneric.makeClusterRunner(__file__, maxJob=pubConf.convertMaxJob, headNode=options.cluster)

if options.index:
    pubConvBing.createIndexJobs(runner, inDir)
    pubConvBing.rewriteIndexesFindDuplicates(inDir)
    sys.exit(0)

pubConvBing.createChunksSubmitJobs(inDir, outDir, minId, runner, options.chunkSize)
tsvFnames = glob.glob(join(outDir,"*.articles.gz"))
dbPath = join(outDir, "articles.db")
pubStore.loadNewTsvFilesSqlite(dbPath, "articles", tsvFnames)