forked from maximilianh/pubMunch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pubCrawl2
executable file
·121 lines (86 loc) · 5.15 KB
/
pubCrawl2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
# load default python packages
import logging, optparse, sys, os
from os.path import *
# add <scriptDir>/lib/ to package search path
# does not work on a frozen package
if not getattr(sys, 'frozen', False):
sys.path.insert(0, join(dirname(abspath(__file__)), "lib"))
import pubGeneric, pubConf
import pubCrawlLib as scrapeLib
def main(args, options):
pubGeneric.setupLogging("", options)
outDir = args[0]
if options.report:
scrapeLib.writeReport(outDir, options.report)
sys.exit(0)
if options.waitTime!=None:
logging.info("Http delay is set to %d secs from command line." % options.waitTime)
scrapeLib.globalForceDelay = options.waitTime
if options.pause:
scrapeLib.DO_PAUSE = True
if options.outputHash:
scrapeLib.TEST_OUTPUT = True
if options.skipLocalMedline:
scrapeLib.SKIPLOCALMEDLINE = True
if options.tryHarder:
scrapeLib.ERRWAIT = 0
scrapeLib.MAXCONSECERR = 500
scrapeLib.userAgent = pubConf.httpUserAgent
if options.fakeUseragent:
logging.debug("Setting useragent to Mozilla")
scrapeLib.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0'
if options.testDoc:
# testing a single document, do not write to disk
scrapeLib.globalForceDelay = 0
docId = outDir
artMeta = scrapeLib.getArticleMeta(docId)
paperData = scrapeLib.crawlOneDoc(artMeta, None)
scrapeLib.printPaperData(paperData)
sys.exit(0)
pubGeneric.setupLogging("", options, logFileName=join(outDir, "crawler.log"), fileMode="a")
scrapeLib.checkCreateLock(outDir)
docIds, skipDocs, skipIssns = scrapeLib.parseDirectories(outDir)
scrapeLib.crawlDocuments(docIds, skipDocs, skipIssns)
scrapeLib.removeLocks()
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] outDir - crawl articles with supp files from websites of hosters. Reads a file docIds.txt in outDir. If not found, checks all subdirectories of outDir for files names docIds.txt and processes their content in random order.
- progress is written to outDir/crawler.log
- the sourceDir is the directory where the docIds.txt file was found
- if it contains a file crawler.txt, only the crawler with this name
is used
- the status of each pmid is written to sourceDir/pmidStatus.log
It is used to avoid crawling the same article twice and to keep status codes.
- the status of whole journals is written to sourceDir/issnStatus.log
It is used to blacklist an ISSN if too many of its articles fail.
Documents from blacklisted ISSNs are then skipped.
- crawling stops if more than 100 consecutive errors are encountered.
- output goes into a subdirectory "files" of each sourceDir
A crawler is a class that retrieves fulltext and supplements, is has the name
of the hoster (e.g. highwire or pmc).
Examples:
- a source can be three specific journals with the highwire crawler.
- a source can be 1000 random PMIDs
""")
parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
help="show more debug messages")
parser.add_option("-t", "--waitTime", dest="waitTime", action="store",
type="int", help="number of seconds to wait between http requests, overrides"
"all other default delay settings")
parser.add_option("", "--test", dest="testDoc", action="store_true", \
help="test mode, outDir is not a directory but a PMID."
"Output is just summary information of files downloaded, not written to disk.")
parser.add_option("-p", "--pause", dest="pause", action="store_true", help="wait for keypress after each download")
parser.add_option("", "--hash", dest="outputHash", action="store_true", help="don't write files to disk, just output their SHA1 values. Used for automated testing.")
parser.add_option("", "--noLocalMedline", dest="skipLocalMedline", action="store_true", help="do not use the local copy of Medline, always query Eutils through the internet")
parser.add_option("-e", "--sendEmail", dest="sendEmail", action="store_true", help="send an error email to address specified in pubConf when program crashes")
parser.add_option("-u", "--fakeUseragent", dest="fakeUseragent", action="store_true", help="by default, the crawler accounces itself to the hoster's webserver as 'genomeBot/0.1'. This parameter changes the behaviour and the crawler will present itself as Firefox. Use this with caution and only if the publisher/hoster accepts it.")
#parser.add_option("", "--preferPmc", dest="preferPmc", action="store_true", help="prefer PMC, if fulltext is available from two sources")
parser.add_option("", "--tryHarder", dest="tryHarder", action="store_true", help="increase max error count to 500 and do not wait between errors")
parser.add_option("", "--report", dest="report", action="store", help="Do not crawl but given the base crawl directory, write a status report in html format to the specified first output filename and quit. ")
(options, args) = parser.parse_args()
if args==[]:
parser.print_help()
sys.exit(1)
main(args, options)