-
Notifications
You must be signed in to change notification settings - Fork 0
/
etext.py
140 lines (121 loc) · 3.93 KB
/
etext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
from google.appengine.ext.webapp import template
import cgi
import sys
from google.appengine.ext import webapp
from google.appengine.ext.webapp.util import run_wsgi_app
from google.appengine.ext import db
import re, codecs, urllib, logging
import Book
class BulkDisplay(webapp.RequestHandler):
def get(self):
books = db.GqlQuery("SELECT * FROM Book LIMIT 10")
for book in books:
self.response.out.write("Book found. Title: " + book.title + ". Author: " + book.author_first_name + " " + book.author_last_name)
class BulkLoad(webapp.RequestHandler):
def get(self):
self.response.headers['Content-Type'] = 'text/plain'
self.response.out.write("Hello!")
logging.debug("Here")
etext_pattern = r"<pgterms:etext.*\"[a-zA-Z]+(?P<etext_number>\d+)\""
title_pattern = r"title.*>(?P<title>.*)<"
author_pattern = r"creator.*>(?P<author>.*)<"
friendly_pattern = r"friendly.*>(?P<friendly>.*)<"
html_pattern = r"<dcterms:hasFormat rdf:resource=\"(?P<url>[^<]+(htm|html))"
cetext_pattern = re.compile(etext_pattern)
ctitle_pattern = re.compile(title_pattern)
cauthor_pattern = re.compile(author_pattern)
cfriendly_pattern = re.compile(friendly_pattern)
chtml_pattern = re.compile(html_pattern)
path = os.path.join(os.path.split(__file__)[0], 'catalog-1.rdf')
file = codecs.open(path, "r", encoding="UTF-8")
NONE = 0
ETEXT = 1
TITLE = 2
AUTHOR = 3
FRIENDLY = 4
state = NONE #haven't found anything
title = ""
author = ""
author_full_name = ""
author_first_name = ""
author_last_name = ""
etext = ""
friendly = ""
t = file.readline()
t = t.decode("utf-8")
record = 1
while(t != None and record < 24000):
if(state == NONE):
title = ""
author = ""
etext = ""
friendly = ""
match = cetext_pattern.search(t)
if (match):
#print("Found etext match.")
#print("Etext number: " + match.group('etext_number'))
state = ETEXT
etext = match.group('etext_number')
elif(state == ETEXT):
match = ctitle_pattern.search(t)
if(match):
#print ("Found title.")
#print("Title: " + match.group('title'))
state = FRIENDLY
title = match.group('title')
# elif(state == AUTHOR):
# match = cauthor_pattern.search(t)
# if(match):
# #print("Found author.")
# #print("Author: " + match.group('author'))
# state = FRIENDLY
# author = match.group('author')
# words = author.split(',')
# #self.response.out.write(words)
# if(len(words) > 1):
# author_first_name = words[1].strip()
# author_last_name = words[0].strip()
# else:
# author_first_name = words[0].strip()
# author_full_name = author_first_name + " " + author_last_name
elif(state == FRIENDLY):
match = cfriendly_pattern.search(t)
if(match):
#print("Found friendly.")
#print("Friendly: " + match.group('friendly'))
state = NONE
friendly = match.group('friendly')
#now extract author information from the friendly version
author_and_title = friendly.split("by")
author_full_name = author_and_title[len(author_and_title)-1];
author_full_name = author_full_name.strip()
words = author_full_name.split(' ')
#now lets get the url to find the book text
url = "http://www.gutenberg.org/cache/epub/" + etext + "/pg" + etext + ".rdf"
new_book = Book.Book()
new_book.title = title
new_book.etext_number = etext
new_book.etext_url = url
new_book.author_full_name = author_full_name
new_book.put()
if((record % 100) == 0):
self.response.out.write("Wrote record number " + str(record))
record += 1
try:
t = file.readline()
except:
t = None
try:
t = t.decode("utf-8")
except:
print "Unicode Decode Error on line."
application = webapp.WSGIApplication(
[('/admin/bulk-load', BulkLoad),
('/admin/bulk-display', BulkDisplay)],
debug=True
)
def main():
run_wsgi_app(application)
if __name__ == "__main__":
main()