forked from cmchurch/nltk_french
-
Notifications
You must be signed in to change notification settings - Fork 0
/
newspaper-le-temps_nltk.py
30 lines (26 loc) · 1.65 KB
/
newspaper-le-temps_nltk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#this python file goes through the xml doccuments and prints out the text data for a particular element name
#Christopher M. Church
#PhD Candidate, UC Berkeley, History
#Social Science D-Lab, UC Berkeley
#USE REGEX INSTEAD OF XML PARSE; lower overhead; also, the files themselves had some problems in the XML and were not properly formed, which resulted in a parse error exception; regex is agnostic to this problem
import re
#variables
path="C:\\Users\\Christopher\\Desktop\\1890\\edit" #set the path that we are going to read through
elem_name = "span" #name of the dom element to look for
xmls=[] #open up an empty array into which we will store our XML filenames
output_file="\\all-text.txt"
#get filenames
for root,dirs,files in os.walk(path): #walk through the filepath and look for xml files, storing them in xmls array
for file in files:
if file.endswith('.html'):
xmls.append(file)
output=open(path+output_file,"w") #OPEN AN OUTPUT FILE FOR ALL THE REGEX MATCHES
for xml in xmls: #iterate over the files found in the os path walk
print xml, #for debugging, print what xml we are on
f = open(path + "\\" + xml, "r") #open the xml (xhtml)
text = f.read() #read it as raw text
regex = re.compile('(<span class="PAG.*?>)(.*?)(</span>)') #create a regular expression with three groups (<span>) (VALUE) (</span>)
matches = regex.findall(text) #create a list of tuples that contain our three matched regular expressions
for match in matches: output.write(match[1]+" ") #save only the second group (the value) and delimit it with a space so we can tokenize later
f.close() #close the xml (xhtml)
output.close() #close the output file)