-
Notifications
You must be signed in to change notification settings - Fork 1
/
convert_xml_to_txt.py
57 lines (48 loc) · 1.83 KB
/
convert_xml_to_txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from __future__ import print_function
from builtins import bytes
import os
import xml.etree.ElementTree as ET
import glob
import shutil
"""
This script converts XML dataset from JOS1M to NLTK-preferred plain-text set with tags separated by slashes.
It DOES NOT escape slashes so be aware of that.
The files will be grabbed in xml/ directory and will end up in txt/ subdirectory.
"""
def convert_xml_file(filename):
tree = ET.parse(filename)
root = tree.getroot()
out_filename = os.path.join("txt", os.path.basename(filename)) + ".txt"
f = open(out_filename, "wb")
print("Converting %s => %s" % (filename, out_filename))
for div in root:
for paragraph in div:
for sentence in paragraph:
for element in sentence:
tag = element.tag[-1]
if tag == 'w':
f.write(element.text.encode('utf-8'))
f.write(bytes(b'/'))
f.write(bytes(element.attrib["msd"], encoding="utf-8"))
elif tag == 'S':
f.write(bytes(b' '))
elif tag == 'c':
f.write(element.text.encode('utf-8'))
f.write(bytes(b'\n\n'))
f.close()
def concat_outputs():
files = glob.glob("txt/jos1M*.txt")
output_file = os.path.join("data", "tagged_corpus", "slotag.txt")
print("Concatenating to", output_file)
with open(output_file, "wb") as out_f:
for f in files:
with open(f, "rb") as in_f:
shutil.copyfileobj(in_f, out_f)
if __name__ == "__main__":
print("Converting XML files to TXT format...")
files = glob.glob('xml/jos1M*.xml')
for f in files:
convert_xml_file(f)
print("Concatenating TXT files into a single output...")
concat_outputs()
print("Complete.")