-
Notifications
You must be signed in to change notification settings - Fork 0
/
modifyGermEval17.py
90 lines (81 loc) · 3.96 KB
/
modifyGermEval17.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import xml.etree.ElementTree as ET
import re
def rewrite_xml(path, filename):
print('processing', filename)
count_inconsistent_offset = 0
count_correct_to = 0
raw = ET.parse(path+filename)
root = raw.getroot()
for id, node in enumerate(root):
text = node.findtext('text')
ops_xml = node.find('Opinions')
# there are ill-formatted attributes like <Opinions/> in train and dev dataset
# After ops_xml is not None, you must double check len(ops_xml)>0
# to avoid index out of range in element Opinions (if you are using one)
if ops_xml is not None:
# if it's a deprecated Opinions element, remove it completely first,
# then insert a NULL opinion if it's relevant
if len(ops_xml) == 0:
node.remove(ops_xml)
r = node.findtext('relevance')
relevance = True if r=='true' else False
if relevance:
sentiment = node.findtext('sentiment')
o = ET.SubElement(node, 'Opinions')
null_op = ET.SubElement(o, 'Opinion')
null_op.set('category', 'Allgemein#Haupt')
null_op.set('from', '0')
null_op.set('to', '0')
null_op.set('target', 'NULL')
null_op.set('polarity', sentiment)
elif ops_xml[0].attrib['target']!='NULL':
for op in ops_xml:
# get attributes for an opinion
offset_from = int(op.attrib['from'])
offset_to = int(op.attrib['to'])
target = op.attrib['target']
abs = op.attrib['polarity']
# fix polarity format
if abs=='positve':
op.set('polarity', 'positive')
elif abs==' negative':
op.set('polarity', 'negative')
# fix inconsistent offset
if text[offset_from:offset_to]!=target:
count_inconsistent_offset += 1
endword = target.split()[-1]
repetition = target.count(endword)
new_to = offset_from
for step in range(repetition):
new_to = new_to + text[new_to:].find(endword) + len(endword)
target = text[offset_from:new_to]
if filename == 'test_dia-2017-09-15.xml':
offset_to2 = int(op.attrib['to2'])
if new_to == offset_to2:
count_correct_to += 1
else:
if new_to == offset_to:
count_correct_to += 1
offset_to = new_to
op.set('target', target)
op.set('to', str(offset_to))
print('inconsistent offset:'+str(count_inconsistent_offset))
print('correct offset to:'+str(count_correct_to))
raw.write(path+'fixed_'+filename)
def main():
"""
read the xml format of GermEval 2017 dataset and fix the following question:
1.There are some incomplete tags. Remove it completely and fill with null tag if needed.
2.Change mispelled polarity to correct spelling
3.Some target terms are inconsistent with their offset, correct and find the first match.
Repetition of the same word is recorded in lef-to-right order. Dia-testset has 'to2' attribute
which will be handled differently.
A modified version will be saved with prefix 'fixed_' + original filename
"""
target_dir = "./data/"
filenames = ['train-2017-09-15.xml', 'dev-2017-09-15.xml', 'test_syn-2017-09-15.xml', 'test_dia-2017-09-15.xml']
for filename in filenames:
rewrite_xml(target_dir, filename)
print('modified xml saved in directory '+target_dir)
if __name__ == "__main__":
main()