-
Notifications
You must be signed in to change notification settings - Fork 3
/
tsv2xml.py
375 lines (312 loc) · 15.4 KB
/
tsv2xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
import argparse
import os
from html import escape
from pathlib import Path
from bs4 import BeautifulSoup
def processFile(file):
output = {'paragraphs': [], 'rel_source_dest': [], 'rel_dest_source': []}
spans = []
tokens = []
currentParagraph = {'text': "", 'spans': spans, 'tokens': tokens, 'section': 'body'}
inside = False
with open(file) as fp:
tokenPreviousPositionEnd = '-1'
previousTagIndex = None
previousTagValue = None
tagIndex = None
tagValue = None
currentSpan = None
tokenId = 0
entitiesLayerFirstIndex = -1
sectionLayerFirstIndex = -1
hasDocumentStructure = False
# If there are no relationships, the TSV has two column less.
with_relationships = False
relation_source_dest = {}
relation_dest_source = {}
spans_layers = 3
relationship_layer_index = 5 # The usual value
for line in fp.readlines():
if line.startswith("#Text") and not inside: # Start paragraph
currentParagraph['text'] = line.replace("#Text=", "")
inside = True
tokenId = 0
elif not line.strip() and inside: # End paragraph
if currentSpan:
spans.append(currentSpan)
output['paragraphs'].append(currentParagraph)
currentParagraph = {'text': "", 'spans': [], 'tokens': [], 'section': "body"}
spans = currentParagraph['spans']
tokens = currentParagraph['tokens']
tokenPreviousPositionEnd = '-1'
previousTagIndex = None
previousTagValue = None
tagIndex = None
tagValue = None
currentSpan = None
inside = False
else:
if not inside:
if line.startswith("#T_SP"):
layerName = line.split('|')[0].split('=')[1]
if layerName == 'webanno.custom.Supercon':
entitiesLayerFirstIndex = spans_layers
entitiesLayerLabelIndex = entitiesLayerFirstIndex + 1
elif layerName == 'webanno.custom.Section':
sectionLayerFirstIndex = spans_layers
hasDocumentStructure = True
layerTagsets = len(line.split('|')) - 1
spans_layers += layerTagsets
if line.startswith("#T_RL"):
with_relationships = True
if spans_layers > 0:
relationship_layer_index = entitiesLayerLabelIndex + 1
print("Ignoring " + line)
continue
split = line.split('\t')
annotationId = split[0]
position = split[1]
tokenPositionStart = position.split("-")[0]
tokenPositionEnd = position.split("-")[1]
if tokenPreviousPositionEnd != tokenPositionStart and tokenPreviousPositionEnd != '-1': ## Add space in the middle #fingercrossed
tokens.append(
{'start': tokenPreviousPositionEnd, 'end': tokenPositionStart, 'text': " ", 'id': tokenId})
tokenId = tokenId + 1
text = split[2]
tokens.append({'start': tokenPositionStart, 'end': tokenPositionEnd, 'text': text, 'id': tokenId})
section = "body"
if sectionLayerFirstIndex > -1:
section = split[3].split('[')[0]
currentParagraph['section'] = section
tag = split[entitiesLayerLabelIndex].strip()
tag = tag.replace('\\', '')
if with_relationships:
relationship_name = split[relationship_layer_index].strip()
relationship_references = split[relationship_layer_index + 1].strip()
else:
relationship_name = '_'
relationship_references = '_'
relationships = [] # list of tuple(source, destination)
if relationship_name != '_' and relationship_references != '_':
# We ignore the name of the relationship for the moment
# names = relationship_name.split("|")
# We split by | as they are grouped as
# 2-162 1965-1969 YBCO *[1] material[1] material-tc|material-tc 2-176[0_1]|2-179[0_1]
references = relationship_references.split("|")
for reference in references:
reference_split = reference.split('[')
if len(reference_split) == 1:
# no disambiguation ids, so I use
# destination = layer-token (element 0)
# source = layer-token of reference (elemnt 6)
source = reference
destination = annotationId
elif len(reference_split) > 1:
reference_source_tsv = reference_split[0]
source = reference_split[1].split('_')[0]
destination = reference_split[1].split('_')[1][:-1]
if source == '0':
source = reference_source_tsv
elif destination == '0':
destination = annotationId
relationships.append((source, destination))
if source not in relation_source_dest:
relation_source_dest[source] = [destination]
else:
relation_source_dest[source].append(destination)
if destination not in relation_dest_source:
relation_dest_source[destination] = [source]
else:
relation_dest_source[destination].append(source)
if tag != '_' and not tag.startswith('*'):
if tag.endswith("]"):
tagValue = tag.split('[')[0]
tagIndex = tag.split('[')[1][:-1]
else:
tagValue = tag
tagIndex = -1
if tagIndex != -1:
if tagIndex != previousTagIndex:
if currentSpan:
spans.append(currentSpan)
currentSpan = {'start': tokenPositionStart, 'end': tokenPositionEnd, 'token_start': tokenId,
'token_end': tokenId, 'label': tagValue, 'tagIndex': tagIndex,
'relationships': relationships}
else:
if tagValue == previousTagValue:
currentSpan['end'] = tokenPositionEnd
currentSpan['token_end'] = tokenId
else:
if currentSpan:
spans.append(currentSpan)
currentSpan = {'start': tokenPositionStart, 'end': tokenPositionEnd,
'token_start': tokenId,
'token_end': tokenId, 'label': tagValue, 'tagIndex': tagIndex,
'relationships': relationships}
else:
if currentSpan:
spans.append(currentSpan)
currentSpan = {'start': tokenPositionStart, 'end': tokenPositionEnd, 'token_start': tokenId,
'token_end': tokenId, 'label': tagValue, 'tagIndex': annotationId,
'relationships': relationships}
else:
if currentSpan:
spans.append(currentSpan)
currentSpan = None
tokenId = tokenId + 1
tokenPreviousPositionEnd = tokenPositionEnd # copy the position end
previousTagIndex = tagIndex # index of the tag in the tsv
previousTagValue = tagValue
# print(output)
# if not line.startswith(str(paragraph_index) + "-"):
# print("Something is wrong in the synchronisation " + str(paragraph_index) + " vs " + line[0:4])
# sys.exit(-255)
# print(split)
output['paragraphs'].append(currentParagraph)
output['rel_dest_source'] = relation_dest_source
output['rel_source_dest'] = relation_source_dest
return output
xmlTemplate = """<tei xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
<fileDesc xml:id="_0">
<titleStmt/>
<publicationStmt>
<publisher>National Institute for Materials Science (NIMS), Tsukuba, Japan</publisher>
<availability>
<licence target="http://creativecommons.org/licenses/by/3.0/">
<p>The Creative Commons Attribution 3.0 Unported (CC BY 3.0) Licence applies to this document.</p>
</licence>
</availability>
</publicationStmt>
</fileDesc>
<encodingDesc>
<appInfo>
<application version="project.version" ident="grobid-superconductors">
<ref target="https://github.com/lfoppiano/grobid-superconductors">A machine learning software for extracting materials and their properties from scientific literature.</ref>
</application>
</appInfo>
</encodingDesc>
<profileDesc>
<abstract/>
</profileDesc>
</teiHeader>
<text xml:lang="en">
<body/>
</text>
</tei>"""
def get_text_under_body(soup):
children = soup.findChildren('text')
return children[0] if children is not None and len(
children) > 0 else None
def writeOutput(datas, output):
paragraphs = []
rel_dest_source = datas['rel_dest_source']
rel_source_dest = datas['rel_source_dest']
for data in datas['paragraphs']:
tokens = data['tokens']
spans = data['spans']
text = data['text']
section = data['section']
paragraph = ''
spanIdx = 0
for i, token in enumerate(tokens):
if spanIdx < len(spans):
span = spans[spanIdx]
span_token_start = span['token_start']
span_token_end = span['token_end']
span_label = span['label']
else:
span = None
if span is not None:
if i < span_token_start:
paragraph += escape(token['text'])
continue
# paragraph += token['text']
elif span_token_start <= i <= span_token_end:
if i == span_token_start:
tagLabel = '<rs type="' + span_label + '">'
pointers = ''
identifier = ''
if span['tagIndex'] in rel_source_dest:
first = True
for dest in rel_source_dest[span['tagIndex']]:
if first:
first = False
pointers = ' corresp="#x' + dest
else:
pointers += ',#x' + dest
pointers += '"'
if span['tagIndex'] in rel_dest_source:
identifier = ' xml:id="x' + span['tagIndex'] + '"'
if pointers != '' or identifier != '':
tagLabel = '<rs type="' + span_label + '"' + identifier + pointers + '>'
paragraph += tagLabel
paragraph += escape(token['text'])
if i == span_token_end:
# paragraph += token['text']
paragraph += '</rs>'
spanIdx += 1
else:
paragraph += escape(token['text'])
paragraphs.append((section, paragraph))
with open(output, 'w') as fo:
soup = BeautifulSoup(xmlTemplate, 'xml')
for section, paragraphObj in paragraphs:
if section == 'title':
tag = BeautifulSoup('<title>' + paragraphObj + '</title>', 'xml')
soup.teiHeader.titleStmt.append(tag)
elif section == 'abstract':
tag = BeautifulSoup('<p>' + paragraphObj + '</p>', 'xml')
soup.teiHeader.profileDesc.abstract.append(tag)
elif section == 'keywords':
tag = BeautifulSoup('<ab type="keywords">' + paragraphObj + '</ab>', 'xml')
soup.teiHeader.profileDesc.append(tag)
elif section == 'body':
tag = BeautifulSoup('<p>' + paragraphObj + '</p>', 'xml')
text_tag = get_text_under_body(soup)
text_tag.body.append(tag)
elif section == 'figureCaption' or section == 'tableCaption':
tag = BeautifulSoup('<ab type="' + section + '">' + paragraphObj + '</ab>', 'xml')
text_tag = get_text_under_body(soup)
text_tag.body.append(tag)
fo.write(str(soup))
fo.flush()
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Converter TSV to XML (Grobid training data based on TEI)")
parser.add_argument("--input", help="Input file or directory", required=True)
parser.add_argument("--output",
help="Output directory (if omitted, the output will be the same directory/file with different extension)",
required=False)
parser.add_argument("--recursive", action="store_true", default=False,
help="Process input directory recursively. If input is a file, this parameter is ignored. ")
args = parser.parse_args()
input = args.input
output = args.output
recursive = args.recursive
if os.path.isdir(input):
path_list = []
if recursive:
for root, dirs, files in os.walk(input):
for file_ in files:
if not file_.lower().endswith(".tsv"):
continue
abs_path = os.path.join(root, file_)
path_list.append(abs_path)
else:
path_list = Path(input).glob('*.tsv')
for path in path_list:
print("Processing: ", path)
output_filename = Path(path).stem
data = processFile(path)
parent_dir = Path(path).parent
if os.path.isdir(str(output)):
output_path = os.path.join(output, str(output_filename)) + ".tei.xml"
else:
output_path = os.path.join(parent_dir, output_filename + ".tei.xml")
writeOutput(data, output_path)
elif os.path.isfile(input):
input_path = Path(input)
data = processFile(input_path)
output_filename = input_path.stem
writeOutput(data, os.path.join(output, str(output_filename) + ".tei.xml"))