forked from axa-group/Parsr
-
Notifications
You must be signed in to change notification settings - Fork 1
/
parsr_output_interpreter.py
105 lines (90 loc) · 3.7 KB
/
parsr_output_interpreter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#
# Copyright 2019 AXA Group Operations S.A.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
class ParsrOutputInterpreter(object):
"""Functions to interpret Parsr's resultant JSON file, enabling
access to the underlying document content
"""
def __init__(self, object=None):
"""Constructor for the class
- object: the Parsr JSON file to be loaded
"""
logging.basicConfig(level=logging.DEBUG,
format='%(name)s - %(levelname)s - %(message)s')
self.object = None
if object is not None:
self.load_object(object)
def __get_text_types(self):
"""Internal function returning the types of text structures
"""
return ['word', 'line', 'character', 'paragraph', 'heading']
def __text_objects_none_page(self, txts, page_number_none):
for page in self.object['pages']:
for element in page['elements']:
if element['type'] in self.__get_text_types():
txts.append(element)
def __get_text_objects(self, page_number=None):
texts = []
if page_number is not None:
page = self.get_page(page_number)
if page is None:
logging.error(
"Cannot get text elements for the requested page; Page {} not found".format(page_number))
return None
else:
for element in page['elements']:
if element['type'] in self.__get_text_types():
texts.append(element)
else:
texts = self.__text_object_none_page(texts, page_number)
return texts
def __text_from_text_object(self, text_object: dict) -> str:
result = ""
if (text_object['type'] in ['paragraph', 'heading']) or (
text_object['type'] in ['line']):
for i in text_object['content']:
result += self.__text_from_text_object(i)
elif text_object['type'] in ['word']:
if isinstance(text_object['content'], list):
for i in text_object['content']:
result += self.__text_from_text_object(i)
else:
result += text_object['content']
result += ' '
elif text_object['type'] in ['character']:
result += text_object['content']
return result
def load_object(self, object):
self.object = object
def get_page(self, page_number: int):
"""Get a particular page in a document
- page_number: The number of the page to be searched
"""
for p in self.object['pages']:
if p['pageNumber'] == page_number:
return p
logging.error("Page {} not found".format(page_number))
return None
def get_text(self, page_number: int = None) -> str:
"""Get the entire text from a particular page
- page_number: The page number from which all the text is to be
extracted
"""
final_text = ""
for text_obj in self.__get_text_objects(page_number):
final_text += self.__text_from_text_object(text_obj)
final_text += "\n\n"
return final_text