Skip to content

Commit

Permalink
Detect headings using cleaned text
Browse files Browse the repository at this point in the history
This addresses a cross platform bug where newlines were detected
differently depending on a complex set of factors relating to versions
of ghostscript etc.

To mitigate this fragility, clean the text before processing it
  • Loading branch information
symroe committed Mar 11, 2020
1 parent dde184b commit 367e2c7
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 32 deletions.
12 changes: 7 additions & 5 deletions ynr/apps/sopn_parsing/helpers/pdf_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
from sopn_parsing.helpers.text_helpers import NoTextInDocumentError, clean_text

# Used by SOPNPageText.get_page_heading
HEADING_SIZE = 0.3
HEADING_SIZE = 0.5

# Used by SOPNPageText.detect_top_page
CONTINUATION_THRESHOLD = 0.4
CONTINUATION_THRESHOLD = 0.5


class SOPNDocument:
Expand Down Expand Up @@ -76,7 +76,8 @@ class SOPNPageText:

def __init__(self, page_number, text):
self.page_number = page_number
self.text = text
self.raw_text = text
self.text = clean_text(text)
self.is_top_page = True

def get_page_heading_set(self):
Expand All @@ -94,8 +95,9 @@ def get_page_heading(self):
Do some basic cleaning of the heading.
"""
threshold = int(len(self.text) * HEADING_SIZE)
search_text = self.text[0:threshold]
words = self.text.split(" ")
threshold = int(len(words) * HEADING_SIZE)
search_text = " ".join(words[0:threshold])
search_text = search_text.replace("\n", " ")
return search_text.lower()

Expand Down
38 changes: 11 additions & 27 deletions ynr/apps/sopn_parsing/tests/test_sopn_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,47 +24,31 @@ def test_sopn_document(self):
self.assertSetEqual(
doc.document_heading,
{
"",
"william",
"following",
"february",
"council",
"lindsey",
"mike",
"of",
"is",
"name",
"statement",
"candidate",
"district",
"the",
"little",
"vale",
"moscow,",
"2019",
"jane",
"as",
"willetts",
"stayte",
"on",
"for",
"berkeley",
"stroud",
"ashton",
"green",
"28",
"thursday",
"edward",
"berr",
"persons",
"liz",
"nominated",
"a",
"thomas",
"home",
"address",
"election",
"councillor",
"vale",
"berkeley",
"on",
"as",
"name",
"thursday",
"candidate",
"february",
"28",
"ashton",
"2019",
},
)

Expand Down

0 comments on commit 367e2c7

Please sign in to comment.