Skip to content

Commit

Permalink
Order documents by post label length
Browse files Browse the repository at this point in the history
In combination with 0dc349d this should
catch a load of false positives in the case where one ward's name is a subset
of another ("Heath" and "Broadheath").

Refs #911
  • Loading branch information
symroe committed Mar 11, 2020
1 parent 0dc349d commit 1d87596
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 3 deletions.
10 changes: 7 additions & 3 deletions ynr/apps/sopn_parsing/helpers/extract_pages.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from django.db.models.functions import Length

from official_documents.models import OfficialDocument
from sopn_parsing.helpers.pdf_helpers import SOPNDocument
from sopn_parsing.helpers.text_helpers import NoTextInDocumentError
Expand All @@ -20,9 +22,11 @@ def extract_pages_for_ballot(ballot):


def extract_pages_for_single_document(document):
other_doc_models = OfficialDocument.objects.filter(
source_url=document.source_url
).select_related("ballot", "ballot__post")
other_doc_models = (
OfficialDocument.objects.filter(source_url=document.source_url)
.select_related("ballot", "ballot__post")
.order_by(-Length("ballot__post__label"))
)

doc_file = document.uploaded_file.file
if not doc_file:
Expand Down
52 changes: 52 additions & 0 deletions ynr/apps/sopn_parsing/tests/test_extract_page_numbers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
from django.core.management import call_command
from django.test import TestCase

from candidates.models import Ballot
from candidates.tests.uk_examples import UK2015ExamplesMixin
from official_documents.models import OfficialDocument
from popolo.models import Post
from sopn_parsing.tests import should_skip_pdf_tests


Expand All @@ -32,3 +34,53 @@ def test_extract_pages_management_command(self):
call_command("sopn_parsing_extract_page_numbers")
doc.refresh_from_db()
self.assertEqual(doc.relevant_pages, "all")

def test_multi_page_sopn_correct_ward_assigning(self):
"""
In the case where:
1. More than one ward is covered in a SOPN
2. The wards covered contains two wards, where one name is a subset of
the other
We could end up with a false positive. This is known as the
"Heath/Broadheath" problem, and was reported here:
https://github.com/DemocracyClub/yournextrepresentative/issues/911
This test ensures the correct pages are matched.
"""
example_doc_path = abspath(
join(
dirname(__file__),
"data/halton-2019-statement-of-persons-nominated.pdf",
)
)

# Create ballots
posts = {"Heath": "13", "Broadheath": "4"}
for post_name in posts:
post = Post.objects.create(
label=post_name,
organization=self.local_council,
identifier=post_name,
)
ballot = Ballot.objects.create(
ballot_paper_id="local.{}.2019-05-02".format(post_name.lower()),
post=post,
election=self.local_election,
)
OfficialDocument.objects.create(
ballot=ballot,
document_type=OfficialDocument.NOMINATION_PAPER,
uploaded_file=SimpleUploadedFile(
"sopn.pdf", open(example_doc_path, "rb").read()
),
source_url="example.com",
)

call_command("sopn_parsing_extract_page_numbers")
for post_name, expected_page in posts.items():
ballot = Ballot.objects.get(post__label=post_name)
self.assertEqual(ballot.sopn.relevant_pages, expected_page)

0 comments on commit 1d87596

Please sign in to comment.