From 1d875967734187cd3268a9209e466e9c69c9f4fe Mon Sep 17 00:00:00 2001 From: Sym Roe Date: Wed, 11 Mar 2020 17:33:47 +0000 Subject: [PATCH] Order documents by post label length In combination with 0dc349df7fc71a6acf01c5f2ba910ad1324d4598 this should catch a load of false positives in the case where one ward's name is a subset of another ("Heath" and "Broadheath"). Refs #911 --- .../sopn_parsing/helpers/extract_pages.py | 10 ++-- .../tests/test_extract_page_numbers.py | 52 +++++++++++++++++++ 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/ynr/apps/sopn_parsing/helpers/extract_pages.py b/ynr/apps/sopn_parsing/helpers/extract_pages.py index dfe15281a1..c978f66ebe 100644 --- a/ynr/apps/sopn_parsing/helpers/extract_pages.py +++ b/ynr/apps/sopn_parsing/helpers/extract_pages.py @@ -1,3 +1,5 @@ +from django.db.models.functions import Length + from official_documents.models import OfficialDocument from sopn_parsing.helpers.pdf_helpers import SOPNDocument from sopn_parsing.helpers.text_helpers import NoTextInDocumentError @@ -20,9 +22,11 @@ def extract_pages_for_ballot(ballot): def extract_pages_for_single_document(document): - other_doc_models = OfficialDocument.objects.filter( - source_url=document.source_url - ).select_related("ballot", "ballot__post") + other_doc_models = ( + OfficialDocument.objects.filter(source_url=document.source_url) + .select_related("ballot", "ballot__post") + .order_by(-Length("ballot__post__label")) + ) doc_file = document.uploaded_file.file if not doc_file: diff --git a/ynr/apps/sopn_parsing/tests/test_extract_page_numbers.py b/ynr/apps/sopn_parsing/tests/test_extract_page_numbers.py index c224ba39f5..a4b8afed8c 100644 --- a/ynr/apps/sopn_parsing/tests/test_extract_page_numbers.py +++ b/ynr/apps/sopn_parsing/tests/test_extract_page_numbers.py @@ -5,8 +5,10 @@ from django.core.management import call_command from django.test import TestCase +from candidates.models import Ballot from candidates.tests.uk_examples import UK2015ExamplesMixin from official_documents.models import OfficialDocument +from popolo.models import Post from sopn_parsing.tests import should_skip_pdf_tests @@ -32,3 +34,53 @@ def test_extract_pages_management_command(self): call_command("sopn_parsing_extract_page_numbers") doc.refresh_from_db() self.assertEqual(doc.relevant_pages, "all") + + def test_multi_page_sopn_correct_ward_assigning(self): + """ + In the case where: + + 1. More than one ward is covered in a SOPN + 2. The wards covered contains two wards, where one name is a subset of + the other + + We could end up with a false positive. This is known as the + "Heath/Broadheath" problem, and was reported here: + + https://github.com/DemocracyClub/yournextrepresentative/issues/911 + + This test ensures the correct pages are matched. + + """ + example_doc_path = abspath( + join( + dirname(__file__), + "data/halton-2019-statement-of-persons-nominated.pdf", + ) + ) + + # Create ballots + posts = {"Heath": "13", "Broadheath": "4"} + for post_name in posts: + post = Post.objects.create( + label=post_name, + organization=self.local_council, + identifier=post_name, + ) + ballot = Ballot.objects.create( + ballot_paper_id="local.{}.2019-05-02".format(post_name.lower()), + post=post, + election=self.local_election, + ) + OfficialDocument.objects.create( + ballot=ballot, + document_type=OfficialDocument.NOMINATION_PAPER, + uploaded_file=SimpleUploadedFile( + "sopn.pdf", open(example_doc_path, "rb").read() + ), + source_url="example.com", + ) + + call_command("sopn_parsing_extract_page_numbers") + for post_name, expected_page in posts.items(): + ballot = Ballot.objects.get(post__label=post_name) + self.assertEqual(ballot.sopn.relevant_pages, expected_page)