From 1d875967734187cd3268a9209e466e9c69c9f4fe Mon Sep 17 00:00:00 2001
From: Sym Roe <sym.roe@talusdesign.co.uk>
Date: Wed, 11 Mar 2020 17:33:47 +0000
Subject: [PATCH] Order documents by post label length

In combination with 0dc349df7fc71a6acf01c5f2ba910ad1324d4598 this should
catch a load of false positives in the case where one ward's name is a subset
of another ("Heath" and "Broadheath").

Refs #911
---
 .../sopn_parsing/helpers/extract_pages.py     | 10 ++--
 .../tests/test_extract_page_numbers.py        | 52 +++++++++++++++++++
 2 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/ynr/apps/sopn_parsing/helpers/extract_pages.py b/ynr/apps/sopn_parsing/helpers/extract_pages.py
index dfe15281a1..c978f66ebe 100644
--- a/ynr/apps/sopn_parsing/helpers/extract_pages.py
+++ b/ynr/apps/sopn_parsing/helpers/extract_pages.py
@@ -1,3 +1,5 @@
+from django.db.models.functions import Length
+
 from official_documents.models import OfficialDocument
 from sopn_parsing.helpers.pdf_helpers import SOPNDocument
 from sopn_parsing.helpers.text_helpers import NoTextInDocumentError
@@ -20,9 +22,11 @@ def extract_pages_for_ballot(ballot):
 
 
 def extract_pages_for_single_document(document):
-    other_doc_models = OfficialDocument.objects.filter(
-        source_url=document.source_url
-    ).select_related("ballot", "ballot__post")
+    other_doc_models = (
+        OfficialDocument.objects.filter(source_url=document.source_url)
+        .select_related("ballot", "ballot__post")
+        .order_by(-Length("ballot__post__label"))
+    )
 
     doc_file = document.uploaded_file.file
     if not doc_file:
diff --git a/ynr/apps/sopn_parsing/tests/test_extract_page_numbers.py b/ynr/apps/sopn_parsing/tests/test_extract_page_numbers.py
index c224ba39f5..a4b8afed8c 100644
--- a/ynr/apps/sopn_parsing/tests/test_extract_page_numbers.py
+++ b/ynr/apps/sopn_parsing/tests/test_extract_page_numbers.py
@@ -5,8 +5,10 @@
 from django.core.management import call_command
 from django.test import TestCase
 
+from candidates.models import Ballot
 from candidates.tests.uk_examples import UK2015ExamplesMixin
 from official_documents.models import OfficialDocument
+from popolo.models import Post
 from sopn_parsing.tests import should_skip_pdf_tests
 
 
@@ -32,3 +34,53 @@ def test_extract_pages_management_command(self):
         call_command("sopn_parsing_extract_page_numbers")
         doc.refresh_from_db()
         self.assertEqual(doc.relevant_pages, "all")
+
+    def test_multi_page_sopn_correct_ward_assigning(self):
+        """
+        In the case where:
+
+        1. More than one ward is covered in a SOPN
+        2. The wards covered contains two wards, where one name is a subset of
+           the other
+
+        We could end up with a false positive. This is known as the
+        "Heath/Broadheath" problem, and was reported here:
+
+        https://github.com/DemocracyClub/yournextrepresentative/issues/911
+
+        This test ensures the correct pages are matched.
+
+        """
+        example_doc_path = abspath(
+            join(
+                dirname(__file__),
+                "data/halton-2019-statement-of-persons-nominated.pdf",
+            )
+        )
+
+        # Create ballots
+        posts = {"Heath": "13", "Broadheath": "4"}
+        for post_name in posts:
+            post = Post.objects.create(
+                label=post_name,
+                organization=self.local_council,
+                identifier=post_name,
+            )
+            ballot = Ballot.objects.create(
+                ballot_paper_id="local.{}.2019-05-02".format(post_name.lower()),
+                post=post,
+                election=self.local_election,
+            )
+            OfficialDocument.objects.create(
+                ballot=ballot,
+                document_type=OfficialDocument.NOMINATION_PAPER,
+                uploaded_file=SimpleUploadedFile(
+                    "sopn.pdf", open(example_doc_path, "rb").read()
+                ),
+                source_url="example.com",
+            )
+
+        call_command("sopn_parsing_extract_page_numbers")
+        for post_name, expected_page in posts.items():
+            ballot = Ballot.objects.get(post__label=post_name)
+            self.assertEqual(ballot.sopn.relevant_pages, expected_page)