diff --git a/ynr/apps/sopn_parsing/helpers/extract_tables.py b/ynr/apps/sopn_parsing/helpers/extract_tables.py index 5c6db1fc4..b8a00d5e6 100644 --- a/ynr/apps/sopn_parsing/helpers/extract_tables.py +++ b/ynr/apps/sopn_parsing/helpers/extract_tables.py @@ -25,6 +25,7 @@ def extract_ballot_table(ballot, parse_flavor="lattice"): document.uploaded_file.path, pages=document.relevant_pages, flavor=parse_flavor, + strip_text="\n", ) except (NotImplementedError, AttributeError): # * NotImplementedError is thrown if the PDF is an image or generally diff --git a/ynr/apps/sopn_parsing/helpers/parse_tables.py b/ynr/apps/sopn_parsing/helpers/parse_tables.py index 616ddde7c..157dcc7e6 100644 --- a/ynr/apps/sopn_parsing/helpers/parse_tables.py +++ b/ynr/apps/sopn_parsing/helpers/parse_tables.py @@ -153,7 +153,9 @@ def clean_name(name): - Build a string to represent the other names by looking for all words not in all caps - Strip whitespace in case last_names is empty and return string titleized """ + name = name.replace("\n", " ") + name = name.replace("`", "'") name = name.replace("\u2013", "\u002d") # remove multiple whitespaces diff --git a/ynr/apps/sopn_parsing/tests/test_extract_tables.py b/ynr/apps/sopn_parsing/tests/test_extract_tables.py index 20155ddfd..0ea9ba2d0 100644 --- a/ynr/apps/sopn_parsing/tests/test_extract_tables.py +++ b/ynr/apps/sopn_parsing/tests/test_extract_tables.py @@ -39,123 +39,51 @@ def test_extract_tables(self): ParsedSOPN.objects.get().as_pandas.to_dict(), { "0": { - "0": "Name of \nCandidate", - "1": "ALAGARATNAM \nRathy", - "2": "BARBER \nJames", - "3": "HAYES \nHelen Elizabeth", - "4": "KANUMANSA \nAmadu", - "5": "KOTECHA \nResham", - "6": "LAMBERT \nRobin Andrew \nDavid", - "7": "NALLY \nSteve", - "8": "NIX \nRashid", + "0": "Name of Candidate", + "1": "ALAGARATNAM Rathy", + "2": "BARBER James", + "3": "HAYES Helen Elizabeth", + "4": "KANUMANSA Amadu", + "5": "KOTECHA Resham", + "6": "LAMBERT Robin Andrew David", + "7": "NALLY Steve", + "8": "NIX Rashid", }, "1": { - "0": "Home \nAddress", - "1": "(address in the \nMitcham and Morden \nConstituency)", - "2": "33 Champion Hill, \nLondon, SE5 8BS", - "3": "11 Woodsyre, \nSydenham Hill, \nLondon, SE26 6SS", - "4": "11 Coleridge House, \nBrowning Street, \nLondon, SE17 1DG", - "5": "(address in the \nRuislip, Northwood \nand Pinner \nConstituency)", - "6": "(address in the \nDuwlich and West \nNorwood \nConstituency)", - "7": "(address in the \nVauxhall \nConstituency)", - "8": "66 Guinness Court, \nLondon, SW3 2PQ", + "0": "Home Address", + "1": "(address in the Mitcham and Morden Constituency)", + "2": "33 Champion Hill, London, SE5 8BS", + "3": "11 Woodsyre, Sydenham Hill, London, SE26 6SS", + "4": "11 Coleridge House, Browning Street, London, SE17 1DG", + "5": "(address in the Ruislip, Northwood and Pinner Constituency)", + "6": "(address in the Duwlich and West Norwood Constituency)", + "7": "(address in the Vauxhall Constituency)", + "8": "66 Guinness Court, London, SW3 2PQ", }, "2": { - "0": "Description \n(if any)", - "1": "UK Independence \nParty (UKIP)", + "0": "Description (if any)", + "1": "UK Independence Party (UKIP)", "2": "Liberal Democrat", "3": "Labour Party", "4": "All People`s Party", - "5": "The Conservative \nParty Candidate", + "5": "The Conservative Party Candidate", "6": "Independent", - "7": "Trade Unionist \nand Socialist \nCoalition", + "7": "Trade Unionist and Socialist Coalition", "8": "The Green Party", }, "3": { - "0": "Name of Assentors \nProposer(+), Seconder(++)", - "1": "Coleman Alice M + \n" - "Potter Keith S ++ \n" - "Potter Stephanie \n" - "Smith Bryan L \n" - "Anderson Beth \n" - "Lumba Avita \n" - "Andersen Robert \n" - "Patel Sajal \n" - "Stanbury Linda \n" - "Stanbury James", - "2": "Fitchett Keith + \n" - "Price Jonathan ++ \n" - "Gardner Brigid \n" - "Waddington Simon \n" - "Morland Laura \n" - "Lester Rachel \n" - "Pidgeon Caroline \n" - "Hare David \n" - "Hanton Alastair \n" - "Haylett Alexander", - "3": "Samuel Gaynelle + \n" - "Whaley Stephen P ++ \n" - "Brazell Shadi M \n" - "De Souza Johnny \n" - "Alcock Heather \n" - "Natzler Robert S \n" - "Pearce Michelle E \n" - "Pickering Robert \n" - "Richardson Katherine G \n" - "Pickard Jane", - "4": "King James + \n" - "King Rosemary ++ \n" - "King David \n" - "Davies Yadalieu \n" - "Sesay Mary \n" - "Rahman Layla K \n" - "Rahman Syed A \n" - "Ahmed Jalaluddin \n" - "Rahman Tajwar S \n" - "Rahman Taamid S", - "5": "Davis James G + \n" - "Bradbury David S ++ \n" - "Badman Susan E \n" - "Hill-Archer Roderick C \n" - "Langley Anne C \n" - "Mitchell Andrew M \n" - "Virgo Marjorie J \n" - "Virgo Philip A \n" - "Chathli Lindsay \n" - "Broomhead Robert A", - "6": "Smith Caitlin + \n" - "Parks Jesse ++ \n" - "Connage Kyesha \n" - "Hendry Perihan \n" - "Mounty E J \n" - "Sharif B \n" - "Scott Wellesley \n" - "Harriott S A \n" - "Harriott Clive \n" - "Ojumu Ibi", - "7": "Tullis Andrew C + \n" - "Mason Joshua H ++ \n" - "Parkinson Francine M \n" - "Gait Elizabeth \n" - "Doolan Samantha \n" - "Ubiaro Elizabeth \n" - "Garner Stuart \n" - "Akinjogbin Dolapo \n" - "Walker Donna \n" - "Lang Geoffrey P", - "8": "Atwell E G + \n" - "Rose Lloyd ++ \n" - "O`Shea C \n" - "Gomes Jacqueline \n" - "Wood Thomas \n" - "Rosenfeld David \n" - "Conroy Martin \n" - "Skiadopoulou I \n" - "Rosenfeld Lawrence \n" - "Rosenfeld Emily", + "0": "Name of Assentors Proposer(+), Seconder(++)", + "1": "Coleman Alice M + Potter Keith S ++ Potter Stephanie Smith Bryan L Anderson Beth Lumba Avita Andersen Robert Patel Sajal Stanbury Linda Stanbury James", + "2": "Fitchett Keith + Price Jonathan ++ Gardner Brigid Waddington Simon Morland Laura Lester Rachel Pidgeon Caroline Hare David Hanton Alastair Haylett Alexander", + "3": "Samuel Gaynelle + Whaley Stephen P ++ Brazell Shadi M De Souza Johnny Alcock Heather Natzler Robert S Pearce Michelle E Pickering Robert Richardson Katherine G Pickard Jane", + "4": "King James + King Rosemary ++ King David Davies Yadalieu Sesay Mary Rahman Layla K Rahman Syed A Ahmed Jalaluddin Rahman Tajwar S Rahman Taamid S", + "5": "Davis James G + Bradbury David S ++ Badman Susan E Hill-Archer Roderick C Langley Anne C Mitchell Andrew M Virgo Marjorie J Virgo Philip A Chathli Lindsay Broomhead Robert A", + "6": "Smith Caitlin + Parks Jesse ++ Connage Kyesha Hendry Perihan Mounty E J Sharif B Scott Wellesley Harriott S A Harriott Clive Ojumu Ibi", + "7": "Tullis Andrew C + Mason Joshua H ++ Parkinson Francine M Gait Elizabeth Doolan Samantha Ubiaro Elizabeth Garner Stuart Akinjogbin Dolapo Walker Donna Lang Geoffrey P", + "8": "Atwell E G + Rose Lloyd ++ O`Shea C Gomes Jacqueline Wood Thomas Rosenfeld David Conroy Martin Skiadopoulou I Rosenfeld Lawrence Rosenfeld Emily", }, "4": { - "0": "Reason why \nno longer \nnominated*", + "0": "Reason why no longer nominated*", "1": "", "2": "", "3": "", diff --git a/ynr/apps/sopn_parsing/tests/test_parse_tables.py b/ynr/apps/sopn_parsing/tests/test_parse_tables.py index 851adac70..477a6152c 100644 --- a/ynr/apps/sopn_parsing/tests/test_parse_tables.py +++ b/ynr/apps/sopn_parsing/tests/test_parse_tables.py @@ -318,12 +318,6 @@ def test_clean_name_replaces_backticks(self): assert "`" not in name assert "'" in name - def test_clean_name_replaces_newlines(self): - name = parse_tables.clean_name( - "A Very Long Name That Splits \nOver Lines" - ) - assert "\n" not in name - def test_clean_name_capitalized_last_and_titalized(self): name = parse_tables.clean_name("SMITH John") assert name == "John Smith"