Skip to content

Commit

Permalink
Remove newlines at the extract table step
Browse files Browse the repository at this point in the history
  • Loading branch information
VirginiaDooley committed Jun 26, 2023
1 parent bcdb0d0 commit 16ab6fa
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 110 deletions.
1 change: 1 addition & 0 deletions ynr/apps/sopn_parsing/helpers/extract_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def extract_ballot_table(ballot, parse_flavor="lattice"):
document.uploaded_file.path,
pages=document.relevant_pages,
flavor=parse_flavor,
strip_text="\n",
)
except (NotImplementedError, AttributeError):
# * NotImplementedError is thrown if the PDF is an image or generally
Expand Down
2 changes: 2 additions & 0 deletions ynr/apps/sopn_parsing/helpers/parse_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,9 @@ def clean_name(name):
- Build a string to represent the other names by looking for all words not in all caps
- Strip whitespace in case last_names is empty and return string titleized
"""

name = name.replace("\n", " ")

name = name.replace("`", "'")
name = name.replace("\u2013", "\u002d")
# remove multiple whitespaces
Expand Down
136 changes: 32 additions & 104 deletions ynr/apps/sopn_parsing/tests/test_extract_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,123 +39,51 @@ def test_extract_tables(self):
ParsedSOPN.objects.get().as_pandas.to_dict(),
{
"0": {
"0": "Name of \nCandidate",
"1": "ALAGARATNAM \nRathy",
"2": "BARBER \nJames",
"3": "HAYES \nHelen Elizabeth",
"4": "KANUMANSA \nAmadu",
"5": "KOTECHA \nResham",
"6": "LAMBERT \nRobin Andrew \nDavid",
"7": "NALLY \nSteve",
"8": "NIX \nRashid",
"0": "Name of Candidate",
"1": "ALAGARATNAM Rathy",
"2": "BARBER James",
"3": "HAYES Helen Elizabeth",
"4": "KANUMANSA Amadu",
"5": "KOTECHA Resham",
"6": "LAMBERT Robin Andrew David",
"7": "NALLY Steve",
"8": "NIX Rashid",
},
"1": {
"0": "Home \nAddress",
"1": "(address in the \nMitcham and Morden \nConstituency)",
"2": "33 Champion Hill, \nLondon, SE5 8BS",
"3": "11 Woodsyre, \nSydenham Hill, \nLondon, SE26 6SS",
"4": "11 Coleridge House, \nBrowning Street, \nLondon, SE17 1DG",
"5": "(address in the \nRuislip, Northwood \nand Pinner \nConstituency)",
"6": "(address in the \nDuwlich and West \nNorwood \nConstituency)",
"7": "(address in the \nVauxhall \nConstituency)",
"8": "66 Guinness Court, \nLondon, SW3 2PQ",
"0": "Home Address",
"1": "(address in the Mitcham and Morden Constituency)",
"2": "33 Champion Hill, London, SE5 8BS",
"3": "11 Woodsyre, Sydenham Hill, London, SE26 6SS",
"4": "11 Coleridge House, Browning Street, London, SE17 1DG",
"5": "(address in the Ruislip, Northwood and Pinner Constituency)",
"6": "(address in the Duwlich and West Norwood Constituency)",
"7": "(address in the Vauxhall Constituency)",
"8": "66 Guinness Court, London, SW3 2PQ",
},
"2": {
"0": "Description \n(if any)",
"1": "UK Independence \nParty (UKIP)",
"0": "Description (if any)",
"1": "UK Independence Party (UKIP)",
"2": "Liberal Democrat",
"3": "Labour Party",
"4": "All People`s Party",
"5": "The Conservative \nParty Candidate",
"5": "The Conservative Party Candidate",
"6": "Independent",
"7": "Trade Unionist \nand Socialist \nCoalition",
"7": "Trade Unionist and Socialist Coalition",
"8": "The Green Party",
},
"3": {
"0": "Name of Assentors \nProposer(+), Seconder(++)",
"1": "Coleman Alice M + \n"
"Potter Keith S ++ \n"
"Potter Stephanie \n"
"Smith Bryan L \n"
"Anderson Beth \n"
"Lumba Avita \n"
"Andersen Robert \n"
"Patel Sajal \n"
"Stanbury Linda \n"
"Stanbury James",
"2": "Fitchett Keith + \n"
"Price Jonathan ++ \n"
"Gardner Brigid \n"
"Waddington Simon \n"
"Morland Laura \n"
"Lester Rachel \n"
"Pidgeon Caroline \n"
"Hare David \n"
"Hanton Alastair \n"
"Haylett Alexander",
"3": "Samuel Gaynelle + \n"
"Whaley Stephen P ++ \n"
"Brazell Shadi M \n"
"De Souza Johnny \n"
"Alcock Heather \n"
"Natzler Robert S \n"
"Pearce Michelle E \n"
"Pickering Robert \n"
"Richardson Katherine G \n"
"Pickard Jane",
"4": "King James + \n"
"King Rosemary ++ \n"
"King David \n"
"Davies Yadalieu \n"
"Sesay Mary \n"
"Rahman Layla K \n"
"Rahman Syed A \n"
"Ahmed Jalaluddin \n"
"Rahman Tajwar S \n"
"Rahman Taamid S",
"5": "Davis James G + \n"
"Bradbury David S ++ \n"
"Badman Susan E \n"
"Hill-Archer Roderick C \n"
"Langley Anne C \n"
"Mitchell Andrew M \n"
"Virgo Marjorie J \n"
"Virgo Philip A \n"
"Chathli Lindsay \n"
"Broomhead Robert A",
"6": "Smith Caitlin + \n"
"Parks Jesse ++ \n"
"Connage Kyesha \n"
"Hendry Perihan \n"
"Mounty E J \n"
"Sharif B \n"
"Scott Wellesley \n"
"Harriott S A \n"
"Harriott Clive \n"
"Ojumu Ibi",
"7": "Tullis Andrew C + \n"
"Mason Joshua H ++ \n"
"Parkinson Francine M \n"
"Gait Elizabeth \n"
"Doolan Samantha \n"
"Ubiaro Elizabeth \n"
"Garner Stuart \n"
"Akinjogbin Dolapo \n"
"Walker Donna \n"
"Lang Geoffrey P",
"8": "Atwell E G + \n"
"Rose Lloyd ++ \n"
"O`Shea C \n"
"Gomes Jacqueline \n"
"Wood Thomas \n"
"Rosenfeld David \n"
"Conroy Martin \n"
"Skiadopoulou I \n"
"Rosenfeld Lawrence \n"
"Rosenfeld Emily",
"0": "Name of Assentors Proposer(+), Seconder(++)",
"1": "Coleman Alice M + Potter Keith S ++ Potter Stephanie Smith Bryan L Anderson Beth Lumba Avita Andersen Robert Patel Sajal Stanbury Linda Stanbury James",
"2": "Fitchett Keith + Price Jonathan ++ Gardner Brigid Waddington Simon Morland Laura Lester Rachel Pidgeon Caroline Hare David Hanton Alastair Haylett Alexander",
"3": "Samuel Gaynelle + Whaley Stephen P ++ Brazell Shadi M De Souza Johnny Alcock Heather Natzler Robert S Pearce Michelle E Pickering Robert Richardson Katherine G Pickard Jane",
"4": "King James + King Rosemary ++ King David Davies Yadalieu Sesay Mary Rahman Layla K Rahman Syed A Ahmed Jalaluddin Rahman Tajwar S Rahman Taamid S",
"5": "Davis James G + Bradbury David S ++ Badman Susan E Hill-Archer Roderick C Langley Anne C Mitchell Andrew M Virgo Marjorie J Virgo Philip A Chathli Lindsay Broomhead Robert A",
"6": "Smith Caitlin + Parks Jesse ++ Connage Kyesha Hendry Perihan Mounty E J Sharif B Scott Wellesley Harriott S A Harriott Clive Ojumu Ibi",
"7": "Tullis Andrew C + Mason Joshua H ++ Parkinson Francine M Gait Elizabeth Doolan Samantha Ubiaro Elizabeth Garner Stuart Akinjogbin Dolapo Walker Donna Lang Geoffrey P",
"8": "Atwell E G + Rose Lloyd ++ O`Shea C Gomes Jacqueline Wood Thomas Rosenfeld David Conroy Martin Skiadopoulou I Rosenfeld Lawrence Rosenfeld Emily",
},
"4": {
"0": "Reason why \nno longer \nnominated*",
"0": "Reason why no longer nominated*",
"1": "",
"2": "",
"3": "",
Expand Down
6 changes: 0 additions & 6 deletions ynr/apps/sopn_parsing/tests/test_parse_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,12 +318,6 @@ def test_clean_name_replaces_backticks(self):
assert "`" not in name
assert "'" in name

def test_clean_name_replaces_newlines(self):
name = parse_tables.clean_name(
"A Very Long Name That Splits \nOver Lines"
)
assert "\n" not in name

def test_clean_name_capitalized_last_and_titalized(self):
name = parse_tables.clean_name("SMITH John")
assert name == "John Smith"
Expand Down

0 comments on commit 16ab6fa

Please sign in to comment.