Skip to content

Commit

Permalink
Merge pull request #11 from Mkranj/include_more_characters
Browse files Browse the repository at this point in the history
Recognise all Unicode letter characters in author names
  • Loading branch information
Mkranj authored Jan 4, 2023
2 parents 6703946 + ee10d34 commit a0e902b
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 27 deletions.
35 changes: 17 additions & 18 deletions PapersCited.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import xlsxwriter

# regex in Python
import re
import regex

# GUI elements - for the file dialog. Explicitly import the filedialog submodule.
import tkinter
Expand All @@ -21,9 +21,9 @@

class RegexPatterns:
# Phrases that make up regex patterns for detecting citations
letter_character = "[a-zšđčćžäöüñáéíóúç'’\\-]"
letter_uppercase = letter_character.upper()
rest_of_word = letter_character[:-1] + letter_uppercase[1:] + "+"
letter_lowercase = "\\p{Ll}"
letter_uppercase = "\\p{Lu}"
rest_of_word = "(?:\\p{L}|['’\\-])+"
# For years - must be exactly four digits, not followed by another digit.
years = "(?:\\(?\\d{4}(?!\\d)[abcd]?,?\\s?;?\\s?)+"
phrase_and = " +(?:and+|[i&]+) +"
Expand Down Expand Up @@ -70,7 +70,6 @@ class PhrasesToChange:
"^za[ ,]"
]
english_excluded_phrases = [
"^-",
"^a[ ,]",
"^an[ ,]",
"^at[ ,]",
Expand Down Expand Up @@ -110,12 +109,12 @@ def drop_excluded_phrases(self):
PhrasesToChange.english_excluded_phrases
for index_no, citation in enumerate(self.citations):
for phrase in excluded_phrases:
match = re.search(
match = regex.search(
phrase,
citation,
re.IGNORECASE
regex.IGNORECASE
)
# If a match is not found, the result of re.match is None
# If a match is not found, the result of regex.match is None
if match:
filtered_citations[index_no] = "__DELETE__"
# Retain only citations that haven't been flagged
Expand All @@ -135,14 +134,14 @@ def _remove_extra_characters(self, allow_commas = False):
# Remove leading and trailing spaces
clean_citations[index_no] = clean_citations[index_no].strip()
# Condense multiple spaces to a single one.
clean_citations[index_no] = re.sub(" +", " ", clean_citations[index_no])
clean_citations[index_no] = regex.sub(" +", " ", clean_citations[index_no])
return(clean_citations)

def _separate_name_year(self):
citations = self.citations
rx = RegexPatterns()
# If letters and digits are "adjacent", put a space in between
separated_citations = [re.sub("(" + rx.rest_of_word + ")(\\d\\d)", "\\g<1> \\g<2>", citation)\
separated_citations = [regex.sub("(" + rx.rest_of_word + ")(\\d\\d)", "\\g<1> \\g<2>", citation)\
for citation in citations]
return(separated_citations)

Expand All @@ -167,10 +166,10 @@ def _separate_multiple_years(self):
extracted_citations = []

for index_no, citation in enumerate(all_citations):
years = re.findall(pattern = single_year_pattern, string = citation)
years = regex.findall(pattern = single_year_pattern, string = citation)
# findall always returns a list
if len(years) > 1:
citation_start = re.sub(all_years_pattern, "", citation)
citation_start = regex.sub(all_years_pattern, "", citation)
all_citations[index_no] = "__DELETE__"
new_citations = [citation_start + year for year in years]
extracted_citations.extend(new_citations)
Expand Down Expand Up @@ -279,7 +278,7 @@ def read_document(filename):

def get_matches_solo_author(text, drop_excluded_phrases = False):
rx = RegexPatterns()
matches = re.findall(
matches = regex.findall(
rx.letter_uppercase + rx.rest_of_word + "[\\s,(]+" + rx.years,
text)
matches = CitationType(matches)
Expand All @@ -289,7 +288,7 @@ def get_matches_solo_author(text, drop_excluded_phrases = False):
def get_matches_two_authors(text, drop_excluded_phrases = False):
# The second word doesn't have to be uppercase, to catch "suradnici".
rx = RegexPatterns()
matches = re.findall(
matches = regex.findall(
rx.letter_uppercase + rx.rest_of_word + rx.phrase_and +
rx.rest_of_word + "[\\s,(]+" + rx.years,
text)
Expand All @@ -299,7 +298,7 @@ def get_matches_two_authors(text, drop_excluded_phrases = False):

def get_matches_author_et_al(text, drop_excluded_phrases = False):
rx = RegexPatterns()
matches = re.findall(
matches = regex.findall(
rx.letter_uppercase + rx.rest_of_word + "(?:" + rx.phrase_et_al + "|" + rx.phrase_i_sur + ")" + rx.years,
text)
matches = CitationType(matches)
Expand All @@ -311,7 +310,7 @@ def get_matches_three_authors(text, drop_excluded_phrases = False):
# To remedy some, the first letter of the first two words must be capitalised.
# The last doesn't, so it catches the term "suradnici" common for multiple authors.
rx = RegexPatterns()
matches = re.findall(
matches = regex.findall(
rx.letter_uppercase + rx.rest_of_word + "[\\s,]+" +
rx.letter_uppercase + rx.rest_of_word + rx.phrase_and +
rx.rest_of_word + "[\\s,(]+" + rx.years,
Expand All @@ -323,7 +322,7 @@ def get_matches_three_authors(text, drop_excluded_phrases = False):
def get_matches_two_surnames(text, drop_excluded_phrases = False):
# Both names must me capitalised for it to be a valid citation.
rx = RegexPatterns()
matches = re.findall(
matches = regex.findall(
rx.letter_uppercase + rx.rest_of_word + "[\\s]+" +
rx.letter_uppercase + rx.rest_of_word + "[\\s,(]+" + rx.years,
text)
Expand All @@ -334,7 +333,7 @@ def get_matches_two_surnames(text, drop_excluded_phrases = False):
def get_matches_two_surnames_et_al(text, drop_excluded_phrases = False):
# Both names must me capitalised for it to be a valid citation.
rx = RegexPatterns()
matches = re.findall(
matches = regex.findall(
rx.letter_uppercase + rx.rest_of_word + "[\\s]+" +
rx.letter_uppercase + rx.rest_of_word + "(?:" + rx.phrase_et_al + "|" + rx.phrase_i_sur + ")" + rx.years,
text)
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ Write an Excel file containing all citations found in a document, so they can be

## Dependencies:
This program was written using Python 3.9.12. It requires the following modules:
textract, xlsxwriter
textract, xlsxwriter, regex

To install them in Powershell, type "*pip install textract*", followed by "*pip install xlsxwriter*".
To install them in Powershell, type "*pip install textract*" and press Enter. After that, follow with "*pip install xlsxwriter*" and "*pip install regex*".

# About:
***PapersCited*** is a small Python program designed to help you with **writing and reviewing reference lists** in your scientific articles. It reads through a document of your choice and takes a note every time something is cited. At the end, it writes all those citations in an Excel file in alphabetical order, omitting duplicate entries.
Expand Down
22 changes: 15 additions & 7 deletions tests/test_detecting_authors.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,12 @@ def test_detecting_two_surnames_i_suradnika():
["Anić Babić i suradnika (2000"]

def test_detecting_two_surnames_et_al():
test_string_semicolon = "This is the facts (Naked Truth et al., 1980). Also see Hard, 1980; Facts, 1989."
assert PapersCited.get_matches_two_surnames_et_al(test_string_semicolon).citations ==\
["Naked Truth et al., 1980"]
test_croatian_i_sur = "Branjek i suradnici (1999) spominju rad Cvanjek i suradnika (1990) više puta (Dvanjek Erin i sur., 2010)"
assert PapersCited.get_matches_two_surnames_et_al(test_croatian_i_sur).citations ==\
["Dvanjek Erin i sur., 2010"]
test_string_semicolon = "This is the facts (Naked Truth et al., 1980). Also see Hard, 1980; Facts, 1989."
assert PapersCited.get_matches_two_surnames_et_al(test_string_semicolon).citations ==\
["Naked Truth et al., 1980"]
test_croatian_i_sur = "Branjek i suradnici (1999) spominju rad Cvanjek i suradnika (1990) više puta (Dvanjek Erin i sur., 2010)"
assert PapersCited.get_matches_two_surnames_et_al(test_croatian_i_sur).citations ==\
["Dvanjek Erin i sur., 2010"]

def test_program_works_with_no_citations_found():
document = ""
Expand Down Expand Up @@ -146,6 +146,14 @@ def test_authors_should_be_capitalised_to_match():
assert PapersCited.get_matches_two_authors(two_authors).citations == ["Bogus and Dogus, 3000"]
# The second name can be lowercase to catch "suradnici".

def test_new_foreign_characters():
text = "Bø (1999) and Yø (2000) and Så (2001)"
assert PapersCited.get_matches_solo_author(text).citations ==\
["Bø (1999", "Yø (2000", "Så (2001"]
text_non_alphanumeric = "Bø's (1999) research cited Yø-yoma (2000)"
assert PapersCited.get_matches_solo_author(text_non_alphanumeric).citations ==\
["Bø's (1999", "Yø-yoma (2000"]

def test_possesive_recognised_and_adjusted():
possesive_text = "Listen to Cohen's (1999) talk."
matches = PapersCited.get_matches_solo_author(possesive_text)
Expand All @@ -163,4 +171,4 @@ def test_surnames_apostrophe_s_recognised():
matches = PapersCited.get_matches_solo_author(text)
matches.cleanup()
assert matches.citations == ["O'Samuel 1999", "O'Sullivan 2000"]

0 comments on commit a0e902b

Please sign in to comment.