diff --git a/PapersCited.py b/PapersCited.py index f7e1e64..f5dcad2 100644 --- a/PapersCited.py +++ b/PapersCited.py @@ -10,7 +10,7 @@ import xlsxwriter # regex in Python -import re +import regex # GUI elements - for the file dialog. Explicitly import the filedialog submodule. import tkinter @@ -21,9 +21,9 @@ class RegexPatterns: # Phrases that make up regex patterns for detecting citations - letter_character = "[a-zšđčćžäöüñáéíóúç'’\\-]" - letter_uppercase = letter_character.upper() - rest_of_word = letter_character[:-1] + letter_uppercase[1:] + "+" + letter_lowercase = "\\p{Ll}" + letter_uppercase = "\\p{Lu}" + rest_of_word = "(?:\\p{L}|['’\\-])+" # For years - must be exactly four digits, not followed by another digit. years = "(?:\\(?\\d{4}(?!\\d)[abcd]?,?\\s?;?\\s?)+" phrase_and = " +(?:and+|[i&]+) +" @@ -70,7 +70,6 @@ class PhrasesToChange: "^za[ ,]" ] english_excluded_phrases = [ - "^-", "^a[ ,]", "^an[ ,]", "^at[ ,]", @@ -110,12 +109,12 @@ def drop_excluded_phrases(self): PhrasesToChange.english_excluded_phrases for index_no, citation in enumerate(self.citations): for phrase in excluded_phrases: - match = re.search( + match = regex.search( phrase, citation, - re.IGNORECASE + regex.IGNORECASE ) - # If a match is not found, the result of re.match is None + # If a match is not found, the result of regex.match is None if match: filtered_citations[index_no] = "__DELETE__" # Retain only citations that haven't been flagged @@ -135,14 +134,14 @@ def _remove_extra_characters(self, allow_commas = False): # Remove leading and trailing spaces clean_citations[index_no] = clean_citations[index_no].strip() # Condense multiple spaces to a single one. - clean_citations[index_no] = re.sub(" +", " ", clean_citations[index_no]) + clean_citations[index_no] = regex.sub(" +", " ", clean_citations[index_no]) return(clean_citations) def _separate_name_year(self): citations = self.citations rx = RegexPatterns() # If letters and digits are "adjacent", put a space in between - separated_citations = [re.sub("(" + rx.rest_of_word + ")(\\d\\d)", "\\g<1> \\g<2>", citation)\ + separated_citations = [regex.sub("(" + rx.rest_of_word + ")(\\d\\d)", "\\g<1> \\g<2>", citation)\ for citation in citations] return(separated_citations) @@ -167,10 +166,10 @@ def _separate_multiple_years(self): extracted_citations = [] for index_no, citation in enumerate(all_citations): - years = re.findall(pattern = single_year_pattern, string = citation) + years = regex.findall(pattern = single_year_pattern, string = citation) # findall always returns a list if len(years) > 1: - citation_start = re.sub(all_years_pattern, "", citation) + citation_start = regex.sub(all_years_pattern, "", citation) all_citations[index_no] = "__DELETE__" new_citations = [citation_start + year for year in years] extracted_citations.extend(new_citations) @@ -279,7 +278,7 @@ def read_document(filename): def get_matches_solo_author(text, drop_excluded_phrases = False): rx = RegexPatterns() - matches = re.findall( + matches = regex.findall( rx.letter_uppercase + rx.rest_of_word + "[\\s,(]+" + rx.years, text) matches = CitationType(matches) @@ -289,7 +288,7 @@ def get_matches_solo_author(text, drop_excluded_phrases = False): def get_matches_two_authors(text, drop_excluded_phrases = False): # The second word doesn't have to be uppercase, to catch "suradnici". rx = RegexPatterns() - matches = re.findall( + matches = regex.findall( rx.letter_uppercase + rx.rest_of_word + rx.phrase_and + rx.rest_of_word + "[\\s,(]+" + rx.years, text) @@ -299,7 +298,7 @@ def get_matches_two_authors(text, drop_excluded_phrases = False): def get_matches_author_et_al(text, drop_excluded_phrases = False): rx = RegexPatterns() - matches = re.findall( + matches = regex.findall( rx.letter_uppercase + rx.rest_of_word + "(?:" + rx.phrase_et_al + "|" + rx.phrase_i_sur + ")" + rx.years, text) matches = CitationType(matches) @@ -311,7 +310,7 @@ def get_matches_three_authors(text, drop_excluded_phrases = False): # To remedy some, the first letter of the first two words must be capitalised. # The last doesn't, so it catches the term "suradnici" common for multiple authors. rx = RegexPatterns() - matches = re.findall( + matches = regex.findall( rx.letter_uppercase + rx.rest_of_word + "[\\s,]+" + rx.letter_uppercase + rx.rest_of_word + rx.phrase_and + rx.rest_of_word + "[\\s,(]+" + rx.years, @@ -323,7 +322,7 @@ def get_matches_three_authors(text, drop_excluded_phrases = False): def get_matches_two_surnames(text, drop_excluded_phrases = False): # Both names must me capitalised for it to be a valid citation. rx = RegexPatterns() - matches = re.findall( + matches = regex.findall( rx.letter_uppercase + rx.rest_of_word + "[\\s]+" + rx.letter_uppercase + rx.rest_of_word + "[\\s,(]+" + rx.years, text) @@ -334,7 +333,7 @@ def get_matches_two_surnames(text, drop_excluded_phrases = False): def get_matches_two_surnames_et_al(text, drop_excluded_phrases = False): # Both names must me capitalised for it to be a valid citation. rx = RegexPatterns() - matches = re.findall( + matches = regex.findall( rx.letter_uppercase + rx.rest_of_word + "[\\s]+" + rx.letter_uppercase + rx.rest_of_word + "(?:" + rx.phrase_et_al + "|" + rx.phrase_i_sur + ")" + rx.years, text) diff --git a/README.md b/README.md index 081120e..ff7fb24 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,9 @@ Write an Excel file containing all citations found in a document, so they can be ## Dependencies: This program was written using Python 3.9.12. It requires the following modules: -textract, xlsxwriter +textract, xlsxwriter, regex -To install them in Powershell, type "*pip install textract*", followed by "*pip install xlsxwriter*". +To install them in Powershell, type "*pip install textract*" and press Enter. After that, follow with "*pip install xlsxwriter*" and "*pip install regex*". # About: ***PapersCited*** is a small Python program designed to help you with **writing and reviewing reference lists** in your scientific articles. It reads through a document of your choice and takes a note every time something is cited. At the end, it writes all those citations in an Excel file in alphabetical order, omitting duplicate entries. diff --git a/tests/test_detecting_authors.py b/tests/test_detecting_authors.py index 87e9b03..068aa57 100644 --- a/tests/test_detecting_authors.py +++ b/tests/test_detecting_authors.py @@ -70,12 +70,12 @@ def test_detecting_two_surnames_i_suradnika(): ["Anić Babić i suradnika (2000"] def test_detecting_two_surnames_et_al(): - test_string_semicolon = "This is the facts (Naked Truth et al., 1980). Also see Hard, 1980; Facts, 1989." - assert PapersCited.get_matches_two_surnames_et_al(test_string_semicolon).citations ==\ - ["Naked Truth et al., 1980"] - test_croatian_i_sur = "Branjek i suradnici (1999) spominju rad Cvanjek i suradnika (1990) više puta (Dvanjek Erin i sur., 2010)" - assert PapersCited.get_matches_two_surnames_et_al(test_croatian_i_sur).citations ==\ - ["Dvanjek Erin i sur., 2010"] + test_string_semicolon = "This is the facts (Naked Truth et al., 1980). Also see Hard, 1980; Facts, 1989." + assert PapersCited.get_matches_two_surnames_et_al(test_string_semicolon).citations ==\ + ["Naked Truth et al., 1980"] + test_croatian_i_sur = "Branjek i suradnici (1999) spominju rad Cvanjek i suradnika (1990) više puta (Dvanjek Erin i sur., 2010)" + assert PapersCited.get_matches_two_surnames_et_al(test_croatian_i_sur).citations ==\ + ["Dvanjek Erin i sur., 2010"] def test_program_works_with_no_citations_found(): document = "" @@ -146,6 +146,14 @@ def test_authors_should_be_capitalised_to_match(): assert PapersCited.get_matches_two_authors(two_authors).citations == ["Bogus and Dogus, 3000"] # The second name can be lowercase to catch "suradnici". +def test_new_foreign_characters(): + text = "Bø (1999) and Yø (2000) and Så (2001)" + assert PapersCited.get_matches_solo_author(text).citations ==\ + ["Bø (1999", "Yø (2000", "Så (2001"] + text_non_alphanumeric = "Bø's (1999) research cited Yø-yoma (2000)" + assert PapersCited.get_matches_solo_author(text_non_alphanumeric).citations ==\ + ["Bø's (1999", "Yø-yoma (2000"] + def test_possesive_recognised_and_adjusted(): possesive_text = "Listen to Cohen's (1999) talk." matches = PapersCited.get_matches_solo_author(possesive_text) @@ -163,4 +171,4 @@ def test_surnames_apostrophe_s_recognised(): matches = PapersCited.get_matches_solo_author(text) matches.cleanup() assert matches.citations == ["O'Samuel 1999", "O'Sullivan 2000"] - \ No newline at end of file + \ No newline at end of file