From 1e8cd6e884c586ff86f6bf4f93c78253c41ffe8e Mon Sep 17 00:00:00 2001 From: Mkranj Date: Sat, 19 Aug 2023 11:56:07 +0200 Subject: [PATCH 1/8] Change version location, update to 1.3 --- PapersCited/PapersCited.py | 2 +- PapersCited/UI/messages.py | 2 ++ PapersCited/UI/windowUI.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/PapersCited/PapersCited.py b/PapersCited/PapersCited.py index 22cb883..d5e441e 100644 --- a/PapersCited/PapersCited.py +++ b/PapersCited/PapersCited.py @@ -1,5 +1,5 @@ from UI.windowUI import main_window -from UI.windowUI import version +from UI.messages import version # -*- coding: utf-8 -*- diff --git a/PapersCited/UI/messages.py b/PapersCited/UI/messages.py index a756c7c..1dab84d 100644 --- a/PapersCited/UI/messages.py +++ b/PapersCited/UI/messages.py @@ -1,3 +1,5 @@ +version = "1.3" + break_with_lines = "--------------------" def filename_cant_be_read_message(filename): diff --git a/PapersCited/UI/windowUI.py b/PapersCited/UI/windowUI.py index 0031356..d912802 100644 --- a/PapersCited/UI/windowUI.py +++ b/PapersCited/UI/windowUI.py @@ -4,10 +4,10 @@ import UI.fileManipulation as fm import UI.messages as ms +from UI.messages import version from UI.appData import AppData # Variables ---- -version = "v.1.2.3" light_yellow = "#ffe08f" From 4cb678fd44e6d7800a4d336c7d76d16ae4a528a0 Mon Sep 17 00:00:00 2001 From: Mkranj Date: Sat, 19 Aug 2023 12:01:33 +0200 Subject: [PATCH 2/8] Simplify warning for POTENTIAL txt problems --- PapersCited/UI/messages.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/PapersCited/UI/messages.py b/PapersCited/UI/messages.py index 1dab84d..375bc16 100644 --- a/PapersCited/UI/messages.py +++ b/PapersCited/UI/messages.py @@ -41,7 +41,9 @@ def cant_write_file(filename): "\nPossible permissions issue, can you create files at that folder?" return(message) -reading_pdf_warning = "Warning!\nReading PDF files is not recommended and might result in inaccurate transcription.\n" -reading_txt_warning = "Warning! Reading .txt files might lead to problems with special characters. \ - \nTo ensure the best format is used, backup the .txt file, then try saving it in UTF-8 or ANSI encoding. \ +reading_pdf_warning = "Warning!\ + \nReading PDF files is not recommended and might result in inaccurate transcription.\n" + +reading_txt_warning = "Warning! " + \ + "If you encounter problems reading this .txt file, backup the original file, then try saving it in UTF-8 or ANSI encoding.\ \n(\"Save as...\" dialog, \"Encoding:\" at the bottom.)\n" \ No newline at end of file From f0cb89f8f0bef9ec514e6d88fd162ea2eb435862 Mon Sep 17 00:00:00 2001 From: Mkranj Date: Sat, 19 Aug 2023 12:05:21 +0200 Subject: [PATCH 3/8] Remove warning for PDF inaccuraccy read_document changes the carriage return symbols that used to cause errors --- PapersCited/UI/fileManipulation.py | 2 -- PapersCited/UI/messages.py | 3 --- 2 files changed, 5 deletions(-) diff --git a/PapersCited/UI/fileManipulation.py b/PapersCited/UI/fileManipulation.py index 4a05402..ab9e5dc 100644 --- a/PapersCited/UI/fileManipulation.py +++ b/PapersCited/UI/fileManipulation.py @@ -20,8 +20,6 @@ def check_file(filename): warning = None file_extension = os.path.splitext(filename)[1] - if file_extension.casefold() == ".pdf": - warning = ms.reading_pdf_warning if file_extension.casefold() == ".txt": warning = ms.reading_txt_warning diff --git a/PapersCited/UI/messages.py b/PapersCited/UI/messages.py index 375bc16..7d56416 100644 --- a/PapersCited/UI/messages.py +++ b/PapersCited/UI/messages.py @@ -41,9 +41,6 @@ def cant_write_file(filename): "\nPossible permissions issue, can you create files at that folder?" return(message) -reading_pdf_warning = "Warning!\ - \nReading PDF files is not recommended and might result in inaccurate transcription.\n" - reading_txt_warning = "Warning! " + \ "If you encounter problems reading this .txt file, backup the original file, then try saving it in UTF-8 or ANSI encoding.\ \n(\"Save as...\" dialog, \"Encoding:\" at the bottom.)\n" \ No newline at end of file From ee3198cf39ad7e3ee884df6ea4f8ec8a18d3e32d Mon Sep 17 00:00:00 2001 From: Mkranj Date: Sat, 19 Aug 2023 12:51:18 +0200 Subject: [PATCH 4/8] Specific warning messages for doc and pdf --- PapersCited/UI/fileManipulation.py | 9 +++++---- PapersCited/UI/messages.py | 28 +++++++++++++++++++++------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/PapersCited/UI/fileManipulation.py b/PapersCited/UI/fileManipulation.py index ab9e5dc..d374ab4 100644 --- a/PapersCited/UI/fileManipulation.py +++ b/PapersCited/UI/fileManipulation.py @@ -58,26 +58,27 @@ def read_document(filename): except: raise Exception("No file selected") + file_extension = os.path.splitext(filename)[1].casefold() + try: target_document = textract.process(filename, output_encoding="utf-8-sig") except Exception as e: # get only the text of the exception error = str(e) # If the file exists, but cannot be read, an error will be raised. - error_message = ms.filename_cant_be_read_message(filename) + \ + error_message = ms.filename_cant_be_read_message(filename, file_extension) + \ "\nThe error message:\n" + error raise Exception(error_message) # UTF-8 encoding so it recognises foreign characters target_document = target_document.decode("utf-8-sig") - file_extension = os.path.splitext(filename)[1] - if file_extension.casefold() == ".pdf": + if file_extension == ".pdf": target_document = target_document.replace("\r\n", " ") target_document = target_document.replace("\r", "") target_document = target_document.replace("\n", " ") - if file_extension.casefold() == ".docx": + if file_extension == ".docx": footnote_text = read_docx_footnotes(filename) target_document = target_document + " \n " + footnote_text diff --git a/PapersCited/UI/messages.py b/PapersCited/UI/messages.py index 7d56416..2f507c4 100644 --- a/PapersCited/UI/messages.py +++ b/PapersCited/UI/messages.py @@ -2,13 +2,27 @@ break_with_lines = "--------------------" -def filename_cant_be_read_message(filename): - message = f"The file {filename} couldn't be read. Make sure the file is a valid textual file." + \ - "If you can regularly open it, you may be missing certain libraries:" + \ - "\nantiword for .doc (not .docx)" + \ - "\npoppler for .pdf" + \ - "\n\nPlease check 'help_with_libraries.txt' at PapersCited Github:" + \ - "https://github.com/Mkranj/PapersCited/blob/main/help_with_libraries.txt" +cant_read_doc_msg = "NOTE: An additional library is required to read .doc files." + \ + "\nThe simplest solution is to convert the file to a .docx file, then try analysing it again." + \ + "\n\nAnother solution is to setup Antiword." + \ + "\nFor more information, please check 'help_with_libraries.txt' at PapersCited Github: " + \ + "https://github.com/Mkranj/PapersCited/blob/main/help_with_libraries.txt" + +cant_read_pdf_msg = "NOTE: An additional library, poppler, is required to read .pdf files." + \ + "\nFor more information, please check 'help_with_libraries.txt' at PapersCited Github: " + \ + "https://github.com/Mkranj/PapersCited/blob/main/help_with_libraries.txt" + \ + "\nAlternatively, you can manually copy the text from the .pdf and paste it into a" + \ + "supported file format, such as .docx or .txt." + +def filename_cant_be_read_message(filename, extension): + message = f"The file {filename} couldn't be read. Make sure the file is a valid textual file." + + if extension == ".doc": + message = message + "\n" + cant_read_doc_msg + "\n" + + if extension == ".pdf": + message = message + "\n" + cant_read_pdf_msg + "\n" + return(message) def report_found_citations(filename, citations, wider_citations): From 68f02c4245c38fe20e431c90c9e1b48f7f31a770 Mon Sep 17 00:00:00 2001 From: Mkranj Date: Sat, 19 Aug 2023 13:02:09 +0200 Subject: [PATCH 5/8] Updated README for new version and GUI --- README.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index c19a3e9..8175594 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,29 @@ -# PapersCited -## v1.2.3 -Create an Excel file containing all citations found in a document, so they can be used to check or build a reference list. +# PapersCited v1.3 +Find all citations mentioned in a document. Build and check your reference lists quickliy and easily. ## About: -***PapersCited*** is a Python program designed to help you with **writing and reviewing reference lists** in your scientific articles. It reads through a document of your choice and takes a note every time something is cited. At the end, it writes all those citations in an Excel file in alphabetical order, omitting duplicate entries. -With that file, you can easily go through your reference list and note if you cited something but didn't include it in the reference list or, conversely, you have a reference that isn't cited anywhere in the article. It is also handy for writing a reference list from stratch. You no longer need to manually go over the whole article and note all the times you cite another source. +***PapersCited*** is a Python program designed to help you with **writing and reviewing reference lists** in your scientific articles. It reads through a document of your choice and takes a note every time something is cited. At the end, it writes all of those citations in alphabetical order, omitting duplicate entries. You can also save them in an Excel or textual file. +Now you can easily go through your reference list and note if you cited something but didn't include it or, conversely, you wrote a reference that isn't cited anywhere in the article. It is also handy for writing a reference list from stratch. You no longer need to manually go over the whole article and note all the times you cite another source. -The first column in the Excel file is empty so you can easily mark certain citations as "OK" or "needs double-checking" when reviewing a reference list. - -**Longer citations** appear in a separate column. These encompass citations listing three authors, or authors with two surnames (e.g. *Van Selm and Jankowsky (2006)*). However, the potential for superfluous words being recognised as proper surnames is somewhat higher here, so they are displayed separately. - -Tested on Windows 10 using .doc, .docx, .txt and .pdf files. This program is appropriate for texts written in **English** and **Croatian**. Some sources may be detected incorrectly in other languages. The software is written with **APA style** citations in mind, but **Chicago style** and similar would work as well. -# Instructions: +Tested on Windows 10 using .docx, .txt, .doc and .pdf files. + +## Instructions: - Download the latest version of **PapersCited.zip** from the *Releases* tab, to the right. - Extract the archive in a folder of your choosing. - The newly extracted folder contains data files, an example document, and a shorcut to the Paperscited program. **Run the PapersCited shortcut**. -- When prompted, select the document you want to search for citations. - -The program creates an Excel file in the same directory as the document. The name of the file is the same as the document, with *"_citations.xlsx"* appended. **If a file with the same name as the _citations.xlsx already exists, it will be overwritten!** +- Click the button and select a file you want to analyse. +- After inspecting the results, you can save them as .xlsx or .txt via appropriate buttons. The program comes with a file called "*example.docx*" if you want to experiment with the program and preview what the output looks like. -## Solutions for potential issues: +### Excel output layout +The first column in the Excel file is empty so you can easily mark certain citations as "OK" or "needs double-checking" when reviewing a reference list. + +**Longer citations** appear in a separate column. These encompass citations listing three authors, or authors with two surnames (e.g. *Van Selm and Jankowsky (2006)*). However, the potential for superfluous words being recognised as proper surnames is somewhat higher here, so they are displayed separately. + +## Reading .doc and .pdf files: - If you get an error reading .doc or .pdf files on Windows, you might need to download additional libraries for working with these files. See [help_with_libraries.txt](https://github.com/Mkranj/PapersCited/blob/main/help_with_libraries.txt) for detailed instructions on how to do so. ## Known limitations: @@ -36,7 +36,7 @@ The program comes with a file called "*example.docx*" if you want to experiment # Running the latest version of the Python script ## Dependencies: This program was written using Python 3.9.12. It requires the following modules: -textract, xlsxwriter, regex, docx2python +textract, xlsxwriter, regex, docx2python, tkinter To install them (on Windows), open Powershell, type "*pip install textract*" and press Enter. After that, follow with "*pip install xlsxwriter*" and so on. From bd7ad3857c3b476a7015bcee0a12ea972b427972 Mon Sep 17 00:00:00 2001 From: Mkranj Date: Sat, 19 Aug 2023 13:18:37 +0200 Subject: [PATCH 6/8] Update About and add sponsor link --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8175594..35211be 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,16 @@ # PapersCited v1.3 -Find all citations mentioned in a document. Build and check your reference lists quickliy and easily. +Find all citations mentioned in a document. Build and check your reference lists quickly and easily. ## About: -***PapersCited*** is a Python program designed to help you with **writing and reviewing reference lists** in your scientific articles. It reads through a document of your choice and takes a note every time something is cited. At the end, it writes all of those citations in alphabetical order, omitting duplicate entries. You can also save them in an Excel or textual file. +***PapersCited*** is a program designed to help you with **writing and reviewing reference lists** in your scientific articles. It reads through a document of your choice and takes a note every time something is cited. At the end, it lists all of those citations in alphabetical order, omitting duplicate entries. You can also save them in an Excel or textual file. Now you can easily go through your reference list and note if you cited something but didn't include it or, conversely, you wrote a reference that isn't cited anywhere in the article. It is also handy for writing a reference list from stratch. You no longer need to manually go over the whole article and note all the times you cite another source. -This program is appropriate for texts written in **English** and **Croatian**. Some sources may be detected incorrectly in other languages. The software is written with **APA style** citations in mind, but **Chicago style** and similar would work as well. +The program was built with texts written in **English** and **Croatian** in mind. Some sources may be detected incorrectly in other languages. **APA style** citations are the primary target, but **Chicago style** and similar should work as well. Tested on Windows 10 using .docx, .txt, .doc and .pdf files. +Found PapersCited useful? How about [**buying me a coffee**](https://www.buymeacoffee.com/mkranj61) and supporting development? After all, coffee makes the world go round :star_struck: + ## Instructions: - Download the latest version of **PapersCited.zip** from the *Releases* tab, to the right. - Extract the archive in a folder of your choosing. From 331166f5e4aa65f1bb0190a053963acb85c586b5 Mon Sep 17 00:00:00 2001 From: Mkranj Date: Thu, 5 Oct 2023 20:03:24 +0200 Subject: [PATCH 7/8] Add Vancouver style and Windows remarks --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index cca0c08..f3b7979 100644 --- a/README.md +++ b/README.md @@ -5,14 +5,14 @@ Find all citations mentioned in a document. Build and check your reference lists ***PapersCited*** is a program designed to help you with **writing and reviewing reference lists** in your scientific articles. It reads through a document of your choice and takes a note every time something is cited. At the end, it lists all of those citations in alphabetical order, omitting duplicate entries. You can also save them in an Excel or textual file. Now you can easily go through your reference list and note if you cited something but didn't include it or, conversely, you wrote a reference that isn't cited anywhere in the article. It is also handy for writing a reference list from stratch. You no longer need to manually go over the whole article and note all the times you cite another source. -The program was built with texts written in **English** and **Croatian** in mind. Some sources may be detected incorrectly in other languages. **APA style** citations are the primary target, but **Chicago style** and similar should work as well. + PapersCited covers **APA style** and **Vancouver style** citations excellently, but **Chicago style** and similar should work as well. The program was built with texts written in **English** and **Croatian** in mind. Some sources may be detected incorrectly in other languages. Tested on Windows 10 using .docx, .txt, .doc and .pdf files. Found PapersCited useful? How about [**buying me a coffee**](https://www.buymeacoffee.com/mkranj61) and supporting development? After all, coffee makes the world go round :star_struck: ## Instructions: -- Download the latest version of **PapersCited.zip** from the *Releases* tab, to the right. +- Download the latest version of **PapersCited.zip** from the *Releases* tab, to the right. *(Windows only)* - Extract the archive in a folder of your choosing. - The newly extracted folder contains data files, an example document, and a shorcut to the Paperscited program. **Run the PapersCited shortcut**. - Click the button and select a file you want to analyse. @@ -29,11 +29,11 @@ The first column in the Excel file is empty so you can easily mark certain citat - If you get an error reading .doc or .pdf files on Windows, you might need to download additional libraries for working with these files. See [help_with_libraries.txt](https://github.com/Mkranj/PapersCited/blob/main/help_with_libraries.txt) for detailed instructions on how to do so. ## Known limitations: -- Copying PDF text to clipboard might yield unexpected characters, such as *´c* instead of *č*. This depends on the individual PDF file's encoding, however, it might lead to incorrect text scanning. - Secondary citations ("*XX 2010, as cited in YY 2012*"). *YY 2012* is detected correctly. However, *XX 2010* also gets recorded as a primary source. *XX 2010* should not be included in the reference list. - In Croatian, different declinations of the author's surnames get detected as different authors. - Surnames with three or more words, such as van der Flier, will get recorded only as the last word or last two words - *Kappe and van der Flier (2010)* will be recorded as *Flier 2010*, **skipping the first author**! These require special attention when writing or reviewing a reference list. Multiple surnames with an "*-*", however, will be recorded correctly. - Similarly, when citing organizations, laws and other documents, it possible that only the end of the full name gets recorded. *World Health Organization (2000)* will be recorded as *Organization 2000*, with *Health Organization 2000* as a separate suggestion. (*WHO 2000* would be fine, though.) So if you are working with these kinds of sources, extra attention is needed. +- Copying text from a PDF to clipboard might yield unexpected characters, such as *´c* instead of *č*. This is a matter of the individual PDF file's encoding, however, it might lead to incorrect text scanning. --- # Running the latest version of the Python script From d0aaa27817a6d8d1b2394ebc8a9d395589ee0a26 Mon Sep 17 00:00:00 2001 From: Mkranj Date: Sat, 7 Oct 2023 11:20:00 +0200 Subject: [PATCH 8/8] Note running .py for non-Windows systems --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index f3b7979..5739c3f 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Find all citations mentioned in a document. Build and check your reference lists ***PapersCited*** is a program designed to help you with **writing and reviewing reference lists** in your scientific articles. It reads through a document of your choice and takes a note every time something is cited. At the end, it lists all of those citations in alphabetical order, omitting duplicate entries. You can also save them in an Excel or textual file. Now you can easily go through your reference list and note if you cited something but didn't include it or, conversely, you wrote a reference that isn't cited anywhere in the article. It is also handy for writing a reference list from stratch. You no longer need to manually go over the whole article and note all the times you cite another source. - PapersCited covers **APA style** and **Vancouver style** citations excellently, but **Chicago style** and similar should work as well. The program was built with texts written in **English** and **Croatian** in mind. Some sources may be detected incorrectly in other languages. +PapersCited covers **APA style** and **Vancouver style** citations excellently, but **Chicago style** and similar should work as well. The program was built with texts written in **English** and **Croatian** in mind. Some sources may be detected incorrectly in other languages. Tested on Windows 10 using .docx, .txt, .doc and .pdf files. @@ -14,8 +14,8 @@ Found PapersCited useful? How about [**buying me a coffee**](https://www.buymeac ## Instructions: - Download the latest version of **PapersCited.zip** from the *Releases* tab, to the right. *(Windows only)* - Extract the archive in a folder of your choosing. -- The newly extracted folder contains data files, an example document, and a shorcut to the Paperscited program. **Run the PapersCited shortcut**. -- Click the button and select a file you want to analyse. +- The newly extracted folder contains an example document and the Paperscited executable. **Run PapersCited.exe**. +- Click the *Choose document* button and select a file you want to analyse. - After inspecting the results, you can save them as .xlsx or .txt via appropriate buttons. The program comes with a file called "*example.docx*" if you want to experiment with the program and preview what the output looks like. @@ -36,7 +36,9 @@ The first column in the Excel file is empty so you can easily mark certain citat - Copying text from a PDF to clipboard might yield unexpected characters, such as *´c* instead of *č*. This is a matter of the individual PDF file's encoding, however, it might lead to incorrect text scanning. --- -# Running the latest version of the Python script +# Running the latest version of the Python script +If you use an operating system other than Windows, or want the very latest changes on this Github, you'll need to install Python and the dependencies listed below. + ## Dependencies: This program was written using Python 3.9.12. It requires the following modules: textract, xlsxwriter, regex, docx2python, tkinter @@ -50,7 +52,7 @@ Keep the default values for all the settings **except** the checkbox asking to p After installing Miniconda, open the start menu and search for Windows Powershell, then open it. Install the two modules required by the program, as described in **Dependencies**. -With all that done, the script should run when double-clicked. If it asks you which program to open it with, choose to *look for another app on this PC* and navigate to C:\Users\\*(your username)*\miniconda3 and select *python.exe*. Now you can just double-click on the script to run it. +With all that done, the *PapersCited.py* script should run when double-clicked. If it asks you which program to open it with, choose to *look for another app on this PC* and navigate to C:\Users\\*(your username)*\miniconda3 and select *python.exe*. Now you can just double-click on the script to run it. # Lastly... Any comments, bug reports and suggestions are welcome. Happy writing!