-
Notifications
You must be signed in to change notification settings - Fork 1
/
CleanPDFCitations.py
75 lines (51 loc) · 2.18 KB
/
CleanPDFCitations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
import re
import os
import argparse
from roman import toRoman
citation_counter = 1
# TODO: look for carriage returns in links and remove
def main():
# create an argument parser to take commandline arguments (e.g. target filename)
arg_parser = argparse.ArgumentParser(description="Remove carriage returns and excess text in citations list from PDF.")
arg_parser.add_argument("filename")
arg_parser.add_argument("report_title")
args = arg_parser.parse_args()
# right now this only matches chapter headings denoted by 5 equals signs and section headings by 5 dashes
# other formats for these could optionally be added, if other markdown sources are used
# e.g. load the file passed in from the command line
targetFile = args.filename
report_title = args.report_title
# strip the file extension from the target file to create target folder
cleanfile = open(targetFile[:-3]+"_clean.txt", "w")
cleanfile.write("Citations\n======\n\n")
sourceStream = open(targetFile, "rU")
sourceText = sourceStream.readlines()
num_lines = len(sourceText)
text_holder = []
cite_pattern = re.compile("(\d+\.\s)")
page_pattern = re.compile("(\d+\s"+report_title+")")
title_pattern = re.compile("(Columbia Journalism School)")
for line_num, line in enumerate(sourceText):
if line_num == 0:
text_holder.append(line[:-1])
if cite_pattern.match(line):
if text_holder != []:
# if it's not empty, process and write its contents, then empty and append this line
joined_holder = "".join(text_holder)
fix_dash1 = re.sub("(-\s)","-", joined_holder)
fix_dash2 = re.sub("(\s-)","-", fix_dash1)
fix_slash1 = re.sub("(\s\/)","/", fix_dash2)
fix_slash2 = re.sub("(\/\s)","/", fix_slash1)
fix_colon = re.sub("https\s://","https://", fix_slash2)
fix_com = re.sub("\s.com", ".com", fix_colon)
cleanfile.writelines(fix_com+"\n\n")
text_holder = []
text_holder.append(line[:-1])
elif not(page_pattern.match(line)) and not(title_pattern.match(line)):
# if this isn't the start of a citation, lop off the errant carriage return
text_holder.append(line[:-1])
sourceStream.close()
cleanfile.close()
main()