-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch.py
148 lines (147 loc) · 4.45 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#
import os, subprocess, sys, argparse, pikepdf, shutil, glob, time, signal, urllib.request, codecs
from pybtex.database import parse_file, BibliographyData
from pathlib import Path
import webbrowser
import translitcodec
from main import remove_special_chars
#
# python3 fetch.py --bib_path ~/paper_journal.bib --root ~/journal_root --watch_dir ~/Downloads
#
def sigint_handler(signal, frame):
print('')
print('Interrupted')
sys.exit(0)
signal.signal(signal.SIGINT, sigint_handler)
#
def check_valid_pdf(path):
try:
pikepdf.open(path).open_metadata()
except:
return False
return True
#
def safe_remove( path ):
try:
os.remove(path)
except:
pass
#
def normalize( text ):
return codecs.encode(text,'translit/short')
#
def download( root, entry, watch_dir ):
#
fields = entry.fields
persons = entry.persons
authors = persons['author']
lastname = remove_special_chars(authors[0].last_names[0].lower().encode("ascii","ignore").decode('latex'))
lastname = normalize(lastname)
dirname = lastname+str(fields['year'])
counter = 0
while True:
base = os.path.join(root,dirname)
if counter == 0:
dirpath = base
else:
dirpath = f'{base}-{counter}'
counter += 1
bib_files = glob.glob(f'{dirpath}/*.bib')
pdf_files = glob.glob(f'{dirpath}/*.pdf')
if bib_files and pdf_files:
ref_fields = list(parse_file(bib_files[0]).entries.values())[0].fields
if 'doi' in fields and 'doi' in ref_fields and ref_fields['doi'] == fields['doi']:
print( f'Found duplicate DOI "{os.path.basename(dirpath)}"' )
return
if 'title' in fields and 'title' in ref_fields and ref_fields['title'].lower() == fields['title'].lower():
print( f'Found duplicate title "{os.path.basename(dirpath)}"' )
return
else:
print( 'Duplicate directory. Increasing counter..' )
continue
else:
save_file_list = os.listdir(watch_dir)
url = 'http://www.google.com/search?query={}'.format('+'.join(fields['title'].split()))
webbrowser.open(url)
tmp_path = None
while True:
for file in os.listdir(watch_dir):
if file.endswith('.pdf') and file not in save_file_list:
if ' ' in file:
new_file = file.replace(' ', '_')
shutil.move(os.path.join(watch_dir,file),os.path.join(watch_dir,new_file))
file = new_file
tmp_path = os.path.join(watch_dir,file)
break
if tmp_path:
break
time.sleep(1)
#
print( f'Saving to {dirpath}...' )
os.makedirs(dirpath,exist_ok=True)
new_path = os.path.join(dirpath,'main.pdf')
cmd = f'gs -o {new_path} -sDEVICE=pdfwrite -dPDFSETTINGS=/prepress -dNOPAUSE -dBATCH {tmp_path}'
copy_original = False
try:
print(cmd)
subprocess.check_output(cmd,stderr=subprocess.STDOUT,timeout=30,shell=True)
except Exception as e:
print(e)
copy_original = True
#
if check_valid_pdf(new_path):
tmp_size = Path(tmp_path).stat().st_size
new_size = Path(new_path).stat().st_size
if new_size < tmp_size:
safe_remove(tmp_path)
else:
copy_original = True
else:
copy_original = True
#
if copy_original:
safe_remove(new_path)
shutil.move(tmp_path,new_path)
#
# Create bibtex file in it
new_bibtex = BibliographyData({
os.path.basename(dirpath) : entry
})
new_bibtex.to_file(os.path.join(dirpath,'main.bib'))
break
#
if __name__ == '__main__':
#
parser = argparse.ArgumentParser()
parser.add_argument('--bib_path', required=True, help='bibtex file path')
parser.add_argument('--root', required=True, help='root output path')
parser.add_argument('--watch_dir', required=True, help='downloads watch directory')
args = parser.parse_args()
#
# Load bibtex
if args.bib_path.startswith('http://') or args.bib_path.startswith('https://'):
bibtex = urllib.request.urlopen(args.bib_path).read().decode('ascii','ignore')
else:
bibtex = parse_file(args.bib_path)
#
# For each paper
NUM_PAPERS = len(list(bibtex.entries.keys()))
print( f'==== {NUM_PAPERS} papers ====')
print( f'Download PDFs to "{args.watch_dir}"' )
#
for i,key in enumerate(bibtex.entries):
#
print('')
print( f'{NUM_PAPERS-i} papers remaining...' )
fields = bibtex.entries[key].fields
print( f'title: "{fields["title"]}"' )
#
if 'volume' in fields and 'number' in fields:
root = os.path.join(args.root,'volume',fields['volume'],fields['number'])
elif 'year' in fields:
root = os.path.join(args.root,'year',fields['year'])
#
# Create directory path and download paper
download(root,bibtex.entries[key],args.watch_dir)
#
print( 'Done!' )