-
Notifications
You must be signed in to change notification settings - Fork 115
/
fix_agenda.py
executable file
·416 lines (348 loc) · 15 KB
/
fix_agenda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
#!/usr/bin/env python3
# The scenario:
# Someone sends you a collection of files for an upcoming meeting,
# in a variety of formats but predominantly .doc and .docx,
# one of which is an agenda for the meeting.
# The goal:
# Create a place on the website where meeting attendees can
# view the agenda and click on links to see documents,
# converted to HTML if possible, plus a zip file of everything
# that they can download to their local machine.
# Make a directory for the meeting (e.g. 2021=03-06), with a
# subdirectory 2021=03-06/orig where all the original documents go.
# If there are any HTML files, put them in 2021=03-06/html.
# Then run this script passing it one argument, the agenda file,
# e.g. fix_agenda orig/1.0_Agenda_03.06.2021.docx.
# This will convert all the .doc and .docx files using unoconv
# into a directory called html, then convert the agenda file to
# html using mammoth, with links to the right converted files.
# It will also make a zip file of everything.
from bs4 import BeautifulSoup, NavigableString
from datetime import datetime
import os, sys
from word2html import docx2html, docx2htmlfile
from difflib import SequenceMatcher
import subprocess
# uno_env = os.environ.copy()
# uno_env['HOME'] = '/tmp'
# uno_env["UNOPATH"] ="/usr/bin/libreoffice"
VERBOSE = True
# A list of extensions we might expect to find in original files.
# This is because the agenda might omit extensions, and we also can't
# assume that a dot indicates an extension.
# For example, the agenda might have '9.3_LA_Report_11.2024'
# which is expected to match '9.3_LA_Report_11.2024.docx', and you
# can't just strip off everything after the last dot because then
# you lose the '2024'. Sigh!
ORIG_EXTENSIONS = [ '.doc', '.docx', '.odt',
'.xls', '.xlsx', '.csv'
]
# Remove extension: can't just use os.path.splitext
# because there might be dots in the filename,
# e.g. 10.1_CNM_Report_03.2021
def smart_splitext(aname):
"""Strip off any extension matching something in ORIG_EXTENSIONS.
"""
name, ext = os.path.splitext(aname)
if ext.lower() in ORIG_EXTENSIONS:
return name, ext
return aname, ''
def legalchars(fname):
"""Remove illegal characters from filenames"""
return''.join([x for x in fname if x.isalpha() or x.isdigit() \
or x in '-_.'])
def fix_agenda(agenda_infile):
# Convert agenda_infile to absolute path
agenda_infile = os.path.abspath(agenda_infile)
# First convert the agenda, if needed
agenda_base, agenda_ext = smart_splitext(agenda_infile)
if agenda_ext =='.html':
with open(agenda_infile) as fp:
soup = BeautifulSoup(fp, "lxml")
elif agenda_ext =='.docx':
soup = BeautifulSoup(docx2html(agenda_infile), "lxml")
else:
print("Agenda must be .html or .docx")
sys.exit(1)
origdir, agendafilename = os.path.split(agenda_infile)
basedir, origdirname = os.path.split(origdir)
if origdirname == "orig":
htmldir = os.path.join(basedir, "html")
elif origdirname == 'html':
htmldir = origdirname
origdirname = os.path.join(basedir, "orig")
origdir = os.path.join(basedir, origdirname)
else:
print("Infile name should be a in a directory named 'orig' or 'html'")
sys.exit(1)
# The title comes out as /full/path/to/filename.html
# How to change a title is completely undocumented,
# but it turns out to be title.string.
title = soup.title.string
if '/' in title:
title = os.path.basename(title)
soup.title.string = title
# Convert any existing links to be target="_blank"
for link in soup.find_all("a"):
if link.attrs["href"] and not link.attrs["href"].startswith("#"):
link.attrs["target"] = "_blank"
origfiles = os.listdir(origdir)
origbases = [ smart_splitext(f)[0] for f in origfiles ]
if not os.path.exists(htmldir):
os.mkdir(htmldir)
htmlfiles = os.listdir(htmldir)
htmlbases = [ smart_splitext(f)[0] for f in htmlfiles ]
# Read the list of original filenames, in subdir orig.
# with bad filename characters removed.
origfiles = []
for fname in os.listdir("orig"):
newfname = legalchars(fname)
if newfname != fname:
print("** Renaming '%s' to '%s'" % (fname, newfname))
os.rename(os.path.join('orig', fname),
os.path.join('orig', newfname))
origfiles.append(newfname)
# List of already existing html filenames, in subdir html.
# We'll add to this list as files are converted to html.
htmlfiles = os.listdir("html")
nosuchfiles = []
cantconvert = []
converted = []
already_converted = []
guesses = {}
for em in soup.findAll('em'):
# em_text = smart_splitext(em.text.strip())[0]
em_text = em.text.strip()
# Remove empty <em>s
if not em_text:
em.extract()
continue
em_text = legalchars(em_text)
if VERBOSE:
print("\n===", em_text)
# Find the original file.
# Can't just search for em_text in origbases,
# because non-technical users don't seem to grok
# "put the filename in the agenda", and will instead put
# something that vaguely resembles the filename, e.g.
# the agenda might have "Budget proposal FY 21-23" where
# the actual filename is "45. Budget proposal FY 21-23.pdf".
# elif em_text in origbases:
def fuzzy_search(agendaname, filenames):
"""If agendaname is a filename referenced in the agenda,
try to match it in the list filenames,
returning the index into the list, or -1 if no match.
If there's no exact match, use fuzzy search and also
be tolerant of missing extensions.
"""
agendaname, fileext = smart_splitext(agendaname)
filenames_no_ext = [ os.path.splitext(f)[0] for f in filenames ]
if agendaname in guesses:
return filenames.index(guesses[agendaname])
# First try non-fuzzy, with fingers tightly crossed
if agendaname in filenames:
return filenames.index(agendaname)
# No luck there, try fuzzy matches
best_ratio = -1
best_match = None
for i, filename in enumerate(filenames):
if agendaname in filename:
return i
r = SequenceMatcher(None, agendaname, filename).ratio()
if r > best_ratio:
best_match = i
best_ratio = r
if best_ratio > .88:
return best_match
return -1
def replace_em(href):
nonlocal em, em_text
a = soup.new_tag("a", href=href, target="_blank")
a.string = em_text
em.contents[0].replace_with(a)
em_text = None
def append_parenthesized_orig(em, origfile):
# Add a parenthesizedlink to the original document.
# This doesn't end up getting called for every .doc* file, though.
base, ext = smart_splitext(origfile)
em.append(NavigableString(" (original "))
linkfile = "orig/" + origfile
a = soup.new_tag("a", href=linkfile, target="_blank")
a.string = ext
em.append(a)
em.append(NavigableString(")"))
index = fuzzy_search(em_text, origbases)
htmlindex = fuzzy_search(em_text, htmlbases)
if index < 0 and htmlindex < 0: # No exact or fuzzy match
print("Couldn't find a match for", em_text)
nosuchfiles.append(em_text)
continue
if index > 0 and origbases[index] != em_text:
guesses[em_text] = origbases[index]
# Is the html match fuzzy but the original match not?
# e.g. if origfiles include 9.1_CNM_Report_03.2024.docx and
# 9.7_SNM_Report_03.2024.pdf, then there won't be an HTML file
# for SNM but fuzzy_match for SNM will match the CNM HTML,
# which it shouldn't.
# print("index", index, "htmlindex", htmlindex)
# print("origbases", origbases)
# print("htmlbases", htmlbases)
if index >= 0 and htmlindex >= 0 and \
origbases[index] != htmlbases[htmlindex]:
print("%s and %s don't match: probably not the right HTML file"
% (origbases[index], htmlbases[htmlindex]))
htmlindex = -1
# Found a match by searching origbases, returning index.
# So the actal original file is origfiles[index].
if index >= 0:
origfile = origfiles[index]
origbase, ext = os.path.splitext(origfile)
# Is there already an HTML file matching this em?
if htmlindex >= 0:
orightml = htmlfiles[htmlindex]
# an original HTML file, or one already converted from Word?
if index >= 0:
if VERBOSE:
print(" There's both Word and HTML for", origbase)
try:
orig_mtime = os.stat("orig/" + origfile).st_mtime
html_mtime = os.stat("html/" + orightml).st_mtime
if orig_mtime <= html_mtime:
replace_em("html/" + orightml)
already_converted.append(origfile)
print(" ... The HTML was newer")
# Still want a link to the orig doc file, though
append_parenthesized_orig(em, origfile)
continue
else:
print(" Replacement Word file, re-converting")
except Exception as e:
print("Couldn't get mtime:", e)
# No Word file, it's just an HTML file that stands by itself
else:
print("standalone html file", orightml)
replace_em("html/" + orightml)
already_converted.append(orightml)
continue
# Okay, the original file isn't HTML. Is it Word?
if ext == ".docx" or ext == ".doc":
infile = os.path.join(origdir, origfile)
htmlfile = origbase + ".html"
outfile = os.path.join(htmldir, htmlfile)
if origbase.startswith('1') and "Agenda" in origbase:
# When converting the agenda, use mammoth because
# there won't be colors (that we care about, anyway)
# and it's sometimes handy to be able to edit the
# agenda -- unoconv output is almost impossible to edit
# without making a mistake.
docx2htmlfile(infile, outfile)
converted.append(origfile)
# print("Converted the agenda %s separately" % outfile)
else:
# Use unoconv because it preserves colors,
# which mammoth/word2html does not.
#
# For some reaason, unoconv flakes out when you try to
# call it from inside a python script, even if it
# works just fine from the shell. The error message is:
# unoconv: Cannot find a suitable pyuno library
# and python binary combination in
# /usr/lib/libreoffice
# The cure: specify the full path to both python and unoconv.
# No one seems to understand why.
rv = subprocess.call(["/usr/bin/python3", "/usr/bin/unoconv",
"-f", "html", "-T", "10",
"-o", outfile, infile])
if not rv:
converted.append(origfile)
else:
cantconvert.append(origfile)
# print("unoconv failed on", filename, "exit code", rv)
if os.path.exists(outfile):
htmlfile = "html/" + htmlfile
replace_em(htmlfile)
# Don't add to htmlfiles, should be already there
append_parenthesized_orig(em, origfile)
else:
if VERBOSE:
print("Not word or html:", origfile)
cantconvert.append(origfile)
linkfile = "orig/" + origfile
replace_em(linkfile)
# Make sure all the originals are readable -- mutt insists on
# saving all attachments as mode 600.
for f in os.listdir(origdir):
os.chmod(os.path.join(origdir, f), 0o644)
zipname = os.path.basename(basedir) # the base directory's name
# Remove any zip file that was there before
os.chdir(basedir)
zipfile = os.path.join("%s.zip" % zipname)
try:
os.unlink(zipfile)
except:
pass
# Now make a zip file, starting from one level above the basedir
os.chdir("..")
allfiles = [ os.path.join(zipname, "index.html") ] \
+ [ os.path.join(zipname, "html", f) for f in htmlfiles ] \
+ [ os.path.join(zipname, "orig", f) for f in origfiles ]
# print("allfiles:")
# for f in allfiles:
# print(" ", f)
rv = subprocess.call(["zip", os.path.join(zipname, zipfile), *allfiles])
if not rv:
print("zip file is:", zipfile)
else:
print("Couldn't save zip file")
# Footer material
tag = soup.new_tag("hr")
soup.body.append(tag)
# Last-modified date
now = datetime.now()
nowtext = NavigableString("Updated: " + now.strftime("%a %m/%d/%Y %H:%M"))
soup.body.append(nowtext)
tag = soup.new_tag("br")
soup.body.append(tag)
# link to the zip file:
tag = soup.new_tag("a", href=zipfile)
tag.string = "Zip file of everything"
soup.body.append(tag)
# Finally, the agenda is ready to write.
agenda_out = os.path.join(basedir, "index.html")
with open(agenda_out, "w") as outfp:
outfp.write(soup.prettify())
print("Wrote agenda", agenda_out)
# Finally, print stats.
if already_converted:
print("\nAlready converted:")
for f in already_converted:
print(" ", f)
if converted:
print("\nConverted:")
for f in converted:
print(" ", f)
if cantconvert:
print("\nLeft untouched:")
for f in cantconvert:
print(" ", f)
if guesses:
print("\nFuzzy matches (in agenda -> actual filename):")
for a in guesses:
print(f" {a} -> {guesses[a]}")
print("BE SURE TO LOOK OVER THIS LIST!")
if nosuchfiles:
print("\nMissing files referenced in the agenda:")
for f in nosuchfiles:
print(" ", f)
orphans = []
for o in origfiles:
if o not in already_converted and o not in converted \
and o not in cantconvert:
orphans.append(o)
if orphans:
orphans.sort()
print("\nOriginal files not referenced in the agenda:")
for o in orphans:
print(" ", o)
if __name__ == '__main__':
fix_agenda(sys.argv[1])