-
Notifications
You must be signed in to change notification settings - Fork 1
/
ifmap.py
1371 lines (1164 loc) · 47.6 KB
/
ifmap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
import sys
import re
import os
import os.path
import time
import datetime
import hashlib
from collections import ChainMap, OrderedDict
import optparse
import markdown
import markdown.inlinepatterns
import markdown.extensions
import xml.etree
import urllib.parse
import json
from jinja2 import Environment, FileSystemLoader, select_autoescape
from jinja2.ext import Extension
ROOTNAME = 'if-archive'
DESTDIR = None
curdate = None
dirsince = None
popt = optparse.OptionParser(usage='ifmap.py')
popt.add_option('--index',
action='store', dest='indexpath',
help='pathname of Master-Index')
popt.add_option('--src',
action='store', dest='libdir',
help='pathname of directory containing template files')
popt.add_option('--tree',
action='store', dest='treedir',
help='pathname of directory containing archive files')
popt.add_option('--dest',
action='store', dest='destdir', default='indexes',
help='directory to write index files (relative to --tree; default "indexes")')
popt.add_option('--meta',
action='store', dest='metadir', default='metadata',
help='directory to write metadata files (relative to --tree; default "metadata")')
popt.add_option('-v', '--verbose',
action='count', dest='verbose', default=0,
help='print verbose output (repeat for more)')
popt.add_option('--curdate',
action='store', dest='curdate', metavar='ISODATE',
help='timestamp to use as "now" (for testing)')
popt.add_option('--since',
action='store', dest='sincefile',
help='only build index/metadata for directories changed since this file')
class DirList:
"""DirList: A list of directories, loaded from a source file.
"""
def __init__(self, filename):
self.ls = []
self.set = set()
try:
filename = os.path.join(opts.libdir, filename)
fl = open(filename, encoding='utf-8')
except:
return
while True:
ln = fl.readline()
if not ln:
break
ln = ln.strip()
if not ln:
continue
self.ls.append(ln)
self.set.add(ln)
fl.close()
class NoIndexEntry(DirList):
"""NoIndexEntry: A list of directories in which it's okay that there's
no index entries.
The logic here is a bit twisty. Normally, if we find a file which
is not mentioned in any Index file, we print a warning.
The no-index-entry file (in libdir) is a list of files and directories
in which we do *not* do this check (and therefore print no warning,
and never exclude files). We use this for directories containing a
large number of boring files (like info/ifdb), and directories whose
contents change frequently (like unprocessed).
"""
def __init__(self):
DirList.__init__(self, 'no-index-entry')
def check(self, path):
"""The argument is the pathname of a file which was found in
the treedir but which was never mentioned in any Index file.
If the path, or any prefix of the path, exists in our list,
we return True.
"""
for val in self.ls:
if path.startswith(val):
return True
return False
class FileHasher:
"""FileHasher: A module which can extract hashes of files.
Since hashing is this script's slowest task, we keep a cache of
checksums. The cache file has a very simple tab-separated format:
size mtime md5 sha512 filename
We only use a cache entry if the size and mtime both match. (So if a
file is updated, we'll recalculate.)
We only ever append to the cache file. So if a file is updated, we
wind up with redundant lines in the cache. That's fine; the latest
line is the one that counts. But it might be a good idea to delete
the cache file every couple of years to tidy up.
"""
def __init__(self):
# Maps filenames to (size, timestamp, md5, sha512)
self.cache = {}
# Create the cache file if it doesn't exist.
self.cachefile = os.path.join(opts.treedir, 'checksum-cache.txt')
if not os.path.exists(self.cachefile):
fl = open(self.cachefile, 'w', encoding='utf-8')
fl.close()
fl = open(self.cachefile, encoding='utf-8')
pattern = re.compile(r'^([0-9]+)\s([0-9]+)\s([0-9a-f]+)\s([0-9a-f]+)\s(.*)$')
while True:
ln = fl.readline()
if not ln:
break
ln = ln.rstrip()
match = pattern.match(ln)
if match:
size = int(match.group(1))
timestamp = int(match.group(2))
md5 = match.group(3)
sha512 = match.group(4)
filename = match.group(5)
self.cache[filename] = (size, timestamp, md5, sha512)
fl.close()
def get_hashes(self, filename, size, timestamp):
if filename in self.cache:
(cachesize, cachetimestamp, md5, sha512) = self.cache[filename]
if size == cachesize and timestamp == cachetimestamp:
return (md5, sha512)
if opts.verbose:
print('Computing hashes for %s' % (filename,))
(md5, sha512) = self.calculate_hashes(filename)
self.cache[filename] = (size, timestamp, md5, sha512)
fl = open(self.cachefile, 'a', encoding='utf-8')
fl.write('%d\t%d\t%s\t%s\t%s\n' % (size, timestamp, md5, sha512, filename))
fl.close()
return (md5, sha512)
def calculate_hashes(self, filename):
accum_md5 = hashlib.md5()
accum_sha512 = hashlib.sha512()
fl = open(filename, 'rb')
while True:
dat = fl.read(1024)
if not dat:
break
accum_md5.update(dat)
accum_sha512.update(dat)
fl.close()
return (accum_md5.hexdigest(), accum_sha512.hexdigest())
class SafeWriter:
"""SafeWriter: a class which can write a file atomically.
This implements a simple pattern: you open a temporary file for
writing, write data to it, close the file, and then move it
to its final location.
"""
def __init__(self, tempname, finalname):
self.tempname = tempname
self.finalname = finalname
self.fl = open(tempname, 'w', encoding='utf-8')
def stream(self):
return self.fl
def resolve(self):
self.fl.close()
self.fl = None
os.replace(self.tempname, self.finalname)
def read_lib_file(filename, default=''):
"""Read a simple text file from the lib directory. Return it as a
string.
If filename is None, return the default string instead.
"""
if not filename:
return default
fl = open(os.path.join(opts.libdir, filename), encoding='utf-8')
res = fl.read()
fl.close()
return res
def is_string_nonwhite(val):
"""Return (bool) whether val contains anything besides whitespace.
"""
return bool(val.strip())
def relroot_for_dirname(val):
"""For a directory, return the relative URL which returns to the
root. "if-archive/games" maps to "../../..".
"""
num = val.count('/')
return '../..' + num * '/..'
def isodate(val):
"""Convert a timestamp to RFS date format.
"""
tup = time.gmtime(int(val))
# RFC 822 date format.
return time.strftime('%a, %d %b %Y %H:%M:%S +0000', tup)
def pluralize(val, singular='', plural='s'):
if val == 1 or val == '1':
return singular
else:
return plural
filehash_pattern = re.compile('([^a-zA-Z0-9_.,;:()@/-])')
filehash_escaper = lambda match: '=%02X=' % (ord(match.group(1)),)
def filehash(val):
"""Escape a filename in a way that can appear in a DOM id or URL
fragment (dir#filename). This also works on full pathnames.
(Nothing in the system needs to reverse this mapping, but it
should be unique.)
This is a bit arbitrary. Almost any non-whitespace character is legal
in those domains; you just have to HTML-escape or URL-escape it.
However, we want to pass dir#file URLs around with a minimum of fuss,
so it's worth encoding Unicode and the fussier punctuation.
"""
return filehash_pattern.sub(filehash_escaper, val)
# All ASCII characters except <&>
htmlable_pattern = re.compile("[ -%'-;=?-~]+")
html_entities = {
# Newlines and tabs are not encoded.
'\n': '\n', '\t': '\t',
# The classic three HTML characters that must be escaped.
'&': '&', '<': '<', '>': '>',
# We could add more classic accented characters, but not really important.
# Actually, if we do, we'd have to distinguish HTML escaping from
# XML escaping. So let's not.
}
def escape_html_string(val):
"""Apply the basic HTML/XML &-escapes to a string. Also &#x...; escapes
for Unicode characters.
"""
res = []
pos = 0
while pos < len(val):
match = htmlable_pattern.match(val, pos=pos)
if match:
res.append(match.group())
pos = match.end()
else:
ch = val[pos]
ent = html_entities.get(ch)
if ent:
res.append(ent)
else:
res.append('&#x%X;' % (ord(ch),))
pos += 1
return ''.join(res)
class InternalLinkProc(markdown.inlinepatterns.InlineProcessor):
def handleMatch(self, m, data):
val = m.group(1)
if '#' in val:
# The hash case. We presume the pre-hash part is a directory.
val, _, dfrag = val.rpartition('#')
if not val.endswith('/'):
val += '/'
else:
dfrag = None
if val == '' or val == '/':
val = 'if-archive'
link = '/indexes/if-archive'
elif val.endswith('/'):
link = '/indexes/if-archive'+val
val = val[1:] # remove slash
else:
link = '/if-archive'+val
val = val[1:] # remove slash
link = urllib.parse.quote(link)
if dfrag:
# Note that the fragment is *not* urlquoted
val = '%s%s' % (val, dfrag,)
link += '#%s' % (filehash(dfrag),)
el = xml.etree.ElementTree.Element('a')
el.text = val
el.set('href', link)
return el, m.start(0), m.end(0)
class InternalLinkExt(markdown.extensions.Extension):
"""Special case for Markdown: convert "</if-archive/foo/>",
"</if-archive/foo/bar.txt>", "</if-archive/foo#bar.txt>"
into Archive internal links. (Server-relative but path-absolute.)
Note that "<dir/file>" links to the file itself. "<dir#file>"
(or "<dir/#file>") links to the file's listing on its index page.
"<dir/>" just links to the index page of the named directory.
(If you want to link to the raw directory, "<dir>" will work, but
I don't know why you'd want that.)
Note that this does not affect "<http://foo.com>", which is
handled as a regular URL link.
Minor bug: For regex reasons, this messes up filenames that contain ">"
or "#".
"""
def extendMarkdown(self, md):
PATTERN = r'</if-archive([^>]*)>'
md.inlinePatterns.register(InternalLinkProc(PATTERN, md), 'intlink', 175)
def findfile(path):
"""Locate the File object for a given pathname.
This is a debugging function; call it after the global dirmap
has been created.
"""
(dirname, filename) = os.path.split(path)
if not dirname.startswith('if-archive'):
if not dirname:
dirname = 'if-archive'
else:
dirname = os.path.join('if-archive', dirname)
dir = dirmap[dirname]
return dir.files[filename]
class ArchiveTree:
"""ArchiveTree: The big directory map.
"""
def __init__(self):
self.dirmap = {}
def get_directory(self, dirname, oradd=False):
"""Get a Directory by name (e.g. "if-archive/games").
If oradd is true, the Directory is created if not found.
Otherwise a missing Directory just returns None.
"""
dir = self.dirmap.get(dirname)
if dir:
return dir
if oradd:
dir = Directory(dirname)
self.dirmap[dirname] = dir
return dir
return None
def get_file_by_path(self, path, ordir=False):
"""Get a File by its full path (e.g. "if-archive/games/foo.z5").
If not found, return None.
If the File turns out to be a directory and ordir is true,
we return the Directory instead. Otherwise we return the
File in the parent dir.
"""
dirname, _, filename = path.rpartition('/')
if not (dirname and filename):
return None
dir = self.dirmap.get(dirname)
if not dir:
return None
file = dir.files.get(filename)
if not file:
return None
if file.isdir and ordir:
return self.dirmap.get(file.path)
return file
class Directory:
"""Directory: one directory in the big directory map.
"""
def __init__(self, dirname):
self.dir = dirname
self.submap = {}
self.putkey('dir', dirname)
pos = dirname.rfind('/')
if pos < 0:
self.parentdirname = None
self.barename = dirname
else:
parentdirname = dirname[ 0 : pos ]
self.parentdirname = parentdirname
self.barename = dirname[ pos+1 : ]
self.putkey('parentdir', parentdirname)
# To be filled in later
self.lastchange = 0
self.doit = True
self.files = {}
self.subdirs = {}
self.parentdir = None
self.parentdescs = OrderedDict() # xmldescs really
self.metadata = OrderedDict()
def __repr__(self):
return '<Directory %s>' % (self.dir,)
def getkey(self, key, default=None):
return self.submap.get(key)
def putkey(self, key, val):
self.submap[key] = val
def getitems(self, isdir=False, display=True):
ls = list(self.files.values())
if display:
# When displaying, symlinks and deep refs to directories all
# count as directories.
ls = [ file for file in ls if file.isdir == isdir ]
else:
# For XML cataloging, symlinks are always files. Deep refs
# are skipped entirely, sorry.
ls = [ file for file in ls if not file.isdeep and (file.isdir and not file.islink) == isdir ]
return ls
metadata_pattern = re.compile('^[ ]*[a-zA-Z0-9_-]+:')
unbox_suffix_pattern = re.compile(r'\.(tar\.gz|tgz|tar\.z|zip)$', re.IGNORECASE)
def stripmetadata(lines):
"""Given a list of lines, remove the metadata lines (lines at the
beginning that look like "key:value"). Return a joined string of
the rest.
"""
pos = 0
for ln in lines:
if not ln.strip():
break
if not (metadata_pattern.match(ln) or ln.startswith(' ')):
break
pos += 1
val = '\n'.join(lines[pos:])
return val.rstrip() + '\n'
def deepsplit(ls):
"""Split a list of File objects into non-deep and deep lists.
"""
undeepls = []
deepls = []
for val in ls:
if val.isdeep:
deepls.append(val)
else:
undeepls.append(val)
return undeepls, deepls
def merge_in_metadata(dest, src):
"""Copy metadata entries from src into dest, discarding duplicates.
The arguments should be OrderedDicts.
"""
for key, srcls in src.items():
if key not in dest:
dest[key] = []
destls = dest[key]
for val in srcls:
if val not in destls:
destls.append(val)
class File:
"""File: one entry in a directory.
The name is a bit of a misnomer; this could represent a file,
symlink, or subdirectory.
(There is no global file list. You have to look at dir.files for each
directory in dirmap.)
"""
def __init__(self, filename, parentdir, isdir=False, islink=False):
self.submap = {}
self.parentdir = parentdir
# Place into the parent directory.
parentdir.files[filename] = self
self.name = filename
self.path = parentdir.dir+'/'+filename
self.parentdescs = OrderedDict() # xmldescs really
self.metadata = OrderedDict()
self.isdir = isdir
self.islink = islink
self.isdeep = ('/' in filename)
self.backsymlinks = []
self.intree = False
self.inmaster = False
self.putkey('name', filename)
self.putkey('dir', parentdir.dir)
self.putkey('path', self.path)
if not islink:
if isdir:
self.putkey('isdir', True)
else:
self.putkey('islink', True)
if isdir:
self.putkey('islinkdir', True)
else:
self.putkey('islinkfile', True)
def __repr__(self):
linkstr = ' (link)' if self.islink else ''
dirstr = ' (dir)' if self.isdir else ''
return '<File %s%s%s>' % (self.name, linkstr, dirstr,)
def complete(self, desclines):
# Take the accumulated description text and stick it into our
# File object.
if desclines:
val = '\n'.join(desclines)
filestr = convertermeta.convert(val)
for (mkey, mls) in convertermeta.Meta.items():
self.metadata[mkey] = list(mls)
convertermeta.Meta.clear()
### sort metadata?
self.putkey('desc', filestr)
self.putkey('hasdesc', is_string_nonwhite(filestr))
# Remove metadata lines before generating XML.
descstr = stripmetadata(desclines)
self.putkey('xmldesc', descstr)
self.putkey('hasxmldesc', is_string_nonwhite(descstr))
def getkey(self, key, default=None):
return self.submap.get(key)
def putkey(self, key, val):
self.submap[key] = val
def getmetadata_string(self, key):
# Return a metadata value as a string. If there are multiple values,
# just use the first.
if key not in self.metadata or not self.metadata[key]:
return None
return self.metadata[key][0]
def parse_master_index(indexpath, archtree):
"""Read through the Master-Index file and create directories and files.
"""
if opts.verbose:
print('Reading Master-Index...')
dirname_pattern = re.compile('^#[ ]*(%s.*):$' % (re.escape(ROOTNAME),))
filename_pattern = re.compile('^##[^#]')
dashline_pattern = re.compile('^[ ]*[-+=#*]+[ -+=#*]*$')
dir = None
direntryset = None
file = None
filedesclines = None
inheader = True
headerlines = None
infl = open(indexpath, encoding='utf-8')
done = False
while not done:
ln = infl.readline()
if not ln:
done = True
ln = None
match = None
else:
ln = ln.rstrip()
match = dirname_pattern.match(ln)
if done or match:
# End of a directory block or end of file.
# Finish constructing the dir entry.
if dir:
if file:
# Also have to finish constructing the file entry.
file.complete(filedesclines)
file = None
dirname = dir.dir
if opts.verbose > 1:
print('Finishing %s...' % (dirname,))
headerstr = '\n'.join(headerlines)
headerstr = headerstr.rstrip() + '\n'
# Now headerstr starts with zero newlines and ends
# with one newline.
anyheader = bool(headerstr.strip())
dir.putkey('hasdesc', anyheader)
dir.putkey('hasxmldesc', anyheader)
if anyheader:
# Convert Markdown to HTML.
val = convertermeta.convert(headerstr)
for (mkey, mls) in convertermeta.Meta.items():
dir.metadata[mkey] = list(mls)
convertermeta.Meta.clear()
### sort metadata?
dir.putkey('header', val)
# For XML, we just escape.
val = stripmetadata(headerstr.split('\n'))
dir.putkey('xmlheader', val)
dir = None
if not done:
# Beginning of a directory block.
assert(match is not None)
dirname = match.group(1)
if opts.verbose > 1:
print('Starting %s...' % (dirname,))
dir = archtree.get_directory(dirname, oradd=True)
direntryset = set()
filedesclines = None
inheader = True
headerlines = []
continue
# We can't do any work outside of a directory block.
if dir is None:
continue
# Skip any line which is entirely dashes (or dash-like
# characters). But we don't skip blank lines this way.
if dashline_pattern.match(ln):
continue
bx = ln
if inheader:
if not filename_pattern.match(bx):
# Further header lines become part of headerlines.
headerlines.append(bx)
continue
# The header ends when we find a line starting with "##".
inheader = False
if filename_pattern.match(bx):
# Start of a new file block.
if file:
# Finish constructing the file in progress.
file.complete(filedesclines)
file = None
# Set up the new file, including a fresh filedesclines
# accumulator.
# (If the file already exists, then it was found in the tree;
# we'll add to its entry. If not, we create a new entry,
# presumed to be as regular file.)
filename = bx[2:].strip()
bx = ''
filedesclines = []
if filename in direntryset:
sys.stderr.write('Duplicate Index entry: "%s" in %s\n' % (filename, dir.dir,))
direntryset.add(filename)
if '/' not in filename:
file = dir.files.get(filename)
if file is None:
file = File(filename, dir)
file.inmaster = True
else:
# Distant file reference, like "Comp/Games"
reldir, _, relfile = filename.rpartition('/')
reldir = dir.dir+'/'+reldir
rel = archtree.get_directory(reldir, oradd=False)
if not rel:
sys.stderr.write('Compound file entry refers to a bad directory: "%s" in %s\n' % (filename, dir.dir,))
continue
relfile = rel.files.get(relfile)
if not relfile:
sys.stderr.write('Compound file entry refers to a bad file: "%s" in %s\n' % (filename, dir.dir,))
continue
relfile.inmaster = True
file = dir.files.get(filename)
if file is not None:
sys.stderr.write('Compound file entry appears twice: "%s" in %s\n' % (filename, dir.dir,))
continue
file = File(filename, dir, isdir=relfile.isdir, islink=relfile.islink)
if file.isdir:
file.putkey('linkdir', dir.dir+'/'+filename)
continue
# Continuing a file block.
filedesclines.append(bx)
# Finished reading Master-Index.
infl.close()
def parse_directory_tree(treedir, archtree):
"""Do a scan of the actual file tree and create directories and
files. We also take the opportunity to scan file dates and sizes.
"""
if opts.verbose:
print('Walking directory tree...')
def scan_directory(dirname, parentlist=None, parentdir=None):
"""Internal recursive function.
"""
if opts.verbose > 1:
print('Scanning %s...' % (dirname,))
dir = archtree.get_directory(dirname, oradd=True)
pathname = os.path.join(treedir, dirname)
sta = os.stat(pathname)
if sta.st_mtime > dir.lastchange:
# Get the directory mod time.
dir.lastchange = sta.st_mtime
for ent in os.scandir(pathname):
if ent.name.startswith('.'):
continue
sta = ent.stat(follow_symlinks=False)
dirname2 = os.path.join(dirname, ent.name)
pathname = os.path.join(treedir, dirname, ent.name)
if ent.is_symlink():
linkname = os.readlink(ent.path)
# Symlink destinations should always be relative.
if linkname.endswith('/'):
linkname = linkname[0:-1]
sta2 = ent.stat(follow_symlinks=True)
if ent.is_file(follow_symlinks=True):
file = dir.files.get(ent.name)
if file is None:
file = File(ent.name, dir, islink=True, isdir=False)
file.intree = True
file.putkey('linkpath', linkname)
nlinkpath = os.path.normpath(os.path.join(dir.dir, linkname))
nlinkdir, nlinkfile = os.path.split(nlinkpath)
# nlinkfile is normally the same as ent.name, but we don't rely on that.
file.putkey('nlinkpath', nlinkpath)
file.putkey('nlinkdir', nlinkdir)
file.putkey('nlinkfile', nlinkfile)
file.putkey('date', str(int(sta2.st_mtime)))
tmdat = time.gmtime(sta2.st_mtime)
file.putkey('datestr', time.strftime('%d-%b-%Y', tmdat))
elif ent.is_dir(follow_symlinks=True):
targetname = os.path.normpath(os.path.join(dirname, linkname))
file = dir.files.get(ent.name)
if file is None:
file = File(ent.name, dir, islink=True, isdir=True)
#file.complete(['Symlink to '+targetname])
file.intree = True
file.putkey('linkdir', targetname)
continue
if ent.is_file():
if sta.st_mtime > dir.lastchange:
# All files, including Index, count towards lastchange
dir.lastchange = sta.st_mtime
if ent.name == 'Index':
# But we don't create a File entry for Index
continue
if ent.name == 'Index~':
# We also skip Index~, because I edit files in Emacs
# and it's a nuisance deleting the leftovers.
continue
file = dir.files.get(ent.name)
if file is None:
file = File(ent.name, dir)
file.intree = True
file.putkey('filesize', str(sta.st_size))
file.putkey('date', str(int(sta.st_mtime)))
tmdat = time.gmtime(sta.st_mtime)
file.putkey('datestr', time.strftime('%d-%b-%Y', tmdat))
hash_md5, hash_sha512 = hasher.get_hashes(pathname, sta.st_size, int(sta.st_mtime))
file.putkey('md5', hash_md5)
file.putkey('sha512', hash_sha512)
continue
if ent.is_dir():
if ent.name == 'lost+found':
# This occurs in the new storage volume but we skip it.
continue
dir2 = archtree.get_directory(dirname2, oradd=True)
file = dir.files.get(ent.name)
if file is None:
file = File(ent.name, dir, isdir=True)
file.putkey('linkdir', dirname2)
file.intree = True
scan_directory(dirname2, dir.files, ent.name)
continue
# End of internal scan_directory function.
# Call the above function recursively.
scan_directory(ROOTNAME)
def construct_archtree(indexpath, treedir):
"""Parse the Master-Index file, and then go through the directory
tree to find more files. Return all the known directories as a dict.
Either or both arguments may be None. At a bare minimum, this always
returns the root directory.
"""
archtree = ArchiveTree()
rootdir = archtree.get_directory(ROOTNAME, oradd=True)
if treedir:
parse_directory_tree(treedir, archtree)
if indexpath:
parse_master_index(indexpath, archtree)
if opts.verbose:
print('Creating subdirectory lists and counts...')
# Connect up each directory to its parent. When we're done, every
# dir will have its subdirs list.
for dir in archtree.dirmap.values():
if dir.parentdirname:
dir2 = archtree.get_directory(dir.parentdirname, oradd=False)
if not dir2:
sys.stderr.write('Directory\'s parent is not listed: %s\n' % (dir.dir))
continue
dir.parentdir = dir2
dir2.subdirs[dir.dir] = dir
fdir = dir2.files.get(dir.barename)
if fdir.submap.get('hasdesc'):
dir.putkey('hasparentdesc', True)
dir.putkey('parentdesc', fdir.submap.get('desc'))
if fdir.submap.get('hasxmldesc'):
dir.parentdescs[dir2.dir] = fdir.submap.get('xmldesc')
merge_in_metadata(dir.metadata, fdir.metadata)
# Connect up deep references to the actual files/dirs they refer to.
# Also create backlinks for file symlinks.
for dir in archtree.dirmap.values():
for file in dir.files.values():
if file.isdeep:
realpath = dir.dir+'/'+file.name
realfile = archtree.get_file_by_path(realpath, ordir=True)
if not realfile:
sys.stderr.write('Deep file reference to nonexistent target: %s in %s\n' % (file.name, dir.dir,))
continue
if file.submap.get('hasxmldesc'):
realfile.parentdescs[file.parentdir.dir] = file.submap.get('xmldesc')
merge_in_metadata(realfile.metadata, file.metadata)
if file.islink and not file.isdir:
nlinkpath = file.getkey('nlinkpath')
dfile = archtree.get_file_by_path(nlinkpath)
if not dfile:
sys.stderr.write('Symlink file reference to nonexistent target: %s in %s\n' % (file.name, dir.dir,))
else:
dfile.backsymlinks.append(file)
return archtree
def check_missing_files(dirmap):
"""Go through dirmap and look for entries which were not found in
the scan-directory phase.
Also look for files that *were* in the scan-directory phase, but
had no Index entry.
"""
for dir in dirmap.values():
for file in dir.files.values():
if file.inmaster and not file.intree and file.getkey('linkdir') is None and file.getkey('islink') is None:
val = file.name
if file.isdeep:
val = '(%s)' % (val,)
sys.stderr.write('Index entry without file: %s/%s\n' % (dir.dir, val,))
if file.intree and not file.inmaster and file.getkey('linkdir') is None:
if not noindexlist.check(file.path):
sys.stderr.write('File without index entry: %s\n' % (file.path,))
def parity_flip(map):
"""Utility function to change the "parity" entry in a dict from "Even"
to "Odd" and vice versa. Call this at the top of a loop. The dict
should start with no "parity" entry.
"""
val = map.get('parity')
if val == 'Even':
map['parity'] = 'Odd'
else:
map['parity'] = 'Even'
def file_detail_map(file):
"""Create a map which has the file info plus some extra details.
"""
itermap = {}
# We show the unbox link based on the "unbox-link"
# metadata key ("true" or otherwise). If that's not
# present, we check whether the parent dir is listed in
# no-unbox-link. Failing that, we default to showing it
# for zip/tar.gz/tgz files.
val = file.getmetadata_string('unbox-link')
if val:
flag = (val.lower() == 'true')
elif file.parentdir.dir in nounboxlinklist.set:
flag = False
else:
flag = bool(unbox_suffix_pattern.search(file.name))
# But if "unbox-block" is set, definitely no link.
# (Unbox pays attention to "unbox-block" and refuses to
# unbox the file. "unbox-link:false" only affects the
# index page.)
if file.getmetadata_string('unbox-block') == 'true':
flag = false
if flag:
itermap['hasunboxlink'] = True
if file.metadata:
itermap['_metadata'] = file.metadata
if file.backsymlinks:
ls = list(file.backsymlinks)
ls.sort(key=lambda dfile: dfile.path)
itermap['_backsymlinks'] = ls
return ChainMap(itermap, file.submap)
def subdir_detail_map(subdir):
"""Create a map which has the directory info plus some extra details.
"""
itermap = {}
if subdir.metadata:
itermap['_metadata'] = subdir.metadata
return ChainMap(itermap, subdir.submap)
def generate_output_dirlist(dirmap, jenv):
"""Write out the dirlist.html index.
"""
template = jenv.get_template('dirlist.html')
dirlist = list(dirmap.values())
dirlist.sort(key=lambda dir:dir.dir.lower())
finaldirlist = [ dir.submap for dir in dirlist ]
itermap = {
'title': 'Complete Index of Directories',
'pageid': 'dirpage',
'_dirs': finaldirlist,
'rootdir': ROOTNAME,
}
filename = os.path.join(DESTDIR, 'dirlist.html')
tempname = os.path.join(DESTDIR, '__temp')
writer = SafeWriter(tempname, filename)
template.stream(itermap).dump(writer.stream())
writer.resolve()
def generate_output_dirmap(dirmap, jenv):
"""Write out the dirlist.html index.
"""
skiplist = [ re.compile(val) for val in mapskippatternlist.ls ]
template = jenv.get_template('dirmap.html')
finaldirlist = []
dirlist = list(dirmap.values())
dirlist.sort(key=lambda dir:dir.dir.lower())
itermap = {}
for dir in dirlist:
skip = False
for pat in skiplist:
if pat.match(dir.dir):
skip = True
break
if not skip:
finaldirlist.append(dir.submap)
itermap = {
'title': 'Index of Directories',
'pageid': 'dirpage',
'_dirs': finaldirlist,
'rootdir': ROOTNAME,
}
filename = os.path.join(DESTDIR, 'dirmap.html')