-
Notifications
You must be signed in to change notification settings - Fork 4
/
find_empty_elements.pl
executable file
·599 lines (515 loc) · 24.4 KB
/
find_empty_elements.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
#!/usr/bin/perl
# Author: Tom Laudeman
# The Institute for Advanced Technology in the Humanities
# Copyright 2014 University of Virginia. Licensed under the Educational Community License, Version 2.0
# (the "License"); you may not use this file except in compliance with the License. You may obtain a
# copy of the License at
# http://www.osedu.org/licenses/ECL-2.0
# http://opensource.org/licenses/ECL-2.0
# Unless required by applicable law or agreed to in writing, software distributed under the License is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing permissions and limitations under the
# License.
# find_empty_elements.pl dir=nysa_cpf > nysa_empty.log 2>&1 &
# todo: if redirected, consider capturing the final redirect URL
# Find empty elements in all the XML files in a directory tree. This is a QA tool to get a quick survey of
# elements that are empty, but maybe shouldn't be empty.
# Check for missing /eac-cpf/cpfDescription/relations/resourceRelation
# If resourceRelation/@xlink:href then check the URL with wget. Run from a special, empty subdir because
# sometimes wget downloads a real file. (Maybe after redirecting.)
# > ./find_empty_elements.pl > empty.log
# grep empty: empty.log| sort -u | less
# > grep empty: empty.log| sort -u
# empty: <abstract />
# empty: <description> </description>
# empty: <empty />
# empty: <eventDescription />
# empty: <fromDate />
# empty: <origination />
# empty: <relations />
# empty: <toDate />
# # Look for <description> </description> in empty.log
# less qa_cpf/Names0/047-001771080.xml
# grep 047-001771080 ./british_library/file_target.xml
# less ./british_library/Names0/047-001771080.xml
# less empty.log
# xlf ./british_library/Names0/047-001771080.xml
# xlf qa_cpf/Names4/047-002259994.xml
use strict;
use session_lib qw(:all);
use XML::XPath;
use CGI; # Handles command line name=value pairs.
use Time::HiRes qw(usleep nanosleep);
my $delay_microsecs = 2000000; # 100000 is 1/10 second, 250000 is 1/4 second
# %re_hash serves two purposes. First, it has the regex that matches error pages for a given repo. Second, it is
# used to determine which repos have to do a regex text vs just looking at the https header from wget
# --spider.
# WorldCat note: the normal regex will not work because the initial file "exists" even
# for a missing URL. We must have for a Location: redirect in the http header for the
# file to be ok.
# Use (?:pattern) for clustering alternatives because it is non-capturing and therefore
# faster.
# Some repositories don't have consistent URLs, or have a mixture, and some have errors in the host name. As a
# result of all that, we need a list of host names that identify a given repository. Since there is no
# "correct" text to test against leaving us to check bad text, we must be certain that we're looking for the
# bad text in a host name that will return the expected. In other words, we won't find the bad text at the
# wrong host name, and having not found the bad text we would (incorrectly) return ok.
# 'rave.ohiolink.edu' => 'ohlink',
# 'archives.library.illinois.edu', => 'uil',
# 'www.amphilsoc.org' => 'aps',
# 'amphilsoc.org' => 'aps',
# 'hdl.loc.gov' => 'lc',
# 'archives.nypl.org' => 'nypl',
# 'www.lib.ncsu.edu' => 'ncsu',
# 'lib.ncsu.edu' => 'ncsu',
# Except for worldcat, these were generated by running a keyboard macro on the output of:
# find url_xml/ -type f -exec head -n 3 {} \; | grep "<file" > tmp.txt
# Manually delete nwda_missing. Or try this:
# find url_xml/ -type f -exec head -n 3 {} \; | grep "<file" | grep -v nwda_missing > tmp.txt
# hostname alone is not enough. Both yale and uks are using hdl.handle.net. uks is hdl.handle.net/10407 and
# yale is hdl.handle.net/10079. There may be other duplicate hosts as well.
my %repo_id = ('www.worldcat.org' => 'worldcat',
'ead.lib.virginia.edu' => 'vah',
'www.lib.utexas.edu' => 'taro',
'oasis.lib.harvard.edu' => 'harvard',
'iarchives.nysed.gov' => 'nysa',
'hdl.handle.net' => 'yale',
'archives.nypl.org' => 'nypl',
'www.aaa.si.edu' => 'aar',
'ead.lib.virginia.edu' => 'vah',
'hdl.handle.net' => 'uks',
'findingaids.library.northwestern.edu' => 'nwu',
'archivespec.unl.edu' => 'unl',
'hdl.loc.gov' => 'lc',
'www.mnhs.org' => 'mhs',
'digital.lib.umd.edu' => 'umd',
'rmoa.unm.edu' => 'rmoa',
'library.duke.edu' => 'duke',
'rmc.library.cornell.edu' => 'crnlu',
'quod.lib.umich.edu' => 'umi',
'www.lib.ncsu.edu' => 'ncsu',
'uda-db.orbiscascade.org' => 'utsu',
'digitool.fcla.edu' => 'afl',
'findingaids.cjh.org' => 'cjh',
'oculus.nlm.nih.gov' => 'nlm',
'special.lib.umn.edu' => 'umn',
'library.syr.edu' => 'syru',
'webapp1.dlib.indiana.edu' => 'inu',
'lib.udel.edu' => 'ude',
'archivesetmanuscrits.bnf.fr' => 'bnf',
'asteria.fivecolleges.edu' => 'fivecol',
'acumen.lib.ua.edu' => 'ual',
'libraries.mit.edu' => 'mit',
'doddcenter.uconn.edu' => 'uct',
'arks.princeton.edu' => 'pu',
'www.lib.uchicago.edu' => 'uchic',
'dlib.nyu.edu' => 'html',
'dla.library.upenn.edu' => 'pacscl',
'nmai.si.edu' => 'nmaia',
'nwda.orbiscascade.org' => 'nwda',
'ccfr.bnf.fr' => 'ccfr',
'rave.ohiolink.edu' => 'ohlink',
'www.azarchivesonline.org' => 'aao',
'archives.library.illinois.edu' => 'uil',
'www.library.ufl.edu' => 'afl-ufl',
'nwda.orbiscascade.org' => 'uut',
'findingaids.cul.columbia.edu' => 'colu',
'archiveshub.ac.uk' => 'ahub',
'findingaid.lib.byu.edu' => 'byu',
'www.oac.cdlib.org' => 'oac',
'archives.utah.gov' => 'utsa',
'library.brown.edu' => 'riamco',
'www2.scc.rutgers.edu' => 'rutu');
# regex hash
# Yale and UKS use handle.net which properly returns a 404.
# NYPL now returns 200 on good requests, and 302 on bad requests. See multi_ok(). There is improved behavior
# for sites that don't use a regex to determine success. Old NYPL: 'nypl' => '<title>.*?servlet
# error.*?<\/title>',
my %re_hash = (
'aps' => '<title>.*?Invalid Document.*?<\/title>',
'lc' => '<title>.*?Handle Problem Report.*?<\/title>',
'ncsu' => '<div.*?>Collection not found\.</div>', # <div class="alert-box alert">Collection not found.</div>
'ohlink' => '<title>dynaXML Error: Servlet Error</title>',
'uil', => 'Could not load Collection',
'worldcat' => '^Location:',
'rmoa' => '<description>The document does not exist.<\/description>',
'aar' => 'Location: /collections//more',
'aao' => '<title>dynaXML Error: Servlet Error</title>',
'afl' => '<title>DigiTool Stream Gateway Error</title>',
'afl-ufl' => '<title>Page Not Found</title>',
'fivecol' => 'Length: 0 \[text\/html\]',
'mhs' => '404 Not Found',
'inu' => '<title>dynaXML Error: Invalid Document</title>',
'nlm' => '404 Not Found',
'nmaia' => '<title>Page not Found',
'nysa' => '<title>dynaXML Error: Servlet Error</title>',
'crnlu' => 'Location: http://rmc.library.cornell.edu/404.html',
'crnlu_missing' => 'Location: http://rmc.library.cornell.edu/404.html',
'pu' => '404 Not Found',
'pacscl' => '<title>,\s+</title>',
'uchic' => 'No document with the given EADID.', # '<title></title>'
'riamco' => 'Length: 2328 \(2.3K\) \[text\/html\]',
'uks' => 'HTTP request sent, awaiting response... 404 Not Found',
'umd' => 'Length: 0 \[text\/html\]',
'utsu' => 'HTTP request sent, awaiting response... 404 Not Found',
'uut' => 'HTTP request sent, awaiting response... 404 Not Found',
'uct' => 'HTTP request sent, awaiting response... 404 Not Found',
'cjh' => 'Undefined variable',
'colu' => '\-\->\s+<table',
'ude' => 'Location: http://www.lib.udel.edu \[following\]'
);
# For any given run it is fairly easy to forget to add an exception to %re_hash, therefore we need to munge
# the first URL and confirm that the munged version fails to resolve. This is the global flag for that once-per-run test.
my $missing_confirmed = 0;
my $check_missing = 1;
my $use_cookie = 0;
main();
exit();
my $usage = "Usage $0 dir=somedirectory {url|empty}\nWhere somedirectory is repo_whatever.\n";
sub main
{
my $do_empty = 0;
my $do_url = 0;
$| = 1; # unbuffer stdout.
# Cache each URL that we have already checked. No reason to re-check them.
my %check_url;
# Cache each file name with a missing host
my %missing_host;
# Yes, I know we are a commandline app, but Perl's CGI allows us
# to use named args which is kind of nice, and as simple as it gets.
my $qq = new CGI;
my %ch = $qq->Vars();
if (! exists($ch{dir}))
{
die "No directory specified.\n$usage\n";
}
if (! -e $ch{dir} || ! -d $ch{dir})
{
die "Specified does not exist or is not a directory.\n$usage\n";
}
# Nov 19 2014 Slightly exciting (?:) non capturing expression in order to the * zero or more occurances of _missing so
# that foo and foo_missing are both repositories.
# Mar 30 2015 Do not make trailing / required on $ch{dir}. The convention is to not have a trailing slash.
my $repo;
if ($ch{dir} =~ m/^(.*?(?:_missing)*)_/)
{
$repo = $1;
print "Repository: $repo\n";
}
elsif ($ch{dir} =~ m/^\/data\/extract\/(.*?)\/*/)
{
$repo = $1;
print "Repository: $repo\n";
}
else
{
die "Can't determine repository name from dir: $ch{dir}\n$usage\n";
}
if (exists($ch{use_cookie}))
{
$use_cookie = 1;
print "Using cookie\n";
# Always write the fe_cookies.txt file so we know it exists in the local directory. wget needs this.
open(my $out, ">", "fe_cookies.txt");
print $out "www.worldcat.org FALSE / FALSE 0 owcLocRedirectPersistent _nr.selected\n";
close($out);
}
else
{
print "Not using cookie\n";
}
if (exists($ch{empty}))
{
$do_empty = 1;
print "Checking empty\n";
}
else
{
print "Not checking empty\n";
}
if (exists($ch{url}))
{
print "Checking url\n";
$do_url = 1;
printf("Delay between URL fetch: %d microseconds (%f seconds)\n", $delay_microsecs, ($delay_microsecs / 1000000));
}
else
{
print "Not checking url\n";
}
if (! $do_empty && ! $do_url)
{
die "No check specified. Must check empty or url\n$usage\n"
}
# Some of the URLs required special algos to create the 'bad' URL. We don't care enough to waste time on
# that, so when we have crafted a regex to check for bad URLs, we might not want to bother with the
# test. Partly because we did the test manually. Of course, this is disabling QA, but we've wasted too
# much time on these bad URLs already. An example URL where we would have to munge the pid param would be
# http://example.com/foo?pid=11150&custom_att_2=direct I'm sure we could write a custom URL munger, but we
# are not going down that rabbit hole.
if (exists($ch{noc}))
{
$check_missing = 0;
$missing_confirmed = 1;
print "Disabled testing missing URLs\n";
}
# The linux find command will not work on a symlinked dir that doesn't have a trailing / so check and add one.
if ($ch{dir} !~ m/\/$/)
{
$ch{dir} = "$ch{dir}/";
}
print "Scanning: $ch{dir}\n";
$XML::XPath::Namespaces = 0;
my @files = `find $ch{dir} -type f`;
chomp(@files);
print "Find done. File count: ". scalar(@files) . "\n";
my $xx = 0;
my $first_host = "";
foreach my $file (@files)
{
print "file: $file\n";
my $xpath = XML::XPath->new(filename => $file);
# Checking CPF for missing resourceRelation is not redundant because <relations> won't be empty if
# there is a <cpfRelation> which is the case with .c and .r files. When both <cpfRelation> and
# <resourceRelation> are empty, we will have an empty <relations> below.
my $rrel = $xpath->find('/eac-cpf/cpfDescription/relations/resourceRelation');
if ($do_url)
{
if (! $rrel)
{
print "missing rrel:\n";
} else
{
# Apparently something about using the xpath function causes the & entity to be translated
# to a ascii & (decimal 38, \046). If the entity was not translated, the wget call would fail.
my $url = $xpath->find('/eac-cpf/cpfDescription/relations/resourceRelation/@xlink:href');
if ($url && $do_url)
{
# Find the repository identity, and deal with changing and unknown host names. Anything
# unexpected is an error. All the expected values should be in %repo_id, and %re_hash.
my $host = "";
if ($url =~ m/https*:\/\/(.*?)\//)
{
$host = $1;
}
if (! $first_host)
{
print "Host name: $host\n";
$first_host = $host;
}
elsif ($first_host ne $host)
{
my $basef = '';
if ($file =~ m/$repo\_cpf_final\/(.+)\.[cr]\d+\.xml/)
{
$basef = $1;
if (exists($missing_host{$basef}))
{
print "error: (probably missing-dup:) bad host: $host\n";
}
else
{
print "error: (probably missing:) bad host: $host\n";
$missing_host{$basef} = 1;
}
}
else
{
print "error: (probably missing:) bad host: $host\n";
}
next();
}
if (exists($check_url{$url}) and $host)
{
# If we already checked the URL, just print the result, marked with dup: for easy grep'ing.
print "dup: $check_url{$url} $url\n";
} else
{
# Sleep 100 (or 250) milliseconds in an attempt to be nice to their web server.
usleep($delay_microsecs);
my $url_ok = 0;
if (! $missing_confirmed && $check_missing)
{
# Use digits to munge the URL. At least one server script likes digits in the id
# CGI param and treats non-digits specially. We just want to make a non-resolving
# URL, or bad CGI parameter. ("Bad" being broadly defined.) Prior to this test,
# confirming that the code would properly classify missing URLs was a time
# consuming process.
# Nov 5 2014 Some servers that use a url with file.xml will server the correct
# page if you send a url file.xml1234, so to test the bad url we have to create file1234.xml.
# This is bad: http://www.aaa.si.edu/collections/findingaids/bishisab1234.xml
my $new_url = "";
if ($url =~ m/\.[A-Za-z]+$/)
{
$new_url = $url;
$new_url =~ s/(\.[A-Za-z]+)$/1234$1/;
}
elsif ($url =~ m/(?:\?eadid=)|(?:\?source=)/)
{
# riamco has params ?eadid, umd has params ?source
$new_url = $url;
$new_url =~ s/((?:\?eadid=)|(?:\?source=))/$1x1234/;
}
else
{
$new_url = $url . '1234';
}
my $check_url_ok = url_check($repo, $new_url);
if ($check_url_ok)
{
die "Error: Bad URL returns ok: $new_url\nOriginal URL: $url\n";
}
else
{
print("Missing test confirmed (tested: $new_url)\n");
$missing_confirmed = 1;
}
}
$url_ok = url_check($repo, $url);
if ($url_ok)
{
# We have a hit. The "ok" value in the check_url hash must be unique for grep'ing
# the results. Dups have the text "ok-prev:" and are detected way back at the top
# of the URL checking code, so they'll never reach this point.
print "ok: $url\n";
$check_url{$url} = "ok-prev:";
}
else
{
# The "missing" value in the check_url hash must be unique for grep'ing the results.
print "missing: $url\n";
$check_url{$url} = "missing-prev:";
}
}
}
}
}
if ($do_empty)
{
# Sep 23 2014 Add test for not ancestor::objectXMLWrap since we only care about empty elements in
# the CPF itself.
# xpath cpf_qa/howard/Coll._113-ead.c01.xml '//*[not(ancestor::objectXMLWrap)]/attribute::*[.=""]' 2> /dev/null | less
# xpath cpf_qa/howard/Coll._113-ead.c01.xml '//*[@*="" and not(ancestor::objectXMLWrap)]' 2> /dev/null | less
# xpath cpf_qa/howard/Coll._113-ead.c01.xml 'local-name(//*[@*="" and not(ancestor::objectXMLWrap)])' 2> /dev/null | less
# my $empty_nodes = $xpath->find('//*[((not(*) and not(normalize-space())) or not(@*)) and not(ancestor::objectXMLWrap)]');
my $empty_nodes = $xpath->find('//*[(not(normalize-space()) or @*="") and not(ancestor::objectXMLWrap)]');
if ($empty_nodes)
{
foreach my $node ($empty_nodes->get_nodelist())
{
# my $val = $node->toString();
my $val = $node->getName();
print "empty: $val";
my $empty_attr = $xpath->find('attribute::*[.=""]', $node);
if ($empty_attr)
{
foreach my $node ($empty_attr->get_nodelist())
{
my $val = $node->getName();
print " attr: $val";
}
}
print "\n";
}
}
}
$xx++;
}
}
sub url_check
{
my $host = $_[0];
my $url = $_[1];
# Rather than returning a 404, most sites returns a 200 http header for missing
# pages. In other words, even a bad URL returns a 200 web page. Instead of using the
# http header result, we have to actually retrieve the entire web page and check it
# with a regex.
# Good URL examples:
# http://iarchives.nysed.gov/xtf/view?docId=14610-88F.xml
# http://www.amphilsoc.org/mole/view?docId=ead/Mss.B.W76a-ead.xml
# http://archives.library.illinois.edu/archon/?p=collections/controlcard&id=995
my $cookie_suffix = "";
if ($use_cookie)
{
my $cookie_suffix = " --load-cookies fe_cookies.txt";
}
my $cmd = "wget -O - \"$url\" $cookie_suffix";
if ($host eq 'www.worldcat.org' || ! exists($re_hash{$host}))
{
# Why did url checking on pu create files? Must use -O - even with --spider because some combination
# of server responses causes wget to save the file after all. --delete-after without the -O - works
# too. It could be the 302 followed by a 500 followed by a 200. Note the "(try: 2)"
# wget --delete-after --spider "http://arks.princeton.edu/ark:/88435/0k225b05z" 2>&1
# wget --spider -O - "http://arks.princeton.edu/ark:/88435/0k225b05z" 2>&1 | less
# Remember, every command has 2>&1 when executed below. Don't add it twice.
$cmd = "wget --spider -O - \"$url\" $cookie_suffix"
}
my $res = `$cmd 2>&1`;
my $url_ok = 0;
if (multi_ok('host' => $host, 'text' => "$res"))
{
$url_ok = 1;
}
return $url_ok;
}
sub multi_ok
{
my %args = @_;
my $host = $args{host};
my $text = $args{text};
# Add new repo/regex pairs to this hash.
# Worldcat at least uses ^ for start of line in the regex, and to support that we need /sm switches. It
# should be fine for all the others which are mid-line matches.
# The two cases here with inverse logic is somewhat disturbing.
# print "host: $host test: " . substr($text, 0, 40) . "...\n";
# If we have a test regex, then don't worry about doing the normal http 200 test. It will always be true.
if (! exists($re_hash{$host}))
{
# print "No regex: $host\n";
# Nov 1 2014. Which server returned "Remote file exists" but not with a 200 http response? That seems odd/wrong
# becase "Remote file exists" is also returned for 302 and other redirects.
# if (($text =~ m/^(?:Remote file exists)|(?:awaiting response\.\.\. 200 OK$)/sm))
# Lacking the previous examples, what seems to make sense is to only accept a 200 response as good,
# and anything else is bad. I seem to recollect that WorldCat returns a redirect, but the new logic
# here is that WorldCat will test the response with a regex, and look for "Location:". Having a regex,
# WorldCat won't come into this code branch.
my $ret_val = 0;
while ($text =~ m/^HTTP request sent, awaiting response\.\.\. (\d+) ([A-Za-z ]+)$/smg)
{
if ($1 eq 200)
{
# If this is the only http response code, then the request was successful, and we keep looking
# for response codes.
$ret_val = 1;
}
if ($1 ne 200)
{
# However, any other response code means there was something fishy, at least for the non-regex
# hosts. Yes, I know we're returning from inside a loop.
# Well behaved servers will return 404, although in at least one case only after trying a redirect.
# It seems like a good idea to return a zero (url not ok) if any http response is a 404.
return 0;
}
}
return $ret_val;
}
elsif (exists($re_hash{$host}))
{
# print "matching host: $host matching: $re_hash{$host}\n";
# print "\n$text\n";
if ($text =~ m/$re_hash{$host}/sm)
{
return 0;
}
# This is dangerous. Basically, we assume that since didn't find the error text, the URL must be
# ok. That assumption is weak. Much better would be to search for text that only occurs in the good
# URLs. We try to make up for this by having an explicit list of host names that will behave as
# expected.
return 1;
}
else
{
die "Error: No matching regex for: $host\n";
}
return 0;
}