-
Notifications
You must be signed in to change notification settings - Fork 26
/
cpdfxmlm.ml
1215 lines (1099 loc) · 42.3 KB
/
cpdfxmlm.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
(*---------------------------------------------------------------------------
Copyright (c) 2007 Daniel C. Bünzli. All rights reserved.
Distributed under the ISC license, see terms at the end of the file.
%%NAME%% %%VERSION%%
---------------------------------------------------------------------------*)
module Std_string = String
module Std_buffer = Buffer
type std_string = string
type std_buffer = Buffer.t
module type String = sig
type t
val empty : t
val length : t -> int
val append : t -> t -> t
val lowercase : t -> t
val iter : (int -> unit) -> t -> unit
val of_string : std_string -> t
val to_utf_8 : ('a -> std_string -> 'a) -> 'a -> t -> 'a
val compare : t -> t -> int
end
module type Buffer = sig
type string
type t
exception Full
val create : int -> t
val add_uchar : t -> int -> unit
val clear : t -> unit
val contents : t -> string
val length : t -> int
end
module type S = sig
type string
type encoding = [
| `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE | `ISO_8859_1 | `US_ASCII ]
type dtd = string option
type name = string * string
type attribute = name * string
type tag = name * attribute list
type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ]
val ns_xml : string
val ns_xmlns : string
type pos = int * int
type error = [
| `Max_buffer_size
| `Unexpected_eoi
| `Malformed_char_stream
| `Unknown_encoding of string
| `Unknown_entity_ref of string
| `Unknown_ns_prefix of string
| `Illegal_char_ref of string
| `Illegal_char_seq of string
| `Expected_char_seqs of string list * string
| `Expected_root_element ]
exception Error of pos * error
val error_message : error -> string
type source = [
| `Channel of in_channel
| `String of int * std_string
| `Fun of (unit -> int) ]
type input
val make_input : ?enc:encoding option -> ?strip:bool ->
?ns:(string -> string option) ->
?entity: (string -> string option) -> source -> input
val input : input -> signal
val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
input -> 'a
val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
input -> (dtd * 'a)
val peek : input -> signal
val eoi : input -> bool
val pos : input -> pos
type 'a frag = [ `El of tag * 'a list | `Data of string ]
type dest = [
| `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ]
type output
val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option ->
?ns_prefix:(string -> string option) -> dest -> output
val output_depth : output -> int
val output : output -> signal -> unit
val output_tree : ('a -> 'a frag) -> output -> 'a -> unit
val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit
end
(* Unicode character lexers *)
exception Malformed (* for character stream, internal only. *)
let utf8_len = [| (* Char byte length according to first UTF-8 byte. *)
1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
1; 1; 1; 1; 1; 1; 1; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
0; 0; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2;
2; 2; 2; 2; 2; 2; 2; 2; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3;
4; 4; 4; 4; 4; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0 |]
let uchar_utf8 i =
let b0 = i () in
begin match utf8_len.(b0) with
| 0 -> raise Malformed
| 1 -> b0
| 2 ->
let b1 = i () in
if b1 lsr 6 != 0b10 then raise Malformed else
((b0 land 0x1F) lsl 6) lor (b1 land 0x3F)
| 3 ->
let b1 = i () in
let b2 = i () in
if b2 lsr 6 != 0b10 then raise Malformed else
begin match b0 with
| 0xE0 -> if b1 < 0xA0 || 0xBF < b1 then raise Malformed else ()
| 0xED -> if b1 < 0x80 || 0x9F < b1 then raise Malformed else ()
| _ -> if b1 lsr 6 != 0b10 then raise Malformed else ()
end;
((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F)
| 4 ->
let b1 = i () in
let b2 = i () in
let b3 = i () in
if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 then raise Malformed else
begin match b0 with
| 0xF0 -> if b1 < 0x90 || 0xBF < b1 then raise Malformed else ()
| 0xF4 -> if b1 < 0x80 || 0x8F < b1 then raise Malformed else ()
| _ -> if b1 lsr 6 != 0b10 then raise Malformed else ()
end;
((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor
((b2 land 0x3F) lsl 6) lor (b3 land 0x3F)
| _ -> assert false
end
let int16_be i =
let b0 = i () in
let b1 = i () in
(b0 lsl 8) lor b1
let int16_le i =
let b0 = i () in
let b1 = i () in
(b1 lsl 8) lor b0
let uchar_utf16 int16 i =
let c0 = int16 i in
if c0 < 0xD800 || c0 > 0xDFFF then c0 else
if c0 > 0xDBFF then raise Malformed else
let c1 = int16 i in
(((c0 land 0x3FF) lsl 10) lor (c1 land 0x3FF)) + 0x10000
let uchar_utf16be = uchar_utf16 int16_be
let uchar_utf16le = uchar_utf16 int16_le
let uchar_byte i = i ()
let uchar_iso_8859_1 i = i ()
let uchar_ascii i = let b = i () in if b > 127 then raise Malformed else b
(* Functorized streaming XML IO *)
module Make (String : String) (Buffer : Buffer with type string = String.t) =
struct
type string = String.t
let str = String.of_string
let str_eq s s' = (compare s s') = 0
let str_empty s = (compare s String.empty) = 0
let cat = String.append
let str_of_char u =
let b = Buffer.create 4 in
Buffer.add_uchar b u;
Buffer.contents b
module Ht = Hashtbl.Make (struct type t = string
let equal = str_eq
let hash = Hashtbl.hash end)
let u_nl = 0x000A (* newline *)
let u_cr = 0x000D (* carriage return *)
let u_space = 0x0020 (* space *)
let u_quot = 0x0022 (* quote *)
let u_sharp = 0x0023 (* # *)
let u_amp = 0x0026 (* & *)
let u_apos = 0x0027 (* ' *)
let u_minus = 0x002D (* - *)
let u_slash = 0x002F (* / *)
let u_colon = 0x003A (* : *)
let u_scolon = 0x003B (* ; *)
let u_lt = 0x003C (* < *)
let u_eq = 0x003D (* = *)
let u_gt = 0x003E (* > *)
let u_qmark = 0x003F (* ? *)
let u_emark = 0x0021 (* ! *)
let u_lbrack = 0x005B (* [ *)
let u_rbrack = 0x005D (* ] *)
let u_x = 0x0078 (* x *)
let u_bom = 0xFEFF (* BOM *)
let u_9 = 0x0039 (* 9 *)
let u_F = 0x0046 (* F *)
let u_D = 0X0044 (* D *)
let s_cdata = str "CDATA["
let ns_xml = str "http://www.w3.org/XML/1998/namespace"
let ns_xmlns = str "http://www.w3.org/2000/xmlns/"
let n_xml = str "xml"
let n_xmlns = str "xmlns"
let n_space = str "space"
let n_version = str "version"
let n_encoding = str "encoding"
let n_standalone = str "standalone"
let v_yes = str "yes"
let v_no = str "no"
let v_preserve = str "preserve"
let v_default = str "default"
let v_version_1_0 = str "1.0"
let v_version_1_1 = str "1.1"
let v_utf_8 = str "utf-8"
let v_utf_16 = str "utf-16"
let v_utf_16be = str "utf-16be"
let v_utf_16le = str "utf-16le"
let v_iso_8859_1 = str "iso-8859-1"
let v_us_ascii = str "us-ascii"
let v_ascii = str "ascii"
let name_str (p,l) = if str_empty p then l else cat p (cat (str ":") l)
(* Basic types and values *)
type encoding = [
| `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE | `ISO_8859_1 | `US_ASCII ]
type dtd = string option
type name = string * string
type attribute = name * string
type tag = name * attribute list
type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ]
(* Input *)
type pos = int * int
type error = [
| `Max_buffer_size
| `Unexpected_eoi
| `Malformed_char_stream
| `Unknown_encoding of string
| `Unknown_entity_ref of string
| `Unknown_ns_prefix of string
| `Illegal_char_ref of string
| `Illegal_char_seq of string
| `Expected_char_seqs of string list * string
| `Expected_root_element ]
exception Error of pos * error
let error_message e =
let bracket l v r = cat (str l) (cat v (str r)) in
match e with
| `Expected_root_element -> str "expected root element"
| `Max_buffer_size -> str "maximal buffer size exceeded"
| `Unexpected_eoi -> str "unexpected end of input"
| `Malformed_char_stream -> str "malformed character stream"
| `Unknown_encoding e -> bracket "unknown encoding (" e ")"
| `Unknown_entity_ref e -> bracket "unknown entity reference (" e ")"
| `Unknown_ns_prefix e -> bracket "unknown namespace prefix (" e ")"
| `Illegal_char_ref s -> bracket "illegal character reference (#" s ")"
| `Illegal_char_seq s ->
bracket "character sequence illegal here (\"" s "\")"
| `Expected_char_seqs (exps, fnd) ->
let exps =
let exp acc v = cat acc (bracket "\"" v "\", ") in
List.fold_left exp String.empty exps
in
cat (str "expected one of these character sequence: ")
(cat exps (bracket "found \"" fnd "\""))
type limit = (* XML is odd to parse. *)
| Stag of name (* '<' qname *)
| Etag of name (* '</' qname whitespace* *)
| Pi of name (* '<?' qname *)
| Comment (* '<!--' *)
| Cdata (* '<![CDATA[' *)
| Dtd (* '<!' *)
| Text (* other character *)
| Eoi (* End of input *)
type source = [
| `Channel of in_channel
| `String of int * std_string
| `Fun of (unit -> int) ]
type input =
{ enc : encoding option; (* Expected encoding. *)
strip : bool; (* Whitespace stripping default behaviour. *)
fun_ns : string -> string option; (* Namespace callback. *)
fun_entity : string -> string option; (* Entity reference callback. *)
i : unit -> int; (* Byte level input. *)
mutable uchar : (unit -> int) -> int; (* Unicode character lexer. *)
mutable c : int; (* Character lookahead. *)
mutable cr : bool; (* True if last u was '\r'. *)
mutable line : int; (* Current line number. *)
mutable col : int; (* Current column number. *)
mutable limit : limit; (* Last parsed limit. *)
mutable peek : signal; (* Signal lookahead. *)
mutable stripping : bool; (* True if stripping whitespace. *)
mutable last_white : bool; (* True if last char was white. *)
mutable scopes : (name * string list * bool) list;
(* Stack of qualified el. name, bound prefixes and strip behaviour. *)
ns : string Ht.t; (* prefix -> uri bindings. *)
ident : Buffer.t; (* Buffer for names and entity refs. *)
data : Buffer.t; } (* Buffer for character and attribute data. *)
let err_input_tree = "input signal not `El_start or `Data"
let err_input_doc_tree = "input signal not `Dtd"
let err i e = raise (Error ((i.line, i.col), e))
let err_illegal_char i u = err i (`Illegal_char_seq (str_of_char u))
let err_expected_seqs i exps s = err i (`Expected_char_seqs (exps, s))
let err_expected_chars i exps =
err i (`Expected_char_seqs (List.map str_of_char exps, str_of_char i.c))
let u_eoi = max_int
let u_start_doc = u_eoi - 1
let u_end_doc = u_start_doc - 1
let signal_start_stream = `Data String.empty
let make_input ?(enc = None) ?(strip = false) ?(ns = fun _ -> None)
?(entity = fun _ -> None) src =
let i = match src with
| `Fun f -> f
| `Channel ic -> fun () -> input_byte ic
| `String (pos, s) ->
let len = Std_string.length s in
let pos = ref (pos - 1) in
fun () ->
incr pos;
if !pos = len then raise End_of_file else
Char.code (Std_string.get s !pos)
in
let bindings =
let h = Ht.create 15 in
Ht.add h String.empty String.empty;
Ht.add h n_xml ns_xml;
Ht.add h n_xmlns ns_xmlns;
h
in
{ enc = enc; strip = strip; fun_ns = ns; fun_entity = entity;
i = i; uchar = uchar_byte; c = u_start_doc; cr = false;
line = 1; col = 0; limit = Text; peek = signal_start_stream;
stripping = strip; last_white = true; scopes = []; ns = bindings;
ident = Buffer.create 64; data = Buffer.create 1024; }
(* Bracketed non-terminals in comments refer to XML 1.0 non terminals *)
let r : int -> int -> int -> bool = fun u a b -> a <= u && u <= b
let is_white = function 0x0020 | 0x0009 | 0x000D | 0x000A -> true | _ -> false
let is_char = function (* {Char} *)
| u when r u 0x0020 0xD7FF -> true
| 0x0009 | 0x000A | 0x000D -> true
| u when r u 0xE000 0xFFFD || r u 0x10000 0x10FFFF -> true
| _ -> false
let is_digit u = r u 0x0030 0x0039
let is_hex_digit u =
r u 0x0030 0x0039 || r u 0x0041 0x0046 || r u 0x0061 0x0066
let comm_range u = (* common to functions below *)
r u 0x00C0 0x00D6 || r u 0x00D8 0x00F6 || r u 0x00F8 0x02FF ||
r u 0x0370 0x037D || r u 0x037F 0x1FFF || r u 0x200C 0x200D ||
r u 0x2070 0x218F || r u 0x2C00 0x2FEF || r u 0x3001 0xD7FF ||
r u 0xF900 0xFDCF || r u 0xFDF0 0xFFFD || r u 0x10000 0xEFFFF
let is_name_start_char = function (* {NameStartChar} - ':' (XML 1.1) *)
| u when r u 0x0061 0x007A || r u 0x0041 0x005A -> true (* [a-z] | [A-Z] *)
| u when is_white u -> false
| 0x005F -> true (* '_' *)
| u when comm_range u -> true
| _ -> false
let is_name_char = function (* {NameChar} - ':' (XML 1.1) *)
| u when r u 0x0061 0x007A || r u 0x0041 0x005A -> true (* [a-z] | [A-Z] *)
| u when is_white u -> false
| u when r u 0x0030 0x0039 -> true (* [0-9] *)
| 0x005F | 0x002D | 0x002E | 0x00B7 -> true (* '_' '-' '.' *)
| u when comm_range u || r u 0x0300 0x036F || r u 0x203F 0x2040 -> true
| _ -> false
let rec nextc i =
if i.c = u_eoi then err i `Unexpected_eoi;
if i.c = u_nl then (i.line <- i.line + 1; i.col <- 1)
else i.col <- i.col + 1;
i.c <- i.uchar i.i;
if not (is_char i.c) then raise Malformed;
if i.cr && i.c = u_nl then i.c <- i.uchar i.i; (* cr nl business *)
if i.c = u_cr then (i.cr <- true; i.c <- u_nl) else i.cr <- false
let nextc_eof i = try nextc i with End_of_file -> i.c <- u_eoi
let skip_white i = while (is_white i.c) do nextc i done
let skip_white_eof i = while (is_white i.c) do nextc_eof i done
let accept i c = if i.c = c then nextc i else err_expected_chars i [ c ]
let clear_ident i = Buffer.clear i.ident
let clear_data i = Buffer.clear i.data
let addc_ident i c = Buffer.add_uchar i.ident c
let addc_data i c = Buffer.add_uchar i.data c
let addc_data_strip i c =
if is_white c then i.last_white <- true else
begin
if i.last_white && Buffer.length i.data <> 0 then addc_data i u_space;
i.last_white <- false;
addc_data i c
end
let expand_name i (prefix, local) =
let external_ prefix = match i.fun_ns prefix with
| None -> err i (`Unknown_ns_prefix prefix)
| Some uri -> uri
in
try
let uri = Ht.find i.ns prefix in
if not (str_empty uri) then (uri, local) else
if str_empty prefix then String.empty, local else
(external_ prefix), local (* unbound with xmlns:prefix="" *)
with Not_found -> external_ prefix, local
let find_encoding i = (* Encoding mess. *)
let reset uchar i = i.uchar <- uchar; i.col <- 0; nextc i in
match i.enc with
| None -> (* User doesn't know encoding. *)
begin match nextc i; i.c with
| 0xFE -> (* UTF-16BE BOM. *)
nextc i; if i.c <> 0xFF then err i `Malformed_char_stream;
reset uchar_utf16be i;
true
| 0xFF -> (* UTF-16LE BOM. *)
nextc i; if i.c <> 0xFE then err i `Malformed_char_stream;
reset uchar_utf16le i;
true
| 0xEF -> (* UTF-8 BOM. *)
nextc i; if i.c <> 0xBB then err i `Malformed_char_stream;
nextc i; if i.c <> 0xBF then err i `Malformed_char_stream;
reset uchar_utf8 i;
true
| 0x3C | _ -> (* UTF-8 or other, try declaration. *)
i.uchar <- uchar_utf8;
false
end
| Some e -> (* User knows encoding. *)
begin match e with
| `US_ASCII -> reset uchar_ascii i
| `ISO_8859_1 -> reset uchar_iso_8859_1 i
| `UTF_8 -> (* Skip BOM if present. *)
reset uchar_utf8 i; if i.c = u_bom then (i.col <- 0; nextc i)
| `UTF_16 -> (* Which UTF-16 ? look BOM. *)
let b0 = nextc i; i.c in
let b1 = nextc i; i.c in
begin match b0, b1 with
| 0xFE, 0xFF -> reset uchar_utf16be i
| 0xFF, 0xFE -> reset uchar_utf16le i
| _ -> err i `Malformed_char_stream;
end
| `UTF_16BE -> (* Skip BOM if present. *)
reset uchar_utf16be i; if i.c = u_bom then (i.col <- 0; nextc i)
| `UTF_16LE ->
reset uchar_utf16le i; if i.c = u_bom then (i.col <- 0; nextc i)
end;
true (* Ignore xml declaration. *)
let p_ncname i = (* {NCName} (Namespace 1.1) *)
clear_ident i;
if not (is_name_start_char i.c) then err_illegal_char i i.c else
begin
addc_ident i i.c; nextc i;
while is_name_char i.c do addc_ident i i.c; nextc i done;
Buffer.contents i.ident
end
let p_qname i = (* {QName} (Namespace 1.1) *)
let n = p_ncname i in
if i.c <> u_colon then (String.empty, n) else (nextc i; (n, p_ncname i))
let p_charref i = (* {CharRef}, '&' was eaten. *)
let c = ref 0 in
clear_ident i;
nextc i;
if i.c = u_scolon then err i (`Illegal_char_ref String.empty) else
begin
try
if i.c = u_x then
begin
addc_ident i i.c;
nextc i;
while (i.c <> u_scolon) do
addc_ident i i.c;
if not (is_hex_digit i.c) then raise Exit else
c := !c * 16 + (if i.c <= u_9 then i.c - 48 else
if i.c <= u_F then i.c - 55 else
i.c - 87);
nextc i;
done
end
else
while (i.c <> u_scolon) do
addc_ident i i.c;
if not (is_digit i.c) then raise Exit else
c := !c * 10 + (i.c - 48);
nextc i
done
with Exit ->
c := -1; while i.c <> u_scolon do addc_ident i i.c; nextc i done
end;
nextc i;
if is_char !c then (clear_ident i; addc_ident i !c; Buffer.contents i.ident)
else err i (`Illegal_char_ref (Buffer.contents i.ident))
let predefined_entities =
let h = Ht.create 5 in
let e k v = Ht.add h (str k) (str v) in
e "lt" "<"; e "gt" ">"; e "amp" "&"; e "apos" "'"; e "quot" "\"";
h
let p_entity_ref i = (* {EntityRef}, '&' was eaten. *)
let ent = p_ncname i in
accept i u_scolon;
try Ht.find predefined_entities ent with Not_found ->
match i.fun_entity ent with
| Some s -> s
| None -> err i (`Unknown_entity_ref ent)
let p_reference i = (* {Reference} *)
nextc i; if i.c = u_sharp then p_charref i else p_entity_ref i
let p_attr_value i = (* {S}? {AttValue} *)
skip_white i;
let delim =
if i.c = u_quot || i.c = u_apos then i.c else
err_expected_chars i [ u_quot; u_apos]
in
nextc i;
skip_white i;
clear_data i;
i.last_white <- true;
while (i.c <> delim) do
if i.c = u_lt then err_illegal_char i u_lt else
if i.c = u_amp then String.iter (addc_data_strip i) (p_reference i)
else (addc_data_strip i i.c; nextc i)
done;
nextc i;
Buffer.contents i.data
let p_attributes i = (* ({S} {Attribute})* {S}? *)
let rec aux i pre_acc acc =
if not (is_white i.c) then pre_acc, acc else
begin
skip_white i;
if i.c = u_slash || i.c = u_gt then pre_acc, acc else
begin
let (prefix, local) as n = p_qname i in
let v = skip_white i; accept i u_eq; p_attr_value i in
let att = n, v in
if str_empty prefix && str_eq local n_xmlns then
begin (* xmlns *)
Ht.add i.ns String.empty v;
aux i (String.empty :: pre_acc) (att :: acc)
end
else if str_eq prefix n_xmlns then
begin (* xmlns:local *)
Ht.add i.ns local v;
aux i (local :: pre_acc) (att :: acc)
end
else if str_eq prefix n_xml && str_eq local n_space then
begin (* xml:space *)
if str_eq v v_preserve then i.stripping <- false else
if str_eq v v_default then i.stripping <- i.strip else ();
aux i pre_acc (att :: acc)
end
else
aux i pre_acc (att :: acc)
end
end
in
aux i [] [] (* Returns a list of bound prefixes and attributes *)
let p_limit i = (* Parses a markup limit *)
i.limit <-
if i.c = u_eoi then Eoi else
if i.c <> u_lt then Text else
begin
nextc i;
if i.c = u_qmark then (nextc i; Pi (p_qname i)) else
if i.c = u_slash then
begin
nextc i;
let n = p_qname i in
skip_white i;
Etag n
end
else if i.c = u_emark then
begin
nextc i;
if i.c = u_minus then (nextc i; accept i u_minus; Comment) else
if i.c = u_D then Dtd else
if i.c = u_lbrack then
begin
nextc i;
clear_ident i;
for k = 1 to 6 do (addc_ident i i.c; nextc i) done;
let cdata = Buffer.contents i.ident in
if str_eq cdata s_cdata then Cdata else
err_expected_seqs i [ s_cdata ] cdata
end
else
err i (`Illegal_char_seq (cat (str "<!") (str_of_char i.c)))
end
else
Stag (p_qname i)
end
let rec skip_comment i = (* {Comment}, '<!--' was eaten *)
while (i.c <> u_minus) do nextc i done;
nextc i;
if i.c <> u_minus then skip_comment i else
begin
nextc i;
if i.c <> u_gt then err_expected_chars i [ u_gt ];
nextc_eof i
end
let rec skip_pi i = (* {PI}, '<?' qname was eaten *)
while (i.c <> u_qmark) do nextc i done;
nextc i;
if i.c <> u_gt then skip_pi i else nextc_eof i
let rec skip_misc i ~allow_xmlpi = match i.limit with (* {Misc}* *)
| Pi (p,l) when (str_empty p && str_eq n_xml (String.lowercase l)) ->
if allow_xmlpi then () else err i (`Illegal_char_seq l)
| Pi _ -> skip_pi i; p_limit i; skip_misc i ~allow_xmlpi
| Comment -> skip_comment i; p_limit i; skip_misc i ~allow_xmlpi
| Text when is_white i.c ->
skip_white_eof i; p_limit i; skip_misc i ~allow_xmlpi
| _ -> ()
let p_chardata addc i = (* {CharData}* ({Reference}{Chardata})* *)
while (i.c <> u_lt) do
if i.c = u_amp then String.iter (addc i) (p_reference i)
else if i.c = u_rbrack then
begin
addc i i.c;
nextc i;
if i.c = u_rbrack then begin
addc i i.c;
nextc i; (* detects ']'*']]>' *)
while (i.c = u_rbrack) do addc i i.c; nextc i done;
if i.c = u_gt then err i (`Illegal_char_seq (str "]]>"));
end
end
else
(addc i i.c; nextc i)
done
let rec p_cdata addc i = (* {CData} {CDEnd} *)
try while (true) do
if i.c = u_rbrack then begin
nextc i;
while i.c = u_rbrack do
nextc i;
if i.c = u_gt then (nextc i; raise Exit);
addc i u_rbrack
done;
addc i u_rbrack;
end;
addc i i.c;
nextc i;
done with Exit -> ()
let p_xml_decl i ~ignore_enc ~ignore_utf16 = (* {XMLDecl}? *)
let yes_no = [v_yes; v_no] in
let p_val i = skip_white i; accept i u_eq; skip_white i; p_attr_value i in
let p_val_exp i exp =
let v = p_val i in
if not (List.exists (str_eq v) exp) then err_expected_seqs i exp v
in
match i.limit with
| Pi (p, l) when (str_empty p && str_eq l n_xml) ->
let v = skip_white i; p_ncname i in
if not (str_eq v n_version) then err_expected_seqs i [ n_version ] v;
p_val_exp i [v_version_1_0; v_version_1_1];
skip_white i;
if i.c <> u_qmark then begin
let n = p_ncname i in
if str_eq n n_encoding then begin
let enc = String.lowercase (p_val i) in
if not ignore_enc then begin
if str_eq enc v_utf_8 then i.uchar <- uchar_utf8 else
if str_eq enc v_utf_16be then i.uchar <- uchar_utf16be else
if str_eq enc v_utf_16le then i.uchar <- uchar_utf16le else
if str_eq enc v_iso_8859_1 then i.uchar <- uchar_iso_8859_1 else
if str_eq enc v_us_ascii then i.uchar <- uchar_ascii else
if str_eq enc v_ascii then i.uchar <- uchar_ascii else
if str_eq enc v_utf_16 then
if ignore_utf16 then () else (err i `Malformed_char_stream)
(* A BOM should have been found. *)
else
err i (`Unknown_encoding enc)
end;
skip_white i;
if i.c <> u_qmark then begin
let n = p_ncname i in
if str_eq n n_standalone then p_val_exp i yes_no else
err_expected_seqs i [ n_standalone; str "?>" ] n
end
end
else if str_eq n n_standalone then
p_val_exp i yes_no
else
err_expected_seqs i [ n_encoding; n_standalone; str "?>" ] n
end;
skip_white i;
accept i u_qmark;
accept i u_gt;
p_limit i
| _ -> ()
let p_dtd_signal i =(* {Misc}* {doctypedecl} {Misc}* *)
skip_misc i ~allow_xmlpi:false;
if i.limit <> Dtd then `Dtd None else
begin
let buf = addc_data i in
let nest = ref 1 in
clear_data i;
buf u_lt; buf u_emark; (* add eaten "<!" *)
while (!nest > 0) do
if i.c = u_lt then
begin
nextc i;
if i.c <> u_emark then
(buf u_lt; incr nest)
else
begin
nextc i;
if i.c <> u_minus then (* Carefull with comments ! *)
(buf u_lt; buf u_emark; incr nest)
else
begin
nextc i;
if i.c <> u_minus then
(buf u_lt; buf u_emark; buf u_minus; incr nest)
else
(nextc i; skip_comment i)
end
end
end
else if i.c = u_quot || i.c = u_apos then
begin
let c = i.c in
buf c; nextc i;
while (i.c <> c) do (buf i.c; nextc i) done;
buf c; nextc i
end
else if i.c = u_gt then (buf u_gt; nextc i; decr nest)
else (buf i.c; nextc i)
done;
let dtd = Buffer.contents i.data in
p_limit i;
skip_misc i ~allow_xmlpi:false;
`Dtd (Some dtd);
end
let p_data i =
let rec bufferize addc i = match i.limit with
| Text -> p_chardata addc i; p_limit i; bufferize addc i
| Cdata -> p_cdata addc i; p_limit i; bufferize addc i
| (Stag _ | Etag _) -> ()
| Pi _ -> skip_pi i; p_limit i; bufferize addc i
| Comment -> skip_comment i; p_limit i; bufferize addc i
| Dtd -> err i (`Illegal_char_seq (str "<!D"))
| Eoi -> err i `Unexpected_eoi
in
clear_data i;
i.last_white <- true;
bufferize (if i.stripping then addc_data_strip else addc_data) i;
let d = Buffer.contents i.data in
d
let p_el_start_signal i n =
let expand_att (((prefix, local) as n, v) as att) =
if not (str_eq prefix String.empty) then expand_name i n, v else
if str_eq local n_xmlns then (ns_xmlns, n_xmlns), v else
att (* default namespaces do not influence attributes. *)
in
let strip = i.stripping in (* save it here, p_attributes may change it. *)
let prefixes, atts = p_attributes i in
i.scopes <- (n, prefixes, strip) :: i.scopes;
`El_start ((expand_name i n), List.rev_map expand_att atts)
let p_el_end_signal i n = match i.scopes with
| (n', prefixes, strip) :: scopes ->
if i.c <> u_gt then err_expected_chars i [ u_gt ];
if not (str_eq n n') then err_expected_seqs i [name_str n'] (name_str n);
i.scopes <- scopes;
i.stripping <- strip;
List.iter (Ht.remove i.ns) prefixes;
if scopes = [] then i.c <- u_end_doc else (nextc i; p_limit i);
`El_end
| _ -> assert false
let p_signal i =
if i.scopes = [] then
match i.limit with
| Stag n -> p_el_start_signal i n
| _ -> err i `Expected_root_element
else
let rec find i = match i.limit with
| Stag n -> p_el_start_signal i n
| Etag n -> p_el_end_signal i n
| Text | Cdata ->
let d = p_data i in
if str_empty d then find i else `Data d
| Pi _ -> skip_pi i; p_limit i; find i
| Comment -> skip_comment i; p_limit i; find i
| Dtd -> err i (`Illegal_char_seq (str "<!D"))
| Eoi -> err i `Unexpected_eoi
in
begin match i.peek with
| `El_start (n, _) -> (* finish to input start el. *)
skip_white i;
if i.c = u_gt then (accept i u_gt; p_limit i) else
if i.c = u_slash then
begin
let tag = match i.scopes with
| (tag, _, _) :: _ -> tag | _ -> assert false
in
(nextc i; i.limit <- Etag tag)
end
else
err_expected_chars i [ u_slash; u_gt ]
| _ -> ()
end;
find i
let eoi i =
try
if i.c = u_eoi then true else
if i.c <> u_start_doc then false else (* In a document. *)
if i.peek <> `El_end then (* Start of document sequence. *)
begin
let ignore_enc = find_encoding i in
p_limit i;
p_xml_decl i ~ignore_enc ~ignore_utf16:false;
i.peek <- p_dtd_signal i;
false
end
else (* Subsequent documents. *)
begin
nextc_eof i;
p_limit i;
if i.c = u_eoi then true else
begin
skip_misc i ~allow_xmlpi:true;
if i.c = u_eoi then true else
begin
p_xml_decl i ~ignore_enc:false ~ignore_utf16:true;
i.peek <- p_dtd_signal i;
false
end
end
end
with
| Buffer.Full -> err i `Max_buffer_size
| Malformed -> err i `Malformed_char_stream
| End_of_file -> err i `Unexpected_eoi
let peek i = if eoi i then err i `Unexpected_eoi else i.peek
let input i =
try
if i.c = u_end_doc then (i.c <- u_start_doc; i.peek) else
let s = peek i in
i.peek <- p_signal i;
s
with
| Buffer.Full -> err i `Max_buffer_size
| Malformed -> err i `Malformed_char_stream
| End_of_file -> err i `Unexpected_eoi
let input_tree ~el ~data i = match input i with
| `Data d -> data d
| `El_start tag ->
let rec aux i tags context = match input i with
| `El_start tag -> aux i (tag :: tags) ([] :: context)
| `El_end ->
begin match tags, context with
| tag :: tags', childs :: context' ->
let el = el tag (List.rev childs) in
begin match context' with
| parent :: context'' -> aux i tags' ((el :: parent) :: context'')
| [] -> el
end
| _ -> assert false
end
| `Data d ->
begin match context with
| childs :: context' -> aux i tags (((data d) :: childs) :: context')
| [] -> assert false
end
| `Dtd _ -> assert false
in
aux i (tag :: []) ([] :: [])
| _ -> invalid_arg err_input_tree
let input_doc_tree ~el ~data i = match input i with
| `Dtd d -> d, input_tree ~el ~data i
| _ -> invalid_arg err_input_doc_tree
let pos i = i.line, i.col
(* Output *)
type 'a frag = [ `El of tag * 'a list | `Data of string ]
type dest = [
| `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ]
type output =
{ decl : bool; (* True if the XML declaration should be output. *)
nl : bool; (* True if a newline is output at the end. *)
indent : int option; (* Optional indentation. *)
fun_prefix : string -> string option; (* Prefix callback. *)
prefixes : string Ht.t; (* uri -> prefix bindings. *)
outs : std_string -> int -> int -> unit; (* String output. *)
outc : char -> unit; (* character output. *)
mutable last_el_start : bool; (* True if last signal was `El_start *)
mutable scopes : (name * (string list)) list;
(* Qualified el. name and bound uris. *)
mutable depth : int; } (* Scope depth. *)
let err_prefix uri = "unbound namespace (" ^ uri ^ ")"
let err_dtd = "dtd signal not allowed here"
let err_el_start = "start signal not allowed here"
let err_el_end = "end signal without matching start signal"
let err_data = "data signal not allowed here"
let make_output ?(decl = true) ?(nl = false) ?(indent = None)
?(ns_prefix = fun _ ->None) d =
let outs, outc = match d with
| `Channel c -> (output_substring c), (output_char c)
| `Buffer b -> (Std_buffer.add_substring b), (Std_buffer.add_char b)
| `Fun f ->
let os s p l =
for i = p to p + l - 1 do f (Char.code (Std_string.get s i)) done
in
let oc c = f (Char.code c) in
os, oc
in
let prefixes =
let h = Ht.create 10 in
Ht.add h String.empty String.empty;
Ht.add h ns_xml n_xml;
Ht.add h ns_xmlns n_xmlns;
h
in
{ decl = decl; outs = outs; outc = outc; nl = nl; indent = indent;
last_el_start = false; prefixes = prefixes; scopes = []; depth = -1;
fun_prefix = ns_prefix; }
let output_depth o = o.depth
let outs o s = o.outs s 0 (Std_string.length s)
let str_utf_8 s = String.to_utf_8 (fun _ s -> s) "" s
let out_utf_8 o s = ignore (String.to_utf_8 (fun o s -> outs o s; o) o s)
let prefix_name o (ns, local) =
try
if str_eq ns ns_xmlns && str_eq local n_xmlns then (String.empty, n_xmlns)
else (Ht.find o.prefixes ns, local)
with Not_found ->
match o.fun_prefix ns with
| None -> invalid_arg (err_prefix (str_utf_8 ns))
| Some prefix -> prefix, local
let bind_prefixes o atts =
let add acc ((ns, local), uri) =
if not (str_eq ns ns_xmlns) then acc else