-
Notifications
You must be signed in to change notification settings - Fork 28
/
pdfgenlex.ml
193 lines (172 loc) · 6.3 KB
/
pdfgenlex.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
(* A simple lexer, which distinguishes integers, floats and single character
delimiters. Quoted strings are also distinguished, and allow escaped quotes.
Any other non-whitespace-including string is returned as an [Ident]. *)
open Pdfutil
(* To avoid too much storage allocation (and hence garbage collection), we use
the same data type for this very basic lexing module as for the main lexing in
Pdfread. Eventually, we may unify this with the parsing type too. *)
type t =
| LexNull
| LexBool of bool
| LexInt of int
| LexReal of float
| LexString of string
| LexName of string
| LexLeftSquare
| LexRightSquare
| LexLeftDict
| LexRightDict
| LexStream of Pdf.stream
| LexEndStream
| LexObj
| LexEndObj
| LexR
| LexComment of string
| StopLexing
| LexNone
let string_of_token = function
| LexInt i -> "Int " ^ string_of_int i
| LexReal f -> "Float " ^ string_of_float f
| LexString s -> "String " ^ s
| LexName s -> "Ident " ^ s
| LexNull -> "Nothing"
| _ -> "GenLexNone"
let string_of_tokens ts =
fold_left (fun a b -> a ^ "\n " ^ b) "" (map string_of_token ts)
let is_delimiter = function
| '(' | ')' | '<' | '>' | '[' | ']' | '{' | '}' | '%' | '/' -> true
| _ -> false
let is_not_whitespace = function
| '\000' | '\009' | '\010' | '\012' | ' ' | '\013' -> false
| _ -> true
let is_whitespace_or_delimiter = function
| '\000' | '\009' | '\010' | '\012' | ' ' | '\013'
| '(' | ')' | '<' | '>' | '[' | ']' | '{' | '}' | '%' | '/' -> true
| _ -> false
(* Because String.copy has been removed from OCaml. *)
let string_copy s =
Bytes.unsafe_to_string (Bytes.copy (Bytes.unsafe_of_string s))
let lex_item s =
let len = String.length s in
if len = 0 then LexNull else
try
match String.unsafe_get s 0 with
| 'a'..'z' | 'A'..'Z' ->
LexName (string_copy s)
| '\"' when len >= 2 ->
LexString (String.sub s 1 (len - 2))
| _ ->
let rec isint s pos =
pos = ~-1 ||
match String.unsafe_get s pos with
| '.' -> false
| _ -> isint s (pos - 1)
in
if isint s (len - 1)
then
begin try LexInt (int_of_string s) with
_ ->
begin try
(* Detect malformed numbers "--2" etc. which can appear in some PDFs. *)
if len > 1 && String.unsafe_get s 0 = '-' && String.unsafe_get s 1 = '-' then
LexInt (int_of_string (String.sub s 1 (len - 1)))
else
raise Exit (* nothing we can salvage *)
with
_ -> LexReal (float_of_string s) (* Integer > 2^30 on 32 bit system, int_of_string would fail. *)
end
end
else
begin try LexReal (float_of_string s) with
_ ->
(* Detect malformed numbers "--2.5" etc. which can appear in some PDFs. *)
if len > 1 && String.unsafe_get s 0 = '-' && String.unsafe_get s 1 = '-' then
LexReal (float_of_string (String.sub s 1 (len - 1)))
else
raise Exit (* nothing we can salvage *)
end
with
_ -> LexName (string_copy s)
(* Return the string between and including the current position and before the
next character satisfying a given predicate, leaving the position at the
character following the last one returned. End of input is considered a
delimiter, and the characters up to it are returned if it is reached. *)
let rec lengthuntil i n =
match i.Pdfio.input_byte () with
| x when x = Pdfio.no_more -> n
| x ->
if is_whitespace_or_delimiter (Char.unsafe_chr x)
then n
else lengthuntil i (n + 1)
(* Pre-built strings to prevent allocation just to do int_of_string,
float_of_string etc. What we actually need is int_of_substring etc, but this
will require patching OCaml. *)
let strings =
Array.init 17 (fun i -> Bytes.make i ' ')
let getuntil i =
let p = i.Pdfio.pos_in () in
let l = lengthuntil i 0 in
i.Pdfio.seek_in p;
let s = if l <= 16 then Array.unsafe_get strings l else Bytes.create l in
Pdfio.setinit_string i s 0 l;
Bytes.unsafe_to_string s (* Will never be altered, but copied or discarded by get_string_inner. *)
(* The same, but don't return anything. *)
let rec ignoreuntil f i =
match i.Pdfio.input_byte () with
| x when x = Pdfio.no_more -> ()
| x -> if f (Char.unsafe_chr x) then Pdfio.rewind i else ignoreuntil f i
(* Position on the next non-whitespace character. *)
let dropwhite i =
ignoreuntil is_not_whitespace i
(* Get a quoted string, including the quotes. Any quotes inside must be
escaped. *)
let rec get_string_inner b i =
match i.Pdfio.input_byte () with
| x when x = Pdfio.no_more -> raise End_of_file
| x when x = int_of_char '\"' ->
Buffer.add_char b '\"'
| x when x = int_of_char '\\' ->
begin match i.Pdfio.input_byte () with
| x when x = Pdfio.no_more-> raise End_of_file
| x when x = int_of_char '\"' ->
Buffer.add_char b '\"';
get_string_inner b i
| x ->
Buffer.add_char b '\\';
Buffer.add_char b (Char.unsafe_chr x);
get_string_inner b i
end
| x ->
Buffer.add_char b (Char.unsafe_chr x);
get_string_inner b i
let b = Buffer.create 30
let get_string i =
Pdfio.nudge i;
Buffer.clear b;
Buffer.add_char b '\"';
get_string_inner b i;
Buffer.contents b
(* Repeatedly take a whitespace-or-delimiter-delimited section from the input,
and scan it *)
let get_section i =
match Pdfio.peek_byte i with
| x when x = Pdfio.no_more -> ""
| _ ->
dropwhite i;
match Pdfio.peek_byte i with
| x when x = Pdfio.no_more -> ""
| x when Char.unsafe_chr x = '\"' -> get_string i
| x ->
let x = Char.unsafe_chr x in
if is_delimiter x
then (Pdfio.nudge i; string_of_char x)
else getuntil i
let lex_single i =
lex_item (get_section i)
let rec lex_inner prev i =
match lex_item (get_section i) with
| LexNull -> rev prev
| x -> lex_inner (x::prev) i
let lex = lex_inner []
let lex_string s =
lex (Pdfio.input_of_string s)