-
Notifications
You must be signed in to change notification settings - Fork 1
/
tokenizer.c
397 lines (315 loc) · 11.4 KB
/
tokenizer.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
/* ************************************************************************
*
* tokenizer
*
* (c) 1994-2012 by Markus Reschke
*
* ************************************************************************ */
/*
* local constants
*/
#define TOKENIZER_C 1
/*
* include header files
*/
/* local header files */
#include "common.h" /* common stuff */
#include "variables.h" /* global variables */
#include "functions.h" /* external functions */
/* ************************************************************************
* memory management
* ************************************************************************ */
/*
* free token list
*/
void FreeTokenlist(Token_Type *List)
{
Token_Type *Token;
while (List != NULL) /* run through list */
{
Token = List->Next; /* remember next token in list */
/* free memory */
if (List->String != NULL) free(List->String);
free(List);
List = Token; /* next token element */
}
}
/* ************************************************************************
* tokenizers
* ************************************************************************ */
/*
* tokenize line into linked list of tokens
* - doesn't modify input string
* - separator is whitespace (one or multiple spaces and/or tabs)
* - supports quotations ("")
* - quoted special chars (\ quotes " and \)
* - supports comma separated strings outside quotations (comma will be kept as own token)
*/
Token_Type *Tokenize(char *Line)
{
Token_Type *List = NULL; /* list of tokens */
Token_Type *Last = NULL; /* last token in list */
Token_Type *Token; /* new token */
char *Data; /* start of token inside line */
char *TempData = NULL; /* start of token inside line and new token flag */
char *Temp; /* support pointer */
_Bool RunFlag = TRUE; /* loop control */
_Bool InQuotes = FALSE; /* token in quotation */
_Bool ErrFlag = FALSE; /* signals problem */
_Bool Quoted = FALSE; /* quoted special char */
unsigned int VirtCounter = 0; /* virtual char counter (inside a token) */
int RealCounter = 0; /* virtual char counter (inside a token) */
if (Line == NULL) return List; /* sanity check */
Data = Line; /* start of line is also first token's data */
/*
* process char by char
*/
while (RunFlag)
{
/* move chars inside token if required */
if (VirtCounter != RealCounter)
{
/* get rid of special chars by moving the real chars */
Data[VirtCounter] = Data[RealCounter];
}
/* if line's end is reached */
if (Line[0] == 0)
{
if (Quoted || InQuotes) /* got open quote */
{
/* if a quotation flag is set at the line's end
then there is an unclosed quotation */
ErrFlag = TRUE;
}
else if (RealCounter != 0) /* end after token */
{
/* the line's end marks the end of the token */
TempData = Data; /* flag to add last token */
}
/* (RealCounter == 0) is end after whitespace */
RunFlag = FALSE; /* end loop anyway */
}
/* if char is space or tab */
else if ((Line[0] == ' ') || (Line[0] == '\t'))
{
/* a whitespace marks the end of the last token and the start of a new one
but only outside quotations */
if (InQuotes) /* inside a quotation */
{
VirtCounter++; /* it's just a standard char */
}
else if (Quoted) /* quoted special char expected */
{
ErrFlag = TRUE; /* this is a syntax error */
RunFlag = FALSE; /* end loop */
}
else /* we got a true whitespace */
{
/* if it's not the first char we got a new token */
if (Data != Line)
{
TempData = Data; /* flag to create new token */
}
/* find the next non-whitespace char of the next token */
Line++; /* move to next char */
while ((Line[0] == ' ') || (Line[0] == '\t')) Line++;
Data = Line; /* set start of next token */
Line--; /* because we increase Line at end of loop */
RealCounter = -1; /* reset char counter for new token (-1 to fix increase at loop end) */
}
}
/* if char is a comma */
else if (Line[0] == ',')
{
if (InQuotes) /* in quotation mode */
{
VirtCounter++; /* it's just a standard char */
}
else if (Quoted) /* quoted special char expected */
{
ErrFlag = TRUE; /* this is a syntax error */
RunFlag = FALSE; /* end loop */
}
else /* otherwise it's a separator */
{
if (VirtCounter > 0) /* trailing comma */
{
/* create token for heading string and goto this comma again in the next loop run */
TempData = Data; /* flag to create new token */
Data = Line; /* set start of next token (this comma) */
Line--; /* because we increase Line at end of loop */
}
else /* heading comma */
{
VirtCounter++; /* this comma will be a token */
TempData = Data; /* flag to create new token */
Data = Line; /* set start of next token */
Data++; /* char after comma will be next token */
}
}
}
/* if char is a hash */
else if ((RealCounter == 0) && (Line[0] == '#'))
{
/* hash char after whitespace means the rest of the line is a remark */
RunFlag = FALSE; /* end loop */
}
/* if char is a backslash */
else if (Line[0] == 92)
{
if (Quoted) /* quoted backslash */
{
/* a quoted backslash is a normal char */
Quoted = FALSE; /* stop quote */
VirtCounter++;
}
else /* quoted special char will follow */
{
Quoted = TRUE;
}
}
/* if char is a quotation mark */
else if (Line[0] == 34)
{
if (Quoted) /* quoted quotation mark */
{
/* a quoted quotation is a normal char */
Quoted = FALSE; /* stop quote */
VirtCounter++;
}
else /* quotation mark */
{
if (InQuotes) /* in quotation mode */
{
/* quotation ends with a second quotation mark */
InQuotes = FALSE; /* stop quotation */
}
else /* start quotation */
{
InQuotes = TRUE;
}
}
}
/* otherwise char is a standard char */
else
{
if (Quoted) /* quoted special char expected */
{
ErrFlag = TRUE; /* this is a syntax error */
RunFlag = FALSE; /* end loop */
}
else /* standard char */
{
VirtCounter++;
}
}
/* if we have to save a token */
if (TempData != NULL)
{
/* create new token element and link it to token list */
Token = calloc(1, sizeof(Token_Type)); /* create new token element */
Temp = malloc(VirtCounter + 1); /* create buffer for token string */
if ((Token == NULL) || (Temp == NULL))
Log(L_ERR, "Could not allocate memory!");
strncpy(Temp, TempData, VirtCounter); /* copy token data */
Temp[VirtCounter] = 0; /* add missing 0 */
Token->String = Temp; /* move data pointer */
if (List == NULL) List = Token; /* very first token */
Token->Next = NULL; /* default end */
if (Last != NULL) Last->Next = Token; /* link new token */
Last = Token; /* update pointer to last token */
TempData = NULL; /* reset pointer/flag */
VirtCounter = 0; /* reset char counter for new token */
RealCounter = -1; /* reset char counter for new token (-1 to fix increase at loop end) */
}
Line++; /* move to line's next char */
RealCounter++; /* increment char counter for current token */
}
/*
* error handling
*/
if (ErrFlag)
{
if (Quoted) /* quoted special char */
{
Log(L_WARN, "Quote error: %s", Line);
}
else if (InQuotes) /* unclosed quotation */
{
Log(L_WARN, "Quotation error: %s", Line);
}
FreeTokenlist(List); /* free tokens */
List = NULL;
}
return List;
}
/* ************************************************************************
* untokenize
* ************************************************************************ */
/*
* untokenize token list
*
* returns:
* - string pointer on success
* - NULL on any problem
*/
char *UnTokenize(Token_Type *List)
{
char *String = NULL; /* return value */
char *Buffer;
Token_Type *Token;
size_t Size = 0;
int Counter;
/* sanity checks */
if (List == NULL) return String;
/* add all token string sizes */
Token = List;
while(Token) /* run through list */
{
if (Token->String) /* sanity check */
{
Size += strlen(Token->String) + 1; /* 1 for space/0 */
}
Token = Token->Next; /* next token */
}
/* allocate string */
String = malloc(Size);
if (String == NULL) /* error */
{
Log(L_ERR, "No memory!\n");
}
else /* copy tokens into string */
{
Token = List;
Counter = 0;
Buffer = String;
while(Token) /* run through list */
{
if (Token->String) /* sanity check */
{
Counter++;
if (Counter > 1) /* add space */
{
Buffer[0] = ' '; /* set space */
Buffer++; /* move pointer by one char */
Buffer[0] = 0; /* set new string end */
}
strcpy(Buffer, Token->String); /* copy token string */
Size = strlen(Buffer); /* get length */
Buffer = &Buffer[Size]; /* move pointer to new end */
}
Token = Token->Next; /* next token */
}
}
return String;
}
/* ************************************************************************
* clean up of local definitions
* ************************************************************************ */
/*
* undo local constants
*/
#undef TOKENIZER_C
/* ************************************************************************
* EOF
* ************************************************************************ */