1 /*
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net>
11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12 Copyright (c) 2002 Greg Stein <gstein@users.sourceforge.net>
13 Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14 Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15 Copyright (c) 2016-2024 Sebastian Pipping <sebastian@pipping.org>
16 Copyright (c) 2016 Pascal Cuoq <cuoq@trust-in-soft.com>
17 Copyright (c) 2016 Don Lewis <truckman@apache.org>
18 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk>
19 Copyright (c) 2017 Alexander Bluhm <alexander.bluhm@gmx.net>
20 Copyright (c) 2017 Benbuck Nason <bnason@netflix.com>
21 Copyright (c) 2017 José Gutiérrez de la Concha <jose@zeroc.com>
22 Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
23 Copyright (c) 2021 Donghee Na <donghee.na@python.org>
24 Copyright (c) 2022 Martin Ettl <ettl.martin78@googlemail.com>
25 Copyright (c) 2022 Sean McBride <sean@rogue-research.com>
26 Copyright (c) 2023 Hanno Böck <hanno@gentoo.org>
27 Licensed under the MIT license:
28
29 Permission is hereby granted, free of charge, to any person obtaining
30 a copy of this software and associated documentation files (the
31 "Software"), to deal in the Software without restriction, including
32 without limitation the rights to use, copy, modify, merge, publish,
33 distribute, sublicense, and/or sell copies of the Software, and to permit
34 persons to whom the Software is furnished to do so, subject to the
35 following conditions:
36
37 The above copyright notice and this permission notice shall be included
38 in all copies or substantial portions of the Software.
39
40 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
41 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
42 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
43 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
44 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
45 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
46 USE OR OTHER DEALINGS IN THE SOFTWARE.
47 */
48
49 #include "expat_config.h"
50
51 #include <stddef.h>
52 #include <string.h> /* memcpy */
53 #include <stdbool.h>
54
55 #ifdef _WIN32
56 # include "winconfig.h"
57 #endif
58
59 #include "expat_external.h"
60 #include "internal.h"
61 #include "xmltok.h"
62 #include "nametab.h"
63
64 #ifdef XML_DTD
65 # define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
66 #else
67 # define IGNORE_SECTION_TOK_VTABLE /* as nothing */
68 #endif
69
70 #define VTABLE1 \
71 {PREFIX(prologTok), PREFIX(contentTok), \
72 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \
73 {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \
74 PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \
75 PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \
76 PREFIX(updatePosition), PREFIX(isPublicId)
77
78 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
79
80 #define UCS2_GET_NAMING(pages, hi, lo) \
81 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
82
83 /* A 2 byte UTF-8 representation splits the characters 11 bits between
84 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
85 pages, 3 bits to add to that index and 5 bits to generate the mask.
86 */
87 #define UTF8_GET_NAMING2(pages, byte) \
88 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
89 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \
90 & (1u << (((byte)[1]) & 0x1F)))
91
92 /* A 3 byte UTF-8 representation splits the characters 16 bits between
93 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
94 into pages, 3 bits to add to that index and 5 bits to generate the
95 mask.
96 */
97 #define UTF8_GET_NAMING3(pages, byte) \
98 (namingBitmap \
99 [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \
100 << 3) \
101 + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \
102 & (1u << (((byte)[2]) & 0x1F)))
103
104 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
105 of Unicode 3.2: https://www.unicode.org/unicode/reports/tr28/
106 with the additional restriction of not allowing the Unicode
107 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
108 Implementation details:
109 (A & 0x80) == 0 means A < 0x80
110 and
111 (A & 0xC0) == 0xC0 means A > 0xBF
112 */
113
114 #define UTF8_INVALID2(p) \
115 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
116
117 #define UTF8_INVALID3(p) \
118 (((p)[2] & 0x80) == 0 \
119 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \
120 : ((p)[2] & 0xC0) == 0xC0) \
121 || ((*p) == 0xE0 \
122 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
123 : ((p)[1] & 0x80) == 0 \
124 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
125
126 #define UTF8_INVALID4(p) \
127 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \
128 || ((p)[2] & 0xC0) == 0xC0 \
129 || ((*p) == 0xF0 \
130 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
131 : ((p)[1] & 0x80) == 0 \
132 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
133
134 static int PTRFASTCALL
isNever(const ENCODING * enc,const char * p)135 isNever(const ENCODING *enc, const char *p) {
136 UNUSED_P(enc);
137 UNUSED_P(p);
138 return 0;
139 }
140
141 static int PTRFASTCALL
utf8_isName2(const ENCODING * enc,const char * p)142 utf8_isName2(const ENCODING *enc, const char *p) {
143 UNUSED_P(enc);
144 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
145 }
146
147 static int PTRFASTCALL
utf8_isName3(const ENCODING * enc,const char * p)148 utf8_isName3(const ENCODING *enc, const char *p) {
149 UNUSED_P(enc);
150 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
151 }
152
153 #define utf8_isName4 isNever
154
155 static int PTRFASTCALL
utf8_isNmstrt2(const ENCODING * enc,const char * p)156 utf8_isNmstrt2(const ENCODING *enc, const char *p) {
157 UNUSED_P(enc);
158 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
159 }
160
161 static int PTRFASTCALL
utf8_isNmstrt3(const ENCODING * enc,const char * p)162 utf8_isNmstrt3(const ENCODING *enc, const char *p) {
163 UNUSED_P(enc);
164 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
165 }
166
167 #define utf8_isNmstrt4 isNever
168
169 static int PTRFASTCALL
utf8_isInvalid2(const ENCODING * enc,const char * p)170 utf8_isInvalid2(const ENCODING *enc, const char *p) {
171 UNUSED_P(enc);
172 return UTF8_INVALID2((const unsigned char *)p);
173 }
174
175 static int PTRFASTCALL
utf8_isInvalid3(const ENCODING * enc,const char * p)176 utf8_isInvalid3(const ENCODING *enc, const char *p) {
177 UNUSED_P(enc);
178 return UTF8_INVALID3((const unsigned char *)p);
179 }
180
181 static int PTRFASTCALL
utf8_isInvalid4(const ENCODING * enc,const char * p)182 utf8_isInvalid4(const ENCODING *enc, const char *p) {
183 UNUSED_P(enc);
184 return UTF8_INVALID4((const unsigned char *)p);
185 }
186
187 struct normal_encoding {
188 ENCODING enc;
189 unsigned char type[256];
190 #ifdef XML_MIN_SIZE
191 int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
192 int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
193 int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
194 int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
195 int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
196 #endif /* XML_MIN_SIZE */
197 int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
198 int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
199 int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
200 int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
201 int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
202 int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
203 int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
204 int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
205 int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
206 };
207
208 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
209
210 #ifdef XML_MIN_SIZE
211
212 # define STANDARD_VTABLE(E) \
213 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
214
215 #else
216
217 # define STANDARD_VTABLE(E) /* as nothing */
218
219 #endif
220
221 #define NORMAL_VTABLE(E) \
222 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \
223 E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
224
225 #define NULL_VTABLE \
226 /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \
227 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \
228 /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
229
230 static int FASTCALL checkCharRefNumber(int result);
231
232 #include "xmltok_impl.h"
233 #include "ascii.h"
234
235 #ifdef XML_MIN_SIZE
236 # define sb_isNameMin isNever
237 # define sb_isNmstrtMin isNever
238 #endif
239
240 #ifdef XML_MIN_SIZE
241 # define MINBPC(enc) ((enc)->minBytesPerChar)
242 #else
243 /* minimum bytes per character */
244 # define MINBPC(enc) 1
245 #endif
246
247 #define SB_BYTE_TYPE(enc, p) \
248 (((const struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
249
250 #ifdef XML_MIN_SIZE
251 static int PTRFASTCALL
sb_byteType(const ENCODING * enc,const char * p)252 sb_byteType(const ENCODING *enc, const char *p) {
253 return SB_BYTE_TYPE(enc, p);
254 }
255 # define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
256 #else
257 # define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
258 #endif
259
260 #ifdef XML_MIN_SIZE
261 # define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
262 static int PTRFASTCALL
sb_byteToAscii(const ENCODING * enc,const char * p)263 sb_byteToAscii(const ENCODING *enc, const char *p) {
264 UNUSED_P(enc);
265 return *p;
266 }
267 #else
268 # define BYTE_TO_ASCII(enc, p) (*(p))
269 #endif
270
271 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
272 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
273 #ifdef XML_MIN_SIZE
274 # define IS_INVALID_CHAR(enc, p, n) \
275 (AS_NORMAL_ENCODING(enc)->isInvalid##n \
276 && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
277 #else
278 # define IS_INVALID_CHAR(enc, p, n) \
279 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
280 #endif
281
282 #ifdef XML_MIN_SIZE
283 # define IS_NAME_CHAR_MINBPC(enc, p) \
284 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
285 # define IS_NMSTRT_CHAR_MINBPC(enc, p) \
286 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
287 #else
288 # define IS_NAME_CHAR_MINBPC(enc, p) (0)
289 # define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
290 #endif
291
292 #ifdef XML_MIN_SIZE
293 # define CHAR_MATCHES(enc, p, c) \
294 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
295 static int PTRCALL
sb_charMatches(const ENCODING * enc,const char * p,int c)296 sb_charMatches(const ENCODING *enc, const char *p, int c) {
297 UNUSED_P(enc);
298 return *p == c;
299 }
300 #else
301 /* c is an ASCII character */
302 # define CHAR_MATCHES(enc, p, c) (*(p) == (c))
303 #endif
304
305 #define PREFIX(ident) normal_##ident
306 #define XML_TOK_IMPL_C
307 #include "xmltok_impl.c"
308 #undef XML_TOK_IMPL_C
309
310 #undef MINBPC
311 #undef BYTE_TYPE
312 #undef BYTE_TO_ASCII
313 #undef CHAR_MATCHES
314 #undef IS_NAME_CHAR
315 #undef IS_NAME_CHAR_MINBPC
316 #undef IS_NMSTRT_CHAR
317 #undef IS_NMSTRT_CHAR_MINBPC
318 #undef IS_INVALID_CHAR
319
320 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
321 UTF8_cval1 = 0x00,
322 UTF8_cval2 = 0xc0,
323 UTF8_cval3 = 0xe0,
324 UTF8_cval4 = 0xf0
325 };
326
327 void
_INTERNAL_trim_to_complete_utf8_characters(const char * from,const char ** fromLimRef)328 _INTERNAL_trim_to_complete_utf8_characters(const char *from,
329 const char **fromLimRef) {
330 const char *fromLim = *fromLimRef;
331 size_t walked = 0;
332 for (; fromLim > from; fromLim--, walked++) {
333 const unsigned char prev = (unsigned char)fromLim[-1];
334 if ((prev & 0xf8u)
335 == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
336 if (walked + 1 >= 4) {
337 fromLim += 4 - 1;
338 break;
339 } else {
340 walked = 0;
341 }
342 } else if ((prev & 0xf0u)
343 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
344 if (walked + 1 >= 3) {
345 fromLim += 3 - 1;
346 break;
347 } else {
348 walked = 0;
349 }
350 } else if ((prev & 0xe0u)
351 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
352 if (walked + 1 >= 2) {
353 fromLim += 2 - 1;
354 break;
355 } else {
356 walked = 0;
357 }
358 } else if ((prev & 0x80u)
359 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
360 break;
361 }
362 }
363 *fromLimRef = fromLim;
364 }
365
366 static enum XML_Convert_Result PTRCALL
utf8_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)367 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
368 char **toP, const char *toLim) {
369 bool input_incomplete = false;
370 bool output_exhausted = false;
371
372 /* Avoid copying partial characters (due to limited space). */
373 const ptrdiff_t bytesAvailable = fromLim - *fromP;
374 const ptrdiff_t bytesStorable = toLim - *toP;
375 UNUSED_P(enc);
376 if (bytesAvailable > bytesStorable) {
377 fromLim = *fromP + bytesStorable;
378 output_exhausted = true;
379 }
380
381 /* Avoid copying partial characters (from incomplete input). */
382 {
383 const char *const fromLimBefore = fromLim;
384 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
385 if (fromLim < fromLimBefore) {
386 input_incomplete = true;
387 }
388 }
389
390 {
391 const ptrdiff_t bytesToCopy = fromLim - *fromP;
392 memcpy(*toP, *fromP, bytesToCopy);
393 *fromP += bytesToCopy;
394 *toP += bytesToCopy;
395 }
396
397 if (output_exhausted) /* needs to go first */
398 return XML_CONVERT_OUTPUT_EXHAUSTED;
399 else if (input_incomplete)
400 return XML_CONVERT_INPUT_INCOMPLETE;
401 else
402 return XML_CONVERT_COMPLETED;
403 }
404
405 static enum XML_Convert_Result PTRCALL
utf8_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)406 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
407 unsigned short **toP, const unsigned short *toLim) {
408 enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
409 unsigned short *to = *toP;
410 const char *from = *fromP;
411 while (from < fromLim && to < toLim) {
412 switch (SB_BYTE_TYPE(enc, from)) {
413 case BT_LEAD2:
414 if (fromLim - from < 2) {
415 res = XML_CONVERT_INPUT_INCOMPLETE;
416 goto after;
417 }
418 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
419 from += 2;
420 break;
421 case BT_LEAD3:
422 if (fromLim - from < 3) {
423 res = XML_CONVERT_INPUT_INCOMPLETE;
424 goto after;
425 }
426 *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
427 | (from[2] & 0x3f));
428 from += 3;
429 break;
430 case BT_LEAD4: {
431 unsigned long n;
432 if (toLim - to < 2) {
433 res = XML_CONVERT_OUTPUT_EXHAUSTED;
434 goto after;
435 }
436 if (fromLim - from < 4) {
437 res = XML_CONVERT_INPUT_INCOMPLETE;
438 goto after;
439 }
440 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
441 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
442 n -= 0x10000;
443 to[0] = (unsigned short)((n >> 10) | 0xD800);
444 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
445 to += 2;
446 from += 4;
447 } break;
448 default:
449 *to++ = *from++;
450 break;
451 }
452 }
453 if (from < fromLim)
454 res = XML_CONVERT_OUTPUT_EXHAUSTED;
455 after:
456 *fromP = from;
457 *toP = to;
458 return res;
459 }
460
461 #ifdef XML_NS
462 static const struct normal_encoding utf8_encoding_ns
463 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
464 {
465 # include "asciitab.h"
466 # include "utf8tab.h"
467 },
468 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
469 #endif
470
471 static const struct normal_encoding utf8_encoding
472 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
473 {
474 #define BT_COLON BT_NMSTRT
475 #include "asciitab.h"
476 #undef BT_COLON
477 #include "utf8tab.h"
478 },
479 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
480
481 #ifdef XML_NS
482
483 static const struct normal_encoding internal_utf8_encoding_ns
484 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
485 {
486 # include "iasciitab.h"
487 # include "utf8tab.h"
488 },
489 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
490
491 #endif
492
493 static const struct normal_encoding internal_utf8_encoding
494 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
495 {
496 #define BT_COLON BT_NMSTRT
497 #include "iasciitab.h"
498 #undef BT_COLON
499 #include "utf8tab.h"
500 },
501 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
502
503 static enum XML_Convert_Result PTRCALL
latin1_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)504 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
505 char **toP, const char *toLim) {
506 UNUSED_P(enc);
507 for (;;) {
508 unsigned char c;
509 if (*fromP == fromLim)
510 return XML_CONVERT_COMPLETED;
511 c = (unsigned char)**fromP;
512 if (c & 0x80) {
513 if (toLim - *toP < 2)
514 return XML_CONVERT_OUTPUT_EXHAUSTED;
515 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
516 *(*toP)++ = (char)((c & 0x3f) | 0x80);
517 (*fromP)++;
518 } else {
519 if (*toP == toLim)
520 return XML_CONVERT_OUTPUT_EXHAUSTED;
521 *(*toP)++ = *(*fromP)++;
522 }
523 }
524 }
525
526 static enum XML_Convert_Result PTRCALL
latin1_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)527 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
528 unsigned short **toP, const unsigned short *toLim) {
529 UNUSED_P(enc);
530 while (*fromP < fromLim && *toP < toLim)
531 *(*toP)++ = (unsigned char)*(*fromP)++;
532
533 if ((*toP == toLim) && (*fromP < fromLim))
534 return XML_CONVERT_OUTPUT_EXHAUSTED;
535 else
536 return XML_CONVERT_COMPLETED;
537 }
538
539 #ifdef XML_NS
540
541 static const struct normal_encoding latin1_encoding_ns
542 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
543 {
544 # include "asciitab.h"
545 # include "latin1tab.h"
546 },
547 STANDARD_VTABLE(sb_) NULL_VTABLE};
548
549 #endif
550
551 static const struct normal_encoding latin1_encoding
552 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
553 {
554 #define BT_COLON BT_NMSTRT
555 #include "asciitab.h"
556 #undef BT_COLON
557 #include "latin1tab.h"
558 },
559 STANDARD_VTABLE(sb_) NULL_VTABLE};
560
561 static enum XML_Convert_Result PTRCALL
ascii_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)562 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
563 char **toP, const char *toLim) {
564 UNUSED_P(enc);
565 while (*fromP < fromLim && *toP < toLim)
566 *(*toP)++ = *(*fromP)++;
567
568 if ((*toP == toLim) && (*fromP < fromLim))
569 return XML_CONVERT_OUTPUT_EXHAUSTED;
570 else
571 return XML_CONVERT_COMPLETED;
572 }
573
574 #ifdef XML_NS
575
576 static const struct normal_encoding ascii_encoding_ns
577 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
578 {
579 # include "asciitab.h"
580 /* BT_NONXML == 0 */
581 },
582 STANDARD_VTABLE(sb_) NULL_VTABLE};
583
584 #endif
585
586 static const struct normal_encoding ascii_encoding
587 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
588 {
589 #define BT_COLON BT_NMSTRT
590 #include "asciitab.h"
591 #undef BT_COLON
592 /* BT_NONXML == 0 */
593 },
594 STANDARD_VTABLE(sb_) NULL_VTABLE};
595
596 static int PTRFASTCALL
unicode_byte_type(char hi,char lo)597 unicode_byte_type(char hi, char lo) {
598 switch ((unsigned char)hi) {
599 /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
600 case 0xD8:
601 case 0xD9:
602 case 0xDA:
603 case 0xDB:
604 return BT_LEAD4;
605 /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
606 case 0xDC:
607 case 0xDD:
608 case 0xDE:
609 case 0xDF:
610 return BT_TRAIL;
611 case 0xFF:
612 switch ((unsigned char)lo) {
613 case 0xFF: /* noncharacter-FFFF */
614 case 0xFE: /* noncharacter-FFFE */
615 return BT_NONXML;
616 }
617 break;
618 }
619 return BT_NONASCII;
620 }
621
622 #define DEFINE_UTF16_TO_UTF8(E) \
623 static enum XML_Convert_Result PTRCALL E##toUtf8( \
624 const ENCODING *enc, const char **fromP, const char *fromLim, \
625 char **toP, const char *toLim) { \
626 const char *from = *fromP; \
627 UNUSED_P(enc); \
628 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
629 for (; from < fromLim; from += 2) { \
630 int plane; \
631 unsigned char lo2; \
632 unsigned char lo = GET_LO(from); \
633 unsigned char hi = GET_HI(from); \
634 switch (hi) { \
635 case 0: \
636 if (lo < 0x80) { \
637 if (*toP == toLim) { \
638 *fromP = from; \
639 return XML_CONVERT_OUTPUT_EXHAUSTED; \
640 } \
641 *(*toP)++ = lo; \
642 break; \
643 } \
644 /* fall through */ \
645 case 0x1: \
646 case 0x2: \
647 case 0x3: \
648 case 0x4: \
649 case 0x5: \
650 case 0x6: \
651 case 0x7: \
652 if (toLim - *toP < 2) { \
653 *fromP = from; \
654 return XML_CONVERT_OUTPUT_EXHAUSTED; \
655 } \
656 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
657 *(*toP)++ = ((lo & 0x3f) | 0x80); \
658 break; \
659 default: \
660 if (toLim - *toP < 3) { \
661 *fromP = from; \
662 return XML_CONVERT_OUTPUT_EXHAUSTED; \
663 } \
664 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
665 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
666 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
667 *(*toP)++ = ((lo & 0x3f) | 0x80); \
668 break; \
669 case 0xD8: \
670 case 0xD9: \
671 case 0xDA: \
672 case 0xDB: \
673 if (toLim - *toP < 4) { \
674 *fromP = from; \
675 return XML_CONVERT_OUTPUT_EXHAUSTED; \
676 } \
677 if (fromLim - from < 4) { \
678 *fromP = from; \
679 return XML_CONVERT_INPUT_INCOMPLETE; \
680 } \
681 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
682 *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \
683 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
684 from += 2; \
685 lo2 = GET_LO(from); \
686 *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \
687 | (lo2 >> 6) | 0x80); \
688 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
689 break; \
690 } \
691 } \
692 *fromP = from; \
693 if (from < fromLim) \
694 return XML_CONVERT_INPUT_INCOMPLETE; \
695 else \
696 return XML_CONVERT_COMPLETED; \
697 }
698
699 #define DEFINE_UTF16_TO_UTF16(E) \
700 static enum XML_Convert_Result PTRCALL E##toUtf16( \
701 const ENCODING *enc, const char **fromP, const char *fromLim, \
702 unsigned short **toP, const unsigned short *toLim) { \
703 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
704 UNUSED_P(enc); \
705 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
706 /* Avoid copying first half only of surrogate */ \
707 if (fromLim - *fromP > ((toLim - *toP) << 1) \
708 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
709 fromLim -= 2; \
710 res = XML_CONVERT_INPUT_INCOMPLETE; \
711 } \
712 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
713 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
714 if ((*toP == toLim) && (*fromP < fromLim)) \
715 return XML_CONVERT_OUTPUT_EXHAUSTED; \
716 else \
717 return res; \
718 }
719
720 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
721 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
722
723 DEFINE_UTF16_TO_UTF8(little2_)
DEFINE_UTF16_TO_UTF16(little2_)724 DEFINE_UTF16_TO_UTF16(little2_)
725
726 #undef GET_LO
727 #undef GET_HI
728
729 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
730 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
731
732 DEFINE_UTF16_TO_UTF8(big2_)
733 DEFINE_UTF16_TO_UTF16(big2_)
734
735 #undef GET_LO
736 #undef GET_HI
737
738 #define LITTLE2_BYTE_TYPE(enc, p) \
739 ((p)[1] == 0 ? SB_BYTE_TYPE(enc, p) : unicode_byte_type((p)[1], (p)[0]))
740 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
741 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
742 #define LITTLE2_IS_NAME_CHAR_MINBPC(p) \
743 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
744 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \
745 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
746
747 #ifdef XML_MIN_SIZE
748
749 static int PTRFASTCALL
750 little2_byteType(const ENCODING *enc, const char *p) {
751 return LITTLE2_BYTE_TYPE(enc, p);
752 }
753
754 static int PTRFASTCALL
little2_byteToAscii(const ENCODING * enc,const char * p)755 little2_byteToAscii(const ENCODING *enc, const char *p) {
756 UNUSED_P(enc);
757 return LITTLE2_BYTE_TO_ASCII(p);
758 }
759
760 static int PTRCALL
little2_charMatches(const ENCODING * enc,const char * p,int c)761 little2_charMatches(const ENCODING *enc, const char *p, int c) {
762 UNUSED_P(enc);
763 return LITTLE2_CHAR_MATCHES(p, c);
764 }
765
766 static int PTRFASTCALL
little2_isNameMin(const ENCODING * enc,const char * p)767 little2_isNameMin(const ENCODING *enc, const char *p) {
768 UNUSED_P(enc);
769 return LITTLE2_IS_NAME_CHAR_MINBPC(p);
770 }
771
772 static int PTRFASTCALL
little2_isNmstrtMin(const ENCODING * enc,const char * p)773 little2_isNmstrtMin(const ENCODING *enc, const char *p) {
774 UNUSED_P(enc);
775 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
776 }
777
778 # undef VTABLE
779 # define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
780
781 #else /* not XML_MIN_SIZE */
782
783 # undef PREFIX
784 # define PREFIX(ident) little2_##ident
785 # define MINBPC(enc) 2
786 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
787 # define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
788 # define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
789 # define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
790 # define IS_NAME_CHAR(enc, p, n) 0
791 # define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
792 # define IS_NMSTRT_CHAR(enc, p, n) (0)
793 # define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
794
795 # define XML_TOK_IMPL_C
796 # include "xmltok_impl.c"
797 # undef XML_TOK_IMPL_C
798
799 # undef MINBPC
800 # undef BYTE_TYPE
801 # undef BYTE_TO_ASCII
802 # undef CHAR_MATCHES
803 # undef IS_NAME_CHAR
804 # undef IS_NAME_CHAR_MINBPC
805 # undef IS_NMSTRT_CHAR
806 # undef IS_NMSTRT_CHAR_MINBPC
807 # undef IS_INVALID_CHAR
808
809 #endif /* not XML_MIN_SIZE */
810
811 #ifdef XML_NS
812
813 static const struct normal_encoding little2_encoding_ns
814 = {{VTABLE, 2, 0,
815 # if BYTEORDER == 1234
816 1
817 # else
818 0
819 # endif
820 },
821 {
822 # include "asciitab.h"
823 # include "latin1tab.h"
824 },
825 STANDARD_VTABLE(little2_) NULL_VTABLE};
826
827 #endif
828
829 static const struct normal_encoding little2_encoding
830 = {{VTABLE, 2, 0,
831 #if BYTEORDER == 1234
832 1
833 #else
834 0
835 #endif
836 },
837 {
838 #define BT_COLON BT_NMSTRT
839 #include "asciitab.h"
840 #undef BT_COLON
841 #include "latin1tab.h"
842 },
843 STANDARD_VTABLE(little2_) NULL_VTABLE};
844
845 #if BYTEORDER != 4321
846
847 # ifdef XML_NS
848
849 static const struct normal_encoding internal_little2_encoding_ns
850 = {{VTABLE, 2, 0, 1},
851 {
852 # include "iasciitab.h"
853 # include "latin1tab.h"
854 },
855 STANDARD_VTABLE(little2_) NULL_VTABLE};
856
857 # endif
858
859 static const struct normal_encoding internal_little2_encoding
860 = {{VTABLE, 2, 0, 1},
861 {
862 # define BT_COLON BT_NMSTRT
863 # include "iasciitab.h"
864 # undef BT_COLON
865 # include "latin1tab.h"
866 },
867 STANDARD_VTABLE(little2_) NULL_VTABLE};
868
869 #endif
870
871 #define BIG2_BYTE_TYPE(enc, p) \
872 ((p)[0] == 0 ? SB_BYTE_TYPE(enc, p + 1) : unicode_byte_type((p)[0], (p)[1]))
873 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
874 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
875 #define BIG2_IS_NAME_CHAR_MINBPC(p) \
876 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
877 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \
878 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
879
880 #ifdef XML_MIN_SIZE
881
882 static int PTRFASTCALL
big2_byteType(const ENCODING * enc,const char * p)883 big2_byteType(const ENCODING *enc, const char *p) {
884 return BIG2_BYTE_TYPE(enc, p);
885 }
886
887 static int PTRFASTCALL
big2_byteToAscii(const ENCODING * enc,const char * p)888 big2_byteToAscii(const ENCODING *enc, const char *p) {
889 UNUSED_P(enc);
890 return BIG2_BYTE_TO_ASCII(p);
891 }
892
893 static int PTRCALL
big2_charMatches(const ENCODING * enc,const char * p,int c)894 big2_charMatches(const ENCODING *enc, const char *p, int c) {
895 UNUSED_P(enc);
896 return BIG2_CHAR_MATCHES(p, c);
897 }
898
899 static int PTRFASTCALL
big2_isNameMin(const ENCODING * enc,const char * p)900 big2_isNameMin(const ENCODING *enc, const char *p) {
901 UNUSED_P(enc);
902 return BIG2_IS_NAME_CHAR_MINBPC(p);
903 }
904
905 static int PTRFASTCALL
big2_isNmstrtMin(const ENCODING * enc,const char * p)906 big2_isNmstrtMin(const ENCODING *enc, const char *p) {
907 UNUSED_P(enc);
908 return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
909 }
910
911 # undef VTABLE
912 # define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
913
914 #else /* not XML_MIN_SIZE */
915
916 # undef PREFIX
917 # define PREFIX(ident) big2_##ident
918 # define MINBPC(enc) 2
919 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
920 # define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
921 # define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
922 # define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
923 # define IS_NAME_CHAR(enc, p, n) 0
924 # define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
925 # define IS_NMSTRT_CHAR(enc, p, n) (0)
926 # define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
927
928 # define XML_TOK_IMPL_C
929 # include "xmltok_impl.c"
930 # undef XML_TOK_IMPL_C
931
932 # undef MINBPC
933 # undef BYTE_TYPE
934 # undef BYTE_TO_ASCII
935 # undef CHAR_MATCHES
936 # undef IS_NAME_CHAR
937 # undef IS_NAME_CHAR_MINBPC
938 # undef IS_NMSTRT_CHAR
939 # undef IS_NMSTRT_CHAR_MINBPC
940 # undef IS_INVALID_CHAR
941
942 #endif /* not XML_MIN_SIZE */
943
944 #ifdef XML_NS
945
946 static const struct normal_encoding big2_encoding_ns
947 = {{VTABLE, 2, 0,
948 # if BYTEORDER == 4321
949 1
950 # else
951 0
952 # endif
953 },
954 {
955 # include "asciitab.h"
956 # include "latin1tab.h"
957 },
958 STANDARD_VTABLE(big2_) NULL_VTABLE};
959
960 #endif
961
962 static const struct normal_encoding big2_encoding
963 = {{VTABLE, 2, 0,
964 #if BYTEORDER == 4321
965 1
966 #else
967 0
968 #endif
969 },
970 {
971 #define BT_COLON BT_NMSTRT
972 #include "asciitab.h"
973 #undef BT_COLON
974 #include "latin1tab.h"
975 },
976 STANDARD_VTABLE(big2_) NULL_VTABLE};
977
978 #if BYTEORDER != 1234
979
980 # ifdef XML_NS
981
982 static const struct normal_encoding internal_big2_encoding_ns
983 = {{VTABLE, 2, 0, 1},
984 {
985 # include "iasciitab.h"
986 # include "latin1tab.h"
987 },
988 STANDARD_VTABLE(big2_) NULL_VTABLE};
989
990 # endif
991
992 static const struct normal_encoding internal_big2_encoding
993 = {{VTABLE, 2, 0, 1},
994 {
995 # define BT_COLON BT_NMSTRT
996 # include "iasciitab.h"
997 # undef BT_COLON
998 # include "latin1tab.h"
999 },
1000 STANDARD_VTABLE(big2_) NULL_VTABLE};
1001
1002 #endif
1003
1004 #undef PREFIX
1005
1006 static int FASTCALL
streqci(const char * s1,const char * s2)1007 streqci(const char *s1, const char *s2) {
1008 for (;;) {
1009 char c1 = *s1++;
1010 char c2 = *s2++;
1011 if (ASCII_a <= c1 && c1 <= ASCII_z)
1012 c1 += ASCII_A - ASCII_a;
1013 if (ASCII_a <= c2 && c2 <= ASCII_z)
1014 /* The following line will never get executed. streqci() is
1015 * only called from two places, both of which guarantee to put
1016 * upper-case strings into s2.
1017 */
1018 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1019 if (c1 != c2)
1020 return 0;
1021 if (! c1)
1022 break;
1023 }
1024 return 1;
1025 }
1026
1027 static void PTRCALL
initUpdatePosition(const ENCODING * enc,const char * ptr,const char * end,POSITION * pos)1028 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1029 POSITION *pos) {
1030 UNUSED_P(enc);
1031 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1032 }
1033
1034 static int
toAscii(const ENCODING * enc,const char * ptr,const char * end)1035 toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1036 char buf[1];
1037 char *p = buf;
1038 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1039 if (p == buf)
1040 return -1;
1041 else
1042 return buf[0];
1043 }
1044
1045 static int FASTCALL
isSpace(int c)1046 isSpace(int c) {
1047 switch (c) {
1048 case 0x20:
1049 case 0xD:
1050 case 0xA:
1051 case 0x9:
1052 return 1;
1053 }
1054 return 0;
1055 }
1056
1057 /* Return 1 if there's just optional white space or there's an S
1058 followed by name=val.
1059 */
1060 static int
parsePseudoAttribute(const ENCODING * enc,const char * ptr,const char * end,const char ** namePtr,const char ** nameEndPtr,const char ** valPtr,const char ** nextTokPtr)1061 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1062 const char **namePtr, const char **nameEndPtr,
1063 const char **valPtr, const char **nextTokPtr) {
1064 int c;
1065 char open;
1066 if (ptr == end) {
1067 *namePtr = NULL;
1068 return 1;
1069 }
1070 if (! isSpace(toAscii(enc, ptr, end))) {
1071 *nextTokPtr = ptr;
1072 return 0;
1073 }
1074 do {
1075 ptr += enc->minBytesPerChar;
1076 } while (isSpace(toAscii(enc, ptr, end)));
1077 if (ptr == end) {
1078 *namePtr = NULL;
1079 return 1;
1080 }
1081 *namePtr = ptr;
1082 for (;;) {
1083 c = toAscii(enc, ptr, end);
1084 if (c == -1) {
1085 *nextTokPtr = ptr;
1086 return 0;
1087 }
1088 if (c == ASCII_EQUALS) {
1089 *nameEndPtr = ptr;
1090 break;
1091 }
1092 if (isSpace(c)) {
1093 *nameEndPtr = ptr;
1094 do {
1095 ptr += enc->minBytesPerChar;
1096 } while (isSpace(c = toAscii(enc, ptr, end)));
1097 if (c != ASCII_EQUALS) {
1098 *nextTokPtr = ptr;
1099 return 0;
1100 }
1101 break;
1102 }
1103 ptr += enc->minBytesPerChar;
1104 }
1105 if (ptr == *namePtr) {
1106 *nextTokPtr = ptr;
1107 return 0;
1108 }
1109 ptr += enc->minBytesPerChar;
1110 c = toAscii(enc, ptr, end);
1111 while (isSpace(c)) {
1112 ptr += enc->minBytesPerChar;
1113 c = toAscii(enc, ptr, end);
1114 }
1115 if (c != ASCII_QUOT && c != ASCII_APOS) {
1116 *nextTokPtr = ptr;
1117 return 0;
1118 }
1119 open = (char)c;
1120 ptr += enc->minBytesPerChar;
1121 *valPtr = ptr;
1122 for (;; ptr += enc->minBytesPerChar) {
1123 c = toAscii(enc, ptr, end);
1124 if (c == open)
1125 break;
1126 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1127 && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1128 && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1129 *nextTokPtr = ptr;
1130 return 0;
1131 }
1132 }
1133 *nextTokPtr = ptr + enc->minBytesPerChar;
1134 return 1;
1135 }
1136
1137 static const char KW_version[]
1138 = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1139
1140 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1141 ASCII_i, ASCII_n, ASCII_g, '\0'};
1142
1143 static const char KW_standalone[]
1144 = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1145 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1146
1147 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1148
1149 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1150
1151 static int
doParseXmlDecl(const ENCODING * (* encodingFinder)(const ENCODING *,const char *,const char *),int isGeneralTextEntity,const ENCODING * enc,const char * ptr,const char * end,const char ** badPtr,const char ** versionPtr,const char ** versionEndPtr,const char ** encodingName,const ENCODING ** encoding,int * standalone)1152 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1153 const char *),
1154 int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1155 const char *end, const char **badPtr, const char **versionPtr,
1156 const char **versionEndPtr, const char **encodingName,
1157 const ENCODING **encoding, int *standalone) {
1158 const char *val = NULL;
1159 const char *name = NULL;
1160 const char *nameEnd = NULL;
1161 ptr += 5 * enc->minBytesPerChar;
1162 end -= 2 * enc->minBytesPerChar;
1163 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1164 || ! name) {
1165 *badPtr = ptr;
1166 return 0;
1167 }
1168 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1169 if (! isGeneralTextEntity) {
1170 *badPtr = name;
1171 return 0;
1172 }
1173 } else {
1174 if (versionPtr)
1175 *versionPtr = val;
1176 if (versionEndPtr)
1177 *versionEndPtr = ptr;
1178 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1179 *badPtr = ptr;
1180 return 0;
1181 }
1182 if (! name) {
1183 if (isGeneralTextEntity) {
1184 /* a TextDecl must have an EncodingDecl */
1185 *badPtr = ptr;
1186 return 0;
1187 }
1188 return 1;
1189 }
1190 }
1191 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1192 int c = toAscii(enc, val, end);
1193 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1194 *badPtr = val;
1195 return 0;
1196 }
1197 if (encodingName)
1198 *encodingName = val;
1199 if (encoding)
1200 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1201 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1202 *badPtr = ptr;
1203 return 0;
1204 }
1205 if (! name)
1206 return 1;
1207 }
1208 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1209 || isGeneralTextEntity) {
1210 *badPtr = name;
1211 return 0;
1212 }
1213 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1214 if (standalone)
1215 *standalone = 1;
1216 } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1217 if (standalone)
1218 *standalone = 0;
1219 } else {
1220 *badPtr = val;
1221 return 0;
1222 }
1223 while (isSpace(toAscii(enc, ptr, end)))
1224 ptr += enc->minBytesPerChar;
1225 if (ptr != end) {
1226 *badPtr = ptr;
1227 return 0;
1228 }
1229 return 1;
1230 }
1231
1232 static int FASTCALL
checkCharRefNumber(int result)1233 checkCharRefNumber(int result) {
1234 switch (result >> 8) {
1235 case 0xD8:
1236 case 0xD9:
1237 case 0xDA:
1238 case 0xDB:
1239 case 0xDC:
1240 case 0xDD:
1241 case 0xDE:
1242 case 0xDF:
1243 return -1;
1244 case 0:
1245 if (latin1_encoding.type[result] == BT_NONXML)
1246 return -1;
1247 break;
1248 case 0xFF:
1249 if (result == 0xFFFE || result == 0xFFFF)
1250 return -1;
1251 break;
1252 }
1253 return result;
1254 }
1255
1256 int FASTCALL
XmlUtf8Encode(int c,char * buf)1257 XmlUtf8Encode(int c, char *buf) {
1258 enum {
1259 /* minN is minimum legal resulting value for N byte sequence */
1260 min2 = 0x80,
1261 min3 = 0x800,
1262 min4 = 0x10000
1263 };
1264
1265 if (c < 0)
1266 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1267 if (c < min2) {
1268 buf[0] = (char)(c | UTF8_cval1);
1269 return 1;
1270 }
1271 if (c < min3) {
1272 buf[0] = (char)((c >> 6) | UTF8_cval2);
1273 buf[1] = (char)((c & 0x3f) | 0x80);
1274 return 2;
1275 }
1276 if (c < min4) {
1277 buf[0] = (char)((c >> 12) | UTF8_cval3);
1278 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1279 buf[2] = (char)((c & 0x3f) | 0x80);
1280 return 3;
1281 }
1282 if (c < 0x110000) {
1283 buf[0] = (char)((c >> 18) | UTF8_cval4);
1284 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1285 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1286 buf[3] = (char)((c & 0x3f) | 0x80);
1287 return 4;
1288 }
1289 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1290 }
1291
1292 int FASTCALL
XmlUtf16Encode(int charNum,unsigned short * buf)1293 XmlUtf16Encode(int charNum, unsigned short *buf) {
1294 if (charNum < 0)
1295 return 0;
1296 if (charNum < 0x10000) {
1297 buf[0] = (unsigned short)charNum;
1298 return 1;
1299 }
1300 if (charNum < 0x110000) {
1301 charNum -= 0x10000;
1302 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1303 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1304 return 2;
1305 }
1306 return 0;
1307 }
1308
1309 struct unknown_encoding {
1310 struct normal_encoding normal;
1311 CONVERTER convert;
1312 void *userData;
1313 unsigned short utf16[256];
1314 char utf8[256][4];
1315 };
1316
1317 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1318
1319 int
XmlSizeOfUnknownEncoding(void)1320 XmlSizeOfUnknownEncoding(void) {
1321 return sizeof(struct unknown_encoding);
1322 }
1323
1324 static int PTRFASTCALL
unknown_isName(const ENCODING * enc,const char * p)1325 unknown_isName(const ENCODING *enc, const char *p) {
1326 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1327 int c = uenc->convert(uenc->userData, p);
1328 if (c & ~0xFFFF)
1329 return 0;
1330 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1331 }
1332
1333 static int PTRFASTCALL
unknown_isNmstrt(const ENCODING * enc,const char * p)1334 unknown_isNmstrt(const ENCODING *enc, const char *p) {
1335 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1336 int c = uenc->convert(uenc->userData, p);
1337 if (c & ~0xFFFF)
1338 return 0;
1339 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1340 }
1341
1342 static int PTRFASTCALL
unknown_isInvalid(const ENCODING * enc,const char * p)1343 unknown_isInvalid(const ENCODING *enc, const char *p) {
1344 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1345 int c = uenc->convert(uenc->userData, p);
1346 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1347 }
1348
1349 static enum XML_Convert_Result PTRCALL
unknown_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)1350 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1351 char **toP, const char *toLim) {
1352 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1353 char buf[XML_UTF8_ENCODE_MAX];
1354 for (;;) {
1355 const char *utf8;
1356 int n;
1357 if (*fromP == fromLim)
1358 return XML_CONVERT_COMPLETED;
1359 utf8 = uenc->utf8[(unsigned char)**fromP];
1360 n = *utf8++;
1361 if (n == 0) {
1362 int c = uenc->convert(uenc->userData, *fromP);
1363 n = XmlUtf8Encode(c, buf);
1364 if (n > toLim - *toP)
1365 return XML_CONVERT_OUTPUT_EXHAUSTED;
1366 utf8 = buf;
1367 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1368 - (BT_LEAD2 - 2));
1369 } else {
1370 if (n > toLim - *toP)
1371 return XML_CONVERT_OUTPUT_EXHAUSTED;
1372 (*fromP)++;
1373 }
1374 memcpy(*toP, utf8, n);
1375 *toP += n;
1376 }
1377 }
1378
1379 static enum XML_Convert_Result PTRCALL
unknown_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)1380 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1381 unsigned short **toP, const unsigned short *toLim) {
1382 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1383 while (*fromP < fromLim && *toP < toLim) {
1384 unsigned short c = uenc->utf16[(unsigned char)**fromP];
1385 if (c == 0) {
1386 c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1387 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1388 - (BT_LEAD2 - 2));
1389 } else
1390 (*fromP)++;
1391 *(*toP)++ = c;
1392 }
1393
1394 if ((*toP == toLim) && (*fromP < fromLim))
1395 return XML_CONVERT_OUTPUT_EXHAUSTED;
1396 else
1397 return XML_CONVERT_COMPLETED;
1398 }
1399
1400 ENCODING *
XmlInitUnknownEncoding(void * mem,int * table,CONVERTER convert,void * userData)1401 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1402 void *userData) {
1403 int i;
1404 struct unknown_encoding *e = (struct unknown_encoding *)mem;
1405 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1406 for (i = 0; i < 128; i++)
1407 if (latin1_encoding.type[i] != BT_OTHER
1408 && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1409 return 0;
1410 for (i = 0; i < 256; i++) {
1411 int c = table[i];
1412 if (c == -1) {
1413 e->normal.type[i] = BT_MALFORM;
1414 /* This shouldn't really get used. */
1415 e->utf16[i] = 0xFFFF;
1416 e->utf8[i][0] = 1;
1417 e->utf8[i][1] = 0;
1418 } else if (c < 0) {
1419 if (c < -4)
1420 return 0;
1421 /* Multi-byte sequences need a converter function */
1422 if (! convert)
1423 return 0;
1424 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1425 e->utf8[i][0] = 0;
1426 e->utf16[i] = 0;
1427 } else if (c < 0x80) {
1428 if (latin1_encoding.type[c] != BT_OTHER
1429 && latin1_encoding.type[c] != BT_NONXML && c != i)
1430 return 0;
1431 e->normal.type[i] = latin1_encoding.type[c];
1432 e->utf8[i][0] = 1;
1433 e->utf8[i][1] = (char)c;
1434 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1435 } else if (checkCharRefNumber(c) < 0) {
1436 e->normal.type[i] = BT_NONXML;
1437 /* This shouldn't really get used. */
1438 e->utf16[i] = 0xFFFF;
1439 e->utf8[i][0] = 1;
1440 e->utf8[i][1] = 0;
1441 } else {
1442 if (c > 0xFFFF)
1443 return 0;
1444 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1445 e->normal.type[i] = BT_NMSTRT;
1446 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1447 e->normal.type[i] = BT_NAME;
1448 else
1449 e->normal.type[i] = BT_OTHER;
1450 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1451 e->utf16[i] = (unsigned short)c;
1452 }
1453 }
1454 e->userData = userData;
1455 e->convert = convert;
1456 if (convert) {
1457 e->normal.isName2 = unknown_isName;
1458 e->normal.isName3 = unknown_isName;
1459 e->normal.isName4 = unknown_isName;
1460 e->normal.isNmstrt2 = unknown_isNmstrt;
1461 e->normal.isNmstrt3 = unknown_isNmstrt;
1462 e->normal.isNmstrt4 = unknown_isNmstrt;
1463 e->normal.isInvalid2 = unknown_isInvalid;
1464 e->normal.isInvalid3 = unknown_isInvalid;
1465 e->normal.isInvalid4 = unknown_isInvalid;
1466 }
1467 e->normal.enc.utf8Convert = unknown_toUtf8;
1468 e->normal.enc.utf16Convert = unknown_toUtf16;
1469 return &(e->normal.enc);
1470 }
1471
1472 /* If this enumeration is changed, getEncodingIndex and encodings
1473 must also be changed. */
1474 enum {
1475 UNKNOWN_ENC = -1,
1476 ISO_8859_1_ENC = 0,
1477 US_ASCII_ENC,
1478 UTF_8_ENC,
1479 UTF_16_ENC,
1480 UTF_16BE_ENC,
1481 UTF_16LE_ENC,
1482 /* must match encodingNames up to here */
1483 NO_ENC
1484 };
1485
1486 static const char KW_ISO_8859_1[]
1487 = {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8,
1488 ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'};
1489 static const char KW_US_ASCII[]
1490 = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1491 ASCII_C, ASCII_I, ASCII_I, '\0'};
1492 static const char KW_UTF_8[]
1493 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1494 static const char KW_UTF_16[]
1495 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1496 static const char KW_UTF_16BE[]
1497 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1498 ASCII_6, ASCII_B, ASCII_E, '\0'};
1499 static const char KW_UTF_16LE[]
1500 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1501 ASCII_6, ASCII_L, ASCII_E, '\0'};
1502
1503 static int FASTCALL
getEncodingIndex(const char * name)1504 getEncodingIndex(const char *name) {
1505 static const char *const encodingNames[] = {
1506 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1507 };
1508 int i;
1509 if (name == NULL)
1510 return NO_ENC;
1511 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1512 if (streqci(name, encodingNames[i]))
1513 return i;
1514 return UNKNOWN_ENC;
1515 }
1516
1517 /* For binary compatibility, we store the index of the encoding
1518 specified at initialization in the isUtf16 member.
1519 */
1520
1521 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1522 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1523
1524 /* This is what detects the encoding. encodingTable maps from
1525 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1526 the external (protocol) specified encoding; state is
1527 XML_CONTENT_STATE if we're parsing an external text entity, and
1528 XML_PROLOG_STATE otherwise.
1529 */
1530
1531 static int
initScan(const ENCODING * const * encodingTable,const INIT_ENCODING * enc,int state,const char * ptr,const char * end,const char ** nextTokPtr)1532 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1533 int state, const char *ptr, const char *end, const char **nextTokPtr) {
1534 const ENCODING **encPtr;
1535
1536 if (ptr >= end)
1537 return XML_TOK_NONE;
1538 encPtr = enc->encPtr;
1539 if (ptr + 1 == end) {
1540 /* only a single byte available for auto-detection */
1541 #ifndef XML_DTD /* FIXME */
1542 /* a well-formed document entity must have more than one byte */
1543 if (state != XML_CONTENT_STATE)
1544 return XML_TOK_PARTIAL;
1545 #endif
1546 /* so we're parsing an external text entity... */
1547 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1548 switch (INIT_ENC_INDEX(enc)) {
1549 case UTF_16_ENC:
1550 case UTF_16LE_ENC:
1551 case UTF_16BE_ENC:
1552 return XML_TOK_PARTIAL;
1553 }
1554 switch ((unsigned char)*ptr) {
1555 case 0xFE:
1556 case 0xFF:
1557 case 0xEF: /* possibly first byte of UTF-8 BOM */
1558 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1559 break;
1560 /* fall through */
1561 case 0x00:
1562 case 0x3C:
1563 return XML_TOK_PARTIAL;
1564 }
1565 } else {
1566 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1567 case 0xFEFF:
1568 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1569 break;
1570 *nextTokPtr = ptr + 2;
1571 *encPtr = encodingTable[UTF_16BE_ENC];
1572 return XML_TOK_BOM;
1573 /* 00 3C is handled in the default case */
1574 case 0x3C00:
1575 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1576 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1577 && state == XML_CONTENT_STATE)
1578 break;
1579 *encPtr = encodingTable[UTF_16LE_ENC];
1580 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1581 case 0xFFFE:
1582 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1583 break;
1584 *nextTokPtr = ptr + 2;
1585 *encPtr = encodingTable[UTF_16LE_ENC];
1586 return XML_TOK_BOM;
1587 case 0xEFBB:
1588 /* Maybe a UTF-8 BOM (EF BB BF) */
1589 /* If there's an explicitly specified (external) encoding
1590 of ISO-8859-1 or some flavour of UTF-16
1591 and this is an external text entity,
1592 don't look for the BOM,
1593 because it might be a legal data.
1594 */
1595 if (state == XML_CONTENT_STATE) {
1596 int e = INIT_ENC_INDEX(enc);
1597 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1598 || e == UTF_16_ENC)
1599 break;
1600 }
1601 if (ptr + 2 == end)
1602 return XML_TOK_PARTIAL;
1603 if ((unsigned char)ptr[2] == 0xBF) {
1604 *nextTokPtr = ptr + 3;
1605 *encPtr = encodingTable[UTF_8_ENC];
1606 return XML_TOK_BOM;
1607 }
1608 break;
1609 default:
1610 if (ptr[0] == '\0') {
1611 /* 0 isn't a legal data character. Furthermore a document
1612 entity can only start with ASCII characters. So the only
1613 way this can fail to be big-endian UTF-16 if it it's an
1614 external parsed general entity that's labelled as
1615 UTF-16LE.
1616 */
1617 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1618 break;
1619 *encPtr = encodingTable[UTF_16BE_ENC];
1620 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1621 } else if (ptr[1] == '\0') {
1622 /* We could recover here in the case:
1623 - parsing an external entity
1624 - second byte is 0
1625 - no externally specified encoding
1626 - no encoding declaration
1627 by assuming UTF-16LE. But we don't, because this would mean when
1628 presented just with a single byte, we couldn't reliably determine
1629 whether we needed further bytes.
1630 */
1631 if (state == XML_CONTENT_STATE)
1632 break;
1633 *encPtr = encodingTable[UTF_16LE_ENC];
1634 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1635 }
1636 break;
1637 }
1638 }
1639 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1640 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1641 }
1642
1643 #define NS(x) x
1644 #define ns(x) x
1645 #define XML_TOK_NS_C
1646 #include "xmltok_ns.c"
1647 #undef XML_TOK_NS_C
1648 #undef NS
1649 #undef ns
1650
1651 #ifdef XML_NS
1652
1653 # define NS(x) x##NS
1654 # define ns(x) x##_ns
1655
1656 # define XML_TOK_NS_C
1657 # include "xmltok_ns.c"
1658 # undef XML_TOK_NS_C
1659
1660 # undef NS
1661 # undef ns
1662
1663 ENCODING *
XmlInitUnknownEncodingNS(void * mem,int * table,CONVERTER convert,void * userData)1664 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1665 void *userData) {
1666 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1667 if (enc)
1668 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1669 return enc;
1670 }
1671
1672 #endif /* XML_NS */
1673