1 /*
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net>
11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12 Copyright (c) 2002 Greg Stein <gstein@users.sourceforge.net>
13 Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14 Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15 Copyright (c) 2016-2024 Sebastian Pipping <sebastian@pipping.org>
16 Copyright (c) 2016 Pascal Cuoq <cuoq@trust-in-soft.com>
17 Copyright (c) 2016 Don Lewis <truckman@apache.org>
18 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk>
19 Copyright (c) 2017 Alexander Bluhm <alexander.bluhm@gmx.net>
20 Copyright (c) 2017 Benbuck Nason <bnason@netflix.com>
21 Copyright (c) 2017 José Gutiérrez de la Concha <jose@zeroc.com>
22 Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
23 Copyright (c) 2021 Donghee Na <donghee.na@python.org>
24 Copyright (c) 2022 Martin Ettl <ettl.martin78@googlemail.com>
25 Copyright (c) 2022 Sean McBride <sean@rogue-research.com>
26 Copyright (c) 2023 Hanno Böck <hanno@gentoo.org>
27 Copyright (c) 2025 Alfonso Gregory <gfunni234@gmail.com>
28 Copyright (c) 2026 Nick Begg <nick@stunttruck.net>
29 Licensed under the MIT license:
30
31 Permission is hereby granted, free of charge, to any person obtaining
32 a copy of this software and associated documentation files (the
33 "Software"), to deal in the Software without restriction, including
34 without limitation the rights to use, copy, modify, merge, publish,
35 distribute, sublicense, and/or sell copies of the Software, and to permit
36 persons to whom the Software is furnished to do so, subject to the
37 following conditions:
38
39 The above copyright notice and this permission notice shall be included
40 in all copies or substantial portions of the Software.
41
42 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
43 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
44 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
45 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
46 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
47 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
48 USE OR OTHER DEALINGS IN THE SOFTWARE.
49 */
50
51 #include "expat_config.h"
52
53 #include <stddef.h>
54 #include <string.h> /* memcpy */
55 #include <stdbool.h>
56
57 #ifdef _WIN32
58 # include "winconfig.h"
59 #endif
60
61 #include "internal.h"
62 #include "fallthrough.h"
63 #include "xmltok.h"
64 #include "nametab.h"
65
66 #ifdef XML_DTD
67 # define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
68 #else
69 # define IGNORE_SECTION_TOK_VTABLE /* as nothing */
70 #endif
71
72 #define VTABLE1 \
73 {PREFIX(prologTok), PREFIX(contentTok), \
74 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \
75 {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \
76 PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \
77 PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \
78 PREFIX(updatePosition), PREFIX(isPublicId)
79
80 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
81
82 #define UCS2_GET_NAMING(pages, hi, lo) \
83 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
84
85 /* A 2 byte UTF-8 representation splits the characters 11 bits between
86 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
87 pages, 3 bits to add to that index and 5 bits to generate the mask.
88 */
89 #define UTF8_GET_NAMING2(pages, byte) \
90 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
91 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \
92 & (1u << (((byte)[1]) & 0x1F)))
93
94 /* A 3 byte UTF-8 representation splits the characters 16 bits between
95 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
96 into pages, 3 bits to add to that index and 5 bits to generate the
97 mask.
98 */
99 #define UTF8_GET_NAMING3(pages, byte) \
100 (namingBitmap \
101 [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \
102 << 3) \
103 + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \
104 & (1u << (((byte)[2]) & 0x1F)))
105
106 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
107 of Unicode 3.2: https://www.unicode.org/unicode/reports/tr28/
108 with the additional restriction of not allowing the Unicode
109 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
110 Implementation details:
111 (A & 0x80) == 0 means A < 0x80
112 and
113 (A & 0xC0) == 0xC0 means A > 0xBF
114 */
115
116 #define UTF8_INVALID2(p) \
117 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
118
119 #define UTF8_INVALID3(p) \
120 (((p)[2] & 0x80) == 0 \
121 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \
122 : ((p)[2] & 0xC0) == 0xC0) \
123 || ((*p) == 0xE0 \
124 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
125 : ((p)[1] & 0x80) == 0 \
126 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
127
128 #define UTF8_INVALID4(p) \
129 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \
130 || ((p)[2] & 0xC0) == 0xC0 \
131 || ((*p) == 0xF0 \
132 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
133 : ((p)[1] & 0x80) == 0 \
134 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
135
136 static int PTRFASTCALL
isNever(const ENCODING * enc,const char * p)137 isNever(const ENCODING *enc, const char *p) {
138 UNUSED_P(enc);
139 UNUSED_P(p);
140 return 0;
141 }
142
143 static int PTRFASTCALL
utf8_isName2(const ENCODING * enc,const char * p)144 utf8_isName2(const ENCODING *enc, const char *p) {
145 UNUSED_P(enc);
146 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
147 }
148
149 static int PTRFASTCALL
utf8_isName3(const ENCODING * enc,const char * p)150 utf8_isName3(const ENCODING *enc, const char *p) {
151 UNUSED_P(enc);
152 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
153 }
154
155 #define utf8_isName4 isNever
156
157 static int PTRFASTCALL
utf8_isNmstrt2(const ENCODING * enc,const char * p)158 utf8_isNmstrt2(const ENCODING *enc, const char *p) {
159 UNUSED_P(enc);
160 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
161 }
162
163 static int PTRFASTCALL
utf8_isNmstrt3(const ENCODING * enc,const char * p)164 utf8_isNmstrt3(const ENCODING *enc, const char *p) {
165 UNUSED_P(enc);
166 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
167 }
168
169 #define utf8_isNmstrt4 isNever
170
171 static int PTRFASTCALL
utf8_isInvalid2(const ENCODING * enc,const char * p)172 utf8_isInvalid2(const ENCODING *enc, const char *p) {
173 UNUSED_P(enc);
174 return UTF8_INVALID2((const unsigned char *)p);
175 }
176
177 static int PTRFASTCALL
utf8_isInvalid3(const ENCODING * enc,const char * p)178 utf8_isInvalid3(const ENCODING *enc, const char *p) {
179 UNUSED_P(enc);
180 return UTF8_INVALID3((const unsigned char *)p);
181 }
182
183 static int PTRFASTCALL
utf8_isInvalid4(const ENCODING * enc,const char * p)184 utf8_isInvalid4(const ENCODING *enc, const char *p) {
185 UNUSED_P(enc);
186 return UTF8_INVALID4((const unsigned char *)p);
187 }
188
189 struct normal_encoding {
190 ENCODING enc;
191 unsigned char type[256];
192 #ifdef XML_MIN_SIZE
193 int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
194 int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
195 int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
196 int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
197 int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
198 #endif /* XML_MIN_SIZE */
199 int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
200 int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
201 int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
202 int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
203 int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
204 int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
205 int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
206 int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
207 int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
208 };
209
210 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
211
212 #ifdef XML_MIN_SIZE
213
214 # define STANDARD_VTABLE(E) \
215 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
216
217 #else
218
219 # define STANDARD_VTABLE(E) /* as nothing */
220
221 #endif
222
223 #define NORMAL_VTABLE(E) \
224 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \
225 E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
226
227 #define NULL_VTABLE \
228 /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \
229 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \
230 /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
231
232 static int FASTCALL checkCharRefNumber(int result);
233
234 #include "xmltok_impl.h"
235 #include "ascii.h"
236
237 #ifdef XML_MIN_SIZE
238 # define sb_isNameMin isNever
239 # define sb_isNmstrtMin isNever
240 #endif
241
242 #ifdef XML_MIN_SIZE
243 # define MINBPC(enc) ((enc)->minBytesPerChar)
244 #else
245 /* minimum bytes per character */
246 # define MINBPC(enc) 1
247 #endif
248
249 #define SB_BYTE_TYPE(enc, p) \
250 (((const struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
251
252 #ifdef XML_MIN_SIZE
253 static int PTRFASTCALL
sb_byteType(const ENCODING * enc,const char * p)254 sb_byteType(const ENCODING *enc, const char *p) {
255 return SB_BYTE_TYPE(enc, p);
256 }
257 # define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
258 #else
259 # define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
260 #endif
261
262 #ifdef XML_MIN_SIZE
263 # define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
264 static int PTRFASTCALL
sb_byteToAscii(const ENCODING * enc,const char * p)265 sb_byteToAscii(const ENCODING *enc, const char *p) {
266 UNUSED_P(enc);
267 return *p;
268 }
269 #else
270 # define BYTE_TO_ASCII(enc, p) (*(p))
271 #endif
272
273 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
274 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
275 #ifdef XML_MIN_SIZE
276 # define IS_INVALID_CHAR(enc, p, n) \
277 (AS_NORMAL_ENCODING(enc)->isInvalid##n \
278 && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
279 #else
280 # define IS_INVALID_CHAR(enc, p, n) \
281 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
282 #endif
283
284 #ifdef XML_MIN_SIZE
285 # define IS_NAME_CHAR_MINBPC(enc, p) \
286 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
287 # define IS_NMSTRT_CHAR_MINBPC(enc, p) \
288 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
289 #else
290 # define IS_NAME_CHAR_MINBPC(enc, p) (0)
291 # define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
292 #endif
293
294 #ifdef XML_MIN_SIZE
295 # define CHAR_MATCHES(enc, p, c) \
296 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
297 static int PTRCALL
sb_charMatches(const ENCODING * enc,const char * p,int c)298 sb_charMatches(const ENCODING *enc, const char *p, int c) {
299 UNUSED_P(enc);
300 return *p == c;
301 }
302 #else
303 /* c is an ASCII character */
304 # define CHAR_MATCHES(enc, p, c) (*(p) == (c))
305 #endif
306
307 #define PREFIX(ident) normal_##ident
308 #define XML_TOK_IMPL_C
309 #include "xmltok_impl.c"
310 #undef XML_TOK_IMPL_C
311
312 #undef MINBPC
313 #undef BYTE_TYPE
314 #undef BYTE_TO_ASCII
315 #undef CHAR_MATCHES
316 #undef IS_NAME_CHAR
317 #undef IS_NAME_CHAR_MINBPC
318 #undef IS_NMSTRT_CHAR
319 #undef IS_NMSTRT_CHAR_MINBPC
320 #undef IS_INVALID_CHAR
321
322 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
323 UTF8_cval1 = 0x00,
324 UTF8_cval2 = 0xc0,
325 UTF8_cval3 = 0xe0,
326 UTF8_cval4 = 0xf0
327 };
328
329 void
_INTERNAL_trim_to_complete_utf8_characters(const char * from,const char ** fromLimRef)330 _INTERNAL_trim_to_complete_utf8_characters(const char *from,
331 const char **fromLimRef) {
332 const char *fromLim = *fromLimRef;
333 size_t walked = 0;
334 for (; fromLim > from; fromLim--, walked++) {
335 const unsigned char prev = (unsigned char)fromLim[-1];
336 if ((prev & 0xf8u)
337 == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
338 if (walked + 1 >= 4) {
339 fromLim += 4 - 1;
340 break;
341 } else {
342 walked = 0;
343 }
344 } else if ((prev & 0xf0u)
345 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
346 if (walked + 1 >= 3) {
347 fromLim += 3 - 1;
348 break;
349 } else {
350 walked = 0;
351 }
352 } else if ((prev & 0xe0u)
353 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
354 if (walked + 1 >= 2) {
355 fromLim += 2 - 1;
356 break;
357 } else {
358 walked = 0;
359 }
360 } else if ((prev & 0x80u)
361 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
362 break;
363 }
364 }
365 *fromLimRef = fromLim;
366 }
367
368 static enum XML_Convert_Result PTRCALL
utf8_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)369 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
370 char **toP, const char *toLim) {
371 bool input_incomplete = false;
372 bool output_exhausted = false;
373
374 /* Avoid copying partial characters (due to limited space). */
375 const ptrdiff_t bytesAvailable = fromLim - *fromP;
376 const ptrdiff_t bytesStorable = toLim - *toP;
377 UNUSED_P(enc);
378 if (bytesAvailable > bytesStorable) {
379 fromLim = *fromP + bytesStorable;
380 output_exhausted = true;
381 }
382
383 /* Avoid copying partial characters (from incomplete input). */
384 {
385 const char *const fromLimBefore = fromLim;
386 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
387 if (fromLim < fromLimBefore) {
388 input_incomplete = true;
389 }
390 }
391
392 {
393 const ptrdiff_t bytesToCopy = fromLim - *fromP;
394 memcpy(*toP, *fromP, bytesToCopy);
395 *fromP += bytesToCopy;
396 *toP += bytesToCopy;
397 }
398
399 if (output_exhausted) /* needs to go first */
400 return XML_CONVERT_OUTPUT_EXHAUSTED;
401 else if (input_incomplete)
402 return XML_CONVERT_INPUT_INCOMPLETE;
403 else
404 return XML_CONVERT_COMPLETED;
405 }
406
407 static enum XML_Convert_Result PTRCALL
utf8_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)408 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
409 unsigned short **toP, const unsigned short *toLim) {
410 enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
411 unsigned short *to = *toP;
412 const char *from = *fromP;
413 while (from < fromLim && to < toLim) {
414 switch (SB_BYTE_TYPE(enc, from)) {
415 case BT_LEAD2:
416 if (fromLim - from < 2) {
417 res = XML_CONVERT_INPUT_INCOMPLETE;
418 goto after;
419 }
420 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
421 from += 2;
422 break;
423 case BT_LEAD3:
424 if (fromLim - from < 3) {
425 res = XML_CONVERT_INPUT_INCOMPLETE;
426 goto after;
427 }
428 *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
429 | (from[2] & 0x3f));
430 from += 3;
431 break;
432 case BT_LEAD4: {
433 unsigned long n;
434 if (toLim - to < 2) {
435 res = XML_CONVERT_OUTPUT_EXHAUSTED;
436 goto after;
437 }
438 if (fromLim - from < 4) {
439 res = XML_CONVERT_INPUT_INCOMPLETE;
440 goto after;
441 }
442 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
443 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
444 n -= 0x10000;
445 to[0] = (unsigned short)((n >> 10) | 0xD800);
446 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
447 to += 2;
448 from += 4;
449 } break;
450 default:
451 *to++ = *from++;
452 break;
453 }
454 }
455 if (from < fromLim)
456 res = XML_CONVERT_OUTPUT_EXHAUSTED;
457 after:
458 *fromP = from;
459 *toP = to;
460 return res;
461 }
462
463 #ifdef XML_NS
464 static const struct normal_encoding utf8_encoding_ns
465 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
466 {
467 # include "asciitab.h"
468 # include "utf8tab.h"
469 },
470 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
471 #endif
472
473 static const struct normal_encoding utf8_encoding
474 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
475 {
476 #define BT_COLON BT_NMSTRT
477 #include "asciitab.h"
478 #undef BT_COLON
479 #include "utf8tab.h"
480 },
481 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
482
483 #ifdef XML_NS
484
485 static const struct normal_encoding internal_utf8_encoding_ns
486 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
487 {
488 # include "iasciitab.h"
489 # include "utf8tab.h"
490 },
491 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
492
493 #endif
494
495 static const struct normal_encoding internal_utf8_encoding
496 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
497 {
498 #define BT_COLON BT_NMSTRT
499 #include "iasciitab.h"
500 #undef BT_COLON
501 #include "utf8tab.h"
502 },
503 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
504
505 static enum XML_Convert_Result PTRCALL
latin1_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)506 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
507 char **toP, const char *toLim) {
508 UNUSED_P(enc);
509 for (;;) {
510 unsigned char c;
511 if (*fromP == fromLim)
512 return XML_CONVERT_COMPLETED;
513 c = (unsigned char)**fromP;
514 if (c & 0x80) {
515 if (toLim - *toP < 2)
516 return XML_CONVERT_OUTPUT_EXHAUSTED;
517 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
518 *(*toP)++ = (char)((c & 0x3f) | 0x80);
519 (*fromP)++;
520 } else {
521 if (*toP == toLim)
522 return XML_CONVERT_OUTPUT_EXHAUSTED;
523 *(*toP)++ = *(*fromP)++;
524 }
525 }
526 }
527
528 static enum XML_Convert_Result PTRCALL
latin1_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)529 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
530 unsigned short **toP, const unsigned short *toLim) {
531 UNUSED_P(enc);
532 while (*fromP < fromLim && *toP < toLim)
533 *(*toP)++ = (unsigned char)*(*fromP)++;
534
535 if ((*toP == toLim) && (*fromP < fromLim))
536 return XML_CONVERT_OUTPUT_EXHAUSTED;
537 else
538 return XML_CONVERT_COMPLETED;
539 }
540
541 #ifdef XML_NS
542
543 static const struct normal_encoding latin1_encoding_ns
544 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
545 {
546 # include "asciitab.h"
547 # include "latin1tab.h"
548 },
549 STANDARD_VTABLE(sb_) NULL_VTABLE};
550
551 #endif
552
553 static const struct normal_encoding latin1_encoding
554 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
555 {
556 #define BT_COLON BT_NMSTRT
557 #include "asciitab.h"
558 #undef BT_COLON
559 #include "latin1tab.h"
560 },
561 STANDARD_VTABLE(sb_) NULL_VTABLE};
562
563 static enum XML_Convert_Result PTRCALL
ascii_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)564 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
565 char **toP, const char *toLim) {
566 UNUSED_P(enc);
567 while (*fromP < fromLim && *toP < toLim)
568 *(*toP)++ = *(*fromP)++;
569
570 if ((*toP == toLim) && (*fromP < fromLim))
571 return XML_CONVERT_OUTPUT_EXHAUSTED;
572 else
573 return XML_CONVERT_COMPLETED;
574 }
575
576 #ifdef XML_NS
577
578 static const struct normal_encoding ascii_encoding_ns
579 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
580 {
581 # include "asciitab.h"
582 /* BT_NONXML == 0 */
583 },
584 STANDARD_VTABLE(sb_) NULL_VTABLE};
585
586 #endif
587
588 static const struct normal_encoding ascii_encoding
589 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
590 {
591 #define BT_COLON BT_NMSTRT
592 #include "asciitab.h"
593 #undef BT_COLON
594 /* BT_NONXML == 0 */
595 },
596 STANDARD_VTABLE(sb_) NULL_VTABLE};
597
598 static int PTRFASTCALL
unicode_byte_type(char hi,char lo)599 unicode_byte_type(char hi, char lo) {
600 switch ((unsigned char)hi) {
601 /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
602 case 0xD8:
603 case 0xD9:
604 case 0xDA:
605 case 0xDB:
606 return BT_LEAD4;
607 /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
608 case 0xDC:
609 case 0xDD:
610 case 0xDE:
611 case 0xDF:
612 return BT_TRAIL;
613 case 0xFF:
614 switch ((unsigned char)lo) {
615 case 0xFF: /* noncharacter-FFFF */
616 case 0xFE: /* noncharacter-FFFE */
617 return BT_NONXML;
618 }
619 break;
620 }
621 return BT_NONASCII;
622 }
623
624 #define DEFINE_UTF16_TO_UTF8(E) \
625 static enum XML_Convert_Result PTRCALL E##toUtf8( \
626 const ENCODING *enc, const char **fromP, const char *fromLim, \
627 char **toP, const char *toLim) { \
628 const char *from = *fromP; \
629 UNUSED_P(enc); \
630 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
631 for (; from < fromLim; from += 2) { \
632 int plane; \
633 unsigned char lo2; \
634 unsigned char lo = GET_LO(from); \
635 unsigned char hi = GET_HI(from); \
636 switch (hi) { \
637 case 0: \
638 if (lo < 0x80) { \
639 if (*toP == toLim) { \
640 *fromP = from; \
641 return XML_CONVERT_OUTPUT_EXHAUSTED; \
642 } \
643 *(*toP)++ = lo; \
644 break; \
645 } \
646 EXPAT_FALLTHROUGH; \
647 case 0x1: \
648 case 0x2: \
649 case 0x3: \
650 case 0x4: \
651 case 0x5: \
652 case 0x6: \
653 case 0x7: \
654 if (toLim - *toP < 2) { \
655 *fromP = from; \
656 return XML_CONVERT_OUTPUT_EXHAUSTED; \
657 } \
658 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
659 *(*toP)++ = ((lo & 0x3f) | 0x80); \
660 break; \
661 default: \
662 if (toLim - *toP < 3) { \
663 *fromP = from; \
664 return XML_CONVERT_OUTPUT_EXHAUSTED; \
665 } \
666 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
667 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
668 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
669 *(*toP)++ = ((lo & 0x3f) | 0x80); \
670 break; \
671 case 0xD8: \
672 case 0xD9: \
673 case 0xDA: \
674 case 0xDB: \
675 if (toLim - *toP < 4) { \
676 *fromP = from; \
677 return XML_CONVERT_OUTPUT_EXHAUSTED; \
678 } \
679 if (fromLim - from < 4) { \
680 *fromP = from; \
681 return XML_CONVERT_INPUT_INCOMPLETE; \
682 } \
683 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
684 *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \
685 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
686 from += 2; \
687 lo2 = GET_LO(from); \
688 *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \
689 | (lo2 >> 6) | 0x80); \
690 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
691 break; \
692 } \
693 } \
694 *fromP = from; \
695 if (from < fromLim) \
696 return XML_CONVERT_INPUT_INCOMPLETE; \
697 else \
698 return XML_CONVERT_COMPLETED; \
699 }
700
701 #define DEFINE_UTF16_TO_UTF16(E) \
702 static enum XML_Convert_Result PTRCALL E##toUtf16( \
703 const ENCODING *enc, const char **fromP, const char *fromLim, \
704 unsigned short **toP, const unsigned short *toLim) { \
705 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
706 UNUSED_P(enc); \
707 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
708 /* Avoid copying first half only of surrogate */ \
709 if (fromLim - *fromP > ((toLim - *toP) << 1) \
710 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
711 fromLim -= 2; \
712 res = XML_CONVERT_INPUT_INCOMPLETE; \
713 } \
714 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
715 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
716 if ((*toP == toLim) && (*fromP < fromLim)) \
717 return XML_CONVERT_OUTPUT_EXHAUSTED; \
718 else \
719 return res; \
720 }
721
722 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
723 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
724
725 DEFINE_UTF16_TO_UTF8(little2_)
DEFINE_UTF16_TO_UTF16(little2_)726 DEFINE_UTF16_TO_UTF16(little2_)
727
728 #undef GET_LO
729 #undef GET_HI
730
731 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
732 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
733
734 DEFINE_UTF16_TO_UTF8(big2_)
735 DEFINE_UTF16_TO_UTF16(big2_)
736
737 #undef GET_LO
738 #undef GET_HI
739
740 #define LITTLE2_BYTE_TYPE(enc, p) \
741 ((p)[1] == 0 ? SB_BYTE_TYPE(enc, p) : unicode_byte_type((p)[1], (p)[0]))
742 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
743 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
744 #define LITTLE2_IS_NAME_CHAR_MINBPC(p) \
745 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
746 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \
747 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
748
749 #ifdef XML_MIN_SIZE
750
751 static int PTRFASTCALL
752 little2_byteType(const ENCODING *enc, const char *p) {
753 return LITTLE2_BYTE_TYPE(enc, p);
754 }
755
756 static int PTRFASTCALL
little2_byteToAscii(const ENCODING * enc,const char * p)757 little2_byteToAscii(const ENCODING *enc, const char *p) {
758 UNUSED_P(enc);
759 return LITTLE2_BYTE_TO_ASCII(p);
760 }
761
762 static int PTRCALL
little2_charMatches(const ENCODING * enc,const char * p,int c)763 little2_charMatches(const ENCODING *enc, const char *p, int c) {
764 UNUSED_P(enc);
765 return LITTLE2_CHAR_MATCHES(p, c);
766 }
767
768 static int PTRFASTCALL
little2_isNameMin(const ENCODING * enc,const char * p)769 little2_isNameMin(const ENCODING *enc, const char *p) {
770 UNUSED_P(enc);
771 return LITTLE2_IS_NAME_CHAR_MINBPC(p);
772 }
773
774 static int PTRFASTCALL
little2_isNmstrtMin(const ENCODING * enc,const char * p)775 little2_isNmstrtMin(const ENCODING *enc, const char *p) {
776 UNUSED_P(enc);
777 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
778 }
779
780 # undef VTABLE
781 # define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
782
783 #else /* not XML_MIN_SIZE */
784
785 # undef PREFIX
786 # define PREFIX(ident) little2_##ident
787 # define MINBPC(enc) 2
788 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
789 # define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
790 # define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
791 # define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
792 # define IS_NAME_CHAR(enc, p, n) 0
793 # define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
794 # define IS_NMSTRT_CHAR(enc, p, n) (0)
795 # define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
796
797 # define XML_TOK_IMPL_C
798 # include "xmltok_impl.c"
799 # undef XML_TOK_IMPL_C
800
801 # undef MINBPC
802 # undef BYTE_TYPE
803 # undef BYTE_TO_ASCII
804 # undef CHAR_MATCHES
805 # undef IS_NAME_CHAR
806 # undef IS_NAME_CHAR_MINBPC
807 # undef IS_NMSTRT_CHAR
808 # undef IS_NMSTRT_CHAR_MINBPC
809 # undef IS_INVALID_CHAR
810
811 #endif /* not XML_MIN_SIZE */
812
813 #ifdef XML_NS
814
815 static const struct normal_encoding little2_encoding_ns
816 = {{VTABLE, 2, 0,
817 # if BYTEORDER == 1234
818 1
819 # else
820 0
821 # endif
822 },
823 {
824 # include "asciitab.h"
825 # include "latin1tab.h"
826 },
827 STANDARD_VTABLE(little2_) NULL_VTABLE};
828
829 #endif
830
831 static const struct normal_encoding little2_encoding
832 = {{VTABLE, 2, 0,
833 #if BYTEORDER == 1234
834 1
835 #else
836 0
837 #endif
838 },
839 {
840 #define BT_COLON BT_NMSTRT
841 #include "asciitab.h"
842 #undef BT_COLON
843 #include "latin1tab.h"
844 },
845 STANDARD_VTABLE(little2_) NULL_VTABLE};
846
847 #if BYTEORDER != 4321
848
849 # ifdef XML_NS
850
851 static const struct normal_encoding internal_little2_encoding_ns
852 = {{VTABLE, 2, 0, 1},
853 {
854 # include "iasciitab.h"
855 # include "latin1tab.h"
856 },
857 STANDARD_VTABLE(little2_) NULL_VTABLE};
858
859 # endif
860
861 static const struct normal_encoding internal_little2_encoding
862 = {{VTABLE, 2, 0, 1},
863 {
864 # define BT_COLON BT_NMSTRT
865 # include "iasciitab.h"
866 # undef BT_COLON
867 # include "latin1tab.h"
868 },
869 STANDARD_VTABLE(little2_) NULL_VTABLE};
870
871 #endif
872
873 #define BIG2_BYTE_TYPE(enc, p) \
874 ((p)[0] == 0 ? SB_BYTE_TYPE(enc, p + 1) : unicode_byte_type((p)[0], (p)[1]))
875 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
876 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
877 #define BIG2_IS_NAME_CHAR_MINBPC(p) \
878 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
879 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \
880 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
881
882 #ifdef XML_MIN_SIZE
883
884 static int PTRFASTCALL
big2_byteType(const ENCODING * enc,const char * p)885 big2_byteType(const ENCODING *enc, const char *p) {
886 return BIG2_BYTE_TYPE(enc, p);
887 }
888
889 static int PTRFASTCALL
big2_byteToAscii(const ENCODING * enc,const char * p)890 big2_byteToAscii(const ENCODING *enc, const char *p) {
891 UNUSED_P(enc);
892 return BIG2_BYTE_TO_ASCII(p);
893 }
894
895 static int PTRCALL
big2_charMatches(const ENCODING * enc,const char * p,int c)896 big2_charMatches(const ENCODING *enc, const char *p, int c) {
897 UNUSED_P(enc);
898 return BIG2_CHAR_MATCHES(p, c);
899 }
900
901 static int PTRFASTCALL
big2_isNameMin(const ENCODING * enc,const char * p)902 big2_isNameMin(const ENCODING *enc, const char *p) {
903 UNUSED_P(enc);
904 return BIG2_IS_NAME_CHAR_MINBPC(p);
905 }
906
907 static int PTRFASTCALL
big2_isNmstrtMin(const ENCODING * enc,const char * p)908 big2_isNmstrtMin(const ENCODING *enc, const char *p) {
909 UNUSED_P(enc);
910 return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
911 }
912
913 # undef VTABLE
914 # define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
915
916 #else /* not XML_MIN_SIZE */
917
918 # undef PREFIX
919 # define PREFIX(ident) big2_##ident
920 # define MINBPC(enc) 2
921 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
922 # define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
923 # define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
924 # define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
925 # define IS_NAME_CHAR(enc, p, n) 0
926 # define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
927 # define IS_NMSTRT_CHAR(enc, p, n) (0)
928 # define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
929
930 # define XML_TOK_IMPL_C
931 # include "xmltok_impl.c"
932 # undef XML_TOK_IMPL_C
933
934 # undef MINBPC
935 # undef BYTE_TYPE
936 # undef BYTE_TO_ASCII
937 # undef CHAR_MATCHES
938 # undef IS_NAME_CHAR
939 # undef IS_NAME_CHAR_MINBPC
940 # undef IS_NMSTRT_CHAR
941 # undef IS_NMSTRT_CHAR_MINBPC
942 # undef IS_INVALID_CHAR
943
944 #endif /* not XML_MIN_SIZE */
945
946 #ifdef XML_NS
947
948 static const struct normal_encoding big2_encoding_ns
949 = {{VTABLE, 2, 0,
950 # if BYTEORDER == 4321
951 1
952 # else
953 0
954 # endif
955 },
956 {
957 # include "asciitab.h"
958 # include "latin1tab.h"
959 },
960 STANDARD_VTABLE(big2_) NULL_VTABLE};
961
962 #endif
963
964 static const struct normal_encoding big2_encoding
965 = {{VTABLE, 2, 0,
966 #if BYTEORDER == 4321
967 1
968 #else
969 0
970 #endif
971 },
972 {
973 #define BT_COLON BT_NMSTRT
974 #include "asciitab.h"
975 #undef BT_COLON
976 #include "latin1tab.h"
977 },
978 STANDARD_VTABLE(big2_) NULL_VTABLE};
979
980 #if BYTEORDER != 1234
981
982 # ifdef XML_NS
983
984 static const struct normal_encoding internal_big2_encoding_ns
985 = {{VTABLE, 2, 0, 1},
986 {
987 # include "iasciitab.h"
988 # include "latin1tab.h"
989 },
990 STANDARD_VTABLE(big2_) NULL_VTABLE};
991
992 # endif
993
994 static const struct normal_encoding internal_big2_encoding
995 = {{VTABLE, 2, 0, 1},
996 {
997 # define BT_COLON BT_NMSTRT
998 # include "iasciitab.h"
999 # undef BT_COLON
1000 # include "latin1tab.h"
1001 },
1002 STANDARD_VTABLE(big2_) NULL_VTABLE};
1003
1004 #endif
1005
1006 #undef PREFIX
1007
1008 static int FASTCALL
streqci(const char * s1,const char * s2)1009 streqci(const char *s1, const char *s2) {
1010 for (;;) {
1011 char c1 = *s1++;
1012 char c2 = *s2++;
1013 if (ASCII_a <= c1 && c1 <= ASCII_z)
1014 c1 += ASCII_A - ASCII_a;
1015 if (ASCII_a <= c2 && c2 <= ASCII_z)
1016 /* The following line will never get executed. streqci() is
1017 * only called from two places, both of which guarantee to put
1018 * upper-case strings into s2.
1019 */
1020 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1021 if (c1 != c2)
1022 return 0;
1023 if (! c1)
1024 break;
1025 }
1026 return 1;
1027 }
1028
1029 static void PTRCALL
initUpdatePosition(const ENCODING * enc,const char * ptr,const char * end,POSITION * pos)1030 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1031 POSITION *pos) {
1032 UNUSED_P(enc);
1033 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1034 }
1035
1036 static int
toAscii(const ENCODING * enc,const char * ptr,const char * end)1037 toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1038 char buf[1];
1039 char *p = buf;
1040 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1041 if (p == buf)
1042 return -1;
1043 else
1044 return buf[0];
1045 }
1046
1047 static int FASTCALL
isSpace(int c)1048 isSpace(int c) {
1049 switch (c) {
1050 case 0x20:
1051 case 0xD:
1052 case 0xA:
1053 case 0x9:
1054 return 1;
1055 }
1056 return 0;
1057 }
1058
1059 /* Return 1 if there's just optional white space or there's an S
1060 followed by name=val.
1061 */
1062 static int
parsePseudoAttribute(const ENCODING * enc,const char * ptr,const char * end,const char ** namePtr,const char ** nameEndPtr,const char ** valPtr,const char ** nextTokPtr)1063 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1064 const char **namePtr, const char **nameEndPtr,
1065 const char **valPtr, const char **nextTokPtr) {
1066 int c;
1067 char open;
1068 if (ptr == end) {
1069 *namePtr = NULL;
1070 return 1;
1071 }
1072 if (! isSpace(toAscii(enc, ptr, end))) {
1073 *nextTokPtr = ptr;
1074 return 0;
1075 }
1076 do {
1077 ptr += enc->minBytesPerChar;
1078 } while (isSpace(toAscii(enc, ptr, end)));
1079 if (ptr == end) {
1080 *namePtr = NULL;
1081 return 1;
1082 }
1083 *namePtr = ptr;
1084 for (;;) {
1085 c = toAscii(enc, ptr, end);
1086 if (c == -1) {
1087 *nextTokPtr = ptr;
1088 return 0;
1089 }
1090 if (c == ASCII_EQUALS) {
1091 *nameEndPtr = ptr;
1092 break;
1093 }
1094 if (isSpace(c)) {
1095 *nameEndPtr = ptr;
1096 do {
1097 ptr += enc->minBytesPerChar;
1098 } while (isSpace(c = toAscii(enc, ptr, end)));
1099 if (c != ASCII_EQUALS) {
1100 *nextTokPtr = ptr;
1101 return 0;
1102 }
1103 break;
1104 }
1105 ptr += enc->minBytesPerChar;
1106 }
1107 if (ptr == *namePtr) {
1108 *nextTokPtr = ptr;
1109 return 0;
1110 }
1111 ptr += enc->minBytesPerChar;
1112 c = toAscii(enc, ptr, end);
1113 while (isSpace(c)) {
1114 ptr += enc->minBytesPerChar;
1115 c = toAscii(enc, ptr, end);
1116 }
1117 if (c != ASCII_QUOT && c != ASCII_APOS) {
1118 *nextTokPtr = ptr;
1119 return 0;
1120 }
1121 open = (char)c;
1122 ptr += enc->minBytesPerChar;
1123 *valPtr = ptr;
1124 for (;; ptr += enc->minBytesPerChar) {
1125 c = toAscii(enc, ptr, end);
1126 if (c == open)
1127 break;
1128 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1129 && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1130 && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1131 *nextTokPtr = ptr;
1132 return 0;
1133 }
1134 }
1135 *nextTokPtr = ptr + enc->minBytesPerChar;
1136 return 1;
1137 }
1138
1139 static const char KW_version[]
1140 = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1141
1142 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1143 ASCII_i, ASCII_n, ASCII_g, '\0'};
1144
1145 static const char KW_standalone[]
1146 = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1147 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1148
1149 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1150
1151 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1152
1153 static int
doParseXmlDecl(const ENCODING * (* encodingFinder)(const ENCODING *,const char *,const char *),int isGeneralTextEntity,const ENCODING * enc,const char * ptr,const char * end,const char ** badPtr,const char ** versionPtr,const char ** versionEndPtr,const char ** encodingName,const ENCODING ** encoding,int * standalone)1154 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1155 const char *),
1156 int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1157 const char *end, const char **badPtr, const char **versionPtr,
1158 const char **versionEndPtr, const char **encodingName,
1159 const ENCODING **encoding, int *standalone) {
1160 const char *val = NULL;
1161 const char *name = NULL;
1162 const char *nameEnd = NULL;
1163 ptr += 5 * enc->minBytesPerChar;
1164 end -= 2 * enc->minBytesPerChar;
1165 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1166 || ! name) {
1167 *badPtr = ptr;
1168 return 0;
1169 }
1170 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1171 if (! isGeneralTextEntity) {
1172 *badPtr = name;
1173 return 0;
1174 }
1175 } else {
1176 if (versionPtr)
1177 *versionPtr = val;
1178 if (versionEndPtr)
1179 *versionEndPtr = ptr;
1180 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1181 *badPtr = ptr;
1182 return 0;
1183 }
1184 if (! name) {
1185 if (isGeneralTextEntity) {
1186 /* a TextDecl must have an EncodingDecl */
1187 *badPtr = ptr;
1188 return 0;
1189 }
1190 return 1;
1191 }
1192 }
1193 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1194 int c = toAscii(enc, val, end);
1195 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1196 *badPtr = val;
1197 return 0;
1198 }
1199 if (encodingName)
1200 *encodingName = val;
1201 if (encoding)
1202 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1203 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1204 *badPtr = ptr;
1205 return 0;
1206 }
1207 if (! name)
1208 return 1;
1209 }
1210 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1211 || isGeneralTextEntity) {
1212 *badPtr = name;
1213 return 0;
1214 }
1215 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1216 if (standalone)
1217 *standalone = 1;
1218 } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1219 if (standalone)
1220 *standalone = 0;
1221 } else {
1222 *badPtr = val;
1223 return 0;
1224 }
1225 while (isSpace(toAscii(enc, ptr, end)))
1226 ptr += enc->minBytesPerChar;
1227 if (ptr != end) {
1228 *badPtr = ptr;
1229 return 0;
1230 }
1231 return 1;
1232 }
1233
1234 static int FASTCALL
checkCharRefNumber(int result)1235 checkCharRefNumber(int result) {
1236 switch (result >> 8) {
1237 case 0xD8:
1238 case 0xD9:
1239 case 0xDA:
1240 case 0xDB:
1241 case 0xDC:
1242 case 0xDD:
1243 case 0xDE:
1244 case 0xDF:
1245 return -1;
1246 case 0:
1247 if (latin1_encoding.type[result] == BT_NONXML)
1248 return -1;
1249 break;
1250 case 0xFF:
1251 if (result == 0xFFFE || result == 0xFFFF)
1252 return -1;
1253 break;
1254 }
1255 return result;
1256 }
1257
1258 int FASTCALL
XmlUtf8Encode(int c,char * buf)1259 XmlUtf8Encode(int c, char *buf) {
1260 enum {
1261 /* minN is minimum legal resulting value for N byte sequence */
1262 min2 = 0x80,
1263 min3 = 0x800,
1264 min4 = 0x10000
1265 };
1266
1267 if (c < 0)
1268 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1269 if (c < min2) {
1270 buf[0] = (char)(c | UTF8_cval1);
1271 return 1;
1272 }
1273 if (c < min3) {
1274 buf[0] = (char)((c >> 6) | UTF8_cval2);
1275 buf[1] = (char)((c & 0x3f) | 0x80);
1276 return 2;
1277 }
1278 if (c < min4) {
1279 buf[0] = (char)((c >> 12) | UTF8_cval3);
1280 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1281 buf[2] = (char)((c & 0x3f) | 0x80);
1282 return 3;
1283 }
1284 if (c < 0x110000) {
1285 buf[0] = (char)((c >> 18) | UTF8_cval4);
1286 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1287 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1288 buf[3] = (char)((c & 0x3f) | 0x80);
1289 return 4;
1290 }
1291 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1292 }
1293
1294 int FASTCALL
XmlUtf16Encode(int charNum,unsigned short * buf)1295 XmlUtf16Encode(int charNum, unsigned short *buf) {
1296 if (charNum < 0)
1297 return 0;
1298 if (charNum < 0x10000) {
1299 buf[0] = (unsigned short)charNum;
1300 return 1;
1301 }
1302 if (charNum < 0x110000) {
1303 charNum -= 0x10000;
1304 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1305 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1306 return 2;
1307 }
1308 return 0;
1309 }
1310
1311 struct unknown_encoding {
1312 struct normal_encoding normal;
1313 CONVERTER convert;
1314 void *userData;
1315 unsigned short utf16[256];
1316 char utf8[256][4];
1317 };
1318
1319 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1320
1321 int
XmlSizeOfUnknownEncoding(void)1322 XmlSizeOfUnknownEncoding(void) {
1323 return sizeof(struct unknown_encoding);
1324 }
1325
1326 static int PTRFASTCALL
unknown_isName(const ENCODING * enc,const char * p)1327 unknown_isName(const ENCODING *enc, const char *p) {
1328 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1329 int c = uenc->convert(uenc->userData, p);
1330 if (c & ~0xFFFF)
1331 return 0;
1332 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1333 }
1334
1335 static int PTRFASTCALL
unknown_isNmstrt(const ENCODING * enc,const char * p)1336 unknown_isNmstrt(const ENCODING *enc, const char *p) {
1337 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1338 int c = uenc->convert(uenc->userData, p);
1339 if (c & ~0xFFFF)
1340 return 0;
1341 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1342 }
1343
1344 static int PTRFASTCALL
unknown_isInvalid(const ENCODING * enc,const char * p)1345 unknown_isInvalid(const ENCODING *enc, const char *p) {
1346 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1347 int c = uenc->convert(uenc->userData, p);
1348 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1349 }
1350
1351 static enum XML_Convert_Result PTRCALL
unknown_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)1352 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1353 char **toP, const char *toLim) {
1354 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1355 char buf[XML_UTF8_ENCODE_MAX];
1356 for (;;) {
1357 const char *utf8;
1358 int n;
1359 if (*fromP == fromLim)
1360 return XML_CONVERT_COMPLETED;
1361 utf8 = uenc->utf8[(unsigned char)**fromP];
1362 n = *utf8++;
1363 if (n == 0) {
1364 int c = uenc->convert(uenc->userData, *fromP);
1365 n = XmlUtf8Encode(c, buf);
1366 if (n > toLim - *toP)
1367 return XML_CONVERT_OUTPUT_EXHAUSTED;
1368 utf8 = buf;
1369 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1370 - (BT_LEAD2 - 2));
1371 } else {
1372 if (n > toLim - *toP)
1373 return XML_CONVERT_OUTPUT_EXHAUSTED;
1374 (*fromP)++;
1375 }
1376 memcpy(*toP, utf8, n);
1377 *toP += n;
1378 }
1379 }
1380
1381 static enum XML_Convert_Result PTRCALL
unknown_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)1382 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1383 unsigned short **toP, const unsigned short *toLim) {
1384 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1385 while (*fromP < fromLim && *toP < toLim) {
1386 unsigned short c = uenc->utf16[(unsigned char)**fromP];
1387 if (c == 0) {
1388 c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1389 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1390 - (BT_LEAD2 - 2));
1391 } else
1392 (*fromP)++;
1393 *(*toP)++ = c;
1394 }
1395
1396 if ((*toP == toLim) && (*fromP < fromLim))
1397 return XML_CONVERT_OUTPUT_EXHAUSTED;
1398 else
1399 return XML_CONVERT_COMPLETED;
1400 }
1401
1402 ENCODING *
XmlInitUnknownEncoding(void * mem,const int * table,CONVERTER convert,void * userData)1403 XmlInitUnknownEncoding(void *mem, const int *table, CONVERTER convert,
1404 void *userData) {
1405 int i;
1406 struct unknown_encoding *e = (struct unknown_encoding *)mem;
1407 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1408 for (i = 0; i < 128; i++)
1409 if (latin1_encoding.type[i] != BT_OTHER
1410 && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1411 return 0;
1412 for (i = 0; i < 256; i++) {
1413 int c = table[i];
1414 if (c == -1) {
1415 e->normal.type[i] = BT_MALFORM;
1416 /* This shouldn't really get used. */
1417 e->utf16[i] = 0xFFFF;
1418 e->utf8[i][0] = 1;
1419 e->utf8[i][1] = 0;
1420 } else if (c < 0) {
1421 if (c < -4)
1422 return 0;
1423 /* Multi-byte sequences need a converter function */
1424 if (! convert)
1425 return 0;
1426 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1427 e->utf8[i][0] = 0;
1428 e->utf16[i] = 0;
1429 } else if (c < 0x80) {
1430 if (latin1_encoding.type[c] != BT_OTHER
1431 && latin1_encoding.type[c] != BT_NONXML && c != i)
1432 return 0;
1433 e->normal.type[i] = latin1_encoding.type[c];
1434 e->utf8[i][0] = 1;
1435 e->utf8[i][1] = (char)c;
1436 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1437 } else if (checkCharRefNumber(c) < 0) {
1438 e->normal.type[i] = BT_NONXML;
1439 /* This shouldn't really get used. */
1440 e->utf16[i] = 0xFFFF;
1441 e->utf8[i][0] = 1;
1442 e->utf8[i][1] = 0;
1443 } else {
1444 if (c > 0xFFFF)
1445 return 0;
1446 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1447 e->normal.type[i] = BT_NMSTRT;
1448 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1449 e->normal.type[i] = BT_NAME;
1450 else
1451 e->normal.type[i] = BT_OTHER;
1452 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1453 e->utf16[i] = (unsigned short)c;
1454 }
1455 }
1456 e->userData = userData;
1457 e->convert = convert;
1458 if (convert) {
1459 e->normal.isName2 = unknown_isName;
1460 e->normal.isName3 = unknown_isName;
1461 e->normal.isName4 = unknown_isName;
1462 e->normal.isNmstrt2 = unknown_isNmstrt;
1463 e->normal.isNmstrt3 = unknown_isNmstrt;
1464 e->normal.isNmstrt4 = unknown_isNmstrt;
1465 e->normal.isInvalid2 = unknown_isInvalid;
1466 e->normal.isInvalid3 = unknown_isInvalid;
1467 e->normal.isInvalid4 = unknown_isInvalid;
1468 }
1469 e->normal.enc.utf8Convert = unknown_toUtf8;
1470 e->normal.enc.utf16Convert = unknown_toUtf16;
1471 return &(e->normal.enc);
1472 }
1473
1474 /* If this enumeration is changed, getEncodingIndex and encodings
1475 must also be changed. */
1476 enum {
1477 UNKNOWN_ENC = -1,
1478 ISO_8859_1_ENC = 0,
1479 US_ASCII_ENC,
1480 UTF_8_ENC,
1481 UTF_16_ENC,
1482 UTF_16BE_ENC,
1483 UTF_16LE_ENC,
1484 /* must match encodingNames up to here */
1485 NO_ENC
1486 };
1487
1488 static const char KW_ISO_8859_1[]
1489 = {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8,
1490 ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'};
1491 static const char KW_US_ASCII[]
1492 = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1493 ASCII_C, ASCII_I, ASCII_I, '\0'};
1494 static const char KW_UTF_8[]
1495 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1496 static const char KW_UTF_16[]
1497 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1498 static const char KW_UTF_16BE[]
1499 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1500 ASCII_6, ASCII_B, ASCII_E, '\0'};
1501 static const char KW_UTF_16LE[]
1502 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1503 ASCII_6, ASCII_L, ASCII_E, '\0'};
1504
1505 static int FASTCALL
getEncodingIndex(const char * name)1506 getEncodingIndex(const char *name) {
1507 static const char *const encodingNames[] = {
1508 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1509 };
1510 int i;
1511 if (name == NULL)
1512 return NO_ENC;
1513 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1514 if (streqci(name, encodingNames[i]))
1515 return i;
1516 return UNKNOWN_ENC;
1517 }
1518
1519 /* For binary compatibility, we store the index of the encoding
1520 specified at initialization in the isUtf16 member.
1521 */
1522
1523 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1524 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1525
1526 /* This is what detects the encoding. encodingTable maps from
1527 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1528 the external (protocol) specified encoding; state is
1529 XML_CONTENT_STATE if we're parsing an external text entity, and
1530 XML_PROLOG_STATE otherwise.
1531 */
1532
1533 static int
initScan(const ENCODING * const * encodingTable,const INIT_ENCODING * enc,int state,const char * ptr,const char * end,const char ** nextTokPtr)1534 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1535 int state, const char *ptr, const char *end, const char **nextTokPtr) {
1536 const ENCODING **encPtr;
1537
1538 if (ptr >= end)
1539 return XML_TOK_NONE;
1540 encPtr = enc->encPtr;
1541 if (ptr + 1 == end) {
1542 /* only a single byte available for auto-detection */
1543 #ifndef XML_DTD /* FIXME */
1544 /* a well-formed document entity must have more than one byte */
1545 if (state != XML_CONTENT_STATE)
1546 return XML_TOK_PARTIAL;
1547 #endif
1548 /* so we're parsing an external text entity... */
1549 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1550 switch (INIT_ENC_INDEX(enc)) {
1551 case UTF_16_ENC:
1552 case UTF_16LE_ENC:
1553 case UTF_16BE_ENC:
1554 return XML_TOK_PARTIAL;
1555 }
1556 switch ((unsigned char)*ptr) {
1557 case 0xFE:
1558 case 0xFF:
1559 case 0xEF: /* possibly first byte of UTF-8 BOM */
1560 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1561 break;
1562 EXPAT_FALLTHROUGH;
1563 case 0x00:
1564 case 0x3C:
1565 return XML_TOK_PARTIAL;
1566 }
1567 } else {
1568 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1569 case 0xFEFF:
1570 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1571 break;
1572 *nextTokPtr = ptr + 2;
1573 *encPtr = encodingTable[UTF_16BE_ENC];
1574 return XML_TOK_BOM;
1575 /* 00 3C is handled in the default case */
1576 case 0x3C00:
1577 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1578 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1579 && state == XML_CONTENT_STATE)
1580 break;
1581 *encPtr = encodingTable[UTF_16LE_ENC];
1582 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1583 case 0xFFFE:
1584 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1585 break;
1586 *nextTokPtr = ptr + 2;
1587 *encPtr = encodingTable[UTF_16LE_ENC];
1588 return XML_TOK_BOM;
1589 case 0xEFBB:
1590 /* Maybe a UTF-8 BOM (EF BB BF) */
1591 /* If there's an explicitly specified (external) encoding
1592 of ISO-8859-1 or some flavour of UTF-16
1593 and this is an external text entity,
1594 don't look for the BOM,
1595 because it might be a legal data.
1596 */
1597 if (state == XML_CONTENT_STATE) {
1598 int e = INIT_ENC_INDEX(enc);
1599 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1600 || e == UTF_16_ENC)
1601 break;
1602 }
1603 if (ptr + 2 == end)
1604 return XML_TOK_PARTIAL;
1605 if ((unsigned char)ptr[2] == 0xBF) {
1606 *nextTokPtr = ptr + 3;
1607 *encPtr = encodingTable[UTF_8_ENC];
1608 return XML_TOK_BOM;
1609 }
1610 break;
1611 default:
1612 if (ptr[0] == '\0') {
1613 /* 0 isn't a legal data character. Furthermore a document
1614 entity can only start with ASCII characters. So the only
1615 way this can fail to be big-endian UTF-16 if it it's an
1616 external parsed general entity that's labelled as
1617 UTF-16LE.
1618 */
1619 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1620 break;
1621 *encPtr = encodingTable[UTF_16BE_ENC];
1622 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1623 } else if (ptr[1] == '\0') {
1624 /* We could recover here in the case:
1625 - parsing an external entity
1626 - second byte is 0
1627 - no externally specified encoding
1628 - no encoding declaration
1629 by assuming UTF-16LE. But we don't, because this would mean when
1630 presented just with a single byte, we couldn't reliably determine
1631 whether we needed further bytes.
1632 */
1633 if (state == XML_CONTENT_STATE)
1634 break;
1635 *encPtr = encodingTable[UTF_16LE_ENC];
1636 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1637 }
1638 break;
1639 }
1640 }
1641 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1642 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1643 }
1644
1645 #define NS(x) x
1646 #define ns(x) x
1647 #define XML_TOK_NS_C
1648 #include "xmltok_ns.c"
1649 #undef XML_TOK_NS_C
1650 #undef NS
1651 #undef ns
1652
1653 #ifdef XML_NS
1654
1655 # define NS(x) x##NS
1656 # define ns(x) x##_ns
1657
1658 # define XML_TOK_NS_C
1659 # include "xmltok_ns.c"
1660 # undef XML_TOK_NS_C
1661
1662 # undef NS
1663 # undef ns
1664
1665 ENCODING *
XmlInitUnknownEncodingNS(void * mem,const int * table,CONVERTER convert,void * userData)1666 XmlInitUnknownEncodingNS(void *mem, const int *table, CONVERTER convert,
1667 void *userData) {
1668 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1669 if (enc)
1670 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1671 return enc;
1672 }
1673
1674 #endif /* XML_NS */
1675