xref: /freebsd/contrib/expat/lib/xmltok.c (revision 55141f2c8991b2a6adbf30bb0fe3e6cbc303f06d)
1 /*
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11    Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12    Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
13    Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14    Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15    Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
16    Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
17    Copyright (c) 2016      Don Lewis <truckman@apache.org>
18    Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
19    Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
20    Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
21    Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
22    Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23    Copyright (c) 2021      Dong-hee Na <donghee.na@python.org>
24    Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
25    Licensed under the MIT license:
26 
27    Permission is  hereby granted,  free of charge,  to any  person obtaining
28    a  copy  of  this  software   and  associated  documentation  files  (the
29    "Software"),  to  deal in  the  Software  without restriction,  including
30    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
31    distribute, sublicense, and/or sell copies of the Software, and to permit
32    persons  to whom  the Software  is  furnished to  do so,  subject to  the
33    following conditions:
34 
35    The above copyright  notice and this permission notice  shall be included
36    in all copies or substantial portions of the Software.
37 
38    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
39    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
40    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
41    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
42    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
43    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
44    USE OR OTHER DEALINGS IN THE SOFTWARE.
45 */
46 
47 #include <expat_config.h>
48 
49 #include <stddef.h>
50 #include <string.h> /* memcpy */
51 #include <stdbool.h>
52 
53 #ifdef _WIN32
54 #  include "winconfig.h"
55 #endif
56 
57 #include "expat_external.h"
58 #include "internal.h"
59 #include "xmltok.h"
60 #include "nametab.h"
61 
62 #ifdef XML_DTD
63 #  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
64 #else
65 #  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
66 #endif
67 
68 #define VTABLE1                                                                \
69   {PREFIX(prologTok), PREFIX(contentTok),                                      \
70    PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
71       {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
72       PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
73       PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
74       PREFIX(updatePosition), PREFIX(isPublicId)
75 
76 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
77 
78 #define UCS2_GET_NAMING(pages, hi, lo)                                         \
79   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
80 
81 /* A 2 byte UTF-8 representation splits the characters 11 bits between
82    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
83    pages, 3 bits to add to that index and 5 bits to generate the mask.
84 */
85 #define UTF8_GET_NAMING2(pages, byte)                                          \
86   (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
87                 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
88    & (1u << (((byte)[1]) & 0x1F)))
89 
90 /* A 3 byte UTF-8 representation splits the characters 16 bits between
91    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
92    into pages, 3 bits to add to that index and 5 bits to generate the
93    mask.
94 */
95 #define UTF8_GET_NAMING3(pages, byte)                                          \
96   (namingBitmap                                                                \
97        [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
98          << 3)                                                                 \
99         + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
100    & (1u << (((byte)[2]) & 0x1F)))
101 
102 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
103    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
104    with the additional restriction of not allowing the Unicode
105    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
106    Implementation details:
107      (A & 0x80) == 0     means A < 0x80
108    and
109      (A & 0xC0) == 0xC0  means A > 0xBF
110 */
111 
112 #define UTF8_INVALID2(p)                                                       \
113   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
114 
115 #define UTF8_INVALID3(p)                                                       \
116   (((p)[2] & 0x80) == 0                                                        \
117    || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
118                                       : ((p)[2] & 0xC0) == 0xC0)               \
119    || ((*p) == 0xE0                                                            \
120            ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
121            : ((p)[1] & 0x80) == 0                                              \
122                  || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
123 
124 #define UTF8_INVALID4(p)                                                       \
125   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
126    || ((p)[2] & 0xC0) == 0xC0                                                  \
127    || ((*p) == 0xF0                                                            \
128            ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
129            : ((p)[1] & 0x80) == 0                                              \
130                  || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
131 
132 static int PTRFASTCALL
133 isNever(const ENCODING *enc, const char *p) {
134   UNUSED_P(enc);
135   UNUSED_P(p);
136   return 0;
137 }
138 
139 static int PTRFASTCALL
140 utf8_isName2(const ENCODING *enc, const char *p) {
141   UNUSED_P(enc);
142   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
143 }
144 
145 static int PTRFASTCALL
146 utf8_isName3(const ENCODING *enc, const char *p) {
147   UNUSED_P(enc);
148   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
149 }
150 
151 #define utf8_isName4 isNever
152 
153 static int PTRFASTCALL
154 utf8_isNmstrt2(const ENCODING *enc, const char *p) {
155   UNUSED_P(enc);
156   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
157 }
158 
159 static int PTRFASTCALL
160 utf8_isNmstrt3(const ENCODING *enc, const char *p) {
161   UNUSED_P(enc);
162   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
163 }
164 
165 #define utf8_isNmstrt4 isNever
166 
167 static int PTRFASTCALL
168 utf8_isInvalid2(const ENCODING *enc, const char *p) {
169   UNUSED_P(enc);
170   return UTF8_INVALID2((const unsigned char *)p);
171 }
172 
173 static int PTRFASTCALL
174 utf8_isInvalid3(const ENCODING *enc, const char *p) {
175   UNUSED_P(enc);
176   return UTF8_INVALID3((const unsigned char *)p);
177 }
178 
179 static int PTRFASTCALL
180 utf8_isInvalid4(const ENCODING *enc, const char *p) {
181   UNUSED_P(enc);
182   return UTF8_INVALID4((const unsigned char *)p);
183 }
184 
185 struct normal_encoding {
186   ENCODING enc;
187   unsigned char type[256];
188 #ifdef XML_MIN_SIZE
189   int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
190   int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
191   int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
192   int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
193   int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
194 #endif /* XML_MIN_SIZE */
195   int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
196   int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
197   int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
198   int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
199   int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
200   int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
201   int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
202   int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
203   int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
204 };
205 
206 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
207 
208 #ifdef XML_MIN_SIZE
209 
210 #  define STANDARD_VTABLE(E)                                                   \
211     E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
212 
213 #else
214 
215 #  define STANDARD_VTABLE(E) /* as nothing */
216 
217 #endif
218 
219 #define NORMAL_VTABLE(E)                                                       \
220   E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
221       E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
222 
223 #define NULL_VTABLE                                                            \
224   /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
225       /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
226       /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
227 
228 static int FASTCALL checkCharRefNumber(int);
229 
230 #include "xmltok_impl.h"
231 #include "ascii.h"
232 
233 #ifdef XML_MIN_SIZE
234 #  define sb_isNameMin isNever
235 #  define sb_isNmstrtMin isNever
236 #endif
237 
238 #ifdef XML_MIN_SIZE
239 #  define MINBPC(enc) ((enc)->minBytesPerChar)
240 #else
241 /* minimum bytes per character */
242 #  define MINBPC(enc) 1
243 #endif
244 
245 #define SB_BYTE_TYPE(enc, p)                                                   \
246   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
247 
248 #ifdef XML_MIN_SIZE
249 static int PTRFASTCALL
250 sb_byteType(const ENCODING *enc, const char *p) {
251   return SB_BYTE_TYPE(enc, p);
252 }
253 #  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
254 #else
255 #  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
256 #endif
257 
258 #ifdef XML_MIN_SIZE
259 #  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
260 static int PTRFASTCALL
261 sb_byteToAscii(const ENCODING *enc, const char *p) {
262   UNUSED_P(enc);
263   return *p;
264 }
265 #else
266 #  define BYTE_TO_ASCII(enc, p) (*(p))
267 #endif
268 
269 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
270 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
271 #ifdef XML_MIN_SIZE
272 #  define IS_INVALID_CHAR(enc, p, n)                                           \
273     (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
274      && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
275 #else
276 #  define IS_INVALID_CHAR(enc, p, n)                                           \
277     (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
278 #endif
279 
280 #ifdef XML_MIN_SIZE
281 #  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
282     (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
283 #  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
284     (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
285 #else
286 #  define IS_NAME_CHAR_MINBPC(enc, p) (0)
287 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
288 #endif
289 
290 #ifdef XML_MIN_SIZE
291 #  define CHAR_MATCHES(enc, p, c)                                              \
292     (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
293 static int PTRCALL
294 sb_charMatches(const ENCODING *enc, const char *p, int c) {
295   UNUSED_P(enc);
296   return *p == c;
297 }
298 #else
299 /* c is an ASCII character */
300 #  define CHAR_MATCHES(enc, p, c) (*(p) == (c))
301 #endif
302 
303 #define PREFIX(ident) normal_##ident
304 #define XML_TOK_IMPL_C
305 #include "xmltok_impl.c"
306 #undef XML_TOK_IMPL_C
307 
308 #undef MINBPC
309 #undef BYTE_TYPE
310 #undef BYTE_TO_ASCII
311 #undef CHAR_MATCHES
312 #undef IS_NAME_CHAR
313 #undef IS_NAME_CHAR_MINBPC
314 #undef IS_NMSTRT_CHAR
315 #undef IS_NMSTRT_CHAR_MINBPC
316 #undef IS_INVALID_CHAR
317 
318 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
319        UTF8_cval1 = 0x00,
320        UTF8_cval2 = 0xc0,
321        UTF8_cval3 = 0xe0,
322        UTF8_cval4 = 0xf0
323 };
324 
325 void
326 _INTERNAL_trim_to_complete_utf8_characters(const char *from,
327                                            const char **fromLimRef) {
328   const char *fromLim = *fromLimRef;
329   size_t walked = 0;
330   for (; fromLim > from; fromLim--, walked++) {
331     const unsigned char prev = (unsigned char)fromLim[-1];
332     if ((prev & 0xf8u)
333         == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
334       if (walked + 1 >= 4) {
335         fromLim += 4 - 1;
336         break;
337       } else {
338         walked = 0;
339       }
340     } else if ((prev & 0xf0u)
341                == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
342       if (walked + 1 >= 3) {
343         fromLim += 3 - 1;
344         break;
345       } else {
346         walked = 0;
347       }
348     } else if ((prev & 0xe0u)
349                == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
350       if (walked + 1 >= 2) {
351         fromLim += 2 - 1;
352         break;
353       } else {
354         walked = 0;
355       }
356     } else if ((prev & 0x80u)
357                == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
358       break;
359     }
360   }
361   *fromLimRef = fromLim;
362 }
363 
364 static enum XML_Convert_Result PTRCALL
365 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
366             char **toP, const char *toLim) {
367   bool input_incomplete = false;
368   bool output_exhausted = false;
369 
370   /* Avoid copying partial characters (due to limited space). */
371   const ptrdiff_t bytesAvailable = fromLim - *fromP;
372   const ptrdiff_t bytesStorable = toLim - *toP;
373   UNUSED_P(enc);
374   if (bytesAvailable > bytesStorable) {
375     fromLim = *fromP + bytesStorable;
376     output_exhausted = true;
377   }
378 
379   /* Avoid copying partial characters (from incomplete input). */
380   {
381     const char *const fromLimBefore = fromLim;
382     _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
383     if (fromLim < fromLimBefore) {
384       input_incomplete = true;
385     }
386   }
387 
388   {
389     const ptrdiff_t bytesToCopy = fromLim - *fromP;
390     memcpy(*toP, *fromP, bytesToCopy);
391     *fromP += bytesToCopy;
392     *toP += bytesToCopy;
393   }
394 
395   if (output_exhausted) /* needs to go first */
396     return XML_CONVERT_OUTPUT_EXHAUSTED;
397   else if (input_incomplete)
398     return XML_CONVERT_INPUT_INCOMPLETE;
399   else
400     return XML_CONVERT_COMPLETED;
401 }
402 
403 static enum XML_Convert_Result PTRCALL
404 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
405              unsigned short **toP, const unsigned short *toLim) {
406   enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
407   unsigned short *to = *toP;
408   const char *from = *fromP;
409   while (from < fromLim && to < toLim) {
410     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
411     case BT_LEAD2:
412       if (fromLim - from < 2) {
413         res = XML_CONVERT_INPUT_INCOMPLETE;
414         goto after;
415       }
416       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
417       from += 2;
418       break;
419     case BT_LEAD3:
420       if (fromLim - from < 3) {
421         res = XML_CONVERT_INPUT_INCOMPLETE;
422         goto after;
423       }
424       *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
425                                | (from[2] & 0x3f));
426       from += 3;
427       break;
428     case BT_LEAD4: {
429       unsigned long n;
430       if (toLim - to < 2) {
431         res = XML_CONVERT_OUTPUT_EXHAUSTED;
432         goto after;
433       }
434       if (fromLim - from < 4) {
435         res = XML_CONVERT_INPUT_INCOMPLETE;
436         goto after;
437       }
438       n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
439           | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
440       n -= 0x10000;
441       to[0] = (unsigned short)((n >> 10) | 0xD800);
442       to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
443       to += 2;
444       from += 4;
445     } break;
446     default:
447       *to++ = *from++;
448       break;
449     }
450   }
451   if (from < fromLim)
452     res = XML_CONVERT_OUTPUT_EXHAUSTED;
453 after:
454   *fromP = from;
455   *toP = to;
456   return res;
457 }
458 
459 #ifdef XML_NS
460 static const struct normal_encoding utf8_encoding_ns
461     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
462        {
463 #  include "asciitab.h"
464 #  include "utf8tab.h"
465        },
466        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
467 #endif
468 
469 static const struct normal_encoding utf8_encoding
470     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
471        {
472 #define BT_COLON BT_NMSTRT
473 #include "asciitab.h"
474 #undef BT_COLON
475 #include "utf8tab.h"
476        },
477        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
478 
479 #ifdef XML_NS
480 
481 static const struct normal_encoding internal_utf8_encoding_ns
482     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
483        {
484 #  include "iasciitab.h"
485 #  include "utf8tab.h"
486        },
487        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
488 
489 #endif
490 
491 static const struct normal_encoding internal_utf8_encoding
492     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
493        {
494 #define BT_COLON BT_NMSTRT
495 #include "iasciitab.h"
496 #undef BT_COLON
497 #include "utf8tab.h"
498        },
499        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
500 
501 static enum XML_Convert_Result PTRCALL
502 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
503               char **toP, const char *toLim) {
504   UNUSED_P(enc);
505   for (;;) {
506     unsigned char c;
507     if (*fromP == fromLim)
508       return XML_CONVERT_COMPLETED;
509     c = (unsigned char)**fromP;
510     if (c & 0x80) {
511       if (toLim - *toP < 2)
512         return XML_CONVERT_OUTPUT_EXHAUSTED;
513       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
514       *(*toP)++ = (char)((c & 0x3f) | 0x80);
515       (*fromP)++;
516     } else {
517       if (*toP == toLim)
518         return XML_CONVERT_OUTPUT_EXHAUSTED;
519       *(*toP)++ = *(*fromP)++;
520     }
521   }
522 }
523 
524 static enum XML_Convert_Result PTRCALL
525 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
526                unsigned short **toP, const unsigned short *toLim) {
527   UNUSED_P(enc);
528   while (*fromP < fromLim && *toP < toLim)
529     *(*toP)++ = (unsigned char)*(*fromP)++;
530 
531   if ((*toP == toLim) && (*fromP < fromLim))
532     return XML_CONVERT_OUTPUT_EXHAUSTED;
533   else
534     return XML_CONVERT_COMPLETED;
535 }
536 
537 #ifdef XML_NS
538 
539 static const struct normal_encoding latin1_encoding_ns
540     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
541        {
542 #  include "asciitab.h"
543 #  include "latin1tab.h"
544        },
545        STANDARD_VTABLE(sb_) NULL_VTABLE};
546 
547 #endif
548 
549 static const struct normal_encoding latin1_encoding
550     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
551        {
552 #define BT_COLON BT_NMSTRT
553 #include "asciitab.h"
554 #undef BT_COLON
555 #include "latin1tab.h"
556        },
557        STANDARD_VTABLE(sb_) NULL_VTABLE};
558 
559 static enum XML_Convert_Result PTRCALL
560 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
561              char **toP, const char *toLim) {
562   UNUSED_P(enc);
563   while (*fromP < fromLim && *toP < toLim)
564     *(*toP)++ = *(*fromP)++;
565 
566   if ((*toP == toLim) && (*fromP < fromLim))
567     return XML_CONVERT_OUTPUT_EXHAUSTED;
568   else
569     return XML_CONVERT_COMPLETED;
570 }
571 
572 #ifdef XML_NS
573 
574 static const struct normal_encoding ascii_encoding_ns
575     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
576        {
577 #  include "asciitab.h"
578            /* BT_NONXML == 0 */
579        },
580        STANDARD_VTABLE(sb_) NULL_VTABLE};
581 
582 #endif
583 
584 static const struct normal_encoding ascii_encoding
585     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
586        {
587 #define BT_COLON BT_NMSTRT
588 #include "asciitab.h"
589 #undef BT_COLON
590            /* BT_NONXML == 0 */
591        },
592        STANDARD_VTABLE(sb_) NULL_VTABLE};
593 
594 static int PTRFASTCALL
595 unicode_byte_type(char hi, char lo) {
596   switch ((unsigned char)hi) {
597   /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
598   case 0xD8:
599   case 0xD9:
600   case 0xDA:
601   case 0xDB:
602     return BT_LEAD4;
603   /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
604   case 0xDC:
605   case 0xDD:
606   case 0xDE:
607   case 0xDF:
608     return BT_TRAIL;
609   case 0xFF:
610     switch ((unsigned char)lo) {
611     case 0xFF: /* noncharacter-FFFF */
612     case 0xFE: /* noncharacter-FFFE */
613       return BT_NONXML;
614     }
615     break;
616   }
617   return BT_NONASCII;
618 }
619 
620 #define DEFINE_UTF16_TO_UTF8(E)                                                \
621   static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
622       const ENCODING *enc, const char **fromP, const char *fromLim,            \
623       char **toP, const char *toLim) {                                         \
624     const char *from = *fromP;                                                 \
625     UNUSED_P(enc);                                                             \
626     fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
627     for (; from < fromLim; from += 2) {                                        \
628       int plane;                                                               \
629       unsigned char lo2;                                                       \
630       unsigned char lo = GET_LO(from);                                         \
631       unsigned char hi = GET_HI(from);                                         \
632       switch (hi) {                                                            \
633       case 0:                                                                  \
634         if (lo < 0x80) {                                                       \
635           if (*toP == toLim) {                                                 \
636             *fromP = from;                                                     \
637             return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
638           }                                                                    \
639           *(*toP)++ = lo;                                                      \
640           break;                                                               \
641         }                                                                      \
642         /* fall through */                                                     \
643       case 0x1:                                                                \
644       case 0x2:                                                                \
645       case 0x3:                                                                \
646       case 0x4:                                                                \
647       case 0x5:                                                                \
648       case 0x6:                                                                \
649       case 0x7:                                                                \
650         if (toLim - *toP < 2) {                                                \
651           *fromP = from;                                                       \
652           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
653         }                                                                      \
654         *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
655         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
656         break;                                                                 \
657       default:                                                                 \
658         if (toLim - *toP < 3) {                                                \
659           *fromP = from;                                                       \
660           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
661         }                                                                      \
662         /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
663         *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
664         *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
665         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
666         break;                                                                 \
667       case 0xD8:                                                               \
668       case 0xD9:                                                               \
669       case 0xDA:                                                               \
670       case 0xDB:                                                               \
671         if (toLim - *toP < 4) {                                                \
672           *fromP = from;                                                       \
673           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
674         }                                                                      \
675         if (fromLim - from < 4) {                                              \
676           *fromP = from;                                                       \
677           return XML_CONVERT_INPUT_INCOMPLETE;                                 \
678         }                                                                      \
679         plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
680         *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
681         *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
682         from += 2;                                                             \
683         lo2 = GET_LO(from);                                                    \
684         *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
685                      | (lo2 >> 6) | 0x80);                                     \
686         *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
687         break;                                                                 \
688       }                                                                        \
689     }                                                                          \
690     *fromP = from;                                                             \
691     if (from < fromLim)                                                        \
692       return XML_CONVERT_INPUT_INCOMPLETE;                                     \
693     else                                                                       \
694       return XML_CONVERT_COMPLETED;                                            \
695   }
696 
697 #define DEFINE_UTF16_TO_UTF16(E)                                               \
698   static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
699       const ENCODING *enc, const char **fromP, const char *fromLim,            \
700       unsigned short **toP, const unsigned short *toLim) {                     \
701     enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
702     UNUSED_P(enc);                                                             \
703     fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
704     /* Avoid copying first half only of surrogate */                           \
705     if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
706         && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
707       fromLim -= 2;                                                            \
708       res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
709     }                                                                          \
710     for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
711       *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
712     if ((*toP == toLim) && (*fromP < fromLim))                                 \
713       return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
714     else                                                                       \
715       return res;                                                              \
716   }
717 
718 #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
719 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
720 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
721 
722 DEFINE_UTF16_TO_UTF8(little2_)
723 DEFINE_UTF16_TO_UTF16(little2_)
724 
725 #undef SET2
726 #undef GET_LO
727 #undef GET_HI
728 
729 #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
730 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
731 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
732 
733 DEFINE_UTF16_TO_UTF8(big2_)
734 DEFINE_UTF16_TO_UTF16(big2_)
735 
736 #undef SET2
737 #undef GET_LO
738 #undef GET_HI
739 
740 #define LITTLE2_BYTE_TYPE(enc, p)                                              \
741   ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]  \
742                : unicode_byte_type((p)[1], (p)[0]))
743 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
744 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
745 #define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
746   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
747 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
748   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
749 
750 #ifdef XML_MIN_SIZE
751 
752 static int PTRFASTCALL
753 little2_byteType(const ENCODING *enc, const char *p) {
754   return LITTLE2_BYTE_TYPE(enc, p);
755 }
756 
757 static int PTRFASTCALL
758 little2_byteToAscii(const ENCODING *enc, const char *p) {
759   UNUSED_P(enc);
760   return LITTLE2_BYTE_TO_ASCII(p);
761 }
762 
763 static int PTRCALL
764 little2_charMatches(const ENCODING *enc, const char *p, int c) {
765   UNUSED_P(enc);
766   return LITTLE2_CHAR_MATCHES(p, c);
767 }
768 
769 static int PTRFASTCALL
770 little2_isNameMin(const ENCODING *enc, const char *p) {
771   UNUSED_P(enc);
772   return LITTLE2_IS_NAME_CHAR_MINBPC(p);
773 }
774 
775 static int PTRFASTCALL
776 little2_isNmstrtMin(const ENCODING *enc, const char *p) {
777   UNUSED_P(enc);
778   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
779 }
780 
781 #  undef VTABLE
782 #  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
783 
784 #else /* not XML_MIN_SIZE */
785 
786 #  undef PREFIX
787 #  define PREFIX(ident) little2_##ident
788 #  define MINBPC(enc) 2
789 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
790 #  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
791 #  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
792 #  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
793 #  define IS_NAME_CHAR(enc, p, n) 0
794 #  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
795 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
796 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
797 
798 #  define XML_TOK_IMPL_C
799 #  include "xmltok_impl.c"
800 #  undef XML_TOK_IMPL_C
801 
802 #  undef MINBPC
803 #  undef BYTE_TYPE
804 #  undef BYTE_TO_ASCII
805 #  undef CHAR_MATCHES
806 #  undef IS_NAME_CHAR
807 #  undef IS_NAME_CHAR_MINBPC
808 #  undef IS_NMSTRT_CHAR
809 #  undef IS_NMSTRT_CHAR_MINBPC
810 #  undef IS_INVALID_CHAR
811 
812 #endif /* not XML_MIN_SIZE */
813 
814 #ifdef XML_NS
815 
816 static const struct normal_encoding little2_encoding_ns
817     = {{VTABLE, 2, 0,
818 #  if BYTEORDER == 1234
819         1
820 #  else
821         0
822 #  endif
823        },
824        {
825 #  include "asciitab.h"
826 #  include "latin1tab.h"
827        },
828        STANDARD_VTABLE(little2_) NULL_VTABLE};
829 
830 #endif
831 
832 static const struct normal_encoding little2_encoding
833     = {{VTABLE, 2, 0,
834 #if BYTEORDER == 1234
835         1
836 #else
837         0
838 #endif
839        },
840        {
841 #define BT_COLON BT_NMSTRT
842 #include "asciitab.h"
843 #undef BT_COLON
844 #include "latin1tab.h"
845        },
846        STANDARD_VTABLE(little2_) NULL_VTABLE};
847 
848 #if BYTEORDER != 4321
849 
850 #  ifdef XML_NS
851 
852 static const struct normal_encoding internal_little2_encoding_ns
853     = {{VTABLE, 2, 0, 1},
854        {
855 #    include "iasciitab.h"
856 #    include "latin1tab.h"
857        },
858        STANDARD_VTABLE(little2_) NULL_VTABLE};
859 
860 #  endif
861 
862 static const struct normal_encoding internal_little2_encoding
863     = {{VTABLE, 2, 0, 1},
864        {
865 #  define BT_COLON BT_NMSTRT
866 #  include "iasciitab.h"
867 #  undef BT_COLON
868 #  include "latin1tab.h"
869        },
870        STANDARD_VTABLE(little2_) NULL_VTABLE};
871 
872 #endif
873 
874 #define BIG2_BYTE_TYPE(enc, p)                                                 \
875   ((p)[0] == 0                                                                 \
876        ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]]        \
877        : unicode_byte_type((p)[0], (p)[1]))
878 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
879 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
880 #define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
881   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
882 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
883   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
884 
885 #ifdef XML_MIN_SIZE
886 
887 static int PTRFASTCALL
888 big2_byteType(const ENCODING *enc, const char *p) {
889   return BIG2_BYTE_TYPE(enc, p);
890 }
891 
892 static int PTRFASTCALL
893 big2_byteToAscii(const ENCODING *enc, const char *p) {
894   UNUSED_P(enc);
895   return BIG2_BYTE_TO_ASCII(p);
896 }
897 
898 static int PTRCALL
899 big2_charMatches(const ENCODING *enc, const char *p, int c) {
900   UNUSED_P(enc);
901   return BIG2_CHAR_MATCHES(p, c);
902 }
903 
904 static int PTRFASTCALL
905 big2_isNameMin(const ENCODING *enc, const char *p) {
906   UNUSED_P(enc);
907   return BIG2_IS_NAME_CHAR_MINBPC(p);
908 }
909 
910 static int PTRFASTCALL
911 big2_isNmstrtMin(const ENCODING *enc, const char *p) {
912   UNUSED_P(enc);
913   return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
914 }
915 
916 #  undef VTABLE
917 #  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
918 
919 #else /* not XML_MIN_SIZE */
920 
921 #  undef PREFIX
922 #  define PREFIX(ident) big2_##ident
923 #  define MINBPC(enc) 2
924 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
925 #  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
926 #  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
927 #  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
928 #  define IS_NAME_CHAR(enc, p, n) 0
929 #  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
930 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
931 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
932 
933 #  define XML_TOK_IMPL_C
934 #  include "xmltok_impl.c"
935 #  undef XML_TOK_IMPL_C
936 
937 #  undef MINBPC
938 #  undef BYTE_TYPE
939 #  undef BYTE_TO_ASCII
940 #  undef CHAR_MATCHES
941 #  undef IS_NAME_CHAR
942 #  undef IS_NAME_CHAR_MINBPC
943 #  undef IS_NMSTRT_CHAR
944 #  undef IS_NMSTRT_CHAR_MINBPC
945 #  undef IS_INVALID_CHAR
946 
947 #endif /* not XML_MIN_SIZE */
948 
949 #ifdef XML_NS
950 
951 static const struct normal_encoding big2_encoding_ns
952     = {{VTABLE, 2, 0,
953 #  if BYTEORDER == 4321
954         1
955 #  else
956         0
957 #  endif
958        },
959        {
960 #  include "asciitab.h"
961 #  include "latin1tab.h"
962        },
963        STANDARD_VTABLE(big2_) NULL_VTABLE};
964 
965 #endif
966 
967 static const struct normal_encoding big2_encoding
968     = {{VTABLE, 2, 0,
969 #if BYTEORDER == 4321
970         1
971 #else
972         0
973 #endif
974        },
975        {
976 #define BT_COLON BT_NMSTRT
977 #include "asciitab.h"
978 #undef BT_COLON
979 #include "latin1tab.h"
980        },
981        STANDARD_VTABLE(big2_) NULL_VTABLE};
982 
983 #if BYTEORDER != 1234
984 
985 #  ifdef XML_NS
986 
987 static const struct normal_encoding internal_big2_encoding_ns
988     = {{VTABLE, 2, 0, 1},
989        {
990 #    include "iasciitab.h"
991 #    include "latin1tab.h"
992        },
993        STANDARD_VTABLE(big2_) NULL_VTABLE};
994 
995 #  endif
996 
997 static const struct normal_encoding internal_big2_encoding
998     = {{VTABLE, 2, 0, 1},
999        {
1000 #  define BT_COLON BT_NMSTRT
1001 #  include "iasciitab.h"
1002 #  undef BT_COLON
1003 #  include "latin1tab.h"
1004        },
1005        STANDARD_VTABLE(big2_) NULL_VTABLE};
1006 
1007 #endif
1008 
1009 #undef PREFIX
1010 
1011 static int FASTCALL
1012 streqci(const char *s1, const char *s2) {
1013   for (;;) {
1014     char c1 = *s1++;
1015     char c2 = *s2++;
1016     if (ASCII_a <= c1 && c1 <= ASCII_z)
1017       c1 += ASCII_A - ASCII_a;
1018     if (ASCII_a <= c2 && c2 <= ASCII_z)
1019       /* The following line will never get executed.  streqci() is
1020        * only called from two places, both of which guarantee to put
1021        * upper-case strings into s2.
1022        */
1023       c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1024     if (c1 != c2)
1025       return 0;
1026     if (! c1)
1027       break;
1028   }
1029   return 1;
1030 }
1031 
1032 static void PTRCALL
1033 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1034                    POSITION *pos) {
1035   UNUSED_P(enc);
1036   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1037 }
1038 
1039 static int
1040 toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1041   char buf[1];
1042   char *p = buf;
1043   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1044   if (p == buf)
1045     return -1;
1046   else
1047     return buf[0];
1048 }
1049 
1050 static int FASTCALL
1051 isSpace(int c) {
1052   switch (c) {
1053   case 0x20:
1054   case 0xD:
1055   case 0xA:
1056   case 0x9:
1057     return 1;
1058   }
1059   return 0;
1060 }
1061 
1062 /* Return 1 if there's just optional white space or there's an S
1063    followed by name=val.
1064 */
1065 static int
1066 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1067                      const char **namePtr, const char **nameEndPtr,
1068                      const char **valPtr, const char **nextTokPtr) {
1069   int c;
1070   char open;
1071   if (ptr == end) {
1072     *namePtr = NULL;
1073     return 1;
1074   }
1075   if (! isSpace(toAscii(enc, ptr, end))) {
1076     *nextTokPtr = ptr;
1077     return 0;
1078   }
1079   do {
1080     ptr += enc->minBytesPerChar;
1081   } while (isSpace(toAscii(enc, ptr, end)));
1082   if (ptr == end) {
1083     *namePtr = NULL;
1084     return 1;
1085   }
1086   *namePtr = ptr;
1087   for (;;) {
1088     c = toAscii(enc, ptr, end);
1089     if (c == -1) {
1090       *nextTokPtr = ptr;
1091       return 0;
1092     }
1093     if (c == ASCII_EQUALS) {
1094       *nameEndPtr = ptr;
1095       break;
1096     }
1097     if (isSpace(c)) {
1098       *nameEndPtr = ptr;
1099       do {
1100         ptr += enc->minBytesPerChar;
1101       } while (isSpace(c = toAscii(enc, ptr, end)));
1102       if (c != ASCII_EQUALS) {
1103         *nextTokPtr = ptr;
1104         return 0;
1105       }
1106       break;
1107     }
1108     ptr += enc->minBytesPerChar;
1109   }
1110   if (ptr == *namePtr) {
1111     *nextTokPtr = ptr;
1112     return 0;
1113   }
1114   ptr += enc->minBytesPerChar;
1115   c = toAscii(enc, ptr, end);
1116   while (isSpace(c)) {
1117     ptr += enc->minBytesPerChar;
1118     c = toAscii(enc, ptr, end);
1119   }
1120   if (c != ASCII_QUOT && c != ASCII_APOS) {
1121     *nextTokPtr = ptr;
1122     return 0;
1123   }
1124   open = (char)c;
1125   ptr += enc->minBytesPerChar;
1126   *valPtr = ptr;
1127   for (;; ptr += enc->minBytesPerChar) {
1128     c = toAscii(enc, ptr, end);
1129     if (c == open)
1130       break;
1131     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1132         && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1133         && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1134       *nextTokPtr = ptr;
1135       return 0;
1136     }
1137   }
1138   *nextTokPtr = ptr + enc->minBytesPerChar;
1139   return 1;
1140 }
1141 
1142 static const char KW_version[]
1143     = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1144 
1145 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1146                                    ASCII_i, ASCII_n, ASCII_g, '\0'};
1147 
1148 static const char KW_standalone[]
1149     = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1150        ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1151 
1152 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1153 
1154 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1155 
1156 static int
1157 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1158                                                  const char *),
1159                int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1160                const char *end, const char **badPtr, const char **versionPtr,
1161                const char **versionEndPtr, const char **encodingName,
1162                const ENCODING **encoding, int *standalone) {
1163   const char *val = NULL;
1164   const char *name = NULL;
1165   const char *nameEnd = NULL;
1166   ptr += 5 * enc->minBytesPerChar;
1167   end -= 2 * enc->minBytesPerChar;
1168   if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1169       || ! name) {
1170     *badPtr = ptr;
1171     return 0;
1172   }
1173   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1174     if (! isGeneralTextEntity) {
1175       *badPtr = name;
1176       return 0;
1177     }
1178   } else {
1179     if (versionPtr)
1180       *versionPtr = val;
1181     if (versionEndPtr)
1182       *versionEndPtr = ptr;
1183     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1184       *badPtr = ptr;
1185       return 0;
1186     }
1187     if (! name) {
1188       if (isGeneralTextEntity) {
1189         /* a TextDecl must have an EncodingDecl */
1190         *badPtr = ptr;
1191         return 0;
1192       }
1193       return 1;
1194     }
1195   }
1196   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1197     int c = toAscii(enc, val, end);
1198     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1199       *badPtr = val;
1200       return 0;
1201     }
1202     if (encodingName)
1203       *encodingName = val;
1204     if (encoding)
1205       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1206     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1207       *badPtr = ptr;
1208       return 0;
1209     }
1210     if (! name)
1211       return 1;
1212   }
1213   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1214       || isGeneralTextEntity) {
1215     *badPtr = name;
1216     return 0;
1217   }
1218   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1219     if (standalone)
1220       *standalone = 1;
1221   } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1222     if (standalone)
1223       *standalone = 0;
1224   } else {
1225     *badPtr = val;
1226     return 0;
1227   }
1228   while (isSpace(toAscii(enc, ptr, end)))
1229     ptr += enc->minBytesPerChar;
1230   if (ptr != end) {
1231     *badPtr = ptr;
1232     return 0;
1233   }
1234   return 1;
1235 }
1236 
1237 static int FASTCALL
1238 checkCharRefNumber(int result) {
1239   switch (result >> 8) {
1240   case 0xD8:
1241   case 0xD9:
1242   case 0xDA:
1243   case 0xDB:
1244   case 0xDC:
1245   case 0xDD:
1246   case 0xDE:
1247   case 0xDF:
1248     return -1;
1249   case 0:
1250     if (latin1_encoding.type[result] == BT_NONXML)
1251       return -1;
1252     break;
1253   case 0xFF:
1254     if (result == 0xFFFE || result == 0xFFFF)
1255       return -1;
1256     break;
1257   }
1258   return result;
1259 }
1260 
1261 int FASTCALL
1262 XmlUtf8Encode(int c, char *buf) {
1263   enum {
1264     /* minN is minimum legal resulting value for N byte sequence */
1265     min2 = 0x80,
1266     min3 = 0x800,
1267     min4 = 0x10000
1268   };
1269 
1270   if (c < 0)
1271     return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1272   if (c < min2) {
1273     buf[0] = (char)(c | UTF8_cval1);
1274     return 1;
1275   }
1276   if (c < min3) {
1277     buf[0] = (char)((c >> 6) | UTF8_cval2);
1278     buf[1] = (char)((c & 0x3f) | 0x80);
1279     return 2;
1280   }
1281   if (c < min4) {
1282     buf[0] = (char)((c >> 12) | UTF8_cval3);
1283     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1284     buf[2] = (char)((c & 0x3f) | 0x80);
1285     return 3;
1286   }
1287   if (c < 0x110000) {
1288     buf[0] = (char)((c >> 18) | UTF8_cval4);
1289     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1290     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1291     buf[3] = (char)((c & 0x3f) | 0x80);
1292     return 4;
1293   }
1294   return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1295 }
1296 
1297 int FASTCALL
1298 XmlUtf16Encode(int charNum, unsigned short *buf) {
1299   if (charNum < 0)
1300     return 0;
1301   if (charNum < 0x10000) {
1302     buf[0] = (unsigned short)charNum;
1303     return 1;
1304   }
1305   if (charNum < 0x110000) {
1306     charNum -= 0x10000;
1307     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1308     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1309     return 2;
1310   }
1311   return 0;
1312 }
1313 
1314 struct unknown_encoding {
1315   struct normal_encoding normal;
1316   CONVERTER convert;
1317   void *userData;
1318   unsigned short utf16[256];
1319   char utf8[256][4];
1320 };
1321 
1322 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1323 
1324 int
1325 XmlSizeOfUnknownEncoding(void) {
1326   return sizeof(struct unknown_encoding);
1327 }
1328 
1329 static int PTRFASTCALL
1330 unknown_isName(const ENCODING *enc, const char *p) {
1331   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1332   int c = uenc->convert(uenc->userData, p);
1333   if (c & ~0xFFFF)
1334     return 0;
1335   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1336 }
1337 
1338 static int PTRFASTCALL
1339 unknown_isNmstrt(const ENCODING *enc, const char *p) {
1340   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1341   int c = uenc->convert(uenc->userData, p);
1342   if (c & ~0xFFFF)
1343     return 0;
1344   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1345 }
1346 
1347 static int PTRFASTCALL
1348 unknown_isInvalid(const ENCODING *enc, const char *p) {
1349   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1350   int c = uenc->convert(uenc->userData, p);
1351   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1352 }
1353 
1354 static enum XML_Convert_Result PTRCALL
1355 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1356                char **toP, const char *toLim) {
1357   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1358   char buf[XML_UTF8_ENCODE_MAX];
1359   for (;;) {
1360     const char *utf8;
1361     int n;
1362     if (*fromP == fromLim)
1363       return XML_CONVERT_COMPLETED;
1364     utf8 = uenc->utf8[(unsigned char)**fromP];
1365     n = *utf8++;
1366     if (n == 0) {
1367       int c = uenc->convert(uenc->userData, *fromP);
1368       n = XmlUtf8Encode(c, buf);
1369       if (n > toLim - *toP)
1370         return XML_CONVERT_OUTPUT_EXHAUSTED;
1371       utf8 = buf;
1372       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1373                  - (BT_LEAD2 - 2));
1374     } else {
1375       if (n > toLim - *toP)
1376         return XML_CONVERT_OUTPUT_EXHAUSTED;
1377       (*fromP)++;
1378     }
1379     memcpy(*toP, utf8, n);
1380     *toP += n;
1381   }
1382 }
1383 
1384 static enum XML_Convert_Result PTRCALL
1385 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1386                 unsigned short **toP, const unsigned short *toLim) {
1387   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1388   while (*fromP < fromLim && *toP < toLim) {
1389     unsigned short c = uenc->utf16[(unsigned char)**fromP];
1390     if (c == 0) {
1391       c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1392       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1393                  - (BT_LEAD2 - 2));
1394     } else
1395       (*fromP)++;
1396     *(*toP)++ = c;
1397   }
1398 
1399   if ((*toP == toLim) && (*fromP < fromLim))
1400     return XML_CONVERT_OUTPUT_EXHAUSTED;
1401   else
1402     return XML_CONVERT_COMPLETED;
1403 }
1404 
1405 ENCODING *
1406 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1407                        void *userData) {
1408   int i;
1409   struct unknown_encoding *e = (struct unknown_encoding *)mem;
1410   memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1411   for (i = 0; i < 128; i++)
1412     if (latin1_encoding.type[i] != BT_OTHER
1413         && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1414       return 0;
1415   for (i = 0; i < 256; i++) {
1416     int c = table[i];
1417     if (c == -1) {
1418       e->normal.type[i] = BT_MALFORM;
1419       /* This shouldn't really get used. */
1420       e->utf16[i] = 0xFFFF;
1421       e->utf8[i][0] = 1;
1422       e->utf8[i][1] = 0;
1423     } else if (c < 0) {
1424       if (c < -4)
1425         return 0;
1426       /* Multi-byte sequences need a converter function */
1427       if (! convert)
1428         return 0;
1429       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1430       e->utf8[i][0] = 0;
1431       e->utf16[i] = 0;
1432     } else if (c < 0x80) {
1433       if (latin1_encoding.type[c] != BT_OTHER
1434           && latin1_encoding.type[c] != BT_NONXML && c != i)
1435         return 0;
1436       e->normal.type[i] = latin1_encoding.type[c];
1437       e->utf8[i][0] = 1;
1438       e->utf8[i][1] = (char)c;
1439       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1440     } else if (checkCharRefNumber(c) < 0) {
1441       e->normal.type[i] = BT_NONXML;
1442       /* This shouldn't really get used. */
1443       e->utf16[i] = 0xFFFF;
1444       e->utf8[i][0] = 1;
1445       e->utf8[i][1] = 0;
1446     } else {
1447       if (c > 0xFFFF)
1448         return 0;
1449       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1450         e->normal.type[i] = BT_NMSTRT;
1451       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1452         e->normal.type[i] = BT_NAME;
1453       else
1454         e->normal.type[i] = BT_OTHER;
1455       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1456       e->utf16[i] = (unsigned short)c;
1457     }
1458   }
1459   e->userData = userData;
1460   e->convert = convert;
1461   if (convert) {
1462     e->normal.isName2 = unknown_isName;
1463     e->normal.isName3 = unknown_isName;
1464     e->normal.isName4 = unknown_isName;
1465     e->normal.isNmstrt2 = unknown_isNmstrt;
1466     e->normal.isNmstrt3 = unknown_isNmstrt;
1467     e->normal.isNmstrt4 = unknown_isNmstrt;
1468     e->normal.isInvalid2 = unknown_isInvalid;
1469     e->normal.isInvalid3 = unknown_isInvalid;
1470     e->normal.isInvalid4 = unknown_isInvalid;
1471   }
1472   e->normal.enc.utf8Convert = unknown_toUtf8;
1473   e->normal.enc.utf16Convert = unknown_toUtf16;
1474   return &(e->normal.enc);
1475 }
1476 
1477 /* If this enumeration is changed, getEncodingIndex and encodings
1478 must also be changed. */
1479 enum {
1480   UNKNOWN_ENC = -1,
1481   ISO_8859_1_ENC = 0,
1482   US_ASCII_ENC,
1483   UTF_8_ENC,
1484   UTF_16_ENC,
1485   UTF_16BE_ENC,
1486   UTF_16LE_ENC,
1487   /* must match encodingNames up to here */
1488   NO_ENC
1489 };
1490 
1491 static const char KW_ISO_8859_1[]
1492     = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1493        ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1494 static const char KW_US_ASCII[]
1495     = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1496        ASCII_C, ASCII_I, ASCII_I,     '\0'};
1497 static const char KW_UTF_8[]
1498     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1499 static const char KW_UTF_16[]
1500     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1501 static const char KW_UTF_16BE[]
1502     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1503        ASCII_6, ASCII_B, ASCII_E, '\0'};
1504 static const char KW_UTF_16LE[]
1505     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1506        ASCII_6, ASCII_L, ASCII_E, '\0'};
1507 
1508 static int FASTCALL
1509 getEncodingIndex(const char *name) {
1510   static const char *const encodingNames[] = {
1511       KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1512   };
1513   int i;
1514   if (name == NULL)
1515     return NO_ENC;
1516   for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1517     if (streqci(name, encodingNames[i]))
1518       return i;
1519   return UNKNOWN_ENC;
1520 }
1521 
1522 /* For binary compatibility, we store the index of the encoding
1523    specified at initialization in the isUtf16 member.
1524 */
1525 
1526 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1527 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1528 
1529 /* This is what detects the encoding.  encodingTable maps from
1530    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1531    the external (protocol) specified encoding; state is
1532    XML_CONTENT_STATE if we're parsing an external text entity, and
1533    XML_PROLOG_STATE otherwise.
1534 */
1535 
1536 static int
1537 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1538          int state, const char *ptr, const char *end, const char **nextTokPtr) {
1539   const ENCODING **encPtr;
1540 
1541   if (ptr >= end)
1542     return XML_TOK_NONE;
1543   encPtr = enc->encPtr;
1544   if (ptr + 1 == end) {
1545     /* only a single byte available for auto-detection */
1546 #ifndef XML_DTD /* FIXME */
1547     /* a well-formed document entity must have more than one byte */
1548     if (state != XML_CONTENT_STATE)
1549       return XML_TOK_PARTIAL;
1550 #endif
1551     /* so we're parsing an external text entity... */
1552     /* if UTF-16 was externally specified, then we need at least 2 bytes */
1553     switch (INIT_ENC_INDEX(enc)) {
1554     case UTF_16_ENC:
1555     case UTF_16LE_ENC:
1556     case UTF_16BE_ENC:
1557       return XML_TOK_PARTIAL;
1558     }
1559     switch ((unsigned char)*ptr) {
1560     case 0xFE:
1561     case 0xFF:
1562     case 0xEF: /* possibly first byte of UTF-8 BOM */
1563       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1564         break;
1565       /* fall through */
1566     case 0x00:
1567     case 0x3C:
1568       return XML_TOK_PARTIAL;
1569     }
1570   } else {
1571     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1572     case 0xFEFF:
1573       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1574         break;
1575       *nextTokPtr = ptr + 2;
1576       *encPtr = encodingTable[UTF_16BE_ENC];
1577       return XML_TOK_BOM;
1578     /* 00 3C is handled in the default case */
1579     case 0x3C00:
1580       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1581            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1582           && state == XML_CONTENT_STATE)
1583         break;
1584       *encPtr = encodingTable[UTF_16LE_ENC];
1585       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1586     case 0xFFFE:
1587       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1588         break;
1589       *nextTokPtr = ptr + 2;
1590       *encPtr = encodingTable[UTF_16LE_ENC];
1591       return XML_TOK_BOM;
1592     case 0xEFBB:
1593       /* Maybe a UTF-8 BOM (EF BB BF) */
1594       /* If there's an explicitly specified (external) encoding
1595          of ISO-8859-1 or some flavour of UTF-16
1596          and this is an external text entity,
1597          don't look for the BOM,
1598          because it might be a legal data.
1599       */
1600       if (state == XML_CONTENT_STATE) {
1601         int e = INIT_ENC_INDEX(enc);
1602         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1603             || e == UTF_16_ENC)
1604           break;
1605       }
1606       if (ptr + 2 == end)
1607         return XML_TOK_PARTIAL;
1608       if ((unsigned char)ptr[2] == 0xBF) {
1609         *nextTokPtr = ptr + 3;
1610         *encPtr = encodingTable[UTF_8_ENC];
1611         return XML_TOK_BOM;
1612       }
1613       break;
1614     default:
1615       if (ptr[0] == '\0') {
1616         /* 0 isn't a legal data character. Furthermore a document
1617            entity can only start with ASCII characters.  So the only
1618            way this can fail to be big-endian UTF-16 if it it's an
1619            external parsed general entity that's labelled as
1620            UTF-16LE.
1621         */
1622         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1623           break;
1624         *encPtr = encodingTable[UTF_16BE_ENC];
1625         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1626       } else if (ptr[1] == '\0') {
1627         /* We could recover here in the case:
1628             - parsing an external entity
1629             - second byte is 0
1630             - no externally specified encoding
1631             - no encoding declaration
1632            by assuming UTF-16LE.  But we don't, because this would mean when
1633            presented just with a single byte, we couldn't reliably determine
1634            whether we needed further bytes.
1635         */
1636         if (state == XML_CONTENT_STATE)
1637           break;
1638         *encPtr = encodingTable[UTF_16LE_ENC];
1639         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1640       }
1641       break;
1642     }
1643   }
1644   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1645   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1646 }
1647 
1648 #define NS(x) x
1649 #define ns(x) x
1650 #define XML_TOK_NS_C
1651 #include "xmltok_ns.c"
1652 #undef XML_TOK_NS_C
1653 #undef NS
1654 #undef ns
1655 
1656 #ifdef XML_NS
1657 
1658 #  define NS(x) x##NS
1659 #  define ns(x) x##_ns
1660 
1661 #  define XML_TOK_NS_C
1662 #  include "xmltok_ns.c"
1663 #  undef XML_TOK_NS_C
1664 
1665 #  undef NS
1666 #  undef ns
1667 
1668 ENCODING *
1669 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1670                          void *userData) {
1671   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1672   if (enc)
1673     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1674   return enc;
1675 }
1676 
1677 #endif /* XML_NS */
1678