xref: /freebsd/contrib/expat/lib/xmltok.c (revision 7be9a3b45356747f9fcb6d69a722c1c95f8060bf)
1 /*
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11    Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12    Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
13    Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14    Copyright (c) 2005-2009 Steven Solie <ssolie@users.sourceforge.net>
15    Copyright (c) 2016-2021 Sebastian Pipping <sebastian@pipping.org>
16    Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
17    Copyright (c) 2016      Don Lewis <truckman@apache.org>
18    Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
19    Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
20    Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
21    Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
22    Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23    Copyright (c) 2021      Dong-hee Na <donghee.na@python.org>
24    Licensed under the MIT license:
25 
26    Permission is  hereby granted,  free of charge,  to any  person obtaining
27    a  copy  of  this  software   and  associated  documentation  files  (the
28    "Software"),  to  deal in  the  Software  without restriction,  including
29    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
30    distribute, sublicense, and/or sell copies of the Software, and to permit
31    persons  to whom  the Software  is  furnished to  do so,  subject to  the
32    following conditions:
33 
34    The above copyright  notice and this permission notice  shall be included
35    in all copies or substantial portions of the Software.
36 
37    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
38    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
39    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
40    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
41    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
42    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
43    USE OR OTHER DEALINGS IN THE SOFTWARE.
44 */
45 
46 #include <expat_config.h>
47 
48 #include <stddef.h>
49 #include <string.h> /* memcpy */
50 #include <stdbool.h>
51 
52 #ifdef _WIN32
53 #  include "winconfig.h"
54 #endif
55 
56 #include "expat_external.h"
57 #include "internal.h"
58 #include "xmltok.h"
59 #include "nametab.h"
60 
61 #ifdef XML_DTD
62 #  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
63 #else
64 #  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
65 #endif
66 
67 #define VTABLE1                                                                \
68   {PREFIX(prologTok), PREFIX(contentTok),                                      \
69    PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
70       {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
71       PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
72       PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
73       PREFIX(updatePosition), PREFIX(isPublicId)
74 
75 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
76 
77 #define UCS2_GET_NAMING(pages, hi, lo)                                         \
78   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
79 
80 /* A 2 byte UTF-8 representation splits the characters 11 bits between
81    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
82    pages, 3 bits to add to that index and 5 bits to generate the mask.
83 */
84 #define UTF8_GET_NAMING2(pages, byte)                                          \
85   (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
86                 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
87    & (1u << (((byte)[1]) & 0x1F)))
88 
89 /* A 3 byte UTF-8 representation splits the characters 16 bits between
90    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
91    into pages, 3 bits to add to that index and 5 bits to generate the
92    mask.
93 */
94 #define UTF8_GET_NAMING3(pages, byte)                                          \
95   (namingBitmap                                                                \
96        [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
97          << 3)                                                                 \
98         + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
99    & (1u << (((byte)[2]) & 0x1F)))
100 
101 #define UTF8_GET_NAMING(pages, p, n)                                           \
102   ((n) == 2                                                                    \
103        ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p))                   \
104        : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0))
105 
106 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
107    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
108    with the additional restriction of not allowing the Unicode
109    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
110    Implementation details:
111      (A & 0x80) == 0     means A < 0x80
112    and
113      (A & 0xC0) == 0xC0  means A > 0xBF
114 */
115 
116 #define UTF8_INVALID2(p)                                                       \
117   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
118 
119 #define UTF8_INVALID3(p)                                                       \
120   (((p)[2] & 0x80) == 0                                                        \
121    || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
122                                       : ((p)[2] & 0xC0) == 0xC0)               \
123    || ((*p) == 0xE0                                                            \
124            ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
125            : ((p)[1] & 0x80) == 0                                              \
126                  || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
127 
128 #define UTF8_INVALID4(p)                                                       \
129   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
130    || ((p)[2] & 0xC0) == 0xC0                                                  \
131    || ((*p) == 0xF0                                                            \
132            ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
133            : ((p)[1] & 0x80) == 0                                              \
134                  || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
135 
136 static int PTRFASTCALL
137 isNever(const ENCODING *enc, const char *p) {
138   UNUSED_P(enc);
139   UNUSED_P(p);
140   return 0;
141 }
142 
143 static int PTRFASTCALL
144 utf8_isName2(const ENCODING *enc, const char *p) {
145   UNUSED_P(enc);
146   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
147 }
148 
149 static int PTRFASTCALL
150 utf8_isName3(const ENCODING *enc, const char *p) {
151   UNUSED_P(enc);
152   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
153 }
154 
155 #define utf8_isName4 isNever
156 
157 static int PTRFASTCALL
158 utf8_isNmstrt2(const ENCODING *enc, const char *p) {
159   UNUSED_P(enc);
160   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
161 }
162 
163 static int PTRFASTCALL
164 utf8_isNmstrt3(const ENCODING *enc, const char *p) {
165   UNUSED_P(enc);
166   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
167 }
168 
169 #define utf8_isNmstrt4 isNever
170 
171 static int PTRFASTCALL
172 utf8_isInvalid2(const ENCODING *enc, const char *p) {
173   UNUSED_P(enc);
174   return UTF8_INVALID2((const unsigned char *)p);
175 }
176 
177 static int PTRFASTCALL
178 utf8_isInvalid3(const ENCODING *enc, const char *p) {
179   UNUSED_P(enc);
180   return UTF8_INVALID3((const unsigned char *)p);
181 }
182 
183 static int PTRFASTCALL
184 utf8_isInvalid4(const ENCODING *enc, const char *p) {
185   UNUSED_P(enc);
186   return UTF8_INVALID4((const unsigned char *)p);
187 }
188 
189 struct normal_encoding {
190   ENCODING enc;
191   unsigned char type[256];
192 #ifdef XML_MIN_SIZE
193   int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
194   int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
195   int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
196   int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
197   int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
198 #endif /* XML_MIN_SIZE */
199   int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
200   int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
201   int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
202   int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
203   int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
204   int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
205   int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
206   int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
207   int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
208 };
209 
210 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
211 
212 #ifdef XML_MIN_SIZE
213 
214 #  define STANDARD_VTABLE(E)                                                   \
215     E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
216 
217 #else
218 
219 #  define STANDARD_VTABLE(E) /* as nothing */
220 
221 #endif
222 
223 #define NORMAL_VTABLE(E)                                                       \
224   E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
225       E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
226 
227 #define NULL_VTABLE                                                            \
228   /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
229       /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
230       /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
231 
232 static int FASTCALL checkCharRefNumber(int);
233 
234 #include "xmltok_impl.h"
235 #include "ascii.h"
236 
237 #ifdef XML_MIN_SIZE
238 #  define sb_isNameMin isNever
239 #  define sb_isNmstrtMin isNever
240 #endif
241 
242 #ifdef XML_MIN_SIZE
243 #  define MINBPC(enc) ((enc)->minBytesPerChar)
244 #else
245 /* minimum bytes per character */
246 #  define MINBPC(enc) 1
247 #endif
248 
249 #define SB_BYTE_TYPE(enc, p)                                                   \
250   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
251 
252 #ifdef XML_MIN_SIZE
253 static int PTRFASTCALL
254 sb_byteType(const ENCODING *enc, const char *p) {
255   return SB_BYTE_TYPE(enc, p);
256 }
257 #  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
258 #else
259 #  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
260 #endif
261 
262 #ifdef XML_MIN_SIZE
263 #  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
264 static int PTRFASTCALL
265 sb_byteToAscii(const ENCODING *enc, const char *p) {
266   UNUSED_P(enc);
267   return *p;
268 }
269 #else
270 #  define BYTE_TO_ASCII(enc, p) (*(p))
271 #endif
272 
273 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
274 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
275 #ifdef XML_MIN_SIZE
276 #  define IS_INVALID_CHAR(enc, p, n)                                           \
277     (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
278      && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
279 #else
280 #  define IS_INVALID_CHAR(enc, p, n)                                           \
281     (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
282 #endif
283 
284 #ifdef XML_MIN_SIZE
285 #  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
286     (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
287 #  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
288     (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
289 #else
290 #  define IS_NAME_CHAR_MINBPC(enc, p) (0)
291 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
292 #endif
293 
294 #ifdef XML_MIN_SIZE
295 #  define CHAR_MATCHES(enc, p, c)                                              \
296     (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
297 static int PTRCALL
298 sb_charMatches(const ENCODING *enc, const char *p, int c) {
299   UNUSED_P(enc);
300   return *p == c;
301 }
302 #else
303 /* c is an ASCII character */
304 #  define CHAR_MATCHES(enc, p, c) (*(p) == c)
305 #endif
306 
307 #define PREFIX(ident) normal_##ident
308 #define XML_TOK_IMPL_C
309 #include "xmltok_impl.c"
310 #undef XML_TOK_IMPL_C
311 
312 #undef MINBPC
313 #undef BYTE_TYPE
314 #undef BYTE_TO_ASCII
315 #undef CHAR_MATCHES
316 #undef IS_NAME_CHAR
317 #undef IS_NAME_CHAR_MINBPC
318 #undef IS_NMSTRT_CHAR
319 #undef IS_NMSTRT_CHAR_MINBPC
320 #undef IS_INVALID_CHAR
321 
322 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
323        UTF8_cval1 = 0x00,
324        UTF8_cval2 = 0xc0,
325        UTF8_cval3 = 0xe0,
326        UTF8_cval4 = 0xf0
327 };
328 
329 void
330 _INTERNAL_trim_to_complete_utf8_characters(const char *from,
331                                            const char **fromLimRef) {
332   const char *fromLim = *fromLimRef;
333   size_t walked = 0;
334   for (; fromLim > from; fromLim--, walked++) {
335     const unsigned char prev = (unsigned char)fromLim[-1];
336     if ((prev & 0xf8u)
337         == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
338       if (walked + 1 >= 4) {
339         fromLim += 4 - 1;
340         break;
341       } else {
342         walked = 0;
343       }
344     } else if ((prev & 0xf0u)
345                == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
346       if (walked + 1 >= 3) {
347         fromLim += 3 - 1;
348         break;
349       } else {
350         walked = 0;
351       }
352     } else if ((prev & 0xe0u)
353                == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
354       if (walked + 1 >= 2) {
355         fromLim += 2 - 1;
356         break;
357       } else {
358         walked = 0;
359       }
360     } else if ((prev & 0x80u)
361                == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
362       break;
363     }
364   }
365   *fromLimRef = fromLim;
366 }
367 
368 static enum XML_Convert_Result PTRCALL
369 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
370             char **toP, const char *toLim) {
371   bool input_incomplete = false;
372   bool output_exhausted = false;
373 
374   /* Avoid copying partial characters (due to limited space). */
375   const ptrdiff_t bytesAvailable = fromLim - *fromP;
376   const ptrdiff_t bytesStorable = toLim - *toP;
377   UNUSED_P(enc);
378   if (bytesAvailable > bytesStorable) {
379     fromLim = *fromP + bytesStorable;
380     output_exhausted = true;
381   }
382 
383   /* Avoid copying partial characters (from incomplete input). */
384   {
385     const char *const fromLimBefore = fromLim;
386     _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
387     if (fromLim < fromLimBefore) {
388       input_incomplete = true;
389     }
390   }
391 
392   {
393     const ptrdiff_t bytesToCopy = fromLim - *fromP;
394     memcpy(*toP, *fromP, bytesToCopy);
395     *fromP += bytesToCopy;
396     *toP += bytesToCopy;
397   }
398 
399   if (output_exhausted) /* needs to go first */
400     return XML_CONVERT_OUTPUT_EXHAUSTED;
401   else if (input_incomplete)
402     return XML_CONVERT_INPUT_INCOMPLETE;
403   else
404     return XML_CONVERT_COMPLETED;
405 }
406 
407 static enum XML_Convert_Result PTRCALL
408 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
409              unsigned short **toP, const unsigned short *toLim) {
410   enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
411   unsigned short *to = *toP;
412   const char *from = *fromP;
413   while (from < fromLim && to < toLim) {
414     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
415     case BT_LEAD2:
416       if (fromLim - from < 2) {
417         res = XML_CONVERT_INPUT_INCOMPLETE;
418         goto after;
419       }
420       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
421       from += 2;
422       break;
423     case BT_LEAD3:
424       if (fromLim - from < 3) {
425         res = XML_CONVERT_INPUT_INCOMPLETE;
426         goto after;
427       }
428       *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
429                                | (from[2] & 0x3f));
430       from += 3;
431       break;
432     case BT_LEAD4: {
433       unsigned long n;
434       if (toLim - to < 2) {
435         res = XML_CONVERT_OUTPUT_EXHAUSTED;
436         goto after;
437       }
438       if (fromLim - from < 4) {
439         res = XML_CONVERT_INPUT_INCOMPLETE;
440         goto after;
441       }
442       n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
443           | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
444       n -= 0x10000;
445       to[0] = (unsigned short)((n >> 10) | 0xD800);
446       to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
447       to += 2;
448       from += 4;
449     } break;
450     default:
451       *to++ = *from++;
452       break;
453     }
454   }
455   if (from < fromLim)
456     res = XML_CONVERT_OUTPUT_EXHAUSTED;
457 after:
458   *fromP = from;
459   *toP = to;
460   return res;
461 }
462 
463 #ifdef XML_NS
464 static const struct normal_encoding utf8_encoding_ns
465     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
466        {
467 #  include "asciitab.h"
468 #  include "utf8tab.h"
469        },
470        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
471 #endif
472 
473 static const struct normal_encoding utf8_encoding
474     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
475        {
476 #define BT_COLON BT_NMSTRT
477 #include "asciitab.h"
478 #undef BT_COLON
479 #include "utf8tab.h"
480        },
481        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
482 
483 #ifdef XML_NS
484 
485 static const struct normal_encoding internal_utf8_encoding_ns
486     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
487        {
488 #  include "iasciitab.h"
489 #  include "utf8tab.h"
490        },
491        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
492 
493 #endif
494 
495 static const struct normal_encoding internal_utf8_encoding
496     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
497        {
498 #define BT_COLON BT_NMSTRT
499 #include "iasciitab.h"
500 #undef BT_COLON
501 #include "utf8tab.h"
502        },
503        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
504 
505 static enum XML_Convert_Result PTRCALL
506 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
507               char **toP, const char *toLim) {
508   UNUSED_P(enc);
509   for (;;) {
510     unsigned char c;
511     if (*fromP == fromLim)
512       return XML_CONVERT_COMPLETED;
513     c = (unsigned char)**fromP;
514     if (c & 0x80) {
515       if (toLim - *toP < 2)
516         return XML_CONVERT_OUTPUT_EXHAUSTED;
517       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
518       *(*toP)++ = (char)((c & 0x3f) | 0x80);
519       (*fromP)++;
520     } else {
521       if (*toP == toLim)
522         return XML_CONVERT_OUTPUT_EXHAUSTED;
523       *(*toP)++ = *(*fromP)++;
524     }
525   }
526 }
527 
528 static enum XML_Convert_Result PTRCALL
529 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
530                unsigned short **toP, const unsigned short *toLim) {
531   UNUSED_P(enc);
532   while (*fromP < fromLim && *toP < toLim)
533     *(*toP)++ = (unsigned char)*(*fromP)++;
534 
535   if ((*toP == toLim) && (*fromP < fromLim))
536     return XML_CONVERT_OUTPUT_EXHAUSTED;
537   else
538     return XML_CONVERT_COMPLETED;
539 }
540 
541 #ifdef XML_NS
542 
543 static const struct normal_encoding latin1_encoding_ns
544     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
545        {
546 #  include "asciitab.h"
547 #  include "latin1tab.h"
548        },
549        STANDARD_VTABLE(sb_) NULL_VTABLE};
550 
551 #endif
552 
553 static const struct normal_encoding latin1_encoding
554     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
555        {
556 #define BT_COLON BT_NMSTRT
557 #include "asciitab.h"
558 #undef BT_COLON
559 #include "latin1tab.h"
560        },
561        STANDARD_VTABLE(sb_) NULL_VTABLE};
562 
563 static enum XML_Convert_Result PTRCALL
564 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
565              char **toP, const char *toLim) {
566   UNUSED_P(enc);
567   while (*fromP < fromLim && *toP < toLim)
568     *(*toP)++ = *(*fromP)++;
569 
570   if ((*toP == toLim) && (*fromP < fromLim))
571     return XML_CONVERT_OUTPUT_EXHAUSTED;
572   else
573     return XML_CONVERT_COMPLETED;
574 }
575 
576 #ifdef XML_NS
577 
578 static const struct normal_encoding ascii_encoding_ns
579     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
580        {
581 #  include "asciitab.h"
582            /* BT_NONXML == 0 */
583        },
584        STANDARD_VTABLE(sb_) NULL_VTABLE};
585 
586 #endif
587 
588 static const struct normal_encoding ascii_encoding
589     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
590        {
591 #define BT_COLON BT_NMSTRT
592 #include "asciitab.h"
593 #undef BT_COLON
594            /* BT_NONXML == 0 */
595        },
596        STANDARD_VTABLE(sb_) NULL_VTABLE};
597 
598 static int PTRFASTCALL
599 unicode_byte_type(char hi, char lo) {
600   switch ((unsigned char)hi) {
601   /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
602   case 0xD8:
603   case 0xD9:
604   case 0xDA:
605   case 0xDB:
606     return BT_LEAD4;
607   /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
608   case 0xDC:
609   case 0xDD:
610   case 0xDE:
611   case 0xDF:
612     return BT_TRAIL;
613   case 0xFF:
614     switch ((unsigned char)lo) {
615     case 0xFF: /* noncharacter-FFFF */
616     case 0xFE: /* noncharacter-FFFE */
617       return BT_NONXML;
618     }
619     break;
620   }
621   return BT_NONASCII;
622 }
623 
624 #define DEFINE_UTF16_TO_UTF8(E)                                                \
625   static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
626       const ENCODING *enc, const char **fromP, const char *fromLim,            \
627       char **toP, const char *toLim) {                                         \
628     const char *from = *fromP;                                                 \
629     UNUSED_P(enc);                                                             \
630     fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
631     for (; from < fromLim; from += 2) {                                        \
632       int plane;                                                               \
633       unsigned char lo2;                                                       \
634       unsigned char lo = GET_LO(from);                                         \
635       unsigned char hi = GET_HI(from);                                         \
636       switch (hi) {                                                            \
637       case 0:                                                                  \
638         if (lo < 0x80) {                                                       \
639           if (*toP == toLim) {                                                 \
640             *fromP = from;                                                     \
641             return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
642           }                                                                    \
643           *(*toP)++ = lo;                                                      \
644           break;                                                               \
645         }                                                                      \
646         /* fall through */                                                     \
647       case 0x1:                                                                \
648       case 0x2:                                                                \
649       case 0x3:                                                                \
650       case 0x4:                                                                \
651       case 0x5:                                                                \
652       case 0x6:                                                                \
653       case 0x7:                                                                \
654         if (toLim - *toP < 2) {                                                \
655           *fromP = from;                                                       \
656           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
657         }                                                                      \
658         *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
659         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
660         break;                                                                 \
661       default:                                                                 \
662         if (toLim - *toP < 3) {                                                \
663           *fromP = from;                                                       \
664           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
665         }                                                                      \
666         /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
667         *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
668         *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
669         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
670         break;                                                                 \
671       case 0xD8:                                                               \
672       case 0xD9:                                                               \
673       case 0xDA:                                                               \
674       case 0xDB:                                                               \
675         if (toLim - *toP < 4) {                                                \
676           *fromP = from;                                                       \
677           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
678         }                                                                      \
679         if (fromLim - from < 4) {                                              \
680           *fromP = from;                                                       \
681           return XML_CONVERT_INPUT_INCOMPLETE;                                 \
682         }                                                                      \
683         plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
684         *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
685         *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
686         from += 2;                                                             \
687         lo2 = GET_LO(from);                                                    \
688         *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
689                      | (lo2 >> 6) | 0x80);                                     \
690         *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
691         break;                                                                 \
692       }                                                                        \
693     }                                                                          \
694     *fromP = from;                                                             \
695     if (from < fromLim)                                                        \
696       return XML_CONVERT_INPUT_INCOMPLETE;                                     \
697     else                                                                       \
698       return XML_CONVERT_COMPLETED;                                            \
699   }
700 
701 #define DEFINE_UTF16_TO_UTF16(E)                                               \
702   static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
703       const ENCODING *enc, const char **fromP, const char *fromLim,            \
704       unsigned short **toP, const unsigned short *toLim) {                     \
705     enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
706     UNUSED_P(enc);                                                             \
707     fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
708     /* Avoid copying first half only of surrogate */                           \
709     if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
710         && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
711       fromLim -= 2;                                                            \
712       res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
713     }                                                                          \
714     for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
715       *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
716     if ((*toP == toLim) && (*fromP < fromLim))                                 \
717       return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
718     else                                                                       \
719       return res;                                                              \
720   }
721 
722 #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
723 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
724 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
725 
726 DEFINE_UTF16_TO_UTF8(little2_)
727 DEFINE_UTF16_TO_UTF16(little2_)
728 
729 #undef SET2
730 #undef GET_LO
731 #undef GET_HI
732 
733 #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
734 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
735 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
736 
737 DEFINE_UTF16_TO_UTF8(big2_)
738 DEFINE_UTF16_TO_UTF16(big2_)
739 
740 #undef SET2
741 #undef GET_LO
742 #undef GET_HI
743 
744 #define LITTLE2_BYTE_TYPE(enc, p)                                              \
745   ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]  \
746                : unicode_byte_type((p)[1], (p)[0]))
747 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
748 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c)
749 #define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
750   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
751 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
752   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
753 
754 #ifdef XML_MIN_SIZE
755 
756 static int PTRFASTCALL
757 little2_byteType(const ENCODING *enc, const char *p) {
758   return LITTLE2_BYTE_TYPE(enc, p);
759 }
760 
761 static int PTRFASTCALL
762 little2_byteToAscii(const ENCODING *enc, const char *p) {
763   UNUSED_P(enc);
764   return LITTLE2_BYTE_TO_ASCII(p);
765 }
766 
767 static int PTRCALL
768 little2_charMatches(const ENCODING *enc, const char *p, int c) {
769   UNUSED_P(enc);
770   return LITTLE2_CHAR_MATCHES(p, c);
771 }
772 
773 static int PTRFASTCALL
774 little2_isNameMin(const ENCODING *enc, const char *p) {
775   UNUSED_P(enc);
776   return LITTLE2_IS_NAME_CHAR_MINBPC(p);
777 }
778 
779 static int PTRFASTCALL
780 little2_isNmstrtMin(const ENCODING *enc, const char *p) {
781   UNUSED_P(enc);
782   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
783 }
784 
785 #  undef VTABLE
786 #  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
787 
788 #else /* not XML_MIN_SIZE */
789 
790 #  undef PREFIX
791 #  define PREFIX(ident) little2_##ident
792 #  define MINBPC(enc) 2
793 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
794 #  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
795 #  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
796 #  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
797 #  define IS_NAME_CHAR(enc, p, n) 0
798 #  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
799 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
800 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
801 
802 #  define XML_TOK_IMPL_C
803 #  include "xmltok_impl.c"
804 #  undef XML_TOK_IMPL_C
805 
806 #  undef MINBPC
807 #  undef BYTE_TYPE
808 #  undef BYTE_TO_ASCII
809 #  undef CHAR_MATCHES
810 #  undef IS_NAME_CHAR
811 #  undef IS_NAME_CHAR_MINBPC
812 #  undef IS_NMSTRT_CHAR
813 #  undef IS_NMSTRT_CHAR_MINBPC
814 #  undef IS_INVALID_CHAR
815 
816 #endif /* not XML_MIN_SIZE */
817 
818 #ifdef XML_NS
819 
820 static const struct normal_encoding little2_encoding_ns
821     = {{VTABLE, 2, 0,
822 #  if BYTEORDER == 1234
823         1
824 #  else
825         0
826 #  endif
827        },
828        {
829 #  include "asciitab.h"
830 #  include "latin1tab.h"
831        },
832        STANDARD_VTABLE(little2_) NULL_VTABLE};
833 
834 #endif
835 
836 static const struct normal_encoding little2_encoding
837     = {{VTABLE, 2, 0,
838 #if BYTEORDER == 1234
839         1
840 #else
841         0
842 #endif
843        },
844        {
845 #define BT_COLON BT_NMSTRT
846 #include "asciitab.h"
847 #undef BT_COLON
848 #include "latin1tab.h"
849        },
850        STANDARD_VTABLE(little2_) NULL_VTABLE};
851 
852 #if BYTEORDER != 4321
853 
854 #  ifdef XML_NS
855 
856 static const struct normal_encoding internal_little2_encoding_ns
857     = {{VTABLE, 2, 0, 1},
858        {
859 #    include "iasciitab.h"
860 #    include "latin1tab.h"
861        },
862        STANDARD_VTABLE(little2_) NULL_VTABLE};
863 
864 #  endif
865 
866 static const struct normal_encoding internal_little2_encoding
867     = {{VTABLE, 2, 0, 1},
868        {
869 #  define BT_COLON BT_NMSTRT
870 #  include "iasciitab.h"
871 #  undef BT_COLON
872 #  include "latin1tab.h"
873        },
874        STANDARD_VTABLE(little2_) NULL_VTABLE};
875 
876 #endif
877 
878 #define BIG2_BYTE_TYPE(enc, p)                                                 \
879   ((p)[0] == 0                                                                 \
880        ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]]        \
881        : unicode_byte_type((p)[0], (p)[1]))
882 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
883 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c)
884 #define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
885   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
886 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
887   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
888 
889 #ifdef XML_MIN_SIZE
890 
891 static int PTRFASTCALL
892 big2_byteType(const ENCODING *enc, const char *p) {
893   return BIG2_BYTE_TYPE(enc, p);
894 }
895 
896 static int PTRFASTCALL
897 big2_byteToAscii(const ENCODING *enc, const char *p) {
898   UNUSED_P(enc);
899   return BIG2_BYTE_TO_ASCII(p);
900 }
901 
902 static int PTRCALL
903 big2_charMatches(const ENCODING *enc, const char *p, int c) {
904   UNUSED_P(enc);
905   return BIG2_CHAR_MATCHES(p, c);
906 }
907 
908 static int PTRFASTCALL
909 big2_isNameMin(const ENCODING *enc, const char *p) {
910   UNUSED_P(enc);
911   return BIG2_IS_NAME_CHAR_MINBPC(p);
912 }
913 
914 static int PTRFASTCALL
915 big2_isNmstrtMin(const ENCODING *enc, const char *p) {
916   UNUSED_P(enc);
917   return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
918 }
919 
920 #  undef VTABLE
921 #  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
922 
923 #else /* not XML_MIN_SIZE */
924 
925 #  undef PREFIX
926 #  define PREFIX(ident) big2_##ident
927 #  define MINBPC(enc) 2
928 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
929 #  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
930 #  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
931 #  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
932 #  define IS_NAME_CHAR(enc, p, n) 0
933 #  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
934 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
935 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
936 
937 #  define XML_TOK_IMPL_C
938 #  include "xmltok_impl.c"
939 #  undef XML_TOK_IMPL_C
940 
941 #  undef MINBPC
942 #  undef BYTE_TYPE
943 #  undef BYTE_TO_ASCII
944 #  undef CHAR_MATCHES
945 #  undef IS_NAME_CHAR
946 #  undef IS_NAME_CHAR_MINBPC
947 #  undef IS_NMSTRT_CHAR
948 #  undef IS_NMSTRT_CHAR_MINBPC
949 #  undef IS_INVALID_CHAR
950 
951 #endif /* not XML_MIN_SIZE */
952 
953 #ifdef XML_NS
954 
955 static const struct normal_encoding big2_encoding_ns
956     = {{VTABLE, 2, 0,
957 #  if BYTEORDER == 4321
958         1
959 #  else
960         0
961 #  endif
962        },
963        {
964 #  include "asciitab.h"
965 #  include "latin1tab.h"
966        },
967        STANDARD_VTABLE(big2_) NULL_VTABLE};
968 
969 #endif
970 
971 static const struct normal_encoding big2_encoding
972     = {{VTABLE, 2, 0,
973 #if BYTEORDER == 4321
974         1
975 #else
976         0
977 #endif
978        },
979        {
980 #define BT_COLON BT_NMSTRT
981 #include "asciitab.h"
982 #undef BT_COLON
983 #include "latin1tab.h"
984        },
985        STANDARD_VTABLE(big2_) NULL_VTABLE};
986 
987 #if BYTEORDER != 1234
988 
989 #  ifdef XML_NS
990 
991 static const struct normal_encoding internal_big2_encoding_ns
992     = {{VTABLE, 2, 0, 1},
993        {
994 #    include "iasciitab.h"
995 #    include "latin1tab.h"
996        },
997        STANDARD_VTABLE(big2_) NULL_VTABLE};
998 
999 #  endif
1000 
1001 static const struct normal_encoding internal_big2_encoding
1002     = {{VTABLE, 2, 0, 1},
1003        {
1004 #  define BT_COLON BT_NMSTRT
1005 #  include "iasciitab.h"
1006 #  undef BT_COLON
1007 #  include "latin1tab.h"
1008        },
1009        STANDARD_VTABLE(big2_) NULL_VTABLE};
1010 
1011 #endif
1012 
1013 #undef PREFIX
1014 
1015 static int FASTCALL
1016 streqci(const char *s1, const char *s2) {
1017   for (;;) {
1018     char c1 = *s1++;
1019     char c2 = *s2++;
1020     if (ASCII_a <= c1 && c1 <= ASCII_z)
1021       c1 += ASCII_A - ASCII_a;
1022     if (ASCII_a <= c2 && c2 <= ASCII_z)
1023       /* The following line will never get executed.  streqci() is
1024        * only called from two places, both of which guarantee to put
1025        * upper-case strings into s2.
1026        */
1027       c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1028     if (c1 != c2)
1029       return 0;
1030     if (! c1)
1031       break;
1032   }
1033   return 1;
1034 }
1035 
1036 static void PTRCALL
1037 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1038                    POSITION *pos) {
1039   UNUSED_P(enc);
1040   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1041 }
1042 
1043 static int
1044 toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1045   char buf[1];
1046   char *p = buf;
1047   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1048   if (p == buf)
1049     return -1;
1050   else
1051     return buf[0];
1052 }
1053 
1054 static int FASTCALL
1055 isSpace(int c) {
1056   switch (c) {
1057   case 0x20:
1058   case 0xD:
1059   case 0xA:
1060   case 0x9:
1061     return 1;
1062   }
1063   return 0;
1064 }
1065 
1066 /* Return 1 if there's just optional white space or there's an S
1067    followed by name=val.
1068 */
1069 static int
1070 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1071                      const char **namePtr, const char **nameEndPtr,
1072                      const char **valPtr, const char **nextTokPtr) {
1073   int c;
1074   char open;
1075   if (ptr == end) {
1076     *namePtr = NULL;
1077     return 1;
1078   }
1079   if (! isSpace(toAscii(enc, ptr, end))) {
1080     *nextTokPtr = ptr;
1081     return 0;
1082   }
1083   do {
1084     ptr += enc->minBytesPerChar;
1085   } while (isSpace(toAscii(enc, ptr, end)));
1086   if (ptr == end) {
1087     *namePtr = NULL;
1088     return 1;
1089   }
1090   *namePtr = ptr;
1091   for (;;) {
1092     c = toAscii(enc, ptr, end);
1093     if (c == -1) {
1094       *nextTokPtr = ptr;
1095       return 0;
1096     }
1097     if (c == ASCII_EQUALS) {
1098       *nameEndPtr = ptr;
1099       break;
1100     }
1101     if (isSpace(c)) {
1102       *nameEndPtr = ptr;
1103       do {
1104         ptr += enc->minBytesPerChar;
1105       } while (isSpace(c = toAscii(enc, ptr, end)));
1106       if (c != ASCII_EQUALS) {
1107         *nextTokPtr = ptr;
1108         return 0;
1109       }
1110       break;
1111     }
1112     ptr += enc->minBytesPerChar;
1113   }
1114   if (ptr == *namePtr) {
1115     *nextTokPtr = ptr;
1116     return 0;
1117   }
1118   ptr += enc->minBytesPerChar;
1119   c = toAscii(enc, ptr, end);
1120   while (isSpace(c)) {
1121     ptr += enc->minBytesPerChar;
1122     c = toAscii(enc, ptr, end);
1123   }
1124   if (c != ASCII_QUOT && c != ASCII_APOS) {
1125     *nextTokPtr = ptr;
1126     return 0;
1127   }
1128   open = (char)c;
1129   ptr += enc->minBytesPerChar;
1130   *valPtr = ptr;
1131   for (;; ptr += enc->minBytesPerChar) {
1132     c = toAscii(enc, ptr, end);
1133     if (c == open)
1134       break;
1135     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1136         && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1137         && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1138       *nextTokPtr = ptr;
1139       return 0;
1140     }
1141   }
1142   *nextTokPtr = ptr + enc->minBytesPerChar;
1143   return 1;
1144 }
1145 
1146 static const char KW_version[]
1147     = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1148 
1149 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1150                                    ASCII_i, ASCII_n, ASCII_g, '\0'};
1151 
1152 static const char KW_standalone[]
1153     = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1154        ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1155 
1156 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1157 
1158 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1159 
1160 static int
1161 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1162                                                  const char *),
1163                int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1164                const char *end, const char **badPtr, const char **versionPtr,
1165                const char **versionEndPtr, const char **encodingName,
1166                const ENCODING **encoding, int *standalone) {
1167   const char *val = NULL;
1168   const char *name = NULL;
1169   const char *nameEnd = NULL;
1170   ptr += 5 * enc->minBytesPerChar;
1171   end -= 2 * enc->minBytesPerChar;
1172   if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1173       || ! name) {
1174     *badPtr = ptr;
1175     return 0;
1176   }
1177   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1178     if (! isGeneralTextEntity) {
1179       *badPtr = name;
1180       return 0;
1181     }
1182   } else {
1183     if (versionPtr)
1184       *versionPtr = val;
1185     if (versionEndPtr)
1186       *versionEndPtr = ptr;
1187     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1188       *badPtr = ptr;
1189       return 0;
1190     }
1191     if (! name) {
1192       if (isGeneralTextEntity) {
1193         /* a TextDecl must have an EncodingDecl */
1194         *badPtr = ptr;
1195         return 0;
1196       }
1197       return 1;
1198     }
1199   }
1200   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1201     int c = toAscii(enc, val, end);
1202     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1203       *badPtr = val;
1204       return 0;
1205     }
1206     if (encodingName)
1207       *encodingName = val;
1208     if (encoding)
1209       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1210     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1211       *badPtr = ptr;
1212       return 0;
1213     }
1214     if (! name)
1215       return 1;
1216   }
1217   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1218       || isGeneralTextEntity) {
1219     *badPtr = name;
1220     return 0;
1221   }
1222   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1223     if (standalone)
1224       *standalone = 1;
1225   } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1226     if (standalone)
1227       *standalone = 0;
1228   } else {
1229     *badPtr = val;
1230     return 0;
1231   }
1232   while (isSpace(toAscii(enc, ptr, end)))
1233     ptr += enc->minBytesPerChar;
1234   if (ptr != end) {
1235     *badPtr = ptr;
1236     return 0;
1237   }
1238   return 1;
1239 }
1240 
1241 static int FASTCALL
1242 checkCharRefNumber(int result) {
1243   switch (result >> 8) {
1244   case 0xD8:
1245   case 0xD9:
1246   case 0xDA:
1247   case 0xDB:
1248   case 0xDC:
1249   case 0xDD:
1250   case 0xDE:
1251   case 0xDF:
1252     return -1;
1253   case 0:
1254     if (latin1_encoding.type[result] == BT_NONXML)
1255       return -1;
1256     break;
1257   case 0xFF:
1258     if (result == 0xFFFE || result == 0xFFFF)
1259       return -1;
1260     break;
1261   }
1262   return result;
1263 }
1264 
1265 int FASTCALL
1266 XmlUtf8Encode(int c, char *buf) {
1267   enum {
1268     /* minN is minimum legal resulting value for N byte sequence */
1269     min2 = 0x80,
1270     min3 = 0x800,
1271     min4 = 0x10000
1272   };
1273 
1274   if (c < 0)
1275     return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1276   if (c < min2) {
1277     buf[0] = (char)(c | UTF8_cval1);
1278     return 1;
1279   }
1280   if (c < min3) {
1281     buf[0] = (char)((c >> 6) | UTF8_cval2);
1282     buf[1] = (char)((c & 0x3f) | 0x80);
1283     return 2;
1284   }
1285   if (c < min4) {
1286     buf[0] = (char)((c >> 12) | UTF8_cval3);
1287     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1288     buf[2] = (char)((c & 0x3f) | 0x80);
1289     return 3;
1290   }
1291   if (c < 0x110000) {
1292     buf[0] = (char)((c >> 18) | UTF8_cval4);
1293     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1294     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1295     buf[3] = (char)((c & 0x3f) | 0x80);
1296     return 4;
1297   }
1298   return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1299 }
1300 
1301 int FASTCALL
1302 XmlUtf16Encode(int charNum, unsigned short *buf) {
1303   if (charNum < 0)
1304     return 0;
1305   if (charNum < 0x10000) {
1306     buf[0] = (unsigned short)charNum;
1307     return 1;
1308   }
1309   if (charNum < 0x110000) {
1310     charNum -= 0x10000;
1311     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1312     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1313     return 2;
1314   }
1315   return 0;
1316 }
1317 
1318 struct unknown_encoding {
1319   struct normal_encoding normal;
1320   CONVERTER convert;
1321   void *userData;
1322   unsigned short utf16[256];
1323   char utf8[256][4];
1324 };
1325 
1326 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1327 
1328 int
1329 XmlSizeOfUnknownEncoding(void) {
1330   return sizeof(struct unknown_encoding);
1331 }
1332 
1333 static int PTRFASTCALL
1334 unknown_isName(const ENCODING *enc, const char *p) {
1335   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1336   int c = uenc->convert(uenc->userData, p);
1337   if (c & ~0xFFFF)
1338     return 0;
1339   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1340 }
1341 
1342 static int PTRFASTCALL
1343 unknown_isNmstrt(const ENCODING *enc, const char *p) {
1344   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1345   int c = uenc->convert(uenc->userData, p);
1346   if (c & ~0xFFFF)
1347     return 0;
1348   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1349 }
1350 
1351 static int PTRFASTCALL
1352 unknown_isInvalid(const ENCODING *enc, const char *p) {
1353   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1354   int c = uenc->convert(uenc->userData, p);
1355   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1356 }
1357 
1358 static enum XML_Convert_Result PTRCALL
1359 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1360                char **toP, const char *toLim) {
1361   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1362   char buf[XML_UTF8_ENCODE_MAX];
1363   for (;;) {
1364     const char *utf8;
1365     int n;
1366     if (*fromP == fromLim)
1367       return XML_CONVERT_COMPLETED;
1368     utf8 = uenc->utf8[(unsigned char)**fromP];
1369     n = *utf8++;
1370     if (n == 0) {
1371       int c = uenc->convert(uenc->userData, *fromP);
1372       n = XmlUtf8Encode(c, buf);
1373       if (n > toLim - *toP)
1374         return XML_CONVERT_OUTPUT_EXHAUSTED;
1375       utf8 = buf;
1376       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1377                  - (BT_LEAD2 - 2));
1378     } else {
1379       if (n > toLim - *toP)
1380         return XML_CONVERT_OUTPUT_EXHAUSTED;
1381       (*fromP)++;
1382     }
1383     memcpy(*toP, utf8, n);
1384     *toP += n;
1385   }
1386 }
1387 
1388 static enum XML_Convert_Result PTRCALL
1389 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1390                 unsigned short **toP, const unsigned short *toLim) {
1391   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1392   while (*fromP < fromLim && *toP < toLim) {
1393     unsigned short c = uenc->utf16[(unsigned char)**fromP];
1394     if (c == 0) {
1395       c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1396       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1397                  - (BT_LEAD2 - 2));
1398     } else
1399       (*fromP)++;
1400     *(*toP)++ = c;
1401   }
1402 
1403   if ((*toP == toLim) && (*fromP < fromLim))
1404     return XML_CONVERT_OUTPUT_EXHAUSTED;
1405   else
1406     return XML_CONVERT_COMPLETED;
1407 }
1408 
1409 ENCODING *
1410 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1411                        void *userData) {
1412   int i;
1413   struct unknown_encoding *e = (struct unknown_encoding *)mem;
1414   memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1415   for (i = 0; i < 128; i++)
1416     if (latin1_encoding.type[i] != BT_OTHER
1417         && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1418       return 0;
1419   for (i = 0; i < 256; i++) {
1420     int c = table[i];
1421     if (c == -1) {
1422       e->normal.type[i] = BT_MALFORM;
1423       /* This shouldn't really get used. */
1424       e->utf16[i] = 0xFFFF;
1425       e->utf8[i][0] = 1;
1426       e->utf8[i][1] = 0;
1427     } else if (c < 0) {
1428       if (c < -4)
1429         return 0;
1430       /* Multi-byte sequences need a converter function */
1431       if (! convert)
1432         return 0;
1433       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1434       e->utf8[i][0] = 0;
1435       e->utf16[i] = 0;
1436     } else if (c < 0x80) {
1437       if (latin1_encoding.type[c] != BT_OTHER
1438           && latin1_encoding.type[c] != BT_NONXML && c != i)
1439         return 0;
1440       e->normal.type[i] = latin1_encoding.type[c];
1441       e->utf8[i][0] = 1;
1442       e->utf8[i][1] = (char)c;
1443       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1444     } else if (checkCharRefNumber(c) < 0) {
1445       e->normal.type[i] = BT_NONXML;
1446       /* This shouldn't really get used. */
1447       e->utf16[i] = 0xFFFF;
1448       e->utf8[i][0] = 1;
1449       e->utf8[i][1] = 0;
1450     } else {
1451       if (c > 0xFFFF)
1452         return 0;
1453       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1454         e->normal.type[i] = BT_NMSTRT;
1455       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1456         e->normal.type[i] = BT_NAME;
1457       else
1458         e->normal.type[i] = BT_OTHER;
1459       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1460       e->utf16[i] = (unsigned short)c;
1461     }
1462   }
1463   e->userData = userData;
1464   e->convert = convert;
1465   if (convert) {
1466     e->normal.isName2 = unknown_isName;
1467     e->normal.isName3 = unknown_isName;
1468     e->normal.isName4 = unknown_isName;
1469     e->normal.isNmstrt2 = unknown_isNmstrt;
1470     e->normal.isNmstrt3 = unknown_isNmstrt;
1471     e->normal.isNmstrt4 = unknown_isNmstrt;
1472     e->normal.isInvalid2 = unknown_isInvalid;
1473     e->normal.isInvalid3 = unknown_isInvalid;
1474     e->normal.isInvalid4 = unknown_isInvalid;
1475   }
1476   e->normal.enc.utf8Convert = unknown_toUtf8;
1477   e->normal.enc.utf16Convert = unknown_toUtf16;
1478   return &(e->normal.enc);
1479 }
1480 
1481 /* If this enumeration is changed, getEncodingIndex and encodings
1482 must also be changed. */
1483 enum {
1484   UNKNOWN_ENC = -1,
1485   ISO_8859_1_ENC = 0,
1486   US_ASCII_ENC,
1487   UTF_8_ENC,
1488   UTF_16_ENC,
1489   UTF_16BE_ENC,
1490   UTF_16LE_ENC,
1491   /* must match encodingNames up to here */
1492   NO_ENC
1493 };
1494 
1495 static const char KW_ISO_8859_1[]
1496     = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1497        ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1498 static const char KW_US_ASCII[]
1499     = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1500        ASCII_C, ASCII_I, ASCII_I,     '\0'};
1501 static const char KW_UTF_8[]
1502     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1503 static const char KW_UTF_16[]
1504     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1505 static const char KW_UTF_16BE[]
1506     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1507        ASCII_6, ASCII_B, ASCII_E, '\0'};
1508 static const char KW_UTF_16LE[]
1509     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1510        ASCII_6, ASCII_L, ASCII_E, '\0'};
1511 
1512 static int FASTCALL
1513 getEncodingIndex(const char *name) {
1514   static const char *const encodingNames[] = {
1515       KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1516   };
1517   int i;
1518   if (name == NULL)
1519     return NO_ENC;
1520   for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1521     if (streqci(name, encodingNames[i]))
1522       return i;
1523   return UNKNOWN_ENC;
1524 }
1525 
1526 /* For binary compatibility, we store the index of the encoding
1527    specified at initialization in the isUtf16 member.
1528 */
1529 
1530 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1531 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1532 
1533 /* This is what detects the encoding.  encodingTable maps from
1534    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1535    the external (protocol) specified encoding; state is
1536    XML_CONTENT_STATE if we're parsing an external text entity, and
1537    XML_PROLOG_STATE otherwise.
1538 */
1539 
1540 static int
1541 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1542          int state, const char *ptr, const char *end, const char **nextTokPtr) {
1543   const ENCODING **encPtr;
1544 
1545   if (ptr >= end)
1546     return XML_TOK_NONE;
1547   encPtr = enc->encPtr;
1548   if (ptr + 1 == end) {
1549     /* only a single byte available for auto-detection */
1550 #ifndef XML_DTD /* FIXME */
1551     /* a well-formed document entity must have more than one byte */
1552     if (state != XML_CONTENT_STATE)
1553       return XML_TOK_PARTIAL;
1554 #endif
1555     /* so we're parsing an external text entity... */
1556     /* if UTF-16 was externally specified, then we need at least 2 bytes */
1557     switch (INIT_ENC_INDEX(enc)) {
1558     case UTF_16_ENC:
1559     case UTF_16LE_ENC:
1560     case UTF_16BE_ENC:
1561       return XML_TOK_PARTIAL;
1562     }
1563     switch ((unsigned char)*ptr) {
1564     case 0xFE:
1565     case 0xFF:
1566     case 0xEF: /* possibly first byte of UTF-8 BOM */
1567       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1568         break;
1569       /* fall through */
1570     case 0x00:
1571     case 0x3C:
1572       return XML_TOK_PARTIAL;
1573     }
1574   } else {
1575     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1576     case 0xFEFF:
1577       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1578         break;
1579       *nextTokPtr = ptr + 2;
1580       *encPtr = encodingTable[UTF_16BE_ENC];
1581       return XML_TOK_BOM;
1582     /* 00 3C is handled in the default case */
1583     case 0x3C00:
1584       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1585            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1586           && state == XML_CONTENT_STATE)
1587         break;
1588       *encPtr = encodingTable[UTF_16LE_ENC];
1589       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1590     case 0xFFFE:
1591       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1592         break;
1593       *nextTokPtr = ptr + 2;
1594       *encPtr = encodingTable[UTF_16LE_ENC];
1595       return XML_TOK_BOM;
1596     case 0xEFBB:
1597       /* Maybe a UTF-8 BOM (EF BB BF) */
1598       /* If there's an explicitly specified (external) encoding
1599          of ISO-8859-1 or some flavour of UTF-16
1600          and this is an external text entity,
1601          don't look for the BOM,
1602          because it might be a legal data.
1603       */
1604       if (state == XML_CONTENT_STATE) {
1605         int e = INIT_ENC_INDEX(enc);
1606         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1607             || e == UTF_16_ENC)
1608           break;
1609       }
1610       if (ptr + 2 == end)
1611         return XML_TOK_PARTIAL;
1612       if ((unsigned char)ptr[2] == 0xBF) {
1613         *nextTokPtr = ptr + 3;
1614         *encPtr = encodingTable[UTF_8_ENC];
1615         return XML_TOK_BOM;
1616       }
1617       break;
1618     default:
1619       if (ptr[0] == '\0') {
1620         /* 0 isn't a legal data character. Furthermore a document
1621            entity can only start with ASCII characters.  So the only
1622            way this can fail to be big-endian UTF-16 if it it's an
1623            external parsed general entity that's labelled as
1624            UTF-16LE.
1625         */
1626         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1627           break;
1628         *encPtr = encodingTable[UTF_16BE_ENC];
1629         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1630       } else if (ptr[1] == '\0') {
1631         /* We could recover here in the case:
1632             - parsing an external entity
1633             - second byte is 0
1634             - no externally specified encoding
1635             - no encoding declaration
1636            by assuming UTF-16LE.  But we don't, because this would mean when
1637            presented just with a single byte, we couldn't reliably determine
1638            whether we needed further bytes.
1639         */
1640         if (state == XML_CONTENT_STATE)
1641           break;
1642         *encPtr = encodingTable[UTF_16LE_ENC];
1643         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1644       }
1645       break;
1646     }
1647   }
1648   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1649   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1650 }
1651 
1652 #define NS(x) x
1653 #define ns(x) x
1654 #define XML_TOK_NS_C
1655 #include "xmltok_ns.c"
1656 #undef XML_TOK_NS_C
1657 #undef NS
1658 #undef ns
1659 
1660 #ifdef XML_NS
1661 
1662 #  define NS(x) x##NS
1663 #  define ns(x) x##_ns
1664 
1665 #  define XML_TOK_NS_C
1666 #  include "xmltok_ns.c"
1667 #  undef XML_TOK_NS_C
1668 
1669 #  undef NS
1670 #  undef ns
1671 
1672 ENCODING *
1673 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1674                          void *userData) {
1675   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1676   if (enc)
1677     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1678   return enc;
1679 }
1680 
1681 #endif /* XML_NS */
1682