xref: /freebsd/contrib/expat/lib/xmltok.c (revision bce40c0242b167a541cc051b6b0dbcc5f3d04319)
1 /*
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11    Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12    Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
13    Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14    Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15    Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
16    Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
17    Copyright (c) 2016      Don Lewis <truckman@apache.org>
18    Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
19    Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
20    Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
21    Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
22    Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23    Copyright (c) 2021      Dong-hee Na <donghee.na@python.org>
24    Licensed under the MIT license:
25 
26    Permission is  hereby granted,  free of charge,  to any  person obtaining
27    a  copy  of  this  software   and  associated  documentation  files  (the
28    "Software"),  to  deal in  the  Software  without restriction,  including
29    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
30    distribute, sublicense, and/or sell copies of the Software, and to permit
31    persons  to whom  the Software  is  furnished to  do so,  subject to  the
32    following conditions:
33 
34    The above copyright  notice and this permission notice  shall be included
35    in all copies or substantial portions of the Software.
36 
37    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
38    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
39    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
40    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
41    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
42    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
43    USE OR OTHER DEALINGS IN THE SOFTWARE.
44 */
45 
46 #include <expat_config.h>
47 
48 #include <stddef.h>
49 #include <string.h> /* memcpy */
50 #include <stdbool.h>
51 
52 #ifdef _WIN32
53 #  include "winconfig.h"
54 #endif
55 
56 #include "expat_external.h"
57 #include "internal.h"
58 #include "xmltok.h"
59 #include "nametab.h"
60 
61 #ifdef XML_DTD
62 #  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
63 #else
64 #  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
65 #endif
66 
67 #define VTABLE1                                                                \
68   {PREFIX(prologTok), PREFIX(contentTok),                                      \
69    PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
70       {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
71       PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
72       PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
73       PREFIX(updatePosition), PREFIX(isPublicId)
74 
75 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
76 
77 #define UCS2_GET_NAMING(pages, hi, lo)                                         \
78   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
79 
80 /* A 2 byte UTF-8 representation splits the characters 11 bits between
81    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
82    pages, 3 bits to add to that index and 5 bits to generate the mask.
83 */
84 #define UTF8_GET_NAMING2(pages, byte)                                          \
85   (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
86                 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
87    & (1u << (((byte)[1]) & 0x1F)))
88 
89 /* A 3 byte UTF-8 representation splits the characters 16 bits between
90    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
91    into pages, 3 bits to add to that index and 5 bits to generate the
92    mask.
93 */
94 #define UTF8_GET_NAMING3(pages, byte)                                          \
95   (namingBitmap                                                                \
96        [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
97          << 3)                                                                 \
98         + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
99    & (1u << (((byte)[2]) & 0x1F)))
100 
101 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
102    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
103    with the additional restriction of not allowing the Unicode
104    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
105    Implementation details:
106      (A & 0x80) == 0     means A < 0x80
107    and
108      (A & 0xC0) == 0xC0  means A > 0xBF
109 */
110 
111 #define UTF8_INVALID2(p)                                                       \
112   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
113 
114 #define UTF8_INVALID3(p)                                                       \
115   (((p)[2] & 0x80) == 0                                                        \
116    || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
117                                       : ((p)[2] & 0xC0) == 0xC0)               \
118    || ((*p) == 0xE0                                                            \
119            ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
120            : ((p)[1] & 0x80) == 0                                              \
121                  || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
122 
123 #define UTF8_INVALID4(p)                                                       \
124   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
125    || ((p)[2] & 0xC0) == 0xC0                                                  \
126    || ((*p) == 0xF0                                                            \
127            ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
128            : ((p)[1] & 0x80) == 0                                              \
129                  || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
130 
131 static int PTRFASTCALL
132 isNever(const ENCODING *enc, const char *p) {
133   UNUSED_P(enc);
134   UNUSED_P(p);
135   return 0;
136 }
137 
138 static int PTRFASTCALL
139 utf8_isName2(const ENCODING *enc, const char *p) {
140   UNUSED_P(enc);
141   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
142 }
143 
144 static int PTRFASTCALL
145 utf8_isName3(const ENCODING *enc, const char *p) {
146   UNUSED_P(enc);
147   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
148 }
149 
150 #define utf8_isName4 isNever
151 
152 static int PTRFASTCALL
153 utf8_isNmstrt2(const ENCODING *enc, const char *p) {
154   UNUSED_P(enc);
155   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
156 }
157 
158 static int PTRFASTCALL
159 utf8_isNmstrt3(const ENCODING *enc, const char *p) {
160   UNUSED_P(enc);
161   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
162 }
163 
164 #define utf8_isNmstrt4 isNever
165 
166 static int PTRFASTCALL
167 utf8_isInvalid2(const ENCODING *enc, const char *p) {
168   UNUSED_P(enc);
169   return UTF8_INVALID2((const unsigned char *)p);
170 }
171 
172 static int PTRFASTCALL
173 utf8_isInvalid3(const ENCODING *enc, const char *p) {
174   UNUSED_P(enc);
175   return UTF8_INVALID3((const unsigned char *)p);
176 }
177 
178 static int PTRFASTCALL
179 utf8_isInvalid4(const ENCODING *enc, const char *p) {
180   UNUSED_P(enc);
181   return UTF8_INVALID4((const unsigned char *)p);
182 }
183 
184 struct normal_encoding {
185   ENCODING enc;
186   unsigned char type[256];
187 #ifdef XML_MIN_SIZE
188   int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
189   int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
190   int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
191   int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
192   int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
193 #endif /* XML_MIN_SIZE */
194   int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
195   int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
196   int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
197   int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
198   int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
199   int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
200   int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
201   int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
202   int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
203 };
204 
205 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
206 
207 #ifdef XML_MIN_SIZE
208 
209 #  define STANDARD_VTABLE(E)                                                   \
210     E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
211 
212 #else
213 
214 #  define STANDARD_VTABLE(E) /* as nothing */
215 
216 #endif
217 
218 #define NORMAL_VTABLE(E)                                                       \
219   E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
220       E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
221 
222 #define NULL_VTABLE                                                            \
223   /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
224       /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
225       /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
226 
227 static int FASTCALL checkCharRefNumber(int);
228 
229 #include "xmltok_impl.h"
230 #include "ascii.h"
231 
232 #ifdef XML_MIN_SIZE
233 #  define sb_isNameMin isNever
234 #  define sb_isNmstrtMin isNever
235 #endif
236 
237 #ifdef XML_MIN_SIZE
238 #  define MINBPC(enc) ((enc)->minBytesPerChar)
239 #else
240 /* minimum bytes per character */
241 #  define MINBPC(enc) 1
242 #endif
243 
244 #define SB_BYTE_TYPE(enc, p)                                                   \
245   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
246 
247 #ifdef XML_MIN_SIZE
248 static int PTRFASTCALL
249 sb_byteType(const ENCODING *enc, const char *p) {
250   return SB_BYTE_TYPE(enc, p);
251 }
252 #  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
253 #else
254 #  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
255 #endif
256 
257 #ifdef XML_MIN_SIZE
258 #  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
259 static int PTRFASTCALL
260 sb_byteToAscii(const ENCODING *enc, const char *p) {
261   UNUSED_P(enc);
262   return *p;
263 }
264 #else
265 #  define BYTE_TO_ASCII(enc, p) (*(p))
266 #endif
267 
268 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
269 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
270 #ifdef XML_MIN_SIZE
271 #  define IS_INVALID_CHAR(enc, p, n)                                           \
272     (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
273      && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
274 #else
275 #  define IS_INVALID_CHAR(enc, p, n)                                           \
276     (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
277 #endif
278 
279 #ifdef XML_MIN_SIZE
280 #  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
281     (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
282 #  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
283     (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
284 #else
285 #  define IS_NAME_CHAR_MINBPC(enc, p) (0)
286 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
287 #endif
288 
289 #ifdef XML_MIN_SIZE
290 #  define CHAR_MATCHES(enc, p, c)                                              \
291     (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
292 static int PTRCALL
293 sb_charMatches(const ENCODING *enc, const char *p, int c) {
294   UNUSED_P(enc);
295   return *p == c;
296 }
297 #else
298 /* c is an ASCII character */
299 #  define CHAR_MATCHES(enc, p, c) (*(p) == c)
300 #endif
301 
302 #define PREFIX(ident) normal_##ident
303 #define XML_TOK_IMPL_C
304 #include "xmltok_impl.c"
305 #undef XML_TOK_IMPL_C
306 
307 #undef MINBPC
308 #undef BYTE_TYPE
309 #undef BYTE_TO_ASCII
310 #undef CHAR_MATCHES
311 #undef IS_NAME_CHAR
312 #undef IS_NAME_CHAR_MINBPC
313 #undef IS_NMSTRT_CHAR
314 #undef IS_NMSTRT_CHAR_MINBPC
315 #undef IS_INVALID_CHAR
316 
317 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
318        UTF8_cval1 = 0x00,
319        UTF8_cval2 = 0xc0,
320        UTF8_cval3 = 0xe0,
321        UTF8_cval4 = 0xf0
322 };
323 
324 void
325 _INTERNAL_trim_to_complete_utf8_characters(const char *from,
326                                            const char **fromLimRef) {
327   const char *fromLim = *fromLimRef;
328   size_t walked = 0;
329   for (; fromLim > from; fromLim--, walked++) {
330     const unsigned char prev = (unsigned char)fromLim[-1];
331     if ((prev & 0xf8u)
332         == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
333       if (walked + 1 >= 4) {
334         fromLim += 4 - 1;
335         break;
336       } else {
337         walked = 0;
338       }
339     } else if ((prev & 0xf0u)
340                == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
341       if (walked + 1 >= 3) {
342         fromLim += 3 - 1;
343         break;
344       } else {
345         walked = 0;
346       }
347     } else if ((prev & 0xe0u)
348                == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
349       if (walked + 1 >= 2) {
350         fromLim += 2 - 1;
351         break;
352       } else {
353         walked = 0;
354       }
355     } else if ((prev & 0x80u)
356                == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
357       break;
358     }
359   }
360   *fromLimRef = fromLim;
361 }
362 
363 static enum XML_Convert_Result PTRCALL
364 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
365             char **toP, const char *toLim) {
366   bool input_incomplete = false;
367   bool output_exhausted = false;
368 
369   /* Avoid copying partial characters (due to limited space). */
370   const ptrdiff_t bytesAvailable = fromLim - *fromP;
371   const ptrdiff_t bytesStorable = toLim - *toP;
372   UNUSED_P(enc);
373   if (bytesAvailable > bytesStorable) {
374     fromLim = *fromP + bytesStorable;
375     output_exhausted = true;
376   }
377 
378   /* Avoid copying partial characters (from incomplete input). */
379   {
380     const char *const fromLimBefore = fromLim;
381     _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
382     if (fromLim < fromLimBefore) {
383       input_incomplete = true;
384     }
385   }
386 
387   {
388     const ptrdiff_t bytesToCopy = fromLim - *fromP;
389     memcpy(*toP, *fromP, bytesToCopy);
390     *fromP += bytesToCopy;
391     *toP += bytesToCopy;
392   }
393 
394   if (output_exhausted) /* needs to go first */
395     return XML_CONVERT_OUTPUT_EXHAUSTED;
396   else if (input_incomplete)
397     return XML_CONVERT_INPUT_INCOMPLETE;
398   else
399     return XML_CONVERT_COMPLETED;
400 }
401 
402 static enum XML_Convert_Result PTRCALL
403 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
404              unsigned short **toP, const unsigned short *toLim) {
405   enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
406   unsigned short *to = *toP;
407   const char *from = *fromP;
408   while (from < fromLim && to < toLim) {
409     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
410     case BT_LEAD2:
411       if (fromLim - from < 2) {
412         res = XML_CONVERT_INPUT_INCOMPLETE;
413         goto after;
414       }
415       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
416       from += 2;
417       break;
418     case BT_LEAD3:
419       if (fromLim - from < 3) {
420         res = XML_CONVERT_INPUT_INCOMPLETE;
421         goto after;
422       }
423       *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
424                                | (from[2] & 0x3f));
425       from += 3;
426       break;
427     case BT_LEAD4: {
428       unsigned long n;
429       if (toLim - to < 2) {
430         res = XML_CONVERT_OUTPUT_EXHAUSTED;
431         goto after;
432       }
433       if (fromLim - from < 4) {
434         res = XML_CONVERT_INPUT_INCOMPLETE;
435         goto after;
436       }
437       n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
438           | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
439       n -= 0x10000;
440       to[0] = (unsigned short)((n >> 10) | 0xD800);
441       to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
442       to += 2;
443       from += 4;
444     } break;
445     default:
446       *to++ = *from++;
447       break;
448     }
449   }
450   if (from < fromLim)
451     res = XML_CONVERT_OUTPUT_EXHAUSTED;
452 after:
453   *fromP = from;
454   *toP = to;
455   return res;
456 }
457 
458 #ifdef XML_NS
459 static const struct normal_encoding utf8_encoding_ns
460     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
461        {
462 #  include "asciitab.h"
463 #  include "utf8tab.h"
464        },
465        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
466 #endif
467 
468 static const struct normal_encoding utf8_encoding
469     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
470        {
471 #define BT_COLON BT_NMSTRT
472 #include "asciitab.h"
473 #undef BT_COLON
474 #include "utf8tab.h"
475        },
476        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
477 
478 #ifdef XML_NS
479 
480 static const struct normal_encoding internal_utf8_encoding_ns
481     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
482        {
483 #  include "iasciitab.h"
484 #  include "utf8tab.h"
485        },
486        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
487 
488 #endif
489 
490 static const struct normal_encoding internal_utf8_encoding
491     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
492        {
493 #define BT_COLON BT_NMSTRT
494 #include "iasciitab.h"
495 #undef BT_COLON
496 #include "utf8tab.h"
497        },
498        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
499 
500 static enum XML_Convert_Result PTRCALL
501 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
502               char **toP, const char *toLim) {
503   UNUSED_P(enc);
504   for (;;) {
505     unsigned char c;
506     if (*fromP == fromLim)
507       return XML_CONVERT_COMPLETED;
508     c = (unsigned char)**fromP;
509     if (c & 0x80) {
510       if (toLim - *toP < 2)
511         return XML_CONVERT_OUTPUT_EXHAUSTED;
512       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
513       *(*toP)++ = (char)((c & 0x3f) | 0x80);
514       (*fromP)++;
515     } else {
516       if (*toP == toLim)
517         return XML_CONVERT_OUTPUT_EXHAUSTED;
518       *(*toP)++ = *(*fromP)++;
519     }
520   }
521 }
522 
523 static enum XML_Convert_Result PTRCALL
524 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
525                unsigned short **toP, const unsigned short *toLim) {
526   UNUSED_P(enc);
527   while (*fromP < fromLim && *toP < toLim)
528     *(*toP)++ = (unsigned char)*(*fromP)++;
529 
530   if ((*toP == toLim) && (*fromP < fromLim))
531     return XML_CONVERT_OUTPUT_EXHAUSTED;
532   else
533     return XML_CONVERT_COMPLETED;
534 }
535 
536 #ifdef XML_NS
537 
538 static const struct normal_encoding latin1_encoding_ns
539     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
540        {
541 #  include "asciitab.h"
542 #  include "latin1tab.h"
543        },
544        STANDARD_VTABLE(sb_) NULL_VTABLE};
545 
546 #endif
547 
548 static const struct normal_encoding latin1_encoding
549     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
550        {
551 #define BT_COLON BT_NMSTRT
552 #include "asciitab.h"
553 #undef BT_COLON
554 #include "latin1tab.h"
555        },
556        STANDARD_VTABLE(sb_) NULL_VTABLE};
557 
558 static enum XML_Convert_Result PTRCALL
559 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
560              char **toP, const char *toLim) {
561   UNUSED_P(enc);
562   while (*fromP < fromLim && *toP < toLim)
563     *(*toP)++ = *(*fromP)++;
564 
565   if ((*toP == toLim) && (*fromP < fromLim))
566     return XML_CONVERT_OUTPUT_EXHAUSTED;
567   else
568     return XML_CONVERT_COMPLETED;
569 }
570 
571 #ifdef XML_NS
572 
573 static const struct normal_encoding ascii_encoding_ns
574     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
575        {
576 #  include "asciitab.h"
577            /* BT_NONXML == 0 */
578        },
579        STANDARD_VTABLE(sb_) NULL_VTABLE};
580 
581 #endif
582 
583 static const struct normal_encoding ascii_encoding
584     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
585        {
586 #define BT_COLON BT_NMSTRT
587 #include "asciitab.h"
588 #undef BT_COLON
589            /* BT_NONXML == 0 */
590        },
591        STANDARD_VTABLE(sb_) NULL_VTABLE};
592 
593 static int PTRFASTCALL
594 unicode_byte_type(char hi, char lo) {
595   switch ((unsigned char)hi) {
596   /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
597   case 0xD8:
598   case 0xD9:
599   case 0xDA:
600   case 0xDB:
601     return BT_LEAD4;
602   /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
603   case 0xDC:
604   case 0xDD:
605   case 0xDE:
606   case 0xDF:
607     return BT_TRAIL;
608   case 0xFF:
609     switch ((unsigned char)lo) {
610     case 0xFF: /* noncharacter-FFFF */
611     case 0xFE: /* noncharacter-FFFE */
612       return BT_NONXML;
613     }
614     break;
615   }
616   return BT_NONASCII;
617 }
618 
619 #define DEFINE_UTF16_TO_UTF8(E)                                                \
620   static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
621       const ENCODING *enc, const char **fromP, const char *fromLim,            \
622       char **toP, const char *toLim) {                                         \
623     const char *from = *fromP;                                                 \
624     UNUSED_P(enc);                                                             \
625     fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
626     for (; from < fromLim; from += 2) {                                        \
627       int plane;                                                               \
628       unsigned char lo2;                                                       \
629       unsigned char lo = GET_LO(from);                                         \
630       unsigned char hi = GET_HI(from);                                         \
631       switch (hi) {                                                            \
632       case 0:                                                                  \
633         if (lo < 0x80) {                                                       \
634           if (*toP == toLim) {                                                 \
635             *fromP = from;                                                     \
636             return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
637           }                                                                    \
638           *(*toP)++ = lo;                                                      \
639           break;                                                               \
640         }                                                                      \
641         /* fall through */                                                     \
642       case 0x1:                                                                \
643       case 0x2:                                                                \
644       case 0x3:                                                                \
645       case 0x4:                                                                \
646       case 0x5:                                                                \
647       case 0x6:                                                                \
648       case 0x7:                                                                \
649         if (toLim - *toP < 2) {                                                \
650           *fromP = from;                                                       \
651           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
652         }                                                                      \
653         *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
654         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
655         break;                                                                 \
656       default:                                                                 \
657         if (toLim - *toP < 3) {                                                \
658           *fromP = from;                                                       \
659           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
660         }                                                                      \
661         /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
662         *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
663         *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
664         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
665         break;                                                                 \
666       case 0xD8:                                                               \
667       case 0xD9:                                                               \
668       case 0xDA:                                                               \
669       case 0xDB:                                                               \
670         if (toLim - *toP < 4) {                                                \
671           *fromP = from;                                                       \
672           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
673         }                                                                      \
674         if (fromLim - from < 4) {                                              \
675           *fromP = from;                                                       \
676           return XML_CONVERT_INPUT_INCOMPLETE;                                 \
677         }                                                                      \
678         plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
679         *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
680         *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
681         from += 2;                                                             \
682         lo2 = GET_LO(from);                                                    \
683         *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
684                      | (lo2 >> 6) | 0x80);                                     \
685         *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
686         break;                                                                 \
687       }                                                                        \
688     }                                                                          \
689     *fromP = from;                                                             \
690     if (from < fromLim)                                                        \
691       return XML_CONVERT_INPUT_INCOMPLETE;                                     \
692     else                                                                       \
693       return XML_CONVERT_COMPLETED;                                            \
694   }
695 
696 #define DEFINE_UTF16_TO_UTF16(E)                                               \
697   static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
698       const ENCODING *enc, const char **fromP, const char *fromLim,            \
699       unsigned short **toP, const unsigned short *toLim) {                     \
700     enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
701     UNUSED_P(enc);                                                             \
702     fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
703     /* Avoid copying first half only of surrogate */                           \
704     if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
705         && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
706       fromLim -= 2;                                                            \
707       res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
708     }                                                                          \
709     for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
710       *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
711     if ((*toP == toLim) && (*fromP < fromLim))                                 \
712       return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
713     else                                                                       \
714       return res;                                                              \
715   }
716 
717 #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
718 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
719 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
720 
721 DEFINE_UTF16_TO_UTF8(little2_)
722 DEFINE_UTF16_TO_UTF16(little2_)
723 
724 #undef SET2
725 #undef GET_LO
726 #undef GET_HI
727 
728 #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
729 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
730 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
731 
732 DEFINE_UTF16_TO_UTF8(big2_)
733 DEFINE_UTF16_TO_UTF16(big2_)
734 
735 #undef SET2
736 #undef GET_LO
737 #undef GET_HI
738 
739 #define LITTLE2_BYTE_TYPE(enc, p)                                              \
740   ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]  \
741                : unicode_byte_type((p)[1], (p)[0]))
742 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
743 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c)
744 #define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
745   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
746 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
747   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
748 
749 #ifdef XML_MIN_SIZE
750 
751 static int PTRFASTCALL
752 little2_byteType(const ENCODING *enc, const char *p) {
753   return LITTLE2_BYTE_TYPE(enc, p);
754 }
755 
756 static int PTRFASTCALL
757 little2_byteToAscii(const ENCODING *enc, const char *p) {
758   UNUSED_P(enc);
759   return LITTLE2_BYTE_TO_ASCII(p);
760 }
761 
762 static int PTRCALL
763 little2_charMatches(const ENCODING *enc, const char *p, int c) {
764   UNUSED_P(enc);
765   return LITTLE2_CHAR_MATCHES(p, c);
766 }
767 
768 static int PTRFASTCALL
769 little2_isNameMin(const ENCODING *enc, const char *p) {
770   UNUSED_P(enc);
771   return LITTLE2_IS_NAME_CHAR_MINBPC(p);
772 }
773 
774 static int PTRFASTCALL
775 little2_isNmstrtMin(const ENCODING *enc, const char *p) {
776   UNUSED_P(enc);
777   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
778 }
779 
780 #  undef VTABLE
781 #  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
782 
783 #else /* not XML_MIN_SIZE */
784 
785 #  undef PREFIX
786 #  define PREFIX(ident) little2_##ident
787 #  define MINBPC(enc) 2
788 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
789 #  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
790 #  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
791 #  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
792 #  define IS_NAME_CHAR(enc, p, n) 0
793 #  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
794 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
795 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
796 
797 #  define XML_TOK_IMPL_C
798 #  include "xmltok_impl.c"
799 #  undef XML_TOK_IMPL_C
800 
801 #  undef MINBPC
802 #  undef BYTE_TYPE
803 #  undef BYTE_TO_ASCII
804 #  undef CHAR_MATCHES
805 #  undef IS_NAME_CHAR
806 #  undef IS_NAME_CHAR_MINBPC
807 #  undef IS_NMSTRT_CHAR
808 #  undef IS_NMSTRT_CHAR_MINBPC
809 #  undef IS_INVALID_CHAR
810 
811 #endif /* not XML_MIN_SIZE */
812 
813 #ifdef XML_NS
814 
815 static const struct normal_encoding little2_encoding_ns
816     = {{VTABLE, 2, 0,
817 #  if BYTEORDER == 1234
818         1
819 #  else
820         0
821 #  endif
822        },
823        {
824 #  include "asciitab.h"
825 #  include "latin1tab.h"
826        },
827        STANDARD_VTABLE(little2_) NULL_VTABLE};
828 
829 #endif
830 
831 static const struct normal_encoding little2_encoding
832     = {{VTABLE, 2, 0,
833 #if BYTEORDER == 1234
834         1
835 #else
836         0
837 #endif
838        },
839        {
840 #define BT_COLON BT_NMSTRT
841 #include "asciitab.h"
842 #undef BT_COLON
843 #include "latin1tab.h"
844        },
845        STANDARD_VTABLE(little2_) NULL_VTABLE};
846 
847 #if BYTEORDER != 4321
848 
849 #  ifdef XML_NS
850 
851 static const struct normal_encoding internal_little2_encoding_ns
852     = {{VTABLE, 2, 0, 1},
853        {
854 #    include "iasciitab.h"
855 #    include "latin1tab.h"
856        },
857        STANDARD_VTABLE(little2_) NULL_VTABLE};
858 
859 #  endif
860 
861 static const struct normal_encoding internal_little2_encoding
862     = {{VTABLE, 2, 0, 1},
863        {
864 #  define BT_COLON BT_NMSTRT
865 #  include "iasciitab.h"
866 #  undef BT_COLON
867 #  include "latin1tab.h"
868        },
869        STANDARD_VTABLE(little2_) NULL_VTABLE};
870 
871 #endif
872 
873 #define BIG2_BYTE_TYPE(enc, p)                                                 \
874   ((p)[0] == 0                                                                 \
875        ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]]        \
876        : unicode_byte_type((p)[0], (p)[1]))
877 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
878 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c)
879 #define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
880   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
881 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
882   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
883 
884 #ifdef XML_MIN_SIZE
885 
886 static int PTRFASTCALL
887 big2_byteType(const ENCODING *enc, const char *p) {
888   return BIG2_BYTE_TYPE(enc, p);
889 }
890 
891 static int PTRFASTCALL
892 big2_byteToAscii(const ENCODING *enc, const char *p) {
893   UNUSED_P(enc);
894   return BIG2_BYTE_TO_ASCII(p);
895 }
896 
897 static int PTRCALL
898 big2_charMatches(const ENCODING *enc, const char *p, int c) {
899   UNUSED_P(enc);
900   return BIG2_CHAR_MATCHES(p, c);
901 }
902 
903 static int PTRFASTCALL
904 big2_isNameMin(const ENCODING *enc, const char *p) {
905   UNUSED_P(enc);
906   return BIG2_IS_NAME_CHAR_MINBPC(p);
907 }
908 
909 static int PTRFASTCALL
910 big2_isNmstrtMin(const ENCODING *enc, const char *p) {
911   UNUSED_P(enc);
912   return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
913 }
914 
915 #  undef VTABLE
916 #  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
917 
918 #else /* not XML_MIN_SIZE */
919 
920 #  undef PREFIX
921 #  define PREFIX(ident) big2_##ident
922 #  define MINBPC(enc) 2
923 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
924 #  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
925 #  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
926 #  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
927 #  define IS_NAME_CHAR(enc, p, n) 0
928 #  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
929 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
930 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
931 
932 #  define XML_TOK_IMPL_C
933 #  include "xmltok_impl.c"
934 #  undef XML_TOK_IMPL_C
935 
936 #  undef MINBPC
937 #  undef BYTE_TYPE
938 #  undef BYTE_TO_ASCII
939 #  undef CHAR_MATCHES
940 #  undef IS_NAME_CHAR
941 #  undef IS_NAME_CHAR_MINBPC
942 #  undef IS_NMSTRT_CHAR
943 #  undef IS_NMSTRT_CHAR_MINBPC
944 #  undef IS_INVALID_CHAR
945 
946 #endif /* not XML_MIN_SIZE */
947 
948 #ifdef XML_NS
949 
950 static const struct normal_encoding big2_encoding_ns
951     = {{VTABLE, 2, 0,
952 #  if BYTEORDER == 4321
953         1
954 #  else
955         0
956 #  endif
957        },
958        {
959 #  include "asciitab.h"
960 #  include "latin1tab.h"
961        },
962        STANDARD_VTABLE(big2_) NULL_VTABLE};
963 
964 #endif
965 
966 static const struct normal_encoding big2_encoding
967     = {{VTABLE, 2, 0,
968 #if BYTEORDER == 4321
969         1
970 #else
971         0
972 #endif
973        },
974        {
975 #define BT_COLON BT_NMSTRT
976 #include "asciitab.h"
977 #undef BT_COLON
978 #include "latin1tab.h"
979        },
980        STANDARD_VTABLE(big2_) NULL_VTABLE};
981 
982 #if BYTEORDER != 1234
983 
984 #  ifdef XML_NS
985 
986 static const struct normal_encoding internal_big2_encoding_ns
987     = {{VTABLE, 2, 0, 1},
988        {
989 #    include "iasciitab.h"
990 #    include "latin1tab.h"
991        },
992        STANDARD_VTABLE(big2_) NULL_VTABLE};
993 
994 #  endif
995 
996 static const struct normal_encoding internal_big2_encoding
997     = {{VTABLE, 2, 0, 1},
998        {
999 #  define BT_COLON BT_NMSTRT
1000 #  include "iasciitab.h"
1001 #  undef BT_COLON
1002 #  include "latin1tab.h"
1003        },
1004        STANDARD_VTABLE(big2_) NULL_VTABLE};
1005 
1006 #endif
1007 
1008 #undef PREFIX
1009 
1010 static int FASTCALL
1011 streqci(const char *s1, const char *s2) {
1012   for (;;) {
1013     char c1 = *s1++;
1014     char c2 = *s2++;
1015     if (ASCII_a <= c1 && c1 <= ASCII_z)
1016       c1 += ASCII_A - ASCII_a;
1017     if (ASCII_a <= c2 && c2 <= ASCII_z)
1018       /* The following line will never get executed.  streqci() is
1019        * only called from two places, both of which guarantee to put
1020        * upper-case strings into s2.
1021        */
1022       c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1023     if (c1 != c2)
1024       return 0;
1025     if (! c1)
1026       break;
1027   }
1028   return 1;
1029 }
1030 
1031 static void PTRCALL
1032 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1033                    POSITION *pos) {
1034   UNUSED_P(enc);
1035   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1036 }
1037 
1038 static int
1039 toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1040   char buf[1];
1041   char *p = buf;
1042   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1043   if (p == buf)
1044     return -1;
1045   else
1046     return buf[0];
1047 }
1048 
1049 static int FASTCALL
1050 isSpace(int c) {
1051   switch (c) {
1052   case 0x20:
1053   case 0xD:
1054   case 0xA:
1055   case 0x9:
1056     return 1;
1057   }
1058   return 0;
1059 }
1060 
1061 /* Return 1 if there's just optional white space or there's an S
1062    followed by name=val.
1063 */
1064 static int
1065 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1066                      const char **namePtr, const char **nameEndPtr,
1067                      const char **valPtr, const char **nextTokPtr) {
1068   int c;
1069   char open;
1070   if (ptr == end) {
1071     *namePtr = NULL;
1072     return 1;
1073   }
1074   if (! isSpace(toAscii(enc, ptr, end))) {
1075     *nextTokPtr = ptr;
1076     return 0;
1077   }
1078   do {
1079     ptr += enc->minBytesPerChar;
1080   } while (isSpace(toAscii(enc, ptr, end)));
1081   if (ptr == end) {
1082     *namePtr = NULL;
1083     return 1;
1084   }
1085   *namePtr = ptr;
1086   for (;;) {
1087     c = toAscii(enc, ptr, end);
1088     if (c == -1) {
1089       *nextTokPtr = ptr;
1090       return 0;
1091     }
1092     if (c == ASCII_EQUALS) {
1093       *nameEndPtr = ptr;
1094       break;
1095     }
1096     if (isSpace(c)) {
1097       *nameEndPtr = ptr;
1098       do {
1099         ptr += enc->minBytesPerChar;
1100       } while (isSpace(c = toAscii(enc, ptr, end)));
1101       if (c != ASCII_EQUALS) {
1102         *nextTokPtr = ptr;
1103         return 0;
1104       }
1105       break;
1106     }
1107     ptr += enc->minBytesPerChar;
1108   }
1109   if (ptr == *namePtr) {
1110     *nextTokPtr = ptr;
1111     return 0;
1112   }
1113   ptr += enc->minBytesPerChar;
1114   c = toAscii(enc, ptr, end);
1115   while (isSpace(c)) {
1116     ptr += enc->minBytesPerChar;
1117     c = toAscii(enc, ptr, end);
1118   }
1119   if (c != ASCII_QUOT && c != ASCII_APOS) {
1120     *nextTokPtr = ptr;
1121     return 0;
1122   }
1123   open = (char)c;
1124   ptr += enc->minBytesPerChar;
1125   *valPtr = ptr;
1126   for (;; ptr += enc->minBytesPerChar) {
1127     c = toAscii(enc, ptr, end);
1128     if (c == open)
1129       break;
1130     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1131         && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1132         && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1133       *nextTokPtr = ptr;
1134       return 0;
1135     }
1136   }
1137   *nextTokPtr = ptr + enc->minBytesPerChar;
1138   return 1;
1139 }
1140 
1141 static const char KW_version[]
1142     = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1143 
1144 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1145                                    ASCII_i, ASCII_n, ASCII_g, '\0'};
1146 
1147 static const char KW_standalone[]
1148     = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1149        ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1150 
1151 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1152 
1153 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1154 
1155 static int
1156 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1157                                                  const char *),
1158                int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1159                const char *end, const char **badPtr, const char **versionPtr,
1160                const char **versionEndPtr, const char **encodingName,
1161                const ENCODING **encoding, int *standalone) {
1162   const char *val = NULL;
1163   const char *name = NULL;
1164   const char *nameEnd = NULL;
1165   ptr += 5 * enc->minBytesPerChar;
1166   end -= 2 * enc->minBytesPerChar;
1167   if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1168       || ! name) {
1169     *badPtr = ptr;
1170     return 0;
1171   }
1172   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1173     if (! isGeneralTextEntity) {
1174       *badPtr = name;
1175       return 0;
1176     }
1177   } else {
1178     if (versionPtr)
1179       *versionPtr = val;
1180     if (versionEndPtr)
1181       *versionEndPtr = ptr;
1182     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1183       *badPtr = ptr;
1184       return 0;
1185     }
1186     if (! name) {
1187       if (isGeneralTextEntity) {
1188         /* a TextDecl must have an EncodingDecl */
1189         *badPtr = ptr;
1190         return 0;
1191       }
1192       return 1;
1193     }
1194   }
1195   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1196     int c = toAscii(enc, val, end);
1197     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1198       *badPtr = val;
1199       return 0;
1200     }
1201     if (encodingName)
1202       *encodingName = val;
1203     if (encoding)
1204       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1205     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1206       *badPtr = ptr;
1207       return 0;
1208     }
1209     if (! name)
1210       return 1;
1211   }
1212   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1213       || isGeneralTextEntity) {
1214     *badPtr = name;
1215     return 0;
1216   }
1217   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1218     if (standalone)
1219       *standalone = 1;
1220   } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1221     if (standalone)
1222       *standalone = 0;
1223   } else {
1224     *badPtr = val;
1225     return 0;
1226   }
1227   while (isSpace(toAscii(enc, ptr, end)))
1228     ptr += enc->minBytesPerChar;
1229   if (ptr != end) {
1230     *badPtr = ptr;
1231     return 0;
1232   }
1233   return 1;
1234 }
1235 
1236 static int FASTCALL
1237 checkCharRefNumber(int result) {
1238   switch (result >> 8) {
1239   case 0xD8:
1240   case 0xD9:
1241   case 0xDA:
1242   case 0xDB:
1243   case 0xDC:
1244   case 0xDD:
1245   case 0xDE:
1246   case 0xDF:
1247     return -1;
1248   case 0:
1249     if (latin1_encoding.type[result] == BT_NONXML)
1250       return -1;
1251     break;
1252   case 0xFF:
1253     if (result == 0xFFFE || result == 0xFFFF)
1254       return -1;
1255     break;
1256   }
1257   return result;
1258 }
1259 
1260 int FASTCALL
1261 XmlUtf8Encode(int c, char *buf) {
1262   enum {
1263     /* minN is minimum legal resulting value for N byte sequence */
1264     min2 = 0x80,
1265     min3 = 0x800,
1266     min4 = 0x10000
1267   };
1268 
1269   if (c < 0)
1270     return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1271   if (c < min2) {
1272     buf[0] = (char)(c | UTF8_cval1);
1273     return 1;
1274   }
1275   if (c < min3) {
1276     buf[0] = (char)((c >> 6) | UTF8_cval2);
1277     buf[1] = (char)((c & 0x3f) | 0x80);
1278     return 2;
1279   }
1280   if (c < min4) {
1281     buf[0] = (char)((c >> 12) | UTF8_cval3);
1282     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1283     buf[2] = (char)((c & 0x3f) | 0x80);
1284     return 3;
1285   }
1286   if (c < 0x110000) {
1287     buf[0] = (char)((c >> 18) | UTF8_cval4);
1288     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1289     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1290     buf[3] = (char)((c & 0x3f) | 0x80);
1291     return 4;
1292   }
1293   return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1294 }
1295 
1296 int FASTCALL
1297 XmlUtf16Encode(int charNum, unsigned short *buf) {
1298   if (charNum < 0)
1299     return 0;
1300   if (charNum < 0x10000) {
1301     buf[0] = (unsigned short)charNum;
1302     return 1;
1303   }
1304   if (charNum < 0x110000) {
1305     charNum -= 0x10000;
1306     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1307     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1308     return 2;
1309   }
1310   return 0;
1311 }
1312 
1313 struct unknown_encoding {
1314   struct normal_encoding normal;
1315   CONVERTER convert;
1316   void *userData;
1317   unsigned short utf16[256];
1318   char utf8[256][4];
1319 };
1320 
1321 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1322 
1323 int
1324 XmlSizeOfUnknownEncoding(void) {
1325   return sizeof(struct unknown_encoding);
1326 }
1327 
1328 static int PTRFASTCALL
1329 unknown_isName(const ENCODING *enc, const char *p) {
1330   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1331   int c = uenc->convert(uenc->userData, p);
1332   if (c & ~0xFFFF)
1333     return 0;
1334   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1335 }
1336 
1337 static int PTRFASTCALL
1338 unknown_isNmstrt(const ENCODING *enc, const char *p) {
1339   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1340   int c = uenc->convert(uenc->userData, p);
1341   if (c & ~0xFFFF)
1342     return 0;
1343   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1344 }
1345 
1346 static int PTRFASTCALL
1347 unknown_isInvalid(const ENCODING *enc, const char *p) {
1348   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1349   int c = uenc->convert(uenc->userData, p);
1350   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1351 }
1352 
1353 static enum XML_Convert_Result PTRCALL
1354 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1355                char **toP, const char *toLim) {
1356   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1357   char buf[XML_UTF8_ENCODE_MAX];
1358   for (;;) {
1359     const char *utf8;
1360     int n;
1361     if (*fromP == fromLim)
1362       return XML_CONVERT_COMPLETED;
1363     utf8 = uenc->utf8[(unsigned char)**fromP];
1364     n = *utf8++;
1365     if (n == 0) {
1366       int c = uenc->convert(uenc->userData, *fromP);
1367       n = XmlUtf8Encode(c, buf);
1368       if (n > toLim - *toP)
1369         return XML_CONVERT_OUTPUT_EXHAUSTED;
1370       utf8 = buf;
1371       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1372                  - (BT_LEAD2 - 2));
1373     } else {
1374       if (n > toLim - *toP)
1375         return XML_CONVERT_OUTPUT_EXHAUSTED;
1376       (*fromP)++;
1377     }
1378     memcpy(*toP, utf8, n);
1379     *toP += n;
1380   }
1381 }
1382 
1383 static enum XML_Convert_Result PTRCALL
1384 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1385                 unsigned short **toP, const unsigned short *toLim) {
1386   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1387   while (*fromP < fromLim && *toP < toLim) {
1388     unsigned short c = uenc->utf16[(unsigned char)**fromP];
1389     if (c == 0) {
1390       c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1391       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1392                  - (BT_LEAD2 - 2));
1393     } else
1394       (*fromP)++;
1395     *(*toP)++ = c;
1396   }
1397 
1398   if ((*toP == toLim) && (*fromP < fromLim))
1399     return XML_CONVERT_OUTPUT_EXHAUSTED;
1400   else
1401     return XML_CONVERT_COMPLETED;
1402 }
1403 
1404 ENCODING *
1405 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1406                        void *userData) {
1407   int i;
1408   struct unknown_encoding *e = (struct unknown_encoding *)mem;
1409   memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1410   for (i = 0; i < 128; i++)
1411     if (latin1_encoding.type[i] != BT_OTHER
1412         && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1413       return 0;
1414   for (i = 0; i < 256; i++) {
1415     int c = table[i];
1416     if (c == -1) {
1417       e->normal.type[i] = BT_MALFORM;
1418       /* This shouldn't really get used. */
1419       e->utf16[i] = 0xFFFF;
1420       e->utf8[i][0] = 1;
1421       e->utf8[i][1] = 0;
1422     } else if (c < 0) {
1423       if (c < -4)
1424         return 0;
1425       /* Multi-byte sequences need a converter function */
1426       if (! convert)
1427         return 0;
1428       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1429       e->utf8[i][0] = 0;
1430       e->utf16[i] = 0;
1431     } else if (c < 0x80) {
1432       if (latin1_encoding.type[c] != BT_OTHER
1433           && latin1_encoding.type[c] != BT_NONXML && c != i)
1434         return 0;
1435       e->normal.type[i] = latin1_encoding.type[c];
1436       e->utf8[i][0] = 1;
1437       e->utf8[i][1] = (char)c;
1438       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1439     } else if (checkCharRefNumber(c) < 0) {
1440       e->normal.type[i] = BT_NONXML;
1441       /* This shouldn't really get used. */
1442       e->utf16[i] = 0xFFFF;
1443       e->utf8[i][0] = 1;
1444       e->utf8[i][1] = 0;
1445     } else {
1446       if (c > 0xFFFF)
1447         return 0;
1448       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1449         e->normal.type[i] = BT_NMSTRT;
1450       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1451         e->normal.type[i] = BT_NAME;
1452       else
1453         e->normal.type[i] = BT_OTHER;
1454       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1455       e->utf16[i] = (unsigned short)c;
1456     }
1457   }
1458   e->userData = userData;
1459   e->convert = convert;
1460   if (convert) {
1461     e->normal.isName2 = unknown_isName;
1462     e->normal.isName3 = unknown_isName;
1463     e->normal.isName4 = unknown_isName;
1464     e->normal.isNmstrt2 = unknown_isNmstrt;
1465     e->normal.isNmstrt3 = unknown_isNmstrt;
1466     e->normal.isNmstrt4 = unknown_isNmstrt;
1467     e->normal.isInvalid2 = unknown_isInvalid;
1468     e->normal.isInvalid3 = unknown_isInvalid;
1469     e->normal.isInvalid4 = unknown_isInvalid;
1470   }
1471   e->normal.enc.utf8Convert = unknown_toUtf8;
1472   e->normal.enc.utf16Convert = unknown_toUtf16;
1473   return &(e->normal.enc);
1474 }
1475 
1476 /* If this enumeration is changed, getEncodingIndex and encodings
1477 must also be changed. */
1478 enum {
1479   UNKNOWN_ENC = -1,
1480   ISO_8859_1_ENC = 0,
1481   US_ASCII_ENC,
1482   UTF_8_ENC,
1483   UTF_16_ENC,
1484   UTF_16BE_ENC,
1485   UTF_16LE_ENC,
1486   /* must match encodingNames up to here */
1487   NO_ENC
1488 };
1489 
1490 static const char KW_ISO_8859_1[]
1491     = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1492        ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1493 static const char KW_US_ASCII[]
1494     = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1495        ASCII_C, ASCII_I, ASCII_I,     '\0'};
1496 static const char KW_UTF_8[]
1497     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1498 static const char KW_UTF_16[]
1499     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1500 static const char KW_UTF_16BE[]
1501     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1502        ASCII_6, ASCII_B, ASCII_E, '\0'};
1503 static const char KW_UTF_16LE[]
1504     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1505        ASCII_6, ASCII_L, ASCII_E, '\0'};
1506 
1507 static int FASTCALL
1508 getEncodingIndex(const char *name) {
1509   static const char *const encodingNames[] = {
1510       KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1511   };
1512   int i;
1513   if (name == NULL)
1514     return NO_ENC;
1515   for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1516     if (streqci(name, encodingNames[i]))
1517       return i;
1518   return UNKNOWN_ENC;
1519 }
1520 
1521 /* For binary compatibility, we store the index of the encoding
1522    specified at initialization in the isUtf16 member.
1523 */
1524 
1525 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1526 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1527 
1528 /* This is what detects the encoding.  encodingTable maps from
1529    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1530    the external (protocol) specified encoding; state is
1531    XML_CONTENT_STATE if we're parsing an external text entity, and
1532    XML_PROLOG_STATE otherwise.
1533 */
1534 
1535 static int
1536 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1537          int state, const char *ptr, const char *end, const char **nextTokPtr) {
1538   const ENCODING **encPtr;
1539 
1540   if (ptr >= end)
1541     return XML_TOK_NONE;
1542   encPtr = enc->encPtr;
1543   if (ptr + 1 == end) {
1544     /* only a single byte available for auto-detection */
1545 #ifndef XML_DTD /* FIXME */
1546     /* a well-formed document entity must have more than one byte */
1547     if (state != XML_CONTENT_STATE)
1548       return XML_TOK_PARTIAL;
1549 #endif
1550     /* so we're parsing an external text entity... */
1551     /* if UTF-16 was externally specified, then we need at least 2 bytes */
1552     switch (INIT_ENC_INDEX(enc)) {
1553     case UTF_16_ENC:
1554     case UTF_16LE_ENC:
1555     case UTF_16BE_ENC:
1556       return XML_TOK_PARTIAL;
1557     }
1558     switch ((unsigned char)*ptr) {
1559     case 0xFE:
1560     case 0xFF:
1561     case 0xEF: /* possibly first byte of UTF-8 BOM */
1562       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1563         break;
1564       /* fall through */
1565     case 0x00:
1566     case 0x3C:
1567       return XML_TOK_PARTIAL;
1568     }
1569   } else {
1570     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1571     case 0xFEFF:
1572       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1573         break;
1574       *nextTokPtr = ptr + 2;
1575       *encPtr = encodingTable[UTF_16BE_ENC];
1576       return XML_TOK_BOM;
1577     /* 00 3C is handled in the default case */
1578     case 0x3C00:
1579       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1580            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1581           && state == XML_CONTENT_STATE)
1582         break;
1583       *encPtr = encodingTable[UTF_16LE_ENC];
1584       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1585     case 0xFFFE:
1586       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1587         break;
1588       *nextTokPtr = ptr + 2;
1589       *encPtr = encodingTable[UTF_16LE_ENC];
1590       return XML_TOK_BOM;
1591     case 0xEFBB:
1592       /* Maybe a UTF-8 BOM (EF BB BF) */
1593       /* If there's an explicitly specified (external) encoding
1594          of ISO-8859-1 or some flavour of UTF-16
1595          and this is an external text entity,
1596          don't look for the BOM,
1597          because it might be a legal data.
1598       */
1599       if (state == XML_CONTENT_STATE) {
1600         int e = INIT_ENC_INDEX(enc);
1601         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1602             || e == UTF_16_ENC)
1603           break;
1604       }
1605       if (ptr + 2 == end)
1606         return XML_TOK_PARTIAL;
1607       if ((unsigned char)ptr[2] == 0xBF) {
1608         *nextTokPtr = ptr + 3;
1609         *encPtr = encodingTable[UTF_8_ENC];
1610         return XML_TOK_BOM;
1611       }
1612       break;
1613     default:
1614       if (ptr[0] == '\0') {
1615         /* 0 isn't a legal data character. Furthermore a document
1616            entity can only start with ASCII characters.  So the only
1617            way this can fail to be big-endian UTF-16 if it it's an
1618            external parsed general entity that's labelled as
1619            UTF-16LE.
1620         */
1621         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1622           break;
1623         *encPtr = encodingTable[UTF_16BE_ENC];
1624         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1625       } else if (ptr[1] == '\0') {
1626         /* We could recover here in the case:
1627             - parsing an external entity
1628             - second byte is 0
1629             - no externally specified encoding
1630             - no encoding declaration
1631            by assuming UTF-16LE.  But we don't, because this would mean when
1632            presented just with a single byte, we couldn't reliably determine
1633            whether we needed further bytes.
1634         */
1635         if (state == XML_CONTENT_STATE)
1636           break;
1637         *encPtr = encodingTable[UTF_16LE_ENC];
1638         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1639       }
1640       break;
1641     }
1642   }
1643   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1644   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1645 }
1646 
1647 #define NS(x) x
1648 #define ns(x) x
1649 #define XML_TOK_NS_C
1650 #include "xmltok_ns.c"
1651 #undef XML_TOK_NS_C
1652 #undef NS
1653 #undef ns
1654 
1655 #ifdef XML_NS
1656 
1657 #  define NS(x) x##NS
1658 #  define ns(x) x##_ns
1659 
1660 #  define XML_TOK_NS_C
1661 #  include "xmltok_ns.c"
1662 #  undef XML_TOK_NS_C
1663 
1664 #  undef NS
1665 #  undef ns
1666 
1667 ENCODING *
1668 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1669                          void *userData) {
1670   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1671   if (enc)
1672     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1673   return enc;
1674 }
1675 
1676 #endif /* XML_NS */
1677