xref: /freebsd/contrib/expat/lib/xmltok.c (revision 7029da5c36f2d3cf6bb6c81bf551229f416399e8)
1 /*
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000-2017 Expat development team
11    Licensed under the MIT license:
12 
13    Permission is  hereby granted,  free of charge,  to any  person obtaining
14    a  copy  of  this  software   and  associated  documentation  files  (the
15    "Software"),  to  deal in  the  Software  without restriction,  including
16    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
17    distribute, sublicense, and/or sell copies of the Software, and to permit
18    persons  to whom  the Software  is  furnished to  do so,  subject to  the
19    following conditions:
20 
21    The above copyright  notice and this permission notice  shall be included
22    in all copies or substantial portions of the Software.
23 
24    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
25    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
26    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
29    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30    USE OR OTHER DEALINGS IN THE SOFTWARE.
31 */
32 
33 #include <stddef.h>
34 #include <string.h> /* memcpy */
35 
36 #if defined(_MSC_VER) && (_MSC_VER <= 1700)
37 /* for vs2012/11.0/1700 and earlier Visual Studio compilers */
38 #  define bool int
39 #  define false 0
40 #  define true 1
41 #else
42 #  include <stdbool.h>
43 #endif
44 
45 #ifdef _WIN32
46 #  include "winconfig.h"
47 #else
48 #  ifdef HAVE_EXPAT_CONFIG_H
49 #    include <expat_config.h>
50 #  endif
51 #endif /* ndef _WIN32 */
52 
53 #include "expat_external.h"
54 #include "internal.h"
55 #include "xmltok.h"
56 #include "nametab.h"
57 
58 #ifdef XML_DTD
59 #  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
60 #else
61 #  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
62 #endif
63 
64 #define VTABLE1                                                                \
65   {PREFIX(prologTok), PREFIX(contentTok),                                      \
66    PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
67       {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
68       PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
69       PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
70       PREFIX(updatePosition), PREFIX(isPublicId)
71 
72 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
73 
74 #define UCS2_GET_NAMING(pages, hi, lo)                                         \
75   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
76 
77 /* A 2 byte UTF-8 representation splits the characters 11 bits between
78    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
79    pages, 3 bits to add to that index and 5 bits to generate the mask.
80 */
81 #define UTF8_GET_NAMING2(pages, byte)                                          \
82   (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
83                 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
84    & (1u << (((byte)[1]) & 0x1F)))
85 
86 /* A 3 byte UTF-8 representation splits the characters 16 bits between
87    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
88    into pages, 3 bits to add to that index and 5 bits to generate the
89    mask.
90 */
91 #define UTF8_GET_NAMING3(pages, byte)                                          \
92   (namingBitmap                                                                \
93        [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
94          << 3)                                                                 \
95         + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
96    & (1u << (((byte)[2]) & 0x1F)))
97 
98 #define UTF8_GET_NAMING(pages, p, n)                                           \
99   ((n) == 2                                                                    \
100        ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p))                   \
101        : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0))
102 
103 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
104    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
105    with the additional restriction of not allowing the Unicode
106    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
107    Implementation details:
108      (A & 0x80) == 0     means A < 0x80
109    and
110      (A & 0xC0) == 0xC0  means A > 0xBF
111 */
112 
113 #define UTF8_INVALID2(p)                                                       \
114   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
115 
116 #define UTF8_INVALID3(p)                                                       \
117   (((p)[2] & 0x80) == 0                                                        \
118    || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
119                                       : ((p)[2] & 0xC0) == 0xC0)               \
120    || ((*p) == 0xE0                                                            \
121            ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
122            : ((p)[1] & 0x80) == 0                                              \
123                  || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
124 
125 #define UTF8_INVALID4(p)                                                       \
126   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
127    || ((p)[2] & 0xC0) == 0xC0                                                  \
128    || ((*p) == 0xF0                                                            \
129            ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
130            : ((p)[1] & 0x80) == 0                                              \
131                  || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
132 
133 static int PTRFASTCALL
134 isNever(const ENCODING *enc, const char *p) {
135   UNUSED_P(enc);
136   UNUSED_P(p);
137   return 0;
138 }
139 
140 static int PTRFASTCALL
141 utf8_isName2(const ENCODING *enc, const char *p) {
142   UNUSED_P(enc);
143   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
144 }
145 
146 static int PTRFASTCALL
147 utf8_isName3(const ENCODING *enc, const char *p) {
148   UNUSED_P(enc);
149   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
150 }
151 
152 #define utf8_isName4 isNever
153 
154 static int PTRFASTCALL
155 utf8_isNmstrt2(const ENCODING *enc, const char *p) {
156   UNUSED_P(enc);
157   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
158 }
159 
160 static int PTRFASTCALL
161 utf8_isNmstrt3(const ENCODING *enc, const char *p) {
162   UNUSED_P(enc);
163   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
164 }
165 
166 #define utf8_isNmstrt4 isNever
167 
168 static int PTRFASTCALL
169 utf8_isInvalid2(const ENCODING *enc, const char *p) {
170   UNUSED_P(enc);
171   return UTF8_INVALID2((const unsigned char *)p);
172 }
173 
174 static int PTRFASTCALL
175 utf8_isInvalid3(const ENCODING *enc, const char *p) {
176   UNUSED_P(enc);
177   return UTF8_INVALID3((const unsigned char *)p);
178 }
179 
180 static int PTRFASTCALL
181 utf8_isInvalid4(const ENCODING *enc, const char *p) {
182   UNUSED_P(enc);
183   return UTF8_INVALID4((const unsigned char *)p);
184 }
185 
186 struct normal_encoding {
187   ENCODING enc;
188   unsigned char type[256];
189 #ifdef XML_MIN_SIZE
190   int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
191   int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
192   int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
193   int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
194   int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
195 #endif /* XML_MIN_SIZE */
196   int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
197   int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
198   int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
199   int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
200   int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
201   int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
202   int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
203   int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
204   int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
205 };
206 
207 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
208 
209 #ifdef XML_MIN_SIZE
210 
211 #  define STANDARD_VTABLE(E)                                                   \
212     E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
213 
214 #else
215 
216 #  define STANDARD_VTABLE(E) /* as nothing */
217 
218 #endif
219 
220 #define NORMAL_VTABLE(E)                                                       \
221   E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
222       E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
223 
224 #define NULL_VTABLE                                                            \
225   /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
226       /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
227       /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
228 
229 static int FASTCALL checkCharRefNumber(int);
230 
231 #include "xmltok_impl.h"
232 #include "ascii.h"
233 
234 #ifdef XML_MIN_SIZE
235 #  define sb_isNameMin isNever
236 #  define sb_isNmstrtMin isNever
237 #endif
238 
239 #ifdef XML_MIN_SIZE
240 #  define MINBPC(enc) ((enc)->minBytesPerChar)
241 #else
242 /* minimum bytes per character */
243 #  define MINBPC(enc) 1
244 #endif
245 
246 #define SB_BYTE_TYPE(enc, p)                                                   \
247   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
248 
249 #ifdef XML_MIN_SIZE
250 static int PTRFASTCALL
251 sb_byteType(const ENCODING *enc, const char *p) {
252   return SB_BYTE_TYPE(enc, p);
253 }
254 #  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
255 #else
256 #  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
257 #endif
258 
259 #ifdef XML_MIN_SIZE
260 #  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
261 static int PTRFASTCALL
262 sb_byteToAscii(const ENCODING *enc, const char *p) {
263   UNUSED_P(enc);
264   return *p;
265 }
266 #else
267 #  define BYTE_TO_ASCII(enc, p) (*(p))
268 #endif
269 
270 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
271 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
272 #define IS_INVALID_CHAR(enc, p, n)                                             \
273   (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
274 
275 #ifdef XML_MIN_SIZE
276 #  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
277     (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
278 #  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
279     (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
280 #else
281 #  define IS_NAME_CHAR_MINBPC(enc, p) (0)
282 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
283 #endif
284 
285 #ifdef XML_MIN_SIZE
286 #  define CHAR_MATCHES(enc, p, c)                                              \
287     (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
288 static int PTRCALL
289 sb_charMatches(const ENCODING *enc, const char *p, int c) {
290   UNUSED_P(enc);
291   return *p == c;
292 }
293 #else
294 /* c is an ASCII character */
295 #  define CHAR_MATCHES(enc, p, c) (*(p) == c)
296 #endif
297 
298 #define PREFIX(ident) normal_##ident
299 #define XML_TOK_IMPL_C
300 #include "xmltok_impl.c"
301 #undef XML_TOK_IMPL_C
302 
303 #undef MINBPC
304 #undef BYTE_TYPE
305 #undef BYTE_TO_ASCII
306 #undef CHAR_MATCHES
307 #undef IS_NAME_CHAR
308 #undef IS_NAME_CHAR_MINBPC
309 #undef IS_NMSTRT_CHAR
310 #undef IS_NMSTRT_CHAR_MINBPC
311 #undef IS_INVALID_CHAR
312 
313 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
314        UTF8_cval1 = 0x00,
315        UTF8_cval2 = 0xc0,
316        UTF8_cval3 = 0xe0,
317        UTF8_cval4 = 0xf0
318 };
319 
320 void
321 _INTERNAL_trim_to_complete_utf8_characters(const char *from,
322                                            const char **fromLimRef) {
323   const char *fromLim = *fromLimRef;
324   size_t walked = 0;
325   for (; fromLim > from; fromLim--, walked++) {
326     const unsigned char prev = (unsigned char)fromLim[-1];
327     if ((prev & 0xf8u)
328         == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
329       if (walked + 1 >= 4) {
330         fromLim += 4 - 1;
331         break;
332       } else {
333         walked = 0;
334       }
335     } else if ((prev & 0xf0u)
336                == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
337       if (walked + 1 >= 3) {
338         fromLim += 3 - 1;
339         break;
340       } else {
341         walked = 0;
342       }
343     } else if ((prev & 0xe0u)
344                == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
345       if (walked + 1 >= 2) {
346         fromLim += 2 - 1;
347         break;
348       } else {
349         walked = 0;
350       }
351     } else if ((prev & 0x80u)
352                == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
353       break;
354     }
355   }
356   *fromLimRef = fromLim;
357 }
358 
359 static enum XML_Convert_Result PTRCALL
360 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
361             char **toP, const char *toLim) {
362   bool input_incomplete = false;
363   bool output_exhausted = false;
364 
365   /* Avoid copying partial characters (due to limited space). */
366   const ptrdiff_t bytesAvailable = fromLim - *fromP;
367   const ptrdiff_t bytesStorable = toLim - *toP;
368   UNUSED_P(enc);
369   if (bytesAvailable > bytesStorable) {
370     fromLim = *fromP + bytesStorable;
371     output_exhausted = true;
372   }
373 
374   /* Avoid copying partial characters (from incomplete input). */
375   {
376     const char *const fromLimBefore = fromLim;
377     _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
378     if (fromLim < fromLimBefore) {
379       input_incomplete = true;
380     }
381   }
382 
383   {
384     const ptrdiff_t bytesToCopy = fromLim - *fromP;
385     memcpy(*toP, *fromP, bytesToCopy);
386     *fromP += bytesToCopy;
387     *toP += bytesToCopy;
388   }
389 
390   if (output_exhausted) /* needs to go first */
391     return XML_CONVERT_OUTPUT_EXHAUSTED;
392   else if (input_incomplete)
393     return XML_CONVERT_INPUT_INCOMPLETE;
394   else
395     return XML_CONVERT_COMPLETED;
396 }
397 
398 static enum XML_Convert_Result PTRCALL
399 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
400              unsigned short **toP, const unsigned short *toLim) {
401   enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
402   unsigned short *to = *toP;
403   const char *from = *fromP;
404   while (from < fromLim && to < toLim) {
405     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
406     case BT_LEAD2:
407       if (fromLim - from < 2) {
408         res = XML_CONVERT_INPUT_INCOMPLETE;
409         goto after;
410       }
411       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
412       from += 2;
413       break;
414     case BT_LEAD3:
415       if (fromLim - from < 3) {
416         res = XML_CONVERT_INPUT_INCOMPLETE;
417         goto after;
418       }
419       *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
420                                | (from[2] & 0x3f));
421       from += 3;
422       break;
423     case BT_LEAD4: {
424       unsigned long n;
425       if (toLim - to < 2) {
426         res = XML_CONVERT_OUTPUT_EXHAUSTED;
427         goto after;
428       }
429       if (fromLim - from < 4) {
430         res = XML_CONVERT_INPUT_INCOMPLETE;
431         goto after;
432       }
433       n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
434           | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
435       n -= 0x10000;
436       to[0] = (unsigned short)((n >> 10) | 0xD800);
437       to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
438       to += 2;
439       from += 4;
440     } break;
441     default:
442       *to++ = *from++;
443       break;
444     }
445   }
446   if (from < fromLim)
447     res = XML_CONVERT_OUTPUT_EXHAUSTED;
448 after:
449   *fromP = from;
450   *toP = to;
451   return res;
452 }
453 
454 #ifdef XML_NS
455 static const struct normal_encoding utf8_encoding_ns
456     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
457        {
458 #  include "asciitab.h"
459 #  include "utf8tab.h"
460        },
461        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
462 #endif
463 
464 static const struct normal_encoding utf8_encoding
465     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
466        {
467 #define BT_COLON BT_NMSTRT
468 #include "asciitab.h"
469 #undef BT_COLON
470 #include "utf8tab.h"
471        },
472        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
473 
474 #ifdef XML_NS
475 
476 static const struct normal_encoding internal_utf8_encoding_ns
477     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
478        {
479 #  include "iasciitab.h"
480 #  include "utf8tab.h"
481        },
482        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
483 
484 #endif
485 
486 static const struct normal_encoding internal_utf8_encoding
487     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
488        {
489 #define BT_COLON BT_NMSTRT
490 #include "iasciitab.h"
491 #undef BT_COLON
492 #include "utf8tab.h"
493        },
494        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
495 
496 static enum XML_Convert_Result PTRCALL
497 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
498               char **toP, const char *toLim) {
499   UNUSED_P(enc);
500   for (;;) {
501     unsigned char c;
502     if (*fromP == fromLim)
503       return XML_CONVERT_COMPLETED;
504     c = (unsigned char)**fromP;
505     if (c & 0x80) {
506       if (toLim - *toP < 2)
507         return XML_CONVERT_OUTPUT_EXHAUSTED;
508       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
509       *(*toP)++ = (char)((c & 0x3f) | 0x80);
510       (*fromP)++;
511     } else {
512       if (*toP == toLim)
513         return XML_CONVERT_OUTPUT_EXHAUSTED;
514       *(*toP)++ = *(*fromP)++;
515     }
516   }
517 }
518 
519 static enum XML_Convert_Result PTRCALL
520 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
521                unsigned short **toP, const unsigned short *toLim) {
522   UNUSED_P(enc);
523   while (*fromP < fromLim && *toP < toLim)
524     *(*toP)++ = (unsigned char)*(*fromP)++;
525 
526   if ((*toP == toLim) && (*fromP < fromLim))
527     return XML_CONVERT_OUTPUT_EXHAUSTED;
528   else
529     return XML_CONVERT_COMPLETED;
530 }
531 
532 #ifdef XML_NS
533 
534 static const struct normal_encoding latin1_encoding_ns
535     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
536        {
537 #  include "asciitab.h"
538 #  include "latin1tab.h"
539        },
540        STANDARD_VTABLE(sb_) NULL_VTABLE};
541 
542 #endif
543 
544 static const struct normal_encoding latin1_encoding
545     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
546        {
547 #define BT_COLON BT_NMSTRT
548 #include "asciitab.h"
549 #undef BT_COLON
550 #include "latin1tab.h"
551        },
552        STANDARD_VTABLE(sb_) NULL_VTABLE};
553 
554 static enum XML_Convert_Result PTRCALL
555 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
556              char **toP, const char *toLim) {
557   UNUSED_P(enc);
558   while (*fromP < fromLim && *toP < toLim)
559     *(*toP)++ = *(*fromP)++;
560 
561   if ((*toP == toLim) && (*fromP < fromLim))
562     return XML_CONVERT_OUTPUT_EXHAUSTED;
563   else
564     return XML_CONVERT_COMPLETED;
565 }
566 
567 #ifdef XML_NS
568 
569 static const struct normal_encoding ascii_encoding_ns
570     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
571        {
572 #  include "asciitab.h"
573            /* BT_NONXML == 0 */
574        },
575        STANDARD_VTABLE(sb_) NULL_VTABLE};
576 
577 #endif
578 
579 static const struct normal_encoding ascii_encoding
580     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
581        {
582 #define BT_COLON BT_NMSTRT
583 #include "asciitab.h"
584 #undef BT_COLON
585            /* BT_NONXML == 0 */
586        },
587        STANDARD_VTABLE(sb_) NULL_VTABLE};
588 
589 static int PTRFASTCALL
590 unicode_byte_type(char hi, char lo) {
591   switch ((unsigned char)hi) {
592   /* 0xD800–0xDBFF first 16-bit code unit or high surrogate (W1) */
593   case 0xD8:
594   case 0xD9:
595   case 0xDA:
596   case 0xDB:
597     return BT_LEAD4;
598   /* 0xDC00–0xDFFF second 16-bit code unit or low surrogate (W2) */
599   case 0xDC:
600   case 0xDD:
601   case 0xDE:
602   case 0xDF:
603     return BT_TRAIL;
604   case 0xFF:
605     switch ((unsigned char)lo) {
606     case 0xFF: /* noncharacter-FFFF */
607     case 0xFE: /* noncharacter-FFFE */
608       return BT_NONXML;
609     }
610     break;
611   }
612   return BT_NONASCII;
613 }
614 
615 #define DEFINE_UTF16_TO_UTF8(E)                                                \
616   static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
617       const ENCODING *enc, const char **fromP, const char *fromLim,            \
618       char **toP, const char *toLim) {                                         \
619     const char *from = *fromP;                                                 \
620     UNUSED_P(enc);                                                             \
621     fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
622     for (; from < fromLim; from += 2) {                                        \
623       int plane;                                                               \
624       unsigned char lo2;                                                       \
625       unsigned char lo = GET_LO(from);                                         \
626       unsigned char hi = GET_HI(from);                                         \
627       switch (hi) {                                                            \
628       case 0:                                                                  \
629         if (lo < 0x80) {                                                       \
630           if (*toP == toLim) {                                                 \
631             *fromP = from;                                                     \
632             return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
633           }                                                                    \
634           *(*toP)++ = lo;                                                      \
635           break;                                                               \
636         }                                                                      \
637         /* fall through */                                                     \
638       case 0x1:                                                                \
639       case 0x2:                                                                \
640       case 0x3:                                                                \
641       case 0x4:                                                                \
642       case 0x5:                                                                \
643       case 0x6:                                                                \
644       case 0x7:                                                                \
645         if (toLim - *toP < 2) {                                                \
646           *fromP = from;                                                       \
647           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
648         }                                                                      \
649         *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
650         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
651         break;                                                                 \
652       default:                                                                 \
653         if (toLim - *toP < 3) {                                                \
654           *fromP = from;                                                       \
655           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
656         }                                                                      \
657         /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
658         *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
659         *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
660         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
661         break;                                                                 \
662       case 0xD8:                                                               \
663       case 0xD9:                                                               \
664       case 0xDA:                                                               \
665       case 0xDB:                                                               \
666         if (toLim - *toP < 4) {                                                \
667           *fromP = from;                                                       \
668           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
669         }                                                                      \
670         if (fromLim - from < 4) {                                              \
671           *fromP = from;                                                       \
672           return XML_CONVERT_INPUT_INCOMPLETE;                                 \
673         }                                                                      \
674         plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
675         *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
676         *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
677         from += 2;                                                             \
678         lo2 = GET_LO(from);                                                    \
679         *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
680                      | (lo2 >> 6) | 0x80);                                     \
681         *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
682         break;                                                                 \
683       }                                                                        \
684     }                                                                          \
685     *fromP = from;                                                             \
686     if (from < fromLim)                                                        \
687       return XML_CONVERT_INPUT_INCOMPLETE;                                     \
688     else                                                                       \
689       return XML_CONVERT_COMPLETED;                                            \
690   }
691 
692 #define DEFINE_UTF16_TO_UTF16(E)                                               \
693   static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
694       const ENCODING *enc, const char **fromP, const char *fromLim,            \
695       unsigned short **toP, const unsigned short *toLim) {                     \
696     enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
697     UNUSED_P(enc);                                                             \
698     fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
699     /* Avoid copying first half only of surrogate */                           \
700     if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
701         && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
702       fromLim -= 2;                                                            \
703       res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
704     }                                                                          \
705     for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
706       *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
707     if ((*toP == toLim) && (*fromP < fromLim))                                 \
708       return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
709     else                                                                       \
710       return res;                                                              \
711   }
712 
713 #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
714 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
715 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
716 
717 DEFINE_UTF16_TO_UTF8(little2_)
718 DEFINE_UTF16_TO_UTF16(little2_)
719 
720 #undef SET2
721 #undef GET_LO
722 #undef GET_HI
723 
724 #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
725 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
726 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
727 
728 DEFINE_UTF16_TO_UTF8(big2_)
729 DEFINE_UTF16_TO_UTF16(big2_)
730 
731 #undef SET2
732 #undef GET_LO
733 #undef GET_HI
734 
735 #define LITTLE2_BYTE_TYPE(enc, p)                                              \
736   ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]  \
737                : unicode_byte_type((p)[1], (p)[0]))
738 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
739 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c)
740 #define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
741   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
742 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
743   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
744 
745 #ifdef XML_MIN_SIZE
746 
747 static int PTRFASTCALL
748 little2_byteType(const ENCODING *enc, const char *p) {
749   return LITTLE2_BYTE_TYPE(enc, p);
750 }
751 
752 static int PTRFASTCALL
753 little2_byteToAscii(const ENCODING *enc, const char *p) {
754   UNUSED_P(enc);
755   return LITTLE2_BYTE_TO_ASCII(p);
756 }
757 
758 static int PTRCALL
759 little2_charMatches(const ENCODING *enc, const char *p, int c) {
760   UNUSED_P(enc);
761   return LITTLE2_CHAR_MATCHES(p, c);
762 }
763 
764 static int PTRFASTCALL
765 little2_isNameMin(const ENCODING *enc, const char *p) {
766   UNUSED_P(enc);
767   return LITTLE2_IS_NAME_CHAR_MINBPC(p);
768 }
769 
770 static int PTRFASTCALL
771 little2_isNmstrtMin(const ENCODING *enc, const char *p) {
772   UNUSED_P(enc);
773   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
774 }
775 
776 #  undef VTABLE
777 #  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
778 
779 #else /* not XML_MIN_SIZE */
780 
781 #  undef PREFIX
782 #  define PREFIX(ident) little2_##ident
783 #  define MINBPC(enc) 2
784 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
785 #  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
786 #  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
787 #  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
788 #  define IS_NAME_CHAR(enc, p, n) 0
789 #  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
790 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
791 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
792 
793 #  define XML_TOK_IMPL_C
794 #  include "xmltok_impl.c"
795 #  undef XML_TOK_IMPL_C
796 
797 #  undef MINBPC
798 #  undef BYTE_TYPE
799 #  undef BYTE_TO_ASCII
800 #  undef CHAR_MATCHES
801 #  undef IS_NAME_CHAR
802 #  undef IS_NAME_CHAR_MINBPC
803 #  undef IS_NMSTRT_CHAR
804 #  undef IS_NMSTRT_CHAR_MINBPC
805 #  undef IS_INVALID_CHAR
806 
807 #endif /* not XML_MIN_SIZE */
808 
809 #ifdef XML_NS
810 
811 static const struct normal_encoding little2_encoding_ns
812     = {{VTABLE, 2, 0,
813 #  if BYTEORDER == 1234
814         1
815 #  else
816         0
817 #  endif
818        },
819        {
820 #  include "asciitab.h"
821 #  include "latin1tab.h"
822        },
823        STANDARD_VTABLE(little2_) NULL_VTABLE};
824 
825 #endif
826 
827 static const struct normal_encoding little2_encoding
828     = {{VTABLE, 2, 0,
829 #if BYTEORDER == 1234
830         1
831 #else
832         0
833 #endif
834        },
835        {
836 #define BT_COLON BT_NMSTRT
837 #include "asciitab.h"
838 #undef BT_COLON
839 #include "latin1tab.h"
840        },
841        STANDARD_VTABLE(little2_) NULL_VTABLE};
842 
843 #if BYTEORDER != 4321
844 
845 #  ifdef XML_NS
846 
847 static const struct normal_encoding internal_little2_encoding_ns
848     = {{VTABLE, 2, 0, 1},
849        {
850 #    include "iasciitab.h"
851 #    include "latin1tab.h"
852        },
853        STANDARD_VTABLE(little2_) NULL_VTABLE};
854 
855 #  endif
856 
857 static const struct normal_encoding internal_little2_encoding
858     = {{VTABLE, 2, 0, 1},
859        {
860 #  define BT_COLON BT_NMSTRT
861 #  include "iasciitab.h"
862 #  undef BT_COLON
863 #  include "latin1tab.h"
864        },
865        STANDARD_VTABLE(little2_) NULL_VTABLE};
866 
867 #endif
868 
869 #define BIG2_BYTE_TYPE(enc, p)                                                 \
870   ((p)[0] == 0                                                                 \
871        ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]]        \
872        : unicode_byte_type((p)[0], (p)[1]))
873 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
874 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c)
875 #define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
876   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
877 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
878   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
879 
880 #ifdef XML_MIN_SIZE
881 
882 static int PTRFASTCALL
883 big2_byteType(const ENCODING *enc, const char *p) {
884   return BIG2_BYTE_TYPE(enc, p);
885 }
886 
887 static int PTRFASTCALL
888 big2_byteToAscii(const ENCODING *enc, const char *p) {
889   UNUSED_P(enc);
890   return BIG2_BYTE_TO_ASCII(p);
891 }
892 
893 static int PTRCALL
894 big2_charMatches(const ENCODING *enc, const char *p, int c) {
895   UNUSED_P(enc);
896   return BIG2_CHAR_MATCHES(p, c);
897 }
898 
899 static int PTRFASTCALL
900 big2_isNameMin(const ENCODING *enc, const char *p) {
901   UNUSED_P(enc);
902   return BIG2_IS_NAME_CHAR_MINBPC(p);
903 }
904 
905 static int PTRFASTCALL
906 big2_isNmstrtMin(const ENCODING *enc, const char *p) {
907   UNUSED_P(enc);
908   return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
909 }
910 
911 #  undef VTABLE
912 #  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
913 
914 #else /* not XML_MIN_SIZE */
915 
916 #  undef PREFIX
917 #  define PREFIX(ident) big2_##ident
918 #  define MINBPC(enc) 2
919 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
920 #  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
921 #  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
922 #  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
923 #  define IS_NAME_CHAR(enc, p, n) 0
924 #  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
925 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
926 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
927 
928 #  define XML_TOK_IMPL_C
929 #  include "xmltok_impl.c"
930 #  undef XML_TOK_IMPL_C
931 
932 #  undef MINBPC
933 #  undef BYTE_TYPE
934 #  undef BYTE_TO_ASCII
935 #  undef CHAR_MATCHES
936 #  undef IS_NAME_CHAR
937 #  undef IS_NAME_CHAR_MINBPC
938 #  undef IS_NMSTRT_CHAR
939 #  undef IS_NMSTRT_CHAR_MINBPC
940 #  undef IS_INVALID_CHAR
941 
942 #endif /* not XML_MIN_SIZE */
943 
944 #ifdef XML_NS
945 
946 static const struct normal_encoding big2_encoding_ns
947     = {{VTABLE, 2, 0,
948 #  if BYTEORDER == 4321
949         1
950 #  else
951         0
952 #  endif
953        },
954        {
955 #  include "asciitab.h"
956 #  include "latin1tab.h"
957        },
958        STANDARD_VTABLE(big2_) NULL_VTABLE};
959 
960 #endif
961 
962 static const struct normal_encoding big2_encoding
963     = {{VTABLE, 2, 0,
964 #if BYTEORDER == 4321
965         1
966 #else
967         0
968 #endif
969        },
970        {
971 #define BT_COLON BT_NMSTRT
972 #include "asciitab.h"
973 #undef BT_COLON
974 #include "latin1tab.h"
975        },
976        STANDARD_VTABLE(big2_) NULL_VTABLE};
977 
978 #if BYTEORDER != 1234
979 
980 #  ifdef XML_NS
981 
982 static const struct normal_encoding internal_big2_encoding_ns
983     = {{VTABLE, 2, 0, 1},
984        {
985 #    include "iasciitab.h"
986 #    include "latin1tab.h"
987        },
988        STANDARD_VTABLE(big2_) NULL_VTABLE};
989 
990 #  endif
991 
992 static const struct normal_encoding internal_big2_encoding
993     = {{VTABLE, 2, 0, 1},
994        {
995 #  define BT_COLON BT_NMSTRT
996 #  include "iasciitab.h"
997 #  undef BT_COLON
998 #  include "latin1tab.h"
999        },
1000        STANDARD_VTABLE(big2_) NULL_VTABLE};
1001 
1002 #endif
1003 
1004 #undef PREFIX
1005 
1006 static int FASTCALL
1007 streqci(const char *s1, const char *s2) {
1008   for (;;) {
1009     char c1 = *s1++;
1010     char c2 = *s2++;
1011     if (ASCII_a <= c1 && c1 <= ASCII_z)
1012       c1 += ASCII_A - ASCII_a;
1013     if (ASCII_a <= c2 && c2 <= ASCII_z)
1014       /* The following line will never get executed.  streqci() is
1015        * only called from two places, both of which guarantee to put
1016        * upper-case strings into s2.
1017        */
1018       c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1019     if (c1 != c2)
1020       return 0;
1021     if (! c1)
1022       break;
1023   }
1024   return 1;
1025 }
1026 
1027 static void PTRCALL
1028 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1029                    POSITION *pos) {
1030   UNUSED_P(enc);
1031   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1032 }
1033 
1034 static int
1035 toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1036   char buf[1];
1037   char *p = buf;
1038   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1039   if (p == buf)
1040     return -1;
1041   else
1042     return buf[0];
1043 }
1044 
1045 static int FASTCALL
1046 isSpace(int c) {
1047   switch (c) {
1048   case 0x20:
1049   case 0xD:
1050   case 0xA:
1051   case 0x9:
1052     return 1;
1053   }
1054   return 0;
1055 }
1056 
1057 /* Return 1 if there's just optional white space or there's an S
1058    followed by name=val.
1059 */
1060 static int
1061 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1062                      const char **namePtr, const char **nameEndPtr,
1063                      const char **valPtr, const char **nextTokPtr) {
1064   int c;
1065   char open;
1066   if (ptr == end) {
1067     *namePtr = NULL;
1068     return 1;
1069   }
1070   if (! isSpace(toAscii(enc, ptr, end))) {
1071     *nextTokPtr = ptr;
1072     return 0;
1073   }
1074   do {
1075     ptr += enc->minBytesPerChar;
1076   } while (isSpace(toAscii(enc, ptr, end)));
1077   if (ptr == end) {
1078     *namePtr = NULL;
1079     return 1;
1080   }
1081   *namePtr = ptr;
1082   for (;;) {
1083     c = toAscii(enc, ptr, end);
1084     if (c == -1) {
1085       *nextTokPtr = ptr;
1086       return 0;
1087     }
1088     if (c == ASCII_EQUALS) {
1089       *nameEndPtr = ptr;
1090       break;
1091     }
1092     if (isSpace(c)) {
1093       *nameEndPtr = ptr;
1094       do {
1095         ptr += enc->minBytesPerChar;
1096       } while (isSpace(c = toAscii(enc, ptr, end)));
1097       if (c != ASCII_EQUALS) {
1098         *nextTokPtr = ptr;
1099         return 0;
1100       }
1101       break;
1102     }
1103     ptr += enc->minBytesPerChar;
1104   }
1105   if (ptr == *namePtr) {
1106     *nextTokPtr = ptr;
1107     return 0;
1108   }
1109   ptr += enc->minBytesPerChar;
1110   c = toAscii(enc, ptr, end);
1111   while (isSpace(c)) {
1112     ptr += enc->minBytesPerChar;
1113     c = toAscii(enc, ptr, end);
1114   }
1115   if (c != ASCII_QUOT && c != ASCII_APOS) {
1116     *nextTokPtr = ptr;
1117     return 0;
1118   }
1119   open = (char)c;
1120   ptr += enc->minBytesPerChar;
1121   *valPtr = ptr;
1122   for (;; ptr += enc->minBytesPerChar) {
1123     c = toAscii(enc, ptr, end);
1124     if (c == open)
1125       break;
1126     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1127         && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1128         && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1129       *nextTokPtr = ptr;
1130       return 0;
1131     }
1132   }
1133   *nextTokPtr = ptr + enc->minBytesPerChar;
1134   return 1;
1135 }
1136 
1137 static const char KW_version[]
1138     = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1139 
1140 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1141                                    ASCII_i, ASCII_n, ASCII_g, '\0'};
1142 
1143 static const char KW_standalone[]
1144     = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1145        ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1146 
1147 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1148 
1149 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1150 
1151 static int
1152 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1153                                                  const char *),
1154                int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1155                const char *end, const char **badPtr, const char **versionPtr,
1156                const char **versionEndPtr, const char **encodingName,
1157                const ENCODING **encoding, int *standalone) {
1158   const char *val = NULL;
1159   const char *name = NULL;
1160   const char *nameEnd = NULL;
1161   ptr += 5 * enc->minBytesPerChar;
1162   end -= 2 * enc->minBytesPerChar;
1163   if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1164       || ! name) {
1165     *badPtr = ptr;
1166     return 0;
1167   }
1168   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1169     if (! isGeneralTextEntity) {
1170       *badPtr = name;
1171       return 0;
1172     }
1173   } else {
1174     if (versionPtr)
1175       *versionPtr = val;
1176     if (versionEndPtr)
1177       *versionEndPtr = ptr;
1178     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1179       *badPtr = ptr;
1180       return 0;
1181     }
1182     if (! name) {
1183       if (isGeneralTextEntity) {
1184         /* a TextDecl must have an EncodingDecl */
1185         *badPtr = ptr;
1186         return 0;
1187       }
1188       return 1;
1189     }
1190   }
1191   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1192     int c = toAscii(enc, val, end);
1193     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1194       *badPtr = val;
1195       return 0;
1196     }
1197     if (encodingName)
1198       *encodingName = val;
1199     if (encoding)
1200       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1201     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1202       *badPtr = ptr;
1203       return 0;
1204     }
1205     if (! name)
1206       return 1;
1207   }
1208   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1209       || isGeneralTextEntity) {
1210     *badPtr = name;
1211     return 0;
1212   }
1213   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1214     if (standalone)
1215       *standalone = 1;
1216   } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1217     if (standalone)
1218       *standalone = 0;
1219   } else {
1220     *badPtr = val;
1221     return 0;
1222   }
1223   while (isSpace(toAscii(enc, ptr, end)))
1224     ptr += enc->minBytesPerChar;
1225   if (ptr != end) {
1226     *badPtr = ptr;
1227     return 0;
1228   }
1229   return 1;
1230 }
1231 
1232 static int FASTCALL
1233 checkCharRefNumber(int result) {
1234   switch (result >> 8) {
1235   case 0xD8:
1236   case 0xD9:
1237   case 0xDA:
1238   case 0xDB:
1239   case 0xDC:
1240   case 0xDD:
1241   case 0xDE:
1242   case 0xDF:
1243     return -1;
1244   case 0:
1245     if (latin1_encoding.type[result] == BT_NONXML)
1246       return -1;
1247     break;
1248   case 0xFF:
1249     if (result == 0xFFFE || result == 0xFFFF)
1250       return -1;
1251     break;
1252   }
1253   return result;
1254 }
1255 
1256 int FASTCALL
1257 XmlUtf8Encode(int c, char *buf) {
1258   enum {
1259     /* minN is minimum legal resulting value for N byte sequence */
1260     min2 = 0x80,
1261     min3 = 0x800,
1262     min4 = 0x10000
1263   };
1264 
1265   if (c < 0)
1266     return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1267   if (c < min2) {
1268     buf[0] = (char)(c | UTF8_cval1);
1269     return 1;
1270   }
1271   if (c < min3) {
1272     buf[0] = (char)((c >> 6) | UTF8_cval2);
1273     buf[1] = (char)((c & 0x3f) | 0x80);
1274     return 2;
1275   }
1276   if (c < min4) {
1277     buf[0] = (char)((c >> 12) | UTF8_cval3);
1278     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1279     buf[2] = (char)((c & 0x3f) | 0x80);
1280     return 3;
1281   }
1282   if (c < 0x110000) {
1283     buf[0] = (char)((c >> 18) | UTF8_cval4);
1284     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1285     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1286     buf[3] = (char)((c & 0x3f) | 0x80);
1287     return 4;
1288   }
1289   return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1290 }
1291 
1292 int FASTCALL
1293 XmlUtf16Encode(int charNum, unsigned short *buf) {
1294   if (charNum < 0)
1295     return 0;
1296   if (charNum < 0x10000) {
1297     buf[0] = (unsigned short)charNum;
1298     return 1;
1299   }
1300   if (charNum < 0x110000) {
1301     charNum -= 0x10000;
1302     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1303     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1304     return 2;
1305   }
1306   return 0;
1307 }
1308 
1309 struct unknown_encoding {
1310   struct normal_encoding normal;
1311   CONVERTER convert;
1312   void *userData;
1313   unsigned short utf16[256];
1314   char utf8[256][4];
1315 };
1316 
1317 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1318 
1319 int
1320 XmlSizeOfUnknownEncoding(void) {
1321   return sizeof(struct unknown_encoding);
1322 }
1323 
1324 static int PTRFASTCALL
1325 unknown_isName(const ENCODING *enc, const char *p) {
1326   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1327   int c = uenc->convert(uenc->userData, p);
1328   if (c & ~0xFFFF)
1329     return 0;
1330   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1331 }
1332 
1333 static int PTRFASTCALL
1334 unknown_isNmstrt(const ENCODING *enc, const char *p) {
1335   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1336   int c = uenc->convert(uenc->userData, p);
1337   if (c & ~0xFFFF)
1338     return 0;
1339   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1340 }
1341 
1342 static int PTRFASTCALL
1343 unknown_isInvalid(const ENCODING *enc, const char *p) {
1344   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1345   int c = uenc->convert(uenc->userData, p);
1346   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1347 }
1348 
1349 static enum XML_Convert_Result PTRCALL
1350 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1351                char **toP, const char *toLim) {
1352   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1353   char buf[XML_UTF8_ENCODE_MAX];
1354   for (;;) {
1355     const char *utf8;
1356     int n;
1357     if (*fromP == fromLim)
1358       return XML_CONVERT_COMPLETED;
1359     utf8 = uenc->utf8[(unsigned char)**fromP];
1360     n = *utf8++;
1361     if (n == 0) {
1362       int c = uenc->convert(uenc->userData, *fromP);
1363       n = XmlUtf8Encode(c, buf);
1364       if (n > toLim - *toP)
1365         return XML_CONVERT_OUTPUT_EXHAUSTED;
1366       utf8 = buf;
1367       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1368                  - (BT_LEAD2 - 2));
1369     } else {
1370       if (n > toLim - *toP)
1371         return XML_CONVERT_OUTPUT_EXHAUSTED;
1372       (*fromP)++;
1373     }
1374     memcpy(*toP, utf8, n);
1375     *toP += n;
1376   }
1377 }
1378 
1379 static enum XML_Convert_Result PTRCALL
1380 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1381                 unsigned short **toP, const unsigned short *toLim) {
1382   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1383   while (*fromP < fromLim && *toP < toLim) {
1384     unsigned short c = uenc->utf16[(unsigned char)**fromP];
1385     if (c == 0) {
1386       c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1387       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1388                  - (BT_LEAD2 - 2));
1389     } else
1390       (*fromP)++;
1391     *(*toP)++ = c;
1392   }
1393 
1394   if ((*toP == toLim) && (*fromP < fromLim))
1395     return XML_CONVERT_OUTPUT_EXHAUSTED;
1396   else
1397     return XML_CONVERT_COMPLETED;
1398 }
1399 
1400 ENCODING *
1401 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1402                        void *userData) {
1403   int i;
1404   struct unknown_encoding *e = (struct unknown_encoding *)mem;
1405   memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1406   for (i = 0; i < 128; i++)
1407     if (latin1_encoding.type[i] != BT_OTHER
1408         && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1409       return 0;
1410   for (i = 0; i < 256; i++) {
1411     int c = table[i];
1412     if (c == -1) {
1413       e->normal.type[i] = BT_MALFORM;
1414       /* This shouldn't really get used. */
1415       e->utf16[i] = 0xFFFF;
1416       e->utf8[i][0] = 1;
1417       e->utf8[i][1] = 0;
1418     } else if (c < 0) {
1419       if (c < -4)
1420         return 0;
1421       /* Multi-byte sequences need a converter function */
1422       if (! convert)
1423         return 0;
1424       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1425       e->utf8[i][0] = 0;
1426       e->utf16[i] = 0;
1427     } else if (c < 0x80) {
1428       if (latin1_encoding.type[c] != BT_OTHER
1429           && latin1_encoding.type[c] != BT_NONXML && c != i)
1430         return 0;
1431       e->normal.type[i] = latin1_encoding.type[c];
1432       e->utf8[i][0] = 1;
1433       e->utf8[i][1] = (char)c;
1434       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1435     } else if (checkCharRefNumber(c) < 0) {
1436       e->normal.type[i] = BT_NONXML;
1437       /* This shouldn't really get used. */
1438       e->utf16[i] = 0xFFFF;
1439       e->utf8[i][0] = 1;
1440       e->utf8[i][1] = 0;
1441     } else {
1442       if (c > 0xFFFF)
1443         return 0;
1444       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1445         e->normal.type[i] = BT_NMSTRT;
1446       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1447         e->normal.type[i] = BT_NAME;
1448       else
1449         e->normal.type[i] = BT_OTHER;
1450       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1451       e->utf16[i] = (unsigned short)c;
1452     }
1453   }
1454   e->userData = userData;
1455   e->convert = convert;
1456   if (convert) {
1457     e->normal.isName2 = unknown_isName;
1458     e->normal.isName3 = unknown_isName;
1459     e->normal.isName4 = unknown_isName;
1460     e->normal.isNmstrt2 = unknown_isNmstrt;
1461     e->normal.isNmstrt3 = unknown_isNmstrt;
1462     e->normal.isNmstrt4 = unknown_isNmstrt;
1463     e->normal.isInvalid2 = unknown_isInvalid;
1464     e->normal.isInvalid3 = unknown_isInvalid;
1465     e->normal.isInvalid4 = unknown_isInvalid;
1466   }
1467   e->normal.enc.utf8Convert = unknown_toUtf8;
1468   e->normal.enc.utf16Convert = unknown_toUtf16;
1469   return &(e->normal.enc);
1470 }
1471 
1472 /* If this enumeration is changed, getEncodingIndex and encodings
1473 must also be changed. */
1474 enum {
1475   UNKNOWN_ENC = -1,
1476   ISO_8859_1_ENC = 0,
1477   US_ASCII_ENC,
1478   UTF_8_ENC,
1479   UTF_16_ENC,
1480   UTF_16BE_ENC,
1481   UTF_16LE_ENC,
1482   /* must match encodingNames up to here */
1483   NO_ENC
1484 };
1485 
1486 static const char KW_ISO_8859_1[]
1487     = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1488        ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1489 static const char KW_US_ASCII[]
1490     = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1491        ASCII_C, ASCII_I, ASCII_I,     '\0'};
1492 static const char KW_UTF_8[]
1493     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1494 static const char KW_UTF_16[]
1495     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1496 static const char KW_UTF_16BE[]
1497     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1498        ASCII_6, ASCII_B, ASCII_E, '\0'};
1499 static const char KW_UTF_16LE[]
1500     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1501        ASCII_6, ASCII_L, ASCII_E, '\0'};
1502 
1503 static int FASTCALL
1504 getEncodingIndex(const char *name) {
1505   static const char *const encodingNames[] = {
1506       KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1507   };
1508   int i;
1509   if (name == NULL)
1510     return NO_ENC;
1511   for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1512     if (streqci(name, encodingNames[i]))
1513       return i;
1514   return UNKNOWN_ENC;
1515 }
1516 
1517 /* For binary compatibility, we store the index of the encoding
1518    specified at initialization in the isUtf16 member.
1519 */
1520 
1521 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1522 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1523 
1524 /* This is what detects the encoding.  encodingTable maps from
1525    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1526    the external (protocol) specified encoding; state is
1527    XML_CONTENT_STATE if we're parsing an external text entity, and
1528    XML_PROLOG_STATE otherwise.
1529 */
1530 
1531 static int
1532 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1533          int state, const char *ptr, const char *end, const char **nextTokPtr) {
1534   const ENCODING **encPtr;
1535 
1536   if (ptr >= end)
1537     return XML_TOK_NONE;
1538   encPtr = enc->encPtr;
1539   if (ptr + 1 == end) {
1540     /* only a single byte available for auto-detection */
1541 #ifndef XML_DTD /* FIXME */
1542     /* a well-formed document entity must have more than one byte */
1543     if (state != XML_CONTENT_STATE)
1544       return XML_TOK_PARTIAL;
1545 #endif
1546     /* so we're parsing an external text entity... */
1547     /* if UTF-16 was externally specified, then we need at least 2 bytes */
1548     switch (INIT_ENC_INDEX(enc)) {
1549     case UTF_16_ENC:
1550     case UTF_16LE_ENC:
1551     case UTF_16BE_ENC:
1552       return XML_TOK_PARTIAL;
1553     }
1554     switch ((unsigned char)*ptr) {
1555     case 0xFE:
1556     case 0xFF:
1557     case 0xEF: /* possibly first byte of UTF-8 BOM */
1558       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1559         break;
1560       /* fall through */
1561     case 0x00:
1562     case 0x3C:
1563       return XML_TOK_PARTIAL;
1564     }
1565   } else {
1566     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1567     case 0xFEFF:
1568       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1569         break;
1570       *nextTokPtr = ptr + 2;
1571       *encPtr = encodingTable[UTF_16BE_ENC];
1572       return XML_TOK_BOM;
1573     /* 00 3C is handled in the default case */
1574     case 0x3C00:
1575       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1576            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1577           && state == XML_CONTENT_STATE)
1578         break;
1579       *encPtr = encodingTable[UTF_16LE_ENC];
1580       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1581     case 0xFFFE:
1582       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1583         break;
1584       *nextTokPtr = ptr + 2;
1585       *encPtr = encodingTable[UTF_16LE_ENC];
1586       return XML_TOK_BOM;
1587     case 0xEFBB:
1588       /* Maybe a UTF-8 BOM (EF BB BF) */
1589       /* If there's an explicitly specified (external) encoding
1590          of ISO-8859-1 or some flavour of UTF-16
1591          and this is an external text entity,
1592          don't look for the BOM,
1593          because it might be a legal data.
1594       */
1595       if (state == XML_CONTENT_STATE) {
1596         int e = INIT_ENC_INDEX(enc);
1597         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1598             || e == UTF_16_ENC)
1599           break;
1600       }
1601       if (ptr + 2 == end)
1602         return XML_TOK_PARTIAL;
1603       if ((unsigned char)ptr[2] == 0xBF) {
1604         *nextTokPtr = ptr + 3;
1605         *encPtr = encodingTable[UTF_8_ENC];
1606         return XML_TOK_BOM;
1607       }
1608       break;
1609     default:
1610       if (ptr[0] == '\0') {
1611         /* 0 isn't a legal data character. Furthermore a document
1612            entity can only start with ASCII characters.  So the only
1613            way this can fail to be big-endian UTF-16 if it it's an
1614            external parsed general entity that's labelled as
1615            UTF-16LE.
1616         */
1617         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1618           break;
1619         *encPtr = encodingTable[UTF_16BE_ENC];
1620         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1621       } else if (ptr[1] == '\0') {
1622         /* We could recover here in the case:
1623             - parsing an external entity
1624             - second byte is 0
1625             - no externally specified encoding
1626             - no encoding declaration
1627            by assuming UTF-16LE.  But we don't, because this would mean when
1628            presented just with a single byte, we couldn't reliably determine
1629            whether we needed further bytes.
1630         */
1631         if (state == XML_CONTENT_STATE)
1632           break;
1633         *encPtr = encodingTable[UTF_16LE_ENC];
1634         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1635       }
1636       break;
1637     }
1638   }
1639   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1640   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1641 }
1642 
1643 #define NS(x) x
1644 #define ns(x) x
1645 #define XML_TOK_NS_C
1646 #include "xmltok_ns.c"
1647 #undef XML_TOK_NS_C
1648 #undef NS
1649 #undef ns
1650 
1651 #ifdef XML_NS
1652 
1653 #  define NS(x) x##NS
1654 #  define ns(x) x##_ns
1655 
1656 #  define XML_TOK_NS_C
1657 #  include "xmltok_ns.c"
1658 #  undef XML_TOK_NS_C
1659 
1660 #  undef NS
1661 #  undef ns
1662 
1663 ENCODING *
1664 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1665                          void *userData) {
1666   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1667   if (enc)
1668     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1669   return enc;
1670 }
1671 
1672 #endif /* XML_NS */
1673