xref: /freebsd/contrib/expat/lib/xmltok.c (revision c82aeee8a6d39371006f5eeb1b51704e7b97e2b7)
1 /*
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11    Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12    Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
13    Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14    Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15    Copyright (c) 2016-2024 Sebastian Pipping <sebastian@pipping.org>
16    Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
17    Copyright (c) 2016      Don Lewis <truckman@apache.org>
18    Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
19    Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
20    Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
21    Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
22    Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23    Copyright (c) 2021      Donghee Na <donghee.na@python.org>
24    Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
25    Copyright (c) 2022      Sean McBride <sean@rogue-research.com>
26    Copyright (c) 2023      Hanno Böck <hanno@gentoo.org>
27    Copyright (c) 2025      Alfonso Gregory <gfunni234@gmail.com>
28    Copyright (c) 2026      Nick Begg <nick@stunttruck.net>
29    Licensed under the MIT license:
30 
31    Permission is  hereby granted,  free of charge,  to any  person obtaining
32    a  copy  of  this  software   and  associated  documentation  files  (the
33    "Software"),  to  deal in  the  Software  without restriction,  including
34    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
35    distribute, sublicense, and/or sell copies of the Software, and to permit
36    persons  to whom  the Software  is  furnished to  do so,  subject to  the
37    following conditions:
38 
39    The above copyright  notice and this permission notice  shall be included
40    in all copies or substantial portions of the Software.
41 
42    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
43    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
44    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
45    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
46    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
47    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
48    USE OR OTHER DEALINGS IN THE SOFTWARE.
49 */
50 
51 #include "expat_config.h"
52 
53 #include <stddef.h>
54 #include <string.h> /* memcpy */
55 #include <stdbool.h>
56 
57 #ifdef _WIN32
58 #  include "winconfig.h"
59 #endif
60 
61 #include "internal.h"
62 #include "fallthrough.h"
63 #include "xmltok.h"
64 #include "nametab.h"
65 
66 #ifdef XML_DTD
67 #  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
68 #else
69 #  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
70 #endif
71 
72 #define VTABLE1                                                                \
73   {PREFIX(prologTok), PREFIX(contentTok),                                      \
74    PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
75       {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
76       PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
77       PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
78       PREFIX(updatePosition), PREFIX(isPublicId)
79 
80 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
81 
82 #define UCS2_GET_NAMING(pages, hi, lo)                                         \
83   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
84 
85 /* A 2 byte UTF-8 representation splits the characters 11 bits between
86    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
87    pages, 3 bits to add to that index and 5 bits to generate the mask.
88 */
89 #define UTF8_GET_NAMING2(pages, byte)                                          \
90   (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
91                 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
92    & (1u << (((byte)[1]) & 0x1F)))
93 
94 /* A 3 byte UTF-8 representation splits the characters 16 bits between
95    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
96    into pages, 3 bits to add to that index and 5 bits to generate the
97    mask.
98 */
99 #define UTF8_GET_NAMING3(pages, byte)                                          \
100   (namingBitmap                                                                \
101        [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
102          << 3)                                                                 \
103         + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
104    & (1u << (((byte)[2]) & 0x1F)))
105 
106 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
107    of Unicode 3.2: https://www.unicode.org/unicode/reports/tr28/
108    with the additional restriction of not allowing the Unicode
109    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
110    Implementation details:
111      (A & 0x80) == 0     means A < 0x80
112    and
113      (A & 0xC0) == 0xC0  means A > 0xBF
114 */
115 
116 #define UTF8_INVALID2(p)                                                       \
117   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
118 
119 #define UTF8_INVALID3(p)                                                       \
120   (((p)[2] & 0x80) == 0                                                        \
121    || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
122                                       : ((p)[2] & 0xC0) == 0xC0)               \
123    || ((*p) == 0xE0                                                            \
124            ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
125            : ((p)[1] & 0x80) == 0                                              \
126                  || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
127 
128 #define UTF8_INVALID4(p)                                                       \
129   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
130    || ((p)[2] & 0xC0) == 0xC0                                                  \
131    || ((*p) == 0xF0                                                            \
132            ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
133            : ((p)[1] & 0x80) == 0                                              \
134                  || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
135 
136 static int PTRFASTCALL
isNever(const ENCODING * enc,const char * p)137 isNever(const ENCODING *enc, const char *p) {
138   UNUSED_P(enc);
139   UNUSED_P(p);
140   return 0;
141 }
142 
143 static int PTRFASTCALL
utf8_isName2(const ENCODING * enc,const char * p)144 utf8_isName2(const ENCODING *enc, const char *p) {
145   UNUSED_P(enc);
146   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
147 }
148 
149 static int PTRFASTCALL
utf8_isName3(const ENCODING * enc,const char * p)150 utf8_isName3(const ENCODING *enc, const char *p) {
151   UNUSED_P(enc);
152   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
153 }
154 
155 #define utf8_isName4 isNever
156 
157 static int PTRFASTCALL
utf8_isNmstrt2(const ENCODING * enc,const char * p)158 utf8_isNmstrt2(const ENCODING *enc, const char *p) {
159   UNUSED_P(enc);
160   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
161 }
162 
163 static int PTRFASTCALL
utf8_isNmstrt3(const ENCODING * enc,const char * p)164 utf8_isNmstrt3(const ENCODING *enc, const char *p) {
165   UNUSED_P(enc);
166   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
167 }
168 
169 #define utf8_isNmstrt4 isNever
170 
171 static int PTRFASTCALL
utf8_isInvalid2(const ENCODING * enc,const char * p)172 utf8_isInvalid2(const ENCODING *enc, const char *p) {
173   UNUSED_P(enc);
174   return UTF8_INVALID2((const unsigned char *)p);
175 }
176 
177 static int PTRFASTCALL
utf8_isInvalid3(const ENCODING * enc,const char * p)178 utf8_isInvalid3(const ENCODING *enc, const char *p) {
179   UNUSED_P(enc);
180   return UTF8_INVALID3((const unsigned char *)p);
181 }
182 
183 static int PTRFASTCALL
utf8_isInvalid4(const ENCODING * enc,const char * p)184 utf8_isInvalid4(const ENCODING *enc, const char *p) {
185   UNUSED_P(enc);
186   return UTF8_INVALID4((const unsigned char *)p);
187 }
188 
189 struct normal_encoding {
190   ENCODING enc;
191   unsigned char type[256];
192 #ifdef XML_MIN_SIZE
193   int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
194   int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
195   int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
196   int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
197   int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
198 #endif /* XML_MIN_SIZE */
199   int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
200   int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
201   int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
202   int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
203   int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
204   int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
205   int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
206   int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
207   int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
208 };
209 
210 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
211 
212 #ifdef XML_MIN_SIZE
213 
214 #  define STANDARD_VTABLE(E)                                                   \
215     E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
216 
217 #else
218 
219 #  define STANDARD_VTABLE(E) /* as nothing */
220 
221 #endif
222 
223 #define NORMAL_VTABLE(E)                                                       \
224   E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
225       E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
226 
227 #define NULL_VTABLE                                                            \
228   /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
229       /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
230       /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
231 
232 static int FASTCALL checkCharRefNumber(int result);
233 
234 #include "xmltok_impl.h"
235 #include "ascii.h"
236 
237 #ifdef XML_MIN_SIZE
238 #  define sb_isNameMin isNever
239 #  define sb_isNmstrtMin isNever
240 #endif
241 
242 #ifdef XML_MIN_SIZE
243 #  define MINBPC(enc) ((enc)->minBytesPerChar)
244 #else
245 /* minimum bytes per character */
246 #  define MINBPC(enc) 1
247 #endif
248 
249 #define SB_BYTE_TYPE(enc, p)                                                   \
250   (((const struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
251 
252 #ifdef XML_MIN_SIZE
253 static int PTRFASTCALL
sb_byteType(const ENCODING * enc,const char * p)254 sb_byteType(const ENCODING *enc, const char *p) {
255   return SB_BYTE_TYPE(enc, p);
256 }
257 #  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
258 #else
259 #  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
260 #endif
261 
262 #ifdef XML_MIN_SIZE
263 #  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
264 static int PTRFASTCALL
sb_byteToAscii(const ENCODING * enc,const char * p)265 sb_byteToAscii(const ENCODING *enc, const char *p) {
266   UNUSED_P(enc);
267   return *p;
268 }
269 #else
270 #  define BYTE_TO_ASCII(enc, p) (*(p))
271 #endif
272 
273 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
274 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
275 #ifdef XML_MIN_SIZE
276 #  define IS_INVALID_CHAR(enc, p, n)                                           \
277     (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
278      && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
279 #else
280 #  define IS_INVALID_CHAR(enc, p, n)                                           \
281     (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
282 #endif
283 
284 #ifdef XML_MIN_SIZE
285 #  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
286     (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
287 #  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
288     (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
289 #else
290 #  define IS_NAME_CHAR_MINBPC(enc, p) (0)
291 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
292 #endif
293 
294 #ifdef XML_MIN_SIZE
295 #  define CHAR_MATCHES(enc, p, c)                                              \
296     (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
297 static int PTRCALL
sb_charMatches(const ENCODING * enc,const char * p,int c)298 sb_charMatches(const ENCODING *enc, const char *p, int c) {
299   UNUSED_P(enc);
300   return *p == c;
301 }
302 #else
303 /* c is an ASCII character */
304 #  define CHAR_MATCHES(enc, p, c) (*(p) == (c))
305 #endif
306 
307 #define PREFIX(ident) normal_##ident
308 #define XML_TOK_IMPL_C
309 #include "xmltok_impl.c"
310 #undef XML_TOK_IMPL_C
311 
312 #undef MINBPC
313 #undef BYTE_TYPE
314 #undef BYTE_TO_ASCII
315 #undef CHAR_MATCHES
316 #undef IS_NAME_CHAR
317 #undef IS_NAME_CHAR_MINBPC
318 #undef IS_NMSTRT_CHAR
319 #undef IS_NMSTRT_CHAR_MINBPC
320 #undef IS_INVALID_CHAR
321 
322 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
323        UTF8_cval1 = 0x00,
324        UTF8_cval2 = 0xc0,
325        UTF8_cval3 = 0xe0,
326        UTF8_cval4 = 0xf0
327 };
328 
329 void
_INTERNAL_trim_to_complete_utf8_characters(const char * from,const char ** fromLimRef)330 _INTERNAL_trim_to_complete_utf8_characters(const char *from,
331                                            const char **fromLimRef) {
332   const char *fromLim = *fromLimRef;
333   size_t walked = 0;
334   for (; fromLim > from; fromLim--, walked++) {
335     const unsigned char prev = (unsigned char)fromLim[-1];
336     if ((prev & 0xf8u)
337         == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
338       if (walked + 1 >= 4) {
339         fromLim += 4 - 1;
340         break;
341       } else {
342         walked = 0;
343       }
344     } else if ((prev & 0xf0u)
345                == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
346       if (walked + 1 >= 3) {
347         fromLim += 3 - 1;
348         break;
349       } else {
350         walked = 0;
351       }
352     } else if ((prev & 0xe0u)
353                == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
354       if (walked + 1 >= 2) {
355         fromLim += 2 - 1;
356         break;
357       } else {
358         walked = 0;
359       }
360     } else if ((prev & 0x80u)
361                == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
362       break;
363     }
364   }
365   *fromLimRef = fromLim;
366 }
367 
368 static enum XML_Convert_Result PTRCALL
utf8_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)369 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
370             char **toP, const char *toLim) {
371   bool input_incomplete = false;
372   bool output_exhausted = false;
373 
374   /* Avoid copying partial characters (due to limited space). */
375   const ptrdiff_t bytesAvailable = fromLim - *fromP;
376   const ptrdiff_t bytesStorable = toLim - *toP;
377   UNUSED_P(enc);
378   if (bytesAvailable > bytesStorable) {
379     fromLim = *fromP + bytesStorable;
380     output_exhausted = true;
381   }
382 
383   /* Avoid copying partial characters (from incomplete input). */
384   {
385     const char *const fromLimBefore = fromLim;
386     _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
387     if (fromLim < fromLimBefore) {
388       input_incomplete = true;
389     }
390   }
391 
392   {
393     const ptrdiff_t bytesToCopy = fromLim - *fromP;
394     memcpy(*toP, *fromP, bytesToCopy);
395     *fromP += bytesToCopy;
396     *toP += bytesToCopy;
397   }
398 
399   if (output_exhausted) /* needs to go first */
400     return XML_CONVERT_OUTPUT_EXHAUSTED;
401   else if (input_incomplete)
402     return XML_CONVERT_INPUT_INCOMPLETE;
403   else
404     return XML_CONVERT_COMPLETED;
405 }
406 
407 static enum XML_Convert_Result PTRCALL
utf8_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)408 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
409              unsigned short **toP, const unsigned short *toLim) {
410   enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
411   unsigned short *to = *toP;
412   const char *from = *fromP;
413   while (from < fromLim && to < toLim) {
414     switch (SB_BYTE_TYPE(enc, from)) {
415     case BT_LEAD2:
416       if (fromLim - from < 2) {
417         res = XML_CONVERT_INPUT_INCOMPLETE;
418         goto after;
419       }
420       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
421       from += 2;
422       break;
423     case BT_LEAD3:
424       if (fromLim - from < 3) {
425         res = XML_CONVERT_INPUT_INCOMPLETE;
426         goto after;
427       }
428       *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
429                                | (from[2] & 0x3f));
430       from += 3;
431       break;
432     case BT_LEAD4: {
433       unsigned long n;
434       if (toLim - to < 2) {
435         res = XML_CONVERT_OUTPUT_EXHAUSTED;
436         goto after;
437       }
438       if (fromLim - from < 4) {
439         res = XML_CONVERT_INPUT_INCOMPLETE;
440         goto after;
441       }
442       n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
443           | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
444       n -= 0x10000;
445       to[0] = (unsigned short)((n >> 10) | 0xD800);
446       to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
447       to += 2;
448       from += 4;
449     } break;
450     default:
451       *to++ = *from++;
452       break;
453     }
454   }
455   if (from < fromLim)
456     res = XML_CONVERT_OUTPUT_EXHAUSTED;
457 after:
458   *fromP = from;
459   *toP = to;
460   return res;
461 }
462 
463 #ifdef XML_NS
464 static const struct normal_encoding utf8_encoding_ns
465     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
466        {
467 #  include "asciitab.h"
468 #  include "utf8tab.h"
469        },
470        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
471 #endif
472 
473 static const struct normal_encoding utf8_encoding
474     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
475        {
476 #define BT_COLON BT_NMSTRT
477 #include "asciitab.h"
478 #undef BT_COLON
479 #include "utf8tab.h"
480        },
481        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
482 
483 #ifdef XML_NS
484 
485 static const struct normal_encoding internal_utf8_encoding_ns
486     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
487        {
488 #  include "iasciitab.h"
489 #  include "utf8tab.h"
490        },
491        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
492 
493 #endif
494 
495 static const struct normal_encoding internal_utf8_encoding
496     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
497        {
498 #define BT_COLON BT_NMSTRT
499 #include "iasciitab.h"
500 #undef BT_COLON
501 #include "utf8tab.h"
502        },
503        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
504 
505 static enum XML_Convert_Result PTRCALL
latin1_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)506 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
507               char **toP, const char *toLim) {
508   UNUSED_P(enc);
509   for (;;) {
510     unsigned char c;
511     if (*fromP == fromLim)
512       return XML_CONVERT_COMPLETED;
513     c = (unsigned char)**fromP;
514     if (c & 0x80) {
515       if (toLim - *toP < 2)
516         return XML_CONVERT_OUTPUT_EXHAUSTED;
517       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
518       *(*toP)++ = (char)((c & 0x3f) | 0x80);
519       (*fromP)++;
520     } else {
521       if (*toP == toLim)
522         return XML_CONVERT_OUTPUT_EXHAUSTED;
523       *(*toP)++ = *(*fromP)++;
524     }
525   }
526 }
527 
528 static enum XML_Convert_Result PTRCALL
latin1_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)529 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
530                unsigned short **toP, const unsigned short *toLim) {
531   UNUSED_P(enc);
532   while (*fromP < fromLim && *toP < toLim)
533     *(*toP)++ = (unsigned char)*(*fromP)++;
534 
535   if ((*toP == toLim) && (*fromP < fromLim))
536     return XML_CONVERT_OUTPUT_EXHAUSTED;
537   else
538     return XML_CONVERT_COMPLETED;
539 }
540 
541 #ifdef XML_NS
542 
543 static const struct normal_encoding latin1_encoding_ns
544     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
545        {
546 #  include "asciitab.h"
547 #  include "latin1tab.h"
548        },
549        STANDARD_VTABLE(sb_) NULL_VTABLE};
550 
551 #endif
552 
553 static const struct normal_encoding latin1_encoding
554     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
555        {
556 #define BT_COLON BT_NMSTRT
557 #include "asciitab.h"
558 #undef BT_COLON
559 #include "latin1tab.h"
560        },
561        STANDARD_VTABLE(sb_) NULL_VTABLE};
562 
563 static enum XML_Convert_Result PTRCALL
ascii_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)564 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
565              char **toP, const char *toLim) {
566   UNUSED_P(enc);
567   while (*fromP < fromLim && *toP < toLim)
568     *(*toP)++ = *(*fromP)++;
569 
570   if ((*toP == toLim) && (*fromP < fromLim))
571     return XML_CONVERT_OUTPUT_EXHAUSTED;
572   else
573     return XML_CONVERT_COMPLETED;
574 }
575 
576 #ifdef XML_NS
577 
578 static const struct normal_encoding ascii_encoding_ns
579     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
580        {
581 #  include "asciitab.h"
582            /* BT_NONXML == 0 */
583        },
584        STANDARD_VTABLE(sb_) NULL_VTABLE};
585 
586 #endif
587 
588 static const struct normal_encoding ascii_encoding
589     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
590        {
591 #define BT_COLON BT_NMSTRT
592 #include "asciitab.h"
593 #undef BT_COLON
594            /* BT_NONXML == 0 */
595        },
596        STANDARD_VTABLE(sb_) NULL_VTABLE};
597 
598 static int PTRFASTCALL
unicode_byte_type(char hi,char lo)599 unicode_byte_type(char hi, char lo) {
600   switch ((unsigned char)hi) {
601   /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
602   case 0xD8:
603   case 0xD9:
604   case 0xDA:
605   case 0xDB:
606     return BT_LEAD4;
607   /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
608   case 0xDC:
609   case 0xDD:
610   case 0xDE:
611   case 0xDF:
612     return BT_TRAIL;
613   case 0xFF:
614     switch ((unsigned char)lo) {
615     case 0xFF: /* noncharacter-FFFF */
616     case 0xFE: /* noncharacter-FFFE */
617       return BT_NONXML;
618     }
619     break;
620   }
621   return BT_NONASCII;
622 }
623 
624 #define DEFINE_UTF16_TO_UTF8(E)                                                \
625   static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
626       const ENCODING *enc, const char **fromP, const char *fromLim,            \
627       char **toP, const char *toLim) {                                         \
628     const char *from = *fromP;                                                 \
629     UNUSED_P(enc);                                                             \
630     fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
631     for (; from < fromLim; from += 2) {                                        \
632       int plane;                                                               \
633       unsigned char lo2;                                                       \
634       unsigned char lo = GET_LO(from);                                         \
635       unsigned char hi = GET_HI(from);                                         \
636       switch (hi) {                                                            \
637       case 0:                                                                  \
638         if (lo < 0x80) {                                                       \
639           if (*toP == toLim) {                                                 \
640             *fromP = from;                                                     \
641             return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
642           }                                                                    \
643           *(*toP)++ = lo;                                                      \
644           break;                                                               \
645         }                                                                      \
646         EXPAT_FALLTHROUGH;                                                     \
647       case 0x1:                                                                \
648       case 0x2:                                                                \
649       case 0x3:                                                                \
650       case 0x4:                                                                \
651       case 0x5:                                                                \
652       case 0x6:                                                                \
653       case 0x7:                                                                \
654         if (toLim - *toP < 2) {                                                \
655           *fromP = from;                                                       \
656           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
657         }                                                                      \
658         *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
659         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
660         break;                                                                 \
661       default:                                                                 \
662         if (toLim - *toP < 3) {                                                \
663           *fromP = from;                                                       \
664           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
665         }                                                                      \
666         /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
667         *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
668         *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
669         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
670         break;                                                                 \
671       case 0xD8:                                                               \
672       case 0xD9:                                                               \
673       case 0xDA:                                                               \
674       case 0xDB:                                                               \
675         if (toLim - *toP < 4) {                                                \
676           *fromP = from;                                                       \
677           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
678         }                                                                      \
679         if (fromLim - from < 4) {                                              \
680           *fromP = from;                                                       \
681           return XML_CONVERT_INPUT_INCOMPLETE;                                 \
682         }                                                                      \
683         plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
684         *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
685         *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
686         from += 2;                                                             \
687         lo2 = GET_LO(from);                                                    \
688         *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
689                      | (lo2 >> 6) | 0x80);                                     \
690         *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
691         break;                                                                 \
692       }                                                                        \
693     }                                                                          \
694     *fromP = from;                                                             \
695     if (from < fromLim)                                                        \
696       return XML_CONVERT_INPUT_INCOMPLETE;                                     \
697     else                                                                       \
698       return XML_CONVERT_COMPLETED;                                            \
699   }
700 
701 #define DEFINE_UTF16_TO_UTF16(E)                                               \
702   static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
703       const ENCODING *enc, const char **fromP, const char *fromLim,            \
704       unsigned short **toP, const unsigned short *toLim) {                     \
705     enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
706     UNUSED_P(enc);                                                             \
707     fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
708     /* Avoid copying first half only of surrogate */                           \
709     if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
710         && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
711       fromLim -= 2;                                                            \
712       res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
713     }                                                                          \
714     for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
715       *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
716     if ((*toP == toLim) && (*fromP < fromLim))                                 \
717       return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
718     else                                                                       \
719       return res;                                                              \
720   }
721 
722 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
723 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
724 
725 DEFINE_UTF16_TO_UTF8(little2_)
DEFINE_UTF16_TO_UTF16(little2_)726 DEFINE_UTF16_TO_UTF16(little2_)
727 
728 #undef GET_LO
729 #undef GET_HI
730 
731 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
732 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
733 
734 DEFINE_UTF16_TO_UTF8(big2_)
735 DEFINE_UTF16_TO_UTF16(big2_)
736 
737 #undef GET_LO
738 #undef GET_HI
739 
740 #define LITTLE2_BYTE_TYPE(enc, p)                                              \
741   ((p)[1] == 0 ? SB_BYTE_TYPE(enc, p) : unicode_byte_type((p)[1], (p)[0]))
742 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
743 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
744 #define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
745   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
746 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
747   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
748 
749 #ifdef XML_MIN_SIZE
750 
751 static int PTRFASTCALL
752 little2_byteType(const ENCODING *enc, const char *p) {
753   return LITTLE2_BYTE_TYPE(enc, p);
754 }
755 
756 static int PTRFASTCALL
little2_byteToAscii(const ENCODING * enc,const char * p)757 little2_byteToAscii(const ENCODING *enc, const char *p) {
758   UNUSED_P(enc);
759   return LITTLE2_BYTE_TO_ASCII(p);
760 }
761 
762 static int PTRCALL
little2_charMatches(const ENCODING * enc,const char * p,int c)763 little2_charMatches(const ENCODING *enc, const char *p, int c) {
764   UNUSED_P(enc);
765   return LITTLE2_CHAR_MATCHES(p, c);
766 }
767 
768 static int PTRFASTCALL
little2_isNameMin(const ENCODING * enc,const char * p)769 little2_isNameMin(const ENCODING *enc, const char *p) {
770   UNUSED_P(enc);
771   return LITTLE2_IS_NAME_CHAR_MINBPC(p);
772 }
773 
774 static int PTRFASTCALL
little2_isNmstrtMin(const ENCODING * enc,const char * p)775 little2_isNmstrtMin(const ENCODING *enc, const char *p) {
776   UNUSED_P(enc);
777   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
778 }
779 
780 #  undef VTABLE
781 #  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
782 
783 #else /* not XML_MIN_SIZE */
784 
785 #  undef PREFIX
786 #  define PREFIX(ident) little2_##ident
787 #  define MINBPC(enc) 2
788 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
789 #  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
790 #  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
791 #  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
792 #  define IS_NAME_CHAR(enc, p, n) 0
793 #  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
794 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
795 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
796 
797 #  define XML_TOK_IMPL_C
798 #  include "xmltok_impl.c"
799 #  undef XML_TOK_IMPL_C
800 
801 #  undef MINBPC
802 #  undef BYTE_TYPE
803 #  undef BYTE_TO_ASCII
804 #  undef CHAR_MATCHES
805 #  undef IS_NAME_CHAR
806 #  undef IS_NAME_CHAR_MINBPC
807 #  undef IS_NMSTRT_CHAR
808 #  undef IS_NMSTRT_CHAR_MINBPC
809 #  undef IS_INVALID_CHAR
810 
811 #endif /* not XML_MIN_SIZE */
812 
813 #ifdef XML_NS
814 
815 static const struct normal_encoding little2_encoding_ns
816     = {{VTABLE, 2, 0,
817 #  if BYTEORDER == 1234
818         1
819 #  else
820         0
821 #  endif
822        },
823        {
824 #  include "asciitab.h"
825 #  include "latin1tab.h"
826        },
827        STANDARD_VTABLE(little2_) NULL_VTABLE};
828 
829 #endif
830 
831 static const struct normal_encoding little2_encoding
832     = {{VTABLE, 2, 0,
833 #if BYTEORDER == 1234
834         1
835 #else
836         0
837 #endif
838        },
839        {
840 #define BT_COLON BT_NMSTRT
841 #include "asciitab.h"
842 #undef BT_COLON
843 #include "latin1tab.h"
844        },
845        STANDARD_VTABLE(little2_) NULL_VTABLE};
846 
847 #if BYTEORDER != 4321
848 
849 #  ifdef XML_NS
850 
851 static const struct normal_encoding internal_little2_encoding_ns
852     = {{VTABLE, 2, 0, 1},
853        {
854 #    include "iasciitab.h"
855 #    include "latin1tab.h"
856        },
857        STANDARD_VTABLE(little2_) NULL_VTABLE};
858 
859 #  endif
860 
861 static const struct normal_encoding internal_little2_encoding
862     = {{VTABLE, 2, 0, 1},
863        {
864 #  define BT_COLON BT_NMSTRT
865 #  include "iasciitab.h"
866 #  undef BT_COLON
867 #  include "latin1tab.h"
868        },
869        STANDARD_VTABLE(little2_) NULL_VTABLE};
870 
871 #endif
872 
873 #define BIG2_BYTE_TYPE(enc, p)                                                 \
874   ((p)[0] == 0 ? SB_BYTE_TYPE(enc, p + 1) : unicode_byte_type((p)[0], (p)[1]))
875 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
876 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
877 #define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
878   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
879 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
880   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
881 
882 #ifdef XML_MIN_SIZE
883 
884 static int PTRFASTCALL
big2_byteType(const ENCODING * enc,const char * p)885 big2_byteType(const ENCODING *enc, const char *p) {
886   return BIG2_BYTE_TYPE(enc, p);
887 }
888 
889 static int PTRFASTCALL
big2_byteToAscii(const ENCODING * enc,const char * p)890 big2_byteToAscii(const ENCODING *enc, const char *p) {
891   UNUSED_P(enc);
892   return BIG2_BYTE_TO_ASCII(p);
893 }
894 
895 static int PTRCALL
big2_charMatches(const ENCODING * enc,const char * p,int c)896 big2_charMatches(const ENCODING *enc, const char *p, int c) {
897   UNUSED_P(enc);
898   return BIG2_CHAR_MATCHES(p, c);
899 }
900 
901 static int PTRFASTCALL
big2_isNameMin(const ENCODING * enc,const char * p)902 big2_isNameMin(const ENCODING *enc, const char *p) {
903   UNUSED_P(enc);
904   return BIG2_IS_NAME_CHAR_MINBPC(p);
905 }
906 
907 static int PTRFASTCALL
big2_isNmstrtMin(const ENCODING * enc,const char * p)908 big2_isNmstrtMin(const ENCODING *enc, const char *p) {
909   UNUSED_P(enc);
910   return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
911 }
912 
913 #  undef VTABLE
914 #  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
915 
916 #else /* not XML_MIN_SIZE */
917 
918 #  undef PREFIX
919 #  define PREFIX(ident) big2_##ident
920 #  define MINBPC(enc) 2
921 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
922 #  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
923 #  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
924 #  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
925 #  define IS_NAME_CHAR(enc, p, n) 0
926 #  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
927 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
928 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
929 
930 #  define XML_TOK_IMPL_C
931 #  include "xmltok_impl.c"
932 #  undef XML_TOK_IMPL_C
933 
934 #  undef MINBPC
935 #  undef BYTE_TYPE
936 #  undef BYTE_TO_ASCII
937 #  undef CHAR_MATCHES
938 #  undef IS_NAME_CHAR
939 #  undef IS_NAME_CHAR_MINBPC
940 #  undef IS_NMSTRT_CHAR
941 #  undef IS_NMSTRT_CHAR_MINBPC
942 #  undef IS_INVALID_CHAR
943 
944 #endif /* not XML_MIN_SIZE */
945 
946 #ifdef XML_NS
947 
948 static const struct normal_encoding big2_encoding_ns
949     = {{VTABLE, 2, 0,
950 #  if BYTEORDER == 4321
951         1
952 #  else
953         0
954 #  endif
955        },
956        {
957 #  include "asciitab.h"
958 #  include "latin1tab.h"
959        },
960        STANDARD_VTABLE(big2_) NULL_VTABLE};
961 
962 #endif
963 
964 static const struct normal_encoding big2_encoding
965     = {{VTABLE, 2, 0,
966 #if BYTEORDER == 4321
967         1
968 #else
969         0
970 #endif
971        },
972        {
973 #define BT_COLON BT_NMSTRT
974 #include "asciitab.h"
975 #undef BT_COLON
976 #include "latin1tab.h"
977        },
978        STANDARD_VTABLE(big2_) NULL_VTABLE};
979 
980 #if BYTEORDER != 1234
981 
982 #  ifdef XML_NS
983 
984 static const struct normal_encoding internal_big2_encoding_ns
985     = {{VTABLE, 2, 0, 1},
986        {
987 #    include "iasciitab.h"
988 #    include "latin1tab.h"
989        },
990        STANDARD_VTABLE(big2_) NULL_VTABLE};
991 
992 #  endif
993 
994 static const struct normal_encoding internal_big2_encoding
995     = {{VTABLE, 2, 0, 1},
996        {
997 #  define BT_COLON BT_NMSTRT
998 #  include "iasciitab.h"
999 #  undef BT_COLON
1000 #  include "latin1tab.h"
1001        },
1002        STANDARD_VTABLE(big2_) NULL_VTABLE};
1003 
1004 #endif
1005 
1006 #undef PREFIX
1007 
1008 static int FASTCALL
streqci(const char * s1,const char * s2)1009 streqci(const char *s1, const char *s2) {
1010   for (;;) {
1011     char c1 = *s1++;
1012     char c2 = *s2++;
1013     if (ASCII_a <= c1 && c1 <= ASCII_z)
1014       c1 += ASCII_A - ASCII_a;
1015     if (ASCII_a <= c2 && c2 <= ASCII_z)
1016       /* The following line will never get executed.  streqci() is
1017        * only called from two places, both of which guarantee to put
1018        * upper-case strings into s2.
1019        */
1020       c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1021     if (c1 != c2)
1022       return 0;
1023     if (! c1)
1024       break;
1025   }
1026   return 1;
1027 }
1028 
1029 static void PTRCALL
initUpdatePosition(const ENCODING * enc,const char * ptr,const char * end,POSITION * pos)1030 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1031                    POSITION *pos) {
1032   UNUSED_P(enc);
1033   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1034 }
1035 
1036 static int
toAscii(const ENCODING * enc,const char * ptr,const char * end)1037 toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1038   char buf[1];
1039   char *p = buf;
1040   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1041   if (p == buf)
1042     return -1;
1043   else
1044     return buf[0];
1045 }
1046 
1047 static int FASTCALL
isSpace(int c)1048 isSpace(int c) {
1049   switch (c) {
1050   case 0x20:
1051   case 0xD:
1052   case 0xA:
1053   case 0x9:
1054     return 1;
1055   }
1056   return 0;
1057 }
1058 
1059 /* Return 1 if there's just optional white space or there's an S
1060    followed by name=val.
1061 */
1062 static int
parsePseudoAttribute(const ENCODING * enc,const char * ptr,const char * end,const char ** namePtr,const char ** nameEndPtr,const char ** valPtr,const char ** nextTokPtr)1063 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1064                      const char **namePtr, const char **nameEndPtr,
1065                      const char **valPtr, const char **nextTokPtr) {
1066   int c;
1067   char open;
1068   if (ptr == end) {
1069     *namePtr = NULL;
1070     return 1;
1071   }
1072   if (! isSpace(toAscii(enc, ptr, end))) {
1073     *nextTokPtr = ptr;
1074     return 0;
1075   }
1076   do {
1077     ptr += enc->minBytesPerChar;
1078   } while (isSpace(toAscii(enc, ptr, end)));
1079   if (ptr == end) {
1080     *namePtr = NULL;
1081     return 1;
1082   }
1083   *namePtr = ptr;
1084   for (;;) {
1085     c = toAscii(enc, ptr, end);
1086     if (c == -1) {
1087       *nextTokPtr = ptr;
1088       return 0;
1089     }
1090     if (c == ASCII_EQUALS) {
1091       *nameEndPtr = ptr;
1092       break;
1093     }
1094     if (isSpace(c)) {
1095       *nameEndPtr = ptr;
1096       do {
1097         ptr += enc->minBytesPerChar;
1098       } while (isSpace(c = toAscii(enc, ptr, end)));
1099       if (c != ASCII_EQUALS) {
1100         *nextTokPtr = ptr;
1101         return 0;
1102       }
1103       break;
1104     }
1105     ptr += enc->minBytesPerChar;
1106   }
1107   if (ptr == *namePtr) {
1108     *nextTokPtr = ptr;
1109     return 0;
1110   }
1111   ptr += enc->minBytesPerChar;
1112   c = toAscii(enc, ptr, end);
1113   while (isSpace(c)) {
1114     ptr += enc->minBytesPerChar;
1115     c = toAscii(enc, ptr, end);
1116   }
1117   if (c != ASCII_QUOT && c != ASCII_APOS) {
1118     *nextTokPtr = ptr;
1119     return 0;
1120   }
1121   open = (char)c;
1122   ptr += enc->minBytesPerChar;
1123   *valPtr = ptr;
1124   for (;; ptr += enc->minBytesPerChar) {
1125     c = toAscii(enc, ptr, end);
1126     if (c == open)
1127       break;
1128     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1129         && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1130         && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1131       *nextTokPtr = ptr;
1132       return 0;
1133     }
1134   }
1135   *nextTokPtr = ptr + enc->minBytesPerChar;
1136   return 1;
1137 }
1138 
1139 static const char KW_version[]
1140     = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1141 
1142 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1143                                    ASCII_i, ASCII_n, ASCII_g, '\0'};
1144 
1145 static const char KW_standalone[]
1146     = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1147        ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1148 
1149 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1150 
1151 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1152 
1153 static int
doParseXmlDecl(const ENCODING * (* encodingFinder)(const ENCODING *,const char *,const char *),int isGeneralTextEntity,const ENCODING * enc,const char * ptr,const char * end,const char ** badPtr,const char ** versionPtr,const char ** versionEndPtr,const char ** encodingName,const ENCODING ** encoding,int * standalone)1154 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1155                                                  const char *),
1156                int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1157                const char *end, const char **badPtr, const char **versionPtr,
1158                const char **versionEndPtr, const char **encodingName,
1159                const ENCODING **encoding, int *standalone) {
1160   const char *val = NULL;
1161   const char *name = NULL;
1162   const char *nameEnd = NULL;
1163   ptr += 5 * enc->minBytesPerChar;
1164   end -= 2 * enc->minBytesPerChar;
1165   if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1166       || ! name) {
1167     *badPtr = ptr;
1168     return 0;
1169   }
1170   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1171     if (! isGeneralTextEntity) {
1172       *badPtr = name;
1173       return 0;
1174     }
1175   } else {
1176     if (versionPtr)
1177       *versionPtr = val;
1178     if (versionEndPtr)
1179       *versionEndPtr = ptr;
1180     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1181       *badPtr = ptr;
1182       return 0;
1183     }
1184     if (! name) {
1185       if (isGeneralTextEntity) {
1186         /* a TextDecl must have an EncodingDecl */
1187         *badPtr = ptr;
1188         return 0;
1189       }
1190       return 1;
1191     }
1192   }
1193   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1194     int c = toAscii(enc, val, end);
1195     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1196       *badPtr = val;
1197       return 0;
1198     }
1199     if (encodingName)
1200       *encodingName = val;
1201     if (encoding)
1202       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1203     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1204       *badPtr = ptr;
1205       return 0;
1206     }
1207     if (! name)
1208       return 1;
1209   }
1210   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1211       || isGeneralTextEntity) {
1212     *badPtr = name;
1213     return 0;
1214   }
1215   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1216     if (standalone)
1217       *standalone = 1;
1218   } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1219     if (standalone)
1220       *standalone = 0;
1221   } else {
1222     *badPtr = val;
1223     return 0;
1224   }
1225   while (isSpace(toAscii(enc, ptr, end)))
1226     ptr += enc->minBytesPerChar;
1227   if (ptr != end) {
1228     *badPtr = ptr;
1229     return 0;
1230   }
1231   return 1;
1232 }
1233 
1234 static int FASTCALL
checkCharRefNumber(int result)1235 checkCharRefNumber(int result) {
1236   switch (result >> 8) {
1237   case 0xD8:
1238   case 0xD9:
1239   case 0xDA:
1240   case 0xDB:
1241   case 0xDC:
1242   case 0xDD:
1243   case 0xDE:
1244   case 0xDF:
1245     return -1;
1246   case 0:
1247     if (latin1_encoding.type[result] == BT_NONXML)
1248       return -1;
1249     break;
1250   case 0xFF:
1251     if (result == 0xFFFE || result == 0xFFFF)
1252       return -1;
1253     break;
1254   }
1255   return result;
1256 }
1257 
1258 int FASTCALL
XmlUtf8Encode(int c,char * buf)1259 XmlUtf8Encode(int c, char *buf) {
1260   enum {
1261     /* minN is minimum legal resulting value for N byte sequence */
1262     min2 = 0x80,
1263     min3 = 0x800,
1264     min4 = 0x10000
1265   };
1266 
1267   if (c < 0)
1268     return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1269   if (c < min2) {
1270     buf[0] = (char)(c | UTF8_cval1);
1271     return 1;
1272   }
1273   if (c < min3) {
1274     buf[0] = (char)((c >> 6) | UTF8_cval2);
1275     buf[1] = (char)((c & 0x3f) | 0x80);
1276     return 2;
1277   }
1278   if (c < min4) {
1279     buf[0] = (char)((c >> 12) | UTF8_cval3);
1280     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1281     buf[2] = (char)((c & 0x3f) | 0x80);
1282     return 3;
1283   }
1284   if (c < 0x110000) {
1285     buf[0] = (char)((c >> 18) | UTF8_cval4);
1286     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1287     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1288     buf[3] = (char)((c & 0x3f) | 0x80);
1289     return 4;
1290   }
1291   return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1292 }
1293 
1294 int FASTCALL
XmlUtf16Encode(int charNum,unsigned short * buf)1295 XmlUtf16Encode(int charNum, unsigned short *buf) {
1296   if (charNum < 0)
1297     return 0;
1298   if (charNum < 0x10000) {
1299     buf[0] = (unsigned short)charNum;
1300     return 1;
1301   }
1302   if (charNum < 0x110000) {
1303     charNum -= 0x10000;
1304     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1305     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1306     return 2;
1307   }
1308   return 0;
1309 }
1310 
1311 struct unknown_encoding {
1312   struct normal_encoding normal;
1313   CONVERTER convert;
1314   void *userData;
1315   unsigned short utf16[256];
1316   char utf8[256][4];
1317 };
1318 
1319 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1320 
1321 int
XmlSizeOfUnknownEncoding(void)1322 XmlSizeOfUnknownEncoding(void) {
1323   return sizeof(struct unknown_encoding);
1324 }
1325 
1326 static int PTRFASTCALL
unknown_isName(const ENCODING * enc,const char * p)1327 unknown_isName(const ENCODING *enc, const char *p) {
1328   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1329   int c = uenc->convert(uenc->userData, p);
1330   if (c & ~0xFFFF)
1331     return 0;
1332   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1333 }
1334 
1335 static int PTRFASTCALL
unknown_isNmstrt(const ENCODING * enc,const char * p)1336 unknown_isNmstrt(const ENCODING *enc, const char *p) {
1337   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1338   int c = uenc->convert(uenc->userData, p);
1339   if (c & ~0xFFFF)
1340     return 0;
1341   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1342 }
1343 
1344 static int PTRFASTCALL
unknown_isInvalid(const ENCODING * enc,const char * p)1345 unknown_isInvalid(const ENCODING *enc, const char *p) {
1346   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1347   int c = uenc->convert(uenc->userData, p);
1348   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1349 }
1350 
1351 static enum XML_Convert_Result PTRCALL
unknown_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)1352 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1353                char **toP, const char *toLim) {
1354   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1355   char buf[XML_UTF8_ENCODE_MAX];
1356   for (;;) {
1357     const char *utf8;
1358     int n;
1359     if (*fromP == fromLim)
1360       return XML_CONVERT_COMPLETED;
1361     utf8 = uenc->utf8[(unsigned char)**fromP];
1362     n = *utf8++;
1363     if (n == 0) {
1364       int c = uenc->convert(uenc->userData, *fromP);
1365       n = XmlUtf8Encode(c, buf);
1366       if (n > toLim - *toP)
1367         return XML_CONVERT_OUTPUT_EXHAUSTED;
1368       utf8 = buf;
1369       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1370                  - (BT_LEAD2 - 2));
1371     } else {
1372       if (n > toLim - *toP)
1373         return XML_CONVERT_OUTPUT_EXHAUSTED;
1374       (*fromP)++;
1375     }
1376     memcpy(*toP, utf8, n);
1377     *toP += n;
1378   }
1379 }
1380 
1381 static enum XML_Convert_Result PTRCALL
unknown_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)1382 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1383                 unsigned short **toP, const unsigned short *toLim) {
1384   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1385   while (*fromP < fromLim && *toP < toLim) {
1386     unsigned short c = uenc->utf16[(unsigned char)**fromP];
1387     if (c == 0) {
1388       c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1389       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1390                  - (BT_LEAD2 - 2));
1391     } else
1392       (*fromP)++;
1393     *(*toP)++ = c;
1394   }
1395 
1396   if ((*toP == toLim) && (*fromP < fromLim))
1397     return XML_CONVERT_OUTPUT_EXHAUSTED;
1398   else
1399     return XML_CONVERT_COMPLETED;
1400 }
1401 
1402 ENCODING *
XmlInitUnknownEncoding(void * mem,const int * table,CONVERTER convert,void * userData)1403 XmlInitUnknownEncoding(void *mem, const int *table, CONVERTER convert,
1404                        void *userData) {
1405   int i;
1406   struct unknown_encoding *e = (struct unknown_encoding *)mem;
1407   memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1408   for (i = 0; i < 128; i++)
1409     if (latin1_encoding.type[i] != BT_OTHER
1410         && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1411       return 0;
1412   for (i = 0; i < 256; i++) {
1413     int c = table[i];
1414     if (c == -1) {
1415       e->normal.type[i] = BT_MALFORM;
1416       /* This shouldn't really get used. */
1417       e->utf16[i] = 0xFFFF;
1418       e->utf8[i][0] = 1;
1419       e->utf8[i][1] = 0;
1420     } else if (c < 0) {
1421       if (c < -4)
1422         return 0;
1423       /* Multi-byte sequences need a converter function */
1424       if (! convert)
1425         return 0;
1426       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1427       e->utf8[i][0] = 0;
1428       e->utf16[i] = 0;
1429     } else if (c < 0x80) {
1430       if (latin1_encoding.type[c] != BT_OTHER
1431           && latin1_encoding.type[c] != BT_NONXML && c != i)
1432         return 0;
1433       e->normal.type[i] = latin1_encoding.type[c];
1434       e->utf8[i][0] = 1;
1435       e->utf8[i][1] = (char)c;
1436       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1437     } else if (checkCharRefNumber(c) < 0) {
1438       e->normal.type[i] = BT_NONXML;
1439       /* This shouldn't really get used. */
1440       e->utf16[i] = 0xFFFF;
1441       e->utf8[i][0] = 1;
1442       e->utf8[i][1] = 0;
1443     } else {
1444       if (c > 0xFFFF)
1445         return 0;
1446       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1447         e->normal.type[i] = BT_NMSTRT;
1448       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1449         e->normal.type[i] = BT_NAME;
1450       else
1451         e->normal.type[i] = BT_OTHER;
1452       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1453       e->utf16[i] = (unsigned short)c;
1454     }
1455   }
1456   e->userData = userData;
1457   e->convert = convert;
1458   if (convert) {
1459     e->normal.isName2 = unknown_isName;
1460     e->normal.isName3 = unknown_isName;
1461     e->normal.isName4 = unknown_isName;
1462     e->normal.isNmstrt2 = unknown_isNmstrt;
1463     e->normal.isNmstrt3 = unknown_isNmstrt;
1464     e->normal.isNmstrt4 = unknown_isNmstrt;
1465     e->normal.isInvalid2 = unknown_isInvalid;
1466     e->normal.isInvalid3 = unknown_isInvalid;
1467     e->normal.isInvalid4 = unknown_isInvalid;
1468   }
1469   e->normal.enc.utf8Convert = unknown_toUtf8;
1470   e->normal.enc.utf16Convert = unknown_toUtf16;
1471   return &(e->normal.enc);
1472 }
1473 
1474 /* If this enumeration is changed, getEncodingIndex and encodings
1475 must also be changed. */
1476 enum {
1477   UNKNOWN_ENC = -1,
1478   ISO_8859_1_ENC = 0,
1479   US_ASCII_ENC,
1480   UTF_8_ENC,
1481   UTF_16_ENC,
1482   UTF_16BE_ENC,
1483   UTF_16LE_ENC,
1484   /* must match encodingNames up to here */
1485   NO_ENC
1486 };
1487 
1488 static const char KW_ISO_8859_1[]
1489     = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1490        ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1491 static const char KW_US_ASCII[]
1492     = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1493        ASCII_C, ASCII_I, ASCII_I,     '\0'};
1494 static const char KW_UTF_8[]
1495     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1496 static const char KW_UTF_16[]
1497     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1498 static const char KW_UTF_16BE[]
1499     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1500        ASCII_6, ASCII_B, ASCII_E, '\0'};
1501 static const char KW_UTF_16LE[]
1502     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1503        ASCII_6, ASCII_L, ASCII_E, '\0'};
1504 
1505 static int FASTCALL
getEncodingIndex(const char * name)1506 getEncodingIndex(const char *name) {
1507   static const char *const encodingNames[] = {
1508       KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1509   };
1510   int i;
1511   if (name == NULL)
1512     return NO_ENC;
1513   for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1514     if (streqci(name, encodingNames[i]))
1515       return i;
1516   return UNKNOWN_ENC;
1517 }
1518 
1519 /* For binary compatibility, we store the index of the encoding
1520    specified at initialization in the isUtf16 member.
1521 */
1522 
1523 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1524 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1525 
1526 /* This is what detects the encoding.  encodingTable maps from
1527    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1528    the external (protocol) specified encoding; state is
1529    XML_CONTENT_STATE if we're parsing an external text entity, and
1530    XML_PROLOG_STATE otherwise.
1531 */
1532 
1533 static int
initScan(const ENCODING * const * encodingTable,const INIT_ENCODING * enc,int state,const char * ptr,const char * end,const char ** nextTokPtr)1534 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1535          int state, const char *ptr, const char *end, const char **nextTokPtr) {
1536   const ENCODING **encPtr;
1537 
1538   if (ptr >= end)
1539     return XML_TOK_NONE;
1540   encPtr = enc->encPtr;
1541   if (ptr + 1 == end) {
1542     /* only a single byte available for auto-detection */
1543 #ifndef XML_DTD /* FIXME */
1544     /* a well-formed document entity must have more than one byte */
1545     if (state != XML_CONTENT_STATE)
1546       return XML_TOK_PARTIAL;
1547 #endif
1548     /* so we're parsing an external text entity... */
1549     /* if UTF-16 was externally specified, then we need at least 2 bytes */
1550     switch (INIT_ENC_INDEX(enc)) {
1551     case UTF_16_ENC:
1552     case UTF_16LE_ENC:
1553     case UTF_16BE_ENC:
1554       return XML_TOK_PARTIAL;
1555     }
1556     switch ((unsigned char)*ptr) {
1557     case 0xFE:
1558     case 0xFF:
1559     case 0xEF: /* possibly first byte of UTF-8 BOM */
1560       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1561         break;
1562       EXPAT_FALLTHROUGH;
1563     case 0x00:
1564     case 0x3C:
1565       return XML_TOK_PARTIAL;
1566     }
1567   } else {
1568     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1569     case 0xFEFF:
1570       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1571         break;
1572       *nextTokPtr = ptr + 2;
1573       *encPtr = encodingTable[UTF_16BE_ENC];
1574       return XML_TOK_BOM;
1575     /* 00 3C is handled in the default case */
1576     case 0x3C00:
1577       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1578            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1579           && state == XML_CONTENT_STATE)
1580         break;
1581       *encPtr = encodingTable[UTF_16LE_ENC];
1582       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1583     case 0xFFFE:
1584       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1585         break;
1586       *nextTokPtr = ptr + 2;
1587       *encPtr = encodingTable[UTF_16LE_ENC];
1588       return XML_TOK_BOM;
1589     case 0xEFBB:
1590       /* Maybe a UTF-8 BOM (EF BB BF) */
1591       /* If there's an explicitly specified (external) encoding
1592          of ISO-8859-1 or some flavour of UTF-16
1593          and this is an external text entity,
1594          don't look for the BOM,
1595          because it might be a legal data.
1596       */
1597       if (state == XML_CONTENT_STATE) {
1598         int e = INIT_ENC_INDEX(enc);
1599         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1600             || e == UTF_16_ENC)
1601           break;
1602       }
1603       if (ptr + 2 == end)
1604         return XML_TOK_PARTIAL;
1605       if ((unsigned char)ptr[2] == 0xBF) {
1606         *nextTokPtr = ptr + 3;
1607         *encPtr = encodingTable[UTF_8_ENC];
1608         return XML_TOK_BOM;
1609       }
1610       break;
1611     default:
1612       if (ptr[0] == '\0') {
1613         /* 0 isn't a legal data character. Furthermore a document
1614            entity can only start with ASCII characters.  So the only
1615            way this can fail to be big-endian UTF-16 if it it's an
1616            external parsed general entity that's labelled as
1617            UTF-16LE.
1618         */
1619         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1620           break;
1621         *encPtr = encodingTable[UTF_16BE_ENC];
1622         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1623       } else if (ptr[1] == '\0') {
1624         /* We could recover here in the case:
1625             - parsing an external entity
1626             - second byte is 0
1627             - no externally specified encoding
1628             - no encoding declaration
1629            by assuming UTF-16LE.  But we don't, because this would mean when
1630            presented just with a single byte, we couldn't reliably determine
1631            whether we needed further bytes.
1632         */
1633         if (state == XML_CONTENT_STATE)
1634           break;
1635         *encPtr = encodingTable[UTF_16LE_ENC];
1636         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1637       }
1638       break;
1639     }
1640   }
1641   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1642   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1643 }
1644 
1645 #define NS(x) x
1646 #define ns(x) x
1647 #define XML_TOK_NS_C
1648 #include "xmltok_ns.c"
1649 #undef XML_TOK_NS_C
1650 #undef NS
1651 #undef ns
1652 
1653 #ifdef XML_NS
1654 
1655 #  define NS(x) x##NS
1656 #  define ns(x) x##_ns
1657 
1658 #  define XML_TOK_NS_C
1659 #  include "xmltok_ns.c"
1660 #  undef XML_TOK_NS_C
1661 
1662 #  undef NS
1663 #  undef ns
1664 
1665 ENCODING *
XmlInitUnknownEncodingNS(void * mem,const int * table,CONVERTER convert,void * userData)1666 XmlInitUnknownEncodingNS(void *mem, const int *table, CONVERTER convert,
1667                          void *userData) {
1668   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1669   if (enc)
1670     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1671   return enc;
1672 }
1673 
1674 #endif /* XML_NS */
1675