xref: /freebsd/contrib/expat/lib/xmltok_impl.c (revision 4543ef516683042d46f3bd3bb8a4f3f746e00499)
1 /* This file is included (from xmltok.c, 1-3 times depending on XML_MIN_SIZE)!
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11    Copyright (c) 2002      Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12    Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
13    Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
14    Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
15    Copyright (c) 2018      Benjamin Peterson <benjamin@python.org>
16    Copyright (c) 2018      Anton Maklakov <antmak.pub@gmail.com>
17    Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
18    Copyright (c) 2020      Boris Kolpackov <boris@codesynthesis.com>
19    Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
20    Licensed under the MIT license:
21 
22    Permission is  hereby granted,  free of charge,  to any  person obtaining
23    a  copy  of  this  software   and  associated  documentation  files  (the
24    "Software"),  to  deal in  the  Software  without restriction,  including
25    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
26    distribute, sublicense, and/or sell copies of the Software, and to permit
27    persons  to whom  the Software  is  furnished to  do so,  subject to  the
28    following conditions:
29 
30    The above copyright  notice and this permission notice  shall be included
31    in all copies or substantial portions of the Software.
32 
33    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
34    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
35    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
36    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
37    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
38    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
39    USE OR OTHER DEALINGS IN THE SOFTWARE.
40 */
41 
42 #ifdef XML_TOK_IMPL_C
43 
44 #  ifndef IS_INVALID_CHAR // i.e. for UTF-16 and XML_MIN_SIZE not defined
45 #    define IS_INVALID_CHAR(enc, ptr, n) (0)
46 #  endif
47 
48 #  define INVALID_LEAD_CASE(n, ptr, nextTokPtr)                                \
49   case BT_LEAD##n:                                                             \
50     if (end - ptr < n)                                                         \
51       return XML_TOK_PARTIAL_CHAR;                                             \
52     if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
53       *(nextTokPtr) = (ptr);                                                   \
54       return XML_TOK_INVALID;                                                  \
55     }                                                                          \
56     ptr += n;                                                                  \
57     break;
58 
59 #  define INVALID_CASES(ptr, nextTokPtr)                                       \
60     INVALID_LEAD_CASE(2, ptr, nextTokPtr)                                      \
61     INVALID_LEAD_CASE(3, ptr, nextTokPtr)                                      \
62     INVALID_LEAD_CASE(4, ptr, nextTokPtr)                                      \
63   case BT_NONXML:                                                              \
64   case BT_MALFORM:                                                             \
65   case BT_TRAIL:                                                               \
66     *(nextTokPtr) = (ptr);                                                     \
67     return XML_TOK_INVALID;
68 
69 #  define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr)                        \
70   case BT_LEAD##n:                                                             \
71     if (end - ptr < n)                                                         \
72       return XML_TOK_PARTIAL_CHAR;                                             \
73     if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) {         \
74       *nextTokPtr = ptr;                                                       \
75       return XML_TOK_INVALID;                                                  \
76     }                                                                          \
77     ptr += n;                                                                  \
78     break;
79 
80 #  define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)                          \
81   case BT_NONASCII:                                                            \
82     if (! IS_NAME_CHAR_MINBPC(enc, ptr)) {                                     \
83       *nextTokPtr = ptr;                                                       \
84       return XML_TOK_INVALID;                                                  \
85     }                                                                          \
86     /* fall through */                                                         \
87   case BT_NMSTRT:                                                              \
88   case BT_HEX:                                                                 \
89   case BT_DIGIT:                                                               \
90   case BT_NAME:                                                                \
91   case BT_MINUS:                                                               \
92     ptr += MINBPC(enc);                                                        \
93     break;                                                                     \
94     CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr)                              \
95     CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr)                              \
96     CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
97 
98 #  define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr)                      \
99   case BT_LEAD##n:                                                             \
100     if ((end) - (ptr) < (n))                                                   \
101       return XML_TOK_PARTIAL_CHAR;                                             \
102     if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) {       \
103       *nextTokPtr = ptr;                                                       \
104       return XML_TOK_INVALID;                                                  \
105     }                                                                          \
106     ptr += n;                                                                  \
107     break;
108 
109 #  define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)                        \
110   case BT_NONASCII:                                                            \
111     if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {                                   \
112       *nextTokPtr = ptr;                                                       \
113       return XML_TOK_INVALID;                                                  \
114     }                                                                          \
115     /* fall through */                                                         \
116   case BT_NMSTRT:                                                              \
117   case BT_HEX:                                                                 \
118     ptr += MINBPC(enc);                                                        \
119     break;                                                                     \
120     CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr)                            \
121     CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr)                            \
122     CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
123 
124 #  ifndef PREFIX
125 #    define PREFIX(ident) ident
126 #  endif
127 
128 #  define HAS_CHARS(enc, ptr, end, count)                                      \
129     ((end) - (ptr) >= ((count) * MINBPC(enc)))
130 
131 #  define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
132 
133 #  define REQUIRE_CHARS(enc, ptr, end, count)                                  \
134     {                                                                          \
135       if (! HAS_CHARS(enc, ptr, end, count)) {                                 \
136         return XML_TOK_PARTIAL;                                                \
137       }                                                                        \
138     }
139 
140 #  define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
141 
142 /* ptr points to character following "<!-" */
143 
144 static int PTRCALL
PREFIX(scanComment)145 PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
146                     const char **nextTokPtr) {
147   if (HAS_CHAR(enc, ptr, end)) {
148     if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
149       *nextTokPtr = ptr;
150       return XML_TOK_INVALID;
151     }
152     ptr += MINBPC(enc);
153     while (HAS_CHAR(enc, ptr, end)) {
154       switch (BYTE_TYPE(enc, ptr)) {
155         INVALID_CASES(ptr, nextTokPtr)
156       case BT_MINUS:
157         ptr += MINBPC(enc);
158         REQUIRE_CHAR(enc, ptr, end);
159         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
160           ptr += MINBPC(enc);
161           REQUIRE_CHAR(enc, ptr, end);
162           if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
163             *nextTokPtr = ptr;
164             return XML_TOK_INVALID;
165           }
166           *nextTokPtr = ptr + MINBPC(enc);
167           return XML_TOK_COMMENT;
168         }
169         break;
170       default:
171         ptr += MINBPC(enc);
172         break;
173       }
174     }
175   }
176   return XML_TOK_PARTIAL;
177 }
178 
179 /* ptr points to character following "<!" */
180 
181 static int PTRCALL
PREFIX(scanDecl)182 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
183                  const char **nextTokPtr) {
184   REQUIRE_CHAR(enc, ptr, end);
185   switch (BYTE_TYPE(enc, ptr)) {
186   case BT_MINUS:
187     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
188   case BT_LSQB:
189     *nextTokPtr = ptr + MINBPC(enc);
190     return XML_TOK_COND_SECT_OPEN;
191   case BT_NMSTRT:
192   case BT_HEX:
193     ptr += MINBPC(enc);
194     break;
195   default:
196     *nextTokPtr = ptr;
197     return XML_TOK_INVALID;
198   }
199   while (HAS_CHAR(enc, ptr, end)) {
200     switch (BYTE_TYPE(enc, ptr)) {
201     case BT_PERCNT:
202       REQUIRE_CHARS(enc, ptr, end, 2);
203       /* don't allow <!ENTITY% foo "whatever"> */
204       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
205       case BT_S:
206       case BT_CR:
207       case BT_LF:
208       case BT_PERCNT:
209         *nextTokPtr = ptr;
210         return XML_TOK_INVALID;
211       }
212       /* fall through */
213     case BT_S:
214     case BT_CR:
215     case BT_LF:
216       *nextTokPtr = ptr;
217       return XML_TOK_DECL_OPEN;
218     case BT_NMSTRT:
219     case BT_HEX:
220       ptr += MINBPC(enc);
221       break;
222     default:
223       *nextTokPtr = ptr;
224       return XML_TOK_INVALID;
225     }
226   }
227   return XML_TOK_PARTIAL;
228 }
229 
230 static int PTRCALL
PREFIX(checkPiTarget)231 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
232                       int *tokPtr) {
233   int upper = 0;
234   UNUSED_P(enc);
235   *tokPtr = XML_TOK_PI;
236   if (end - ptr != MINBPC(enc) * 3)
237     return 1;
238   switch (BYTE_TO_ASCII(enc, ptr)) {
239   case ASCII_x:
240     break;
241   case ASCII_X:
242     upper = 1;
243     break;
244   default:
245     return 1;
246   }
247   ptr += MINBPC(enc);
248   switch (BYTE_TO_ASCII(enc, ptr)) {
249   case ASCII_m:
250     break;
251   case ASCII_M:
252     upper = 1;
253     break;
254   default:
255     return 1;
256   }
257   ptr += MINBPC(enc);
258   switch (BYTE_TO_ASCII(enc, ptr)) {
259   case ASCII_l:
260     break;
261   case ASCII_L:
262     upper = 1;
263     break;
264   default:
265     return 1;
266   }
267   if (upper)
268     return 0;
269   *tokPtr = XML_TOK_XML_DECL;
270   return 1;
271 }
272 
273 /* ptr points to character following "<?" */
274 
275 static int PTRCALL
PREFIX(scanPi)276 PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
277                const char **nextTokPtr) {
278   int tok;
279   const char *target = ptr;
280   REQUIRE_CHAR(enc, ptr, end);
281   switch (BYTE_TYPE(enc, ptr)) {
282     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
283   default:
284     *nextTokPtr = ptr;
285     return XML_TOK_INVALID;
286   }
287   while (HAS_CHAR(enc, ptr, end)) {
288     switch (BYTE_TYPE(enc, ptr)) {
289       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
290     case BT_S:
291     case BT_CR:
292     case BT_LF:
293       if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
294         *nextTokPtr = ptr;
295         return XML_TOK_INVALID;
296       }
297       ptr += MINBPC(enc);
298       while (HAS_CHAR(enc, ptr, end)) {
299         switch (BYTE_TYPE(enc, ptr)) {
300           INVALID_CASES(ptr, nextTokPtr)
301         case BT_QUEST:
302           ptr += MINBPC(enc);
303           REQUIRE_CHAR(enc, ptr, end);
304           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
305             *nextTokPtr = ptr + MINBPC(enc);
306             return tok;
307           }
308           break;
309         default:
310           ptr += MINBPC(enc);
311           break;
312         }
313       }
314       return XML_TOK_PARTIAL;
315     case BT_QUEST:
316       if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
317         *nextTokPtr = ptr;
318         return XML_TOK_INVALID;
319       }
320       ptr += MINBPC(enc);
321       REQUIRE_CHAR(enc, ptr, end);
322       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
323         *nextTokPtr = ptr + MINBPC(enc);
324         return tok;
325       }
326       /* fall through */
327     default:
328       *nextTokPtr = ptr;
329       return XML_TOK_INVALID;
330     }
331   }
332   return XML_TOK_PARTIAL;
333 }
334 
335 static int PTRCALL
PREFIX(scanCdataSection)336 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
337                          const char **nextTokPtr) {
338   static const char CDATA_LSQB[]
339       = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
340   int i;
341   UNUSED_P(enc);
342   /* CDATA[ */
343   REQUIRE_CHARS(enc, ptr, end, 6);
344   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
345     if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
346       *nextTokPtr = ptr;
347       return XML_TOK_INVALID;
348     }
349   }
350   *nextTokPtr = ptr;
351   return XML_TOK_CDATA_SECT_OPEN;
352 }
353 
354 static int PTRCALL
PREFIX(cdataSectionTok)355 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
356                         const char **nextTokPtr) {
357   if (ptr >= end)
358     return XML_TOK_NONE;
359   if (MINBPC(enc) > 1) {
360     size_t n = end - ptr;
361     if (n & (MINBPC(enc) - 1)) {
362       n &= ~(MINBPC(enc) - 1);
363       if (n == 0)
364         return XML_TOK_PARTIAL;
365       end = ptr + n;
366     }
367   }
368   switch (BYTE_TYPE(enc, ptr)) {
369   case BT_RSQB:
370     ptr += MINBPC(enc);
371     REQUIRE_CHAR(enc, ptr, end);
372     if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
373       break;
374     ptr += MINBPC(enc);
375     REQUIRE_CHAR(enc, ptr, end);
376     if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
377       ptr -= MINBPC(enc);
378       break;
379     }
380     *nextTokPtr = ptr + MINBPC(enc);
381     return XML_TOK_CDATA_SECT_CLOSE;
382   case BT_CR:
383     ptr += MINBPC(enc);
384     REQUIRE_CHAR(enc, ptr, end);
385     if (BYTE_TYPE(enc, ptr) == BT_LF)
386       ptr += MINBPC(enc);
387     *nextTokPtr = ptr;
388     return XML_TOK_DATA_NEWLINE;
389   case BT_LF:
390     *nextTokPtr = ptr + MINBPC(enc);
391     return XML_TOK_DATA_NEWLINE;
392     INVALID_CASES(ptr, nextTokPtr)
393   default:
394     ptr += MINBPC(enc);
395     break;
396   }
397   while (HAS_CHAR(enc, ptr, end)) {
398     switch (BYTE_TYPE(enc, ptr)) {
399 #  define LEAD_CASE(n)                                                         \
400   case BT_LEAD##n:                                                             \
401     if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) {                       \
402       *nextTokPtr = ptr;                                                       \
403       return XML_TOK_DATA_CHARS;                                               \
404     }                                                                          \
405     ptr += n;                                                                  \
406     break;
407       LEAD_CASE(2)
408       LEAD_CASE(3)
409       LEAD_CASE(4)
410 #  undef LEAD_CASE
411     case BT_NONXML:
412     case BT_MALFORM:
413     case BT_TRAIL:
414     case BT_CR:
415     case BT_LF:
416     case BT_RSQB:
417       *nextTokPtr = ptr;
418       return XML_TOK_DATA_CHARS;
419     default:
420       ptr += MINBPC(enc);
421       break;
422     }
423   }
424   *nextTokPtr = ptr;
425   return XML_TOK_DATA_CHARS;
426 }
427 
428 /* ptr points to character following "</" */
429 
430 static int PTRCALL
PREFIX(scanEndTag)431 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
432                    const char **nextTokPtr) {
433   REQUIRE_CHAR(enc, ptr, end);
434   switch (BYTE_TYPE(enc, ptr)) {
435     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
436   default:
437     *nextTokPtr = ptr;
438     return XML_TOK_INVALID;
439   }
440   while (HAS_CHAR(enc, ptr, end)) {
441     switch (BYTE_TYPE(enc, ptr)) {
442       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
443     case BT_S:
444     case BT_CR:
445     case BT_LF:
446       for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
447         switch (BYTE_TYPE(enc, ptr)) {
448         case BT_S:
449         case BT_CR:
450         case BT_LF:
451           break;
452         case BT_GT:
453           *nextTokPtr = ptr + MINBPC(enc);
454           return XML_TOK_END_TAG;
455         default:
456           *nextTokPtr = ptr;
457           return XML_TOK_INVALID;
458         }
459       }
460       return XML_TOK_PARTIAL;
461 #  ifdef XML_NS
462     case BT_COLON:
463       /* no need to check qname syntax here,
464          since end-tag must match exactly */
465       ptr += MINBPC(enc);
466       break;
467 #  endif
468     case BT_GT:
469       *nextTokPtr = ptr + MINBPC(enc);
470       return XML_TOK_END_TAG;
471     default:
472       *nextTokPtr = ptr;
473       return XML_TOK_INVALID;
474     }
475   }
476   return XML_TOK_PARTIAL;
477 }
478 
479 /* ptr points to character following "&#X" */
480 
481 static int PTRCALL
PREFIX(scanHexCharRef)482 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
483                        const char **nextTokPtr) {
484   if (HAS_CHAR(enc, ptr, end)) {
485     switch (BYTE_TYPE(enc, ptr)) {
486     case BT_DIGIT:
487     case BT_HEX:
488       break;
489     default:
490       *nextTokPtr = ptr;
491       return XML_TOK_INVALID;
492     }
493     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
494       switch (BYTE_TYPE(enc, ptr)) {
495       case BT_DIGIT:
496       case BT_HEX:
497         break;
498       case BT_SEMI:
499         *nextTokPtr = ptr + MINBPC(enc);
500         return XML_TOK_CHAR_REF;
501       default:
502         *nextTokPtr = ptr;
503         return XML_TOK_INVALID;
504       }
505     }
506   }
507   return XML_TOK_PARTIAL;
508 }
509 
510 /* ptr points to character following "&#" */
511 
512 static int PTRCALL
PREFIX(scanCharRef)513 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
514                     const char **nextTokPtr) {
515   if (HAS_CHAR(enc, ptr, end)) {
516     if (CHAR_MATCHES(enc, ptr, ASCII_x))
517       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
518     switch (BYTE_TYPE(enc, ptr)) {
519     case BT_DIGIT:
520       break;
521     default:
522       *nextTokPtr = ptr;
523       return XML_TOK_INVALID;
524     }
525     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
526       switch (BYTE_TYPE(enc, ptr)) {
527       case BT_DIGIT:
528         break;
529       case BT_SEMI:
530         *nextTokPtr = ptr + MINBPC(enc);
531         return XML_TOK_CHAR_REF;
532       default:
533         *nextTokPtr = ptr;
534         return XML_TOK_INVALID;
535       }
536     }
537   }
538   return XML_TOK_PARTIAL;
539 }
540 
541 /* ptr points to character following "&" */
542 
543 static int PTRCALL
PREFIX(scanRef)544 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
545                 const char **nextTokPtr) {
546   REQUIRE_CHAR(enc, ptr, end);
547   switch (BYTE_TYPE(enc, ptr)) {
548     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
549   case BT_NUM:
550     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
551   default:
552     *nextTokPtr = ptr;
553     return XML_TOK_INVALID;
554   }
555   while (HAS_CHAR(enc, ptr, end)) {
556     switch (BYTE_TYPE(enc, ptr)) {
557       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
558     case BT_SEMI:
559       *nextTokPtr = ptr + MINBPC(enc);
560       return XML_TOK_ENTITY_REF;
561     default:
562       *nextTokPtr = ptr;
563       return XML_TOK_INVALID;
564     }
565   }
566   return XML_TOK_PARTIAL;
567 }
568 
569 /* ptr points to character following first character of attribute name */
570 
571 static int PTRCALL
PREFIX(scanAtts)572 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
573                  const char **nextTokPtr) {
574 #  ifdef XML_NS
575   int hadColon = 0;
576 #  endif
577   while (HAS_CHAR(enc, ptr, end)) {
578     switch (BYTE_TYPE(enc, ptr)) {
579       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
580 #  ifdef XML_NS
581     case BT_COLON:
582       if (hadColon) {
583         *nextTokPtr = ptr;
584         return XML_TOK_INVALID;
585       }
586       hadColon = 1;
587       ptr += MINBPC(enc);
588       REQUIRE_CHAR(enc, ptr, end);
589       switch (BYTE_TYPE(enc, ptr)) {
590         CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
591       default:
592         *nextTokPtr = ptr;
593         return XML_TOK_INVALID;
594       }
595       break;
596 #  endif
597     case BT_S:
598     case BT_CR:
599     case BT_LF:
600       for (;;) {
601         int t;
602 
603         ptr += MINBPC(enc);
604         REQUIRE_CHAR(enc, ptr, end);
605         t = BYTE_TYPE(enc, ptr);
606         if (t == BT_EQUALS)
607           break;
608         switch (t) {
609         case BT_S:
610         case BT_LF:
611         case BT_CR:
612           break;
613         default:
614           *nextTokPtr = ptr;
615           return XML_TOK_INVALID;
616         }
617       }
618       /* fall through */
619     case BT_EQUALS: {
620       int open;
621 #  ifdef XML_NS
622       hadColon = 0;
623 #  endif
624       for (;;) {
625         ptr += MINBPC(enc);
626         REQUIRE_CHAR(enc, ptr, end);
627         open = BYTE_TYPE(enc, ptr);
628         if (open == BT_QUOT || open == BT_APOS)
629           break;
630         switch (open) {
631         case BT_S:
632         case BT_LF:
633         case BT_CR:
634           break;
635         default:
636           *nextTokPtr = ptr;
637           return XML_TOK_INVALID;
638         }
639       }
640       ptr += MINBPC(enc);
641       /* in attribute value */
642       for (;;) {
643         int t;
644         REQUIRE_CHAR(enc, ptr, end);
645         t = BYTE_TYPE(enc, ptr);
646         if (t == open)
647           break;
648         switch (t) {
649           INVALID_CASES(ptr, nextTokPtr)
650         case BT_AMP: {
651           int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
652           if (tok <= 0) {
653             if (tok == XML_TOK_INVALID)
654               *nextTokPtr = ptr;
655             return tok;
656           }
657           break;
658         }
659         case BT_LT:
660           *nextTokPtr = ptr;
661           return XML_TOK_INVALID;
662         default:
663           ptr += MINBPC(enc);
664           break;
665         }
666       }
667       ptr += MINBPC(enc);
668       REQUIRE_CHAR(enc, ptr, end);
669       switch (BYTE_TYPE(enc, ptr)) {
670       case BT_S:
671       case BT_CR:
672       case BT_LF:
673         break;
674       case BT_SOL:
675         goto sol;
676       case BT_GT:
677         goto gt;
678       default:
679         *nextTokPtr = ptr;
680         return XML_TOK_INVALID;
681       }
682       /* ptr points to closing quote */
683       for (;;) {
684         ptr += MINBPC(enc);
685         REQUIRE_CHAR(enc, ptr, end);
686         switch (BYTE_TYPE(enc, ptr)) {
687           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
688         case BT_S:
689         case BT_CR:
690         case BT_LF:
691           continue;
692         case BT_GT:
693         gt:
694           *nextTokPtr = ptr + MINBPC(enc);
695           return XML_TOK_START_TAG_WITH_ATTS;
696         case BT_SOL:
697         sol:
698           ptr += MINBPC(enc);
699           REQUIRE_CHAR(enc, ptr, end);
700           if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
701             *nextTokPtr = ptr;
702             return XML_TOK_INVALID;
703           }
704           *nextTokPtr = ptr + MINBPC(enc);
705           return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
706         default:
707           *nextTokPtr = ptr;
708           return XML_TOK_INVALID;
709         }
710         break;
711       }
712       break;
713     }
714     default:
715       *nextTokPtr = ptr;
716       return XML_TOK_INVALID;
717     }
718   }
719   return XML_TOK_PARTIAL;
720 }
721 
722 /* ptr points to character following "<" */
723 
724 static int PTRCALL
PREFIX(scanLt)725 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
726                const char **nextTokPtr) {
727 #  ifdef XML_NS
728   int hadColon;
729 #  endif
730   REQUIRE_CHAR(enc, ptr, end);
731   switch (BYTE_TYPE(enc, ptr)) {
732     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
733   case BT_EXCL:
734     ptr += MINBPC(enc);
735     REQUIRE_CHAR(enc, ptr, end);
736     switch (BYTE_TYPE(enc, ptr)) {
737     case BT_MINUS:
738       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
739     case BT_LSQB:
740       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
741     }
742     *nextTokPtr = ptr;
743     return XML_TOK_INVALID;
744   case BT_QUEST:
745     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
746   case BT_SOL:
747     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
748   default:
749     *nextTokPtr = ptr;
750     return XML_TOK_INVALID;
751   }
752 #  ifdef XML_NS
753   hadColon = 0;
754 #  endif
755   /* we have a start-tag */
756   while (HAS_CHAR(enc, ptr, end)) {
757     switch (BYTE_TYPE(enc, ptr)) {
758       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
759 #  ifdef XML_NS
760     case BT_COLON:
761       if (hadColon) {
762         *nextTokPtr = ptr;
763         return XML_TOK_INVALID;
764       }
765       hadColon = 1;
766       ptr += MINBPC(enc);
767       REQUIRE_CHAR(enc, ptr, end);
768       switch (BYTE_TYPE(enc, ptr)) {
769         CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
770       default:
771         *nextTokPtr = ptr;
772         return XML_TOK_INVALID;
773       }
774       break;
775 #  endif
776     case BT_S:
777     case BT_CR:
778     case BT_LF: {
779       ptr += MINBPC(enc);
780       while (HAS_CHAR(enc, ptr, end)) {
781         switch (BYTE_TYPE(enc, ptr)) {
782           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
783         case BT_GT:
784           goto gt;
785         case BT_SOL:
786           goto sol;
787         case BT_S:
788         case BT_CR:
789         case BT_LF:
790           ptr += MINBPC(enc);
791           continue;
792         default:
793           *nextTokPtr = ptr;
794           return XML_TOK_INVALID;
795         }
796         return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
797       }
798       return XML_TOK_PARTIAL;
799     }
800     case BT_GT:
801     gt:
802       *nextTokPtr = ptr + MINBPC(enc);
803       return XML_TOK_START_TAG_NO_ATTS;
804     case BT_SOL:
805     sol:
806       ptr += MINBPC(enc);
807       REQUIRE_CHAR(enc, ptr, end);
808       if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
809         *nextTokPtr = ptr;
810         return XML_TOK_INVALID;
811       }
812       *nextTokPtr = ptr + MINBPC(enc);
813       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
814     default:
815       *nextTokPtr = ptr;
816       return XML_TOK_INVALID;
817     }
818   }
819   return XML_TOK_PARTIAL;
820 }
821 
822 static int PTRCALL
PREFIX(contentTok)823 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
824                    const char **nextTokPtr) {
825   if (ptr >= end)
826     return XML_TOK_NONE;
827   if (MINBPC(enc) > 1) {
828     size_t n = end - ptr;
829     if (n & (MINBPC(enc) - 1)) {
830       n &= ~(MINBPC(enc) - 1);
831       if (n == 0)
832         return XML_TOK_PARTIAL;
833       end = ptr + n;
834     }
835   }
836   switch (BYTE_TYPE(enc, ptr)) {
837   case BT_LT:
838     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
839   case BT_AMP:
840     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
841   case BT_CR:
842     ptr += MINBPC(enc);
843     if (! HAS_CHAR(enc, ptr, end))
844       return XML_TOK_TRAILING_CR;
845     if (BYTE_TYPE(enc, ptr) == BT_LF)
846       ptr += MINBPC(enc);
847     *nextTokPtr = ptr;
848     return XML_TOK_DATA_NEWLINE;
849   case BT_LF:
850     *nextTokPtr = ptr + MINBPC(enc);
851     return XML_TOK_DATA_NEWLINE;
852   case BT_RSQB:
853     ptr += MINBPC(enc);
854     if (! HAS_CHAR(enc, ptr, end))
855       return XML_TOK_TRAILING_RSQB;
856     if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
857       break;
858     ptr += MINBPC(enc);
859     if (! HAS_CHAR(enc, ptr, end))
860       return XML_TOK_TRAILING_RSQB;
861     if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
862       ptr -= MINBPC(enc);
863       break;
864     }
865     *nextTokPtr = ptr;
866     return XML_TOK_INVALID;
867     INVALID_CASES(ptr, nextTokPtr)
868   default:
869     ptr += MINBPC(enc);
870     break;
871   }
872   while (HAS_CHAR(enc, ptr, end)) {
873     switch (BYTE_TYPE(enc, ptr)) {
874 #  define LEAD_CASE(n)                                                         \
875   case BT_LEAD##n:                                                             \
876     if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) {                       \
877       *nextTokPtr = ptr;                                                       \
878       return XML_TOK_DATA_CHARS;                                               \
879     }                                                                          \
880     ptr += n;                                                                  \
881     break;
882       LEAD_CASE(2)
883       LEAD_CASE(3)
884       LEAD_CASE(4)
885 #  undef LEAD_CASE
886     case BT_RSQB:
887       if (HAS_CHARS(enc, ptr, end, 2)) {
888         if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
889           ptr += MINBPC(enc);
890           break;
891         }
892         if (HAS_CHARS(enc, ptr, end, 3)) {
893           if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
894             ptr += MINBPC(enc);
895             break;
896           }
897           *nextTokPtr = ptr + 2 * MINBPC(enc);
898           return XML_TOK_INVALID;
899         }
900       }
901       /* fall through */
902     case BT_AMP:
903     case BT_LT:
904     case BT_NONXML:
905     case BT_MALFORM:
906     case BT_TRAIL:
907     case BT_CR:
908     case BT_LF:
909       *nextTokPtr = ptr;
910       return XML_TOK_DATA_CHARS;
911     default:
912       ptr += MINBPC(enc);
913       break;
914     }
915   }
916   *nextTokPtr = ptr;
917   return XML_TOK_DATA_CHARS;
918 }
919 
920 /* ptr points to character following "%" */
921 
922 static int PTRCALL
PREFIX(scanPercent)923 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
924                     const char **nextTokPtr) {
925   REQUIRE_CHAR(enc, ptr, end);
926   switch (BYTE_TYPE(enc, ptr)) {
927     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
928   case BT_S:
929   case BT_LF:
930   case BT_CR:
931   case BT_PERCNT:
932     *nextTokPtr = ptr;
933     return XML_TOK_PERCENT;
934   default:
935     *nextTokPtr = ptr;
936     return XML_TOK_INVALID;
937   }
938   while (HAS_CHAR(enc, ptr, end)) {
939     switch (BYTE_TYPE(enc, ptr)) {
940       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
941     case BT_SEMI:
942       *nextTokPtr = ptr + MINBPC(enc);
943       return XML_TOK_PARAM_ENTITY_REF;
944     default:
945       *nextTokPtr = ptr;
946       return XML_TOK_INVALID;
947     }
948   }
949   return XML_TOK_PARTIAL;
950 }
951 
952 static int PTRCALL
PREFIX(scanPoundName)953 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
954                       const char **nextTokPtr) {
955   REQUIRE_CHAR(enc, ptr, end);
956   switch (BYTE_TYPE(enc, ptr)) {
957     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
958   default:
959     *nextTokPtr = ptr;
960     return XML_TOK_INVALID;
961   }
962   while (HAS_CHAR(enc, ptr, end)) {
963     switch (BYTE_TYPE(enc, ptr)) {
964       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
965     case BT_CR:
966     case BT_LF:
967     case BT_S:
968     case BT_RPAR:
969     case BT_GT:
970     case BT_PERCNT:
971     case BT_VERBAR:
972       *nextTokPtr = ptr;
973       return XML_TOK_POUND_NAME;
974     default:
975       *nextTokPtr = ptr;
976       return XML_TOK_INVALID;
977     }
978   }
979   return -XML_TOK_POUND_NAME;
980 }
981 
982 static int PTRCALL
PREFIX(scanLit)983 PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
984                 const char **nextTokPtr) {
985   while (HAS_CHAR(enc, ptr, end)) {
986     int t = BYTE_TYPE(enc, ptr);
987     switch (t) {
988       INVALID_CASES(ptr, nextTokPtr)
989     case BT_QUOT:
990     case BT_APOS:
991       ptr += MINBPC(enc);
992       if (t != open)
993         break;
994       if (! HAS_CHAR(enc, ptr, end))
995         return -XML_TOK_LITERAL;
996       *nextTokPtr = ptr;
997       switch (BYTE_TYPE(enc, ptr)) {
998       case BT_S:
999       case BT_CR:
1000       case BT_LF:
1001       case BT_GT:
1002       case BT_PERCNT:
1003       case BT_LSQB:
1004         return XML_TOK_LITERAL;
1005       default:
1006         return XML_TOK_INVALID;
1007       }
1008     default:
1009       ptr += MINBPC(enc);
1010       break;
1011     }
1012   }
1013   return XML_TOK_PARTIAL;
1014 }
1015 
1016 static int PTRCALL
PREFIX(prologTok)1017 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
1018                   const char **nextTokPtr) {
1019   int tok;
1020   if (ptr >= end)
1021     return XML_TOK_NONE;
1022   if (MINBPC(enc) > 1) {
1023     size_t n = end - ptr;
1024     if (n & (MINBPC(enc) - 1)) {
1025       n &= ~(MINBPC(enc) - 1);
1026       if (n == 0)
1027         return XML_TOK_PARTIAL;
1028       end = ptr + n;
1029     }
1030   }
1031   switch (BYTE_TYPE(enc, ptr)) {
1032   case BT_QUOT:
1033     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1034   case BT_APOS:
1035     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1036   case BT_LT: {
1037     ptr += MINBPC(enc);
1038     REQUIRE_CHAR(enc, ptr, end);
1039     switch (BYTE_TYPE(enc, ptr)) {
1040     case BT_EXCL:
1041       return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1042     case BT_QUEST:
1043       return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1044     case BT_NMSTRT:
1045     case BT_HEX:
1046     case BT_NONASCII:
1047     case BT_LEAD2:
1048     case BT_LEAD3:
1049     case BT_LEAD4:
1050       *nextTokPtr = ptr - MINBPC(enc);
1051       return XML_TOK_INSTANCE_START;
1052     }
1053     *nextTokPtr = ptr;
1054     return XML_TOK_INVALID;
1055   }
1056   case BT_CR:
1057     if (ptr + MINBPC(enc) == end) {
1058       *nextTokPtr = end;
1059       /* indicate that this might be part of a CR/LF pair */
1060       return -XML_TOK_PROLOG_S;
1061     }
1062     /* fall through */
1063   case BT_S:
1064   case BT_LF:
1065     for (;;) {
1066       ptr += MINBPC(enc);
1067       if (! HAS_CHAR(enc, ptr, end))
1068         break;
1069       switch (BYTE_TYPE(enc, ptr)) {
1070       case BT_S:
1071       case BT_LF:
1072         break;
1073       case BT_CR:
1074         /* don't split CR/LF pair */
1075         if (ptr + MINBPC(enc) != end)
1076           break;
1077         /* fall through */
1078       default:
1079         *nextTokPtr = ptr;
1080         return XML_TOK_PROLOG_S;
1081       }
1082     }
1083     *nextTokPtr = ptr;
1084     return XML_TOK_PROLOG_S;
1085   case BT_PERCNT:
1086     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1087   case BT_COMMA:
1088     *nextTokPtr = ptr + MINBPC(enc);
1089     return XML_TOK_COMMA;
1090   case BT_LSQB:
1091     *nextTokPtr = ptr + MINBPC(enc);
1092     return XML_TOK_OPEN_BRACKET;
1093   case BT_RSQB:
1094     ptr += MINBPC(enc);
1095     if (! HAS_CHAR(enc, ptr, end))
1096       return -XML_TOK_CLOSE_BRACKET;
1097     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1098       REQUIRE_CHARS(enc, ptr, end, 2);
1099       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1100         *nextTokPtr = ptr + 2 * MINBPC(enc);
1101         return XML_TOK_COND_SECT_CLOSE;
1102       }
1103     }
1104     *nextTokPtr = ptr;
1105     return XML_TOK_CLOSE_BRACKET;
1106   case BT_LPAR:
1107     *nextTokPtr = ptr + MINBPC(enc);
1108     return XML_TOK_OPEN_PAREN;
1109   case BT_RPAR:
1110     ptr += MINBPC(enc);
1111     if (! HAS_CHAR(enc, ptr, end))
1112       return -XML_TOK_CLOSE_PAREN;
1113     switch (BYTE_TYPE(enc, ptr)) {
1114     case BT_AST:
1115       *nextTokPtr = ptr + MINBPC(enc);
1116       return XML_TOK_CLOSE_PAREN_ASTERISK;
1117     case BT_QUEST:
1118       *nextTokPtr = ptr + MINBPC(enc);
1119       return XML_TOK_CLOSE_PAREN_QUESTION;
1120     case BT_PLUS:
1121       *nextTokPtr = ptr + MINBPC(enc);
1122       return XML_TOK_CLOSE_PAREN_PLUS;
1123     case BT_CR:
1124     case BT_LF:
1125     case BT_S:
1126     case BT_GT:
1127     case BT_COMMA:
1128     case BT_VERBAR:
1129     case BT_RPAR:
1130       *nextTokPtr = ptr;
1131       return XML_TOK_CLOSE_PAREN;
1132     }
1133     *nextTokPtr = ptr;
1134     return XML_TOK_INVALID;
1135   case BT_VERBAR:
1136     *nextTokPtr = ptr + MINBPC(enc);
1137     return XML_TOK_OR;
1138   case BT_GT:
1139     *nextTokPtr = ptr + MINBPC(enc);
1140     return XML_TOK_DECL_CLOSE;
1141   case BT_NUM:
1142     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1143 #  define LEAD_CASE(n)                                                         \
1144   case BT_LEAD##n:                                                             \
1145     if (end - ptr < n)                                                         \
1146       return XML_TOK_PARTIAL_CHAR;                                             \
1147     if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
1148       *nextTokPtr = ptr;                                                       \
1149       return XML_TOK_INVALID;                                                  \
1150     }                                                                          \
1151     if (IS_NMSTRT_CHAR(enc, ptr, n)) {                                         \
1152       ptr += n;                                                                \
1153       tok = XML_TOK_NAME;                                                      \
1154       break;                                                                   \
1155     }                                                                          \
1156     if (IS_NAME_CHAR(enc, ptr, n)) {                                           \
1157       ptr += n;                                                                \
1158       tok = XML_TOK_NMTOKEN;                                                   \
1159       break;                                                                   \
1160     }                                                                          \
1161     *nextTokPtr = ptr;                                                         \
1162     return XML_TOK_INVALID;
1163     LEAD_CASE(2)
1164     LEAD_CASE(3)
1165     LEAD_CASE(4)
1166 #  undef LEAD_CASE
1167   case BT_NMSTRT:
1168   case BT_HEX:
1169     tok = XML_TOK_NAME;
1170     ptr += MINBPC(enc);
1171     break;
1172   case BT_DIGIT:
1173   case BT_NAME:
1174   case BT_MINUS:
1175 #  ifdef XML_NS
1176   case BT_COLON:
1177 #  endif
1178     tok = XML_TOK_NMTOKEN;
1179     ptr += MINBPC(enc);
1180     break;
1181   case BT_NONASCII:
1182     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1183       ptr += MINBPC(enc);
1184       tok = XML_TOK_NAME;
1185       break;
1186     }
1187     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1188       ptr += MINBPC(enc);
1189       tok = XML_TOK_NMTOKEN;
1190       break;
1191     }
1192     /* fall through */
1193   default:
1194     *nextTokPtr = ptr;
1195     return XML_TOK_INVALID;
1196   }
1197   while (HAS_CHAR(enc, ptr, end)) {
1198     switch (BYTE_TYPE(enc, ptr)) {
1199       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1200     case BT_GT:
1201     case BT_RPAR:
1202     case BT_COMMA:
1203     case BT_VERBAR:
1204     case BT_LSQB:
1205     case BT_PERCNT:
1206     case BT_S:
1207     case BT_CR:
1208     case BT_LF:
1209       *nextTokPtr = ptr;
1210       return tok;
1211 #  ifdef XML_NS
1212     case BT_COLON:
1213       ptr += MINBPC(enc);
1214       switch (tok) {
1215       case XML_TOK_NAME:
1216         REQUIRE_CHAR(enc, ptr, end);
1217         tok = XML_TOK_PREFIXED_NAME;
1218         switch (BYTE_TYPE(enc, ptr)) {
1219           CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1220         default:
1221           tok = XML_TOK_NMTOKEN;
1222           break;
1223         }
1224         break;
1225       case XML_TOK_PREFIXED_NAME:
1226         tok = XML_TOK_NMTOKEN;
1227         break;
1228       }
1229       break;
1230 #  endif
1231     case BT_PLUS:
1232       if (tok == XML_TOK_NMTOKEN) {
1233         *nextTokPtr = ptr;
1234         return XML_TOK_INVALID;
1235       }
1236       *nextTokPtr = ptr + MINBPC(enc);
1237       return XML_TOK_NAME_PLUS;
1238     case BT_AST:
1239       if (tok == XML_TOK_NMTOKEN) {
1240         *nextTokPtr = ptr;
1241         return XML_TOK_INVALID;
1242       }
1243       *nextTokPtr = ptr + MINBPC(enc);
1244       return XML_TOK_NAME_ASTERISK;
1245     case BT_QUEST:
1246       if (tok == XML_TOK_NMTOKEN) {
1247         *nextTokPtr = ptr;
1248         return XML_TOK_INVALID;
1249       }
1250       *nextTokPtr = ptr + MINBPC(enc);
1251       return XML_TOK_NAME_QUESTION;
1252     default:
1253       *nextTokPtr = ptr;
1254       return XML_TOK_INVALID;
1255     }
1256   }
1257   return -tok;
1258 }
1259 
1260 static int PTRCALL
PREFIX(attributeValueTok)1261 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1262                           const char **nextTokPtr) {
1263   const char *start;
1264   if (ptr >= end)
1265     return XML_TOK_NONE;
1266   else if (! HAS_CHAR(enc, ptr, end)) {
1267     /* This line cannot be executed.  The incoming data has already
1268      * been tokenized once, so incomplete characters like this have
1269      * already been eliminated from the input.  Retaining the paranoia
1270      * check is still valuable, however.
1271      */
1272     return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1273   }
1274   start = ptr;
1275   while (HAS_CHAR(enc, ptr, end)) {
1276     switch (BYTE_TYPE(enc, ptr)) {
1277 #  define LEAD_CASE(n)                                                         \
1278   case BT_LEAD##n:                                                             \
1279     ptr += n; /* NOTE: The encoding has already been validated. */             \
1280     break;
1281       LEAD_CASE(2)
1282       LEAD_CASE(3)
1283       LEAD_CASE(4)
1284 #  undef LEAD_CASE
1285     case BT_AMP:
1286       if (ptr == start)
1287         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1288       *nextTokPtr = ptr;
1289       return XML_TOK_DATA_CHARS;
1290     case BT_LT:
1291       /* this is for inside entity references */
1292       *nextTokPtr = ptr;
1293       return XML_TOK_INVALID;
1294     case BT_LF:
1295       if (ptr == start) {
1296         *nextTokPtr = ptr + MINBPC(enc);
1297         return XML_TOK_DATA_NEWLINE;
1298       }
1299       *nextTokPtr = ptr;
1300       return XML_TOK_DATA_CHARS;
1301     case BT_CR:
1302       if (ptr == start) {
1303         ptr += MINBPC(enc);
1304         if (! HAS_CHAR(enc, ptr, end))
1305           return XML_TOK_TRAILING_CR;
1306         if (BYTE_TYPE(enc, ptr) == BT_LF)
1307           ptr += MINBPC(enc);
1308         *nextTokPtr = ptr;
1309         return XML_TOK_DATA_NEWLINE;
1310       }
1311       *nextTokPtr = ptr;
1312       return XML_TOK_DATA_CHARS;
1313     case BT_S:
1314       if (ptr == start) {
1315         *nextTokPtr = ptr + MINBPC(enc);
1316         return XML_TOK_ATTRIBUTE_VALUE_S;
1317       }
1318       *nextTokPtr = ptr;
1319       return XML_TOK_DATA_CHARS;
1320     default:
1321       ptr += MINBPC(enc);
1322       break;
1323     }
1324   }
1325   *nextTokPtr = ptr;
1326   return XML_TOK_DATA_CHARS;
1327 }
1328 
1329 static int PTRCALL
PREFIX(entityValueTok)1330 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1331                        const char **nextTokPtr) {
1332   const char *start;
1333   if (ptr >= end)
1334     return XML_TOK_NONE;
1335   else if (! HAS_CHAR(enc, ptr, end)) {
1336     /* This line cannot be executed.  The incoming data has already
1337      * been tokenized once, so incomplete characters like this have
1338      * already been eliminated from the input.  Retaining the paranoia
1339      * check is still valuable, however.
1340      */
1341     return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1342   }
1343   start = ptr;
1344   while (HAS_CHAR(enc, ptr, end)) {
1345     switch (BYTE_TYPE(enc, ptr)) {
1346 #  define LEAD_CASE(n)                                                         \
1347   case BT_LEAD##n:                                                             \
1348     ptr += n; /* NOTE: The encoding has already been validated. */             \
1349     break;
1350       LEAD_CASE(2)
1351       LEAD_CASE(3)
1352       LEAD_CASE(4)
1353 #  undef LEAD_CASE
1354     case BT_AMP:
1355       if (ptr == start)
1356         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1357       *nextTokPtr = ptr;
1358       return XML_TOK_DATA_CHARS;
1359     case BT_PERCNT:
1360       if (ptr == start) {
1361         int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1362         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1363       }
1364       *nextTokPtr = ptr;
1365       return XML_TOK_DATA_CHARS;
1366     case BT_LF:
1367       if (ptr == start) {
1368         *nextTokPtr = ptr + MINBPC(enc);
1369         return XML_TOK_DATA_NEWLINE;
1370       }
1371       *nextTokPtr = ptr;
1372       return XML_TOK_DATA_CHARS;
1373     case BT_CR:
1374       if (ptr == start) {
1375         ptr += MINBPC(enc);
1376         if (! HAS_CHAR(enc, ptr, end))
1377           return XML_TOK_TRAILING_CR;
1378         if (BYTE_TYPE(enc, ptr) == BT_LF)
1379           ptr += MINBPC(enc);
1380         *nextTokPtr = ptr;
1381         return XML_TOK_DATA_NEWLINE;
1382       }
1383       *nextTokPtr = ptr;
1384       return XML_TOK_DATA_CHARS;
1385     default:
1386       ptr += MINBPC(enc);
1387       break;
1388     }
1389   }
1390   *nextTokPtr = ptr;
1391   return XML_TOK_DATA_CHARS;
1392 }
1393 
1394 #  ifdef XML_DTD
1395 
1396 static int PTRCALL
PREFIX(ignoreSectionTok)1397 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1398                          const char **nextTokPtr) {
1399   int level = 0;
1400   if (MINBPC(enc) > 1) {
1401     size_t n = end - ptr;
1402     if (n & (MINBPC(enc) - 1)) {
1403       n &= ~(MINBPC(enc) - 1);
1404       end = ptr + n;
1405     }
1406   }
1407   while (HAS_CHAR(enc, ptr, end)) {
1408     switch (BYTE_TYPE(enc, ptr)) {
1409       INVALID_CASES(ptr, nextTokPtr)
1410     case BT_LT:
1411       ptr += MINBPC(enc);
1412       REQUIRE_CHAR(enc, ptr, end);
1413       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1414         ptr += MINBPC(enc);
1415         REQUIRE_CHAR(enc, ptr, end);
1416         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1417           ++level;
1418           ptr += MINBPC(enc);
1419         }
1420       }
1421       break;
1422     case BT_RSQB:
1423       ptr += MINBPC(enc);
1424       REQUIRE_CHAR(enc, ptr, end);
1425       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1426         ptr += MINBPC(enc);
1427         REQUIRE_CHAR(enc, ptr, end);
1428         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1429           ptr += MINBPC(enc);
1430           if (level == 0) {
1431             *nextTokPtr = ptr;
1432             return XML_TOK_IGNORE_SECT;
1433           }
1434           --level;
1435         }
1436       }
1437       break;
1438     default:
1439       ptr += MINBPC(enc);
1440       break;
1441     }
1442   }
1443   return XML_TOK_PARTIAL;
1444 }
1445 
1446 #  endif /* XML_DTD */
1447 
1448 static int PTRCALL
PREFIX(isPublicId)1449 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1450                    const char **badPtr) {
1451   ptr += MINBPC(enc);
1452   end -= MINBPC(enc);
1453   for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1454     switch (BYTE_TYPE(enc, ptr)) {
1455     case BT_DIGIT:
1456     case BT_HEX:
1457     case BT_MINUS:
1458     case BT_APOS:
1459     case BT_LPAR:
1460     case BT_RPAR:
1461     case BT_PLUS:
1462     case BT_COMMA:
1463     case BT_SOL:
1464     case BT_EQUALS:
1465     case BT_QUEST:
1466     case BT_CR:
1467     case BT_LF:
1468     case BT_SEMI:
1469     case BT_EXCL:
1470     case BT_AST:
1471     case BT_PERCNT:
1472     case BT_NUM:
1473 #  ifdef XML_NS
1474     case BT_COLON:
1475 #  endif
1476       break;
1477     case BT_S:
1478       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1479         *badPtr = ptr;
1480         return 0;
1481       }
1482       break;
1483     case BT_NAME:
1484     case BT_NMSTRT:
1485       if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1486         break;
1487       /* fall through */
1488     default:
1489       switch (BYTE_TO_ASCII(enc, ptr)) {
1490       case 0x24: /* $ */
1491       case 0x40: /* @ */
1492         break;
1493       default:
1494         *badPtr = ptr;
1495         return 0;
1496       }
1497       break;
1498     }
1499   }
1500   return 1;
1501 }
1502 
1503 /* This must only be called for a well-formed start-tag or empty
1504    element tag.  Returns the number of attributes.  Pointers to the
1505    first attsMax attributes are stored in atts.
1506 */
1507 
1508 static int PTRCALL
PREFIX(getAtts)1509 PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
1510                 ATTRIBUTE *atts) {
1511   enum { other, inName, inValue } state = inName;
1512   int nAtts = 0;
1513   int open = 0; /* defined when state == inValue;
1514                    initialization just to shut up compilers */
1515 
1516   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1517     switch (BYTE_TYPE(enc, ptr)) {
1518 #  define START_NAME                                                           \
1519     if (state == other) {                                                      \
1520       if (nAtts < attsMax) {                                                   \
1521         atts[nAtts].name = ptr;                                                \
1522         atts[nAtts].normalized = 1;                                            \
1523       }                                                                        \
1524       state = inName;                                                          \
1525     }
1526 #  define LEAD_CASE(n)                                                         \
1527   case BT_LEAD##n: /* NOTE: The encoding has already been validated. */        \
1528     START_NAME ptr += (n - MINBPC(enc));                                       \
1529     break;
1530       LEAD_CASE(2)
1531       LEAD_CASE(3)
1532       LEAD_CASE(4)
1533 #  undef LEAD_CASE
1534     case BT_NONASCII:
1535     case BT_NMSTRT:
1536     case BT_HEX:
1537       START_NAME
1538       break;
1539 #  undef START_NAME
1540     case BT_QUOT:
1541       if (state != inValue) {
1542         if (nAtts < attsMax)
1543           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1544         state = inValue;
1545         open = BT_QUOT;
1546       } else if (open == BT_QUOT) {
1547         state = other;
1548         if (nAtts < attsMax)
1549           atts[nAtts].valueEnd = ptr;
1550         nAtts++;
1551       }
1552       break;
1553     case BT_APOS:
1554       if (state != inValue) {
1555         if (nAtts < attsMax)
1556           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1557         state = inValue;
1558         open = BT_APOS;
1559       } else if (open == BT_APOS) {
1560         state = other;
1561         if (nAtts < attsMax)
1562           atts[nAtts].valueEnd = ptr;
1563         nAtts++;
1564       }
1565       break;
1566     case BT_AMP:
1567       if (nAtts < attsMax)
1568         atts[nAtts].normalized = 0;
1569       break;
1570     case BT_S:
1571       if (state == inName)
1572         state = other;
1573       else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
1574                && (ptr == atts[nAtts].valuePtr
1575                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1576                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1577                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1578         atts[nAtts].normalized = 0;
1579       break;
1580     case BT_CR:
1581     case BT_LF:
1582       /* This case ensures that the first attribute name is counted
1583          Apart from that we could just change state on the quote. */
1584       if (state == inName)
1585         state = other;
1586       else if (state == inValue && nAtts < attsMax)
1587         atts[nAtts].normalized = 0;
1588       break;
1589     case BT_GT:
1590     case BT_SOL:
1591       if (state != inValue)
1592         return nAtts;
1593       break;
1594     default:
1595       break;
1596     }
1597   }
1598   /* not reached */
1599 }
1600 
1601 static int PTRFASTCALL
PREFIX(charRefNumber)1602 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
1603   int result = 0;
1604   /* skip &# */
1605   UNUSED_P(enc);
1606   ptr += 2 * MINBPC(enc);
1607   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1608     for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1609          ptr += MINBPC(enc)) {
1610       int c = BYTE_TO_ASCII(enc, ptr);
1611       switch (c) {
1612       case ASCII_0:
1613       case ASCII_1:
1614       case ASCII_2:
1615       case ASCII_3:
1616       case ASCII_4:
1617       case ASCII_5:
1618       case ASCII_6:
1619       case ASCII_7:
1620       case ASCII_8:
1621       case ASCII_9:
1622         result <<= 4;
1623         result |= (c - ASCII_0);
1624         break;
1625       case ASCII_A:
1626       case ASCII_B:
1627       case ASCII_C:
1628       case ASCII_D:
1629       case ASCII_E:
1630       case ASCII_F:
1631         result <<= 4;
1632         result += 10 + (c - ASCII_A);
1633         break;
1634       case ASCII_a:
1635       case ASCII_b:
1636       case ASCII_c:
1637       case ASCII_d:
1638       case ASCII_e:
1639       case ASCII_f:
1640         result <<= 4;
1641         result += 10 + (c - ASCII_a);
1642         break;
1643       }
1644       if (result >= 0x110000)
1645         return -1;
1646     }
1647   } else {
1648     for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1649       int c = BYTE_TO_ASCII(enc, ptr);
1650       result *= 10;
1651       result += (c - ASCII_0);
1652       if (result >= 0x110000)
1653         return -1;
1654     }
1655   }
1656   return checkCharRefNumber(result);
1657 }
1658 
1659 static int PTRCALL
PREFIX(predefinedEntityName)1660 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1661                              const char *end) {
1662   UNUSED_P(enc);
1663   switch ((end - ptr) / MINBPC(enc)) {
1664   case 2:
1665     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1666       switch (BYTE_TO_ASCII(enc, ptr)) {
1667       case ASCII_l:
1668         return ASCII_LT;
1669       case ASCII_g:
1670         return ASCII_GT;
1671       }
1672     }
1673     break;
1674   case 3:
1675     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1676       ptr += MINBPC(enc);
1677       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1678         ptr += MINBPC(enc);
1679         if (CHAR_MATCHES(enc, ptr, ASCII_p))
1680           return ASCII_AMP;
1681       }
1682     }
1683     break;
1684   case 4:
1685     switch (BYTE_TO_ASCII(enc, ptr)) {
1686     case ASCII_q:
1687       ptr += MINBPC(enc);
1688       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1689         ptr += MINBPC(enc);
1690         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1691           ptr += MINBPC(enc);
1692           if (CHAR_MATCHES(enc, ptr, ASCII_t))
1693             return ASCII_QUOT;
1694         }
1695       }
1696       break;
1697     case ASCII_a:
1698       ptr += MINBPC(enc);
1699       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1700         ptr += MINBPC(enc);
1701         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1702           ptr += MINBPC(enc);
1703           if (CHAR_MATCHES(enc, ptr, ASCII_s))
1704             return ASCII_APOS;
1705         }
1706       }
1707       break;
1708     }
1709   }
1710   return 0;
1711 }
1712 
1713 static int PTRCALL
PREFIX(nameMatchesAscii)1714 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1715                          const char *end1, const char *ptr2) {
1716   UNUSED_P(enc);
1717   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1718     if (end1 - ptr1 < MINBPC(enc)) {
1719       /* This line cannot be executed.  The incoming data has already
1720        * been tokenized once, so incomplete characters like this have
1721        * already been eliminated from the input.  Retaining the
1722        * paranoia check is still valuable, however.
1723        */
1724       return 0; /* LCOV_EXCL_LINE */
1725     }
1726     if (! CHAR_MATCHES(enc, ptr1, *ptr2))
1727       return 0;
1728   }
1729   return ptr1 == end1;
1730 }
1731 
1732 static int PTRFASTCALL
PREFIX(nameLength)1733 PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
1734   const char *start = ptr;
1735   for (;;) {
1736     switch (BYTE_TYPE(enc, ptr)) {
1737 #  define LEAD_CASE(n)                                                         \
1738   case BT_LEAD##n:                                                             \
1739     ptr += n; /* NOTE: The encoding has already been validated. */             \
1740     break;
1741       LEAD_CASE(2)
1742       LEAD_CASE(3)
1743       LEAD_CASE(4)
1744 #  undef LEAD_CASE
1745     case BT_NONASCII:
1746     case BT_NMSTRT:
1747 #  ifdef XML_NS
1748     case BT_COLON:
1749 #  endif
1750     case BT_HEX:
1751     case BT_DIGIT:
1752     case BT_NAME:
1753     case BT_MINUS:
1754       ptr += MINBPC(enc);
1755       break;
1756     default:
1757       return (int)(ptr - start);
1758     }
1759   }
1760 }
1761 
1762 static const char *PTRFASTCALL
PREFIX(skipS)1763 PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
1764   for (;;) {
1765     switch (BYTE_TYPE(enc, ptr)) {
1766     case BT_LF:
1767     case BT_CR:
1768     case BT_S:
1769       ptr += MINBPC(enc);
1770       break;
1771     default:
1772       return ptr;
1773     }
1774   }
1775 }
1776 
1777 static void PTRCALL
PREFIX(updatePosition)1778 PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
1779                        POSITION *pos) {
1780   while (HAS_CHAR(enc, ptr, end)) {
1781     switch (BYTE_TYPE(enc, ptr)) {
1782 #  define LEAD_CASE(n)                                                         \
1783   case BT_LEAD##n:                                                             \
1784     ptr += n; /* NOTE: The encoding has already been validated. */             \
1785     pos->columnNumber++;                                                       \
1786     break;
1787       LEAD_CASE(2)
1788       LEAD_CASE(3)
1789       LEAD_CASE(4)
1790 #  undef LEAD_CASE
1791     case BT_LF:
1792       pos->columnNumber = 0;
1793       pos->lineNumber++;
1794       ptr += MINBPC(enc);
1795       break;
1796     case BT_CR:
1797       pos->lineNumber++;
1798       ptr += MINBPC(enc);
1799       if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1800         ptr += MINBPC(enc);
1801       pos->columnNumber = 0;
1802       break;
1803     default:
1804       ptr += MINBPC(enc);
1805       pos->columnNumber++;
1806       break;
1807     }
1808   }
1809 }
1810 
1811 #  undef DO_LEAD_CASE
1812 #  undef MULTIBYTE_CASES
1813 #  undef INVALID_CASES
1814 #  undef CHECK_NAME_CASE
1815 #  undef CHECK_NAME_CASES
1816 #  undef CHECK_NMSTRT_CASE
1817 #  undef CHECK_NMSTRT_CASES
1818 
1819 #endif /* XML_TOK_IMPL_C */
1820