xref: /freebsd/contrib/expat/lib/xmltok_impl.c (revision 3e8eb5c7f4909209c042403ddee340b2ee7003a5)
1 /* This file is included (from xmltok.c, 1-3 times depending on XML_MIN_SIZE)!
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11    Copyright (c) 2002      Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12    Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
13    Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
14    Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
15    Copyright (c) 2018      Benjamin Peterson <benjamin@python.org>
16    Copyright (c) 2018      Anton Maklakov <antmak.pub@gmail.com>
17    Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
18    Copyright (c) 2020      Boris Kolpackov <boris@codesynthesis.com>
19    Licensed under the MIT license:
20 
21    Permission is  hereby granted,  free of charge,  to any  person obtaining
22    a  copy  of  this  software   and  associated  documentation  files  (the
23    "Software"),  to  deal in  the  Software  without restriction,  including
24    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
25    distribute, sublicense, and/or sell copies of the Software, and to permit
26    persons  to whom  the Software  is  furnished to  do so,  subject to  the
27    following conditions:
28 
29    The above copyright  notice and this permission notice  shall be included
30    in all copies or substantial portions of the Software.
31 
32    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
33    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
34    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
35    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
36    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
37    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
38    USE OR OTHER DEALINGS IN THE SOFTWARE.
39 */
40 
41 #ifdef XML_TOK_IMPL_C
42 
43 #  ifndef IS_INVALID_CHAR // i.e. for UTF-16 and XML_MIN_SIZE not defined
44 #    define IS_INVALID_CHAR(enc, ptr, n) (0)
45 #  endif
46 
47 #  define INVALID_LEAD_CASE(n, ptr, nextTokPtr)                                \
48   case BT_LEAD##n:                                                             \
49     if (end - ptr < n)                                                         \
50       return XML_TOK_PARTIAL_CHAR;                                             \
51     if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
52       *(nextTokPtr) = (ptr);                                                   \
53       return XML_TOK_INVALID;                                                  \
54     }                                                                          \
55     ptr += n;                                                                  \
56     break;
57 
58 #  define INVALID_CASES(ptr, nextTokPtr)                                       \
59     INVALID_LEAD_CASE(2, ptr, nextTokPtr)                                      \
60     INVALID_LEAD_CASE(3, ptr, nextTokPtr)                                      \
61     INVALID_LEAD_CASE(4, ptr, nextTokPtr)                                      \
62   case BT_NONXML:                                                              \
63   case BT_MALFORM:                                                             \
64   case BT_TRAIL:                                                               \
65     *(nextTokPtr) = (ptr);                                                     \
66     return XML_TOK_INVALID;
67 
68 #  define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr)                        \
69   case BT_LEAD##n:                                                             \
70     if (end - ptr < n)                                                         \
71       return XML_TOK_PARTIAL_CHAR;                                             \
72     if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) {         \
73       *nextTokPtr = ptr;                                                       \
74       return XML_TOK_INVALID;                                                  \
75     }                                                                          \
76     ptr += n;                                                                  \
77     break;
78 
79 #  define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)                          \
80   case BT_NONASCII:                                                            \
81     if (! IS_NAME_CHAR_MINBPC(enc, ptr)) {                                     \
82       *nextTokPtr = ptr;                                                       \
83       return XML_TOK_INVALID;                                                  \
84     }                                                                          \
85     /* fall through */                                                         \
86   case BT_NMSTRT:                                                              \
87   case BT_HEX:                                                                 \
88   case BT_DIGIT:                                                               \
89   case BT_NAME:                                                                \
90   case BT_MINUS:                                                               \
91     ptr += MINBPC(enc);                                                        \
92     break;                                                                     \
93     CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr)                              \
94     CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr)                              \
95     CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
96 
97 #  define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr)                      \
98   case BT_LEAD##n:                                                             \
99     if (end - ptr < n)                                                         \
100       return XML_TOK_PARTIAL_CHAR;                                             \
101     if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) {       \
102       *nextTokPtr = ptr;                                                       \
103       return XML_TOK_INVALID;                                                  \
104     }                                                                          \
105     ptr += n;                                                                  \
106     break;
107 
108 #  define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)                        \
109   case BT_NONASCII:                                                            \
110     if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {                                   \
111       *nextTokPtr = ptr;                                                       \
112       return XML_TOK_INVALID;                                                  \
113     }                                                                          \
114     /* fall through */                                                         \
115   case BT_NMSTRT:                                                              \
116   case BT_HEX:                                                                 \
117     ptr += MINBPC(enc);                                                        \
118     break;                                                                     \
119     CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr)                            \
120     CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr)                            \
121     CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
122 
123 #  ifndef PREFIX
124 #    define PREFIX(ident) ident
125 #  endif
126 
127 #  define HAS_CHARS(enc, ptr, end, count) (end - ptr >= count * MINBPC(enc))
128 
129 #  define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
130 
131 #  define REQUIRE_CHARS(enc, ptr, end, count)                                  \
132     {                                                                          \
133       if (! HAS_CHARS(enc, ptr, end, count)) {                                 \
134         return XML_TOK_PARTIAL;                                                \
135       }                                                                        \
136     }
137 
138 #  define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
139 
140 /* ptr points to character following "<!-" */
141 
142 static int PTRCALL
143 PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
144                     const char **nextTokPtr) {
145   if (HAS_CHAR(enc, ptr, end)) {
146     if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
147       *nextTokPtr = ptr;
148       return XML_TOK_INVALID;
149     }
150     ptr += MINBPC(enc);
151     while (HAS_CHAR(enc, ptr, end)) {
152       switch (BYTE_TYPE(enc, ptr)) {
153         INVALID_CASES(ptr, nextTokPtr)
154       case BT_MINUS:
155         ptr += MINBPC(enc);
156         REQUIRE_CHAR(enc, ptr, end);
157         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
158           ptr += MINBPC(enc);
159           REQUIRE_CHAR(enc, ptr, end);
160           if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
161             *nextTokPtr = ptr;
162             return XML_TOK_INVALID;
163           }
164           *nextTokPtr = ptr + MINBPC(enc);
165           return XML_TOK_COMMENT;
166         }
167         break;
168       default:
169         ptr += MINBPC(enc);
170         break;
171       }
172     }
173   }
174   return XML_TOK_PARTIAL;
175 }
176 
177 /* ptr points to character following "<!" */
178 
179 static int PTRCALL
180 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
181                  const char **nextTokPtr) {
182   REQUIRE_CHAR(enc, ptr, end);
183   switch (BYTE_TYPE(enc, ptr)) {
184   case BT_MINUS:
185     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
186   case BT_LSQB:
187     *nextTokPtr = ptr + MINBPC(enc);
188     return XML_TOK_COND_SECT_OPEN;
189   case BT_NMSTRT:
190   case BT_HEX:
191     ptr += MINBPC(enc);
192     break;
193   default:
194     *nextTokPtr = ptr;
195     return XML_TOK_INVALID;
196   }
197   while (HAS_CHAR(enc, ptr, end)) {
198     switch (BYTE_TYPE(enc, ptr)) {
199     case BT_PERCNT:
200       REQUIRE_CHARS(enc, ptr, end, 2);
201       /* don't allow <!ENTITY% foo "whatever"> */
202       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
203       case BT_S:
204       case BT_CR:
205       case BT_LF:
206       case BT_PERCNT:
207         *nextTokPtr = ptr;
208         return XML_TOK_INVALID;
209       }
210       /* fall through */
211     case BT_S:
212     case BT_CR:
213     case BT_LF:
214       *nextTokPtr = ptr;
215       return XML_TOK_DECL_OPEN;
216     case BT_NMSTRT:
217     case BT_HEX:
218       ptr += MINBPC(enc);
219       break;
220     default:
221       *nextTokPtr = ptr;
222       return XML_TOK_INVALID;
223     }
224   }
225   return XML_TOK_PARTIAL;
226 }
227 
228 static int PTRCALL
229 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
230                       int *tokPtr) {
231   int upper = 0;
232   UNUSED_P(enc);
233   *tokPtr = XML_TOK_PI;
234   if (end - ptr != MINBPC(enc) * 3)
235     return 1;
236   switch (BYTE_TO_ASCII(enc, ptr)) {
237   case ASCII_x:
238     break;
239   case ASCII_X:
240     upper = 1;
241     break;
242   default:
243     return 1;
244   }
245   ptr += MINBPC(enc);
246   switch (BYTE_TO_ASCII(enc, ptr)) {
247   case ASCII_m:
248     break;
249   case ASCII_M:
250     upper = 1;
251     break;
252   default:
253     return 1;
254   }
255   ptr += MINBPC(enc);
256   switch (BYTE_TO_ASCII(enc, ptr)) {
257   case ASCII_l:
258     break;
259   case ASCII_L:
260     upper = 1;
261     break;
262   default:
263     return 1;
264   }
265   if (upper)
266     return 0;
267   *tokPtr = XML_TOK_XML_DECL;
268   return 1;
269 }
270 
271 /* ptr points to character following "<?" */
272 
273 static int PTRCALL
274 PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
275                const char **nextTokPtr) {
276   int tok;
277   const char *target = ptr;
278   REQUIRE_CHAR(enc, ptr, end);
279   switch (BYTE_TYPE(enc, ptr)) {
280     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
281   default:
282     *nextTokPtr = ptr;
283     return XML_TOK_INVALID;
284   }
285   while (HAS_CHAR(enc, ptr, end)) {
286     switch (BYTE_TYPE(enc, ptr)) {
287       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
288     case BT_S:
289     case BT_CR:
290     case BT_LF:
291       if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
292         *nextTokPtr = ptr;
293         return XML_TOK_INVALID;
294       }
295       ptr += MINBPC(enc);
296       while (HAS_CHAR(enc, ptr, end)) {
297         switch (BYTE_TYPE(enc, ptr)) {
298           INVALID_CASES(ptr, nextTokPtr)
299         case BT_QUEST:
300           ptr += MINBPC(enc);
301           REQUIRE_CHAR(enc, ptr, end);
302           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
303             *nextTokPtr = ptr + MINBPC(enc);
304             return tok;
305           }
306           break;
307         default:
308           ptr += MINBPC(enc);
309           break;
310         }
311       }
312       return XML_TOK_PARTIAL;
313     case BT_QUEST:
314       if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
315         *nextTokPtr = ptr;
316         return XML_TOK_INVALID;
317       }
318       ptr += MINBPC(enc);
319       REQUIRE_CHAR(enc, ptr, end);
320       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
321         *nextTokPtr = ptr + MINBPC(enc);
322         return tok;
323       }
324       /* fall through */
325     default:
326       *nextTokPtr = ptr;
327       return XML_TOK_INVALID;
328     }
329   }
330   return XML_TOK_PARTIAL;
331 }
332 
333 static int PTRCALL
334 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
335                          const char **nextTokPtr) {
336   static const char CDATA_LSQB[]
337       = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
338   int i;
339   UNUSED_P(enc);
340   /* CDATA[ */
341   REQUIRE_CHARS(enc, ptr, end, 6);
342   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
343     if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
344       *nextTokPtr = ptr;
345       return XML_TOK_INVALID;
346     }
347   }
348   *nextTokPtr = ptr;
349   return XML_TOK_CDATA_SECT_OPEN;
350 }
351 
352 static int PTRCALL
353 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
354                         const char **nextTokPtr) {
355   if (ptr >= end)
356     return XML_TOK_NONE;
357   if (MINBPC(enc) > 1) {
358     size_t n = end - ptr;
359     if (n & (MINBPC(enc) - 1)) {
360       n &= ~(MINBPC(enc) - 1);
361       if (n == 0)
362         return XML_TOK_PARTIAL;
363       end = ptr + n;
364     }
365   }
366   switch (BYTE_TYPE(enc, ptr)) {
367   case BT_RSQB:
368     ptr += MINBPC(enc);
369     REQUIRE_CHAR(enc, ptr, end);
370     if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
371       break;
372     ptr += MINBPC(enc);
373     REQUIRE_CHAR(enc, ptr, end);
374     if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
375       ptr -= MINBPC(enc);
376       break;
377     }
378     *nextTokPtr = ptr + MINBPC(enc);
379     return XML_TOK_CDATA_SECT_CLOSE;
380   case BT_CR:
381     ptr += MINBPC(enc);
382     REQUIRE_CHAR(enc, ptr, end);
383     if (BYTE_TYPE(enc, ptr) == BT_LF)
384       ptr += MINBPC(enc);
385     *nextTokPtr = ptr;
386     return XML_TOK_DATA_NEWLINE;
387   case BT_LF:
388     *nextTokPtr = ptr + MINBPC(enc);
389     return XML_TOK_DATA_NEWLINE;
390     INVALID_CASES(ptr, nextTokPtr)
391   default:
392     ptr += MINBPC(enc);
393     break;
394   }
395   while (HAS_CHAR(enc, ptr, end)) {
396     switch (BYTE_TYPE(enc, ptr)) {
397 #  define LEAD_CASE(n)                                                         \
398   case BT_LEAD##n:                                                             \
399     if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) {                       \
400       *nextTokPtr = ptr;                                                       \
401       return XML_TOK_DATA_CHARS;                                               \
402     }                                                                          \
403     ptr += n;                                                                  \
404     break;
405       LEAD_CASE(2)
406       LEAD_CASE(3)
407       LEAD_CASE(4)
408 #  undef LEAD_CASE
409     case BT_NONXML:
410     case BT_MALFORM:
411     case BT_TRAIL:
412     case BT_CR:
413     case BT_LF:
414     case BT_RSQB:
415       *nextTokPtr = ptr;
416       return XML_TOK_DATA_CHARS;
417     default:
418       ptr += MINBPC(enc);
419       break;
420     }
421   }
422   *nextTokPtr = ptr;
423   return XML_TOK_DATA_CHARS;
424 }
425 
426 /* ptr points to character following "</" */
427 
428 static int PTRCALL
429 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
430                    const char **nextTokPtr) {
431   REQUIRE_CHAR(enc, ptr, end);
432   switch (BYTE_TYPE(enc, ptr)) {
433     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
434   default:
435     *nextTokPtr = ptr;
436     return XML_TOK_INVALID;
437   }
438   while (HAS_CHAR(enc, ptr, end)) {
439     switch (BYTE_TYPE(enc, ptr)) {
440       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
441     case BT_S:
442     case BT_CR:
443     case BT_LF:
444       for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
445         switch (BYTE_TYPE(enc, ptr)) {
446         case BT_S:
447         case BT_CR:
448         case BT_LF:
449           break;
450         case BT_GT:
451           *nextTokPtr = ptr + MINBPC(enc);
452           return XML_TOK_END_TAG;
453         default:
454           *nextTokPtr = ptr;
455           return XML_TOK_INVALID;
456         }
457       }
458       return XML_TOK_PARTIAL;
459 #  ifdef XML_NS
460     case BT_COLON:
461       /* no need to check qname syntax here,
462          since end-tag must match exactly */
463       ptr += MINBPC(enc);
464       break;
465 #  endif
466     case BT_GT:
467       *nextTokPtr = ptr + MINBPC(enc);
468       return XML_TOK_END_TAG;
469     default:
470       *nextTokPtr = ptr;
471       return XML_TOK_INVALID;
472     }
473   }
474   return XML_TOK_PARTIAL;
475 }
476 
477 /* ptr points to character following "&#X" */
478 
479 static int PTRCALL
480 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
481                        const char **nextTokPtr) {
482   if (HAS_CHAR(enc, ptr, end)) {
483     switch (BYTE_TYPE(enc, ptr)) {
484     case BT_DIGIT:
485     case BT_HEX:
486       break;
487     default:
488       *nextTokPtr = ptr;
489       return XML_TOK_INVALID;
490     }
491     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
492       switch (BYTE_TYPE(enc, ptr)) {
493       case BT_DIGIT:
494       case BT_HEX:
495         break;
496       case BT_SEMI:
497         *nextTokPtr = ptr + MINBPC(enc);
498         return XML_TOK_CHAR_REF;
499       default:
500         *nextTokPtr = ptr;
501         return XML_TOK_INVALID;
502       }
503     }
504   }
505   return XML_TOK_PARTIAL;
506 }
507 
508 /* ptr points to character following "&#" */
509 
510 static int PTRCALL
511 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
512                     const char **nextTokPtr) {
513   if (HAS_CHAR(enc, ptr, end)) {
514     if (CHAR_MATCHES(enc, ptr, ASCII_x))
515       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
516     switch (BYTE_TYPE(enc, ptr)) {
517     case BT_DIGIT:
518       break;
519     default:
520       *nextTokPtr = ptr;
521       return XML_TOK_INVALID;
522     }
523     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
524       switch (BYTE_TYPE(enc, ptr)) {
525       case BT_DIGIT:
526         break;
527       case BT_SEMI:
528         *nextTokPtr = ptr + MINBPC(enc);
529         return XML_TOK_CHAR_REF;
530       default:
531         *nextTokPtr = ptr;
532         return XML_TOK_INVALID;
533       }
534     }
535   }
536   return XML_TOK_PARTIAL;
537 }
538 
539 /* ptr points to character following "&" */
540 
541 static int PTRCALL
542 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
543                 const char **nextTokPtr) {
544   REQUIRE_CHAR(enc, ptr, end);
545   switch (BYTE_TYPE(enc, ptr)) {
546     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
547   case BT_NUM:
548     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
549   default:
550     *nextTokPtr = ptr;
551     return XML_TOK_INVALID;
552   }
553   while (HAS_CHAR(enc, ptr, end)) {
554     switch (BYTE_TYPE(enc, ptr)) {
555       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
556     case BT_SEMI:
557       *nextTokPtr = ptr + MINBPC(enc);
558       return XML_TOK_ENTITY_REF;
559     default:
560       *nextTokPtr = ptr;
561       return XML_TOK_INVALID;
562     }
563   }
564   return XML_TOK_PARTIAL;
565 }
566 
567 /* ptr points to character following first character of attribute name */
568 
569 static int PTRCALL
570 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
571                  const char **nextTokPtr) {
572 #  ifdef XML_NS
573   int hadColon = 0;
574 #  endif
575   while (HAS_CHAR(enc, ptr, end)) {
576     switch (BYTE_TYPE(enc, ptr)) {
577       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
578 #  ifdef XML_NS
579     case BT_COLON:
580       if (hadColon) {
581         *nextTokPtr = ptr;
582         return XML_TOK_INVALID;
583       }
584       hadColon = 1;
585       ptr += MINBPC(enc);
586       REQUIRE_CHAR(enc, ptr, end);
587       switch (BYTE_TYPE(enc, ptr)) {
588         CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
589       default:
590         *nextTokPtr = ptr;
591         return XML_TOK_INVALID;
592       }
593       break;
594 #  endif
595     case BT_S:
596     case BT_CR:
597     case BT_LF:
598       for (;;) {
599         int t;
600 
601         ptr += MINBPC(enc);
602         REQUIRE_CHAR(enc, ptr, end);
603         t = BYTE_TYPE(enc, ptr);
604         if (t == BT_EQUALS)
605           break;
606         switch (t) {
607         case BT_S:
608         case BT_LF:
609         case BT_CR:
610           break;
611         default:
612           *nextTokPtr = ptr;
613           return XML_TOK_INVALID;
614         }
615       }
616       /* fall through */
617     case BT_EQUALS: {
618       int open;
619 #  ifdef XML_NS
620       hadColon = 0;
621 #  endif
622       for (;;) {
623         ptr += MINBPC(enc);
624         REQUIRE_CHAR(enc, ptr, end);
625         open = BYTE_TYPE(enc, ptr);
626         if (open == BT_QUOT || open == BT_APOS)
627           break;
628         switch (open) {
629         case BT_S:
630         case BT_LF:
631         case BT_CR:
632           break;
633         default:
634           *nextTokPtr = ptr;
635           return XML_TOK_INVALID;
636         }
637       }
638       ptr += MINBPC(enc);
639       /* in attribute value */
640       for (;;) {
641         int t;
642         REQUIRE_CHAR(enc, ptr, end);
643         t = BYTE_TYPE(enc, ptr);
644         if (t == open)
645           break;
646         switch (t) {
647           INVALID_CASES(ptr, nextTokPtr)
648         case BT_AMP: {
649           int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
650           if (tok <= 0) {
651             if (tok == XML_TOK_INVALID)
652               *nextTokPtr = ptr;
653             return tok;
654           }
655           break;
656         }
657         case BT_LT:
658           *nextTokPtr = ptr;
659           return XML_TOK_INVALID;
660         default:
661           ptr += MINBPC(enc);
662           break;
663         }
664       }
665       ptr += MINBPC(enc);
666       REQUIRE_CHAR(enc, ptr, end);
667       switch (BYTE_TYPE(enc, ptr)) {
668       case BT_S:
669       case BT_CR:
670       case BT_LF:
671         break;
672       case BT_SOL:
673         goto sol;
674       case BT_GT:
675         goto gt;
676       default:
677         *nextTokPtr = ptr;
678         return XML_TOK_INVALID;
679       }
680       /* ptr points to closing quote */
681       for (;;) {
682         ptr += MINBPC(enc);
683         REQUIRE_CHAR(enc, ptr, end);
684         switch (BYTE_TYPE(enc, ptr)) {
685           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
686         case BT_S:
687         case BT_CR:
688         case BT_LF:
689           continue;
690         case BT_GT:
691         gt:
692           *nextTokPtr = ptr + MINBPC(enc);
693           return XML_TOK_START_TAG_WITH_ATTS;
694         case BT_SOL:
695         sol:
696           ptr += MINBPC(enc);
697           REQUIRE_CHAR(enc, ptr, end);
698           if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
699             *nextTokPtr = ptr;
700             return XML_TOK_INVALID;
701           }
702           *nextTokPtr = ptr + MINBPC(enc);
703           return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
704         default:
705           *nextTokPtr = ptr;
706           return XML_TOK_INVALID;
707         }
708         break;
709       }
710       break;
711     }
712     default:
713       *nextTokPtr = ptr;
714       return XML_TOK_INVALID;
715     }
716   }
717   return XML_TOK_PARTIAL;
718 }
719 
720 /* ptr points to character following "<" */
721 
722 static int PTRCALL
723 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
724                const char **nextTokPtr) {
725 #  ifdef XML_NS
726   int hadColon;
727 #  endif
728   REQUIRE_CHAR(enc, ptr, end);
729   switch (BYTE_TYPE(enc, ptr)) {
730     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
731   case BT_EXCL:
732     ptr += MINBPC(enc);
733     REQUIRE_CHAR(enc, ptr, end);
734     switch (BYTE_TYPE(enc, ptr)) {
735     case BT_MINUS:
736       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
737     case BT_LSQB:
738       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
739     }
740     *nextTokPtr = ptr;
741     return XML_TOK_INVALID;
742   case BT_QUEST:
743     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
744   case BT_SOL:
745     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
746   default:
747     *nextTokPtr = ptr;
748     return XML_TOK_INVALID;
749   }
750 #  ifdef XML_NS
751   hadColon = 0;
752 #  endif
753   /* we have a start-tag */
754   while (HAS_CHAR(enc, ptr, end)) {
755     switch (BYTE_TYPE(enc, ptr)) {
756       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
757 #  ifdef XML_NS
758     case BT_COLON:
759       if (hadColon) {
760         *nextTokPtr = ptr;
761         return XML_TOK_INVALID;
762       }
763       hadColon = 1;
764       ptr += MINBPC(enc);
765       REQUIRE_CHAR(enc, ptr, end);
766       switch (BYTE_TYPE(enc, ptr)) {
767         CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
768       default:
769         *nextTokPtr = ptr;
770         return XML_TOK_INVALID;
771       }
772       break;
773 #  endif
774     case BT_S:
775     case BT_CR:
776     case BT_LF: {
777       ptr += MINBPC(enc);
778       while (HAS_CHAR(enc, ptr, end)) {
779         switch (BYTE_TYPE(enc, ptr)) {
780           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
781         case BT_GT:
782           goto gt;
783         case BT_SOL:
784           goto sol;
785         case BT_S:
786         case BT_CR:
787         case BT_LF:
788           ptr += MINBPC(enc);
789           continue;
790         default:
791           *nextTokPtr = ptr;
792           return XML_TOK_INVALID;
793         }
794         return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
795       }
796       return XML_TOK_PARTIAL;
797     }
798     case BT_GT:
799     gt:
800       *nextTokPtr = ptr + MINBPC(enc);
801       return XML_TOK_START_TAG_NO_ATTS;
802     case BT_SOL:
803     sol:
804       ptr += MINBPC(enc);
805       REQUIRE_CHAR(enc, ptr, end);
806       if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
807         *nextTokPtr = ptr;
808         return XML_TOK_INVALID;
809       }
810       *nextTokPtr = ptr + MINBPC(enc);
811       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
812     default:
813       *nextTokPtr = ptr;
814       return XML_TOK_INVALID;
815     }
816   }
817   return XML_TOK_PARTIAL;
818 }
819 
820 static int PTRCALL
821 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
822                    const char **nextTokPtr) {
823   if (ptr >= end)
824     return XML_TOK_NONE;
825   if (MINBPC(enc) > 1) {
826     size_t n = end - ptr;
827     if (n & (MINBPC(enc) - 1)) {
828       n &= ~(MINBPC(enc) - 1);
829       if (n == 0)
830         return XML_TOK_PARTIAL;
831       end = ptr + n;
832     }
833   }
834   switch (BYTE_TYPE(enc, ptr)) {
835   case BT_LT:
836     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
837   case BT_AMP:
838     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
839   case BT_CR:
840     ptr += MINBPC(enc);
841     if (! HAS_CHAR(enc, ptr, end))
842       return XML_TOK_TRAILING_CR;
843     if (BYTE_TYPE(enc, ptr) == BT_LF)
844       ptr += MINBPC(enc);
845     *nextTokPtr = ptr;
846     return XML_TOK_DATA_NEWLINE;
847   case BT_LF:
848     *nextTokPtr = ptr + MINBPC(enc);
849     return XML_TOK_DATA_NEWLINE;
850   case BT_RSQB:
851     ptr += MINBPC(enc);
852     if (! HAS_CHAR(enc, ptr, end))
853       return XML_TOK_TRAILING_RSQB;
854     if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
855       break;
856     ptr += MINBPC(enc);
857     if (! HAS_CHAR(enc, ptr, end))
858       return XML_TOK_TRAILING_RSQB;
859     if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
860       ptr -= MINBPC(enc);
861       break;
862     }
863     *nextTokPtr = ptr;
864     return XML_TOK_INVALID;
865     INVALID_CASES(ptr, nextTokPtr)
866   default:
867     ptr += MINBPC(enc);
868     break;
869   }
870   while (HAS_CHAR(enc, ptr, end)) {
871     switch (BYTE_TYPE(enc, ptr)) {
872 #  define LEAD_CASE(n)                                                         \
873   case BT_LEAD##n:                                                             \
874     if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) {                       \
875       *nextTokPtr = ptr;                                                       \
876       return XML_TOK_DATA_CHARS;                                               \
877     }                                                                          \
878     ptr += n;                                                                  \
879     break;
880       LEAD_CASE(2)
881       LEAD_CASE(3)
882       LEAD_CASE(4)
883 #  undef LEAD_CASE
884     case BT_RSQB:
885       if (HAS_CHARS(enc, ptr, end, 2)) {
886         if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
887           ptr += MINBPC(enc);
888           break;
889         }
890         if (HAS_CHARS(enc, ptr, end, 3)) {
891           if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
892             ptr += MINBPC(enc);
893             break;
894           }
895           *nextTokPtr = ptr + 2 * MINBPC(enc);
896           return XML_TOK_INVALID;
897         }
898       }
899       /* fall through */
900     case BT_AMP:
901     case BT_LT:
902     case BT_NONXML:
903     case BT_MALFORM:
904     case BT_TRAIL:
905     case BT_CR:
906     case BT_LF:
907       *nextTokPtr = ptr;
908       return XML_TOK_DATA_CHARS;
909     default:
910       ptr += MINBPC(enc);
911       break;
912     }
913   }
914   *nextTokPtr = ptr;
915   return XML_TOK_DATA_CHARS;
916 }
917 
918 /* ptr points to character following "%" */
919 
920 static int PTRCALL
921 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
922                     const char **nextTokPtr) {
923   REQUIRE_CHAR(enc, ptr, end);
924   switch (BYTE_TYPE(enc, ptr)) {
925     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
926   case BT_S:
927   case BT_LF:
928   case BT_CR:
929   case BT_PERCNT:
930     *nextTokPtr = ptr;
931     return XML_TOK_PERCENT;
932   default:
933     *nextTokPtr = ptr;
934     return XML_TOK_INVALID;
935   }
936   while (HAS_CHAR(enc, ptr, end)) {
937     switch (BYTE_TYPE(enc, ptr)) {
938       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
939     case BT_SEMI:
940       *nextTokPtr = ptr + MINBPC(enc);
941       return XML_TOK_PARAM_ENTITY_REF;
942     default:
943       *nextTokPtr = ptr;
944       return XML_TOK_INVALID;
945     }
946   }
947   return XML_TOK_PARTIAL;
948 }
949 
950 static int PTRCALL
951 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
952                       const char **nextTokPtr) {
953   REQUIRE_CHAR(enc, ptr, end);
954   switch (BYTE_TYPE(enc, ptr)) {
955     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
956   default:
957     *nextTokPtr = ptr;
958     return XML_TOK_INVALID;
959   }
960   while (HAS_CHAR(enc, ptr, end)) {
961     switch (BYTE_TYPE(enc, ptr)) {
962       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
963     case BT_CR:
964     case BT_LF:
965     case BT_S:
966     case BT_RPAR:
967     case BT_GT:
968     case BT_PERCNT:
969     case BT_VERBAR:
970       *nextTokPtr = ptr;
971       return XML_TOK_POUND_NAME;
972     default:
973       *nextTokPtr = ptr;
974       return XML_TOK_INVALID;
975     }
976   }
977   return -XML_TOK_POUND_NAME;
978 }
979 
980 static int PTRCALL
981 PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
982                 const char **nextTokPtr) {
983   while (HAS_CHAR(enc, ptr, end)) {
984     int t = BYTE_TYPE(enc, ptr);
985     switch (t) {
986       INVALID_CASES(ptr, nextTokPtr)
987     case BT_QUOT:
988     case BT_APOS:
989       ptr += MINBPC(enc);
990       if (t != open)
991         break;
992       if (! HAS_CHAR(enc, ptr, end))
993         return -XML_TOK_LITERAL;
994       *nextTokPtr = ptr;
995       switch (BYTE_TYPE(enc, ptr)) {
996       case BT_S:
997       case BT_CR:
998       case BT_LF:
999       case BT_GT:
1000       case BT_PERCNT:
1001       case BT_LSQB:
1002         return XML_TOK_LITERAL;
1003       default:
1004         return XML_TOK_INVALID;
1005       }
1006     default:
1007       ptr += MINBPC(enc);
1008       break;
1009     }
1010   }
1011   return XML_TOK_PARTIAL;
1012 }
1013 
1014 static int PTRCALL
1015 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
1016                   const char **nextTokPtr) {
1017   int tok;
1018   if (ptr >= end)
1019     return XML_TOK_NONE;
1020   if (MINBPC(enc) > 1) {
1021     size_t n = end - ptr;
1022     if (n & (MINBPC(enc) - 1)) {
1023       n &= ~(MINBPC(enc) - 1);
1024       if (n == 0)
1025         return XML_TOK_PARTIAL;
1026       end = ptr + n;
1027     }
1028   }
1029   switch (BYTE_TYPE(enc, ptr)) {
1030   case BT_QUOT:
1031     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1032   case BT_APOS:
1033     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1034   case BT_LT: {
1035     ptr += MINBPC(enc);
1036     REQUIRE_CHAR(enc, ptr, end);
1037     switch (BYTE_TYPE(enc, ptr)) {
1038     case BT_EXCL:
1039       return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1040     case BT_QUEST:
1041       return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1042     case BT_NMSTRT:
1043     case BT_HEX:
1044     case BT_NONASCII:
1045     case BT_LEAD2:
1046     case BT_LEAD3:
1047     case BT_LEAD4:
1048       *nextTokPtr = ptr - MINBPC(enc);
1049       return XML_TOK_INSTANCE_START;
1050     }
1051     *nextTokPtr = ptr;
1052     return XML_TOK_INVALID;
1053   }
1054   case BT_CR:
1055     if (ptr + MINBPC(enc) == end) {
1056       *nextTokPtr = end;
1057       /* indicate that this might be part of a CR/LF pair */
1058       return -XML_TOK_PROLOG_S;
1059     }
1060     /* fall through */
1061   case BT_S:
1062   case BT_LF:
1063     for (;;) {
1064       ptr += MINBPC(enc);
1065       if (! HAS_CHAR(enc, ptr, end))
1066         break;
1067       switch (BYTE_TYPE(enc, ptr)) {
1068       case BT_S:
1069       case BT_LF:
1070         break;
1071       case BT_CR:
1072         /* don't split CR/LF pair */
1073         if (ptr + MINBPC(enc) != end)
1074           break;
1075         /* fall through */
1076       default:
1077         *nextTokPtr = ptr;
1078         return XML_TOK_PROLOG_S;
1079       }
1080     }
1081     *nextTokPtr = ptr;
1082     return XML_TOK_PROLOG_S;
1083   case BT_PERCNT:
1084     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1085   case BT_COMMA:
1086     *nextTokPtr = ptr + MINBPC(enc);
1087     return XML_TOK_COMMA;
1088   case BT_LSQB:
1089     *nextTokPtr = ptr + MINBPC(enc);
1090     return XML_TOK_OPEN_BRACKET;
1091   case BT_RSQB:
1092     ptr += MINBPC(enc);
1093     if (! HAS_CHAR(enc, ptr, end))
1094       return -XML_TOK_CLOSE_BRACKET;
1095     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1096       REQUIRE_CHARS(enc, ptr, end, 2);
1097       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1098         *nextTokPtr = ptr + 2 * MINBPC(enc);
1099         return XML_TOK_COND_SECT_CLOSE;
1100       }
1101     }
1102     *nextTokPtr = ptr;
1103     return XML_TOK_CLOSE_BRACKET;
1104   case BT_LPAR:
1105     *nextTokPtr = ptr + MINBPC(enc);
1106     return XML_TOK_OPEN_PAREN;
1107   case BT_RPAR:
1108     ptr += MINBPC(enc);
1109     if (! HAS_CHAR(enc, ptr, end))
1110       return -XML_TOK_CLOSE_PAREN;
1111     switch (BYTE_TYPE(enc, ptr)) {
1112     case BT_AST:
1113       *nextTokPtr = ptr + MINBPC(enc);
1114       return XML_TOK_CLOSE_PAREN_ASTERISK;
1115     case BT_QUEST:
1116       *nextTokPtr = ptr + MINBPC(enc);
1117       return XML_TOK_CLOSE_PAREN_QUESTION;
1118     case BT_PLUS:
1119       *nextTokPtr = ptr + MINBPC(enc);
1120       return XML_TOK_CLOSE_PAREN_PLUS;
1121     case BT_CR:
1122     case BT_LF:
1123     case BT_S:
1124     case BT_GT:
1125     case BT_COMMA:
1126     case BT_VERBAR:
1127     case BT_RPAR:
1128       *nextTokPtr = ptr;
1129       return XML_TOK_CLOSE_PAREN;
1130     }
1131     *nextTokPtr = ptr;
1132     return XML_TOK_INVALID;
1133   case BT_VERBAR:
1134     *nextTokPtr = ptr + MINBPC(enc);
1135     return XML_TOK_OR;
1136   case BT_GT:
1137     *nextTokPtr = ptr + MINBPC(enc);
1138     return XML_TOK_DECL_CLOSE;
1139   case BT_NUM:
1140     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1141 #  define LEAD_CASE(n)                                                         \
1142   case BT_LEAD##n:                                                             \
1143     if (end - ptr < n)                                                         \
1144       return XML_TOK_PARTIAL_CHAR;                                             \
1145     if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
1146       *nextTokPtr = ptr;                                                       \
1147       return XML_TOK_INVALID;                                                  \
1148     }                                                                          \
1149     if (IS_NMSTRT_CHAR(enc, ptr, n)) {                                         \
1150       ptr += n;                                                                \
1151       tok = XML_TOK_NAME;                                                      \
1152       break;                                                                   \
1153     }                                                                          \
1154     if (IS_NAME_CHAR(enc, ptr, n)) {                                           \
1155       ptr += n;                                                                \
1156       tok = XML_TOK_NMTOKEN;                                                   \
1157       break;                                                                   \
1158     }                                                                          \
1159     *nextTokPtr = ptr;                                                         \
1160     return XML_TOK_INVALID;
1161     LEAD_CASE(2)
1162     LEAD_CASE(3)
1163     LEAD_CASE(4)
1164 #  undef LEAD_CASE
1165   case BT_NMSTRT:
1166   case BT_HEX:
1167     tok = XML_TOK_NAME;
1168     ptr += MINBPC(enc);
1169     break;
1170   case BT_DIGIT:
1171   case BT_NAME:
1172   case BT_MINUS:
1173 #  ifdef XML_NS
1174   case BT_COLON:
1175 #  endif
1176     tok = XML_TOK_NMTOKEN;
1177     ptr += MINBPC(enc);
1178     break;
1179   case BT_NONASCII:
1180     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1181       ptr += MINBPC(enc);
1182       tok = XML_TOK_NAME;
1183       break;
1184     }
1185     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1186       ptr += MINBPC(enc);
1187       tok = XML_TOK_NMTOKEN;
1188       break;
1189     }
1190     /* fall through */
1191   default:
1192     *nextTokPtr = ptr;
1193     return XML_TOK_INVALID;
1194   }
1195   while (HAS_CHAR(enc, ptr, end)) {
1196     switch (BYTE_TYPE(enc, ptr)) {
1197       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1198     case BT_GT:
1199     case BT_RPAR:
1200     case BT_COMMA:
1201     case BT_VERBAR:
1202     case BT_LSQB:
1203     case BT_PERCNT:
1204     case BT_S:
1205     case BT_CR:
1206     case BT_LF:
1207       *nextTokPtr = ptr;
1208       return tok;
1209 #  ifdef XML_NS
1210     case BT_COLON:
1211       ptr += MINBPC(enc);
1212       switch (tok) {
1213       case XML_TOK_NAME:
1214         REQUIRE_CHAR(enc, ptr, end);
1215         tok = XML_TOK_PREFIXED_NAME;
1216         switch (BYTE_TYPE(enc, ptr)) {
1217           CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1218         default:
1219           tok = XML_TOK_NMTOKEN;
1220           break;
1221         }
1222         break;
1223       case XML_TOK_PREFIXED_NAME:
1224         tok = XML_TOK_NMTOKEN;
1225         break;
1226       }
1227       break;
1228 #  endif
1229     case BT_PLUS:
1230       if (tok == XML_TOK_NMTOKEN) {
1231         *nextTokPtr = ptr;
1232         return XML_TOK_INVALID;
1233       }
1234       *nextTokPtr = ptr + MINBPC(enc);
1235       return XML_TOK_NAME_PLUS;
1236     case BT_AST:
1237       if (tok == XML_TOK_NMTOKEN) {
1238         *nextTokPtr = ptr;
1239         return XML_TOK_INVALID;
1240       }
1241       *nextTokPtr = ptr + MINBPC(enc);
1242       return XML_TOK_NAME_ASTERISK;
1243     case BT_QUEST:
1244       if (tok == XML_TOK_NMTOKEN) {
1245         *nextTokPtr = ptr;
1246         return XML_TOK_INVALID;
1247       }
1248       *nextTokPtr = ptr + MINBPC(enc);
1249       return XML_TOK_NAME_QUESTION;
1250     default:
1251       *nextTokPtr = ptr;
1252       return XML_TOK_INVALID;
1253     }
1254   }
1255   return -tok;
1256 }
1257 
1258 static int PTRCALL
1259 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1260                           const char **nextTokPtr) {
1261   const char *start;
1262   if (ptr >= end)
1263     return XML_TOK_NONE;
1264   else if (! HAS_CHAR(enc, ptr, end)) {
1265     /* This line cannot be executed.  The incoming data has already
1266      * been tokenized once, so incomplete characters like this have
1267      * already been eliminated from the input.  Retaining the paranoia
1268      * check is still valuable, however.
1269      */
1270     return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1271   }
1272   start = ptr;
1273   while (HAS_CHAR(enc, ptr, end)) {
1274     switch (BYTE_TYPE(enc, ptr)) {
1275 #  define LEAD_CASE(n)                                                         \
1276   case BT_LEAD##n:                                                             \
1277     ptr += n; /* NOTE: The encoding has already been validated. */             \
1278     break;
1279       LEAD_CASE(2)
1280       LEAD_CASE(3)
1281       LEAD_CASE(4)
1282 #  undef LEAD_CASE
1283     case BT_AMP:
1284       if (ptr == start)
1285         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1286       *nextTokPtr = ptr;
1287       return XML_TOK_DATA_CHARS;
1288     case BT_LT:
1289       /* this is for inside entity references */
1290       *nextTokPtr = ptr;
1291       return XML_TOK_INVALID;
1292     case BT_LF:
1293       if (ptr == start) {
1294         *nextTokPtr = ptr + MINBPC(enc);
1295         return XML_TOK_DATA_NEWLINE;
1296       }
1297       *nextTokPtr = ptr;
1298       return XML_TOK_DATA_CHARS;
1299     case BT_CR:
1300       if (ptr == start) {
1301         ptr += MINBPC(enc);
1302         if (! HAS_CHAR(enc, ptr, end))
1303           return XML_TOK_TRAILING_CR;
1304         if (BYTE_TYPE(enc, ptr) == BT_LF)
1305           ptr += MINBPC(enc);
1306         *nextTokPtr = ptr;
1307         return XML_TOK_DATA_NEWLINE;
1308       }
1309       *nextTokPtr = ptr;
1310       return XML_TOK_DATA_CHARS;
1311     case BT_S:
1312       if (ptr == start) {
1313         *nextTokPtr = ptr + MINBPC(enc);
1314         return XML_TOK_ATTRIBUTE_VALUE_S;
1315       }
1316       *nextTokPtr = ptr;
1317       return XML_TOK_DATA_CHARS;
1318     default:
1319       ptr += MINBPC(enc);
1320       break;
1321     }
1322   }
1323   *nextTokPtr = ptr;
1324   return XML_TOK_DATA_CHARS;
1325 }
1326 
1327 static int PTRCALL
1328 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1329                        const char **nextTokPtr) {
1330   const char *start;
1331   if (ptr >= end)
1332     return XML_TOK_NONE;
1333   else if (! HAS_CHAR(enc, ptr, end)) {
1334     /* This line cannot be executed.  The incoming data has already
1335      * been tokenized once, so incomplete characters like this have
1336      * already been eliminated from the input.  Retaining the paranoia
1337      * check is still valuable, however.
1338      */
1339     return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1340   }
1341   start = ptr;
1342   while (HAS_CHAR(enc, ptr, end)) {
1343     switch (BYTE_TYPE(enc, ptr)) {
1344 #  define LEAD_CASE(n)                                                         \
1345   case BT_LEAD##n:                                                             \
1346     ptr += n; /* NOTE: The encoding has already been validated. */             \
1347     break;
1348       LEAD_CASE(2)
1349       LEAD_CASE(3)
1350       LEAD_CASE(4)
1351 #  undef LEAD_CASE
1352     case BT_AMP:
1353       if (ptr == start)
1354         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1355       *nextTokPtr = ptr;
1356       return XML_TOK_DATA_CHARS;
1357     case BT_PERCNT:
1358       if (ptr == start) {
1359         int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1360         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1361       }
1362       *nextTokPtr = ptr;
1363       return XML_TOK_DATA_CHARS;
1364     case BT_LF:
1365       if (ptr == start) {
1366         *nextTokPtr = ptr + MINBPC(enc);
1367         return XML_TOK_DATA_NEWLINE;
1368       }
1369       *nextTokPtr = ptr;
1370       return XML_TOK_DATA_CHARS;
1371     case BT_CR:
1372       if (ptr == start) {
1373         ptr += MINBPC(enc);
1374         if (! HAS_CHAR(enc, ptr, end))
1375           return XML_TOK_TRAILING_CR;
1376         if (BYTE_TYPE(enc, ptr) == BT_LF)
1377           ptr += MINBPC(enc);
1378         *nextTokPtr = ptr;
1379         return XML_TOK_DATA_NEWLINE;
1380       }
1381       *nextTokPtr = ptr;
1382       return XML_TOK_DATA_CHARS;
1383     default:
1384       ptr += MINBPC(enc);
1385       break;
1386     }
1387   }
1388   *nextTokPtr = ptr;
1389   return XML_TOK_DATA_CHARS;
1390 }
1391 
1392 #  ifdef XML_DTD
1393 
1394 static int PTRCALL
1395 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1396                          const char **nextTokPtr) {
1397   int level = 0;
1398   if (MINBPC(enc) > 1) {
1399     size_t n = end - ptr;
1400     if (n & (MINBPC(enc) - 1)) {
1401       n &= ~(MINBPC(enc) - 1);
1402       end = ptr + n;
1403     }
1404   }
1405   while (HAS_CHAR(enc, ptr, end)) {
1406     switch (BYTE_TYPE(enc, ptr)) {
1407       INVALID_CASES(ptr, nextTokPtr)
1408     case BT_LT:
1409       ptr += MINBPC(enc);
1410       REQUIRE_CHAR(enc, ptr, end);
1411       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1412         ptr += MINBPC(enc);
1413         REQUIRE_CHAR(enc, ptr, end);
1414         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1415           ++level;
1416           ptr += MINBPC(enc);
1417         }
1418       }
1419       break;
1420     case BT_RSQB:
1421       ptr += MINBPC(enc);
1422       REQUIRE_CHAR(enc, ptr, end);
1423       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1424         ptr += MINBPC(enc);
1425         REQUIRE_CHAR(enc, ptr, end);
1426         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1427           ptr += MINBPC(enc);
1428           if (level == 0) {
1429             *nextTokPtr = ptr;
1430             return XML_TOK_IGNORE_SECT;
1431           }
1432           --level;
1433         }
1434       }
1435       break;
1436     default:
1437       ptr += MINBPC(enc);
1438       break;
1439     }
1440   }
1441   return XML_TOK_PARTIAL;
1442 }
1443 
1444 #  endif /* XML_DTD */
1445 
1446 static int PTRCALL
1447 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1448                    const char **badPtr) {
1449   ptr += MINBPC(enc);
1450   end -= MINBPC(enc);
1451   for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1452     switch (BYTE_TYPE(enc, ptr)) {
1453     case BT_DIGIT:
1454     case BT_HEX:
1455     case BT_MINUS:
1456     case BT_APOS:
1457     case BT_LPAR:
1458     case BT_RPAR:
1459     case BT_PLUS:
1460     case BT_COMMA:
1461     case BT_SOL:
1462     case BT_EQUALS:
1463     case BT_QUEST:
1464     case BT_CR:
1465     case BT_LF:
1466     case BT_SEMI:
1467     case BT_EXCL:
1468     case BT_AST:
1469     case BT_PERCNT:
1470     case BT_NUM:
1471 #  ifdef XML_NS
1472     case BT_COLON:
1473 #  endif
1474       break;
1475     case BT_S:
1476       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1477         *badPtr = ptr;
1478         return 0;
1479       }
1480       break;
1481     case BT_NAME:
1482     case BT_NMSTRT:
1483       if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1484         break;
1485       /* fall through */
1486     default:
1487       switch (BYTE_TO_ASCII(enc, ptr)) {
1488       case 0x24: /* $ */
1489       case 0x40: /* @ */
1490         break;
1491       default:
1492         *badPtr = ptr;
1493         return 0;
1494       }
1495       break;
1496     }
1497   }
1498   return 1;
1499 }
1500 
1501 /* This must only be called for a well-formed start-tag or empty
1502    element tag.  Returns the number of attributes.  Pointers to the
1503    first attsMax attributes are stored in atts.
1504 */
1505 
1506 static int PTRCALL
1507 PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
1508                 ATTRIBUTE *atts) {
1509   enum { other, inName, inValue } state = inName;
1510   int nAtts = 0;
1511   int open = 0; /* defined when state == inValue;
1512                    initialization just to shut up compilers */
1513 
1514   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1515     switch (BYTE_TYPE(enc, ptr)) {
1516 #  define START_NAME                                                           \
1517     if (state == other) {                                                      \
1518       if (nAtts < attsMax) {                                                   \
1519         atts[nAtts].name = ptr;                                                \
1520         atts[nAtts].normalized = 1;                                            \
1521       }                                                                        \
1522       state = inName;                                                          \
1523     }
1524 #  define LEAD_CASE(n)                                                         \
1525   case BT_LEAD##n: /* NOTE: The encoding has already been validated. */        \
1526     START_NAME ptr += (n - MINBPC(enc));                                       \
1527     break;
1528       LEAD_CASE(2)
1529       LEAD_CASE(3)
1530       LEAD_CASE(4)
1531 #  undef LEAD_CASE
1532     case BT_NONASCII:
1533     case BT_NMSTRT:
1534     case BT_HEX:
1535       START_NAME
1536       break;
1537 #  undef START_NAME
1538     case BT_QUOT:
1539       if (state != inValue) {
1540         if (nAtts < attsMax)
1541           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1542         state = inValue;
1543         open = BT_QUOT;
1544       } else if (open == BT_QUOT) {
1545         state = other;
1546         if (nAtts < attsMax)
1547           atts[nAtts].valueEnd = ptr;
1548         nAtts++;
1549       }
1550       break;
1551     case BT_APOS:
1552       if (state != inValue) {
1553         if (nAtts < attsMax)
1554           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1555         state = inValue;
1556         open = BT_APOS;
1557       } else if (open == BT_APOS) {
1558         state = other;
1559         if (nAtts < attsMax)
1560           atts[nAtts].valueEnd = ptr;
1561         nAtts++;
1562       }
1563       break;
1564     case BT_AMP:
1565       if (nAtts < attsMax)
1566         atts[nAtts].normalized = 0;
1567       break;
1568     case BT_S:
1569       if (state == inName)
1570         state = other;
1571       else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
1572                && (ptr == atts[nAtts].valuePtr
1573                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1574                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1575                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1576         atts[nAtts].normalized = 0;
1577       break;
1578     case BT_CR:
1579     case BT_LF:
1580       /* This case ensures that the first attribute name is counted
1581          Apart from that we could just change state on the quote. */
1582       if (state == inName)
1583         state = other;
1584       else if (state == inValue && nAtts < attsMax)
1585         atts[nAtts].normalized = 0;
1586       break;
1587     case BT_GT:
1588     case BT_SOL:
1589       if (state != inValue)
1590         return nAtts;
1591       break;
1592     default:
1593       break;
1594     }
1595   }
1596   /* not reached */
1597 }
1598 
1599 static int PTRFASTCALL
1600 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
1601   int result = 0;
1602   /* skip &# */
1603   UNUSED_P(enc);
1604   ptr += 2 * MINBPC(enc);
1605   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1606     for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1607          ptr += MINBPC(enc)) {
1608       int c = BYTE_TO_ASCII(enc, ptr);
1609       switch (c) {
1610       case ASCII_0:
1611       case ASCII_1:
1612       case ASCII_2:
1613       case ASCII_3:
1614       case ASCII_4:
1615       case ASCII_5:
1616       case ASCII_6:
1617       case ASCII_7:
1618       case ASCII_8:
1619       case ASCII_9:
1620         result <<= 4;
1621         result |= (c - ASCII_0);
1622         break;
1623       case ASCII_A:
1624       case ASCII_B:
1625       case ASCII_C:
1626       case ASCII_D:
1627       case ASCII_E:
1628       case ASCII_F:
1629         result <<= 4;
1630         result += 10 + (c - ASCII_A);
1631         break;
1632       case ASCII_a:
1633       case ASCII_b:
1634       case ASCII_c:
1635       case ASCII_d:
1636       case ASCII_e:
1637       case ASCII_f:
1638         result <<= 4;
1639         result += 10 + (c - ASCII_a);
1640         break;
1641       }
1642       if (result >= 0x110000)
1643         return -1;
1644     }
1645   } else {
1646     for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1647       int c = BYTE_TO_ASCII(enc, ptr);
1648       result *= 10;
1649       result += (c - ASCII_0);
1650       if (result >= 0x110000)
1651         return -1;
1652     }
1653   }
1654   return checkCharRefNumber(result);
1655 }
1656 
1657 static int PTRCALL
1658 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1659                              const char *end) {
1660   UNUSED_P(enc);
1661   switch ((end - ptr) / MINBPC(enc)) {
1662   case 2:
1663     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1664       switch (BYTE_TO_ASCII(enc, ptr)) {
1665       case ASCII_l:
1666         return ASCII_LT;
1667       case ASCII_g:
1668         return ASCII_GT;
1669       }
1670     }
1671     break;
1672   case 3:
1673     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1674       ptr += MINBPC(enc);
1675       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1676         ptr += MINBPC(enc);
1677         if (CHAR_MATCHES(enc, ptr, ASCII_p))
1678           return ASCII_AMP;
1679       }
1680     }
1681     break;
1682   case 4:
1683     switch (BYTE_TO_ASCII(enc, ptr)) {
1684     case ASCII_q:
1685       ptr += MINBPC(enc);
1686       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1687         ptr += MINBPC(enc);
1688         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1689           ptr += MINBPC(enc);
1690           if (CHAR_MATCHES(enc, ptr, ASCII_t))
1691             return ASCII_QUOT;
1692         }
1693       }
1694       break;
1695     case ASCII_a:
1696       ptr += MINBPC(enc);
1697       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1698         ptr += MINBPC(enc);
1699         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1700           ptr += MINBPC(enc);
1701           if (CHAR_MATCHES(enc, ptr, ASCII_s))
1702             return ASCII_APOS;
1703         }
1704       }
1705       break;
1706     }
1707   }
1708   return 0;
1709 }
1710 
1711 static int PTRCALL
1712 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1713                          const char *end1, const char *ptr2) {
1714   UNUSED_P(enc);
1715   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1716     if (end1 - ptr1 < MINBPC(enc)) {
1717       /* This line cannot be executed.  The incoming data has already
1718        * been tokenized once, so incomplete characters like this have
1719        * already been eliminated from the input.  Retaining the
1720        * paranoia check is still valuable, however.
1721        */
1722       return 0; /* LCOV_EXCL_LINE */
1723     }
1724     if (! CHAR_MATCHES(enc, ptr1, *ptr2))
1725       return 0;
1726   }
1727   return ptr1 == end1;
1728 }
1729 
1730 static int PTRFASTCALL
1731 PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
1732   const char *start = ptr;
1733   for (;;) {
1734     switch (BYTE_TYPE(enc, ptr)) {
1735 #  define LEAD_CASE(n)                                                         \
1736   case BT_LEAD##n:                                                             \
1737     ptr += n; /* NOTE: The encoding has already been validated. */             \
1738     break;
1739       LEAD_CASE(2)
1740       LEAD_CASE(3)
1741       LEAD_CASE(4)
1742 #  undef LEAD_CASE
1743     case BT_NONASCII:
1744     case BT_NMSTRT:
1745 #  ifdef XML_NS
1746     case BT_COLON:
1747 #  endif
1748     case BT_HEX:
1749     case BT_DIGIT:
1750     case BT_NAME:
1751     case BT_MINUS:
1752       ptr += MINBPC(enc);
1753       break;
1754     default:
1755       return (int)(ptr - start);
1756     }
1757   }
1758 }
1759 
1760 static const char *PTRFASTCALL
1761 PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
1762   for (;;) {
1763     switch (BYTE_TYPE(enc, ptr)) {
1764     case BT_LF:
1765     case BT_CR:
1766     case BT_S:
1767       ptr += MINBPC(enc);
1768       break;
1769     default:
1770       return ptr;
1771     }
1772   }
1773 }
1774 
1775 static void PTRCALL
1776 PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
1777                        POSITION *pos) {
1778   while (HAS_CHAR(enc, ptr, end)) {
1779     switch (BYTE_TYPE(enc, ptr)) {
1780 #  define LEAD_CASE(n)                                                         \
1781   case BT_LEAD##n:                                                             \
1782     ptr += n; /* NOTE: The encoding has already been validated. */             \
1783     pos->columnNumber++;                                                       \
1784     break;
1785       LEAD_CASE(2)
1786       LEAD_CASE(3)
1787       LEAD_CASE(4)
1788 #  undef LEAD_CASE
1789     case BT_LF:
1790       pos->columnNumber = 0;
1791       pos->lineNumber++;
1792       ptr += MINBPC(enc);
1793       break;
1794     case BT_CR:
1795       pos->lineNumber++;
1796       ptr += MINBPC(enc);
1797       if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1798         ptr += MINBPC(enc);
1799       pos->columnNumber = 0;
1800       break;
1801     default:
1802       ptr += MINBPC(enc);
1803       pos->columnNumber++;
1804       break;
1805     }
1806   }
1807 }
1808 
1809 #  undef DO_LEAD_CASE
1810 #  undef MULTIBYTE_CASES
1811 #  undef INVALID_CASES
1812 #  undef CHECK_NAME_CASE
1813 #  undef CHECK_NAME_CASES
1814 #  undef CHECK_NMSTRT_CASE
1815 #  undef CHECK_NMSTRT_CASES
1816 
1817 #endif /* XML_TOK_IMPL_C */
1818