xref: /freebsd/contrib/expat/lib/xmltok_impl.c (revision 884a2a699669ec61e2366e3e358342dbc94be24a)
1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2    See the file COPYING for copying permission.
3 */
4 
5 /* This file is included! */
6 #ifdef XML_TOK_IMPL_C
7 
8 #ifndef IS_INVALID_CHAR
9 #define IS_INVALID_CHAR(enc, ptr, n) (0)
10 #endif
11 
12 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
13     case BT_LEAD ## n: \
14       if (end - ptr < n) \
15         return XML_TOK_PARTIAL_CHAR; \
16       if (IS_INVALID_CHAR(enc, ptr, n)) { \
17         *(nextTokPtr) = (ptr); \
18         return XML_TOK_INVALID; \
19       } \
20       ptr += n; \
21       break;
22 
23 #define INVALID_CASES(ptr, nextTokPtr) \
24   INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
25   INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
26   INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
27   case BT_NONXML: \
28   case BT_MALFORM: \
29   case BT_TRAIL: \
30     *(nextTokPtr) = (ptr); \
31     return XML_TOK_INVALID;
32 
33 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
34    case BT_LEAD ## n: \
35      if (end - ptr < n) \
36        return XML_TOK_PARTIAL_CHAR; \
37      if (!IS_NAME_CHAR(enc, ptr, n)) { \
38        *nextTokPtr = ptr; \
39        return XML_TOK_INVALID; \
40      } \
41      ptr += n; \
42      break;
43 
44 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
45   case BT_NONASCII: \
46     if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
47       *nextTokPtr = ptr; \
48       return XML_TOK_INVALID; \
49     } \
50   case BT_NMSTRT: \
51   case BT_HEX: \
52   case BT_DIGIT: \
53   case BT_NAME: \
54   case BT_MINUS: \
55     ptr += MINBPC(enc); \
56     break; \
57   CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
58   CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
59   CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
60 
61 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
62    case BT_LEAD ## n: \
63      if (end - ptr < n) \
64        return XML_TOK_PARTIAL_CHAR; \
65      if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
66        *nextTokPtr = ptr; \
67        return XML_TOK_INVALID; \
68      } \
69      ptr += n; \
70      break;
71 
72 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
73   case BT_NONASCII: \
74     if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
75       *nextTokPtr = ptr; \
76       return XML_TOK_INVALID; \
77     } \
78   case BT_NMSTRT: \
79   case BT_HEX: \
80     ptr += MINBPC(enc); \
81     break; \
82   CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
83   CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
84   CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
85 
86 #ifndef PREFIX
87 #define PREFIX(ident) ident
88 #endif
89 
90 /* ptr points to character following "<!-" */
91 
92 static int PTRCALL
93 PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
94                     const char *end, const char **nextTokPtr)
95 {
96   if (ptr != end) {
97     if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
98       *nextTokPtr = ptr;
99       return XML_TOK_INVALID;
100     }
101     ptr += MINBPC(enc);
102     while (ptr != end) {
103       switch (BYTE_TYPE(enc, ptr)) {
104       INVALID_CASES(ptr, nextTokPtr)
105       case BT_MINUS:
106         if ((ptr += MINBPC(enc)) == end)
107           return XML_TOK_PARTIAL;
108         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
109           if ((ptr += MINBPC(enc)) == end)
110             return XML_TOK_PARTIAL;
111           if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
112             *nextTokPtr = ptr;
113             return XML_TOK_INVALID;
114           }
115           *nextTokPtr = ptr + MINBPC(enc);
116           return XML_TOK_COMMENT;
117         }
118         break;
119       default:
120         ptr += MINBPC(enc);
121         break;
122       }
123     }
124   }
125   return XML_TOK_PARTIAL;
126 }
127 
128 /* ptr points to character following "<!" */
129 
130 static int PTRCALL
131 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
132                  const char *end, const char **nextTokPtr)
133 {
134   if (ptr == end)
135     return XML_TOK_PARTIAL;
136   switch (BYTE_TYPE(enc, ptr)) {
137   case BT_MINUS:
138     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
139   case BT_LSQB:
140     *nextTokPtr = ptr + MINBPC(enc);
141     return XML_TOK_COND_SECT_OPEN;
142   case BT_NMSTRT:
143   case BT_HEX:
144     ptr += MINBPC(enc);
145     break;
146   default:
147     *nextTokPtr = ptr;
148     return XML_TOK_INVALID;
149   }
150   while (ptr != end) {
151     switch (BYTE_TYPE(enc, ptr)) {
152     case BT_PERCNT:
153       if (ptr + MINBPC(enc) == end)
154         return XML_TOK_PARTIAL;
155       /* don't allow <!ENTITY% foo "whatever"> */
156       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
157       case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
158         *nextTokPtr = ptr;
159         return XML_TOK_INVALID;
160       }
161       /* fall through */
162     case BT_S: case BT_CR: case BT_LF:
163       *nextTokPtr = ptr;
164       return XML_TOK_DECL_OPEN;
165     case BT_NMSTRT:
166     case BT_HEX:
167       ptr += MINBPC(enc);
168       break;
169     default:
170       *nextTokPtr = ptr;
171       return XML_TOK_INVALID;
172     }
173   }
174   return XML_TOK_PARTIAL;
175 }
176 
177 static int PTRCALL
178 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr,
179                       const char *end, int *tokPtr)
180 {
181   int upper = 0;
182   *tokPtr = XML_TOK_PI;
183   if (end - ptr != MINBPC(enc)*3)
184     return 1;
185   switch (BYTE_TO_ASCII(enc, ptr)) {
186   case ASCII_x:
187     break;
188   case ASCII_X:
189     upper = 1;
190     break;
191   default:
192     return 1;
193   }
194   ptr += MINBPC(enc);
195   switch (BYTE_TO_ASCII(enc, ptr)) {
196   case ASCII_m:
197     break;
198   case ASCII_M:
199     upper = 1;
200     break;
201   default:
202     return 1;
203   }
204   ptr += MINBPC(enc);
205   switch (BYTE_TO_ASCII(enc, ptr)) {
206   case ASCII_l:
207     break;
208   case ASCII_L:
209     upper = 1;
210     break;
211   default:
212     return 1;
213   }
214   if (upper)
215     return 0;
216   *tokPtr = XML_TOK_XML_DECL;
217   return 1;
218 }
219 
220 /* ptr points to character following "<?" */
221 
222 static int PTRCALL
223 PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
224                const char *end, const char **nextTokPtr)
225 {
226   int tok;
227   const char *target = ptr;
228   if (ptr == end)
229     return XML_TOK_PARTIAL;
230   switch (BYTE_TYPE(enc, ptr)) {
231   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
232   default:
233     *nextTokPtr = ptr;
234     return XML_TOK_INVALID;
235   }
236   while (ptr != end) {
237     switch (BYTE_TYPE(enc, ptr)) {
238     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
239     case BT_S: case BT_CR: case BT_LF:
240       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
241         *nextTokPtr = ptr;
242         return XML_TOK_INVALID;
243       }
244       ptr += MINBPC(enc);
245       while (ptr != end) {
246         switch (BYTE_TYPE(enc, ptr)) {
247         INVALID_CASES(ptr, nextTokPtr)
248         case BT_QUEST:
249           ptr += MINBPC(enc);
250           if (ptr == end)
251             return XML_TOK_PARTIAL;
252           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
253             *nextTokPtr = ptr + MINBPC(enc);
254             return tok;
255           }
256           break;
257         default:
258           ptr += MINBPC(enc);
259           break;
260         }
261       }
262       return XML_TOK_PARTIAL;
263     case BT_QUEST:
264       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
265         *nextTokPtr = ptr;
266         return XML_TOK_INVALID;
267       }
268       ptr += MINBPC(enc);
269       if (ptr == end)
270         return XML_TOK_PARTIAL;
271       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
272         *nextTokPtr = ptr + MINBPC(enc);
273         return tok;
274       }
275       /* fall through */
276     default:
277       *nextTokPtr = ptr;
278       return XML_TOK_INVALID;
279     }
280   }
281   return XML_TOK_PARTIAL;
282 }
283 
284 static int PTRCALL
285 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr,
286                          const char *end, const char **nextTokPtr)
287 {
288   static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
289                                      ASCII_T, ASCII_A, ASCII_LSQB };
290   int i;
291   /* CDATA[ */
292   if (end - ptr < 6 * MINBPC(enc))
293     return XML_TOK_PARTIAL;
294   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
295     if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
296       *nextTokPtr = ptr;
297       return XML_TOK_INVALID;
298     }
299   }
300   *nextTokPtr = ptr;
301   return XML_TOK_CDATA_SECT_OPEN;
302 }
303 
304 static int PTRCALL
305 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
306                         const char *end, const char **nextTokPtr)
307 {
308   if (ptr == end)
309     return XML_TOK_NONE;
310   if (MINBPC(enc) > 1) {
311     size_t n = end - ptr;
312     if (n & (MINBPC(enc) - 1)) {
313       n &= ~(MINBPC(enc) - 1);
314       if (n == 0)
315         return XML_TOK_PARTIAL;
316       end = ptr + n;
317     }
318   }
319   switch (BYTE_TYPE(enc, ptr)) {
320   case BT_RSQB:
321     ptr += MINBPC(enc);
322     if (ptr == end)
323       return XML_TOK_PARTIAL;
324     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
325       break;
326     ptr += MINBPC(enc);
327     if (ptr == end)
328       return XML_TOK_PARTIAL;
329     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
330       ptr -= MINBPC(enc);
331       break;
332     }
333     *nextTokPtr = ptr + MINBPC(enc);
334     return XML_TOK_CDATA_SECT_CLOSE;
335   case BT_CR:
336     ptr += MINBPC(enc);
337     if (ptr == end)
338       return XML_TOK_PARTIAL;
339     if (BYTE_TYPE(enc, ptr) == BT_LF)
340       ptr += MINBPC(enc);
341     *nextTokPtr = ptr;
342     return XML_TOK_DATA_NEWLINE;
343   case BT_LF:
344     *nextTokPtr = ptr + MINBPC(enc);
345     return XML_TOK_DATA_NEWLINE;
346   INVALID_CASES(ptr, nextTokPtr)
347   default:
348     ptr += MINBPC(enc);
349     break;
350   }
351   while (ptr != end) {
352     switch (BYTE_TYPE(enc, ptr)) {
353 #define LEAD_CASE(n) \
354     case BT_LEAD ## n: \
355       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
356         *nextTokPtr = ptr; \
357         return XML_TOK_DATA_CHARS; \
358       } \
359       ptr += n; \
360       break;
361     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
362 #undef LEAD_CASE
363     case BT_NONXML:
364     case BT_MALFORM:
365     case BT_TRAIL:
366     case BT_CR:
367     case BT_LF:
368     case BT_RSQB:
369       *nextTokPtr = ptr;
370       return XML_TOK_DATA_CHARS;
371     default:
372       ptr += MINBPC(enc);
373       break;
374     }
375   }
376   *nextTokPtr = ptr;
377   return XML_TOK_DATA_CHARS;
378 }
379 
380 /* ptr points to character following "</" */
381 
382 static int PTRCALL
383 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
384                    const char *end, const char **nextTokPtr)
385 {
386   if (ptr == end)
387     return XML_TOK_PARTIAL;
388   switch (BYTE_TYPE(enc, ptr)) {
389   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
390   default:
391     *nextTokPtr = ptr;
392     return XML_TOK_INVALID;
393   }
394   while (ptr != end) {
395     switch (BYTE_TYPE(enc, ptr)) {
396     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
397     case BT_S: case BT_CR: case BT_LF:
398       for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
399         switch (BYTE_TYPE(enc, ptr)) {
400         case BT_S: case BT_CR: case BT_LF:
401           break;
402         case BT_GT:
403           *nextTokPtr = ptr + MINBPC(enc);
404           return XML_TOK_END_TAG;
405         default:
406           *nextTokPtr = ptr;
407           return XML_TOK_INVALID;
408         }
409       }
410       return XML_TOK_PARTIAL;
411 #ifdef XML_NS
412     case BT_COLON:
413       /* no need to check qname syntax here,
414          since end-tag must match exactly */
415       ptr += MINBPC(enc);
416       break;
417 #endif
418     case BT_GT:
419       *nextTokPtr = ptr + MINBPC(enc);
420       return XML_TOK_END_TAG;
421     default:
422       *nextTokPtr = ptr;
423       return XML_TOK_INVALID;
424     }
425   }
426   return XML_TOK_PARTIAL;
427 }
428 
429 /* ptr points to character following "&#X" */
430 
431 static int PTRCALL
432 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
433                        const char *end, const char **nextTokPtr)
434 {
435   if (ptr != end) {
436     switch (BYTE_TYPE(enc, ptr)) {
437     case BT_DIGIT:
438     case BT_HEX:
439       break;
440     default:
441       *nextTokPtr = ptr;
442       return XML_TOK_INVALID;
443     }
444     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
445       switch (BYTE_TYPE(enc, ptr)) {
446       case BT_DIGIT:
447       case BT_HEX:
448         break;
449       case BT_SEMI:
450         *nextTokPtr = ptr + MINBPC(enc);
451         return XML_TOK_CHAR_REF;
452       default:
453         *nextTokPtr = ptr;
454         return XML_TOK_INVALID;
455       }
456     }
457   }
458   return XML_TOK_PARTIAL;
459 }
460 
461 /* ptr points to character following "&#" */
462 
463 static int PTRCALL
464 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
465                     const char *end, const char **nextTokPtr)
466 {
467   if (ptr != end) {
468     if (CHAR_MATCHES(enc, ptr, ASCII_x))
469       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
470     switch (BYTE_TYPE(enc, ptr)) {
471     case BT_DIGIT:
472       break;
473     default:
474       *nextTokPtr = ptr;
475       return XML_TOK_INVALID;
476     }
477     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
478       switch (BYTE_TYPE(enc, ptr)) {
479       case BT_DIGIT:
480         break;
481       case BT_SEMI:
482         *nextTokPtr = ptr + MINBPC(enc);
483         return XML_TOK_CHAR_REF;
484       default:
485         *nextTokPtr = ptr;
486         return XML_TOK_INVALID;
487       }
488     }
489   }
490   return XML_TOK_PARTIAL;
491 }
492 
493 /* ptr points to character following "&" */
494 
495 static int PTRCALL
496 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
497                 const char **nextTokPtr)
498 {
499   if (ptr == end)
500     return XML_TOK_PARTIAL;
501   switch (BYTE_TYPE(enc, ptr)) {
502   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
503   case BT_NUM:
504     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
505   default:
506     *nextTokPtr = ptr;
507     return XML_TOK_INVALID;
508   }
509   while (ptr != end) {
510     switch (BYTE_TYPE(enc, ptr)) {
511     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
512     case BT_SEMI:
513       *nextTokPtr = ptr + MINBPC(enc);
514       return XML_TOK_ENTITY_REF;
515     default:
516       *nextTokPtr = ptr;
517       return XML_TOK_INVALID;
518     }
519   }
520   return XML_TOK_PARTIAL;
521 }
522 
523 /* ptr points to character following first character of attribute name */
524 
525 static int PTRCALL
526 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
527                  const char **nextTokPtr)
528 {
529 #ifdef XML_NS
530   int hadColon = 0;
531 #endif
532   while (ptr != end) {
533     switch (BYTE_TYPE(enc, ptr)) {
534     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
535 #ifdef XML_NS
536     case BT_COLON:
537       if (hadColon) {
538         *nextTokPtr = ptr;
539         return XML_TOK_INVALID;
540       }
541       hadColon = 1;
542       ptr += MINBPC(enc);
543       if (ptr == end)
544         return XML_TOK_PARTIAL;
545       switch (BYTE_TYPE(enc, ptr)) {
546       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
547       default:
548         *nextTokPtr = ptr;
549         return XML_TOK_INVALID;
550       }
551       break;
552 #endif
553     case BT_S: case BT_CR: case BT_LF:
554       for (;;) {
555         int t;
556 
557         ptr += MINBPC(enc);
558         if (ptr == end)
559           return XML_TOK_PARTIAL;
560         t = BYTE_TYPE(enc, ptr);
561         if (t == BT_EQUALS)
562           break;
563         switch (t) {
564         case BT_S:
565         case BT_LF:
566         case BT_CR:
567           break;
568         default:
569           *nextTokPtr = ptr;
570           return XML_TOK_INVALID;
571         }
572       }
573     /* fall through */
574     case BT_EQUALS:
575       {
576         int open;
577 #ifdef XML_NS
578         hadColon = 0;
579 #endif
580         for (;;) {
581           ptr += MINBPC(enc);
582           if (ptr == end)
583             return XML_TOK_PARTIAL;
584           open = BYTE_TYPE(enc, ptr);
585           if (open == BT_QUOT || open == BT_APOS)
586             break;
587           switch (open) {
588           case BT_S:
589           case BT_LF:
590           case BT_CR:
591             break;
592           default:
593             *nextTokPtr = ptr;
594             return XML_TOK_INVALID;
595           }
596         }
597         ptr += MINBPC(enc);
598         /* in attribute value */
599         for (;;) {
600           int t;
601           if (ptr == end)
602             return XML_TOK_PARTIAL;
603           t = BYTE_TYPE(enc, ptr);
604           if (t == open)
605             break;
606           switch (t) {
607           INVALID_CASES(ptr, nextTokPtr)
608           case BT_AMP:
609             {
610               int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
611               if (tok <= 0) {
612                 if (tok == XML_TOK_INVALID)
613                   *nextTokPtr = ptr;
614                 return tok;
615               }
616               break;
617             }
618           case BT_LT:
619             *nextTokPtr = ptr;
620             return XML_TOK_INVALID;
621           default:
622             ptr += MINBPC(enc);
623             break;
624           }
625         }
626         ptr += MINBPC(enc);
627         if (ptr == end)
628           return XML_TOK_PARTIAL;
629         switch (BYTE_TYPE(enc, ptr)) {
630         case BT_S:
631         case BT_CR:
632         case BT_LF:
633           break;
634         case BT_SOL:
635           goto sol;
636         case BT_GT:
637           goto gt;
638         default:
639           *nextTokPtr = ptr;
640           return XML_TOK_INVALID;
641         }
642         /* ptr points to closing quote */
643         for (;;) {
644           ptr += MINBPC(enc);
645           if (ptr == end)
646             return XML_TOK_PARTIAL;
647           switch (BYTE_TYPE(enc, ptr)) {
648           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
649           case BT_S: case BT_CR: case BT_LF:
650             continue;
651           case BT_GT:
652           gt:
653             *nextTokPtr = ptr + MINBPC(enc);
654             return XML_TOK_START_TAG_WITH_ATTS;
655           case BT_SOL:
656           sol:
657             ptr += MINBPC(enc);
658             if (ptr == end)
659               return XML_TOK_PARTIAL;
660             if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
661               *nextTokPtr = ptr;
662               return XML_TOK_INVALID;
663             }
664             *nextTokPtr = ptr + MINBPC(enc);
665             return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
666           default:
667             *nextTokPtr = ptr;
668             return XML_TOK_INVALID;
669           }
670           break;
671         }
672         break;
673       }
674     default:
675       *nextTokPtr = ptr;
676       return XML_TOK_INVALID;
677     }
678   }
679   return XML_TOK_PARTIAL;
680 }
681 
682 /* ptr points to character following "<" */
683 
684 static int PTRCALL
685 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
686                const char **nextTokPtr)
687 {
688 #ifdef XML_NS
689   int hadColon;
690 #endif
691   if (ptr == end)
692     return XML_TOK_PARTIAL;
693   switch (BYTE_TYPE(enc, ptr)) {
694   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
695   case BT_EXCL:
696     if ((ptr += MINBPC(enc)) == end)
697       return XML_TOK_PARTIAL;
698     switch (BYTE_TYPE(enc, ptr)) {
699     case BT_MINUS:
700       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
701     case BT_LSQB:
702       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
703                                       end, nextTokPtr);
704     }
705     *nextTokPtr = ptr;
706     return XML_TOK_INVALID;
707   case BT_QUEST:
708     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
709   case BT_SOL:
710     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
711   default:
712     *nextTokPtr = ptr;
713     return XML_TOK_INVALID;
714   }
715 #ifdef XML_NS
716   hadColon = 0;
717 #endif
718   /* we have a start-tag */
719   while (ptr != end) {
720     switch (BYTE_TYPE(enc, ptr)) {
721     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
722 #ifdef XML_NS
723     case BT_COLON:
724       if (hadColon) {
725         *nextTokPtr = ptr;
726         return XML_TOK_INVALID;
727       }
728       hadColon = 1;
729       ptr += MINBPC(enc);
730       if (ptr == end)
731         return XML_TOK_PARTIAL;
732       switch (BYTE_TYPE(enc, ptr)) {
733       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
734       default:
735         *nextTokPtr = ptr;
736         return XML_TOK_INVALID;
737       }
738       break;
739 #endif
740     case BT_S: case BT_CR: case BT_LF:
741       {
742         ptr += MINBPC(enc);
743         while (ptr != end) {
744           switch (BYTE_TYPE(enc, ptr)) {
745           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
746           case BT_GT:
747             goto gt;
748           case BT_SOL:
749             goto sol;
750           case BT_S: case BT_CR: case BT_LF:
751             ptr += MINBPC(enc);
752             continue;
753           default:
754             *nextTokPtr = ptr;
755             return XML_TOK_INVALID;
756           }
757           return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
758         }
759         return XML_TOK_PARTIAL;
760       }
761     case BT_GT:
762     gt:
763       *nextTokPtr = ptr + MINBPC(enc);
764       return XML_TOK_START_TAG_NO_ATTS;
765     case BT_SOL:
766     sol:
767       ptr += MINBPC(enc);
768       if (ptr == end)
769         return XML_TOK_PARTIAL;
770       if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
771         *nextTokPtr = ptr;
772         return XML_TOK_INVALID;
773       }
774       *nextTokPtr = ptr + MINBPC(enc);
775       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
776     default:
777       *nextTokPtr = ptr;
778       return XML_TOK_INVALID;
779     }
780   }
781   return XML_TOK_PARTIAL;
782 }
783 
784 static int PTRCALL
785 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
786                    const char **nextTokPtr)
787 {
788   if (ptr == end)
789     return XML_TOK_NONE;
790   if (MINBPC(enc) > 1) {
791     size_t n = end - ptr;
792     if (n & (MINBPC(enc) - 1)) {
793       n &= ~(MINBPC(enc) - 1);
794       if (n == 0)
795         return XML_TOK_PARTIAL;
796       end = ptr + n;
797     }
798   }
799   switch (BYTE_TYPE(enc, ptr)) {
800   case BT_LT:
801     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
802   case BT_AMP:
803     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
804   case BT_CR:
805     ptr += MINBPC(enc);
806     if (ptr == end)
807       return XML_TOK_TRAILING_CR;
808     if (BYTE_TYPE(enc, ptr) == BT_LF)
809       ptr += MINBPC(enc);
810     *nextTokPtr = ptr;
811     return XML_TOK_DATA_NEWLINE;
812   case BT_LF:
813     *nextTokPtr = ptr + MINBPC(enc);
814     return XML_TOK_DATA_NEWLINE;
815   case BT_RSQB:
816     ptr += MINBPC(enc);
817     if (ptr == end)
818       return XML_TOK_TRAILING_RSQB;
819     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
820       break;
821     ptr += MINBPC(enc);
822     if (ptr == end)
823       return XML_TOK_TRAILING_RSQB;
824     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
825       ptr -= MINBPC(enc);
826       break;
827     }
828     *nextTokPtr = ptr;
829     return XML_TOK_INVALID;
830   INVALID_CASES(ptr, nextTokPtr)
831   default:
832     ptr += MINBPC(enc);
833     break;
834   }
835   while (ptr != end) {
836     switch (BYTE_TYPE(enc, ptr)) {
837 #define LEAD_CASE(n) \
838     case BT_LEAD ## n: \
839       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
840         *nextTokPtr = ptr; \
841         return XML_TOK_DATA_CHARS; \
842       } \
843       ptr += n; \
844       break;
845     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
846 #undef LEAD_CASE
847     case BT_RSQB:
848       if (ptr + MINBPC(enc) != end) {
849          if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
850            ptr += MINBPC(enc);
851            break;
852          }
853          if (ptr + 2*MINBPC(enc) != end) {
854            if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
855              ptr += MINBPC(enc);
856              break;
857            }
858            *nextTokPtr = ptr + 2*MINBPC(enc);
859            return XML_TOK_INVALID;
860          }
861       }
862       /* fall through */
863     case BT_AMP:
864     case BT_LT:
865     case BT_NONXML:
866     case BT_MALFORM:
867     case BT_TRAIL:
868     case BT_CR:
869     case BT_LF:
870       *nextTokPtr = ptr;
871       return XML_TOK_DATA_CHARS;
872     default:
873       ptr += MINBPC(enc);
874       break;
875     }
876   }
877   *nextTokPtr = ptr;
878   return XML_TOK_DATA_CHARS;
879 }
880 
881 /* ptr points to character following "%" */
882 
883 static int PTRCALL
884 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
885                     const char **nextTokPtr)
886 {
887   if (ptr == end)
888     return -XML_TOK_PERCENT;
889   switch (BYTE_TYPE(enc, ptr)) {
890   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
891   case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
892     *nextTokPtr = ptr;
893     return XML_TOK_PERCENT;
894   default:
895     *nextTokPtr = ptr;
896     return XML_TOK_INVALID;
897   }
898   while (ptr != end) {
899     switch (BYTE_TYPE(enc, ptr)) {
900     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
901     case BT_SEMI:
902       *nextTokPtr = ptr + MINBPC(enc);
903       return XML_TOK_PARAM_ENTITY_REF;
904     default:
905       *nextTokPtr = ptr;
906       return XML_TOK_INVALID;
907     }
908   }
909   return XML_TOK_PARTIAL;
910 }
911 
912 static int PTRCALL
913 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
914                       const char **nextTokPtr)
915 {
916   if (ptr == end)
917     return XML_TOK_PARTIAL;
918   switch (BYTE_TYPE(enc, ptr)) {
919   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
920   default:
921     *nextTokPtr = ptr;
922     return XML_TOK_INVALID;
923   }
924   while (ptr != end) {
925     switch (BYTE_TYPE(enc, ptr)) {
926     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
927     case BT_CR: case BT_LF: case BT_S:
928     case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
929       *nextTokPtr = ptr;
930       return XML_TOK_POUND_NAME;
931     default:
932       *nextTokPtr = ptr;
933       return XML_TOK_INVALID;
934     }
935   }
936   return -XML_TOK_POUND_NAME;
937 }
938 
939 static int PTRCALL
940 PREFIX(scanLit)(int open, const ENCODING *enc,
941                 const char *ptr, const char *end,
942                 const char **nextTokPtr)
943 {
944   while (ptr != end) {
945     int t = BYTE_TYPE(enc, ptr);
946     switch (t) {
947     INVALID_CASES(ptr, nextTokPtr)
948     case BT_QUOT:
949     case BT_APOS:
950       ptr += MINBPC(enc);
951       if (t != open)
952         break;
953       if (ptr == end)
954         return -XML_TOK_LITERAL;
955       *nextTokPtr = ptr;
956       switch (BYTE_TYPE(enc, ptr)) {
957       case BT_S: case BT_CR: case BT_LF:
958       case BT_GT: case BT_PERCNT: case BT_LSQB:
959         return XML_TOK_LITERAL;
960       default:
961         return XML_TOK_INVALID;
962       }
963     default:
964       ptr += MINBPC(enc);
965       break;
966     }
967   }
968   return XML_TOK_PARTIAL;
969 }
970 
971 static int PTRCALL
972 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
973                   const char **nextTokPtr)
974 {
975   int tok;
976   if (ptr == end)
977     return XML_TOK_NONE;
978   if (MINBPC(enc) > 1) {
979     size_t n = end - ptr;
980     if (n & (MINBPC(enc) - 1)) {
981       n &= ~(MINBPC(enc) - 1);
982       if (n == 0)
983         return XML_TOK_PARTIAL;
984       end = ptr + n;
985     }
986   }
987   switch (BYTE_TYPE(enc, ptr)) {
988   case BT_QUOT:
989     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
990   case BT_APOS:
991     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
992   case BT_LT:
993     {
994       ptr += MINBPC(enc);
995       if (ptr == end)
996         return XML_TOK_PARTIAL;
997       switch (BYTE_TYPE(enc, ptr)) {
998       case BT_EXCL:
999         return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1000       case BT_QUEST:
1001         return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1002       case BT_NMSTRT:
1003       case BT_HEX:
1004       case BT_NONASCII:
1005       case BT_LEAD2:
1006       case BT_LEAD3:
1007       case BT_LEAD4:
1008         *nextTokPtr = ptr - MINBPC(enc);
1009         return XML_TOK_INSTANCE_START;
1010       }
1011       *nextTokPtr = ptr;
1012       return XML_TOK_INVALID;
1013     }
1014   case BT_CR:
1015     if (ptr + MINBPC(enc) == end) {
1016       *nextTokPtr = end;
1017       /* indicate that this might be part of a CR/LF pair */
1018       return -XML_TOK_PROLOG_S;
1019     }
1020     /* fall through */
1021   case BT_S: case BT_LF:
1022     for (;;) {
1023       ptr += MINBPC(enc);
1024       if (ptr == end)
1025         break;
1026       switch (BYTE_TYPE(enc, ptr)) {
1027       case BT_S: case BT_LF:
1028         break;
1029       case BT_CR:
1030         /* don't split CR/LF pair */
1031         if (ptr + MINBPC(enc) != end)
1032           break;
1033         /* fall through */
1034       default:
1035         *nextTokPtr = ptr;
1036         return XML_TOK_PROLOG_S;
1037       }
1038     }
1039     *nextTokPtr = ptr;
1040     return XML_TOK_PROLOG_S;
1041   case BT_PERCNT:
1042     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1043   case BT_COMMA:
1044     *nextTokPtr = ptr + MINBPC(enc);
1045     return XML_TOK_COMMA;
1046   case BT_LSQB:
1047     *nextTokPtr = ptr + MINBPC(enc);
1048     return XML_TOK_OPEN_BRACKET;
1049   case BT_RSQB:
1050     ptr += MINBPC(enc);
1051     if (ptr == end)
1052       return -XML_TOK_CLOSE_BRACKET;
1053     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1054       if (ptr + MINBPC(enc) == end)
1055         return XML_TOK_PARTIAL;
1056       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1057         *nextTokPtr = ptr + 2*MINBPC(enc);
1058         return XML_TOK_COND_SECT_CLOSE;
1059       }
1060     }
1061     *nextTokPtr = ptr;
1062     return XML_TOK_CLOSE_BRACKET;
1063   case BT_LPAR:
1064     *nextTokPtr = ptr + MINBPC(enc);
1065     return XML_TOK_OPEN_PAREN;
1066   case BT_RPAR:
1067     ptr += MINBPC(enc);
1068     if (ptr == end)
1069       return -XML_TOK_CLOSE_PAREN;
1070     switch (BYTE_TYPE(enc, ptr)) {
1071     case BT_AST:
1072       *nextTokPtr = ptr + MINBPC(enc);
1073       return XML_TOK_CLOSE_PAREN_ASTERISK;
1074     case BT_QUEST:
1075       *nextTokPtr = ptr + MINBPC(enc);
1076       return XML_TOK_CLOSE_PAREN_QUESTION;
1077     case BT_PLUS:
1078       *nextTokPtr = ptr + MINBPC(enc);
1079       return XML_TOK_CLOSE_PAREN_PLUS;
1080     case BT_CR: case BT_LF: case BT_S:
1081     case BT_GT: case BT_COMMA: case BT_VERBAR:
1082     case BT_RPAR:
1083       *nextTokPtr = ptr;
1084       return XML_TOK_CLOSE_PAREN;
1085     }
1086     *nextTokPtr = ptr;
1087     return XML_TOK_INVALID;
1088   case BT_VERBAR:
1089     *nextTokPtr = ptr + MINBPC(enc);
1090     return XML_TOK_OR;
1091   case BT_GT:
1092     *nextTokPtr = ptr + MINBPC(enc);
1093     return XML_TOK_DECL_CLOSE;
1094   case BT_NUM:
1095     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1096 #define LEAD_CASE(n) \
1097   case BT_LEAD ## n: \
1098     if (end - ptr < n) \
1099       return XML_TOK_PARTIAL_CHAR; \
1100     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1101       ptr += n; \
1102       tok = XML_TOK_NAME; \
1103       break; \
1104     } \
1105     if (IS_NAME_CHAR(enc, ptr, n)) { \
1106       ptr += n; \
1107       tok = XML_TOK_NMTOKEN; \
1108       break; \
1109     } \
1110     *nextTokPtr = ptr; \
1111     return XML_TOK_INVALID;
1112     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1113 #undef LEAD_CASE
1114   case BT_NMSTRT:
1115   case BT_HEX:
1116     tok = XML_TOK_NAME;
1117     ptr += MINBPC(enc);
1118     break;
1119   case BT_DIGIT:
1120   case BT_NAME:
1121   case BT_MINUS:
1122 #ifdef XML_NS
1123   case BT_COLON:
1124 #endif
1125     tok = XML_TOK_NMTOKEN;
1126     ptr += MINBPC(enc);
1127     break;
1128   case BT_NONASCII:
1129     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1130       ptr += MINBPC(enc);
1131       tok = XML_TOK_NAME;
1132       break;
1133     }
1134     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1135       ptr += MINBPC(enc);
1136       tok = XML_TOK_NMTOKEN;
1137       break;
1138     }
1139     /* fall through */
1140   default:
1141     *nextTokPtr = ptr;
1142     return XML_TOK_INVALID;
1143   }
1144   while (ptr != end) {
1145     switch (BYTE_TYPE(enc, ptr)) {
1146     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1147     case BT_GT: case BT_RPAR: case BT_COMMA:
1148     case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1149     case BT_S: case BT_CR: case BT_LF:
1150       *nextTokPtr = ptr;
1151       return tok;
1152 #ifdef XML_NS
1153     case BT_COLON:
1154       ptr += MINBPC(enc);
1155       switch (tok) {
1156       case XML_TOK_NAME:
1157         if (ptr == end)
1158           return XML_TOK_PARTIAL;
1159         tok = XML_TOK_PREFIXED_NAME;
1160         switch (BYTE_TYPE(enc, ptr)) {
1161         CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1162         default:
1163           tok = XML_TOK_NMTOKEN;
1164           break;
1165         }
1166         break;
1167       case XML_TOK_PREFIXED_NAME:
1168         tok = XML_TOK_NMTOKEN;
1169         break;
1170       }
1171       break;
1172 #endif
1173     case BT_PLUS:
1174       if (tok == XML_TOK_NMTOKEN)  {
1175         *nextTokPtr = ptr;
1176         return XML_TOK_INVALID;
1177       }
1178       *nextTokPtr = ptr + MINBPC(enc);
1179       return XML_TOK_NAME_PLUS;
1180     case BT_AST:
1181       if (tok == XML_TOK_NMTOKEN)  {
1182         *nextTokPtr = ptr;
1183         return XML_TOK_INVALID;
1184       }
1185       *nextTokPtr = ptr + MINBPC(enc);
1186       return XML_TOK_NAME_ASTERISK;
1187     case BT_QUEST:
1188       if (tok == XML_TOK_NMTOKEN)  {
1189         *nextTokPtr = ptr;
1190         return XML_TOK_INVALID;
1191       }
1192       *nextTokPtr = ptr + MINBPC(enc);
1193       return XML_TOK_NAME_QUESTION;
1194     default:
1195       *nextTokPtr = ptr;
1196       return XML_TOK_INVALID;
1197     }
1198   }
1199   return -tok;
1200 }
1201 
1202 static int PTRCALL
1203 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1204                           const char *end, const char **nextTokPtr)
1205 {
1206   const char *start;
1207   if (ptr == end)
1208     return XML_TOK_NONE;
1209   start = ptr;
1210   while (ptr != end) {
1211     switch (BYTE_TYPE(enc, ptr)) {
1212 #define LEAD_CASE(n) \
1213     case BT_LEAD ## n: ptr += n; break;
1214     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1215 #undef LEAD_CASE
1216     case BT_AMP:
1217       if (ptr == start)
1218         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1219       *nextTokPtr = ptr;
1220       return XML_TOK_DATA_CHARS;
1221     case BT_LT:
1222       /* this is for inside entity references */
1223       *nextTokPtr = ptr;
1224       return XML_TOK_INVALID;
1225     case BT_LF:
1226       if (ptr == start) {
1227         *nextTokPtr = ptr + MINBPC(enc);
1228         return XML_TOK_DATA_NEWLINE;
1229       }
1230       *nextTokPtr = ptr;
1231       return XML_TOK_DATA_CHARS;
1232     case BT_CR:
1233       if (ptr == start) {
1234         ptr += MINBPC(enc);
1235         if (ptr == end)
1236           return XML_TOK_TRAILING_CR;
1237         if (BYTE_TYPE(enc, ptr) == BT_LF)
1238           ptr += MINBPC(enc);
1239         *nextTokPtr = ptr;
1240         return XML_TOK_DATA_NEWLINE;
1241       }
1242       *nextTokPtr = ptr;
1243       return XML_TOK_DATA_CHARS;
1244     case BT_S:
1245       if (ptr == start) {
1246         *nextTokPtr = ptr + MINBPC(enc);
1247         return XML_TOK_ATTRIBUTE_VALUE_S;
1248       }
1249       *nextTokPtr = ptr;
1250       return XML_TOK_DATA_CHARS;
1251     default:
1252       ptr += MINBPC(enc);
1253       break;
1254     }
1255   }
1256   *nextTokPtr = ptr;
1257   return XML_TOK_DATA_CHARS;
1258 }
1259 
1260 static int PTRCALL
1261 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1262                        const char *end, const char **nextTokPtr)
1263 {
1264   const char *start;
1265   if (ptr == end)
1266     return XML_TOK_NONE;
1267   start = ptr;
1268   while (ptr != end) {
1269     switch (BYTE_TYPE(enc, ptr)) {
1270 #define LEAD_CASE(n) \
1271     case BT_LEAD ## n: ptr += n; break;
1272     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1273 #undef LEAD_CASE
1274     case BT_AMP:
1275       if (ptr == start)
1276         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1277       *nextTokPtr = ptr;
1278       return XML_TOK_DATA_CHARS;
1279     case BT_PERCNT:
1280       if (ptr == start) {
1281         int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1282                                        end, nextTokPtr);
1283         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1284       }
1285       *nextTokPtr = ptr;
1286       return XML_TOK_DATA_CHARS;
1287     case BT_LF:
1288       if (ptr == start) {
1289         *nextTokPtr = ptr + MINBPC(enc);
1290         return XML_TOK_DATA_NEWLINE;
1291       }
1292       *nextTokPtr = ptr;
1293       return XML_TOK_DATA_CHARS;
1294     case BT_CR:
1295       if (ptr == start) {
1296         ptr += MINBPC(enc);
1297         if (ptr == end)
1298           return XML_TOK_TRAILING_CR;
1299         if (BYTE_TYPE(enc, ptr) == BT_LF)
1300           ptr += MINBPC(enc);
1301         *nextTokPtr = ptr;
1302         return XML_TOK_DATA_NEWLINE;
1303       }
1304       *nextTokPtr = ptr;
1305       return XML_TOK_DATA_CHARS;
1306     default:
1307       ptr += MINBPC(enc);
1308       break;
1309     }
1310   }
1311   *nextTokPtr = ptr;
1312   return XML_TOK_DATA_CHARS;
1313 }
1314 
1315 #ifdef XML_DTD
1316 
1317 static int PTRCALL
1318 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1319                          const char *end, const char **nextTokPtr)
1320 {
1321   int level = 0;
1322   if (MINBPC(enc) > 1) {
1323     size_t n = end - ptr;
1324     if (n & (MINBPC(enc) - 1)) {
1325       n &= ~(MINBPC(enc) - 1);
1326       end = ptr + n;
1327     }
1328   }
1329   while (ptr != end) {
1330     switch (BYTE_TYPE(enc, ptr)) {
1331     INVALID_CASES(ptr, nextTokPtr)
1332     case BT_LT:
1333       if ((ptr += MINBPC(enc)) == end)
1334         return XML_TOK_PARTIAL;
1335       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1336         if ((ptr += MINBPC(enc)) == end)
1337           return XML_TOK_PARTIAL;
1338         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1339           ++level;
1340           ptr += MINBPC(enc);
1341         }
1342       }
1343       break;
1344     case BT_RSQB:
1345       if ((ptr += MINBPC(enc)) == end)
1346         return XML_TOK_PARTIAL;
1347       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1348         if ((ptr += MINBPC(enc)) == end)
1349           return XML_TOK_PARTIAL;
1350         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1351           ptr += MINBPC(enc);
1352           if (level == 0) {
1353             *nextTokPtr = ptr;
1354             return XML_TOK_IGNORE_SECT;
1355           }
1356           --level;
1357         }
1358       }
1359       break;
1360     default:
1361       ptr += MINBPC(enc);
1362       break;
1363     }
1364   }
1365   return XML_TOK_PARTIAL;
1366 }
1367 
1368 #endif /* XML_DTD */
1369 
1370 static int PTRCALL
1371 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1372                    const char **badPtr)
1373 {
1374   ptr += MINBPC(enc);
1375   end -= MINBPC(enc);
1376   for (; ptr != end; ptr += MINBPC(enc)) {
1377     switch (BYTE_TYPE(enc, ptr)) {
1378     case BT_DIGIT:
1379     case BT_HEX:
1380     case BT_MINUS:
1381     case BT_APOS:
1382     case BT_LPAR:
1383     case BT_RPAR:
1384     case BT_PLUS:
1385     case BT_COMMA:
1386     case BT_SOL:
1387     case BT_EQUALS:
1388     case BT_QUEST:
1389     case BT_CR:
1390     case BT_LF:
1391     case BT_SEMI:
1392     case BT_EXCL:
1393     case BT_AST:
1394     case BT_PERCNT:
1395     case BT_NUM:
1396 #ifdef XML_NS
1397     case BT_COLON:
1398 #endif
1399       break;
1400     case BT_S:
1401       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1402         *badPtr = ptr;
1403         return 0;
1404       }
1405       break;
1406     case BT_NAME:
1407     case BT_NMSTRT:
1408       if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1409         break;
1410     default:
1411       switch (BYTE_TO_ASCII(enc, ptr)) {
1412       case 0x24: /* $ */
1413       case 0x40: /* @ */
1414         break;
1415       default:
1416         *badPtr = ptr;
1417         return 0;
1418       }
1419       break;
1420     }
1421   }
1422   return 1;
1423 }
1424 
1425 /* This must only be called for a well-formed start-tag or empty
1426    element tag.  Returns the number of attributes.  Pointers to the
1427    first attsMax attributes are stored in atts.
1428 */
1429 
1430 static int PTRCALL
1431 PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1432                 int attsMax, ATTRIBUTE *atts)
1433 {
1434   enum { other, inName, inValue } state = inName;
1435   int nAtts = 0;
1436   int open = 0; /* defined when state == inValue;
1437                    initialization just to shut up compilers */
1438 
1439   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1440     switch (BYTE_TYPE(enc, ptr)) {
1441 #define START_NAME \
1442       if (state == other) { \
1443         if (nAtts < attsMax) { \
1444           atts[nAtts].name = ptr; \
1445           atts[nAtts].normalized = 1; \
1446         } \
1447         state = inName; \
1448       }
1449 #define LEAD_CASE(n) \
1450     case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1451     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1452 #undef LEAD_CASE
1453     case BT_NONASCII:
1454     case BT_NMSTRT:
1455     case BT_HEX:
1456       START_NAME
1457       break;
1458 #undef START_NAME
1459     case BT_QUOT:
1460       if (state != inValue) {
1461         if (nAtts < attsMax)
1462           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1463         state = inValue;
1464         open = BT_QUOT;
1465       }
1466       else if (open == BT_QUOT) {
1467         state = other;
1468         if (nAtts < attsMax)
1469           atts[nAtts].valueEnd = ptr;
1470         nAtts++;
1471       }
1472       break;
1473     case BT_APOS:
1474       if (state != inValue) {
1475         if (nAtts < attsMax)
1476           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1477         state = inValue;
1478         open = BT_APOS;
1479       }
1480       else if (open == BT_APOS) {
1481         state = other;
1482         if (nAtts < attsMax)
1483           atts[nAtts].valueEnd = ptr;
1484         nAtts++;
1485       }
1486       break;
1487     case BT_AMP:
1488       if (nAtts < attsMax)
1489         atts[nAtts].normalized = 0;
1490       break;
1491     case BT_S:
1492       if (state == inName)
1493         state = other;
1494       else if (state == inValue
1495                && nAtts < attsMax
1496                && atts[nAtts].normalized
1497                && (ptr == atts[nAtts].valuePtr
1498                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1499                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1500                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1501         atts[nAtts].normalized = 0;
1502       break;
1503     case BT_CR: case BT_LF:
1504       /* This case ensures that the first attribute name is counted
1505          Apart from that we could just change state on the quote. */
1506       if (state == inName)
1507         state = other;
1508       else if (state == inValue && nAtts < attsMax)
1509         atts[nAtts].normalized = 0;
1510       break;
1511     case BT_GT:
1512     case BT_SOL:
1513       if (state != inValue)
1514         return nAtts;
1515       break;
1516     default:
1517       break;
1518     }
1519   }
1520   /* not reached */
1521 }
1522 
1523 static int PTRFASTCALL
1524 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
1525 {
1526   int result = 0;
1527   /* skip &# */
1528   ptr += 2*MINBPC(enc);
1529   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1530     for (ptr += MINBPC(enc);
1531          !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1532          ptr += MINBPC(enc)) {
1533       int c = BYTE_TO_ASCII(enc, ptr);
1534       switch (c) {
1535       case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1536       case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1537         result <<= 4;
1538         result |= (c - ASCII_0);
1539         break;
1540       case ASCII_A: case ASCII_B: case ASCII_C:
1541       case ASCII_D: case ASCII_E: case ASCII_F:
1542         result <<= 4;
1543         result += 10 + (c - ASCII_A);
1544         break;
1545       case ASCII_a: case ASCII_b: case ASCII_c:
1546       case ASCII_d: case ASCII_e: case ASCII_f:
1547         result <<= 4;
1548         result += 10 + (c - ASCII_a);
1549         break;
1550       }
1551       if (result >= 0x110000)
1552         return -1;
1553     }
1554   }
1555   else {
1556     for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1557       int c = BYTE_TO_ASCII(enc, ptr);
1558       result *= 10;
1559       result += (c - ASCII_0);
1560       if (result >= 0x110000)
1561         return -1;
1562     }
1563   }
1564   return checkCharRefNumber(result);
1565 }
1566 
1567 static int PTRCALL
1568 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1569                              const char *end)
1570 {
1571   switch ((end - ptr)/MINBPC(enc)) {
1572   case 2:
1573     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1574       switch (BYTE_TO_ASCII(enc, ptr)) {
1575       case ASCII_l:
1576         return ASCII_LT;
1577       case ASCII_g:
1578         return ASCII_GT;
1579       }
1580     }
1581     break;
1582   case 3:
1583     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1584       ptr += MINBPC(enc);
1585       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1586         ptr += MINBPC(enc);
1587         if (CHAR_MATCHES(enc, ptr, ASCII_p))
1588           return ASCII_AMP;
1589       }
1590     }
1591     break;
1592   case 4:
1593     switch (BYTE_TO_ASCII(enc, ptr)) {
1594     case ASCII_q:
1595       ptr += MINBPC(enc);
1596       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1597         ptr += MINBPC(enc);
1598         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1599           ptr += MINBPC(enc);
1600           if (CHAR_MATCHES(enc, ptr, ASCII_t))
1601             return ASCII_QUOT;
1602         }
1603       }
1604       break;
1605     case ASCII_a:
1606       ptr += MINBPC(enc);
1607       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1608         ptr += MINBPC(enc);
1609         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1610           ptr += MINBPC(enc);
1611           if (CHAR_MATCHES(enc, ptr, ASCII_s))
1612             return ASCII_APOS;
1613         }
1614       }
1615       break;
1616     }
1617   }
1618   return 0;
1619 }
1620 
1621 static int PTRCALL
1622 PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1623 {
1624   for (;;) {
1625     switch (BYTE_TYPE(enc, ptr1)) {
1626 #define LEAD_CASE(n) \
1627     case BT_LEAD ## n: \
1628       if (*ptr1++ != *ptr2++) \
1629         return 0;
1630     LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1631 #undef LEAD_CASE
1632       /* fall through */
1633       if (*ptr1++ != *ptr2++)
1634         return 0;
1635       break;
1636     case BT_NONASCII:
1637     case BT_NMSTRT:
1638 #ifdef XML_NS
1639     case BT_COLON:
1640 #endif
1641     case BT_HEX:
1642     case BT_DIGIT:
1643     case BT_NAME:
1644     case BT_MINUS:
1645       if (*ptr2++ != *ptr1++)
1646         return 0;
1647       if (MINBPC(enc) > 1) {
1648         if (*ptr2++ != *ptr1++)
1649           return 0;
1650         if (MINBPC(enc) > 2) {
1651           if (*ptr2++ != *ptr1++)
1652             return 0;
1653           if (MINBPC(enc) > 3) {
1654             if (*ptr2++ != *ptr1++)
1655               return 0;
1656           }
1657         }
1658       }
1659       break;
1660     default:
1661       if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1662         return 1;
1663       switch (BYTE_TYPE(enc, ptr2)) {
1664       case BT_LEAD2:
1665       case BT_LEAD3:
1666       case BT_LEAD4:
1667       case BT_NONASCII:
1668       case BT_NMSTRT:
1669 #ifdef XML_NS
1670       case BT_COLON:
1671 #endif
1672       case BT_HEX:
1673       case BT_DIGIT:
1674       case BT_NAME:
1675       case BT_MINUS:
1676         return 0;
1677       default:
1678         return 1;
1679       }
1680     }
1681   }
1682   /* not reached */
1683 }
1684 
1685 static int PTRCALL
1686 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1687                          const char *end1, const char *ptr2)
1688 {
1689   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1690     if (ptr1 == end1)
1691       return 0;
1692     if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1693       return 0;
1694   }
1695   return ptr1 == end1;
1696 }
1697 
1698 static int PTRFASTCALL
1699 PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1700 {
1701   const char *start = ptr;
1702   for (;;) {
1703     switch (BYTE_TYPE(enc, ptr)) {
1704 #define LEAD_CASE(n) \
1705     case BT_LEAD ## n: ptr += n; break;
1706     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1707 #undef LEAD_CASE
1708     case BT_NONASCII:
1709     case BT_NMSTRT:
1710 #ifdef XML_NS
1711     case BT_COLON:
1712 #endif
1713     case BT_HEX:
1714     case BT_DIGIT:
1715     case BT_NAME:
1716     case BT_MINUS:
1717       ptr += MINBPC(enc);
1718       break;
1719     default:
1720       return (int)(ptr - start);
1721     }
1722   }
1723 }
1724 
1725 static const char * PTRFASTCALL
1726 PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1727 {
1728   for (;;) {
1729     switch (BYTE_TYPE(enc, ptr)) {
1730     case BT_LF:
1731     case BT_CR:
1732     case BT_S:
1733       ptr += MINBPC(enc);
1734       break;
1735     default:
1736       return ptr;
1737     }
1738   }
1739 }
1740 
1741 static void PTRCALL
1742 PREFIX(updatePosition)(const ENCODING *enc,
1743                        const char *ptr,
1744                        const char *end,
1745                        POSITION *pos)
1746 {
1747   while (ptr < end) {
1748     switch (BYTE_TYPE(enc, ptr)) {
1749 #define LEAD_CASE(n) \
1750     case BT_LEAD ## n: \
1751       ptr += n; \
1752       break;
1753     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1754 #undef LEAD_CASE
1755     case BT_LF:
1756       pos->columnNumber = (XML_Size)-1;
1757       pos->lineNumber++;
1758       ptr += MINBPC(enc);
1759       break;
1760     case BT_CR:
1761       pos->lineNumber++;
1762       ptr += MINBPC(enc);
1763       if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
1764         ptr += MINBPC(enc);
1765       pos->columnNumber = (XML_Size)-1;
1766       break;
1767     default:
1768       ptr += MINBPC(enc);
1769       break;
1770     }
1771     pos->columnNumber++;
1772   }
1773 }
1774 
1775 #undef DO_LEAD_CASE
1776 #undef MULTIBYTE_CASES
1777 #undef INVALID_CASES
1778 #undef CHECK_NAME_CASE
1779 #undef CHECK_NAME_CASES
1780 #undef CHECK_NMSTRT_CASE
1781 #undef CHECK_NMSTRT_CASES
1782 
1783 #endif /* XML_TOK_IMPL_C */
1784