xref: /freebsd/contrib/expat/lib/xmltok_impl.c (revision 31d62a73c2e6ac0ff413a7a17700ffc7dce254ef)
1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2    See the file COPYING for copying permission.
3 */
4 
5 /* This file is included! */
6 #ifdef XML_TOK_IMPL_C
7 
8 #ifndef IS_INVALID_CHAR
9 #define IS_INVALID_CHAR(enc, ptr, n) (0)
10 #endif
11 
12 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
13     case BT_LEAD ## n: \
14       if (end - ptr < n) \
15         return XML_TOK_PARTIAL_CHAR; \
16       if (IS_INVALID_CHAR(enc, ptr, n)) { \
17         *(nextTokPtr) = (ptr); \
18         return XML_TOK_INVALID; \
19       } \
20       ptr += n; \
21       break;
22 
23 #define INVALID_CASES(ptr, nextTokPtr) \
24   INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
25   INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
26   INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
27   case BT_NONXML: \
28   case BT_MALFORM: \
29   case BT_TRAIL: \
30     *(nextTokPtr) = (ptr); \
31     return XML_TOK_INVALID;
32 
33 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
34    case BT_LEAD ## n: \
35      if (end - ptr < n) \
36        return XML_TOK_PARTIAL_CHAR; \
37      if (!IS_NAME_CHAR(enc, ptr, n)) { \
38        *nextTokPtr = ptr; \
39        return XML_TOK_INVALID; \
40      } \
41      ptr += n; \
42      break;
43 
44 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
45   case BT_NONASCII: \
46     if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
47       *nextTokPtr = ptr; \
48       return XML_TOK_INVALID; \
49     } \
50   case BT_NMSTRT: \
51   case BT_HEX: \
52   case BT_DIGIT: \
53   case BT_NAME: \
54   case BT_MINUS: \
55     ptr += MINBPC(enc); \
56     break; \
57   CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
58   CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
59   CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
60 
61 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
62    case BT_LEAD ## n: \
63      if (end - ptr < n) \
64        return XML_TOK_PARTIAL_CHAR; \
65      if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
66        *nextTokPtr = ptr; \
67        return XML_TOK_INVALID; \
68      } \
69      ptr += n; \
70      break;
71 
72 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
73   case BT_NONASCII: \
74     if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
75       *nextTokPtr = ptr; \
76       return XML_TOK_INVALID; \
77     } \
78   case BT_NMSTRT: \
79   case BT_HEX: \
80     ptr += MINBPC(enc); \
81     break; \
82   CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
83   CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
84   CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
85 
86 #ifndef PREFIX
87 #define PREFIX(ident) ident
88 #endif
89 
90 
91 #define HAS_CHARS(enc, ptr, end, count) \
92     (end - ptr >= count * MINBPC(enc))
93 
94 #define HAS_CHAR(enc, ptr, end) \
95     HAS_CHARS(enc, ptr, end, 1)
96 
97 #define REQUIRE_CHARS(enc, ptr, end, count) \
98     { \
99       if (! HAS_CHARS(enc, ptr, end, count)) { \
100         return XML_TOK_PARTIAL; \
101       } \
102     }
103 
104 #define REQUIRE_CHAR(enc, ptr, end) \
105     REQUIRE_CHARS(enc, ptr, end, 1)
106 
107 
108 /* ptr points to character following "<!-" */
109 
110 static int PTRCALL
111 PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
112                     const char *end, const char **nextTokPtr)
113 {
114   if (HAS_CHAR(enc, ptr, end)) {
115     if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
116       *nextTokPtr = ptr;
117       return XML_TOK_INVALID;
118     }
119     ptr += MINBPC(enc);
120     while (HAS_CHAR(enc, ptr, end)) {
121       switch (BYTE_TYPE(enc, ptr)) {
122       INVALID_CASES(ptr, nextTokPtr)
123       case BT_MINUS:
124         ptr += MINBPC(enc);
125         REQUIRE_CHAR(enc, ptr, end);
126         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
127           ptr += MINBPC(enc);
128           REQUIRE_CHAR(enc, ptr, end);
129           if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
130             *nextTokPtr = ptr;
131             return XML_TOK_INVALID;
132           }
133           *nextTokPtr = ptr + MINBPC(enc);
134           return XML_TOK_COMMENT;
135         }
136         break;
137       default:
138         ptr += MINBPC(enc);
139         break;
140       }
141     }
142   }
143   return XML_TOK_PARTIAL;
144 }
145 
146 /* ptr points to character following "<!" */
147 
148 static int PTRCALL
149 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
150                  const char *end, const char **nextTokPtr)
151 {
152   REQUIRE_CHAR(enc, ptr, end);
153   switch (BYTE_TYPE(enc, ptr)) {
154   case BT_MINUS:
155     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
156   case BT_LSQB:
157     *nextTokPtr = ptr + MINBPC(enc);
158     return XML_TOK_COND_SECT_OPEN;
159   case BT_NMSTRT:
160   case BT_HEX:
161     ptr += MINBPC(enc);
162     break;
163   default:
164     *nextTokPtr = ptr;
165     return XML_TOK_INVALID;
166   }
167   while (HAS_CHAR(enc, ptr, end)) {
168     switch (BYTE_TYPE(enc, ptr)) {
169     case BT_PERCNT:
170       REQUIRE_CHARS(enc, ptr, end, 2);
171       /* don't allow <!ENTITY% foo "whatever"> */
172       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
173       case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
174         *nextTokPtr = ptr;
175         return XML_TOK_INVALID;
176       }
177       /* fall through */
178     case BT_S: case BT_CR: case BT_LF:
179       *nextTokPtr = ptr;
180       return XML_TOK_DECL_OPEN;
181     case BT_NMSTRT:
182     case BT_HEX:
183       ptr += MINBPC(enc);
184       break;
185     default:
186       *nextTokPtr = ptr;
187       return XML_TOK_INVALID;
188     }
189   }
190   return XML_TOK_PARTIAL;
191 }
192 
193 static int PTRCALL
194 PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr,
195                       const char *end, int *tokPtr)
196 {
197   int upper = 0;
198   *tokPtr = XML_TOK_PI;
199   if (end - ptr != MINBPC(enc)*3)
200     return 1;
201   switch (BYTE_TO_ASCII(enc, ptr)) {
202   case ASCII_x:
203     break;
204   case ASCII_X:
205     upper = 1;
206     break;
207   default:
208     return 1;
209   }
210   ptr += MINBPC(enc);
211   switch (BYTE_TO_ASCII(enc, ptr)) {
212   case ASCII_m:
213     break;
214   case ASCII_M:
215     upper = 1;
216     break;
217   default:
218     return 1;
219   }
220   ptr += MINBPC(enc);
221   switch (BYTE_TO_ASCII(enc, ptr)) {
222   case ASCII_l:
223     break;
224   case ASCII_L:
225     upper = 1;
226     break;
227   default:
228     return 1;
229   }
230   if (upper)
231     return 0;
232   *tokPtr = XML_TOK_XML_DECL;
233   return 1;
234 }
235 
236 /* ptr points to character following "<?" */
237 
238 static int PTRCALL
239 PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
240                const char *end, const char **nextTokPtr)
241 {
242   int tok;
243   const char *target = ptr;
244   REQUIRE_CHAR(enc, ptr, end);
245   switch (BYTE_TYPE(enc, ptr)) {
246   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
247   default:
248     *nextTokPtr = ptr;
249     return XML_TOK_INVALID;
250   }
251   while (HAS_CHAR(enc, ptr, end)) {
252     switch (BYTE_TYPE(enc, ptr)) {
253     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
254     case BT_S: case BT_CR: case BT_LF:
255       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
256         *nextTokPtr = ptr;
257         return XML_TOK_INVALID;
258       }
259       ptr += MINBPC(enc);
260       while (HAS_CHAR(enc, ptr, end)) {
261         switch (BYTE_TYPE(enc, ptr)) {
262         INVALID_CASES(ptr, nextTokPtr)
263         case BT_QUEST:
264           ptr += MINBPC(enc);
265           REQUIRE_CHAR(enc, ptr, end);
266           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
267             *nextTokPtr = ptr + MINBPC(enc);
268             return tok;
269           }
270           break;
271         default:
272           ptr += MINBPC(enc);
273           break;
274         }
275       }
276       return XML_TOK_PARTIAL;
277     case BT_QUEST:
278       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
279         *nextTokPtr = ptr;
280         return XML_TOK_INVALID;
281       }
282       ptr += MINBPC(enc);
283       REQUIRE_CHAR(enc, ptr, end);
284       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
285         *nextTokPtr = ptr + MINBPC(enc);
286         return tok;
287       }
288       /* fall through */
289     default:
290       *nextTokPtr = ptr;
291       return XML_TOK_INVALID;
292     }
293   }
294   return XML_TOK_PARTIAL;
295 }
296 
297 static int PTRCALL
298 PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr,
299                          const char *end, const char **nextTokPtr)
300 {
301   static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
302                                      ASCII_T, ASCII_A, ASCII_LSQB };
303   int i;
304   /* CDATA[ */
305   REQUIRE_CHARS(enc, ptr, end, 6);
306   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
307     if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
308       *nextTokPtr = ptr;
309       return XML_TOK_INVALID;
310     }
311   }
312   *nextTokPtr = ptr;
313   return XML_TOK_CDATA_SECT_OPEN;
314 }
315 
316 static int PTRCALL
317 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
318                         const char *end, const char **nextTokPtr)
319 {
320   if (ptr >= end)
321     return XML_TOK_NONE;
322   if (MINBPC(enc) > 1) {
323     size_t n = end - ptr;
324     if (n & (MINBPC(enc) - 1)) {
325       n &= ~(MINBPC(enc) - 1);
326       if (n == 0)
327         return XML_TOK_PARTIAL;
328       end = ptr + n;
329     }
330   }
331   switch (BYTE_TYPE(enc, ptr)) {
332   case BT_RSQB:
333     ptr += MINBPC(enc);
334     REQUIRE_CHAR(enc, ptr, end);
335     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
336       break;
337     ptr += MINBPC(enc);
338     REQUIRE_CHAR(enc, ptr, end);
339     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
340       ptr -= MINBPC(enc);
341       break;
342     }
343     *nextTokPtr = ptr + MINBPC(enc);
344     return XML_TOK_CDATA_SECT_CLOSE;
345   case BT_CR:
346     ptr += MINBPC(enc);
347     REQUIRE_CHAR(enc, ptr, end);
348     if (BYTE_TYPE(enc, ptr) == BT_LF)
349       ptr += MINBPC(enc);
350     *nextTokPtr = ptr;
351     return XML_TOK_DATA_NEWLINE;
352   case BT_LF:
353     *nextTokPtr = ptr + MINBPC(enc);
354     return XML_TOK_DATA_NEWLINE;
355   INVALID_CASES(ptr, nextTokPtr)
356   default:
357     ptr += MINBPC(enc);
358     break;
359   }
360   while (HAS_CHAR(enc, ptr, end)) {
361     switch (BYTE_TYPE(enc, ptr)) {
362 #define LEAD_CASE(n) \
363     case BT_LEAD ## n: \
364       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
365         *nextTokPtr = ptr; \
366         return XML_TOK_DATA_CHARS; \
367       } \
368       ptr += n; \
369       break;
370     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
371 #undef LEAD_CASE
372     case BT_NONXML:
373     case BT_MALFORM:
374     case BT_TRAIL:
375     case BT_CR:
376     case BT_LF:
377     case BT_RSQB:
378       *nextTokPtr = ptr;
379       return XML_TOK_DATA_CHARS;
380     default:
381       ptr += MINBPC(enc);
382       break;
383     }
384   }
385   *nextTokPtr = ptr;
386   return XML_TOK_DATA_CHARS;
387 }
388 
389 /* ptr points to character following "</" */
390 
391 static int PTRCALL
392 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
393                    const char *end, const char **nextTokPtr)
394 {
395   REQUIRE_CHAR(enc, ptr, end);
396   switch (BYTE_TYPE(enc, ptr)) {
397   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
398   default:
399     *nextTokPtr = ptr;
400     return XML_TOK_INVALID;
401   }
402   while (HAS_CHAR(enc, ptr, end)) {
403     switch (BYTE_TYPE(enc, ptr)) {
404     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
405     case BT_S: case BT_CR: case BT_LF:
406       for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
407         switch (BYTE_TYPE(enc, ptr)) {
408         case BT_S: case BT_CR: case BT_LF:
409           break;
410         case BT_GT:
411           *nextTokPtr = ptr + MINBPC(enc);
412           return XML_TOK_END_TAG;
413         default:
414           *nextTokPtr = ptr;
415           return XML_TOK_INVALID;
416         }
417       }
418       return XML_TOK_PARTIAL;
419 #ifdef XML_NS
420     case BT_COLON:
421       /* no need to check qname syntax here,
422          since end-tag must match exactly */
423       ptr += MINBPC(enc);
424       break;
425 #endif
426     case BT_GT:
427       *nextTokPtr = ptr + MINBPC(enc);
428       return XML_TOK_END_TAG;
429     default:
430       *nextTokPtr = ptr;
431       return XML_TOK_INVALID;
432     }
433   }
434   return XML_TOK_PARTIAL;
435 }
436 
437 /* ptr points to character following "&#X" */
438 
439 static int PTRCALL
440 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
441                        const char *end, const char **nextTokPtr)
442 {
443   if (HAS_CHAR(enc, ptr, end)) {
444     switch (BYTE_TYPE(enc, ptr)) {
445     case BT_DIGIT:
446     case BT_HEX:
447       break;
448     default:
449       *nextTokPtr = ptr;
450       return XML_TOK_INVALID;
451     }
452     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
453       switch (BYTE_TYPE(enc, ptr)) {
454       case BT_DIGIT:
455       case BT_HEX:
456         break;
457       case BT_SEMI:
458         *nextTokPtr = ptr + MINBPC(enc);
459         return XML_TOK_CHAR_REF;
460       default:
461         *nextTokPtr = ptr;
462         return XML_TOK_INVALID;
463       }
464     }
465   }
466   return XML_TOK_PARTIAL;
467 }
468 
469 /* ptr points to character following "&#" */
470 
471 static int PTRCALL
472 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
473                     const char *end, const char **nextTokPtr)
474 {
475   if (HAS_CHAR(enc, ptr, end)) {
476     if (CHAR_MATCHES(enc, ptr, ASCII_x))
477       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
478     switch (BYTE_TYPE(enc, ptr)) {
479     case BT_DIGIT:
480       break;
481     default:
482       *nextTokPtr = ptr;
483       return XML_TOK_INVALID;
484     }
485     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
486       switch (BYTE_TYPE(enc, ptr)) {
487       case BT_DIGIT:
488         break;
489       case BT_SEMI:
490         *nextTokPtr = ptr + MINBPC(enc);
491         return XML_TOK_CHAR_REF;
492       default:
493         *nextTokPtr = ptr;
494         return XML_TOK_INVALID;
495       }
496     }
497   }
498   return XML_TOK_PARTIAL;
499 }
500 
501 /* ptr points to character following "&" */
502 
503 static int PTRCALL
504 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
505                 const char **nextTokPtr)
506 {
507   REQUIRE_CHAR(enc, ptr, end);
508   switch (BYTE_TYPE(enc, ptr)) {
509   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
510   case BT_NUM:
511     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
512   default:
513     *nextTokPtr = ptr;
514     return XML_TOK_INVALID;
515   }
516   while (HAS_CHAR(enc, ptr, end)) {
517     switch (BYTE_TYPE(enc, ptr)) {
518     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
519     case BT_SEMI:
520       *nextTokPtr = ptr + MINBPC(enc);
521       return XML_TOK_ENTITY_REF;
522     default:
523       *nextTokPtr = ptr;
524       return XML_TOK_INVALID;
525     }
526   }
527   return XML_TOK_PARTIAL;
528 }
529 
530 /* ptr points to character following first character of attribute name */
531 
532 static int PTRCALL
533 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
534                  const char **nextTokPtr)
535 {
536 #ifdef XML_NS
537   int hadColon = 0;
538 #endif
539   while (HAS_CHAR(enc, ptr, end)) {
540     switch (BYTE_TYPE(enc, ptr)) {
541     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
542 #ifdef XML_NS
543     case BT_COLON:
544       if (hadColon) {
545         *nextTokPtr = ptr;
546         return XML_TOK_INVALID;
547       }
548       hadColon = 1;
549       ptr += MINBPC(enc);
550       REQUIRE_CHAR(enc, ptr, end);
551       switch (BYTE_TYPE(enc, ptr)) {
552       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
553       default:
554         *nextTokPtr = ptr;
555         return XML_TOK_INVALID;
556       }
557       break;
558 #endif
559     case BT_S: case BT_CR: case BT_LF:
560       for (;;) {
561         int t;
562 
563         ptr += MINBPC(enc);
564         REQUIRE_CHAR(enc, ptr, end);
565         t = BYTE_TYPE(enc, ptr);
566         if (t == BT_EQUALS)
567           break;
568         switch (t) {
569         case BT_S:
570         case BT_LF:
571         case BT_CR:
572           break;
573         default:
574           *nextTokPtr = ptr;
575           return XML_TOK_INVALID;
576         }
577       }
578     /* fall through */
579     case BT_EQUALS:
580       {
581         int open;
582 #ifdef XML_NS
583         hadColon = 0;
584 #endif
585         for (;;) {
586           ptr += MINBPC(enc);
587           REQUIRE_CHAR(enc, ptr, end);
588           open = BYTE_TYPE(enc, ptr);
589           if (open == BT_QUOT || open == BT_APOS)
590             break;
591           switch (open) {
592           case BT_S:
593           case BT_LF:
594           case BT_CR:
595             break;
596           default:
597             *nextTokPtr = ptr;
598             return XML_TOK_INVALID;
599           }
600         }
601         ptr += MINBPC(enc);
602         /* in attribute value */
603         for (;;) {
604           int t;
605           REQUIRE_CHAR(enc, ptr, end);
606           t = BYTE_TYPE(enc, ptr);
607           if (t == open)
608             break;
609           switch (t) {
610           INVALID_CASES(ptr, nextTokPtr)
611           case BT_AMP:
612             {
613               int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
614               if (tok <= 0) {
615                 if (tok == XML_TOK_INVALID)
616                   *nextTokPtr = ptr;
617                 return tok;
618               }
619               break;
620             }
621           case BT_LT:
622             *nextTokPtr = ptr;
623             return XML_TOK_INVALID;
624           default:
625             ptr += MINBPC(enc);
626             break;
627           }
628         }
629         ptr += MINBPC(enc);
630         REQUIRE_CHAR(enc, ptr, end);
631         switch (BYTE_TYPE(enc, ptr)) {
632         case BT_S:
633         case BT_CR:
634         case BT_LF:
635           break;
636         case BT_SOL:
637           goto sol;
638         case BT_GT:
639           goto gt;
640         default:
641           *nextTokPtr = ptr;
642           return XML_TOK_INVALID;
643         }
644         /* ptr points to closing quote */
645         for (;;) {
646           ptr += MINBPC(enc);
647           REQUIRE_CHAR(enc, ptr, end);
648           switch (BYTE_TYPE(enc, ptr)) {
649           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
650           case BT_S: case BT_CR: case BT_LF:
651             continue;
652           case BT_GT:
653           gt:
654             *nextTokPtr = ptr + MINBPC(enc);
655             return XML_TOK_START_TAG_WITH_ATTS;
656           case BT_SOL:
657           sol:
658             ptr += MINBPC(enc);
659             REQUIRE_CHAR(enc, ptr, end);
660             if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
661               *nextTokPtr = ptr;
662               return XML_TOK_INVALID;
663             }
664             *nextTokPtr = ptr + MINBPC(enc);
665             return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
666           default:
667             *nextTokPtr = ptr;
668             return XML_TOK_INVALID;
669           }
670           break;
671         }
672         break;
673       }
674     default:
675       *nextTokPtr = ptr;
676       return XML_TOK_INVALID;
677     }
678   }
679   return XML_TOK_PARTIAL;
680 }
681 
682 /* ptr points to character following "<" */
683 
684 static int PTRCALL
685 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
686                const char **nextTokPtr)
687 {
688 #ifdef XML_NS
689   int hadColon;
690 #endif
691   REQUIRE_CHAR(enc, ptr, end);
692   switch (BYTE_TYPE(enc, ptr)) {
693   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
694   case BT_EXCL:
695     ptr += MINBPC(enc);
696     REQUIRE_CHAR(enc, ptr, end);
697     switch (BYTE_TYPE(enc, ptr)) {
698     case BT_MINUS:
699       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
700     case BT_LSQB:
701       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
702                                       end, nextTokPtr);
703     }
704     *nextTokPtr = ptr;
705     return XML_TOK_INVALID;
706   case BT_QUEST:
707     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
708   case BT_SOL:
709     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
710   default:
711     *nextTokPtr = ptr;
712     return XML_TOK_INVALID;
713   }
714 #ifdef XML_NS
715   hadColon = 0;
716 #endif
717   /* we have a start-tag */
718   while (HAS_CHAR(enc, ptr, end)) {
719     switch (BYTE_TYPE(enc, ptr)) {
720     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
721 #ifdef XML_NS
722     case BT_COLON:
723       if (hadColon) {
724         *nextTokPtr = ptr;
725         return XML_TOK_INVALID;
726       }
727       hadColon = 1;
728       ptr += MINBPC(enc);
729       REQUIRE_CHAR(enc, ptr, end);
730       switch (BYTE_TYPE(enc, ptr)) {
731       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
732       default:
733         *nextTokPtr = ptr;
734         return XML_TOK_INVALID;
735       }
736       break;
737 #endif
738     case BT_S: case BT_CR: case BT_LF:
739       {
740         ptr += MINBPC(enc);
741         while (HAS_CHAR(enc, ptr, end)) {
742           switch (BYTE_TYPE(enc, ptr)) {
743           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
744           case BT_GT:
745             goto gt;
746           case BT_SOL:
747             goto sol;
748           case BT_S: case BT_CR: case BT_LF:
749             ptr += MINBPC(enc);
750             continue;
751           default:
752             *nextTokPtr = ptr;
753             return XML_TOK_INVALID;
754           }
755           return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
756         }
757         return XML_TOK_PARTIAL;
758       }
759     case BT_GT:
760     gt:
761       *nextTokPtr = ptr + MINBPC(enc);
762       return XML_TOK_START_TAG_NO_ATTS;
763     case BT_SOL:
764     sol:
765       ptr += MINBPC(enc);
766       REQUIRE_CHAR(enc, ptr, end);
767       if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
768         *nextTokPtr = ptr;
769         return XML_TOK_INVALID;
770       }
771       *nextTokPtr = ptr + MINBPC(enc);
772       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
773     default:
774       *nextTokPtr = ptr;
775       return XML_TOK_INVALID;
776     }
777   }
778   return XML_TOK_PARTIAL;
779 }
780 
781 static int PTRCALL
782 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
783                    const char **nextTokPtr)
784 {
785   if (ptr >= end)
786     return XML_TOK_NONE;
787   if (MINBPC(enc) > 1) {
788     size_t n = end - ptr;
789     if (n & (MINBPC(enc) - 1)) {
790       n &= ~(MINBPC(enc) - 1);
791       if (n == 0)
792         return XML_TOK_PARTIAL;
793       end = ptr + n;
794     }
795   }
796   switch (BYTE_TYPE(enc, ptr)) {
797   case BT_LT:
798     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
799   case BT_AMP:
800     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
801   case BT_CR:
802     ptr += MINBPC(enc);
803     if (! HAS_CHAR(enc, ptr, end))
804       return XML_TOK_TRAILING_CR;
805     if (BYTE_TYPE(enc, ptr) == BT_LF)
806       ptr += MINBPC(enc);
807     *nextTokPtr = ptr;
808     return XML_TOK_DATA_NEWLINE;
809   case BT_LF:
810     *nextTokPtr = ptr + MINBPC(enc);
811     return XML_TOK_DATA_NEWLINE;
812   case BT_RSQB:
813     ptr += MINBPC(enc);
814     if (! HAS_CHAR(enc, ptr, end))
815       return XML_TOK_TRAILING_RSQB;
816     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
817       break;
818     ptr += MINBPC(enc);
819     if (! HAS_CHAR(enc, ptr, end))
820       return XML_TOK_TRAILING_RSQB;
821     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
822       ptr -= MINBPC(enc);
823       break;
824     }
825     *nextTokPtr = ptr;
826     return XML_TOK_INVALID;
827   INVALID_CASES(ptr, nextTokPtr)
828   default:
829     ptr += MINBPC(enc);
830     break;
831   }
832   while (HAS_CHAR(enc, ptr, end)) {
833     switch (BYTE_TYPE(enc, ptr)) {
834 #define LEAD_CASE(n) \
835     case BT_LEAD ## n: \
836       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
837         *nextTokPtr = ptr; \
838         return XML_TOK_DATA_CHARS; \
839       } \
840       ptr += n; \
841       break;
842     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
843 #undef LEAD_CASE
844     case BT_RSQB:
845       if (HAS_CHARS(enc, ptr, end, 2)) {
846          if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
847            ptr += MINBPC(enc);
848            break;
849          }
850          if (HAS_CHARS(enc, ptr, end, 3)) {
851            if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
852              ptr += MINBPC(enc);
853              break;
854            }
855            *nextTokPtr = ptr + 2*MINBPC(enc);
856            return XML_TOK_INVALID;
857          }
858       }
859       /* fall through */
860     case BT_AMP:
861     case BT_LT:
862     case BT_NONXML:
863     case BT_MALFORM:
864     case BT_TRAIL:
865     case BT_CR:
866     case BT_LF:
867       *nextTokPtr = ptr;
868       return XML_TOK_DATA_CHARS;
869     default:
870       ptr += MINBPC(enc);
871       break;
872     }
873   }
874   *nextTokPtr = ptr;
875   return XML_TOK_DATA_CHARS;
876 }
877 
878 /* ptr points to character following "%" */
879 
880 static int PTRCALL
881 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
882                     const char **nextTokPtr)
883 {
884   REQUIRE_CHAR(enc, ptr, end);
885   switch (BYTE_TYPE(enc, ptr)) {
886   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
887   case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
888     *nextTokPtr = ptr;
889     return XML_TOK_PERCENT;
890   default:
891     *nextTokPtr = ptr;
892     return XML_TOK_INVALID;
893   }
894   while (HAS_CHAR(enc, ptr, end)) {
895     switch (BYTE_TYPE(enc, ptr)) {
896     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
897     case BT_SEMI:
898       *nextTokPtr = ptr + MINBPC(enc);
899       return XML_TOK_PARAM_ENTITY_REF;
900     default:
901       *nextTokPtr = ptr;
902       return XML_TOK_INVALID;
903     }
904   }
905   return XML_TOK_PARTIAL;
906 }
907 
908 static int PTRCALL
909 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
910                       const char **nextTokPtr)
911 {
912   REQUIRE_CHAR(enc, ptr, end);
913   switch (BYTE_TYPE(enc, ptr)) {
914   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
915   default:
916     *nextTokPtr = ptr;
917     return XML_TOK_INVALID;
918   }
919   while (HAS_CHAR(enc, ptr, end)) {
920     switch (BYTE_TYPE(enc, ptr)) {
921     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
922     case BT_CR: case BT_LF: case BT_S:
923     case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
924       *nextTokPtr = ptr;
925       return XML_TOK_POUND_NAME;
926     default:
927       *nextTokPtr = ptr;
928       return XML_TOK_INVALID;
929     }
930   }
931   return -XML_TOK_POUND_NAME;
932 }
933 
934 static int PTRCALL
935 PREFIX(scanLit)(int open, const ENCODING *enc,
936                 const char *ptr, const char *end,
937                 const char **nextTokPtr)
938 {
939   while (HAS_CHAR(enc, ptr, end)) {
940     int t = BYTE_TYPE(enc, ptr);
941     switch (t) {
942     INVALID_CASES(ptr, nextTokPtr)
943     case BT_QUOT:
944     case BT_APOS:
945       ptr += MINBPC(enc);
946       if (t != open)
947         break;
948       if (! HAS_CHAR(enc, ptr, end))
949         return -XML_TOK_LITERAL;
950       *nextTokPtr = ptr;
951       switch (BYTE_TYPE(enc, ptr)) {
952       case BT_S: case BT_CR: case BT_LF:
953       case BT_GT: case BT_PERCNT: case BT_LSQB:
954         return XML_TOK_LITERAL;
955       default:
956         return XML_TOK_INVALID;
957       }
958     default:
959       ptr += MINBPC(enc);
960       break;
961     }
962   }
963   return XML_TOK_PARTIAL;
964 }
965 
966 static int PTRCALL
967 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
968                   const char **nextTokPtr)
969 {
970   int tok;
971   if (ptr >= end)
972     return XML_TOK_NONE;
973   if (MINBPC(enc) > 1) {
974     size_t n = end - ptr;
975     if (n & (MINBPC(enc) - 1)) {
976       n &= ~(MINBPC(enc) - 1);
977       if (n == 0)
978         return XML_TOK_PARTIAL;
979       end = ptr + n;
980     }
981   }
982   switch (BYTE_TYPE(enc, ptr)) {
983   case BT_QUOT:
984     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
985   case BT_APOS:
986     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
987   case BT_LT:
988     {
989       ptr += MINBPC(enc);
990       REQUIRE_CHAR(enc, ptr, end);
991       switch (BYTE_TYPE(enc, ptr)) {
992       case BT_EXCL:
993         return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
994       case BT_QUEST:
995         return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
996       case BT_NMSTRT:
997       case BT_HEX:
998       case BT_NONASCII:
999       case BT_LEAD2:
1000       case BT_LEAD3:
1001       case BT_LEAD4:
1002         *nextTokPtr = ptr - MINBPC(enc);
1003         return XML_TOK_INSTANCE_START;
1004       }
1005       *nextTokPtr = ptr;
1006       return XML_TOK_INVALID;
1007     }
1008   case BT_CR:
1009     if (ptr + MINBPC(enc) == end) {
1010       *nextTokPtr = end;
1011       /* indicate that this might be part of a CR/LF pair */
1012       return -XML_TOK_PROLOG_S;
1013     }
1014     /* fall through */
1015   case BT_S: case BT_LF:
1016     for (;;) {
1017       ptr += MINBPC(enc);
1018       if (! HAS_CHAR(enc, ptr, end))
1019         break;
1020       switch (BYTE_TYPE(enc, ptr)) {
1021       case BT_S: case BT_LF:
1022         break;
1023       case BT_CR:
1024         /* don't split CR/LF pair */
1025         if (ptr + MINBPC(enc) != end)
1026           break;
1027         /* fall through */
1028       default:
1029         *nextTokPtr = ptr;
1030         return XML_TOK_PROLOG_S;
1031       }
1032     }
1033     *nextTokPtr = ptr;
1034     return XML_TOK_PROLOG_S;
1035   case BT_PERCNT:
1036     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1037   case BT_COMMA:
1038     *nextTokPtr = ptr + MINBPC(enc);
1039     return XML_TOK_COMMA;
1040   case BT_LSQB:
1041     *nextTokPtr = ptr + MINBPC(enc);
1042     return XML_TOK_OPEN_BRACKET;
1043   case BT_RSQB:
1044     ptr += MINBPC(enc);
1045     if (! HAS_CHAR(enc, ptr, end))
1046       return -XML_TOK_CLOSE_BRACKET;
1047     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1048       REQUIRE_CHARS(enc, ptr, end, 2);
1049       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1050         *nextTokPtr = ptr + 2*MINBPC(enc);
1051         return XML_TOK_COND_SECT_CLOSE;
1052       }
1053     }
1054     *nextTokPtr = ptr;
1055     return XML_TOK_CLOSE_BRACKET;
1056   case BT_LPAR:
1057     *nextTokPtr = ptr + MINBPC(enc);
1058     return XML_TOK_OPEN_PAREN;
1059   case BT_RPAR:
1060     ptr += MINBPC(enc);
1061     if (! HAS_CHAR(enc, ptr, end))
1062       return -XML_TOK_CLOSE_PAREN;
1063     switch (BYTE_TYPE(enc, ptr)) {
1064     case BT_AST:
1065       *nextTokPtr = ptr + MINBPC(enc);
1066       return XML_TOK_CLOSE_PAREN_ASTERISK;
1067     case BT_QUEST:
1068       *nextTokPtr = ptr + MINBPC(enc);
1069       return XML_TOK_CLOSE_PAREN_QUESTION;
1070     case BT_PLUS:
1071       *nextTokPtr = ptr + MINBPC(enc);
1072       return XML_TOK_CLOSE_PAREN_PLUS;
1073     case BT_CR: case BT_LF: case BT_S:
1074     case BT_GT: case BT_COMMA: case BT_VERBAR:
1075     case BT_RPAR:
1076       *nextTokPtr = ptr;
1077       return XML_TOK_CLOSE_PAREN;
1078     }
1079     *nextTokPtr = ptr;
1080     return XML_TOK_INVALID;
1081   case BT_VERBAR:
1082     *nextTokPtr = ptr + MINBPC(enc);
1083     return XML_TOK_OR;
1084   case BT_GT:
1085     *nextTokPtr = ptr + MINBPC(enc);
1086     return XML_TOK_DECL_CLOSE;
1087   case BT_NUM:
1088     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1089 #define LEAD_CASE(n) \
1090   case BT_LEAD ## n: \
1091     if (end - ptr < n) \
1092       return XML_TOK_PARTIAL_CHAR; \
1093     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1094       ptr += n; \
1095       tok = XML_TOK_NAME; \
1096       break; \
1097     } \
1098     if (IS_NAME_CHAR(enc, ptr, n)) { \
1099       ptr += n; \
1100       tok = XML_TOK_NMTOKEN; \
1101       break; \
1102     } \
1103     *nextTokPtr = ptr; \
1104     return XML_TOK_INVALID;
1105     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1106 #undef LEAD_CASE
1107   case BT_NMSTRT:
1108   case BT_HEX:
1109     tok = XML_TOK_NAME;
1110     ptr += MINBPC(enc);
1111     break;
1112   case BT_DIGIT:
1113   case BT_NAME:
1114   case BT_MINUS:
1115 #ifdef XML_NS
1116   case BT_COLON:
1117 #endif
1118     tok = XML_TOK_NMTOKEN;
1119     ptr += MINBPC(enc);
1120     break;
1121   case BT_NONASCII:
1122     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1123       ptr += MINBPC(enc);
1124       tok = XML_TOK_NAME;
1125       break;
1126     }
1127     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1128       ptr += MINBPC(enc);
1129       tok = XML_TOK_NMTOKEN;
1130       break;
1131     }
1132     /* fall through */
1133   default:
1134     *nextTokPtr = ptr;
1135     return XML_TOK_INVALID;
1136   }
1137   while (HAS_CHAR(enc, ptr, end)) {
1138     switch (BYTE_TYPE(enc, ptr)) {
1139     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1140     case BT_GT: case BT_RPAR: case BT_COMMA:
1141     case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1142     case BT_S: case BT_CR: case BT_LF:
1143       *nextTokPtr = ptr;
1144       return tok;
1145 #ifdef XML_NS
1146     case BT_COLON:
1147       ptr += MINBPC(enc);
1148       switch (tok) {
1149       case XML_TOK_NAME:
1150         REQUIRE_CHAR(enc, ptr, end);
1151         tok = XML_TOK_PREFIXED_NAME;
1152         switch (BYTE_TYPE(enc, ptr)) {
1153         CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1154         default:
1155           tok = XML_TOK_NMTOKEN;
1156           break;
1157         }
1158         break;
1159       case XML_TOK_PREFIXED_NAME:
1160         tok = XML_TOK_NMTOKEN;
1161         break;
1162       }
1163       break;
1164 #endif
1165     case BT_PLUS:
1166       if (tok == XML_TOK_NMTOKEN)  {
1167         *nextTokPtr = ptr;
1168         return XML_TOK_INVALID;
1169       }
1170       *nextTokPtr = ptr + MINBPC(enc);
1171       return XML_TOK_NAME_PLUS;
1172     case BT_AST:
1173       if (tok == XML_TOK_NMTOKEN)  {
1174         *nextTokPtr = ptr;
1175         return XML_TOK_INVALID;
1176       }
1177       *nextTokPtr = ptr + MINBPC(enc);
1178       return XML_TOK_NAME_ASTERISK;
1179     case BT_QUEST:
1180       if (tok == XML_TOK_NMTOKEN)  {
1181         *nextTokPtr = ptr;
1182         return XML_TOK_INVALID;
1183       }
1184       *nextTokPtr = ptr + MINBPC(enc);
1185       return XML_TOK_NAME_QUESTION;
1186     default:
1187       *nextTokPtr = ptr;
1188       return XML_TOK_INVALID;
1189     }
1190   }
1191   return -tok;
1192 }
1193 
1194 static int PTRCALL
1195 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1196                           const char *end, const char **nextTokPtr)
1197 {
1198   const char *start;
1199   if (ptr >= end)
1200     return XML_TOK_NONE;
1201   else if (! HAS_CHAR(enc, ptr, end))
1202     return XML_TOK_PARTIAL;
1203   start = ptr;
1204   while (HAS_CHAR(enc, ptr, end)) {
1205     switch (BYTE_TYPE(enc, ptr)) {
1206 #define LEAD_CASE(n) \
1207     case BT_LEAD ## n: ptr += n; break;
1208     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1209 #undef LEAD_CASE
1210     case BT_AMP:
1211       if (ptr == start)
1212         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1213       *nextTokPtr = ptr;
1214       return XML_TOK_DATA_CHARS;
1215     case BT_LT:
1216       /* this is for inside entity references */
1217       *nextTokPtr = ptr;
1218       return XML_TOK_INVALID;
1219     case BT_LF:
1220       if (ptr == start) {
1221         *nextTokPtr = ptr + MINBPC(enc);
1222         return XML_TOK_DATA_NEWLINE;
1223       }
1224       *nextTokPtr = ptr;
1225       return XML_TOK_DATA_CHARS;
1226     case BT_CR:
1227       if (ptr == start) {
1228         ptr += MINBPC(enc);
1229         if (! HAS_CHAR(enc, ptr, end))
1230           return XML_TOK_TRAILING_CR;
1231         if (BYTE_TYPE(enc, ptr) == BT_LF)
1232           ptr += MINBPC(enc);
1233         *nextTokPtr = ptr;
1234         return XML_TOK_DATA_NEWLINE;
1235       }
1236       *nextTokPtr = ptr;
1237       return XML_TOK_DATA_CHARS;
1238     case BT_S:
1239       if (ptr == start) {
1240         *nextTokPtr = ptr + MINBPC(enc);
1241         return XML_TOK_ATTRIBUTE_VALUE_S;
1242       }
1243       *nextTokPtr = ptr;
1244       return XML_TOK_DATA_CHARS;
1245     default:
1246       ptr += MINBPC(enc);
1247       break;
1248     }
1249   }
1250   *nextTokPtr = ptr;
1251   return XML_TOK_DATA_CHARS;
1252 }
1253 
1254 static int PTRCALL
1255 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1256                        const char *end, const char **nextTokPtr)
1257 {
1258   const char *start;
1259   if (ptr >= end)
1260     return XML_TOK_NONE;
1261   else if (! HAS_CHAR(enc, ptr, end))
1262     return XML_TOK_PARTIAL;
1263   start = ptr;
1264   while (HAS_CHAR(enc, ptr, end)) {
1265     switch (BYTE_TYPE(enc, ptr)) {
1266 #define LEAD_CASE(n) \
1267     case BT_LEAD ## n: ptr += n; break;
1268     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1269 #undef LEAD_CASE
1270     case BT_AMP:
1271       if (ptr == start)
1272         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1273       *nextTokPtr = ptr;
1274       return XML_TOK_DATA_CHARS;
1275     case BT_PERCNT:
1276       if (ptr == start) {
1277         int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1278                                        end, nextTokPtr);
1279         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1280       }
1281       *nextTokPtr = ptr;
1282       return XML_TOK_DATA_CHARS;
1283     case BT_LF:
1284       if (ptr == start) {
1285         *nextTokPtr = ptr + MINBPC(enc);
1286         return XML_TOK_DATA_NEWLINE;
1287       }
1288       *nextTokPtr = ptr;
1289       return XML_TOK_DATA_CHARS;
1290     case BT_CR:
1291       if (ptr == start) {
1292         ptr += MINBPC(enc);
1293         if (! HAS_CHAR(enc, ptr, end))
1294           return XML_TOK_TRAILING_CR;
1295         if (BYTE_TYPE(enc, ptr) == BT_LF)
1296           ptr += MINBPC(enc);
1297         *nextTokPtr = ptr;
1298         return XML_TOK_DATA_NEWLINE;
1299       }
1300       *nextTokPtr = ptr;
1301       return XML_TOK_DATA_CHARS;
1302     default:
1303       ptr += MINBPC(enc);
1304       break;
1305     }
1306   }
1307   *nextTokPtr = ptr;
1308   return XML_TOK_DATA_CHARS;
1309 }
1310 
1311 #ifdef XML_DTD
1312 
1313 static int PTRCALL
1314 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1315                          const char *end, const char **nextTokPtr)
1316 {
1317   int level = 0;
1318   if (MINBPC(enc) > 1) {
1319     size_t n = end - ptr;
1320     if (n & (MINBPC(enc) - 1)) {
1321       n &= ~(MINBPC(enc) - 1);
1322       end = ptr + n;
1323     }
1324   }
1325   while (HAS_CHAR(enc, ptr, end)) {
1326     switch (BYTE_TYPE(enc, ptr)) {
1327     INVALID_CASES(ptr, nextTokPtr)
1328     case BT_LT:
1329       ptr += MINBPC(enc);
1330       REQUIRE_CHAR(enc, ptr, end);
1331       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1332         ptr += MINBPC(enc);
1333         REQUIRE_CHAR(enc, ptr, end);
1334         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1335           ++level;
1336           ptr += MINBPC(enc);
1337         }
1338       }
1339       break;
1340     case BT_RSQB:
1341       ptr += MINBPC(enc);
1342       REQUIRE_CHAR(enc, ptr, end);
1343       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1344         ptr += MINBPC(enc);
1345         REQUIRE_CHAR(enc, ptr, end);
1346         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1347           ptr += MINBPC(enc);
1348           if (level == 0) {
1349             *nextTokPtr = ptr;
1350             return XML_TOK_IGNORE_SECT;
1351           }
1352           --level;
1353         }
1354       }
1355       break;
1356     default:
1357       ptr += MINBPC(enc);
1358       break;
1359     }
1360   }
1361   return XML_TOK_PARTIAL;
1362 }
1363 
1364 #endif /* XML_DTD */
1365 
1366 static int PTRCALL
1367 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1368                    const char **badPtr)
1369 {
1370   ptr += MINBPC(enc);
1371   end -= MINBPC(enc);
1372   for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1373     switch (BYTE_TYPE(enc, ptr)) {
1374     case BT_DIGIT:
1375     case BT_HEX:
1376     case BT_MINUS:
1377     case BT_APOS:
1378     case BT_LPAR:
1379     case BT_RPAR:
1380     case BT_PLUS:
1381     case BT_COMMA:
1382     case BT_SOL:
1383     case BT_EQUALS:
1384     case BT_QUEST:
1385     case BT_CR:
1386     case BT_LF:
1387     case BT_SEMI:
1388     case BT_EXCL:
1389     case BT_AST:
1390     case BT_PERCNT:
1391     case BT_NUM:
1392 #ifdef XML_NS
1393     case BT_COLON:
1394 #endif
1395       break;
1396     case BT_S:
1397       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1398         *badPtr = ptr;
1399         return 0;
1400       }
1401       break;
1402     case BT_NAME:
1403     case BT_NMSTRT:
1404       if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1405         break;
1406     default:
1407       switch (BYTE_TO_ASCII(enc, ptr)) {
1408       case 0x24: /* $ */
1409       case 0x40: /* @ */
1410         break;
1411       default:
1412         *badPtr = ptr;
1413         return 0;
1414       }
1415       break;
1416     }
1417   }
1418   return 1;
1419 }
1420 
1421 /* This must only be called for a well-formed start-tag or empty
1422    element tag.  Returns the number of attributes.  Pointers to the
1423    first attsMax attributes are stored in atts.
1424 */
1425 
1426 static int PTRCALL
1427 PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1428                 int attsMax, ATTRIBUTE *atts)
1429 {
1430   enum { other, inName, inValue } state = inName;
1431   int nAtts = 0;
1432   int open = 0; /* defined when state == inValue;
1433                    initialization just to shut up compilers */
1434 
1435   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1436     switch (BYTE_TYPE(enc, ptr)) {
1437 #define START_NAME \
1438       if (state == other) { \
1439         if (nAtts < attsMax) { \
1440           atts[nAtts].name = ptr; \
1441           atts[nAtts].normalized = 1; \
1442         } \
1443         state = inName; \
1444       }
1445 #define LEAD_CASE(n) \
1446     case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1447     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1448 #undef LEAD_CASE
1449     case BT_NONASCII:
1450     case BT_NMSTRT:
1451     case BT_HEX:
1452       START_NAME
1453       break;
1454 #undef START_NAME
1455     case BT_QUOT:
1456       if (state != inValue) {
1457         if (nAtts < attsMax)
1458           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1459         state = inValue;
1460         open = BT_QUOT;
1461       }
1462       else if (open == BT_QUOT) {
1463         state = other;
1464         if (nAtts < attsMax)
1465           atts[nAtts].valueEnd = ptr;
1466         nAtts++;
1467       }
1468       break;
1469     case BT_APOS:
1470       if (state != inValue) {
1471         if (nAtts < attsMax)
1472           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1473         state = inValue;
1474         open = BT_APOS;
1475       }
1476       else if (open == BT_APOS) {
1477         state = other;
1478         if (nAtts < attsMax)
1479           atts[nAtts].valueEnd = ptr;
1480         nAtts++;
1481       }
1482       break;
1483     case BT_AMP:
1484       if (nAtts < attsMax)
1485         atts[nAtts].normalized = 0;
1486       break;
1487     case BT_S:
1488       if (state == inName)
1489         state = other;
1490       else if (state == inValue
1491                && nAtts < attsMax
1492                && atts[nAtts].normalized
1493                && (ptr == atts[nAtts].valuePtr
1494                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1495                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1496                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1497         atts[nAtts].normalized = 0;
1498       break;
1499     case BT_CR: case BT_LF:
1500       /* This case ensures that the first attribute name is counted
1501          Apart from that we could just change state on the quote. */
1502       if (state == inName)
1503         state = other;
1504       else if (state == inValue && nAtts < attsMax)
1505         atts[nAtts].normalized = 0;
1506       break;
1507     case BT_GT:
1508     case BT_SOL:
1509       if (state != inValue)
1510         return nAtts;
1511       break;
1512     default:
1513       break;
1514     }
1515   }
1516   /* not reached */
1517 }
1518 
1519 static int PTRFASTCALL
1520 PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr)
1521 {
1522   int result = 0;
1523   /* skip &# */
1524   ptr += 2*MINBPC(enc);
1525   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1526     for (ptr += MINBPC(enc);
1527          !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1528          ptr += MINBPC(enc)) {
1529       int c = BYTE_TO_ASCII(enc, ptr);
1530       switch (c) {
1531       case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1532       case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1533         result <<= 4;
1534         result |= (c - ASCII_0);
1535         break;
1536       case ASCII_A: case ASCII_B: case ASCII_C:
1537       case ASCII_D: case ASCII_E: case ASCII_F:
1538         result <<= 4;
1539         result += 10 + (c - ASCII_A);
1540         break;
1541       case ASCII_a: case ASCII_b: case ASCII_c:
1542       case ASCII_d: case ASCII_e: case ASCII_f:
1543         result <<= 4;
1544         result += 10 + (c - ASCII_a);
1545         break;
1546       }
1547       if (result >= 0x110000)
1548         return -1;
1549     }
1550   }
1551   else {
1552     for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1553       int c = BYTE_TO_ASCII(enc, ptr);
1554       result *= 10;
1555       result += (c - ASCII_0);
1556       if (result >= 0x110000)
1557         return -1;
1558     }
1559   }
1560   return checkCharRefNumber(result);
1561 }
1562 
1563 static int PTRCALL
1564 PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr,
1565                              const char *end)
1566 {
1567   switch ((end - ptr)/MINBPC(enc)) {
1568   case 2:
1569     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1570       switch (BYTE_TO_ASCII(enc, ptr)) {
1571       case ASCII_l:
1572         return ASCII_LT;
1573       case ASCII_g:
1574         return ASCII_GT;
1575       }
1576     }
1577     break;
1578   case 3:
1579     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1580       ptr += MINBPC(enc);
1581       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1582         ptr += MINBPC(enc);
1583         if (CHAR_MATCHES(enc, ptr, ASCII_p))
1584           return ASCII_AMP;
1585       }
1586     }
1587     break;
1588   case 4:
1589     switch (BYTE_TO_ASCII(enc, ptr)) {
1590     case ASCII_q:
1591       ptr += MINBPC(enc);
1592       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1593         ptr += MINBPC(enc);
1594         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1595           ptr += MINBPC(enc);
1596           if (CHAR_MATCHES(enc, ptr, ASCII_t))
1597             return ASCII_QUOT;
1598         }
1599       }
1600       break;
1601     case ASCII_a:
1602       ptr += MINBPC(enc);
1603       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1604         ptr += MINBPC(enc);
1605         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1606           ptr += MINBPC(enc);
1607           if (CHAR_MATCHES(enc, ptr, ASCII_s))
1608             return ASCII_APOS;
1609         }
1610       }
1611       break;
1612     }
1613   }
1614   return 0;
1615 }
1616 
1617 static int PTRCALL
1618 PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1619 {
1620   for (;;) {
1621     switch (BYTE_TYPE(enc, ptr1)) {
1622 #define LEAD_CASE(n) \
1623     case BT_LEAD ## n: \
1624       if (*ptr1++ != *ptr2++) \
1625         return 0;
1626     LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1627 #undef LEAD_CASE
1628       /* fall through */
1629       if (*ptr1++ != *ptr2++)
1630         return 0;
1631       break;
1632     case BT_NONASCII:
1633     case BT_NMSTRT:
1634 #ifdef XML_NS
1635     case BT_COLON:
1636 #endif
1637     case BT_HEX:
1638     case BT_DIGIT:
1639     case BT_NAME:
1640     case BT_MINUS:
1641       if (*ptr2++ != *ptr1++)
1642         return 0;
1643       if (MINBPC(enc) > 1) {
1644         if (*ptr2++ != *ptr1++)
1645           return 0;
1646         if (MINBPC(enc) > 2) {
1647           if (*ptr2++ != *ptr1++)
1648             return 0;
1649           if (MINBPC(enc) > 3) {
1650             if (*ptr2++ != *ptr1++)
1651               return 0;
1652           }
1653         }
1654       }
1655       break;
1656     default:
1657       if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1658         return 1;
1659       switch (BYTE_TYPE(enc, ptr2)) {
1660       case BT_LEAD2:
1661       case BT_LEAD3:
1662       case BT_LEAD4:
1663       case BT_NONASCII:
1664       case BT_NMSTRT:
1665 #ifdef XML_NS
1666       case BT_COLON:
1667 #endif
1668       case BT_HEX:
1669       case BT_DIGIT:
1670       case BT_NAME:
1671       case BT_MINUS:
1672         return 0;
1673       default:
1674         return 1;
1675       }
1676     }
1677   }
1678   /* not reached */
1679 }
1680 
1681 static int PTRCALL
1682 PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1,
1683                          const char *end1, const char *ptr2)
1684 {
1685   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1686     if (end1 - ptr1 < MINBPC(enc))
1687       return 0;
1688     if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1689       return 0;
1690   }
1691   return ptr1 == end1;
1692 }
1693 
1694 static int PTRFASTCALL
1695 PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1696 {
1697   const char *start = ptr;
1698   for (;;) {
1699     switch (BYTE_TYPE(enc, ptr)) {
1700 #define LEAD_CASE(n) \
1701     case BT_LEAD ## n: ptr += n; break;
1702     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1703 #undef LEAD_CASE
1704     case BT_NONASCII:
1705     case BT_NMSTRT:
1706 #ifdef XML_NS
1707     case BT_COLON:
1708 #endif
1709     case BT_HEX:
1710     case BT_DIGIT:
1711     case BT_NAME:
1712     case BT_MINUS:
1713       ptr += MINBPC(enc);
1714       break;
1715     default:
1716       return (int)(ptr - start);
1717     }
1718   }
1719 }
1720 
1721 static const char * PTRFASTCALL
1722 PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1723 {
1724   for (;;) {
1725     switch (BYTE_TYPE(enc, ptr)) {
1726     case BT_LF:
1727     case BT_CR:
1728     case BT_S:
1729       ptr += MINBPC(enc);
1730       break;
1731     default:
1732       return ptr;
1733     }
1734   }
1735 }
1736 
1737 static void PTRCALL
1738 PREFIX(updatePosition)(const ENCODING *enc,
1739                        const char *ptr,
1740                        const char *end,
1741                        POSITION *pos)
1742 {
1743   while (HAS_CHAR(enc, ptr, end)) {
1744     switch (BYTE_TYPE(enc, ptr)) {
1745 #define LEAD_CASE(n) \
1746     case BT_LEAD ## n: \
1747       ptr += n; \
1748       break;
1749     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1750 #undef LEAD_CASE
1751     case BT_LF:
1752       pos->columnNumber = (XML_Size)-1;
1753       pos->lineNumber++;
1754       ptr += MINBPC(enc);
1755       break;
1756     case BT_CR:
1757       pos->lineNumber++;
1758       ptr += MINBPC(enc);
1759       if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1760         ptr += MINBPC(enc);
1761       pos->columnNumber = (XML_Size)-1;
1762       break;
1763     default:
1764       ptr += MINBPC(enc);
1765       break;
1766     }
1767     pos->columnNumber++;
1768   }
1769 }
1770 
1771 #undef DO_LEAD_CASE
1772 #undef MULTIBYTE_CASES
1773 #undef INVALID_CASES
1774 #undef CHECK_NAME_CASE
1775 #undef CHECK_NAME_CASES
1776 #undef CHECK_NMSTRT_CASE
1777 #undef CHECK_NMSTRT_CASES
1778 
1779 #endif /* XML_TOK_IMPL_C */
1780