1 /* This file is included (from xmltok.c, 1-3 times depending on XML_MIN_SIZE)!
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net>
11 Copyright (c) 2002 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12 Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
13 Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
14 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk>
15 Copyright (c) 2018 Benjamin Peterson <benjamin@python.org>
16 Copyright (c) 2018 Anton Maklakov <antmak.pub@gmail.com>
17 Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
18 Copyright (c) 2020 Boris Kolpackov <boris@codesynthesis.com>
19 Copyright (c) 2022 Martin Ettl <ettl.martin78@googlemail.com>
20 Licensed under the MIT license:
21
22 Permission is hereby granted, free of charge, to any person obtaining
23 a copy of this software and associated documentation files (the
24 "Software"), to deal in the Software without restriction, including
25 without limitation the rights to use, copy, modify, merge, publish,
26 distribute, sublicense, and/or sell copies of the Software, and to permit
27 persons to whom the Software is furnished to do so, subject to the
28 following conditions:
29
30 The above copyright notice and this permission notice shall be included
31 in all copies or substantial portions of the Software.
32
33 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
34 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
35 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
36 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
37 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
38 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
39 USE OR OTHER DEALINGS IN THE SOFTWARE.
40 */
41
42 #ifdef XML_TOK_IMPL_C
43
44 # ifndef IS_INVALID_CHAR // i.e. for UTF-16 and XML_MIN_SIZE not defined
45 # define IS_INVALID_CHAR(enc, ptr, n) (0)
46 # endif
47
48 # define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
49 case BT_LEAD##n: \
50 if (end - ptr < n) \
51 return XML_TOK_PARTIAL_CHAR; \
52 if (IS_INVALID_CHAR(enc, ptr, n)) { \
53 *(nextTokPtr) = (ptr); \
54 return XML_TOK_INVALID; \
55 } \
56 ptr += n; \
57 break;
58
59 # define INVALID_CASES(ptr, nextTokPtr) \
60 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
61 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
62 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
63 case BT_NONXML: \
64 case BT_MALFORM: \
65 case BT_TRAIL: \
66 *(nextTokPtr) = (ptr); \
67 return XML_TOK_INVALID;
68
69 # define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
70 case BT_LEAD##n: \
71 if (end - ptr < n) \
72 return XML_TOK_PARTIAL_CHAR; \
73 if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) { \
74 *nextTokPtr = ptr; \
75 return XML_TOK_INVALID; \
76 } \
77 ptr += n; \
78 break;
79
80 # define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
81 case BT_NONASCII: \
82 if (! IS_NAME_CHAR_MINBPC(enc, ptr)) { \
83 *nextTokPtr = ptr; \
84 return XML_TOK_INVALID; \
85 } \
86 /* fall through */ \
87 case BT_NMSTRT: \
88 case BT_HEX: \
89 case BT_DIGIT: \
90 case BT_NAME: \
91 case BT_MINUS: \
92 ptr += MINBPC(enc); \
93 break; \
94 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
95 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
96 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
97
98 # define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
99 case BT_LEAD##n: \
100 if ((end) - (ptr) < (n)) \
101 return XML_TOK_PARTIAL_CHAR; \
102 if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) { \
103 *nextTokPtr = ptr; \
104 return XML_TOK_INVALID; \
105 } \
106 ptr += n; \
107 break;
108
109 # define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
110 case BT_NONASCII: \
111 if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
112 *nextTokPtr = ptr; \
113 return XML_TOK_INVALID; \
114 } \
115 /* fall through */ \
116 case BT_NMSTRT: \
117 case BT_HEX: \
118 ptr += MINBPC(enc); \
119 break; \
120 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
121 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
122 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
123
124 # ifndef PREFIX
125 # define PREFIX(ident) ident
126 # endif
127
128 # define HAS_CHARS(enc, ptr, end, count) \
129 ((end) - (ptr) >= ((count) * MINBPC(enc)))
130
131 # define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
132
133 # define REQUIRE_CHARS(enc, ptr, end, count) \
134 { \
135 if (! HAS_CHARS(enc, ptr, end, count)) { \
136 return XML_TOK_PARTIAL; \
137 } \
138 }
139
140 # define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
141
142 /* ptr points to character following "<!-" */
143
144 static int PTRCALL
PREFIX(scanComment)145 PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
146 const char **nextTokPtr) {
147 if (HAS_CHAR(enc, ptr, end)) {
148 if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
149 *nextTokPtr = ptr;
150 return XML_TOK_INVALID;
151 }
152 ptr += MINBPC(enc);
153 while (HAS_CHAR(enc, ptr, end)) {
154 switch (BYTE_TYPE(enc, ptr)) {
155 INVALID_CASES(ptr, nextTokPtr)
156 case BT_MINUS:
157 ptr += MINBPC(enc);
158 REQUIRE_CHAR(enc, ptr, end);
159 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
160 ptr += MINBPC(enc);
161 REQUIRE_CHAR(enc, ptr, end);
162 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
163 *nextTokPtr = ptr;
164 return XML_TOK_INVALID;
165 }
166 *nextTokPtr = ptr + MINBPC(enc);
167 return XML_TOK_COMMENT;
168 }
169 break;
170 default:
171 ptr += MINBPC(enc);
172 break;
173 }
174 }
175 }
176 return XML_TOK_PARTIAL;
177 }
178
179 /* ptr points to character following "<!" */
180
181 static int PTRCALL
PREFIX(scanDecl)182 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
183 const char **nextTokPtr) {
184 REQUIRE_CHAR(enc, ptr, end);
185 switch (BYTE_TYPE(enc, ptr)) {
186 case BT_MINUS:
187 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
188 case BT_LSQB:
189 *nextTokPtr = ptr + MINBPC(enc);
190 return XML_TOK_COND_SECT_OPEN;
191 case BT_NMSTRT:
192 case BT_HEX:
193 ptr += MINBPC(enc);
194 break;
195 default:
196 *nextTokPtr = ptr;
197 return XML_TOK_INVALID;
198 }
199 while (HAS_CHAR(enc, ptr, end)) {
200 switch (BYTE_TYPE(enc, ptr)) {
201 case BT_PERCNT:
202 REQUIRE_CHARS(enc, ptr, end, 2);
203 /* don't allow <!ENTITY% foo "whatever"> */
204 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
205 case BT_S:
206 case BT_CR:
207 case BT_LF:
208 case BT_PERCNT:
209 *nextTokPtr = ptr;
210 return XML_TOK_INVALID;
211 }
212 /* fall through */
213 case BT_S:
214 case BT_CR:
215 case BT_LF:
216 *nextTokPtr = ptr;
217 return XML_TOK_DECL_OPEN;
218 case BT_NMSTRT:
219 case BT_HEX:
220 ptr += MINBPC(enc);
221 break;
222 default:
223 *nextTokPtr = ptr;
224 return XML_TOK_INVALID;
225 }
226 }
227 return XML_TOK_PARTIAL;
228 }
229
230 static int PTRCALL
PREFIX(checkPiTarget)231 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
232 int *tokPtr) {
233 int upper = 0;
234 UNUSED_P(enc);
235 *tokPtr = XML_TOK_PI;
236 if (end - ptr != MINBPC(enc) * 3)
237 return 1;
238 switch (BYTE_TO_ASCII(enc, ptr)) {
239 case ASCII_x:
240 break;
241 case ASCII_X:
242 upper = 1;
243 break;
244 default:
245 return 1;
246 }
247 ptr += MINBPC(enc);
248 switch (BYTE_TO_ASCII(enc, ptr)) {
249 case ASCII_m:
250 break;
251 case ASCII_M:
252 upper = 1;
253 break;
254 default:
255 return 1;
256 }
257 ptr += MINBPC(enc);
258 switch (BYTE_TO_ASCII(enc, ptr)) {
259 case ASCII_l:
260 break;
261 case ASCII_L:
262 upper = 1;
263 break;
264 default:
265 return 1;
266 }
267 if (upper)
268 return 0;
269 *tokPtr = XML_TOK_XML_DECL;
270 return 1;
271 }
272
273 /* ptr points to character following "<?" */
274
275 static int PTRCALL
PREFIX(scanPi)276 PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
277 const char **nextTokPtr) {
278 int tok;
279 const char *target = ptr;
280 REQUIRE_CHAR(enc, ptr, end);
281 switch (BYTE_TYPE(enc, ptr)) {
282 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
283 default:
284 *nextTokPtr = ptr;
285 return XML_TOK_INVALID;
286 }
287 while (HAS_CHAR(enc, ptr, end)) {
288 switch (BYTE_TYPE(enc, ptr)) {
289 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
290 case BT_S:
291 case BT_CR:
292 case BT_LF:
293 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
294 *nextTokPtr = ptr;
295 return XML_TOK_INVALID;
296 }
297 ptr += MINBPC(enc);
298 while (HAS_CHAR(enc, ptr, end)) {
299 switch (BYTE_TYPE(enc, ptr)) {
300 INVALID_CASES(ptr, nextTokPtr)
301 case BT_QUEST:
302 ptr += MINBPC(enc);
303 REQUIRE_CHAR(enc, ptr, end);
304 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
305 *nextTokPtr = ptr + MINBPC(enc);
306 return tok;
307 }
308 break;
309 default:
310 ptr += MINBPC(enc);
311 break;
312 }
313 }
314 return XML_TOK_PARTIAL;
315 case BT_QUEST:
316 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
317 *nextTokPtr = ptr;
318 return XML_TOK_INVALID;
319 }
320 ptr += MINBPC(enc);
321 REQUIRE_CHAR(enc, ptr, end);
322 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
323 *nextTokPtr = ptr + MINBPC(enc);
324 return tok;
325 }
326 /* fall through */
327 default:
328 *nextTokPtr = ptr;
329 return XML_TOK_INVALID;
330 }
331 }
332 return XML_TOK_PARTIAL;
333 }
334
335 static int PTRCALL
PREFIX(scanCdataSection)336 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
337 const char **nextTokPtr) {
338 static const char CDATA_LSQB[]
339 = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
340 int i;
341 UNUSED_P(enc);
342 /* CDATA[ */
343 REQUIRE_CHARS(enc, ptr, end, 6);
344 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
345 if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
346 *nextTokPtr = ptr;
347 return XML_TOK_INVALID;
348 }
349 }
350 *nextTokPtr = ptr;
351 return XML_TOK_CDATA_SECT_OPEN;
352 }
353
354 static int PTRCALL
PREFIX(cdataSectionTok)355 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
356 const char **nextTokPtr) {
357 if (ptr >= end)
358 return XML_TOK_NONE;
359 if (MINBPC(enc) > 1) {
360 size_t n = end - ptr;
361 if (n & (MINBPC(enc) - 1)) {
362 n &= ~(MINBPC(enc) - 1);
363 if (n == 0)
364 return XML_TOK_PARTIAL;
365 end = ptr + n;
366 }
367 }
368 switch (BYTE_TYPE(enc, ptr)) {
369 case BT_RSQB:
370 ptr += MINBPC(enc);
371 REQUIRE_CHAR(enc, ptr, end);
372 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
373 break;
374 ptr += MINBPC(enc);
375 REQUIRE_CHAR(enc, ptr, end);
376 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
377 ptr -= MINBPC(enc);
378 break;
379 }
380 *nextTokPtr = ptr + MINBPC(enc);
381 return XML_TOK_CDATA_SECT_CLOSE;
382 case BT_CR:
383 ptr += MINBPC(enc);
384 REQUIRE_CHAR(enc, ptr, end);
385 if (BYTE_TYPE(enc, ptr) == BT_LF)
386 ptr += MINBPC(enc);
387 *nextTokPtr = ptr;
388 return XML_TOK_DATA_NEWLINE;
389 case BT_LF:
390 *nextTokPtr = ptr + MINBPC(enc);
391 return XML_TOK_DATA_NEWLINE;
392 INVALID_CASES(ptr, nextTokPtr)
393 default:
394 ptr += MINBPC(enc);
395 break;
396 }
397 while (HAS_CHAR(enc, ptr, end)) {
398 switch (BYTE_TYPE(enc, ptr)) {
399 # define LEAD_CASE(n) \
400 case BT_LEAD##n: \
401 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
402 *nextTokPtr = ptr; \
403 return XML_TOK_DATA_CHARS; \
404 } \
405 ptr += n; \
406 break;
407 LEAD_CASE(2)
408 LEAD_CASE(3)
409 LEAD_CASE(4)
410 # undef LEAD_CASE
411 case BT_NONXML:
412 case BT_MALFORM:
413 case BT_TRAIL:
414 case BT_CR:
415 case BT_LF:
416 case BT_RSQB:
417 *nextTokPtr = ptr;
418 return XML_TOK_DATA_CHARS;
419 default:
420 ptr += MINBPC(enc);
421 break;
422 }
423 }
424 *nextTokPtr = ptr;
425 return XML_TOK_DATA_CHARS;
426 }
427
428 /* ptr points to character following "</" */
429
430 static int PTRCALL
PREFIX(scanEndTag)431 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
432 const char **nextTokPtr) {
433 REQUIRE_CHAR(enc, ptr, end);
434 switch (BYTE_TYPE(enc, ptr)) {
435 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
436 default:
437 *nextTokPtr = ptr;
438 return XML_TOK_INVALID;
439 }
440 while (HAS_CHAR(enc, ptr, end)) {
441 switch (BYTE_TYPE(enc, ptr)) {
442 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
443 case BT_S:
444 case BT_CR:
445 case BT_LF:
446 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
447 switch (BYTE_TYPE(enc, ptr)) {
448 case BT_S:
449 case BT_CR:
450 case BT_LF:
451 break;
452 case BT_GT:
453 *nextTokPtr = ptr + MINBPC(enc);
454 return XML_TOK_END_TAG;
455 default:
456 *nextTokPtr = ptr;
457 return XML_TOK_INVALID;
458 }
459 }
460 return XML_TOK_PARTIAL;
461 # ifdef XML_NS
462 case BT_COLON:
463 /* no need to check qname syntax here,
464 since end-tag must match exactly */
465 ptr += MINBPC(enc);
466 break;
467 # endif
468 case BT_GT:
469 *nextTokPtr = ptr + MINBPC(enc);
470 return XML_TOK_END_TAG;
471 default:
472 *nextTokPtr = ptr;
473 return XML_TOK_INVALID;
474 }
475 }
476 return XML_TOK_PARTIAL;
477 }
478
479 /* ptr points to character following "&#X" */
480
481 static int PTRCALL
PREFIX(scanHexCharRef)482 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
483 const char **nextTokPtr) {
484 if (HAS_CHAR(enc, ptr, end)) {
485 switch (BYTE_TYPE(enc, ptr)) {
486 case BT_DIGIT:
487 case BT_HEX:
488 break;
489 default:
490 *nextTokPtr = ptr;
491 return XML_TOK_INVALID;
492 }
493 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
494 switch (BYTE_TYPE(enc, ptr)) {
495 case BT_DIGIT:
496 case BT_HEX:
497 break;
498 case BT_SEMI:
499 *nextTokPtr = ptr + MINBPC(enc);
500 return XML_TOK_CHAR_REF;
501 default:
502 *nextTokPtr = ptr;
503 return XML_TOK_INVALID;
504 }
505 }
506 }
507 return XML_TOK_PARTIAL;
508 }
509
510 /* ptr points to character following "&#" */
511
512 static int PTRCALL
PREFIX(scanCharRef)513 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
514 const char **nextTokPtr) {
515 if (HAS_CHAR(enc, ptr, end)) {
516 if (CHAR_MATCHES(enc, ptr, ASCII_x))
517 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
518 switch (BYTE_TYPE(enc, ptr)) {
519 case BT_DIGIT:
520 break;
521 default:
522 *nextTokPtr = ptr;
523 return XML_TOK_INVALID;
524 }
525 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
526 switch (BYTE_TYPE(enc, ptr)) {
527 case BT_DIGIT:
528 break;
529 case BT_SEMI:
530 *nextTokPtr = ptr + MINBPC(enc);
531 return XML_TOK_CHAR_REF;
532 default:
533 *nextTokPtr = ptr;
534 return XML_TOK_INVALID;
535 }
536 }
537 }
538 return XML_TOK_PARTIAL;
539 }
540
541 /* ptr points to character following "&" */
542
543 static int PTRCALL
PREFIX(scanRef)544 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
545 const char **nextTokPtr) {
546 REQUIRE_CHAR(enc, ptr, end);
547 switch (BYTE_TYPE(enc, ptr)) {
548 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
549 case BT_NUM:
550 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
551 default:
552 *nextTokPtr = ptr;
553 return XML_TOK_INVALID;
554 }
555 while (HAS_CHAR(enc, ptr, end)) {
556 switch (BYTE_TYPE(enc, ptr)) {
557 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
558 case BT_SEMI:
559 *nextTokPtr = ptr + MINBPC(enc);
560 return XML_TOK_ENTITY_REF;
561 default:
562 *nextTokPtr = ptr;
563 return XML_TOK_INVALID;
564 }
565 }
566 return XML_TOK_PARTIAL;
567 }
568
569 /* ptr points to character following first character of attribute name */
570
571 static int PTRCALL
PREFIX(scanAtts)572 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
573 const char **nextTokPtr) {
574 # ifdef XML_NS
575 int hadColon = 0;
576 # endif
577 while (HAS_CHAR(enc, ptr, end)) {
578 switch (BYTE_TYPE(enc, ptr)) {
579 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
580 # ifdef XML_NS
581 case BT_COLON:
582 if (hadColon) {
583 *nextTokPtr = ptr;
584 return XML_TOK_INVALID;
585 }
586 hadColon = 1;
587 ptr += MINBPC(enc);
588 REQUIRE_CHAR(enc, ptr, end);
589 switch (BYTE_TYPE(enc, ptr)) {
590 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
591 default:
592 *nextTokPtr = ptr;
593 return XML_TOK_INVALID;
594 }
595 break;
596 # endif
597 case BT_S:
598 case BT_CR:
599 case BT_LF:
600 for (;;) {
601 int t;
602
603 ptr += MINBPC(enc);
604 REQUIRE_CHAR(enc, ptr, end);
605 t = BYTE_TYPE(enc, ptr);
606 if (t == BT_EQUALS)
607 break;
608 switch (t) {
609 case BT_S:
610 case BT_LF:
611 case BT_CR:
612 break;
613 default:
614 *nextTokPtr = ptr;
615 return XML_TOK_INVALID;
616 }
617 }
618 /* fall through */
619 case BT_EQUALS: {
620 int open;
621 # ifdef XML_NS
622 hadColon = 0;
623 # endif
624 for (;;) {
625 ptr += MINBPC(enc);
626 REQUIRE_CHAR(enc, ptr, end);
627 open = BYTE_TYPE(enc, ptr);
628 if (open == BT_QUOT || open == BT_APOS)
629 break;
630 switch (open) {
631 case BT_S:
632 case BT_LF:
633 case BT_CR:
634 break;
635 default:
636 *nextTokPtr = ptr;
637 return XML_TOK_INVALID;
638 }
639 }
640 ptr += MINBPC(enc);
641 /* in attribute value */
642 for (;;) {
643 int t;
644 REQUIRE_CHAR(enc, ptr, end);
645 t = BYTE_TYPE(enc, ptr);
646 if (t == open)
647 break;
648 switch (t) {
649 INVALID_CASES(ptr, nextTokPtr)
650 case BT_AMP: {
651 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
652 if (tok <= 0) {
653 if (tok == XML_TOK_INVALID)
654 *nextTokPtr = ptr;
655 return tok;
656 }
657 break;
658 }
659 case BT_LT:
660 *nextTokPtr = ptr;
661 return XML_TOK_INVALID;
662 default:
663 ptr += MINBPC(enc);
664 break;
665 }
666 }
667 ptr += MINBPC(enc);
668 REQUIRE_CHAR(enc, ptr, end);
669 switch (BYTE_TYPE(enc, ptr)) {
670 case BT_S:
671 case BT_CR:
672 case BT_LF:
673 break;
674 case BT_SOL:
675 goto sol;
676 case BT_GT:
677 goto gt;
678 default:
679 *nextTokPtr = ptr;
680 return XML_TOK_INVALID;
681 }
682 /* ptr points to closing quote */
683 for (;;) {
684 ptr += MINBPC(enc);
685 REQUIRE_CHAR(enc, ptr, end);
686 switch (BYTE_TYPE(enc, ptr)) {
687 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
688 case BT_S:
689 case BT_CR:
690 case BT_LF:
691 continue;
692 case BT_GT:
693 gt:
694 *nextTokPtr = ptr + MINBPC(enc);
695 return XML_TOK_START_TAG_WITH_ATTS;
696 case BT_SOL:
697 sol:
698 ptr += MINBPC(enc);
699 REQUIRE_CHAR(enc, ptr, end);
700 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
701 *nextTokPtr = ptr;
702 return XML_TOK_INVALID;
703 }
704 *nextTokPtr = ptr + MINBPC(enc);
705 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
706 default:
707 *nextTokPtr = ptr;
708 return XML_TOK_INVALID;
709 }
710 break;
711 }
712 break;
713 }
714 default:
715 *nextTokPtr = ptr;
716 return XML_TOK_INVALID;
717 }
718 }
719 return XML_TOK_PARTIAL;
720 }
721
722 /* ptr points to character following "<" */
723
724 static int PTRCALL
PREFIX(scanLt)725 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
726 const char **nextTokPtr) {
727 # ifdef XML_NS
728 int hadColon;
729 # endif
730 REQUIRE_CHAR(enc, ptr, end);
731 switch (BYTE_TYPE(enc, ptr)) {
732 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
733 case BT_EXCL:
734 ptr += MINBPC(enc);
735 REQUIRE_CHAR(enc, ptr, end);
736 switch (BYTE_TYPE(enc, ptr)) {
737 case BT_MINUS:
738 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
739 case BT_LSQB:
740 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
741 }
742 *nextTokPtr = ptr;
743 return XML_TOK_INVALID;
744 case BT_QUEST:
745 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
746 case BT_SOL:
747 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
748 default:
749 *nextTokPtr = ptr;
750 return XML_TOK_INVALID;
751 }
752 # ifdef XML_NS
753 hadColon = 0;
754 # endif
755 /* we have a start-tag */
756 while (HAS_CHAR(enc, ptr, end)) {
757 switch (BYTE_TYPE(enc, ptr)) {
758 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
759 # ifdef XML_NS
760 case BT_COLON:
761 if (hadColon) {
762 *nextTokPtr = ptr;
763 return XML_TOK_INVALID;
764 }
765 hadColon = 1;
766 ptr += MINBPC(enc);
767 REQUIRE_CHAR(enc, ptr, end);
768 switch (BYTE_TYPE(enc, ptr)) {
769 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
770 default:
771 *nextTokPtr = ptr;
772 return XML_TOK_INVALID;
773 }
774 break;
775 # endif
776 case BT_S:
777 case BT_CR:
778 case BT_LF: {
779 ptr += MINBPC(enc);
780 while (HAS_CHAR(enc, ptr, end)) {
781 switch (BYTE_TYPE(enc, ptr)) {
782 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
783 case BT_GT:
784 goto gt;
785 case BT_SOL:
786 goto sol;
787 case BT_S:
788 case BT_CR:
789 case BT_LF:
790 ptr += MINBPC(enc);
791 continue;
792 default:
793 *nextTokPtr = ptr;
794 return XML_TOK_INVALID;
795 }
796 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
797 }
798 return XML_TOK_PARTIAL;
799 }
800 case BT_GT:
801 gt:
802 *nextTokPtr = ptr + MINBPC(enc);
803 return XML_TOK_START_TAG_NO_ATTS;
804 case BT_SOL:
805 sol:
806 ptr += MINBPC(enc);
807 REQUIRE_CHAR(enc, ptr, end);
808 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
809 *nextTokPtr = ptr;
810 return XML_TOK_INVALID;
811 }
812 *nextTokPtr = ptr + MINBPC(enc);
813 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
814 default:
815 *nextTokPtr = ptr;
816 return XML_TOK_INVALID;
817 }
818 }
819 return XML_TOK_PARTIAL;
820 }
821
822 static int PTRCALL
PREFIX(contentTok)823 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
824 const char **nextTokPtr) {
825 if (ptr >= end)
826 return XML_TOK_NONE;
827 if (MINBPC(enc) > 1) {
828 size_t n = end - ptr;
829 if (n & (MINBPC(enc) - 1)) {
830 n &= ~(MINBPC(enc) - 1);
831 if (n == 0)
832 return XML_TOK_PARTIAL;
833 end = ptr + n;
834 }
835 }
836 switch (BYTE_TYPE(enc, ptr)) {
837 case BT_LT:
838 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
839 case BT_AMP:
840 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
841 case BT_CR:
842 ptr += MINBPC(enc);
843 if (! HAS_CHAR(enc, ptr, end))
844 return XML_TOK_TRAILING_CR;
845 if (BYTE_TYPE(enc, ptr) == BT_LF)
846 ptr += MINBPC(enc);
847 *nextTokPtr = ptr;
848 return XML_TOK_DATA_NEWLINE;
849 case BT_LF:
850 *nextTokPtr = ptr + MINBPC(enc);
851 return XML_TOK_DATA_NEWLINE;
852 case BT_RSQB:
853 ptr += MINBPC(enc);
854 if (! HAS_CHAR(enc, ptr, end))
855 return XML_TOK_TRAILING_RSQB;
856 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
857 break;
858 ptr += MINBPC(enc);
859 if (! HAS_CHAR(enc, ptr, end))
860 return XML_TOK_TRAILING_RSQB;
861 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
862 ptr -= MINBPC(enc);
863 break;
864 }
865 *nextTokPtr = ptr;
866 return XML_TOK_INVALID;
867 INVALID_CASES(ptr, nextTokPtr)
868 default:
869 ptr += MINBPC(enc);
870 break;
871 }
872 while (HAS_CHAR(enc, ptr, end)) {
873 switch (BYTE_TYPE(enc, ptr)) {
874 # define LEAD_CASE(n) \
875 case BT_LEAD##n: \
876 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
877 *nextTokPtr = ptr; \
878 return XML_TOK_DATA_CHARS; \
879 } \
880 ptr += n; \
881 break;
882 LEAD_CASE(2)
883 LEAD_CASE(3)
884 LEAD_CASE(4)
885 # undef LEAD_CASE
886 case BT_RSQB:
887 if (HAS_CHARS(enc, ptr, end, 2)) {
888 if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
889 ptr += MINBPC(enc);
890 break;
891 }
892 if (HAS_CHARS(enc, ptr, end, 3)) {
893 if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
894 ptr += MINBPC(enc);
895 break;
896 }
897 *nextTokPtr = ptr + 2 * MINBPC(enc);
898 return XML_TOK_INVALID;
899 }
900 }
901 /* fall through */
902 case BT_AMP:
903 case BT_LT:
904 case BT_NONXML:
905 case BT_MALFORM:
906 case BT_TRAIL:
907 case BT_CR:
908 case BT_LF:
909 *nextTokPtr = ptr;
910 return XML_TOK_DATA_CHARS;
911 default:
912 ptr += MINBPC(enc);
913 break;
914 }
915 }
916 *nextTokPtr = ptr;
917 return XML_TOK_DATA_CHARS;
918 }
919
920 /* ptr points to character following "%" */
921
922 static int PTRCALL
PREFIX(scanPercent)923 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
924 const char **nextTokPtr) {
925 REQUIRE_CHAR(enc, ptr, end);
926 switch (BYTE_TYPE(enc, ptr)) {
927 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
928 case BT_S:
929 case BT_LF:
930 case BT_CR:
931 case BT_PERCNT:
932 *nextTokPtr = ptr;
933 return XML_TOK_PERCENT;
934 default:
935 *nextTokPtr = ptr;
936 return XML_TOK_INVALID;
937 }
938 while (HAS_CHAR(enc, ptr, end)) {
939 switch (BYTE_TYPE(enc, ptr)) {
940 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
941 case BT_SEMI:
942 *nextTokPtr = ptr + MINBPC(enc);
943 return XML_TOK_PARAM_ENTITY_REF;
944 default:
945 *nextTokPtr = ptr;
946 return XML_TOK_INVALID;
947 }
948 }
949 return XML_TOK_PARTIAL;
950 }
951
952 static int PTRCALL
PREFIX(scanPoundName)953 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
954 const char **nextTokPtr) {
955 REQUIRE_CHAR(enc, ptr, end);
956 switch (BYTE_TYPE(enc, ptr)) {
957 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
958 default:
959 *nextTokPtr = ptr;
960 return XML_TOK_INVALID;
961 }
962 while (HAS_CHAR(enc, ptr, end)) {
963 switch (BYTE_TYPE(enc, ptr)) {
964 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
965 case BT_CR:
966 case BT_LF:
967 case BT_S:
968 case BT_RPAR:
969 case BT_GT:
970 case BT_PERCNT:
971 case BT_VERBAR:
972 *nextTokPtr = ptr;
973 return XML_TOK_POUND_NAME;
974 default:
975 *nextTokPtr = ptr;
976 return XML_TOK_INVALID;
977 }
978 }
979 return -XML_TOK_POUND_NAME;
980 }
981
982 static int PTRCALL
PREFIX(scanLit)983 PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
984 const char **nextTokPtr) {
985 while (HAS_CHAR(enc, ptr, end)) {
986 int t = BYTE_TYPE(enc, ptr);
987 switch (t) {
988 INVALID_CASES(ptr, nextTokPtr)
989 case BT_QUOT:
990 case BT_APOS:
991 ptr += MINBPC(enc);
992 if (t != open)
993 break;
994 if (! HAS_CHAR(enc, ptr, end))
995 return -XML_TOK_LITERAL;
996 *nextTokPtr = ptr;
997 switch (BYTE_TYPE(enc, ptr)) {
998 case BT_S:
999 case BT_CR:
1000 case BT_LF:
1001 case BT_GT:
1002 case BT_PERCNT:
1003 case BT_LSQB:
1004 return XML_TOK_LITERAL;
1005 default:
1006 return XML_TOK_INVALID;
1007 }
1008 default:
1009 ptr += MINBPC(enc);
1010 break;
1011 }
1012 }
1013 return XML_TOK_PARTIAL;
1014 }
1015
1016 static int PTRCALL
PREFIX(prologTok)1017 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
1018 const char **nextTokPtr) {
1019 int tok;
1020 if (ptr >= end)
1021 return XML_TOK_NONE;
1022 if (MINBPC(enc) > 1) {
1023 size_t n = end - ptr;
1024 if (n & (MINBPC(enc) - 1)) {
1025 n &= ~(MINBPC(enc) - 1);
1026 if (n == 0)
1027 return XML_TOK_PARTIAL;
1028 end = ptr + n;
1029 }
1030 }
1031 switch (BYTE_TYPE(enc, ptr)) {
1032 case BT_QUOT:
1033 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1034 case BT_APOS:
1035 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1036 case BT_LT: {
1037 ptr += MINBPC(enc);
1038 REQUIRE_CHAR(enc, ptr, end);
1039 switch (BYTE_TYPE(enc, ptr)) {
1040 case BT_EXCL:
1041 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1042 case BT_QUEST:
1043 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1044 case BT_NMSTRT:
1045 case BT_HEX:
1046 case BT_NONASCII:
1047 case BT_LEAD2:
1048 case BT_LEAD3:
1049 case BT_LEAD4:
1050 *nextTokPtr = ptr - MINBPC(enc);
1051 return XML_TOK_INSTANCE_START;
1052 }
1053 *nextTokPtr = ptr;
1054 return XML_TOK_INVALID;
1055 }
1056 case BT_CR:
1057 if (ptr + MINBPC(enc) == end) {
1058 *nextTokPtr = end;
1059 /* indicate that this might be part of a CR/LF pair */
1060 return -XML_TOK_PROLOG_S;
1061 }
1062 /* fall through */
1063 case BT_S:
1064 case BT_LF:
1065 for (;;) {
1066 ptr += MINBPC(enc);
1067 if (! HAS_CHAR(enc, ptr, end))
1068 break;
1069 switch (BYTE_TYPE(enc, ptr)) {
1070 case BT_S:
1071 case BT_LF:
1072 break;
1073 case BT_CR:
1074 /* don't split CR/LF pair */
1075 if (ptr + MINBPC(enc) != end)
1076 break;
1077 /* fall through */
1078 default:
1079 *nextTokPtr = ptr;
1080 return XML_TOK_PROLOG_S;
1081 }
1082 }
1083 *nextTokPtr = ptr;
1084 return XML_TOK_PROLOG_S;
1085 case BT_PERCNT:
1086 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1087 case BT_COMMA:
1088 *nextTokPtr = ptr + MINBPC(enc);
1089 return XML_TOK_COMMA;
1090 case BT_LSQB:
1091 *nextTokPtr = ptr + MINBPC(enc);
1092 return XML_TOK_OPEN_BRACKET;
1093 case BT_RSQB:
1094 ptr += MINBPC(enc);
1095 if (! HAS_CHAR(enc, ptr, end))
1096 return -XML_TOK_CLOSE_BRACKET;
1097 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1098 REQUIRE_CHARS(enc, ptr, end, 2);
1099 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1100 *nextTokPtr = ptr + 2 * MINBPC(enc);
1101 return XML_TOK_COND_SECT_CLOSE;
1102 }
1103 }
1104 *nextTokPtr = ptr;
1105 return XML_TOK_CLOSE_BRACKET;
1106 case BT_LPAR:
1107 *nextTokPtr = ptr + MINBPC(enc);
1108 return XML_TOK_OPEN_PAREN;
1109 case BT_RPAR:
1110 ptr += MINBPC(enc);
1111 if (! HAS_CHAR(enc, ptr, end))
1112 return -XML_TOK_CLOSE_PAREN;
1113 switch (BYTE_TYPE(enc, ptr)) {
1114 case BT_AST:
1115 *nextTokPtr = ptr + MINBPC(enc);
1116 return XML_TOK_CLOSE_PAREN_ASTERISK;
1117 case BT_QUEST:
1118 *nextTokPtr = ptr + MINBPC(enc);
1119 return XML_TOK_CLOSE_PAREN_QUESTION;
1120 case BT_PLUS:
1121 *nextTokPtr = ptr + MINBPC(enc);
1122 return XML_TOK_CLOSE_PAREN_PLUS;
1123 case BT_CR:
1124 case BT_LF:
1125 case BT_S:
1126 case BT_GT:
1127 case BT_COMMA:
1128 case BT_VERBAR:
1129 case BT_RPAR:
1130 *nextTokPtr = ptr;
1131 return XML_TOK_CLOSE_PAREN;
1132 }
1133 *nextTokPtr = ptr;
1134 return XML_TOK_INVALID;
1135 case BT_VERBAR:
1136 *nextTokPtr = ptr + MINBPC(enc);
1137 return XML_TOK_OR;
1138 case BT_GT:
1139 *nextTokPtr = ptr + MINBPC(enc);
1140 return XML_TOK_DECL_CLOSE;
1141 case BT_NUM:
1142 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1143 # define LEAD_CASE(n) \
1144 case BT_LEAD##n: \
1145 if (end - ptr < n) \
1146 return XML_TOK_PARTIAL_CHAR; \
1147 if (IS_INVALID_CHAR(enc, ptr, n)) { \
1148 *nextTokPtr = ptr; \
1149 return XML_TOK_INVALID; \
1150 } \
1151 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1152 ptr += n; \
1153 tok = XML_TOK_NAME; \
1154 break; \
1155 } \
1156 if (IS_NAME_CHAR(enc, ptr, n)) { \
1157 ptr += n; \
1158 tok = XML_TOK_NMTOKEN; \
1159 break; \
1160 } \
1161 *nextTokPtr = ptr; \
1162 return XML_TOK_INVALID;
1163 LEAD_CASE(2)
1164 LEAD_CASE(3)
1165 LEAD_CASE(4)
1166 # undef LEAD_CASE
1167 case BT_NMSTRT:
1168 case BT_HEX:
1169 tok = XML_TOK_NAME;
1170 ptr += MINBPC(enc);
1171 break;
1172 case BT_DIGIT:
1173 case BT_NAME:
1174 case BT_MINUS:
1175 # ifdef XML_NS
1176 case BT_COLON:
1177 # endif
1178 tok = XML_TOK_NMTOKEN;
1179 ptr += MINBPC(enc);
1180 break;
1181 case BT_NONASCII:
1182 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1183 ptr += MINBPC(enc);
1184 tok = XML_TOK_NAME;
1185 break;
1186 }
1187 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1188 ptr += MINBPC(enc);
1189 tok = XML_TOK_NMTOKEN;
1190 break;
1191 }
1192 /* fall through */
1193 default:
1194 *nextTokPtr = ptr;
1195 return XML_TOK_INVALID;
1196 }
1197 while (HAS_CHAR(enc, ptr, end)) {
1198 switch (BYTE_TYPE(enc, ptr)) {
1199 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1200 case BT_GT:
1201 case BT_RPAR:
1202 case BT_COMMA:
1203 case BT_VERBAR:
1204 case BT_LSQB:
1205 case BT_PERCNT:
1206 case BT_S:
1207 case BT_CR:
1208 case BT_LF:
1209 *nextTokPtr = ptr;
1210 return tok;
1211 # ifdef XML_NS
1212 case BT_COLON:
1213 ptr += MINBPC(enc);
1214 switch (tok) {
1215 case XML_TOK_NAME:
1216 REQUIRE_CHAR(enc, ptr, end);
1217 tok = XML_TOK_PREFIXED_NAME;
1218 switch (BYTE_TYPE(enc, ptr)) {
1219 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1220 default:
1221 tok = XML_TOK_NMTOKEN;
1222 break;
1223 }
1224 break;
1225 case XML_TOK_PREFIXED_NAME:
1226 tok = XML_TOK_NMTOKEN;
1227 break;
1228 }
1229 break;
1230 # endif
1231 case BT_PLUS:
1232 if (tok == XML_TOK_NMTOKEN) {
1233 *nextTokPtr = ptr;
1234 return XML_TOK_INVALID;
1235 }
1236 *nextTokPtr = ptr + MINBPC(enc);
1237 return XML_TOK_NAME_PLUS;
1238 case BT_AST:
1239 if (tok == XML_TOK_NMTOKEN) {
1240 *nextTokPtr = ptr;
1241 return XML_TOK_INVALID;
1242 }
1243 *nextTokPtr = ptr + MINBPC(enc);
1244 return XML_TOK_NAME_ASTERISK;
1245 case BT_QUEST:
1246 if (tok == XML_TOK_NMTOKEN) {
1247 *nextTokPtr = ptr;
1248 return XML_TOK_INVALID;
1249 }
1250 *nextTokPtr = ptr + MINBPC(enc);
1251 return XML_TOK_NAME_QUESTION;
1252 default:
1253 *nextTokPtr = ptr;
1254 return XML_TOK_INVALID;
1255 }
1256 }
1257 return -tok;
1258 }
1259
1260 static int PTRCALL
PREFIX(attributeValueTok)1261 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1262 const char **nextTokPtr) {
1263 const char *start;
1264 if (ptr >= end)
1265 return XML_TOK_NONE;
1266 else if (! HAS_CHAR(enc, ptr, end)) {
1267 /* This line cannot be executed. The incoming data has already
1268 * been tokenized once, so incomplete characters like this have
1269 * already been eliminated from the input. Retaining the paranoia
1270 * check is still valuable, however.
1271 */
1272 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1273 }
1274 start = ptr;
1275 while (HAS_CHAR(enc, ptr, end)) {
1276 switch (BYTE_TYPE(enc, ptr)) {
1277 # define LEAD_CASE(n) \
1278 case BT_LEAD##n: \
1279 ptr += n; /* NOTE: The encoding has already been validated. */ \
1280 break;
1281 LEAD_CASE(2)
1282 LEAD_CASE(3)
1283 LEAD_CASE(4)
1284 # undef LEAD_CASE
1285 case BT_AMP:
1286 if (ptr == start)
1287 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1288 *nextTokPtr = ptr;
1289 return XML_TOK_DATA_CHARS;
1290 case BT_LT:
1291 /* this is for inside entity references */
1292 *nextTokPtr = ptr;
1293 return XML_TOK_INVALID;
1294 case BT_LF:
1295 if (ptr == start) {
1296 *nextTokPtr = ptr + MINBPC(enc);
1297 return XML_TOK_DATA_NEWLINE;
1298 }
1299 *nextTokPtr = ptr;
1300 return XML_TOK_DATA_CHARS;
1301 case BT_CR:
1302 if (ptr == start) {
1303 ptr += MINBPC(enc);
1304 if (! HAS_CHAR(enc, ptr, end))
1305 return XML_TOK_TRAILING_CR;
1306 if (BYTE_TYPE(enc, ptr) == BT_LF)
1307 ptr += MINBPC(enc);
1308 *nextTokPtr = ptr;
1309 return XML_TOK_DATA_NEWLINE;
1310 }
1311 *nextTokPtr = ptr;
1312 return XML_TOK_DATA_CHARS;
1313 case BT_S:
1314 if (ptr == start) {
1315 *nextTokPtr = ptr + MINBPC(enc);
1316 return XML_TOK_ATTRIBUTE_VALUE_S;
1317 }
1318 *nextTokPtr = ptr;
1319 return XML_TOK_DATA_CHARS;
1320 default:
1321 ptr += MINBPC(enc);
1322 break;
1323 }
1324 }
1325 *nextTokPtr = ptr;
1326 return XML_TOK_DATA_CHARS;
1327 }
1328
1329 static int PTRCALL
PREFIX(entityValueTok)1330 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1331 const char **nextTokPtr) {
1332 const char *start;
1333 if (ptr >= end)
1334 return XML_TOK_NONE;
1335 else if (! HAS_CHAR(enc, ptr, end)) {
1336 /* This line cannot be executed. The incoming data has already
1337 * been tokenized once, so incomplete characters like this have
1338 * already been eliminated from the input. Retaining the paranoia
1339 * check is still valuable, however.
1340 */
1341 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1342 }
1343 start = ptr;
1344 while (HAS_CHAR(enc, ptr, end)) {
1345 switch (BYTE_TYPE(enc, ptr)) {
1346 # define LEAD_CASE(n) \
1347 case BT_LEAD##n: \
1348 ptr += n; /* NOTE: The encoding has already been validated. */ \
1349 break;
1350 LEAD_CASE(2)
1351 LEAD_CASE(3)
1352 LEAD_CASE(4)
1353 # undef LEAD_CASE
1354 case BT_AMP:
1355 if (ptr == start)
1356 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1357 *nextTokPtr = ptr;
1358 return XML_TOK_DATA_CHARS;
1359 case BT_PERCNT:
1360 if (ptr == start) {
1361 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1362 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1363 }
1364 *nextTokPtr = ptr;
1365 return XML_TOK_DATA_CHARS;
1366 case BT_LF:
1367 if (ptr == start) {
1368 *nextTokPtr = ptr + MINBPC(enc);
1369 return XML_TOK_DATA_NEWLINE;
1370 }
1371 *nextTokPtr = ptr;
1372 return XML_TOK_DATA_CHARS;
1373 case BT_CR:
1374 if (ptr == start) {
1375 ptr += MINBPC(enc);
1376 if (! HAS_CHAR(enc, ptr, end))
1377 return XML_TOK_TRAILING_CR;
1378 if (BYTE_TYPE(enc, ptr) == BT_LF)
1379 ptr += MINBPC(enc);
1380 *nextTokPtr = ptr;
1381 return XML_TOK_DATA_NEWLINE;
1382 }
1383 *nextTokPtr = ptr;
1384 return XML_TOK_DATA_CHARS;
1385 default:
1386 ptr += MINBPC(enc);
1387 break;
1388 }
1389 }
1390 *nextTokPtr = ptr;
1391 return XML_TOK_DATA_CHARS;
1392 }
1393
1394 # ifdef XML_DTD
1395
1396 static int PTRCALL
PREFIX(ignoreSectionTok)1397 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1398 const char **nextTokPtr) {
1399 int level = 0;
1400 if (MINBPC(enc) > 1) {
1401 size_t n = end - ptr;
1402 if (n & (MINBPC(enc) - 1)) {
1403 n &= ~(MINBPC(enc) - 1);
1404 end = ptr + n;
1405 }
1406 }
1407 while (HAS_CHAR(enc, ptr, end)) {
1408 switch (BYTE_TYPE(enc, ptr)) {
1409 INVALID_CASES(ptr, nextTokPtr)
1410 case BT_LT:
1411 ptr += MINBPC(enc);
1412 REQUIRE_CHAR(enc, ptr, end);
1413 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1414 ptr += MINBPC(enc);
1415 REQUIRE_CHAR(enc, ptr, end);
1416 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1417 ++level;
1418 ptr += MINBPC(enc);
1419 }
1420 }
1421 break;
1422 case BT_RSQB:
1423 ptr += MINBPC(enc);
1424 REQUIRE_CHAR(enc, ptr, end);
1425 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1426 ptr += MINBPC(enc);
1427 REQUIRE_CHAR(enc, ptr, end);
1428 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1429 ptr += MINBPC(enc);
1430 if (level == 0) {
1431 *nextTokPtr = ptr;
1432 return XML_TOK_IGNORE_SECT;
1433 }
1434 --level;
1435 }
1436 }
1437 break;
1438 default:
1439 ptr += MINBPC(enc);
1440 break;
1441 }
1442 }
1443 return XML_TOK_PARTIAL;
1444 }
1445
1446 # endif /* XML_DTD */
1447
1448 static int PTRCALL
PREFIX(isPublicId)1449 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1450 const char **badPtr) {
1451 ptr += MINBPC(enc);
1452 end -= MINBPC(enc);
1453 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1454 switch (BYTE_TYPE(enc, ptr)) {
1455 case BT_DIGIT:
1456 case BT_HEX:
1457 case BT_MINUS:
1458 case BT_APOS:
1459 case BT_LPAR:
1460 case BT_RPAR:
1461 case BT_PLUS:
1462 case BT_COMMA:
1463 case BT_SOL:
1464 case BT_EQUALS:
1465 case BT_QUEST:
1466 case BT_CR:
1467 case BT_LF:
1468 case BT_SEMI:
1469 case BT_EXCL:
1470 case BT_AST:
1471 case BT_PERCNT:
1472 case BT_NUM:
1473 # ifdef XML_NS
1474 case BT_COLON:
1475 # endif
1476 break;
1477 case BT_S:
1478 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1479 *badPtr = ptr;
1480 return 0;
1481 }
1482 break;
1483 case BT_NAME:
1484 case BT_NMSTRT:
1485 if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1486 break;
1487 /* fall through */
1488 default:
1489 switch (BYTE_TO_ASCII(enc, ptr)) {
1490 case 0x24: /* $ */
1491 case 0x40: /* @ */
1492 break;
1493 default:
1494 *badPtr = ptr;
1495 return 0;
1496 }
1497 break;
1498 }
1499 }
1500 return 1;
1501 }
1502
1503 /* This must only be called for a well-formed start-tag or empty
1504 element tag. Returns the number of attributes. Pointers to the
1505 first attsMax attributes are stored in atts.
1506 */
1507
1508 static int PTRCALL
PREFIX(getAtts)1509 PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
1510 ATTRIBUTE *atts) {
1511 enum { other, inName, inValue } state = inName;
1512 int nAtts = 0;
1513 int open = 0; /* defined when state == inValue;
1514 initialization just to shut up compilers */
1515
1516 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1517 switch (BYTE_TYPE(enc, ptr)) {
1518 # define START_NAME \
1519 if (state == other) { \
1520 if (nAtts < attsMax) { \
1521 atts[nAtts].name = ptr; \
1522 atts[nAtts].normalized = 1; \
1523 } \
1524 state = inName; \
1525 }
1526 # define LEAD_CASE(n) \
1527 case BT_LEAD##n: /* NOTE: The encoding has already been validated. */ \
1528 START_NAME ptr += (n - MINBPC(enc)); \
1529 break;
1530 LEAD_CASE(2)
1531 LEAD_CASE(3)
1532 LEAD_CASE(4)
1533 # undef LEAD_CASE
1534 case BT_NONASCII:
1535 case BT_NMSTRT:
1536 case BT_HEX:
1537 START_NAME
1538 break;
1539 # undef START_NAME
1540 case BT_QUOT:
1541 if (state != inValue) {
1542 if (nAtts < attsMax)
1543 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1544 state = inValue;
1545 open = BT_QUOT;
1546 } else if (open == BT_QUOT) {
1547 state = other;
1548 if (nAtts < attsMax)
1549 atts[nAtts].valueEnd = ptr;
1550 nAtts++;
1551 }
1552 break;
1553 case BT_APOS:
1554 if (state != inValue) {
1555 if (nAtts < attsMax)
1556 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1557 state = inValue;
1558 open = BT_APOS;
1559 } else if (open == BT_APOS) {
1560 state = other;
1561 if (nAtts < attsMax)
1562 atts[nAtts].valueEnd = ptr;
1563 nAtts++;
1564 }
1565 break;
1566 case BT_AMP:
1567 if (nAtts < attsMax)
1568 atts[nAtts].normalized = 0;
1569 break;
1570 case BT_S:
1571 if (state == inName)
1572 state = other;
1573 else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
1574 && (ptr == atts[nAtts].valuePtr
1575 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1576 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1577 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1578 atts[nAtts].normalized = 0;
1579 break;
1580 case BT_CR:
1581 case BT_LF:
1582 /* This case ensures that the first attribute name is counted
1583 Apart from that we could just change state on the quote. */
1584 if (state == inName)
1585 state = other;
1586 else if (state == inValue && nAtts < attsMax)
1587 atts[nAtts].normalized = 0;
1588 break;
1589 case BT_GT:
1590 case BT_SOL:
1591 if (state != inValue)
1592 return nAtts;
1593 break;
1594 default:
1595 break;
1596 }
1597 }
1598 /* not reached */
1599 }
1600
1601 static int PTRFASTCALL
PREFIX(charRefNumber)1602 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
1603 int result = 0;
1604 /* skip &# */
1605 UNUSED_P(enc);
1606 ptr += 2 * MINBPC(enc);
1607 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1608 for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1609 ptr += MINBPC(enc)) {
1610 int c = BYTE_TO_ASCII(enc, ptr);
1611 switch (c) {
1612 case ASCII_0:
1613 case ASCII_1:
1614 case ASCII_2:
1615 case ASCII_3:
1616 case ASCII_4:
1617 case ASCII_5:
1618 case ASCII_6:
1619 case ASCII_7:
1620 case ASCII_8:
1621 case ASCII_9:
1622 result <<= 4;
1623 result |= (c - ASCII_0);
1624 break;
1625 case ASCII_A:
1626 case ASCII_B:
1627 case ASCII_C:
1628 case ASCII_D:
1629 case ASCII_E:
1630 case ASCII_F:
1631 result <<= 4;
1632 result += 10 + (c - ASCII_A);
1633 break;
1634 case ASCII_a:
1635 case ASCII_b:
1636 case ASCII_c:
1637 case ASCII_d:
1638 case ASCII_e:
1639 case ASCII_f:
1640 result <<= 4;
1641 result += 10 + (c - ASCII_a);
1642 break;
1643 }
1644 if (result >= 0x110000)
1645 return -1;
1646 }
1647 } else {
1648 for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1649 int c = BYTE_TO_ASCII(enc, ptr);
1650 result *= 10;
1651 result += (c - ASCII_0);
1652 if (result >= 0x110000)
1653 return -1;
1654 }
1655 }
1656 return checkCharRefNumber(result);
1657 }
1658
1659 static int PTRCALL
PREFIX(predefinedEntityName)1660 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1661 const char *end) {
1662 UNUSED_P(enc);
1663 switch ((end - ptr) / MINBPC(enc)) {
1664 case 2:
1665 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1666 switch (BYTE_TO_ASCII(enc, ptr)) {
1667 case ASCII_l:
1668 return ASCII_LT;
1669 case ASCII_g:
1670 return ASCII_GT;
1671 }
1672 }
1673 break;
1674 case 3:
1675 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1676 ptr += MINBPC(enc);
1677 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1678 ptr += MINBPC(enc);
1679 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1680 return ASCII_AMP;
1681 }
1682 }
1683 break;
1684 case 4:
1685 switch (BYTE_TO_ASCII(enc, ptr)) {
1686 case ASCII_q:
1687 ptr += MINBPC(enc);
1688 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1689 ptr += MINBPC(enc);
1690 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1691 ptr += MINBPC(enc);
1692 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1693 return ASCII_QUOT;
1694 }
1695 }
1696 break;
1697 case ASCII_a:
1698 ptr += MINBPC(enc);
1699 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1700 ptr += MINBPC(enc);
1701 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1702 ptr += MINBPC(enc);
1703 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1704 return ASCII_APOS;
1705 }
1706 }
1707 break;
1708 }
1709 }
1710 return 0;
1711 }
1712
1713 static int PTRCALL
PREFIX(nameMatchesAscii)1714 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1715 const char *end1, const char *ptr2) {
1716 UNUSED_P(enc);
1717 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1718 if (end1 - ptr1 < MINBPC(enc)) {
1719 /* This line cannot be executed. The incoming data has already
1720 * been tokenized once, so incomplete characters like this have
1721 * already been eliminated from the input. Retaining the
1722 * paranoia check is still valuable, however.
1723 */
1724 return 0; /* LCOV_EXCL_LINE */
1725 }
1726 if (! CHAR_MATCHES(enc, ptr1, *ptr2))
1727 return 0;
1728 }
1729 return ptr1 == end1;
1730 }
1731
1732 static int PTRFASTCALL
PREFIX(nameLength)1733 PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
1734 const char *start = ptr;
1735 for (;;) {
1736 switch (BYTE_TYPE(enc, ptr)) {
1737 # define LEAD_CASE(n) \
1738 case BT_LEAD##n: \
1739 ptr += n; /* NOTE: The encoding has already been validated. */ \
1740 break;
1741 LEAD_CASE(2)
1742 LEAD_CASE(3)
1743 LEAD_CASE(4)
1744 # undef LEAD_CASE
1745 case BT_NONASCII:
1746 case BT_NMSTRT:
1747 # ifdef XML_NS
1748 case BT_COLON:
1749 # endif
1750 case BT_HEX:
1751 case BT_DIGIT:
1752 case BT_NAME:
1753 case BT_MINUS:
1754 ptr += MINBPC(enc);
1755 break;
1756 default:
1757 return (int)(ptr - start);
1758 }
1759 }
1760 }
1761
1762 static const char *PTRFASTCALL
PREFIX(skipS)1763 PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
1764 for (;;) {
1765 switch (BYTE_TYPE(enc, ptr)) {
1766 case BT_LF:
1767 case BT_CR:
1768 case BT_S:
1769 ptr += MINBPC(enc);
1770 break;
1771 default:
1772 return ptr;
1773 }
1774 }
1775 }
1776
1777 static void PTRCALL
PREFIX(updatePosition)1778 PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
1779 POSITION *pos) {
1780 while (HAS_CHAR(enc, ptr, end)) {
1781 switch (BYTE_TYPE(enc, ptr)) {
1782 # define LEAD_CASE(n) \
1783 case BT_LEAD##n: \
1784 ptr += n; /* NOTE: The encoding has already been validated. */ \
1785 pos->columnNumber++; \
1786 break;
1787 LEAD_CASE(2)
1788 LEAD_CASE(3)
1789 LEAD_CASE(4)
1790 # undef LEAD_CASE
1791 case BT_LF:
1792 pos->columnNumber = 0;
1793 pos->lineNumber++;
1794 ptr += MINBPC(enc);
1795 break;
1796 case BT_CR:
1797 pos->lineNumber++;
1798 ptr += MINBPC(enc);
1799 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1800 ptr += MINBPC(enc);
1801 pos->columnNumber = 0;
1802 break;
1803 default:
1804 ptr += MINBPC(enc);
1805 pos->columnNumber++;
1806 break;
1807 }
1808 }
1809 }
1810
1811 # undef DO_LEAD_CASE
1812 # undef MULTIBYTE_CASES
1813 # undef INVALID_CASES
1814 # undef CHECK_NAME_CASE
1815 # undef CHECK_NAME_CASES
1816 # undef CHECK_NMSTRT_CASE
1817 # undef CHECK_NMSTRT_CASES
1818
1819 #endif /* XML_TOK_IMPL_C */
1820