1 /* This file is included (from xmltok.c, 1-3 times depending on XML_MIN_SIZE)! 2 __ __ _ 3 ___\ \/ /_ __ __ _| |_ 4 / _ \\ /| '_ \ / _` | __| 5 | __// \| |_) | (_| | |_ 6 \___/_/\_\ .__/ \__,_|\__| 7 |_| XML parser 8 9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd 10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net> 11 Copyright (c) 2002 Fred L. Drake, Jr. <fdrake@users.sourceforge.net> 12 Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net> 13 Copyright (c) 2016-2021 Sebastian Pipping <sebastian@pipping.org> 14 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk> 15 Copyright (c) 2018 Benjamin Peterson <benjamin@python.org> 16 Copyright (c) 2018 Anton Maklakov <antmak.pub@gmail.com> 17 Copyright (c) 2019 David Loffredo <loffredo@steptools.com> 18 Copyright (c) 2020 Boris Kolpackov <boris@codesynthesis.com> 19 Licensed under the MIT license: 20 21 Permission is hereby granted, free of charge, to any person obtaining 22 a copy of this software and associated documentation files (the 23 "Software"), to deal in the Software without restriction, including 24 without limitation the rights to use, copy, modify, merge, publish, 25 distribute, sublicense, and/or sell copies of the Software, and to permit 26 persons to whom the Software is furnished to do so, subject to the 27 following conditions: 28 29 The above copyright notice and this permission notice shall be included 30 in all copies or substantial portions of the Software. 31 32 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 33 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 34 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 35 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 36 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 37 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 38 USE OR OTHER DEALINGS IN THE SOFTWARE. 39 */ 40 41 #ifdef XML_TOK_IMPL_C 42 43 # ifndef IS_INVALID_CHAR // i.e. for UTF-16 and XML_MIN_SIZE not defined 44 # define IS_INVALID_CHAR(enc, ptr, n) (0) 45 # endif 46 47 # define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \ 48 case BT_LEAD##n: \ 49 if (end - ptr < n) \ 50 return XML_TOK_PARTIAL_CHAR; \ 51 if (IS_INVALID_CHAR(enc, ptr, n)) { \ 52 *(nextTokPtr) = (ptr); \ 53 return XML_TOK_INVALID; \ 54 } \ 55 ptr += n; \ 56 break; 57 58 # define INVALID_CASES(ptr, nextTokPtr) \ 59 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \ 60 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \ 61 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \ 62 case BT_NONXML: \ 63 case BT_MALFORM: \ 64 case BT_TRAIL: \ 65 *(nextTokPtr) = (ptr); \ 66 return XML_TOK_INVALID; 67 68 # define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \ 69 case BT_LEAD##n: \ 70 if (end - ptr < n) \ 71 return XML_TOK_PARTIAL_CHAR; \ 72 if (! IS_NAME_CHAR(enc, ptr, n)) { \ 73 *nextTokPtr = ptr; \ 74 return XML_TOK_INVALID; \ 75 } \ 76 ptr += n; \ 77 break; 78 79 # define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \ 80 case BT_NONASCII: \ 81 if (! IS_NAME_CHAR_MINBPC(enc, ptr)) { \ 82 *nextTokPtr = ptr; \ 83 return XML_TOK_INVALID; \ 84 } \ 85 /* fall through */ \ 86 case BT_NMSTRT: \ 87 case BT_HEX: \ 88 case BT_DIGIT: \ 89 case BT_NAME: \ 90 case BT_MINUS: \ 91 ptr += MINBPC(enc); \ 92 break; \ 93 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \ 94 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \ 95 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr) 96 97 # define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \ 98 case BT_LEAD##n: \ 99 if (end - ptr < n) \ 100 return XML_TOK_PARTIAL_CHAR; \ 101 if (! IS_NMSTRT_CHAR(enc, ptr, n)) { \ 102 *nextTokPtr = ptr; \ 103 return XML_TOK_INVALID; \ 104 } \ 105 ptr += n; \ 106 break; 107 108 # define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \ 109 case BT_NONASCII: \ 110 if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \ 111 *nextTokPtr = ptr; \ 112 return XML_TOK_INVALID; \ 113 } \ 114 /* fall through */ \ 115 case BT_NMSTRT: \ 116 case BT_HEX: \ 117 ptr += MINBPC(enc); \ 118 break; \ 119 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \ 120 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \ 121 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr) 122 123 # ifndef PREFIX 124 # define PREFIX(ident) ident 125 # endif 126 127 # define HAS_CHARS(enc, ptr, end, count) (end - ptr >= count * MINBPC(enc)) 128 129 # define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1) 130 131 # define REQUIRE_CHARS(enc, ptr, end, count) \ 132 { \ 133 if (! HAS_CHARS(enc, ptr, end, count)) { \ 134 return XML_TOK_PARTIAL; \ 135 } \ 136 } 137 138 # define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1) 139 140 /* ptr points to character following "<!-" */ 141 142 static int PTRCALL 143 PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end, 144 const char **nextTokPtr) { 145 if (HAS_CHAR(enc, ptr, end)) { 146 if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { 147 *nextTokPtr = ptr; 148 return XML_TOK_INVALID; 149 } 150 ptr += MINBPC(enc); 151 while (HAS_CHAR(enc, ptr, end)) { 152 switch (BYTE_TYPE(enc, ptr)) { 153 INVALID_CASES(ptr, nextTokPtr) 154 case BT_MINUS: 155 ptr += MINBPC(enc); 156 REQUIRE_CHAR(enc, ptr, end); 157 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { 158 ptr += MINBPC(enc); 159 REQUIRE_CHAR(enc, ptr, end); 160 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) { 161 *nextTokPtr = ptr; 162 return XML_TOK_INVALID; 163 } 164 *nextTokPtr = ptr + MINBPC(enc); 165 return XML_TOK_COMMENT; 166 } 167 break; 168 default: 169 ptr += MINBPC(enc); 170 break; 171 } 172 } 173 } 174 return XML_TOK_PARTIAL; 175 } 176 177 /* ptr points to character following "<!" */ 178 179 static int PTRCALL 180 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end, 181 const char **nextTokPtr) { 182 REQUIRE_CHAR(enc, ptr, end); 183 switch (BYTE_TYPE(enc, ptr)) { 184 case BT_MINUS: 185 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); 186 case BT_LSQB: 187 *nextTokPtr = ptr + MINBPC(enc); 188 return XML_TOK_COND_SECT_OPEN; 189 case BT_NMSTRT: 190 case BT_HEX: 191 ptr += MINBPC(enc); 192 break; 193 default: 194 *nextTokPtr = ptr; 195 return XML_TOK_INVALID; 196 } 197 while (HAS_CHAR(enc, ptr, end)) { 198 switch (BYTE_TYPE(enc, ptr)) { 199 case BT_PERCNT: 200 REQUIRE_CHARS(enc, ptr, end, 2); 201 /* don't allow <!ENTITY% foo "whatever"> */ 202 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) { 203 case BT_S: 204 case BT_CR: 205 case BT_LF: 206 case BT_PERCNT: 207 *nextTokPtr = ptr; 208 return XML_TOK_INVALID; 209 } 210 /* fall through */ 211 case BT_S: 212 case BT_CR: 213 case BT_LF: 214 *nextTokPtr = ptr; 215 return XML_TOK_DECL_OPEN; 216 case BT_NMSTRT: 217 case BT_HEX: 218 ptr += MINBPC(enc); 219 break; 220 default: 221 *nextTokPtr = ptr; 222 return XML_TOK_INVALID; 223 } 224 } 225 return XML_TOK_PARTIAL; 226 } 227 228 static int PTRCALL 229 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end, 230 int *tokPtr) { 231 int upper = 0; 232 UNUSED_P(enc); 233 *tokPtr = XML_TOK_PI; 234 if (end - ptr != MINBPC(enc) * 3) 235 return 1; 236 switch (BYTE_TO_ASCII(enc, ptr)) { 237 case ASCII_x: 238 break; 239 case ASCII_X: 240 upper = 1; 241 break; 242 default: 243 return 1; 244 } 245 ptr += MINBPC(enc); 246 switch (BYTE_TO_ASCII(enc, ptr)) { 247 case ASCII_m: 248 break; 249 case ASCII_M: 250 upper = 1; 251 break; 252 default: 253 return 1; 254 } 255 ptr += MINBPC(enc); 256 switch (BYTE_TO_ASCII(enc, ptr)) { 257 case ASCII_l: 258 break; 259 case ASCII_L: 260 upper = 1; 261 break; 262 default: 263 return 1; 264 } 265 if (upper) 266 return 0; 267 *tokPtr = XML_TOK_XML_DECL; 268 return 1; 269 } 270 271 /* ptr points to character following "<?" */ 272 273 static int PTRCALL 274 PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end, 275 const char **nextTokPtr) { 276 int tok; 277 const char *target = ptr; 278 REQUIRE_CHAR(enc, ptr, end); 279 switch (BYTE_TYPE(enc, ptr)) { 280 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 281 default: 282 *nextTokPtr = ptr; 283 return XML_TOK_INVALID; 284 } 285 while (HAS_CHAR(enc, ptr, end)) { 286 switch (BYTE_TYPE(enc, ptr)) { 287 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 288 case BT_S: 289 case BT_CR: 290 case BT_LF: 291 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { 292 *nextTokPtr = ptr; 293 return XML_TOK_INVALID; 294 } 295 ptr += MINBPC(enc); 296 while (HAS_CHAR(enc, ptr, end)) { 297 switch (BYTE_TYPE(enc, ptr)) { 298 INVALID_CASES(ptr, nextTokPtr) 299 case BT_QUEST: 300 ptr += MINBPC(enc); 301 REQUIRE_CHAR(enc, ptr, end); 302 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 303 *nextTokPtr = ptr + MINBPC(enc); 304 return tok; 305 } 306 break; 307 default: 308 ptr += MINBPC(enc); 309 break; 310 } 311 } 312 return XML_TOK_PARTIAL; 313 case BT_QUEST: 314 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { 315 *nextTokPtr = ptr; 316 return XML_TOK_INVALID; 317 } 318 ptr += MINBPC(enc); 319 REQUIRE_CHAR(enc, ptr, end); 320 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 321 *nextTokPtr = ptr + MINBPC(enc); 322 return tok; 323 } 324 /* fall through */ 325 default: 326 *nextTokPtr = ptr; 327 return XML_TOK_INVALID; 328 } 329 } 330 return XML_TOK_PARTIAL; 331 } 332 333 static int PTRCALL 334 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end, 335 const char **nextTokPtr) { 336 static const char CDATA_LSQB[] 337 = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB}; 338 int i; 339 UNUSED_P(enc); 340 /* CDATA[ */ 341 REQUIRE_CHARS(enc, ptr, end, 6); 342 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) { 343 if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) { 344 *nextTokPtr = ptr; 345 return XML_TOK_INVALID; 346 } 347 } 348 *nextTokPtr = ptr; 349 return XML_TOK_CDATA_SECT_OPEN; 350 } 351 352 static int PTRCALL 353 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end, 354 const char **nextTokPtr) { 355 if (ptr >= end) 356 return XML_TOK_NONE; 357 if (MINBPC(enc) > 1) { 358 size_t n = end - ptr; 359 if (n & (MINBPC(enc) - 1)) { 360 n &= ~(MINBPC(enc) - 1); 361 if (n == 0) 362 return XML_TOK_PARTIAL; 363 end = ptr + n; 364 } 365 } 366 switch (BYTE_TYPE(enc, ptr)) { 367 case BT_RSQB: 368 ptr += MINBPC(enc); 369 REQUIRE_CHAR(enc, ptr, end); 370 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB)) 371 break; 372 ptr += MINBPC(enc); 373 REQUIRE_CHAR(enc, ptr, end); 374 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) { 375 ptr -= MINBPC(enc); 376 break; 377 } 378 *nextTokPtr = ptr + MINBPC(enc); 379 return XML_TOK_CDATA_SECT_CLOSE; 380 case BT_CR: 381 ptr += MINBPC(enc); 382 REQUIRE_CHAR(enc, ptr, end); 383 if (BYTE_TYPE(enc, ptr) == BT_LF) 384 ptr += MINBPC(enc); 385 *nextTokPtr = ptr; 386 return XML_TOK_DATA_NEWLINE; 387 case BT_LF: 388 *nextTokPtr = ptr + MINBPC(enc); 389 return XML_TOK_DATA_NEWLINE; 390 INVALID_CASES(ptr, nextTokPtr) 391 default: 392 ptr += MINBPC(enc); 393 break; 394 } 395 while (HAS_CHAR(enc, ptr, end)) { 396 switch (BYTE_TYPE(enc, ptr)) { 397 # define LEAD_CASE(n) \ 398 case BT_LEAD##n: \ 399 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ 400 *nextTokPtr = ptr; \ 401 return XML_TOK_DATA_CHARS; \ 402 } \ 403 ptr += n; \ 404 break; 405 LEAD_CASE(2) 406 LEAD_CASE(3) 407 LEAD_CASE(4) 408 # undef LEAD_CASE 409 case BT_NONXML: 410 case BT_MALFORM: 411 case BT_TRAIL: 412 case BT_CR: 413 case BT_LF: 414 case BT_RSQB: 415 *nextTokPtr = ptr; 416 return XML_TOK_DATA_CHARS; 417 default: 418 ptr += MINBPC(enc); 419 break; 420 } 421 } 422 *nextTokPtr = ptr; 423 return XML_TOK_DATA_CHARS; 424 } 425 426 /* ptr points to character following "</" */ 427 428 static int PTRCALL 429 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end, 430 const char **nextTokPtr) { 431 REQUIRE_CHAR(enc, ptr, end); 432 switch (BYTE_TYPE(enc, ptr)) { 433 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 434 default: 435 *nextTokPtr = ptr; 436 return XML_TOK_INVALID; 437 } 438 while (HAS_CHAR(enc, ptr, end)) { 439 switch (BYTE_TYPE(enc, ptr)) { 440 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 441 case BT_S: 442 case BT_CR: 443 case BT_LF: 444 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 445 switch (BYTE_TYPE(enc, ptr)) { 446 case BT_S: 447 case BT_CR: 448 case BT_LF: 449 break; 450 case BT_GT: 451 *nextTokPtr = ptr + MINBPC(enc); 452 return XML_TOK_END_TAG; 453 default: 454 *nextTokPtr = ptr; 455 return XML_TOK_INVALID; 456 } 457 } 458 return XML_TOK_PARTIAL; 459 # ifdef XML_NS 460 case BT_COLON: 461 /* no need to check qname syntax here, 462 since end-tag must match exactly */ 463 ptr += MINBPC(enc); 464 break; 465 # endif 466 case BT_GT: 467 *nextTokPtr = ptr + MINBPC(enc); 468 return XML_TOK_END_TAG; 469 default: 470 *nextTokPtr = ptr; 471 return XML_TOK_INVALID; 472 } 473 } 474 return XML_TOK_PARTIAL; 475 } 476 477 /* ptr points to character following "&#X" */ 478 479 static int PTRCALL 480 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end, 481 const char **nextTokPtr) { 482 if (HAS_CHAR(enc, ptr, end)) { 483 switch (BYTE_TYPE(enc, ptr)) { 484 case BT_DIGIT: 485 case BT_HEX: 486 break; 487 default: 488 *nextTokPtr = ptr; 489 return XML_TOK_INVALID; 490 } 491 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 492 switch (BYTE_TYPE(enc, ptr)) { 493 case BT_DIGIT: 494 case BT_HEX: 495 break; 496 case BT_SEMI: 497 *nextTokPtr = ptr + MINBPC(enc); 498 return XML_TOK_CHAR_REF; 499 default: 500 *nextTokPtr = ptr; 501 return XML_TOK_INVALID; 502 } 503 } 504 } 505 return XML_TOK_PARTIAL; 506 } 507 508 /* ptr points to character following "&#" */ 509 510 static int PTRCALL 511 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end, 512 const char **nextTokPtr) { 513 if (HAS_CHAR(enc, ptr, end)) { 514 if (CHAR_MATCHES(enc, ptr, ASCII_x)) 515 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 516 switch (BYTE_TYPE(enc, ptr)) { 517 case BT_DIGIT: 518 break; 519 default: 520 *nextTokPtr = ptr; 521 return XML_TOK_INVALID; 522 } 523 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 524 switch (BYTE_TYPE(enc, ptr)) { 525 case BT_DIGIT: 526 break; 527 case BT_SEMI: 528 *nextTokPtr = ptr + MINBPC(enc); 529 return XML_TOK_CHAR_REF; 530 default: 531 *nextTokPtr = ptr; 532 return XML_TOK_INVALID; 533 } 534 } 535 } 536 return XML_TOK_PARTIAL; 537 } 538 539 /* ptr points to character following "&" */ 540 541 static int PTRCALL 542 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end, 543 const char **nextTokPtr) { 544 REQUIRE_CHAR(enc, ptr, end); 545 switch (BYTE_TYPE(enc, ptr)) { 546 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 547 case BT_NUM: 548 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 549 default: 550 *nextTokPtr = ptr; 551 return XML_TOK_INVALID; 552 } 553 while (HAS_CHAR(enc, ptr, end)) { 554 switch (BYTE_TYPE(enc, ptr)) { 555 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 556 case BT_SEMI: 557 *nextTokPtr = ptr + MINBPC(enc); 558 return XML_TOK_ENTITY_REF; 559 default: 560 *nextTokPtr = ptr; 561 return XML_TOK_INVALID; 562 } 563 } 564 return XML_TOK_PARTIAL; 565 } 566 567 /* ptr points to character following first character of attribute name */ 568 569 static int PTRCALL 570 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, 571 const char **nextTokPtr) { 572 # ifdef XML_NS 573 int hadColon = 0; 574 # endif 575 while (HAS_CHAR(enc, ptr, end)) { 576 switch (BYTE_TYPE(enc, ptr)) { 577 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 578 # ifdef XML_NS 579 case BT_COLON: 580 if (hadColon) { 581 *nextTokPtr = ptr; 582 return XML_TOK_INVALID; 583 } 584 hadColon = 1; 585 ptr += MINBPC(enc); 586 REQUIRE_CHAR(enc, ptr, end); 587 switch (BYTE_TYPE(enc, ptr)) { 588 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 589 default: 590 *nextTokPtr = ptr; 591 return XML_TOK_INVALID; 592 } 593 break; 594 # endif 595 case BT_S: 596 case BT_CR: 597 case BT_LF: 598 for (;;) { 599 int t; 600 601 ptr += MINBPC(enc); 602 REQUIRE_CHAR(enc, ptr, end); 603 t = BYTE_TYPE(enc, ptr); 604 if (t == BT_EQUALS) 605 break; 606 switch (t) { 607 case BT_S: 608 case BT_LF: 609 case BT_CR: 610 break; 611 default: 612 *nextTokPtr = ptr; 613 return XML_TOK_INVALID; 614 } 615 } 616 /* fall through */ 617 case BT_EQUALS: { 618 int open; 619 # ifdef XML_NS 620 hadColon = 0; 621 # endif 622 for (;;) { 623 ptr += MINBPC(enc); 624 REQUIRE_CHAR(enc, ptr, end); 625 open = BYTE_TYPE(enc, ptr); 626 if (open == BT_QUOT || open == BT_APOS) 627 break; 628 switch (open) { 629 case BT_S: 630 case BT_LF: 631 case BT_CR: 632 break; 633 default: 634 *nextTokPtr = ptr; 635 return XML_TOK_INVALID; 636 } 637 } 638 ptr += MINBPC(enc); 639 /* in attribute value */ 640 for (;;) { 641 int t; 642 REQUIRE_CHAR(enc, ptr, end); 643 t = BYTE_TYPE(enc, ptr); 644 if (t == open) 645 break; 646 switch (t) { 647 INVALID_CASES(ptr, nextTokPtr) 648 case BT_AMP: { 649 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr); 650 if (tok <= 0) { 651 if (tok == XML_TOK_INVALID) 652 *nextTokPtr = ptr; 653 return tok; 654 } 655 break; 656 } 657 case BT_LT: 658 *nextTokPtr = ptr; 659 return XML_TOK_INVALID; 660 default: 661 ptr += MINBPC(enc); 662 break; 663 } 664 } 665 ptr += MINBPC(enc); 666 REQUIRE_CHAR(enc, ptr, end); 667 switch (BYTE_TYPE(enc, ptr)) { 668 case BT_S: 669 case BT_CR: 670 case BT_LF: 671 break; 672 case BT_SOL: 673 goto sol; 674 case BT_GT: 675 goto gt; 676 default: 677 *nextTokPtr = ptr; 678 return XML_TOK_INVALID; 679 } 680 /* ptr points to closing quote */ 681 for (;;) { 682 ptr += MINBPC(enc); 683 REQUIRE_CHAR(enc, ptr, end); 684 switch (BYTE_TYPE(enc, ptr)) { 685 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 686 case BT_S: 687 case BT_CR: 688 case BT_LF: 689 continue; 690 case BT_GT: 691 gt: 692 *nextTokPtr = ptr + MINBPC(enc); 693 return XML_TOK_START_TAG_WITH_ATTS; 694 case BT_SOL: 695 sol: 696 ptr += MINBPC(enc); 697 REQUIRE_CHAR(enc, ptr, end); 698 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) { 699 *nextTokPtr = ptr; 700 return XML_TOK_INVALID; 701 } 702 *nextTokPtr = ptr + MINBPC(enc); 703 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS; 704 default: 705 *nextTokPtr = ptr; 706 return XML_TOK_INVALID; 707 } 708 break; 709 } 710 break; 711 } 712 default: 713 *nextTokPtr = ptr; 714 return XML_TOK_INVALID; 715 } 716 } 717 return XML_TOK_PARTIAL; 718 } 719 720 /* ptr points to character following "<" */ 721 722 static int PTRCALL 723 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, 724 const char **nextTokPtr) { 725 # ifdef XML_NS 726 int hadColon; 727 # endif 728 REQUIRE_CHAR(enc, ptr, end); 729 switch (BYTE_TYPE(enc, ptr)) { 730 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 731 case BT_EXCL: 732 ptr += MINBPC(enc); 733 REQUIRE_CHAR(enc, ptr, end); 734 switch (BYTE_TYPE(enc, ptr)) { 735 case BT_MINUS: 736 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); 737 case BT_LSQB: 738 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr); 739 } 740 *nextTokPtr = ptr; 741 return XML_TOK_INVALID; 742 case BT_QUEST: 743 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); 744 case BT_SOL: 745 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr); 746 default: 747 *nextTokPtr = ptr; 748 return XML_TOK_INVALID; 749 } 750 # ifdef XML_NS 751 hadColon = 0; 752 # endif 753 /* we have a start-tag */ 754 while (HAS_CHAR(enc, ptr, end)) { 755 switch (BYTE_TYPE(enc, ptr)) { 756 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 757 # ifdef XML_NS 758 case BT_COLON: 759 if (hadColon) { 760 *nextTokPtr = ptr; 761 return XML_TOK_INVALID; 762 } 763 hadColon = 1; 764 ptr += MINBPC(enc); 765 REQUIRE_CHAR(enc, ptr, end); 766 switch (BYTE_TYPE(enc, ptr)) { 767 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 768 default: 769 *nextTokPtr = ptr; 770 return XML_TOK_INVALID; 771 } 772 break; 773 # endif 774 case BT_S: 775 case BT_CR: 776 case BT_LF: { 777 ptr += MINBPC(enc); 778 while (HAS_CHAR(enc, ptr, end)) { 779 switch (BYTE_TYPE(enc, ptr)) { 780 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 781 case BT_GT: 782 goto gt; 783 case BT_SOL: 784 goto sol; 785 case BT_S: 786 case BT_CR: 787 case BT_LF: 788 ptr += MINBPC(enc); 789 continue; 790 default: 791 *nextTokPtr = ptr; 792 return XML_TOK_INVALID; 793 } 794 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr); 795 } 796 return XML_TOK_PARTIAL; 797 } 798 case BT_GT: 799 gt: 800 *nextTokPtr = ptr + MINBPC(enc); 801 return XML_TOK_START_TAG_NO_ATTS; 802 case BT_SOL: 803 sol: 804 ptr += MINBPC(enc); 805 REQUIRE_CHAR(enc, ptr, end); 806 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) { 807 *nextTokPtr = ptr; 808 return XML_TOK_INVALID; 809 } 810 *nextTokPtr = ptr + MINBPC(enc); 811 return XML_TOK_EMPTY_ELEMENT_NO_ATTS; 812 default: 813 *nextTokPtr = ptr; 814 return XML_TOK_INVALID; 815 } 816 } 817 return XML_TOK_PARTIAL; 818 } 819 820 static int PTRCALL 821 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, 822 const char **nextTokPtr) { 823 if (ptr >= end) 824 return XML_TOK_NONE; 825 if (MINBPC(enc) > 1) { 826 size_t n = end - ptr; 827 if (n & (MINBPC(enc) - 1)) { 828 n &= ~(MINBPC(enc) - 1); 829 if (n == 0) 830 return XML_TOK_PARTIAL; 831 end = ptr + n; 832 } 833 } 834 switch (BYTE_TYPE(enc, ptr)) { 835 case BT_LT: 836 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr); 837 case BT_AMP: 838 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 839 case BT_CR: 840 ptr += MINBPC(enc); 841 if (! HAS_CHAR(enc, ptr, end)) 842 return XML_TOK_TRAILING_CR; 843 if (BYTE_TYPE(enc, ptr) == BT_LF) 844 ptr += MINBPC(enc); 845 *nextTokPtr = ptr; 846 return XML_TOK_DATA_NEWLINE; 847 case BT_LF: 848 *nextTokPtr = ptr + MINBPC(enc); 849 return XML_TOK_DATA_NEWLINE; 850 case BT_RSQB: 851 ptr += MINBPC(enc); 852 if (! HAS_CHAR(enc, ptr, end)) 853 return XML_TOK_TRAILING_RSQB; 854 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB)) 855 break; 856 ptr += MINBPC(enc); 857 if (! HAS_CHAR(enc, ptr, end)) 858 return XML_TOK_TRAILING_RSQB; 859 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) { 860 ptr -= MINBPC(enc); 861 break; 862 } 863 *nextTokPtr = ptr; 864 return XML_TOK_INVALID; 865 INVALID_CASES(ptr, nextTokPtr) 866 default: 867 ptr += MINBPC(enc); 868 break; 869 } 870 while (HAS_CHAR(enc, ptr, end)) { 871 switch (BYTE_TYPE(enc, ptr)) { 872 # define LEAD_CASE(n) \ 873 case BT_LEAD##n: \ 874 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ 875 *nextTokPtr = ptr; \ 876 return XML_TOK_DATA_CHARS; \ 877 } \ 878 ptr += n; \ 879 break; 880 LEAD_CASE(2) 881 LEAD_CASE(3) 882 LEAD_CASE(4) 883 # undef LEAD_CASE 884 case BT_RSQB: 885 if (HAS_CHARS(enc, ptr, end, 2)) { 886 if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) { 887 ptr += MINBPC(enc); 888 break; 889 } 890 if (HAS_CHARS(enc, ptr, end, 3)) { 891 if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) { 892 ptr += MINBPC(enc); 893 break; 894 } 895 *nextTokPtr = ptr + 2 * MINBPC(enc); 896 return XML_TOK_INVALID; 897 } 898 } 899 /* fall through */ 900 case BT_AMP: 901 case BT_LT: 902 case BT_NONXML: 903 case BT_MALFORM: 904 case BT_TRAIL: 905 case BT_CR: 906 case BT_LF: 907 *nextTokPtr = ptr; 908 return XML_TOK_DATA_CHARS; 909 default: 910 ptr += MINBPC(enc); 911 break; 912 } 913 } 914 *nextTokPtr = ptr; 915 return XML_TOK_DATA_CHARS; 916 } 917 918 /* ptr points to character following "%" */ 919 920 static int PTRCALL 921 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end, 922 const char **nextTokPtr) { 923 REQUIRE_CHAR(enc, ptr, end); 924 switch (BYTE_TYPE(enc, ptr)) { 925 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 926 case BT_S: 927 case BT_LF: 928 case BT_CR: 929 case BT_PERCNT: 930 *nextTokPtr = ptr; 931 return XML_TOK_PERCENT; 932 default: 933 *nextTokPtr = ptr; 934 return XML_TOK_INVALID; 935 } 936 while (HAS_CHAR(enc, ptr, end)) { 937 switch (BYTE_TYPE(enc, ptr)) { 938 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 939 case BT_SEMI: 940 *nextTokPtr = ptr + MINBPC(enc); 941 return XML_TOK_PARAM_ENTITY_REF; 942 default: 943 *nextTokPtr = ptr; 944 return XML_TOK_INVALID; 945 } 946 } 947 return XML_TOK_PARTIAL; 948 } 949 950 static int PTRCALL 951 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end, 952 const char **nextTokPtr) { 953 REQUIRE_CHAR(enc, ptr, end); 954 switch (BYTE_TYPE(enc, ptr)) { 955 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 956 default: 957 *nextTokPtr = ptr; 958 return XML_TOK_INVALID; 959 } 960 while (HAS_CHAR(enc, ptr, end)) { 961 switch (BYTE_TYPE(enc, ptr)) { 962 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 963 case BT_CR: 964 case BT_LF: 965 case BT_S: 966 case BT_RPAR: 967 case BT_GT: 968 case BT_PERCNT: 969 case BT_VERBAR: 970 *nextTokPtr = ptr; 971 return XML_TOK_POUND_NAME; 972 default: 973 *nextTokPtr = ptr; 974 return XML_TOK_INVALID; 975 } 976 } 977 return -XML_TOK_POUND_NAME; 978 } 979 980 static int PTRCALL 981 PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end, 982 const char **nextTokPtr) { 983 while (HAS_CHAR(enc, ptr, end)) { 984 int t = BYTE_TYPE(enc, ptr); 985 switch (t) { 986 INVALID_CASES(ptr, nextTokPtr) 987 case BT_QUOT: 988 case BT_APOS: 989 ptr += MINBPC(enc); 990 if (t != open) 991 break; 992 if (! HAS_CHAR(enc, ptr, end)) 993 return -XML_TOK_LITERAL; 994 *nextTokPtr = ptr; 995 switch (BYTE_TYPE(enc, ptr)) { 996 case BT_S: 997 case BT_CR: 998 case BT_LF: 999 case BT_GT: 1000 case BT_PERCNT: 1001 case BT_LSQB: 1002 return XML_TOK_LITERAL; 1003 default: 1004 return XML_TOK_INVALID; 1005 } 1006 default: 1007 ptr += MINBPC(enc); 1008 break; 1009 } 1010 } 1011 return XML_TOK_PARTIAL; 1012 } 1013 1014 static int PTRCALL 1015 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, 1016 const char **nextTokPtr) { 1017 int tok; 1018 if (ptr >= end) 1019 return XML_TOK_NONE; 1020 if (MINBPC(enc) > 1) { 1021 size_t n = end - ptr; 1022 if (n & (MINBPC(enc) - 1)) { 1023 n &= ~(MINBPC(enc) - 1); 1024 if (n == 0) 1025 return XML_TOK_PARTIAL; 1026 end = ptr + n; 1027 } 1028 } 1029 switch (BYTE_TYPE(enc, ptr)) { 1030 case BT_QUOT: 1031 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr); 1032 case BT_APOS: 1033 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr); 1034 case BT_LT: { 1035 ptr += MINBPC(enc); 1036 REQUIRE_CHAR(enc, ptr, end); 1037 switch (BYTE_TYPE(enc, ptr)) { 1038 case BT_EXCL: 1039 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1040 case BT_QUEST: 1041 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1042 case BT_NMSTRT: 1043 case BT_HEX: 1044 case BT_NONASCII: 1045 case BT_LEAD2: 1046 case BT_LEAD3: 1047 case BT_LEAD4: 1048 *nextTokPtr = ptr - MINBPC(enc); 1049 return XML_TOK_INSTANCE_START; 1050 } 1051 *nextTokPtr = ptr; 1052 return XML_TOK_INVALID; 1053 } 1054 case BT_CR: 1055 if (ptr + MINBPC(enc) == end) { 1056 *nextTokPtr = end; 1057 /* indicate that this might be part of a CR/LF pair */ 1058 return -XML_TOK_PROLOG_S; 1059 } 1060 /* fall through */ 1061 case BT_S: 1062 case BT_LF: 1063 for (;;) { 1064 ptr += MINBPC(enc); 1065 if (! HAS_CHAR(enc, ptr, end)) 1066 break; 1067 switch (BYTE_TYPE(enc, ptr)) { 1068 case BT_S: 1069 case BT_LF: 1070 break; 1071 case BT_CR: 1072 /* don't split CR/LF pair */ 1073 if (ptr + MINBPC(enc) != end) 1074 break; 1075 /* fall through */ 1076 default: 1077 *nextTokPtr = ptr; 1078 return XML_TOK_PROLOG_S; 1079 } 1080 } 1081 *nextTokPtr = ptr; 1082 return XML_TOK_PROLOG_S; 1083 case BT_PERCNT: 1084 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1085 case BT_COMMA: 1086 *nextTokPtr = ptr + MINBPC(enc); 1087 return XML_TOK_COMMA; 1088 case BT_LSQB: 1089 *nextTokPtr = ptr + MINBPC(enc); 1090 return XML_TOK_OPEN_BRACKET; 1091 case BT_RSQB: 1092 ptr += MINBPC(enc); 1093 if (! HAS_CHAR(enc, ptr, end)) 1094 return -XML_TOK_CLOSE_BRACKET; 1095 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { 1096 REQUIRE_CHARS(enc, ptr, end, 2); 1097 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) { 1098 *nextTokPtr = ptr + 2 * MINBPC(enc); 1099 return XML_TOK_COND_SECT_CLOSE; 1100 } 1101 } 1102 *nextTokPtr = ptr; 1103 return XML_TOK_CLOSE_BRACKET; 1104 case BT_LPAR: 1105 *nextTokPtr = ptr + MINBPC(enc); 1106 return XML_TOK_OPEN_PAREN; 1107 case BT_RPAR: 1108 ptr += MINBPC(enc); 1109 if (! HAS_CHAR(enc, ptr, end)) 1110 return -XML_TOK_CLOSE_PAREN; 1111 switch (BYTE_TYPE(enc, ptr)) { 1112 case BT_AST: 1113 *nextTokPtr = ptr + MINBPC(enc); 1114 return XML_TOK_CLOSE_PAREN_ASTERISK; 1115 case BT_QUEST: 1116 *nextTokPtr = ptr + MINBPC(enc); 1117 return XML_TOK_CLOSE_PAREN_QUESTION; 1118 case BT_PLUS: 1119 *nextTokPtr = ptr + MINBPC(enc); 1120 return XML_TOK_CLOSE_PAREN_PLUS; 1121 case BT_CR: 1122 case BT_LF: 1123 case BT_S: 1124 case BT_GT: 1125 case BT_COMMA: 1126 case BT_VERBAR: 1127 case BT_RPAR: 1128 *nextTokPtr = ptr; 1129 return XML_TOK_CLOSE_PAREN; 1130 } 1131 *nextTokPtr = ptr; 1132 return XML_TOK_INVALID; 1133 case BT_VERBAR: 1134 *nextTokPtr = ptr + MINBPC(enc); 1135 return XML_TOK_OR; 1136 case BT_GT: 1137 *nextTokPtr = ptr + MINBPC(enc); 1138 return XML_TOK_DECL_CLOSE; 1139 case BT_NUM: 1140 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1141 # define LEAD_CASE(n) \ 1142 case BT_LEAD##n: \ 1143 if (end - ptr < n) \ 1144 return XML_TOK_PARTIAL_CHAR; \ 1145 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ 1146 ptr += n; \ 1147 tok = XML_TOK_NAME; \ 1148 break; \ 1149 } \ 1150 if (IS_NAME_CHAR(enc, ptr, n)) { \ 1151 ptr += n; \ 1152 tok = XML_TOK_NMTOKEN; \ 1153 break; \ 1154 } \ 1155 *nextTokPtr = ptr; \ 1156 return XML_TOK_INVALID; 1157 LEAD_CASE(2) 1158 LEAD_CASE(3) 1159 LEAD_CASE(4) 1160 # undef LEAD_CASE 1161 case BT_NMSTRT: 1162 case BT_HEX: 1163 tok = XML_TOK_NAME; 1164 ptr += MINBPC(enc); 1165 break; 1166 case BT_DIGIT: 1167 case BT_NAME: 1168 case BT_MINUS: 1169 # ifdef XML_NS 1170 case BT_COLON: 1171 # endif 1172 tok = XML_TOK_NMTOKEN; 1173 ptr += MINBPC(enc); 1174 break; 1175 case BT_NONASCII: 1176 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { 1177 ptr += MINBPC(enc); 1178 tok = XML_TOK_NAME; 1179 break; 1180 } 1181 if (IS_NAME_CHAR_MINBPC(enc, ptr)) { 1182 ptr += MINBPC(enc); 1183 tok = XML_TOK_NMTOKEN; 1184 break; 1185 } 1186 /* fall through */ 1187 default: 1188 *nextTokPtr = ptr; 1189 return XML_TOK_INVALID; 1190 } 1191 while (HAS_CHAR(enc, ptr, end)) { 1192 switch (BYTE_TYPE(enc, ptr)) { 1193 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 1194 case BT_GT: 1195 case BT_RPAR: 1196 case BT_COMMA: 1197 case BT_VERBAR: 1198 case BT_LSQB: 1199 case BT_PERCNT: 1200 case BT_S: 1201 case BT_CR: 1202 case BT_LF: 1203 *nextTokPtr = ptr; 1204 return tok; 1205 # ifdef XML_NS 1206 case BT_COLON: 1207 ptr += MINBPC(enc); 1208 switch (tok) { 1209 case XML_TOK_NAME: 1210 REQUIRE_CHAR(enc, ptr, end); 1211 tok = XML_TOK_PREFIXED_NAME; 1212 switch (BYTE_TYPE(enc, ptr)) { 1213 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 1214 default: 1215 tok = XML_TOK_NMTOKEN; 1216 break; 1217 } 1218 break; 1219 case XML_TOK_PREFIXED_NAME: 1220 tok = XML_TOK_NMTOKEN; 1221 break; 1222 } 1223 break; 1224 # endif 1225 case BT_PLUS: 1226 if (tok == XML_TOK_NMTOKEN) { 1227 *nextTokPtr = ptr; 1228 return XML_TOK_INVALID; 1229 } 1230 *nextTokPtr = ptr + MINBPC(enc); 1231 return XML_TOK_NAME_PLUS; 1232 case BT_AST: 1233 if (tok == XML_TOK_NMTOKEN) { 1234 *nextTokPtr = ptr; 1235 return XML_TOK_INVALID; 1236 } 1237 *nextTokPtr = ptr + MINBPC(enc); 1238 return XML_TOK_NAME_ASTERISK; 1239 case BT_QUEST: 1240 if (tok == XML_TOK_NMTOKEN) { 1241 *nextTokPtr = ptr; 1242 return XML_TOK_INVALID; 1243 } 1244 *nextTokPtr = ptr + MINBPC(enc); 1245 return XML_TOK_NAME_QUESTION; 1246 default: 1247 *nextTokPtr = ptr; 1248 return XML_TOK_INVALID; 1249 } 1250 } 1251 return -tok; 1252 } 1253 1254 static int PTRCALL 1255 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end, 1256 const char **nextTokPtr) { 1257 const char *start; 1258 if (ptr >= end) 1259 return XML_TOK_NONE; 1260 else if (! HAS_CHAR(enc, ptr, end)) { 1261 /* This line cannot be executed. The incoming data has already 1262 * been tokenized once, so incomplete characters like this have 1263 * already been eliminated from the input. Retaining the paranoia 1264 * check is still valuable, however. 1265 */ 1266 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */ 1267 } 1268 start = ptr; 1269 while (HAS_CHAR(enc, ptr, end)) { 1270 switch (BYTE_TYPE(enc, ptr)) { 1271 # define LEAD_CASE(n) \ 1272 case BT_LEAD##n: \ 1273 ptr += n; \ 1274 break; 1275 LEAD_CASE(2) 1276 LEAD_CASE(3) 1277 LEAD_CASE(4) 1278 # undef LEAD_CASE 1279 case BT_AMP: 1280 if (ptr == start) 1281 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1282 *nextTokPtr = ptr; 1283 return XML_TOK_DATA_CHARS; 1284 case BT_LT: 1285 /* this is for inside entity references */ 1286 *nextTokPtr = ptr; 1287 return XML_TOK_INVALID; 1288 case BT_LF: 1289 if (ptr == start) { 1290 *nextTokPtr = ptr + MINBPC(enc); 1291 return XML_TOK_DATA_NEWLINE; 1292 } 1293 *nextTokPtr = ptr; 1294 return XML_TOK_DATA_CHARS; 1295 case BT_CR: 1296 if (ptr == start) { 1297 ptr += MINBPC(enc); 1298 if (! HAS_CHAR(enc, ptr, end)) 1299 return XML_TOK_TRAILING_CR; 1300 if (BYTE_TYPE(enc, ptr) == BT_LF) 1301 ptr += MINBPC(enc); 1302 *nextTokPtr = ptr; 1303 return XML_TOK_DATA_NEWLINE; 1304 } 1305 *nextTokPtr = ptr; 1306 return XML_TOK_DATA_CHARS; 1307 case BT_S: 1308 if (ptr == start) { 1309 *nextTokPtr = ptr + MINBPC(enc); 1310 return XML_TOK_ATTRIBUTE_VALUE_S; 1311 } 1312 *nextTokPtr = ptr; 1313 return XML_TOK_DATA_CHARS; 1314 default: 1315 ptr += MINBPC(enc); 1316 break; 1317 } 1318 } 1319 *nextTokPtr = ptr; 1320 return XML_TOK_DATA_CHARS; 1321 } 1322 1323 static int PTRCALL 1324 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end, 1325 const char **nextTokPtr) { 1326 const char *start; 1327 if (ptr >= end) 1328 return XML_TOK_NONE; 1329 else if (! HAS_CHAR(enc, ptr, end)) { 1330 /* This line cannot be executed. The incoming data has already 1331 * been tokenized once, so incomplete characters like this have 1332 * already been eliminated from the input. Retaining the paranoia 1333 * check is still valuable, however. 1334 */ 1335 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */ 1336 } 1337 start = ptr; 1338 while (HAS_CHAR(enc, ptr, end)) { 1339 switch (BYTE_TYPE(enc, ptr)) { 1340 # define LEAD_CASE(n) \ 1341 case BT_LEAD##n: \ 1342 ptr += n; \ 1343 break; 1344 LEAD_CASE(2) 1345 LEAD_CASE(3) 1346 LEAD_CASE(4) 1347 # undef LEAD_CASE 1348 case BT_AMP: 1349 if (ptr == start) 1350 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1351 *nextTokPtr = ptr; 1352 return XML_TOK_DATA_CHARS; 1353 case BT_PERCNT: 1354 if (ptr == start) { 1355 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1356 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok; 1357 } 1358 *nextTokPtr = ptr; 1359 return XML_TOK_DATA_CHARS; 1360 case BT_LF: 1361 if (ptr == start) { 1362 *nextTokPtr = ptr + MINBPC(enc); 1363 return XML_TOK_DATA_NEWLINE; 1364 } 1365 *nextTokPtr = ptr; 1366 return XML_TOK_DATA_CHARS; 1367 case BT_CR: 1368 if (ptr == start) { 1369 ptr += MINBPC(enc); 1370 if (! HAS_CHAR(enc, ptr, end)) 1371 return XML_TOK_TRAILING_CR; 1372 if (BYTE_TYPE(enc, ptr) == BT_LF) 1373 ptr += MINBPC(enc); 1374 *nextTokPtr = ptr; 1375 return XML_TOK_DATA_NEWLINE; 1376 } 1377 *nextTokPtr = ptr; 1378 return XML_TOK_DATA_CHARS; 1379 default: 1380 ptr += MINBPC(enc); 1381 break; 1382 } 1383 } 1384 *nextTokPtr = ptr; 1385 return XML_TOK_DATA_CHARS; 1386 } 1387 1388 # ifdef XML_DTD 1389 1390 static int PTRCALL 1391 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end, 1392 const char **nextTokPtr) { 1393 int level = 0; 1394 if (MINBPC(enc) > 1) { 1395 size_t n = end - ptr; 1396 if (n & (MINBPC(enc) - 1)) { 1397 n &= ~(MINBPC(enc) - 1); 1398 end = ptr + n; 1399 } 1400 } 1401 while (HAS_CHAR(enc, ptr, end)) { 1402 switch (BYTE_TYPE(enc, ptr)) { 1403 INVALID_CASES(ptr, nextTokPtr) 1404 case BT_LT: 1405 ptr += MINBPC(enc); 1406 REQUIRE_CHAR(enc, ptr, end); 1407 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) { 1408 ptr += MINBPC(enc); 1409 REQUIRE_CHAR(enc, ptr, end); 1410 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) { 1411 ++level; 1412 ptr += MINBPC(enc); 1413 } 1414 } 1415 break; 1416 case BT_RSQB: 1417 ptr += MINBPC(enc); 1418 REQUIRE_CHAR(enc, ptr, end); 1419 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { 1420 ptr += MINBPC(enc); 1421 REQUIRE_CHAR(enc, ptr, end); 1422 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 1423 ptr += MINBPC(enc); 1424 if (level == 0) { 1425 *nextTokPtr = ptr; 1426 return XML_TOK_IGNORE_SECT; 1427 } 1428 --level; 1429 } 1430 } 1431 break; 1432 default: 1433 ptr += MINBPC(enc); 1434 break; 1435 } 1436 } 1437 return XML_TOK_PARTIAL; 1438 } 1439 1440 # endif /* XML_DTD */ 1441 1442 static int PTRCALL 1443 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end, 1444 const char **badPtr) { 1445 ptr += MINBPC(enc); 1446 end -= MINBPC(enc); 1447 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 1448 switch (BYTE_TYPE(enc, ptr)) { 1449 case BT_DIGIT: 1450 case BT_HEX: 1451 case BT_MINUS: 1452 case BT_APOS: 1453 case BT_LPAR: 1454 case BT_RPAR: 1455 case BT_PLUS: 1456 case BT_COMMA: 1457 case BT_SOL: 1458 case BT_EQUALS: 1459 case BT_QUEST: 1460 case BT_CR: 1461 case BT_LF: 1462 case BT_SEMI: 1463 case BT_EXCL: 1464 case BT_AST: 1465 case BT_PERCNT: 1466 case BT_NUM: 1467 # ifdef XML_NS 1468 case BT_COLON: 1469 # endif 1470 break; 1471 case BT_S: 1472 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) { 1473 *badPtr = ptr; 1474 return 0; 1475 } 1476 break; 1477 case BT_NAME: 1478 case BT_NMSTRT: 1479 if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f)) 1480 break; 1481 /* fall through */ 1482 default: 1483 switch (BYTE_TO_ASCII(enc, ptr)) { 1484 case 0x24: /* $ */ 1485 case 0x40: /* @ */ 1486 break; 1487 default: 1488 *badPtr = ptr; 1489 return 0; 1490 } 1491 break; 1492 } 1493 } 1494 return 1; 1495 } 1496 1497 /* This must only be called for a well-formed start-tag or empty 1498 element tag. Returns the number of attributes. Pointers to the 1499 first attsMax attributes are stored in atts. 1500 */ 1501 1502 static int PTRCALL 1503 PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax, 1504 ATTRIBUTE *atts) { 1505 enum { other, inName, inValue } state = inName; 1506 int nAtts = 0; 1507 int open = 0; /* defined when state == inValue; 1508 initialization just to shut up compilers */ 1509 1510 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) { 1511 switch (BYTE_TYPE(enc, ptr)) { 1512 # define START_NAME \ 1513 if (state == other) { \ 1514 if (nAtts < attsMax) { \ 1515 atts[nAtts].name = ptr; \ 1516 atts[nAtts].normalized = 1; \ 1517 } \ 1518 state = inName; \ 1519 } 1520 # define LEAD_CASE(n) \ 1521 case BT_LEAD##n: \ 1522 START_NAME ptr += (n - MINBPC(enc)); \ 1523 break; 1524 LEAD_CASE(2) 1525 LEAD_CASE(3) 1526 LEAD_CASE(4) 1527 # undef LEAD_CASE 1528 case BT_NONASCII: 1529 case BT_NMSTRT: 1530 case BT_HEX: 1531 START_NAME 1532 break; 1533 # undef START_NAME 1534 case BT_QUOT: 1535 if (state != inValue) { 1536 if (nAtts < attsMax) 1537 atts[nAtts].valuePtr = ptr + MINBPC(enc); 1538 state = inValue; 1539 open = BT_QUOT; 1540 } else if (open == BT_QUOT) { 1541 state = other; 1542 if (nAtts < attsMax) 1543 atts[nAtts].valueEnd = ptr; 1544 nAtts++; 1545 } 1546 break; 1547 case BT_APOS: 1548 if (state != inValue) { 1549 if (nAtts < attsMax) 1550 atts[nAtts].valuePtr = ptr + MINBPC(enc); 1551 state = inValue; 1552 open = BT_APOS; 1553 } else if (open == BT_APOS) { 1554 state = other; 1555 if (nAtts < attsMax) 1556 atts[nAtts].valueEnd = ptr; 1557 nAtts++; 1558 } 1559 break; 1560 case BT_AMP: 1561 if (nAtts < attsMax) 1562 atts[nAtts].normalized = 0; 1563 break; 1564 case BT_S: 1565 if (state == inName) 1566 state = other; 1567 else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized 1568 && (ptr == atts[nAtts].valuePtr 1569 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE 1570 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE 1571 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open)) 1572 atts[nAtts].normalized = 0; 1573 break; 1574 case BT_CR: 1575 case BT_LF: 1576 /* This case ensures that the first attribute name is counted 1577 Apart from that we could just change state on the quote. */ 1578 if (state == inName) 1579 state = other; 1580 else if (state == inValue && nAtts < attsMax) 1581 atts[nAtts].normalized = 0; 1582 break; 1583 case BT_GT: 1584 case BT_SOL: 1585 if (state != inValue) 1586 return nAtts; 1587 break; 1588 default: 1589 break; 1590 } 1591 } 1592 /* not reached */ 1593 } 1594 1595 static int PTRFASTCALL 1596 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) { 1597 int result = 0; 1598 /* skip &# */ 1599 UNUSED_P(enc); 1600 ptr += 2 * MINBPC(enc); 1601 if (CHAR_MATCHES(enc, ptr, ASCII_x)) { 1602 for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); 1603 ptr += MINBPC(enc)) { 1604 int c = BYTE_TO_ASCII(enc, ptr); 1605 switch (c) { 1606 case ASCII_0: 1607 case ASCII_1: 1608 case ASCII_2: 1609 case ASCII_3: 1610 case ASCII_4: 1611 case ASCII_5: 1612 case ASCII_6: 1613 case ASCII_7: 1614 case ASCII_8: 1615 case ASCII_9: 1616 result <<= 4; 1617 result |= (c - ASCII_0); 1618 break; 1619 case ASCII_A: 1620 case ASCII_B: 1621 case ASCII_C: 1622 case ASCII_D: 1623 case ASCII_E: 1624 case ASCII_F: 1625 result <<= 4; 1626 result += 10 + (c - ASCII_A); 1627 break; 1628 case ASCII_a: 1629 case ASCII_b: 1630 case ASCII_c: 1631 case ASCII_d: 1632 case ASCII_e: 1633 case ASCII_f: 1634 result <<= 4; 1635 result += 10 + (c - ASCII_a); 1636 break; 1637 } 1638 if (result >= 0x110000) 1639 return -1; 1640 } 1641 } else { 1642 for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) { 1643 int c = BYTE_TO_ASCII(enc, ptr); 1644 result *= 10; 1645 result += (c - ASCII_0); 1646 if (result >= 0x110000) 1647 return -1; 1648 } 1649 } 1650 return checkCharRefNumber(result); 1651 } 1652 1653 static int PTRCALL 1654 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, 1655 const char *end) { 1656 UNUSED_P(enc); 1657 switch ((end - ptr) / MINBPC(enc)) { 1658 case 2: 1659 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) { 1660 switch (BYTE_TO_ASCII(enc, ptr)) { 1661 case ASCII_l: 1662 return ASCII_LT; 1663 case ASCII_g: 1664 return ASCII_GT; 1665 } 1666 } 1667 break; 1668 case 3: 1669 if (CHAR_MATCHES(enc, ptr, ASCII_a)) { 1670 ptr += MINBPC(enc); 1671 if (CHAR_MATCHES(enc, ptr, ASCII_m)) { 1672 ptr += MINBPC(enc); 1673 if (CHAR_MATCHES(enc, ptr, ASCII_p)) 1674 return ASCII_AMP; 1675 } 1676 } 1677 break; 1678 case 4: 1679 switch (BYTE_TO_ASCII(enc, ptr)) { 1680 case ASCII_q: 1681 ptr += MINBPC(enc); 1682 if (CHAR_MATCHES(enc, ptr, ASCII_u)) { 1683 ptr += MINBPC(enc); 1684 if (CHAR_MATCHES(enc, ptr, ASCII_o)) { 1685 ptr += MINBPC(enc); 1686 if (CHAR_MATCHES(enc, ptr, ASCII_t)) 1687 return ASCII_QUOT; 1688 } 1689 } 1690 break; 1691 case ASCII_a: 1692 ptr += MINBPC(enc); 1693 if (CHAR_MATCHES(enc, ptr, ASCII_p)) { 1694 ptr += MINBPC(enc); 1695 if (CHAR_MATCHES(enc, ptr, ASCII_o)) { 1696 ptr += MINBPC(enc); 1697 if (CHAR_MATCHES(enc, ptr, ASCII_s)) 1698 return ASCII_APOS; 1699 } 1700 } 1701 break; 1702 } 1703 } 1704 return 0; 1705 } 1706 1707 static int PTRCALL 1708 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, 1709 const char *end1, const char *ptr2) { 1710 UNUSED_P(enc); 1711 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) { 1712 if (end1 - ptr1 < MINBPC(enc)) { 1713 /* This line cannot be executed. The incoming data has already 1714 * been tokenized once, so incomplete characters like this have 1715 * already been eliminated from the input. Retaining the 1716 * paranoia check is still valuable, however. 1717 */ 1718 return 0; /* LCOV_EXCL_LINE */ 1719 } 1720 if (! CHAR_MATCHES(enc, ptr1, *ptr2)) 1721 return 0; 1722 } 1723 return ptr1 == end1; 1724 } 1725 1726 static int PTRFASTCALL 1727 PREFIX(nameLength)(const ENCODING *enc, const char *ptr) { 1728 const char *start = ptr; 1729 for (;;) { 1730 switch (BYTE_TYPE(enc, ptr)) { 1731 # define LEAD_CASE(n) \ 1732 case BT_LEAD##n: \ 1733 ptr += n; \ 1734 break; 1735 LEAD_CASE(2) 1736 LEAD_CASE(3) 1737 LEAD_CASE(4) 1738 # undef LEAD_CASE 1739 case BT_NONASCII: 1740 case BT_NMSTRT: 1741 # ifdef XML_NS 1742 case BT_COLON: 1743 # endif 1744 case BT_HEX: 1745 case BT_DIGIT: 1746 case BT_NAME: 1747 case BT_MINUS: 1748 ptr += MINBPC(enc); 1749 break; 1750 default: 1751 return (int)(ptr - start); 1752 } 1753 } 1754 } 1755 1756 static const char *PTRFASTCALL 1757 PREFIX(skipS)(const ENCODING *enc, const char *ptr) { 1758 for (;;) { 1759 switch (BYTE_TYPE(enc, ptr)) { 1760 case BT_LF: 1761 case BT_CR: 1762 case BT_S: 1763 ptr += MINBPC(enc); 1764 break; 1765 default: 1766 return ptr; 1767 } 1768 } 1769 } 1770 1771 static void PTRCALL 1772 PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end, 1773 POSITION *pos) { 1774 while (HAS_CHAR(enc, ptr, end)) { 1775 switch (BYTE_TYPE(enc, ptr)) { 1776 # define LEAD_CASE(n) \ 1777 case BT_LEAD##n: \ 1778 ptr += n; \ 1779 pos->columnNumber++; \ 1780 break; 1781 LEAD_CASE(2) 1782 LEAD_CASE(3) 1783 LEAD_CASE(4) 1784 # undef LEAD_CASE 1785 case BT_LF: 1786 pos->columnNumber = 0; 1787 pos->lineNumber++; 1788 ptr += MINBPC(enc); 1789 break; 1790 case BT_CR: 1791 pos->lineNumber++; 1792 ptr += MINBPC(enc); 1793 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF) 1794 ptr += MINBPC(enc); 1795 pos->columnNumber = 0; 1796 break; 1797 default: 1798 ptr += MINBPC(enc); 1799 pos->columnNumber++; 1800 break; 1801 } 1802 } 1803 } 1804 1805 # undef DO_LEAD_CASE 1806 # undef MULTIBYTE_CASES 1807 # undef INVALID_CASES 1808 # undef CHECK_NAME_CASE 1809 # undef CHECK_NAME_CASES 1810 # undef CHECK_NMSTRT_CASE 1811 # undef CHECK_NMSTRT_CASES 1812 1813 #endif /* XML_TOK_IMPL_C */ 1814