1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd 2 See the file COPYING for copying permission. 3 */ 4 5 #ifndef IS_INVALID_CHAR 6 #define IS_INVALID_CHAR(enc, ptr, n) (0) 7 #endif 8 9 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \ 10 case BT_LEAD ## n: \ 11 if (end - ptr < n) \ 12 return XML_TOK_PARTIAL_CHAR; \ 13 if (IS_INVALID_CHAR(enc, ptr, n)) { \ 14 *(nextTokPtr) = (ptr); \ 15 return XML_TOK_INVALID; \ 16 } \ 17 ptr += n; \ 18 break; 19 20 #define INVALID_CASES(ptr, nextTokPtr) \ 21 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \ 22 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \ 23 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \ 24 case BT_NONXML: \ 25 case BT_MALFORM: \ 26 case BT_TRAIL: \ 27 *(nextTokPtr) = (ptr); \ 28 return XML_TOK_INVALID; 29 30 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \ 31 case BT_LEAD ## n: \ 32 if (end - ptr < n) \ 33 return XML_TOK_PARTIAL_CHAR; \ 34 if (!IS_NAME_CHAR(enc, ptr, n)) { \ 35 *nextTokPtr = ptr; \ 36 return XML_TOK_INVALID; \ 37 } \ 38 ptr += n; \ 39 break; 40 41 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \ 42 case BT_NONASCII: \ 43 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \ 44 *nextTokPtr = ptr; \ 45 return XML_TOK_INVALID; \ 46 } \ 47 case BT_NMSTRT: \ 48 case BT_HEX: \ 49 case BT_DIGIT: \ 50 case BT_NAME: \ 51 case BT_MINUS: \ 52 ptr += MINBPC(enc); \ 53 break; \ 54 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \ 55 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \ 56 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr) 57 58 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \ 59 case BT_LEAD ## n: \ 60 if (end - ptr < n) \ 61 return XML_TOK_PARTIAL_CHAR; \ 62 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \ 63 *nextTokPtr = ptr; \ 64 return XML_TOK_INVALID; \ 65 } \ 66 ptr += n; \ 67 break; 68 69 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \ 70 case BT_NONASCII: \ 71 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \ 72 *nextTokPtr = ptr; \ 73 return XML_TOK_INVALID; \ 74 } \ 75 case BT_NMSTRT: \ 76 case BT_HEX: \ 77 ptr += MINBPC(enc); \ 78 break; \ 79 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \ 80 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \ 81 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr) 82 83 #ifndef PREFIX 84 #define PREFIX(ident) ident 85 #endif 86 87 /* ptr points to character following "<!-" */ 88 89 static int FASTCALL 90 PREFIX(scanComment)(const ENCODING *enc, const char *ptr, 91 const char *end, const char **nextTokPtr) 92 { 93 if (ptr != end) { 94 if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { 95 *nextTokPtr = ptr; 96 return XML_TOK_INVALID; 97 } 98 ptr += MINBPC(enc); 99 while (ptr != end) { 100 switch (BYTE_TYPE(enc, ptr)) { 101 INVALID_CASES(ptr, nextTokPtr) 102 case BT_MINUS: 103 if ((ptr += MINBPC(enc)) == end) 104 return XML_TOK_PARTIAL; 105 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { 106 if ((ptr += MINBPC(enc)) == end) 107 return XML_TOK_PARTIAL; 108 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 109 *nextTokPtr = ptr; 110 return XML_TOK_INVALID; 111 } 112 *nextTokPtr = ptr + MINBPC(enc); 113 return XML_TOK_COMMENT; 114 } 115 break; 116 default: 117 ptr += MINBPC(enc); 118 break; 119 } 120 } 121 } 122 return XML_TOK_PARTIAL; 123 } 124 125 /* ptr points to character following "<!" */ 126 127 static int FASTCALL 128 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, 129 const char *end, const char **nextTokPtr) 130 { 131 if (ptr == end) 132 return XML_TOK_PARTIAL; 133 switch (BYTE_TYPE(enc, ptr)) { 134 case BT_MINUS: 135 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); 136 case BT_LSQB: 137 *nextTokPtr = ptr + MINBPC(enc); 138 return XML_TOK_COND_SECT_OPEN; 139 case BT_NMSTRT: 140 case BT_HEX: 141 ptr += MINBPC(enc); 142 break; 143 default: 144 *nextTokPtr = ptr; 145 return XML_TOK_INVALID; 146 } 147 while (ptr != end) { 148 switch (BYTE_TYPE(enc, ptr)) { 149 case BT_PERCNT: 150 if (ptr + MINBPC(enc) == end) 151 return XML_TOK_PARTIAL; 152 /* don't allow <!ENTITY% foo "whatever"> */ 153 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) { 154 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT: 155 *nextTokPtr = ptr; 156 return XML_TOK_INVALID; 157 } 158 /* fall through */ 159 case BT_S: case BT_CR: case BT_LF: 160 *nextTokPtr = ptr; 161 return XML_TOK_DECL_OPEN; 162 case BT_NMSTRT: 163 case BT_HEX: 164 ptr += MINBPC(enc); 165 break; 166 default: 167 *nextTokPtr = ptr; 168 return XML_TOK_INVALID; 169 } 170 } 171 return XML_TOK_PARTIAL; 172 } 173 174 static int FASTCALL 175 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, 176 const char *end, int *tokPtr) 177 { 178 int upper = 0; 179 *tokPtr = XML_TOK_PI; 180 if (end - ptr != MINBPC(enc)*3) 181 return 1; 182 switch (BYTE_TO_ASCII(enc, ptr)) { 183 case ASCII_x: 184 break; 185 case ASCII_X: 186 upper = 1; 187 break; 188 default: 189 return 1; 190 } 191 ptr += MINBPC(enc); 192 switch (BYTE_TO_ASCII(enc, ptr)) { 193 case ASCII_m: 194 break; 195 case ASCII_M: 196 upper = 1; 197 break; 198 default: 199 return 1; 200 } 201 ptr += MINBPC(enc); 202 switch (BYTE_TO_ASCII(enc, ptr)) { 203 case ASCII_l: 204 break; 205 case ASCII_L: 206 upper = 1; 207 break; 208 default: 209 return 1; 210 } 211 if (upper) 212 return 0; 213 *tokPtr = XML_TOK_XML_DECL; 214 return 1; 215 } 216 217 /* ptr points to character following "<?" */ 218 219 static int FASTCALL 220 PREFIX(scanPi)(const ENCODING *enc, const char *ptr, 221 const char *end, const char **nextTokPtr) 222 { 223 int tok; 224 const char *target = ptr; 225 if (ptr == end) 226 return XML_TOK_PARTIAL; 227 switch (BYTE_TYPE(enc, ptr)) { 228 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 229 default: 230 *nextTokPtr = ptr; 231 return XML_TOK_INVALID; 232 } 233 while (ptr != end) { 234 switch (BYTE_TYPE(enc, ptr)) { 235 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 236 case BT_S: case BT_CR: case BT_LF: 237 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { 238 *nextTokPtr = ptr; 239 return XML_TOK_INVALID; 240 } 241 ptr += MINBPC(enc); 242 while (ptr != end) { 243 switch (BYTE_TYPE(enc, ptr)) { 244 INVALID_CASES(ptr, nextTokPtr) 245 case BT_QUEST: 246 ptr += MINBPC(enc); 247 if (ptr == end) 248 return XML_TOK_PARTIAL; 249 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 250 *nextTokPtr = ptr + MINBPC(enc); 251 return tok; 252 } 253 break; 254 default: 255 ptr += MINBPC(enc); 256 break; 257 } 258 } 259 return XML_TOK_PARTIAL; 260 case BT_QUEST: 261 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { 262 *nextTokPtr = ptr; 263 return XML_TOK_INVALID; 264 } 265 ptr += MINBPC(enc); 266 if (ptr == end) 267 return XML_TOK_PARTIAL; 268 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 269 *nextTokPtr = ptr + MINBPC(enc); 270 return tok; 271 } 272 /* fall through */ 273 default: 274 *nextTokPtr = ptr; 275 return XML_TOK_INVALID; 276 } 277 } 278 return XML_TOK_PARTIAL; 279 } 280 281 static int FASTCALL 282 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, 283 const char *end, const char **nextTokPtr) 284 { 285 static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A, 286 ASCII_T, ASCII_A, ASCII_LSQB }; 287 int i; 288 /* CDATA[ */ 289 if (end - ptr < 6 * MINBPC(enc)) 290 return XML_TOK_PARTIAL; 291 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) { 292 if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) { 293 *nextTokPtr = ptr; 294 return XML_TOK_INVALID; 295 } 296 } 297 *nextTokPtr = ptr; 298 return XML_TOK_CDATA_SECT_OPEN; 299 } 300 301 static int FASTCALL 302 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, 303 const char *end, const char **nextTokPtr) 304 { 305 if (ptr == end) 306 return XML_TOK_NONE; 307 if (MINBPC(enc) > 1) { 308 size_t n = end - ptr; 309 if (n & (MINBPC(enc) - 1)) { 310 n &= ~(MINBPC(enc) - 1); 311 if (n == 0) 312 return XML_TOK_PARTIAL; 313 end = ptr + n; 314 } 315 } 316 switch (BYTE_TYPE(enc, ptr)) { 317 case BT_RSQB: 318 ptr += MINBPC(enc); 319 if (ptr == end) 320 return XML_TOK_PARTIAL; 321 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) 322 break; 323 ptr += MINBPC(enc); 324 if (ptr == end) 325 return XML_TOK_PARTIAL; 326 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 327 ptr -= MINBPC(enc); 328 break; 329 } 330 *nextTokPtr = ptr + MINBPC(enc); 331 return XML_TOK_CDATA_SECT_CLOSE; 332 case BT_CR: 333 ptr += MINBPC(enc); 334 if (ptr == end) 335 return XML_TOK_PARTIAL; 336 if (BYTE_TYPE(enc, ptr) == BT_LF) 337 ptr += MINBPC(enc); 338 *nextTokPtr = ptr; 339 return XML_TOK_DATA_NEWLINE; 340 case BT_LF: 341 *nextTokPtr = ptr + MINBPC(enc); 342 return XML_TOK_DATA_NEWLINE; 343 INVALID_CASES(ptr, nextTokPtr) 344 default: 345 ptr += MINBPC(enc); 346 break; 347 } 348 while (ptr != end) { 349 switch (BYTE_TYPE(enc, ptr)) { 350 #define LEAD_CASE(n) \ 351 case BT_LEAD ## n: \ 352 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ 353 *nextTokPtr = ptr; \ 354 return XML_TOK_DATA_CHARS; \ 355 } \ 356 ptr += n; \ 357 break; 358 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 359 #undef LEAD_CASE 360 case BT_NONXML: 361 case BT_MALFORM: 362 case BT_TRAIL: 363 case BT_CR: 364 case BT_LF: 365 case BT_RSQB: 366 *nextTokPtr = ptr; 367 return XML_TOK_DATA_CHARS; 368 default: 369 ptr += MINBPC(enc); 370 break; 371 } 372 } 373 *nextTokPtr = ptr; 374 return XML_TOK_DATA_CHARS; 375 } 376 377 /* ptr points to character following "</" */ 378 379 static int FASTCALL 380 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, 381 const char *end, const char **nextTokPtr) 382 { 383 if (ptr == end) 384 return XML_TOK_PARTIAL; 385 switch (BYTE_TYPE(enc, ptr)) { 386 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 387 default: 388 *nextTokPtr = ptr; 389 return XML_TOK_INVALID; 390 } 391 while (ptr != end) { 392 switch (BYTE_TYPE(enc, ptr)) { 393 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 394 case BT_S: case BT_CR: case BT_LF: 395 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) { 396 switch (BYTE_TYPE(enc, ptr)) { 397 case BT_S: case BT_CR: case BT_LF: 398 break; 399 case BT_GT: 400 *nextTokPtr = ptr + MINBPC(enc); 401 return XML_TOK_END_TAG; 402 default: 403 *nextTokPtr = ptr; 404 return XML_TOK_INVALID; 405 } 406 } 407 return XML_TOK_PARTIAL; 408 #ifdef XML_NS 409 case BT_COLON: 410 /* no need to check qname syntax here, 411 since end-tag must match exactly */ 412 ptr += MINBPC(enc); 413 break; 414 #endif 415 case BT_GT: 416 *nextTokPtr = ptr + MINBPC(enc); 417 return XML_TOK_END_TAG; 418 default: 419 *nextTokPtr = ptr; 420 return XML_TOK_INVALID; 421 } 422 } 423 return XML_TOK_PARTIAL; 424 } 425 426 /* ptr points to character following "&#X" */ 427 428 static int FASTCALL 429 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, 430 const char *end, const char **nextTokPtr) 431 { 432 if (ptr != end) { 433 switch (BYTE_TYPE(enc, ptr)) { 434 case BT_DIGIT: 435 case BT_HEX: 436 break; 437 default: 438 *nextTokPtr = ptr; 439 return XML_TOK_INVALID; 440 } 441 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) { 442 switch (BYTE_TYPE(enc, ptr)) { 443 case BT_DIGIT: 444 case BT_HEX: 445 break; 446 case BT_SEMI: 447 *nextTokPtr = ptr + MINBPC(enc); 448 return XML_TOK_CHAR_REF; 449 default: 450 *nextTokPtr = ptr; 451 return XML_TOK_INVALID; 452 } 453 } 454 } 455 return XML_TOK_PARTIAL; 456 } 457 458 /* ptr points to character following "&#" */ 459 460 static int FASTCALL 461 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, 462 const char *end, const char **nextTokPtr) 463 { 464 if (ptr != end) { 465 if (CHAR_MATCHES(enc, ptr, ASCII_x)) 466 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 467 switch (BYTE_TYPE(enc, ptr)) { 468 case BT_DIGIT: 469 break; 470 default: 471 *nextTokPtr = ptr; 472 return XML_TOK_INVALID; 473 } 474 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) { 475 switch (BYTE_TYPE(enc, ptr)) { 476 case BT_DIGIT: 477 break; 478 case BT_SEMI: 479 *nextTokPtr = ptr + MINBPC(enc); 480 return XML_TOK_CHAR_REF; 481 default: 482 *nextTokPtr = ptr; 483 return XML_TOK_INVALID; 484 } 485 } 486 } 487 return XML_TOK_PARTIAL; 488 } 489 490 /* ptr points to character following "&" */ 491 492 static int FASTCALL 493 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end, 494 const char **nextTokPtr) 495 { 496 if (ptr == end) 497 return XML_TOK_PARTIAL; 498 switch (BYTE_TYPE(enc, ptr)) { 499 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 500 case BT_NUM: 501 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 502 default: 503 *nextTokPtr = ptr; 504 return XML_TOK_INVALID; 505 } 506 while (ptr != end) { 507 switch (BYTE_TYPE(enc, ptr)) { 508 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 509 case BT_SEMI: 510 *nextTokPtr = ptr + MINBPC(enc); 511 return XML_TOK_ENTITY_REF; 512 default: 513 *nextTokPtr = ptr; 514 return XML_TOK_INVALID; 515 } 516 } 517 return XML_TOK_PARTIAL; 518 } 519 520 /* ptr points to character following first character of attribute name */ 521 522 static int FASTCALL 523 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, 524 const char **nextTokPtr) 525 { 526 #ifdef XML_NS 527 int hadColon = 0; 528 #endif 529 while (ptr != end) { 530 switch (BYTE_TYPE(enc, ptr)) { 531 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 532 #ifdef XML_NS 533 case BT_COLON: 534 if (hadColon) { 535 *nextTokPtr = ptr; 536 return XML_TOK_INVALID; 537 } 538 hadColon = 1; 539 ptr += MINBPC(enc); 540 if (ptr == end) 541 return XML_TOK_PARTIAL; 542 switch (BYTE_TYPE(enc, ptr)) { 543 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 544 default: 545 *nextTokPtr = ptr; 546 return XML_TOK_INVALID; 547 } 548 break; 549 #endif 550 case BT_S: case BT_CR: case BT_LF: 551 for (;;) { 552 int t; 553 554 ptr += MINBPC(enc); 555 if (ptr == end) 556 return XML_TOK_PARTIAL; 557 t = BYTE_TYPE(enc, ptr); 558 if (t == BT_EQUALS) 559 break; 560 switch (t) { 561 case BT_S: 562 case BT_LF: 563 case BT_CR: 564 break; 565 default: 566 *nextTokPtr = ptr; 567 return XML_TOK_INVALID; 568 } 569 } 570 /* fall through */ 571 case BT_EQUALS: 572 { 573 int open; 574 #ifdef XML_NS 575 hadColon = 0; 576 #endif 577 for (;;) { 578 ptr += MINBPC(enc); 579 if (ptr == end) 580 return XML_TOK_PARTIAL; 581 open = BYTE_TYPE(enc, ptr); 582 if (open == BT_QUOT || open == BT_APOS) 583 break; 584 switch (open) { 585 case BT_S: 586 case BT_LF: 587 case BT_CR: 588 break; 589 default: 590 *nextTokPtr = ptr; 591 return XML_TOK_INVALID; 592 } 593 } 594 ptr += MINBPC(enc); 595 /* in attribute value */ 596 for (;;) { 597 int t; 598 if (ptr == end) 599 return XML_TOK_PARTIAL; 600 t = BYTE_TYPE(enc, ptr); 601 if (t == open) 602 break; 603 switch (t) { 604 INVALID_CASES(ptr, nextTokPtr) 605 case BT_AMP: 606 { 607 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr); 608 if (tok <= 0) { 609 if (tok == XML_TOK_INVALID) 610 *nextTokPtr = ptr; 611 return tok; 612 } 613 break; 614 } 615 case BT_LT: 616 *nextTokPtr = ptr; 617 return XML_TOK_INVALID; 618 default: 619 ptr += MINBPC(enc); 620 break; 621 } 622 } 623 ptr += MINBPC(enc); 624 if (ptr == end) 625 return XML_TOK_PARTIAL; 626 switch (BYTE_TYPE(enc, ptr)) { 627 case BT_S: 628 case BT_CR: 629 case BT_LF: 630 break; 631 case BT_SOL: 632 goto sol; 633 case BT_GT: 634 goto gt; 635 default: 636 *nextTokPtr = ptr; 637 return XML_TOK_INVALID; 638 } 639 /* ptr points to closing quote */ 640 for (;;) { 641 ptr += MINBPC(enc); 642 if (ptr == end) 643 return XML_TOK_PARTIAL; 644 switch (BYTE_TYPE(enc, ptr)) { 645 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 646 case BT_S: case BT_CR: case BT_LF: 647 continue; 648 case BT_GT: 649 gt: 650 *nextTokPtr = ptr + MINBPC(enc); 651 return XML_TOK_START_TAG_WITH_ATTS; 652 case BT_SOL: 653 sol: 654 ptr += MINBPC(enc); 655 if (ptr == end) 656 return XML_TOK_PARTIAL; 657 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 658 *nextTokPtr = ptr; 659 return XML_TOK_INVALID; 660 } 661 *nextTokPtr = ptr + MINBPC(enc); 662 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS; 663 default: 664 *nextTokPtr = ptr; 665 return XML_TOK_INVALID; 666 } 667 break; 668 } 669 break; 670 } 671 default: 672 *nextTokPtr = ptr; 673 return XML_TOK_INVALID; 674 } 675 } 676 return XML_TOK_PARTIAL; 677 } 678 679 /* ptr points to character following "<" */ 680 681 static int FASTCALL 682 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, 683 const char **nextTokPtr) 684 { 685 #ifdef XML_NS 686 int hadColon; 687 #endif 688 if (ptr == end) 689 return XML_TOK_PARTIAL; 690 switch (BYTE_TYPE(enc, ptr)) { 691 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 692 case BT_EXCL: 693 if ((ptr += MINBPC(enc)) == end) 694 return XML_TOK_PARTIAL; 695 switch (BYTE_TYPE(enc, ptr)) { 696 case BT_MINUS: 697 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); 698 case BT_LSQB: 699 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), 700 end, nextTokPtr); 701 } 702 *nextTokPtr = ptr; 703 return XML_TOK_INVALID; 704 case BT_QUEST: 705 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); 706 case BT_SOL: 707 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr); 708 default: 709 *nextTokPtr = ptr; 710 return XML_TOK_INVALID; 711 } 712 #ifdef XML_NS 713 hadColon = 0; 714 #endif 715 /* we have a start-tag */ 716 while (ptr != end) { 717 switch (BYTE_TYPE(enc, ptr)) { 718 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 719 #ifdef XML_NS 720 case BT_COLON: 721 if (hadColon) { 722 *nextTokPtr = ptr; 723 return XML_TOK_INVALID; 724 } 725 hadColon = 1; 726 ptr += MINBPC(enc); 727 if (ptr == end) 728 return XML_TOK_PARTIAL; 729 switch (BYTE_TYPE(enc, ptr)) { 730 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 731 default: 732 *nextTokPtr = ptr; 733 return XML_TOK_INVALID; 734 } 735 break; 736 #endif 737 case BT_S: case BT_CR: case BT_LF: 738 { 739 ptr += MINBPC(enc); 740 while (ptr != end) { 741 switch (BYTE_TYPE(enc, ptr)) { 742 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 743 case BT_GT: 744 goto gt; 745 case BT_SOL: 746 goto sol; 747 case BT_S: case BT_CR: case BT_LF: 748 ptr += MINBPC(enc); 749 continue; 750 default: 751 *nextTokPtr = ptr; 752 return XML_TOK_INVALID; 753 } 754 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr); 755 } 756 return XML_TOK_PARTIAL; 757 } 758 case BT_GT: 759 gt: 760 *nextTokPtr = ptr + MINBPC(enc); 761 return XML_TOK_START_TAG_NO_ATTS; 762 case BT_SOL: 763 sol: 764 ptr += MINBPC(enc); 765 if (ptr == end) 766 return XML_TOK_PARTIAL; 767 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 768 *nextTokPtr = ptr; 769 return XML_TOK_INVALID; 770 } 771 *nextTokPtr = ptr + MINBPC(enc); 772 return XML_TOK_EMPTY_ELEMENT_NO_ATTS; 773 default: 774 *nextTokPtr = ptr; 775 return XML_TOK_INVALID; 776 } 777 } 778 return XML_TOK_PARTIAL; 779 } 780 781 static int FASTCALL 782 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, 783 const char **nextTokPtr) 784 { 785 if (ptr == end) 786 return XML_TOK_NONE; 787 if (MINBPC(enc) > 1) { 788 size_t n = end - ptr; 789 if (n & (MINBPC(enc) - 1)) { 790 n &= ~(MINBPC(enc) - 1); 791 if (n == 0) 792 return XML_TOK_PARTIAL; 793 end = ptr + n; 794 } 795 } 796 switch (BYTE_TYPE(enc, ptr)) { 797 case BT_LT: 798 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr); 799 case BT_AMP: 800 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 801 case BT_CR: 802 ptr += MINBPC(enc); 803 if (ptr == end) 804 return XML_TOK_TRAILING_CR; 805 if (BYTE_TYPE(enc, ptr) == BT_LF) 806 ptr += MINBPC(enc); 807 *nextTokPtr = ptr; 808 return XML_TOK_DATA_NEWLINE; 809 case BT_LF: 810 *nextTokPtr = ptr + MINBPC(enc); 811 return XML_TOK_DATA_NEWLINE; 812 case BT_RSQB: 813 ptr += MINBPC(enc); 814 if (ptr == end) 815 return XML_TOK_TRAILING_RSQB; 816 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) 817 break; 818 ptr += MINBPC(enc); 819 if (ptr == end) 820 return XML_TOK_TRAILING_RSQB; 821 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 822 ptr -= MINBPC(enc); 823 break; 824 } 825 *nextTokPtr = ptr; 826 return XML_TOK_INVALID; 827 INVALID_CASES(ptr, nextTokPtr) 828 default: 829 ptr += MINBPC(enc); 830 break; 831 } 832 while (ptr != end) { 833 switch (BYTE_TYPE(enc, ptr)) { 834 #define LEAD_CASE(n) \ 835 case BT_LEAD ## n: \ 836 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ 837 *nextTokPtr = ptr; \ 838 return XML_TOK_DATA_CHARS; \ 839 } \ 840 ptr += n; \ 841 break; 842 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 843 #undef LEAD_CASE 844 case BT_RSQB: 845 if (ptr + MINBPC(enc) != end) { 846 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) { 847 ptr += MINBPC(enc); 848 break; 849 } 850 if (ptr + 2*MINBPC(enc) != end) { 851 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) { 852 ptr += MINBPC(enc); 853 break; 854 } 855 *nextTokPtr = ptr + 2*MINBPC(enc); 856 return XML_TOK_INVALID; 857 } 858 } 859 /* fall through */ 860 case BT_AMP: 861 case BT_LT: 862 case BT_NONXML: 863 case BT_MALFORM: 864 case BT_TRAIL: 865 case BT_CR: 866 case BT_LF: 867 *nextTokPtr = ptr; 868 return XML_TOK_DATA_CHARS; 869 default: 870 ptr += MINBPC(enc); 871 break; 872 } 873 } 874 *nextTokPtr = ptr; 875 return XML_TOK_DATA_CHARS; 876 } 877 878 /* ptr points to character following "%" */ 879 880 static int FASTCALL 881 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end, 882 const char **nextTokPtr) 883 { 884 if (ptr == end) 885 return XML_TOK_PARTIAL; 886 switch (BYTE_TYPE(enc, ptr)) { 887 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 888 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT: 889 *nextTokPtr = ptr; 890 return XML_TOK_PERCENT; 891 default: 892 *nextTokPtr = ptr; 893 return XML_TOK_INVALID; 894 } 895 while (ptr != end) { 896 switch (BYTE_TYPE(enc, ptr)) { 897 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 898 case BT_SEMI: 899 *nextTokPtr = ptr + MINBPC(enc); 900 return XML_TOK_PARAM_ENTITY_REF; 901 default: 902 *nextTokPtr = ptr; 903 return XML_TOK_INVALID; 904 } 905 } 906 return XML_TOK_PARTIAL; 907 } 908 909 static int FASTCALL 910 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end, 911 const char **nextTokPtr) 912 { 913 if (ptr == end) 914 return XML_TOK_PARTIAL; 915 switch (BYTE_TYPE(enc, ptr)) { 916 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 917 default: 918 *nextTokPtr = ptr; 919 return XML_TOK_INVALID; 920 } 921 while (ptr != end) { 922 switch (BYTE_TYPE(enc, ptr)) { 923 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 924 case BT_CR: case BT_LF: case BT_S: 925 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR: 926 *nextTokPtr = ptr; 927 return XML_TOK_POUND_NAME; 928 default: 929 *nextTokPtr = ptr; 930 return XML_TOK_INVALID; 931 } 932 } 933 return -XML_TOK_POUND_NAME; 934 } 935 936 static int FASTCALL 937 PREFIX(scanLit)(int open, const ENCODING *enc, 938 const char *ptr, const char *end, 939 const char **nextTokPtr) 940 { 941 while (ptr != end) { 942 int t = BYTE_TYPE(enc, ptr); 943 switch (t) { 944 INVALID_CASES(ptr, nextTokPtr) 945 case BT_QUOT: 946 case BT_APOS: 947 ptr += MINBPC(enc); 948 if (t != open) 949 break; 950 if (ptr == end) 951 return -XML_TOK_LITERAL; 952 *nextTokPtr = ptr; 953 switch (BYTE_TYPE(enc, ptr)) { 954 case BT_S: case BT_CR: case BT_LF: 955 case BT_GT: case BT_PERCNT: case BT_LSQB: 956 return XML_TOK_LITERAL; 957 default: 958 return XML_TOK_INVALID; 959 } 960 default: 961 ptr += MINBPC(enc); 962 break; 963 } 964 } 965 return XML_TOK_PARTIAL; 966 } 967 968 static int FASTCALL 969 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, 970 const char **nextTokPtr) 971 { 972 int tok; 973 if (ptr == end) 974 return XML_TOK_NONE; 975 if (MINBPC(enc) > 1) { 976 size_t n = end - ptr; 977 if (n & (MINBPC(enc) - 1)) { 978 n &= ~(MINBPC(enc) - 1); 979 if (n == 0) 980 return XML_TOK_PARTIAL; 981 end = ptr + n; 982 } 983 } 984 switch (BYTE_TYPE(enc, ptr)) { 985 case BT_QUOT: 986 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr); 987 case BT_APOS: 988 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr); 989 case BT_LT: 990 { 991 ptr += MINBPC(enc); 992 if (ptr == end) 993 return XML_TOK_PARTIAL; 994 switch (BYTE_TYPE(enc, ptr)) { 995 case BT_EXCL: 996 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr); 997 case BT_QUEST: 998 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); 999 case BT_NMSTRT: 1000 case BT_HEX: 1001 case BT_NONASCII: 1002 case BT_LEAD2: 1003 case BT_LEAD3: 1004 case BT_LEAD4: 1005 *nextTokPtr = ptr - MINBPC(enc); 1006 return XML_TOK_INSTANCE_START; 1007 } 1008 *nextTokPtr = ptr; 1009 return XML_TOK_INVALID; 1010 } 1011 case BT_CR: 1012 if (ptr + MINBPC(enc) == end) { 1013 *nextTokPtr = end; 1014 /* indicate that this might be part of a CR/LF pair */ 1015 return -XML_TOK_PROLOG_S; 1016 } 1017 /* fall through */ 1018 case BT_S: case BT_LF: 1019 for (;;) { 1020 ptr += MINBPC(enc); 1021 if (ptr == end) 1022 break; 1023 switch (BYTE_TYPE(enc, ptr)) { 1024 case BT_S: case BT_LF: 1025 break; 1026 case BT_CR: 1027 /* don't split CR/LF pair */ 1028 if (ptr + MINBPC(enc) != end) 1029 break; 1030 /* fall through */ 1031 default: 1032 *nextTokPtr = ptr; 1033 return XML_TOK_PROLOG_S; 1034 } 1035 } 1036 *nextTokPtr = ptr; 1037 return XML_TOK_PROLOG_S; 1038 case BT_PERCNT: 1039 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1040 case BT_COMMA: 1041 *nextTokPtr = ptr + MINBPC(enc); 1042 return XML_TOK_COMMA; 1043 case BT_LSQB: 1044 *nextTokPtr = ptr + MINBPC(enc); 1045 return XML_TOK_OPEN_BRACKET; 1046 case BT_RSQB: 1047 ptr += MINBPC(enc); 1048 if (ptr == end) 1049 return -XML_TOK_CLOSE_BRACKET; 1050 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { 1051 if (ptr + MINBPC(enc) == end) 1052 return XML_TOK_PARTIAL; 1053 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) { 1054 *nextTokPtr = ptr + 2*MINBPC(enc); 1055 return XML_TOK_COND_SECT_CLOSE; 1056 } 1057 } 1058 *nextTokPtr = ptr; 1059 return XML_TOK_CLOSE_BRACKET; 1060 case BT_LPAR: 1061 *nextTokPtr = ptr + MINBPC(enc); 1062 return XML_TOK_OPEN_PAREN; 1063 case BT_RPAR: 1064 ptr += MINBPC(enc); 1065 if (ptr == end) 1066 return -XML_TOK_CLOSE_PAREN; 1067 switch (BYTE_TYPE(enc, ptr)) { 1068 case BT_AST: 1069 *nextTokPtr = ptr + MINBPC(enc); 1070 return XML_TOK_CLOSE_PAREN_ASTERISK; 1071 case BT_QUEST: 1072 *nextTokPtr = ptr + MINBPC(enc); 1073 return XML_TOK_CLOSE_PAREN_QUESTION; 1074 case BT_PLUS: 1075 *nextTokPtr = ptr + MINBPC(enc); 1076 return XML_TOK_CLOSE_PAREN_PLUS; 1077 case BT_CR: case BT_LF: case BT_S: 1078 case BT_GT: case BT_COMMA: case BT_VERBAR: 1079 case BT_RPAR: 1080 *nextTokPtr = ptr; 1081 return XML_TOK_CLOSE_PAREN; 1082 } 1083 *nextTokPtr = ptr; 1084 return XML_TOK_INVALID; 1085 case BT_VERBAR: 1086 *nextTokPtr = ptr + MINBPC(enc); 1087 return XML_TOK_OR; 1088 case BT_GT: 1089 *nextTokPtr = ptr + MINBPC(enc); 1090 return XML_TOK_DECL_CLOSE; 1091 case BT_NUM: 1092 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1093 #define LEAD_CASE(n) \ 1094 case BT_LEAD ## n: \ 1095 if (end - ptr < n) \ 1096 return XML_TOK_PARTIAL_CHAR; \ 1097 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ 1098 ptr += n; \ 1099 tok = XML_TOK_NAME; \ 1100 break; \ 1101 } \ 1102 if (IS_NAME_CHAR(enc, ptr, n)) { \ 1103 ptr += n; \ 1104 tok = XML_TOK_NMTOKEN; \ 1105 break; \ 1106 } \ 1107 *nextTokPtr = ptr; \ 1108 return XML_TOK_INVALID; 1109 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1110 #undef LEAD_CASE 1111 case BT_NMSTRT: 1112 case BT_HEX: 1113 tok = XML_TOK_NAME; 1114 ptr += MINBPC(enc); 1115 break; 1116 case BT_DIGIT: 1117 case BT_NAME: 1118 case BT_MINUS: 1119 #ifdef XML_NS 1120 case BT_COLON: 1121 #endif 1122 tok = XML_TOK_NMTOKEN; 1123 ptr += MINBPC(enc); 1124 break; 1125 case BT_NONASCII: 1126 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { 1127 ptr += MINBPC(enc); 1128 tok = XML_TOK_NAME; 1129 break; 1130 } 1131 if (IS_NAME_CHAR_MINBPC(enc, ptr)) { 1132 ptr += MINBPC(enc); 1133 tok = XML_TOK_NMTOKEN; 1134 break; 1135 } 1136 /* fall through */ 1137 default: 1138 *nextTokPtr = ptr; 1139 return XML_TOK_INVALID; 1140 } 1141 while (ptr != end) { 1142 switch (BYTE_TYPE(enc, ptr)) { 1143 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 1144 case BT_GT: case BT_RPAR: case BT_COMMA: 1145 case BT_VERBAR: case BT_LSQB: case BT_PERCNT: 1146 case BT_S: case BT_CR: case BT_LF: 1147 *nextTokPtr = ptr; 1148 return tok; 1149 #ifdef XML_NS 1150 case BT_COLON: 1151 ptr += MINBPC(enc); 1152 switch (tok) { 1153 case XML_TOK_NAME: 1154 if (ptr == end) 1155 return XML_TOK_PARTIAL; 1156 tok = XML_TOK_PREFIXED_NAME; 1157 switch (BYTE_TYPE(enc, ptr)) { 1158 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 1159 default: 1160 tok = XML_TOK_NMTOKEN; 1161 break; 1162 } 1163 break; 1164 case XML_TOK_PREFIXED_NAME: 1165 tok = XML_TOK_NMTOKEN; 1166 break; 1167 } 1168 break; 1169 #endif 1170 case BT_PLUS: 1171 if (tok == XML_TOK_NMTOKEN) { 1172 *nextTokPtr = ptr; 1173 return XML_TOK_INVALID; 1174 } 1175 *nextTokPtr = ptr + MINBPC(enc); 1176 return XML_TOK_NAME_PLUS; 1177 case BT_AST: 1178 if (tok == XML_TOK_NMTOKEN) { 1179 *nextTokPtr = ptr; 1180 return XML_TOK_INVALID; 1181 } 1182 *nextTokPtr = ptr + MINBPC(enc); 1183 return XML_TOK_NAME_ASTERISK; 1184 case BT_QUEST: 1185 if (tok == XML_TOK_NMTOKEN) { 1186 *nextTokPtr = ptr; 1187 return XML_TOK_INVALID; 1188 } 1189 *nextTokPtr = ptr + MINBPC(enc); 1190 return XML_TOK_NAME_QUESTION; 1191 default: 1192 *nextTokPtr = ptr; 1193 return XML_TOK_INVALID; 1194 } 1195 } 1196 return -tok; 1197 } 1198 1199 static int FASTCALL 1200 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, 1201 const char *end, const char **nextTokPtr) 1202 { 1203 const char *start; 1204 if (ptr == end) 1205 return XML_TOK_NONE; 1206 start = ptr; 1207 while (ptr != end) { 1208 switch (BYTE_TYPE(enc, ptr)) { 1209 #define LEAD_CASE(n) \ 1210 case BT_LEAD ## n: ptr += n; break; 1211 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1212 #undef LEAD_CASE 1213 case BT_AMP: 1214 if (ptr == start) 1215 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1216 *nextTokPtr = ptr; 1217 return XML_TOK_DATA_CHARS; 1218 case BT_LT: 1219 /* this is for inside entity references */ 1220 *nextTokPtr = ptr; 1221 return XML_TOK_INVALID; 1222 case BT_LF: 1223 if (ptr == start) { 1224 *nextTokPtr = ptr + MINBPC(enc); 1225 return XML_TOK_DATA_NEWLINE; 1226 } 1227 *nextTokPtr = ptr; 1228 return XML_TOK_DATA_CHARS; 1229 case BT_CR: 1230 if (ptr == start) { 1231 ptr += MINBPC(enc); 1232 if (ptr == end) 1233 return XML_TOK_TRAILING_CR; 1234 if (BYTE_TYPE(enc, ptr) == BT_LF) 1235 ptr += MINBPC(enc); 1236 *nextTokPtr = ptr; 1237 return XML_TOK_DATA_NEWLINE; 1238 } 1239 *nextTokPtr = ptr; 1240 return XML_TOK_DATA_CHARS; 1241 case BT_S: 1242 if (ptr == start) { 1243 *nextTokPtr = ptr + MINBPC(enc); 1244 return XML_TOK_ATTRIBUTE_VALUE_S; 1245 } 1246 *nextTokPtr = ptr; 1247 return XML_TOK_DATA_CHARS; 1248 default: 1249 ptr += MINBPC(enc); 1250 break; 1251 } 1252 } 1253 *nextTokPtr = ptr; 1254 return XML_TOK_DATA_CHARS; 1255 } 1256 1257 static int FASTCALL 1258 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, 1259 const char *end, const char **nextTokPtr) 1260 { 1261 const char *start; 1262 if (ptr == end) 1263 return XML_TOK_NONE; 1264 start = ptr; 1265 while (ptr != end) { 1266 switch (BYTE_TYPE(enc, ptr)) { 1267 #define LEAD_CASE(n) \ 1268 case BT_LEAD ## n: ptr += n; break; 1269 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1270 #undef LEAD_CASE 1271 case BT_AMP: 1272 if (ptr == start) 1273 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1274 *nextTokPtr = ptr; 1275 return XML_TOK_DATA_CHARS; 1276 case BT_PERCNT: 1277 if (ptr == start) { 1278 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), 1279 end, nextTokPtr); 1280 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok; 1281 } 1282 *nextTokPtr = ptr; 1283 return XML_TOK_DATA_CHARS; 1284 case BT_LF: 1285 if (ptr == start) { 1286 *nextTokPtr = ptr + MINBPC(enc); 1287 return XML_TOK_DATA_NEWLINE; 1288 } 1289 *nextTokPtr = ptr; 1290 return XML_TOK_DATA_CHARS; 1291 case BT_CR: 1292 if (ptr == start) { 1293 ptr += MINBPC(enc); 1294 if (ptr == end) 1295 return XML_TOK_TRAILING_CR; 1296 if (BYTE_TYPE(enc, ptr) == BT_LF) 1297 ptr += MINBPC(enc); 1298 *nextTokPtr = ptr; 1299 return XML_TOK_DATA_NEWLINE; 1300 } 1301 *nextTokPtr = ptr; 1302 return XML_TOK_DATA_CHARS; 1303 default: 1304 ptr += MINBPC(enc); 1305 break; 1306 } 1307 } 1308 *nextTokPtr = ptr; 1309 return XML_TOK_DATA_CHARS; 1310 } 1311 1312 #ifdef XML_DTD 1313 1314 static int FASTCALL 1315 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, 1316 const char *end, const char **nextTokPtr) 1317 { 1318 int level = 0; 1319 if (MINBPC(enc) > 1) { 1320 size_t n = end - ptr; 1321 if (n & (MINBPC(enc) - 1)) { 1322 n &= ~(MINBPC(enc) - 1); 1323 end = ptr + n; 1324 } 1325 } 1326 while (ptr != end) { 1327 switch (BYTE_TYPE(enc, ptr)) { 1328 INVALID_CASES(ptr, nextTokPtr) 1329 case BT_LT: 1330 if ((ptr += MINBPC(enc)) == end) 1331 return XML_TOK_PARTIAL; 1332 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) { 1333 if ((ptr += MINBPC(enc)) == end) 1334 return XML_TOK_PARTIAL; 1335 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) { 1336 ++level; 1337 ptr += MINBPC(enc); 1338 } 1339 } 1340 break; 1341 case BT_RSQB: 1342 if ((ptr += MINBPC(enc)) == end) 1343 return XML_TOK_PARTIAL; 1344 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { 1345 if ((ptr += MINBPC(enc)) == end) 1346 return XML_TOK_PARTIAL; 1347 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 1348 ptr += MINBPC(enc); 1349 if (level == 0) { 1350 *nextTokPtr = ptr; 1351 return XML_TOK_IGNORE_SECT; 1352 } 1353 --level; 1354 } 1355 } 1356 break; 1357 default: 1358 ptr += MINBPC(enc); 1359 break; 1360 } 1361 } 1362 return XML_TOK_PARTIAL; 1363 } 1364 1365 #endif /* XML_DTD */ 1366 1367 static int FASTCALL 1368 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end, 1369 const char **badPtr) 1370 { 1371 ptr += MINBPC(enc); 1372 end -= MINBPC(enc); 1373 for (; ptr != end; ptr += MINBPC(enc)) { 1374 switch (BYTE_TYPE(enc, ptr)) { 1375 case BT_DIGIT: 1376 case BT_HEX: 1377 case BT_MINUS: 1378 case BT_APOS: 1379 case BT_LPAR: 1380 case BT_RPAR: 1381 case BT_PLUS: 1382 case BT_COMMA: 1383 case BT_SOL: 1384 case BT_EQUALS: 1385 case BT_QUEST: 1386 case BT_CR: 1387 case BT_LF: 1388 case BT_SEMI: 1389 case BT_EXCL: 1390 case BT_AST: 1391 case BT_PERCNT: 1392 case BT_NUM: 1393 #ifdef XML_NS 1394 case BT_COLON: 1395 #endif 1396 break; 1397 case BT_S: 1398 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) { 1399 *badPtr = ptr; 1400 return 0; 1401 } 1402 break; 1403 case BT_NAME: 1404 case BT_NMSTRT: 1405 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f)) 1406 break; 1407 default: 1408 switch (BYTE_TO_ASCII(enc, ptr)) { 1409 case 0x24: /* $ */ 1410 case 0x40: /* @ */ 1411 break; 1412 default: 1413 *badPtr = ptr; 1414 return 0; 1415 } 1416 break; 1417 } 1418 } 1419 return 1; 1420 } 1421 1422 /* This must only be called for a well-formed start-tag or empty 1423 element tag. Returns the number of attributes. Pointers to the 1424 first attsMax attributes are stored in atts. 1425 */ 1426 1427 static int FASTCALL 1428 PREFIX(getAtts)(const ENCODING *enc, const char *ptr, 1429 int attsMax, ATTRIBUTE *atts) 1430 { 1431 enum { other, inName, inValue } state = inName; 1432 int nAtts = 0; 1433 int open = 0; /* defined when state == inValue; 1434 initialization just to shut up compilers */ 1435 1436 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) { 1437 switch (BYTE_TYPE(enc, ptr)) { 1438 #define START_NAME \ 1439 if (state == other) { \ 1440 if (nAtts < attsMax) { \ 1441 atts[nAtts].name = ptr; \ 1442 atts[nAtts].normalized = 1; \ 1443 } \ 1444 state = inName; \ 1445 } 1446 #define LEAD_CASE(n) \ 1447 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break; 1448 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1449 #undef LEAD_CASE 1450 case BT_NONASCII: 1451 case BT_NMSTRT: 1452 case BT_HEX: 1453 START_NAME 1454 break; 1455 #undef START_NAME 1456 case BT_QUOT: 1457 if (state != inValue) { 1458 if (nAtts < attsMax) 1459 atts[nAtts].valuePtr = ptr + MINBPC(enc); 1460 state = inValue; 1461 open = BT_QUOT; 1462 } 1463 else if (open == BT_QUOT) { 1464 state = other; 1465 if (nAtts < attsMax) 1466 atts[nAtts].valueEnd = ptr; 1467 nAtts++; 1468 } 1469 break; 1470 case BT_APOS: 1471 if (state != inValue) { 1472 if (nAtts < attsMax) 1473 atts[nAtts].valuePtr = ptr + MINBPC(enc); 1474 state = inValue; 1475 open = BT_APOS; 1476 } 1477 else if (open == BT_APOS) { 1478 state = other; 1479 if (nAtts < attsMax) 1480 atts[nAtts].valueEnd = ptr; 1481 nAtts++; 1482 } 1483 break; 1484 case BT_AMP: 1485 if (nAtts < attsMax) 1486 atts[nAtts].normalized = 0; 1487 break; 1488 case BT_S: 1489 if (state == inName) 1490 state = other; 1491 else if (state == inValue 1492 && nAtts < attsMax 1493 && atts[nAtts].normalized 1494 && (ptr == atts[nAtts].valuePtr 1495 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE 1496 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE 1497 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open)) 1498 atts[nAtts].normalized = 0; 1499 break; 1500 case BT_CR: case BT_LF: 1501 /* This case ensures that the first attribute name is counted 1502 Apart from that we could just change state on the quote. */ 1503 if (state == inName) 1504 state = other; 1505 else if (state == inValue && nAtts < attsMax) 1506 atts[nAtts].normalized = 0; 1507 break; 1508 case BT_GT: 1509 case BT_SOL: 1510 if (state != inValue) 1511 return nAtts; 1512 break; 1513 default: 1514 break; 1515 } 1516 } 1517 /* not reached */ 1518 } 1519 1520 static int FASTCALL 1521 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) 1522 { 1523 int result = 0; 1524 /* skip &# */ 1525 ptr += 2*MINBPC(enc); 1526 if (CHAR_MATCHES(enc, ptr, ASCII_x)) { 1527 for (ptr += MINBPC(enc); 1528 !CHAR_MATCHES(enc, ptr, ASCII_SEMI); 1529 ptr += MINBPC(enc)) { 1530 int c = BYTE_TO_ASCII(enc, ptr); 1531 switch (c) { 1532 case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4: 1533 case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9: 1534 result <<= 4; 1535 result |= (c - ASCII_0); 1536 break; 1537 case ASCII_A: case ASCII_B: case ASCII_C: 1538 case ASCII_D: case ASCII_E: case ASCII_F: 1539 result <<= 4; 1540 result += 10 + (c - ASCII_A); 1541 break; 1542 case ASCII_a: case ASCII_b: case ASCII_c: 1543 case ASCII_d: case ASCII_e: case ASCII_f: 1544 result <<= 4; 1545 result += 10 + (c - ASCII_a); 1546 break; 1547 } 1548 if (result >= 0x110000) 1549 return -1; 1550 } 1551 } 1552 else { 1553 for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) { 1554 int c = BYTE_TO_ASCII(enc, ptr); 1555 result *= 10; 1556 result += (c - ASCII_0); 1557 if (result >= 0x110000) 1558 return -1; 1559 } 1560 } 1561 return checkCharRefNumber(result); 1562 } 1563 1564 static int FASTCALL 1565 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, 1566 const char *end) 1567 { 1568 switch ((end - ptr)/MINBPC(enc)) { 1569 case 2: 1570 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) { 1571 switch (BYTE_TO_ASCII(enc, ptr)) { 1572 case ASCII_l: 1573 return ASCII_LT; 1574 case ASCII_g: 1575 return ASCII_GT; 1576 } 1577 } 1578 break; 1579 case 3: 1580 if (CHAR_MATCHES(enc, ptr, ASCII_a)) { 1581 ptr += MINBPC(enc); 1582 if (CHAR_MATCHES(enc, ptr, ASCII_m)) { 1583 ptr += MINBPC(enc); 1584 if (CHAR_MATCHES(enc, ptr, ASCII_p)) 1585 return ASCII_AMP; 1586 } 1587 } 1588 break; 1589 case 4: 1590 switch (BYTE_TO_ASCII(enc, ptr)) { 1591 case ASCII_q: 1592 ptr += MINBPC(enc); 1593 if (CHAR_MATCHES(enc, ptr, ASCII_u)) { 1594 ptr += MINBPC(enc); 1595 if (CHAR_MATCHES(enc, ptr, ASCII_o)) { 1596 ptr += MINBPC(enc); 1597 if (CHAR_MATCHES(enc, ptr, ASCII_t)) 1598 return ASCII_QUOT; 1599 } 1600 } 1601 break; 1602 case ASCII_a: 1603 ptr += MINBPC(enc); 1604 if (CHAR_MATCHES(enc, ptr, ASCII_p)) { 1605 ptr += MINBPC(enc); 1606 if (CHAR_MATCHES(enc, ptr, ASCII_o)) { 1607 ptr += MINBPC(enc); 1608 if (CHAR_MATCHES(enc, ptr, ASCII_s)) 1609 return ASCII_APOS; 1610 } 1611 } 1612 break; 1613 } 1614 } 1615 return 0; 1616 } 1617 1618 static int FASTCALL 1619 PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2) 1620 { 1621 for (;;) { 1622 switch (BYTE_TYPE(enc, ptr1)) { 1623 #define LEAD_CASE(n) \ 1624 case BT_LEAD ## n: \ 1625 if (*ptr1++ != *ptr2++) \ 1626 return 0; 1627 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2) 1628 #undef LEAD_CASE 1629 /* fall through */ 1630 if (*ptr1++ != *ptr2++) 1631 return 0; 1632 break; 1633 case BT_NONASCII: 1634 case BT_NMSTRT: 1635 #ifdef XML_NS 1636 case BT_COLON: 1637 #endif 1638 case BT_HEX: 1639 case BT_DIGIT: 1640 case BT_NAME: 1641 case BT_MINUS: 1642 if (*ptr2++ != *ptr1++) 1643 return 0; 1644 if (MINBPC(enc) > 1) { 1645 if (*ptr2++ != *ptr1++) 1646 return 0; 1647 if (MINBPC(enc) > 2) { 1648 if (*ptr2++ != *ptr1++) 1649 return 0; 1650 if (MINBPC(enc) > 3) { 1651 if (*ptr2++ != *ptr1++) 1652 return 0; 1653 } 1654 } 1655 } 1656 break; 1657 default: 1658 if (MINBPC(enc) == 1 && *ptr1 == *ptr2) 1659 return 1; 1660 switch (BYTE_TYPE(enc, ptr2)) { 1661 case BT_LEAD2: 1662 case BT_LEAD3: 1663 case BT_LEAD4: 1664 case BT_NONASCII: 1665 case BT_NMSTRT: 1666 #ifdef XML_NS 1667 case BT_COLON: 1668 #endif 1669 case BT_HEX: 1670 case BT_DIGIT: 1671 case BT_NAME: 1672 case BT_MINUS: 1673 return 0; 1674 default: 1675 return 1; 1676 } 1677 } 1678 } 1679 /* not reached */ 1680 } 1681 1682 static int FASTCALL 1683 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, 1684 const char *end1, const char *ptr2) 1685 { 1686 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) { 1687 if (ptr1 == end1) 1688 return 0; 1689 if (!CHAR_MATCHES(enc, ptr1, *ptr2)) 1690 return 0; 1691 } 1692 return ptr1 == end1; 1693 } 1694 1695 static int FASTCALL 1696 PREFIX(nameLength)(const ENCODING *enc, const char *ptr) 1697 { 1698 const char *start = ptr; 1699 for (;;) { 1700 switch (BYTE_TYPE(enc, ptr)) { 1701 #define LEAD_CASE(n) \ 1702 case BT_LEAD ## n: ptr += n; break; 1703 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1704 #undef LEAD_CASE 1705 case BT_NONASCII: 1706 case BT_NMSTRT: 1707 #ifdef XML_NS 1708 case BT_COLON: 1709 #endif 1710 case BT_HEX: 1711 case BT_DIGIT: 1712 case BT_NAME: 1713 case BT_MINUS: 1714 ptr += MINBPC(enc); 1715 break; 1716 default: 1717 return ptr - start; 1718 } 1719 } 1720 } 1721 1722 static const char * FASTCALL 1723 PREFIX(skipS)(const ENCODING *enc, const char *ptr) 1724 { 1725 for (;;) { 1726 switch (BYTE_TYPE(enc, ptr)) { 1727 case BT_LF: 1728 case BT_CR: 1729 case BT_S: 1730 ptr += MINBPC(enc); 1731 break; 1732 default: 1733 return ptr; 1734 } 1735 } 1736 } 1737 1738 static void FASTCALL 1739 PREFIX(updatePosition)(const ENCODING *enc, 1740 const char *ptr, 1741 const char *end, 1742 POSITION *pos) 1743 { 1744 while (ptr != end) { 1745 switch (BYTE_TYPE(enc, ptr)) { 1746 #define LEAD_CASE(n) \ 1747 case BT_LEAD ## n: \ 1748 ptr += n; \ 1749 break; 1750 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1751 #undef LEAD_CASE 1752 case BT_LF: 1753 pos->columnNumber = (unsigned)-1; 1754 pos->lineNumber++; 1755 ptr += MINBPC(enc); 1756 break; 1757 case BT_CR: 1758 pos->lineNumber++; 1759 ptr += MINBPC(enc); 1760 if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF) 1761 ptr += MINBPC(enc); 1762 pos->columnNumber = (unsigned)-1; 1763 break; 1764 default: 1765 ptr += MINBPC(enc); 1766 break; 1767 } 1768 pos->columnNumber++; 1769 } 1770 } 1771 1772 #undef DO_LEAD_CASE 1773 #undef MULTIBYTE_CASES 1774 #undef INVALID_CASES 1775 #undef CHECK_NAME_CASE 1776 #undef CHECK_NAME_CASES 1777 #undef CHECK_NMSTRT_CASE 1778 #undef CHECK_NMSTRT_CASES 1779