1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd 2 See the file COPYING for copying permission. 3 */ 4 5 /* This file is included! */ 6 #ifdef XML_TOK_IMPL_C 7 8 #ifndef IS_INVALID_CHAR 9 #define IS_INVALID_CHAR(enc, ptr, n) (0) 10 #endif 11 12 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \ 13 case BT_LEAD ## n: \ 14 if (end - ptr < n) \ 15 return XML_TOK_PARTIAL_CHAR; \ 16 if (IS_INVALID_CHAR(enc, ptr, n)) { \ 17 *(nextTokPtr) = (ptr); \ 18 return XML_TOK_INVALID; \ 19 } \ 20 ptr += n; \ 21 break; 22 23 #define INVALID_CASES(ptr, nextTokPtr) \ 24 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \ 25 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \ 26 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \ 27 case BT_NONXML: \ 28 case BT_MALFORM: \ 29 case BT_TRAIL: \ 30 *(nextTokPtr) = (ptr); \ 31 return XML_TOK_INVALID; 32 33 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \ 34 case BT_LEAD ## n: \ 35 if (end - ptr < n) \ 36 return XML_TOK_PARTIAL_CHAR; \ 37 if (!IS_NAME_CHAR(enc, ptr, n)) { \ 38 *nextTokPtr = ptr; \ 39 return XML_TOK_INVALID; \ 40 } \ 41 ptr += n; \ 42 break; 43 44 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \ 45 case BT_NONASCII: \ 46 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \ 47 *nextTokPtr = ptr; \ 48 return XML_TOK_INVALID; \ 49 } \ 50 case BT_NMSTRT: \ 51 case BT_HEX: \ 52 case BT_DIGIT: \ 53 case BT_NAME: \ 54 case BT_MINUS: \ 55 ptr += MINBPC(enc); \ 56 break; \ 57 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \ 58 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \ 59 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr) 60 61 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \ 62 case BT_LEAD ## n: \ 63 if (end - ptr < n) \ 64 return XML_TOK_PARTIAL_CHAR; \ 65 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \ 66 *nextTokPtr = ptr; \ 67 return XML_TOK_INVALID; \ 68 } \ 69 ptr += n; \ 70 break; 71 72 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \ 73 case BT_NONASCII: \ 74 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \ 75 *nextTokPtr = ptr; \ 76 return XML_TOK_INVALID; \ 77 } \ 78 case BT_NMSTRT: \ 79 case BT_HEX: \ 80 ptr += MINBPC(enc); \ 81 break; \ 82 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \ 83 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \ 84 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr) 85 86 #ifndef PREFIX 87 #define PREFIX(ident) ident 88 #endif 89 90 /* ptr points to character following "<!-" */ 91 92 static int PTRCALL 93 PREFIX(scanComment)(const ENCODING *enc, const char *ptr, 94 const char *end, const char **nextTokPtr) 95 { 96 if (ptr != end) { 97 if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { 98 *nextTokPtr = ptr; 99 return XML_TOK_INVALID; 100 } 101 ptr += MINBPC(enc); 102 while (ptr != end) { 103 switch (BYTE_TYPE(enc, ptr)) { 104 INVALID_CASES(ptr, nextTokPtr) 105 case BT_MINUS: 106 if ((ptr += MINBPC(enc)) == end) 107 return XML_TOK_PARTIAL; 108 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { 109 if ((ptr += MINBPC(enc)) == end) 110 return XML_TOK_PARTIAL; 111 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 112 *nextTokPtr = ptr; 113 return XML_TOK_INVALID; 114 } 115 *nextTokPtr = ptr + MINBPC(enc); 116 return XML_TOK_COMMENT; 117 } 118 break; 119 default: 120 ptr += MINBPC(enc); 121 break; 122 } 123 } 124 } 125 return XML_TOK_PARTIAL; 126 } 127 128 /* ptr points to character following "<!" */ 129 130 static int PTRCALL 131 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, 132 const char *end, const char **nextTokPtr) 133 { 134 if (ptr == end) 135 return XML_TOK_PARTIAL; 136 switch (BYTE_TYPE(enc, ptr)) { 137 case BT_MINUS: 138 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); 139 case BT_LSQB: 140 *nextTokPtr = ptr + MINBPC(enc); 141 return XML_TOK_COND_SECT_OPEN; 142 case BT_NMSTRT: 143 case BT_HEX: 144 ptr += MINBPC(enc); 145 break; 146 default: 147 *nextTokPtr = ptr; 148 return XML_TOK_INVALID; 149 } 150 while (ptr != end) { 151 switch (BYTE_TYPE(enc, ptr)) { 152 case BT_PERCNT: 153 if (ptr + MINBPC(enc) == end) 154 return XML_TOK_PARTIAL; 155 /* don't allow <!ENTITY% foo "whatever"> */ 156 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) { 157 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT: 158 *nextTokPtr = ptr; 159 return XML_TOK_INVALID; 160 } 161 /* fall through */ 162 case BT_S: case BT_CR: case BT_LF: 163 *nextTokPtr = ptr; 164 return XML_TOK_DECL_OPEN; 165 case BT_NMSTRT: 166 case BT_HEX: 167 ptr += MINBPC(enc); 168 break; 169 default: 170 *nextTokPtr = ptr; 171 return XML_TOK_INVALID; 172 } 173 } 174 return XML_TOK_PARTIAL; 175 } 176 177 static int PTRCALL 178 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, 179 const char *end, int *tokPtr) 180 { 181 int upper = 0; 182 *tokPtr = XML_TOK_PI; 183 if (end - ptr != MINBPC(enc)*3) 184 return 1; 185 switch (BYTE_TO_ASCII(enc, ptr)) { 186 case ASCII_x: 187 break; 188 case ASCII_X: 189 upper = 1; 190 break; 191 default: 192 return 1; 193 } 194 ptr += MINBPC(enc); 195 switch (BYTE_TO_ASCII(enc, ptr)) { 196 case ASCII_m: 197 break; 198 case ASCII_M: 199 upper = 1; 200 break; 201 default: 202 return 1; 203 } 204 ptr += MINBPC(enc); 205 switch (BYTE_TO_ASCII(enc, ptr)) { 206 case ASCII_l: 207 break; 208 case ASCII_L: 209 upper = 1; 210 break; 211 default: 212 return 1; 213 } 214 if (upper) 215 return 0; 216 *tokPtr = XML_TOK_XML_DECL; 217 return 1; 218 } 219 220 /* ptr points to character following "<?" */ 221 222 static int PTRCALL 223 PREFIX(scanPi)(const ENCODING *enc, const char *ptr, 224 const char *end, const char **nextTokPtr) 225 { 226 int tok; 227 const char *target = ptr; 228 if (ptr == end) 229 return XML_TOK_PARTIAL; 230 switch (BYTE_TYPE(enc, ptr)) { 231 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 232 default: 233 *nextTokPtr = ptr; 234 return XML_TOK_INVALID; 235 } 236 while (ptr != end) { 237 switch (BYTE_TYPE(enc, ptr)) { 238 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 239 case BT_S: case BT_CR: case BT_LF: 240 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { 241 *nextTokPtr = ptr; 242 return XML_TOK_INVALID; 243 } 244 ptr += MINBPC(enc); 245 while (ptr != end) { 246 switch (BYTE_TYPE(enc, ptr)) { 247 INVALID_CASES(ptr, nextTokPtr) 248 case BT_QUEST: 249 ptr += MINBPC(enc); 250 if (ptr == end) 251 return XML_TOK_PARTIAL; 252 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 253 *nextTokPtr = ptr + MINBPC(enc); 254 return tok; 255 } 256 break; 257 default: 258 ptr += MINBPC(enc); 259 break; 260 } 261 } 262 return XML_TOK_PARTIAL; 263 case BT_QUEST: 264 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { 265 *nextTokPtr = ptr; 266 return XML_TOK_INVALID; 267 } 268 ptr += MINBPC(enc); 269 if (ptr == end) 270 return XML_TOK_PARTIAL; 271 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 272 *nextTokPtr = ptr + MINBPC(enc); 273 return tok; 274 } 275 /* fall through */ 276 default: 277 *nextTokPtr = ptr; 278 return XML_TOK_INVALID; 279 } 280 } 281 return XML_TOK_PARTIAL; 282 } 283 284 static int PTRCALL 285 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, 286 const char *end, const char **nextTokPtr) 287 { 288 static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A, 289 ASCII_T, ASCII_A, ASCII_LSQB }; 290 int i; 291 /* CDATA[ */ 292 if (end - ptr < 6 * MINBPC(enc)) 293 return XML_TOK_PARTIAL; 294 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) { 295 if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) { 296 *nextTokPtr = ptr; 297 return XML_TOK_INVALID; 298 } 299 } 300 *nextTokPtr = ptr; 301 return XML_TOK_CDATA_SECT_OPEN; 302 } 303 304 static int PTRCALL 305 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, 306 const char *end, const char **nextTokPtr) 307 { 308 if (ptr == end) 309 return XML_TOK_NONE; 310 if (MINBPC(enc) > 1) { 311 size_t n = end - ptr; 312 if (n & (MINBPC(enc) - 1)) { 313 n &= ~(MINBPC(enc) - 1); 314 if (n == 0) 315 return XML_TOK_PARTIAL; 316 end = ptr + n; 317 } 318 } 319 switch (BYTE_TYPE(enc, ptr)) { 320 case BT_RSQB: 321 ptr += MINBPC(enc); 322 if (ptr == end) 323 return XML_TOK_PARTIAL; 324 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) 325 break; 326 ptr += MINBPC(enc); 327 if (ptr == end) 328 return XML_TOK_PARTIAL; 329 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 330 ptr -= MINBPC(enc); 331 break; 332 } 333 *nextTokPtr = ptr + MINBPC(enc); 334 return XML_TOK_CDATA_SECT_CLOSE; 335 case BT_CR: 336 ptr += MINBPC(enc); 337 if (ptr == end) 338 return XML_TOK_PARTIAL; 339 if (BYTE_TYPE(enc, ptr) == BT_LF) 340 ptr += MINBPC(enc); 341 *nextTokPtr = ptr; 342 return XML_TOK_DATA_NEWLINE; 343 case BT_LF: 344 *nextTokPtr = ptr + MINBPC(enc); 345 return XML_TOK_DATA_NEWLINE; 346 INVALID_CASES(ptr, nextTokPtr) 347 default: 348 ptr += MINBPC(enc); 349 break; 350 } 351 while (ptr != end) { 352 switch (BYTE_TYPE(enc, ptr)) { 353 #define LEAD_CASE(n) \ 354 case BT_LEAD ## n: \ 355 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ 356 *nextTokPtr = ptr; \ 357 return XML_TOK_DATA_CHARS; \ 358 } \ 359 ptr += n; \ 360 break; 361 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 362 #undef LEAD_CASE 363 case BT_NONXML: 364 case BT_MALFORM: 365 case BT_TRAIL: 366 case BT_CR: 367 case BT_LF: 368 case BT_RSQB: 369 *nextTokPtr = ptr; 370 return XML_TOK_DATA_CHARS; 371 default: 372 ptr += MINBPC(enc); 373 break; 374 } 375 } 376 *nextTokPtr = ptr; 377 return XML_TOK_DATA_CHARS; 378 } 379 380 /* ptr points to character following "</" */ 381 382 static int PTRCALL 383 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, 384 const char *end, const char **nextTokPtr) 385 { 386 if (ptr == end) 387 return XML_TOK_PARTIAL; 388 switch (BYTE_TYPE(enc, ptr)) { 389 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 390 default: 391 *nextTokPtr = ptr; 392 return XML_TOK_INVALID; 393 } 394 while (ptr != end) { 395 switch (BYTE_TYPE(enc, ptr)) { 396 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 397 case BT_S: case BT_CR: case BT_LF: 398 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) { 399 switch (BYTE_TYPE(enc, ptr)) { 400 case BT_S: case BT_CR: case BT_LF: 401 break; 402 case BT_GT: 403 *nextTokPtr = ptr + MINBPC(enc); 404 return XML_TOK_END_TAG; 405 default: 406 *nextTokPtr = ptr; 407 return XML_TOK_INVALID; 408 } 409 } 410 return XML_TOK_PARTIAL; 411 #ifdef XML_NS 412 case BT_COLON: 413 /* no need to check qname syntax here, 414 since end-tag must match exactly */ 415 ptr += MINBPC(enc); 416 break; 417 #endif 418 case BT_GT: 419 *nextTokPtr = ptr + MINBPC(enc); 420 return XML_TOK_END_TAG; 421 default: 422 *nextTokPtr = ptr; 423 return XML_TOK_INVALID; 424 } 425 } 426 return XML_TOK_PARTIAL; 427 } 428 429 /* ptr points to character following "&#X" */ 430 431 static int PTRCALL 432 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, 433 const char *end, const char **nextTokPtr) 434 { 435 if (ptr != end) { 436 switch (BYTE_TYPE(enc, ptr)) { 437 case BT_DIGIT: 438 case BT_HEX: 439 break; 440 default: 441 *nextTokPtr = ptr; 442 return XML_TOK_INVALID; 443 } 444 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) { 445 switch (BYTE_TYPE(enc, ptr)) { 446 case BT_DIGIT: 447 case BT_HEX: 448 break; 449 case BT_SEMI: 450 *nextTokPtr = ptr + MINBPC(enc); 451 return XML_TOK_CHAR_REF; 452 default: 453 *nextTokPtr = ptr; 454 return XML_TOK_INVALID; 455 } 456 } 457 } 458 return XML_TOK_PARTIAL; 459 } 460 461 /* ptr points to character following "&#" */ 462 463 static int PTRCALL 464 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, 465 const char *end, const char **nextTokPtr) 466 { 467 if (ptr != end) { 468 if (CHAR_MATCHES(enc, ptr, ASCII_x)) 469 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 470 switch (BYTE_TYPE(enc, ptr)) { 471 case BT_DIGIT: 472 break; 473 default: 474 *nextTokPtr = ptr; 475 return XML_TOK_INVALID; 476 } 477 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) { 478 switch (BYTE_TYPE(enc, ptr)) { 479 case BT_DIGIT: 480 break; 481 case BT_SEMI: 482 *nextTokPtr = ptr + MINBPC(enc); 483 return XML_TOK_CHAR_REF; 484 default: 485 *nextTokPtr = ptr; 486 return XML_TOK_INVALID; 487 } 488 } 489 } 490 return XML_TOK_PARTIAL; 491 } 492 493 /* ptr points to character following "&" */ 494 495 static int PTRCALL 496 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end, 497 const char **nextTokPtr) 498 { 499 if (ptr == end) 500 return XML_TOK_PARTIAL; 501 switch (BYTE_TYPE(enc, ptr)) { 502 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 503 case BT_NUM: 504 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 505 default: 506 *nextTokPtr = ptr; 507 return XML_TOK_INVALID; 508 } 509 while (ptr != end) { 510 switch (BYTE_TYPE(enc, ptr)) { 511 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 512 case BT_SEMI: 513 *nextTokPtr = ptr + MINBPC(enc); 514 return XML_TOK_ENTITY_REF; 515 default: 516 *nextTokPtr = ptr; 517 return XML_TOK_INVALID; 518 } 519 } 520 return XML_TOK_PARTIAL; 521 } 522 523 /* ptr points to character following first character of attribute name */ 524 525 static int PTRCALL 526 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, 527 const char **nextTokPtr) 528 { 529 #ifdef XML_NS 530 int hadColon = 0; 531 #endif 532 while (ptr != end) { 533 switch (BYTE_TYPE(enc, ptr)) { 534 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 535 #ifdef XML_NS 536 case BT_COLON: 537 if (hadColon) { 538 *nextTokPtr = ptr; 539 return XML_TOK_INVALID; 540 } 541 hadColon = 1; 542 ptr += MINBPC(enc); 543 if (ptr == end) 544 return XML_TOK_PARTIAL; 545 switch (BYTE_TYPE(enc, ptr)) { 546 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 547 default: 548 *nextTokPtr = ptr; 549 return XML_TOK_INVALID; 550 } 551 break; 552 #endif 553 case BT_S: case BT_CR: case BT_LF: 554 for (;;) { 555 int t; 556 557 ptr += MINBPC(enc); 558 if (ptr == end) 559 return XML_TOK_PARTIAL; 560 t = BYTE_TYPE(enc, ptr); 561 if (t == BT_EQUALS) 562 break; 563 switch (t) { 564 case BT_S: 565 case BT_LF: 566 case BT_CR: 567 break; 568 default: 569 *nextTokPtr = ptr; 570 return XML_TOK_INVALID; 571 } 572 } 573 /* fall through */ 574 case BT_EQUALS: 575 { 576 int open; 577 #ifdef XML_NS 578 hadColon = 0; 579 #endif 580 for (;;) { 581 ptr += MINBPC(enc); 582 if (ptr == end) 583 return XML_TOK_PARTIAL; 584 open = BYTE_TYPE(enc, ptr); 585 if (open == BT_QUOT || open == BT_APOS) 586 break; 587 switch (open) { 588 case BT_S: 589 case BT_LF: 590 case BT_CR: 591 break; 592 default: 593 *nextTokPtr = ptr; 594 return XML_TOK_INVALID; 595 } 596 } 597 ptr += MINBPC(enc); 598 /* in attribute value */ 599 for (;;) { 600 int t; 601 if (ptr == end) 602 return XML_TOK_PARTIAL; 603 t = BYTE_TYPE(enc, ptr); 604 if (t == open) 605 break; 606 switch (t) { 607 INVALID_CASES(ptr, nextTokPtr) 608 case BT_AMP: 609 { 610 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr); 611 if (tok <= 0) { 612 if (tok == XML_TOK_INVALID) 613 *nextTokPtr = ptr; 614 return tok; 615 } 616 break; 617 } 618 case BT_LT: 619 *nextTokPtr = ptr; 620 return XML_TOK_INVALID; 621 default: 622 ptr += MINBPC(enc); 623 break; 624 } 625 } 626 ptr += MINBPC(enc); 627 if (ptr == end) 628 return XML_TOK_PARTIAL; 629 switch (BYTE_TYPE(enc, ptr)) { 630 case BT_S: 631 case BT_CR: 632 case BT_LF: 633 break; 634 case BT_SOL: 635 goto sol; 636 case BT_GT: 637 goto gt; 638 default: 639 *nextTokPtr = ptr; 640 return XML_TOK_INVALID; 641 } 642 /* ptr points to closing quote */ 643 for (;;) { 644 ptr += MINBPC(enc); 645 if (ptr == end) 646 return XML_TOK_PARTIAL; 647 switch (BYTE_TYPE(enc, ptr)) { 648 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 649 case BT_S: case BT_CR: case BT_LF: 650 continue; 651 case BT_GT: 652 gt: 653 *nextTokPtr = ptr + MINBPC(enc); 654 return XML_TOK_START_TAG_WITH_ATTS; 655 case BT_SOL: 656 sol: 657 ptr += MINBPC(enc); 658 if (ptr == end) 659 return XML_TOK_PARTIAL; 660 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 661 *nextTokPtr = ptr; 662 return XML_TOK_INVALID; 663 } 664 *nextTokPtr = ptr + MINBPC(enc); 665 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS; 666 default: 667 *nextTokPtr = ptr; 668 return XML_TOK_INVALID; 669 } 670 break; 671 } 672 break; 673 } 674 default: 675 *nextTokPtr = ptr; 676 return XML_TOK_INVALID; 677 } 678 } 679 return XML_TOK_PARTIAL; 680 } 681 682 /* ptr points to character following "<" */ 683 684 static int PTRCALL 685 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, 686 const char **nextTokPtr) 687 { 688 #ifdef XML_NS 689 int hadColon; 690 #endif 691 if (ptr == end) 692 return XML_TOK_PARTIAL; 693 switch (BYTE_TYPE(enc, ptr)) { 694 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 695 case BT_EXCL: 696 if ((ptr += MINBPC(enc)) == end) 697 return XML_TOK_PARTIAL; 698 switch (BYTE_TYPE(enc, ptr)) { 699 case BT_MINUS: 700 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); 701 case BT_LSQB: 702 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), 703 end, nextTokPtr); 704 } 705 *nextTokPtr = ptr; 706 return XML_TOK_INVALID; 707 case BT_QUEST: 708 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); 709 case BT_SOL: 710 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr); 711 default: 712 *nextTokPtr = ptr; 713 return XML_TOK_INVALID; 714 } 715 #ifdef XML_NS 716 hadColon = 0; 717 #endif 718 /* we have a start-tag */ 719 while (ptr != end) { 720 switch (BYTE_TYPE(enc, ptr)) { 721 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 722 #ifdef XML_NS 723 case BT_COLON: 724 if (hadColon) { 725 *nextTokPtr = ptr; 726 return XML_TOK_INVALID; 727 } 728 hadColon = 1; 729 ptr += MINBPC(enc); 730 if (ptr == end) 731 return XML_TOK_PARTIAL; 732 switch (BYTE_TYPE(enc, ptr)) { 733 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 734 default: 735 *nextTokPtr = ptr; 736 return XML_TOK_INVALID; 737 } 738 break; 739 #endif 740 case BT_S: case BT_CR: case BT_LF: 741 { 742 ptr += MINBPC(enc); 743 while (ptr != end) { 744 switch (BYTE_TYPE(enc, ptr)) { 745 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 746 case BT_GT: 747 goto gt; 748 case BT_SOL: 749 goto sol; 750 case BT_S: case BT_CR: case BT_LF: 751 ptr += MINBPC(enc); 752 continue; 753 default: 754 *nextTokPtr = ptr; 755 return XML_TOK_INVALID; 756 } 757 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr); 758 } 759 return XML_TOK_PARTIAL; 760 } 761 case BT_GT: 762 gt: 763 *nextTokPtr = ptr + MINBPC(enc); 764 return XML_TOK_START_TAG_NO_ATTS; 765 case BT_SOL: 766 sol: 767 ptr += MINBPC(enc); 768 if (ptr == end) 769 return XML_TOK_PARTIAL; 770 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 771 *nextTokPtr = ptr; 772 return XML_TOK_INVALID; 773 } 774 *nextTokPtr = ptr + MINBPC(enc); 775 return XML_TOK_EMPTY_ELEMENT_NO_ATTS; 776 default: 777 *nextTokPtr = ptr; 778 return XML_TOK_INVALID; 779 } 780 } 781 return XML_TOK_PARTIAL; 782 } 783 784 static int PTRCALL 785 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, 786 const char **nextTokPtr) 787 { 788 if (ptr == end) 789 return XML_TOK_NONE; 790 if (MINBPC(enc) > 1) { 791 size_t n = end - ptr; 792 if (n & (MINBPC(enc) - 1)) { 793 n &= ~(MINBPC(enc) - 1); 794 if (n == 0) 795 return XML_TOK_PARTIAL; 796 end = ptr + n; 797 } 798 } 799 switch (BYTE_TYPE(enc, ptr)) { 800 case BT_LT: 801 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr); 802 case BT_AMP: 803 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 804 case BT_CR: 805 ptr += MINBPC(enc); 806 if (ptr == end) 807 return XML_TOK_TRAILING_CR; 808 if (BYTE_TYPE(enc, ptr) == BT_LF) 809 ptr += MINBPC(enc); 810 *nextTokPtr = ptr; 811 return XML_TOK_DATA_NEWLINE; 812 case BT_LF: 813 *nextTokPtr = ptr + MINBPC(enc); 814 return XML_TOK_DATA_NEWLINE; 815 case BT_RSQB: 816 ptr += MINBPC(enc); 817 if (ptr == end) 818 return XML_TOK_TRAILING_RSQB; 819 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) 820 break; 821 ptr += MINBPC(enc); 822 if (ptr == end) 823 return XML_TOK_TRAILING_RSQB; 824 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 825 ptr -= MINBPC(enc); 826 break; 827 } 828 *nextTokPtr = ptr; 829 return XML_TOK_INVALID; 830 INVALID_CASES(ptr, nextTokPtr) 831 default: 832 ptr += MINBPC(enc); 833 break; 834 } 835 while (ptr != end) { 836 switch (BYTE_TYPE(enc, ptr)) { 837 #define LEAD_CASE(n) \ 838 case BT_LEAD ## n: \ 839 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ 840 *nextTokPtr = ptr; \ 841 return XML_TOK_DATA_CHARS; \ 842 } \ 843 ptr += n; \ 844 break; 845 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 846 #undef LEAD_CASE 847 case BT_RSQB: 848 if (ptr + MINBPC(enc) != end) { 849 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) { 850 ptr += MINBPC(enc); 851 break; 852 } 853 if (ptr + 2*MINBPC(enc) != end) { 854 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) { 855 ptr += MINBPC(enc); 856 break; 857 } 858 *nextTokPtr = ptr + 2*MINBPC(enc); 859 return XML_TOK_INVALID; 860 } 861 } 862 /* fall through */ 863 case BT_AMP: 864 case BT_LT: 865 case BT_NONXML: 866 case BT_MALFORM: 867 case BT_TRAIL: 868 case BT_CR: 869 case BT_LF: 870 *nextTokPtr = ptr; 871 return XML_TOK_DATA_CHARS; 872 default: 873 ptr += MINBPC(enc); 874 break; 875 } 876 } 877 *nextTokPtr = ptr; 878 return XML_TOK_DATA_CHARS; 879 } 880 881 /* ptr points to character following "%" */ 882 883 static int PTRCALL 884 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end, 885 const char **nextTokPtr) 886 { 887 if (ptr == end) 888 return -XML_TOK_PERCENT; 889 switch (BYTE_TYPE(enc, ptr)) { 890 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 891 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT: 892 *nextTokPtr = ptr; 893 return XML_TOK_PERCENT; 894 default: 895 *nextTokPtr = ptr; 896 return XML_TOK_INVALID; 897 } 898 while (ptr != end) { 899 switch (BYTE_TYPE(enc, ptr)) { 900 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 901 case BT_SEMI: 902 *nextTokPtr = ptr + MINBPC(enc); 903 return XML_TOK_PARAM_ENTITY_REF; 904 default: 905 *nextTokPtr = ptr; 906 return XML_TOK_INVALID; 907 } 908 } 909 return XML_TOK_PARTIAL; 910 } 911 912 static int PTRCALL 913 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end, 914 const char **nextTokPtr) 915 { 916 if (ptr == end) 917 return XML_TOK_PARTIAL; 918 switch (BYTE_TYPE(enc, ptr)) { 919 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 920 default: 921 *nextTokPtr = ptr; 922 return XML_TOK_INVALID; 923 } 924 while (ptr != end) { 925 switch (BYTE_TYPE(enc, ptr)) { 926 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 927 case BT_CR: case BT_LF: case BT_S: 928 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR: 929 *nextTokPtr = ptr; 930 return XML_TOK_POUND_NAME; 931 default: 932 *nextTokPtr = ptr; 933 return XML_TOK_INVALID; 934 } 935 } 936 return -XML_TOK_POUND_NAME; 937 } 938 939 static int PTRCALL 940 PREFIX(scanLit)(int open, const ENCODING *enc, 941 const char *ptr, const char *end, 942 const char **nextTokPtr) 943 { 944 while (ptr != end) { 945 int t = BYTE_TYPE(enc, ptr); 946 switch (t) { 947 INVALID_CASES(ptr, nextTokPtr) 948 case BT_QUOT: 949 case BT_APOS: 950 ptr += MINBPC(enc); 951 if (t != open) 952 break; 953 if (ptr == end) 954 return -XML_TOK_LITERAL; 955 *nextTokPtr = ptr; 956 switch (BYTE_TYPE(enc, ptr)) { 957 case BT_S: case BT_CR: case BT_LF: 958 case BT_GT: case BT_PERCNT: case BT_LSQB: 959 return XML_TOK_LITERAL; 960 default: 961 return XML_TOK_INVALID; 962 } 963 default: 964 ptr += MINBPC(enc); 965 break; 966 } 967 } 968 return XML_TOK_PARTIAL; 969 } 970 971 static int PTRCALL 972 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, 973 const char **nextTokPtr) 974 { 975 int tok; 976 if (ptr == end) 977 return XML_TOK_NONE; 978 if (MINBPC(enc) > 1) { 979 size_t n = end - ptr; 980 if (n & (MINBPC(enc) - 1)) { 981 n &= ~(MINBPC(enc) - 1); 982 if (n == 0) 983 return XML_TOK_PARTIAL; 984 end = ptr + n; 985 } 986 } 987 switch (BYTE_TYPE(enc, ptr)) { 988 case BT_QUOT: 989 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr); 990 case BT_APOS: 991 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr); 992 case BT_LT: 993 { 994 ptr += MINBPC(enc); 995 if (ptr == end) 996 return XML_TOK_PARTIAL; 997 switch (BYTE_TYPE(enc, ptr)) { 998 case BT_EXCL: 999 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1000 case BT_QUEST: 1001 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1002 case BT_NMSTRT: 1003 case BT_HEX: 1004 case BT_NONASCII: 1005 case BT_LEAD2: 1006 case BT_LEAD3: 1007 case BT_LEAD4: 1008 *nextTokPtr = ptr - MINBPC(enc); 1009 return XML_TOK_INSTANCE_START; 1010 } 1011 *nextTokPtr = ptr; 1012 return XML_TOK_INVALID; 1013 } 1014 case BT_CR: 1015 if (ptr + MINBPC(enc) == end) { 1016 *nextTokPtr = end; 1017 /* indicate that this might be part of a CR/LF pair */ 1018 return -XML_TOK_PROLOG_S; 1019 } 1020 /* fall through */ 1021 case BT_S: case BT_LF: 1022 for (;;) { 1023 ptr += MINBPC(enc); 1024 if (ptr == end) 1025 break; 1026 switch (BYTE_TYPE(enc, ptr)) { 1027 case BT_S: case BT_LF: 1028 break; 1029 case BT_CR: 1030 /* don't split CR/LF pair */ 1031 if (ptr + MINBPC(enc) != end) 1032 break; 1033 /* fall through */ 1034 default: 1035 *nextTokPtr = ptr; 1036 return XML_TOK_PROLOG_S; 1037 } 1038 } 1039 *nextTokPtr = ptr; 1040 return XML_TOK_PROLOG_S; 1041 case BT_PERCNT: 1042 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1043 case BT_COMMA: 1044 *nextTokPtr = ptr + MINBPC(enc); 1045 return XML_TOK_COMMA; 1046 case BT_LSQB: 1047 *nextTokPtr = ptr + MINBPC(enc); 1048 return XML_TOK_OPEN_BRACKET; 1049 case BT_RSQB: 1050 ptr += MINBPC(enc); 1051 if (ptr == end) 1052 return -XML_TOK_CLOSE_BRACKET; 1053 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { 1054 if (ptr + MINBPC(enc) == end) 1055 return XML_TOK_PARTIAL; 1056 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) { 1057 *nextTokPtr = ptr + 2*MINBPC(enc); 1058 return XML_TOK_COND_SECT_CLOSE; 1059 } 1060 } 1061 *nextTokPtr = ptr; 1062 return XML_TOK_CLOSE_BRACKET; 1063 case BT_LPAR: 1064 *nextTokPtr = ptr + MINBPC(enc); 1065 return XML_TOK_OPEN_PAREN; 1066 case BT_RPAR: 1067 ptr += MINBPC(enc); 1068 if (ptr == end) 1069 return -XML_TOK_CLOSE_PAREN; 1070 switch (BYTE_TYPE(enc, ptr)) { 1071 case BT_AST: 1072 *nextTokPtr = ptr + MINBPC(enc); 1073 return XML_TOK_CLOSE_PAREN_ASTERISK; 1074 case BT_QUEST: 1075 *nextTokPtr = ptr + MINBPC(enc); 1076 return XML_TOK_CLOSE_PAREN_QUESTION; 1077 case BT_PLUS: 1078 *nextTokPtr = ptr + MINBPC(enc); 1079 return XML_TOK_CLOSE_PAREN_PLUS; 1080 case BT_CR: case BT_LF: case BT_S: 1081 case BT_GT: case BT_COMMA: case BT_VERBAR: 1082 case BT_RPAR: 1083 *nextTokPtr = ptr; 1084 return XML_TOK_CLOSE_PAREN; 1085 } 1086 *nextTokPtr = ptr; 1087 return XML_TOK_INVALID; 1088 case BT_VERBAR: 1089 *nextTokPtr = ptr + MINBPC(enc); 1090 return XML_TOK_OR; 1091 case BT_GT: 1092 *nextTokPtr = ptr + MINBPC(enc); 1093 return XML_TOK_DECL_CLOSE; 1094 case BT_NUM: 1095 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1096 #define LEAD_CASE(n) \ 1097 case BT_LEAD ## n: \ 1098 if (end - ptr < n) \ 1099 return XML_TOK_PARTIAL_CHAR; \ 1100 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ 1101 ptr += n; \ 1102 tok = XML_TOK_NAME; \ 1103 break; \ 1104 } \ 1105 if (IS_NAME_CHAR(enc, ptr, n)) { \ 1106 ptr += n; \ 1107 tok = XML_TOK_NMTOKEN; \ 1108 break; \ 1109 } \ 1110 *nextTokPtr = ptr; \ 1111 return XML_TOK_INVALID; 1112 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1113 #undef LEAD_CASE 1114 case BT_NMSTRT: 1115 case BT_HEX: 1116 tok = XML_TOK_NAME; 1117 ptr += MINBPC(enc); 1118 break; 1119 case BT_DIGIT: 1120 case BT_NAME: 1121 case BT_MINUS: 1122 #ifdef XML_NS 1123 case BT_COLON: 1124 #endif 1125 tok = XML_TOK_NMTOKEN; 1126 ptr += MINBPC(enc); 1127 break; 1128 case BT_NONASCII: 1129 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { 1130 ptr += MINBPC(enc); 1131 tok = XML_TOK_NAME; 1132 break; 1133 } 1134 if (IS_NAME_CHAR_MINBPC(enc, ptr)) { 1135 ptr += MINBPC(enc); 1136 tok = XML_TOK_NMTOKEN; 1137 break; 1138 } 1139 /* fall through */ 1140 default: 1141 *nextTokPtr = ptr; 1142 return XML_TOK_INVALID; 1143 } 1144 while (ptr != end) { 1145 switch (BYTE_TYPE(enc, ptr)) { 1146 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 1147 case BT_GT: case BT_RPAR: case BT_COMMA: 1148 case BT_VERBAR: case BT_LSQB: case BT_PERCNT: 1149 case BT_S: case BT_CR: case BT_LF: 1150 *nextTokPtr = ptr; 1151 return tok; 1152 #ifdef XML_NS 1153 case BT_COLON: 1154 ptr += MINBPC(enc); 1155 switch (tok) { 1156 case XML_TOK_NAME: 1157 if (ptr == end) 1158 return XML_TOK_PARTIAL; 1159 tok = XML_TOK_PREFIXED_NAME; 1160 switch (BYTE_TYPE(enc, ptr)) { 1161 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 1162 default: 1163 tok = XML_TOK_NMTOKEN; 1164 break; 1165 } 1166 break; 1167 case XML_TOK_PREFIXED_NAME: 1168 tok = XML_TOK_NMTOKEN; 1169 break; 1170 } 1171 break; 1172 #endif 1173 case BT_PLUS: 1174 if (tok == XML_TOK_NMTOKEN) { 1175 *nextTokPtr = ptr; 1176 return XML_TOK_INVALID; 1177 } 1178 *nextTokPtr = ptr + MINBPC(enc); 1179 return XML_TOK_NAME_PLUS; 1180 case BT_AST: 1181 if (tok == XML_TOK_NMTOKEN) { 1182 *nextTokPtr = ptr; 1183 return XML_TOK_INVALID; 1184 } 1185 *nextTokPtr = ptr + MINBPC(enc); 1186 return XML_TOK_NAME_ASTERISK; 1187 case BT_QUEST: 1188 if (tok == XML_TOK_NMTOKEN) { 1189 *nextTokPtr = ptr; 1190 return XML_TOK_INVALID; 1191 } 1192 *nextTokPtr = ptr + MINBPC(enc); 1193 return XML_TOK_NAME_QUESTION; 1194 default: 1195 *nextTokPtr = ptr; 1196 return XML_TOK_INVALID; 1197 } 1198 } 1199 return -tok; 1200 } 1201 1202 static int PTRCALL 1203 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, 1204 const char *end, const char **nextTokPtr) 1205 { 1206 const char *start; 1207 if (ptr == end) 1208 return XML_TOK_NONE; 1209 start = ptr; 1210 while (ptr != end) { 1211 switch (BYTE_TYPE(enc, ptr)) { 1212 #define LEAD_CASE(n) \ 1213 case BT_LEAD ## n: ptr += n; break; 1214 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1215 #undef LEAD_CASE 1216 case BT_AMP: 1217 if (ptr == start) 1218 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1219 *nextTokPtr = ptr; 1220 return XML_TOK_DATA_CHARS; 1221 case BT_LT: 1222 /* this is for inside entity references */ 1223 *nextTokPtr = ptr; 1224 return XML_TOK_INVALID; 1225 case BT_LF: 1226 if (ptr == start) { 1227 *nextTokPtr = ptr + MINBPC(enc); 1228 return XML_TOK_DATA_NEWLINE; 1229 } 1230 *nextTokPtr = ptr; 1231 return XML_TOK_DATA_CHARS; 1232 case BT_CR: 1233 if (ptr == start) { 1234 ptr += MINBPC(enc); 1235 if (ptr == end) 1236 return XML_TOK_TRAILING_CR; 1237 if (BYTE_TYPE(enc, ptr) == BT_LF) 1238 ptr += MINBPC(enc); 1239 *nextTokPtr = ptr; 1240 return XML_TOK_DATA_NEWLINE; 1241 } 1242 *nextTokPtr = ptr; 1243 return XML_TOK_DATA_CHARS; 1244 case BT_S: 1245 if (ptr == start) { 1246 *nextTokPtr = ptr + MINBPC(enc); 1247 return XML_TOK_ATTRIBUTE_VALUE_S; 1248 } 1249 *nextTokPtr = ptr; 1250 return XML_TOK_DATA_CHARS; 1251 default: 1252 ptr += MINBPC(enc); 1253 break; 1254 } 1255 } 1256 *nextTokPtr = ptr; 1257 return XML_TOK_DATA_CHARS; 1258 } 1259 1260 static int PTRCALL 1261 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, 1262 const char *end, const char **nextTokPtr) 1263 { 1264 const char *start; 1265 if (ptr == end) 1266 return XML_TOK_NONE; 1267 start = ptr; 1268 while (ptr != end) { 1269 switch (BYTE_TYPE(enc, ptr)) { 1270 #define LEAD_CASE(n) \ 1271 case BT_LEAD ## n: ptr += n; break; 1272 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1273 #undef LEAD_CASE 1274 case BT_AMP: 1275 if (ptr == start) 1276 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1277 *nextTokPtr = ptr; 1278 return XML_TOK_DATA_CHARS; 1279 case BT_PERCNT: 1280 if (ptr == start) { 1281 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), 1282 end, nextTokPtr); 1283 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok; 1284 } 1285 *nextTokPtr = ptr; 1286 return XML_TOK_DATA_CHARS; 1287 case BT_LF: 1288 if (ptr == start) { 1289 *nextTokPtr = ptr + MINBPC(enc); 1290 return XML_TOK_DATA_NEWLINE; 1291 } 1292 *nextTokPtr = ptr; 1293 return XML_TOK_DATA_CHARS; 1294 case BT_CR: 1295 if (ptr == start) { 1296 ptr += MINBPC(enc); 1297 if (ptr == end) 1298 return XML_TOK_TRAILING_CR; 1299 if (BYTE_TYPE(enc, ptr) == BT_LF) 1300 ptr += MINBPC(enc); 1301 *nextTokPtr = ptr; 1302 return XML_TOK_DATA_NEWLINE; 1303 } 1304 *nextTokPtr = ptr; 1305 return XML_TOK_DATA_CHARS; 1306 default: 1307 ptr += MINBPC(enc); 1308 break; 1309 } 1310 } 1311 *nextTokPtr = ptr; 1312 return XML_TOK_DATA_CHARS; 1313 } 1314 1315 #ifdef XML_DTD 1316 1317 static int PTRCALL 1318 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, 1319 const char *end, const char **nextTokPtr) 1320 { 1321 int level = 0; 1322 if (MINBPC(enc) > 1) { 1323 size_t n = end - ptr; 1324 if (n & (MINBPC(enc) - 1)) { 1325 n &= ~(MINBPC(enc) - 1); 1326 end = ptr + n; 1327 } 1328 } 1329 while (ptr != end) { 1330 switch (BYTE_TYPE(enc, ptr)) { 1331 INVALID_CASES(ptr, nextTokPtr) 1332 case BT_LT: 1333 if ((ptr += MINBPC(enc)) == end) 1334 return XML_TOK_PARTIAL; 1335 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) { 1336 if ((ptr += MINBPC(enc)) == end) 1337 return XML_TOK_PARTIAL; 1338 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) { 1339 ++level; 1340 ptr += MINBPC(enc); 1341 } 1342 } 1343 break; 1344 case BT_RSQB: 1345 if ((ptr += MINBPC(enc)) == end) 1346 return XML_TOK_PARTIAL; 1347 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { 1348 if ((ptr += MINBPC(enc)) == end) 1349 return XML_TOK_PARTIAL; 1350 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 1351 ptr += MINBPC(enc); 1352 if (level == 0) { 1353 *nextTokPtr = ptr; 1354 return XML_TOK_IGNORE_SECT; 1355 } 1356 --level; 1357 } 1358 } 1359 break; 1360 default: 1361 ptr += MINBPC(enc); 1362 break; 1363 } 1364 } 1365 return XML_TOK_PARTIAL; 1366 } 1367 1368 #endif /* XML_DTD */ 1369 1370 static int PTRCALL 1371 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end, 1372 const char **badPtr) 1373 { 1374 ptr += MINBPC(enc); 1375 end -= MINBPC(enc); 1376 for (; ptr != end; ptr += MINBPC(enc)) { 1377 switch (BYTE_TYPE(enc, ptr)) { 1378 case BT_DIGIT: 1379 case BT_HEX: 1380 case BT_MINUS: 1381 case BT_APOS: 1382 case BT_LPAR: 1383 case BT_RPAR: 1384 case BT_PLUS: 1385 case BT_COMMA: 1386 case BT_SOL: 1387 case BT_EQUALS: 1388 case BT_QUEST: 1389 case BT_CR: 1390 case BT_LF: 1391 case BT_SEMI: 1392 case BT_EXCL: 1393 case BT_AST: 1394 case BT_PERCNT: 1395 case BT_NUM: 1396 #ifdef XML_NS 1397 case BT_COLON: 1398 #endif 1399 break; 1400 case BT_S: 1401 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) { 1402 *badPtr = ptr; 1403 return 0; 1404 } 1405 break; 1406 case BT_NAME: 1407 case BT_NMSTRT: 1408 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f)) 1409 break; 1410 default: 1411 switch (BYTE_TO_ASCII(enc, ptr)) { 1412 case 0x24: /* $ */ 1413 case 0x40: /* @ */ 1414 break; 1415 default: 1416 *badPtr = ptr; 1417 return 0; 1418 } 1419 break; 1420 } 1421 } 1422 return 1; 1423 } 1424 1425 /* This must only be called for a well-formed start-tag or empty 1426 element tag. Returns the number of attributes. Pointers to the 1427 first attsMax attributes are stored in atts. 1428 */ 1429 1430 static int PTRCALL 1431 PREFIX(getAtts)(const ENCODING *enc, const char *ptr, 1432 int attsMax, ATTRIBUTE *atts) 1433 { 1434 enum { other, inName, inValue } state = inName; 1435 int nAtts = 0; 1436 int open = 0; /* defined when state == inValue; 1437 initialization just to shut up compilers */ 1438 1439 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) { 1440 switch (BYTE_TYPE(enc, ptr)) { 1441 #define START_NAME \ 1442 if (state == other) { \ 1443 if (nAtts < attsMax) { \ 1444 atts[nAtts].name = ptr; \ 1445 atts[nAtts].normalized = 1; \ 1446 } \ 1447 state = inName; \ 1448 } 1449 #define LEAD_CASE(n) \ 1450 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break; 1451 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1452 #undef LEAD_CASE 1453 case BT_NONASCII: 1454 case BT_NMSTRT: 1455 case BT_HEX: 1456 START_NAME 1457 break; 1458 #undef START_NAME 1459 case BT_QUOT: 1460 if (state != inValue) { 1461 if (nAtts < attsMax) 1462 atts[nAtts].valuePtr = ptr + MINBPC(enc); 1463 state = inValue; 1464 open = BT_QUOT; 1465 } 1466 else if (open == BT_QUOT) { 1467 state = other; 1468 if (nAtts < attsMax) 1469 atts[nAtts].valueEnd = ptr; 1470 nAtts++; 1471 } 1472 break; 1473 case BT_APOS: 1474 if (state != inValue) { 1475 if (nAtts < attsMax) 1476 atts[nAtts].valuePtr = ptr + MINBPC(enc); 1477 state = inValue; 1478 open = BT_APOS; 1479 } 1480 else if (open == BT_APOS) { 1481 state = other; 1482 if (nAtts < attsMax) 1483 atts[nAtts].valueEnd = ptr; 1484 nAtts++; 1485 } 1486 break; 1487 case BT_AMP: 1488 if (nAtts < attsMax) 1489 atts[nAtts].normalized = 0; 1490 break; 1491 case BT_S: 1492 if (state == inName) 1493 state = other; 1494 else if (state == inValue 1495 && nAtts < attsMax 1496 && atts[nAtts].normalized 1497 && (ptr == atts[nAtts].valuePtr 1498 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE 1499 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE 1500 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open)) 1501 atts[nAtts].normalized = 0; 1502 break; 1503 case BT_CR: case BT_LF: 1504 /* This case ensures that the first attribute name is counted 1505 Apart from that we could just change state on the quote. */ 1506 if (state == inName) 1507 state = other; 1508 else if (state == inValue && nAtts < attsMax) 1509 atts[nAtts].normalized = 0; 1510 break; 1511 case BT_GT: 1512 case BT_SOL: 1513 if (state != inValue) 1514 return nAtts; 1515 break; 1516 default: 1517 break; 1518 } 1519 } 1520 /* not reached */ 1521 } 1522 1523 static int PTRFASTCALL 1524 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) 1525 { 1526 int result = 0; 1527 /* skip &# */ 1528 ptr += 2*MINBPC(enc); 1529 if (CHAR_MATCHES(enc, ptr, ASCII_x)) { 1530 for (ptr += MINBPC(enc); 1531 !CHAR_MATCHES(enc, ptr, ASCII_SEMI); 1532 ptr += MINBPC(enc)) { 1533 int c = BYTE_TO_ASCII(enc, ptr); 1534 switch (c) { 1535 case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4: 1536 case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9: 1537 result <<= 4; 1538 result |= (c - ASCII_0); 1539 break; 1540 case ASCII_A: case ASCII_B: case ASCII_C: 1541 case ASCII_D: case ASCII_E: case ASCII_F: 1542 result <<= 4; 1543 result += 10 + (c - ASCII_A); 1544 break; 1545 case ASCII_a: case ASCII_b: case ASCII_c: 1546 case ASCII_d: case ASCII_e: case ASCII_f: 1547 result <<= 4; 1548 result += 10 + (c - ASCII_a); 1549 break; 1550 } 1551 if (result >= 0x110000) 1552 return -1; 1553 } 1554 } 1555 else { 1556 for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) { 1557 int c = BYTE_TO_ASCII(enc, ptr); 1558 result *= 10; 1559 result += (c - ASCII_0); 1560 if (result >= 0x110000) 1561 return -1; 1562 } 1563 } 1564 return checkCharRefNumber(result); 1565 } 1566 1567 static int PTRCALL 1568 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, 1569 const char *end) 1570 { 1571 switch ((end - ptr)/MINBPC(enc)) { 1572 case 2: 1573 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) { 1574 switch (BYTE_TO_ASCII(enc, ptr)) { 1575 case ASCII_l: 1576 return ASCII_LT; 1577 case ASCII_g: 1578 return ASCII_GT; 1579 } 1580 } 1581 break; 1582 case 3: 1583 if (CHAR_MATCHES(enc, ptr, ASCII_a)) { 1584 ptr += MINBPC(enc); 1585 if (CHAR_MATCHES(enc, ptr, ASCII_m)) { 1586 ptr += MINBPC(enc); 1587 if (CHAR_MATCHES(enc, ptr, ASCII_p)) 1588 return ASCII_AMP; 1589 } 1590 } 1591 break; 1592 case 4: 1593 switch (BYTE_TO_ASCII(enc, ptr)) { 1594 case ASCII_q: 1595 ptr += MINBPC(enc); 1596 if (CHAR_MATCHES(enc, ptr, ASCII_u)) { 1597 ptr += MINBPC(enc); 1598 if (CHAR_MATCHES(enc, ptr, ASCII_o)) { 1599 ptr += MINBPC(enc); 1600 if (CHAR_MATCHES(enc, ptr, ASCII_t)) 1601 return ASCII_QUOT; 1602 } 1603 } 1604 break; 1605 case ASCII_a: 1606 ptr += MINBPC(enc); 1607 if (CHAR_MATCHES(enc, ptr, ASCII_p)) { 1608 ptr += MINBPC(enc); 1609 if (CHAR_MATCHES(enc, ptr, ASCII_o)) { 1610 ptr += MINBPC(enc); 1611 if (CHAR_MATCHES(enc, ptr, ASCII_s)) 1612 return ASCII_APOS; 1613 } 1614 } 1615 break; 1616 } 1617 } 1618 return 0; 1619 } 1620 1621 static int PTRCALL 1622 PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2) 1623 { 1624 for (;;) { 1625 switch (BYTE_TYPE(enc, ptr1)) { 1626 #define LEAD_CASE(n) \ 1627 case BT_LEAD ## n: \ 1628 if (*ptr1++ != *ptr2++) \ 1629 return 0; 1630 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2) 1631 #undef LEAD_CASE 1632 /* fall through */ 1633 if (*ptr1++ != *ptr2++) 1634 return 0; 1635 break; 1636 case BT_NONASCII: 1637 case BT_NMSTRT: 1638 #ifdef XML_NS 1639 case BT_COLON: 1640 #endif 1641 case BT_HEX: 1642 case BT_DIGIT: 1643 case BT_NAME: 1644 case BT_MINUS: 1645 if (*ptr2++ != *ptr1++) 1646 return 0; 1647 if (MINBPC(enc) > 1) { 1648 if (*ptr2++ != *ptr1++) 1649 return 0; 1650 if (MINBPC(enc) > 2) { 1651 if (*ptr2++ != *ptr1++) 1652 return 0; 1653 if (MINBPC(enc) > 3) { 1654 if (*ptr2++ != *ptr1++) 1655 return 0; 1656 } 1657 } 1658 } 1659 break; 1660 default: 1661 if (MINBPC(enc) == 1 && *ptr1 == *ptr2) 1662 return 1; 1663 switch (BYTE_TYPE(enc, ptr2)) { 1664 case BT_LEAD2: 1665 case BT_LEAD3: 1666 case BT_LEAD4: 1667 case BT_NONASCII: 1668 case BT_NMSTRT: 1669 #ifdef XML_NS 1670 case BT_COLON: 1671 #endif 1672 case BT_HEX: 1673 case BT_DIGIT: 1674 case BT_NAME: 1675 case BT_MINUS: 1676 return 0; 1677 default: 1678 return 1; 1679 } 1680 } 1681 } 1682 /* not reached */ 1683 } 1684 1685 static int PTRCALL 1686 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, 1687 const char *end1, const char *ptr2) 1688 { 1689 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) { 1690 if (ptr1 == end1) 1691 return 0; 1692 if (!CHAR_MATCHES(enc, ptr1, *ptr2)) 1693 return 0; 1694 } 1695 return ptr1 == end1; 1696 } 1697 1698 static int PTRFASTCALL 1699 PREFIX(nameLength)(const ENCODING *enc, const char *ptr) 1700 { 1701 const char *start = ptr; 1702 for (;;) { 1703 switch (BYTE_TYPE(enc, ptr)) { 1704 #define LEAD_CASE(n) \ 1705 case BT_LEAD ## n: ptr += n; break; 1706 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1707 #undef LEAD_CASE 1708 case BT_NONASCII: 1709 case BT_NMSTRT: 1710 #ifdef XML_NS 1711 case BT_COLON: 1712 #endif 1713 case BT_HEX: 1714 case BT_DIGIT: 1715 case BT_NAME: 1716 case BT_MINUS: 1717 ptr += MINBPC(enc); 1718 break; 1719 default: 1720 return (int)(ptr - start); 1721 } 1722 } 1723 } 1724 1725 static const char * PTRFASTCALL 1726 PREFIX(skipS)(const ENCODING *enc, const char *ptr) 1727 { 1728 for (;;) { 1729 switch (BYTE_TYPE(enc, ptr)) { 1730 case BT_LF: 1731 case BT_CR: 1732 case BT_S: 1733 ptr += MINBPC(enc); 1734 break; 1735 default: 1736 return ptr; 1737 } 1738 } 1739 } 1740 1741 static void PTRCALL 1742 PREFIX(updatePosition)(const ENCODING *enc, 1743 const char *ptr, 1744 const char *end, 1745 POSITION *pos) 1746 { 1747 while (ptr < end) { 1748 switch (BYTE_TYPE(enc, ptr)) { 1749 #define LEAD_CASE(n) \ 1750 case BT_LEAD ## n: \ 1751 ptr += n; \ 1752 break; 1753 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1754 #undef LEAD_CASE 1755 case BT_LF: 1756 pos->columnNumber = (XML_Size)-1; 1757 pos->lineNumber++; 1758 ptr += MINBPC(enc); 1759 break; 1760 case BT_CR: 1761 pos->lineNumber++; 1762 ptr += MINBPC(enc); 1763 if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF) 1764 ptr += MINBPC(enc); 1765 pos->columnNumber = (XML_Size)-1; 1766 break; 1767 default: 1768 ptr += MINBPC(enc); 1769 break; 1770 } 1771 pos->columnNumber++; 1772 } 1773 } 1774 1775 #undef DO_LEAD_CASE 1776 #undef MULTIBYTE_CASES 1777 #undef INVALID_CASES 1778 #undef CHECK_NAME_CASE 1779 #undef CHECK_NAME_CASES 1780 #undef CHECK_NMSTRT_CASE 1781 #undef CHECK_NMSTRT_CASES 1782 1783 #endif /* XML_TOK_IMPL_C */ 1784