1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd 2 See the file COPYING for copying permission. 3 */ 4 5 /* This file is included! */ 6 #ifdef XML_TOK_IMPL_C 7 8 #ifndef IS_INVALID_CHAR 9 #define IS_INVALID_CHAR(enc, ptr, n) (0) 10 #endif 11 12 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \ 13 case BT_LEAD ## n: \ 14 if (end - ptr < n) \ 15 return XML_TOK_PARTIAL_CHAR; \ 16 if (IS_INVALID_CHAR(enc, ptr, n)) { \ 17 *(nextTokPtr) = (ptr); \ 18 return XML_TOK_INVALID; \ 19 } \ 20 ptr += n; \ 21 break; 22 23 #define INVALID_CASES(ptr, nextTokPtr) \ 24 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \ 25 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \ 26 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \ 27 case BT_NONXML: \ 28 case BT_MALFORM: \ 29 case BT_TRAIL: \ 30 *(nextTokPtr) = (ptr); \ 31 return XML_TOK_INVALID; 32 33 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \ 34 case BT_LEAD ## n: \ 35 if (end - ptr < n) \ 36 return XML_TOK_PARTIAL_CHAR; \ 37 if (!IS_NAME_CHAR(enc, ptr, n)) { \ 38 *nextTokPtr = ptr; \ 39 return XML_TOK_INVALID; \ 40 } \ 41 ptr += n; \ 42 break; 43 44 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \ 45 case BT_NONASCII: \ 46 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \ 47 *nextTokPtr = ptr; \ 48 return XML_TOK_INVALID; \ 49 } \ 50 case BT_NMSTRT: \ 51 case BT_HEX: \ 52 case BT_DIGIT: \ 53 case BT_NAME: \ 54 case BT_MINUS: \ 55 ptr += MINBPC(enc); \ 56 break; \ 57 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \ 58 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \ 59 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr) 60 61 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \ 62 case BT_LEAD ## n: \ 63 if (end - ptr < n) \ 64 return XML_TOK_PARTIAL_CHAR; \ 65 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \ 66 *nextTokPtr = ptr; \ 67 return XML_TOK_INVALID; \ 68 } \ 69 ptr += n; \ 70 break; 71 72 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \ 73 case BT_NONASCII: \ 74 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \ 75 *nextTokPtr = ptr; \ 76 return XML_TOK_INVALID; \ 77 } \ 78 case BT_NMSTRT: \ 79 case BT_HEX: \ 80 ptr += MINBPC(enc); \ 81 break; \ 82 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \ 83 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \ 84 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr) 85 86 #ifndef PREFIX 87 #define PREFIX(ident) ident 88 #endif 89 90 91 #define HAS_CHARS(enc, ptr, end, count) \ 92 (end - ptr >= count * MINBPC(enc)) 93 94 #define HAS_CHAR(enc, ptr, end) \ 95 HAS_CHARS(enc, ptr, end, 1) 96 97 #define REQUIRE_CHARS(enc, ptr, end, count) \ 98 { \ 99 if (! HAS_CHARS(enc, ptr, end, count)) { \ 100 return XML_TOK_PARTIAL; \ 101 } \ 102 } 103 104 #define REQUIRE_CHAR(enc, ptr, end) \ 105 REQUIRE_CHARS(enc, ptr, end, 1) 106 107 108 /* ptr points to character following "<!-" */ 109 110 static int PTRCALL 111 PREFIX(scanComment)(const ENCODING *enc, const char *ptr, 112 const char *end, const char **nextTokPtr) 113 { 114 if (HAS_CHAR(enc, ptr, end)) { 115 if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { 116 *nextTokPtr = ptr; 117 return XML_TOK_INVALID; 118 } 119 ptr += MINBPC(enc); 120 while (HAS_CHAR(enc, ptr, end)) { 121 switch (BYTE_TYPE(enc, ptr)) { 122 INVALID_CASES(ptr, nextTokPtr) 123 case BT_MINUS: 124 ptr += MINBPC(enc); 125 REQUIRE_CHAR(enc, ptr, end); 126 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { 127 ptr += MINBPC(enc); 128 REQUIRE_CHAR(enc, ptr, end); 129 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 130 *nextTokPtr = ptr; 131 return XML_TOK_INVALID; 132 } 133 *nextTokPtr = ptr + MINBPC(enc); 134 return XML_TOK_COMMENT; 135 } 136 break; 137 default: 138 ptr += MINBPC(enc); 139 break; 140 } 141 } 142 } 143 return XML_TOK_PARTIAL; 144 } 145 146 /* ptr points to character following "<!" */ 147 148 static int PTRCALL 149 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, 150 const char *end, const char **nextTokPtr) 151 { 152 REQUIRE_CHAR(enc, ptr, end); 153 switch (BYTE_TYPE(enc, ptr)) { 154 case BT_MINUS: 155 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); 156 case BT_LSQB: 157 *nextTokPtr = ptr + MINBPC(enc); 158 return XML_TOK_COND_SECT_OPEN; 159 case BT_NMSTRT: 160 case BT_HEX: 161 ptr += MINBPC(enc); 162 break; 163 default: 164 *nextTokPtr = ptr; 165 return XML_TOK_INVALID; 166 } 167 while (HAS_CHAR(enc, ptr, end)) { 168 switch (BYTE_TYPE(enc, ptr)) { 169 case BT_PERCNT: 170 REQUIRE_CHARS(enc, ptr, end, 2); 171 /* don't allow <!ENTITY% foo "whatever"> */ 172 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) { 173 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT: 174 *nextTokPtr = ptr; 175 return XML_TOK_INVALID; 176 } 177 /* fall through */ 178 case BT_S: case BT_CR: case BT_LF: 179 *nextTokPtr = ptr; 180 return XML_TOK_DECL_OPEN; 181 case BT_NMSTRT: 182 case BT_HEX: 183 ptr += MINBPC(enc); 184 break; 185 default: 186 *nextTokPtr = ptr; 187 return XML_TOK_INVALID; 188 } 189 } 190 return XML_TOK_PARTIAL; 191 } 192 193 static int PTRCALL 194 PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr, 195 const char *end, int *tokPtr) 196 { 197 int upper = 0; 198 *tokPtr = XML_TOK_PI; 199 if (end - ptr != MINBPC(enc)*3) 200 return 1; 201 switch (BYTE_TO_ASCII(enc, ptr)) { 202 case ASCII_x: 203 break; 204 case ASCII_X: 205 upper = 1; 206 break; 207 default: 208 return 1; 209 } 210 ptr += MINBPC(enc); 211 switch (BYTE_TO_ASCII(enc, ptr)) { 212 case ASCII_m: 213 break; 214 case ASCII_M: 215 upper = 1; 216 break; 217 default: 218 return 1; 219 } 220 ptr += MINBPC(enc); 221 switch (BYTE_TO_ASCII(enc, ptr)) { 222 case ASCII_l: 223 break; 224 case ASCII_L: 225 upper = 1; 226 break; 227 default: 228 return 1; 229 } 230 if (upper) 231 return 0; 232 *tokPtr = XML_TOK_XML_DECL; 233 return 1; 234 } 235 236 /* ptr points to character following "<?" */ 237 238 static int PTRCALL 239 PREFIX(scanPi)(const ENCODING *enc, const char *ptr, 240 const char *end, const char **nextTokPtr) 241 { 242 int tok; 243 const char *target = ptr; 244 REQUIRE_CHAR(enc, ptr, end); 245 switch (BYTE_TYPE(enc, ptr)) { 246 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 247 default: 248 *nextTokPtr = ptr; 249 return XML_TOK_INVALID; 250 } 251 while (HAS_CHAR(enc, ptr, end)) { 252 switch (BYTE_TYPE(enc, ptr)) { 253 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 254 case BT_S: case BT_CR: case BT_LF: 255 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { 256 *nextTokPtr = ptr; 257 return XML_TOK_INVALID; 258 } 259 ptr += MINBPC(enc); 260 while (HAS_CHAR(enc, ptr, end)) { 261 switch (BYTE_TYPE(enc, ptr)) { 262 INVALID_CASES(ptr, nextTokPtr) 263 case BT_QUEST: 264 ptr += MINBPC(enc); 265 REQUIRE_CHAR(enc, ptr, end); 266 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 267 *nextTokPtr = ptr + MINBPC(enc); 268 return tok; 269 } 270 break; 271 default: 272 ptr += MINBPC(enc); 273 break; 274 } 275 } 276 return XML_TOK_PARTIAL; 277 case BT_QUEST: 278 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { 279 *nextTokPtr = ptr; 280 return XML_TOK_INVALID; 281 } 282 ptr += MINBPC(enc); 283 REQUIRE_CHAR(enc, ptr, end); 284 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 285 *nextTokPtr = ptr + MINBPC(enc); 286 return tok; 287 } 288 /* fall through */ 289 default: 290 *nextTokPtr = ptr; 291 return XML_TOK_INVALID; 292 } 293 } 294 return XML_TOK_PARTIAL; 295 } 296 297 static int PTRCALL 298 PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr, 299 const char *end, const char **nextTokPtr) 300 { 301 static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A, 302 ASCII_T, ASCII_A, ASCII_LSQB }; 303 int i; 304 /* CDATA[ */ 305 REQUIRE_CHARS(enc, ptr, end, 6); 306 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) { 307 if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) { 308 *nextTokPtr = ptr; 309 return XML_TOK_INVALID; 310 } 311 } 312 *nextTokPtr = ptr; 313 return XML_TOK_CDATA_SECT_OPEN; 314 } 315 316 static int PTRCALL 317 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, 318 const char *end, const char **nextTokPtr) 319 { 320 if (ptr >= end) 321 return XML_TOK_NONE; 322 if (MINBPC(enc) > 1) { 323 size_t n = end - ptr; 324 if (n & (MINBPC(enc) - 1)) { 325 n &= ~(MINBPC(enc) - 1); 326 if (n == 0) 327 return XML_TOK_PARTIAL; 328 end = ptr + n; 329 } 330 } 331 switch (BYTE_TYPE(enc, ptr)) { 332 case BT_RSQB: 333 ptr += MINBPC(enc); 334 REQUIRE_CHAR(enc, ptr, end); 335 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) 336 break; 337 ptr += MINBPC(enc); 338 REQUIRE_CHAR(enc, ptr, end); 339 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 340 ptr -= MINBPC(enc); 341 break; 342 } 343 *nextTokPtr = ptr + MINBPC(enc); 344 return XML_TOK_CDATA_SECT_CLOSE; 345 case BT_CR: 346 ptr += MINBPC(enc); 347 REQUIRE_CHAR(enc, ptr, end); 348 if (BYTE_TYPE(enc, ptr) == BT_LF) 349 ptr += MINBPC(enc); 350 *nextTokPtr = ptr; 351 return XML_TOK_DATA_NEWLINE; 352 case BT_LF: 353 *nextTokPtr = ptr + MINBPC(enc); 354 return XML_TOK_DATA_NEWLINE; 355 INVALID_CASES(ptr, nextTokPtr) 356 default: 357 ptr += MINBPC(enc); 358 break; 359 } 360 while (HAS_CHAR(enc, ptr, end)) { 361 switch (BYTE_TYPE(enc, ptr)) { 362 #define LEAD_CASE(n) \ 363 case BT_LEAD ## n: \ 364 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ 365 *nextTokPtr = ptr; \ 366 return XML_TOK_DATA_CHARS; \ 367 } \ 368 ptr += n; \ 369 break; 370 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 371 #undef LEAD_CASE 372 case BT_NONXML: 373 case BT_MALFORM: 374 case BT_TRAIL: 375 case BT_CR: 376 case BT_LF: 377 case BT_RSQB: 378 *nextTokPtr = ptr; 379 return XML_TOK_DATA_CHARS; 380 default: 381 ptr += MINBPC(enc); 382 break; 383 } 384 } 385 *nextTokPtr = ptr; 386 return XML_TOK_DATA_CHARS; 387 } 388 389 /* ptr points to character following "</" */ 390 391 static int PTRCALL 392 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, 393 const char *end, const char **nextTokPtr) 394 { 395 REQUIRE_CHAR(enc, ptr, end); 396 switch (BYTE_TYPE(enc, ptr)) { 397 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 398 default: 399 *nextTokPtr = ptr; 400 return XML_TOK_INVALID; 401 } 402 while (HAS_CHAR(enc, ptr, end)) { 403 switch (BYTE_TYPE(enc, ptr)) { 404 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 405 case BT_S: case BT_CR: case BT_LF: 406 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 407 switch (BYTE_TYPE(enc, ptr)) { 408 case BT_S: case BT_CR: case BT_LF: 409 break; 410 case BT_GT: 411 *nextTokPtr = ptr + MINBPC(enc); 412 return XML_TOK_END_TAG; 413 default: 414 *nextTokPtr = ptr; 415 return XML_TOK_INVALID; 416 } 417 } 418 return XML_TOK_PARTIAL; 419 #ifdef XML_NS 420 case BT_COLON: 421 /* no need to check qname syntax here, 422 since end-tag must match exactly */ 423 ptr += MINBPC(enc); 424 break; 425 #endif 426 case BT_GT: 427 *nextTokPtr = ptr + MINBPC(enc); 428 return XML_TOK_END_TAG; 429 default: 430 *nextTokPtr = ptr; 431 return XML_TOK_INVALID; 432 } 433 } 434 return XML_TOK_PARTIAL; 435 } 436 437 /* ptr points to character following "&#X" */ 438 439 static int PTRCALL 440 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, 441 const char *end, const char **nextTokPtr) 442 { 443 if (HAS_CHAR(enc, ptr, end)) { 444 switch (BYTE_TYPE(enc, ptr)) { 445 case BT_DIGIT: 446 case BT_HEX: 447 break; 448 default: 449 *nextTokPtr = ptr; 450 return XML_TOK_INVALID; 451 } 452 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 453 switch (BYTE_TYPE(enc, ptr)) { 454 case BT_DIGIT: 455 case BT_HEX: 456 break; 457 case BT_SEMI: 458 *nextTokPtr = ptr + MINBPC(enc); 459 return XML_TOK_CHAR_REF; 460 default: 461 *nextTokPtr = ptr; 462 return XML_TOK_INVALID; 463 } 464 } 465 } 466 return XML_TOK_PARTIAL; 467 } 468 469 /* ptr points to character following "&#" */ 470 471 static int PTRCALL 472 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, 473 const char *end, const char **nextTokPtr) 474 { 475 if (HAS_CHAR(enc, ptr, end)) { 476 if (CHAR_MATCHES(enc, ptr, ASCII_x)) 477 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 478 switch (BYTE_TYPE(enc, ptr)) { 479 case BT_DIGIT: 480 break; 481 default: 482 *nextTokPtr = ptr; 483 return XML_TOK_INVALID; 484 } 485 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 486 switch (BYTE_TYPE(enc, ptr)) { 487 case BT_DIGIT: 488 break; 489 case BT_SEMI: 490 *nextTokPtr = ptr + MINBPC(enc); 491 return XML_TOK_CHAR_REF; 492 default: 493 *nextTokPtr = ptr; 494 return XML_TOK_INVALID; 495 } 496 } 497 } 498 return XML_TOK_PARTIAL; 499 } 500 501 /* ptr points to character following "&" */ 502 503 static int PTRCALL 504 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end, 505 const char **nextTokPtr) 506 { 507 REQUIRE_CHAR(enc, ptr, end); 508 switch (BYTE_TYPE(enc, ptr)) { 509 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 510 case BT_NUM: 511 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 512 default: 513 *nextTokPtr = ptr; 514 return XML_TOK_INVALID; 515 } 516 while (HAS_CHAR(enc, ptr, end)) { 517 switch (BYTE_TYPE(enc, ptr)) { 518 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 519 case BT_SEMI: 520 *nextTokPtr = ptr + MINBPC(enc); 521 return XML_TOK_ENTITY_REF; 522 default: 523 *nextTokPtr = ptr; 524 return XML_TOK_INVALID; 525 } 526 } 527 return XML_TOK_PARTIAL; 528 } 529 530 /* ptr points to character following first character of attribute name */ 531 532 static int PTRCALL 533 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, 534 const char **nextTokPtr) 535 { 536 #ifdef XML_NS 537 int hadColon = 0; 538 #endif 539 while (HAS_CHAR(enc, ptr, end)) { 540 switch (BYTE_TYPE(enc, ptr)) { 541 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 542 #ifdef XML_NS 543 case BT_COLON: 544 if (hadColon) { 545 *nextTokPtr = ptr; 546 return XML_TOK_INVALID; 547 } 548 hadColon = 1; 549 ptr += MINBPC(enc); 550 REQUIRE_CHAR(enc, ptr, end); 551 switch (BYTE_TYPE(enc, ptr)) { 552 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 553 default: 554 *nextTokPtr = ptr; 555 return XML_TOK_INVALID; 556 } 557 break; 558 #endif 559 case BT_S: case BT_CR: case BT_LF: 560 for (;;) { 561 int t; 562 563 ptr += MINBPC(enc); 564 REQUIRE_CHAR(enc, ptr, end); 565 t = BYTE_TYPE(enc, ptr); 566 if (t == BT_EQUALS) 567 break; 568 switch (t) { 569 case BT_S: 570 case BT_LF: 571 case BT_CR: 572 break; 573 default: 574 *nextTokPtr = ptr; 575 return XML_TOK_INVALID; 576 } 577 } 578 /* fall through */ 579 case BT_EQUALS: 580 { 581 int open; 582 #ifdef XML_NS 583 hadColon = 0; 584 #endif 585 for (;;) { 586 ptr += MINBPC(enc); 587 REQUIRE_CHAR(enc, ptr, end); 588 open = BYTE_TYPE(enc, ptr); 589 if (open == BT_QUOT || open == BT_APOS) 590 break; 591 switch (open) { 592 case BT_S: 593 case BT_LF: 594 case BT_CR: 595 break; 596 default: 597 *nextTokPtr = ptr; 598 return XML_TOK_INVALID; 599 } 600 } 601 ptr += MINBPC(enc); 602 /* in attribute value */ 603 for (;;) { 604 int t; 605 REQUIRE_CHAR(enc, ptr, end); 606 t = BYTE_TYPE(enc, ptr); 607 if (t == open) 608 break; 609 switch (t) { 610 INVALID_CASES(ptr, nextTokPtr) 611 case BT_AMP: 612 { 613 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr); 614 if (tok <= 0) { 615 if (tok == XML_TOK_INVALID) 616 *nextTokPtr = ptr; 617 return tok; 618 } 619 break; 620 } 621 case BT_LT: 622 *nextTokPtr = ptr; 623 return XML_TOK_INVALID; 624 default: 625 ptr += MINBPC(enc); 626 break; 627 } 628 } 629 ptr += MINBPC(enc); 630 REQUIRE_CHAR(enc, ptr, end); 631 switch (BYTE_TYPE(enc, ptr)) { 632 case BT_S: 633 case BT_CR: 634 case BT_LF: 635 break; 636 case BT_SOL: 637 goto sol; 638 case BT_GT: 639 goto gt; 640 default: 641 *nextTokPtr = ptr; 642 return XML_TOK_INVALID; 643 } 644 /* ptr points to closing quote */ 645 for (;;) { 646 ptr += MINBPC(enc); 647 REQUIRE_CHAR(enc, ptr, end); 648 switch (BYTE_TYPE(enc, ptr)) { 649 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 650 case BT_S: case BT_CR: case BT_LF: 651 continue; 652 case BT_GT: 653 gt: 654 *nextTokPtr = ptr + MINBPC(enc); 655 return XML_TOK_START_TAG_WITH_ATTS; 656 case BT_SOL: 657 sol: 658 ptr += MINBPC(enc); 659 REQUIRE_CHAR(enc, ptr, end); 660 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 661 *nextTokPtr = ptr; 662 return XML_TOK_INVALID; 663 } 664 *nextTokPtr = ptr + MINBPC(enc); 665 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS; 666 default: 667 *nextTokPtr = ptr; 668 return XML_TOK_INVALID; 669 } 670 break; 671 } 672 break; 673 } 674 default: 675 *nextTokPtr = ptr; 676 return XML_TOK_INVALID; 677 } 678 } 679 return XML_TOK_PARTIAL; 680 } 681 682 /* ptr points to character following "<" */ 683 684 static int PTRCALL 685 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, 686 const char **nextTokPtr) 687 { 688 #ifdef XML_NS 689 int hadColon; 690 #endif 691 REQUIRE_CHAR(enc, ptr, end); 692 switch (BYTE_TYPE(enc, ptr)) { 693 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 694 case BT_EXCL: 695 ptr += MINBPC(enc); 696 REQUIRE_CHAR(enc, ptr, end); 697 switch (BYTE_TYPE(enc, ptr)) { 698 case BT_MINUS: 699 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); 700 case BT_LSQB: 701 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), 702 end, nextTokPtr); 703 } 704 *nextTokPtr = ptr; 705 return XML_TOK_INVALID; 706 case BT_QUEST: 707 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); 708 case BT_SOL: 709 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr); 710 default: 711 *nextTokPtr = ptr; 712 return XML_TOK_INVALID; 713 } 714 #ifdef XML_NS 715 hadColon = 0; 716 #endif 717 /* we have a start-tag */ 718 while (HAS_CHAR(enc, ptr, end)) { 719 switch (BYTE_TYPE(enc, ptr)) { 720 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 721 #ifdef XML_NS 722 case BT_COLON: 723 if (hadColon) { 724 *nextTokPtr = ptr; 725 return XML_TOK_INVALID; 726 } 727 hadColon = 1; 728 ptr += MINBPC(enc); 729 REQUIRE_CHAR(enc, ptr, end); 730 switch (BYTE_TYPE(enc, ptr)) { 731 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 732 default: 733 *nextTokPtr = ptr; 734 return XML_TOK_INVALID; 735 } 736 break; 737 #endif 738 case BT_S: case BT_CR: case BT_LF: 739 { 740 ptr += MINBPC(enc); 741 while (HAS_CHAR(enc, ptr, end)) { 742 switch (BYTE_TYPE(enc, ptr)) { 743 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 744 case BT_GT: 745 goto gt; 746 case BT_SOL: 747 goto sol; 748 case BT_S: case BT_CR: case BT_LF: 749 ptr += MINBPC(enc); 750 continue; 751 default: 752 *nextTokPtr = ptr; 753 return XML_TOK_INVALID; 754 } 755 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr); 756 } 757 return XML_TOK_PARTIAL; 758 } 759 case BT_GT: 760 gt: 761 *nextTokPtr = ptr + MINBPC(enc); 762 return XML_TOK_START_TAG_NO_ATTS; 763 case BT_SOL: 764 sol: 765 ptr += MINBPC(enc); 766 REQUIRE_CHAR(enc, ptr, end); 767 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 768 *nextTokPtr = ptr; 769 return XML_TOK_INVALID; 770 } 771 *nextTokPtr = ptr + MINBPC(enc); 772 return XML_TOK_EMPTY_ELEMENT_NO_ATTS; 773 default: 774 *nextTokPtr = ptr; 775 return XML_TOK_INVALID; 776 } 777 } 778 return XML_TOK_PARTIAL; 779 } 780 781 static int PTRCALL 782 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, 783 const char **nextTokPtr) 784 { 785 if (ptr >= end) 786 return XML_TOK_NONE; 787 if (MINBPC(enc) > 1) { 788 size_t n = end - ptr; 789 if (n & (MINBPC(enc) - 1)) { 790 n &= ~(MINBPC(enc) - 1); 791 if (n == 0) 792 return XML_TOK_PARTIAL; 793 end = ptr + n; 794 } 795 } 796 switch (BYTE_TYPE(enc, ptr)) { 797 case BT_LT: 798 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr); 799 case BT_AMP: 800 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 801 case BT_CR: 802 ptr += MINBPC(enc); 803 if (! HAS_CHAR(enc, ptr, end)) 804 return XML_TOK_TRAILING_CR; 805 if (BYTE_TYPE(enc, ptr) == BT_LF) 806 ptr += MINBPC(enc); 807 *nextTokPtr = ptr; 808 return XML_TOK_DATA_NEWLINE; 809 case BT_LF: 810 *nextTokPtr = ptr + MINBPC(enc); 811 return XML_TOK_DATA_NEWLINE; 812 case BT_RSQB: 813 ptr += MINBPC(enc); 814 if (! HAS_CHAR(enc, ptr, end)) 815 return XML_TOK_TRAILING_RSQB; 816 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) 817 break; 818 ptr += MINBPC(enc); 819 if (! HAS_CHAR(enc, ptr, end)) 820 return XML_TOK_TRAILING_RSQB; 821 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 822 ptr -= MINBPC(enc); 823 break; 824 } 825 *nextTokPtr = ptr; 826 return XML_TOK_INVALID; 827 INVALID_CASES(ptr, nextTokPtr) 828 default: 829 ptr += MINBPC(enc); 830 break; 831 } 832 while (HAS_CHAR(enc, ptr, end)) { 833 switch (BYTE_TYPE(enc, ptr)) { 834 #define LEAD_CASE(n) \ 835 case BT_LEAD ## n: \ 836 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ 837 *nextTokPtr = ptr; \ 838 return XML_TOK_DATA_CHARS; \ 839 } \ 840 ptr += n; \ 841 break; 842 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 843 #undef LEAD_CASE 844 case BT_RSQB: 845 if (HAS_CHARS(enc, ptr, end, 2)) { 846 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) { 847 ptr += MINBPC(enc); 848 break; 849 } 850 if (HAS_CHARS(enc, ptr, end, 3)) { 851 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) { 852 ptr += MINBPC(enc); 853 break; 854 } 855 *nextTokPtr = ptr + 2*MINBPC(enc); 856 return XML_TOK_INVALID; 857 } 858 } 859 /* fall through */ 860 case BT_AMP: 861 case BT_LT: 862 case BT_NONXML: 863 case BT_MALFORM: 864 case BT_TRAIL: 865 case BT_CR: 866 case BT_LF: 867 *nextTokPtr = ptr; 868 return XML_TOK_DATA_CHARS; 869 default: 870 ptr += MINBPC(enc); 871 break; 872 } 873 } 874 *nextTokPtr = ptr; 875 return XML_TOK_DATA_CHARS; 876 } 877 878 /* ptr points to character following "%" */ 879 880 static int PTRCALL 881 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end, 882 const char **nextTokPtr) 883 { 884 REQUIRE_CHAR(enc, ptr, end); 885 switch (BYTE_TYPE(enc, ptr)) { 886 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 887 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT: 888 *nextTokPtr = ptr; 889 return XML_TOK_PERCENT; 890 default: 891 *nextTokPtr = ptr; 892 return XML_TOK_INVALID; 893 } 894 while (HAS_CHAR(enc, ptr, end)) { 895 switch (BYTE_TYPE(enc, ptr)) { 896 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 897 case BT_SEMI: 898 *nextTokPtr = ptr + MINBPC(enc); 899 return XML_TOK_PARAM_ENTITY_REF; 900 default: 901 *nextTokPtr = ptr; 902 return XML_TOK_INVALID; 903 } 904 } 905 return XML_TOK_PARTIAL; 906 } 907 908 static int PTRCALL 909 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end, 910 const char **nextTokPtr) 911 { 912 REQUIRE_CHAR(enc, ptr, end); 913 switch (BYTE_TYPE(enc, ptr)) { 914 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 915 default: 916 *nextTokPtr = ptr; 917 return XML_TOK_INVALID; 918 } 919 while (HAS_CHAR(enc, ptr, end)) { 920 switch (BYTE_TYPE(enc, ptr)) { 921 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 922 case BT_CR: case BT_LF: case BT_S: 923 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR: 924 *nextTokPtr = ptr; 925 return XML_TOK_POUND_NAME; 926 default: 927 *nextTokPtr = ptr; 928 return XML_TOK_INVALID; 929 } 930 } 931 return -XML_TOK_POUND_NAME; 932 } 933 934 static int PTRCALL 935 PREFIX(scanLit)(int open, const ENCODING *enc, 936 const char *ptr, const char *end, 937 const char **nextTokPtr) 938 { 939 while (HAS_CHAR(enc, ptr, end)) { 940 int t = BYTE_TYPE(enc, ptr); 941 switch (t) { 942 INVALID_CASES(ptr, nextTokPtr) 943 case BT_QUOT: 944 case BT_APOS: 945 ptr += MINBPC(enc); 946 if (t != open) 947 break; 948 if (! HAS_CHAR(enc, ptr, end)) 949 return -XML_TOK_LITERAL; 950 *nextTokPtr = ptr; 951 switch (BYTE_TYPE(enc, ptr)) { 952 case BT_S: case BT_CR: case BT_LF: 953 case BT_GT: case BT_PERCNT: case BT_LSQB: 954 return XML_TOK_LITERAL; 955 default: 956 return XML_TOK_INVALID; 957 } 958 default: 959 ptr += MINBPC(enc); 960 break; 961 } 962 } 963 return XML_TOK_PARTIAL; 964 } 965 966 static int PTRCALL 967 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, 968 const char **nextTokPtr) 969 { 970 int tok; 971 if (ptr >= end) 972 return XML_TOK_NONE; 973 if (MINBPC(enc) > 1) { 974 size_t n = end - ptr; 975 if (n & (MINBPC(enc) - 1)) { 976 n &= ~(MINBPC(enc) - 1); 977 if (n == 0) 978 return XML_TOK_PARTIAL; 979 end = ptr + n; 980 } 981 } 982 switch (BYTE_TYPE(enc, ptr)) { 983 case BT_QUOT: 984 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr); 985 case BT_APOS: 986 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr); 987 case BT_LT: 988 { 989 ptr += MINBPC(enc); 990 REQUIRE_CHAR(enc, ptr, end); 991 switch (BYTE_TYPE(enc, ptr)) { 992 case BT_EXCL: 993 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr); 994 case BT_QUEST: 995 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); 996 case BT_NMSTRT: 997 case BT_HEX: 998 case BT_NONASCII: 999 case BT_LEAD2: 1000 case BT_LEAD3: 1001 case BT_LEAD4: 1002 *nextTokPtr = ptr - MINBPC(enc); 1003 return XML_TOK_INSTANCE_START; 1004 } 1005 *nextTokPtr = ptr; 1006 return XML_TOK_INVALID; 1007 } 1008 case BT_CR: 1009 if (ptr + MINBPC(enc) == end) { 1010 *nextTokPtr = end; 1011 /* indicate that this might be part of a CR/LF pair */ 1012 return -XML_TOK_PROLOG_S; 1013 } 1014 /* fall through */ 1015 case BT_S: case BT_LF: 1016 for (;;) { 1017 ptr += MINBPC(enc); 1018 if (! HAS_CHAR(enc, ptr, end)) 1019 break; 1020 switch (BYTE_TYPE(enc, ptr)) { 1021 case BT_S: case BT_LF: 1022 break; 1023 case BT_CR: 1024 /* don't split CR/LF pair */ 1025 if (ptr + MINBPC(enc) != end) 1026 break; 1027 /* fall through */ 1028 default: 1029 *nextTokPtr = ptr; 1030 return XML_TOK_PROLOG_S; 1031 } 1032 } 1033 *nextTokPtr = ptr; 1034 return XML_TOK_PROLOG_S; 1035 case BT_PERCNT: 1036 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1037 case BT_COMMA: 1038 *nextTokPtr = ptr + MINBPC(enc); 1039 return XML_TOK_COMMA; 1040 case BT_LSQB: 1041 *nextTokPtr = ptr + MINBPC(enc); 1042 return XML_TOK_OPEN_BRACKET; 1043 case BT_RSQB: 1044 ptr += MINBPC(enc); 1045 if (! HAS_CHAR(enc, ptr, end)) 1046 return -XML_TOK_CLOSE_BRACKET; 1047 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { 1048 REQUIRE_CHARS(enc, ptr, end, 2); 1049 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) { 1050 *nextTokPtr = ptr + 2*MINBPC(enc); 1051 return XML_TOK_COND_SECT_CLOSE; 1052 } 1053 } 1054 *nextTokPtr = ptr; 1055 return XML_TOK_CLOSE_BRACKET; 1056 case BT_LPAR: 1057 *nextTokPtr = ptr + MINBPC(enc); 1058 return XML_TOK_OPEN_PAREN; 1059 case BT_RPAR: 1060 ptr += MINBPC(enc); 1061 if (! HAS_CHAR(enc, ptr, end)) 1062 return -XML_TOK_CLOSE_PAREN; 1063 switch (BYTE_TYPE(enc, ptr)) { 1064 case BT_AST: 1065 *nextTokPtr = ptr + MINBPC(enc); 1066 return XML_TOK_CLOSE_PAREN_ASTERISK; 1067 case BT_QUEST: 1068 *nextTokPtr = ptr + MINBPC(enc); 1069 return XML_TOK_CLOSE_PAREN_QUESTION; 1070 case BT_PLUS: 1071 *nextTokPtr = ptr + MINBPC(enc); 1072 return XML_TOK_CLOSE_PAREN_PLUS; 1073 case BT_CR: case BT_LF: case BT_S: 1074 case BT_GT: case BT_COMMA: case BT_VERBAR: 1075 case BT_RPAR: 1076 *nextTokPtr = ptr; 1077 return XML_TOK_CLOSE_PAREN; 1078 } 1079 *nextTokPtr = ptr; 1080 return XML_TOK_INVALID; 1081 case BT_VERBAR: 1082 *nextTokPtr = ptr + MINBPC(enc); 1083 return XML_TOK_OR; 1084 case BT_GT: 1085 *nextTokPtr = ptr + MINBPC(enc); 1086 return XML_TOK_DECL_CLOSE; 1087 case BT_NUM: 1088 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1089 #define LEAD_CASE(n) \ 1090 case BT_LEAD ## n: \ 1091 if (end - ptr < n) \ 1092 return XML_TOK_PARTIAL_CHAR; \ 1093 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ 1094 ptr += n; \ 1095 tok = XML_TOK_NAME; \ 1096 break; \ 1097 } \ 1098 if (IS_NAME_CHAR(enc, ptr, n)) { \ 1099 ptr += n; \ 1100 tok = XML_TOK_NMTOKEN; \ 1101 break; \ 1102 } \ 1103 *nextTokPtr = ptr; \ 1104 return XML_TOK_INVALID; 1105 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1106 #undef LEAD_CASE 1107 case BT_NMSTRT: 1108 case BT_HEX: 1109 tok = XML_TOK_NAME; 1110 ptr += MINBPC(enc); 1111 break; 1112 case BT_DIGIT: 1113 case BT_NAME: 1114 case BT_MINUS: 1115 #ifdef XML_NS 1116 case BT_COLON: 1117 #endif 1118 tok = XML_TOK_NMTOKEN; 1119 ptr += MINBPC(enc); 1120 break; 1121 case BT_NONASCII: 1122 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { 1123 ptr += MINBPC(enc); 1124 tok = XML_TOK_NAME; 1125 break; 1126 } 1127 if (IS_NAME_CHAR_MINBPC(enc, ptr)) { 1128 ptr += MINBPC(enc); 1129 tok = XML_TOK_NMTOKEN; 1130 break; 1131 } 1132 /* fall through */ 1133 default: 1134 *nextTokPtr = ptr; 1135 return XML_TOK_INVALID; 1136 } 1137 while (HAS_CHAR(enc, ptr, end)) { 1138 switch (BYTE_TYPE(enc, ptr)) { 1139 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 1140 case BT_GT: case BT_RPAR: case BT_COMMA: 1141 case BT_VERBAR: case BT_LSQB: case BT_PERCNT: 1142 case BT_S: case BT_CR: case BT_LF: 1143 *nextTokPtr = ptr; 1144 return tok; 1145 #ifdef XML_NS 1146 case BT_COLON: 1147 ptr += MINBPC(enc); 1148 switch (tok) { 1149 case XML_TOK_NAME: 1150 REQUIRE_CHAR(enc, ptr, end); 1151 tok = XML_TOK_PREFIXED_NAME; 1152 switch (BYTE_TYPE(enc, ptr)) { 1153 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 1154 default: 1155 tok = XML_TOK_NMTOKEN; 1156 break; 1157 } 1158 break; 1159 case XML_TOK_PREFIXED_NAME: 1160 tok = XML_TOK_NMTOKEN; 1161 break; 1162 } 1163 break; 1164 #endif 1165 case BT_PLUS: 1166 if (tok == XML_TOK_NMTOKEN) { 1167 *nextTokPtr = ptr; 1168 return XML_TOK_INVALID; 1169 } 1170 *nextTokPtr = ptr + MINBPC(enc); 1171 return XML_TOK_NAME_PLUS; 1172 case BT_AST: 1173 if (tok == XML_TOK_NMTOKEN) { 1174 *nextTokPtr = ptr; 1175 return XML_TOK_INVALID; 1176 } 1177 *nextTokPtr = ptr + MINBPC(enc); 1178 return XML_TOK_NAME_ASTERISK; 1179 case BT_QUEST: 1180 if (tok == XML_TOK_NMTOKEN) { 1181 *nextTokPtr = ptr; 1182 return XML_TOK_INVALID; 1183 } 1184 *nextTokPtr = ptr + MINBPC(enc); 1185 return XML_TOK_NAME_QUESTION; 1186 default: 1187 *nextTokPtr = ptr; 1188 return XML_TOK_INVALID; 1189 } 1190 } 1191 return -tok; 1192 } 1193 1194 static int PTRCALL 1195 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, 1196 const char *end, const char **nextTokPtr) 1197 { 1198 const char *start; 1199 if (ptr >= end) 1200 return XML_TOK_NONE; 1201 else if (! HAS_CHAR(enc, ptr, end)) 1202 return XML_TOK_PARTIAL; 1203 start = ptr; 1204 while (HAS_CHAR(enc, ptr, end)) { 1205 switch (BYTE_TYPE(enc, ptr)) { 1206 #define LEAD_CASE(n) \ 1207 case BT_LEAD ## n: ptr += n; break; 1208 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1209 #undef LEAD_CASE 1210 case BT_AMP: 1211 if (ptr == start) 1212 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1213 *nextTokPtr = ptr; 1214 return XML_TOK_DATA_CHARS; 1215 case BT_LT: 1216 /* this is for inside entity references */ 1217 *nextTokPtr = ptr; 1218 return XML_TOK_INVALID; 1219 case BT_LF: 1220 if (ptr == start) { 1221 *nextTokPtr = ptr + MINBPC(enc); 1222 return XML_TOK_DATA_NEWLINE; 1223 } 1224 *nextTokPtr = ptr; 1225 return XML_TOK_DATA_CHARS; 1226 case BT_CR: 1227 if (ptr == start) { 1228 ptr += MINBPC(enc); 1229 if (! HAS_CHAR(enc, ptr, end)) 1230 return XML_TOK_TRAILING_CR; 1231 if (BYTE_TYPE(enc, ptr) == BT_LF) 1232 ptr += MINBPC(enc); 1233 *nextTokPtr = ptr; 1234 return XML_TOK_DATA_NEWLINE; 1235 } 1236 *nextTokPtr = ptr; 1237 return XML_TOK_DATA_CHARS; 1238 case BT_S: 1239 if (ptr == start) { 1240 *nextTokPtr = ptr + MINBPC(enc); 1241 return XML_TOK_ATTRIBUTE_VALUE_S; 1242 } 1243 *nextTokPtr = ptr; 1244 return XML_TOK_DATA_CHARS; 1245 default: 1246 ptr += MINBPC(enc); 1247 break; 1248 } 1249 } 1250 *nextTokPtr = ptr; 1251 return XML_TOK_DATA_CHARS; 1252 } 1253 1254 static int PTRCALL 1255 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, 1256 const char *end, const char **nextTokPtr) 1257 { 1258 const char *start; 1259 if (ptr >= end) 1260 return XML_TOK_NONE; 1261 else if (! HAS_CHAR(enc, ptr, end)) 1262 return XML_TOK_PARTIAL; 1263 start = ptr; 1264 while (HAS_CHAR(enc, ptr, end)) { 1265 switch (BYTE_TYPE(enc, ptr)) { 1266 #define LEAD_CASE(n) \ 1267 case BT_LEAD ## n: ptr += n; break; 1268 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1269 #undef LEAD_CASE 1270 case BT_AMP: 1271 if (ptr == start) 1272 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1273 *nextTokPtr = ptr; 1274 return XML_TOK_DATA_CHARS; 1275 case BT_PERCNT: 1276 if (ptr == start) { 1277 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), 1278 end, nextTokPtr); 1279 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok; 1280 } 1281 *nextTokPtr = ptr; 1282 return XML_TOK_DATA_CHARS; 1283 case BT_LF: 1284 if (ptr == start) { 1285 *nextTokPtr = ptr + MINBPC(enc); 1286 return XML_TOK_DATA_NEWLINE; 1287 } 1288 *nextTokPtr = ptr; 1289 return XML_TOK_DATA_CHARS; 1290 case BT_CR: 1291 if (ptr == start) { 1292 ptr += MINBPC(enc); 1293 if (! HAS_CHAR(enc, ptr, end)) 1294 return XML_TOK_TRAILING_CR; 1295 if (BYTE_TYPE(enc, ptr) == BT_LF) 1296 ptr += MINBPC(enc); 1297 *nextTokPtr = ptr; 1298 return XML_TOK_DATA_NEWLINE; 1299 } 1300 *nextTokPtr = ptr; 1301 return XML_TOK_DATA_CHARS; 1302 default: 1303 ptr += MINBPC(enc); 1304 break; 1305 } 1306 } 1307 *nextTokPtr = ptr; 1308 return XML_TOK_DATA_CHARS; 1309 } 1310 1311 #ifdef XML_DTD 1312 1313 static int PTRCALL 1314 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, 1315 const char *end, const char **nextTokPtr) 1316 { 1317 int level = 0; 1318 if (MINBPC(enc) > 1) { 1319 size_t n = end - ptr; 1320 if (n & (MINBPC(enc) - 1)) { 1321 n &= ~(MINBPC(enc) - 1); 1322 end = ptr + n; 1323 } 1324 } 1325 while (HAS_CHAR(enc, ptr, end)) { 1326 switch (BYTE_TYPE(enc, ptr)) { 1327 INVALID_CASES(ptr, nextTokPtr) 1328 case BT_LT: 1329 ptr += MINBPC(enc); 1330 REQUIRE_CHAR(enc, ptr, end); 1331 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) { 1332 ptr += MINBPC(enc); 1333 REQUIRE_CHAR(enc, ptr, end); 1334 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) { 1335 ++level; 1336 ptr += MINBPC(enc); 1337 } 1338 } 1339 break; 1340 case BT_RSQB: 1341 ptr += MINBPC(enc); 1342 REQUIRE_CHAR(enc, ptr, end); 1343 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { 1344 ptr += MINBPC(enc); 1345 REQUIRE_CHAR(enc, ptr, end); 1346 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 1347 ptr += MINBPC(enc); 1348 if (level == 0) { 1349 *nextTokPtr = ptr; 1350 return XML_TOK_IGNORE_SECT; 1351 } 1352 --level; 1353 } 1354 } 1355 break; 1356 default: 1357 ptr += MINBPC(enc); 1358 break; 1359 } 1360 } 1361 return XML_TOK_PARTIAL; 1362 } 1363 1364 #endif /* XML_DTD */ 1365 1366 static int PTRCALL 1367 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end, 1368 const char **badPtr) 1369 { 1370 ptr += MINBPC(enc); 1371 end -= MINBPC(enc); 1372 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 1373 switch (BYTE_TYPE(enc, ptr)) { 1374 case BT_DIGIT: 1375 case BT_HEX: 1376 case BT_MINUS: 1377 case BT_APOS: 1378 case BT_LPAR: 1379 case BT_RPAR: 1380 case BT_PLUS: 1381 case BT_COMMA: 1382 case BT_SOL: 1383 case BT_EQUALS: 1384 case BT_QUEST: 1385 case BT_CR: 1386 case BT_LF: 1387 case BT_SEMI: 1388 case BT_EXCL: 1389 case BT_AST: 1390 case BT_PERCNT: 1391 case BT_NUM: 1392 #ifdef XML_NS 1393 case BT_COLON: 1394 #endif 1395 break; 1396 case BT_S: 1397 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) { 1398 *badPtr = ptr; 1399 return 0; 1400 } 1401 break; 1402 case BT_NAME: 1403 case BT_NMSTRT: 1404 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f)) 1405 break; 1406 default: 1407 switch (BYTE_TO_ASCII(enc, ptr)) { 1408 case 0x24: /* $ */ 1409 case 0x40: /* @ */ 1410 break; 1411 default: 1412 *badPtr = ptr; 1413 return 0; 1414 } 1415 break; 1416 } 1417 } 1418 return 1; 1419 } 1420 1421 /* This must only be called for a well-formed start-tag or empty 1422 element tag. Returns the number of attributes. Pointers to the 1423 first attsMax attributes are stored in atts. 1424 */ 1425 1426 static int PTRCALL 1427 PREFIX(getAtts)(const ENCODING *enc, const char *ptr, 1428 int attsMax, ATTRIBUTE *atts) 1429 { 1430 enum { other, inName, inValue } state = inName; 1431 int nAtts = 0; 1432 int open = 0; /* defined when state == inValue; 1433 initialization just to shut up compilers */ 1434 1435 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) { 1436 switch (BYTE_TYPE(enc, ptr)) { 1437 #define START_NAME \ 1438 if (state == other) { \ 1439 if (nAtts < attsMax) { \ 1440 atts[nAtts].name = ptr; \ 1441 atts[nAtts].normalized = 1; \ 1442 } \ 1443 state = inName; \ 1444 } 1445 #define LEAD_CASE(n) \ 1446 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break; 1447 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1448 #undef LEAD_CASE 1449 case BT_NONASCII: 1450 case BT_NMSTRT: 1451 case BT_HEX: 1452 START_NAME 1453 break; 1454 #undef START_NAME 1455 case BT_QUOT: 1456 if (state != inValue) { 1457 if (nAtts < attsMax) 1458 atts[nAtts].valuePtr = ptr + MINBPC(enc); 1459 state = inValue; 1460 open = BT_QUOT; 1461 } 1462 else if (open == BT_QUOT) { 1463 state = other; 1464 if (nAtts < attsMax) 1465 atts[nAtts].valueEnd = ptr; 1466 nAtts++; 1467 } 1468 break; 1469 case BT_APOS: 1470 if (state != inValue) { 1471 if (nAtts < attsMax) 1472 atts[nAtts].valuePtr = ptr + MINBPC(enc); 1473 state = inValue; 1474 open = BT_APOS; 1475 } 1476 else if (open == BT_APOS) { 1477 state = other; 1478 if (nAtts < attsMax) 1479 atts[nAtts].valueEnd = ptr; 1480 nAtts++; 1481 } 1482 break; 1483 case BT_AMP: 1484 if (nAtts < attsMax) 1485 atts[nAtts].normalized = 0; 1486 break; 1487 case BT_S: 1488 if (state == inName) 1489 state = other; 1490 else if (state == inValue 1491 && nAtts < attsMax 1492 && atts[nAtts].normalized 1493 && (ptr == atts[nAtts].valuePtr 1494 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE 1495 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE 1496 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open)) 1497 atts[nAtts].normalized = 0; 1498 break; 1499 case BT_CR: case BT_LF: 1500 /* This case ensures that the first attribute name is counted 1501 Apart from that we could just change state on the quote. */ 1502 if (state == inName) 1503 state = other; 1504 else if (state == inValue && nAtts < attsMax) 1505 atts[nAtts].normalized = 0; 1506 break; 1507 case BT_GT: 1508 case BT_SOL: 1509 if (state != inValue) 1510 return nAtts; 1511 break; 1512 default: 1513 break; 1514 } 1515 } 1516 /* not reached */ 1517 } 1518 1519 static int PTRFASTCALL 1520 PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr) 1521 { 1522 int result = 0; 1523 /* skip &# */ 1524 ptr += 2*MINBPC(enc); 1525 if (CHAR_MATCHES(enc, ptr, ASCII_x)) { 1526 for (ptr += MINBPC(enc); 1527 !CHAR_MATCHES(enc, ptr, ASCII_SEMI); 1528 ptr += MINBPC(enc)) { 1529 int c = BYTE_TO_ASCII(enc, ptr); 1530 switch (c) { 1531 case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4: 1532 case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9: 1533 result <<= 4; 1534 result |= (c - ASCII_0); 1535 break; 1536 case ASCII_A: case ASCII_B: case ASCII_C: 1537 case ASCII_D: case ASCII_E: case ASCII_F: 1538 result <<= 4; 1539 result += 10 + (c - ASCII_A); 1540 break; 1541 case ASCII_a: case ASCII_b: case ASCII_c: 1542 case ASCII_d: case ASCII_e: case ASCII_f: 1543 result <<= 4; 1544 result += 10 + (c - ASCII_a); 1545 break; 1546 } 1547 if (result >= 0x110000) 1548 return -1; 1549 } 1550 } 1551 else { 1552 for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) { 1553 int c = BYTE_TO_ASCII(enc, ptr); 1554 result *= 10; 1555 result += (c - ASCII_0); 1556 if (result >= 0x110000) 1557 return -1; 1558 } 1559 } 1560 return checkCharRefNumber(result); 1561 } 1562 1563 static int PTRCALL 1564 PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr, 1565 const char *end) 1566 { 1567 switch ((end - ptr)/MINBPC(enc)) { 1568 case 2: 1569 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) { 1570 switch (BYTE_TO_ASCII(enc, ptr)) { 1571 case ASCII_l: 1572 return ASCII_LT; 1573 case ASCII_g: 1574 return ASCII_GT; 1575 } 1576 } 1577 break; 1578 case 3: 1579 if (CHAR_MATCHES(enc, ptr, ASCII_a)) { 1580 ptr += MINBPC(enc); 1581 if (CHAR_MATCHES(enc, ptr, ASCII_m)) { 1582 ptr += MINBPC(enc); 1583 if (CHAR_MATCHES(enc, ptr, ASCII_p)) 1584 return ASCII_AMP; 1585 } 1586 } 1587 break; 1588 case 4: 1589 switch (BYTE_TO_ASCII(enc, ptr)) { 1590 case ASCII_q: 1591 ptr += MINBPC(enc); 1592 if (CHAR_MATCHES(enc, ptr, ASCII_u)) { 1593 ptr += MINBPC(enc); 1594 if (CHAR_MATCHES(enc, ptr, ASCII_o)) { 1595 ptr += MINBPC(enc); 1596 if (CHAR_MATCHES(enc, ptr, ASCII_t)) 1597 return ASCII_QUOT; 1598 } 1599 } 1600 break; 1601 case ASCII_a: 1602 ptr += MINBPC(enc); 1603 if (CHAR_MATCHES(enc, ptr, ASCII_p)) { 1604 ptr += MINBPC(enc); 1605 if (CHAR_MATCHES(enc, ptr, ASCII_o)) { 1606 ptr += MINBPC(enc); 1607 if (CHAR_MATCHES(enc, ptr, ASCII_s)) 1608 return ASCII_APOS; 1609 } 1610 } 1611 break; 1612 } 1613 } 1614 return 0; 1615 } 1616 1617 static int PTRCALL 1618 PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2) 1619 { 1620 for (;;) { 1621 switch (BYTE_TYPE(enc, ptr1)) { 1622 #define LEAD_CASE(n) \ 1623 case BT_LEAD ## n: \ 1624 if (*ptr1++ != *ptr2++) \ 1625 return 0; 1626 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2) 1627 #undef LEAD_CASE 1628 /* fall through */ 1629 if (*ptr1++ != *ptr2++) 1630 return 0; 1631 break; 1632 case BT_NONASCII: 1633 case BT_NMSTRT: 1634 #ifdef XML_NS 1635 case BT_COLON: 1636 #endif 1637 case BT_HEX: 1638 case BT_DIGIT: 1639 case BT_NAME: 1640 case BT_MINUS: 1641 if (*ptr2++ != *ptr1++) 1642 return 0; 1643 if (MINBPC(enc) > 1) { 1644 if (*ptr2++ != *ptr1++) 1645 return 0; 1646 if (MINBPC(enc) > 2) { 1647 if (*ptr2++ != *ptr1++) 1648 return 0; 1649 if (MINBPC(enc) > 3) { 1650 if (*ptr2++ != *ptr1++) 1651 return 0; 1652 } 1653 } 1654 } 1655 break; 1656 default: 1657 if (MINBPC(enc) == 1 && *ptr1 == *ptr2) 1658 return 1; 1659 switch (BYTE_TYPE(enc, ptr2)) { 1660 case BT_LEAD2: 1661 case BT_LEAD3: 1662 case BT_LEAD4: 1663 case BT_NONASCII: 1664 case BT_NMSTRT: 1665 #ifdef XML_NS 1666 case BT_COLON: 1667 #endif 1668 case BT_HEX: 1669 case BT_DIGIT: 1670 case BT_NAME: 1671 case BT_MINUS: 1672 return 0; 1673 default: 1674 return 1; 1675 } 1676 } 1677 } 1678 /* not reached */ 1679 } 1680 1681 static int PTRCALL 1682 PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1, 1683 const char *end1, const char *ptr2) 1684 { 1685 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) { 1686 if (end1 - ptr1 < MINBPC(enc)) 1687 return 0; 1688 if (!CHAR_MATCHES(enc, ptr1, *ptr2)) 1689 return 0; 1690 } 1691 return ptr1 == end1; 1692 } 1693 1694 static int PTRFASTCALL 1695 PREFIX(nameLength)(const ENCODING *enc, const char *ptr) 1696 { 1697 const char *start = ptr; 1698 for (;;) { 1699 switch (BYTE_TYPE(enc, ptr)) { 1700 #define LEAD_CASE(n) \ 1701 case BT_LEAD ## n: ptr += n; break; 1702 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1703 #undef LEAD_CASE 1704 case BT_NONASCII: 1705 case BT_NMSTRT: 1706 #ifdef XML_NS 1707 case BT_COLON: 1708 #endif 1709 case BT_HEX: 1710 case BT_DIGIT: 1711 case BT_NAME: 1712 case BT_MINUS: 1713 ptr += MINBPC(enc); 1714 break; 1715 default: 1716 return (int)(ptr - start); 1717 } 1718 } 1719 } 1720 1721 static const char * PTRFASTCALL 1722 PREFIX(skipS)(const ENCODING *enc, const char *ptr) 1723 { 1724 for (;;) { 1725 switch (BYTE_TYPE(enc, ptr)) { 1726 case BT_LF: 1727 case BT_CR: 1728 case BT_S: 1729 ptr += MINBPC(enc); 1730 break; 1731 default: 1732 return ptr; 1733 } 1734 } 1735 } 1736 1737 static void PTRCALL 1738 PREFIX(updatePosition)(const ENCODING *enc, 1739 const char *ptr, 1740 const char *end, 1741 POSITION *pos) 1742 { 1743 while (HAS_CHAR(enc, ptr, end)) { 1744 switch (BYTE_TYPE(enc, ptr)) { 1745 #define LEAD_CASE(n) \ 1746 case BT_LEAD ## n: \ 1747 ptr += n; \ 1748 break; 1749 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1750 #undef LEAD_CASE 1751 case BT_LF: 1752 pos->columnNumber = (XML_Size)-1; 1753 pos->lineNumber++; 1754 ptr += MINBPC(enc); 1755 break; 1756 case BT_CR: 1757 pos->lineNumber++; 1758 ptr += MINBPC(enc); 1759 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF) 1760 ptr += MINBPC(enc); 1761 pos->columnNumber = (XML_Size)-1; 1762 break; 1763 default: 1764 ptr += MINBPC(enc); 1765 break; 1766 } 1767 pos->columnNumber++; 1768 } 1769 } 1770 1771 #undef DO_LEAD_CASE 1772 #undef MULTIBYTE_CASES 1773 #undef INVALID_CASES 1774 #undef CHECK_NAME_CASE 1775 #undef CHECK_NAME_CASES 1776 #undef CHECK_NMSTRT_CASE 1777 #undef CHECK_NMSTRT_CASES 1778 1779 #endif /* XML_TOK_IMPL_C */ 1780