1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd 2 See the file COPYING for copying permission. 3 */ 4 5 #ifdef COMPILED_FROM_DSP 6 #include "winconfig.h" 7 #elif defined(MACOS_CLASSIC) 8 #include "macconfig.h" 9 #else 10 #include <expat_config.h> 11 #endif /* ndef COMPILED_FROM_DSP */ 12 13 #include "internal.h" 14 #include "xmltok.h" 15 #include "nametab.h" 16 17 #ifdef XML_DTD 18 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) 19 #else 20 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */ 21 #endif 22 23 #define VTABLE1 \ 24 { PREFIX(prologTok), PREFIX(contentTok), \ 25 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \ 26 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ 27 PREFIX(sameName), \ 28 PREFIX(nameMatchesAscii), \ 29 PREFIX(nameLength), \ 30 PREFIX(skipS), \ 31 PREFIX(getAtts), \ 32 PREFIX(charRefNumber), \ 33 PREFIX(predefinedEntityName), \ 34 PREFIX(updatePosition), \ 35 PREFIX(isPublicId) 36 37 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) 38 39 #define UCS2_GET_NAMING(pages, hi, lo) \ 40 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) 41 42 /* A 2 byte UTF-8 representation splits the characters 11 bits between 43 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into 44 pages, 3 bits to add to that index and 5 bits to generate the mask. 45 */ 46 #define UTF8_GET_NAMING2(pages, byte) \ 47 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ 48 + ((((byte)[0]) & 3) << 1) \ 49 + ((((byte)[1]) >> 5) & 1)] \ 50 & (1 << (((byte)[1]) & 0x1F))) 51 52 /* A 3 byte UTF-8 representation splits the characters 16 bits between 53 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index 54 into pages, 3 bits to add to that index and 5 bits to generate the 55 mask. 56 */ 57 #define UTF8_GET_NAMING3(pages, byte) \ 58 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ 59 + ((((byte)[1]) >> 2) & 0xF)] \ 60 << 3) \ 61 + ((((byte)[1]) & 3) << 1) \ 62 + ((((byte)[2]) >> 5) & 1)] \ 63 & (1 << (((byte)[2]) & 0x1F))) 64 65 #define UTF8_GET_NAMING(pages, p, n) \ 66 ((n) == 2 \ 67 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ 68 : ((n) == 3 \ 69 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ 70 : 0)) 71 72 /* Detection of invalid UTF-8 sequences is based on Table 3.1B 73 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ 74 with the additional restriction of not allowing the Unicode 75 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). 76 Implementation details: 77 (A & 0x80) == 0 means A < 0x80 78 and 79 (A & 0xC0) == 0xC0 means A > 0xBF 80 */ 81 82 #define UTF8_INVALID2(p) \ 83 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) 84 85 #define UTF8_INVALID3(p) \ 86 (((p)[2] & 0x80) == 0 \ 87 || \ 88 ((*p) == 0xEF && (p)[1] == 0xBF \ 89 ? \ 90 (p)[2] > 0xBD \ 91 : \ 92 ((p)[2] & 0xC0) == 0xC0) \ 93 || \ 94 ((*p) == 0xE0 \ 95 ? \ 96 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ 97 : \ 98 ((p)[1] & 0x80) == 0 \ 99 || \ 100 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) 101 102 #define UTF8_INVALID4(p) \ 103 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \ 104 || \ 105 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \ 106 || \ 107 ((*p) == 0xF0 \ 108 ? \ 109 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ 110 : \ 111 ((p)[1] & 0x80) == 0 \ 112 || \ 113 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) 114 115 static int FASTCALL 116 isNever(const ENCODING *enc, const char *p) 117 { 118 return 0; 119 } 120 121 static int FASTCALL 122 utf8_isName2(const ENCODING *enc, const char *p) 123 { 124 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); 125 } 126 127 static int FASTCALL 128 utf8_isName3(const ENCODING *enc, const char *p) 129 { 130 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); 131 } 132 133 #define utf8_isName4 isNever 134 135 static int FASTCALL 136 utf8_isNmstrt2(const ENCODING *enc, const char *p) 137 { 138 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); 139 } 140 141 static int FASTCALL 142 utf8_isNmstrt3(const ENCODING *enc, const char *p) 143 { 144 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); 145 } 146 147 #define utf8_isNmstrt4 isNever 148 149 static int FASTCALL 150 utf8_isInvalid2(const ENCODING *enc, const char *p) 151 { 152 return UTF8_INVALID2((const unsigned char *)p); 153 } 154 155 static int FASTCALL 156 utf8_isInvalid3(const ENCODING *enc, const char *p) 157 { 158 return UTF8_INVALID3((const unsigned char *)p); 159 } 160 161 static int FASTCALL 162 utf8_isInvalid4(const ENCODING *enc, const char *p) 163 { 164 return UTF8_INVALID4((const unsigned char *)p); 165 } 166 167 struct normal_encoding { 168 ENCODING enc; 169 unsigned char type[256]; 170 #ifdef XML_MIN_SIZE 171 int (FASTCALL *byteType)(const ENCODING *, const char *); 172 int (FASTCALL *isNameMin)(const ENCODING *, const char *); 173 int (FASTCALL *isNmstrtMin)(const ENCODING *, const char *); 174 int (FASTCALL *byteToAscii)(const ENCODING *, const char *); 175 int (FASTCALL *charMatches)(const ENCODING *, const char *, int); 176 #endif /* XML_MIN_SIZE */ 177 int (FASTCALL *isName2)(const ENCODING *, const char *); 178 int (FASTCALL *isName3)(const ENCODING *, const char *); 179 int (FASTCALL *isName4)(const ENCODING *, const char *); 180 int (FASTCALL *isNmstrt2)(const ENCODING *, const char *); 181 int (FASTCALL *isNmstrt3)(const ENCODING *, const char *); 182 int (FASTCALL *isNmstrt4)(const ENCODING *, const char *); 183 int (FASTCALL *isInvalid2)(const ENCODING *, const char *); 184 int (FASTCALL *isInvalid3)(const ENCODING *, const char *); 185 int (FASTCALL *isInvalid4)(const ENCODING *, const char *); 186 }; 187 188 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc)) 189 190 #ifdef XML_MIN_SIZE 191 192 #define STANDARD_VTABLE(E) \ 193 E ## byteType, \ 194 E ## isNameMin, \ 195 E ## isNmstrtMin, \ 196 E ## byteToAscii, \ 197 E ## charMatches, 198 199 #else 200 201 #define STANDARD_VTABLE(E) /* as nothing */ 202 203 #endif 204 205 #define NORMAL_VTABLE(E) \ 206 E ## isName2, \ 207 E ## isName3, \ 208 E ## isName4, \ 209 E ## isNmstrt2, \ 210 E ## isNmstrt3, \ 211 E ## isNmstrt4, \ 212 E ## isInvalid2, \ 213 E ## isInvalid3, \ 214 E ## isInvalid4 215 216 static int FASTCALL checkCharRefNumber(int); 217 218 #include "xmltok_impl.h" 219 #include "ascii.h" 220 221 #ifdef XML_MIN_SIZE 222 #define sb_isNameMin isNever 223 #define sb_isNmstrtMin isNever 224 #endif 225 226 #ifdef XML_MIN_SIZE 227 #define MINBPC(enc) ((enc)->minBytesPerChar) 228 #else 229 /* minimum bytes per character */ 230 #define MINBPC(enc) 1 231 #endif 232 233 #define SB_BYTE_TYPE(enc, p) \ 234 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) 235 236 #ifdef XML_MIN_SIZE 237 static int FASTCALL 238 sb_byteType(const ENCODING *enc, const char *p) 239 { 240 return SB_BYTE_TYPE(enc, p); 241 } 242 #define BYTE_TYPE(enc, p) \ 243 (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) 244 #else 245 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) 246 #endif 247 248 #ifdef XML_MIN_SIZE 249 #define BYTE_TO_ASCII(enc, p) \ 250 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) 251 static int FASTCALL 252 sb_byteToAscii(const ENCODING *enc, const char *p) 253 { 254 return *p; 255 } 256 #else 257 #define BYTE_TO_ASCII(enc, p) (*(p)) 258 #endif 259 260 #define IS_NAME_CHAR(enc, p, n) \ 261 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p)) 262 #define IS_NMSTRT_CHAR(enc, p, n) \ 263 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p)) 264 #define IS_INVALID_CHAR(enc, p, n) \ 265 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p)) 266 267 #ifdef XML_MIN_SIZE 268 #define IS_NAME_CHAR_MINBPC(enc, p) \ 269 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) 270 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 271 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) 272 #else 273 #define IS_NAME_CHAR_MINBPC(enc, p) (0) 274 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 275 #endif 276 277 #ifdef XML_MIN_SIZE 278 #define CHAR_MATCHES(enc, p, c) \ 279 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) 280 static int FASTCALL 281 sb_charMatches(const ENCODING *enc, const char *p, int c) 282 { 283 return *p == c; 284 } 285 #else 286 /* c is an ASCII character */ 287 #define CHAR_MATCHES(enc, p, c) (*(p) == c) 288 #endif 289 290 #define PREFIX(ident) normal_ ## ident 291 #include "xmltok_impl.c" 292 293 #undef MINBPC 294 #undef BYTE_TYPE 295 #undef BYTE_TO_ASCII 296 #undef CHAR_MATCHES 297 #undef IS_NAME_CHAR 298 #undef IS_NAME_CHAR_MINBPC 299 #undef IS_NMSTRT_CHAR 300 #undef IS_NMSTRT_CHAR_MINBPC 301 #undef IS_INVALID_CHAR 302 303 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ 304 UTF8_cval1 = 0x00, 305 UTF8_cval2 = 0xc0, 306 UTF8_cval3 = 0xe0, 307 UTF8_cval4 = 0xf0 308 }; 309 310 static void FASTCALL 311 utf8_toUtf8(const ENCODING *enc, 312 const char **fromP, const char *fromLim, 313 char **toP, const char *toLim) 314 { 315 char *to; 316 const char *from; 317 if (fromLim - *fromP > toLim - *toP) { 318 /* Avoid copying partial characters. */ 319 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) 320 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) 321 break; 322 } 323 for (to = *toP, from = *fromP; from != fromLim; from++, to++) 324 *to = *from; 325 *fromP = from; 326 *toP = to; 327 } 328 329 static void FASTCALL 330 utf8_toUtf16(const ENCODING *enc, 331 const char **fromP, const char *fromLim, 332 unsigned short **toP, const unsigned short *toLim) 333 { 334 unsigned short *to = *toP; 335 const char *from = *fromP; 336 while (from != fromLim && to != toLim) { 337 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { 338 case BT_LEAD2: 339 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); 340 from += 2; 341 break; 342 case BT_LEAD3: 343 *to++ = (unsigned short)(((from[0] & 0xf) << 12) 344 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f)); 345 from += 3; 346 break; 347 case BT_LEAD4: 348 { 349 unsigned long n; 350 if (to + 1 == toLim) 351 goto after; 352 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) 353 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); 354 n -= 0x10000; 355 to[0] = (unsigned short)((n >> 10) | 0xD800); 356 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); 357 to += 2; 358 from += 4; 359 } 360 break; 361 default: 362 *to++ = *from++; 363 break; 364 } 365 } 366 after: 367 *fromP = from; 368 *toP = to; 369 } 370 371 #ifdef XML_NS 372 static const struct normal_encoding utf8_encoding_ns = { 373 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 374 { 375 #include "asciitab.h" 376 #include "utf8tab.h" 377 }, 378 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 379 }; 380 #endif 381 382 static const struct normal_encoding utf8_encoding = { 383 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 384 { 385 #define BT_COLON BT_NMSTRT 386 #include "asciitab.h" 387 #undef BT_COLON 388 #include "utf8tab.h" 389 }, 390 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 391 }; 392 393 #ifdef XML_NS 394 395 static const struct normal_encoding internal_utf8_encoding_ns = { 396 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 397 { 398 #include "iasciitab.h" 399 #include "utf8tab.h" 400 }, 401 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 402 }; 403 404 #endif 405 406 static const struct normal_encoding internal_utf8_encoding = { 407 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 408 { 409 #define BT_COLON BT_NMSTRT 410 #include "iasciitab.h" 411 #undef BT_COLON 412 #include "utf8tab.h" 413 }, 414 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 415 }; 416 417 static void FASTCALL 418 latin1_toUtf8(const ENCODING *enc, 419 const char **fromP, const char *fromLim, 420 char **toP, const char *toLim) 421 { 422 for (;;) { 423 unsigned char c; 424 if (*fromP == fromLim) 425 break; 426 c = (unsigned char)**fromP; 427 if (c & 0x80) { 428 if (toLim - *toP < 2) 429 break; 430 *(*toP)++ = (char)((c >> 6) | UTF8_cval2); 431 *(*toP)++ = (char)((c & 0x3f) | 0x80); 432 (*fromP)++; 433 } 434 else { 435 if (*toP == toLim) 436 break; 437 *(*toP)++ = *(*fromP)++; 438 } 439 } 440 } 441 442 static void FASTCALL 443 latin1_toUtf16(const ENCODING *enc, 444 const char **fromP, const char *fromLim, 445 unsigned short **toP, const unsigned short *toLim) 446 { 447 while (*fromP != fromLim && *toP != toLim) 448 *(*toP)++ = (unsigned char)*(*fromP)++; 449 } 450 451 #ifdef XML_NS 452 453 static const struct normal_encoding latin1_encoding_ns = { 454 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, 455 { 456 #include "asciitab.h" 457 #include "latin1tab.h" 458 }, 459 STANDARD_VTABLE(sb_) 460 }; 461 462 #endif 463 464 static const struct normal_encoding latin1_encoding = { 465 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, 466 { 467 #define BT_COLON BT_NMSTRT 468 #include "asciitab.h" 469 #undef BT_COLON 470 #include "latin1tab.h" 471 }, 472 STANDARD_VTABLE(sb_) 473 }; 474 475 static void FASTCALL 476 ascii_toUtf8(const ENCODING *enc, 477 const char **fromP, const char *fromLim, 478 char **toP, const char *toLim) 479 { 480 while (*fromP != fromLim && *toP != toLim) 481 *(*toP)++ = *(*fromP)++; 482 } 483 484 #ifdef XML_NS 485 486 static const struct normal_encoding ascii_encoding_ns = { 487 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, 488 { 489 #include "asciitab.h" 490 /* BT_NONXML == 0 */ 491 }, 492 STANDARD_VTABLE(sb_) 493 }; 494 495 #endif 496 497 static const struct normal_encoding ascii_encoding = { 498 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, 499 { 500 #define BT_COLON BT_NMSTRT 501 #include "asciitab.h" 502 #undef BT_COLON 503 /* BT_NONXML == 0 */ 504 }, 505 STANDARD_VTABLE(sb_) 506 }; 507 508 static int FASTCALL 509 unicode_byte_type(char hi, char lo) 510 { 511 switch ((unsigned char)hi) { 512 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 513 return BT_LEAD4; 514 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 515 return BT_TRAIL; 516 case 0xFF: 517 switch ((unsigned char)lo) { 518 case 0xFF: 519 case 0xFE: 520 return BT_NONXML; 521 } 522 break; 523 } 524 return BT_NONASCII; 525 } 526 527 #define DEFINE_UTF16_TO_UTF8(E) \ 528 static void FASTCALL \ 529 E ## toUtf8(const ENCODING *enc, \ 530 const char **fromP, const char *fromLim, \ 531 char **toP, const char *toLim) \ 532 { \ 533 const char *from; \ 534 for (from = *fromP; from != fromLim; from += 2) { \ 535 int plane; \ 536 unsigned char lo2; \ 537 unsigned char lo = GET_LO(from); \ 538 unsigned char hi = GET_HI(from); \ 539 switch (hi) { \ 540 case 0: \ 541 if (lo < 0x80) { \ 542 if (*toP == toLim) { \ 543 *fromP = from; \ 544 return; \ 545 } \ 546 *(*toP)++ = lo; \ 547 break; \ 548 } \ 549 /* fall through */ \ 550 case 0x1: case 0x2: case 0x3: \ 551 case 0x4: case 0x5: case 0x6: case 0x7: \ 552 if (toLim - *toP < 2) { \ 553 *fromP = from; \ 554 return; \ 555 } \ 556 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ 557 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 558 break; \ 559 default: \ 560 if (toLim - *toP < 3) { \ 561 *fromP = from; \ 562 return; \ 563 } \ 564 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ 565 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ 566 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ 567 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 568 break; \ 569 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ 570 if (toLim - *toP < 4) { \ 571 *fromP = from; \ 572 return; \ 573 } \ 574 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ 575 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ 576 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ 577 from += 2; \ 578 lo2 = GET_LO(from); \ 579 *(*toP)++ = (((lo & 0x3) << 4) \ 580 | ((GET_HI(from) & 0x3) << 2) \ 581 | (lo2 >> 6) \ 582 | 0x80); \ 583 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ 584 break; \ 585 } \ 586 } \ 587 *fromP = from; \ 588 } 589 590 #define DEFINE_UTF16_TO_UTF16(E) \ 591 static void FASTCALL \ 592 E ## toUtf16(const ENCODING *enc, \ 593 const char **fromP, const char *fromLim, \ 594 unsigned short **toP, const unsigned short *toLim) \ 595 { \ 596 /* Avoid copying first half only of surrogate */ \ 597 if (fromLim - *fromP > ((toLim - *toP) << 1) \ 598 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ 599 fromLim -= 2; \ 600 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ 601 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ 602 } 603 604 #define SET2(ptr, ch) \ 605 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) 606 #define GET_LO(ptr) ((unsigned char)(ptr)[0]) 607 #define GET_HI(ptr) ((unsigned char)(ptr)[1]) 608 609 DEFINE_UTF16_TO_UTF8(little2_) 610 DEFINE_UTF16_TO_UTF16(little2_) 611 612 #undef SET2 613 #undef GET_LO 614 #undef GET_HI 615 616 #define SET2(ptr, ch) \ 617 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) 618 #define GET_LO(ptr) ((unsigned char)(ptr)[1]) 619 #define GET_HI(ptr) ((unsigned char)(ptr)[0]) 620 621 DEFINE_UTF16_TO_UTF8(big2_) 622 DEFINE_UTF16_TO_UTF16(big2_) 623 624 #undef SET2 625 #undef GET_LO 626 #undef GET_HI 627 628 #define LITTLE2_BYTE_TYPE(enc, p) \ 629 ((p)[1] == 0 \ 630 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ 631 : unicode_byte_type((p)[1], (p)[0])) 632 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) 633 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) 634 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \ 635 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) 636 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 637 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) 638 639 #ifdef XML_MIN_SIZE 640 641 static int FASTCALL 642 little2_byteType(const ENCODING *enc, const char *p) 643 { 644 return LITTLE2_BYTE_TYPE(enc, p); 645 } 646 647 static int FASTCALL 648 little2_byteToAscii(const ENCODING *enc, const char *p) 649 { 650 return LITTLE2_BYTE_TO_ASCII(enc, p); 651 } 652 653 static int FASTCALL 654 little2_charMatches(const ENCODING *enc, const char *p, int c) 655 { 656 return LITTLE2_CHAR_MATCHES(enc, p, c); 657 } 658 659 static int FASTCALL 660 little2_isNameMin(const ENCODING *enc, const char *p) 661 { 662 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p); 663 } 664 665 static int FASTCALL 666 little2_isNmstrtMin(const ENCODING *enc, const char *p) 667 { 668 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p); 669 } 670 671 #undef VTABLE 672 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 673 674 #else /* not XML_MIN_SIZE */ 675 676 #undef PREFIX 677 #define PREFIX(ident) little2_ ## ident 678 #define MINBPC(enc) 2 679 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 680 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) 681 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) 682 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c) 683 #define IS_NAME_CHAR(enc, p, n) 0 684 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) 685 #define IS_NMSTRT_CHAR(enc, p, n) (0) 686 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) 687 688 #include "xmltok_impl.c" 689 690 #undef MINBPC 691 #undef BYTE_TYPE 692 #undef BYTE_TO_ASCII 693 #undef CHAR_MATCHES 694 #undef IS_NAME_CHAR 695 #undef IS_NAME_CHAR_MINBPC 696 #undef IS_NMSTRT_CHAR 697 #undef IS_NMSTRT_CHAR_MINBPC 698 #undef IS_INVALID_CHAR 699 700 #endif /* not XML_MIN_SIZE */ 701 702 #ifdef XML_NS 703 704 static const struct normal_encoding little2_encoding_ns = { 705 { VTABLE, 2, 0, 706 #if BYTEORDER == 1234 707 1 708 #else 709 0 710 #endif 711 }, 712 { 713 #include "asciitab.h" 714 #include "latin1tab.h" 715 }, 716 STANDARD_VTABLE(little2_) 717 }; 718 719 #endif 720 721 static const struct normal_encoding little2_encoding = { 722 { VTABLE, 2, 0, 723 #if BYTEORDER == 1234 724 1 725 #else 726 0 727 #endif 728 }, 729 { 730 #define BT_COLON BT_NMSTRT 731 #include "asciitab.h" 732 #undef BT_COLON 733 #include "latin1tab.h" 734 }, 735 STANDARD_VTABLE(little2_) 736 }; 737 738 #if BYTEORDER != 4321 739 740 #ifdef XML_NS 741 742 static const struct normal_encoding internal_little2_encoding_ns = { 743 { VTABLE, 2, 0, 1 }, 744 { 745 #include "iasciitab.h" 746 #include "latin1tab.h" 747 }, 748 STANDARD_VTABLE(little2_) 749 }; 750 751 #endif 752 753 static const struct normal_encoding internal_little2_encoding = { 754 { VTABLE, 2, 0, 1 }, 755 { 756 #define BT_COLON BT_NMSTRT 757 #include "iasciitab.h" 758 #undef BT_COLON 759 #include "latin1tab.h" 760 }, 761 STANDARD_VTABLE(little2_) 762 }; 763 764 #endif 765 766 767 #define BIG2_BYTE_TYPE(enc, p) \ 768 ((p)[0] == 0 \ 769 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ 770 : unicode_byte_type((p)[0], (p)[1])) 771 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) 772 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) 773 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \ 774 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) 775 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 776 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) 777 778 #ifdef XML_MIN_SIZE 779 780 static int FASTCALL 781 big2_byteType(const ENCODING *enc, const char *p) 782 { 783 return BIG2_BYTE_TYPE(enc, p); 784 } 785 786 static int FASTCALL 787 big2_byteToAscii(const ENCODING *enc, const char *p) 788 { 789 return BIG2_BYTE_TO_ASCII(enc, p); 790 } 791 792 static int FASTCALL 793 big2_charMatches(const ENCODING *enc, const char *p, int c) 794 { 795 return BIG2_CHAR_MATCHES(enc, p, c); 796 } 797 798 static int FASTCALL 799 big2_isNameMin(const ENCODING *enc, const char *p) 800 { 801 return BIG2_IS_NAME_CHAR_MINBPC(enc, p); 802 } 803 804 static int FASTCALL 805 big2_isNmstrtMin(const ENCODING *enc, const char *p) 806 { 807 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p); 808 } 809 810 #undef VTABLE 811 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 812 813 #else /* not XML_MIN_SIZE */ 814 815 #undef PREFIX 816 #define PREFIX(ident) big2_ ## ident 817 #define MINBPC(enc) 2 818 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 819 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) 820 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) 821 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c) 822 #define IS_NAME_CHAR(enc, p, n) 0 823 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p) 824 #define IS_NMSTRT_CHAR(enc, p, n) (0) 825 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) 826 827 #include "xmltok_impl.c" 828 829 #undef MINBPC 830 #undef BYTE_TYPE 831 #undef BYTE_TO_ASCII 832 #undef CHAR_MATCHES 833 #undef IS_NAME_CHAR 834 #undef IS_NAME_CHAR_MINBPC 835 #undef IS_NMSTRT_CHAR 836 #undef IS_NMSTRT_CHAR_MINBPC 837 #undef IS_INVALID_CHAR 838 839 #endif /* not XML_MIN_SIZE */ 840 841 #ifdef XML_NS 842 843 static const struct normal_encoding big2_encoding_ns = { 844 { VTABLE, 2, 0, 845 #if BYTEORDER == 4321 846 1 847 #else 848 0 849 #endif 850 }, 851 { 852 #include "asciitab.h" 853 #include "latin1tab.h" 854 }, 855 STANDARD_VTABLE(big2_) 856 }; 857 858 #endif 859 860 static const struct normal_encoding big2_encoding = { 861 { VTABLE, 2, 0, 862 #if BYTEORDER == 4321 863 1 864 #else 865 0 866 #endif 867 }, 868 { 869 #define BT_COLON BT_NMSTRT 870 #include "asciitab.h" 871 #undef BT_COLON 872 #include "latin1tab.h" 873 }, 874 STANDARD_VTABLE(big2_) 875 }; 876 877 #if BYTEORDER != 1234 878 879 #ifdef XML_NS 880 881 static const struct normal_encoding internal_big2_encoding_ns = { 882 { VTABLE, 2, 0, 1 }, 883 { 884 #include "iasciitab.h" 885 #include "latin1tab.h" 886 }, 887 STANDARD_VTABLE(big2_) 888 }; 889 890 #endif 891 892 static const struct normal_encoding internal_big2_encoding = { 893 { VTABLE, 2, 0, 1 }, 894 { 895 #define BT_COLON BT_NMSTRT 896 #include "iasciitab.h" 897 #undef BT_COLON 898 #include "latin1tab.h" 899 }, 900 STANDARD_VTABLE(big2_) 901 }; 902 903 #endif 904 905 #undef PREFIX 906 907 static int FASTCALL 908 streqci(const char *s1, const char *s2) 909 { 910 for (;;) { 911 char c1 = *s1++; 912 char c2 = *s2++; 913 if (ASCII_a <= c1 && c1 <= ASCII_z) 914 c1 += ASCII_A - ASCII_a; 915 if (ASCII_a <= c2 && c2 <= ASCII_z) 916 c2 += ASCII_A - ASCII_a; 917 if (c1 != c2) 918 return 0; 919 if (!c1) 920 break; 921 } 922 return 1; 923 } 924 925 static void FASTCALL 926 initUpdatePosition(const ENCODING *enc, const char *ptr, 927 const char *end, POSITION *pos) 928 { 929 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); 930 } 931 932 static int FASTCALL 933 toAscii(const ENCODING *enc, const char *ptr, const char *end) 934 { 935 char buf[1]; 936 char *p = buf; 937 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); 938 if (p == buf) 939 return -1; 940 else 941 return buf[0]; 942 } 943 944 static int FASTCALL 945 isSpace(int c) 946 { 947 switch (c) { 948 case 0x20: 949 case 0xD: 950 case 0xA: 951 case 0x9: 952 return 1; 953 } 954 return 0; 955 } 956 957 /* Return 1 if there's just optional white space or there's an S 958 followed by name=val. 959 */ 960 static int FASTCALL 961 parsePseudoAttribute(const ENCODING *enc, 962 const char *ptr, 963 const char *end, 964 const char **namePtr, 965 const char **nameEndPtr, 966 const char **valPtr, 967 const char **nextTokPtr) 968 { 969 int c; 970 char open; 971 if (ptr == end) { 972 *namePtr = NULL; 973 return 1; 974 } 975 if (!isSpace(toAscii(enc, ptr, end))) { 976 *nextTokPtr = ptr; 977 return 0; 978 } 979 do { 980 ptr += enc->minBytesPerChar; 981 } while (isSpace(toAscii(enc, ptr, end))); 982 if (ptr == end) { 983 *namePtr = NULL; 984 return 1; 985 } 986 *namePtr = ptr; 987 for (;;) { 988 c = toAscii(enc, ptr, end); 989 if (c == -1) { 990 *nextTokPtr = ptr; 991 return 0; 992 } 993 if (c == ASCII_EQUALS) { 994 *nameEndPtr = ptr; 995 break; 996 } 997 if (isSpace(c)) { 998 *nameEndPtr = ptr; 999 do { 1000 ptr += enc->minBytesPerChar; 1001 } while (isSpace(c = toAscii(enc, ptr, end))); 1002 if (c != ASCII_EQUALS) { 1003 *nextTokPtr = ptr; 1004 return 0; 1005 } 1006 break; 1007 } 1008 ptr += enc->minBytesPerChar; 1009 } 1010 if (ptr == *namePtr) { 1011 *nextTokPtr = ptr; 1012 return 0; 1013 } 1014 ptr += enc->minBytesPerChar; 1015 c = toAscii(enc, ptr, end); 1016 while (isSpace(c)) { 1017 ptr += enc->minBytesPerChar; 1018 c = toAscii(enc, ptr, end); 1019 } 1020 if (c != ASCII_QUOT && c != ASCII_APOS) { 1021 *nextTokPtr = ptr; 1022 return 0; 1023 } 1024 open = (char)c; 1025 ptr += enc->minBytesPerChar; 1026 *valPtr = ptr; 1027 for (;; ptr += enc->minBytesPerChar) { 1028 c = toAscii(enc, ptr, end); 1029 if (c == open) 1030 break; 1031 if (!(ASCII_a <= c && c <= ASCII_z) 1032 && !(ASCII_A <= c && c <= ASCII_Z) 1033 && !(ASCII_0 <= c && c <= ASCII_9) 1034 && c != ASCII_PERIOD 1035 && c != ASCII_MINUS 1036 && c != ASCII_UNDERSCORE) { 1037 *nextTokPtr = ptr; 1038 return 0; 1039 } 1040 } 1041 *nextTokPtr = ptr + enc->minBytesPerChar; 1042 return 1; 1043 } 1044 1045 static const char KW_version[] = { 1046 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0' 1047 }; 1048 1049 static const char KW_encoding[] = { 1050 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0' 1051 }; 1052 1053 static const char KW_standalone[] = { 1054 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, 1055 ASCII_n, ASCII_e, '\0' 1056 }; 1057 1058 static const char KW_yes[] = { 1059 ASCII_y, ASCII_e, ASCII_s, '\0' 1060 }; 1061 1062 static const char KW_no[] = { 1063 ASCII_n, ASCII_o, '\0' 1064 }; 1065 1066 static int 1067 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, 1068 const char *, 1069 const char *), 1070 int isGeneralTextEntity, 1071 const ENCODING *enc, 1072 const char *ptr, 1073 const char *end, 1074 const char **badPtr, 1075 const char **versionPtr, 1076 const char **versionEndPtr, 1077 const char **encodingName, 1078 const ENCODING **encoding, 1079 int *standalone) 1080 { 1081 const char *val = NULL; 1082 const char *name = NULL; 1083 const char *nameEnd = NULL; 1084 ptr += 5 * enc->minBytesPerChar; 1085 end -= 2 * enc->minBytesPerChar; 1086 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) 1087 || !name) { 1088 *badPtr = ptr; 1089 return 0; 1090 } 1091 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) { 1092 if (!isGeneralTextEntity) { 1093 *badPtr = name; 1094 return 0; 1095 } 1096 } 1097 else { 1098 if (versionPtr) 1099 *versionPtr = val; 1100 if (versionEndPtr) 1101 *versionEndPtr = ptr; 1102 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1103 *badPtr = ptr; 1104 return 0; 1105 } 1106 if (!name) { 1107 if (isGeneralTextEntity) { 1108 /* a TextDecl must have an EncodingDecl */ 1109 *badPtr = ptr; 1110 return 0; 1111 } 1112 return 1; 1113 } 1114 } 1115 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) { 1116 int c = toAscii(enc, val, end); 1117 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) { 1118 *badPtr = val; 1119 return 0; 1120 } 1121 if (encodingName) 1122 *encodingName = val; 1123 if (encoding) 1124 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); 1125 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1126 *badPtr = ptr; 1127 return 0; 1128 } 1129 if (!name) 1130 return 1; 1131 } 1132 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) 1133 || isGeneralTextEntity) { 1134 *badPtr = name; 1135 return 0; 1136 } 1137 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) { 1138 if (standalone) 1139 *standalone = 1; 1140 } 1141 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) { 1142 if (standalone) 1143 *standalone = 0; 1144 } 1145 else { 1146 *badPtr = val; 1147 return 0; 1148 } 1149 while (isSpace(toAscii(enc, ptr, end))) 1150 ptr += enc->minBytesPerChar; 1151 if (ptr != end) { 1152 *badPtr = ptr; 1153 return 0; 1154 } 1155 return 1; 1156 } 1157 1158 static int FASTCALL 1159 checkCharRefNumber(int result) 1160 { 1161 switch (result >> 8) { 1162 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 1163 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 1164 return -1; 1165 case 0: 1166 if (latin1_encoding.type[result] == BT_NONXML) 1167 return -1; 1168 break; 1169 case 0xFF: 1170 if (result == 0xFFFE || result == 0xFFFF) 1171 return -1; 1172 break; 1173 } 1174 return result; 1175 } 1176 1177 int 1178 XmlUtf8Encode(int c, char *buf) 1179 { 1180 enum { 1181 /* minN is minimum legal resulting value for N byte sequence */ 1182 min2 = 0x80, 1183 min3 = 0x800, 1184 min4 = 0x10000 1185 }; 1186 1187 if (c < 0) 1188 return 0; 1189 if (c < min2) { 1190 buf[0] = (char)(c | UTF8_cval1); 1191 return 1; 1192 } 1193 if (c < min3) { 1194 buf[0] = (char)((c >> 6) | UTF8_cval2); 1195 buf[1] = (char)((c & 0x3f) | 0x80); 1196 return 2; 1197 } 1198 if (c < min4) { 1199 buf[0] = (char)((c >> 12) | UTF8_cval3); 1200 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80); 1201 buf[2] = (char)((c & 0x3f) | 0x80); 1202 return 3; 1203 } 1204 if (c < 0x110000) { 1205 buf[0] = (char)((c >> 18) | UTF8_cval4); 1206 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80); 1207 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80); 1208 buf[3] = (char)((c & 0x3f) | 0x80); 1209 return 4; 1210 } 1211 return 0; 1212 } 1213 1214 int 1215 XmlUtf16Encode(int charNum, unsigned short *buf) 1216 { 1217 if (charNum < 0) 1218 return 0; 1219 if (charNum < 0x10000) { 1220 buf[0] = (unsigned short)charNum; 1221 return 1; 1222 } 1223 if (charNum < 0x110000) { 1224 charNum -= 0x10000; 1225 buf[0] = (unsigned short)((charNum >> 10) + 0xD800); 1226 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00); 1227 return 2; 1228 } 1229 return 0; 1230 } 1231 1232 struct unknown_encoding { 1233 struct normal_encoding normal; 1234 int (*convert)(void *userData, const char *p); 1235 void *userData; 1236 unsigned short utf16[256]; 1237 char utf8[256][4]; 1238 }; 1239 1240 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc)) 1241 1242 int 1243 XmlSizeOfUnknownEncoding(void) 1244 { 1245 return sizeof(struct unknown_encoding); 1246 } 1247 1248 static int FASTCALL 1249 unknown_isName(const ENCODING *enc, const char *p) 1250 { 1251 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1252 int c = uenc->convert(uenc->userData, p); 1253 if (c & ~0xFFFF) 1254 return 0; 1255 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); 1256 } 1257 1258 static int FASTCALL 1259 unknown_isNmstrt(const ENCODING *enc, const char *p) 1260 { 1261 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1262 int c = uenc->convert(uenc->userData, p); 1263 if (c & ~0xFFFF) 1264 return 0; 1265 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); 1266 } 1267 1268 static int FASTCALL 1269 unknown_isInvalid(const ENCODING *enc, const char *p) 1270 { 1271 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1272 int c = uenc->convert(uenc->userData, p); 1273 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; 1274 } 1275 1276 static void FASTCALL 1277 unknown_toUtf8(const ENCODING *enc, 1278 const char **fromP, const char *fromLim, 1279 char **toP, const char *toLim) 1280 { 1281 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1282 char buf[XML_UTF8_ENCODE_MAX]; 1283 for (;;) { 1284 const char *utf8; 1285 int n; 1286 if (*fromP == fromLim) 1287 break; 1288 utf8 = uenc->utf8[(unsigned char)**fromP]; 1289 n = *utf8++; 1290 if (n == 0) { 1291 int c = uenc->convert(uenc->userData, *fromP); 1292 n = XmlUtf8Encode(c, buf); 1293 if (n > toLim - *toP) 1294 break; 1295 utf8 = buf; 1296 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1297 - (BT_LEAD2 - 2)); 1298 } 1299 else { 1300 if (n > toLim - *toP) 1301 break; 1302 (*fromP)++; 1303 } 1304 do { 1305 *(*toP)++ = *utf8++; 1306 } while (--n != 0); 1307 } 1308 } 1309 1310 static void FASTCALL 1311 unknown_toUtf16(const ENCODING *enc, 1312 const char **fromP, const char *fromLim, 1313 unsigned short **toP, const unsigned short *toLim) 1314 { 1315 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1316 while (*fromP != fromLim && *toP != toLim) { 1317 unsigned short c = uenc->utf16[(unsigned char)**fromP]; 1318 if (c == 0) { 1319 c = (unsigned short) 1320 uenc->convert(uenc->userData, *fromP); 1321 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1322 - (BT_LEAD2 - 2)); 1323 } 1324 else 1325 (*fromP)++; 1326 *(*toP)++ = c; 1327 } 1328 } 1329 1330 ENCODING * 1331 XmlInitUnknownEncoding(void *mem, 1332 int *table, 1333 int (*convert)(void *userData, const char *p), 1334 void *userData) 1335 { 1336 int i; 1337 struct unknown_encoding *e = mem; 1338 for (i = 0; i < (int)sizeof(struct normal_encoding); i++) 1339 ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; 1340 for (i = 0; i < 128; i++) 1341 if (latin1_encoding.type[i] != BT_OTHER 1342 && latin1_encoding.type[i] != BT_NONXML 1343 && table[i] != i) 1344 return 0; 1345 for (i = 0; i < 256; i++) { 1346 int c = table[i]; 1347 if (c == -1) { 1348 e->normal.type[i] = BT_MALFORM; 1349 /* This shouldn't really get used. */ 1350 e->utf16[i] = 0xFFFF; 1351 e->utf8[i][0] = 1; 1352 e->utf8[i][1] = 0; 1353 } 1354 else if (c < 0) { 1355 if (c < -4) 1356 return 0; 1357 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2)); 1358 e->utf8[i][0] = 0; 1359 e->utf16[i] = 0; 1360 } 1361 else if (c < 0x80) { 1362 if (latin1_encoding.type[c] != BT_OTHER 1363 && latin1_encoding.type[c] != BT_NONXML 1364 && c != i) 1365 return 0; 1366 e->normal.type[i] = latin1_encoding.type[c]; 1367 e->utf8[i][0] = 1; 1368 e->utf8[i][1] = (char)c; 1369 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c); 1370 } 1371 else if (checkCharRefNumber(c) < 0) { 1372 e->normal.type[i] = BT_NONXML; 1373 /* This shouldn't really get used. */ 1374 e->utf16[i] = 0xFFFF; 1375 e->utf8[i][0] = 1; 1376 e->utf8[i][1] = 0; 1377 } 1378 else { 1379 if (c > 0xFFFF) 1380 return 0; 1381 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) 1382 e->normal.type[i] = BT_NMSTRT; 1383 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) 1384 e->normal.type[i] = BT_NAME; 1385 else 1386 e->normal.type[i] = BT_OTHER; 1387 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); 1388 e->utf16[i] = (unsigned short)c; 1389 } 1390 } 1391 e->userData = userData; 1392 e->convert = convert; 1393 if (convert) { 1394 e->normal.isName2 = unknown_isName; 1395 e->normal.isName3 = unknown_isName; 1396 e->normal.isName4 = unknown_isName; 1397 e->normal.isNmstrt2 = unknown_isNmstrt; 1398 e->normal.isNmstrt3 = unknown_isNmstrt; 1399 e->normal.isNmstrt4 = unknown_isNmstrt; 1400 e->normal.isInvalid2 = unknown_isInvalid; 1401 e->normal.isInvalid3 = unknown_isInvalid; 1402 e->normal.isInvalid4 = unknown_isInvalid; 1403 } 1404 e->normal.enc.utf8Convert = unknown_toUtf8; 1405 e->normal.enc.utf16Convert = unknown_toUtf16; 1406 return &(e->normal.enc); 1407 } 1408 1409 /* If this enumeration is changed, getEncodingIndex and encodings 1410 must also be changed. */ 1411 enum { 1412 UNKNOWN_ENC = -1, 1413 ISO_8859_1_ENC = 0, 1414 US_ASCII_ENC, 1415 UTF_8_ENC, 1416 UTF_16_ENC, 1417 UTF_16BE_ENC, 1418 UTF_16LE_ENC, 1419 /* must match encodingNames up to here */ 1420 NO_ENC 1421 }; 1422 1423 static const char KW_ISO_8859_1[] = { 1424 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, 1425 ASCII_MINUS, ASCII_1, '\0' 1426 }; 1427 static const char KW_US_ASCII[] = { 1428 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, 1429 '\0' 1430 }; 1431 static const char KW_UTF_8[] = { 1432 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0' 1433 }; 1434 static const char KW_UTF_16[] = { 1435 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0' 1436 }; 1437 static const char KW_UTF_16BE[] = { 1438 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, 1439 '\0' 1440 }; 1441 static const char KW_UTF_16LE[] = { 1442 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, 1443 '\0' 1444 }; 1445 1446 static int FASTCALL 1447 getEncodingIndex(const char *name) 1448 { 1449 static const char *encodingNames[] = { 1450 KW_ISO_8859_1, 1451 KW_US_ASCII, 1452 KW_UTF_8, 1453 KW_UTF_16, 1454 KW_UTF_16BE, 1455 KW_UTF_16LE, 1456 }; 1457 int i; 1458 if (name == NULL) 1459 return NO_ENC; 1460 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++) 1461 if (streqci(name, encodingNames[i])) 1462 return i; 1463 return UNKNOWN_ENC; 1464 } 1465 1466 /* For binary compatibility, we store the index of the encoding 1467 specified at initialization in the isUtf16 member. 1468 */ 1469 1470 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) 1471 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) 1472 1473 /* This is what detects the encoding. encodingTable maps from 1474 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of 1475 the external (protocol) specified encoding; state is 1476 XML_CONTENT_STATE if we're parsing an external text entity, and 1477 XML_PROLOG_STATE otherwise. 1478 */ 1479 1480 1481 static int FASTCALL 1482 initScan(const ENCODING **encodingTable, 1483 const INIT_ENCODING *enc, 1484 int state, 1485 const char *ptr, 1486 const char *end, 1487 const char **nextTokPtr) 1488 { 1489 const ENCODING **encPtr; 1490 1491 if (ptr == end) 1492 return XML_TOK_NONE; 1493 encPtr = enc->encPtr; 1494 if (ptr + 1 == end) { 1495 /* only a single byte available for auto-detection */ 1496 #ifndef XML_DTD /* FIXME */ 1497 /* a well-formed document entity must have more than one byte */ 1498 if (state != XML_CONTENT_STATE) 1499 return XML_TOK_PARTIAL; 1500 #endif 1501 /* so we're parsing an external text entity... */ 1502 /* if UTF-16 was externally specified, then we need at least 2 bytes */ 1503 switch (INIT_ENC_INDEX(enc)) { 1504 case UTF_16_ENC: 1505 case UTF_16LE_ENC: 1506 case UTF_16BE_ENC: 1507 return XML_TOK_PARTIAL; 1508 } 1509 switch ((unsigned char)*ptr) { 1510 case 0xFE: 1511 case 0xFF: 1512 case 0xEF: /* possibly first byte of UTF-8 BOM */ 1513 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1514 && state == XML_CONTENT_STATE) 1515 break; 1516 /* fall through */ 1517 case 0x00: 1518 case 0x3C: 1519 return XML_TOK_PARTIAL; 1520 } 1521 } 1522 else { 1523 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { 1524 case 0xFEFF: 1525 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1526 && state == XML_CONTENT_STATE) 1527 break; 1528 *nextTokPtr = ptr + 2; 1529 *encPtr = encodingTable[UTF_16BE_ENC]; 1530 return XML_TOK_BOM; 1531 /* 00 3C is handled in the default case */ 1532 case 0x3C00: 1533 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC 1534 || INIT_ENC_INDEX(enc) == UTF_16_ENC) 1535 && state == XML_CONTENT_STATE) 1536 break; 1537 *encPtr = encodingTable[UTF_16LE_ENC]; 1538 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1539 case 0xFFFE: 1540 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1541 && state == XML_CONTENT_STATE) 1542 break; 1543 *nextTokPtr = ptr + 2; 1544 *encPtr = encodingTable[UTF_16LE_ENC]; 1545 return XML_TOK_BOM; 1546 case 0xEFBB: 1547 /* Maybe a UTF-8 BOM (EF BB BF) */ 1548 /* If there's an explicitly specified (external) encoding 1549 of ISO-8859-1 or some flavour of UTF-16 1550 and this is an external text entity, 1551 don't look for the BOM, 1552 because it might be a legal data. 1553 */ 1554 if (state == XML_CONTENT_STATE) { 1555 int e = INIT_ENC_INDEX(enc); 1556 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC 1557 || e == UTF_16LE_ENC || e == UTF_16_ENC) 1558 break; 1559 } 1560 if (ptr + 2 == end) 1561 return XML_TOK_PARTIAL; 1562 if ((unsigned char)ptr[2] == 0xBF) { 1563 *nextTokPtr = ptr + 3; 1564 *encPtr = encodingTable[UTF_8_ENC]; 1565 return XML_TOK_BOM; 1566 } 1567 break; 1568 default: 1569 if (ptr[0] == '\0') { 1570 /* 0 isn't a legal data character. Furthermore a document 1571 entity can only start with ASCII characters. So the only 1572 way this can fail to be big-endian UTF-16 if it it's an 1573 external parsed general entity that's labelled as 1574 UTF-16LE. 1575 */ 1576 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) 1577 break; 1578 *encPtr = encodingTable[UTF_16BE_ENC]; 1579 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1580 } 1581 else if (ptr[1] == '\0') { 1582 /* We could recover here in the case: 1583 - parsing an external entity 1584 - second byte is 0 1585 - no externally specified encoding 1586 - no encoding declaration 1587 by assuming UTF-16LE. But we don't, because this would mean when 1588 presented just with a single byte, we couldn't reliably determine 1589 whether we needed further bytes. 1590 */ 1591 if (state == XML_CONTENT_STATE) 1592 break; 1593 *encPtr = encodingTable[UTF_16LE_ENC]; 1594 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1595 } 1596 break; 1597 } 1598 } 1599 *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; 1600 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1601 } 1602 1603 1604 #define NS(x) x 1605 #define ns(x) x 1606 #include "xmltok_ns.c" 1607 #undef NS 1608 #undef ns 1609 1610 #ifdef XML_NS 1611 1612 #define NS(x) x ## NS 1613 #define ns(x) x ## _ns 1614 1615 #include "xmltok_ns.c" 1616 1617 #undef NS 1618 #undef ns 1619 1620 ENCODING * 1621 XmlInitUnknownEncodingNS(void *mem, 1622 int *table, 1623 int (*convert)(void *userData, const char *p), 1624 void *userData) 1625 { 1626 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); 1627 if (enc) 1628 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON; 1629 return enc; 1630 } 1631 1632 #endif /* XML_NS */ 1633