1 /* 2 __ __ _ 3 ___\ \/ /_ __ __ _| |_ 4 / _ \\ /| '_ \ / _` | __| 5 | __// \| |_) | (_| | |_ 6 \___/_/\_\ .__/ \__,_|\__| 7 |_| XML parser 8 9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd 10 Copyright (c) 2000-2017 Expat development team 11 Licensed under the MIT license: 12 13 Permission is hereby granted, free of charge, to any person obtaining 14 a copy of this software and associated documentation files (the 15 "Software"), to deal in the Software without restriction, including 16 without limitation the rights to use, copy, modify, merge, publish, 17 distribute, sublicense, and/or sell copies of the Software, and to permit 18 persons to whom the Software is furnished to do so, subject to the 19 following conditions: 20 21 The above copyright notice and this permission notice shall be included 22 in all copies or substantial portions of the Software. 23 24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 30 USE OR OTHER DEALINGS IN THE SOFTWARE. 31 */ 32 33 #include <stddef.h> 34 #include <string.h> /* memcpy */ 35 36 #if defined(_MSC_VER) && (_MSC_VER <= 1700) 37 /* for vs2012/11.0/1700 and earlier Visual Studio compilers */ 38 # define bool int 39 # define false 0 40 # define true 1 41 #else 42 # include <stdbool.h> 43 #endif 44 45 46 #ifdef _WIN32 47 #include "winconfig.h" 48 #else 49 #ifdef HAVE_EXPAT_CONFIG_H 50 #include <expat_config.h> 51 #endif 52 #endif /* ndef _WIN32 */ 53 54 #include "expat_external.h" 55 #include "internal.h" 56 #include "xmltok.h" 57 #include "nametab.h" 58 59 #ifdef XML_DTD 60 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) 61 #else 62 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */ 63 #endif 64 65 #define VTABLE1 \ 66 { PREFIX(prologTok), PREFIX(contentTok), \ 67 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \ 68 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ 69 PREFIX(nameMatchesAscii), \ 70 PREFIX(nameLength), \ 71 PREFIX(skipS), \ 72 PREFIX(getAtts), \ 73 PREFIX(charRefNumber), \ 74 PREFIX(predefinedEntityName), \ 75 PREFIX(updatePosition), \ 76 PREFIX(isPublicId) 77 78 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) 79 80 #define UCS2_GET_NAMING(pages, hi, lo) \ 81 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F))) 82 83 /* A 2 byte UTF-8 representation splits the characters 11 bits between 84 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into 85 pages, 3 bits to add to that index and 5 bits to generate the mask. 86 */ 87 #define UTF8_GET_NAMING2(pages, byte) \ 88 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ 89 + ((((byte)[0]) & 3) << 1) \ 90 + ((((byte)[1]) >> 5) & 1)] \ 91 & (1u << (((byte)[1]) & 0x1F))) 92 93 /* A 3 byte UTF-8 representation splits the characters 16 bits between 94 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index 95 into pages, 3 bits to add to that index and 5 bits to generate the 96 mask. 97 */ 98 #define UTF8_GET_NAMING3(pages, byte) \ 99 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ 100 + ((((byte)[1]) >> 2) & 0xF)] \ 101 << 3) \ 102 + ((((byte)[1]) & 3) << 1) \ 103 + ((((byte)[2]) >> 5) & 1)] \ 104 & (1u << (((byte)[2]) & 0x1F))) 105 106 #define UTF8_GET_NAMING(pages, p, n) \ 107 ((n) == 2 \ 108 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ 109 : ((n) == 3 \ 110 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ 111 : 0)) 112 113 /* Detection of invalid UTF-8 sequences is based on Table 3.1B 114 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ 115 with the additional restriction of not allowing the Unicode 116 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). 117 Implementation details: 118 (A & 0x80) == 0 means A < 0x80 119 and 120 (A & 0xC0) == 0xC0 means A > 0xBF 121 */ 122 123 #define UTF8_INVALID2(p) \ 124 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) 125 126 #define UTF8_INVALID3(p) \ 127 (((p)[2] & 0x80) == 0 \ 128 || \ 129 ((*p) == 0xEF && (p)[1] == 0xBF \ 130 ? \ 131 (p)[2] > 0xBD \ 132 : \ 133 ((p)[2] & 0xC0) == 0xC0) \ 134 || \ 135 ((*p) == 0xE0 \ 136 ? \ 137 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ 138 : \ 139 ((p)[1] & 0x80) == 0 \ 140 || \ 141 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) 142 143 #define UTF8_INVALID4(p) \ 144 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \ 145 || \ 146 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \ 147 || \ 148 ((*p) == 0xF0 \ 149 ? \ 150 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ 151 : \ 152 ((p)[1] & 0x80) == 0 \ 153 || \ 154 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) 155 156 static int PTRFASTCALL 157 isNever(const ENCODING *UNUSED_P(enc), const char *UNUSED_P(p)) 158 { 159 return 0; 160 } 161 162 static int PTRFASTCALL 163 utf8_isName2(const ENCODING *UNUSED_P(enc), const char *p) 164 { 165 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); 166 } 167 168 static int PTRFASTCALL 169 utf8_isName3(const ENCODING *UNUSED_P(enc), const char *p) 170 { 171 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); 172 } 173 174 #define utf8_isName4 isNever 175 176 static int PTRFASTCALL 177 utf8_isNmstrt2(const ENCODING *UNUSED_P(enc), const char *p) 178 { 179 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); 180 } 181 182 static int PTRFASTCALL 183 utf8_isNmstrt3(const ENCODING *UNUSED_P(enc), const char *p) 184 { 185 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); 186 } 187 188 #define utf8_isNmstrt4 isNever 189 190 static int PTRFASTCALL 191 utf8_isInvalid2(const ENCODING *UNUSED_P(enc), const char *p) 192 { 193 return UTF8_INVALID2((const unsigned char *)p); 194 } 195 196 static int PTRFASTCALL 197 utf8_isInvalid3(const ENCODING *UNUSED_P(enc), const char *p) 198 { 199 return UTF8_INVALID3((const unsigned char *)p); 200 } 201 202 static int PTRFASTCALL 203 utf8_isInvalid4(const ENCODING *UNUSED_P(enc), const char *p) 204 { 205 return UTF8_INVALID4((const unsigned char *)p); 206 } 207 208 struct normal_encoding { 209 ENCODING enc; 210 unsigned char type[256]; 211 #ifdef XML_MIN_SIZE 212 int (PTRFASTCALL *byteType)(const ENCODING *, const char *); 213 int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *); 214 int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *); 215 int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *); 216 int (PTRCALL *charMatches)(const ENCODING *, const char *, int); 217 #endif /* XML_MIN_SIZE */ 218 int (PTRFASTCALL *isName2)(const ENCODING *, const char *); 219 int (PTRFASTCALL *isName3)(const ENCODING *, const char *); 220 int (PTRFASTCALL *isName4)(const ENCODING *, const char *); 221 int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *); 222 int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *); 223 int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *); 224 int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *); 225 int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *); 226 int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *); 227 }; 228 229 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc)) 230 231 #ifdef XML_MIN_SIZE 232 233 #define STANDARD_VTABLE(E) \ 234 E ## byteType, \ 235 E ## isNameMin, \ 236 E ## isNmstrtMin, \ 237 E ## byteToAscii, \ 238 E ## charMatches, 239 240 #else 241 242 #define STANDARD_VTABLE(E) /* as nothing */ 243 244 #endif 245 246 #define NORMAL_VTABLE(E) \ 247 E ## isName2, \ 248 E ## isName3, \ 249 E ## isName4, \ 250 E ## isNmstrt2, \ 251 E ## isNmstrt3, \ 252 E ## isNmstrt4, \ 253 E ## isInvalid2, \ 254 E ## isInvalid3, \ 255 E ## isInvalid4 256 257 #define NULL_VTABLE \ 258 /* isName2 */ NULL, \ 259 /* isName3 */ NULL, \ 260 /* isName4 */ NULL, \ 261 /* isNmstrt2 */ NULL, \ 262 /* isNmstrt3 */ NULL, \ 263 /* isNmstrt4 */ NULL, \ 264 /* isInvalid2 */ NULL, \ 265 /* isInvalid3 */ NULL, \ 266 /* isInvalid4 */ NULL 267 268 static int FASTCALL checkCharRefNumber(int); 269 270 #include "xmltok_impl.h" 271 #include "ascii.h" 272 273 #ifdef XML_MIN_SIZE 274 #define sb_isNameMin isNever 275 #define sb_isNmstrtMin isNever 276 #endif 277 278 #ifdef XML_MIN_SIZE 279 #define MINBPC(enc) ((enc)->minBytesPerChar) 280 #else 281 /* minimum bytes per character */ 282 #define MINBPC(enc) 1 283 #endif 284 285 #define SB_BYTE_TYPE(enc, p) \ 286 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) 287 288 #ifdef XML_MIN_SIZE 289 static int PTRFASTCALL 290 sb_byteType(const ENCODING *enc, const char *p) 291 { 292 return SB_BYTE_TYPE(enc, p); 293 } 294 #define BYTE_TYPE(enc, p) \ 295 (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) 296 #else 297 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) 298 #endif 299 300 #ifdef XML_MIN_SIZE 301 #define BYTE_TO_ASCII(enc, p) \ 302 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) 303 static int PTRFASTCALL 304 sb_byteToAscii(const ENCODING *enc, const char *p) 305 { 306 return *p; 307 } 308 #else 309 #define BYTE_TO_ASCII(enc, p) (*(p)) 310 #endif 311 312 #define IS_NAME_CHAR(enc, p, n) \ 313 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p)) 314 #define IS_NMSTRT_CHAR(enc, p, n) \ 315 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p)) 316 #define IS_INVALID_CHAR(enc, p, n) \ 317 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p)) 318 319 #ifdef XML_MIN_SIZE 320 #define IS_NAME_CHAR_MINBPC(enc, p) \ 321 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) 322 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 323 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) 324 #else 325 #define IS_NAME_CHAR_MINBPC(enc, p) (0) 326 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 327 #endif 328 329 #ifdef XML_MIN_SIZE 330 #define CHAR_MATCHES(enc, p, c) \ 331 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) 332 static int PTRCALL 333 sb_charMatches(const ENCODING *enc, const char *p, int c) 334 { 335 return *p == c; 336 } 337 #else 338 /* c is an ASCII character */ 339 #define CHAR_MATCHES(enc, p, c) (*(p) == c) 340 #endif 341 342 #define PREFIX(ident) normal_ ## ident 343 #define XML_TOK_IMPL_C 344 #include "xmltok_impl.c" 345 #undef XML_TOK_IMPL_C 346 347 #undef MINBPC 348 #undef BYTE_TYPE 349 #undef BYTE_TO_ASCII 350 #undef CHAR_MATCHES 351 #undef IS_NAME_CHAR 352 #undef IS_NAME_CHAR_MINBPC 353 #undef IS_NMSTRT_CHAR 354 #undef IS_NMSTRT_CHAR_MINBPC 355 #undef IS_INVALID_CHAR 356 357 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ 358 UTF8_cval1 = 0x00, 359 UTF8_cval2 = 0xc0, 360 UTF8_cval3 = 0xe0, 361 UTF8_cval4 = 0xf0 362 }; 363 364 void 365 _INTERNAL_trim_to_complete_utf8_characters(const char * from, const char ** fromLimRef) 366 { 367 const char * fromLim = *fromLimRef; 368 size_t walked = 0; 369 for (; fromLim > from; fromLim--, walked++) { 370 const unsigned char prev = (unsigned char)fromLim[-1]; 371 if ((prev & 0xf8u) == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */ 372 if (walked + 1 >= 4) { 373 fromLim += 4 - 1; 374 break; 375 } else { 376 walked = 0; 377 } 378 } else if ((prev & 0xf0u) == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */ 379 if (walked + 1 >= 3) { 380 fromLim += 3 - 1; 381 break; 382 } else { 383 walked = 0; 384 } 385 } else if ((prev & 0xe0u) == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */ 386 if (walked + 1 >= 2) { 387 fromLim += 2 - 1; 388 break; 389 } else { 390 walked = 0; 391 } 392 } else if ((prev & 0x80u) == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */ 393 break; 394 } 395 } 396 *fromLimRef = fromLim; 397 } 398 399 static enum XML_Convert_Result PTRCALL 400 utf8_toUtf8(const ENCODING *UNUSED_P(enc), 401 const char **fromP, const char *fromLim, 402 char **toP, const char *toLim) 403 { 404 bool input_incomplete = false; 405 bool output_exhausted = false; 406 407 /* Avoid copying partial characters (due to limited space). */ 408 const ptrdiff_t bytesAvailable = fromLim - *fromP; 409 const ptrdiff_t bytesStorable = toLim - *toP; 410 if (bytesAvailable > bytesStorable) { 411 fromLim = *fromP + bytesStorable; 412 output_exhausted = true; 413 } 414 415 /* Avoid copying partial characters (from incomplete input). */ 416 { 417 const char * const fromLimBefore = fromLim; 418 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim); 419 if (fromLim < fromLimBefore) { 420 input_incomplete = true; 421 } 422 } 423 424 { 425 const ptrdiff_t bytesToCopy = fromLim - *fromP; 426 memcpy(*toP, *fromP, bytesToCopy); 427 *fromP += bytesToCopy; 428 *toP += bytesToCopy; 429 } 430 431 if (output_exhausted) /* needs to go first */ 432 return XML_CONVERT_OUTPUT_EXHAUSTED; 433 else if (input_incomplete) 434 return XML_CONVERT_INPUT_INCOMPLETE; 435 else 436 return XML_CONVERT_COMPLETED; 437 } 438 439 static enum XML_Convert_Result PTRCALL 440 utf8_toUtf16(const ENCODING *enc, 441 const char **fromP, const char *fromLim, 442 unsigned short **toP, const unsigned short *toLim) 443 { 444 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; 445 unsigned short *to = *toP; 446 const char *from = *fromP; 447 while (from < fromLim && to < toLim) { 448 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { 449 case BT_LEAD2: 450 if (fromLim - from < 2) { 451 res = XML_CONVERT_INPUT_INCOMPLETE; 452 goto after; 453 } 454 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); 455 from += 2; 456 break; 457 case BT_LEAD3: 458 if (fromLim - from < 3) { 459 res = XML_CONVERT_INPUT_INCOMPLETE; 460 goto after; 461 } 462 *to++ = (unsigned short)(((from[0] & 0xf) << 12) 463 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f)); 464 from += 3; 465 break; 466 case BT_LEAD4: 467 { 468 unsigned long n; 469 if (toLim - to < 2) { 470 res = XML_CONVERT_OUTPUT_EXHAUSTED; 471 goto after; 472 } 473 if (fromLim - from < 4) { 474 res = XML_CONVERT_INPUT_INCOMPLETE; 475 goto after; 476 } 477 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) 478 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); 479 n -= 0x10000; 480 to[0] = (unsigned short)((n >> 10) | 0xD800); 481 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); 482 to += 2; 483 from += 4; 484 } 485 break; 486 default: 487 *to++ = *from++; 488 break; 489 } 490 } 491 if (from < fromLim) 492 res = XML_CONVERT_OUTPUT_EXHAUSTED; 493 after: 494 *fromP = from; 495 *toP = to; 496 return res; 497 } 498 499 #ifdef XML_NS 500 static const struct normal_encoding utf8_encoding_ns = { 501 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 502 { 503 #include "asciitab.h" 504 #include "utf8tab.h" 505 }, 506 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 507 }; 508 #endif 509 510 static const struct normal_encoding utf8_encoding = { 511 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 512 { 513 #define BT_COLON BT_NMSTRT 514 #include "asciitab.h" 515 #undef BT_COLON 516 #include "utf8tab.h" 517 }, 518 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 519 }; 520 521 #ifdef XML_NS 522 523 static const struct normal_encoding internal_utf8_encoding_ns = { 524 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 525 { 526 #include "iasciitab.h" 527 #include "utf8tab.h" 528 }, 529 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 530 }; 531 532 #endif 533 534 static const struct normal_encoding internal_utf8_encoding = { 535 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, 536 { 537 #define BT_COLON BT_NMSTRT 538 #include "iasciitab.h" 539 #undef BT_COLON 540 #include "utf8tab.h" 541 }, 542 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) 543 }; 544 545 static enum XML_Convert_Result PTRCALL 546 latin1_toUtf8(const ENCODING *UNUSED_P(enc), 547 const char **fromP, const char *fromLim, 548 char **toP, const char *toLim) 549 { 550 for (;;) { 551 unsigned char c; 552 if (*fromP == fromLim) 553 return XML_CONVERT_COMPLETED; 554 c = (unsigned char)**fromP; 555 if (c & 0x80) { 556 if (toLim - *toP < 2) 557 return XML_CONVERT_OUTPUT_EXHAUSTED; 558 *(*toP)++ = (char)((c >> 6) | UTF8_cval2); 559 *(*toP)++ = (char)((c & 0x3f) | 0x80); 560 (*fromP)++; 561 } 562 else { 563 if (*toP == toLim) 564 return XML_CONVERT_OUTPUT_EXHAUSTED; 565 *(*toP)++ = *(*fromP)++; 566 } 567 } 568 } 569 570 static enum XML_Convert_Result PTRCALL 571 latin1_toUtf16(const ENCODING *UNUSED_P(enc), 572 const char **fromP, const char *fromLim, 573 unsigned short **toP, const unsigned short *toLim) 574 { 575 while (*fromP < fromLim && *toP < toLim) 576 *(*toP)++ = (unsigned char)*(*fromP)++; 577 578 if ((*toP == toLim) && (*fromP < fromLim)) 579 return XML_CONVERT_OUTPUT_EXHAUSTED; 580 else 581 return XML_CONVERT_COMPLETED; 582 } 583 584 #ifdef XML_NS 585 586 static const struct normal_encoding latin1_encoding_ns = { 587 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, 588 { 589 #include "asciitab.h" 590 #include "latin1tab.h" 591 }, 592 STANDARD_VTABLE(sb_) NULL_VTABLE 593 }; 594 595 #endif 596 597 static const struct normal_encoding latin1_encoding = { 598 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, 599 { 600 #define BT_COLON BT_NMSTRT 601 #include "asciitab.h" 602 #undef BT_COLON 603 #include "latin1tab.h" 604 }, 605 STANDARD_VTABLE(sb_) NULL_VTABLE 606 }; 607 608 static enum XML_Convert_Result PTRCALL 609 ascii_toUtf8(const ENCODING *UNUSED_P(enc), 610 const char **fromP, const char *fromLim, 611 char **toP, const char *toLim) 612 { 613 while (*fromP < fromLim && *toP < toLim) 614 *(*toP)++ = *(*fromP)++; 615 616 if ((*toP == toLim) && (*fromP < fromLim)) 617 return XML_CONVERT_OUTPUT_EXHAUSTED; 618 else 619 return XML_CONVERT_COMPLETED; 620 } 621 622 #ifdef XML_NS 623 624 static const struct normal_encoding ascii_encoding_ns = { 625 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, 626 { 627 #include "asciitab.h" 628 /* BT_NONXML == 0 */ 629 }, 630 STANDARD_VTABLE(sb_) NULL_VTABLE 631 }; 632 633 #endif 634 635 static const struct normal_encoding ascii_encoding = { 636 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, 637 { 638 #define BT_COLON BT_NMSTRT 639 #include "asciitab.h" 640 #undef BT_COLON 641 /* BT_NONXML == 0 */ 642 }, 643 STANDARD_VTABLE(sb_) NULL_VTABLE 644 }; 645 646 static int PTRFASTCALL 647 unicode_byte_type(char hi, char lo) 648 { 649 switch ((unsigned char)hi) { 650 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 651 return BT_LEAD4; 652 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 653 return BT_TRAIL; 654 case 0xFF: 655 switch ((unsigned char)lo) { 656 case 0xFF: 657 case 0xFE: 658 return BT_NONXML; 659 } 660 break; 661 } 662 return BT_NONASCII; 663 } 664 665 #define DEFINE_UTF16_TO_UTF8(E) \ 666 static enum XML_Convert_Result PTRCALL \ 667 E ## toUtf8(const ENCODING *UNUSED_P(enc), \ 668 const char **fromP, const char *fromLim, \ 669 char **toP, const char *toLim) \ 670 { \ 671 const char *from = *fromP; \ 672 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \ 673 for (; from < fromLim; from += 2) { \ 674 int plane; \ 675 unsigned char lo2; \ 676 unsigned char lo = GET_LO(from); \ 677 unsigned char hi = GET_HI(from); \ 678 switch (hi) { \ 679 case 0: \ 680 if (lo < 0x80) { \ 681 if (*toP == toLim) { \ 682 *fromP = from; \ 683 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 684 } \ 685 *(*toP)++ = lo; \ 686 break; \ 687 } \ 688 /* fall through */ \ 689 case 0x1: case 0x2: case 0x3: \ 690 case 0x4: case 0x5: case 0x6: case 0x7: \ 691 if (toLim - *toP < 2) { \ 692 *fromP = from; \ 693 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 694 } \ 695 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ 696 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 697 break; \ 698 default: \ 699 if (toLim - *toP < 3) { \ 700 *fromP = from; \ 701 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 702 } \ 703 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ 704 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ 705 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ 706 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 707 break; \ 708 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ 709 if (toLim - *toP < 4) { \ 710 *fromP = from; \ 711 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 712 } \ 713 if (fromLim - from < 4) { \ 714 *fromP = from; \ 715 return XML_CONVERT_INPUT_INCOMPLETE; \ 716 } \ 717 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ 718 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ 719 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ 720 from += 2; \ 721 lo2 = GET_LO(from); \ 722 *(*toP)++ = (((lo & 0x3) << 4) \ 723 | ((GET_HI(from) & 0x3) << 2) \ 724 | (lo2 >> 6) \ 725 | 0x80); \ 726 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ 727 break; \ 728 } \ 729 } \ 730 *fromP = from; \ 731 if (from < fromLim) \ 732 return XML_CONVERT_INPUT_INCOMPLETE; \ 733 else \ 734 return XML_CONVERT_COMPLETED; \ 735 } 736 737 #define DEFINE_UTF16_TO_UTF16(E) \ 738 static enum XML_Convert_Result PTRCALL \ 739 E ## toUtf16(const ENCODING *UNUSED_P(enc), \ 740 const char **fromP, const char *fromLim, \ 741 unsigned short **toP, const unsigned short *toLim) \ 742 { \ 743 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \ 744 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \ 745 /* Avoid copying first half only of surrogate */ \ 746 if (fromLim - *fromP > ((toLim - *toP) << 1) \ 747 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \ 748 fromLim -= 2; \ 749 res = XML_CONVERT_INPUT_INCOMPLETE; \ 750 } \ 751 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \ 752 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ 753 if ((*toP == toLim) && (*fromP < fromLim)) \ 754 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 755 else \ 756 return res; \ 757 } 758 759 #define SET2(ptr, ch) \ 760 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) 761 #define GET_LO(ptr) ((unsigned char)(ptr)[0]) 762 #define GET_HI(ptr) ((unsigned char)(ptr)[1]) 763 764 DEFINE_UTF16_TO_UTF8(little2_) 765 DEFINE_UTF16_TO_UTF16(little2_) 766 767 #undef SET2 768 #undef GET_LO 769 #undef GET_HI 770 771 #define SET2(ptr, ch) \ 772 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) 773 #define GET_LO(ptr) ((unsigned char)(ptr)[1]) 774 #define GET_HI(ptr) ((unsigned char)(ptr)[0]) 775 776 DEFINE_UTF16_TO_UTF8(big2_) 777 DEFINE_UTF16_TO_UTF16(big2_) 778 779 #undef SET2 780 #undef GET_LO 781 #undef GET_HI 782 783 #define LITTLE2_BYTE_TYPE(enc, p) \ 784 ((p)[1] == 0 \ 785 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ 786 : unicode_byte_type((p)[1], (p)[0])) 787 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) 788 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) 789 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \ 790 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) 791 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 792 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) 793 794 #ifdef XML_MIN_SIZE 795 796 static int PTRFASTCALL 797 little2_byteType(const ENCODING *enc, const char *p) 798 { 799 return LITTLE2_BYTE_TYPE(enc, p); 800 } 801 802 static int PTRFASTCALL 803 little2_byteToAscii(const ENCODING *enc, const char *p) 804 { 805 return LITTLE2_BYTE_TO_ASCII(enc, p); 806 } 807 808 static int PTRCALL 809 little2_charMatches(const ENCODING *enc, const char *p, int c) 810 { 811 return LITTLE2_CHAR_MATCHES(enc, p, c); 812 } 813 814 static int PTRFASTCALL 815 little2_isNameMin(const ENCODING *enc, const char *p) 816 { 817 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p); 818 } 819 820 static int PTRFASTCALL 821 little2_isNmstrtMin(const ENCODING *enc, const char *p) 822 { 823 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p); 824 } 825 826 #undef VTABLE 827 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 828 829 #else /* not XML_MIN_SIZE */ 830 831 #undef PREFIX 832 #define PREFIX(ident) little2_ ## ident 833 #define MINBPC(enc) 2 834 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 835 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) 836 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) 837 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c) 838 #define IS_NAME_CHAR(enc, p, n) 0 839 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) 840 #define IS_NMSTRT_CHAR(enc, p, n) (0) 841 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) 842 843 #define XML_TOK_IMPL_C 844 #include "xmltok_impl.c" 845 #undef XML_TOK_IMPL_C 846 847 #undef MINBPC 848 #undef BYTE_TYPE 849 #undef BYTE_TO_ASCII 850 #undef CHAR_MATCHES 851 #undef IS_NAME_CHAR 852 #undef IS_NAME_CHAR_MINBPC 853 #undef IS_NMSTRT_CHAR 854 #undef IS_NMSTRT_CHAR_MINBPC 855 #undef IS_INVALID_CHAR 856 857 #endif /* not XML_MIN_SIZE */ 858 859 #ifdef XML_NS 860 861 static const struct normal_encoding little2_encoding_ns = { 862 { VTABLE, 2, 0, 863 #if BYTEORDER == 1234 864 1 865 #else 866 0 867 #endif 868 }, 869 { 870 #include "asciitab.h" 871 #include "latin1tab.h" 872 }, 873 STANDARD_VTABLE(little2_) NULL_VTABLE 874 }; 875 876 #endif 877 878 static const struct normal_encoding little2_encoding = { 879 { VTABLE, 2, 0, 880 #if BYTEORDER == 1234 881 1 882 #else 883 0 884 #endif 885 }, 886 { 887 #define BT_COLON BT_NMSTRT 888 #include "asciitab.h" 889 #undef BT_COLON 890 #include "latin1tab.h" 891 }, 892 STANDARD_VTABLE(little2_) NULL_VTABLE 893 }; 894 895 #if BYTEORDER != 4321 896 897 #ifdef XML_NS 898 899 static const struct normal_encoding internal_little2_encoding_ns = { 900 { VTABLE, 2, 0, 1 }, 901 { 902 #include "iasciitab.h" 903 #include "latin1tab.h" 904 }, 905 STANDARD_VTABLE(little2_) NULL_VTABLE 906 }; 907 908 #endif 909 910 static const struct normal_encoding internal_little2_encoding = { 911 { VTABLE, 2, 0, 1 }, 912 { 913 #define BT_COLON BT_NMSTRT 914 #include "iasciitab.h" 915 #undef BT_COLON 916 #include "latin1tab.h" 917 }, 918 STANDARD_VTABLE(little2_) NULL_VTABLE 919 }; 920 921 #endif 922 923 924 #define BIG2_BYTE_TYPE(enc, p) \ 925 ((p)[0] == 0 \ 926 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ 927 : unicode_byte_type((p)[0], (p)[1])) 928 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) 929 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) 930 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \ 931 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) 932 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 933 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) 934 935 #ifdef XML_MIN_SIZE 936 937 static int PTRFASTCALL 938 big2_byteType(const ENCODING *enc, const char *p) 939 { 940 return BIG2_BYTE_TYPE(enc, p); 941 } 942 943 static int PTRFASTCALL 944 big2_byteToAscii(const ENCODING *enc, const char *p) 945 { 946 return BIG2_BYTE_TO_ASCII(enc, p); 947 } 948 949 static int PTRCALL 950 big2_charMatches(const ENCODING *enc, const char *p, int c) 951 { 952 return BIG2_CHAR_MATCHES(enc, p, c); 953 } 954 955 static int PTRFASTCALL 956 big2_isNameMin(const ENCODING *enc, const char *p) 957 { 958 return BIG2_IS_NAME_CHAR_MINBPC(enc, p); 959 } 960 961 static int PTRFASTCALL 962 big2_isNmstrtMin(const ENCODING *enc, const char *p) 963 { 964 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p); 965 } 966 967 #undef VTABLE 968 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 969 970 #else /* not XML_MIN_SIZE */ 971 972 #undef PREFIX 973 #define PREFIX(ident) big2_ ## ident 974 #define MINBPC(enc) 2 975 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 976 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) 977 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) 978 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c) 979 #define IS_NAME_CHAR(enc, p, n) 0 980 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p) 981 #define IS_NMSTRT_CHAR(enc, p, n) (0) 982 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) 983 984 #define XML_TOK_IMPL_C 985 #include "xmltok_impl.c" 986 #undef XML_TOK_IMPL_C 987 988 #undef MINBPC 989 #undef BYTE_TYPE 990 #undef BYTE_TO_ASCII 991 #undef CHAR_MATCHES 992 #undef IS_NAME_CHAR 993 #undef IS_NAME_CHAR_MINBPC 994 #undef IS_NMSTRT_CHAR 995 #undef IS_NMSTRT_CHAR_MINBPC 996 #undef IS_INVALID_CHAR 997 998 #endif /* not XML_MIN_SIZE */ 999 1000 #ifdef XML_NS 1001 1002 static const struct normal_encoding big2_encoding_ns = { 1003 { VTABLE, 2, 0, 1004 #if BYTEORDER == 4321 1005 1 1006 #else 1007 0 1008 #endif 1009 }, 1010 { 1011 #include "asciitab.h" 1012 #include "latin1tab.h" 1013 }, 1014 STANDARD_VTABLE(big2_) NULL_VTABLE 1015 }; 1016 1017 #endif 1018 1019 static const struct normal_encoding big2_encoding = { 1020 { VTABLE, 2, 0, 1021 #if BYTEORDER == 4321 1022 1 1023 #else 1024 0 1025 #endif 1026 }, 1027 { 1028 #define BT_COLON BT_NMSTRT 1029 #include "asciitab.h" 1030 #undef BT_COLON 1031 #include "latin1tab.h" 1032 }, 1033 STANDARD_VTABLE(big2_) NULL_VTABLE 1034 }; 1035 1036 #if BYTEORDER != 1234 1037 1038 #ifdef XML_NS 1039 1040 static const struct normal_encoding internal_big2_encoding_ns = { 1041 { VTABLE, 2, 0, 1 }, 1042 { 1043 #include "iasciitab.h" 1044 #include "latin1tab.h" 1045 }, 1046 STANDARD_VTABLE(big2_) NULL_VTABLE 1047 }; 1048 1049 #endif 1050 1051 static const struct normal_encoding internal_big2_encoding = { 1052 { VTABLE, 2, 0, 1 }, 1053 { 1054 #define BT_COLON BT_NMSTRT 1055 #include "iasciitab.h" 1056 #undef BT_COLON 1057 #include "latin1tab.h" 1058 }, 1059 STANDARD_VTABLE(big2_) NULL_VTABLE 1060 }; 1061 1062 #endif 1063 1064 #undef PREFIX 1065 1066 static int FASTCALL 1067 streqci(const char *s1, const char *s2) 1068 { 1069 for (;;) { 1070 char c1 = *s1++; 1071 char c2 = *s2++; 1072 if (ASCII_a <= c1 && c1 <= ASCII_z) 1073 c1 += ASCII_A - ASCII_a; 1074 if (ASCII_a <= c2 && c2 <= ASCII_z) 1075 /* The following line will never get executed. streqci() is 1076 * only called from two places, both of which guarantee to put 1077 * upper-case strings into s2. 1078 */ 1079 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */ 1080 if (c1 != c2) 1081 return 0; 1082 if (!c1) 1083 break; 1084 } 1085 return 1; 1086 } 1087 1088 static void PTRCALL 1089 initUpdatePosition(const ENCODING *UNUSED_P(enc), const char *ptr, 1090 const char *end, POSITION *pos) 1091 { 1092 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); 1093 } 1094 1095 static int 1096 toAscii(const ENCODING *enc, const char *ptr, const char *end) 1097 { 1098 char buf[1]; 1099 char *p = buf; 1100 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); 1101 if (p == buf) 1102 return -1; 1103 else 1104 return buf[0]; 1105 } 1106 1107 static int FASTCALL 1108 isSpace(int c) 1109 { 1110 switch (c) { 1111 case 0x20: 1112 case 0xD: 1113 case 0xA: 1114 case 0x9: 1115 return 1; 1116 } 1117 return 0; 1118 } 1119 1120 /* Return 1 if there's just optional white space or there's an S 1121 followed by name=val. 1122 */ 1123 static int 1124 parsePseudoAttribute(const ENCODING *enc, 1125 const char *ptr, 1126 const char *end, 1127 const char **namePtr, 1128 const char **nameEndPtr, 1129 const char **valPtr, 1130 const char **nextTokPtr) 1131 { 1132 int c; 1133 char open; 1134 if (ptr == end) { 1135 *namePtr = NULL; 1136 return 1; 1137 } 1138 if (!isSpace(toAscii(enc, ptr, end))) { 1139 *nextTokPtr = ptr; 1140 return 0; 1141 } 1142 do { 1143 ptr += enc->minBytesPerChar; 1144 } while (isSpace(toAscii(enc, ptr, end))); 1145 if (ptr == end) { 1146 *namePtr = NULL; 1147 return 1; 1148 } 1149 *namePtr = ptr; 1150 for (;;) { 1151 c = toAscii(enc, ptr, end); 1152 if (c == -1) { 1153 *nextTokPtr = ptr; 1154 return 0; 1155 } 1156 if (c == ASCII_EQUALS) { 1157 *nameEndPtr = ptr; 1158 break; 1159 } 1160 if (isSpace(c)) { 1161 *nameEndPtr = ptr; 1162 do { 1163 ptr += enc->minBytesPerChar; 1164 } while (isSpace(c = toAscii(enc, ptr, end))); 1165 if (c != ASCII_EQUALS) { 1166 *nextTokPtr = ptr; 1167 return 0; 1168 } 1169 break; 1170 } 1171 ptr += enc->minBytesPerChar; 1172 } 1173 if (ptr == *namePtr) { 1174 *nextTokPtr = ptr; 1175 return 0; 1176 } 1177 ptr += enc->minBytesPerChar; 1178 c = toAscii(enc, ptr, end); 1179 while (isSpace(c)) { 1180 ptr += enc->minBytesPerChar; 1181 c = toAscii(enc, ptr, end); 1182 } 1183 if (c != ASCII_QUOT && c != ASCII_APOS) { 1184 *nextTokPtr = ptr; 1185 return 0; 1186 } 1187 open = (char)c; 1188 ptr += enc->minBytesPerChar; 1189 *valPtr = ptr; 1190 for (;; ptr += enc->minBytesPerChar) { 1191 c = toAscii(enc, ptr, end); 1192 if (c == open) 1193 break; 1194 if (!(ASCII_a <= c && c <= ASCII_z) 1195 && !(ASCII_A <= c && c <= ASCII_Z) 1196 && !(ASCII_0 <= c && c <= ASCII_9) 1197 && c != ASCII_PERIOD 1198 && c != ASCII_MINUS 1199 && c != ASCII_UNDERSCORE) { 1200 *nextTokPtr = ptr; 1201 return 0; 1202 } 1203 } 1204 *nextTokPtr = ptr + enc->minBytesPerChar; 1205 return 1; 1206 } 1207 1208 static const char KW_version[] = { 1209 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0' 1210 }; 1211 1212 static const char KW_encoding[] = { 1213 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0' 1214 }; 1215 1216 static const char KW_standalone[] = { 1217 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, 1218 ASCII_n, ASCII_e, '\0' 1219 }; 1220 1221 static const char KW_yes[] = { 1222 ASCII_y, ASCII_e, ASCII_s, '\0' 1223 }; 1224 1225 static const char KW_no[] = { 1226 ASCII_n, ASCII_o, '\0' 1227 }; 1228 1229 static int 1230 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, 1231 const char *, 1232 const char *), 1233 int isGeneralTextEntity, 1234 const ENCODING *enc, 1235 const char *ptr, 1236 const char *end, 1237 const char **badPtr, 1238 const char **versionPtr, 1239 const char **versionEndPtr, 1240 const char **encodingName, 1241 const ENCODING **encoding, 1242 int *standalone) 1243 { 1244 const char *val = NULL; 1245 const char *name = NULL; 1246 const char *nameEnd = NULL; 1247 ptr += 5 * enc->minBytesPerChar; 1248 end -= 2 * enc->minBytesPerChar; 1249 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) 1250 || !name) { 1251 *badPtr = ptr; 1252 return 0; 1253 } 1254 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) { 1255 if (!isGeneralTextEntity) { 1256 *badPtr = name; 1257 return 0; 1258 } 1259 } 1260 else { 1261 if (versionPtr) 1262 *versionPtr = val; 1263 if (versionEndPtr) 1264 *versionEndPtr = ptr; 1265 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1266 *badPtr = ptr; 1267 return 0; 1268 } 1269 if (!name) { 1270 if (isGeneralTextEntity) { 1271 /* a TextDecl must have an EncodingDecl */ 1272 *badPtr = ptr; 1273 return 0; 1274 } 1275 return 1; 1276 } 1277 } 1278 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) { 1279 int c = toAscii(enc, val, end); 1280 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) { 1281 *badPtr = val; 1282 return 0; 1283 } 1284 if (encodingName) 1285 *encodingName = val; 1286 if (encoding) 1287 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); 1288 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1289 *badPtr = ptr; 1290 return 0; 1291 } 1292 if (!name) 1293 return 1; 1294 } 1295 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) 1296 || isGeneralTextEntity) { 1297 *badPtr = name; 1298 return 0; 1299 } 1300 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) { 1301 if (standalone) 1302 *standalone = 1; 1303 } 1304 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) { 1305 if (standalone) 1306 *standalone = 0; 1307 } 1308 else { 1309 *badPtr = val; 1310 return 0; 1311 } 1312 while (isSpace(toAscii(enc, ptr, end))) 1313 ptr += enc->minBytesPerChar; 1314 if (ptr != end) { 1315 *badPtr = ptr; 1316 return 0; 1317 } 1318 return 1; 1319 } 1320 1321 static int FASTCALL 1322 checkCharRefNumber(int result) 1323 { 1324 switch (result >> 8) { 1325 case 0xD8: case 0xD9: case 0xDA: case 0xDB: 1326 case 0xDC: case 0xDD: case 0xDE: case 0xDF: 1327 return -1; 1328 case 0: 1329 if (latin1_encoding.type[result] == BT_NONXML) 1330 return -1; 1331 break; 1332 case 0xFF: 1333 if (result == 0xFFFE || result == 0xFFFF) 1334 return -1; 1335 break; 1336 } 1337 return result; 1338 } 1339 1340 int FASTCALL 1341 XmlUtf8Encode(int c, char *buf) 1342 { 1343 enum { 1344 /* minN is minimum legal resulting value for N byte sequence */ 1345 min2 = 0x80, 1346 min3 = 0x800, 1347 min4 = 0x10000 1348 }; 1349 1350 if (c < 0) 1351 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */ 1352 if (c < min2) { 1353 buf[0] = (char)(c | UTF8_cval1); 1354 return 1; 1355 } 1356 if (c < min3) { 1357 buf[0] = (char)((c >> 6) | UTF8_cval2); 1358 buf[1] = (char)((c & 0x3f) | 0x80); 1359 return 2; 1360 } 1361 if (c < min4) { 1362 buf[0] = (char)((c >> 12) | UTF8_cval3); 1363 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80); 1364 buf[2] = (char)((c & 0x3f) | 0x80); 1365 return 3; 1366 } 1367 if (c < 0x110000) { 1368 buf[0] = (char)((c >> 18) | UTF8_cval4); 1369 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80); 1370 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80); 1371 buf[3] = (char)((c & 0x3f) | 0x80); 1372 return 4; 1373 } 1374 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */ 1375 } 1376 1377 int FASTCALL 1378 XmlUtf16Encode(int charNum, unsigned short *buf) 1379 { 1380 if (charNum < 0) 1381 return 0; 1382 if (charNum < 0x10000) { 1383 buf[0] = (unsigned short)charNum; 1384 return 1; 1385 } 1386 if (charNum < 0x110000) { 1387 charNum -= 0x10000; 1388 buf[0] = (unsigned short)((charNum >> 10) + 0xD800); 1389 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00); 1390 return 2; 1391 } 1392 return 0; 1393 } 1394 1395 struct unknown_encoding { 1396 struct normal_encoding normal; 1397 CONVERTER convert; 1398 void *userData; 1399 unsigned short utf16[256]; 1400 char utf8[256][4]; 1401 }; 1402 1403 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc)) 1404 1405 int 1406 XmlSizeOfUnknownEncoding(void) 1407 { 1408 return sizeof(struct unknown_encoding); 1409 } 1410 1411 static int PTRFASTCALL 1412 unknown_isName(const ENCODING *enc, const char *p) 1413 { 1414 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1415 int c = uenc->convert(uenc->userData, p); 1416 if (c & ~0xFFFF) 1417 return 0; 1418 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); 1419 } 1420 1421 static int PTRFASTCALL 1422 unknown_isNmstrt(const ENCODING *enc, const char *p) 1423 { 1424 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1425 int c = uenc->convert(uenc->userData, p); 1426 if (c & ~0xFFFF) 1427 return 0; 1428 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); 1429 } 1430 1431 static int PTRFASTCALL 1432 unknown_isInvalid(const ENCODING *enc, const char *p) 1433 { 1434 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1435 int c = uenc->convert(uenc->userData, p); 1436 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; 1437 } 1438 1439 static enum XML_Convert_Result PTRCALL 1440 unknown_toUtf8(const ENCODING *enc, 1441 const char **fromP, const char *fromLim, 1442 char **toP, const char *toLim) 1443 { 1444 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1445 char buf[XML_UTF8_ENCODE_MAX]; 1446 for (;;) { 1447 const char *utf8; 1448 int n; 1449 if (*fromP == fromLim) 1450 return XML_CONVERT_COMPLETED; 1451 utf8 = uenc->utf8[(unsigned char)**fromP]; 1452 n = *utf8++; 1453 if (n == 0) { 1454 int c = uenc->convert(uenc->userData, *fromP); 1455 n = XmlUtf8Encode(c, buf); 1456 if (n > toLim - *toP) 1457 return XML_CONVERT_OUTPUT_EXHAUSTED; 1458 utf8 = buf; 1459 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1460 - (BT_LEAD2 - 2)); 1461 } 1462 else { 1463 if (n > toLim - *toP) 1464 return XML_CONVERT_OUTPUT_EXHAUSTED; 1465 (*fromP)++; 1466 } 1467 memcpy(*toP, utf8, n); 1468 *toP += n; 1469 } 1470 } 1471 1472 static enum XML_Convert_Result PTRCALL 1473 unknown_toUtf16(const ENCODING *enc, 1474 const char **fromP, const char *fromLim, 1475 unsigned short **toP, const unsigned short *toLim) 1476 { 1477 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1478 while (*fromP < fromLim && *toP < toLim) { 1479 unsigned short c = uenc->utf16[(unsigned char)**fromP]; 1480 if (c == 0) { 1481 c = (unsigned short) 1482 uenc->convert(uenc->userData, *fromP); 1483 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1484 - (BT_LEAD2 - 2)); 1485 } 1486 else 1487 (*fromP)++; 1488 *(*toP)++ = c; 1489 } 1490 1491 if ((*toP == toLim) && (*fromP < fromLim)) 1492 return XML_CONVERT_OUTPUT_EXHAUSTED; 1493 else 1494 return XML_CONVERT_COMPLETED; 1495 } 1496 1497 ENCODING * 1498 XmlInitUnknownEncoding(void *mem, 1499 int *table, 1500 CONVERTER convert, 1501 void *userData) 1502 { 1503 int i; 1504 struct unknown_encoding *e = (struct unknown_encoding *)mem; 1505 for (i = 0; i < (int)sizeof(struct normal_encoding); i++) 1506 ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; 1507 for (i = 0; i < 128; i++) 1508 if (latin1_encoding.type[i] != BT_OTHER 1509 && latin1_encoding.type[i] != BT_NONXML 1510 && table[i] != i) 1511 return 0; 1512 for (i = 0; i < 256; i++) { 1513 int c = table[i]; 1514 if (c == -1) { 1515 e->normal.type[i] = BT_MALFORM; 1516 /* This shouldn't really get used. */ 1517 e->utf16[i] = 0xFFFF; 1518 e->utf8[i][0] = 1; 1519 e->utf8[i][1] = 0; 1520 } 1521 else if (c < 0) { 1522 if (c < -4) 1523 return 0; 1524 /* Multi-byte sequences need a converter function */ 1525 if (!convert) 1526 return 0; 1527 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2)); 1528 e->utf8[i][0] = 0; 1529 e->utf16[i] = 0; 1530 } 1531 else if (c < 0x80) { 1532 if (latin1_encoding.type[c] != BT_OTHER 1533 && latin1_encoding.type[c] != BT_NONXML 1534 && c != i) 1535 return 0; 1536 e->normal.type[i] = latin1_encoding.type[c]; 1537 e->utf8[i][0] = 1; 1538 e->utf8[i][1] = (char)c; 1539 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c); 1540 } 1541 else if (checkCharRefNumber(c) < 0) { 1542 e->normal.type[i] = BT_NONXML; 1543 /* This shouldn't really get used. */ 1544 e->utf16[i] = 0xFFFF; 1545 e->utf8[i][0] = 1; 1546 e->utf8[i][1] = 0; 1547 } 1548 else { 1549 if (c > 0xFFFF) 1550 return 0; 1551 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) 1552 e->normal.type[i] = BT_NMSTRT; 1553 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) 1554 e->normal.type[i] = BT_NAME; 1555 else 1556 e->normal.type[i] = BT_OTHER; 1557 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); 1558 e->utf16[i] = (unsigned short)c; 1559 } 1560 } 1561 e->userData = userData; 1562 e->convert = convert; 1563 if (convert) { 1564 e->normal.isName2 = unknown_isName; 1565 e->normal.isName3 = unknown_isName; 1566 e->normal.isName4 = unknown_isName; 1567 e->normal.isNmstrt2 = unknown_isNmstrt; 1568 e->normal.isNmstrt3 = unknown_isNmstrt; 1569 e->normal.isNmstrt4 = unknown_isNmstrt; 1570 e->normal.isInvalid2 = unknown_isInvalid; 1571 e->normal.isInvalid3 = unknown_isInvalid; 1572 e->normal.isInvalid4 = unknown_isInvalid; 1573 } 1574 e->normal.enc.utf8Convert = unknown_toUtf8; 1575 e->normal.enc.utf16Convert = unknown_toUtf16; 1576 return &(e->normal.enc); 1577 } 1578 1579 /* If this enumeration is changed, getEncodingIndex and encodings 1580 must also be changed. */ 1581 enum { 1582 UNKNOWN_ENC = -1, 1583 ISO_8859_1_ENC = 0, 1584 US_ASCII_ENC, 1585 UTF_8_ENC, 1586 UTF_16_ENC, 1587 UTF_16BE_ENC, 1588 UTF_16LE_ENC, 1589 /* must match encodingNames up to here */ 1590 NO_ENC 1591 }; 1592 1593 static const char KW_ISO_8859_1[] = { 1594 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, 1595 ASCII_MINUS, ASCII_1, '\0' 1596 }; 1597 static const char KW_US_ASCII[] = { 1598 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, 1599 '\0' 1600 }; 1601 static const char KW_UTF_8[] = { 1602 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0' 1603 }; 1604 static const char KW_UTF_16[] = { 1605 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0' 1606 }; 1607 static const char KW_UTF_16BE[] = { 1608 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, 1609 '\0' 1610 }; 1611 static const char KW_UTF_16LE[] = { 1612 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, 1613 '\0' 1614 }; 1615 1616 static int FASTCALL 1617 getEncodingIndex(const char *name) 1618 { 1619 static const char * const encodingNames[] = { 1620 KW_ISO_8859_1, 1621 KW_US_ASCII, 1622 KW_UTF_8, 1623 KW_UTF_16, 1624 KW_UTF_16BE, 1625 KW_UTF_16LE, 1626 }; 1627 int i; 1628 if (name == NULL) 1629 return NO_ENC; 1630 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++) 1631 if (streqci(name, encodingNames[i])) 1632 return i; 1633 return UNKNOWN_ENC; 1634 } 1635 1636 /* For binary compatibility, we store the index of the encoding 1637 specified at initialization in the isUtf16 member. 1638 */ 1639 1640 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) 1641 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) 1642 1643 /* This is what detects the encoding. encodingTable maps from 1644 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of 1645 the external (protocol) specified encoding; state is 1646 XML_CONTENT_STATE if we're parsing an external text entity, and 1647 XML_PROLOG_STATE otherwise. 1648 */ 1649 1650 1651 static int 1652 initScan(const ENCODING * const *encodingTable, 1653 const INIT_ENCODING *enc, 1654 int state, 1655 const char *ptr, 1656 const char *end, 1657 const char **nextTokPtr) 1658 { 1659 const ENCODING **encPtr; 1660 1661 if (ptr >= end) 1662 return XML_TOK_NONE; 1663 encPtr = enc->encPtr; 1664 if (ptr + 1 == end) { 1665 /* only a single byte available for auto-detection */ 1666 #ifndef XML_DTD /* FIXME */ 1667 /* a well-formed document entity must have more than one byte */ 1668 if (state != XML_CONTENT_STATE) 1669 return XML_TOK_PARTIAL; 1670 #endif 1671 /* so we're parsing an external text entity... */ 1672 /* if UTF-16 was externally specified, then we need at least 2 bytes */ 1673 switch (INIT_ENC_INDEX(enc)) { 1674 case UTF_16_ENC: 1675 case UTF_16LE_ENC: 1676 case UTF_16BE_ENC: 1677 return XML_TOK_PARTIAL; 1678 } 1679 switch ((unsigned char)*ptr) { 1680 case 0xFE: 1681 case 0xFF: 1682 case 0xEF: /* possibly first byte of UTF-8 BOM */ 1683 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1684 && state == XML_CONTENT_STATE) 1685 break; 1686 /* fall through */ 1687 case 0x00: 1688 case 0x3C: 1689 return XML_TOK_PARTIAL; 1690 } 1691 } 1692 else { 1693 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { 1694 case 0xFEFF: 1695 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1696 && state == XML_CONTENT_STATE) 1697 break; 1698 *nextTokPtr = ptr + 2; 1699 *encPtr = encodingTable[UTF_16BE_ENC]; 1700 return XML_TOK_BOM; 1701 /* 00 3C is handled in the default case */ 1702 case 0x3C00: 1703 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC 1704 || INIT_ENC_INDEX(enc) == UTF_16_ENC) 1705 && state == XML_CONTENT_STATE) 1706 break; 1707 *encPtr = encodingTable[UTF_16LE_ENC]; 1708 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1709 case 0xFFFE: 1710 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC 1711 && state == XML_CONTENT_STATE) 1712 break; 1713 *nextTokPtr = ptr + 2; 1714 *encPtr = encodingTable[UTF_16LE_ENC]; 1715 return XML_TOK_BOM; 1716 case 0xEFBB: 1717 /* Maybe a UTF-8 BOM (EF BB BF) */ 1718 /* If there's an explicitly specified (external) encoding 1719 of ISO-8859-1 or some flavour of UTF-16 1720 and this is an external text entity, 1721 don't look for the BOM, 1722 because it might be a legal data. 1723 */ 1724 if (state == XML_CONTENT_STATE) { 1725 int e = INIT_ENC_INDEX(enc); 1726 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC 1727 || e == UTF_16LE_ENC || e == UTF_16_ENC) 1728 break; 1729 } 1730 if (ptr + 2 == end) 1731 return XML_TOK_PARTIAL; 1732 if ((unsigned char)ptr[2] == 0xBF) { 1733 *nextTokPtr = ptr + 3; 1734 *encPtr = encodingTable[UTF_8_ENC]; 1735 return XML_TOK_BOM; 1736 } 1737 break; 1738 default: 1739 if (ptr[0] == '\0') { 1740 /* 0 isn't a legal data character. Furthermore a document 1741 entity can only start with ASCII characters. So the only 1742 way this can fail to be big-endian UTF-16 if it it's an 1743 external parsed general entity that's labelled as 1744 UTF-16LE. 1745 */ 1746 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) 1747 break; 1748 *encPtr = encodingTable[UTF_16BE_ENC]; 1749 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1750 } 1751 else if (ptr[1] == '\0') { 1752 /* We could recover here in the case: 1753 - parsing an external entity 1754 - second byte is 0 1755 - no externally specified encoding 1756 - no encoding declaration 1757 by assuming UTF-16LE. But we don't, because this would mean when 1758 presented just with a single byte, we couldn't reliably determine 1759 whether we needed further bytes. 1760 */ 1761 if (state == XML_CONTENT_STATE) 1762 break; 1763 *encPtr = encodingTable[UTF_16LE_ENC]; 1764 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1765 } 1766 break; 1767 } 1768 } 1769 *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; 1770 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1771 } 1772 1773 1774 #define NS(x) x 1775 #define ns(x) x 1776 #define XML_TOK_NS_C 1777 #include "xmltok_ns.c" 1778 #undef XML_TOK_NS_C 1779 #undef NS 1780 #undef ns 1781 1782 #ifdef XML_NS 1783 1784 #define NS(x) x ## NS 1785 #define ns(x) x ## _ns 1786 1787 #define XML_TOK_NS_C 1788 #include "xmltok_ns.c" 1789 #undef XML_TOK_NS_C 1790 1791 #undef NS 1792 #undef ns 1793 1794 ENCODING * 1795 XmlInitUnknownEncodingNS(void *mem, 1796 int *table, 1797 CONVERTER convert, 1798 void *userData) 1799 { 1800 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); 1801 if (enc) 1802 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON; 1803 return enc; 1804 } 1805 1806 #endif /* XML_NS */ 1807