1 /* 2 __ __ _ 3 ___\ \/ /_ __ __ _| |_ 4 / _ \\ /| '_ \ / _` | __| 5 | __// \| |_) | (_| | |_ 6 \___/_/\_\ .__/ \__,_|\__| 7 |_| XML parser 8 9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd 10 Copyright (c) 2000-2017 Expat development team 11 Licensed under the MIT license: 12 13 Permission is hereby granted, free of charge, to any person obtaining 14 a copy of this software and associated documentation files (the 15 "Software"), to deal in the Software without restriction, including 16 without limitation the rights to use, copy, modify, merge, publish, 17 distribute, sublicense, and/or sell copies of the Software, and to permit 18 persons to whom the Software is furnished to do so, subject to the 19 following conditions: 20 21 The above copyright notice and this permission notice shall be included 22 in all copies or substantial portions of the Software. 23 24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 30 USE OR OTHER DEALINGS IN THE SOFTWARE. 31 */ 32 33 #include <stddef.h> 34 #include <string.h> /* memcpy */ 35 36 #if defined(_MSC_VER) && (_MSC_VER <= 1700) 37 /* for vs2012/11.0/1700 and earlier Visual Studio compilers */ 38 # define bool int 39 # define false 0 40 # define true 1 41 #else 42 # include <stdbool.h> 43 #endif 44 45 #ifdef _WIN32 46 # include "winconfig.h" 47 #else 48 # ifdef HAVE_EXPAT_CONFIG_H 49 # include <expat_config.h> 50 # endif 51 #endif /* ndef _WIN32 */ 52 53 #include "expat_external.h" 54 #include "internal.h" 55 #include "xmltok.h" 56 #include "nametab.h" 57 58 #ifdef XML_DTD 59 # define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) 60 #else 61 # define IGNORE_SECTION_TOK_VTABLE /* as nothing */ 62 #endif 63 64 #define VTABLE1 \ 65 {PREFIX(prologTok), PREFIX(contentTok), \ 66 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \ 67 {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \ 68 PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \ 69 PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \ 70 PREFIX(updatePosition), PREFIX(isPublicId) 71 72 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) 73 74 #define UCS2_GET_NAMING(pages, hi, lo) \ 75 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F))) 76 77 /* A 2 byte UTF-8 representation splits the characters 11 bits between 78 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into 79 pages, 3 bits to add to that index and 5 bits to generate the mask. 80 */ 81 #define UTF8_GET_NAMING2(pages, byte) \ 82 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ 83 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \ 84 & (1u << (((byte)[1]) & 0x1F))) 85 86 /* A 3 byte UTF-8 representation splits the characters 16 bits between 87 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index 88 into pages, 3 bits to add to that index and 5 bits to generate the 89 mask. 90 */ 91 #define UTF8_GET_NAMING3(pages, byte) \ 92 (namingBitmap \ 93 [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \ 94 << 3) \ 95 + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \ 96 & (1u << (((byte)[2]) & 0x1F))) 97 98 #define UTF8_GET_NAMING(pages, p, n) \ 99 ((n) == 2 \ 100 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ 101 : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0)) 102 103 /* Detection of invalid UTF-8 sequences is based on Table 3.1B 104 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ 105 with the additional restriction of not allowing the Unicode 106 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). 107 Implementation details: 108 (A & 0x80) == 0 means A < 0x80 109 and 110 (A & 0xC0) == 0xC0 means A > 0xBF 111 */ 112 113 #define UTF8_INVALID2(p) \ 114 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) 115 116 #define UTF8_INVALID3(p) \ 117 (((p)[2] & 0x80) == 0 \ 118 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \ 119 : ((p)[2] & 0xC0) == 0xC0) \ 120 || ((*p) == 0xE0 \ 121 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ 122 : ((p)[1] & 0x80) == 0 \ 123 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) 124 125 #define UTF8_INVALID4(p) \ 126 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \ 127 || ((p)[2] & 0xC0) == 0xC0 \ 128 || ((*p) == 0xF0 \ 129 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ 130 : ((p)[1] & 0x80) == 0 \ 131 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) 132 133 static int PTRFASTCALL 134 isNever(const ENCODING *enc, const char *p) { 135 UNUSED_P(enc); 136 UNUSED_P(p); 137 return 0; 138 } 139 140 static int PTRFASTCALL 141 utf8_isName2(const ENCODING *enc, const char *p) { 142 UNUSED_P(enc); 143 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); 144 } 145 146 static int PTRFASTCALL 147 utf8_isName3(const ENCODING *enc, const char *p) { 148 UNUSED_P(enc); 149 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); 150 } 151 152 #define utf8_isName4 isNever 153 154 static int PTRFASTCALL 155 utf8_isNmstrt2(const ENCODING *enc, const char *p) { 156 UNUSED_P(enc); 157 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); 158 } 159 160 static int PTRFASTCALL 161 utf8_isNmstrt3(const ENCODING *enc, const char *p) { 162 UNUSED_P(enc); 163 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); 164 } 165 166 #define utf8_isNmstrt4 isNever 167 168 static int PTRFASTCALL 169 utf8_isInvalid2(const ENCODING *enc, const char *p) { 170 UNUSED_P(enc); 171 return UTF8_INVALID2((const unsigned char *)p); 172 } 173 174 static int PTRFASTCALL 175 utf8_isInvalid3(const ENCODING *enc, const char *p) { 176 UNUSED_P(enc); 177 return UTF8_INVALID3((const unsigned char *)p); 178 } 179 180 static int PTRFASTCALL 181 utf8_isInvalid4(const ENCODING *enc, const char *p) { 182 UNUSED_P(enc); 183 return UTF8_INVALID4((const unsigned char *)p); 184 } 185 186 struct normal_encoding { 187 ENCODING enc; 188 unsigned char type[256]; 189 #ifdef XML_MIN_SIZE 190 int(PTRFASTCALL *byteType)(const ENCODING *, const char *); 191 int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *); 192 int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *); 193 int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *); 194 int(PTRCALL *charMatches)(const ENCODING *, const char *, int); 195 #endif /* XML_MIN_SIZE */ 196 int(PTRFASTCALL *isName2)(const ENCODING *, const char *); 197 int(PTRFASTCALL *isName3)(const ENCODING *, const char *); 198 int(PTRFASTCALL *isName4)(const ENCODING *, const char *); 199 int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *); 200 int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *); 201 int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *); 202 int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *); 203 int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *); 204 int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *); 205 }; 206 207 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc)) 208 209 #ifdef XML_MIN_SIZE 210 211 # define STANDARD_VTABLE(E) \ 212 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches, 213 214 #else 215 216 # define STANDARD_VTABLE(E) /* as nothing */ 217 218 #endif 219 220 #define NORMAL_VTABLE(E) \ 221 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \ 222 E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4 223 224 #define NULL_VTABLE \ 225 /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \ 226 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \ 227 /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL 228 229 static int FASTCALL checkCharRefNumber(int); 230 231 #include "xmltok_impl.h" 232 #include "ascii.h" 233 234 #ifdef XML_MIN_SIZE 235 # define sb_isNameMin isNever 236 # define sb_isNmstrtMin isNever 237 #endif 238 239 #ifdef XML_MIN_SIZE 240 # define MINBPC(enc) ((enc)->minBytesPerChar) 241 #else 242 /* minimum bytes per character */ 243 # define MINBPC(enc) 1 244 #endif 245 246 #define SB_BYTE_TYPE(enc, p) \ 247 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) 248 249 #ifdef XML_MIN_SIZE 250 static int PTRFASTCALL 251 sb_byteType(const ENCODING *enc, const char *p) { 252 return SB_BYTE_TYPE(enc, p); 253 } 254 # define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) 255 #else 256 # define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) 257 #endif 258 259 #ifdef XML_MIN_SIZE 260 # define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) 261 static int PTRFASTCALL 262 sb_byteToAscii(const ENCODING *enc, const char *p) { 263 UNUSED_P(enc); 264 return *p; 265 } 266 #else 267 # define BYTE_TO_ASCII(enc, p) (*(p)) 268 #endif 269 270 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p)) 271 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p)) 272 #define IS_INVALID_CHAR(enc, p, n) \ 273 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p)) 274 275 #ifdef XML_MIN_SIZE 276 # define IS_NAME_CHAR_MINBPC(enc, p) \ 277 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) 278 # define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 279 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) 280 #else 281 # define IS_NAME_CHAR_MINBPC(enc, p) (0) 282 # define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 283 #endif 284 285 #ifdef XML_MIN_SIZE 286 # define CHAR_MATCHES(enc, p, c) \ 287 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) 288 static int PTRCALL 289 sb_charMatches(const ENCODING *enc, const char *p, int c) { 290 UNUSED_P(enc); 291 return *p == c; 292 } 293 #else 294 /* c is an ASCII character */ 295 # define CHAR_MATCHES(enc, p, c) (*(p) == c) 296 #endif 297 298 #define PREFIX(ident) normal_##ident 299 #define XML_TOK_IMPL_C 300 #include "xmltok_impl.c" 301 #undef XML_TOK_IMPL_C 302 303 #undef MINBPC 304 #undef BYTE_TYPE 305 #undef BYTE_TO_ASCII 306 #undef CHAR_MATCHES 307 #undef IS_NAME_CHAR 308 #undef IS_NAME_CHAR_MINBPC 309 #undef IS_NMSTRT_CHAR 310 #undef IS_NMSTRT_CHAR_MINBPC 311 #undef IS_INVALID_CHAR 312 313 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ 314 UTF8_cval1 = 0x00, 315 UTF8_cval2 = 0xc0, 316 UTF8_cval3 = 0xe0, 317 UTF8_cval4 = 0xf0 318 }; 319 320 void 321 _INTERNAL_trim_to_complete_utf8_characters(const char *from, 322 const char **fromLimRef) { 323 const char *fromLim = *fromLimRef; 324 size_t walked = 0; 325 for (; fromLim > from; fromLim--, walked++) { 326 const unsigned char prev = (unsigned char)fromLim[-1]; 327 if ((prev & 0xf8u) 328 == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */ 329 if (walked + 1 >= 4) { 330 fromLim += 4 - 1; 331 break; 332 } else { 333 walked = 0; 334 } 335 } else if ((prev & 0xf0u) 336 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */ 337 if (walked + 1 >= 3) { 338 fromLim += 3 - 1; 339 break; 340 } else { 341 walked = 0; 342 } 343 } else if ((prev & 0xe0u) 344 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */ 345 if (walked + 1 >= 2) { 346 fromLim += 2 - 1; 347 break; 348 } else { 349 walked = 0; 350 } 351 } else if ((prev & 0x80u) 352 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */ 353 break; 354 } 355 } 356 *fromLimRef = fromLim; 357 } 358 359 static enum XML_Convert_Result PTRCALL 360 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 361 char **toP, const char *toLim) { 362 bool input_incomplete = false; 363 bool output_exhausted = false; 364 365 /* Avoid copying partial characters (due to limited space). */ 366 const ptrdiff_t bytesAvailable = fromLim - *fromP; 367 const ptrdiff_t bytesStorable = toLim - *toP; 368 UNUSED_P(enc); 369 if (bytesAvailable > bytesStorable) { 370 fromLim = *fromP + bytesStorable; 371 output_exhausted = true; 372 } 373 374 /* Avoid copying partial characters (from incomplete input). */ 375 { 376 const char *const fromLimBefore = fromLim; 377 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim); 378 if (fromLim < fromLimBefore) { 379 input_incomplete = true; 380 } 381 } 382 383 { 384 const ptrdiff_t bytesToCopy = fromLim - *fromP; 385 memcpy(*toP, *fromP, bytesToCopy); 386 *fromP += bytesToCopy; 387 *toP += bytesToCopy; 388 } 389 390 if (output_exhausted) /* needs to go first */ 391 return XML_CONVERT_OUTPUT_EXHAUSTED; 392 else if (input_incomplete) 393 return XML_CONVERT_INPUT_INCOMPLETE; 394 else 395 return XML_CONVERT_COMPLETED; 396 } 397 398 static enum XML_Convert_Result PTRCALL 399 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, 400 unsigned short **toP, const unsigned short *toLim) { 401 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; 402 unsigned short *to = *toP; 403 const char *from = *fromP; 404 while (from < fromLim && to < toLim) { 405 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { 406 case BT_LEAD2: 407 if (fromLim - from < 2) { 408 res = XML_CONVERT_INPUT_INCOMPLETE; 409 goto after; 410 } 411 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); 412 from += 2; 413 break; 414 case BT_LEAD3: 415 if (fromLim - from < 3) { 416 res = XML_CONVERT_INPUT_INCOMPLETE; 417 goto after; 418 } 419 *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) 420 | (from[2] & 0x3f)); 421 from += 3; 422 break; 423 case BT_LEAD4: { 424 unsigned long n; 425 if (toLim - to < 2) { 426 res = XML_CONVERT_OUTPUT_EXHAUSTED; 427 goto after; 428 } 429 if (fromLim - from < 4) { 430 res = XML_CONVERT_INPUT_INCOMPLETE; 431 goto after; 432 } 433 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) 434 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); 435 n -= 0x10000; 436 to[0] = (unsigned short)((n >> 10) | 0xD800); 437 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); 438 to += 2; 439 from += 4; 440 } break; 441 default: 442 *to++ = *from++; 443 break; 444 } 445 } 446 if (from < fromLim) 447 res = XML_CONVERT_OUTPUT_EXHAUSTED; 448 after: 449 *fromP = from; 450 *toP = to; 451 return res; 452 } 453 454 #ifdef XML_NS 455 static const struct normal_encoding utf8_encoding_ns 456 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 457 { 458 # include "asciitab.h" 459 # include "utf8tab.h" 460 }, 461 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 462 #endif 463 464 static const struct normal_encoding utf8_encoding 465 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 466 { 467 #define BT_COLON BT_NMSTRT 468 #include "asciitab.h" 469 #undef BT_COLON 470 #include "utf8tab.h" 471 }, 472 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 473 474 #ifdef XML_NS 475 476 static const struct normal_encoding internal_utf8_encoding_ns 477 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 478 { 479 # include "iasciitab.h" 480 # include "utf8tab.h" 481 }, 482 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 483 484 #endif 485 486 static const struct normal_encoding internal_utf8_encoding 487 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 488 { 489 #define BT_COLON BT_NMSTRT 490 #include "iasciitab.h" 491 #undef BT_COLON 492 #include "utf8tab.h" 493 }, 494 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 495 496 static enum XML_Convert_Result PTRCALL 497 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 498 char **toP, const char *toLim) { 499 UNUSED_P(enc); 500 for (;;) { 501 unsigned char c; 502 if (*fromP == fromLim) 503 return XML_CONVERT_COMPLETED; 504 c = (unsigned char)**fromP; 505 if (c & 0x80) { 506 if (toLim - *toP < 2) 507 return XML_CONVERT_OUTPUT_EXHAUSTED; 508 *(*toP)++ = (char)((c >> 6) | UTF8_cval2); 509 *(*toP)++ = (char)((c & 0x3f) | 0x80); 510 (*fromP)++; 511 } else { 512 if (*toP == toLim) 513 return XML_CONVERT_OUTPUT_EXHAUSTED; 514 *(*toP)++ = *(*fromP)++; 515 } 516 } 517 } 518 519 static enum XML_Convert_Result PTRCALL 520 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, 521 unsigned short **toP, const unsigned short *toLim) { 522 UNUSED_P(enc); 523 while (*fromP < fromLim && *toP < toLim) 524 *(*toP)++ = (unsigned char)*(*fromP)++; 525 526 if ((*toP == toLim) && (*fromP < fromLim)) 527 return XML_CONVERT_OUTPUT_EXHAUSTED; 528 else 529 return XML_CONVERT_COMPLETED; 530 } 531 532 #ifdef XML_NS 533 534 static const struct normal_encoding latin1_encoding_ns 535 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0}, 536 { 537 # include "asciitab.h" 538 # include "latin1tab.h" 539 }, 540 STANDARD_VTABLE(sb_) NULL_VTABLE}; 541 542 #endif 543 544 static const struct normal_encoding latin1_encoding 545 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0}, 546 { 547 #define BT_COLON BT_NMSTRT 548 #include "asciitab.h" 549 #undef BT_COLON 550 #include "latin1tab.h" 551 }, 552 STANDARD_VTABLE(sb_) NULL_VTABLE}; 553 554 static enum XML_Convert_Result PTRCALL 555 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 556 char **toP, const char *toLim) { 557 UNUSED_P(enc); 558 while (*fromP < fromLim && *toP < toLim) 559 *(*toP)++ = *(*fromP)++; 560 561 if ((*toP == toLim) && (*fromP < fromLim)) 562 return XML_CONVERT_OUTPUT_EXHAUSTED; 563 else 564 return XML_CONVERT_COMPLETED; 565 } 566 567 #ifdef XML_NS 568 569 static const struct normal_encoding ascii_encoding_ns 570 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0}, 571 { 572 # include "asciitab.h" 573 /* BT_NONXML == 0 */ 574 }, 575 STANDARD_VTABLE(sb_) NULL_VTABLE}; 576 577 #endif 578 579 static const struct normal_encoding ascii_encoding 580 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0}, 581 { 582 #define BT_COLON BT_NMSTRT 583 #include "asciitab.h" 584 #undef BT_COLON 585 /* BT_NONXML == 0 */ 586 }, 587 STANDARD_VTABLE(sb_) NULL_VTABLE}; 588 589 static int PTRFASTCALL 590 unicode_byte_type(char hi, char lo) { 591 switch ((unsigned char)hi) { 592 /* 0xD800–0xDBFF first 16-bit code unit or high surrogate (W1) */ 593 case 0xD8: 594 case 0xD9: 595 case 0xDA: 596 case 0xDB: 597 return BT_LEAD4; 598 /* 0xDC00–0xDFFF second 16-bit code unit or low surrogate (W2) */ 599 case 0xDC: 600 case 0xDD: 601 case 0xDE: 602 case 0xDF: 603 return BT_TRAIL; 604 case 0xFF: 605 switch ((unsigned char)lo) { 606 case 0xFF: /* noncharacter-FFFF */ 607 case 0xFE: /* noncharacter-FFFE */ 608 return BT_NONXML; 609 } 610 break; 611 } 612 return BT_NONASCII; 613 } 614 615 #define DEFINE_UTF16_TO_UTF8(E) \ 616 static enum XML_Convert_Result PTRCALL E##toUtf8( \ 617 const ENCODING *enc, const char **fromP, const char *fromLim, \ 618 char **toP, const char *toLim) { \ 619 const char *from = *fromP; \ 620 UNUSED_P(enc); \ 621 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \ 622 for (; from < fromLim; from += 2) { \ 623 int plane; \ 624 unsigned char lo2; \ 625 unsigned char lo = GET_LO(from); \ 626 unsigned char hi = GET_HI(from); \ 627 switch (hi) { \ 628 case 0: \ 629 if (lo < 0x80) { \ 630 if (*toP == toLim) { \ 631 *fromP = from; \ 632 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 633 } \ 634 *(*toP)++ = lo; \ 635 break; \ 636 } \ 637 /* fall through */ \ 638 case 0x1: \ 639 case 0x2: \ 640 case 0x3: \ 641 case 0x4: \ 642 case 0x5: \ 643 case 0x6: \ 644 case 0x7: \ 645 if (toLim - *toP < 2) { \ 646 *fromP = from; \ 647 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 648 } \ 649 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ 650 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 651 break; \ 652 default: \ 653 if (toLim - *toP < 3) { \ 654 *fromP = from; \ 655 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 656 } \ 657 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ 658 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ 659 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ 660 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 661 break; \ 662 case 0xD8: \ 663 case 0xD9: \ 664 case 0xDA: \ 665 case 0xDB: \ 666 if (toLim - *toP < 4) { \ 667 *fromP = from; \ 668 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 669 } \ 670 if (fromLim - from < 4) { \ 671 *fromP = from; \ 672 return XML_CONVERT_INPUT_INCOMPLETE; \ 673 } \ 674 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ 675 *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \ 676 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ 677 from += 2; \ 678 lo2 = GET_LO(from); \ 679 *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \ 680 | (lo2 >> 6) | 0x80); \ 681 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ 682 break; \ 683 } \ 684 } \ 685 *fromP = from; \ 686 if (from < fromLim) \ 687 return XML_CONVERT_INPUT_INCOMPLETE; \ 688 else \ 689 return XML_CONVERT_COMPLETED; \ 690 } 691 692 #define DEFINE_UTF16_TO_UTF16(E) \ 693 static enum XML_Convert_Result PTRCALL E##toUtf16( \ 694 const ENCODING *enc, const char **fromP, const char *fromLim, \ 695 unsigned short **toP, const unsigned short *toLim) { \ 696 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \ 697 UNUSED_P(enc); \ 698 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \ 699 /* Avoid copying first half only of surrogate */ \ 700 if (fromLim - *fromP > ((toLim - *toP) << 1) \ 701 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \ 702 fromLim -= 2; \ 703 res = XML_CONVERT_INPUT_INCOMPLETE; \ 704 } \ 705 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \ 706 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ 707 if ((*toP == toLim) && (*fromP < fromLim)) \ 708 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 709 else \ 710 return res; \ 711 } 712 713 #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8))) 714 #define GET_LO(ptr) ((unsigned char)(ptr)[0]) 715 #define GET_HI(ptr) ((unsigned char)(ptr)[1]) 716 717 DEFINE_UTF16_TO_UTF8(little2_) 718 DEFINE_UTF16_TO_UTF16(little2_) 719 720 #undef SET2 721 #undef GET_LO 722 #undef GET_HI 723 724 #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF))) 725 #define GET_LO(ptr) ((unsigned char)(ptr)[1]) 726 #define GET_HI(ptr) ((unsigned char)(ptr)[0]) 727 728 DEFINE_UTF16_TO_UTF8(big2_) 729 DEFINE_UTF16_TO_UTF16(big2_) 730 731 #undef SET2 732 #undef GET_LO 733 #undef GET_HI 734 735 #define LITTLE2_BYTE_TYPE(enc, p) \ 736 ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ 737 : unicode_byte_type((p)[1], (p)[0])) 738 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1) 739 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c) 740 #define LITTLE2_IS_NAME_CHAR_MINBPC(p) \ 741 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) 742 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \ 743 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) 744 745 #ifdef XML_MIN_SIZE 746 747 static int PTRFASTCALL 748 little2_byteType(const ENCODING *enc, const char *p) { 749 return LITTLE2_BYTE_TYPE(enc, p); 750 } 751 752 static int PTRFASTCALL 753 little2_byteToAscii(const ENCODING *enc, const char *p) { 754 UNUSED_P(enc); 755 return LITTLE2_BYTE_TO_ASCII(p); 756 } 757 758 static int PTRCALL 759 little2_charMatches(const ENCODING *enc, const char *p, int c) { 760 UNUSED_P(enc); 761 return LITTLE2_CHAR_MATCHES(p, c); 762 } 763 764 static int PTRFASTCALL 765 little2_isNameMin(const ENCODING *enc, const char *p) { 766 UNUSED_P(enc); 767 return LITTLE2_IS_NAME_CHAR_MINBPC(p); 768 } 769 770 static int PTRFASTCALL 771 little2_isNmstrtMin(const ENCODING *enc, const char *p) { 772 UNUSED_P(enc); 773 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p); 774 } 775 776 # undef VTABLE 777 # define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 778 779 #else /* not XML_MIN_SIZE */ 780 781 # undef PREFIX 782 # define PREFIX(ident) little2_##ident 783 # define MINBPC(enc) 2 784 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 785 # define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) 786 # define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p) 787 # define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c) 788 # define IS_NAME_CHAR(enc, p, n) 0 789 # define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p) 790 # define IS_NMSTRT_CHAR(enc, p, n) (0) 791 # define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) 792 793 # define XML_TOK_IMPL_C 794 # include "xmltok_impl.c" 795 # undef XML_TOK_IMPL_C 796 797 # undef MINBPC 798 # undef BYTE_TYPE 799 # undef BYTE_TO_ASCII 800 # undef CHAR_MATCHES 801 # undef IS_NAME_CHAR 802 # undef IS_NAME_CHAR_MINBPC 803 # undef IS_NMSTRT_CHAR 804 # undef IS_NMSTRT_CHAR_MINBPC 805 # undef IS_INVALID_CHAR 806 807 #endif /* not XML_MIN_SIZE */ 808 809 #ifdef XML_NS 810 811 static const struct normal_encoding little2_encoding_ns 812 = {{VTABLE, 2, 0, 813 # if BYTEORDER == 1234 814 1 815 # else 816 0 817 # endif 818 }, 819 { 820 # include "asciitab.h" 821 # include "latin1tab.h" 822 }, 823 STANDARD_VTABLE(little2_) NULL_VTABLE}; 824 825 #endif 826 827 static const struct normal_encoding little2_encoding 828 = {{VTABLE, 2, 0, 829 #if BYTEORDER == 1234 830 1 831 #else 832 0 833 #endif 834 }, 835 { 836 #define BT_COLON BT_NMSTRT 837 #include "asciitab.h" 838 #undef BT_COLON 839 #include "latin1tab.h" 840 }, 841 STANDARD_VTABLE(little2_) NULL_VTABLE}; 842 843 #if BYTEORDER != 4321 844 845 # ifdef XML_NS 846 847 static const struct normal_encoding internal_little2_encoding_ns 848 = {{VTABLE, 2, 0, 1}, 849 { 850 # include "iasciitab.h" 851 # include "latin1tab.h" 852 }, 853 STANDARD_VTABLE(little2_) NULL_VTABLE}; 854 855 # endif 856 857 static const struct normal_encoding internal_little2_encoding 858 = {{VTABLE, 2, 0, 1}, 859 { 860 # define BT_COLON BT_NMSTRT 861 # include "iasciitab.h" 862 # undef BT_COLON 863 # include "latin1tab.h" 864 }, 865 STANDARD_VTABLE(little2_) NULL_VTABLE}; 866 867 #endif 868 869 #define BIG2_BYTE_TYPE(enc, p) \ 870 ((p)[0] == 0 \ 871 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ 872 : unicode_byte_type((p)[0], (p)[1])) 873 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1) 874 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c) 875 #define BIG2_IS_NAME_CHAR_MINBPC(p) \ 876 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) 877 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \ 878 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) 879 880 #ifdef XML_MIN_SIZE 881 882 static int PTRFASTCALL 883 big2_byteType(const ENCODING *enc, const char *p) { 884 return BIG2_BYTE_TYPE(enc, p); 885 } 886 887 static int PTRFASTCALL 888 big2_byteToAscii(const ENCODING *enc, const char *p) { 889 UNUSED_P(enc); 890 return BIG2_BYTE_TO_ASCII(p); 891 } 892 893 static int PTRCALL 894 big2_charMatches(const ENCODING *enc, const char *p, int c) { 895 UNUSED_P(enc); 896 return BIG2_CHAR_MATCHES(p, c); 897 } 898 899 static int PTRFASTCALL 900 big2_isNameMin(const ENCODING *enc, const char *p) { 901 UNUSED_P(enc); 902 return BIG2_IS_NAME_CHAR_MINBPC(p); 903 } 904 905 static int PTRFASTCALL 906 big2_isNmstrtMin(const ENCODING *enc, const char *p) { 907 UNUSED_P(enc); 908 return BIG2_IS_NMSTRT_CHAR_MINBPC(p); 909 } 910 911 # undef VTABLE 912 # define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 913 914 #else /* not XML_MIN_SIZE */ 915 916 # undef PREFIX 917 # define PREFIX(ident) big2_##ident 918 # define MINBPC(enc) 2 919 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 920 # define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) 921 # define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p) 922 # define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c) 923 # define IS_NAME_CHAR(enc, p, n) 0 924 # define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p) 925 # define IS_NMSTRT_CHAR(enc, p, n) (0) 926 # define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p) 927 928 # define XML_TOK_IMPL_C 929 # include "xmltok_impl.c" 930 # undef XML_TOK_IMPL_C 931 932 # undef MINBPC 933 # undef BYTE_TYPE 934 # undef BYTE_TO_ASCII 935 # undef CHAR_MATCHES 936 # undef IS_NAME_CHAR 937 # undef IS_NAME_CHAR_MINBPC 938 # undef IS_NMSTRT_CHAR 939 # undef IS_NMSTRT_CHAR_MINBPC 940 # undef IS_INVALID_CHAR 941 942 #endif /* not XML_MIN_SIZE */ 943 944 #ifdef XML_NS 945 946 static const struct normal_encoding big2_encoding_ns 947 = {{VTABLE, 2, 0, 948 # if BYTEORDER == 4321 949 1 950 # else 951 0 952 # endif 953 }, 954 { 955 # include "asciitab.h" 956 # include "latin1tab.h" 957 }, 958 STANDARD_VTABLE(big2_) NULL_VTABLE}; 959 960 #endif 961 962 static const struct normal_encoding big2_encoding 963 = {{VTABLE, 2, 0, 964 #if BYTEORDER == 4321 965 1 966 #else 967 0 968 #endif 969 }, 970 { 971 #define BT_COLON BT_NMSTRT 972 #include "asciitab.h" 973 #undef BT_COLON 974 #include "latin1tab.h" 975 }, 976 STANDARD_VTABLE(big2_) NULL_VTABLE}; 977 978 #if BYTEORDER != 1234 979 980 # ifdef XML_NS 981 982 static const struct normal_encoding internal_big2_encoding_ns 983 = {{VTABLE, 2, 0, 1}, 984 { 985 # include "iasciitab.h" 986 # include "latin1tab.h" 987 }, 988 STANDARD_VTABLE(big2_) NULL_VTABLE}; 989 990 # endif 991 992 static const struct normal_encoding internal_big2_encoding 993 = {{VTABLE, 2, 0, 1}, 994 { 995 # define BT_COLON BT_NMSTRT 996 # include "iasciitab.h" 997 # undef BT_COLON 998 # include "latin1tab.h" 999 }, 1000 STANDARD_VTABLE(big2_) NULL_VTABLE}; 1001 1002 #endif 1003 1004 #undef PREFIX 1005 1006 static int FASTCALL 1007 streqci(const char *s1, const char *s2) { 1008 for (;;) { 1009 char c1 = *s1++; 1010 char c2 = *s2++; 1011 if (ASCII_a <= c1 && c1 <= ASCII_z) 1012 c1 += ASCII_A - ASCII_a; 1013 if (ASCII_a <= c2 && c2 <= ASCII_z) 1014 /* The following line will never get executed. streqci() is 1015 * only called from two places, both of which guarantee to put 1016 * upper-case strings into s2. 1017 */ 1018 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */ 1019 if (c1 != c2) 1020 return 0; 1021 if (! c1) 1022 break; 1023 } 1024 return 1; 1025 } 1026 1027 static void PTRCALL 1028 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end, 1029 POSITION *pos) { 1030 UNUSED_P(enc); 1031 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); 1032 } 1033 1034 static int 1035 toAscii(const ENCODING *enc, const char *ptr, const char *end) { 1036 char buf[1]; 1037 char *p = buf; 1038 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); 1039 if (p == buf) 1040 return -1; 1041 else 1042 return buf[0]; 1043 } 1044 1045 static int FASTCALL 1046 isSpace(int c) { 1047 switch (c) { 1048 case 0x20: 1049 case 0xD: 1050 case 0xA: 1051 case 0x9: 1052 return 1; 1053 } 1054 return 0; 1055 } 1056 1057 /* Return 1 if there's just optional white space or there's an S 1058 followed by name=val. 1059 */ 1060 static int 1061 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end, 1062 const char **namePtr, const char **nameEndPtr, 1063 const char **valPtr, const char **nextTokPtr) { 1064 int c; 1065 char open; 1066 if (ptr == end) { 1067 *namePtr = NULL; 1068 return 1; 1069 } 1070 if (! isSpace(toAscii(enc, ptr, end))) { 1071 *nextTokPtr = ptr; 1072 return 0; 1073 } 1074 do { 1075 ptr += enc->minBytesPerChar; 1076 } while (isSpace(toAscii(enc, ptr, end))); 1077 if (ptr == end) { 1078 *namePtr = NULL; 1079 return 1; 1080 } 1081 *namePtr = ptr; 1082 for (;;) { 1083 c = toAscii(enc, ptr, end); 1084 if (c == -1) { 1085 *nextTokPtr = ptr; 1086 return 0; 1087 } 1088 if (c == ASCII_EQUALS) { 1089 *nameEndPtr = ptr; 1090 break; 1091 } 1092 if (isSpace(c)) { 1093 *nameEndPtr = ptr; 1094 do { 1095 ptr += enc->minBytesPerChar; 1096 } while (isSpace(c = toAscii(enc, ptr, end))); 1097 if (c != ASCII_EQUALS) { 1098 *nextTokPtr = ptr; 1099 return 0; 1100 } 1101 break; 1102 } 1103 ptr += enc->minBytesPerChar; 1104 } 1105 if (ptr == *namePtr) { 1106 *nextTokPtr = ptr; 1107 return 0; 1108 } 1109 ptr += enc->minBytesPerChar; 1110 c = toAscii(enc, ptr, end); 1111 while (isSpace(c)) { 1112 ptr += enc->minBytesPerChar; 1113 c = toAscii(enc, ptr, end); 1114 } 1115 if (c != ASCII_QUOT && c != ASCII_APOS) { 1116 *nextTokPtr = ptr; 1117 return 0; 1118 } 1119 open = (char)c; 1120 ptr += enc->minBytesPerChar; 1121 *valPtr = ptr; 1122 for (;; ptr += enc->minBytesPerChar) { 1123 c = toAscii(enc, ptr, end); 1124 if (c == open) 1125 break; 1126 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z) 1127 && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD 1128 && c != ASCII_MINUS && c != ASCII_UNDERSCORE) { 1129 *nextTokPtr = ptr; 1130 return 0; 1131 } 1132 } 1133 *nextTokPtr = ptr + enc->minBytesPerChar; 1134 return 1; 1135 } 1136 1137 static const char KW_version[] 1138 = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'}; 1139 1140 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, 1141 ASCII_i, ASCII_n, ASCII_g, '\0'}; 1142 1143 static const char KW_standalone[] 1144 = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, 1145 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'}; 1146 1147 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'}; 1148 1149 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'}; 1150 1151 static int 1152 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *, 1153 const char *), 1154 int isGeneralTextEntity, const ENCODING *enc, const char *ptr, 1155 const char *end, const char **badPtr, const char **versionPtr, 1156 const char **versionEndPtr, const char **encodingName, 1157 const ENCODING **encoding, int *standalone) { 1158 const char *val = NULL; 1159 const char *name = NULL; 1160 const char *nameEnd = NULL; 1161 ptr += 5 * enc->minBytesPerChar; 1162 end -= 2 * enc->minBytesPerChar; 1163 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) 1164 || ! name) { 1165 *badPtr = ptr; 1166 return 0; 1167 } 1168 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) { 1169 if (! isGeneralTextEntity) { 1170 *badPtr = name; 1171 return 0; 1172 } 1173 } else { 1174 if (versionPtr) 1175 *versionPtr = val; 1176 if (versionEndPtr) 1177 *versionEndPtr = ptr; 1178 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1179 *badPtr = ptr; 1180 return 0; 1181 } 1182 if (! name) { 1183 if (isGeneralTextEntity) { 1184 /* a TextDecl must have an EncodingDecl */ 1185 *badPtr = ptr; 1186 return 0; 1187 } 1188 return 1; 1189 } 1190 } 1191 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) { 1192 int c = toAscii(enc, val, end); 1193 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) { 1194 *badPtr = val; 1195 return 0; 1196 } 1197 if (encodingName) 1198 *encodingName = val; 1199 if (encoding) 1200 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); 1201 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1202 *badPtr = ptr; 1203 return 0; 1204 } 1205 if (! name) 1206 return 1; 1207 } 1208 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) 1209 || isGeneralTextEntity) { 1210 *badPtr = name; 1211 return 0; 1212 } 1213 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) { 1214 if (standalone) 1215 *standalone = 1; 1216 } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) { 1217 if (standalone) 1218 *standalone = 0; 1219 } else { 1220 *badPtr = val; 1221 return 0; 1222 } 1223 while (isSpace(toAscii(enc, ptr, end))) 1224 ptr += enc->minBytesPerChar; 1225 if (ptr != end) { 1226 *badPtr = ptr; 1227 return 0; 1228 } 1229 return 1; 1230 } 1231 1232 static int FASTCALL 1233 checkCharRefNumber(int result) { 1234 switch (result >> 8) { 1235 case 0xD8: 1236 case 0xD9: 1237 case 0xDA: 1238 case 0xDB: 1239 case 0xDC: 1240 case 0xDD: 1241 case 0xDE: 1242 case 0xDF: 1243 return -1; 1244 case 0: 1245 if (latin1_encoding.type[result] == BT_NONXML) 1246 return -1; 1247 break; 1248 case 0xFF: 1249 if (result == 0xFFFE || result == 0xFFFF) 1250 return -1; 1251 break; 1252 } 1253 return result; 1254 } 1255 1256 int FASTCALL 1257 XmlUtf8Encode(int c, char *buf) { 1258 enum { 1259 /* minN is minimum legal resulting value for N byte sequence */ 1260 min2 = 0x80, 1261 min3 = 0x800, 1262 min4 = 0x10000 1263 }; 1264 1265 if (c < 0) 1266 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */ 1267 if (c < min2) { 1268 buf[0] = (char)(c | UTF8_cval1); 1269 return 1; 1270 } 1271 if (c < min3) { 1272 buf[0] = (char)((c >> 6) | UTF8_cval2); 1273 buf[1] = (char)((c & 0x3f) | 0x80); 1274 return 2; 1275 } 1276 if (c < min4) { 1277 buf[0] = (char)((c >> 12) | UTF8_cval3); 1278 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80); 1279 buf[2] = (char)((c & 0x3f) | 0x80); 1280 return 3; 1281 } 1282 if (c < 0x110000) { 1283 buf[0] = (char)((c >> 18) | UTF8_cval4); 1284 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80); 1285 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80); 1286 buf[3] = (char)((c & 0x3f) | 0x80); 1287 return 4; 1288 } 1289 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */ 1290 } 1291 1292 int FASTCALL 1293 XmlUtf16Encode(int charNum, unsigned short *buf) { 1294 if (charNum < 0) 1295 return 0; 1296 if (charNum < 0x10000) { 1297 buf[0] = (unsigned short)charNum; 1298 return 1; 1299 } 1300 if (charNum < 0x110000) { 1301 charNum -= 0x10000; 1302 buf[0] = (unsigned short)((charNum >> 10) + 0xD800); 1303 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00); 1304 return 2; 1305 } 1306 return 0; 1307 } 1308 1309 struct unknown_encoding { 1310 struct normal_encoding normal; 1311 CONVERTER convert; 1312 void *userData; 1313 unsigned short utf16[256]; 1314 char utf8[256][4]; 1315 }; 1316 1317 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc)) 1318 1319 int 1320 XmlSizeOfUnknownEncoding(void) { 1321 return sizeof(struct unknown_encoding); 1322 } 1323 1324 static int PTRFASTCALL 1325 unknown_isName(const ENCODING *enc, const char *p) { 1326 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1327 int c = uenc->convert(uenc->userData, p); 1328 if (c & ~0xFFFF) 1329 return 0; 1330 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); 1331 } 1332 1333 static int PTRFASTCALL 1334 unknown_isNmstrt(const ENCODING *enc, const char *p) { 1335 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1336 int c = uenc->convert(uenc->userData, p); 1337 if (c & ~0xFFFF) 1338 return 0; 1339 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); 1340 } 1341 1342 static int PTRFASTCALL 1343 unknown_isInvalid(const ENCODING *enc, const char *p) { 1344 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1345 int c = uenc->convert(uenc->userData, p); 1346 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; 1347 } 1348 1349 static enum XML_Convert_Result PTRCALL 1350 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 1351 char **toP, const char *toLim) { 1352 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1353 char buf[XML_UTF8_ENCODE_MAX]; 1354 for (;;) { 1355 const char *utf8; 1356 int n; 1357 if (*fromP == fromLim) 1358 return XML_CONVERT_COMPLETED; 1359 utf8 = uenc->utf8[(unsigned char)**fromP]; 1360 n = *utf8++; 1361 if (n == 0) { 1362 int c = uenc->convert(uenc->userData, *fromP); 1363 n = XmlUtf8Encode(c, buf); 1364 if (n > toLim - *toP) 1365 return XML_CONVERT_OUTPUT_EXHAUSTED; 1366 utf8 = buf; 1367 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1368 - (BT_LEAD2 - 2)); 1369 } else { 1370 if (n > toLim - *toP) 1371 return XML_CONVERT_OUTPUT_EXHAUSTED; 1372 (*fromP)++; 1373 } 1374 memcpy(*toP, utf8, n); 1375 *toP += n; 1376 } 1377 } 1378 1379 static enum XML_Convert_Result PTRCALL 1380 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, 1381 unsigned short **toP, const unsigned short *toLim) { 1382 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1383 while (*fromP < fromLim && *toP < toLim) { 1384 unsigned short c = uenc->utf16[(unsigned char)**fromP]; 1385 if (c == 0) { 1386 c = (unsigned short)uenc->convert(uenc->userData, *fromP); 1387 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1388 - (BT_LEAD2 - 2)); 1389 } else 1390 (*fromP)++; 1391 *(*toP)++ = c; 1392 } 1393 1394 if ((*toP == toLim) && (*fromP < fromLim)) 1395 return XML_CONVERT_OUTPUT_EXHAUSTED; 1396 else 1397 return XML_CONVERT_COMPLETED; 1398 } 1399 1400 ENCODING * 1401 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert, 1402 void *userData) { 1403 int i; 1404 struct unknown_encoding *e = (struct unknown_encoding *)mem; 1405 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding)); 1406 for (i = 0; i < 128; i++) 1407 if (latin1_encoding.type[i] != BT_OTHER 1408 && latin1_encoding.type[i] != BT_NONXML && table[i] != i) 1409 return 0; 1410 for (i = 0; i < 256; i++) { 1411 int c = table[i]; 1412 if (c == -1) { 1413 e->normal.type[i] = BT_MALFORM; 1414 /* This shouldn't really get used. */ 1415 e->utf16[i] = 0xFFFF; 1416 e->utf8[i][0] = 1; 1417 e->utf8[i][1] = 0; 1418 } else if (c < 0) { 1419 if (c < -4) 1420 return 0; 1421 /* Multi-byte sequences need a converter function */ 1422 if (! convert) 1423 return 0; 1424 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2)); 1425 e->utf8[i][0] = 0; 1426 e->utf16[i] = 0; 1427 } else if (c < 0x80) { 1428 if (latin1_encoding.type[c] != BT_OTHER 1429 && latin1_encoding.type[c] != BT_NONXML && c != i) 1430 return 0; 1431 e->normal.type[i] = latin1_encoding.type[c]; 1432 e->utf8[i][0] = 1; 1433 e->utf8[i][1] = (char)c; 1434 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c); 1435 } else if (checkCharRefNumber(c) < 0) { 1436 e->normal.type[i] = BT_NONXML; 1437 /* This shouldn't really get used. */ 1438 e->utf16[i] = 0xFFFF; 1439 e->utf8[i][0] = 1; 1440 e->utf8[i][1] = 0; 1441 } else { 1442 if (c > 0xFFFF) 1443 return 0; 1444 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) 1445 e->normal.type[i] = BT_NMSTRT; 1446 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) 1447 e->normal.type[i] = BT_NAME; 1448 else 1449 e->normal.type[i] = BT_OTHER; 1450 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); 1451 e->utf16[i] = (unsigned short)c; 1452 } 1453 } 1454 e->userData = userData; 1455 e->convert = convert; 1456 if (convert) { 1457 e->normal.isName2 = unknown_isName; 1458 e->normal.isName3 = unknown_isName; 1459 e->normal.isName4 = unknown_isName; 1460 e->normal.isNmstrt2 = unknown_isNmstrt; 1461 e->normal.isNmstrt3 = unknown_isNmstrt; 1462 e->normal.isNmstrt4 = unknown_isNmstrt; 1463 e->normal.isInvalid2 = unknown_isInvalid; 1464 e->normal.isInvalid3 = unknown_isInvalid; 1465 e->normal.isInvalid4 = unknown_isInvalid; 1466 } 1467 e->normal.enc.utf8Convert = unknown_toUtf8; 1468 e->normal.enc.utf16Convert = unknown_toUtf16; 1469 return &(e->normal.enc); 1470 } 1471 1472 /* If this enumeration is changed, getEncodingIndex and encodings 1473 must also be changed. */ 1474 enum { 1475 UNKNOWN_ENC = -1, 1476 ISO_8859_1_ENC = 0, 1477 US_ASCII_ENC, 1478 UTF_8_ENC, 1479 UTF_16_ENC, 1480 UTF_16BE_ENC, 1481 UTF_16LE_ENC, 1482 /* must match encodingNames up to here */ 1483 NO_ENC 1484 }; 1485 1486 static const char KW_ISO_8859_1[] 1487 = {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, 1488 ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'}; 1489 static const char KW_US_ASCII[] 1490 = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, 1491 ASCII_C, ASCII_I, ASCII_I, '\0'}; 1492 static const char KW_UTF_8[] 1493 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'}; 1494 static const char KW_UTF_16[] 1495 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'}; 1496 static const char KW_UTF_16BE[] 1497 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, 1498 ASCII_6, ASCII_B, ASCII_E, '\0'}; 1499 static const char KW_UTF_16LE[] 1500 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, 1501 ASCII_6, ASCII_L, ASCII_E, '\0'}; 1502 1503 static int FASTCALL 1504 getEncodingIndex(const char *name) { 1505 static const char *const encodingNames[] = { 1506 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE, 1507 }; 1508 int i; 1509 if (name == NULL) 1510 return NO_ENC; 1511 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++) 1512 if (streqci(name, encodingNames[i])) 1513 return i; 1514 return UNKNOWN_ENC; 1515 } 1516 1517 /* For binary compatibility, we store the index of the encoding 1518 specified at initialization in the isUtf16 member. 1519 */ 1520 1521 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) 1522 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) 1523 1524 /* This is what detects the encoding. encodingTable maps from 1525 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of 1526 the external (protocol) specified encoding; state is 1527 XML_CONTENT_STATE if we're parsing an external text entity, and 1528 XML_PROLOG_STATE otherwise. 1529 */ 1530 1531 static int 1532 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc, 1533 int state, const char *ptr, const char *end, const char **nextTokPtr) { 1534 const ENCODING **encPtr; 1535 1536 if (ptr >= end) 1537 return XML_TOK_NONE; 1538 encPtr = enc->encPtr; 1539 if (ptr + 1 == end) { 1540 /* only a single byte available for auto-detection */ 1541 #ifndef XML_DTD /* FIXME */ 1542 /* a well-formed document entity must have more than one byte */ 1543 if (state != XML_CONTENT_STATE) 1544 return XML_TOK_PARTIAL; 1545 #endif 1546 /* so we're parsing an external text entity... */ 1547 /* if UTF-16 was externally specified, then we need at least 2 bytes */ 1548 switch (INIT_ENC_INDEX(enc)) { 1549 case UTF_16_ENC: 1550 case UTF_16LE_ENC: 1551 case UTF_16BE_ENC: 1552 return XML_TOK_PARTIAL; 1553 } 1554 switch ((unsigned char)*ptr) { 1555 case 0xFE: 1556 case 0xFF: 1557 case 0xEF: /* possibly first byte of UTF-8 BOM */ 1558 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) 1559 break; 1560 /* fall through */ 1561 case 0x00: 1562 case 0x3C: 1563 return XML_TOK_PARTIAL; 1564 } 1565 } else { 1566 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { 1567 case 0xFEFF: 1568 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) 1569 break; 1570 *nextTokPtr = ptr + 2; 1571 *encPtr = encodingTable[UTF_16BE_ENC]; 1572 return XML_TOK_BOM; 1573 /* 00 3C is handled in the default case */ 1574 case 0x3C00: 1575 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC 1576 || INIT_ENC_INDEX(enc) == UTF_16_ENC) 1577 && state == XML_CONTENT_STATE) 1578 break; 1579 *encPtr = encodingTable[UTF_16LE_ENC]; 1580 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1581 case 0xFFFE: 1582 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) 1583 break; 1584 *nextTokPtr = ptr + 2; 1585 *encPtr = encodingTable[UTF_16LE_ENC]; 1586 return XML_TOK_BOM; 1587 case 0xEFBB: 1588 /* Maybe a UTF-8 BOM (EF BB BF) */ 1589 /* If there's an explicitly specified (external) encoding 1590 of ISO-8859-1 or some flavour of UTF-16 1591 and this is an external text entity, 1592 don't look for the BOM, 1593 because it might be a legal data. 1594 */ 1595 if (state == XML_CONTENT_STATE) { 1596 int e = INIT_ENC_INDEX(enc); 1597 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC 1598 || e == UTF_16_ENC) 1599 break; 1600 } 1601 if (ptr + 2 == end) 1602 return XML_TOK_PARTIAL; 1603 if ((unsigned char)ptr[2] == 0xBF) { 1604 *nextTokPtr = ptr + 3; 1605 *encPtr = encodingTable[UTF_8_ENC]; 1606 return XML_TOK_BOM; 1607 } 1608 break; 1609 default: 1610 if (ptr[0] == '\0') { 1611 /* 0 isn't a legal data character. Furthermore a document 1612 entity can only start with ASCII characters. So the only 1613 way this can fail to be big-endian UTF-16 if it it's an 1614 external parsed general entity that's labelled as 1615 UTF-16LE. 1616 */ 1617 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) 1618 break; 1619 *encPtr = encodingTable[UTF_16BE_ENC]; 1620 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1621 } else if (ptr[1] == '\0') { 1622 /* We could recover here in the case: 1623 - parsing an external entity 1624 - second byte is 0 1625 - no externally specified encoding 1626 - no encoding declaration 1627 by assuming UTF-16LE. But we don't, because this would mean when 1628 presented just with a single byte, we couldn't reliably determine 1629 whether we needed further bytes. 1630 */ 1631 if (state == XML_CONTENT_STATE) 1632 break; 1633 *encPtr = encodingTable[UTF_16LE_ENC]; 1634 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1635 } 1636 break; 1637 } 1638 } 1639 *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; 1640 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1641 } 1642 1643 #define NS(x) x 1644 #define ns(x) x 1645 #define XML_TOK_NS_C 1646 #include "xmltok_ns.c" 1647 #undef XML_TOK_NS_C 1648 #undef NS 1649 #undef ns 1650 1651 #ifdef XML_NS 1652 1653 # define NS(x) x##NS 1654 # define ns(x) x##_ns 1655 1656 # define XML_TOK_NS_C 1657 # include "xmltok_ns.c" 1658 # undef XML_TOK_NS_C 1659 1660 # undef NS 1661 # undef ns 1662 1663 ENCODING * 1664 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert, 1665 void *userData) { 1666 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); 1667 if (enc) 1668 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON; 1669 return enc; 1670 } 1671 1672 #endif /* XML_NS */ 1673