1 /* 2 __ __ _ 3 ___\ \/ /_ __ __ _| |_ 4 / _ \\ /| '_ \ / _` | __| 5 | __// \| |_) | (_| | |_ 6 \___/_/\_\ .__/ \__,_|\__| 7 |_| XML parser 8 9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd 10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net> 11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net> 12 Copyright (c) 2002 Greg Stein <gstein@users.sourceforge.net> 13 Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net> 14 Copyright (c) 2005-2009 Steven Solie <ssolie@users.sourceforge.net> 15 Copyright (c) 2016-2021 Sebastian Pipping <sebastian@pipping.org> 16 Copyright (c) 2016 Pascal Cuoq <cuoq@trust-in-soft.com> 17 Copyright (c) 2016 Don Lewis <truckman@apache.org> 18 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk> 19 Copyright (c) 2017 Alexander Bluhm <alexander.bluhm@gmx.net> 20 Copyright (c) 2017 Benbuck Nason <bnason@netflix.com> 21 Copyright (c) 2017 José Gutiérrez de la Concha <jose@zeroc.com> 22 Copyright (c) 2019 David Loffredo <loffredo@steptools.com> 23 Copyright (c) 2021 Dong-hee Na <donghee.na@python.org> 24 Licensed under the MIT license: 25 26 Permission is hereby granted, free of charge, to any person obtaining 27 a copy of this software and associated documentation files (the 28 "Software"), to deal in the Software without restriction, including 29 without limitation the rights to use, copy, modify, merge, publish, 30 distribute, sublicense, and/or sell copies of the Software, and to permit 31 persons to whom the Software is furnished to do so, subject to the 32 following conditions: 33 34 The above copyright notice and this permission notice shall be included 35 in all copies or substantial portions of the Software. 36 37 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 38 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 39 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 40 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 41 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 42 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 43 USE OR OTHER DEALINGS IN THE SOFTWARE. 44 */ 45 46 #include <expat_config.h> 47 48 #include <stddef.h> 49 #include <string.h> /* memcpy */ 50 #include <stdbool.h> 51 52 #ifdef _WIN32 53 # include "winconfig.h" 54 #endif 55 56 #include "expat_external.h" 57 #include "internal.h" 58 #include "xmltok.h" 59 #include "nametab.h" 60 61 #ifdef XML_DTD 62 # define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) 63 #else 64 # define IGNORE_SECTION_TOK_VTABLE /* as nothing */ 65 #endif 66 67 #define VTABLE1 \ 68 {PREFIX(prologTok), PREFIX(contentTok), \ 69 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \ 70 {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \ 71 PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \ 72 PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \ 73 PREFIX(updatePosition), PREFIX(isPublicId) 74 75 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) 76 77 #define UCS2_GET_NAMING(pages, hi, lo) \ 78 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F))) 79 80 /* A 2 byte UTF-8 representation splits the characters 11 bits between 81 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into 82 pages, 3 bits to add to that index and 5 bits to generate the mask. 83 */ 84 #define UTF8_GET_NAMING2(pages, byte) \ 85 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ 86 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \ 87 & (1u << (((byte)[1]) & 0x1F))) 88 89 /* A 3 byte UTF-8 representation splits the characters 16 bits between 90 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index 91 into pages, 3 bits to add to that index and 5 bits to generate the 92 mask. 93 */ 94 #define UTF8_GET_NAMING3(pages, byte) \ 95 (namingBitmap \ 96 [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \ 97 << 3) \ 98 + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \ 99 & (1u << (((byte)[2]) & 0x1F))) 100 101 #define UTF8_GET_NAMING(pages, p, n) \ 102 ((n) == 2 \ 103 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ 104 : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0)) 105 106 /* Detection of invalid UTF-8 sequences is based on Table 3.1B 107 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ 108 with the additional restriction of not allowing the Unicode 109 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). 110 Implementation details: 111 (A & 0x80) == 0 means A < 0x80 112 and 113 (A & 0xC0) == 0xC0 means A > 0xBF 114 */ 115 116 #define UTF8_INVALID2(p) \ 117 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) 118 119 #define UTF8_INVALID3(p) \ 120 (((p)[2] & 0x80) == 0 \ 121 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \ 122 : ((p)[2] & 0xC0) == 0xC0) \ 123 || ((*p) == 0xE0 \ 124 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ 125 : ((p)[1] & 0x80) == 0 \ 126 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) 127 128 #define UTF8_INVALID4(p) \ 129 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \ 130 || ((p)[2] & 0xC0) == 0xC0 \ 131 || ((*p) == 0xF0 \ 132 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ 133 : ((p)[1] & 0x80) == 0 \ 134 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) 135 136 static int PTRFASTCALL 137 isNever(const ENCODING *enc, const char *p) { 138 UNUSED_P(enc); 139 UNUSED_P(p); 140 return 0; 141 } 142 143 static int PTRFASTCALL 144 utf8_isName2(const ENCODING *enc, const char *p) { 145 UNUSED_P(enc); 146 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); 147 } 148 149 static int PTRFASTCALL 150 utf8_isName3(const ENCODING *enc, const char *p) { 151 UNUSED_P(enc); 152 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); 153 } 154 155 #define utf8_isName4 isNever 156 157 static int PTRFASTCALL 158 utf8_isNmstrt2(const ENCODING *enc, const char *p) { 159 UNUSED_P(enc); 160 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); 161 } 162 163 static int PTRFASTCALL 164 utf8_isNmstrt3(const ENCODING *enc, const char *p) { 165 UNUSED_P(enc); 166 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); 167 } 168 169 #define utf8_isNmstrt4 isNever 170 171 static int PTRFASTCALL 172 utf8_isInvalid2(const ENCODING *enc, const char *p) { 173 UNUSED_P(enc); 174 return UTF8_INVALID2((const unsigned char *)p); 175 } 176 177 static int PTRFASTCALL 178 utf8_isInvalid3(const ENCODING *enc, const char *p) { 179 UNUSED_P(enc); 180 return UTF8_INVALID3((const unsigned char *)p); 181 } 182 183 static int PTRFASTCALL 184 utf8_isInvalid4(const ENCODING *enc, const char *p) { 185 UNUSED_P(enc); 186 return UTF8_INVALID4((const unsigned char *)p); 187 } 188 189 struct normal_encoding { 190 ENCODING enc; 191 unsigned char type[256]; 192 #ifdef XML_MIN_SIZE 193 int(PTRFASTCALL *byteType)(const ENCODING *, const char *); 194 int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *); 195 int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *); 196 int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *); 197 int(PTRCALL *charMatches)(const ENCODING *, const char *, int); 198 #endif /* XML_MIN_SIZE */ 199 int(PTRFASTCALL *isName2)(const ENCODING *, const char *); 200 int(PTRFASTCALL *isName3)(const ENCODING *, const char *); 201 int(PTRFASTCALL *isName4)(const ENCODING *, const char *); 202 int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *); 203 int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *); 204 int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *); 205 int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *); 206 int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *); 207 int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *); 208 }; 209 210 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc)) 211 212 #ifdef XML_MIN_SIZE 213 214 # define STANDARD_VTABLE(E) \ 215 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches, 216 217 #else 218 219 # define STANDARD_VTABLE(E) /* as nothing */ 220 221 #endif 222 223 #define NORMAL_VTABLE(E) \ 224 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \ 225 E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4 226 227 #define NULL_VTABLE \ 228 /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \ 229 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \ 230 /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL 231 232 static int FASTCALL checkCharRefNumber(int); 233 234 #include "xmltok_impl.h" 235 #include "ascii.h" 236 237 #ifdef XML_MIN_SIZE 238 # define sb_isNameMin isNever 239 # define sb_isNmstrtMin isNever 240 #endif 241 242 #ifdef XML_MIN_SIZE 243 # define MINBPC(enc) ((enc)->minBytesPerChar) 244 #else 245 /* minimum bytes per character */ 246 # define MINBPC(enc) 1 247 #endif 248 249 #define SB_BYTE_TYPE(enc, p) \ 250 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) 251 252 #ifdef XML_MIN_SIZE 253 static int PTRFASTCALL 254 sb_byteType(const ENCODING *enc, const char *p) { 255 return SB_BYTE_TYPE(enc, p); 256 } 257 # define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) 258 #else 259 # define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) 260 #endif 261 262 #ifdef XML_MIN_SIZE 263 # define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) 264 static int PTRFASTCALL 265 sb_byteToAscii(const ENCODING *enc, const char *p) { 266 UNUSED_P(enc); 267 return *p; 268 } 269 #else 270 # define BYTE_TO_ASCII(enc, p) (*(p)) 271 #endif 272 273 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p)) 274 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p)) 275 #ifdef XML_MIN_SIZE 276 # define IS_INVALID_CHAR(enc, p, n) \ 277 (AS_NORMAL_ENCODING(enc)->isInvalid##n \ 278 && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p)) 279 #else 280 # define IS_INVALID_CHAR(enc, p, n) \ 281 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p)) 282 #endif 283 284 #ifdef XML_MIN_SIZE 285 # define IS_NAME_CHAR_MINBPC(enc, p) \ 286 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) 287 # define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 288 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) 289 #else 290 # define IS_NAME_CHAR_MINBPC(enc, p) (0) 291 # define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 292 #endif 293 294 #ifdef XML_MIN_SIZE 295 # define CHAR_MATCHES(enc, p, c) \ 296 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) 297 static int PTRCALL 298 sb_charMatches(const ENCODING *enc, const char *p, int c) { 299 UNUSED_P(enc); 300 return *p == c; 301 } 302 #else 303 /* c is an ASCII character */ 304 # define CHAR_MATCHES(enc, p, c) (*(p) == c) 305 #endif 306 307 #define PREFIX(ident) normal_##ident 308 #define XML_TOK_IMPL_C 309 #include "xmltok_impl.c" 310 #undef XML_TOK_IMPL_C 311 312 #undef MINBPC 313 #undef BYTE_TYPE 314 #undef BYTE_TO_ASCII 315 #undef CHAR_MATCHES 316 #undef IS_NAME_CHAR 317 #undef IS_NAME_CHAR_MINBPC 318 #undef IS_NMSTRT_CHAR 319 #undef IS_NMSTRT_CHAR_MINBPC 320 #undef IS_INVALID_CHAR 321 322 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ 323 UTF8_cval1 = 0x00, 324 UTF8_cval2 = 0xc0, 325 UTF8_cval3 = 0xe0, 326 UTF8_cval4 = 0xf0 327 }; 328 329 void 330 _INTERNAL_trim_to_complete_utf8_characters(const char *from, 331 const char **fromLimRef) { 332 const char *fromLim = *fromLimRef; 333 size_t walked = 0; 334 for (; fromLim > from; fromLim--, walked++) { 335 const unsigned char prev = (unsigned char)fromLim[-1]; 336 if ((prev & 0xf8u) 337 == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */ 338 if (walked + 1 >= 4) { 339 fromLim += 4 - 1; 340 break; 341 } else { 342 walked = 0; 343 } 344 } else if ((prev & 0xf0u) 345 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */ 346 if (walked + 1 >= 3) { 347 fromLim += 3 - 1; 348 break; 349 } else { 350 walked = 0; 351 } 352 } else if ((prev & 0xe0u) 353 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */ 354 if (walked + 1 >= 2) { 355 fromLim += 2 - 1; 356 break; 357 } else { 358 walked = 0; 359 } 360 } else if ((prev & 0x80u) 361 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */ 362 break; 363 } 364 } 365 *fromLimRef = fromLim; 366 } 367 368 static enum XML_Convert_Result PTRCALL 369 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 370 char **toP, const char *toLim) { 371 bool input_incomplete = false; 372 bool output_exhausted = false; 373 374 /* Avoid copying partial characters (due to limited space). */ 375 const ptrdiff_t bytesAvailable = fromLim - *fromP; 376 const ptrdiff_t bytesStorable = toLim - *toP; 377 UNUSED_P(enc); 378 if (bytesAvailable > bytesStorable) { 379 fromLim = *fromP + bytesStorable; 380 output_exhausted = true; 381 } 382 383 /* Avoid copying partial characters (from incomplete input). */ 384 { 385 const char *const fromLimBefore = fromLim; 386 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim); 387 if (fromLim < fromLimBefore) { 388 input_incomplete = true; 389 } 390 } 391 392 { 393 const ptrdiff_t bytesToCopy = fromLim - *fromP; 394 memcpy(*toP, *fromP, bytesToCopy); 395 *fromP += bytesToCopy; 396 *toP += bytesToCopy; 397 } 398 399 if (output_exhausted) /* needs to go first */ 400 return XML_CONVERT_OUTPUT_EXHAUSTED; 401 else if (input_incomplete) 402 return XML_CONVERT_INPUT_INCOMPLETE; 403 else 404 return XML_CONVERT_COMPLETED; 405 } 406 407 static enum XML_Convert_Result PTRCALL 408 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, 409 unsigned short **toP, const unsigned short *toLim) { 410 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; 411 unsigned short *to = *toP; 412 const char *from = *fromP; 413 while (from < fromLim && to < toLim) { 414 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { 415 case BT_LEAD2: 416 if (fromLim - from < 2) { 417 res = XML_CONVERT_INPUT_INCOMPLETE; 418 goto after; 419 } 420 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); 421 from += 2; 422 break; 423 case BT_LEAD3: 424 if (fromLim - from < 3) { 425 res = XML_CONVERT_INPUT_INCOMPLETE; 426 goto after; 427 } 428 *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) 429 | (from[2] & 0x3f)); 430 from += 3; 431 break; 432 case BT_LEAD4: { 433 unsigned long n; 434 if (toLim - to < 2) { 435 res = XML_CONVERT_OUTPUT_EXHAUSTED; 436 goto after; 437 } 438 if (fromLim - from < 4) { 439 res = XML_CONVERT_INPUT_INCOMPLETE; 440 goto after; 441 } 442 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) 443 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); 444 n -= 0x10000; 445 to[0] = (unsigned short)((n >> 10) | 0xD800); 446 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); 447 to += 2; 448 from += 4; 449 } break; 450 default: 451 *to++ = *from++; 452 break; 453 } 454 } 455 if (from < fromLim) 456 res = XML_CONVERT_OUTPUT_EXHAUSTED; 457 after: 458 *fromP = from; 459 *toP = to; 460 return res; 461 } 462 463 #ifdef XML_NS 464 static const struct normal_encoding utf8_encoding_ns 465 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 466 { 467 # include "asciitab.h" 468 # include "utf8tab.h" 469 }, 470 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 471 #endif 472 473 static const struct normal_encoding utf8_encoding 474 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 475 { 476 #define BT_COLON BT_NMSTRT 477 #include "asciitab.h" 478 #undef BT_COLON 479 #include "utf8tab.h" 480 }, 481 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 482 483 #ifdef XML_NS 484 485 static const struct normal_encoding internal_utf8_encoding_ns 486 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 487 { 488 # include "iasciitab.h" 489 # include "utf8tab.h" 490 }, 491 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 492 493 #endif 494 495 static const struct normal_encoding internal_utf8_encoding 496 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 497 { 498 #define BT_COLON BT_NMSTRT 499 #include "iasciitab.h" 500 #undef BT_COLON 501 #include "utf8tab.h" 502 }, 503 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 504 505 static enum XML_Convert_Result PTRCALL 506 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 507 char **toP, const char *toLim) { 508 UNUSED_P(enc); 509 for (;;) { 510 unsigned char c; 511 if (*fromP == fromLim) 512 return XML_CONVERT_COMPLETED; 513 c = (unsigned char)**fromP; 514 if (c & 0x80) { 515 if (toLim - *toP < 2) 516 return XML_CONVERT_OUTPUT_EXHAUSTED; 517 *(*toP)++ = (char)((c >> 6) | UTF8_cval2); 518 *(*toP)++ = (char)((c & 0x3f) | 0x80); 519 (*fromP)++; 520 } else { 521 if (*toP == toLim) 522 return XML_CONVERT_OUTPUT_EXHAUSTED; 523 *(*toP)++ = *(*fromP)++; 524 } 525 } 526 } 527 528 static enum XML_Convert_Result PTRCALL 529 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, 530 unsigned short **toP, const unsigned short *toLim) { 531 UNUSED_P(enc); 532 while (*fromP < fromLim && *toP < toLim) 533 *(*toP)++ = (unsigned char)*(*fromP)++; 534 535 if ((*toP == toLim) && (*fromP < fromLim)) 536 return XML_CONVERT_OUTPUT_EXHAUSTED; 537 else 538 return XML_CONVERT_COMPLETED; 539 } 540 541 #ifdef XML_NS 542 543 static const struct normal_encoding latin1_encoding_ns 544 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0}, 545 { 546 # include "asciitab.h" 547 # include "latin1tab.h" 548 }, 549 STANDARD_VTABLE(sb_) NULL_VTABLE}; 550 551 #endif 552 553 static const struct normal_encoding latin1_encoding 554 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0}, 555 { 556 #define BT_COLON BT_NMSTRT 557 #include "asciitab.h" 558 #undef BT_COLON 559 #include "latin1tab.h" 560 }, 561 STANDARD_VTABLE(sb_) NULL_VTABLE}; 562 563 static enum XML_Convert_Result PTRCALL 564 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 565 char **toP, const char *toLim) { 566 UNUSED_P(enc); 567 while (*fromP < fromLim && *toP < toLim) 568 *(*toP)++ = *(*fromP)++; 569 570 if ((*toP == toLim) && (*fromP < fromLim)) 571 return XML_CONVERT_OUTPUT_EXHAUSTED; 572 else 573 return XML_CONVERT_COMPLETED; 574 } 575 576 #ifdef XML_NS 577 578 static const struct normal_encoding ascii_encoding_ns 579 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0}, 580 { 581 # include "asciitab.h" 582 /* BT_NONXML == 0 */ 583 }, 584 STANDARD_VTABLE(sb_) NULL_VTABLE}; 585 586 #endif 587 588 static const struct normal_encoding ascii_encoding 589 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0}, 590 { 591 #define BT_COLON BT_NMSTRT 592 #include "asciitab.h" 593 #undef BT_COLON 594 /* BT_NONXML == 0 */ 595 }, 596 STANDARD_VTABLE(sb_) NULL_VTABLE}; 597 598 static int PTRFASTCALL 599 unicode_byte_type(char hi, char lo) { 600 switch ((unsigned char)hi) { 601 /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */ 602 case 0xD8: 603 case 0xD9: 604 case 0xDA: 605 case 0xDB: 606 return BT_LEAD4; 607 /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */ 608 case 0xDC: 609 case 0xDD: 610 case 0xDE: 611 case 0xDF: 612 return BT_TRAIL; 613 case 0xFF: 614 switch ((unsigned char)lo) { 615 case 0xFF: /* noncharacter-FFFF */ 616 case 0xFE: /* noncharacter-FFFE */ 617 return BT_NONXML; 618 } 619 break; 620 } 621 return BT_NONASCII; 622 } 623 624 #define DEFINE_UTF16_TO_UTF8(E) \ 625 static enum XML_Convert_Result PTRCALL E##toUtf8( \ 626 const ENCODING *enc, const char **fromP, const char *fromLim, \ 627 char **toP, const char *toLim) { \ 628 const char *from = *fromP; \ 629 UNUSED_P(enc); \ 630 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \ 631 for (; from < fromLim; from += 2) { \ 632 int plane; \ 633 unsigned char lo2; \ 634 unsigned char lo = GET_LO(from); \ 635 unsigned char hi = GET_HI(from); \ 636 switch (hi) { \ 637 case 0: \ 638 if (lo < 0x80) { \ 639 if (*toP == toLim) { \ 640 *fromP = from; \ 641 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 642 } \ 643 *(*toP)++ = lo; \ 644 break; \ 645 } \ 646 /* fall through */ \ 647 case 0x1: \ 648 case 0x2: \ 649 case 0x3: \ 650 case 0x4: \ 651 case 0x5: \ 652 case 0x6: \ 653 case 0x7: \ 654 if (toLim - *toP < 2) { \ 655 *fromP = from; \ 656 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 657 } \ 658 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ 659 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 660 break; \ 661 default: \ 662 if (toLim - *toP < 3) { \ 663 *fromP = from; \ 664 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 665 } \ 666 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ 667 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ 668 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ 669 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 670 break; \ 671 case 0xD8: \ 672 case 0xD9: \ 673 case 0xDA: \ 674 case 0xDB: \ 675 if (toLim - *toP < 4) { \ 676 *fromP = from; \ 677 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 678 } \ 679 if (fromLim - from < 4) { \ 680 *fromP = from; \ 681 return XML_CONVERT_INPUT_INCOMPLETE; \ 682 } \ 683 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ 684 *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \ 685 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ 686 from += 2; \ 687 lo2 = GET_LO(from); \ 688 *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \ 689 | (lo2 >> 6) | 0x80); \ 690 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ 691 break; \ 692 } \ 693 } \ 694 *fromP = from; \ 695 if (from < fromLim) \ 696 return XML_CONVERT_INPUT_INCOMPLETE; \ 697 else \ 698 return XML_CONVERT_COMPLETED; \ 699 } 700 701 #define DEFINE_UTF16_TO_UTF16(E) \ 702 static enum XML_Convert_Result PTRCALL E##toUtf16( \ 703 const ENCODING *enc, const char **fromP, const char *fromLim, \ 704 unsigned short **toP, const unsigned short *toLim) { \ 705 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \ 706 UNUSED_P(enc); \ 707 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \ 708 /* Avoid copying first half only of surrogate */ \ 709 if (fromLim - *fromP > ((toLim - *toP) << 1) \ 710 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \ 711 fromLim -= 2; \ 712 res = XML_CONVERT_INPUT_INCOMPLETE; \ 713 } \ 714 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \ 715 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ 716 if ((*toP == toLim) && (*fromP < fromLim)) \ 717 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 718 else \ 719 return res; \ 720 } 721 722 #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8))) 723 #define GET_LO(ptr) ((unsigned char)(ptr)[0]) 724 #define GET_HI(ptr) ((unsigned char)(ptr)[1]) 725 726 DEFINE_UTF16_TO_UTF8(little2_) 727 DEFINE_UTF16_TO_UTF16(little2_) 728 729 #undef SET2 730 #undef GET_LO 731 #undef GET_HI 732 733 #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF))) 734 #define GET_LO(ptr) ((unsigned char)(ptr)[1]) 735 #define GET_HI(ptr) ((unsigned char)(ptr)[0]) 736 737 DEFINE_UTF16_TO_UTF8(big2_) 738 DEFINE_UTF16_TO_UTF16(big2_) 739 740 #undef SET2 741 #undef GET_LO 742 #undef GET_HI 743 744 #define LITTLE2_BYTE_TYPE(enc, p) \ 745 ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ 746 : unicode_byte_type((p)[1], (p)[0])) 747 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1) 748 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c) 749 #define LITTLE2_IS_NAME_CHAR_MINBPC(p) \ 750 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) 751 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \ 752 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) 753 754 #ifdef XML_MIN_SIZE 755 756 static int PTRFASTCALL 757 little2_byteType(const ENCODING *enc, const char *p) { 758 return LITTLE2_BYTE_TYPE(enc, p); 759 } 760 761 static int PTRFASTCALL 762 little2_byteToAscii(const ENCODING *enc, const char *p) { 763 UNUSED_P(enc); 764 return LITTLE2_BYTE_TO_ASCII(p); 765 } 766 767 static int PTRCALL 768 little2_charMatches(const ENCODING *enc, const char *p, int c) { 769 UNUSED_P(enc); 770 return LITTLE2_CHAR_MATCHES(p, c); 771 } 772 773 static int PTRFASTCALL 774 little2_isNameMin(const ENCODING *enc, const char *p) { 775 UNUSED_P(enc); 776 return LITTLE2_IS_NAME_CHAR_MINBPC(p); 777 } 778 779 static int PTRFASTCALL 780 little2_isNmstrtMin(const ENCODING *enc, const char *p) { 781 UNUSED_P(enc); 782 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p); 783 } 784 785 # undef VTABLE 786 # define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 787 788 #else /* not XML_MIN_SIZE */ 789 790 # undef PREFIX 791 # define PREFIX(ident) little2_##ident 792 # define MINBPC(enc) 2 793 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 794 # define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) 795 # define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p) 796 # define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c) 797 # define IS_NAME_CHAR(enc, p, n) 0 798 # define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p) 799 # define IS_NMSTRT_CHAR(enc, p, n) (0) 800 # define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) 801 802 # define XML_TOK_IMPL_C 803 # include "xmltok_impl.c" 804 # undef XML_TOK_IMPL_C 805 806 # undef MINBPC 807 # undef BYTE_TYPE 808 # undef BYTE_TO_ASCII 809 # undef CHAR_MATCHES 810 # undef IS_NAME_CHAR 811 # undef IS_NAME_CHAR_MINBPC 812 # undef IS_NMSTRT_CHAR 813 # undef IS_NMSTRT_CHAR_MINBPC 814 # undef IS_INVALID_CHAR 815 816 #endif /* not XML_MIN_SIZE */ 817 818 #ifdef XML_NS 819 820 static const struct normal_encoding little2_encoding_ns 821 = {{VTABLE, 2, 0, 822 # if BYTEORDER == 1234 823 1 824 # else 825 0 826 # endif 827 }, 828 { 829 # include "asciitab.h" 830 # include "latin1tab.h" 831 }, 832 STANDARD_VTABLE(little2_) NULL_VTABLE}; 833 834 #endif 835 836 static const struct normal_encoding little2_encoding 837 = {{VTABLE, 2, 0, 838 #if BYTEORDER == 1234 839 1 840 #else 841 0 842 #endif 843 }, 844 { 845 #define BT_COLON BT_NMSTRT 846 #include "asciitab.h" 847 #undef BT_COLON 848 #include "latin1tab.h" 849 }, 850 STANDARD_VTABLE(little2_) NULL_VTABLE}; 851 852 #if BYTEORDER != 4321 853 854 # ifdef XML_NS 855 856 static const struct normal_encoding internal_little2_encoding_ns 857 = {{VTABLE, 2, 0, 1}, 858 { 859 # include "iasciitab.h" 860 # include "latin1tab.h" 861 }, 862 STANDARD_VTABLE(little2_) NULL_VTABLE}; 863 864 # endif 865 866 static const struct normal_encoding internal_little2_encoding 867 = {{VTABLE, 2, 0, 1}, 868 { 869 # define BT_COLON BT_NMSTRT 870 # include "iasciitab.h" 871 # undef BT_COLON 872 # include "latin1tab.h" 873 }, 874 STANDARD_VTABLE(little2_) NULL_VTABLE}; 875 876 #endif 877 878 #define BIG2_BYTE_TYPE(enc, p) \ 879 ((p)[0] == 0 \ 880 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ 881 : unicode_byte_type((p)[0], (p)[1])) 882 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1) 883 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c) 884 #define BIG2_IS_NAME_CHAR_MINBPC(p) \ 885 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) 886 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \ 887 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) 888 889 #ifdef XML_MIN_SIZE 890 891 static int PTRFASTCALL 892 big2_byteType(const ENCODING *enc, const char *p) { 893 return BIG2_BYTE_TYPE(enc, p); 894 } 895 896 static int PTRFASTCALL 897 big2_byteToAscii(const ENCODING *enc, const char *p) { 898 UNUSED_P(enc); 899 return BIG2_BYTE_TO_ASCII(p); 900 } 901 902 static int PTRCALL 903 big2_charMatches(const ENCODING *enc, const char *p, int c) { 904 UNUSED_P(enc); 905 return BIG2_CHAR_MATCHES(p, c); 906 } 907 908 static int PTRFASTCALL 909 big2_isNameMin(const ENCODING *enc, const char *p) { 910 UNUSED_P(enc); 911 return BIG2_IS_NAME_CHAR_MINBPC(p); 912 } 913 914 static int PTRFASTCALL 915 big2_isNmstrtMin(const ENCODING *enc, const char *p) { 916 UNUSED_P(enc); 917 return BIG2_IS_NMSTRT_CHAR_MINBPC(p); 918 } 919 920 # undef VTABLE 921 # define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 922 923 #else /* not XML_MIN_SIZE */ 924 925 # undef PREFIX 926 # define PREFIX(ident) big2_##ident 927 # define MINBPC(enc) 2 928 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 929 # define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) 930 # define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p) 931 # define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c) 932 # define IS_NAME_CHAR(enc, p, n) 0 933 # define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p) 934 # define IS_NMSTRT_CHAR(enc, p, n) (0) 935 # define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p) 936 937 # define XML_TOK_IMPL_C 938 # include "xmltok_impl.c" 939 # undef XML_TOK_IMPL_C 940 941 # undef MINBPC 942 # undef BYTE_TYPE 943 # undef BYTE_TO_ASCII 944 # undef CHAR_MATCHES 945 # undef IS_NAME_CHAR 946 # undef IS_NAME_CHAR_MINBPC 947 # undef IS_NMSTRT_CHAR 948 # undef IS_NMSTRT_CHAR_MINBPC 949 # undef IS_INVALID_CHAR 950 951 #endif /* not XML_MIN_SIZE */ 952 953 #ifdef XML_NS 954 955 static const struct normal_encoding big2_encoding_ns 956 = {{VTABLE, 2, 0, 957 # if BYTEORDER == 4321 958 1 959 # else 960 0 961 # endif 962 }, 963 { 964 # include "asciitab.h" 965 # include "latin1tab.h" 966 }, 967 STANDARD_VTABLE(big2_) NULL_VTABLE}; 968 969 #endif 970 971 static const struct normal_encoding big2_encoding 972 = {{VTABLE, 2, 0, 973 #if BYTEORDER == 4321 974 1 975 #else 976 0 977 #endif 978 }, 979 { 980 #define BT_COLON BT_NMSTRT 981 #include "asciitab.h" 982 #undef BT_COLON 983 #include "latin1tab.h" 984 }, 985 STANDARD_VTABLE(big2_) NULL_VTABLE}; 986 987 #if BYTEORDER != 1234 988 989 # ifdef XML_NS 990 991 static const struct normal_encoding internal_big2_encoding_ns 992 = {{VTABLE, 2, 0, 1}, 993 { 994 # include "iasciitab.h" 995 # include "latin1tab.h" 996 }, 997 STANDARD_VTABLE(big2_) NULL_VTABLE}; 998 999 # endif 1000 1001 static const struct normal_encoding internal_big2_encoding 1002 = {{VTABLE, 2, 0, 1}, 1003 { 1004 # define BT_COLON BT_NMSTRT 1005 # include "iasciitab.h" 1006 # undef BT_COLON 1007 # include "latin1tab.h" 1008 }, 1009 STANDARD_VTABLE(big2_) NULL_VTABLE}; 1010 1011 #endif 1012 1013 #undef PREFIX 1014 1015 static int FASTCALL 1016 streqci(const char *s1, const char *s2) { 1017 for (;;) { 1018 char c1 = *s1++; 1019 char c2 = *s2++; 1020 if (ASCII_a <= c1 && c1 <= ASCII_z) 1021 c1 += ASCII_A - ASCII_a; 1022 if (ASCII_a <= c2 && c2 <= ASCII_z) 1023 /* The following line will never get executed. streqci() is 1024 * only called from two places, both of which guarantee to put 1025 * upper-case strings into s2. 1026 */ 1027 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */ 1028 if (c1 != c2) 1029 return 0; 1030 if (! c1) 1031 break; 1032 } 1033 return 1; 1034 } 1035 1036 static void PTRCALL 1037 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end, 1038 POSITION *pos) { 1039 UNUSED_P(enc); 1040 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); 1041 } 1042 1043 static int 1044 toAscii(const ENCODING *enc, const char *ptr, const char *end) { 1045 char buf[1]; 1046 char *p = buf; 1047 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); 1048 if (p == buf) 1049 return -1; 1050 else 1051 return buf[0]; 1052 } 1053 1054 static int FASTCALL 1055 isSpace(int c) { 1056 switch (c) { 1057 case 0x20: 1058 case 0xD: 1059 case 0xA: 1060 case 0x9: 1061 return 1; 1062 } 1063 return 0; 1064 } 1065 1066 /* Return 1 if there's just optional white space or there's an S 1067 followed by name=val. 1068 */ 1069 static int 1070 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end, 1071 const char **namePtr, const char **nameEndPtr, 1072 const char **valPtr, const char **nextTokPtr) { 1073 int c; 1074 char open; 1075 if (ptr == end) { 1076 *namePtr = NULL; 1077 return 1; 1078 } 1079 if (! isSpace(toAscii(enc, ptr, end))) { 1080 *nextTokPtr = ptr; 1081 return 0; 1082 } 1083 do { 1084 ptr += enc->minBytesPerChar; 1085 } while (isSpace(toAscii(enc, ptr, end))); 1086 if (ptr == end) { 1087 *namePtr = NULL; 1088 return 1; 1089 } 1090 *namePtr = ptr; 1091 for (;;) { 1092 c = toAscii(enc, ptr, end); 1093 if (c == -1) { 1094 *nextTokPtr = ptr; 1095 return 0; 1096 } 1097 if (c == ASCII_EQUALS) { 1098 *nameEndPtr = ptr; 1099 break; 1100 } 1101 if (isSpace(c)) { 1102 *nameEndPtr = ptr; 1103 do { 1104 ptr += enc->minBytesPerChar; 1105 } while (isSpace(c = toAscii(enc, ptr, end))); 1106 if (c != ASCII_EQUALS) { 1107 *nextTokPtr = ptr; 1108 return 0; 1109 } 1110 break; 1111 } 1112 ptr += enc->minBytesPerChar; 1113 } 1114 if (ptr == *namePtr) { 1115 *nextTokPtr = ptr; 1116 return 0; 1117 } 1118 ptr += enc->minBytesPerChar; 1119 c = toAscii(enc, ptr, end); 1120 while (isSpace(c)) { 1121 ptr += enc->minBytesPerChar; 1122 c = toAscii(enc, ptr, end); 1123 } 1124 if (c != ASCII_QUOT && c != ASCII_APOS) { 1125 *nextTokPtr = ptr; 1126 return 0; 1127 } 1128 open = (char)c; 1129 ptr += enc->minBytesPerChar; 1130 *valPtr = ptr; 1131 for (;; ptr += enc->minBytesPerChar) { 1132 c = toAscii(enc, ptr, end); 1133 if (c == open) 1134 break; 1135 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z) 1136 && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD 1137 && c != ASCII_MINUS && c != ASCII_UNDERSCORE) { 1138 *nextTokPtr = ptr; 1139 return 0; 1140 } 1141 } 1142 *nextTokPtr = ptr + enc->minBytesPerChar; 1143 return 1; 1144 } 1145 1146 static const char KW_version[] 1147 = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'}; 1148 1149 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, 1150 ASCII_i, ASCII_n, ASCII_g, '\0'}; 1151 1152 static const char KW_standalone[] 1153 = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, 1154 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'}; 1155 1156 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'}; 1157 1158 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'}; 1159 1160 static int 1161 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *, 1162 const char *), 1163 int isGeneralTextEntity, const ENCODING *enc, const char *ptr, 1164 const char *end, const char **badPtr, const char **versionPtr, 1165 const char **versionEndPtr, const char **encodingName, 1166 const ENCODING **encoding, int *standalone) { 1167 const char *val = NULL; 1168 const char *name = NULL; 1169 const char *nameEnd = NULL; 1170 ptr += 5 * enc->minBytesPerChar; 1171 end -= 2 * enc->minBytesPerChar; 1172 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) 1173 || ! name) { 1174 *badPtr = ptr; 1175 return 0; 1176 } 1177 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) { 1178 if (! isGeneralTextEntity) { 1179 *badPtr = name; 1180 return 0; 1181 } 1182 } else { 1183 if (versionPtr) 1184 *versionPtr = val; 1185 if (versionEndPtr) 1186 *versionEndPtr = ptr; 1187 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1188 *badPtr = ptr; 1189 return 0; 1190 } 1191 if (! name) { 1192 if (isGeneralTextEntity) { 1193 /* a TextDecl must have an EncodingDecl */ 1194 *badPtr = ptr; 1195 return 0; 1196 } 1197 return 1; 1198 } 1199 } 1200 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) { 1201 int c = toAscii(enc, val, end); 1202 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) { 1203 *badPtr = val; 1204 return 0; 1205 } 1206 if (encodingName) 1207 *encodingName = val; 1208 if (encoding) 1209 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); 1210 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1211 *badPtr = ptr; 1212 return 0; 1213 } 1214 if (! name) 1215 return 1; 1216 } 1217 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) 1218 || isGeneralTextEntity) { 1219 *badPtr = name; 1220 return 0; 1221 } 1222 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) { 1223 if (standalone) 1224 *standalone = 1; 1225 } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) { 1226 if (standalone) 1227 *standalone = 0; 1228 } else { 1229 *badPtr = val; 1230 return 0; 1231 } 1232 while (isSpace(toAscii(enc, ptr, end))) 1233 ptr += enc->minBytesPerChar; 1234 if (ptr != end) { 1235 *badPtr = ptr; 1236 return 0; 1237 } 1238 return 1; 1239 } 1240 1241 static int FASTCALL 1242 checkCharRefNumber(int result) { 1243 switch (result >> 8) { 1244 case 0xD8: 1245 case 0xD9: 1246 case 0xDA: 1247 case 0xDB: 1248 case 0xDC: 1249 case 0xDD: 1250 case 0xDE: 1251 case 0xDF: 1252 return -1; 1253 case 0: 1254 if (latin1_encoding.type[result] == BT_NONXML) 1255 return -1; 1256 break; 1257 case 0xFF: 1258 if (result == 0xFFFE || result == 0xFFFF) 1259 return -1; 1260 break; 1261 } 1262 return result; 1263 } 1264 1265 int FASTCALL 1266 XmlUtf8Encode(int c, char *buf) { 1267 enum { 1268 /* minN is minimum legal resulting value for N byte sequence */ 1269 min2 = 0x80, 1270 min3 = 0x800, 1271 min4 = 0x10000 1272 }; 1273 1274 if (c < 0) 1275 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */ 1276 if (c < min2) { 1277 buf[0] = (char)(c | UTF8_cval1); 1278 return 1; 1279 } 1280 if (c < min3) { 1281 buf[0] = (char)((c >> 6) | UTF8_cval2); 1282 buf[1] = (char)((c & 0x3f) | 0x80); 1283 return 2; 1284 } 1285 if (c < min4) { 1286 buf[0] = (char)((c >> 12) | UTF8_cval3); 1287 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80); 1288 buf[2] = (char)((c & 0x3f) | 0x80); 1289 return 3; 1290 } 1291 if (c < 0x110000) { 1292 buf[0] = (char)((c >> 18) | UTF8_cval4); 1293 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80); 1294 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80); 1295 buf[3] = (char)((c & 0x3f) | 0x80); 1296 return 4; 1297 } 1298 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */ 1299 } 1300 1301 int FASTCALL 1302 XmlUtf16Encode(int charNum, unsigned short *buf) { 1303 if (charNum < 0) 1304 return 0; 1305 if (charNum < 0x10000) { 1306 buf[0] = (unsigned short)charNum; 1307 return 1; 1308 } 1309 if (charNum < 0x110000) { 1310 charNum -= 0x10000; 1311 buf[0] = (unsigned short)((charNum >> 10) + 0xD800); 1312 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00); 1313 return 2; 1314 } 1315 return 0; 1316 } 1317 1318 struct unknown_encoding { 1319 struct normal_encoding normal; 1320 CONVERTER convert; 1321 void *userData; 1322 unsigned short utf16[256]; 1323 char utf8[256][4]; 1324 }; 1325 1326 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc)) 1327 1328 int 1329 XmlSizeOfUnknownEncoding(void) { 1330 return sizeof(struct unknown_encoding); 1331 } 1332 1333 static int PTRFASTCALL 1334 unknown_isName(const ENCODING *enc, const char *p) { 1335 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1336 int c = uenc->convert(uenc->userData, p); 1337 if (c & ~0xFFFF) 1338 return 0; 1339 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); 1340 } 1341 1342 static int PTRFASTCALL 1343 unknown_isNmstrt(const ENCODING *enc, const char *p) { 1344 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1345 int c = uenc->convert(uenc->userData, p); 1346 if (c & ~0xFFFF) 1347 return 0; 1348 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); 1349 } 1350 1351 static int PTRFASTCALL 1352 unknown_isInvalid(const ENCODING *enc, const char *p) { 1353 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1354 int c = uenc->convert(uenc->userData, p); 1355 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; 1356 } 1357 1358 static enum XML_Convert_Result PTRCALL 1359 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 1360 char **toP, const char *toLim) { 1361 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1362 char buf[XML_UTF8_ENCODE_MAX]; 1363 for (;;) { 1364 const char *utf8; 1365 int n; 1366 if (*fromP == fromLim) 1367 return XML_CONVERT_COMPLETED; 1368 utf8 = uenc->utf8[(unsigned char)**fromP]; 1369 n = *utf8++; 1370 if (n == 0) { 1371 int c = uenc->convert(uenc->userData, *fromP); 1372 n = XmlUtf8Encode(c, buf); 1373 if (n > toLim - *toP) 1374 return XML_CONVERT_OUTPUT_EXHAUSTED; 1375 utf8 = buf; 1376 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1377 - (BT_LEAD2 - 2)); 1378 } else { 1379 if (n > toLim - *toP) 1380 return XML_CONVERT_OUTPUT_EXHAUSTED; 1381 (*fromP)++; 1382 } 1383 memcpy(*toP, utf8, n); 1384 *toP += n; 1385 } 1386 } 1387 1388 static enum XML_Convert_Result PTRCALL 1389 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, 1390 unsigned short **toP, const unsigned short *toLim) { 1391 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1392 while (*fromP < fromLim && *toP < toLim) { 1393 unsigned short c = uenc->utf16[(unsigned char)**fromP]; 1394 if (c == 0) { 1395 c = (unsigned short)uenc->convert(uenc->userData, *fromP); 1396 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1397 - (BT_LEAD2 - 2)); 1398 } else 1399 (*fromP)++; 1400 *(*toP)++ = c; 1401 } 1402 1403 if ((*toP == toLim) && (*fromP < fromLim)) 1404 return XML_CONVERT_OUTPUT_EXHAUSTED; 1405 else 1406 return XML_CONVERT_COMPLETED; 1407 } 1408 1409 ENCODING * 1410 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert, 1411 void *userData) { 1412 int i; 1413 struct unknown_encoding *e = (struct unknown_encoding *)mem; 1414 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding)); 1415 for (i = 0; i < 128; i++) 1416 if (latin1_encoding.type[i] != BT_OTHER 1417 && latin1_encoding.type[i] != BT_NONXML && table[i] != i) 1418 return 0; 1419 for (i = 0; i < 256; i++) { 1420 int c = table[i]; 1421 if (c == -1) { 1422 e->normal.type[i] = BT_MALFORM; 1423 /* This shouldn't really get used. */ 1424 e->utf16[i] = 0xFFFF; 1425 e->utf8[i][0] = 1; 1426 e->utf8[i][1] = 0; 1427 } else if (c < 0) { 1428 if (c < -4) 1429 return 0; 1430 /* Multi-byte sequences need a converter function */ 1431 if (! convert) 1432 return 0; 1433 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2)); 1434 e->utf8[i][0] = 0; 1435 e->utf16[i] = 0; 1436 } else if (c < 0x80) { 1437 if (latin1_encoding.type[c] != BT_OTHER 1438 && latin1_encoding.type[c] != BT_NONXML && c != i) 1439 return 0; 1440 e->normal.type[i] = latin1_encoding.type[c]; 1441 e->utf8[i][0] = 1; 1442 e->utf8[i][1] = (char)c; 1443 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c); 1444 } else if (checkCharRefNumber(c) < 0) { 1445 e->normal.type[i] = BT_NONXML; 1446 /* This shouldn't really get used. */ 1447 e->utf16[i] = 0xFFFF; 1448 e->utf8[i][0] = 1; 1449 e->utf8[i][1] = 0; 1450 } else { 1451 if (c > 0xFFFF) 1452 return 0; 1453 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) 1454 e->normal.type[i] = BT_NMSTRT; 1455 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) 1456 e->normal.type[i] = BT_NAME; 1457 else 1458 e->normal.type[i] = BT_OTHER; 1459 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); 1460 e->utf16[i] = (unsigned short)c; 1461 } 1462 } 1463 e->userData = userData; 1464 e->convert = convert; 1465 if (convert) { 1466 e->normal.isName2 = unknown_isName; 1467 e->normal.isName3 = unknown_isName; 1468 e->normal.isName4 = unknown_isName; 1469 e->normal.isNmstrt2 = unknown_isNmstrt; 1470 e->normal.isNmstrt3 = unknown_isNmstrt; 1471 e->normal.isNmstrt4 = unknown_isNmstrt; 1472 e->normal.isInvalid2 = unknown_isInvalid; 1473 e->normal.isInvalid3 = unknown_isInvalid; 1474 e->normal.isInvalid4 = unknown_isInvalid; 1475 } 1476 e->normal.enc.utf8Convert = unknown_toUtf8; 1477 e->normal.enc.utf16Convert = unknown_toUtf16; 1478 return &(e->normal.enc); 1479 } 1480 1481 /* If this enumeration is changed, getEncodingIndex and encodings 1482 must also be changed. */ 1483 enum { 1484 UNKNOWN_ENC = -1, 1485 ISO_8859_1_ENC = 0, 1486 US_ASCII_ENC, 1487 UTF_8_ENC, 1488 UTF_16_ENC, 1489 UTF_16BE_ENC, 1490 UTF_16LE_ENC, 1491 /* must match encodingNames up to here */ 1492 NO_ENC 1493 }; 1494 1495 static const char KW_ISO_8859_1[] 1496 = {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, 1497 ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'}; 1498 static const char KW_US_ASCII[] 1499 = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, 1500 ASCII_C, ASCII_I, ASCII_I, '\0'}; 1501 static const char KW_UTF_8[] 1502 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'}; 1503 static const char KW_UTF_16[] 1504 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'}; 1505 static const char KW_UTF_16BE[] 1506 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, 1507 ASCII_6, ASCII_B, ASCII_E, '\0'}; 1508 static const char KW_UTF_16LE[] 1509 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, 1510 ASCII_6, ASCII_L, ASCII_E, '\0'}; 1511 1512 static int FASTCALL 1513 getEncodingIndex(const char *name) { 1514 static const char *const encodingNames[] = { 1515 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE, 1516 }; 1517 int i; 1518 if (name == NULL) 1519 return NO_ENC; 1520 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++) 1521 if (streqci(name, encodingNames[i])) 1522 return i; 1523 return UNKNOWN_ENC; 1524 } 1525 1526 /* For binary compatibility, we store the index of the encoding 1527 specified at initialization in the isUtf16 member. 1528 */ 1529 1530 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) 1531 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) 1532 1533 /* This is what detects the encoding. encodingTable maps from 1534 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of 1535 the external (protocol) specified encoding; state is 1536 XML_CONTENT_STATE if we're parsing an external text entity, and 1537 XML_PROLOG_STATE otherwise. 1538 */ 1539 1540 static int 1541 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc, 1542 int state, const char *ptr, const char *end, const char **nextTokPtr) { 1543 const ENCODING **encPtr; 1544 1545 if (ptr >= end) 1546 return XML_TOK_NONE; 1547 encPtr = enc->encPtr; 1548 if (ptr + 1 == end) { 1549 /* only a single byte available for auto-detection */ 1550 #ifndef XML_DTD /* FIXME */ 1551 /* a well-formed document entity must have more than one byte */ 1552 if (state != XML_CONTENT_STATE) 1553 return XML_TOK_PARTIAL; 1554 #endif 1555 /* so we're parsing an external text entity... */ 1556 /* if UTF-16 was externally specified, then we need at least 2 bytes */ 1557 switch (INIT_ENC_INDEX(enc)) { 1558 case UTF_16_ENC: 1559 case UTF_16LE_ENC: 1560 case UTF_16BE_ENC: 1561 return XML_TOK_PARTIAL; 1562 } 1563 switch ((unsigned char)*ptr) { 1564 case 0xFE: 1565 case 0xFF: 1566 case 0xEF: /* possibly first byte of UTF-8 BOM */ 1567 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) 1568 break; 1569 /* fall through */ 1570 case 0x00: 1571 case 0x3C: 1572 return XML_TOK_PARTIAL; 1573 } 1574 } else { 1575 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { 1576 case 0xFEFF: 1577 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) 1578 break; 1579 *nextTokPtr = ptr + 2; 1580 *encPtr = encodingTable[UTF_16BE_ENC]; 1581 return XML_TOK_BOM; 1582 /* 00 3C is handled in the default case */ 1583 case 0x3C00: 1584 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC 1585 || INIT_ENC_INDEX(enc) == UTF_16_ENC) 1586 && state == XML_CONTENT_STATE) 1587 break; 1588 *encPtr = encodingTable[UTF_16LE_ENC]; 1589 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1590 case 0xFFFE: 1591 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) 1592 break; 1593 *nextTokPtr = ptr + 2; 1594 *encPtr = encodingTable[UTF_16LE_ENC]; 1595 return XML_TOK_BOM; 1596 case 0xEFBB: 1597 /* Maybe a UTF-8 BOM (EF BB BF) */ 1598 /* If there's an explicitly specified (external) encoding 1599 of ISO-8859-1 or some flavour of UTF-16 1600 and this is an external text entity, 1601 don't look for the BOM, 1602 because it might be a legal data. 1603 */ 1604 if (state == XML_CONTENT_STATE) { 1605 int e = INIT_ENC_INDEX(enc); 1606 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC 1607 || e == UTF_16_ENC) 1608 break; 1609 } 1610 if (ptr + 2 == end) 1611 return XML_TOK_PARTIAL; 1612 if ((unsigned char)ptr[2] == 0xBF) { 1613 *nextTokPtr = ptr + 3; 1614 *encPtr = encodingTable[UTF_8_ENC]; 1615 return XML_TOK_BOM; 1616 } 1617 break; 1618 default: 1619 if (ptr[0] == '\0') { 1620 /* 0 isn't a legal data character. Furthermore a document 1621 entity can only start with ASCII characters. So the only 1622 way this can fail to be big-endian UTF-16 if it it's an 1623 external parsed general entity that's labelled as 1624 UTF-16LE. 1625 */ 1626 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) 1627 break; 1628 *encPtr = encodingTable[UTF_16BE_ENC]; 1629 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1630 } else if (ptr[1] == '\0') { 1631 /* We could recover here in the case: 1632 - parsing an external entity 1633 - second byte is 0 1634 - no externally specified encoding 1635 - no encoding declaration 1636 by assuming UTF-16LE. But we don't, because this would mean when 1637 presented just with a single byte, we couldn't reliably determine 1638 whether we needed further bytes. 1639 */ 1640 if (state == XML_CONTENT_STATE) 1641 break; 1642 *encPtr = encodingTable[UTF_16LE_ENC]; 1643 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1644 } 1645 break; 1646 } 1647 } 1648 *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; 1649 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1650 } 1651 1652 #define NS(x) x 1653 #define ns(x) x 1654 #define XML_TOK_NS_C 1655 #include "xmltok_ns.c" 1656 #undef XML_TOK_NS_C 1657 #undef NS 1658 #undef ns 1659 1660 #ifdef XML_NS 1661 1662 # define NS(x) x##NS 1663 # define ns(x) x##_ns 1664 1665 # define XML_TOK_NS_C 1666 # include "xmltok_ns.c" 1667 # undef XML_TOK_NS_C 1668 1669 # undef NS 1670 # undef ns 1671 1672 ENCODING * 1673 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert, 1674 void *userData) { 1675 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); 1676 if (enc) 1677 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON; 1678 return enc; 1679 } 1680 1681 #endif /* XML_NS */ 1682