1 /* 2 __ __ _ 3 ___\ \/ /_ __ __ _| |_ 4 / _ \\ /| '_ \ / _` | __| 5 | __// \| |_) | (_| | |_ 6 \___/_/\_\ .__/ \__,_|\__| 7 |_| XML parser 8 9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd 10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net> 11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net> 12 Copyright (c) 2002 Greg Stein <gstein@users.sourceforge.net> 13 Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net> 14 Copyright (c) 2005-2009 Steven Solie <steven@solie.ca> 15 Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org> 16 Copyright (c) 2016 Pascal Cuoq <cuoq@trust-in-soft.com> 17 Copyright (c) 2016 Don Lewis <truckman@apache.org> 18 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk> 19 Copyright (c) 2017 Alexander Bluhm <alexander.bluhm@gmx.net> 20 Copyright (c) 2017 Benbuck Nason <bnason@netflix.com> 21 Copyright (c) 2017 José Gutiérrez de la Concha <jose@zeroc.com> 22 Copyright (c) 2019 David Loffredo <loffredo@steptools.com> 23 Copyright (c) 2021 Dong-hee Na <donghee.na@python.org> 24 Licensed under the MIT license: 25 26 Permission is hereby granted, free of charge, to any person obtaining 27 a copy of this software and associated documentation files (the 28 "Software"), to deal in the Software without restriction, including 29 without limitation the rights to use, copy, modify, merge, publish, 30 distribute, sublicense, and/or sell copies of the Software, and to permit 31 persons to whom the Software is furnished to do so, subject to the 32 following conditions: 33 34 The above copyright notice and this permission notice shall be included 35 in all copies or substantial portions of the Software. 36 37 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 38 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 39 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 40 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 41 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 42 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 43 USE OR OTHER DEALINGS IN THE SOFTWARE. 44 */ 45 46 #include <expat_config.h> 47 48 #include <stddef.h> 49 #include <string.h> /* memcpy */ 50 #include <stdbool.h> 51 52 #ifdef _WIN32 53 # include "winconfig.h" 54 #endif 55 56 #include "expat_external.h" 57 #include "internal.h" 58 #include "xmltok.h" 59 #include "nametab.h" 60 61 #ifdef XML_DTD 62 # define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) 63 #else 64 # define IGNORE_SECTION_TOK_VTABLE /* as nothing */ 65 #endif 66 67 #define VTABLE1 \ 68 {PREFIX(prologTok), PREFIX(contentTok), \ 69 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \ 70 {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \ 71 PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \ 72 PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \ 73 PREFIX(updatePosition), PREFIX(isPublicId) 74 75 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) 76 77 #define UCS2_GET_NAMING(pages, hi, lo) \ 78 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F))) 79 80 /* A 2 byte UTF-8 representation splits the characters 11 bits between 81 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into 82 pages, 3 bits to add to that index and 5 bits to generate the mask. 83 */ 84 #define UTF8_GET_NAMING2(pages, byte) \ 85 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ 86 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \ 87 & (1u << (((byte)[1]) & 0x1F))) 88 89 /* A 3 byte UTF-8 representation splits the characters 16 bits between 90 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index 91 into pages, 3 bits to add to that index and 5 bits to generate the 92 mask. 93 */ 94 #define UTF8_GET_NAMING3(pages, byte) \ 95 (namingBitmap \ 96 [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \ 97 << 3) \ 98 + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \ 99 & (1u << (((byte)[2]) & 0x1F))) 100 101 /* Detection of invalid UTF-8 sequences is based on Table 3.1B 102 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ 103 with the additional restriction of not allowing the Unicode 104 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). 105 Implementation details: 106 (A & 0x80) == 0 means A < 0x80 107 and 108 (A & 0xC0) == 0xC0 means A > 0xBF 109 */ 110 111 #define UTF8_INVALID2(p) \ 112 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) 113 114 #define UTF8_INVALID3(p) \ 115 (((p)[2] & 0x80) == 0 \ 116 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \ 117 : ((p)[2] & 0xC0) == 0xC0) \ 118 || ((*p) == 0xE0 \ 119 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ 120 : ((p)[1] & 0x80) == 0 \ 121 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) 122 123 #define UTF8_INVALID4(p) \ 124 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \ 125 || ((p)[2] & 0xC0) == 0xC0 \ 126 || ((*p) == 0xF0 \ 127 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ 128 : ((p)[1] & 0x80) == 0 \ 129 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) 130 131 static int PTRFASTCALL 132 isNever(const ENCODING *enc, const char *p) { 133 UNUSED_P(enc); 134 UNUSED_P(p); 135 return 0; 136 } 137 138 static int PTRFASTCALL 139 utf8_isName2(const ENCODING *enc, const char *p) { 140 UNUSED_P(enc); 141 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); 142 } 143 144 static int PTRFASTCALL 145 utf8_isName3(const ENCODING *enc, const char *p) { 146 UNUSED_P(enc); 147 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); 148 } 149 150 #define utf8_isName4 isNever 151 152 static int PTRFASTCALL 153 utf8_isNmstrt2(const ENCODING *enc, const char *p) { 154 UNUSED_P(enc); 155 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); 156 } 157 158 static int PTRFASTCALL 159 utf8_isNmstrt3(const ENCODING *enc, const char *p) { 160 UNUSED_P(enc); 161 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); 162 } 163 164 #define utf8_isNmstrt4 isNever 165 166 static int PTRFASTCALL 167 utf8_isInvalid2(const ENCODING *enc, const char *p) { 168 UNUSED_P(enc); 169 return UTF8_INVALID2((const unsigned char *)p); 170 } 171 172 static int PTRFASTCALL 173 utf8_isInvalid3(const ENCODING *enc, const char *p) { 174 UNUSED_P(enc); 175 return UTF8_INVALID3((const unsigned char *)p); 176 } 177 178 static int PTRFASTCALL 179 utf8_isInvalid4(const ENCODING *enc, const char *p) { 180 UNUSED_P(enc); 181 return UTF8_INVALID4((const unsigned char *)p); 182 } 183 184 struct normal_encoding { 185 ENCODING enc; 186 unsigned char type[256]; 187 #ifdef XML_MIN_SIZE 188 int(PTRFASTCALL *byteType)(const ENCODING *, const char *); 189 int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *); 190 int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *); 191 int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *); 192 int(PTRCALL *charMatches)(const ENCODING *, const char *, int); 193 #endif /* XML_MIN_SIZE */ 194 int(PTRFASTCALL *isName2)(const ENCODING *, const char *); 195 int(PTRFASTCALL *isName3)(const ENCODING *, const char *); 196 int(PTRFASTCALL *isName4)(const ENCODING *, const char *); 197 int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *); 198 int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *); 199 int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *); 200 int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *); 201 int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *); 202 int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *); 203 }; 204 205 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc)) 206 207 #ifdef XML_MIN_SIZE 208 209 # define STANDARD_VTABLE(E) \ 210 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches, 211 212 #else 213 214 # define STANDARD_VTABLE(E) /* as nothing */ 215 216 #endif 217 218 #define NORMAL_VTABLE(E) \ 219 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \ 220 E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4 221 222 #define NULL_VTABLE \ 223 /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \ 224 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \ 225 /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL 226 227 static int FASTCALL checkCharRefNumber(int); 228 229 #include "xmltok_impl.h" 230 #include "ascii.h" 231 232 #ifdef XML_MIN_SIZE 233 # define sb_isNameMin isNever 234 # define sb_isNmstrtMin isNever 235 #endif 236 237 #ifdef XML_MIN_SIZE 238 # define MINBPC(enc) ((enc)->minBytesPerChar) 239 #else 240 /* minimum bytes per character */ 241 # define MINBPC(enc) 1 242 #endif 243 244 #define SB_BYTE_TYPE(enc, p) \ 245 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) 246 247 #ifdef XML_MIN_SIZE 248 static int PTRFASTCALL 249 sb_byteType(const ENCODING *enc, const char *p) { 250 return SB_BYTE_TYPE(enc, p); 251 } 252 # define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) 253 #else 254 # define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) 255 #endif 256 257 #ifdef XML_MIN_SIZE 258 # define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) 259 static int PTRFASTCALL 260 sb_byteToAscii(const ENCODING *enc, const char *p) { 261 UNUSED_P(enc); 262 return *p; 263 } 264 #else 265 # define BYTE_TO_ASCII(enc, p) (*(p)) 266 #endif 267 268 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p)) 269 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p)) 270 #ifdef XML_MIN_SIZE 271 # define IS_INVALID_CHAR(enc, p, n) \ 272 (AS_NORMAL_ENCODING(enc)->isInvalid##n \ 273 && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p)) 274 #else 275 # define IS_INVALID_CHAR(enc, p, n) \ 276 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p)) 277 #endif 278 279 #ifdef XML_MIN_SIZE 280 # define IS_NAME_CHAR_MINBPC(enc, p) \ 281 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) 282 # define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 283 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) 284 #else 285 # define IS_NAME_CHAR_MINBPC(enc, p) (0) 286 # define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 287 #endif 288 289 #ifdef XML_MIN_SIZE 290 # define CHAR_MATCHES(enc, p, c) \ 291 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) 292 static int PTRCALL 293 sb_charMatches(const ENCODING *enc, const char *p, int c) { 294 UNUSED_P(enc); 295 return *p == c; 296 } 297 #else 298 /* c is an ASCII character */ 299 # define CHAR_MATCHES(enc, p, c) (*(p) == c) 300 #endif 301 302 #define PREFIX(ident) normal_##ident 303 #define XML_TOK_IMPL_C 304 #include "xmltok_impl.c" 305 #undef XML_TOK_IMPL_C 306 307 #undef MINBPC 308 #undef BYTE_TYPE 309 #undef BYTE_TO_ASCII 310 #undef CHAR_MATCHES 311 #undef IS_NAME_CHAR 312 #undef IS_NAME_CHAR_MINBPC 313 #undef IS_NMSTRT_CHAR 314 #undef IS_NMSTRT_CHAR_MINBPC 315 #undef IS_INVALID_CHAR 316 317 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ 318 UTF8_cval1 = 0x00, 319 UTF8_cval2 = 0xc0, 320 UTF8_cval3 = 0xe0, 321 UTF8_cval4 = 0xf0 322 }; 323 324 void 325 _INTERNAL_trim_to_complete_utf8_characters(const char *from, 326 const char **fromLimRef) { 327 const char *fromLim = *fromLimRef; 328 size_t walked = 0; 329 for (; fromLim > from; fromLim--, walked++) { 330 const unsigned char prev = (unsigned char)fromLim[-1]; 331 if ((prev & 0xf8u) 332 == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */ 333 if (walked + 1 >= 4) { 334 fromLim += 4 - 1; 335 break; 336 } else { 337 walked = 0; 338 } 339 } else if ((prev & 0xf0u) 340 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */ 341 if (walked + 1 >= 3) { 342 fromLim += 3 - 1; 343 break; 344 } else { 345 walked = 0; 346 } 347 } else if ((prev & 0xe0u) 348 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */ 349 if (walked + 1 >= 2) { 350 fromLim += 2 - 1; 351 break; 352 } else { 353 walked = 0; 354 } 355 } else if ((prev & 0x80u) 356 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */ 357 break; 358 } 359 } 360 *fromLimRef = fromLim; 361 } 362 363 static enum XML_Convert_Result PTRCALL 364 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 365 char **toP, const char *toLim) { 366 bool input_incomplete = false; 367 bool output_exhausted = false; 368 369 /* Avoid copying partial characters (due to limited space). */ 370 const ptrdiff_t bytesAvailable = fromLim - *fromP; 371 const ptrdiff_t bytesStorable = toLim - *toP; 372 UNUSED_P(enc); 373 if (bytesAvailable > bytesStorable) { 374 fromLim = *fromP + bytesStorable; 375 output_exhausted = true; 376 } 377 378 /* Avoid copying partial characters (from incomplete input). */ 379 { 380 const char *const fromLimBefore = fromLim; 381 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim); 382 if (fromLim < fromLimBefore) { 383 input_incomplete = true; 384 } 385 } 386 387 { 388 const ptrdiff_t bytesToCopy = fromLim - *fromP; 389 memcpy(*toP, *fromP, bytesToCopy); 390 *fromP += bytesToCopy; 391 *toP += bytesToCopy; 392 } 393 394 if (output_exhausted) /* needs to go first */ 395 return XML_CONVERT_OUTPUT_EXHAUSTED; 396 else if (input_incomplete) 397 return XML_CONVERT_INPUT_INCOMPLETE; 398 else 399 return XML_CONVERT_COMPLETED; 400 } 401 402 static enum XML_Convert_Result PTRCALL 403 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, 404 unsigned short **toP, const unsigned short *toLim) { 405 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; 406 unsigned short *to = *toP; 407 const char *from = *fromP; 408 while (from < fromLim && to < toLim) { 409 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { 410 case BT_LEAD2: 411 if (fromLim - from < 2) { 412 res = XML_CONVERT_INPUT_INCOMPLETE; 413 goto after; 414 } 415 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); 416 from += 2; 417 break; 418 case BT_LEAD3: 419 if (fromLim - from < 3) { 420 res = XML_CONVERT_INPUT_INCOMPLETE; 421 goto after; 422 } 423 *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) 424 | (from[2] & 0x3f)); 425 from += 3; 426 break; 427 case BT_LEAD4: { 428 unsigned long n; 429 if (toLim - to < 2) { 430 res = XML_CONVERT_OUTPUT_EXHAUSTED; 431 goto after; 432 } 433 if (fromLim - from < 4) { 434 res = XML_CONVERT_INPUT_INCOMPLETE; 435 goto after; 436 } 437 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) 438 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); 439 n -= 0x10000; 440 to[0] = (unsigned short)((n >> 10) | 0xD800); 441 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); 442 to += 2; 443 from += 4; 444 } break; 445 default: 446 *to++ = *from++; 447 break; 448 } 449 } 450 if (from < fromLim) 451 res = XML_CONVERT_OUTPUT_EXHAUSTED; 452 after: 453 *fromP = from; 454 *toP = to; 455 return res; 456 } 457 458 #ifdef XML_NS 459 static const struct normal_encoding utf8_encoding_ns 460 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 461 { 462 # include "asciitab.h" 463 # include "utf8tab.h" 464 }, 465 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 466 #endif 467 468 static const struct normal_encoding utf8_encoding 469 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 470 { 471 #define BT_COLON BT_NMSTRT 472 #include "asciitab.h" 473 #undef BT_COLON 474 #include "utf8tab.h" 475 }, 476 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 477 478 #ifdef XML_NS 479 480 static const struct normal_encoding internal_utf8_encoding_ns 481 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 482 { 483 # include "iasciitab.h" 484 # include "utf8tab.h" 485 }, 486 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 487 488 #endif 489 490 static const struct normal_encoding internal_utf8_encoding 491 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 492 { 493 #define BT_COLON BT_NMSTRT 494 #include "iasciitab.h" 495 #undef BT_COLON 496 #include "utf8tab.h" 497 }, 498 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 499 500 static enum XML_Convert_Result PTRCALL 501 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 502 char **toP, const char *toLim) { 503 UNUSED_P(enc); 504 for (;;) { 505 unsigned char c; 506 if (*fromP == fromLim) 507 return XML_CONVERT_COMPLETED; 508 c = (unsigned char)**fromP; 509 if (c & 0x80) { 510 if (toLim - *toP < 2) 511 return XML_CONVERT_OUTPUT_EXHAUSTED; 512 *(*toP)++ = (char)((c >> 6) | UTF8_cval2); 513 *(*toP)++ = (char)((c & 0x3f) | 0x80); 514 (*fromP)++; 515 } else { 516 if (*toP == toLim) 517 return XML_CONVERT_OUTPUT_EXHAUSTED; 518 *(*toP)++ = *(*fromP)++; 519 } 520 } 521 } 522 523 static enum XML_Convert_Result PTRCALL 524 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, 525 unsigned short **toP, const unsigned short *toLim) { 526 UNUSED_P(enc); 527 while (*fromP < fromLim && *toP < toLim) 528 *(*toP)++ = (unsigned char)*(*fromP)++; 529 530 if ((*toP == toLim) && (*fromP < fromLim)) 531 return XML_CONVERT_OUTPUT_EXHAUSTED; 532 else 533 return XML_CONVERT_COMPLETED; 534 } 535 536 #ifdef XML_NS 537 538 static const struct normal_encoding latin1_encoding_ns 539 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0}, 540 { 541 # include "asciitab.h" 542 # include "latin1tab.h" 543 }, 544 STANDARD_VTABLE(sb_) NULL_VTABLE}; 545 546 #endif 547 548 static const struct normal_encoding latin1_encoding 549 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0}, 550 { 551 #define BT_COLON BT_NMSTRT 552 #include "asciitab.h" 553 #undef BT_COLON 554 #include "latin1tab.h" 555 }, 556 STANDARD_VTABLE(sb_) NULL_VTABLE}; 557 558 static enum XML_Convert_Result PTRCALL 559 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 560 char **toP, const char *toLim) { 561 UNUSED_P(enc); 562 while (*fromP < fromLim && *toP < toLim) 563 *(*toP)++ = *(*fromP)++; 564 565 if ((*toP == toLim) && (*fromP < fromLim)) 566 return XML_CONVERT_OUTPUT_EXHAUSTED; 567 else 568 return XML_CONVERT_COMPLETED; 569 } 570 571 #ifdef XML_NS 572 573 static const struct normal_encoding ascii_encoding_ns 574 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0}, 575 { 576 # include "asciitab.h" 577 /* BT_NONXML == 0 */ 578 }, 579 STANDARD_VTABLE(sb_) NULL_VTABLE}; 580 581 #endif 582 583 static const struct normal_encoding ascii_encoding 584 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0}, 585 { 586 #define BT_COLON BT_NMSTRT 587 #include "asciitab.h" 588 #undef BT_COLON 589 /* BT_NONXML == 0 */ 590 }, 591 STANDARD_VTABLE(sb_) NULL_VTABLE}; 592 593 static int PTRFASTCALL 594 unicode_byte_type(char hi, char lo) { 595 switch ((unsigned char)hi) { 596 /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */ 597 case 0xD8: 598 case 0xD9: 599 case 0xDA: 600 case 0xDB: 601 return BT_LEAD4; 602 /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */ 603 case 0xDC: 604 case 0xDD: 605 case 0xDE: 606 case 0xDF: 607 return BT_TRAIL; 608 case 0xFF: 609 switch ((unsigned char)lo) { 610 case 0xFF: /* noncharacter-FFFF */ 611 case 0xFE: /* noncharacter-FFFE */ 612 return BT_NONXML; 613 } 614 break; 615 } 616 return BT_NONASCII; 617 } 618 619 #define DEFINE_UTF16_TO_UTF8(E) \ 620 static enum XML_Convert_Result PTRCALL E##toUtf8( \ 621 const ENCODING *enc, const char **fromP, const char *fromLim, \ 622 char **toP, const char *toLim) { \ 623 const char *from = *fromP; \ 624 UNUSED_P(enc); \ 625 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \ 626 for (; from < fromLim; from += 2) { \ 627 int plane; \ 628 unsigned char lo2; \ 629 unsigned char lo = GET_LO(from); \ 630 unsigned char hi = GET_HI(from); \ 631 switch (hi) { \ 632 case 0: \ 633 if (lo < 0x80) { \ 634 if (*toP == toLim) { \ 635 *fromP = from; \ 636 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 637 } \ 638 *(*toP)++ = lo; \ 639 break; \ 640 } \ 641 /* fall through */ \ 642 case 0x1: \ 643 case 0x2: \ 644 case 0x3: \ 645 case 0x4: \ 646 case 0x5: \ 647 case 0x6: \ 648 case 0x7: \ 649 if (toLim - *toP < 2) { \ 650 *fromP = from; \ 651 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 652 } \ 653 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ 654 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 655 break; \ 656 default: \ 657 if (toLim - *toP < 3) { \ 658 *fromP = from; \ 659 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 660 } \ 661 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ 662 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ 663 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ 664 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 665 break; \ 666 case 0xD8: \ 667 case 0xD9: \ 668 case 0xDA: \ 669 case 0xDB: \ 670 if (toLim - *toP < 4) { \ 671 *fromP = from; \ 672 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 673 } \ 674 if (fromLim - from < 4) { \ 675 *fromP = from; \ 676 return XML_CONVERT_INPUT_INCOMPLETE; \ 677 } \ 678 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ 679 *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \ 680 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ 681 from += 2; \ 682 lo2 = GET_LO(from); \ 683 *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \ 684 | (lo2 >> 6) | 0x80); \ 685 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ 686 break; \ 687 } \ 688 } \ 689 *fromP = from; \ 690 if (from < fromLim) \ 691 return XML_CONVERT_INPUT_INCOMPLETE; \ 692 else \ 693 return XML_CONVERT_COMPLETED; \ 694 } 695 696 #define DEFINE_UTF16_TO_UTF16(E) \ 697 static enum XML_Convert_Result PTRCALL E##toUtf16( \ 698 const ENCODING *enc, const char **fromP, const char *fromLim, \ 699 unsigned short **toP, const unsigned short *toLim) { \ 700 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \ 701 UNUSED_P(enc); \ 702 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \ 703 /* Avoid copying first half only of surrogate */ \ 704 if (fromLim - *fromP > ((toLim - *toP) << 1) \ 705 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \ 706 fromLim -= 2; \ 707 res = XML_CONVERT_INPUT_INCOMPLETE; \ 708 } \ 709 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \ 710 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ 711 if ((*toP == toLim) && (*fromP < fromLim)) \ 712 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 713 else \ 714 return res; \ 715 } 716 717 #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8))) 718 #define GET_LO(ptr) ((unsigned char)(ptr)[0]) 719 #define GET_HI(ptr) ((unsigned char)(ptr)[1]) 720 721 DEFINE_UTF16_TO_UTF8(little2_) 722 DEFINE_UTF16_TO_UTF16(little2_) 723 724 #undef SET2 725 #undef GET_LO 726 #undef GET_HI 727 728 #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF))) 729 #define GET_LO(ptr) ((unsigned char)(ptr)[1]) 730 #define GET_HI(ptr) ((unsigned char)(ptr)[0]) 731 732 DEFINE_UTF16_TO_UTF8(big2_) 733 DEFINE_UTF16_TO_UTF16(big2_) 734 735 #undef SET2 736 #undef GET_LO 737 #undef GET_HI 738 739 #define LITTLE2_BYTE_TYPE(enc, p) \ 740 ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ 741 : unicode_byte_type((p)[1], (p)[0])) 742 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1) 743 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c) 744 #define LITTLE2_IS_NAME_CHAR_MINBPC(p) \ 745 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) 746 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \ 747 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) 748 749 #ifdef XML_MIN_SIZE 750 751 static int PTRFASTCALL 752 little2_byteType(const ENCODING *enc, const char *p) { 753 return LITTLE2_BYTE_TYPE(enc, p); 754 } 755 756 static int PTRFASTCALL 757 little2_byteToAscii(const ENCODING *enc, const char *p) { 758 UNUSED_P(enc); 759 return LITTLE2_BYTE_TO_ASCII(p); 760 } 761 762 static int PTRCALL 763 little2_charMatches(const ENCODING *enc, const char *p, int c) { 764 UNUSED_P(enc); 765 return LITTLE2_CHAR_MATCHES(p, c); 766 } 767 768 static int PTRFASTCALL 769 little2_isNameMin(const ENCODING *enc, const char *p) { 770 UNUSED_P(enc); 771 return LITTLE2_IS_NAME_CHAR_MINBPC(p); 772 } 773 774 static int PTRFASTCALL 775 little2_isNmstrtMin(const ENCODING *enc, const char *p) { 776 UNUSED_P(enc); 777 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p); 778 } 779 780 # undef VTABLE 781 # define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 782 783 #else /* not XML_MIN_SIZE */ 784 785 # undef PREFIX 786 # define PREFIX(ident) little2_##ident 787 # define MINBPC(enc) 2 788 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 789 # define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) 790 # define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p) 791 # define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c) 792 # define IS_NAME_CHAR(enc, p, n) 0 793 # define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p) 794 # define IS_NMSTRT_CHAR(enc, p, n) (0) 795 # define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) 796 797 # define XML_TOK_IMPL_C 798 # include "xmltok_impl.c" 799 # undef XML_TOK_IMPL_C 800 801 # undef MINBPC 802 # undef BYTE_TYPE 803 # undef BYTE_TO_ASCII 804 # undef CHAR_MATCHES 805 # undef IS_NAME_CHAR 806 # undef IS_NAME_CHAR_MINBPC 807 # undef IS_NMSTRT_CHAR 808 # undef IS_NMSTRT_CHAR_MINBPC 809 # undef IS_INVALID_CHAR 810 811 #endif /* not XML_MIN_SIZE */ 812 813 #ifdef XML_NS 814 815 static const struct normal_encoding little2_encoding_ns 816 = {{VTABLE, 2, 0, 817 # if BYTEORDER == 1234 818 1 819 # else 820 0 821 # endif 822 }, 823 { 824 # include "asciitab.h" 825 # include "latin1tab.h" 826 }, 827 STANDARD_VTABLE(little2_) NULL_VTABLE}; 828 829 #endif 830 831 static const struct normal_encoding little2_encoding 832 = {{VTABLE, 2, 0, 833 #if BYTEORDER == 1234 834 1 835 #else 836 0 837 #endif 838 }, 839 { 840 #define BT_COLON BT_NMSTRT 841 #include "asciitab.h" 842 #undef BT_COLON 843 #include "latin1tab.h" 844 }, 845 STANDARD_VTABLE(little2_) NULL_VTABLE}; 846 847 #if BYTEORDER != 4321 848 849 # ifdef XML_NS 850 851 static const struct normal_encoding internal_little2_encoding_ns 852 = {{VTABLE, 2, 0, 1}, 853 { 854 # include "iasciitab.h" 855 # include "latin1tab.h" 856 }, 857 STANDARD_VTABLE(little2_) NULL_VTABLE}; 858 859 # endif 860 861 static const struct normal_encoding internal_little2_encoding 862 = {{VTABLE, 2, 0, 1}, 863 { 864 # define BT_COLON BT_NMSTRT 865 # include "iasciitab.h" 866 # undef BT_COLON 867 # include "latin1tab.h" 868 }, 869 STANDARD_VTABLE(little2_) NULL_VTABLE}; 870 871 #endif 872 873 #define BIG2_BYTE_TYPE(enc, p) \ 874 ((p)[0] == 0 \ 875 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ 876 : unicode_byte_type((p)[0], (p)[1])) 877 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1) 878 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c) 879 #define BIG2_IS_NAME_CHAR_MINBPC(p) \ 880 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) 881 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \ 882 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) 883 884 #ifdef XML_MIN_SIZE 885 886 static int PTRFASTCALL 887 big2_byteType(const ENCODING *enc, const char *p) { 888 return BIG2_BYTE_TYPE(enc, p); 889 } 890 891 static int PTRFASTCALL 892 big2_byteToAscii(const ENCODING *enc, const char *p) { 893 UNUSED_P(enc); 894 return BIG2_BYTE_TO_ASCII(p); 895 } 896 897 static int PTRCALL 898 big2_charMatches(const ENCODING *enc, const char *p, int c) { 899 UNUSED_P(enc); 900 return BIG2_CHAR_MATCHES(p, c); 901 } 902 903 static int PTRFASTCALL 904 big2_isNameMin(const ENCODING *enc, const char *p) { 905 UNUSED_P(enc); 906 return BIG2_IS_NAME_CHAR_MINBPC(p); 907 } 908 909 static int PTRFASTCALL 910 big2_isNmstrtMin(const ENCODING *enc, const char *p) { 911 UNUSED_P(enc); 912 return BIG2_IS_NMSTRT_CHAR_MINBPC(p); 913 } 914 915 # undef VTABLE 916 # define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 917 918 #else /* not XML_MIN_SIZE */ 919 920 # undef PREFIX 921 # define PREFIX(ident) big2_##ident 922 # define MINBPC(enc) 2 923 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 924 # define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) 925 # define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p) 926 # define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c) 927 # define IS_NAME_CHAR(enc, p, n) 0 928 # define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p) 929 # define IS_NMSTRT_CHAR(enc, p, n) (0) 930 # define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p) 931 932 # define XML_TOK_IMPL_C 933 # include "xmltok_impl.c" 934 # undef XML_TOK_IMPL_C 935 936 # undef MINBPC 937 # undef BYTE_TYPE 938 # undef BYTE_TO_ASCII 939 # undef CHAR_MATCHES 940 # undef IS_NAME_CHAR 941 # undef IS_NAME_CHAR_MINBPC 942 # undef IS_NMSTRT_CHAR 943 # undef IS_NMSTRT_CHAR_MINBPC 944 # undef IS_INVALID_CHAR 945 946 #endif /* not XML_MIN_SIZE */ 947 948 #ifdef XML_NS 949 950 static const struct normal_encoding big2_encoding_ns 951 = {{VTABLE, 2, 0, 952 # if BYTEORDER == 4321 953 1 954 # else 955 0 956 # endif 957 }, 958 { 959 # include "asciitab.h" 960 # include "latin1tab.h" 961 }, 962 STANDARD_VTABLE(big2_) NULL_VTABLE}; 963 964 #endif 965 966 static const struct normal_encoding big2_encoding 967 = {{VTABLE, 2, 0, 968 #if BYTEORDER == 4321 969 1 970 #else 971 0 972 #endif 973 }, 974 { 975 #define BT_COLON BT_NMSTRT 976 #include "asciitab.h" 977 #undef BT_COLON 978 #include "latin1tab.h" 979 }, 980 STANDARD_VTABLE(big2_) NULL_VTABLE}; 981 982 #if BYTEORDER != 1234 983 984 # ifdef XML_NS 985 986 static const struct normal_encoding internal_big2_encoding_ns 987 = {{VTABLE, 2, 0, 1}, 988 { 989 # include "iasciitab.h" 990 # include "latin1tab.h" 991 }, 992 STANDARD_VTABLE(big2_) NULL_VTABLE}; 993 994 # endif 995 996 static const struct normal_encoding internal_big2_encoding 997 = {{VTABLE, 2, 0, 1}, 998 { 999 # define BT_COLON BT_NMSTRT 1000 # include "iasciitab.h" 1001 # undef BT_COLON 1002 # include "latin1tab.h" 1003 }, 1004 STANDARD_VTABLE(big2_) NULL_VTABLE}; 1005 1006 #endif 1007 1008 #undef PREFIX 1009 1010 static int FASTCALL 1011 streqci(const char *s1, const char *s2) { 1012 for (;;) { 1013 char c1 = *s1++; 1014 char c2 = *s2++; 1015 if (ASCII_a <= c1 && c1 <= ASCII_z) 1016 c1 += ASCII_A - ASCII_a; 1017 if (ASCII_a <= c2 && c2 <= ASCII_z) 1018 /* The following line will never get executed. streqci() is 1019 * only called from two places, both of which guarantee to put 1020 * upper-case strings into s2. 1021 */ 1022 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */ 1023 if (c1 != c2) 1024 return 0; 1025 if (! c1) 1026 break; 1027 } 1028 return 1; 1029 } 1030 1031 static void PTRCALL 1032 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end, 1033 POSITION *pos) { 1034 UNUSED_P(enc); 1035 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); 1036 } 1037 1038 static int 1039 toAscii(const ENCODING *enc, const char *ptr, const char *end) { 1040 char buf[1]; 1041 char *p = buf; 1042 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); 1043 if (p == buf) 1044 return -1; 1045 else 1046 return buf[0]; 1047 } 1048 1049 static int FASTCALL 1050 isSpace(int c) { 1051 switch (c) { 1052 case 0x20: 1053 case 0xD: 1054 case 0xA: 1055 case 0x9: 1056 return 1; 1057 } 1058 return 0; 1059 } 1060 1061 /* Return 1 if there's just optional white space or there's an S 1062 followed by name=val. 1063 */ 1064 static int 1065 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end, 1066 const char **namePtr, const char **nameEndPtr, 1067 const char **valPtr, const char **nextTokPtr) { 1068 int c; 1069 char open; 1070 if (ptr == end) { 1071 *namePtr = NULL; 1072 return 1; 1073 } 1074 if (! isSpace(toAscii(enc, ptr, end))) { 1075 *nextTokPtr = ptr; 1076 return 0; 1077 } 1078 do { 1079 ptr += enc->minBytesPerChar; 1080 } while (isSpace(toAscii(enc, ptr, end))); 1081 if (ptr == end) { 1082 *namePtr = NULL; 1083 return 1; 1084 } 1085 *namePtr = ptr; 1086 for (;;) { 1087 c = toAscii(enc, ptr, end); 1088 if (c == -1) { 1089 *nextTokPtr = ptr; 1090 return 0; 1091 } 1092 if (c == ASCII_EQUALS) { 1093 *nameEndPtr = ptr; 1094 break; 1095 } 1096 if (isSpace(c)) { 1097 *nameEndPtr = ptr; 1098 do { 1099 ptr += enc->minBytesPerChar; 1100 } while (isSpace(c = toAscii(enc, ptr, end))); 1101 if (c != ASCII_EQUALS) { 1102 *nextTokPtr = ptr; 1103 return 0; 1104 } 1105 break; 1106 } 1107 ptr += enc->minBytesPerChar; 1108 } 1109 if (ptr == *namePtr) { 1110 *nextTokPtr = ptr; 1111 return 0; 1112 } 1113 ptr += enc->minBytesPerChar; 1114 c = toAscii(enc, ptr, end); 1115 while (isSpace(c)) { 1116 ptr += enc->minBytesPerChar; 1117 c = toAscii(enc, ptr, end); 1118 } 1119 if (c != ASCII_QUOT && c != ASCII_APOS) { 1120 *nextTokPtr = ptr; 1121 return 0; 1122 } 1123 open = (char)c; 1124 ptr += enc->minBytesPerChar; 1125 *valPtr = ptr; 1126 for (;; ptr += enc->minBytesPerChar) { 1127 c = toAscii(enc, ptr, end); 1128 if (c == open) 1129 break; 1130 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z) 1131 && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD 1132 && c != ASCII_MINUS && c != ASCII_UNDERSCORE) { 1133 *nextTokPtr = ptr; 1134 return 0; 1135 } 1136 } 1137 *nextTokPtr = ptr + enc->minBytesPerChar; 1138 return 1; 1139 } 1140 1141 static const char KW_version[] 1142 = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'}; 1143 1144 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, 1145 ASCII_i, ASCII_n, ASCII_g, '\0'}; 1146 1147 static const char KW_standalone[] 1148 = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, 1149 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'}; 1150 1151 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'}; 1152 1153 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'}; 1154 1155 static int 1156 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *, 1157 const char *), 1158 int isGeneralTextEntity, const ENCODING *enc, const char *ptr, 1159 const char *end, const char **badPtr, const char **versionPtr, 1160 const char **versionEndPtr, const char **encodingName, 1161 const ENCODING **encoding, int *standalone) { 1162 const char *val = NULL; 1163 const char *name = NULL; 1164 const char *nameEnd = NULL; 1165 ptr += 5 * enc->minBytesPerChar; 1166 end -= 2 * enc->minBytesPerChar; 1167 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) 1168 || ! name) { 1169 *badPtr = ptr; 1170 return 0; 1171 } 1172 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) { 1173 if (! isGeneralTextEntity) { 1174 *badPtr = name; 1175 return 0; 1176 } 1177 } else { 1178 if (versionPtr) 1179 *versionPtr = val; 1180 if (versionEndPtr) 1181 *versionEndPtr = ptr; 1182 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1183 *badPtr = ptr; 1184 return 0; 1185 } 1186 if (! name) { 1187 if (isGeneralTextEntity) { 1188 /* a TextDecl must have an EncodingDecl */ 1189 *badPtr = ptr; 1190 return 0; 1191 } 1192 return 1; 1193 } 1194 } 1195 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) { 1196 int c = toAscii(enc, val, end); 1197 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) { 1198 *badPtr = val; 1199 return 0; 1200 } 1201 if (encodingName) 1202 *encodingName = val; 1203 if (encoding) 1204 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); 1205 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1206 *badPtr = ptr; 1207 return 0; 1208 } 1209 if (! name) 1210 return 1; 1211 } 1212 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) 1213 || isGeneralTextEntity) { 1214 *badPtr = name; 1215 return 0; 1216 } 1217 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) { 1218 if (standalone) 1219 *standalone = 1; 1220 } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) { 1221 if (standalone) 1222 *standalone = 0; 1223 } else { 1224 *badPtr = val; 1225 return 0; 1226 } 1227 while (isSpace(toAscii(enc, ptr, end))) 1228 ptr += enc->minBytesPerChar; 1229 if (ptr != end) { 1230 *badPtr = ptr; 1231 return 0; 1232 } 1233 return 1; 1234 } 1235 1236 static int FASTCALL 1237 checkCharRefNumber(int result) { 1238 switch (result >> 8) { 1239 case 0xD8: 1240 case 0xD9: 1241 case 0xDA: 1242 case 0xDB: 1243 case 0xDC: 1244 case 0xDD: 1245 case 0xDE: 1246 case 0xDF: 1247 return -1; 1248 case 0: 1249 if (latin1_encoding.type[result] == BT_NONXML) 1250 return -1; 1251 break; 1252 case 0xFF: 1253 if (result == 0xFFFE || result == 0xFFFF) 1254 return -1; 1255 break; 1256 } 1257 return result; 1258 } 1259 1260 int FASTCALL 1261 XmlUtf8Encode(int c, char *buf) { 1262 enum { 1263 /* minN is minimum legal resulting value for N byte sequence */ 1264 min2 = 0x80, 1265 min3 = 0x800, 1266 min4 = 0x10000 1267 }; 1268 1269 if (c < 0) 1270 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */ 1271 if (c < min2) { 1272 buf[0] = (char)(c | UTF8_cval1); 1273 return 1; 1274 } 1275 if (c < min3) { 1276 buf[0] = (char)((c >> 6) | UTF8_cval2); 1277 buf[1] = (char)((c & 0x3f) | 0x80); 1278 return 2; 1279 } 1280 if (c < min4) { 1281 buf[0] = (char)((c >> 12) | UTF8_cval3); 1282 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80); 1283 buf[2] = (char)((c & 0x3f) | 0x80); 1284 return 3; 1285 } 1286 if (c < 0x110000) { 1287 buf[0] = (char)((c >> 18) | UTF8_cval4); 1288 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80); 1289 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80); 1290 buf[3] = (char)((c & 0x3f) | 0x80); 1291 return 4; 1292 } 1293 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */ 1294 } 1295 1296 int FASTCALL 1297 XmlUtf16Encode(int charNum, unsigned short *buf) { 1298 if (charNum < 0) 1299 return 0; 1300 if (charNum < 0x10000) { 1301 buf[0] = (unsigned short)charNum; 1302 return 1; 1303 } 1304 if (charNum < 0x110000) { 1305 charNum -= 0x10000; 1306 buf[0] = (unsigned short)((charNum >> 10) + 0xD800); 1307 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00); 1308 return 2; 1309 } 1310 return 0; 1311 } 1312 1313 struct unknown_encoding { 1314 struct normal_encoding normal; 1315 CONVERTER convert; 1316 void *userData; 1317 unsigned short utf16[256]; 1318 char utf8[256][4]; 1319 }; 1320 1321 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc)) 1322 1323 int 1324 XmlSizeOfUnknownEncoding(void) { 1325 return sizeof(struct unknown_encoding); 1326 } 1327 1328 static int PTRFASTCALL 1329 unknown_isName(const ENCODING *enc, const char *p) { 1330 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1331 int c = uenc->convert(uenc->userData, p); 1332 if (c & ~0xFFFF) 1333 return 0; 1334 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); 1335 } 1336 1337 static int PTRFASTCALL 1338 unknown_isNmstrt(const ENCODING *enc, const char *p) { 1339 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1340 int c = uenc->convert(uenc->userData, p); 1341 if (c & ~0xFFFF) 1342 return 0; 1343 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); 1344 } 1345 1346 static int PTRFASTCALL 1347 unknown_isInvalid(const ENCODING *enc, const char *p) { 1348 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1349 int c = uenc->convert(uenc->userData, p); 1350 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; 1351 } 1352 1353 static enum XML_Convert_Result PTRCALL 1354 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 1355 char **toP, const char *toLim) { 1356 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1357 char buf[XML_UTF8_ENCODE_MAX]; 1358 for (;;) { 1359 const char *utf8; 1360 int n; 1361 if (*fromP == fromLim) 1362 return XML_CONVERT_COMPLETED; 1363 utf8 = uenc->utf8[(unsigned char)**fromP]; 1364 n = *utf8++; 1365 if (n == 0) { 1366 int c = uenc->convert(uenc->userData, *fromP); 1367 n = XmlUtf8Encode(c, buf); 1368 if (n > toLim - *toP) 1369 return XML_CONVERT_OUTPUT_EXHAUSTED; 1370 utf8 = buf; 1371 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1372 - (BT_LEAD2 - 2)); 1373 } else { 1374 if (n > toLim - *toP) 1375 return XML_CONVERT_OUTPUT_EXHAUSTED; 1376 (*fromP)++; 1377 } 1378 memcpy(*toP, utf8, n); 1379 *toP += n; 1380 } 1381 } 1382 1383 static enum XML_Convert_Result PTRCALL 1384 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, 1385 unsigned short **toP, const unsigned short *toLim) { 1386 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1387 while (*fromP < fromLim && *toP < toLim) { 1388 unsigned short c = uenc->utf16[(unsigned char)**fromP]; 1389 if (c == 0) { 1390 c = (unsigned short)uenc->convert(uenc->userData, *fromP); 1391 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1392 - (BT_LEAD2 - 2)); 1393 } else 1394 (*fromP)++; 1395 *(*toP)++ = c; 1396 } 1397 1398 if ((*toP == toLim) && (*fromP < fromLim)) 1399 return XML_CONVERT_OUTPUT_EXHAUSTED; 1400 else 1401 return XML_CONVERT_COMPLETED; 1402 } 1403 1404 ENCODING * 1405 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert, 1406 void *userData) { 1407 int i; 1408 struct unknown_encoding *e = (struct unknown_encoding *)mem; 1409 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding)); 1410 for (i = 0; i < 128; i++) 1411 if (latin1_encoding.type[i] != BT_OTHER 1412 && latin1_encoding.type[i] != BT_NONXML && table[i] != i) 1413 return 0; 1414 for (i = 0; i < 256; i++) { 1415 int c = table[i]; 1416 if (c == -1) { 1417 e->normal.type[i] = BT_MALFORM; 1418 /* This shouldn't really get used. */ 1419 e->utf16[i] = 0xFFFF; 1420 e->utf8[i][0] = 1; 1421 e->utf8[i][1] = 0; 1422 } else if (c < 0) { 1423 if (c < -4) 1424 return 0; 1425 /* Multi-byte sequences need a converter function */ 1426 if (! convert) 1427 return 0; 1428 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2)); 1429 e->utf8[i][0] = 0; 1430 e->utf16[i] = 0; 1431 } else if (c < 0x80) { 1432 if (latin1_encoding.type[c] != BT_OTHER 1433 && latin1_encoding.type[c] != BT_NONXML && c != i) 1434 return 0; 1435 e->normal.type[i] = latin1_encoding.type[c]; 1436 e->utf8[i][0] = 1; 1437 e->utf8[i][1] = (char)c; 1438 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c); 1439 } else if (checkCharRefNumber(c) < 0) { 1440 e->normal.type[i] = BT_NONXML; 1441 /* This shouldn't really get used. */ 1442 e->utf16[i] = 0xFFFF; 1443 e->utf8[i][0] = 1; 1444 e->utf8[i][1] = 0; 1445 } else { 1446 if (c > 0xFFFF) 1447 return 0; 1448 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) 1449 e->normal.type[i] = BT_NMSTRT; 1450 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) 1451 e->normal.type[i] = BT_NAME; 1452 else 1453 e->normal.type[i] = BT_OTHER; 1454 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); 1455 e->utf16[i] = (unsigned short)c; 1456 } 1457 } 1458 e->userData = userData; 1459 e->convert = convert; 1460 if (convert) { 1461 e->normal.isName2 = unknown_isName; 1462 e->normal.isName3 = unknown_isName; 1463 e->normal.isName4 = unknown_isName; 1464 e->normal.isNmstrt2 = unknown_isNmstrt; 1465 e->normal.isNmstrt3 = unknown_isNmstrt; 1466 e->normal.isNmstrt4 = unknown_isNmstrt; 1467 e->normal.isInvalid2 = unknown_isInvalid; 1468 e->normal.isInvalid3 = unknown_isInvalid; 1469 e->normal.isInvalid4 = unknown_isInvalid; 1470 } 1471 e->normal.enc.utf8Convert = unknown_toUtf8; 1472 e->normal.enc.utf16Convert = unknown_toUtf16; 1473 return &(e->normal.enc); 1474 } 1475 1476 /* If this enumeration is changed, getEncodingIndex and encodings 1477 must also be changed. */ 1478 enum { 1479 UNKNOWN_ENC = -1, 1480 ISO_8859_1_ENC = 0, 1481 US_ASCII_ENC, 1482 UTF_8_ENC, 1483 UTF_16_ENC, 1484 UTF_16BE_ENC, 1485 UTF_16LE_ENC, 1486 /* must match encodingNames up to here */ 1487 NO_ENC 1488 }; 1489 1490 static const char KW_ISO_8859_1[] 1491 = {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, 1492 ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'}; 1493 static const char KW_US_ASCII[] 1494 = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, 1495 ASCII_C, ASCII_I, ASCII_I, '\0'}; 1496 static const char KW_UTF_8[] 1497 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'}; 1498 static const char KW_UTF_16[] 1499 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'}; 1500 static const char KW_UTF_16BE[] 1501 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, 1502 ASCII_6, ASCII_B, ASCII_E, '\0'}; 1503 static const char KW_UTF_16LE[] 1504 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, 1505 ASCII_6, ASCII_L, ASCII_E, '\0'}; 1506 1507 static int FASTCALL 1508 getEncodingIndex(const char *name) { 1509 static const char *const encodingNames[] = { 1510 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE, 1511 }; 1512 int i; 1513 if (name == NULL) 1514 return NO_ENC; 1515 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++) 1516 if (streqci(name, encodingNames[i])) 1517 return i; 1518 return UNKNOWN_ENC; 1519 } 1520 1521 /* For binary compatibility, we store the index of the encoding 1522 specified at initialization in the isUtf16 member. 1523 */ 1524 1525 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) 1526 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) 1527 1528 /* This is what detects the encoding. encodingTable maps from 1529 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of 1530 the external (protocol) specified encoding; state is 1531 XML_CONTENT_STATE if we're parsing an external text entity, and 1532 XML_PROLOG_STATE otherwise. 1533 */ 1534 1535 static int 1536 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc, 1537 int state, const char *ptr, const char *end, const char **nextTokPtr) { 1538 const ENCODING **encPtr; 1539 1540 if (ptr >= end) 1541 return XML_TOK_NONE; 1542 encPtr = enc->encPtr; 1543 if (ptr + 1 == end) { 1544 /* only a single byte available for auto-detection */ 1545 #ifndef XML_DTD /* FIXME */ 1546 /* a well-formed document entity must have more than one byte */ 1547 if (state != XML_CONTENT_STATE) 1548 return XML_TOK_PARTIAL; 1549 #endif 1550 /* so we're parsing an external text entity... */ 1551 /* if UTF-16 was externally specified, then we need at least 2 bytes */ 1552 switch (INIT_ENC_INDEX(enc)) { 1553 case UTF_16_ENC: 1554 case UTF_16LE_ENC: 1555 case UTF_16BE_ENC: 1556 return XML_TOK_PARTIAL; 1557 } 1558 switch ((unsigned char)*ptr) { 1559 case 0xFE: 1560 case 0xFF: 1561 case 0xEF: /* possibly first byte of UTF-8 BOM */ 1562 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) 1563 break; 1564 /* fall through */ 1565 case 0x00: 1566 case 0x3C: 1567 return XML_TOK_PARTIAL; 1568 } 1569 } else { 1570 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { 1571 case 0xFEFF: 1572 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) 1573 break; 1574 *nextTokPtr = ptr + 2; 1575 *encPtr = encodingTable[UTF_16BE_ENC]; 1576 return XML_TOK_BOM; 1577 /* 00 3C is handled in the default case */ 1578 case 0x3C00: 1579 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC 1580 || INIT_ENC_INDEX(enc) == UTF_16_ENC) 1581 && state == XML_CONTENT_STATE) 1582 break; 1583 *encPtr = encodingTable[UTF_16LE_ENC]; 1584 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1585 case 0xFFFE: 1586 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) 1587 break; 1588 *nextTokPtr = ptr + 2; 1589 *encPtr = encodingTable[UTF_16LE_ENC]; 1590 return XML_TOK_BOM; 1591 case 0xEFBB: 1592 /* Maybe a UTF-8 BOM (EF BB BF) */ 1593 /* If there's an explicitly specified (external) encoding 1594 of ISO-8859-1 or some flavour of UTF-16 1595 and this is an external text entity, 1596 don't look for the BOM, 1597 because it might be a legal data. 1598 */ 1599 if (state == XML_CONTENT_STATE) { 1600 int e = INIT_ENC_INDEX(enc); 1601 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC 1602 || e == UTF_16_ENC) 1603 break; 1604 } 1605 if (ptr + 2 == end) 1606 return XML_TOK_PARTIAL; 1607 if ((unsigned char)ptr[2] == 0xBF) { 1608 *nextTokPtr = ptr + 3; 1609 *encPtr = encodingTable[UTF_8_ENC]; 1610 return XML_TOK_BOM; 1611 } 1612 break; 1613 default: 1614 if (ptr[0] == '\0') { 1615 /* 0 isn't a legal data character. Furthermore a document 1616 entity can only start with ASCII characters. So the only 1617 way this can fail to be big-endian UTF-16 if it it's an 1618 external parsed general entity that's labelled as 1619 UTF-16LE. 1620 */ 1621 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) 1622 break; 1623 *encPtr = encodingTable[UTF_16BE_ENC]; 1624 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1625 } else if (ptr[1] == '\0') { 1626 /* We could recover here in the case: 1627 - parsing an external entity 1628 - second byte is 0 1629 - no externally specified encoding 1630 - no encoding declaration 1631 by assuming UTF-16LE. But we don't, because this would mean when 1632 presented just with a single byte, we couldn't reliably determine 1633 whether we needed further bytes. 1634 */ 1635 if (state == XML_CONTENT_STATE) 1636 break; 1637 *encPtr = encodingTable[UTF_16LE_ENC]; 1638 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1639 } 1640 break; 1641 } 1642 } 1643 *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; 1644 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1645 } 1646 1647 #define NS(x) x 1648 #define ns(x) x 1649 #define XML_TOK_NS_C 1650 #include "xmltok_ns.c" 1651 #undef XML_TOK_NS_C 1652 #undef NS 1653 #undef ns 1654 1655 #ifdef XML_NS 1656 1657 # define NS(x) x##NS 1658 # define ns(x) x##_ns 1659 1660 # define XML_TOK_NS_C 1661 # include "xmltok_ns.c" 1662 # undef XML_TOK_NS_C 1663 1664 # undef NS 1665 # undef ns 1666 1667 ENCODING * 1668 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert, 1669 void *userData) { 1670 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); 1671 if (enc) 1672 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON; 1673 return enc; 1674 } 1675 1676 #endif /* XML_NS */ 1677