1 /* 2 __ __ _ 3 ___\ \/ /_ __ __ _| |_ 4 / _ \\ /| '_ \ / _` | __| 5 | __// \| |_) | (_| | |_ 6 \___/_/\_\ .__/ \__,_|\__| 7 |_| XML parser 8 9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd 10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net> 11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net> 12 Copyright (c) 2002 Greg Stein <gstein@users.sourceforge.net> 13 Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net> 14 Copyright (c) 2005-2009 Steven Solie <steven@solie.ca> 15 Copyright (c) 2016-2024 Sebastian Pipping <sebastian@pipping.org> 16 Copyright (c) 2016 Pascal Cuoq <cuoq@trust-in-soft.com> 17 Copyright (c) 2016 Don Lewis <truckman@apache.org> 18 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk> 19 Copyright (c) 2017 Alexander Bluhm <alexander.bluhm@gmx.net> 20 Copyright (c) 2017 Benbuck Nason <bnason@netflix.com> 21 Copyright (c) 2017 José Gutiérrez de la Concha <jose@zeroc.com> 22 Copyright (c) 2019 David Loffredo <loffredo@steptools.com> 23 Copyright (c) 2021 Donghee Na <donghee.na@python.org> 24 Copyright (c) 2022 Martin Ettl <ettl.martin78@googlemail.com> 25 Copyright (c) 2022 Sean McBride <sean@rogue-research.com> 26 Copyright (c) 2023 Hanno Böck <hanno@gentoo.org> 27 Licensed under the MIT license: 28 29 Permission is hereby granted, free of charge, to any person obtaining 30 a copy of this software and associated documentation files (the 31 "Software"), to deal in the Software without restriction, including 32 without limitation the rights to use, copy, modify, merge, publish, 33 distribute, sublicense, and/or sell copies of the Software, and to permit 34 persons to whom the Software is furnished to do so, subject to the 35 following conditions: 36 37 The above copyright notice and this permission notice shall be included 38 in all copies or substantial portions of the Software. 39 40 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 41 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 42 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 43 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 44 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 45 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 46 USE OR OTHER DEALINGS IN THE SOFTWARE. 47 */ 48 49 #include "expat_config.h" 50 51 #include <stddef.h> 52 #include <string.h> /* memcpy */ 53 #include <stdbool.h> 54 55 #ifdef _WIN32 56 # include "winconfig.h" 57 #endif 58 59 #include "expat_external.h" 60 #include "internal.h" 61 #include "xmltok.h" 62 #include "nametab.h" 63 64 #ifdef XML_DTD 65 # define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) 66 #else 67 # define IGNORE_SECTION_TOK_VTABLE /* as nothing */ 68 #endif 69 70 #define VTABLE1 \ 71 {PREFIX(prologTok), PREFIX(contentTok), \ 72 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \ 73 {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \ 74 PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \ 75 PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \ 76 PREFIX(updatePosition), PREFIX(isPublicId) 77 78 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) 79 80 #define UCS2_GET_NAMING(pages, hi, lo) \ 81 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F))) 82 83 /* A 2 byte UTF-8 representation splits the characters 11 bits between 84 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into 85 pages, 3 bits to add to that index and 5 bits to generate the mask. 86 */ 87 #define UTF8_GET_NAMING2(pages, byte) \ 88 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ 89 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \ 90 & (1u << (((byte)[1]) & 0x1F))) 91 92 /* A 3 byte UTF-8 representation splits the characters 16 bits between 93 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index 94 into pages, 3 bits to add to that index and 5 bits to generate the 95 mask. 96 */ 97 #define UTF8_GET_NAMING3(pages, byte) \ 98 (namingBitmap \ 99 [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \ 100 << 3) \ 101 + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \ 102 & (1u << (((byte)[2]) & 0x1F))) 103 104 /* Detection of invalid UTF-8 sequences is based on Table 3.1B 105 of Unicode 3.2: https://www.unicode.org/unicode/reports/tr28/ 106 with the additional restriction of not allowing the Unicode 107 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). 108 Implementation details: 109 (A & 0x80) == 0 means A < 0x80 110 and 111 (A & 0xC0) == 0xC0 means A > 0xBF 112 */ 113 114 #define UTF8_INVALID2(p) \ 115 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) 116 117 #define UTF8_INVALID3(p) \ 118 (((p)[2] & 0x80) == 0 \ 119 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \ 120 : ((p)[2] & 0xC0) == 0xC0) \ 121 || ((*p) == 0xE0 \ 122 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ 123 : ((p)[1] & 0x80) == 0 \ 124 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) 125 126 #define UTF8_INVALID4(p) \ 127 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \ 128 || ((p)[2] & 0xC0) == 0xC0 \ 129 || ((*p) == 0xF0 \ 130 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ 131 : ((p)[1] & 0x80) == 0 \ 132 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) 133 134 static int PTRFASTCALL 135 isNever(const ENCODING *enc, const char *p) { 136 UNUSED_P(enc); 137 UNUSED_P(p); 138 return 0; 139 } 140 141 static int PTRFASTCALL 142 utf8_isName2(const ENCODING *enc, const char *p) { 143 UNUSED_P(enc); 144 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); 145 } 146 147 static int PTRFASTCALL 148 utf8_isName3(const ENCODING *enc, const char *p) { 149 UNUSED_P(enc); 150 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); 151 } 152 153 #define utf8_isName4 isNever 154 155 static int PTRFASTCALL 156 utf8_isNmstrt2(const ENCODING *enc, const char *p) { 157 UNUSED_P(enc); 158 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); 159 } 160 161 static int PTRFASTCALL 162 utf8_isNmstrt3(const ENCODING *enc, const char *p) { 163 UNUSED_P(enc); 164 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); 165 } 166 167 #define utf8_isNmstrt4 isNever 168 169 static int PTRFASTCALL 170 utf8_isInvalid2(const ENCODING *enc, const char *p) { 171 UNUSED_P(enc); 172 return UTF8_INVALID2((const unsigned char *)p); 173 } 174 175 static int PTRFASTCALL 176 utf8_isInvalid3(const ENCODING *enc, const char *p) { 177 UNUSED_P(enc); 178 return UTF8_INVALID3((const unsigned char *)p); 179 } 180 181 static int PTRFASTCALL 182 utf8_isInvalid4(const ENCODING *enc, const char *p) { 183 UNUSED_P(enc); 184 return UTF8_INVALID4((const unsigned char *)p); 185 } 186 187 struct normal_encoding { 188 ENCODING enc; 189 unsigned char type[256]; 190 #ifdef XML_MIN_SIZE 191 int(PTRFASTCALL *byteType)(const ENCODING *, const char *); 192 int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *); 193 int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *); 194 int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *); 195 int(PTRCALL *charMatches)(const ENCODING *, const char *, int); 196 #endif /* XML_MIN_SIZE */ 197 int(PTRFASTCALL *isName2)(const ENCODING *, const char *); 198 int(PTRFASTCALL *isName3)(const ENCODING *, const char *); 199 int(PTRFASTCALL *isName4)(const ENCODING *, const char *); 200 int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *); 201 int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *); 202 int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *); 203 int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *); 204 int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *); 205 int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *); 206 }; 207 208 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc)) 209 210 #ifdef XML_MIN_SIZE 211 212 # define STANDARD_VTABLE(E) \ 213 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches, 214 215 #else 216 217 # define STANDARD_VTABLE(E) /* as nothing */ 218 219 #endif 220 221 #define NORMAL_VTABLE(E) \ 222 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \ 223 E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4 224 225 #define NULL_VTABLE \ 226 /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \ 227 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \ 228 /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL 229 230 static int FASTCALL checkCharRefNumber(int result); 231 232 #include "xmltok_impl.h" 233 #include "ascii.h" 234 235 #ifdef XML_MIN_SIZE 236 # define sb_isNameMin isNever 237 # define sb_isNmstrtMin isNever 238 #endif 239 240 #ifdef XML_MIN_SIZE 241 # define MINBPC(enc) ((enc)->minBytesPerChar) 242 #else 243 /* minimum bytes per character */ 244 # define MINBPC(enc) 1 245 #endif 246 247 #define SB_BYTE_TYPE(enc, p) \ 248 (((const struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) 249 250 #ifdef XML_MIN_SIZE 251 static int PTRFASTCALL 252 sb_byteType(const ENCODING *enc, const char *p) { 253 return SB_BYTE_TYPE(enc, p); 254 } 255 # define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) 256 #else 257 # define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) 258 #endif 259 260 #ifdef XML_MIN_SIZE 261 # define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) 262 static int PTRFASTCALL 263 sb_byteToAscii(const ENCODING *enc, const char *p) { 264 UNUSED_P(enc); 265 return *p; 266 } 267 #else 268 # define BYTE_TO_ASCII(enc, p) (*(p)) 269 #endif 270 271 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p)) 272 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p)) 273 #ifdef XML_MIN_SIZE 274 # define IS_INVALID_CHAR(enc, p, n) \ 275 (AS_NORMAL_ENCODING(enc)->isInvalid##n \ 276 && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p)) 277 #else 278 # define IS_INVALID_CHAR(enc, p, n) \ 279 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p)) 280 #endif 281 282 #ifdef XML_MIN_SIZE 283 # define IS_NAME_CHAR_MINBPC(enc, p) \ 284 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) 285 # define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 286 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) 287 #else 288 # define IS_NAME_CHAR_MINBPC(enc, p) (0) 289 # define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 290 #endif 291 292 #ifdef XML_MIN_SIZE 293 # define CHAR_MATCHES(enc, p, c) \ 294 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) 295 static int PTRCALL 296 sb_charMatches(const ENCODING *enc, const char *p, int c) { 297 UNUSED_P(enc); 298 return *p == c; 299 } 300 #else 301 /* c is an ASCII character */ 302 # define CHAR_MATCHES(enc, p, c) (*(p) == (c)) 303 #endif 304 305 #define PREFIX(ident) normal_##ident 306 #define XML_TOK_IMPL_C 307 #include "xmltok_impl.c" 308 #undef XML_TOK_IMPL_C 309 310 #undef MINBPC 311 #undef BYTE_TYPE 312 #undef BYTE_TO_ASCII 313 #undef CHAR_MATCHES 314 #undef IS_NAME_CHAR 315 #undef IS_NAME_CHAR_MINBPC 316 #undef IS_NMSTRT_CHAR 317 #undef IS_NMSTRT_CHAR_MINBPC 318 #undef IS_INVALID_CHAR 319 320 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ 321 UTF8_cval1 = 0x00, 322 UTF8_cval2 = 0xc0, 323 UTF8_cval3 = 0xe0, 324 UTF8_cval4 = 0xf0 325 }; 326 327 void 328 _INTERNAL_trim_to_complete_utf8_characters(const char *from, 329 const char **fromLimRef) { 330 const char *fromLim = *fromLimRef; 331 size_t walked = 0; 332 for (; fromLim > from; fromLim--, walked++) { 333 const unsigned char prev = (unsigned char)fromLim[-1]; 334 if ((prev & 0xf8u) 335 == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */ 336 if (walked + 1 >= 4) { 337 fromLim += 4 - 1; 338 break; 339 } else { 340 walked = 0; 341 } 342 } else if ((prev & 0xf0u) 343 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */ 344 if (walked + 1 >= 3) { 345 fromLim += 3 - 1; 346 break; 347 } else { 348 walked = 0; 349 } 350 } else if ((prev & 0xe0u) 351 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */ 352 if (walked + 1 >= 2) { 353 fromLim += 2 - 1; 354 break; 355 } else { 356 walked = 0; 357 } 358 } else if ((prev & 0x80u) 359 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */ 360 break; 361 } 362 } 363 *fromLimRef = fromLim; 364 } 365 366 static enum XML_Convert_Result PTRCALL 367 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 368 char **toP, const char *toLim) { 369 bool input_incomplete = false; 370 bool output_exhausted = false; 371 372 /* Avoid copying partial characters (due to limited space). */ 373 const ptrdiff_t bytesAvailable = fromLim - *fromP; 374 const ptrdiff_t bytesStorable = toLim - *toP; 375 UNUSED_P(enc); 376 if (bytesAvailable > bytesStorable) { 377 fromLim = *fromP + bytesStorable; 378 output_exhausted = true; 379 } 380 381 /* Avoid copying partial characters (from incomplete input). */ 382 { 383 const char *const fromLimBefore = fromLim; 384 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim); 385 if (fromLim < fromLimBefore) { 386 input_incomplete = true; 387 } 388 } 389 390 { 391 const ptrdiff_t bytesToCopy = fromLim - *fromP; 392 memcpy(*toP, *fromP, bytesToCopy); 393 *fromP += bytesToCopy; 394 *toP += bytesToCopy; 395 } 396 397 if (output_exhausted) /* needs to go first */ 398 return XML_CONVERT_OUTPUT_EXHAUSTED; 399 else if (input_incomplete) 400 return XML_CONVERT_INPUT_INCOMPLETE; 401 else 402 return XML_CONVERT_COMPLETED; 403 } 404 405 static enum XML_Convert_Result PTRCALL 406 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, 407 unsigned short **toP, const unsigned short *toLim) { 408 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; 409 unsigned short *to = *toP; 410 const char *from = *fromP; 411 while (from < fromLim && to < toLim) { 412 switch (SB_BYTE_TYPE(enc, from)) { 413 case BT_LEAD2: 414 if (fromLim - from < 2) { 415 res = XML_CONVERT_INPUT_INCOMPLETE; 416 goto after; 417 } 418 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); 419 from += 2; 420 break; 421 case BT_LEAD3: 422 if (fromLim - from < 3) { 423 res = XML_CONVERT_INPUT_INCOMPLETE; 424 goto after; 425 } 426 *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) 427 | (from[2] & 0x3f)); 428 from += 3; 429 break; 430 case BT_LEAD4: { 431 unsigned long n; 432 if (toLim - to < 2) { 433 res = XML_CONVERT_OUTPUT_EXHAUSTED; 434 goto after; 435 } 436 if (fromLim - from < 4) { 437 res = XML_CONVERT_INPUT_INCOMPLETE; 438 goto after; 439 } 440 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) 441 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); 442 n -= 0x10000; 443 to[0] = (unsigned short)((n >> 10) | 0xD800); 444 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); 445 to += 2; 446 from += 4; 447 } break; 448 default: 449 *to++ = *from++; 450 break; 451 } 452 } 453 if (from < fromLim) 454 res = XML_CONVERT_OUTPUT_EXHAUSTED; 455 after: 456 *fromP = from; 457 *toP = to; 458 return res; 459 } 460 461 #ifdef XML_NS 462 static const struct normal_encoding utf8_encoding_ns 463 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 464 { 465 # include "asciitab.h" 466 # include "utf8tab.h" 467 }, 468 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 469 #endif 470 471 static const struct normal_encoding utf8_encoding 472 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 473 { 474 #define BT_COLON BT_NMSTRT 475 #include "asciitab.h" 476 #undef BT_COLON 477 #include "utf8tab.h" 478 }, 479 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 480 481 #ifdef XML_NS 482 483 static const struct normal_encoding internal_utf8_encoding_ns 484 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 485 { 486 # include "iasciitab.h" 487 # include "utf8tab.h" 488 }, 489 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 490 491 #endif 492 493 static const struct normal_encoding internal_utf8_encoding 494 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0}, 495 { 496 #define BT_COLON BT_NMSTRT 497 #include "iasciitab.h" 498 #undef BT_COLON 499 #include "utf8tab.h" 500 }, 501 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)}; 502 503 static enum XML_Convert_Result PTRCALL 504 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 505 char **toP, const char *toLim) { 506 UNUSED_P(enc); 507 for (;;) { 508 unsigned char c; 509 if (*fromP == fromLim) 510 return XML_CONVERT_COMPLETED; 511 c = (unsigned char)**fromP; 512 if (c & 0x80) { 513 if (toLim - *toP < 2) 514 return XML_CONVERT_OUTPUT_EXHAUSTED; 515 *(*toP)++ = (char)((c >> 6) | UTF8_cval2); 516 *(*toP)++ = (char)((c & 0x3f) | 0x80); 517 (*fromP)++; 518 } else { 519 if (*toP == toLim) 520 return XML_CONVERT_OUTPUT_EXHAUSTED; 521 *(*toP)++ = *(*fromP)++; 522 } 523 } 524 } 525 526 static enum XML_Convert_Result PTRCALL 527 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, 528 unsigned short **toP, const unsigned short *toLim) { 529 UNUSED_P(enc); 530 while (*fromP < fromLim && *toP < toLim) 531 *(*toP)++ = (unsigned char)*(*fromP)++; 532 533 if ((*toP == toLim) && (*fromP < fromLim)) 534 return XML_CONVERT_OUTPUT_EXHAUSTED; 535 else 536 return XML_CONVERT_COMPLETED; 537 } 538 539 #ifdef XML_NS 540 541 static const struct normal_encoding latin1_encoding_ns 542 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0}, 543 { 544 # include "asciitab.h" 545 # include "latin1tab.h" 546 }, 547 STANDARD_VTABLE(sb_) NULL_VTABLE}; 548 549 #endif 550 551 static const struct normal_encoding latin1_encoding 552 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0}, 553 { 554 #define BT_COLON BT_NMSTRT 555 #include "asciitab.h" 556 #undef BT_COLON 557 #include "latin1tab.h" 558 }, 559 STANDARD_VTABLE(sb_) NULL_VTABLE}; 560 561 static enum XML_Convert_Result PTRCALL 562 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 563 char **toP, const char *toLim) { 564 UNUSED_P(enc); 565 while (*fromP < fromLim && *toP < toLim) 566 *(*toP)++ = *(*fromP)++; 567 568 if ((*toP == toLim) && (*fromP < fromLim)) 569 return XML_CONVERT_OUTPUT_EXHAUSTED; 570 else 571 return XML_CONVERT_COMPLETED; 572 } 573 574 #ifdef XML_NS 575 576 static const struct normal_encoding ascii_encoding_ns 577 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0}, 578 { 579 # include "asciitab.h" 580 /* BT_NONXML == 0 */ 581 }, 582 STANDARD_VTABLE(sb_) NULL_VTABLE}; 583 584 #endif 585 586 static const struct normal_encoding ascii_encoding 587 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0}, 588 { 589 #define BT_COLON BT_NMSTRT 590 #include "asciitab.h" 591 #undef BT_COLON 592 /* BT_NONXML == 0 */ 593 }, 594 STANDARD_VTABLE(sb_) NULL_VTABLE}; 595 596 static int PTRFASTCALL 597 unicode_byte_type(char hi, char lo) { 598 switch ((unsigned char)hi) { 599 /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */ 600 case 0xD8: 601 case 0xD9: 602 case 0xDA: 603 case 0xDB: 604 return BT_LEAD4; 605 /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */ 606 case 0xDC: 607 case 0xDD: 608 case 0xDE: 609 case 0xDF: 610 return BT_TRAIL; 611 case 0xFF: 612 switch ((unsigned char)lo) { 613 case 0xFF: /* noncharacter-FFFF */ 614 case 0xFE: /* noncharacter-FFFE */ 615 return BT_NONXML; 616 } 617 break; 618 } 619 return BT_NONASCII; 620 } 621 622 #define DEFINE_UTF16_TO_UTF8(E) \ 623 static enum XML_Convert_Result PTRCALL E##toUtf8( \ 624 const ENCODING *enc, const char **fromP, const char *fromLim, \ 625 char **toP, const char *toLim) { \ 626 const char *from = *fromP; \ 627 UNUSED_P(enc); \ 628 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \ 629 for (; from < fromLim; from += 2) { \ 630 int plane; \ 631 unsigned char lo2; \ 632 unsigned char lo = GET_LO(from); \ 633 unsigned char hi = GET_HI(from); \ 634 switch (hi) { \ 635 case 0: \ 636 if (lo < 0x80) { \ 637 if (*toP == toLim) { \ 638 *fromP = from; \ 639 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 640 } \ 641 *(*toP)++ = lo; \ 642 break; \ 643 } \ 644 /* fall through */ \ 645 case 0x1: \ 646 case 0x2: \ 647 case 0x3: \ 648 case 0x4: \ 649 case 0x5: \ 650 case 0x6: \ 651 case 0x7: \ 652 if (toLim - *toP < 2) { \ 653 *fromP = from; \ 654 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 655 } \ 656 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ 657 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 658 break; \ 659 default: \ 660 if (toLim - *toP < 3) { \ 661 *fromP = from; \ 662 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 663 } \ 664 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ 665 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ 666 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ 667 *(*toP)++ = ((lo & 0x3f) | 0x80); \ 668 break; \ 669 case 0xD8: \ 670 case 0xD9: \ 671 case 0xDA: \ 672 case 0xDB: \ 673 if (toLim - *toP < 4) { \ 674 *fromP = from; \ 675 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 676 } \ 677 if (fromLim - from < 4) { \ 678 *fromP = from; \ 679 return XML_CONVERT_INPUT_INCOMPLETE; \ 680 } \ 681 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ 682 *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \ 683 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ 684 from += 2; \ 685 lo2 = GET_LO(from); \ 686 *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \ 687 | (lo2 >> 6) | 0x80); \ 688 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ 689 break; \ 690 } \ 691 } \ 692 *fromP = from; \ 693 if (from < fromLim) \ 694 return XML_CONVERT_INPUT_INCOMPLETE; \ 695 else \ 696 return XML_CONVERT_COMPLETED; \ 697 } 698 699 #define DEFINE_UTF16_TO_UTF16(E) \ 700 static enum XML_Convert_Result PTRCALL E##toUtf16( \ 701 const ENCODING *enc, const char **fromP, const char *fromLim, \ 702 unsigned short **toP, const unsigned short *toLim) { \ 703 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \ 704 UNUSED_P(enc); \ 705 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \ 706 /* Avoid copying first half only of surrogate */ \ 707 if (fromLim - *fromP > ((toLim - *toP) << 1) \ 708 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \ 709 fromLim -= 2; \ 710 res = XML_CONVERT_INPUT_INCOMPLETE; \ 711 } \ 712 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \ 713 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ 714 if ((*toP == toLim) && (*fromP < fromLim)) \ 715 return XML_CONVERT_OUTPUT_EXHAUSTED; \ 716 else \ 717 return res; \ 718 } 719 720 #define GET_LO(ptr) ((unsigned char)(ptr)[0]) 721 #define GET_HI(ptr) ((unsigned char)(ptr)[1]) 722 723 DEFINE_UTF16_TO_UTF8(little2_) 724 DEFINE_UTF16_TO_UTF16(little2_) 725 726 #undef GET_LO 727 #undef GET_HI 728 729 #define GET_LO(ptr) ((unsigned char)(ptr)[1]) 730 #define GET_HI(ptr) ((unsigned char)(ptr)[0]) 731 732 DEFINE_UTF16_TO_UTF8(big2_) 733 DEFINE_UTF16_TO_UTF16(big2_) 734 735 #undef GET_LO 736 #undef GET_HI 737 738 #define LITTLE2_BYTE_TYPE(enc, p) \ 739 ((p)[1] == 0 ? SB_BYTE_TYPE(enc, p) : unicode_byte_type((p)[1], (p)[0])) 740 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1) 741 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c)) 742 #define LITTLE2_IS_NAME_CHAR_MINBPC(p) \ 743 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) 744 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \ 745 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) 746 747 #ifdef XML_MIN_SIZE 748 749 static int PTRFASTCALL 750 little2_byteType(const ENCODING *enc, const char *p) { 751 return LITTLE2_BYTE_TYPE(enc, p); 752 } 753 754 static int PTRFASTCALL 755 little2_byteToAscii(const ENCODING *enc, const char *p) { 756 UNUSED_P(enc); 757 return LITTLE2_BYTE_TO_ASCII(p); 758 } 759 760 static int PTRCALL 761 little2_charMatches(const ENCODING *enc, const char *p, int c) { 762 UNUSED_P(enc); 763 return LITTLE2_CHAR_MATCHES(p, c); 764 } 765 766 static int PTRFASTCALL 767 little2_isNameMin(const ENCODING *enc, const char *p) { 768 UNUSED_P(enc); 769 return LITTLE2_IS_NAME_CHAR_MINBPC(p); 770 } 771 772 static int PTRFASTCALL 773 little2_isNmstrtMin(const ENCODING *enc, const char *p) { 774 UNUSED_P(enc); 775 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p); 776 } 777 778 # undef VTABLE 779 # define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 780 781 #else /* not XML_MIN_SIZE */ 782 783 # undef PREFIX 784 # define PREFIX(ident) little2_##ident 785 # define MINBPC(enc) 2 786 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 787 # define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) 788 # define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p) 789 # define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c) 790 # define IS_NAME_CHAR(enc, p, n) 0 791 # define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p) 792 # define IS_NMSTRT_CHAR(enc, p, n) (0) 793 # define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) 794 795 # define XML_TOK_IMPL_C 796 # include "xmltok_impl.c" 797 # undef XML_TOK_IMPL_C 798 799 # undef MINBPC 800 # undef BYTE_TYPE 801 # undef BYTE_TO_ASCII 802 # undef CHAR_MATCHES 803 # undef IS_NAME_CHAR 804 # undef IS_NAME_CHAR_MINBPC 805 # undef IS_NMSTRT_CHAR 806 # undef IS_NMSTRT_CHAR_MINBPC 807 # undef IS_INVALID_CHAR 808 809 #endif /* not XML_MIN_SIZE */ 810 811 #ifdef XML_NS 812 813 static const struct normal_encoding little2_encoding_ns 814 = {{VTABLE, 2, 0, 815 # if BYTEORDER == 1234 816 1 817 # else 818 0 819 # endif 820 }, 821 { 822 # include "asciitab.h" 823 # include "latin1tab.h" 824 }, 825 STANDARD_VTABLE(little2_) NULL_VTABLE}; 826 827 #endif 828 829 static const struct normal_encoding little2_encoding 830 = {{VTABLE, 2, 0, 831 #if BYTEORDER == 1234 832 1 833 #else 834 0 835 #endif 836 }, 837 { 838 #define BT_COLON BT_NMSTRT 839 #include "asciitab.h" 840 #undef BT_COLON 841 #include "latin1tab.h" 842 }, 843 STANDARD_VTABLE(little2_) NULL_VTABLE}; 844 845 #if BYTEORDER != 4321 846 847 # ifdef XML_NS 848 849 static const struct normal_encoding internal_little2_encoding_ns 850 = {{VTABLE, 2, 0, 1}, 851 { 852 # include "iasciitab.h" 853 # include "latin1tab.h" 854 }, 855 STANDARD_VTABLE(little2_) NULL_VTABLE}; 856 857 # endif 858 859 static const struct normal_encoding internal_little2_encoding 860 = {{VTABLE, 2, 0, 1}, 861 { 862 # define BT_COLON BT_NMSTRT 863 # include "iasciitab.h" 864 # undef BT_COLON 865 # include "latin1tab.h" 866 }, 867 STANDARD_VTABLE(little2_) NULL_VTABLE}; 868 869 #endif 870 871 #define BIG2_BYTE_TYPE(enc, p) \ 872 ((p)[0] == 0 ? SB_BYTE_TYPE(enc, p + 1) : unicode_byte_type((p)[0], (p)[1])) 873 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1) 874 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c)) 875 #define BIG2_IS_NAME_CHAR_MINBPC(p) \ 876 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) 877 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \ 878 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) 879 880 #ifdef XML_MIN_SIZE 881 882 static int PTRFASTCALL 883 big2_byteType(const ENCODING *enc, const char *p) { 884 return BIG2_BYTE_TYPE(enc, p); 885 } 886 887 static int PTRFASTCALL 888 big2_byteToAscii(const ENCODING *enc, const char *p) { 889 UNUSED_P(enc); 890 return BIG2_BYTE_TO_ASCII(p); 891 } 892 893 static int PTRCALL 894 big2_charMatches(const ENCODING *enc, const char *p, int c) { 895 UNUSED_P(enc); 896 return BIG2_CHAR_MATCHES(p, c); 897 } 898 899 static int PTRFASTCALL 900 big2_isNameMin(const ENCODING *enc, const char *p) { 901 UNUSED_P(enc); 902 return BIG2_IS_NAME_CHAR_MINBPC(p); 903 } 904 905 static int PTRFASTCALL 906 big2_isNmstrtMin(const ENCODING *enc, const char *p) { 907 UNUSED_P(enc); 908 return BIG2_IS_NMSTRT_CHAR_MINBPC(p); 909 } 910 911 # undef VTABLE 912 # define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 913 914 #else /* not XML_MIN_SIZE */ 915 916 # undef PREFIX 917 # define PREFIX(ident) big2_##ident 918 # define MINBPC(enc) 2 919 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 920 # define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) 921 # define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p) 922 # define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c) 923 # define IS_NAME_CHAR(enc, p, n) 0 924 # define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p) 925 # define IS_NMSTRT_CHAR(enc, p, n) (0) 926 # define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p) 927 928 # define XML_TOK_IMPL_C 929 # include "xmltok_impl.c" 930 # undef XML_TOK_IMPL_C 931 932 # undef MINBPC 933 # undef BYTE_TYPE 934 # undef BYTE_TO_ASCII 935 # undef CHAR_MATCHES 936 # undef IS_NAME_CHAR 937 # undef IS_NAME_CHAR_MINBPC 938 # undef IS_NMSTRT_CHAR 939 # undef IS_NMSTRT_CHAR_MINBPC 940 # undef IS_INVALID_CHAR 941 942 #endif /* not XML_MIN_SIZE */ 943 944 #ifdef XML_NS 945 946 static const struct normal_encoding big2_encoding_ns 947 = {{VTABLE, 2, 0, 948 # if BYTEORDER == 4321 949 1 950 # else 951 0 952 # endif 953 }, 954 { 955 # include "asciitab.h" 956 # include "latin1tab.h" 957 }, 958 STANDARD_VTABLE(big2_) NULL_VTABLE}; 959 960 #endif 961 962 static const struct normal_encoding big2_encoding 963 = {{VTABLE, 2, 0, 964 #if BYTEORDER == 4321 965 1 966 #else 967 0 968 #endif 969 }, 970 { 971 #define BT_COLON BT_NMSTRT 972 #include "asciitab.h" 973 #undef BT_COLON 974 #include "latin1tab.h" 975 }, 976 STANDARD_VTABLE(big2_) NULL_VTABLE}; 977 978 #if BYTEORDER != 1234 979 980 # ifdef XML_NS 981 982 static const struct normal_encoding internal_big2_encoding_ns 983 = {{VTABLE, 2, 0, 1}, 984 { 985 # include "iasciitab.h" 986 # include "latin1tab.h" 987 }, 988 STANDARD_VTABLE(big2_) NULL_VTABLE}; 989 990 # endif 991 992 static const struct normal_encoding internal_big2_encoding 993 = {{VTABLE, 2, 0, 1}, 994 { 995 # define BT_COLON BT_NMSTRT 996 # include "iasciitab.h" 997 # undef BT_COLON 998 # include "latin1tab.h" 999 }, 1000 STANDARD_VTABLE(big2_) NULL_VTABLE}; 1001 1002 #endif 1003 1004 #undef PREFIX 1005 1006 static int FASTCALL 1007 streqci(const char *s1, const char *s2) { 1008 for (;;) { 1009 char c1 = *s1++; 1010 char c2 = *s2++; 1011 if (ASCII_a <= c1 && c1 <= ASCII_z) 1012 c1 += ASCII_A - ASCII_a; 1013 if (ASCII_a <= c2 && c2 <= ASCII_z) 1014 /* The following line will never get executed. streqci() is 1015 * only called from two places, both of which guarantee to put 1016 * upper-case strings into s2. 1017 */ 1018 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */ 1019 if (c1 != c2) 1020 return 0; 1021 if (! c1) 1022 break; 1023 } 1024 return 1; 1025 } 1026 1027 static void PTRCALL 1028 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end, 1029 POSITION *pos) { 1030 UNUSED_P(enc); 1031 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); 1032 } 1033 1034 static int 1035 toAscii(const ENCODING *enc, const char *ptr, const char *end) { 1036 char buf[1]; 1037 char *p = buf; 1038 XmlUtf8Convert(enc, &ptr, end, &p, p + 1); 1039 if (p == buf) 1040 return -1; 1041 else 1042 return buf[0]; 1043 } 1044 1045 static int FASTCALL 1046 isSpace(int c) { 1047 switch (c) { 1048 case 0x20: 1049 case 0xD: 1050 case 0xA: 1051 case 0x9: 1052 return 1; 1053 } 1054 return 0; 1055 } 1056 1057 /* Return 1 if there's just optional white space or there's an S 1058 followed by name=val. 1059 */ 1060 static int 1061 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end, 1062 const char **namePtr, const char **nameEndPtr, 1063 const char **valPtr, const char **nextTokPtr) { 1064 int c; 1065 char open; 1066 if (ptr == end) { 1067 *namePtr = NULL; 1068 return 1; 1069 } 1070 if (! isSpace(toAscii(enc, ptr, end))) { 1071 *nextTokPtr = ptr; 1072 return 0; 1073 } 1074 do { 1075 ptr += enc->minBytesPerChar; 1076 } while (isSpace(toAscii(enc, ptr, end))); 1077 if (ptr == end) { 1078 *namePtr = NULL; 1079 return 1; 1080 } 1081 *namePtr = ptr; 1082 for (;;) { 1083 c = toAscii(enc, ptr, end); 1084 if (c == -1) { 1085 *nextTokPtr = ptr; 1086 return 0; 1087 } 1088 if (c == ASCII_EQUALS) { 1089 *nameEndPtr = ptr; 1090 break; 1091 } 1092 if (isSpace(c)) { 1093 *nameEndPtr = ptr; 1094 do { 1095 ptr += enc->minBytesPerChar; 1096 } while (isSpace(c = toAscii(enc, ptr, end))); 1097 if (c != ASCII_EQUALS) { 1098 *nextTokPtr = ptr; 1099 return 0; 1100 } 1101 break; 1102 } 1103 ptr += enc->minBytesPerChar; 1104 } 1105 if (ptr == *namePtr) { 1106 *nextTokPtr = ptr; 1107 return 0; 1108 } 1109 ptr += enc->minBytesPerChar; 1110 c = toAscii(enc, ptr, end); 1111 while (isSpace(c)) { 1112 ptr += enc->minBytesPerChar; 1113 c = toAscii(enc, ptr, end); 1114 } 1115 if (c != ASCII_QUOT && c != ASCII_APOS) { 1116 *nextTokPtr = ptr; 1117 return 0; 1118 } 1119 open = (char)c; 1120 ptr += enc->minBytesPerChar; 1121 *valPtr = ptr; 1122 for (;; ptr += enc->minBytesPerChar) { 1123 c = toAscii(enc, ptr, end); 1124 if (c == open) 1125 break; 1126 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z) 1127 && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD 1128 && c != ASCII_MINUS && c != ASCII_UNDERSCORE) { 1129 *nextTokPtr = ptr; 1130 return 0; 1131 } 1132 } 1133 *nextTokPtr = ptr + enc->minBytesPerChar; 1134 return 1; 1135 } 1136 1137 static const char KW_version[] 1138 = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'}; 1139 1140 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, 1141 ASCII_i, ASCII_n, ASCII_g, '\0'}; 1142 1143 static const char KW_standalone[] 1144 = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, 1145 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'}; 1146 1147 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'}; 1148 1149 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'}; 1150 1151 static int 1152 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *, 1153 const char *), 1154 int isGeneralTextEntity, const ENCODING *enc, const char *ptr, 1155 const char *end, const char **badPtr, const char **versionPtr, 1156 const char **versionEndPtr, const char **encodingName, 1157 const ENCODING **encoding, int *standalone) { 1158 const char *val = NULL; 1159 const char *name = NULL; 1160 const char *nameEnd = NULL; 1161 ptr += 5 * enc->minBytesPerChar; 1162 end -= 2 * enc->minBytesPerChar; 1163 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) 1164 || ! name) { 1165 *badPtr = ptr; 1166 return 0; 1167 } 1168 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) { 1169 if (! isGeneralTextEntity) { 1170 *badPtr = name; 1171 return 0; 1172 } 1173 } else { 1174 if (versionPtr) 1175 *versionPtr = val; 1176 if (versionEndPtr) 1177 *versionEndPtr = ptr; 1178 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1179 *badPtr = ptr; 1180 return 0; 1181 } 1182 if (! name) { 1183 if (isGeneralTextEntity) { 1184 /* a TextDecl must have an EncodingDecl */ 1185 *badPtr = ptr; 1186 return 0; 1187 } 1188 return 1; 1189 } 1190 } 1191 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) { 1192 int c = toAscii(enc, val, end); 1193 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) { 1194 *badPtr = val; 1195 return 0; 1196 } 1197 if (encodingName) 1198 *encodingName = val; 1199 if (encoding) 1200 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); 1201 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { 1202 *badPtr = ptr; 1203 return 0; 1204 } 1205 if (! name) 1206 return 1; 1207 } 1208 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) 1209 || isGeneralTextEntity) { 1210 *badPtr = name; 1211 return 0; 1212 } 1213 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) { 1214 if (standalone) 1215 *standalone = 1; 1216 } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) { 1217 if (standalone) 1218 *standalone = 0; 1219 } else { 1220 *badPtr = val; 1221 return 0; 1222 } 1223 while (isSpace(toAscii(enc, ptr, end))) 1224 ptr += enc->minBytesPerChar; 1225 if (ptr != end) { 1226 *badPtr = ptr; 1227 return 0; 1228 } 1229 return 1; 1230 } 1231 1232 static int FASTCALL 1233 checkCharRefNumber(int result) { 1234 switch (result >> 8) { 1235 case 0xD8: 1236 case 0xD9: 1237 case 0xDA: 1238 case 0xDB: 1239 case 0xDC: 1240 case 0xDD: 1241 case 0xDE: 1242 case 0xDF: 1243 return -1; 1244 case 0: 1245 if (latin1_encoding.type[result] == BT_NONXML) 1246 return -1; 1247 break; 1248 case 0xFF: 1249 if (result == 0xFFFE || result == 0xFFFF) 1250 return -1; 1251 break; 1252 } 1253 return result; 1254 } 1255 1256 int FASTCALL 1257 XmlUtf8Encode(int c, char *buf) { 1258 enum { 1259 /* minN is minimum legal resulting value for N byte sequence */ 1260 min2 = 0x80, 1261 min3 = 0x800, 1262 min4 = 0x10000 1263 }; 1264 1265 if (c < 0) 1266 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */ 1267 if (c < min2) { 1268 buf[0] = (char)(c | UTF8_cval1); 1269 return 1; 1270 } 1271 if (c < min3) { 1272 buf[0] = (char)((c >> 6) | UTF8_cval2); 1273 buf[1] = (char)((c & 0x3f) | 0x80); 1274 return 2; 1275 } 1276 if (c < min4) { 1277 buf[0] = (char)((c >> 12) | UTF8_cval3); 1278 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80); 1279 buf[2] = (char)((c & 0x3f) | 0x80); 1280 return 3; 1281 } 1282 if (c < 0x110000) { 1283 buf[0] = (char)((c >> 18) | UTF8_cval4); 1284 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80); 1285 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80); 1286 buf[3] = (char)((c & 0x3f) | 0x80); 1287 return 4; 1288 } 1289 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */ 1290 } 1291 1292 int FASTCALL 1293 XmlUtf16Encode(int charNum, unsigned short *buf) { 1294 if (charNum < 0) 1295 return 0; 1296 if (charNum < 0x10000) { 1297 buf[0] = (unsigned short)charNum; 1298 return 1; 1299 } 1300 if (charNum < 0x110000) { 1301 charNum -= 0x10000; 1302 buf[0] = (unsigned short)((charNum >> 10) + 0xD800); 1303 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00); 1304 return 2; 1305 } 1306 return 0; 1307 } 1308 1309 struct unknown_encoding { 1310 struct normal_encoding normal; 1311 CONVERTER convert; 1312 void *userData; 1313 unsigned short utf16[256]; 1314 char utf8[256][4]; 1315 }; 1316 1317 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc)) 1318 1319 int 1320 XmlSizeOfUnknownEncoding(void) { 1321 return sizeof(struct unknown_encoding); 1322 } 1323 1324 static int PTRFASTCALL 1325 unknown_isName(const ENCODING *enc, const char *p) { 1326 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1327 int c = uenc->convert(uenc->userData, p); 1328 if (c & ~0xFFFF) 1329 return 0; 1330 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); 1331 } 1332 1333 static int PTRFASTCALL 1334 unknown_isNmstrt(const ENCODING *enc, const char *p) { 1335 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1336 int c = uenc->convert(uenc->userData, p); 1337 if (c & ~0xFFFF) 1338 return 0; 1339 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); 1340 } 1341 1342 static int PTRFASTCALL 1343 unknown_isInvalid(const ENCODING *enc, const char *p) { 1344 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1345 int c = uenc->convert(uenc->userData, p); 1346 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; 1347 } 1348 1349 static enum XML_Convert_Result PTRCALL 1350 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, 1351 char **toP, const char *toLim) { 1352 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1353 char buf[XML_UTF8_ENCODE_MAX]; 1354 for (;;) { 1355 const char *utf8; 1356 int n; 1357 if (*fromP == fromLim) 1358 return XML_CONVERT_COMPLETED; 1359 utf8 = uenc->utf8[(unsigned char)**fromP]; 1360 n = *utf8++; 1361 if (n == 0) { 1362 int c = uenc->convert(uenc->userData, *fromP); 1363 n = XmlUtf8Encode(c, buf); 1364 if (n > toLim - *toP) 1365 return XML_CONVERT_OUTPUT_EXHAUSTED; 1366 utf8 = buf; 1367 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1368 - (BT_LEAD2 - 2)); 1369 } else { 1370 if (n > toLim - *toP) 1371 return XML_CONVERT_OUTPUT_EXHAUSTED; 1372 (*fromP)++; 1373 } 1374 memcpy(*toP, utf8, n); 1375 *toP += n; 1376 } 1377 } 1378 1379 static enum XML_Convert_Result PTRCALL 1380 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, 1381 unsigned short **toP, const unsigned short *toLim) { 1382 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); 1383 while (*fromP < fromLim && *toP < toLim) { 1384 unsigned short c = uenc->utf16[(unsigned char)**fromP]; 1385 if (c == 0) { 1386 c = (unsigned short)uenc->convert(uenc->userData, *fromP); 1387 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] 1388 - (BT_LEAD2 - 2)); 1389 } else 1390 (*fromP)++; 1391 *(*toP)++ = c; 1392 } 1393 1394 if ((*toP == toLim) && (*fromP < fromLim)) 1395 return XML_CONVERT_OUTPUT_EXHAUSTED; 1396 else 1397 return XML_CONVERT_COMPLETED; 1398 } 1399 1400 ENCODING * 1401 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert, 1402 void *userData) { 1403 int i; 1404 struct unknown_encoding *e = (struct unknown_encoding *)mem; 1405 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding)); 1406 for (i = 0; i < 128; i++) 1407 if (latin1_encoding.type[i] != BT_OTHER 1408 && latin1_encoding.type[i] != BT_NONXML && table[i] != i) 1409 return 0; 1410 for (i = 0; i < 256; i++) { 1411 int c = table[i]; 1412 if (c == -1) { 1413 e->normal.type[i] = BT_MALFORM; 1414 /* This shouldn't really get used. */ 1415 e->utf16[i] = 0xFFFF; 1416 e->utf8[i][0] = 1; 1417 e->utf8[i][1] = 0; 1418 } else if (c < 0) { 1419 if (c < -4) 1420 return 0; 1421 /* Multi-byte sequences need a converter function */ 1422 if (! convert) 1423 return 0; 1424 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2)); 1425 e->utf8[i][0] = 0; 1426 e->utf16[i] = 0; 1427 } else if (c < 0x80) { 1428 if (latin1_encoding.type[c] != BT_OTHER 1429 && latin1_encoding.type[c] != BT_NONXML && c != i) 1430 return 0; 1431 e->normal.type[i] = latin1_encoding.type[c]; 1432 e->utf8[i][0] = 1; 1433 e->utf8[i][1] = (char)c; 1434 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c); 1435 } else if (checkCharRefNumber(c) < 0) { 1436 e->normal.type[i] = BT_NONXML; 1437 /* This shouldn't really get used. */ 1438 e->utf16[i] = 0xFFFF; 1439 e->utf8[i][0] = 1; 1440 e->utf8[i][1] = 0; 1441 } else { 1442 if (c > 0xFFFF) 1443 return 0; 1444 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) 1445 e->normal.type[i] = BT_NMSTRT; 1446 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) 1447 e->normal.type[i] = BT_NAME; 1448 else 1449 e->normal.type[i] = BT_OTHER; 1450 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); 1451 e->utf16[i] = (unsigned short)c; 1452 } 1453 } 1454 e->userData = userData; 1455 e->convert = convert; 1456 if (convert) { 1457 e->normal.isName2 = unknown_isName; 1458 e->normal.isName3 = unknown_isName; 1459 e->normal.isName4 = unknown_isName; 1460 e->normal.isNmstrt2 = unknown_isNmstrt; 1461 e->normal.isNmstrt3 = unknown_isNmstrt; 1462 e->normal.isNmstrt4 = unknown_isNmstrt; 1463 e->normal.isInvalid2 = unknown_isInvalid; 1464 e->normal.isInvalid3 = unknown_isInvalid; 1465 e->normal.isInvalid4 = unknown_isInvalid; 1466 } 1467 e->normal.enc.utf8Convert = unknown_toUtf8; 1468 e->normal.enc.utf16Convert = unknown_toUtf16; 1469 return &(e->normal.enc); 1470 } 1471 1472 /* If this enumeration is changed, getEncodingIndex and encodings 1473 must also be changed. */ 1474 enum { 1475 UNKNOWN_ENC = -1, 1476 ISO_8859_1_ENC = 0, 1477 US_ASCII_ENC, 1478 UTF_8_ENC, 1479 UTF_16_ENC, 1480 UTF_16BE_ENC, 1481 UTF_16LE_ENC, 1482 /* must match encodingNames up to here */ 1483 NO_ENC 1484 }; 1485 1486 static const char KW_ISO_8859_1[] 1487 = {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, 1488 ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'}; 1489 static const char KW_US_ASCII[] 1490 = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, 1491 ASCII_C, ASCII_I, ASCII_I, '\0'}; 1492 static const char KW_UTF_8[] 1493 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'}; 1494 static const char KW_UTF_16[] 1495 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'}; 1496 static const char KW_UTF_16BE[] 1497 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, 1498 ASCII_6, ASCII_B, ASCII_E, '\0'}; 1499 static const char KW_UTF_16LE[] 1500 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, 1501 ASCII_6, ASCII_L, ASCII_E, '\0'}; 1502 1503 static int FASTCALL 1504 getEncodingIndex(const char *name) { 1505 static const char *const encodingNames[] = { 1506 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE, 1507 }; 1508 int i; 1509 if (name == NULL) 1510 return NO_ENC; 1511 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++) 1512 if (streqci(name, encodingNames[i])) 1513 return i; 1514 return UNKNOWN_ENC; 1515 } 1516 1517 /* For binary compatibility, we store the index of the encoding 1518 specified at initialization in the isUtf16 member. 1519 */ 1520 1521 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) 1522 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) 1523 1524 /* This is what detects the encoding. encodingTable maps from 1525 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of 1526 the external (protocol) specified encoding; state is 1527 XML_CONTENT_STATE if we're parsing an external text entity, and 1528 XML_PROLOG_STATE otherwise. 1529 */ 1530 1531 static int 1532 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc, 1533 int state, const char *ptr, const char *end, const char **nextTokPtr) { 1534 const ENCODING **encPtr; 1535 1536 if (ptr >= end) 1537 return XML_TOK_NONE; 1538 encPtr = enc->encPtr; 1539 if (ptr + 1 == end) { 1540 /* only a single byte available for auto-detection */ 1541 #ifndef XML_DTD /* FIXME */ 1542 /* a well-formed document entity must have more than one byte */ 1543 if (state != XML_CONTENT_STATE) 1544 return XML_TOK_PARTIAL; 1545 #endif 1546 /* so we're parsing an external text entity... */ 1547 /* if UTF-16 was externally specified, then we need at least 2 bytes */ 1548 switch (INIT_ENC_INDEX(enc)) { 1549 case UTF_16_ENC: 1550 case UTF_16LE_ENC: 1551 case UTF_16BE_ENC: 1552 return XML_TOK_PARTIAL; 1553 } 1554 switch ((unsigned char)*ptr) { 1555 case 0xFE: 1556 case 0xFF: 1557 case 0xEF: /* possibly first byte of UTF-8 BOM */ 1558 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) 1559 break; 1560 /* fall through */ 1561 case 0x00: 1562 case 0x3C: 1563 return XML_TOK_PARTIAL; 1564 } 1565 } else { 1566 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { 1567 case 0xFEFF: 1568 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) 1569 break; 1570 *nextTokPtr = ptr + 2; 1571 *encPtr = encodingTable[UTF_16BE_ENC]; 1572 return XML_TOK_BOM; 1573 /* 00 3C is handled in the default case */ 1574 case 0x3C00: 1575 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC 1576 || INIT_ENC_INDEX(enc) == UTF_16_ENC) 1577 && state == XML_CONTENT_STATE) 1578 break; 1579 *encPtr = encodingTable[UTF_16LE_ENC]; 1580 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1581 case 0xFFFE: 1582 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) 1583 break; 1584 *nextTokPtr = ptr + 2; 1585 *encPtr = encodingTable[UTF_16LE_ENC]; 1586 return XML_TOK_BOM; 1587 case 0xEFBB: 1588 /* Maybe a UTF-8 BOM (EF BB BF) */ 1589 /* If there's an explicitly specified (external) encoding 1590 of ISO-8859-1 or some flavour of UTF-16 1591 and this is an external text entity, 1592 don't look for the BOM, 1593 because it might be a legal data. 1594 */ 1595 if (state == XML_CONTENT_STATE) { 1596 int e = INIT_ENC_INDEX(enc); 1597 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC 1598 || e == UTF_16_ENC) 1599 break; 1600 } 1601 if (ptr + 2 == end) 1602 return XML_TOK_PARTIAL; 1603 if ((unsigned char)ptr[2] == 0xBF) { 1604 *nextTokPtr = ptr + 3; 1605 *encPtr = encodingTable[UTF_8_ENC]; 1606 return XML_TOK_BOM; 1607 } 1608 break; 1609 default: 1610 if (ptr[0] == '\0') { 1611 /* 0 isn't a legal data character. Furthermore a document 1612 entity can only start with ASCII characters. So the only 1613 way this can fail to be big-endian UTF-16 if it it's an 1614 external parsed general entity that's labelled as 1615 UTF-16LE. 1616 */ 1617 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) 1618 break; 1619 *encPtr = encodingTable[UTF_16BE_ENC]; 1620 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1621 } else if (ptr[1] == '\0') { 1622 /* We could recover here in the case: 1623 - parsing an external entity 1624 - second byte is 0 1625 - no externally specified encoding 1626 - no encoding declaration 1627 by assuming UTF-16LE. But we don't, because this would mean when 1628 presented just with a single byte, we couldn't reliably determine 1629 whether we needed further bytes. 1630 */ 1631 if (state == XML_CONTENT_STATE) 1632 break; 1633 *encPtr = encodingTable[UTF_16LE_ENC]; 1634 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1635 } 1636 break; 1637 } 1638 } 1639 *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; 1640 return XmlTok(*encPtr, state, ptr, end, nextTokPtr); 1641 } 1642 1643 #define NS(x) x 1644 #define ns(x) x 1645 #define XML_TOK_NS_C 1646 #include "xmltok_ns.c" 1647 #undef XML_TOK_NS_C 1648 #undef NS 1649 #undef ns 1650 1651 #ifdef XML_NS 1652 1653 # define NS(x) x##NS 1654 # define ns(x) x##_ns 1655 1656 # define XML_TOK_NS_C 1657 # include "xmltok_ns.c" 1658 # undef XML_TOK_NS_C 1659 1660 # undef NS 1661 # undef ns 1662 1663 ENCODING * 1664 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert, 1665 void *userData) { 1666 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); 1667 if (enc) 1668 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON; 1669 return enc; 1670 } 1671 1672 #endif /* XML_NS */ 1673