1 /* 2 __ __ _ 3 ___\ \/ /_ __ __ _| |_ 4 / _ \\ /| '_ \ / _` | __| 5 | __// \| |_) | (_| | |_ 6 \___/_/\_\ .__/ \__,_|\__| 7 |_| XML parser 8 9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd 10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net> 11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net> 12 Copyright (c) 2004-2009 Karl Waclawek <karl@waclawek.net> 13 Copyright (c) 2005-2007 Steven Solie <steven@solie.ca> 14 Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org> 15 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk> 16 Copyright (c) 2019 David Loffredo <loffredo@steptools.com> 17 Copyright (c) 2020 Joe Orton <jorton@redhat.com> 18 Copyright (c) 2020 Kleber Tarcísio <klebertarcisio@yahoo.com.br> 19 Copyright (c) 2021 Tim Bray <tbray@textuality.com> 20 Licensed under the MIT license: 21 22 Permission is hereby granted, free of charge, to any person obtaining 23 a copy of this software and associated documentation files (the 24 "Software"), to deal in the Software without restriction, including 25 without limitation the rights to use, copy, modify, merge, publish, 26 distribute, sublicense, and/or sell copies of the Software, and to permit 27 persons to whom the Software is furnished to do so, subject to the 28 following conditions: 29 30 The above copyright notice and this permission notice shall be included 31 in all copies or substantial portions of the Software. 32 33 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 34 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 35 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 36 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 37 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 38 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 39 USE OR OTHER DEALINGS IN THE SOFTWARE. 40 */ 41 42 #include <expat_config.h> 43 44 #include <assert.h> 45 #include <stdio.h> 46 #include <stdlib.h> 47 #include <stddef.h> 48 #include <string.h> 49 #include <math.h> /* for isnan */ 50 #include <errno.h> 51 52 #include "expat.h" 53 #include "codepage.h" 54 #include "internal.h" /* for UNUSED_P only */ 55 #include "xmlfile.h" 56 #include "xmltchar.h" 57 58 #ifdef _MSC_VER 59 # include <crtdbg.h> 60 #endif 61 62 #ifdef XML_UNICODE 63 # include <wchar.h> 64 #endif 65 66 enum ExitCode { 67 XMLWF_EXIT_SUCCESS = 0, 68 XMLWF_EXIT_INTERNAL_ERROR = 1, 69 XMLWF_EXIT_NOT_WELLFORMED = 2, 70 XMLWF_EXIT_OUTPUT_ERROR = 3, 71 XMLWF_EXIT_USAGE_ERROR = 4, 72 }; 73 74 /* Structures for handler user data */ 75 typedef struct NotationList { 76 struct NotationList *next; 77 const XML_Char *notationName; 78 const XML_Char *systemId; 79 const XML_Char *publicId; 80 } NotationList; 81 82 typedef struct xmlwfUserData { 83 FILE *fp; 84 NotationList *notationListHead; 85 const XML_Char *currentDoctypeName; 86 } XmlwfUserData; 87 88 /* This ensures proper sorting. */ 89 90 #define NSSEP T('\001') 91 92 static void XMLCALL 93 characterData(void *userData, const XML_Char *s, int len) { 94 FILE *fp = ((XmlwfUserData *)userData)->fp; 95 for (; len > 0; --len, ++s) { 96 switch (*s) { 97 case T('&'): 98 fputts(T("&"), fp); 99 break; 100 case T('<'): 101 fputts(T("<"), fp); 102 break; 103 case T('>'): 104 fputts(T(">"), fp); 105 break; 106 #ifdef W3C14N 107 case 13: 108 fputts(T("
"), fp); 109 break; 110 #else 111 case T('"'): 112 fputts(T("""), fp); 113 break; 114 case 9: 115 case 10: 116 case 13: 117 ftprintf(fp, T("&#%d;"), *s); 118 break; 119 #endif 120 default: 121 puttc(*s, fp); 122 break; 123 } 124 } 125 } 126 127 static void 128 attributeValue(FILE *fp, const XML_Char *s) { 129 puttc(T('='), fp); 130 puttc(T('"'), fp); 131 assert(s); 132 for (;;) { 133 switch (*s) { 134 case 0: 135 case NSSEP: 136 puttc(T('"'), fp); 137 return; 138 case T('&'): 139 fputts(T("&"), fp); 140 break; 141 case T('<'): 142 fputts(T("<"), fp); 143 break; 144 case T('"'): 145 fputts(T("""), fp); 146 break; 147 #ifdef W3C14N 148 case 9: 149 fputts(T("	"), fp); 150 break; 151 case 10: 152 fputts(T("
"), fp); 153 break; 154 case 13: 155 fputts(T("
"), fp); 156 break; 157 #else 158 case T('>'): 159 fputts(T(">"), fp); 160 break; 161 case 9: 162 case 10: 163 case 13: 164 ftprintf(fp, T("&#%d;"), *s); 165 break; 166 #endif 167 default: 168 puttc(*s, fp); 169 break; 170 } 171 s++; 172 } 173 } 174 175 /* Lexicographically comparing UTF-8 encoded attribute values, 176 is equivalent to lexicographically comparing based on the character number. */ 177 178 static int 179 attcmp(const void *att1, const void *att2) { 180 return tcscmp(*(const XML_Char **)att1, *(const XML_Char **)att2); 181 } 182 183 static void XMLCALL 184 startElement(void *userData, const XML_Char *name, const XML_Char **atts) { 185 int nAtts; 186 const XML_Char **p; 187 FILE *fp = ((XmlwfUserData *)userData)->fp; 188 puttc(T('<'), fp); 189 fputts(name, fp); 190 191 p = atts; 192 while (*p) 193 ++p; 194 nAtts = (int)((p - atts) >> 1); 195 if (nAtts > 1) 196 qsort((void *)atts, nAtts, sizeof(XML_Char *) * 2, attcmp); 197 while (*atts) { 198 puttc(T(' '), fp); 199 fputts(*atts++, fp); 200 attributeValue(fp, *atts); 201 atts++; 202 } 203 puttc(T('>'), fp); 204 } 205 206 static void XMLCALL 207 endElement(void *userData, const XML_Char *name) { 208 FILE *fp = ((XmlwfUserData *)userData)->fp; 209 puttc(T('<'), fp); 210 puttc(T('/'), fp); 211 fputts(name, fp); 212 puttc(T('>'), fp); 213 } 214 215 static int 216 nsattcmp(const void *p1, const void *p2) { 217 const XML_Char *att1 = *(const XML_Char **)p1; 218 const XML_Char *att2 = *(const XML_Char **)p2; 219 int sep1 = (tcsrchr(att1, NSSEP) != 0); 220 int sep2 = (tcsrchr(att1, NSSEP) != 0); 221 if (sep1 != sep2) 222 return sep1 - sep2; 223 return tcscmp(att1, att2); 224 } 225 226 static void XMLCALL 227 startElementNS(void *userData, const XML_Char *name, const XML_Char **atts) { 228 int nAtts; 229 int nsi; 230 const XML_Char **p; 231 FILE *fp = ((XmlwfUserData *)userData)->fp; 232 const XML_Char *sep; 233 puttc(T('<'), fp); 234 235 sep = tcsrchr(name, NSSEP); 236 if (sep) { 237 fputts(T("n1:"), fp); 238 fputts(sep + 1, fp); 239 fputts(T(" xmlns:n1"), fp); 240 attributeValue(fp, name); 241 nsi = 2; 242 } else { 243 fputts(name, fp); 244 nsi = 1; 245 } 246 247 p = atts; 248 while (*p) 249 ++p; 250 nAtts = (int)((p - atts) >> 1); 251 if (nAtts > 1) 252 qsort((void *)atts, nAtts, sizeof(XML_Char *) * 2, nsattcmp); 253 while (*atts) { 254 name = *atts++; 255 sep = tcsrchr(name, NSSEP); 256 puttc(T(' '), fp); 257 if (sep) { 258 ftprintf(fp, T("n%d:"), nsi); 259 fputts(sep + 1, fp); 260 } else 261 fputts(name, fp); 262 attributeValue(fp, *atts); 263 if (sep) { 264 ftprintf(fp, T(" xmlns:n%d"), nsi++); 265 attributeValue(fp, name); 266 } 267 atts++; 268 } 269 puttc(T('>'), fp); 270 } 271 272 static void XMLCALL 273 endElementNS(void *userData, const XML_Char *name) { 274 FILE *fp = ((XmlwfUserData *)userData)->fp; 275 const XML_Char *sep; 276 puttc(T('<'), fp); 277 puttc(T('/'), fp); 278 sep = tcsrchr(name, NSSEP); 279 if (sep) { 280 fputts(T("n1:"), fp); 281 fputts(sep + 1, fp); 282 } else 283 fputts(name, fp); 284 puttc(T('>'), fp); 285 } 286 287 #ifndef W3C14N 288 289 static void XMLCALL 290 processingInstruction(void *userData, const XML_Char *target, 291 const XML_Char *data) { 292 FILE *fp = ((XmlwfUserData *)userData)->fp; 293 puttc(T('<'), fp); 294 puttc(T('?'), fp); 295 fputts(target, fp); 296 puttc(T(' '), fp); 297 fputts(data, fp); 298 puttc(T('?'), fp); 299 puttc(T('>'), fp); 300 } 301 302 static XML_Char * 303 xcsdup(const XML_Char *s) { 304 XML_Char *result; 305 int count = 0; 306 int numBytes; 307 308 /* Get the length of the string, including terminator */ 309 while (s[count++] != 0) { 310 /* Do nothing */ 311 } 312 numBytes = count * sizeof(XML_Char); 313 result = malloc(numBytes); 314 if (result == NULL) 315 return NULL; 316 memcpy(result, s, numBytes); 317 return result; 318 } 319 320 static void XMLCALL 321 startDoctypeDecl(void *userData, const XML_Char *doctypeName, 322 const XML_Char *sysid, const XML_Char *publid, 323 int has_internal_subset) { 324 XmlwfUserData *data = (XmlwfUserData *)userData; 325 UNUSED_P(sysid); 326 UNUSED_P(publid); 327 UNUSED_P(has_internal_subset); 328 data->currentDoctypeName = xcsdup(doctypeName); 329 } 330 331 static void 332 freeNotations(XmlwfUserData *data) { 333 NotationList *notationListHead = data->notationListHead; 334 335 while (notationListHead != NULL) { 336 NotationList *next = notationListHead->next; 337 free((void *)notationListHead->notationName); 338 free((void *)notationListHead->systemId); 339 free((void *)notationListHead->publicId); 340 free(notationListHead); 341 notationListHead = next; 342 } 343 data->notationListHead = NULL; 344 } 345 346 static void 347 cleanupUserData(XmlwfUserData *userData) { 348 free((void *)userData->currentDoctypeName); 349 userData->currentDoctypeName = NULL; 350 freeNotations(userData); 351 } 352 353 static int 354 xcscmp(const XML_Char *xs, const XML_Char *xt) { 355 while (*xs != 0 && *xt != 0) { 356 if (*xs < *xt) 357 return -1; 358 if (*xs > *xt) 359 return 1; 360 xs++; 361 xt++; 362 } 363 if (*xs < *xt) 364 return -1; 365 if (*xs > *xt) 366 return 1; 367 return 0; 368 } 369 370 static int 371 notationCmp(const void *a, const void *b) { 372 const NotationList *const n1 = *(NotationList **)a; 373 const NotationList *const n2 = *(NotationList **)b; 374 375 return xcscmp(n1->notationName, n2->notationName); 376 } 377 378 static void XMLCALL 379 endDoctypeDecl(void *userData) { 380 XmlwfUserData *data = (XmlwfUserData *)userData; 381 NotationList **notations; 382 int notationCount = 0; 383 NotationList *p; 384 int i; 385 386 /* How many notations do we have? */ 387 for (p = data->notationListHead; p != NULL; p = p->next) 388 notationCount++; 389 if (notationCount == 0) { 390 /* Nothing to report */ 391 free((void *)data->currentDoctypeName); 392 data->currentDoctypeName = NULL; 393 return; 394 } 395 396 notations = malloc(notationCount * sizeof(NotationList *)); 397 if (notations == NULL) { 398 fprintf(stderr, "Unable to sort notations"); 399 freeNotations(data); 400 return; 401 } 402 403 for (p = data->notationListHead, i = 0; i < notationCount; p = p->next, i++) { 404 notations[i] = p; 405 } 406 qsort(notations, notationCount, sizeof(NotationList *), notationCmp); 407 408 /* Output the DOCTYPE header */ 409 fputts(T("<!DOCTYPE "), data->fp); 410 fputts(data->currentDoctypeName, data->fp); 411 fputts(T(" [\n"), data->fp); 412 413 /* Now the NOTATIONs */ 414 for (i = 0; i < notationCount; i++) { 415 fputts(T("<!NOTATION "), data->fp); 416 fputts(notations[i]->notationName, data->fp); 417 if (notations[i]->publicId != NULL) { 418 fputts(T(" PUBLIC '"), data->fp); 419 fputts(notations[i]->publicId, data->fp); 420 puttc(T('\''), data->fp); 421 if (notations[i]->systemId != NULL) { 422 puttc(T(' '), data->fp); 423 puttc(T('\''), data->fp); 424 fputts(notations[i]->systemId, data->fp); 425 puttc(T('\''), data->fp); 426 } 427 } else if (notations[i]->systemId != NULL) { 428 fputts(T(" SYSTEM '"), data->fp); 429 fputts(notations[i]->systemId, data->fp); 430 puttc(T('\''), data->fp); 431 } 432 puttc(T('>'), data->fp); 433 puttc(T('\n'), data->fp); 434 } 435 436 /* Finally end the DOCTYPE */ 437 fputts(T("]>\n"), data->fp); 438 439 free(notations); 440 freeNotations(data); 441 free((void *)data->currentDoctypeName); 442 data->currentDoctypeName = NULL; 443 } 444 445 static void XMLCALL 446 notationDecl(void *userData, const XML_Char *notationName, const XML_Char *base, 447 const XML_Char *systemId, const XML_Char *publicId) { 448 XmlwfUserData *data = (XmlwfUserData *)userData; 449 NotationList *entry = malloc(sizeof(NotationList)); 450 const char *errorMessage = "Unable to store NOTATION for output\n"; 451 452 UNUSED_P(base); 453 if (entry == NULL) { 454 fputs(errorMessage, stderr); 455 return; /* Nothing we can really do about this */ 456 } 457 entry->notationName = xcsdup(notationName); 458 if (entry->notationName == NULL) { 459 fputs(errorMessage, stderr); 460 free(entry); 461 return; 462 } 463 if (systemId != NULL) { 464 entry->systemId = xcsdup(systemId); 465 if (entry->systemId == NULL) { 466 fputs(errorMessage, stderr); 467 free((void *)entry->notationName); 468 free(entry); 469 return; 470 } 471 } else { 472 entry->systemId = NULL; 473 } 474 if (publicId != NULL) { 475 entry->publicId = xcsdup(publicId); 476 if (entry->publicId == NULL) { 477 fputs(errorMessage, stderr); 478 free((void *)entry->systemId); /* Safe if it's NULL */ 479 free((void *)entry->notationName); 480 free(entry); 481 return; 482 } 483 } else { 484 entry->publicId = NULL; 485 } 486 487 entry->next = data->notationListHead; 488 data->notationListHead = entry; 489 } 490 491 #endif /* not W3C14N */ 492 493 static void XMLCALL 494 defaultCharacterData(void *userData, const XML_Char *s, int len) { 495 UNUSED_P(s); 496 UNUSED_P(len); 497 XML_DefaultCurrent((XML_Parser)userData); 498 } 499 500 static void XMLCALL 501 defaultStartElement(void *userData, const XML_Char *name, 502 const XML_Char **atts) { 503 UNUSED_P(name); 504 UNUSED_P(atts); 505 XML_DefaultCurrent((XML_Parser)userData); 506 } 507 508 static void XMLCALL 509 defaultEndElement(void *userData, const XML_Char *name) { 510 UNUSED_P(name); 511 XML_DefaultCurrent((XML_Parser)userData); 512 } 513 514 static void XMLCALL 515 defaultProcessingInstruction(void *userData, const XML_Char *target, 516 const XML_Char *data) { 517 UNUSED_P(target); 518 UNUSED_P(data); 519 XML_DefaultCurrent((XML_Parser)userData); 520 } 521 522 static void XMLCALL 523 nopCharacterData(void *userData, const XML_Char *s, int len) { 524 UNUSED_P(userData); 525 UNUSED_P(s); 526 UNUSED_P(len); 527 } 528 529 static void XMLCALL 530 nopStartElement(void *userData, const XML_Char *name, const XML_Char **atts) { 531 UNUSED_P(userData); 532 UNUSED_P(name); 533 UNUSED_P(atts); 534 } 535 536 static void XMLCALL 537 nopEndElement(void *userData, const XML_Char *name) { 538 UNUSED_P(userData); 539 UNUSED_P(name); 540 } 541 542 static void XMLCALL 543 nopProcessingInstruction(void *userData, const XML_Char *target, 544 const XML_Char *data) { 545 UNUSED_P(userData); 546 UNUSED_P(target); 547 UNUSED_P(data); 548 } 549 550 static void XMLCALL 551 markup(void *userData, const XML_Char *s, int len) { 552 FILE *fp = ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp; 553 for (; len > 0; --len, ++s) 554 puttc(*s, fp); 555 } 556 557 static void 558 metaLocation(XML_Parser parser) { 559 const XML_Char *uri = XML_GetBase(parser); 560 FILE *fp = ((XmlwfUserData *)XML_GetUserData(parser))->fp; 561 if (uri) 562 ftprintf(fp, T(" uri=\"%s\""), uri); 563 ftprintf(fp, 564 T(" byte=\"%") T(XML_FMT_INT_MOD) T("d\"") T(" nbytes=\"%d\"") 565 T(" line=\"%") T(XML_FMT_INT_MOD) T("u\"") T(" col=\"%") 566 T(XML_FMT_INT_MOD) T("u\""), 567 XML_GetCurrentByteIndex(parser), XML_GetCurrentByteCount(parser), 568 XML_GetCurrentLineNumber(parser), 569 XML_GetCurrentColumnNumber(parser)); 570 } 571 572 static void 573 metaStartDocument(void *userData) { 574 fputts(T("<document>\n"), 575 ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp); 576 } 577 578 static void 579 metaEndDocument(void *userData) { 580 fputts(T("</document>\n"), 581 ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp); 582 } 583 584 static void XMLCALL 585 metaStartElement(void *userData, const XML_Char *name, const XML_Char **atts) { 586 XML_Parser parser = (XML_Parser)userData; 587 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 588 FILE *fp = data->fp; 589 const XML_Char **specifiedAttsEnd 590 = atts + XML_GetSpecifiedAttributeCount(parser); 591 const XML_Char **idAttPtr; 592 int idAttIndex = XML_GetIdAttributeIndex(parser); 593 if (idAttIndex < 0) 594 idAttPtr = 0; 595 else 596 idAttPtr = atts + idAttIndex; 597 598 ftprintf(fp, T("<starttag name=\"%s\""), name); 599 metaLocation(parser); 600 if (*atts) { 601 fputts(T(">\n"), fp); 602 do { 603 ftprintf(fp, T("<attribute name=\"%s\" value=\""), atts[0]); 604 characterData(data, atts[1], (int)tcslen(atts[1])); 605 if (atts >= specifiedAttsEnd) 606 fputts(T("\" defaulted=\"yes\"/>\n"), fp); 607 else if (atts == idAttPtr) 608 fputts(T("\" id=\"yes\"/>\n"), fp); 609 else 610 fputts(T("\"/>\n"), fp); 611 } while (*(atts += 2)); 612 fputts(T("</starttag>\n"), fp); 613 } else 614 fputts(T("/>\n"), fp); 615 } 616 617 static void XMLCALL 618 metaEndElement(void *userData, const XML_Char *name) { 619 XML_Parser parser = (XML_Parser)userData; 620 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 621 FILE *fp = data->fp; 622 ftprintf(fp, T("<endtag name=\"%s\""), name); 623 metaLocation(parser); 624 fputts(T("/>\n"), fp); 625 } 626 627 static void XMLCALL 628 metaProcessingInstruction(void *userData, const XML_Char *target, 629 const XML_Char *data) { 630 XML_Parser parser = (XML_Parser)userData; 631 XmlwfUserData *usrData = (XmlwfUserData *)XML_GetUserData(parser); 632 FILE *fp = usrData->fp; 633 ftprintf(fp, T("<pi target=\"%s\" data=\""), target); 634 characterData(usrData, data, (int)tcslen(data)); 635 puttc(T('"'), fp); 636 metaLocation(parser); 637 fputts(T("/>\n"), fp); 638 } 639 640 static void XMLCALL 641 metaComment(void *userData, const XML_Char *data) { 642 XML_Parser parser = (XML_Parser)userData; 643 XmlwfUserData *usrData = (XmlwfUserData *)XML_GetUserData(parser); 644 FILE *fp = usrData->fp; 645 fputts(T("<comment data=\""), fp); 646 characterData(usrData, data, (int)tcslen(data)); 647 puttc(T('"'), fp); 648 metaLocation(parser); 649 fputts(T("/>\n"), fp); 650 } 651 652 static void XMLCALL 653 metaStartCdataSection(void *userData) { 654 XML_Parser parser = (XML_Parser)userData; 655 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 656 FILE *fp = data->fp; 657 fputts(T("<startcdata"), fp); 658 metaLocation(parser); 659 fputts(T("/>\n"), fp); 660 } 661 662 static void XMLCALL 663 metaEndCdataSection(void *userData) { 664 XML_Parser parser = (XML_Parser)userData; 665 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 666 FILE *fp = data->fp; 667 fputts(T("<endcdata"), fp); 668 metaLocation(parser); 669 fputts(T("/>\n"), fp); 670 } 671 672 static void XMLCALL 673 metaCharacterData(void *userData, const XML_Char *s, int len) { 674 XML_Parser parser = (XML_Parser)userData; 675 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 676 FILE *fp = data->fp; 677 fputts(T("<chars str=\""), fp); 678 characterData(data, s, len); 679 puttc(T('"'), fp); 680 metaLocation(parser); 681 fputts(T("/>\n"), fp); 682 } 683 684 static void XMLCALL 685 metaStartDoctypeDecl(void *userData, const XML_Char *doctypeName, 686 const XML_Char *sysid, const XML_Char *pubid, 687 int has_internal_subset) { 688 XML_Parser parser = (XML_Parser)userData; 689 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 690 FILE *fp = data->fp; 691 UNUSED_P(sysid); 692 UNUSED_P(pubid); 693 UNUSED_P(has_internal_subset); 694 ftprintf(fp, T("<startdoctype name=\"%s\""), doctypeName); 695 metaLocation(parser); 696 fputts(T("/>\n"), fp); 697 } 698 699 static void XMLCALL 700 metaEndDoctypeDecl(void *userData) { 701 XML_Parser parser = (XML_Parser)userData; 702 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 703 FILE *fp = data->fp; 704 fputts(T("<enddoctype"), fp); 705 metaLocation(parser); 706 fputts(T("/>\n"), fp); 707 } 708 709 static void XMLCALL 710 metaNotationDecl(void *userData, const XML_Char *notationName, 711 const XML_Char *base, const XML_Char *systemId, 712 const XML_Char *publicId) { 713 XML_Parser parser = (XML_Parser)userData; 714 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 715 FILE *fp = data->fp; 716 UNUSED_P(base); 717 ftprintf(fp, T("<notation name=\"%s\""), notationName); 718 if (publicId) 719 ftprintf(fp, T(" public=\"%s\""), publicId); 720 if (systemId) { 721 fputts(T(" system=\""), fp); 722 characterData(data, systemId, (int)tcslen(systemId)); 723 puttc(T('"'), fp); 724 } 725 metaLocation(parser); 726 fputts(T("/>\n"), fp); 727 } 728 729 static void XMLCALL 730 metaEntityDecl(void *userData, const XML_Char *entityName, int is_param, 731 const XML_Char *value, int value_length, const XML_Char *base, 732 const XML_Char *systemId, const XML_Char *publicId, 733 const XML_Char *notationName) { 734 XML_Parser parser = (XML_Parser)userData; 735 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 736 FILE *fp = data->fp; 737 738 UNUSED_P(is_param); 739 UNUSED_P(base); 740 if (value) { 741 ftprintf(fp, T("<entity name=\"%s\""), entityName); 742 metaLocation(parser); 743 puttc(T('>'), fp); 744 characterData(data, value, value_length); 745 fputts(T("</entity/>\n"), fp); 746 } else if (notationName) { 747 ftprintf(fp, T("<entity name=\"%s\""), entityName); 748 if (publicId) 749 ftprintf(fp, T(" public=\"%s\""), publicId); 750 fputts(T(" system=\""), fp); 751 characterData(data, systemId, (int)tcslen(systemId)); 752 puttc(T('"'), fp); 753 ftprintf(fp, T(" notation=\"%s\""), notationName); 754 metaLocation(parser); 755 fputts(T("/>\n"), fp); 756 } else { 757 ftprintf(fp, T("<entity name=\"%s\""), entityName); 758 if (publicId) 759 ftprintf(fp, T(" public=\"%s\""), publicId); 760 fputts(T(" system=\""), fp); 761 characterData(data, systemId, (int)tcslen(systemId)); 762 puttc(T('"'), fp); 763 metaLocation(parser); 764 fputts(T("/>\n"), fp); 765 } 766 } 767 768 static void XMLCALL 769 metaStartNamespaceDecl(void *userData, const XML_Char *prefix, 770 const XML_Char *uri) { 771 XML_Parser parser = (XML_Parser)userData; 772 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 773 FILE *fp = data->fp; 774 fputts(T("<startns"), fp); 775 if (prefix) 776 ftprintf(fp, T(" prefix=\"%s\""), prefix); 777 if (uri) { 778 fputts(T(" ns=\""), fp); 779 characterData(data, uri, (int)tcslen(uri)); 780 fputts(T("\"/>\n"), fp); 781 } else 782 fputts(T("/>\n"), fp); 783 } 784 785 static void XMLCALL 786 metaEndNamespaceDecl(void *userData, const XML_Char *prefix) { 787 XML_Parser parser = (XML_Parser)userData; 788 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 789 FILE *fp = data->fp; 790 if (! prefix) 791 fputts(T("<endns/>\n"), fp); 792 else 793 ftprintf(fp, T("<endns prefix=\"%s\"/>\n"), prefix); 794 } 795 796 static int XMLCALL 797 unknownEncodingConvert(void *data, const char *p) { 798 return codepageConvert(*(int *)data, p); 799 } 800 801 static int XMLCALL 802 unknownEncoding(void *userData, const XML_Char *name, XML_Encoding *info) { 803 int cp; 804 static const XML_Char prefixL[] = T("windows-"); 805 static const XML_Char prefixU[] = T("WINDOWS-"); 806 int i; 807 808 UNUSED_P(userData); 809 for (i = 0; prefixU[i]; i++) 810 if (name[i] != prefixU[i] && name[i] != prefixL[i]) 811 return 0; 812 813 cp = 0; 814 for (; name[i]; i++) { 815 static const XML_Char digits[] = T("0123456789"); 816 const XML_Char *s = tcschr(digits, name[i]); 817 if (! s) 818 return 0; 819 cp *= 10; 820 cp += (int)(s - digits); 821 if (cp >= 0x10000) 822 return 0; 823 } 824 if (! codepageMap(cp, info->map)) 825 return 0; 826 info->convert = unknownEncodingConvert; 827 /* We could just cast the code page integer to a void *, 828 and avoid the use of release. */ 829 info->release = free; 830 info->data = malloc(sizeof(int)); 831 if (! info->data) 832 return 0; 833 *(int *)info->data = cp; 834 return 1; 835 } 836 837 static int XMLCALL 838 notStandalone(void *userData) { 839 UNUSED_P(userData); 840 return 0; 841 } 842 843 static void 844 showVersion(XML_Char *prog) { 845 XML_Char *s = prog; 846 XML_Char ch; 847 const XML_Feature *features = XML_GetFeatureList(); 848 while ((ch = *s) != 0) { 849 if (ch == '/' 850 #if defined(_WIN32) 851 || ch == '\\' 852 #endif 853 ) 854 prog = s + 1; 855 ++s; 856 } 857 ftprintf(stdout, T("%s using %s\n"), prog, XML_ExpatVersion()); 858 if (features != NULL && features[0].feature != XML_FEATURE_END) { 859 int i = 1; 860 ftprintf(stdout, T("%s"), features[0].name); 861 if (features[0].value) 862 ftprintf(stdout, T("=%ld"), features[0].value); 863 while (features[i].feature != XML_FEATURE_END) { 864 ftprintf(stdout, T(", %s"), features[i].name); 865 if (features[i].value) 866 ftprintf(stdout, T("=%ld"), features[i].value); 867 ++i; 868 } 869 ftprintf(stdout, T("\n")); 870 } 871 } 872 873 static void 874 usage(const XML_Char *prog, int rc) { 875 ftprintf( 876 stderr, 877 /* Generated with: 878 * $ xmlwf/xmlwf_helpgen.sh 879 * To update, change xmlwf/xmlwf_helpgen.py, then paste the output of 880 * xmlwf/xmlwf_helpgen.sh in here. 881 */ 882 /* clang-format off */ 883 T("usage:\n") 884 T(" %s [OPTIONS] [FILE ...]\n") 885 T(" %s -h\n") 886 T(" %s -v\n") 887 T("\n") 888 T("xmlwf - Determines if an XML document is well-formed\n") 889 T("\n") 890 T("positional arguments:\n") 891 T(" FILE file to process (default: STDIN)\n") 892 T("\n") 893 T("input control arguments:\n") 894 T(" -s print an error if the document is not [s]tandalone\n") 895 T(" -n enable [n]amespace processing\n") 896 T(" -p enable processing external DTDs and [p]arameter entities\n") 897 T(" -x enable processing of e[x]ternal entities\n") 898 T(" -e ENCODING override any in-document [e]ncoding declaration\n") 899 T(" -w enable support for [W]indows code pages\n") 900 T(" -r disable memory-mapping and use normal file [r]ead IO calls instead\n") 901 T(" -k when processing multiple files, [k]eep processing after first file with error\n") 902 T("\n") 903 T("output control arguments:\n") 904 T(" -d DIRECTORY output [d]estination directory\n") 905 T(" -c write a [c]opy of input XML, not canonical XML\n") 906 T(" -m write [m]eta XML, not canonical XML\n") 907 T(" -t write no XML output for [t]iming of plain parsing\n") 908 T(" -N enable adding doctype and [n]otation declarations\n") 909 T("\n") 910 T("billion laughs attack protection:\n") 911 T(" NOTE: If you ever need to increase these values for non-attack payload, please file a bug report.\n") 912 T("\n") 913 T(" -a FACTOR set maximum tolerated [a]mplification factor (default: 100.0)\n") 914 T(" -b BYTES set number of output [b]ytes needed to activate (default: 8 MiB)\n") 915 T("\n") 916 T("info arguments:\n") 917 T(" -h show this [h]elp message and exit\n") 918 T(" -v show program's [v]ersion number and exit\n") 919 T("\n") 920 T("exit status:\n") 921 T(" 0 the input files are well-formed and the output (if requested) was written successfully\n") 922 T(" 1 could not allocate data structures, signals a serious problem with execution environment\n") 923 T(" 2 one or more input files were not well-formed\n") 924 T(" 3 could not create an output file\n") 925 T(" 4 command-line argument error\n") 926 T("\n") 927 T("xmlwf of libexpat is software libre, licensed under the MIT license.\n") 928 T("Please report bugs at https://github.com/libexpat/libexpat/issues. Thank you!\n") 929 , /* clang-format on */ 930 prog, prog, prog); 931 exit(rc); 932 } 933 934 #if defined(__MINGW32__) && defined(XML_UNICODE) 935 /* Silence warning about missing prototype */ 936 int wmain(int argc, XML_Char **argv); 937 #endif 938 939 #define XMLWF_SHIFT_ARG_INTO(constCharStarTarget, argc, argv, i, j) \ 940 { \ 941 if (argv[i][j + 1] == T('\0')) { \ 942 if (++i == argc) \ 943 usage(argv[0], XMLWF_EXIT_USAGE_ERROR); \ 944 constCharStarTarget = argv[i]; \ 945 } else { \ 946 constCharStarTarget = argv[i] + j + 1; \ 947 } \ 948 i++; \ 949 j = 0; \ 950 } 951 952 int 953 tmain(int argc, XML_Char **argv) { 954 int i, j; 955 const XML_Char *outputDir = NULL; 956 const XML_Char *encoding = NULL; 957 unsigned processFlags = XML_MAP_FILE; 958 int windowsCodePages = 0; 959 int outputType = 0; 960 int useNamespaces = 0; 961 int requireStandalone = 0; 962 int requiresNotations = 0; 963 int continueOnError = 0; 964 965 float attackMaximumAmplification = -1.0f; /* signaling "not set" */ 966 unsigned long long attackThresholdBytes; 967 XML_Bool attackThresholdGiven = XML_FALSE; 968 969 int exitCode = XMLWF_EXIT_SUCCESS; 970 enum XML_ParamEntityParsing paramEntityParsing 971 = XML_PARAM_ENTITY_PARSING_NEVER; 972 int useStdin = 0; 973 XmlwfUserData userData = {NULL, NULL, NULL}; 974 975 #ifdef _MSC_VER 976 _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF); 977 #endif 978 979 i = 1; 980 j = 0; 981 while (i < argc) { 982 if (j == 0) { 983 if (argv[i][0] != T('-')) 984 break; 985 if (argv[i][1] == T('-') && argv[i][2] == T('\0')) { 986 i++; 987 break; 988 } 989 j++; 990 } 991 switch (argv[i][j]) { 992 case T('r'): 993 processFlags &= ~XML_MAP_FILE; 994 j++; 995 break; 996 case T('s'): 997 requireStandalone = 1; 998 j++; 999 break; 1000 case T('n'): 1001 useNamespaces = 1; 1002 j++; 1003 break; 1004 case T('p'): 1005 paramEntityParsing = XML_PARAM_ENTITY_PARSING_ALWAYS; 1006 /* fall through */ 1007 case T('x'): 1008 processFlags |= XML_EXTERNAL_ENTITIES; 1009 j++; 1010 break; 1011 case T('w'): 1012 windowsCodePages = 1; 1013 j++; 1014 break; 1015 case T('m'): 1016 outputType = 'm'; 1017 j++; 1018 break; 1019 case T('c'): 1020 outputType = 'c'; 1021 useNamespaces = 0; 1022 j++; 1023 break; 1024 case T('t'): 1025 outputType = 't'; 1026 j++; 1027 break; 1028 case T('N'): 1029 requiresNotations = 1; 1030 j++; 1031 break; 1032 case T('d'): 1033 XMLWF_SHIFT_ARG_INTO(outputDir, argc, argv, i, j); 1034 break; 1035 case T('e'): 1036 XMLWF_SHIFT_ARG_INTO(encoding, argc, argv, i, j); 1037 break; 1038 case T('h'): 1039 usage(argv[0], XMLWF_EXIT_SUCCESS); 1040 return 0; 1041 case T('v'): 1042 showVersion(argv[0]); 1043 return 0; 1044 case T('k'): 1045 continueOnError = 1; 1046 j++; 1047 break; 1048 case T('a'): { 1049 const XML_Char *valueText = NULL; 1050 XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j); 1051 1052 errno = 0; 1053 XML_Char *afterValueText = (XML_Char *)valueText; 1054 attackMaximumAmplification = tcstof(valueText, &afterValueText); 1055 if ((errno != 0) || (afterValueText[0] != T('\0')) 1056 || isnan(attackMaximumAmplification) 1057 || (attackMaximumAmplification < 1.0f)) { 1058 // This prevents tperror(..) from reporting misleading "[..]: Success" 1059 errno = ERANGE; 1060 tperror(T("invalid amplification limit") T( 1061 " (needs a floating point number greater or equal than 1.0)")); 1062 exit(XMLWF_EXIT_USAGE_ERROR); 1063 } 1064 #ifndef XML_DTD 1065 ftprintf(stderr, T("Warning: Given amplification limit ignored") T( 1066 ", xmlwf has been compiled without DTD support.\n")); 1067 #endif 1068 break; 1069 } 1070 case T('b'): { 1071 const XML_Char *valueText = NULL; 1072 XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j); 1073 1074 errno = 0; 1075 XML_Char *afterValueText = (XML_Char *)valueText; 1076 attackThresholdBytes = tcstoull(valueText, &afterValueText, 10); 1077 if ((errno != 0) || (afterValueText[0] != T('\0'))) { 1078 // This prevents tperror(..) from reporting misleading "[..]: Success" 1079 errno = ERANGE; 1080 tperror(T("invalid ignore threshold") 1081 T(" (needs an integer from 0 to 2^64-1)")); 1082 exit(XMLWF_EXIT_USAGE_ERROR); 1083 } 1084 attackThresholdGiven = XML_TRUE; 1085 #ifndef XML_DTD 1086 ftprintf(stderr, T("Warning: Given attack threshold ignored") T( 1087 ", xmlwf has been compiled without DTD support.\n")); 1088 #endif 1089 break; 1090 } 1091 case T('\0'): 1092 if (j > 1) { 1093 i++; 1094 j = 0; 1095 break; 1096 } 1097 /* fall through */ 1098 default: 1099 usage(argv[0], XMLWF_EXIT_USAGE_ERROR); 1100 } 1101 } 1102 if (i == argc) { 1103 useStdin = 1; 1104 processFlags &= ~XML_MAP_FILE; 1105 i--; 1106 } 1107 for (; i < argc; i++) { 1108 XML_Char *outName = 0; 1109 int result; 1110 XML_Parser parser; 1111 if (useNamespaces) 1112 parser = XML_ParserCreateNS(encoding, NSSEP); 1113 else 1114 parser = XML_ParserCreate(encoding); 1115 1116 if (! parser) { 1117 tperror(T("Could not instantiate parser")); 1118 exit(XMLWF_EXIT_INTERNAL_ERROR); 1119 } 1120 1121 if (attackMaximumAmplification != -1.0f) { 1122 #ifdef XML_DTD 1123 XML_SetBillionLaughsAttackProtectionMaximumAmplification( 1124 parser, attackMaximumAmplification); 1125 #endif 1126 } 1127 if (attackThresholdGiven) { 1128 #ifdef XML_DTD 1129 XML_SetBillionLaughsAttackProtectionActivationThreshold( 1130 parser, attackThresholdBytes); 1131 #else 1132 (void)attackThresholdBytes; // silence -Wunused-but-set-variable 1133 #endif 1134 } 1135 1136 if (requireStandalone) 1137 XML_SetNotStandaloneHandler(parser, notStandalone); 1138 XML_SetParamEntityParsing(parser, paramEntityParsing); 1139 if (outputType == 't') { 1140 /* This is for doing timings; this gives a more realistic estimate of 1141 the parsing time. */ 1142 outputDir = 0; 1143 XML_SetElementHandler(parser, nopStartElement, nopEndElement); 1144 XML_SetCharacterDataHandler(parser, nopCharacterData); 1145 XML_SetProcessingInstructionHandler(parser, nopProcessingInstruction); 1146 } else if (outputDir) { 1147 const XML_Char *delim = T("/"); 1148 const XML_Char *file = useStdin ? T("STDIN") : argv[i]; 1149 if (! useStdin) { 1150 /* Jump after last (back)slash */ 1151 const XML_Char *lastDelim = tcsrchr(file, delim[0]); 1152 if (lastDelim) 1153 file = lastDelim + 1; 1154 #if defined(_WIN32) 1155 else { 1156 const XML_Char *winDelim = T("\\"); 1157 lastDelim = tcsrchr(file, winDelim[0]); 1158 if (lastDelim) { 1159 file = lastDelim + 1; 1160 delim = winDelim; 1161 } 1162 } 1163 #endif 1164 } 1165 outName = (XML_Char *)malloc((tcslen(outputDir) + tcslen(file) + 2) 1166 * sizeof(XML_Char)); 1167 if (! outName) { 1168 tperror(T("Could not allocate memory")); 1169 exit(XMLWF_EXIT_INTERNAL_ERROR); 1170 } 1171 tcscpy(outName, outputDir); 1172 tcscat(outName, delim); 1173 tcscat(outName, file); 1174 userData.fp = tfopen(outName, T("wb")); 1175 if (! userData.fp) { 1176 tperror(outName); 1177 exitCode = XMLWF_EXIT_OUTPUT_ERROR; 1178 free(outName); 1179 XML_ParserFree(parser); 1180 if (continueOnError) { 1181 continue; 1182 } else { 1183 break; 1184 } 1185 } 1186 setvbuf(userData.fp, NULL, _IOFBF, 16384); 1187 #ifdef XML_UNICODE 1188 puttc(0xFEFF, userData.fp); 1189 #endif 1190 XML_SetUserData(parser, &userData); 1191 switch (outputType) { 1192 case 'm': 1193 XML_UseParserAsHandlerArg(parser); 1194 XML_SetElementHandler(parser, metaStartElement, metaEndElement); 1195 XML_SetProcessingInstructionHandler(parser, metaProcessingInstruction); 1196 XML_SetCommentHandler(parser, metaComment); 1197 XML_SetCdataSectionHandler(parser, metaStartCdataSection, 1198 metaEndCdataSection); 1199 XML_SetCharacterDataHandler(parser, metaCharacterData); 1200 XML_SetDoctypeDeclHandler(parser, metaStartDoctypeDecl, 1201 metaEndDoctypeDecl); 1202 XML_SetEntityDeclHandler(parser, metaEntityDecl); 1203 XML_SetNotationDeclHandler(parser, metaNotationDecl); 1204 XML_SetNamespaceDeclHandler(parser, metaStartNamespaceDecl, 1205 metaEndNamespaceDecl); 1206 metaStartDocument(parser); 1207 break; 1208 case 'c': 1209 XML_UseParserAsHandlerArg(parser); 1210 XML_SetDefaultHandler(parser, markup); 1211 XML_SetElementHandler(parser, defaultStartElement, defaultEndElement); 1212 XML_SetCharacterDataHandler(parser, defaultCharacterData); 1213 XML_SetProcessingInstructionHandler(parser, 1214 defaultProcessingInstruction); 1215 break; 1216 default: 1217 if (useNamespaces) 1218 XML_SetElementHandler(parser, startElementNS, endElementNS); 1219 else 1220 XML_SetElementHandler(parser, startElement, endElement); 1221 XML_SetCharacterDataHandler(parser, characterData); 1222 #ifndef W3C14N 1223 XML_SetProcessingInstructionHandler(parser, processingInstruction); 1224 if (requiresNotations) { 1225 XML_SetDoctypeDeclHandler(parser, startDoctypeDecl, endDoctypeDecl); 1226 XML_SetNotationDeclHandler(parser, notationDecl); 1227 } 1228 #endif /* not W3C14N */ 1229 break; 1230 } 1231 } 1232 if (windowsCodePages) 1233 XML_SetUnknownEncodingHandler(parser, unknownEncoding, 0); 1234 result = XML_ProcessFile(parser, useStdin ? NULL : argv[i], processFlags); 1235 if (outputDir) { 1236 if (outputType == 'm') 1237 metaEndDocument(parser); 1238 fclose(userData.fp); 1239 if (! result) { 1240 tremove(outName); 1241 } 1242 free(outName); 1243 } 1244 XML_ParserFree(parser); 1245 if (! result) { 1246 exitCode = XMLWF_EXIT_NOT_WELLFORMED; 1247 cleanupUserData(&userData); 1248 if (! continueOnError) { 1249 break; 1250 } 1251 } 1252 } 1253 return exitCode; 1254 } 1255