1 /* 2 __ __ _ 3 ___\ \/ /_ __ __ _| |_ 4 / _ \\ /| '_ \ / _` | __| 5 | __// \| |_) | (_| | |_ 6 \___/_/\_\ .__/ \__,_|\__| 7 |_| XML parser 8 9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd 10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net> 11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net> 12 Copyright (c) 2004-2009 Karl Waclawek <karl@waclawek.net> 13 Copyright (c) 2005-2007 Steven Solie <steven@solie.ca> 14 Copyright (c) 2016-2023 Sebastian Pipping <sebastian@pipping.org> 15 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk> 16 Copyright (c) 2019 David Loffredo <loffredo@steptools.com> 17 Copyright (c) 2020 Joe Orton <jorton@redhat.com> 18 Copyright (c) 2020 Kleber Tarcísio <klebertarcisio@yahoo.com.br> 19 Copyright (c) 2021 Tim Bray <tbray@textuality.com> 20 Copyright (c) 2022 Martin Ettl <ettl.martin78@googlemail.com> 21 Copyright (c) 2022 Sean McBride <sean@rogue-research.com> 22 Licensed under the MIT license: 23 24 Permission is hereby granted, free of charge, to any person obtaining 25 a copy of this software and associated documentation files (the 26 "Software"), to deal in the Software without restriction, including 27 without limitation the rights to use, copy, modify, merge, publish, 28 distribute, sublicense, and/or sell copies of the Software, and to permit 29 persons to whom the Software is furnished to do so, subject to the 30 following conditions: 31 32 The above copyright notice and this permission notice shall be included 33 in all copies or substantial portions of the Software. 34 35 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 36 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 37 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 38 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 39 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 40 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 41 USE OR OTHER DEALINGS IN THE SOFTWARE. 42 */ 43 44 #include "expat_config.h" 45 46 #include <assert.h> 47 #include <stdio.h> 48 #include <stdlib.h> 49 #include <stddef.h> 50 #include <string.h> 51 #include <math.h> /* for isnan */ 52 #include <errno.h> 53 54 #include "expat.h" 55 #include "codepage.h" 56 #include "internal.h" /* for UNUSED_P only */ 57 #include "xmlfile.h" 58 #include "xmltchar.h" 59 60 #ifdef _MSC_VER 61 # include <crtdbg.h> 62 #endif 63 64 #ifdef XML_UNICODE 65 # include <wchar.h> 66 #endif 67 68 enum ExitCode { 69 XMLWF_EXIT_SUCCESS = 0, 70 XMLWF_EXIT_INTERNAL_ERROR = 1, 71 XMLWF_EXIT_NOT_WELLFORMED = 2, 72 XMLWF_EXIT_OUTPUT_ERROR = 3, 73 XMLWF_EXIT_USAGE_ERROR = 4, 74 }; 75 76 /* Structures for handler user data */ 77 typedef struct NotationList { 78 struct NotationList *next; 79 const XML_Char *notationName; 80 const XML_Char *systemId; 81 const XML_Char *publicId; 82 } NotationList; 83 84 typedef struct xmlwfUserData { 85 FILE *fp; 86 NotationList *notationListHead; 87 const XML_Char *currentDoctypeName; 88 } XmlwfUserData; 89 90 /* This ensures proper sorting. */ 91 92 #define NSSEP T('\001') 93 94 static void XMLCALL 95 characterData(void *userData, const XML_Char *s, int len) { 96 FILE *fp = ((XmlwfUserData *)userData)->fp; 97 for (; len > 0; --len, ++s) { 98 switch (*s) { 99 case T('&'): 100 fputts(T("&"), fp); 101 break; 102 case T('<'): 103 fputts(T("<"), fp); 104 break; 105 case T('>'): 106 fputts(T(">"), fp); 107 break; 108 #ifdef W3C14N 109 case 13: 110 fputts(T("
"), fp); 111 break; 112 #else 113 case T('"'): 114 fputts(T("""), fp); 115 break; 116 case 9: 117 case 10: 118 case 13: 119 ftprintf(fp, T("&#%d;"), *s); 120 break; 121 #endif 122 default: 123 puttc(*s, fp); 124 break; 125 } 126 } 127 } 128 129 static void 130 attributeValue(FILE *fp, const XML_Char *s) { 131 puttc(T('='), fp); 132 puttc(T('"'), fp); 133 assert(s); 134 for (;;) { 135 switch (*s) { 136 case 0: 137 case NSSEP: 138 puttc(T('"'), fp); 139 return; 140 case T('&'): 141 fputts(T("&"), fp); 142 break; 143 case T('<'): 144 fputts(T("<"), fp); 145 break; 146 case T('"'): 147 fputts(T("""), fp); 148 break; 149 #ifdef W3C14N 150 case 9: 151 fputts(T("	"), fp); 152 break; 153 case 10: 154 fputts(T("
"), fp); 155 break; 156 case 13: 157 fputts(T("
"), fp); 158 break; 159 #else 160 case T('>'): 161 fputts(T(">"), fp); 162 break; 163 case 9: 164 case 10: 165 case 13: 166 ftprintf(fp, T("&#%d;"), *s); 167 break; 168 #endif 169 default: 170 puttc(*s, fp); 171 break; 172 } 173 s++; 174 } 175 } 176 177 /* Lexicographically comparing UTF-8 encoded attribute values, 178 is equivalent to lexicographically comparing based on the character number. */ 179 180 static int 181 attcmp(const void *att1, const void *att2) { 182 return tcscmp(*(const XML_Char *const *)att1, *(const XML_Char *const *)att2); 183 } 184 185 static void XMLCALL 186 startElement(void *userData, const XML_Char *name, const XML_Char **atts) { 187 int nAtts; 188 const XML_Char **p; 189 FILE *fp = ((XmlwfUserData *)userData)->fp; 190 puttc(T('<'), fp); 191 fputts(name, fp); 192 193 p = atts; 194 while (*p) 195 ++p; 196 nAtts = (int)((p - atts) >> 1); 197 if (nAtts > 1) 198 qsort((void *)atts, nAtts, sizeof(XML_Char *) * 2, attcmp); 199 while (*atts) { 200 puttc(T(' '), fp); 201 fputts(*atts++, fp); 202 attributeValue(fp, *atts); 203 atts++; 204 } 205 puttc(T('>'), fp); 206 } 207 208 static void XMLCALL 209 endElement(void *userData, const XML_Char *name) { 210 FILE *fp = ((XmlwfUserData *)userData)->fp; 211 puttc(T('<'), fp); 212 puttc(T('/'), fp); 213 fputts(name, fp); 214 puttc(T('>'), fp); 215 } 216 217 static int 218 nsattcmp(const void *p1, const void *p2) { 219 const XML_Char *att1 = *(const XML_Char *const *)p1; 220 const XML_Char *att2 = *(const XML_Char *const *)p2; 221 int sep1 = (tcsrchr(att1, NSSEP) != 0); 222 int sep2 = (tcsrchr(att2, NSSEP) != 0); 223 if (sep1 != sep2) 224 return sep1 - sep2; 225 return tcscmp(att1, att2); 226 } 227 228 static void XMLCALL 229 startElementNS(void *userData, const XML_Char *name, const XML_Char **atts) { 230 int nAtts; 231 int nsi; 232 const XML_Char **p; 233 FILE *fp = ((XmlwfUserData *)userData)->fp; 234 const XML_Char *sep; 235 puttc(T('<'), fp); 236 237 sep = tcsrchr(name, NSSEP); 238 if (sep) { 239 fputts(T("n1:"), fp); 240 fputts(sep + 1, fp); 241 fputts(T(" xmlns:n1"), fp); 242 attributeValue(fp, name); 243 nsi = 2; 244 } else { 245 fputts(name, fp); 246 nsi = 1; 247 } 248 249 p = atts; 250 while (*p) 251 ++p; 252 nAtts = (int)((p - atts) >> 1); 253 if (nAtts > 1) 254 qsort((void *)atts, nAtts, sizeof(XML_Char *) * 2, nsattcmp); 255 while (*atts) { 256 name = *atts++; 257 sep = tcsrchr(name, NSSEP); 258 puttc(T(' '), fp); 259 if (sep) { 260 ftprintf(fp, T("n%d:"), nsi); 261 fputts(sep + 1, fp); 262 } else 263 fputts(name, fp); 264 attributeValue(fp, *atts); 265 if (sep) { 266 ftprintf(fp, T(" xmlns:n%d"), nsi++); 267 attributeValue(fp, name); 268 } 269 atts++; 270 } 271 puttc(T('>'), fp); 272 } 273 274 static void XMLCALL 275 endElementNS(void *userData, const XML_Char *name) { 276 FILE *fp = ((XmlwfUserData *)userData)->fp; 277 const XML_Char *sep; 278 puttc(T('<'), fp); 279 puttc(T('/'), fp); 280 sep = tcsrchr(name, NSSEP); 281 if (sep) { 282 fputts(T("n1:"), fp); 283 fputts(sep + 1, fp); 284 } else 285 fputts(name, fp); 286 puttc(T('>'), fp); 287 } 288 289 #ifndef W3C14N 290 291 static void XMLCALL 292 processingInstruction(void *userData, const XML_Char *target, 293 const XML_Char *data) { 294 FILE *fp = ((XmlwfUserData *)userData)->fp; 295 puttc(T('<'), fp); 296 puttc(T('?'), fp); 297 fputts(target, fp); 298 puttc(T(' '), fp); 299 fputts(data, fp); 300 puttc(T('?'), fp); 301 puttc(T('>'), fp); 302 } 303 304 static XML_Char * 305 xcsdup(const XML_Char *s) { 306 XML_Char *result; 307 int count = 0; 308 int numBytes; 309 310 /* Get the length of the string, including terminator */ 311 while (s[count++] != 0) { 312 /* Do nothing */ 313 } 314 numBytes = count * sizeof(XML_Char); 315 result = malloc(numBytes); 316 if (result == NULL) 317 return NULL; 318 memcpy(result, s, numBytes); 319 return result; 320 } 321 322 static void XMLCALL 323 startDoctypeDecl(void *userData, const XML_Char *doctypeName, 324 const XML_Char *sysid, const XML_Char *publid, 325 int has_internal_subset) { 326 XmlwfUserData *data = (XmlwfUserData *)userData; 327 UNUSED_P(sysid); 328 UNUSED_P(publid); 329 UNUSED_P(has_internal_subset); 330 data->currentDoctypeName = xcsdup(doctypeName); 331 } 332 333 static void 334 freeNotations(XmlwfUserData *data) { 335 NotationList *notationListHead = data->notationListHead; 336 337 while (notationListHead != NULL) { 338 NotationList *next = notationListHead->next; 339 free((void *)notationListHead->notationName); 340 free((void *)notationListHead->systemId); 341 free((void *)notationListHead->publicId); 342 free(notationListHead); 343 notationListHead = next; 344 } 345 data->notationListHead = NULL; 346 } 347 348 static void 349 cleanupUserData(XmlwfUserData *userData) { 350 free((void *)userData->currentDoctypeName); 351 userData->currentDoctypeName = NULL; 352 freeNotations(userData); 353 } 354 355 static int 356 xcscmp(const XML_Char *xs, const XML_Char *xt) { 357 while (*xs != 0 && *xt != 0) { 358 if (*xs < *xt) 359 return -1; 360 if (*xs > *xt) 361 return 1; 362 xs++; 363 xt++; 364 } 365 if (*xs < *xt) 366 return -1; 367 if (*xs > *xt) 368 return 1; 369 return 0; 370 } 371 372 static int 373 notationCmp(const void *a, const void *b) { 374 const NotationList *const n1 = *(const NotationList *const *)a; 375 const NotationList *const n2 = *(const NotationList *const *)b; 376 377 return xcscmp(n1->notationName, n2->notationName); 378 } 379 380 static void XMLCALL 381 endDoctypeDecl(void *userData) { 382 XmlwfUserData *data = (XmlwfUserData *)userData; 383 NotationList **notations; 384 int notationCount = 0; 385 NotationList *p; 386 int i; 387 388 /* How many notations do we have? */ 389 for (p = data->notationListHead; p != NULL; p = p->next) 390 notationCount++; 391 if (notationCount == 0) { 392 /* Nothing to report */ 393 free((void *)data->currentDoctypeName); 394 data->currentDoctypeName = NULL; 395 return; 396 } 397 398 notations = malloc(notationCount * sizeof(NotationList *)); 399 if (notations == NULL) { 400 fprintf(stderr, "Unable to sort notations"); 401 freeNotations(data); 402 return; 403 } 404 405 for (p = data->notationListHead, i = 0; i < notationCount; p = p->next, i++) { 406 notations[i] = p; 407 } 408 qsort(notations, notationCount, sizeof(NotationList *), notationCmp); 409 410 /* Output the DOCTYPE header */ 411 fputts(T("<!DOCTYPE "), data->fp); 412 fputts(data->currentDoctypeName, data->fp); 413 fputts(T(" [\n"), data->fp); 414 415 /* Now the NOTATIONs */ 416 for (i = 0; i < notationCount; i++) { 417 fputts(T("<!NOTATION "), data->fp); 418 fputts(notations[i]->notationName, data->fp); 419 if (notations[i]->publicId != NULL) { 420 fputts(T(" PUBLIC '"), data->fp); 421 fputts(notations[i]->publicId, data->fp); 422 puttc(T('\''), data->fp); 423 if (notations[i]->systemId != NULL) { 424 puttc(T(' '), data->fp); 425 puttc(T('\''), data->fp); 426 fputts(notations[i]->systemId, data->fp); 427 puttc(T('\''), data->fp); 428 } 429 } else if (notations[i]->systemId != NULL) { 430 fputts(T(" SYSTEM '"), data->fp); 431 fputts(notations[i]->systemId, data->fp); 432 puttc(T('\''), data->fp); 433 } 434 puttc(T('>'), data->fp); 435 puttc(T('\n'), data->fp); 436 } 437 438 /* Finally end the DOCTYPE */ 439 fputts(T("]>\n"), data->fp); 440 441 free(notations); 442 freeNotations(data); 443 free((void *)data->currentDoctypeName); 444 data->currentDoctypeName = NULL; 445 } 446 447 static void XMLCALL 448 notationDecl(void *userData, const XML_Char *notationName, const XML_Char *base, 449 const XML_Char *systemId, const XML_Char *publicId) { 450 XmlwfUserData *data = (XmlwfUserData *)userData; 451 NotationList *entry = malloc(sizeof(NotationList)); 452 const char *errorMessage = "Unable to store NOTATION for output\n"; 453 454 UNUSED_P(base); 455 if (entry == NULL) { 456 fputs(errorMessage, stderr); 457 return; /* Nothing we can really do about this */ 458 } 459 entry->notationName = xcsdup(notationName); 460 if (entry->notationName == NULL) { 461 fputs(errorMessage, stderr); 462 free(entry); 463 return; 464 } 465 if (systemId != NULL) { 466 entry->systemId = xcsdup(systemId); 467 if (entry->systemId == NULL) { 468 fputs(errorMessage, stderr); 469 free((void *)entry->notationName); 470 free(entry); 471 return; 472 } 473 } else { 474 entry->systemId = NULL; 475 } 476 if (publicId != NULL) { 477 entry->publicId = xcsdup(publicId); 478 if (entry->publicId == NULL) { 479 fputs(errorMessage, stderr); 480 free((void *)entry->systemId); /* Safe if it's NULL */ 481 free((void *)entry->notationName); 482 free(entry); 483 return; 484 } 485 } else { 486 entry->publicId = NULL; 487 } 488 489 entry->next = data->notationListHead; 490 data->notationListHead = entry; 491 } 492 493 #endif /* not W3C14N */ 494 495 static void XMLCALL 496 defaultCharacterData(void *userData, const XML_Char *s, int len) { 497 UNUSED_P(s); 498 UNUSED_P(len); 499 XML_DefaultCurrent((XML_Parser)userData); 500 } 501 502 static void XMLCALL 503 defaultStartElement(void *userData, const XML_Char *name, 504 const XML_Char **atts) { 505 UNUSED_P(name); 506 UNUSED_P(atts); 507 XML_DefaultCurrent((XML_Parser)userData); 508 } 509 510 static void XMLCALL 511 defaultEndElement(void *userData, const XML_Char *name) { 512 UNUSED_P(name); 513 XML_DefaultCurrent((XML_Parser)userData); 514 } 515 516 static void XMLCALL 517 defaultProcessingInstruction(void *userData, const XML_Char *target, 518 const XML_Char *data) { 519 UNUSED_P(target); 520 UNUSED_P(data); 521 XML_DefaultCurrent((XML_Parser)userData); 522 } 523 524 static void XMLCALL 525 nopCharacterData(void *userData, const XML_Char *s, int len) { 526 UNUSED_P(userData); 527 UNUSED_P(s); 528 UNUSED_P(len); 529 } 530 531 static void XMLCALL 532 nopStartElement(void *userData, const XML_Char *name, const XML_Char **atts) { 533 UNUSED_P(userData); 534 UNUSED_P(name); 535 UNUSED_P(atts); 536 } 537 538 static void XMLCALL 539 nopEndElement(void *userData, const XML_Char *name) { 540 UNUSED_P(userData); 541 UNUSED_P(name); 542 } 543 544 static void XMLCALL 545 nopProcessingInstruction(void *userData, const XML_Char *target, 546 const XML_Char *data) { 547 UNUSED_P(userData); 548 UNUSED_P(target); 549 UNUSED_P(data); 550 } 551 552 static void XMLCALL 553 markup(void *userData, const XML_Char *s, int len) { 554 FILE *fp = ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp; 555 for (; len > 0; --len, ++s) 556 puttc(*s, fp); 557 } 558 559 static void 560 metaLocation(XML_Parser parser) { 561 const XML_Char *uri = XML_GetBase(parser); 562 FILE *fp = ((XmlwfUserData *)XML_GetUserData(parser))->fp; 563 if (uri) 564 ftprintf(fp, T(" uri=\"%s\""), uri); 565 ftprintf(fp, 566 T(" byte=\"%") T(XML_FMT_INT_MOD) T("d\"") T(" nbytes=\"%d\"") 567 T(" line=\"%") T(XML_FMT_INT_MOD) T("u\"") T(" col=\"%") 568 T(XML_FMT_INT_MOD) T("u\""), 569 XML_GetCurrentByteIndex(parser), XML_GetCurrentByteCount(parser), 570 XML_GetCurrentLineNumber(parser), 571 XML_GetCurrentColumnNumber(parser)); 572 } 573 574 static void 575 metaStartDocument(void *userData) { 576 fputts(T("<document>\n"), 577 ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp); 578 } 579 580 static void 581 metaEndDocument(void *userData) { 582 fputts(T("</document>\n"), 583 ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp); 584 } 585 586 static void XMLCALL 587 metaStartElement(void *userData, const XML_Char *name, const XML_Char **atts) { 588 XML_Parser parser = (XML_Parser)userData; 589 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 590 FILE *fp = data->fp; 591 const XML_Char **specifiedAttsEnd 592 = atts + XML_GetSpecifiedAttributeCount(parser); 593 const XML_Char **idAttPtr; 594 int idAttIndex = XML_GetIdAttributeIndex(parser); 595 if (idAttIndex < 0) 596 idAttPtr = 0; 597 else 598 idAttPtr = atts + idAttIndex; 599 600 ftprintf(fp, T("<starttag name=\"%s\""), name); 601 metaLocation(parser); 602 if (*atts) { 603 fputts(T(">\n"), fp); 604 do { 605 ftprintf(fp, T("<attribute name=\"%s\" value=\""), atts[0]); 606 characterData(data, atts[1], (int)tcslen(atts[1])); 607 if (atts >= specifiedAttsEnd) 608 fputts(T("\" defaulted=\"yes\"/>\n"), fp); 609 else if (atts == idAttPtr) 610 fputts(T("\" id=\"yes\"/>\n"), fp); 611 else 612 fputts(T("\"/>\n"), fp); 613 } while (*(atts += 2)); 614 fputts(T("</starttag>\n"), fp); 615 } else 616 fputts(T("/>\n"), fp); 617 } 618 619 static void XMLCALL 620 metaEndElement(void *userData, const XML_Char *name) { 621 XML_Parser parser = (XML_Parser)userData; 622 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 623 FILE *fp = data->fp; 624 ftprintf(fp, T("<endtag name=\"%s\""), name); 625 metaLocation(parser); 626 fputts(T("/>\n"), fp); 627 } 628 629 static void XMLCALL 630 metaProcessingInstruction(void *userData, const XML_Char *target, 631 const XML_Char *data) { 632 XML_Parser parser = (XML_Parser)userData; 633 XmlwfUserData *usrData = (XmlwfUserData *)XML_GetUserData(parser); 634 FILE *fp = usrData->fp; 635 ftprintf(fp, T("<pi target=\"%s\" data=\""), target); 636 characterData(usrData, data, (int)tcslen(data)); 637 puttc(T('"'), fp); 638 metaLocation(parser); 639 fputts(T("/>\n"), fp); 640 } 641 642 static void XMLCALL 643 metaComment(void *userData, const XML_Char *data) { 644 XML_Parser parser = (XML_Parser)userData; 645 XmlwfUserData *usrData = (XmlwfUserData *)XML_GetUserData(parser); 646 FILE *fp = usrData->fp; 647 fputts(T("<comment data=\""), fp); 648 characterData(usrData, data, (int)tcslen(data)); 649 puttc(T('"'), fp); 650 metaLocation(parser); 651 fputts(T("/>\n"), fp); 652 } 653 654 static void XMLCALL 655 metaStartCdataSection(void *userData) { 656 XML_Parser parser = (XML_Parser)userData; 657 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 658 FILE *fp = data->fp; 659 fputts(T("<startcdata"), fp); 660 metaLocation(parser); 661 fputts(T("/>\n"), fp); 662 } 663 664 static void XMLCALL 665 metaEndCdataSection(void *userData) { 666 XML_Parser parser = (XML_Parser)userData; 667 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 668 FILE *fp = data->fp; 669 fputts(T("<endcdata"), fp); 670 metaLocation(parser); 671 fputts(T("/>\n"), fp); 672 } 673 674 static void XMLCALL 675 metaCharacterData(void *userData, const XML_Char *s, int len) { 676 XML_Parser parser = (XML_Parser)userData; 677 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 678 FILE *fp = data->fp; 679 fputts(T("<chars str=\""), fp); 680 characterData(data, s, len); 681 puttc(T('"'), fp); 682 metaLocation(parser); 683 fputts(T("/>\n"), fp); 684 } 685 686 static void XMLCALL 687 metaStartDoctypeDecl(void *userData, const XML_Char *doctypeName, 688 const XML_Char *sysid, const XML_Char *pubid, 689 int has_internal_subset) { 690 XML_Parser parser = (XML_Parser)userData; 691 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 692 FILE *fp = data->fp; 693 UNUSED_P(sysid); 694 UNUSED_P(pubid); 695 UNUSED_P(has_internal_subset); 696 ftprintf(fp, T("<startdoctype name=\"%s\""), doctypeName); 697 metaLocation(parser); 698 fputts(T("/>\n"), fp); 699 } 700 701 static void XMLCALL 702 metaEndDoctypeDecl(void *userData) { 703 XML_Parser parser = (XML_Parser)userData; 704 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 705 FILE *fp = data->fp; 706 fputts(T("<enddoctype"), fp); 707 metaLocation(parser); 708 fputts(T("/>\n"), fp); 709 } 710 711 static void XMLCALL 712 metaNotationDecl(void *userData, const XML_Char *notationName, 713 const XML_Char *base, const XML_Char *systemId, 714 const XML_Char *publicId) { 715 XML_Parser parser = (XML_Parser)userData; 716 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 717 FILE *fp = data->fp; 718 UNUSED_P(base); 719 ftprintf(fp, T("<notation name=\"%s\""), notationName); 720 if (publicId) 721 ftprintf(fp, T(" public=\"%s\""), publicId); 722 if (systemId) { 723 fputts(T(" system=\""), fp); 724 characterData(data, systemId, (int)tcslen(systemId)); 725 puttc(T('"'), fp); 726 } 727 metaLocation(parser); 728 fputts(T("/>\n"), fp); 729 } 730 731 static void XMLCALL 732 metaEntityDecl(void *userData, const XML_Char *entityName, int is_param, 733 const XML_Char *value, int value_length, const XML_Char *base, 734 const XML_Char *systemId, const XML_Char *publicId, 735 const XML_Char *notationName) { 736 XML_Parser parser = (XML_Parser)userData; 737 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 738 FILE *fp = data->fp; 739 740 UNUSED_P(is_param); 741 UNUSED_P(base); 742 if (value) { 743 ftprintf(fp, T("<entity name=\"%s\""), entityName); 744 metaLocation(parser); 745 puttc(T('>'), fp); 746 characterData(data, value, value_length); 747 fputts(T("</entity/>\n"), fp); 748 } else if (notationName) { 749 ftprintf(fp, T("<entity name=\"%s\""), entityName); 750 if (publicId) 751 ftprintf(fp, T(" public=\"%s\""), publicId); 752 fputts(T(" system=\""), fp); 753 characterData(data, systemId, (int)tcslen(systemId)); 754 puttc(T('"'), fp); 755 ftprintf(fp, T(" notation=\"%s\""), notationName); 756 metaLocation(parser); 757 fputts(T("/>\n"), fp); 758 } else { 759 ftprintf(fp, T("<entity name=\"%s\""), entityName); 760 if (publicId) 761 ftprintf(fp, T(" public=\"%s\""), publicId); 762 fputts(T(" system=\""), fp); 763 characterData(data, systemId, (int)tcslen(systemId)); 764 puttc(T('"'), fp); 765 metaLocation(parser); 766 fputts(T("/>\n"), fp); 767 } 768 } 769 770 static void XMLCALL 771 metaStartNamespaceDecl(void *userData, const XML_Char *prefix, 772 const XML_Char *uri) { 773 XML_Parser parser = (XML_Parser)userData; 774 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 775 FILE *fp = data->fp; 776 fputts(T("<startns"), fp); 777 if (prefix) 778 ftprintf(fp, T(" prefix=\"%s\""), prefix); 779 if (uri) { 780 fputts(T(" ns=\""), fp); 781 characterData(data, uri, (int)tcslen(uri)); 782 fputts(T("\"/>\n"), fp); 783 } else 784 fputts(T("/>\n"), fp); 785 } 786 787 static void XMLCALL 788 metaEndNamespaceDecl(void *userData, const XML_Char *prefix) { 789 XML_Parser parser = (XML_Parser)userData; 790 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 791 FILE *fp = data->fp; 792 if (! prefix) 793 fputts(T("<endns/>\n"), fp); 794 else 795 ftprintf(fp, T("<endns prefix=\"%s\"/>\n"), prefix); 796 } 797 798 static int XMLCALL 799 unknownEncodingConvert(void *data, const char *p) { 800 return codepageConvert(*(int *)data, p); 801 } 802 803 static int XMLCALL 804 unknownEncoding(void *userData, const XML_Char *name, XML_Encoding *info) { 805 int cp; 806 static const XML_Char prefixL[] = T("windows-"); 807 static const XML_Char prefixU[] = T("WINDOWS-"); 808 int i; 809 810 UNUSED_P(userData); 811 for (i = 0; prefixU[i]; i++) 812 if (name[i] != prefixU[i] && name[i] != prefixL[i]) 813 return 0; 814 815 cp = 0; 816 for (; name[i]; i++) { 817 static const XML_Char digits[] = T("0123456789"); 818 const XML_Char *s = tcschr(digits, name[i]); 819 if (! s) 820 return 0; 821 cp *= 10; 822 cp += (int)(s - digits); 823 if (cp >= 0x10000) 824 return 0; 825 } 826 if (! codepageMap(cp, info->map)) 827 return 0; 828 info->convert = unknownEncodingConvert; 829 /* We could just cast the code page integer to a void *, 830 and avoid the use of release. */ 831 info->release = free; 832 info->data = malloc(sizeof(int)); 833 if (! info->data) 834 return 0; 835 *(int *)info->data = cp; 836 return 1; 837 } 838 839 static int XMLCALL 840 notStandalone(void *userData) { 841 UNUSED_P(userData); 842 return 0; 843 } 844 845 static void 846 showVersion(XML_Char *prog) { 847 XML_Char *s = prog; 848 XML_Char ch; 849 const XML_Feature *features = XML_GetFeatureList(); 850 while ((ch = *s) != 0) { 851 if (ch == '/' 852 #if defined(_WIN32) 853 || ch == '\\' 854 #endif 855 ) 856 prog = s + 1; 857 ++s; 858 } 859 ftprintf(stdout, T("%s using %s\n"), prog, XML_ExpatVersion()); 860 if (features != NULL && features[0].feature != XML_FEATURE_END) { 861 int i = 1; 862 ftprintf(stdout, T("%s"), features[0].name); 863 if (features[0].value) 864 ftprintf(stdout, T("=%ld"), features[0].value); 865 while (features[i].feature != XML_FEATURE_END) { 866 ftprintf(stdout, T(", %s"), features[i].name); 867 if (features[i].value) 868 ftprintf(stdout, T("=%ld"), features[i].value); 869 ++i; 870 } 871 ftprintf(stdout, T("\n")); 872 } 873 } 874 875 #if defined(__GNUC__) 876 __attribute__((noreturn)) 877 #endif 878 static void 879 usage(const XML_Char *prog, int rc) { 880 ftprintf( 881 stderr, 882 /* Generated with: 883 * $ xmlwf/xmlwf_helpgen.sh 884 * To update, change xmlwf/xmlwf_helpgen.py, then paste the output of 885 * xmlwf/xmlwf_helpgen.sh in here. 886 */ 887 /* clang-format off */ 888 T("usage:\n") 889 T(" %s [OPTIONS] [FILE ...]\n") 890 T(" %s -h|--help\n") 891 T(" %s -v|--version\n") 892 T("\n") 893 T("xmlwf - Determines if an XML document is well-formed\n") 894 T("\n") 895 T("positional arguments:\n") 896 T(" FILE file to process (default: STDIN)\n") 897 T("\n") 898 T("input control arguments:\n") 899 T(" -s print an error if the document is not [s]tandalone\n") 900 T(" -n enable [n]amespace processing\n") 901 T(" -p enable processing of external DTDs and [p]arameter entities\n") 902 T(" -x enable processing of e[x]ternal entities\n") 903 T(" -e ENCODING override any in-document [e]ncoding declaration\n") 904 T(" -w enable support for [W]indows code pages\n") 905 T(" -r disable memory-mapping and use [r]ead calls instead\n") 906 T(" -g BYTES buffer size to request per call pair to XML_[G]etBuffer and read (default: 8 KiB)\n") 907 T(" -k when processing multiple files, [k]eep processing after first file with error\n") 908 T("\n") 909 T("output control arguments:\n") 910 T(" -d DIRECTORY output [d]estination directory\n") 911 T(" -c write a [c]opy of input XML, not canonical XML\n") 912 T(" -m write [m]eta XML, not canonical XML\n") 913 T(" -t write no XML output for [t]iming of plain parsing\n") 914 T(" -N enable adding doctype and [n]otation declarations\n") 915 T("\n") 916 T("billion laughs attack protection:\n") 917 T(" NOTE: If you ever need to increase these values for non-attack payload, please file a bug report.\n") 918 T("\n") 919 T(" -a FACTOR set maximum tolerated [a]mplification factor (default: 100.0)\n") 920 T(" -b BYTES set number of output [b]ytes needed to activate (default: 8 MiB)\n") 921 T("\n") 922 T("reparse deferral:\n") 923 T(" -q disable reparse deferral, and allow [q]uadratic parse runtime with large tokens\n") 924 T("\n") 925 T("info arguments:\n") 926 T(" -h, --help show this [h]elp message and exit\n") 927 T(" -v, --version show program's [v]ersion number and exit\n") 928 T("\n") 929 T("exit status:\n") 930 T(" 0 the input files are well-formed and the output (if requested) was written successfully\n") 931 T(" 1 could not allocate data structures, signals a serious problem with execution environment\n") 932 T(" 2 one or more input files were not well-formed\n") 933 T(" 3 could not create an output file\n") 934 T(" 4 command-line argument error\n") 935 T("\n") 936 T("xmlwf of libexpat is software libre, licensed under the MIT license.\n") 937 T("Please report bugs at https://github.com/libexpat/libexpat/issues -- thank you!\n") 938 , /* clang-format on */ 939 prog, prog, prog); 940 exit(rc); 941 } 942 943 #if defined(__MINGW32__) && defined(XML_UNICODE) 944 /* Silence warning about missing prototype */ 945 int wmain(int argc, XML_Char **argv); 946 #endif 947 948 #define XMLWF_SHIFT_ARG_INTO(constCharStarTarget, argc, argv, i, j) \ 949 { \ 950 if (argv[i][j + 1] == T('\0')) { \ 951 if (++i == argc) { \ 952 usage(argv[0], XMLWF_EXIT_USAGE_ERROR); \ 953 /* usage called exit(..), never gets here */ \ 954 } \ 955 constCharStarTarget = argv[i]; \ 956 } else { \ 957 constCharStarTarget = argv[i] + j + 1; \ 958 } \ 959 i++; \ 960 j = 0; \ 961 } 962 963 int 964 tmain(int argc, XML_Char **argv) { 965 int i, j; 966 const XML_Char *outputDir = NULL; 967 const XML_Char *encoding = NULL; 968 unsigned processFlags = XML_MAP_FILE; 969 int windowsCodePages = 0; 970 int outputType = 0; 971 int useNamespaces = 0; 972 int requireStandalone = 0; 973 int requiresNotations = 0; 974 int continueOnError = 0; 975 976 float attackMaximumAmplification = -1.0f; /* signaling "not set" */ 977 unsigned long long attackThresholdBytes = 0; 978 XML_Bool attackThresholdGiven = XML_FALSE; 979 980 XML_Bool disableDeferral = XML_FALSE; 981 982 int exitCode = XMLWF_EXIT_SUCCESS; 983 enum XML_ParamEntityParsing paramEntityParsing 984 = XML_PARAM_ENTITY_PARSING_NEVER; 985 int useStdin = 0; 986 XmlwfUserData userData = {NULL, NULL, NULL}; 987 988 #ifdef _MSC_VER 989 _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF); 990 #endif 991 992 i = 1; 993 j = 0; 994 while (i < argc) { 995 if (j == 0) { 996 if (argv[i][0] != T('-')) 997 break; 998 if (argv[i][1] == T('-')) { 999 if (argv[i][2] == T('\0')) { 1000 i++; 1001 break; 1002 } else if (tcscmp(argv[i] + 2, T("help")) == 0) { 1003 usage(argv[0], XMLWF_EXIT_SUCCESS); 1004 // usage called exit(..), never gets here 1005 } else if (tcscmp(argv[i] + 2, T("version")) == 0) { 1006 showVersion(argv[0]); 1007 return XMLWF_EXIT_SUCCESS; 1008 } 1009 } 1010 j++; 1011 } 1012 switch (argv[i][j]) { 1013 case T('r'): 1014 processFlags &= ~XML_MAP_FILE; 1015 j++; 1016 break; 1017 case T('s'): 1018 requireStandalone = 1; 1019 j++; 1020 break; 1021 case T('n'): 1022 useNamespaces = 1; 1023 j++; 1024 break; 1025 case T('p'): 1026 paramEntityParsing = XML_PARAM_ENTITY_PARSING_ALWAYS; 1027 /* fall through */ 1028 case T('x'): 1029 processFlags |= XML_EXTERNAL_ENTITIES; 1030 j++; 1031 break; 1032 case T('w'): 1033 windowsCodePages = 1; 1034 j++; 1035 break; 1036 case T('m'): 1037 outputType = 'm'; 1038 j++; 1039 break; 1040 case T('c'): 1041 outputType = 'c'; 1042 useNamespaces = 0; 1043 j++; 1044 break; 1045 case T('t'): 1046 outputType = 't'; 1047 j++; 1048 break; 1049 case T('N'): 1050 requiresNotations = 1; 1051 j++; 1052 break; 1053 case T('d'): 1054 XMLWF_SHIFT_ARG_INTO(outputDir, argc, argv, i, j); 1055 break; 1056 case T('e'): 1057 XMLWF_SHIFT_ARG_INTO(encoding, argc, argv, i, j); 1058 break; 1059 case T('h'): 1060 usage(argv[0], XMLWF_EXIT_SUCCESS); 1061 // usage called exit(..), never gets here 1062 case T('v'): 1063 showVersion(argv[0]); 1064 return XMLWF_EXIT_SUCCESS; 1065 case T('g'): { 1066 const XML_Char *valueText = NULL; 1067 XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j); 1068 1069 errno = 0; 1070 XML_Char *afterValueText = (XML_Char *)valueText; 1071 const long long read_size_bytes_candidate 1072 = tcstoull(valueText, &afterValueText, 10); 1073 if ((errno != 0) || (afterValueText[0] != T('\0')) 1074 || (read_size_bytes_candidate < 1) 1075 || (read_size_bytes_candidate > (INT_MAX / 2 + 1))) { 1076 // This prevents tperror(..) from reporting misleading "[..]: Success" 1077 errno = ERANGE; 1078 tperror(T("invalid buffer size") T( 1079 " (needs an integer from 1 to INT_MAX/2+1 i.e. 1,073,741,824 on most platforms)")); 1080 exit(XMLWF_EXIT_USAGE_ERROR); 1081 } 1082 g_read_size_bytes = (int)read_size_bytes_candidate; 1083 break; 1084 } 1085 case T('k'): 1086 continueOnError = 1; 1087 j++; 1088 break; 1089 case T('a'): { 1090 const XML_Char *valueText = NULL; 1091 XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j); 1092 1093 errno = 0; 1094 XML_Char *afterValueText = NULL; 1095 attackMaximumAmplification = tcstof(valueText, &afterValueText); 1096 if ((errno != 0) || (afterValueText[0] != T('\0')) 1097 || isnan(attackMaximumAmplification) 1098 || (attackMaximumAmplification < 1.0f)) { 1099 // This prevents tperror(..) from reporting misleading "[..]: Success" 1100 errno = ERANGE; 1101 tperror(T("invalid amplification limit") T( 1102 " (needs a floating point number greater or equal than 1.0)")); 1103 exit(XMLWF_EXIT_USAGE_ERROR); 1104 } 1105 #if XML_GE == 0 1106 ftprintf(stderr, 1107 T("Warning: Given amplification limit ignored") 1108 T(", xmlwf has been compiled without DTD/GE support.\n")); 1109 #endif 1110 break; 1111 } 1112 case T('b'): { 1113 const XML_Char *valueText = NULL; 1114 XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j); 1115 1116 errno = 0; 1117 XML_Char *afterValueText = (XML_Char *)valueText; 1118 attackThresholdBytes = tcstoull(valueText, &afterValueText, 10); 1119 if ((errno != 0) || (afterValueText[0] != T('\0'))) { 1120 // This prevents tperror(..) from reporting misleading "[..]: Success" 1121 errno = ERANGE; 1122 tperror(T("invalid ignore threshold") 1123 T(" (needs an integer from 0 to 2^64-1)")); 1124 exit(XMLWF_EXIT_USAGE_ERROR); 1125 } 1126 attackThresholdGiven = XML_TRUE; 1127 #if XML_GE == 0 1128 ftprintf(stderr, 1129 T("Warning: Given attack threshold ignored") 1130 T(", xmlwf has been compiled without DTD/GE support.\n")); 1131 #endif 1132 break; 1133 } 1134 case T('q'): { 1135 disableDeferral = XML_TRUE; 1136 j++; 1137 break; 1138 } 1139 case T('\0'): 1140 if (j > 1) { 1141 i++; 1142 j = 0; 1143 break; 1144 } 1145 /* fall through */ 1146 default: 1147 usage(argv[0], XMLWF_EXIT_USAGE_ERROR); 1148 // usage called exit(..), never gets here 1149 } 1150 } 1151 if (i == argc) { 1152 useStdin = 1; 1153 processFlags &= ~XML_MAP_FILE; 1154 i--; 1155 } 1156 for (; i < argc; i++) { 1157 XML_Char *outName = 0; 1158 int result; 1159 XML_Parser parser; 1160 if (useNamespaces) 1161 parser = XML_ParserCreateNS(encoding, NSSEP); 1162 else 1163 parser = XML_ParserCreate(encoding); 1164 1165 if (! parser) { 1166 tperror(T("Could not instantiate parser")); 1167 exit(XMLWF_EXIT_INTERNAL_ERROR); 1168 } 1169 1170 if (attackMaximumAmplification != -1.0f) { 1171 #if XML_GE == 1 1172 XML_SetBillionLaughsAttackProtectionMaximumAmplification( 1173 parser, attackMaximumAmplification); 1174 #endif 1175 } 1176 if (attackThresholdGiven) { 1177 #if XML_GE == 1 1178 XML_SetBillionLaughsAttackProtectionActivationThreshold( 1179 parser, attackThresholdBytes); 1180 #else 1181 (void)attackThresholdBytes; // silence -Wunused-but-set-variable 1182 #endif 1183 } 1184 1185 if (disableDeferral) { 1186 const XML_Bool success = XML_SetReparseDeferralEnabled(parser, XML_FALSE); 1187 if (! success) { 1188 // This prevents tperror(..) from reporting misleading "[..]: Success" 1189 errno = EINVAL; 1190 tperror(T("Failed to disable reparse deferral")); 1191 exit(XMLWF_EXIT_INTERNAL_ERROR); 1192 } 1193 } 1194 1195 if (requireStandalone) 1196 XML_SetNotStandaloneHandler(parser, notStandalone); 1197 XML_SetParamEntityParsing(parser, paramEntityParsing); 1198 if (outputType == 't') { 1199 /* This is for doing timings; this gives a more realistic estimate of 1200 the parsing time. */ 1201 outputDir = 0; 1202 XML_SetElementHandler(parser, nopStartElement, nopEndElement); 1203 XML_SetCharacterDataHandler(parser, nopCharacterData); 1204 XML_SetProcessingInstructionHandler(parser, nopProcessingInstruction); 1205 } else if (outputDir) { 1206 const XML_Char *delim = T("/"); 1207 const XML_Char *file = useStdin ? T("STDIN") : argv[i]; 1208 if (! useStdin) { 1209 /* Jump after last (back)slash */ 1210 const XML_Char *lastDelim = tcsrchr(file, delim[0]); 1211 if (lastDelim) 1212 file = lastDelim + 1; 1213 #if defined(_WIN32) 1214 else { 1215 const XML_Char *winDelim = T("\\"); 1216 lastDelim = tcsrchr(file, winDelim[0]); 1217 if (lastDelim) { 1218 file = lastDelim + 1; 1219 delim = winDelim; 1220 } 1221 } 1222 #endif 1223 } 1224 outName = (XML_Char *)malloc((tcslen(outputDir) + tcslen(file) + 2) 1225 * sizeof(XML_Char)); 1226 if (! outName) { 1227 tperror(T("Could not allocate memory")); 1228 exit(XMLWF_EXIT_INTERNAL_ERROR); 1229 } 1230 tcscpy(outName, outputDir); 1231 tcscat(outName, delim); 1232 tcscat(outName, file); 1233 userData.fp = tfopen(outName, T("wb")); 1234 if (! userData.fp) { 1235 tperror(outName); 1236 exitCode = XMLWF_EXIT_OUTPUT_ERROR; 1237 free(outName); 1238 XML_ParserFree(parser); 1239 if (continueOnError) { 1240 continue; 1241 } else { 1242 break; 1243 } 1244 } 1245 setvbuf(userData.fp, NULL, _IOFBF, 16384); 1246 #ifdef XML_UNICODE 1247 puttc(0xFEFF, userData.fp); 1248 #endif 1249 XML_SetUserData(parser, &userData); 1250 switch (outputType) { 1251 case 'm': 1252 XML_UseParserAsHandlerArg(parser); 1253 XML_SetElementHandler(parser, metaStartElement, metaEndElement); 1254 XML_SetProcessingInstructionHandler(parser, metaProcessingInstruction); 1255 XML_SetCommentHandler(parser, metaComment); 1256 XML_SetCdataSectionHandler(parser, metaStartCdataSection, 1257 metaEndCdataSection); 1258 XML_SetCharacterDataHandler(parser, metaCharacterData); 1259 XML_SetDoctypeDeclHandler(parser, metaStartDoctypeDecl, 1260 metaEndDoctypeDecl); 1261 XML_SetEntityDeclHandler(parser, metaEntityDecl); 1262 XML_SetNotationDeclHandler(parser, metaNotationDecl); 1263 XML_SetNamespaceDeclHandler(parser, metaStartNamespaceDecl, 1264 metaEndNamespaceDecl); 1265 metaStartDocument(parser); 1266 break; 1267 case 'c': 1268 XML_UseParserAsHandlerArg(parser); 1269 XML_SetDefaultHandler(parser, markup); 1270 XML_SetElementHandler(parser, defaultStartElement, defaultEndElement); 1271 XML_SetCharacterDataHandler(parser, defaultCharacterData); 1272 XML_SetProcessingInstructionHandler(parser, 1273 defaultProcessingInstruction); 1274 break; 1275 default: 1276 if (useNamespaces) 1277 XML_SetElementHandler(parser, startElementNS, endElementNS); 1278 else 1279 XML_SetElementHandler(parser, startElement, endElement); 1280 XML_SetCharacterDataHandler(parser, characterData); 1281 #ifndef W3C14N 1282 XML_SetProcessingInstructionHandler(parser, processingInstruction); 1283 if (requiresNotations) { 1284 XML_SetDoctypeDeclHandler(parser, startDoctypeDecl, endDoctypeDecl); 1285 XML_SetNotationDeclHandler(parser, notationDecl); 1286 } 1287 #endif /* not W3C14N */ 1288 break; 1289 } 1290 } 1291 if (windowsCodePages) 1292 XML_SetUnknownEncodingHandler(parser, unknownEncoding, 0); 1293 result = XML_ProcessFile(parser, useStdin ? NULL : argv[i], processFlags); 1294 if (outputDir) { 1295 if (outputType == 'm') 1296 metaEndDocument(parser); 1297 fclose(userData.fp); 1298 if (! result) { 1299 tremove(outName); 1300 } 1301 free(outName); 1302 } 1303 XML_ParserFree(parser); 1304 if (! result) { 1305 exitCode = XMLWF_EXIT_NOT_WELLFORMED; 1306 cleanupUserData(&userData); 1307 if (! continueOnError) { 1308 break; 1309 } 1310 } 1311 } 1312 return exitCode; 1313 } 1314