1 /* 2 __ __ _ 3 ___\ \/ /_ __ __ _| |_ 4 / _ \\ /| '_ \ / _` | __| 5 | __// \| |_) | (_| | |_ 6 \___/_/\_\ .__/ \__,_|\__| 7 |_| XML parser 8 9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd 10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net> 11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net> 12 Copyright (c) 2004-2009 Karl Waclawek <karl@waclawek.net> 13 Copyright (c) 2005-2007 Steven Solie <steven@solie.ca> 14 Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org> 15 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk> 16 Copyright (c) 2019 David Loffredo <loffredo@steptools.com> 17 Copyright (c) 2020 Joe Orton <jorton@redhat.com> 18 Copyright (c) 2020 Kleber Tarcísio <klebertarcisio@yahoo.com.br> 19 Copyright (c) 2021 Tim Bray <tbray@textuality.com> 20 Copyright (c) 2022 Martin Ettl <ettl.martin78@googlemail.com> 21 Licensed under the MIT license: 22 23 Permission is hereby granted, free of charge, to any person obtaining 24 a copy of this software and associated documentation files (the 25 "Software"), to deal in the Software without restriction, including 26 without limitation the rights to use, copy, modify, merge, publish, 27 distribute, sublicense, and/or sell copies of the Software, and to permit 28 persons to whom the Software is furnished to do so, subject to the 29 following conditions: 30 31 The above copyright notice and this permission notice shall be included 32 in all copies or substantial portions of the Software. 33 34 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 35 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 36 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 37 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 38 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 39 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 40 USE OR OTHER DEALINGS IN THE SOFTWARE. 41 */ 42 43 #include <expat_config.h> 44 45 #include <assert.h> 46 #include <stdio.h> 47 #include <stdlib.h> 48 #include <stddef.h> 49 #include <string.h> 50 #include <math.h> /* for isnan */ 51 #include <errno.h> 52 53 #include "expat.h" 54 #include "codepage.h" 55 #include "internal.h" /* for UNUSED_P only */ 56 #include "xmlfile.h" 57 #include "xmltchar.h" 58 59 #ifdef _MSC_VER 60 # include <crtdbg.h> 61 #endif 62 63 #ifdef XML_UNICODE 64 # include <wchar.h> 65 #endif 66 67 enum ExitCode { 68 XMLWF_EXIT_SUCCESS = 0, 69 XMLWF_EXIT_INTERNAL_ERROR = 1, 70 XMLWF_EXIT_NOT_WELLFORMED = 2, 71 XMLWF_EXIT_OUTPUT_ERROR = 3, 72 XMLWF_EXIT_USAGE_ERROR = 4, 73 }; 74 75 /* Structures for handler user data */ 76 typedef struct NotationList { 77 struct NotationList *next; 78 const XML_Char *notationName; 79 const XML_Char *systemId; 80 const XML_Char *publicId; 81 } NotationList; 82 83 typedef struct xmlwfUserData { 84 FILE *fp; 85 NotationList *notationListHead; 86 const XML_Char *currentDoctypeName; 87 } XmlwfUserData; 88 89 /* This ensures proper sorting. */ 90 91 #define NSSEP T('\001') 92 93 static void XMLCALL 94 characterData(void *userData, const XML_Char *s, int len) { 95 FILE *fp = ((XmlwfUserData *)userData)->fp; 96 for (; len > 0; --len, ++s) { 97 switch (*s) { 98 case T('&'): 99 fputts(T("&"), fp); 100 break; 101 case T('<'): 102 fputts(T("<"), fp); 103 break; 104 case T('>'): 105 fputts(T(">"), fp); 106 break; 107 #ifdef W3C14N 108 case 13: 109 fputts(T("
"), fp); 110 break; 111 #else 112 case T('"'): 113 fputts(T("""), fp); 114 break; 115 case 9: 116 case 10: 117 case 13: 118 ftprintf(fp, T("&#%d;"), *s); 119 break; 120 #endif 121 default: 122 puttc(*s, fp); 123 break; 124 } 125 } 126 } 127 128 static void 129 attributeValue(FILE *fp, const XML_Char *s) { 130 puttc(T('='), fp); 131 puttc(T('"'), fp); 132 assert(s); 133 for (;;) { 134 switch (*s) { 135 case 0: 136 case NSSEP: 137 puttc(T('"'), fp); 138 return; 139 case T('&'): 140 fputts(T("&"), fp); 141 break; 142 case T('<'): 143 fputts(T("<"), fp); 144 break; 145 case T('"'): 146 fputts(T("""), fp); 147 break; 148 #ifdef W3C14N 149 case 9: 150 fputts(T("	"), fp); 151 break; 152 case 10: 153 fputts(T("
"), fp); 154 break; 155 case 13: 156 fputts(T("
"), fp); 157 break; 158 #else 159 case T('>'): 160 fputts(T(">"), fp); 161 break; 162 case 9: 163 case 10: 164 case 13: 165 ftprintf(fp, T("&#%d;"), *s); 166 break; 167 #endif 168 default: 169 puttc(*s, fp); 170 break; 171 } 172 s++; 173 } 174 } 175 176 /* Lexicographically comparing UTF-8 encoded attribute values, 177 is equivalent to lexicographically comparing based on the character number. */ 178 179 static int 180 attcmp(const void *att1, const void *att2) { 181 return tcscmp(*(const XML_Char **)att1, *(const XML_Char **)att2); 182 } 183 184 static void XMLCALL 185 startElement(void *userData, const XML_Char *name, const XML_Char **atts) { 186 int nAtts; 187 const XML_Char **p; 188 FILE *fp = ((XmlwfUserData *)userData)->fp; 189 puttc(T('<'), fp); 190 fputts(name, fp); 191 192 p = atts; 193 while (*p) 194 ++p; 195 nAtts = (int)((p - atts) >> 1); 196 if (nAtts > 1) 197 qsort((void *)atts, nAtts, sizeof(XML_Char *) * 2, attcmp); 198 while (*atts) { 199 puttc(T(' '), fp); 200 fputts(*atts++, fp); 201 attributeValue(fp, *atts); 202 atts++; 203 } 204 puttc(T('>'), fp); 205 } 206 207 static void XMLCALL 208 endElement(void *userData, const XML_Char *name) { 209 FILE *fp = ((XmlwfUserData *)userData)->fp; 210 puttc(T('<'), fp); 211 puttc(T('/'), fp); 212 fputts(name, fp); 213 puttc(T('>'), fp); 214 } 215 216 static int 217 nsattcmp(const void *p1, const void *p2) { 218 const XML_Char *att1 = *(const XML_Char **)p1; 219 const XML_Char *att2 = *(const XML_Char **)p2; 220 int sep1 = (tcsrchr(att1, NSSEP) != 0); 221 int sep2 = (tcsrchr(att2, NSSEP) != 0); 222 if (sep1 != sep2) 223 return sep1 - sep2; 224 return tcscmp(att1, att2); 225 } 226 227 static void XMLCALL 228 startElementNS(void *userData, const XML_Char *name, const XML_Char **atts) { 229 int nAtts; 230 int nsi; 231 const XML_Char **p; 232 FILE *fp = ((XmlwfUserData *)userData)->fp; 233 const XML_Char *sep; 234 puttc(T('<'), fp); 235 236 sep = tcsrchr(name, NSSEP); 237 if (sep) { 238 fputts(T("n1:"), fp); 239 fputts(sep + 1, fp); 240 fputts(T(" xmlns:n1"), fp); 241 attributeValue(fp, name); 242 nsi = 2; 243 } else { 244 fputts(name, fp); 245 nsi = 1; 246 } 247 248 p = atts; 249 while (*p) 250 ++p; 251 nAtts = (int)((p - atts) >> 1); 252 if (nAtts > 1) 253 qsort((void *)atts, nAtts, sizeof(XML_Char *) * 2, nsattcmp); 254 while (*atts) { 255 name = *atts++; 256 sep = tcsrchr(name, NSSEP); 257 puttc(T(' '), fp); 258 if (sep) { 259 ftprintf(fp, T("n%d:"), nsi); 260 fputts(sep + 1, fp); 261 } else 262 fputts(name, fp); 263 attributeValue(fp, *atts); 264 if (sep) { 265 ftprintf(fp, T(" xmlns:n%d"), nsi++); 266 attributeValue(fp, name); 267 } 268 atts++; 269 } 270 puttc(T('>'), fp); 271 } 272 273 static void XMLCALL 274 endElementNS(void *userData, const XML_Char *name) { 275 FILE *fp = ((XmlwfUserData *)userData)->fp; 276 const XML_Char *sep; 277 puttc(T('<'), fp); 278 puttc(T('/'), fp); 279 sep = tcsrchr(name, NSSEP); 280 if (sep) { 281 fputts(T("n1:"), fp); 282 fputts(sep + 1, fp); 283 } else 284 fputts(name, fp); 285 puttc(T('>'), fp); 286 } 287 288 #ifndef W3C14N 289 290 static void XMLCALL 291 processingInstruction(void *userData, const XML_Char *target, 292 const XML_Char *data) { 293 FILE *fp = ((XmlwfUserData *)userData)->fp; 294 puttc(T('<'), fp); 295 puttc(T('?'), fp); 296 fputts(target, fp); 297 puttc(T(' '), fp); 298 fputts(data, fp); 299 puttc(T('?'), fp); 300 puttc(T('>'), fp); 301 } 302 303 static XML_Char * 304 xcsdup(const XML_Char *s) { 305 XML_Char *result; 306 int count = 0; 307 int numBytes; 308 309 /* Get the length of the string, including terminator */ 310 while (s[count++] != 0) { 311 /* Do nothing */ 312 } 313 numBytes = count * sizeof(XML_Char); 314 result = malloc(numBytes); 315 if (result == NULL) 316 return NULL; 317 memcpy(result, s, numBytes); 318 return result; 319 } 320 321 static void XMLCALL 322 startDoctypeDecl(void *userData, const XML_Char *doctypeName, 323 const XML_Char *sysid, const XML_Char *publid, 324 int has_internal_subset) { 325 XmlwfUserData *data = (XmlwfUserData *)userData; 326 UNUSED_P(sysid); 327 UNUSED_P(publid); 328 UNUSED_P(has_internal_subset); 329 data->currentDoctypeName = xcsdup(doctypeName); 330 } 331 332 static void 333 freeNotations(XmlwfUserData *data) { 334 NotationList *notationListHead = data->notationListHead; 335 336 while (notationListHead != NULL) { 337 NotationList *next = notationListHead->next; 338 free((void *)notationListHead->notationName); 339 free((void *)notationListHead->systemId); 340 free((void *)notationListHead->publicId); 341 free(notationListHead); 342 notationListHead = next; 343 } 344 data->notationListHead = NULL; 345 } 346 347 static void 348 cleanupUserData(XmlwfUserData *userData) { 349 free((void *)userData->currentDoctypeName); 350 userData->currentDoctypeName = NULL; 351 freeNotations(userData); 352 } 353 354 static int 355 xcscmp(const XML_Char *xs, const XML_Char *xt) { 356 while (*xs != 0 && *xt != 0) { 357 if (*xs < *xt) 358 return -1; 359 if (*xs > *xt) 360 return 1; 361 xs++; 362 xt++; 363 } 364 if (*xs < *xt) 365 return -1; 366 if (*xs > *xt) 367 return 1; 368 return 0; 369 } 370 371 static int 372 notationCmp(const void *a, const void *b) { 373 const NotationList *const n1 = *(NotationList **)a; 374 const NotationList *const n2 = *(NotationList **)b; 375 376 return xcscmp(n1->notationName, n2->notationName); 377 } 378 379 static void XMLCALL 380 endDoctypeDecl(void *userData) { 381 XmlwfUserData *data = (XmlwfUserData *)userData; 382 NotationList **notations; 383 int notationCount = 0; 384 NotationList *p; 385 int i; 386 387 /* How many notations do we have? */ 388 for (p = data->notationListHead; p != NULL; p = p->next) 389 notationCount++; 390 if (notationCount == 0) { 391 /* Nothing to report */ 392 free((void *)data->currentDoctypeName); 393 data->currentDoctypeName = NULL; 394 return; 395 } 396 397 notations = malloc(notationCount * sizeof(NotationList *)); 398 if (notations == NULL) { 399 fprintf(stderr, "Unable to sort notations"); 400 freeNotations(data); 401 return; 402 } 403 404 for (p = data->notationListHead, i = 0; i < notationCount; p = p->next, i++) { 405 notations[i] = p; 406 } 407 qsort(notations, notationCount, sizeof(NotationList *), notationCmp); 408 409 /* Output the DOCTYPE header */ 410 fputts(T("<!DOCTYPE "), data->fp); 411 fputts(data->currentDoctypeName, data->fp); 412 fputts(T(" [\n"), data->fp); 413 414 /* Now the NOTATIONs */ 415 for (i = 0; i < notationCount; i++) { 416 fputts(T("<!NOTATION "), data->fp); 417 fputts(notations[i]->notationName, data->fp); 418 if (notations[i]->publicId != NULL) { 419 fputts(T(" PUBLIC '"), data->fp); 420 fputts(notations[i]->publicId, data->fp); 421 puttc(T('\''), data->fp); 422 if (notations[i]->systemId != NULL) { 423 puttc(T(' '), data->fp); 424 puttc(T('\''), data->fp); 425 fputts(notations[i]->systemId, data->fp); 426 puttc(T('\''), data->fp); 427 } 428 } else if (notations[i]->systemId != NULL) { 429 fputts(T(" SYSTEM '"), data->fp); 430 fputts(notations[i]->systemId, data->fp); 431 puttc(T('\''), data->fp); 432 } 433 puttc(T('>'), data->fp); 434 puttc(T('\n'), data->fp); 435 } 436 437 /* Finally end the DOCTYPE */ 438 fputts(T("]>\n"), data->fp); 439 440 free(notations); 441 freeNotations(data); 442 free((void *)data->currentDoctypeName); 443 data->currentDoctypeName = NULL; 444 } 445 446 static void XMLCALL 447 notationDecl(void *userData, const XML_Char *notationName, const XML_Char *base, 448 const XML_Char *systemId, const XML_Char *publicId) { 449 XmlwfUserData *data = (XmlwfUserData *)userData; 450 NotationList *entry = malloc(sizeof(NotationList)); 451 const char *errorMessage = "Unable to store NOTATION for output\n"; 452 453 UNUSED_P(base); 454 if (entry == NULL) { 455 fputs(errorMessage, stderr); 456 return; /* Nothing we can really do about this */ 457 } 458 entry->notationName = xcsdup(notationName); 459 if (entry->notationName == NULL) { 460 fputs(errorMessage, stderr); 461 free(entry); 462 return; 463 } 464 if (systemId != NULL) { 465 entry->systemId = xcsdup(systemId); 466 if (entry->systemId == NULL) { 467 fputs(errorMessage, stderr); 468 free((void *)entry->notationName); 469 free(entry); 470 return; 471 } 472 } else { 473 entry->systemId = NULL; 474 } 475 if (publicId != NULL) { 476 entry->publicId = xcsdup(publicId); 477 if (entry->publicId == NULL) { 478 fputs(errorMessage, stderr); 479 free((void *)entry->systemId); /* Safe if it's NULL */ 480 free((void *)entry->notationName); 481 free(entry); 482 return; 483 } 484 } else { 485 entry->publicId = NULL; 486 } 487 488 entry->next = data->notationListHead; 489 data->notationListHead = entry; 490 } 491 492 #endif /* not W3C14N */ 493 494 static void XMLCALL 495 defaultCharacterData(void *userData, const XML_Char *s, int len) { 496 UNUSED_P(s); 497 UNUSED_P(len); 498 XML_DefaultCurrent((XML_Parser)userData); 499 } 500 501 static void XMLCALL 502 defaultStartElement(void *userData, const XML_Char *name, 503 const XML_Char **atts) { 504 UNUSED_P(name); 505 UNUSED_P(atts); 506 XML_DefaultCurrent((XML_Parser)userData); 507 } 508 509 static void XMLCALL 510 defaultEndElement(void *userData, const XML_Char *name) { 511 UNUSED_P(name); 512 XML_DefaultCurrent((XML_Parser)userData); 513 } 514 515 static void XMLCALL 516 defaultProcessingInstruction(void *userData, const XML_Char *target, 517 const XML_Char *data) { 518 UNUSED_P(target); 519 UNUSED_P(data); 520 XML_DefaultCurrent((XML_Parser)userData); 521 } 522 523 static void XMLCALL 524 nopCharacterData(void *userData, const XML_Char *s, int len) { 525 UNUSED_P(userData); 526 UNUSED_P(s); 527 UNUSED_P(len); 528 } 529 530 static void XMLCALL 531 nopStartElement(void *userData, const XML_Char *name, const XML_Char **atts) { 532 UNUSED_P(userData); 533 UNUSED_P(name); 534 UNUSED_P(atts); 535 } 536 537 static void XMLCALL 538 nopEndElement(void *userData, const XML_Char *name) { 539 UNUSED_P(userData); 540 UNUSED_P(name); 541 } 542 543 static void XMLCALL 544 nopProcessingInstruction(void *userData, const XML_Char *target, 545 const XML_Char *data) { 546 UNUSED_P(userData); 547 UNUSED_P(target); 548 UNUSED_P(data); 549 } 550 551 static void XMLCALL 552 markup(void *userData, const XML_Char *s, int len) { 553 FILE *fp = ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp; 554 for (; len > 0; --len, ++s) 555 puttc(*s, fp); 556 } 557 558 static void 559 metaLocation(XML_Parser parser) { 560 const XML_Char *uri = XML_GetBase(parser); 561 FILE *fp = ((XmlwfUserData *)XML_GetUserData(parser))->fp; 562 if (uri) 563 ftprintf(fp, T(" uri=\"%s\""), uri); 564 ftprintf(fp, 565 T(" byte=\"%") T(XML_FMT_INT_MOD) T("d\"") T(" nbytes=\"%d\"") 566 T(" line=\"%") T(XML_FMT_INT_MOD) T("u\"") T(" col=\"%") 567 T(XML_FMT_INT_MOD) T("u\""), 568 XML_GetCurrentByteIndex(parser), XML_GetCurrentByteCount(parser), 569 XML_GetCurrentLineNumber(parser), 570 XML_GetCurrentColumnNumber(parser)); 571 } 572 573 static void 574 metaStartDocument(void *userData) { 575 fputts(T("<document>\n"), 576 ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp); 577 } 578 579 static void 580 metaEndDocument(void *userData) { 581 fputts(T("</document>\n"), 582 ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp); 583 } 584 585 static void XMLCALL 586 metaStartElement(void *userData, const XML_Char *name, const XML_Char **atts) { 587 XML_Parser parser = (XML_Parser)userData; 588 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 589 FILE *fp = data->fp; 590 const XML_Char **specifiedAttsEnd 591 = atts + XML_GetSpecifiedAttributeCount(parser); 592 const XML_Char **idAttPtr; 593 int idAttIndex = XML_GetIdAttributeIndex(parser); 594 if (idAttIndex < 0) 595 idAttPtr = 0; 596 else 597 idAttPtr = atts + idAttIndex; 598 599 ftprintf(fp, T("<starttag name=\"%s\""), name); 600 metaLocation(parser); 601 if (*atts) { 602 fputts(T(">\n"), fp); 603 do { 604 ftprintf(fp, T("<attribute name=\"%s\" value=\""), atts[0]); 605 characterData(data, atts[1], (int)tcslen(atts[1])); 606 if (atts >= specifiedAttsEnd) 607 fputts(T("\" defaulted=\"yes\"/>\n"), fp); 608 else if (atts == idAttPtr) 609 fputts(T("\" id=\"yes\"/>\n"), fp); 610 else 611 fputts(T("\"/>\n"), fp); 612 } while (*(atts += 2)); 613 fputts(T("</starttag>\n"), fp); 614 } else 615 fputts(T("/>\n"), fp); 616 } 617 618 static void XMLCALL 619 metaEndElement(void *userData, const XML_Char *name) { 620 XML_Parser parser = (XML_Parser)userData; 621 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 622 FILE *fp = data->fp; 623 ftprintf(fp, T("<endtag name=\"%s\""), name); 624 metaLocation(parser); 625 fputts(T("/>\n"), fp); 626 } 627 628 static void XMLCALL 629 metaProcessingInstruction(void *userData, const XML_Char *target, 630 const XML_Char *data) { 631 XML_Parser parser = (XML_Parser)userData; 632 XmlwfUserData *usrData = (XmlwfUserData *)XML_GetUserData(parser); 633 FILE *fp = usrData->fp; 634 ftprintf(fp, T("<pi target=\"%s\" data=\""), target); 635 characterData(usrData, data, (int)tcslen(data)); 636 puttc(T('"'), fp); 637 metaLocation(parser); 638 fputts(T("/>\n"), fp); 639 } 640 641 static void XMLCALL 642 metaComment(void *userData, const XML_Char *data) { 643 XML_Parser parser = (XML_Parser)userData; 644 XmlwfUserData *usrData = (XmlwfUserData *)XML_GetUserData(parser); 645 FILE *fp = usrData->fp; 646 fputts(T("<comment data=\""), fp); 647 characterData(usrData, data, (int)tcslen(data)); 648 puttc(T('"'), fp); 649 metaLocation(parser); 650 fputts(T("/>\n"), fp); 651 } 652 653 static void XMLCALL 654 metaStartCdataSection(void *userData) { 655 XML_Parser parser = (XML_Parser)userData; 656 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 657 FILE *fp = data->fp; 658 fputts(T("<startcdata"), fp); 659 metaLocation(parser); 660 fputts(T("/>\n"), fp); 661 } 662 663 static void XMLCALL 664 metaEndCdataSection(void *userData) { 665 XML_Parser parser = (XML_Parser)userData; 666 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 667 FILE *fp = data->fp; 668 fputts(T("<endcdata"), fp); 669 metaLocation(parser); 670 fputts(T("/>\n"), fp); 671 } 672 673 static void XMLCALL 674 metaCharacterData(void *userData, const XML_Char *s, int len) { 675 XML_Parser parser = (XML_Parser)userData; 676 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 677 FILE *fp = data->fp; 678 fputts(T("<chars str=\""), fp); 679 characterData(data, s, len); 680 puttc(T('"'), fp); 681 metaLocation(parser); 682 fputts(T("/>\n"), fp); 683 } 684 685 static void XMLCALL 686 metaStartDoctypeDecl(void *userData, const XML_Char *doctypeName, 687 const XML_Char *sysid, const XML_Char *pubid, 688 int has_internal_subset) { 689 XML_Parser parser = (XML_Parser)userData; 690 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 691 FILE *fp = data->fp; 692 UNUSED_P(sysid); 693 UNUSED_P(pubid); 694 UNUSED_P(has_internal_subset); 695 ftprintf(fp, T("<startdoctype name=\"%s\""), doctypeName); 696 metaLocation(parser); 697 fputts(T("/>\n"), fp); 698 } 699 700 static void XMLCALL 701 metaEndDoctypeDecl(void *userData) { 702 XML_Parser parser = (XML_Parser)userData; 703 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 704 FILE *fp = data->fp; 705 fputts(T("<enddoctype"), fp); 706 metaLocation(parser); 707 fputts(T("/>\n"), fp); 708 } 709 710 static void XMLCALL 711 metaNotationDecl(void *userData, const XML_Char *notationName, 712 const XML_Char *base, const XML_Char *systemId, 713 const XML_Char *publicId) { 714 XML_Parser parser = (XML_Parser)userData; 715 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 716 FILE *fp = data->fp; 717 UNUSED_P(base); 718 ftprintf(fp, T("<notation name=\"%s\""), notationName); 719 if (publicId) 720 ftprintf(fp, T(" public=\"%s\""), publicId); 721 if (systemId) { 722 fputts(T(" system=\""), fp); 723 characterData(data, systemId, (int)tcslen(systemId)); 724 puttc(T('"'), fp); 725 } 726 metaLocation(parser); 727 fputts(T("/>\n"), fp); 728 } 729 730 static void XMLCALL 731 metaEntityDecl(void *userData, const XML_Char *entityName, int is_param, 732 const XML_Char *value, int value_length, const XML_Char *base, 733 const XML_Char *systemId, const XML_Char *publicId, 734 const XML_Char *notationName) { 735 XML_Parser parser = (XML_Parser)userData; 736 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 737 FILE *fp = data->fp; 738 739 UNUSED_P(is_param); 740 UNUSED_P(base); 741 if (value) { 742 ftprintf(fp, T("<entity name=\"%s\""), entityName); 743 metaLocation(parser); 744 puttc(T('>'), fp); 745 characterData(data, value, value_length); 746 fputts(T("</entity/>\n"), fp); 747 } else if (notationName) { 748 ftprintf(fp, T("<entity name=\"%s\""), entityName); 749 if (publicId) 750 ftprintf(fp, T(" public=\"%s\""), publicId); 751 fputts(T(" system=\""), fp); 752 characterData(data, systemId, (int)tcslen(systemId)); 753 puttc(T('"'), fp); 754 ftprintf(fp, T(" notation=\"%s\""), notationName); 755 metaLocation(parser); 756 fputts(T("/>\n"), fp); 757 } else { 758 ftprintf(fp, T("<entity name=\"%s\""), entityName); 759 if (publicId) 760 ftprintf(fp, T(" public=\"%s\""), publicId); 761 fputts(T(" system=\""), fp); 762 characterData(data, systemId, (int)tcslen(systemId)); 763 puttc(T('"'), fp); 764 metaLocation(parser); 765 fputts(T("/>\n"), fp); 766 } 767 } 768 769 static void XMLCALL 770 metaStartNamespaceDecl(void *userData, const XML_Char *prefix, 771 const XML_Char *uri) { 772 XML_Parser parser = (XML_Parser)userData; 773 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 774 FILE *fp = data->fp; 775 fputts(T("<startns"), fp); 776 if (prefix) 777 ftprintf(fp, T(" prefix=\"%s\""), prefix); 778 if (uri) { 779 fputts(T(" ns=\""), fp); 780 characterData(data, uri, (int)tcslen(uri)); 781 fputts(T("\"/>\n"), fp); 782 } else 783 fputts(T("/>\n"), fp); 784 } 785 786 static void XMLCALL 787 metaEndNamespaceDecl(void *userData, const XML_Char *prefix) { 788 XML_Parser parser = (XML_Parser)userData; 789 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser); 790 FILE *fp = data->fp; 791 if (! prefix) 792 fputts(T("<endns/>\n"), fp); 793 else 794 ftprintf(fp, T("<endns prefix=\"%s\"/>\n"), prefix); 795 } 796 797 static int XMLCALL 798 unknownEncodingConvert(void *data, const char *p) { 799 return codepageConvert(*(int *)data, p); 800 } 801 802 static int XMLCALL 803 unknownEncoding(void *userData, const XML_Char *name, XML_Encoding *info) { 804 int cp; 805 static const XML_Char prefixL[] = T("windows-"); 806 static const XML_Char prefixU[] = T("WINDOWS-"); 807 int i; 808 809 UNUSED_P(userData); 810 for (i = 0; prefixU[i]; i++) 811 if (name[i] != prefixU[i] && name[i] != prefixL[i]) 812 return 0; 813 814 cp = 0; 815 for (; name[i]; i++) { 816 static const XML_Char digits[] = T("0123456789"); 817 const XML_Char *s = tcschr(digits, name[i]); 818 if (! s) 819 return 0; 820 cp *= 10; 821 cp += (int)(s - digits); 822 if (cp >= 0x10000) 823 return 0; 824 } 825 if (! codepageMap(cp, info->map)) 826 return 0; 827 info->convert = unknownEncodingConvert; 828 /* We could just cast the code page integer to a void *, 829 and avoid the use of release. */ 830 info->release = free; 831 info->data = malloc(sizeof(int)); 832 if (! info->data) 833 return 0; 834 *(int *)info->data = cp; 835 return 1; 836 } 837 838 static int XMLCALL 839 notStandalone(void *userData) { 840 UNUSED_P(userData); 841 return 0; 842 } 843 844 static void 845 showVersion(XML_Char *prog) { 846 XML_Char *s = prog; 847 XML_Char ch; 848 const XML_Feature *features = XML_GetFeatureList(); 849 while ((ch = *s) != 0) { 850 if (ch == '/' 851 #if defined(_WIN32) 852 || ch == '\\' 853 #endif 854 ) 855 prog = s + 1; 856 ++s; 857 } 858 ftprintf(stdout, T("%s using %s\n"), prog, XML_ExpatVersion()); 859 if (features != NULL && features[0].feature != XML_FEATURE_END) { 860 int i = 1; 861 ftprintf(stdout, T("%s"), features[0].name); 862 if (features[0].value) 863 ftprintf(stdout, T("=%ld"), features[0].value); 864 while (features[i].feature != XML_FEATURE_END) { 865 ftprintf(stdout, T(", %s"), features[i].name); 866 if (features[i].value) 867 ftprintf(stdout, T("=%ld"), features[i].value); 868 ++i; 869 } 870 ftprintf(stdout, T("\n")); 871 } 872 } 873 874 static void 875 usage(const XML_Char *prog, int rc) { 876 ftprintf( 877 stderr, 878 /* Generated with: 879 * $ xmlwf/xmlwf_helpgen.sh 880 * To update, change xmlwf/xmlwf_helpgen.py, then paste the output of 881 * xmlwf/xmlwf_helpgen.sh in here. 882 */ 883 /* clang-format off */ 884 T("usage:\n") 885 T(" %s [OPTIONS] [FILE ...]\n") 886 T(" %s -h\n") 887 T(" %s -v\n") 888 T("\n") 889 T("xmlwf - Determines if an XML document is well-formed\n") 890 T("\n") 891 T("positional arguments:\n") 892 T(" FILE file to process (default: STDIN)\n") 893 T("\n") 894 T("input control arguments:\n") 895 T(" -s print an error if the document is not [s]tandalone\n") 896 T(" -n enable [n]amespace processing\n") 897 T(" -p enable processing external DTDs and [p]arameter entities\n") 898 T(" -x enable processing of e[x]ternal entities\n") 899 T(" -e ENCODING override any in-document [e]ncoding declaration\n") 900 T(" -w enable support for [W]indows code pages\n") 901 T(" -r disable memory-mapping and use normal file [r]ead IO calls instead\n") 902 T(" -k when processing multiple files, [k]eep processing after first file with error\n") 903 T("\n") 904 T("output control arguments:\n") 905 T(" -d DIRECTORY output [d]estination directory\n") 906 T(" -c write a [c]opy of input XML, not canonical XML\n") 907 T(" -m write [m]eta XML, not canonical XML\n") 908 T(" -t write no XML output for [t]iming of plain parsing\n") 909 T(" -N enable adding doctype and [n]otation declarations\n") 910 T("\n") 911 T("billion laughs attack protection:\n") 912 T(" NOTE: If you ever need to increase these values for non-attack payload, please file a bug report.\n") 913 T("\n") 914 T(" -a FACTOR set maximum tolerated [a]mplification factor (default: 100.0)\n") 915 T(" -b BYTES set number of output [b]ytes needed to activate (default: 8 MiB)\n") 916 T("\n") 917 T("info arguments:\n") 918 T(" -h show this [h]elp message and exit\n") 919 T(" -v show program's [v]ersion number and exit\n") 920 T("\n") 921 T("exit status:\n") 922 T(" 0 the input files are well-formed and the output (if requested) was written successfully\n") 923 T(" 1 could not allocate data structures, signals a serious problem with execution environment\n") 924 T(" 2 one or more input files were not well-formed\n") 925 T(" 3 could not create an output file\n") 926 T(" 4 command-line argument error\n") 927 T("\n") 928 T("xmlwf of libexpat is software libre, licensed under the MIT license.\n") 929 T("Please report bugs at https://github.com/libexpat/libexpat/issues. Thank you!\n") 930 , /* clang-format on */ 931 prog, prog, prog); 932 exit(rc); 933 } 934 935 #if defined(__MINGW32__) && defined(XML_UNICODE) 936 /* Silence warning about missing prototype */ 937 int wmain(int argc, XML_Char **argv); 938 #endif 939 940 #define XMLWF_SHIFT_ARG_INTO(constCharStarTarget, argc, argv, i, j) \ 941 { \ 942 if (argv[i][j + 1] == T('\0')) { \ 943 if (++i == argc) \ 944 usage(argv[0], XMLWF_EXIT_USAGE_ERROR); \ 945 constCharStarTarget = argv[i]; \ 946 } else { \ 947 constCharStarTarget = argv[i] + j + 1; \ 948 } \ 949 i++; \ 950 j = 0; \ 951 } 952 953 int 954 tmain(int argc, XML_Char **argv) { 955 int i, j; 956 const XML_Char *outputDir = NULL; 957 const XML_Char *encoding = NULL; 958 unsigned processFlags = XML_MAP_FILE; 959 int windowsCodePages = 0; 960 int outputType = 0; 961 int useNamespaces = 0; 962 int requireStandalone = 0; 963 int requiresNotations = 0; 964 int continueOnError = 0; 965 966 float attackMaximumAmplification = -1.0f; /* signaling "not set" */ 967 unsigned long long attackThresholdBytes; 968 XML_Bool attackThresholdGiven = XML_FALSE; 969 970 int exitCode = XMLWF_EXIT_SUCCESS; 971 enum XML_ParamEntityParsing paramEntityParsing 972 = XML_PARAM_ENTITY_PARSING_NEVER; 973 int useStdin = 0; 974 XmlwfUserData userData = {NULL, NULL, NULL}; 975 976 #ifdef _MSC_VER 977 _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF); 978 #endif 979 980 i = 1; 981 j = 0; 982 while (i < argc) { 983 if (j == 0) { 984 if (argv[i][0] != T('-')) 985 break; 986 if (argv[i][1] == T('-') && argv[i][2] == T('\0')) { 987 i++; 988 break; 989 } 990 j++; 991 } 992 switch (argv[i][j]) { 993 case T('r'): 994 processFlags &= ~XML_MAP_FILE; 995 j++; 996 break; 997 case T('s'): 998 requireStandalone = 1; 999 j++; 1000 break; 1001 case T('n'): 1002 useNamespaces = 1; 1003 j++; 1004 break; 1005 case T('p'): 1006 paramEntityParsing = XML_PARAM_ENTITY_PARSING_ALWAYS; 1007 /* fall through */ 1008 case T('x'): 1009 processFlags |= XML_EXTERNAL_ENTITIES; 1010 j++; 1011 break; 1012 case T('w'): 1013 windowsCodePages = 1; 1014 j++; 1015 break; 1016 case T('m'): 1017 outputType = 'm'; 1018 j++; 1019 break; 1020 case T('c'): 1021 outputType = 'c'; 1022 useNamespaces = 0; 1023 j++; 1024 break; 1025 case T('t'): 1026 outputType = 't'; 1027 j++; 1028 break; 1029 case T('N'): 1030 requiresNotations = 1; 1031 j++; 1032 break; 1033 case T('d'): 1034 XMLWF_SHIFT_ARG_INTO(outputDir, argc, argv, i, j); 1035 break; 1036 case T('e'): 1037 XMLWF_SHIFT_ARG_INTO(encoding, argc, argv, i, j); 1038 break; 1039 case T('h'): 1040 usage(argv[0], XMLWF_EXIT_SUCCESS); 1041 return 0; 1042 case T('v'): 1043 showVersion(argv[0]); 1044 return 0; 1045 case T('k'): 1046 continueOnError = 1; 1047 j++; 1048 break; 1049 case T('a'): { 1050 const XML_Char *valueText = NULL; 1051 XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j); 1052 1053 errno = 0; 1054 XML_Char *afterValueText = (XML_Char *)valueText; 1055 attackMaximumAmplification = tcstof(valueText, &afterValueText); 1056 if ((errno != 0) || (afterValueText[0] != T('\0')) 1057 || isnan(attackMaximumAmplification) 1058 || (attackMaximumAmplification < 1.0f)) { 1059 // This prevents tperror(..) from reporting misleading "[..]: Success" 1060 errno = ERANGE; 1061 tperror(T("invalid amplification limit") T( 1062 " (needs a floating point number greater or equal than 1.0)")); 1063 exit(XMLWF_EXIT_USAGE_ERROR); 1064 } 1065 #ifndef XML_DTD 1066 ftprintf(stderr, T("Warning: Given amplification limit ignored") T( 1067 ", xmlwf has been compiled without DTD support.\n")); 1068 #endif 1069 break; 1070 } 1071 case T('b'): { 1072 const XML_Char *valueText = NULL; 1073 XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j); 1074 1075 errno = 0; 1076 XML_Char *afterValueText = (XML_Char *)valueText; 1077 attackThresholdBytes = tcstoull(valueText, &afterValueText, 10); 1078 if ((errno != 0) || (afterValueText[0] != T('\0'))) { 1079 // This prevents tperror(..) from reporting misleading "[..]: Success" 1080 errno = ERANGE; 1081 tperror(T("invalid ignore threshold") 1082 T(" (needs an integer from 0 to 2^64-1)")); 1083 exit(XMLWF_EXIT_USAGE_ERROR); 1084 } 1085 attackThresholdGiven = XML_TRUE; 1086 #ifndef XML_DTD 1087 ftprintf(stderr, T("Warning: Given attack threshold ignored") T( 1088 ", xmlwf has been compiled without DTD support.\n")); 1089 #endif 1090 break; 1091 } 1092 case T('\0'): 1093 if (j > 1) { 1094 i++; 1095 j = 0; 1096 break; 1097 } 1098 /* fall through */ 1099 default: 1100 usage(argv[0], XMLWF_EXIT_USAGE_ERROR); 1101 } 1102 } 1103 if (i == argc) { 1104 useStdin = 1; 1105 processFlags &= ~XML_MAP_FILE; 1106 i--; 1107 } 1108 for (; i < argc; i++) { 1109 XML_Char *outName = 0; 1110 int result; 1111 XML_Parser parser; 1112 if (useNamespaces) 1113 parser = XML_ParserCreateNS(encoding, NSSEP); 1114 else 1115 parser = XML_ParserCreate(encoding); 1116 1117 if (! parser) { 1118 tperror(T("Could not instantiate parser")); 1119 exit(XMLWF_EXIT_INTERNAL_ERROR); 1120 } 1121 1122 if (attackMaximumAmplification != -1.0f) { 1123 #ifdef XML_DTD 1124 XML_SetBillionLaughsAttackProtectionMaximumAmplification( 1125 parser, attackMaximumAmplification); 1126 #endif 1127 } 1128 if (attackThresholdGiven) { 1129 #ifdef XML_DTD 1130 XML_SetBillionLaughsAttackProtectionActivationThreshold( 1131 parser, attackThresholdBytes); 1132 #else 1133 (void)attackThresholdBytes; // silence -Wunused-but-set-variable 1134 #endif 1135 } 1136 1137 if (requireStandalone) 1138 XML_SetNotStandaloneHandler(parser, notStandalone); 1139 XML_SetParamEntityParsing(parser, paramEntityParsing); 1140 if (outputType == 't') { 1141 /* This is for doing timings; this gives a more realistic estimate of 1142 the parsing time. */ 1143 outputDir = 0; 1144 XML_SetElementHandler(parser, nopStartElement, nopEndElement); 1145 XML_SetCharacterDataHandler(parser, nopCharacterData); 1146 XML_SetProcessingInstructionHandler(parser, nopProcessingInstruction); 1147 } else if (outputDir) { 1148 const XML_Char *delim = T("/"); 1149 const XML_Char *file = useStdin ? T("STDIN") : argv[i]; 1150 if (! useStdin) { 1151 /* Jump after last (back)slash */ 1152 const XML_Char *lastDelim = tcsrchr(file, delim[0]); 1153 if (lastDelim) 1154 file = lastDelim + 1; 1155 #if defined(_WIN32) 1156 else { 1157 const XML_Char *winDelim = T("\\"); 1158 lastDelim = tcsrchr(file, winDelim[0]); 1159 if (lastDelim) { 1160 file = lastDelim + 1; 1161 delim = winDelim; 1162 } 1163 } 1164 #endif 1165 } 1166 outName = (XML_Char *)malloc((tcslen(outputDir) + tcslen(file) + 2) 1167 * sizeof(XML_Char)); 1168 if (! outName) { 1169 tperror(T("Could not allocate memory")); 1170 exit(XMLWF_EXIT_INTERNAL_ERROR); 1171 } 1172 tcscpy(outName, outputDir); 1173 tcscat(outName, delim); 1174 tcscat(outName, file); 1175 userData.fp = tfopen(outName, T("wb")); 1176 if (! userData.fp) { 1177 tperror(outName); 1178 exitCode = XMLWF_EXIT_OUTPUT_ERROR; 1179 free(outName); 1180 XML_ParserFree(parser); 1181 if (continueOnError) { 1182 continue; 1183 } else { 1184 break; 1185 } 1186 } 1187 setvbuf(userData.fp, NULL, _IOFBF, 16384); 1188 #ifdef XML_UNICODE 1189 puttc(0xFEFF, userData.fp); 1190 #endif 1191 XML_SetUserData(parser, &userData); 1192 switch (outputType) { 1193 case 'm': 1194 XML_UseParserAsHandlerArg(parser); 1195 XML_SetElementHandler(parser, metaStartElement, metaEndElement); 1196 XML_SetProcessingInstructionHandler(parser, metaProcessingInstruction); 1197 XML_SetCommentHandler(parser, metaComment); 1198 XML_SetCdataSectionHandler(parser, metaStartCdataSection, 1199 metaEndCdataSection); 1200 XML_SetCharacterDataHandler(parser, metaCharacterData); 1201 XML_SetDoctypeDeclHandler(parser, metaStartDoctypeDecl, 1202 metaEndDoctypeDecl); 1203 XML_SetEntityDeclHandler(parser, metaEntityDecl); 1204 XML_SetNotationDeclHandler(parser, metaNotationDecl); 1205 XML_SetNamespaceDeclHandler(parser, metaStartNamespaceDecl, 1206 metaEndNamespaceDecl); 1207 metaStartDocument(parser); 1208 break; 1209 case 'c': 1210 XML_UseParserAsHandlerArg(parser); 1211 XML_SetDefaultHandler(parser, markup); 1212 XML_SetElementHandler(parser, defaultStartElement, defaultEndElement); 1213 XML_SetCharacterDataHandler(parser, defaultCharacterData); 1214 XML_SetProcessingInstructionHandler(parser, 1215 defaultProcessingInstruction); 1216 break; 1217 default: 1218 if (useNamespaces) 1219 XML_SetElementHandler(parser, startElementNS, endElementNS); 1220 else 1221 XML_SetElementHandler(parser, startElement, endElement); 1222 XML_SetCharacterDataHandler(parser, characterData); 1223 #ifndef W3C14N 1224 XML_SetProcessingInstructionHandler(parser, processingInstruction); 1225 if (requiresNotations) { 1226 XML_SetDoctypeDeclHandler(parser, startDoctypeDecl, endDoctypeDecl); 1227 XML_SetNotationDeclHandler(parser, notationDecl); 1228 } 1229 #endif /* not W3C14N */ 1230 break; 1231 } 1232 } 1233 if (windowsCodePages) 1234 XML_SetUnknownEncodingHandler(parser, unknownEncoding, 0); 1235 result = XML_ProcessFile(parser, useStdin ? NULL : argv[i], processFlags); 1236 if (outputDir) { 1237 if (outputType == 'm') 1238 metaEndDocument(parser); 1239 fclose(userData.fp); 1240 if (! result) { 1241 tremove(outName); 1242 } 1243 free(outName); 1244 } 1245 XML_ParserFree(parser); 1246 if (! result) { 1247 exitCode = XMLWF_EXIT_NOT_WELLFORMED; 1248 cleanupUserData(&userData); 1249 if (! continueOnError) { 1250 break; 1251 } 1252 } 1253 } 1254 return exitCode; 1255 } 1256