xref: /freebsd/contrib/expat/xmlwf/xmlwf.c (revision 3705d679a6344c957cae7a1b6372a8bfb8c44f0e)
1 /*
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11    Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12    Copyright (c) 2004-2009 Karl Waclawek <karl@waclawek.net>
13    Copyright (c) 2005-2007 Steven Solie <steven@solie.ca>
14    Copyright (c) 2016-2023 Sebastian Pipping <sebastian@pipping.org>
15    Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
16    Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
17    Copyright (c) 2020      Joe Orton <jorton@redhat.com>
18    Copyright (c) 2020      Kleber Tarcísio <klebertarcisio@yahoo.com.br>
19    Copyright (c) 2021      Tim Bray <tbray@textuality.com>
20    Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
21    Copyright (c) 2022      Sean McBride <sean@rogue-research.com>
22    Licensed under the MIT license:
23 
24    Permission is  hereby granted,  free of charge,  to any  person obtaining
25    a  copy  of  this  software   and  associated  documentation  files  (the
26    "Software"),  to  deal in  the  Software  without restriction,  including
27    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
28    distribute, sublicense, and/or sell copies of the Software, and to permit
29    persons  to whom  the Software  is  furnished to  do so,  subject to  the
30    following conditions:
31 
32    The above copyright  notice and this permission notice  shall be included
33    in all copies or substantial portions of the Software.
34 
35    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
36    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
37    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
38    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
39    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
40    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
41    USE OR OTHER DEALINGS IN THE SOFTWARE.
42 */
43 
44 #include "expat_config.h"
45 
46 #include <assert.h>
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <stddef.h>
50 #include <string.h>
51 #include <math.h> /* for isnan */
52 #include <errno.h>
53 
54 #include "expat.h"
55 #include "codepage.h"
56 #include "internal.h" /* for UNUSED_P only */
57 #include "xmlfile.h"
58 #include "xmltchar.h"
59 
60 #ifdef _MSC_VER
61 #  include <crtdbg.h>
62 #endif
63 
64 #ifdef XML_UNICODE
65 #  include <wchar.h>
66 #endif
67 
68 enum ExitCode {
69   XMLWF_EXIT_SUCCESS = 0,
70   XMLWF_EXIT_INTERNAL_ERROR = 1,
71   XMLWF_EXIT_NOT_WELLFORMED = 2,
72   XMLWF_EXIT_OUTPUT_ERROR = 3,
73   XMLWF_EXIT_USAGE_ERROR = 4,
74 };
75 
76 /* Structures for handler user data */
77 typedef struct NotationList {
78   struct NotationList *next;
79   const XML_Char *notationName;
80   const XML_Char *systemId;
81   const XML_Char *publicId;
82 } NotationList;
83 
84 typedef struct xmlwfUserData {
85   FILE *fp;
86   NotationList *notationListHead;
87   const XML_Char *currentDoctypeName;
88 } XmlwfUserData;
89 
90 /* This ensures proper sorting. */
91 
92 #define NSSEP T('\001')
93 
94 static void XMLCALL
95 characterData(void *userData, const XML_Char *s, int len) {
96   FILE *fp = ((XmlwfUserData *)userData)->fp;
97   for (; len > 0; --len, ++s) {
98     switch (*s) {
99     case T('&'):
100       fputts(T("&amp;"), fp);
101       break;
102     case T('<'):
103       fputts(T("&lt;"), fp);
104       break;
105     case T('>'):
106       fputts(T("&gt;"), fp);
107       break;
108 #ifdef W3C14N
109     case 13:
110       fputts(T("&#xD;"), fp);
111       break;
112 #else
113     case T('"'):
114       fputts(T("&quot;"), fp);
115       break;
116     case 9:
117     case 10:
118     case 13:
119       ftprintf(fp, T("&#%d;"), *s);
120       break;
121 #endif
122     default:
123       puttc(*s, fp);
124       break;
125     }
126   }
127 }
128 
129 static void
130 attributeValue(FILE *fp, const XML_Char *s) {
131   puttc(T('='), fp);
132   puttc(T('"'), fp);
133   assert(s);
134   for (;;) {
135     switch (*s) {
136     case 0:
137     case NSSEP:
138       puttc(T('"'), fp);
139       return;
140     case T('&'):
141       fputts(T("&amp;"), fp);
142       break;
143     case T('<'):
144       fputts(T("&lt;"), fp);
145       break;
146     case T('"'):
147       fputts(T("&quot;"), fp);
148       break;
149 #ifdef W3C14N
150     case 9:
151       fputts(T("&#x9;"), fp);
152       break;
153     case 10:
154       fputts(T("&#xA;"), fp);
155       break;
156     case 13:
157       fputts(T("&#xD;"), fp);
158       break;
159 #else
160     case T('>'):
161       fputts(T("&gt;"), fp);
162       break;
163     case 9:
164     case 10:
165     case 13:
166       ftprintf(fp, T("&#%d;"), *s);
167       break;
168 #endif
169     default:
170       puttc(*s, fp);
171       break;
172     }
173     s++;
174   }
175 }
176 
177 /* Lexicographically comparing UTF-8 encoded attribute values,
178 is equivalent to lexicographically comparing based on the character number. */
179 
180 static int
181 attcmp(const void *att1, const void *att2) {
182   return tcscmp(*(const XML_Char *const *)att1, *(const XML_Char *const *)att2);
183 }
184 
185 static void XMLCALL
186 startElement(void *userData, const XML_Char *name, const XML_Char **atts) {
187   int nAtts;
188   const XML_Char **p;
189   FILE *fp = ((XmlwfUserData *)userData)->fp;
190   puttc(T('<'), fp);
191   fputts(name, fp);
192 
193   p = atts;
194   while (*p)
195     ++p;
196   nAtts = (int)((p - atts) >> 1);
197   if (nAtts > 1)
198     qsort((void *)atts, nAtts, sizeof(XML_Char *) * 2, attcmp);
199   while (*atts) {
200     puttc(T(' '), fp);
201     fputts(*atts++, fp);
202     attributeValue(fp, *atts);
203     atts++;
204   }
205   puttc(T('>'), fp);
206 }
207 
208 static void XMLCALL
209 endElement(void *userData, const XML_Char *name) {
210   FILE *fp = ((XmlwfUserData *)userData)->fp;
211   puttc(T('<'), fp);
212   puttc(T('/'), fp);
213   fputts(name, fp);
214   puttc(T('>'), fp);
215 }
216 
217 static int
218 nsattcmp(const void *p1, const void *p2) {
219   const XML_Char *att1 = *(const XML_Char *const *)p1;
220   const XML_Char *att2 = *(const XML_Char *const *)p2;
221   int sep1 = (tcsrchr(att1, NSSEP) != 0);
222   int sep2 = (tcsrchr(att2, NSSEP) != 0);
223   if (sep1 != sep2)
224     return sep1 - sep2;
225   return tcscmp(att1, att2);
226 }
227 
228 static void XMLCALL
229 startElementNS(void *userData, const XML_Char *name, const XML_Char **atts) {
230   int nAtts;
231   int nsi;
232   const XML_Char **p;
233   FILE *fp = ((XmlwfUserData *)userData)->fp;
234   const XML_Char *sep;
235   puttc(T('<'), fp);
236 
237   sep = tcsrchr(name, NSSEP);
238   if (sep) {
239     fputts(T("n1:"), fp);
240     fputts(sep + 1, fp);
241     fputts(T(" xmlns:n1"), fp);
242     attributeValue(fp, name);
243     nsi = 2;
244   } else {
245     fputts(name, fp);
246     nsi = 1;
247   }
248 
249   p = atts;
250   while (*p)
251     ++p;
252   nAtts = (int)((p - atts) >> 1);
253   if (nAtts > 1)
254     qsort((void *)atts, nAtts, sizeof(XML_Char *) * 2, nsattcmp);
255   while (*atts) {
256     name = *atts++;
257     sep = tcsrchr(name, NSSEP);
258     puttc(T(' '), fp);
259     if (sep) {
260       ftprintf(fp, T("n%d:"), nsi);
261       fputts(sep + 1, fp);
262     } else
263       fputts(name, fp);
264     attributeValue(fp, *atts);
265     if (sep) {
266       ftprintf(fp, T(" xmlns:n%d"), nsi++);
267       attributeValue(fp, name);
268     }
269     atts++;
270   }
271   puttc(T('>'), fp);
272 }
273 
274 static void XMLCALL
275 endElementNS(void *userData, const XML_Char *name) {
276   FILE *fp = ((XmlwfUserData *)userData)->fp;
277   const XML_Char *sep;
278   puttc(T('<'), fp);
279   puttc(T('/'), fp);
280   sep = tcsrchr(name, NSSEP);
281   if (sep) {
282     fputts(T("n1:"), fp);
283     fputts(sep + 1, fp);
284   } else
285     fputts(name, fp);
286   puttc(T('>'), fp);
287 }
288 
289 #ifndef W3C14N
290 
291 static void XMLCALL
292 processingInstruction(void *userData, const XML_Char *target,
293                       const XML_Char *data) {
294   FILE *fp = ((XmlwfUserData *)userData)->fp;
295   puttc(T('<'), fp);
296   puttc(T('?'), fp);
297   fputts(target, fp);
298   puttc(T(' '), fp);
299   fputts(data, fp);
300   puttc(T('?'), fp);
301   puttc(T('>'), fp);
302 }
303 
304 static XML_Char *
305 xcsdup(const XML_Char *s) {
306   XML_Char *result;
307   int count = 0;
308   int numBytes;
309 
310   /* Get the length of the string, including terminator */
311   while (s[count++] != 0) {
312     /* Do nothing */
313   }
314   numBytes = count * sizeof(XML_Char);
315   result = malloc(numBytes);
316   if (result == NULL)
317     return NULL;
318   memcpy(result, s, numBytes);
319   return result;
320 }
321 
322 static void XMLCALL
323 startDoctypeDecl(void *userData, const XML_Char *doctypeName,
324                  const XML_Char *sysid, const XML_Char *publid,
325                  int has_internal_subset) {
326   XmlwfUserData *data = (XmlwfUserData *)userData;
327   UNUSED_P(sysid);
328   UNUSED_P(publid);
329   UNUSED_P(has_internal_subset);
330   data->currentDoctypeName = xcsdup(doctypeName);
331 }
332 
333 static void
334 freeNotations(XmlwfUserData *data) {
335   NotationList *notationListHead = data->notationListHead;
336 
337   while (notationListHead != NULL) {
338     NotationList *next = notationListHead->next;
339     free((void *)notationListHead->notationName);
340     free((void *)notationListHead->systemId);
341     free((void *)notationListHead->publicId);
342     free(notationListHead);
343     notationListHead = next;
344   }
345   data->notationListHead = NULL;
346 }
347 
348 static void
349 cleanupUserData(XmlwfUserData *userData) {
350   free((void *)userData->currentDoctypeName);
351   userData->currentDoctypeName = NULL;
352   freeNotations(userData);
353 }
354 
355 static int
356 xcscmp(const XML_Char *xs, const XML_Char *xt) {
357   while (*xs != 0 && *xt != 0) {
358     if (*xs < *xt)
359       return -1;
360     if (*xs > *xt)
361       return 1;
362     xs++;
363     xt++;
364   }
365   if (*xs < *xt)
366     return -1;
367   if (*xs > *xt)
368     return 1;
369   return 0;
370 }
371 
372 static int
373 notationCmp(const void *a, const void *b) {
374   const NotationList *const n1 = *(const NotationList *const *)a;
375   const NotationList *const n2 = *(const NotationList *const *)b;
376 
377   return xcscmp(n1->notationName, n2->notationName);
378 }
379 
380 static void XMLCALL
381 endDoctypeDecl(void *userData) {
382   XmlwfUserData *data = (XmlwfUserData *)userData;
383   NotationList **notations;
384   int notationCount = 0;
385   NotationList *p;
386   int i;
387 
388   /* How many notations do we have? */
389   for (p = data->notationListHead; p != NULL; p = p->next)
390     notationCount++;
391   if (notationCount == 0) {
392     /* Nothing to report */
393     free((void *)data->currentDoctypeName);
394     data->currentDoctypeName = NULL;
395     return;
396   }
397 
398   notations = malloc(notationCount * sizeof(NotationList *));
399   if (notations == NULL) {
400     fprintf(stderr, "Unable to sort notations");
401     freeNotations(data);
402     return;
403   }
404 
405   for (p = data->notationListHead, i = 0; i < notationCount; p = p->next, i++) {
406     notations[i] = p;
407   }
408   qsort(notations, notationCount, sizeof(NotationList *), notationCmp);
409 
410   /* Output the DOCTYPE header */
411   fputts(T("<!DOCTYPE "), data->fp);
412   fputts(data->currentDoctypeName, data->fp);
413   fputts(T(" [\n"), data->fp);
414 
415   /* Now the NOTATIONs */
416   for (i = 0; i < notationCount; i++) {
417     fputts(T("<!NOTATION "), data->fp);
418     fputts(notations[i]->notationName, data->fp);
419     if (notations[i]->publicId != NULL) {
420       fputts(T(" PUBLIC '"), data->fp);
421       fputts(notations[i]->publicId, data->fp);
422       puttc(T('\''), data->fp);
423       if (notations[i]->systemId != NULL) {
424         puttc(T(' '), data->fp);
425         puttc(T('\''), data->fp);
426         fputts(notations[i]->systemId, data->fp);
427         puttc(T('\''), data->fp);
428       }
429     } else if (notations[i]->systemId != NULL) {
430       fputts(T(" SYSTEM '"), data->fp);
431       fputts(notations[i]->systemId, data->fp);
432       puttc(T('\''), data->fp);
433     }
434     puttc(T('>'), data->fp);
435     puttc(T('\n'), data->fp);
436   }
437 
438   /* Finally end the DOCTYPE */
439   fputts(T("]>\n"), data->fp);
440 
441   free(notations);
442   freeNotations(data);
443   free((void *)data->currentDoctypeName);
444   data->currentDoctypeName = NULL;
445 }
446 
447 static void XMLCALL
448 notationDecl(void *userData, const XML_Char *notationName, const XML_Char *base,
449              const XML_Char *systemId, const XML_Char *publicId) {
450   XmlwfUserData *data = (XmlwfUserData *)userData;
451   NotationList *entry = malloc(sizeof(NotationList));
452   const char *errorMessage = "Unable to store NOTATION for output\n";
453 
454   UNUSED_P(base);
455   if (entry == NULL) {
456     fputs(errorMessage, stderr);
457     return; /* Nothing we can really do about this */
458   }
459   entry->notationName = xcsdup(notationName);
460   if (entry->notationName == NULL) {
461     fputs(errorMessage, stderr);
462     free(entry);
463     return;
464   }
465   if (systemId != NULL) {
466     entry->systemId = xcsdup(systemId);
467     if (entry->systemId == NULL) {
468       fputs(errorMessage, stderr);
469       free((void *)entry->notationName);
470       free(entry);
471       return;
472     }
473   } else {
474     entry->systemId = NULL;
475   }
476   if (publicId != NULL) {
477     entry->publicId = xcsdup(publicId);
478     if (entry->publicId == NULL) {
479       fputs(errorMessage, stderr);
480       free((void *)entry->systemId); /* Safe if it's NULL */
481       free((void *)entry->notationName);
482       free(entry);
483       return;
484     }
485   } else {
486     entry->publicId = NULL;
487   }
488 
489   entry->next = data->notationListHead;
490   data->notationListHead = entry;
491 }
492 
493 #endif /* not W3C14N */
494 
495 static void XMLCALL
496 defaultCharacterData(void *userData, const XML_Char *s, int len) {
497   UNUSED_P(s);
498   UNUSED_P(len);
499   XML_DefaultCurrent((XML_Parser)userData);
500 }
501 
502 static void XMLCALL
503 defaultStartElement(void *userData, const XML_Char *name,
504                     const XML_Char **atts) {
505   UNUSED_P(name);
506   UNUSED_P(atts);
507   XML_DefaultCurrent((XML_Parser)userData);
508 }
509 
510 static void XMLCALL
511 defaultEndElement(void *userData, const XML_Char *name) {
512   UNUSED_P(name);
513   XML_DefaultCurrent((XML_Parser)userData);
514 }
515 
516 static void XMLCALL
517 defaultProcessingInstruction(void *userData, const XML_Char *target,
518                              const XML_Char *data) {
519   UNUSED_P(target);
520   UNUSED_P(data);
521   XML_DefaultCurrent((XML_Parser)userData);
522 }
523 
524 static void XMLCALL
525 nopCharacterData(void *userData, const XML_Char *s, int len) {
526   UNUSED_P(userData);
527   UNUSED_P(s);
528   UNUSED_P(len);
529 }
530 
531 static void XMLCALL
532 nopStartElement(void *userData, const XML_Char *name, const XML_Char **atts) {
533   UNUSED_P(userData);
534   UNUSED_P(name);
535   UNUSED_P(atts);
536 }
537 
538 static void XMLCALL
539 nopEndElement(void *userData, const XML_Char *name) {
540   UNUSED_P(userData);
541   UNUSED_P(name);
542 }
543 
544 static void XMLCALL
545 nopProcessingInstruction(void *userData, const XML_Char *target,
546                          const XML_Char *data) {
547   UNUSED_P(userData);
548   UNUSED_P(target);
549   UNUSED_P(data);
550 }
551 
552 static void XMLCALL
553 markup(void *userData, const XML_Char *s, int len) {
554   FILE *fp = ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp;
555   for (; len > 0; --len, ++s)
556     puttc(*s, fp);
557 }
558 
559 static void
560 metaLocation(XML_Parser parser) {
561   const XML_Char *uri = XML_GetBase(parser);
562   FILE *fp = ((XmlwfUserData *)XML_GetUserData(parser))->fp;
563   if (uri)
564     ftprintf(fp, T(" uri=\"%s\""), uri);
565   ftprintf(fp,
566            T(" byte=\"%") T(XML_FMT_INT_MOD) T("d\"") T(" nbytes=\"%d\"")
567                T(" line=\"%") T(XML_FMT_INT_MOD) T("u\"") T(" col=\"%")
568                    T(XML_FMT_INT_MOD) T("u\""),
569            XML_GetCurrentByteIndex(parser), XML_GetCurrentByteCount(parser),
570            XML_GetCurrentLineNumber(parser),
571            XML_GetCurrentColumnNumber(parser));
572 }
573 
574 static void
575 metaStartDocument(void *userData) {
576   fputts(T("<document>\n"),
577          ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp);
578 }
579 
580 static void
581 metaEndDocument(void *userData) {
582   fputts(T("</document>\n"),
583          ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp);
584 }
585 
586 static void XMLCALL
587 metaStartElement(void *userData, const XML_Char *name, const XML_Char **atts) {
588   XML_Parser parser = (XML_Parser)userData;
589   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
590   FILE *fp = data->fp;
591   const XML_Char **specifiedAttsEnd
592       = atts + XML_GetSpecifiedAttributeCount(parser);
593   const XML_Char **idAttPtr;
594   int idAttIndex = XML_GetIdAttributeIndex(parser);
595   if (idAttIndex < 0)
596     idAttPtr = 0;
597   else
598     idAttPtr = atts + idAttIndex;
599 
600   ftprintf(fp, T("<starttag name=\"%s\""), name);
601   metaLocation(parser);
602   if (*atts) {
603     fputts(T(">\n"), fp);
604     do {
605       ftprintf(fp, T("<attribute name=\"%s\" value=\""), atts[0]);
606       characterData(data, atts[1], (int)tcslen(atts[1]));
607       if (atts >= specifiedAttsEnd)
608         fputts(T("\" defaulted=\"yes\"/>\n"), fp);
609       else if (atts == idAttPtr)
610         fputts(T("\" id=\"yes\"/>\n"), fp);
611       else
612         fputts(T("\"/>\n"), fp);
613     } while (*(atts += 2));
614     fputts(T("</starttag>\n"), fp);
615   } else
616     fputts(T("/>\n"), fp);
617 }
618 
619 static void XMLCALL
620 metaEndElement(void *userData, const XML_Char *name) {
621   XML_Parser parser = (XML_Parser)userData;
622   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
623   FILE *fp = data->fp;
624   ftprintf(fp, T("<endtag name=\"%s\""), name);
625   metaLocation(parser);
626   fputts(T("/>\n"), fp);
627 }
628 
629 static void XMLCALL
630 metaProcessingInstruction(void *userData, const XML_Char *target,
631                           const XML_Char *data) {
632   XML_Parser parser = (XML_Parser)userData;
633   XmlwfUserData *usrData = (XmlwfUserData *)XML_GetUserData(parser);
634   FILE *fp = usrData->fp;
635   ftprintf(fp, T("<pi target=\"%s\" data=\""), target);
636   characterData(usrData, data, (int)tcslen(data));
637   puttc(T('"'), fp);
638   metaLocation(parser);
639   fputts(T("/>\n"), fp);
640 }
641 
642 static void XMLCALL
643 metaComment(void *userData, const XML_Char *data) {
644   XML_Parser parser = (XML_Parser)userData;
645   XmlwfUserData *usrData = (XmlwfUserData *)XML_GetUserData(parser);
646   FILE *fp = usrData->fp;
647   fputts(T("<comment data=\""), fp);
648   characterData(usrData, data, (int)tcslen(data));
649   puttc(T('"'), fp);
650   metaLocation(parser);
651   fputts(T("/>\n"), fp);
652 }
653 
654 static void XMLCALL
655 metaStartCdataSection(void *userData) {
656   XML_Parser parser = (XML_Parser)userData;
657   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
658   FILE *fp = data->fp;
659   fputts(T("<startcdata"), fp);
660   metaLocation(parser);
661   fputts(T("/>\n"), fp);
662 }
663 
664 static void XMLCALL
665 metaEndCdataSection(void *userData) {
666   XML_Parser parser = (XML_Parser)userData;
667   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
668   FILE *fp = data->fp;
669   fputts(T("<endcdata"), fp);
670   metaLocation(parser);
671   fputts(T("/>\n"), fp);
672 }
673 
674 static void XMLCALL
675 metaCharacterData(void *userData, const XML_Char *s, int len) {
676   XML_Parser parser = (XML_Parser)userData;
677   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
678   FILE *fp = data->fp;
679   fputts(T("<chars str=\""), fp);
680   characterData(data, s, len);
681   puttc(T('"'), fp);
682   metaLocation(parser);
683   fputts(T("/>\n"), fp);
684 }
685 
686 static void XMLCALL
687 metaStartDoctypeDecl(void *userData, const XML_Char *doctypeName,
688                      const XML_Char *sysid, const XML_Char *pubid,
689                      int has_internal_subset) {
690   XML_Parser parser = (XML_Parser)userData;
691   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
692   FILE *fp = data->fp;
693   UNUSED_P(sysid);
694   UNUSED_P(pubid);
695   UNUSED_P(has_internal_subset);
696   ftprintf(fp, T("<startdoctype name=\"%s\""), doctypeName);
697   metaLocation(parser);
698   fputts(T("/>\n"), fp);
699 }
700 
701 static void XMLCALL
702 metaEndDoctypeDecl(void *userData) {
703   XML_Parser parser = (XML_Parser)userData;
704   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
705   FILE *fp = data->fp;
706   fputts(T("<enddoctype"), fp);
707   metaLocation(parser);
708   fputts(T("/>\n"), fp);
709 }
710 
711 static void XMLCALL
712 metaNotationDecl(void *userData, const XML_Char *notationName,
713                  const XML_Char *base, const XML_Char *systemId,
714                  const XML_Char *publicId) {
715   XML_Parser parser = (XML_Parser)userData;
716   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
717   FILE *fp = data->fp;
718   UNUSED_P(base);
719   ftprintf(fp, T("<notation name=\"%s\""), notationName);
720   if (publicId)
721     ftprintf(fp, T(" public=\"%s\""), publicId);
722   if (systemId) {
723     fputts(T(" system=\""), fp);
724     characterData(data, systemId, (int)tcslen(systemId));
725     puttc(T('"'), fp);
726   }
727   metaLocation(parser);
728   fputts(T("/>\n"), fp);
729 }
730 
731 static void XMLCALL
732 metaEntityDecl(void *userData, const XML_Char *entityName, int is_param,
733                const XML_Char *value, int value_length, const XML_Char *base,
734                const XML_Char *systemId, const XML_Char *publicId,
735                const XML_Char *notationName) {
736   XML_Parser parser = (XML_Parser)userData;
737   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
738   FILE *fp = data->fp;
739 
740   UNUSED_P(is_param);
741   UNUSED_P(base);
742   if (value) {
743     ftprintf(fp, T("<entity name=\"%s\""), entityName);
744     metaLocation(parser);
745     puttc(T('>'), fp);
746     characterData(data, value, value_length);
747     fputts(T("</entity/>\n"), fp);
748   } else if (notationName) {
749     ftprintf(fp, T("<entity name=\"%s\""), entityName);
750     if (publicId)
751       ftprintf(fp, T(" public=\"%s\""), publicId);
752     fputts(T(" system=\""), fp);
753     characterData(data, systemId, (int)tcslen(systemId));
754     puttc(T('"'), fp);
755     ftprintf(fp, T(" notation=\"%s\""), notationName);
756     metaLocation(parser);
757     fputts(T("/>\n"), fp);
758   } else {
759     ftprintf(fp, T("<entity name=\"%s\""), entityName);
760     if (publicId)
761       ftprintf(fp, T(" public=\"%s\""), publicId);
762     fputts(T(" system=\""), fp);
763     characterData(data, systemId, (int)tcslen(systemId));
764     puttc(T('"'), fp);
765     metaLocation(parser);
766     fputts(T("/>\n"), fp);
767   }
768 }
769 
770 static void XMLCALL
771 metaStartNamespaceDecl(void *userData, const XML_Char *prefix,
772                        const XML_Char *uri) {
773   XML_Parser parser = (XML_Parser)userData;
774   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
775   FILE *fp = data->fp;
776   fputts(T("<startns"), fp);
777   if (prefix)
778     ftprintf(fp, T(" prefix=\"%s\""), prefix);
779   if (uri) {
780     fputts(T(" ns=\""), fp);
781     characterData(data, uri, (int)tcslen(uri));
782     fputts(T("\"/>\n"), fp);
783   } else
784     fputts(T("/>\n"), fp);
785 }
786 
787 static void XMLCALL
788 metaEndNamespaceDecl(void *userData, const XML_Char *prefix) {
789   XML_Parser parser = (XML_Parser)userData;
790   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
791   FILE *fp = data->fp;
792   if (! prefix)
793     fputts(T("<endns/>\n"), fp);
794   else
795     ftprintf(fp, T("<endns prefix=\"%s\"/>\n"), prefix);
796 }
797 
798 static int XMLCALL
799 unknownEncodingConvert(void *data, const char *p) {
800   return codepageConvert(*(int *)data, p);
801 }
802 
803 static int XMLCALL
804 unknownEncoding(void *userData, const XML_Char *name, XML_Encoding *info) {
805   int cp;
806   static const XML_Char prefixL[] = T("windows-");
807   static const XML_Char prefixU[] = T("WINDOWS-");
808   int i;
809 
810   UNUSED_P(userData);
811   for (i = 0; prefixU[i]; i++)
812     if (name[i] != prefixU[i] && name[i] != prefixL[i])
813       return 0;
814 
815   cp = 0;
816   for (; name[i]; i++) {
817     static const XML_Char digits[] = T("0123456789");
818     const XML_Char *s = tcschr(digits, name[i]);
819     if (! s)
820       return 0;
821     cp *= 10;
822     cp += (int)(s - digits);
823     if (cp >= 0x10000)
824       return 0;
825   }
826   if (! codepageMap(cp, info->map))
827     return 0;
828   info->convert = unknownEncodingConvert;
829   /* We could just cast the code page integer to a void *,
830   and avoid the use of release. */
831   info->release = free;
832   info->data = malloc(sizeof(int));
833   if (! info->data)
834     return 0;
835   *(int *)info->data = cp;
836   return 1;
837 }
838 
839 static int XMLCALL
840 notStandalone(void *userData) {
841   UNUSED_P(userData);
842   return 0;
843 }
844 
845 static void
846 showVersion(XML_Char *prog) {
847   XML_Char *s = prog;
848   XML_Char ch;
849   const XML_Feature *features = XML_GetFeatureList();
850   while ((ch = *s) != 0) {
851     if (ch == '/'
852 #if defined(_WIN32)
853         || ch == '\\'
854 #endif
855     )
856       prog = s + 1;
857     ++s;
858   }
859   ftprintf(stdout, T("%s using %s\n"), prog, XML_ExpatVersion());
860   if (features != NULL && features[0].feature != XML_FEATURE_END) {
861     int i = 1;
862     ftprintf(stdout, T("%s"), features[0].name);
863     if (features[0].value)
864       ftprintf(stdout, T("=%ld"), features[0].value);
865     while (features[i].feature != XML_FEATURE_END) {
866       ftprintf(stdout, T(", %s"), features[i].name);
867       if (features[i].value)
868         ftprintf(stdout, T("=%ld"), features[i].value);
869       ++i;
870     }
871     ftprintf(stdout, T("\n"));
872   }
873 }
874 
875 #if defined(__GNUC__)
876 __attribute__((noreturn))
877 #endif
878 static void
879 usage(const XML_Char *prog, int rc) {
880   ftprintf(
881       stderr,
882       /* Generated with:
883        * $ xmlwf/xmlwf_helpgen.sh
884        * To update, change xmlwf/xmlwf_helpgen.py, then paste the output of
885        * xmlwf/xmlwf_helpgen.sh in here.
886        */
887       /* clang-format off */
888       T("usage:\n")
889       T("  %s [OPTIONS] [FILE ...]\n")
890       T("  %s -h|--help\n")
891       T("  %s -v|--version\n")
892       T("\n")
893       T("xmlwf - Determines if an XML document is well-formed\n")
894       T("\n")
895       T("positional arguments:\n")
896       T("  FILE           file to process (default: STDIN)\n")
897       T("\n")
898       T("input control arguments:\n")
899       T("  -s             print an error if the document is not [s]tandalone\n")
900       T("  -n             enable [n]amespace processing\n")
901       T("  -p             enable processing of external DTDs and [p]arameter entities\n")
902       T("  -x             enable processing of e[x]ternal entities\n")
903       T("  -e ENCODING    override any in-document [e]ncoding declaration\n")
904       T("  -w             enable support for [W]indows code pages\n")
905       T("  -r             disable memory-mapping and use [r]ead calls instead\n")
906       T("  -g BYTES       buffer size to request per call pair to XML_[G]etBuffer and read (default: 8 KiB)\n")
907       T("  -k             when processing multiple files, [k]eep processing after first file with error\n")
908       T("\n")
909       T("output control arguments:\n")
910       T("  -d DIRECTORY   output [d]estination directory\n")
911       T("  -c             write a [c]opy of input XML, not canonical XML\n")
912       T("  -m             write [m]eta XML, not canonical XML\n")
913       T("  -t             write no XML output for [t]iming of plain parsing\n")
914       T("  -N             enable adding doctype and [n]otation declarations\n")
915       T("\n")
916       T("billion laughs attack protection:\n")
917       T("  NOTE: If you ever need to increase these values for non-attack payload, please file a bug report.\n")
918       T("\n")
919       T("  -a FACTOR      set maximum tolerated [a]mplification factor (default: 100.0)\n")
920       T("  -b BYTES       set number of output [b]ytes needed to activate (default: 8 MiB)\n")
921       T("\n")
922       T("reparse deferral:\n")
923       T("  -q             disable reparse deferral, and allow [q]uadratic parse runtime with large tokens\n")
924       T("\n")
925       T("info arguments:\n")
926       T("  -h, --help     show this [h]elp message and exit\n")
927       T("  -v, --version  show program's [v]ersion number and exit\n")
928       T("\n")
929       T("exit status:\n")
930       T("  0              the input files are well-formed and the output (if requested) was written successfully\n")
931       T("  1              could not allocate data structures, signals a serious problem with execution environment\n")
932       T("  2              one or more input files were not well-formed\n")
933       T("  3              could not create an output file\n")
934       T("  4              command-line argument error\n")
935       T("\n")
936       T("xmlwf of libexpat is software libre, licensed under the MIT license.\n")
937       T("Please report bugs at https://github.com/libexpat/libexpat/issues -- thank you!\n")
938       , /* clang-format on */
939       prog, prog, prog);
940   exit(rc);
941 }
942 
943 #if defined(__MINGW32__) && defined(XML_UNICODE)
944 /* Silence warning about missing prototype */
945 int wmain(int argc, XML_Char **argv);
946 #endif
947 
948 #define XMLWF_SHIFT_ARG_INTO(constCharStarTarget, argc, argv, i, j)            \
949   {                                                                            \
950     if (argv[i][j + 1] == T('\0')) {                                           \
951       if (++i == argc) {                                                       \
952         usage(argv[0], XMLWF_EXIT_USAGE_ERROR);                                \
953         /* usage called exit(..), never gets here */                           \
954       }                                                                        \
955       constCharStarTarget = argv[i];                                           \
956     } else {                                                                   \
957       constCharStarTarget = argv[i] + j + 1;                                   \
958     }                                                                          \
959     i++;                                                                       \
960     j = 0;                                                                     \
961   }
962 
963 int
964 tmain(int argc, XML_Char **argv) {
965   int i, j;
966   const XML_Char *outputDir = NULL;
967   const XML_Char *encoding = NULL;
968   unsigned processFlags = XML_MAP_FILE;
969   int windowsCodePages = 0;
970   int outputType = 0;
971   int useNamespaces = 0;
972   int requireStandalone = 0;
973   int requiresNotations = 0;
974   int continueOnError = 0;
975 
976   float attackMaximumAmplification = -1.0f; /* signaling "not set" */
977   unsigned long long attackThresholdBytes = 0;
978   XML_Bool attackThresholdGiven = XML_FALSE;
979 
980   XML_Bool disableDeferral = XML_FALSE;
981 
982   int exitCode = XMLWF_EXIT_SUCCESS;
983   enum XML_ParamEntityParsing paramEntityParsing
984       = XML_PARAM_ENTITY_PARSING_NEVER;
985   int useStdin = 0;
986   XmlwfUserData userData = {NULL, NULL, NULL};
987 
988 #ifdef _MSC_VER
989   _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
990 #endif
991 
992   i = 1;
993   j = 0;
994   while (i < argc) {
995     if (j == 0) {
996       if (argv[i][0] != T('-'))
997         break;
998       if (argv[i][1] == T('-')) {
999         if (argv[i][2] == T('\0')) {
1000           i++;
1001           break;
1002         } else if (tcscmp(argv[i] + 2, T("help")) == 0) {
1003           usage(argv[0], XMLWF_EXIT_SUCCESS);
1004           // usage called exit(..), never gets here
1005         } else if (tcscmp(argv[i] + 2, T("version")) == 0) {
1006           showVersion(argv[0]);
1007           return XMLWF_EXIT_SUCCESS;
1008         }
1009       }
1010       j++;
1011     }
1012     switch (argv[i][j]) {
1013     case T('r'):
1014       processFlags &= ~XML_MAP_FILE;
1015       j++;
1016       break;
1017     case T('s'):
1018       requireStandalone = 1;
1019       j++;
1020       break;
1021     case T('n'):
1022       useNamespaces = 1;
1023       j++;
1024       break;
1025     case T('p'):
1026       paramEntityParsing = XML_PARAM_ENTITY_PARSING_ALWAYS;
1027       /* fall through */
1028     case T('x'):
1029       processFlags |= XML_EXTERNAL_ENTITIES;
1030       j++;
1031       break;
1032     case T('w'):
1033       windowsCodePages = 1;
1034       j++;
1035       break;
1036     case T('m'):
1037       outputType = 'm';
1038       j++;
1039       break;
1040     case T('c'):
1041       outputType = 'c';
1042       useNamespaces = 0;
1043       j++;
1044       break;
1045     case T('t'):
1046       outputType = 't';
1047       j++;
1048       break;
1049     case T('N'):
1050       requiresNotations = 1;
1051       j++;
1052       break;
1053     case T('d'):
1054       XMLWF_SHIFT_ARG_INTO(outputDir, argc, argv, i, j);
1055       break;
1056     case T('e'):
1057       XMLWF_SHIFT_ARG_INTO(encoding, argc, argv, i, j);
1058       break;
1059     case T('h'):
1060       usage(argv[0], XMLWF_EXIT_SUCCESS);
1061       // usage called exit(..), never gets here
1062     case T('v'):
1063       showVersion(argv[0]);
1064       return XMLWF_EXIT_SUCCESS;
1065     case T('g'): {
1066       const XML_Char *valueText = NULL;
1067       XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j);
1068 
1069       errno = 0;
1070       XML_Char *afterValueText = (XML_Char *)valueText;
1071       const long long read_size_bytes_candidate
1072           = tcstoull(valueText, &afterValueText, 10);
1073       if ((errno != 0) || (afterValueText[0] != T('\0'))
1074           || (read_size_bytes_candidate < 1)
1075           || (read_size_bytes_candidate > (INT_MAX / 2 + 1))) {
1076         // This prevents tperror(..) from reporting misleading "[..]: Success"
1077         errno = ERANGE;
1078         tperror(T("invalid buffer size") T(
1079             " (needs an integer from 1 to INT_MAX/2+1 i.e. 1,073,741,824 on most platforms)"));
1080         exit(XMLWF_EXIT_USAGE_ERROR);
1081       }
1082       g_read_size_bytes = (int)read_size_bytes_candidate;
1083       break;
1084     }
1085     case T('k'):
1086       continueOnError = 1;
1087       j++;
1088       break;
1089     case T('a'): {
1090       const XML_Char *valueText = NULL;
1091       XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j);
1092 
1093       errno = 0;
1094       XML_Char *afterValueText = NULL;
1095       attackMaximumAmplification = tcstof(valueText, &afterValueText);
1096       if ((errno != 0) || (afterValueText[0] != T('\0'))
1097           || isnan(attackMaximumAmplification)
1098           || (attackMaximumAmplification < 1.0f)) {
1099         // This prevents tperror(..) from reporting misleading "[..]: Success"
1100         errno = ERANGE;
1101         tperror(T("invalid amplification limit") T(
1102             " (needs a floating point number greater or equal than 1.0)"));
1103         exit(XMLWF_EXIT_USAGE_ERROR);
1104       }
1105 #if XML_GE == 0
1106       ftprintf(stderr,
1107                T("Warning: Given amplification limit ignored")
1108                    T(", xmlwf has been compiled without DTD/GE support.\n"));
1109 #endif
1110       break;
1111     }
1112     case T('b'): {
1113       const XML_Char *valueText = NULL;
1114       XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j);
1115 
1116       errno = 0;
1117       XML_Char *afterValueText = (XML_Char *)valueText;
1118       attackThresholdBytes = tcstoull(valueText, &afterValueText, 10);
1119       if ((errno != 0) || (afterValueText[0] != T('\0'))) {
1120         // This prevents tperror(..) from reporting misleading "[..]: Success"
1121         errno = ERANGE;
1122         tperror(T("invalid ignore threshold")
1123                     T(" (needs an integer from 0 to 2^64-1)"));
1124         exit(XMLWF_EXIT_USAGE_ERROR);
1125       }
1126       attackThresholdGiven = XML_TRUE;
1127 #if XML_GE == 0
1128       ftprintf(stderr,
1129                T("Warning: Given attack threshold ignored")
1130                    T(", xmlwf has been compiled without DTD/GE support.\n"));
1131 #endif
1132       break;
1133     }
1134     case T('q'): {
1135       disableDeferral = XML_TRUE;
1136       j++;
1137       break;
1138     }
1139     case T('\0'):
1140       if (j > 1) {
1141         i++;
1142         j = 0;
1143         break;
1144       }
1145       /* fall through */
1146     default:
1147       usage(argv[0], XMLWF_EXIT_USAGE_ERROR);
1148       // usage called exit(..), never gets here
1149     }
1150   }
1151   if (i == argc) {
1152     useStdin = 1;
1153     processFlags &= ~XML_MAP_FILE;
1154     i--;
1155   }
1156   for (; i < argc; i++) {
1157     XML_Char *outName = 0;
1158     int result;
1159     XML_Parser parser;
1160     if (useNamespaces)
1161       parser = XML_ParserCreateNS(encoding, NSSEP);
1162     else
1163       parser = XML_ParserCreate(encoding);
1164 
1165     if (! parser) {
1166       tperror(T("Could not instantiate parser"));
1167       exit(XMLWF_EXIT_INTERNAL_ERROR);
1168     }
1169 
1170     if (attackMaximumAmplification != -1.0f) {
1171 #if XML_GE == 1
1172       XML_SetBillionLaughsAttackProtectionMaximumAmplification(
1173           parser, attackMaximumAmplification);
1174 #endif
1175     }
1176     if (attackThresholdGiven) {
1177 #if XML_GE == 1
1178       XML_SetBillionLaughsAttackProtectionActivationThreshold(
1179           parser, attackThresholdBytes);
1180 #else
1181       (void)attackThresholdBytes; // silence -Wunused-but-set-variable
1182 #endif
1183     }
1184 
1185     if (disableDeferral) {
1186       const XML_Bool success = XML_SetReparseDeferralEnabled(parser, XML_FALSE);
1187       if (! success) {
1188         // This prevents tperror(..) from reporting misleading "[..]: Success"
1189         errno = EINVAL;
1190         tperror(T("Failed to disable reparse deferral"));
1191         exit(XMLWF_EXIT_INTERNAL_ERROR);
1192       }
1193     }
1194 
1195     if (requireStandalone)
1196       XML_SetNotStandaloneHandler(parser, notStandalone);
1197     XML_SetParamEntityParsing(parser, paramEntityParsing);
1198     if (outputType == 't') {
1199       /* This is for doing timings; this gives a more realistic estimate of
1200          the parsing time. */
1201       outputDir = 0;
1202       XML_SetElementHandler(parser, nopStartElement, nopEndElement);
1203       XML_SetCharacterDataHandler(parser, nopCharacterData);
1204       XML_SetProcessingInstructionHandler(parser, nopProcessingInstruction);
1205     } else if (outputDir) {
1206       const XML_Char *delim = T("/");
1207       const XML_Char *file = useStdin ? T("STDIN") : argv[i];
1208       if (! useStdin) {
1209         /* Jump after last (back)slash */
1210         const XML_Char *lastDelim = tcsrchr(file, delim[0]);
1211         if (lastDelim)
1212           file = lastDelim + 1;
1213 #if defined(_WIN32)
1214         else {
1215           const XML_Char *winDelim = T("\\");
1216           lastDelim = tcsrchr(file, winDelim[0]);
1217           if (lastDelim) {
1218             file = lastDelim + 1;
1219             delim = winDelim;
1220           }
1221         }
1222 #endif
1223       }
1224       outName = (XML_Char *)malloc((tcslen(outputDir) + tcslen(file) + 2)
1225                                    * sizeof(XML_Char));
1226       if (! outName) {
1227         tperror(T("Could not allocate memory"));
1228         exit(XMLWF_EXIT_INTERNAL_ERROR);
1229       }
1230       tcscpy(outName, outputDir);
1231       tcscat(outName, delim);
1232       tcscat(outName, file);
1233       userData.fp = tfopen(outName, T("wb"));
1234       if (! userData.fp) {
1235         tperror(outName);
1236         exitCode = XMLWF_EXIT_OUTPUT_ERROR;
1237         free(outName);
1238         XML_ParserFree(parser);
1239         if (continueOnError) {
1240           continue;
1241         } else {
1242           break;
1243         }
1244       }
1245       setvbuf(userData.fp, NULL, _IOFBF, 16384);
1246 #ifdef XML_UNICODE
1247       puttc(0xFEFF, userData.fp);
1248 #endif
1249       XML_SetUserData(parser, &userData);
1250       switch (outputType) {
1251       case 'm':
1252         XML_UseParserAsHandlerArg(parser);
1253         XML_SetElementHandler(parser, metaStartElement, metaEndElement);
1254         XML_SetProcessingInstructionHandler(parser, metaProcessingInstruction);
1255         XML_SetCommentHandler(parser, metaComment);
1256         XML_SetCdataSectionHandler(parser, metaStartCdataSection,
1257                                    metaEndCdataSection);
1258         XML_SetCharacterDataHandler(parser, metaCharacterData);
1259         XML_SetDoctypeDeclHandler(parser, metaStartDoctypeDecl,
1260                                   metaEndDoctypeDecl);
1261         XML_SetEntityDeclHandler(parser, metaEntityDecl);
1262         XML_SetNotationDeclHandler(parser, metaNotationDecl);
1263         XML_SetNamespaceDeclHandler(parser, metaStartNamespaceDecl,
1264                                     metaEndNamespaceDecl);
1265         metaStartDocument(parser);
1266         break;
1267       case 'c':
1268         XML_UseParserAsHandlerArg(parser);
1269         XML_SetDefaultHandler(parser, markup);
1270         XML_SetElementHandler(parser, defaultStartElement, defaultEndElement);
1271         XML_SetCharacterDataHandler(parser, defaultCharacterData);
1272         XML_SetProcessingInstructionHandler(parser,
1273                                             defaultProcessingInstruction);
1274         break;
1275       default:
1276         if (useNamespaces)
1277           XML_SetElementHandler(parser, startElementNS, endElementNS);
1278         else
1279           XML_SetElementHandler(parser, startElement, endElement);
1280         XML_SetCharacterDataHandler(parser, characterData);
1281 #ifndef W3C14N
1282         XML_SetProcessingInstructionHandler(parser, processingInstruction);
1283         if (requiresNotations) {
1284           XML_SetDoctypeDeclHandler(parser, startDoctypeDecl, endDoctypeDecl);
1285           XML_SetNotationDeclHandler(parser, notationDecl);
1286         }
1287 #endif /* not W3C14N */
1288         break;
1289       }
1290     }
1291     if (windowsCodePages)
1292       XML_SetUnknownEncodingHandler(parser, unknownEncoding, 0);
1293     result = XML_ProcessFile(parser, useStdin ? NULL : argv[i], processFlags);
1294     if (outputDir) {
1295       if (outputType == 'm')
1296         metaEndDocument(parser);
1297       fclose(userData.fp);
1298       if (! result) {
1299         tremove(outName);
1300       }
1301       free(outName);
1302     }
1303     XML_ParserFree(parser);
1304     if (! result) {
1305       exitCode = XMLWF_EXIT_NOT_WELLFORMED;
1306       cleanupUserData(&userData);
1307       if (! continueOnError) {
1308         break;
1309       }
1310     }
1311   }
1312   return exitCode;
1313 }
1314