xref: /freebsd/contrib/expat/xmlwf/xmlwf.c (revision c7a063741720ef81d4caa4613242579d12f1d605)
1 /*
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11    Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12    Copyright (c) 2004-2009 Karl Waclawek <karl@waclawek.net>
13    Copyright (c) 2005-2007 Steven Solie <steven@solie.ca>
14    Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
15    Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
16    Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
17    Copyright (c) 2020      Joe Orton <jorton@redhat.com>
18    Copyright (c) 2020      Kleber Tarcísio <klebertarcisio@yahoo.com.br>
19    Copyright (c) 2021      Tim Bray <tbray@textuality.com>
20    Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
21    Licensed under the MIT license:
22 
23    Permission is  hereby granted,  free of charge,  to any  person obtaining
24    a  copy  of  this  software   and  associated  documentation  files  (the
25    "Software"),  to  deal in  the  Software  without restriction,  including
26    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
27    distribute, sublicense, and/or sell copies of the Software, and to permit
28    persons  to whom  the Software  is  furnished to  do so,  subject to  the
29    following conditions:
30 
31    The above copyright  notice and this permission notice  shall be included
32    in all copies or substantial portions of the Software.
33 
34    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
35    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
36    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
37    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
38    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
39    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
40    USE OR OTHER DEALINGS IN THE SOFTWARE.
41 */
42 
43 #include <expat_config.h>
44 
45 #include <assert.h>
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <stddef.h>
49 #include <string.h>
50 #include <math.h> /* for isnan */
51 #include <errno.h>
52 
53 #include "expat.h"
54 #include "codepage.h"
55 #include "internal.h" /* for UNUSED_P only */
56 #include "xmlfile.h"
57 #include "xmltchar.h"
58 
59 #ifdef _MSC_VER
60 #  include <crtdbg.h>
61 #endif
62 
63 #ifdef XML_UNICODE
64 #  include <wchar.h>
65 #endif
66 
67 enum ExitCode {
68   XMLWF_EXIT_SUCCESS = 0,
69   XMLWF_EXIT_INTERNAL_ERROR = 1,
70   XMLWF_EXIT_NOT_WELLFORMED = 2,
71   XMLWF_EXIT_OUTPUT_ERROR = 3,
72   XMLWF_EXIT_USAGE_ERROR = 4,
73 };
74 
75 /* Structures for handler user data */
76 typedef struct NotationList {
77   struct NotationList *next;
78   const XML_Char *notationName;
79   const XML_Char *systemId;
80   const XML_Char *publicId;
81 } NotationList;
82 
83 typedef struct xmlwfUserData {
84   FILE *fp;
85   NotationList *notationListHead;
86   const XML_Char *currentDoctypeName;
87 } XmlwfUserData;
88 
89 /* This ensures proper sorting. */
90 
91 #define NSSEP T('\001')
92 
93 static void XMLCALL
94 characterData(void *userData, const XML_Char *s, int len) {
95   FILE *fp = ((XmlwfUserData *)userData)->fp;
96   for (; len > 0; --len, ++s) {
97     switch (*s) {
98     case T('&'):
99       fputts(T("&amp;"), fp);
100       break;
101     case T('<'):
102       fputts(T("&lt;"), fp);
103       break;
104     case T('>'):
105       fputts(T("&gt;"), fp);
106       break;
107 #ifdef W3C14N
108     case 13:
109       fputts(T("&#xD;"), fp);
110       break;
111 #else
112     case T('"'):
113       fputts(T("&quot;"), fp);
114       break;
115     case 9:
116     case 10:
117     case 13:
118       ftprintf(fp, T("&#%d;"), *s);
119       break;
120 #endif
121     default:
122       puttc(*s, fp);
123       break;
124     }
125   }
126 }
127 
128 static void
129 attributeValue(FILE *fp, const XML_Char *s) {
130   puttc(T('='), fp);
131   puttc(T('"'), fp);
132   assert(s);
133   for (;;) {
134     switch (*s) {
135     case 0:
136     case NSSEP:
137       puttc(T('"'), fp);
138       return;
139     case T('&'):
140       fputts(T("&amp;"), fp);
141       break;
142     case T('<'):
143       fputts(T("&lt;"), fp);
144       break;
145     case T('"'):
146       fputts(T("&quot;"), fp);
147       break;
148 #ifdef W3C14N
149     case 9:
150       fputts(T("&#x9;"), fp);
151       break;
152     case 10:
153       fputts(T("&#xA;"), fp);
154       break;
155     case 13:
156       fputts(T("&#xD;"), fp);
157       break;
158 #else
159     case T('>'):
160       fputts(T("&gt;"), fp);
161       break;
162     case 9:
163     case 10:
164     case 13:
165       ftprintf(fp, T("&#%d;"), *s);
166       break;
167 #endif
168     default:
169       puttc(*s, fp);
170       break;
171     }
172     s++;
173   }
174 }
175 
176 /* Lexicographically comparing UTF-8 encoded attribute values,
177 is equivalent to lexicographically comparing based on the character number. */
178 
179 static int
180 attcmp(const void *att1, const void *att2) {
181   return tcscmp(*(const XML_Char **)att1, *(const XML_Char **)att2);
182 }
183 
184 static void XMLCALL
185 startElement(void *userData, const XML_Char *name, const XML_Char **atts) {
186   int nAtts;
187   const XML_Char **p;
188   FILE *fp = ((XmlwfUserData *)userData)->fp;
189   puttc(T('<'), fp);
190   fputts(name, fp);
191 
192   p = atts;
193   while (*p)
194     ++p;
195   nAtts = (int)((p - atts) >> 1);
196   if (nAtts > 1)
197     qsort((void *)atts, nAtts, sizeof(XML_Char *) * 2, attcmp);
198   while (*atts) {
199     puttc(T(' '), fp);
200     fputts(*atts++, fp);
201     attributeValue(fp, *atts);
202     atts++;
203   }
204   puttc(T('>'), fp);
205 }
206 
207 static void XMLCALL
208 endElement(void *userData, const XML_Char *name) {
209   FILE *fp = ((XmlwfUserData *)userData)->fp;
210   puttc(T('<'), fp);
211   puttc(T('/'), fp);
212   fputts(name, fp);
213   puttc(T('>'), fp);
214 }
215 
216 static int
217 nsattcmp(const void *p1, const void *p2) {
218   const XML_Char *att1 = *(const XML_Char **)p1;
219   const XML_Char *att2 = *(const XML_Char **)p2;
220   int sep1 = (tcsrchr(att1, NSSEP) != 0);
221   int sep2 = (tcsrchr(att2, NSSEP) != 0);
222   if (sep1 != sep2)
223     return sep1 - sep2;
224   return tcscmp(att1, att2);
225 }
226 
227 static void XMLCALL
228 startElementNS(void *userData, const XML_Char *name, const XML_Char **atts) {
229   int nAtts;
230   int nsi;
231   const XML_Char **p;
232   FILE *fp = ((XmlwfUserData *)userData)->fp;
233   const XML_Char *sep;
234   puttc(T('<'), fp);
235 
236   sep = tcsrchr(name, NSSEP);
237   if (sep) {
238     fputts(T("n1:"), fp);
239     fputts(sep + 1, fp);
240     fputts(T(" xmlns:n1"), fp);
241     attributeValue(fp, name);
242     nsi = 2;
243   } else {
244     fputts(name, fp);
245     nsi = 1;
246   }
247 
248   p = atts;
249   while (*p)
250     ++p;
251   nAtts = (int)((p - atts) >> 1);
252   if (nAtts > 1)
253     qsort((void *)atts, nAtts, sizeof(XML_Char *) * 2, nsattcmp);
254   while (*atts) {
255     name = *atts++;
256     sep = tcsrchr(name, NSSEP);
257     puttc(T(' '), fp);
258     if (sep) {
259       ftprintf(fp, T("n%d:"), nsi);
260       fputts(sep + 1, fp);
261     } else
262       fputts(name, fp);
263     attributeValue(fp, *atts);
264     if (sep) {
265       ftprintf(fp, T(" xmlns:n%d"), nsi++);
266       attributeValue(fp, name);
267     }
268     atts++;
269   }
270   puttc(T('>'), fp);
271 }
272 
273 static void XMLCALL
274 endElementNS(void *userData, const XML_Char *name) {
275   FILE *fp = ((XmlwfUserData *)userData)->fp;
276   const XML_Char *sep;
277   puttc(T('<'), fp);
278   puttc(T('/'), fp);
279   sep = tcsrchr(name, NSSEP);
280   if (sep) {
281     fputts(T("n1:"), fp);
282     fputts(sep + 1, fp);
283   } else
284     fputts(name, fp);
285   puttc(T('>'), fp);
286 }
287 
288 #ifndef W3C14N
289 
290 static void XMLCALL
291 processingInstruction(void *userData, const XML_Char *target,
292                       const XML_Char *data) {
293   FILE *fp = ((XmlwfUserData *)userData)->fp;
294   puttc(T('<'), fp);
295   puttc(T('?'), fp);
296   fputts(target, fp);
297   puttc(T(' '), fp);
298   fputts(data, fp);
299   puttc(T('?'), fp);
300   puttc(T('>'), fp);
301 }
302 
303 static XML_Char *
304 xcsdup(const XML_Char *s) {
305   XML_Char *result;
306   int count = 0;
307   int numBytes;
308 
309   /* Get the length of the string, including terminator */
310   while (s[count++] != 0) {
311     /* Do nothing */
312   }
313   numBytes = count * sizeof(XML_Char);
314   result = malloc(numBytes);
315   if (result == NULL)
316     return NULL;
317   memcpy(result, s, numBytes);
318   return result;
319 }
320 
321 static void XMLCALL
322 startDoctypeDecl(void *userData, const XML_Char *doctypeName,
323                  const XML_Char *sysid, const XML_Char *publid,
324                  int has_internal_subset) {
325   XmlwfUserData *data = (XmlwfUserData *)userData;
326   UNUSED_P(sysid);
327   UNUSED_P(publid);
328   UNUSED_P(has_internal_subset);
329   data->currentDoctypeName = xcsdup(doctypeName);
330 }
331 
332 static void
333 freeNotations(XmlwfUserData *data) {
334   NotationList *notationListHead = data->notationListHead;
335 
336   while (notationListHead != NULL) {
337     NotationList *next = notationListHead->next;
338     free((void *)notationListHead->notationName);
339     free((void *)notationListHead->systemId);
340     free((void *)notationListHead->publicId);
341     free(notationListHead);
342     notationListHead = next;
343   }
344   data->notationListHead = NULL;
345 }
346 
347 static void
348 cleanupUserData(XmlwfUserData *userData) {
349   free((void *)userData->currentDoctypeName);
350   userData->currentDoctypeName = NULL;
351   freeNotations(userData);
352 }
353 
354 static int
355 xcscmp(const XML_Char *xs, const XML_Char *xt) {
356   while (*xs != 0 && *xt != 0) {
357     if (*xs < *xt)
358       return -1;
359     if (*xs > *xt)
360       return 1;
361     xs++;
362     xt++;
363   }
364   if (*xs < *xt)
365     return -1;
366   if (*xs > *xt)
367     return 1;
368   return 0;
369 }
370 
371 static int
372 notationCmp(const void *a, const void *b) {
373   const NotationList *const n1 = *(NotationList **)a;
374   const NotationList *const n2 = *(NotationList **)b;
375 
376   return xcscmp(n1->notationName, n2->notationName);
377 }
378 
379 static void XMLCALL
380 endDoctypeDecl(void *userData) {
381   XmlwfUserData *data = (XmlwfUserData *)userData;
382   NotationList **notations;
383   int notationCount = 0;
384   NotationList *p;
385   int i;
386 
387   /* How many notations do we have? */
388   for (p = data->notationListHead; p != NULL; p = p->next)
389     notationCount++;
390   if (notationCount == 0) {
391     /* Nothing to report */
392     free((void *)data->currentDoctypeName);
393     data->currentDoctypeName = NULL;
394     return;
395   }
396 
397   notations = malloc(notationCount * sizeof(NotationList *));
398   if (notations == NULL) {
399     fprintf(stderr, "Unable to sort notations");
400     freeNotations(data);
401     return;
402   }
403 
404   for (p = data->notationListHead, i = 0; i < notationCount; p = p->next, i++) {
405     notations[i] = p;
406   }
407   qsort(notations, notationCount, sizeof(NotationList *), notationCmp);
408 
409   /* Output the DOCTYPE header */
410   fputts(T("<!DOCTYPE "), data->fp);
411   fputts(data->currentDoctypeName, data->fp);
412   fputts(T(" [\n"), data->fp);
413 
414   /* Now the NOTATIONs */
415   for (i = 0; i < notationCount; i++) {
416     fputts(T("<!NOTATION "), data->fp);
417     fputts(notations[i]->notationName, data->fp);
418     if (notations[i]->publicId != NULL) {
419       fputts(T(" PUBLIC '"), data->fp);
420       fputts(notations[i]->publicId, data->fp);
421       puttc(T('\''), data->fp);
422       if (notations[i]->systemId != NULL) {
423         puttc(T(' '), data->fp);
424         puttc(T('\''), data->fp);
425         fputts(notations[i]->systemId, data->fp);
426         puttc(T('\''), data->fp);
427       }
428     } else if (notations[i]->systemId != NULL) {
429       fputts(T(" SYSTEM '"), data->fp);
430       fputts(notations[i]->systemId, data->fp);
431       puttc(T('\''), data->fp);
432     }
433     puttc(T('>'), data->fp);
434     puttc(T('\n'), data->fp);
435   }
436 
437   /* Finally end the DOCTYPE */
438   fputts(T("]>\n"), data->fp);
439 
440   free(notations);
441   freeNotations(data);
442   free((void *)data->currentDoctypeName);
443   data->currentDoctypeName = NULL;
444 }
445 
446 static void XMLCALL
447 notationDecl(void *userData, const XML_Char *notationName, const XML_Char *base,
448              const XML_Char *systemId, const XML_Char *publicId) {
449   XmlwfUserData *data = (XmlwfUserData *)userData;
450   NotationList *entry = malloc(sizeof(NotationList));
451   const char *errorMessage = "Unable to store NOTATION for output\n";
452 
453   UNUSED_P(base);
454   if (entry == NULL) {
455     fputs(errorMessage, stderr);
456     return; /* Nothing we can really do about this */
457   }
458   entry->notationName = xcsdup(notationName);
459   if (entry->notationName == NULL) {
460     fputs(errorMessage, stderr);
461     free(entry);
462     return;
463   }
464   if (systemId != NULL) {
465     entry->systemId = xcsdup(systemId);
466     if (entry->systemId == NULL) {
467       fputs(errorMessage, stderr);
468       free((void *)entry->notationName);
469       free(entry);
470       return;
471     }
472   } else {
473     entry->systemId = NULL;
474   }
475   if (publicId != NULL) {
476     entry->publicId = xcsdup(publicId);
477     if (entry->publicId == NULL) {
478       fputs(errorMessage, stderr);
479       free((void *)entry->systemId); /* Safe if it's NULL */
480       free((void *)entry->notationName);
481       free(entry);
482       return;
483     }
484   } else {
485     entry->publicId = NULL;
486   }
487 
488   entry->next = data->notationListHead;
489   data->notationListHead = entry;
490 }
491 
492 #endif /* not W3C14N */
493 
494 static void XMLCALL
495 defaultCharacterData(void *userData, const XML_Char *s, int len) {
496   UNUSED_P(s);
497   UNUSED_P(len);
498   XML_DefaultCurrent((XML_Parser)userData);
499 }
500 
501 static void XMLCALL
502 defaultStartElement(void *userData, const XML_Char *name,
503                     const XML_Char **atts) {
504   UNUSED_P(name);
505   UNUSED_P(atts);
506   XML_DefaultCurrent((XML_Parser)userData);
507 }
508 
509 static void XMLCALL
510 defaultEndElement(void *userData, const XML_Char *name) {
511   UNUSED_P(name);
512   XML_DefaultCurrent((XML_Parser)userData);
513 }
514 
515 static void XMLCALL
516 defaultProcessingInstruction(void *userData, const XML_Char *target,
517                              const XML_Char *data) {
518   UNUSED_P(target);
519   UNUSED_P(data);
520   XML_DefaultCurrent((XML_Parser)userData);
521 }
522 
523 static void XMLCALL
524 nopCharacterData(void *userData, const XML_Char *s, int len) {
525   UNUSED_P(userData);
526   UNUSED_P(s);
527   UNUSED_P(len);
528 }
529 
530 static void XMLCALL
531 nopStartElement(void *userData, const XML_Char *name, const XML_Char **atts) {
532   UNUSED_P(userData);
533   UNUSED_P(name);
534   UNUSED_P(atts);
535 }
536 
537 static void XMLCALL
538 nopEndElement(void *userData, const XML_Char *name) {
539   UNUSED_P(userData);
540   UNUSED_P(name);
541 }
542 
543 static void XMLCALL
544 nopProcessingInstruction(void *userData, const XML_Char *target,
545                          const XML_Char *data) {
546   UNUSED_P(userData);
547   UNUSED_P(target);
548   UNUSED_P(data);
549 }
550 
551 static void XMLCALL
552 markup(void *userData, const XML_Char *s, int len) {
553   FILE *fp = ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp;
554   for (; len > 0; --len, ++s)
555     puttc(*s, fp);
556 }
557 
558 static void
559 metaLocation(XML_Parser parser) {
560   const XML_Char *uri = XML_GetBase(parser);
561   FILE *fp = ((XmlwfUserData *)XML_GetUserData(parser))->fp;
562   if (uri)
563     ftprintf(fp, T(" uri=\"%s\""), uri);
564   ftprintf(fp,
565            T(" byte=\"%") T(XML_FMT_INT_MOD) T("d\"") T(" nbytes=\"%d\"")
566                T(" line=\"%") T(XML_FMT_INT_MOD) T("u\"") T(" col=\"%")
567                    T(XML_FMT_INT_MOD) T("u\""),
568            XML_GetCurrentByteIndex(parser), XML_GetCurrentByteCount(parser),
569            XML_GetCurrentLineNumber(parser),
570            XML_GetCurrentColumnNumber(parser));
571 }
572 
573 static void
574 metaStartDocument(void *userData) {
575   fputts(T("<document>\n"),
576          ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp);
577 }
578 
579 static void
580 metaEndDocument(void *userData) {
581   fputts(T("</document>\n"),
582          ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp);
583 }
584 
585 static void XMLCALL
586 metaStartElement(void *userData, const XML_Char *name, const XML_Char **atts) {
587   XML_Parser parser = (XML_Parser)userData;
588   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
589   FILE *fp = data->fp;
590   const XML_Char **specifiedAttsEnd
591       = atts + XML_GetSpecifiedAttributeCount(parser);
592   const XML_Char **idAttPtr;
593   int idAttIndex = XML_GetIdAttributeIndex(parser);
594   if (idAttIndex < 0)
595     idAttPtr = 0;
596   else
597     idAttPtr = atts + idAttIndex;
598 
599   ftprintf(fp, T("<starttag name=\"%s\""), name);
600   metaLocation(parser);
601   if (*atts) {
602     fputts(T(">\n"), fp);
603     do {
604       ftprintf(fp, T("<attribute name=\"%s\" value=\""), atts[0]);
605       characterData(data, atts[1], (int)tcslen(atts[1]));
606       if (atts >= specifiedAttsEnd)
607         fputts(T("\" defaulted=\"yes\"/>\n"), fp);
608       else if (atts == idAttPtr)
609         fputts(T("\" id=\"yes\"/>\n"), fp);
610       else
611         fputts(T("\"/>\n"), fp);
612     } while (*(atts += 2));
613     fputts(T("</starttag>\n"), fp);
614   } else
615     fputts(T("/>\n"), fp);
616 }
617 
618 static void XMLCALL
619 metaEndElement(void *userData, const XML_Char *name) {
620   XML_Parser parser = (XML_Parser)userData;
621   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
622   FILE *fp = data->fp;
623   ftprintf(fp, T("<endtag name=\"%s\""), name);
624   metaLocation(parser);
625   fputts(T("/>\n"), fp);
626 }
627 
628 static void XMLCALL
629 metaProcessingInstruction(void *userData, const XML_Char *target,
630                           const XML_Char *data) {
631   XML_Parser parser = (XML_Parser)userData;
632   XmlwfUserData *usrData = (XmlwfUserData *)XML_GetUserData(parser);
633   FILE *fp = usrData->fp;
634   ftprintf(fp, T("<pi target=\"%s\" data=\""), target);
635   characterData(usrData, data, (int)tcslen(data));
636   puttc(T('"'), fp);
637   metaLocation(parser);
638   fputts(T("/>\n"), fp);
639 }
640 
641 static void XMLCALL
642 metaComment(void *userData, const XML_Char *data) {
643   XML_Parser parser = (XML_Parser)userData;
644   XmlwfUserData *usrData = (XmlwfUserData *)XML_GetUserData(parser);
645   FILE *fp = usrData->fp;
646   fputts(T("<comment data=\""), fp);
647   characterData(usrData, data, (int)tcslen(data));
648   puttc(T('"'), fp);
649   metaLocation(parser);
650   fputts(T("/>\n"), fp);
651 }
652 
653 static void XMLCALL
654 metaStartCdataSection(void *userData) {
655   XML_Parser parser = (XML_Parser)userData;
656   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
657   FILE *fp = data->fp;
658   fputts(T("<startcdata"), fp);
659   metaLocation(parser);
660   fputts(T("/>\n"), fp);
661 }
662 
663 static void XMLCALL
664 metaEndCdataSection(void *userData) {
665   XML_Parser parser = (XML_Parser)userData;
666   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
667   FILE *fp = data->fp;
668   fputts(T("<endcdata"), fp);
669   metaLocation(parser);
670   fputts(T("/>\n"), fp);
671 }
672 
673 static void XMLCALL
674 metaCharacterData(void *userData, const XML_Char *s, int len) {
675   XML_Parser parser = (XML_Parser)userData;
676   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
677   FILE *fp = data->fp;
678   fputts(T("<chars str=\""), fp);
679   characterData(data, s, len);
680   puttc(T('"'), fp);
681   metaLocation(parser);
682   fputts(T("/>\n"), fp);
683 }
684 
685 static void XMLCALL
686 metaStartDoctypeDecl(void *userData, const XML_Char *doctypeName,
687                      const XML_Char *sysid, const XML_Char *pubid,
688                      int has_internal_subset) {
689   XML_Parser parser = (XML_Parser)userData;
690   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
691   FILE *fp = data->fp;
692   UNUSED_P(sysid);
693   UNUSED_P(pubid);
694   UNUSED_P(has_internal_subset);
695   ftprintf(fp, T("<startdoctype name=\"%s\""), doctypeName);
696   metaLocation(parser);
697   fputts(T("/>\n"), fp);
698 }
699 
700 static void XMLCALL
701 metaEndDoctypeDecl(void *userData) {
702   XML_Parser parser = (XML_Parser)userData;
703   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
704   FILE *fp = data->fp;
705   fputts(T("<enddoctype"), fp);
706   metaLocation(parser);
707   fputts(T("/>\n"), fp);
708 }
709 
710 static void XMLCALL
711 metaNotationDecl(void *userData, const XML_Char *notationName,
712                  const XML_Char *base, const XML_Char *systemId,
713                  const XML_Char *publicId) {
714   XML_Parser parser = (XML_Parser)userData;
715   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
716   FILE *fp = data->fp;
717   UNUSED_P(base);
718   ftprintf(fp, T("<notation name=\"%s\""), notationName);
719   if (publicId)
720     ftprintf(fp, T(" public=\"%s\""), publicId);
721   if (systemId) {
722     fputts(T(" system=\""), fp);
723     characterData(data, systemId, (int)tcslen(systemId));
724     puttc(T('"'), fp);
725   }
726   metaLocation(parser);
727   fputts(T("/>\n"), fp);
728 }
729 
730 static void XMLCALL
731 metaEntityDecl(void *userData, const XML_Char *entityName, int is_param,
732                const XML_Char *value, int value_length, const XML_Char *base,
733                const XML_Char *systemId, const XML_Char *publicId,
734                const XML_Char *notationName) {
735   XML_Parser parser = (XML_Parser)userData;
736   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
737   FILE *fp = data->fp;
738 
739   UNUSED_P(is_param);
740   UNUSED_P(base);
741   if (value) {
742     ftprintf(fp, T("<entity name=\"%s\""), entityName);
743     metaLocation(parser);
744     puttc(T('>'), fp);
745     characterData(data, value, value_length);
746     fputts(T("</entity/>\n"), fp);
747   } else if (notationName) {
748     ftprintf(fp, T("<entity name=\"%s\""), entityName);
749     if (publicId)
750       ftprintf(fp, T(" public=\"%s\""), publicId);
751     fputts(T(" system=\""), fp);
752     characterData(data, systemId, (int)tcslen(systemId));
753     puttc(T('"'), fp);
754     ftprintf(fp, T(" notation=\"%s\""), notationName);
755     metaLocation(parser);
756     fputts(T("/>\n"), fp);
757   } else {
758     ftprintf(fp, T("<entity name=\"%s\""), entityName);
759     if (publicId)
760       ftprintf(fp, T(" public=\"%s\""), publicId);
761     fputts(T(" system=\""), fp);
762     characterData(data, systemId, (int)tcslen(systemId));
763     puttc(T('"'), fp);
764     metaLocation(parser);
765     fputts(T("/>\n"), fp);
766   }
767 }
768 
769 static void XMLCALL
770 metaStartNamespaceDecl(void *userData, const XML_Char *prefix,
771                        const XML_Char *uri) {
772   XML_Parser parser = (XML_Parser)userData;
773   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
774   FILE *fp = data->fp;
775   fputts(T("<startns"), fp);
776   if (prefix)
777     ftprintf(fp, T(" prefix=\"%s\""), prefix);
778   if (uri) {
779     fputts(T(" ns=\""), fp);
780     characterData(data, uri, (int)tcslen(uri));
781     fputts(T("\"/>\n"), fp);
782   } else
783     fputts(T("/>\n"), fp);
784 }
785 
786 static void XMLCALL
787 metaEndNamespaceDecl(void *userData, const XML_Char *prefix) {
788   XML_Parser parser = (XML_Parser)userData;
789   XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
790   FILE *fp = data->fp;
791   if (! prefix)
792     fputts(T("<endns/>\n"), fp);
793   else
794     ftprintf(fp, T("<endns prefix=\"%s\"/>\n"), prefix);
795 }
796 
797 static int XMLCALL
798 unknownEncodingConvert(void *data, const char *p) {
799   return codepageConvert(*(int *)data, p);
800 }
801 
802 static int XMLCALL
803 unknownEncoding(void *userData, const XML_Char *name, XML_Encoding *info) {
804   int cp;
805   static const XML_Char prefixL[] = T("windows-");
806   static const XML_Char prefixU[] = T("WINDOWS-");
807   int i;
808 
809   UNUSED_P(userData);
810   for (i = 0; prefixU[i]; i++)
811     if (name[i] != prefixU[i] && name[i] != prefixL[i])
812       return 0;
813 
814   cp = 0;
815   for (; name[i]; i++) {
816     static const XML_Char digits[] = T("0123456789");
817     const XML_Char *s = tcschr(digits, name[i]);
818     if (! s)
819       return 0;
820     cp *= 10;
821     cp += (int)(s - digits);
822     if (cp >= 0x10000)
823       return 0;
824   }
825   if (! codepageMap(cp, info->map))
826     return 0;
827   info->convert = unknownEncodingConvert;
828   /* We could just cast the code page integer to a void *,
829   and avoid the use of release. */
830   info->release = free;
831   info->data = malloc(sizeof(int));
832   if (! info->data)
833     return 0;
834   *(int *)info->data = cp;
835   return 1;
836 }
837 
838 static int XMLCALL
839 notStandalone(void *userData) {
840   UNUSED_P(userData);
841   return 0;
842 }
843 
844 static void
845 showVersion(XML_Char *prog) {
846   XML_Char *s = prog;
847   XML_Char ch;
848   const XML_Feature *features = XML_GetFeatureList();
849   while ((ch = *s) != 0) {
850     if (ch == '/'
851 #if defined(_WIN32)
852         || ch == '\\'
853 #endif
854     )
855       prog = s + 1;
856     ++s;
857   }
858   ftprintf(stdout, T("%s using %s\n"), prog, XML_ExpatVersion());
859   if (features != NULL && features[0].feature != XML_FEATURE_END) {
860     int i = 1;
861     ftprintf(stdout, T("%s"), features[0].name);
862     if (features[0].value)
863       ftprintf(stdout, T("=%ld"), features[0].value);
864     while (features[i].feature != XML_FEATURE_END) {
865       ftprintf(stdout, T(", %s"), features[i].name);
866       if (features[i].value)
867         ftprintf(stdout, T("=%ld"), features[i].value);
868       ++i;
869     }
870     ftprintf(stdout, T("\n"));
871   }
872 }
873 
874 static void
875 usage(const XML_Char *prog, int rc) {
876   ftprintf(
877       stderr,
878       /* Generated with:
879        * $ xmlwf/xmlwf_helpgen.sh
880        * To update, change xmlwf/xmlwf_helpgen.py, then paste the output of
881        * xmlwf/xmlwf_helpgen.sh in here.
882        */
883       /* clang-format off */
884       T("usage:\n")
885       T("  %s [OPTIONS] [FILE ...]\n")
886       T("  %s -h\n")
887       T("  %s -v\n")
888       T("\n")
889       T("xmlwf - Determines if an XML document is well-formed\n")
890       T("\n")
891       T("positional arguments:\n")
892       T("  FILE          file to process (default: STDIN)\n")
893       T("\n")
894       T("input control arguments:\n")
895       T("  -s            print an error if the document is not [s]tandalone\n")
896       T("  -n            enable [n]amespace processing\n")
897       T("  -p            enable processing external DTDs and [p]arameter entities\n")
898       T("  -x            enable processing of e[x]ternal entities\n")
899       T("  -e ENCODING   override any in-document [e]ncoding declaration\n")
900       T("  -w            enable support for [W]indows code pages\n")
901       T("  -r            disable memory-mapping and use normal file [r]ead IO calls instead\n")
902       T("  -k            when processing multiple files, [k]eep processing after first file with error\n")
903       T("\n")
904       T("output control arguments:\n")
905       T("  -d DIRECTORY  output [d]estination directory\n")
906       T("  -c            write a [c]opy of input XML, not canonical XML\n")
907       T("  -m            write [m]eta XML, not canonical XML\n")
908       T("  -t            write no XML output for [t]iming of plain parsing\n")
909       T("  -N            enable adding doctype and [n]otation declarations\n")
910       T("\n")
911       T("billion laughs attack protection:\n")
912       T("  NOTE: If you ever need to increase these values for non-attack payload, please file a bug report.\n")
913       T("\n")
914       T("  -a FACTOR     set maximum tolerated [a]mplification factor (default: 100.0)\n")
915       T("  -b BYTES      set number of output [b]ytes needed to activate (default: 8 MiB)\n")
916       T("\n")
917       T("info arguments:\n")
918       T("  -h            show this [h]elp message and exit\n")
919       T("  -v            show program's [v]ersion number and exit\n")
920       T("\n")
921       T("exit status:\n")
922       T("  0             the input files are well-formed and the output (if requested) was written successfully\n")
923       T("  1             could not allocate data structures, signals a serious problem with execution environment\n")
924       T("  2             one or more input files were not well-formed\n")
925       T("  3             could not create an output file\n")
926       T("  4             command-line argument error\n")
927       T("\n")
928       T("xmlwf of libexpat is software libre, licensed under the MIT license.\n")
929       T("Please report bugs at https://github.com/libexpat/libexpat/issues.  Thank you!\n")
930       , /* clang-format on */
931       prog, prog, prog);
932   exit(rc);
933 }
934 
935 #if defined(__MINGW32__) && defined(XML_UNICODE)
936 /* Silence warning about missing prototype */
937 int wmain(int argc, XML_Char **argv);
938 #endif
939 
940 #define XMLWF_SHIFT_ARG_INTO(constCharStarTarget, argc, argv, i, j)            \
941   {                                                                            \
942     if (argv[i][j + 1] == T('\0')) {                                           \
943       if (++i == argc)                                                         \
944         usage(argv[0], XMLWF_EXIT_USAGE_ERROR);                                \
945       constCharStarTarget = argv[i];                                           \
946     } else {                                                                   \
947       constCharStarTarget = argv[i] + j + 1;                                   \
948     }                                                                          \
949     i++;                                                                       \
950     j = 0;                                                                     \
951   }
952 
953 int
954 tmain(int argc, XML_Char **argv) {
955   int i, j;
956   const XML_Char *outputDir = NULL;
957   const XML_Char *encoding = NULL;
958   unsigned processFlags = XML_MAP_FILE;
959   int windowsCodePages = 0;
960   int outputType = 0;
961   int useNamespaces = 0;
962   int requireStandalone = 0;
963   int requiresNotations = 0;
964   int continueOnError = 0;
965 
966   float attackMaximumAmplification = -1.0f; /* signaling "not set" */
967   unsigned long long attackThresholdBytes;
968   XML_Bool attackThresholdGiven = XML_FALSE;
969 
970   int exitCode = XMLWF_EXIT_SUCCESS;
971   enum XML_ParamEntityParsing paramEntityParsing
972       = XML_PARAM_ENTITY_PARSING_NEVER;
973   int useStdin = 0;
974   XmlwfUserData userData = {NULL, NULL, NULL};
975 
976 #ifdef _MSC_VER
977   _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
978 #endif
979 
980   i = 1;
981   j = 0;
982   while (i < argc) {
983     if (j == 0) {
984       if (argv[i][0] != T('-'))
985         break;
986       if (argv[i][1] == T('-') && argv[i][2] == T('\0')) {
987         i++;
988         break;
989       }
990       j++;
991     }
992     switch (argv[i][j]) {
993     case T('r'):
994       processFlags &= ~XML_MAP_FILE;
995       j++;
996       break;
997     case T('s'):
998       requireStandalone = 1;
999       j++;
1000       break;
1001     case T('n'):
1002       useNamespaces = 1;
1003       j++;
1004       break;
1005     case T('p'):
1006       paramEntityParsing = XML_PARAM_ENTITY_PARSING_ALWAYS;
1007       /* fall through */
1008     case T('x'):
1009       processFlags |= XML_EXTERNAL_ENTITIES;
1010       j++;
1011       break;
1012     case T('w'):
1013       windowsCodePages = 1;
1014       j++;
1015       break;
1016     case T('m'):
1017       outputType = 'm';
1018       j++;
1019       break;
1020     case T('c'):
1021       outputType = 'c';
1022       useNamespaces = 0;
1023       j++;
1024       break;
1025     case T('t'):
1026       outputType = 't';
1027       j++;
1028       break;
1029     case T('N'):
1030       requiresNotations = 1;
1031       j++;
1032       break;
1033     case T('d'):
1034       XMLWF_SHIFT_ARG_INTO(outputDir, argc, argv, i, j);
1035       break;
1036     case T('e'):
1037       XMLWF_SHIFT_ARG_INTO(encoding, argc, argv, i, j);
1038       break;
1039     case T('h'):
1040       usage(argv[0], XMLWF_EXIT_SUCCESS);
1041       return 0;
1042     case T('v'):
1043       showVersion(argv[0]);
1044       return 0;
1045     case T('k'):
1046       continueOnError = 1;
1047       j++;
1048       break;
1049     case T('a'): {
1050       const XML_Char *valueText = NULL;
1051       XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j);
1052 
1053       errno = 0;
1054       XML_Char *afterValueText = (XML_Char *)valueText;
1055       attackMaximumAmplification = tcstof(valueText, &afterValueText);
1056       if ((errno != 0) || (afterValueText[0] != T('\0'))
1057           || isnan(attackMaximumAmplification)
1058           || (attackMaximumAmplification < 1.0f)) {
1059         // This prevents tperror(..) from reporting misleading "[..]: Success"
1060         errno = ERANGE;
1061         tperror(T("invalid amplification limit") T(
1062             " (needs a floating point number greater or equal than 1.0)"));
1063         exit(XMLWF_EXIT_USAGE_ERROR);
1064       }
1065 #ifndef XML_DTD
1066       ftprintf(stderr, T("Warning: Given amplification limit ignored") T(
1067                            ", xmlwf has been compiled without DTD support.\n"));
1068 #endif
1069       break;
1070     }
1071     case T('b'): {
1072       const XML_Char *valueText = NULL;
1073       XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j);
1074 
1075       errno = 0;
1076       XML_Char *afterValueText = (XML_Char *)valueText;
1077       attackThresholdBytes = tcstoull(valueText, &afterValueText, 10);
1078       if ((errno != 0) || (afterValueText[0] != T('\0'))) {
1079         // This prevents tperror(..) from reporting misleading "[..]: Success"
1080         errno = ERANGE;
1081         tperror(T("invalid ignore threshold")
1082                     T(" (needs an integer from 0 to 2^64-1)"));
1083         exit(XMLWF_EXIT_USAGE_ERROR);
1084       }
1085       attackThresholdGiven = XML_TRUE;
1086 #ifndef XML_DTD
1087       ftprintf(stderr, T("Warning: Given attack threshold ignored") T(
1088                            ", xmlwf has been compiled without DTD support.\n"));
1089 #endif
1090       break;
1091     }
1092     case T('\0'):
1093       if (j > 1) {
1094         i++;
1095         j = 0;
1096         break;
1097       }
1098       /* fall through */
1099     default:
1100       usage(argv[0], XMLWF_EXIT_USAGE_ERROR);
1101     }
1102   }
1103   if (i == argc) {
1104     useStdin = 1;
1105     processFlags &= ~XML_MAP_FILE;
1106     i--;
1107   }
1108   for (; i < argc; i++) {
1109     XML_Char *outName = 0;
1110     int result;
1111     XML_Parser parser;
1112     if (useNamespaces)
1113       parser = XML_ParserCreateNS(encoding, NSSEP);
1114     else
1115       parser = XML_ParserCreate(encoding);
1116 
1117     if (! parser) {
1118       tperror(T("Could not instantiate parser"));
1119       exit(XMLWF_EXIT_INTERNAL_ERROR);
1120     }
1121 
1122     if (attackMaximumAmplification != -1.0f) {
1123 #ifdef XML_DTD
1124       XML_SetBillionLaughsAttackProtectionMaximumAmplification(
1125           parser, attackMaximumAmplification);
1126 #endif
1127     }
1128     if (attackThresholdGiven) {
1129 #ifdef XML_DTD
1130       XML_SetBillionLaughsAttackProtectionActivationThreshold(
1131           parser, attackThresholdBytes);
1132 #else
1133       (void)attackThresholdBytes; // silence -Wunused-but-set-variable
1134 #endif
1135     }
1136 
1137     if (requireStandalone)
1138       XML_SetNotStandaloneHandler(parser, notStandalone);
1139     XML_SetParamEntityParsing(parser, paramEntityParsing);
1140     if (outputType == 't') {
1141       /* This is for doing timings; this gives a more realistic estimate of
1142          the parsing time. */
1143       outputDir = 0;
1144       XML_SetElementHandler(parser, nopStartElement, nopEndElement);
1145       XML_SetCharacterDataHandler(parser, nopCharacterData);
1146       XML_SetProcessingInstructionHandler(parser, nopProcessingInstruction);
1147     } else if (outputDir) {
1148       const XML_Char *delim = T("/");
1149       const XML_Char *file = useStdin ? T("STDIN") : argv[i];
1150       if (! useStdin) {
1151         /* Jump after last (back)slash */
1152         const XML_Char *lastDelim = tcsrchr(file, delim[0]);
1153         if (lastDelim)
1154           file = lastDelim + 1;
1155 #if defined(_WIN32)
1156         else {
1157           const XML_Char *winDelim = T("\\");
1158           lastDelim = tcsrchr(file, winDelim[0]);
1159           if (lastDelim) {
1160             file = lastDelim + 1;
1161             delim = winDelim;
1162           }
1163         }
1164 #endif
1165       }
1166       outName = (XML_Char *)malloc((tcslen(outputDir) + tcslen(file) + 2)
1167                                    * sizeof(XML_Char));
1168       if (! outName) {
1169         tperror(T("Could not allocate memory"));
1170         exit(XMLWF_EXIT_INTERNAL_ERROR);
1171       }
1172       tcscpy(outName, outputDir);
1173       tcscat(outName, delim);
1174       tcscat(outName, file);
1175       userData.fp = tfopen(outName, T("wb"));
1176       if (! userData.fp) {
1177         tperror(outName);
1178         exitCode = XMLWF_EXIT_OUTPUT_ERROR;
1179         free(outName);
1180         XML_ParserFree(parser);
1181         if (continueOnError) {
1182           continue;
1183         } else {
1184           break;
1185         }
1186       }
1187       setvbuf(userData.fp, NULL, _IOFBF, 16384);
1188 #ifdef XML_UNICODE
1189       puttc(0xFEFF, userData.fp);
1190 #endif
1191       XML_SetUserData(parser, &userData);
1192       switch (outputType) {
1193       case 'm':
1194         XML_UseParserAsHandlerArg(parser);
1195         XML_SetElementHandler(parser, metaStartElement, metaEndElement);
1196         XML_SetProcessingInstructionHandler(parser, metaProcessingInstruction);
1197         XML_SetCommentHandler(parser, metaComment);
1198         XML_SetCdataSectionHandler(parser, metaStartCdataSection,
1199                                    metaEndCdataSection);
1200         XML_SetCharacterDataHandler(parser, metaCharacterData);
1201         XML_SetDoctypeDeclHandler(parser, metaStartDoctypeDecl,
1202                                   metaEndDoctypeDecl);
1203         XML_SetEntityDeclHandler(parser, metaEntityDecl);
1204         XML_SetNotationDeclHandler(parser, metaNotationDecl);
1205         XML_SetNamespaceDeclHandler(parser, metaStartNamespaceDecl,
1206                                     metaEndNamespaceDecl);
1207         metaStartDocument(parser);
1208         break;
1209       case 'c':
1210         XML_UseParserAsHandlerArg(parser);
1211         XML_SetDefaultHandler(parser, markup);
1212         XML_SetElementHandler(parser, defaultStartElement, defaultEndElement);
1213         XML_SetCharacterDataHandler(parser, defaultCharacterData);
1214         XML_SetProcessingInstructionHandler(parser,
1215                                             defaultProcessingInstruction);
1216         break;
1217       default:
1218         if (useNamespaces)
1219           XML_SetElementHandler(parser, startElementNS, endElementNS);
1220         else
1221           XML_SetElementHandler(parser, startElement, endElement);
1222         XML_SetCharacterDataHandler(parser, characterData);
1223 #ifndef W3C14N
1224         XML_SetProcessingInstructionHandler(parser, processingInstruction);
1225         if (requiresNotations) {
1226           XML_SetDoctypeDeclHandler(parser, startDoctypeDecl, endDoctypeDecl);
1227           XML_SetNotationDeclHandler(parser, notationDecl);
1228         }
1229 #endif /* not W3C14N */
1230         break;
1231       }
1232     }
1233     if (windowsCodePages)
1234       XML_SetUnknownEncodingHandler(parser, unknownEncoding, 0);
1235     result = XML_ProcessFile(parser, useStdin ? NULL : argv[i], processFlags);
1236     if (outputDir) {
1237       if (outputType == 'm')
1238         metaEndDocument(parser);
1239       fclose(userData.fp);
1240       if (! result) {
1241         tremove(outName);
1242       }
1243       free(outName);
1244     }
1245     XML_ParserFree(parser);
1246     if (! result) {
1247       exitCode = XMLWF_EXIT_NOT_WELLFORMED;
1248       cleanupUserData(&userData);
1249       if (! continueOnError) {
1250         break;
1251       }
1252     }
1253   }
1254   return exitCode;
1255 }
1256