10a48773fSEric van Gyzen /*
20a48773fSEric van Gyzen __ __ _
30a48773fSEric van Gyzen ___\ \/ /_ __ __ _| |_
40a48773fSEric van Gyzen / _ \\ /| '_ \ / _` | __|
50a48773fSEric van Gyzen | __// \| |_) | (_| | |_
60a48773fSEric van Gyzen \___/_/\_\ .__/ \__,_|\__|
70a48773fSEric van Gyzen |_| XML parser
80a48773fSEric van Gyzen
90a48773fSEric van Gyzen Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10cc68614dSXin LI Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net>
11cc68614dSXin LI Copyright (c) 2002-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12cc68614dSXin LI Copyright (c) 2004-2006 Karl Waclawek <karl@waclawek.net>
13ac69e5d4SEric van Gyzen Copyright (c) 2005-2007 Steven Solie <steven@solie.ca>
14*4543ef51SXin LI Copyright (c) 2016-2023 Sebastian Pipping <sebastian@pipping.org>
15cc68614dSXin LI Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk>
16cc68614dSXin LI Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
17*4543ef51SXin LI Copyright (c) 2021 Donghee Na <donghee.na@python.org>
180a48773fSEric van Gyzen Licensed under the MIT license:
190a48773fSEric van Gyzen
200a48773fSEric van Gyzen Permission is hereby granted, free of charge, to any person obtaining
210a48773fSEric van Gyzen a copy of this software and associated documentation files (the
220a48773fSEric van Gyzen "Software"), to deal in the Software without restriction, including
230a48773fSEric van Gyzen without limitation the rights to use, copy, modify, merge, publish,
240a48773fSEric van Gyzen distribute, sublicense, and/or sell copies of the Software, and to permit
250a48773fSEric van Gyzen persons to whom the Software is furnished to do so, subject to the
260a48773fSEric van Gyzen following conditions:
270a48773fSEric van Gyzen
280a48773fSEric van Gyzen The above copyright notice and this permission notice shall be included
290a48773fSEric van Gyzen in all copies or substantial portions of the Software.
300a48773fSEric van Gyzen
310a48773fSEric van Gyzen THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
320a48773fSEric van Gyzen EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
330a48773fSEric van Gyzen MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
340a48773fSEric van Gyzen NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
350a48773fSEric van Gyzen DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
360a48773fSEric van Gyzen OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
370a48773fSEric van Gyzen USE OR OTHER DEALINGS IN THE SOFTWARE.
385bb6a25fSPoul-Henning Kamp */
395bb6a25fSPoul-Henning Kamp
40*4543ef51SXin LI #include "expat_config.h"
41cc68614dSXin LI
425bb6a25fSPoul-Henning Kamp #include <stdio.h>
435bb6a25fSPoul-Henning Kamp #include <stdlib.h>
445bb6a25fSPoul-Henning Kamp #include <stddef.h>
455bb6a25fSPoul-Henning Kamp #include <string.h>
465bb6a25fSPoul-Henning Kamp #include <fcntl.h>
47220ed979SColeman Kane
480a48773fSEric van Gyzen #ifdef _WIN32
495bb6a25fSPoul-Henning Kamp # include "winconfig.h"
50cc68614dSXin LI #endif
51220ed979SColeman Kane
525bb6a25fSPoul-Henning Kamp #include "expat.h"
53be8aff81SXin LI #include "internal.h" /* for UNUSED_P only */
545bb6a25fSPoul-Henning Kamp #include "xmlfile.h"
555bb6a25fSPoul-Henning Kamp #include "xmltchar.h"
565bb6a25fSPoul-Henning Kamp #include "filemap.h"
575bb6a25fSPoul-Henning Kamp
580a48773fSEric van Gyzen #if defined(_MSC_VER)
595bb6a25fSPoul-Henning Kamp # include <io.h>
605bb6a25fSPoul-Henning Kamp #endif
615bb6a25fSPoul-Henning Kamp
625bb6a25fSPoul-Henning Kamp #ifdef HAVE_UNISTD_H
635bb6a25fSPoul-Henning Kamp # include <unistd.h>
645bb6a25fSPoul-Henning Kamp #endif
655bb6a25fSPoul-Henning Kamp
665bb6a25fSPoul-Henning Kamp #ifndef O_BINARY
675bb6a25fSPoul-Henning Kamp # ifdef _O_BINARY
685bb6a25fSPoul-Henning Kamp # define O_BINARY _O_BINARY
695bb6a25fSPoul-Henning Kamp # else
705bb6a25fSPoul-Henning Kamp # define O_BINARY 0
715bb6a25fSPoul-Henning Kamp # endif
725bb6a25fSPoul-Henning Kamp #endif
735bb6a25fSPoul-Henning Kamp
74*4543ef51SXin LI int g_read_size_bytes = 1024 * 8;
755bb6a25fSPoul-Henning Kamp
765bb6a25fSPoul-Henning Kamp typedef struct {
775bb6a25fSPoul-Henning Kamp XML_Parser parser;
785bb6a25fSPoul-Henning Kamp int *retPtr;
795bb6a25fSPoul-Henning Kamp } PROCESS_ARGS;
805bb6a25fSPoul-Henning Kamp
816b2c1e49SXin LI static int processStream(const XML_Char *filename, XML_Parser parser);
820a48773fSEric van Gyzen
835bb6a25fSPoul-Henning Kamp static void
reportError(XML_Parser parser,const XML_Char * filename)846b2c1e49SXin LI reportError(XML_Parser parser, const XML_Char *filename) {
85220ed979SColeman Kane enum XML_Error code = XML_GetErrorCode(parser);
865bb6a25fSPoul-Henning Kamp const XML_Char *message = XML_ErrorString(code);
875bb6a25fSPoul-Henning Kamp if (message)
880a48773fSEric van Gyzen ftprintf(stdout,
896b2c1e49SXin LI T("%s") T(":%") T(XML_FMT_INT_MOD) T("u") T(":%")
906b2c1e49SXin LI T(XML_FMT_INT_MOD) T("u") T(": %s\n"),
916b2c1e49SXin LI filename, XML_GetErrorLineNumber(parser),
926b2c1e49SXin LI XML_GetErrorColumnNumber(parser), message);
935bb6a25fSPoul-Henning Kamp else
945bb6a25fSPoul-Henning Kamp ftprintf(stderr, T("%s: (unknown message %d)\n"), filename, code);
955bb6a25fSPoul-Henning Kamp }
965bb6a25fSPoul-Henning Kamp
97220ed979SColeman Kane /* This implementation will give problems on files larger than INT_MAX. */
985bb6a25fSPoul-Henning Kamp static void
processFile(const void * data,size_t size,const XML_Char * filename,void * args)996b2c1e49SXin LI processFile(const void *data, size_t size, const XML_Char *filename,
1006b2c1e49SXin LI void *args) {
1015bb6a25fSPoul-Henning Kamp XML_Parser parser = ((PROCESS_ARGS *)args)->parser;
1025bb6a25fSPoul-Henning Kamp int *retPtr = ((PROCESS_ARGS *)args)->retPtr;
103220ed979SColeman Kane if (XML_Parse(parser, (const char *)data, (int)size, 1) == XML_STATUS_ERROR) {
1045bb6a25fSPoul-Henning Kamp reportError(parser, filename);
1055bb6a25fSPoul-Henning Kamp *retPtr = 0;
1066b2c1e49SXin LI } else
1075bb6a25fSPoul-Henning Kamp *retPtr = 1;
1085bb6a25fSPoul-Henning Kamp }
1095bb6a25fSPoul-Henning Kamp
1100a48773fSEric van Gyzen #if defined(_WIN32)
1115bb6a25fSPoul-Henning Kamp
1125bb6a25fSPoul-Henning Kamp static int
isAsciiLetter(XML_Char c)1136b2c1e49SXin LI isAsciiLetter(XML_Char c) {
1145bb6a25fSPoul-Henning Kamp return (T('a') <= c && c <= T('z')) || (T('A') <= c && c <= T('Z'));
1155bb6a25fSPoul-Henning Kamp }
1165bb6a25fSPoul-Henning Kamp
1170a48773fSEric van Gyzen #endif /* _WIN32 */
1185bb6a25fSPoul-Henning Kamp
1195bb6a25fSPoul-Henning Kamp static const XML_Char *
resolveSystemId(const XML_Char * base,const XML_Char * systemId,XML_Char ** toFree)1205bb6a25fSPoul-Henning Kamp resolveSystemId(const XML_Char *base, const XML_Char *systemId,
1216b2c1e49SXin LI XML_Char **toFree) {
1225bb6a25fSPoul-Henning Kamp XML_Char *s;
1235bb6a25fSPoul-Henning Kamp *toFree = 0;
1246b2c1e49SXin LI if (! base || *systemId == T('/')
1250a48773fSEric van Gyzen #if defined(_WIN32)
1265bb6a25fSPoul-Henning Kamp || *systemId == T('\\')
1275bb6a25fSPoul-Henning Kamp || (isAsciiLetter(systemId[0]) && systemId[1] == T(':'))
1285bb6a25fSPoul-Henning Kamp #endif
1295bb6a25fSPoul-Henning Kamp )
1305bb6a25fSPoul-Henning Kamp return systemId;
1315bb6a25fSPoul-Henning Kamp *toFree = (XML_Char *)malloc((tcslen(base) + tcslen(systemId) + 2)
1325bb6a25fSPoul-Henning Kamp * sizeof(XML_Char));
1335bb6a25fSPoul-Henning Kamp if (! *toFree)
1345bb6a25fSPoul-Henning Kamp return systemId;
1355bb6a25fSPoul-Henning Kamp tcscpy(*toFree, base);
1365bb6a25fSPoul-Henning Kamp s = *toFree;
1375bb6a25fSPoul-Henning Kamp if (tcsrchr(s, T('/')))
1385bb6a25fSPoul-Henning Kamp s = tcsrchr(s, T('/')) + 1;
1390a48773fSEric van Gyzen #if defined(_WIN32)
1405bb6a25fSPoul-Henning Kamp if (tcsrchr(s, T('\\')))
1415bb6a25fSPoul-Henning Kamp s = tcsrchr(s, T('\\')) + 1;
1425bb6a25fSPoul-Henning Kamp #endif
1435bb6a25fSPoul-Henning Kamp tcscpy(s, systemId);
1445bb6a25fSPoul-Henning Kamp return *toFree;
1455bb6a25fSPoul-Henning Kamp }
1465bb6a25fSPoul-Henning Kamp
1475bb6a25fSPoul-Henning Kamp static int
externalEntityRefFilemap(XML_Parser parser,const XML_Char * context,const XML_Char * base,const XML_Char * systemId,const XML_Char * publicId)1486b2c1e49SXin LI externalEntityRefFilemap(XML_Parser parser, const XML_Char *context,
1496b2c1e49SXin LI const XML_Char *base, const XML_Char *systemId,
1506b2c1e49SXin LI const XML_Char *publicId) {
1515bb6a25fSPoul-Henning Kamp int result;
1525bb6a25fSPoul-Henning Kamp XML_Char *s;
1535bb6a25fSPoul-Henning Kamp const XML_Char *filename;
1545bb6a25fSPoul-Henning Kamp XML_Parser entParser = XML_ExternalEntityParserCreate(parser, context, 0);
1550a48773fSEric van Gyzen int filemapRes;
1565bb6a25fSPoul-Henning Kamp PROCESS_ARGS args;
1576b2c1e49SXin LI UNUSED_P(publicId);
1585bb6a25fSPoul-Henning Kamp args.retPtr = &result;
1595bb6a25fSPoul-Henning Kamp args.parser = entParser;
1605bb6a25fSPoul-Henning Kamp filename = resolveSystemId(base, systemId, &s);
1615bb6a25fSPoul-Henning Kamp XML_SetBase(entParser, filename);
1620a48773fSEric van Gyzen filemapRes = filemap(filename, processFile, &args);
1630a48773fSEric van Gyzen switch (filemapRes) {
1640a48773fSEric van Gyzen case 0:
1655bb6a25fSPoul-Henning Kamp result = 0;
1660a48773fSEric van Gyzen break;
1670a48773fSEric van Gyzen case 2:
1686b2c1e49SXin LI ftprintf(stderr,
1696b2c1e49SXin LI T("%s: file too large for memory-mapping")
1706b2c1e49SXin LI T(", switching to streaming\n"),
1716b2c1e49SXin LI filename);
1720a48773fSEric van Gyzen result = processStream(filename, entParser);
1730a48773fSEric van Gyzen break;
1740a48773fSEric van Gyzen }
1755bb6a25fSPoul-Henning Kamp free(s);
1765bb6a25fSPoul-Henning Kamp XML_ParserFree(entParser);
1775bb6a25fSPoul-Henning Kamp return result;
1785bb6a25fSPoul-Henning Kamp }
1795bb6a25fSPoul-Henning Kamp
1805bb6a25fSPoul-Henning Kamp static int
processStream(const XML_Char * filename,XML_Parser parser)1816b2c1e49SXin LI processStream(const XML_Char *filename, XML_Parser parser) {
182cc68614dSXin LI /* passing NULL for filename means read input from stdin */
1835bb6a25fSPoul-Henning Kamp int fd = 0; /* 0 is the fileno for stdin */
1845bb6a25fSPoul-Henning Kamp
1855bb6a25fSPoul-Henning Kamp if (filename != NULL) {
1865bb6a25fSPoul-Henning Kamp fd = topen(filename, O_BINARY | O_RDONLY);
1875bb6a25fSPoul-Henning Kamp if (fd < 0) {
1885bb6a25fSPoul-Henning Kamp tperror(filename);
1895bb6a25fSPoul-Henning Kamp return 0;
1905bb6a25fSPoul-Henning Kamp }
1915bb6a25fSPoul-Henning Kamp }
1925bb6a25fSPoul-Henning Kamp for (;;) {
1935bb6a25fSPoul-Henning Kamp int nread;
194*4543ef51SXin LI char *buf = (char *)XML_GetBuffer(parser, g_read_size_bytes);
1955bb6a25fSPoul-Henning Kamp if (! buf) {
1965bb6a25fSPoul-Henning Kamp if (filename != NULL)
1975bb6a25fSPoul-Henning Kamp close(fd);
1985bb6a25fSPoul-Henning Kamp ftprintf(stderr, T("%s: out of memory\n"),
1990a48773fSEric van Gyzen filename != NULL ? filename : T("xmlwf"));
2005bb6a25fSPoul-Henning Kamp return 0;
2015bb6a25fSPoul-Henning Kamp }
202*4543ef51SXin LI nread = read(fd, buf, g_read_size_bytes);
2035bb6a25fSPoul-Henning Kamp if (nread < 0) {
2040a48773fSEric van Gyzen tperror(filename != NULL ? filename : T("STDIN"));
2055bb6a25fSPoul-Henning Kamp if (filename != NULL)
2065bb6a25fSPoul-Henning Kamp close(fd);
2075bb6a25fSPoul-Henning Kamp return 0;
2085bb6a25fSPoul-Henning Kamp }
2095bb6a25fSPoul-Henning Kamp if (XML_ParseBuffer(parser, nread, nread == 0) == XML_STATUS_ERROR) {
2100a48773fSEric van Gyzen reportError(parser, filename != NULL ? filename : T("STDIN"));
2115bb6a25fSPoul-Henning Kamp if (filename != NULL)
2125bb6a25fSPoul-Henning Kamp close(fd);
2135bb6a25fSPoul-Henning Kamp return 0;
2145bb6a25fSPoul-Henning Kamp }
2155bb6a25fSPoul-Henning Kamp if (nread == 0) {
2165bb6a25fSPoul-Henning Kamp if (filename != NULL)
2175bb6a25fSPoul-Henning Kamp close(fd);
2186b2c1e49SXin LI break;
2196b2c1e49SXin LI ;
2205bb6a25fSPoul-Henning Kamp }
2215bb6a25fSPoul-Henning Kamp }
2225bb6a25fSPoul-Henning Kamp return 1;
2235bb6a25fSPoul-Henning Kamp }
2245bb6a25fSPoul-Henning Kamp
2255bb6a25fSPoul-Henning Kamp static int
externalEntityRefStream(XML_Parser parser,const XML_Char * context,const XML_Char * base,const XML_Char * systemId,const XML_Char * publicId)2266b2c1e49SXin LI externalEntityRefStream(XML_Parser parser, const XML_Char *context,
2276b2c1e49SXin LI const XML_Char *base, const XML_Char *systemId,
2286b2c1e49SXin LI const XML_Char *publicId) {
2295bb6a25fSPoul-Henning Kamp XML_Char *s;
2305bb6a25fSPoul-Henning Kamp const XML_Char *filename;
2315bb6a25fSPoul-Henning Kamp int ret;
2325bb6a25fSPoul-Henning Kamp XML_Parser entParser = XML_ExternalEntityParserCreate(parser, context, 0);
2336b2c1e49SXin LI UNUSED_P(publicId);
2345bb6a25fSPoul-Henning Kamp filename = resolveSystemId(base, systemId, &s);
2355bb6a25fSPoul-Henning Kamp XML_SetBase(entParser, filename);
2365bb6a25fSPoul-Henning Kamp ret = processStream(filename, entParser);
2375bb6a25fSPoul-Henning Kamp free(s);
2385bb6a25fSPoul-Henning Kamp XML_ParserFree(entParser);
2395bb6a25fSPoul-Henning Kamp return ret;
2405bb6a25fSPoul-Henning Kamp }
2415bb6a25fSPoul-Henning Kamp
2425bb6a25fSPoul-Henning Kamp int
XML_ProcessFile(XML_Parser parser,const XML_Char * filename,unsigned flags)2436b2c1e49SXin LI XML_ProcessFile(XML_Parser parser, const XML_Char *filename, unsigned flags) {
2445bb6a25fSPoul-Henning Kamp int result;
2455bb6a25fSPoul-Henning Kamp
2465bb6a25fSPoul-Henning Kamp if (! XML_SetBase(parser, filename)) {
2475bb6a25fSPoul-Henning Kamp ftprintf(stderr, T("%s: out of memory"), filename);
2485bb6a25fSPoul-Henning Kamp exit(1);
2495bb6a25fSPoul-Henning Kamp }
2505bb6a25fSPoul-Henning Kamp
2515bb6a25fSPoul-Henning Kamp if (flags & XML_EXTERNAL_ENTITIES)
2526b2c1e49SXin LI XML_SetExternalEntityRefHandler(parser, (flags & XML_MAP_FILE)
2535bb6a25fSPoul-Henning Kamp ? externalEntityRefFilemap
2545bb6a25fSPoul-Henning Kamp : externalEntityRefStream);
2555bb6a25fSPoul-Henning Kamp if (flags & XML_MAP_FILE) {
2560a48773fSEric van Gyzen int filemapRes;
2575bb6a25fSPoul-Henning Kamp PROCESS_ARGS args;
2585bb6a25fSPoul-Henning Kamp args.retPtr = &result;
2595bb6a25fSPoul-Henning Kamp args.parser = parser;
2600a48773fSEric van Gyzen filemapRes = filemap(filename, processFile, &args);
2610a48773fSEric van Gyzen switch (filemapRes) {
2620a48773fSEric van Gyzen case 0:
2635bb6a25fSPoul-Henning Kamp result = 0;
2640a48773fSEric van Gyzen break;
2650a48773fSEric van Gyzen case 2:
2666b2c1e49SXin LI ftprintf(stderr,
2676b2c1e49SXin LI T("%s: file too large for memory-mapping")
2686b2c1e49SXin LI T(", switching to streaming\n"),
2696b2c1e49SXin LI filename);
2700a48773fSEric van Gyzen result = processStream(filename, parser);
2710a48773fSEric van Gyzen break;
2720a48773fSEric van Gyzen }
2736b2c1e49SXin LI } else
2745bb6a25fSPoul-Henning Kamp result = processStream(filename, parser);
2755bb6a25fSPoul-Henning Kamp return result;
2765bb6a25fSPoul-Henning Kamp }
277