10a48773fSEric van Gyzen /*
20a48773fSEric van Gyzen __ __ _
30a48773fSEric van Gyzen ___\ \/ /_ __ __ _| |_
40a48773fSEric van Gyzen / _ \\ /| '_ \ / _` | __|
50a48773fSEric van Gyzen | __// \| |_) | (_| | |_
60a48773fSEric van Gyzen \___/_/\_\ .__/ \__,_|\__|
70a48773fSEric van Gyzen |_| XML parser
80a48773fSEric van Gyzen
90a48773fSEric van Gyzen Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10*cc68614dSXin LI Copyright (c) 2002 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
11*cc68614dSXin LI Copyright (c) 2016-2017 Sebastian Pipping <sebastian@pipping.org>
120a48773fSEric van Gyzen Licensed under the MIT license:
130a48773fSEric van Gyzen
140a48773fSEric van Gyzen Permission is hereby granted, free of charge, to any person obtaining
150a48773fSEric van Gyzen a copy of this software and associated documentation files (the
160a48773fSEric van Gyzen "Software"), to deal in the Software without restriction, including
170a48773fSEric van Gyzen without limitation the rights to use, copy, modify, merge, publish,
180a48773fSEric van Gyzen distribute, sublicense, and/or sell copies of the Software, and to permit
190a48773fSEric van Gyzen persons to whom the Software is furnished to do so, subject to the
200a48773fSEric van Gyzen following conditions:
210a48773fSEric van Gyzen
220a48773fSEric van Gyzen The above copyright notice and this permission notice shall be included
230a48773fSEric van Gyzen in all copies or substantial portions of the Software.
240a48773fSEric van Gyzen
250a48773fSEric van Gyzen THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
260a48773fSEric van Gyzen EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
270a48773fSEric van Gyzen MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
280a48773fSEric van Gyzen NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
290a48773fSEric van Gyzen DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
300a48773fSEric van Gyzen OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
310a48773fSEric van Gyzen USE OR OTHER DEALINGS IN THE SOFTWARE.
320a48773fSEric van Gyzen */
330a48773fSEric van Gyzen
345bb6a25fSPoul-Henning Kamp #define CHARSET_MAX 41
355bb6a25fSPoul-Henning Kamp
365bb6a25fSPoul-Henning Kamp static const char *
getTok(const char ** pp)376b2c1e49SXin LI getTok(const char **pp) {
385bb6a25fSPoul-Henning Kamp enum { inAtom, inString, init, inComment };
395bb6a25fSPoul-Henning Kamp int state = init;
405bb6a25fSPoul-Henning Kamp const char *tokStart = 0;
415bb6a25fSPoul-Henning Kamp for (;;) {
425bb6a25fSPoul-Henning Kamp switch (**pp) {
435bb6a25fSPoul-Henning Kamp case '\0':
445bb6a25fSPoul-Henning Kamp return 0;
455bb6a25fSPoul-Henning Kamp case ' ':
465bb6a25fSPoul-Henning Kamp case '\r':
475bb6a25fSPoul-Henning Kamp case '\t':
485bb6a25fSPoul-Henning Kamp case '\n':
495bb6a25fSPoul-Henning Kamp if (state == inAtom)
505bb6a25fSPoul-Henning Kamp return tokStart;
515bb6a25fSPoul-Henning Kamp break;
525bb6a25fSPoul-Henning Kamp case '(':
535bb6a25fSPoul-Henning Kamp if (state == inAtom)
545bb6a25fSPoul-Henning Kamp return tokStart;
555bb6a25fSPoul-Henning Kamp if (state != inString)
565bb6a25fSPoul-Henning Kamp state++;
575bb6a25fSPoul-Henning Kamp break;
585bb6a25fSPoul-Henning Kamp case ')':
595bb6a25fSPoul-Henning Kamp if (state > init)
605bb6a25fSPoul-Henning Kamp --state;
615bb6a25fSPoul-Henning Kamp else if (state != inString)
625bb6a25fSPoul-Henning Kamp return 0;
635bb6a25fSPoul-Henning Kamp break;
645bb6a25fSPoul-Henning Kamp case ';':
655bb6a25fSPoul-Henning Kamp case '/':
665bb6a25fSPoul-Henning Kamp case '=':
675bb6a25fSPoul-Henning Kamp if (state == inAtom)
685bb6a25fSPoul-Henning Kamp return tokStart;
695bb6a25fSPoul-Henning Kamp if (state == init)
705bb6a25fSPoul-Henning Kamp return (*pp)++;
715bb6a25fSPoul-Henning Kamp break;
725bb6a25fSPoul-Henning Kamp case '\\':
735bb6a25fSPoul-Henning Kamp ++*pp;
745bb6a25fSPoul-Henning Kamp if (**pp == '\0')
755bb6a25fSPoul-Henning Kamp return 0;
765bb6a25fSPoul-Henning Kamp break;
775bb6a25fSPoul-Henning Kamp case '"':
785bb6a25fSPoul-Henning Kamp switch (state) {
795bb6a25fSPoul-Henning Kamp case inString:
805bb6a25fSPoul-Henning Kamp ++*pp;
815bb6a25fSPoul-Henning Kamp return tokStart;
825bb6a25fSPoul-Henning Kamp case inAtom:
835bb6a25fSPoul-Henning Kamp return tokStart;
845bb6a25fSPoul-Henning Kamp case init:
855bb6a25fSPoul-Henning Kamp tokStart = *pp;
865bb6a25fSPoul-Henning Kamp state = inString;
875bb6a25fSPoul-Henning Kamp break;
885bb6a25fSPoul-Henning Kamp }
895bb6a25fSPoul-Henning Kamp break;
905bb6a25fSPoul-Henning Kamp default:
915bb6a25fSPoul-Henning Kamp if (state == init) {
925bb6a25fSPoul-Henning Kamp tokStart = *pp;
935bb6a25fSPoul-Henning Kamp state = inAtom;
945bb6a25fSPoul-Henning Kamp }
955bb6a25fSPoul-Henning Kamp break;
965bb6a25fSPoul-Henning Kamp }
975bb6a25fSPoul-Henning Kamp ++*pp;
985bb6a25fSPoul-Henning Kamp }
995bb6a25fSPoul-Henning Kamp /* not reached */
1005bb6a25fSPoul-Henning Kamp }
1015bb6a25fSPoul-Henning Kamp
1025bb6a25fSPoul-Henning Kamp /* key must be lowercase ASCII */
1035bb6a25fSPoul-Henning Kamp
1045bb6a25fSPoul-Henning Kamp static int
matchkey(const char * start,const char * end,const char * key)1056b2c1e49SXin LI matchkey(const char *start, const char *end, const char *key) {
1065bb6a25fSPoul-Henning Kamp if (! start)
1075bb6a25fSPoul-Henning Kamp return 0;
1085bb6a25fSPoul-Henning Kamp for (; start != end; start++, key++)
1095bb6a25fSPoul-Henning Kamp if (*start != *key && *start != 'A' + (*key - 'a'))
1105bb6a25fSPoul-Henning Kamp return 0;
1115bb6a25fSPoul-Henning Kamp return *key == '\0';
1125bb6a25fSPoul-Henning Kamp }
1135bb6a25fSPoul-Henning Kamp
1145bb6a25fSPoul-Henning Kamp void
getXMLCharset(const char * buf,char * charset)1156b2c1e49SXin LI getXMLCharset(const char *buf, char *charset) {
1165bb6a25fSPoul-Henning Kamp const char *next, *p;
1175bb6a25fSPoul-Henning Kamp
1185bb6a25fSPoul-Henning Kamp charset[0] = '\0';
1195bb6a25fSPoul-Henning Kamp next = buf;
1205bb6a25fSPoul-Henning Kamp p = getTok(&next);
1215bb6a25fSPoul-Henning Kamp if (matchkey(p, next, "text"))
1225bb6a25fSPoul-Henning Kamp strcpy(charset, "us-ascii");
1235bb6a25fSPoul-Henning Kamp else if (! matchkey(p, next, "application"))
1245bb6a25fSPoul-Henning Kamp return;
1255bb6a25fSPoul-Henning Kamp p = getTok(&next);
1265bb6a25fSPoul-Henning Kamp if (! p || *p != '/')
1275bb6a25fSPoul-Henning Kamp return;
1285bb6a25fSPoul-Henning Kamp p = getTok(&next);
1295bb6a25fSPoul-Henning Kamp if (matchkey(p, next, "xml"))
1305bb6a25fSPoul-Henning Kamp isXml = 1;
1315bb6a25fSPoul-Henning Kamp p = getTok(&next);
1325bb6a25fSPoul-Henning Kamp while (p) {
1335bb6a25fSPoul-Henning Kamp if (*p == ';') {
1345bb6a25fSPoul-Henning Kamp p = getTok(&next);
1355bb6a25fSPoul-Henning Kamp if (matchkey(p, next, "charset")) {
1365bb6a25fSPoul-Henning Kamp p = getTok(&next);
1375bb6a25fSPoul-Henning Kamp if (p && *p == '=') {
1385bb6a25fSPoul-Henning Kamp p = getTok(&next);
1395bb6a25fSPoul-Henning Kamp if (p) {
1405bb6a25fSPoul-Henning Kamp char *s = charset;
1415bb6a25fSPoul-Henning Kamp if (*p == '"') {
1425bb6a25fSPoul-Henning Kamp while (++p != next - 1) {
1435bb6a25fSPoul-Henning Kamp if (*p == '\\')
1445bb6a25fSPoul-Henning Kamp ++p;
1455bb6a25fSPoul-Henning Kamp if (s == charset + CHARSET_MAX - 1) {
1465bb6a25fSPoul-Henning Kamp charset[0] = '\0';
1475bb6a25fSPoul-Henning Kamp break;
1485bb6a25fSPoul-Henning Kamp }
1495bb6a25fSPoul-Henning Kamp *s++ = *p;
1505bb6a25fSPoul-Henning Kamp }
1515bb6a25fSPoul-Henning Kamp *s++ = '\0';
1526b2c1e49SXin LI } else {
1535bb6a25fSPoul-Henning Kamp if (next - p > CHARSET_MAX - 1)
1545bb6a25fSPoul-Henning Kamp break;
1555bb6a25fSPoul-Henning Kamp while (p != next)
1565bb6a25fSPoul-Henning Kamp *s++ = *p++;
1575bb6a25fSPoul-Henning Kamp *s = 0;
1585bb6a25fSPoul-Henning Kamp break;
1595bb6a25fSPoul-Henning Kamp }
1605bb6a25fSPoul-Henning Kamp }
1615bb6a25fSPoul-Henning Kamp }
1625bb6a25fSPoul-Henning Kamp }
1636b2c1e49SXin LI } else
1645bb6a25fSPoul-Henning Kamp p = getTok(&next);
1655bb6a25fSPoul-Henning Kamp }
1665bb6a25fSPoul-Henning Kamp }
1675bb6a25fSPoul-Henning Kamp
1685bb6a25fSPoul-Henning Kamp int
main(int argc,char ** argv)1696b2c1e49SXin LI main(int argc, char **argv) {
1705bb6a25fSPoul-Henning Kamp char buf[CHARSET_MAX];
1715bb6a25fSPoul-Henning Kamp getXMLCharset(argv[1], buf);
1725bb6a25fSPoul-Henning Kamp printf("charset = \"%s\"\n", buf);
1735bb6a25fSPoul-Henning Kamp return 0;
1745bb6a25fSPoul-Henning Kamp }
175