xref: /freebsd/contrib/file/src/is_json.c (revision 4f5890a0fb086324a657f3cd7ba1abc57274e0db)
1 /*-
2  * Copyright (c) 2018 Christos Zoulas
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
15  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
18  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24  * POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 /*
28  * Parse JSON object serialization format (RFC-7159)
29  */
30 
31 #ifndef TEST
32 #include "file.h"
33 
34 #ifndef lint
35 FILE_RCSID("@(#)$File: is_json.c,v 1.20 2022/05/28 00:44:22 christos Exp $")
36 #endif
37 
38 #include "magic.h"
39 #else
40 #include <stdio.h>
41 #include <stddef.h>
42 #endif
43 #include <string.h>
44 
45 #ifdef DEBUG
46 #include <stdio.h>
47 #define DPRINTF(a, b, c)	\
48     printf("%*s%s [%.2x/%c] %.*s\n", (int)lvl, "", (a), *(b), *(b), \
49 	(int)(b - c), (const char *)(c))
50 #define __file_debugused
51 #else
52 #define DPRINTF(a, b, c)	do { } while (/*CONSTCOND*/0)
53 #define __file_debugused __attribute__((__unused__))
54 #endif
55 
56 #define JSON_ARRAY	0
57 #define JSON_CONSTANT	1
58 #define JSON_NUMBER	2
59 #define JSON_OBJECT	3
60 #define JSON_STRING	4
61 #define JSON_ARRAYN	5
62 #define JSON_MAX	6
63 
64 /*
65  * if JSON_COUNT != 0:
66  *	count all the objects, require that we have the whole data file
67  * otherwise:
68  *	stop if we find an object or an array
69  */
70 #ifndef JSON_COUNT
71 #define JSON_COUNT 0
72 #endif
73 
74 static int json_parse(const unsigned char **, const unsigned char *, size_t *,
75 	size_t);
76 
77 static int
78 json_isspace(const unsigned char uc)
79 {
80 	switch (uc) {
81 	case ' ':
82 	case '\n':
83 	case '\r':
84 	case '\t':
85 		return 1;
86 	default:
87 		return 0;
88 	}
89 }
90 
91 static int
92 json_isdigit(unsigned char uc)
93 {
94 	switch (uc) {
95 	case '0': case '1': case '2': case '3': case '4':
96 	case '5': case '6': case '7': case '8': case '9':
97 		return 1;
98 	default:
99 		return 0;
100 	}
101 }
102 
103 static int
104 json_isxdigit(unsigned char uc)
105 {
106 	if (json_isdigit(uc))
107 		return 1;
108 	switch (uc) {
109 	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
110 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
111 		return 1;
112 	default:
113 		return 0;
114 	}
115 }
116 
117 static const unsigned char *
118 json_skip_space(const unsigned char *uc, const unsigned char *ue)
119 {
120 	while (uc < ue && json_isspace(*uc))
121 		uc++;
122 	return uc;
123 }
124 
125 static int
126 json_parse_string(const unsigned char **ucp, const unsigned char *ue,
127     size_t lvl __file_debugused)
128 {
129 	const unsigned char *uc = *ucp;
130 	size_t i;
131 
132 	DPRINTF("Parse string: ", uc, *ucp);
133 	while (uc < ue) {
134 		switch (*uc++) {
135 		case '\0':
136 			goto out;
137 		case '\\':
138 			if (uc == ue)
139 				goto out;
140 			switch (*uc++) {
141 			case '\0':
142 				goto out;
143 			case '"':
144 			case '\\':
145 			case '/':
146 			case 'b':
147 			case 'f':
148 			case 'n':
149 			case 'r':
150 			case 't':
151 				continue;
152 			case 'u':
153 				if (ue - uc < 4) {
154 					uc = ue;
155 					goto out;
156 				}
157 				for (i = 0; i < 4; i++)
158 					if (!json_isxdigit(*uc++))
159 						goto out;
160 				continue;
161 			default:
162 				goto out;
163 			}
164 		case '"':
165 			DPRINTF("Good string: ", uc, *ucp);
166 			*ucp = uc;
167 			return 1;
168 		default:
169 			continue;
170 		}
171 	}
172 out:
173 	DPRINTF("Bad string: ", uc, *ucp);
174 	*ucp = uc;
175 	return 0;
176 }
177 
178 static int
179 json_parse_array(const unsigned char **ucp, const unsigned char *ue,
180 	size_t *st, size_t lvl)
181 {
182 	const unsigned char *uc = *ucp;
183 
184 	DPRINTF("Parse array: ", uc, *ucp);
185 	while (uc < ue) {
186 		if (*uc == ']')
187 			goto done;
188 		if (!json_parse(&uc, ue, st, lvl + 1))
189 			goto out;
190 		if (uc == ue)
191 			goto out;
192 		switch (*uc) {
193 		case ',':
194 			uc++;
195 			continue;
196 		case ']':
197 		done:
198 			st[JSON_ARRAYN]++;
199 			DPRINTF("Good array: ", uc, *ucp);
200 			*ucp = uc + 1;
201 			return 1;
202 		default:
203 			goto out;
204 		}
205 	}
206 out:
207 	DPRINTF("Bad array: ", uc,  *ucp);
208 	*ucp = uc;
209 	return 0;
210 }
211 
212 static int
213 json_parse_object(const unsigned char **ucp, const unsigned char *ue,
214 	size_t *st, size_t lvl)
215 {
216 	const unsigned char *uc = *ucp;
217 	DPRINTF("Parse object: ", uc, *ucp);
218 	while (uc < ue) {
219 		uc = json_skip_space(uc, ue);
220 		if (uc == ue)
221 			goto out;
222 		if (*uc == '}') {
223 			uc++;
224 			goto done;
225 		}
226 		if (*uc++ != '"') {
227 			DPRINTF("not string", uc, *ucp);
228 			goto out;
229 		}
230 		DPRINTF("next field", uc, *ucp);
231 		if (!json_parse_string(&uc, ue, lvl)) {
232 			DPRINTF("not string", uc, *ucp);
233 			goto out;
234 		}
235 		uc = json_skip_space(uc, ue);
236 		if (uc == ue)
237 			goto out;
238 		if (*uc++ != ':') {
239 			DPRINTF("not colon", uc, *ucp);
240 			goto out;
241 		}
242 		if (!json_parse(&uc, ue, st, lvl + 1)) {
243 			DPRINTF("not json", uc, *ucp);
244 			goto out;
245 		}
246 		if (uc == ue)
247 			goto out;
248 		switch (*uc++) {
249 		case ',':
250 			continue;
251 		case '}': /* { */
252 		done:
253 			DPRINTF("Good object: ", uc, *ucp);
254 			*ucp = uc;
255 			return 1;
256 		default:
257 			DPRINTF("not more", uc, *ucp);
258 			*ucp = uc - 1;
259 			goto out;
260 		}
261 	}
262 out:
263 	DPRINTF("Bad object: ", uc, *ucp);
264 	*ucp = uc;
265 	return 0;
266 }
267 
268 static int
269 json_parse_number(const unsigned char **ucp, const unsigned char *ue,
270     size_t lvl __file_debugused)
271 {
272 	const unsigned char *uc = *ucp;
273 	int got = 0;
274 
275 	DPRINTF("Parse number: ", uc, *ucp);
276 	if (uc == ue)
277 		return 0;
278 	if (*uc == '-')
279 		uc++;
280 
281 	for (; uc < ue; uc++) {
282 		if (!json_isdigit(*uc))
283 			break;
284 		got = 1;
285 	}
286 	if (uc == ue)
287 		goto out;
288 	if (*uc == '.')
289 		uc++;
290 	for (; uc < ue; uc++) {
291 		if (!json_isdigit(*uc))
292 			break;
293 		got = 1;
294 	}
295 	if (uc == ue)
296 		goto out;
297 	if (got && (*uc == 'e' || *uc == 'E')) {
298 		uc++;
299 		got = 0;
300 		if (uc == ue)
301 			goto out;
302 		if (*uc == '+' || *uc == '-')
303 			uc++;
304 		for (; uc < ue; uc++) {
305 			if (!json_isdigit(*uc))
306 				break;
307 			got = 1;
308 		}
309 	}
310 out:
311 	if (!got)
312 		DPRINTF("Bad number: ", uc, *ucp);
313 	else
314 		DPRINTF("Good number: ", uc, *ucp);
315 	*ucp = uc;
316 	return got;
317 }
318 
319 static int
320 json_parse_const(const unsigned char **ucp, const unsigned char *ue,
321     const char *str, size_t len, size_t lvl __file_debugused)
322 {
323 	const unsigned char *uc = *ucp;
324 
325 	DPRINTF("Parse const: ", uc, *ucp);
326 	for (len--; uc < ue && --len;) {
327 		if (*uc++ == *++str)
328 			continue;
329 	}
330 	if (len)
331 		DPRINTF("Bad const: ", uc, *ucp);
332 	*ucp = uc;
333 	return len == 0;
334 }
335 
336 static int
337 json_parse(const unsigned char **ucp, const unsigned char *ue,
338     size_t *st, size_t lvl)
339 {
340 	const unsigned char *uc;
341 	int rv = 0;
342 	int t;
343 
344 	uc = json_skip_space(*ucp, ue);
345 	if (uc == ue)
346 		goto out;
347 
348 	// Avoid recursion
349 	if (lvl > 500) {
350 		DPRINTF("Too many levels", uc, *ucp);
351 		return 0;
352 	}
353 #if JSON_COUNT
354 	/* bail quickly if not counting */
355 	if (lvl > 1 && (st[JSON_OBJECT] || st[JSON_ARRAYN]))
356 		return 1;
357 #endif
358 
359 	DPRINTF("Parse general: ", uc, *ucp);
360 	switch (*uc++) {
361 	case '"':
362 		rv = json_parse_string(&uc, ue, lvl + 1);
363 		t = JSON_STRING;
364 		break;
365 	case '[':
366 		rv = json_parse_array(&uc, ue, st, lvl + 1);
367 		t = JSON_ARRAY;
368 		break;
369 	case '{': /* '}' */
370 		rv = json_parse_object(&uc, ue, st, lvl + 1);
371 		t = JSON_OBJECT;
372 		break;
373 	case 't':
374 		rv = json_parse_const(&uc, ue, "true", sizeof("true"), lvl + 1);
375 		t = JSON_CONSTANT;
376 		break;
377 	case 'f':
378 		rv = json_parse_const(&uc, ue, "false", sizeof("false"),
379 		    lvl + 1);
380 		t = JSON_CONSTANT;
381 		break;
382 	case 'n':
383 		rv = json_parse_const(&uc, ue, "null", sizeof("null"), lvl + 1);
384 		t = JSON_CONSTANT;
385 		break;
386 	default:
387 		--uc;
388 		rv = json_parse_number(&uc, ue, lvl + 1);
389 		t = JSON_NUMBER;
390 		break;
391 	}
392 	if (rv)
393 		st[t]++;
394 	uc = json_skip_space(uc, ue);
395 out:
396 	DPRINTF("End general: ", uc, *ucp);
397 	*ucp = uc;
398 	if (lvl == 0)
399 		return rv && uc == ue && (st[JSON_ARRAYN] || st[JSON_OBJECT]);
400 	return rv;
401 }
402 
403 #ifndef TEST
404 int
405 file_is_json(struct magic_set *ms, const struct buffer *b)
406 {
407 	const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
408 	const unsigned char *ue = uc + b->flen;
409 	size_t st[JSON_MAX];
410 	int mime = ms->flags & MAGIC_MIME;
411 
412 
413 	if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
414 		return 0;
415 
416 	memset(st, 0, sizeof(st));
417 
418 	if (!json_parse(&uc, ue, st, 0))
419 		return 0;
420 
421 	if (mime == MAGIC_MIME_ENCODING)
422 		return 1;
423 	if (mime) {
424 		if (file_printf(ms, "application/json") == -1)
425 			return -1;
426 		return 1;
427 	}
428 	if (file_printf(ms, "JSON text data") == -1)
429 		return -1;
430 #if JSON_COUNT
431 #define P(n) st[n], st[n] > 1 ? "s" : ""
432 	if (file_printf(ms, " (%" SIZE_T_FORMAT "u object%s, %" SIZE_T_FORMAT
433 	    "u array%s, %" SIZE_T_FORMAT "u string%s, %" SIZE_T_FORMAT
434 	    "u constant%s, %" SIZE_T_FORMAT "u number%s, %" SIZE_T_FORMAT
435 	    "u >1array%s)",
436 	    P(JSON_OBJECT), P(JSON_ARRAY), P(JSON_STRING), P(JSON_CONSTANT),
437 	    P(JSON_NUMBER), P(JSON_ARRAYN))
438 	    == -1)
439 		return -1;
440 #endif
441 	return 1;
442 }
443 
444 #else
445 
446 #include <sys/types.h>
447 #include <sys/stat.h>
448 #include <stdio.h>
449 #include <fcntl.h>
450 #include <unistd.h>
451 #include <stdlib.h>
452 #include <stdint.h>
453 #include <err.h>
454 
455 int
456 main(int argc, char *argv[])
457 {
458 	int fd, rv;
459 	struct stat st;
460 	unsigned char *p;
461 	size_t stats[JSON_MAX];
462 
463 	if ((fd = open(argv[1], O_RDONLY)) == -1)
464 		err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
465 
466 	if (fstat(fd, &st) == -1)
467 		err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
468 
469 	if ((p = CAST(char *, malloc(st.st_size))) == NULL)
470 		err(EXIT_FAILURE, "Can't allocate %jd bytes",
471 		    (intmax_t)st.st_size);
472 	if (read(fd, p, st.st_size) != st.st_size)
473 		err(EXIT_FAILURE, "Can't read %jd bytes",
474 		    (intmax_t)st.st_size);
475 	memset(stats, 0, sizeof(stats));
476 	printf("is json %d\n", json_parse((const unsigned char **)&p,
477 	    p + st.st_size, stats, 0));
478 	return 0;
479 }
480 #endif
481