xref: /freebsd/contrib/file/src/is_json.c (revision c07d6445eb89d9dd3950361b065b7bd110e3a043)
1 /*-
2  * Copyright (c) 2018 Christos Zoulas
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
15  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
18  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24  * POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 /*
28  * Parse JSON object serialization format (RFC-7159)
29  */
30 
31 #ifndef TEST
32 #include "file.h"
33 
34 #ifndef lint
35 FILE_RCSID("@(#)$File: is_json.c,v 1.26 2022/09/13 18:46:07 christos Exp $")
36 #endif
37 
38 #include "magic.h"
39 #else
40 #include <stdio.h>
41 #include <stddef.h>
42 #endif
43 #include <string.h>
44 
45 #ifdef DEBUG
46 #include <stdio.h>
47 #define DPRINTF(a, b, c)	\
48     printf("%*s%s [%.2x/%c] %.*s\n", (int)lvl, "", (a), *(b), *(b), \
49 	(int)(b - c), (const char *)(c))
50 #define __file_debugused
51 #else
52 #define DPRINTF(a, b, c)	do { } while (/*CONSTCOND*/0)
53 #define __file_debugused __attribute__((__unused__))
54 #endif
55 
56 #define JSON_ARRAY	0
57 #define JSON_CONSTANT	1
58 #define JSON_NUMBER	2
59 #define JSON_OBJECT	3
60 #define JSON_STRING	4
61 #define JSON_ARRAYN	5
62 #define JSON_MAX	6
63 
64 /*
65  * if JSON_COUNT != 0:
66  *	count all the objects, require that we have the whole data file
67  * otherwise:
68  *	stop if we find an object or an array
69  */
70 #ifndef JSON_COUNT
71 #define JSON_COUNT 0
72 #endif
73 
74 static int json_parse(const unsigned char **, const unsigned char *, size_t *,
75 	size_t);
76 
77 static int
78 json_isspace(const unsigned char uc)
79 {
80 	switch (uc) {
81 	case ' ':
82 	case '\n':
83 	case '\r':
84 	case '\t':
85 		return 1;
86 	default:
87 		return 0;
88 	}
89 }
90 
91 static int
92 json_isdigit(unsigned char uc)
93 {
94 	switch (uc) {
95 	case '0': case '1': case '2': case '3': case '4':
96 	case '5': case '6': case '7': case '8': case '9':
97 		return 1;
98 	default:
99 		return 0;
100 	}
101 }
102 
103 static int
104 json_isxdigit(unsigned char uc)
105 {
106 	if (json_isdigit(uc))
107 		return 1;
108 	switch (uc) {
109 	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
110 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
111 		return 1;
112 	default:
113 		return 0;
114 	}
115 }
116 
117 static const unsigned char *
118 json_skip_space(const unsigned char *uc, const unsigned char *ue)
119 {
120 	while (uc < ue && json_isspace(*uc))
121 		uc++;
122 	return uc;
123 }
124 
125 /*ARGSUSED*/
126 static int
127 json_parse_string(const unsigned char **ucp, const unsigned char *ue,
128     size_t lvl __file_debugused)
129 {
130 	const unsigned char *uc = *ucp;
131 	size_t i;
132 
133 	DPRINTF("Parse string: ", uc, *ucp);
134 	while (uc < ue) {
135 		switch (*uc++) {
136 		case '\0':
137 			goto out;
138 		case '\\':
139 			if (uc == ue)
140 				goto out;
141 			switch (*uc++) {
142 			case '\0':
143 				goto out;
144 			case '"':
145 			case '\\':
146 			case '/':
147 			case 'b':
148 			case 'f':
149 			case 'n':
150 			case 'r':
151 			case 't':
152 				continue;
153 			case 'u':
154 				if (ue - uc < 4) {
155 					uc = ue;
156 					goto out;
157 				}
158 				for (i = 0; i < 4; i++)
159 					if (!json_isxdigit(*uc++))
160 						goto out;
161 				continue;
162 			default:
163 				goto out;
164 			}
165 		case '"':
166 			DPRINTF("Good string: ", uc, *ucp);
167 			*ucp = uc;
168 			return 1;
169 		default:
170 			continue;
171 		}
172 	}
173 out:
174 	DPRINTF("Bad string: ", uc, *ucp);
175 	*ucp = uc;
176 	return 0;
177 }
178 
179 static int
180 json_parse_array(const unsigned char **ucp, const unsigned char *ue,
181 	size_t *st, size_t lvl)
182 {
183 	const unsigned char *uc = *ucp;
184 
185 	DPRINTF("Parse array: ", uc, *ucp);
186 	while (uc < ue) {
187 		uc = json_skip_space(uc, ue);
188 		if (uc == ue)
189 			goto out;
190 		if (*uc == ']')
191 			goto done;
192 		if (!json_parse(&uc, ue, st, lvl + 1))
193 			goto out;
194 		if (uc == ue)
195 			goto out;
196 		switch (*uc) {
197 		case ',':
198 			uc++;
199 			continue;
200 		case ']':
201 		done:
202 			st[JSON_ARRAYN]++;
203 			DPRINTF("Good array: ", uc, *ucp);
204 			*ucp = uc + 1;
205 			return 1;
206 		default:
207 			goto out;
208 		}
209 	}
210 out:
211 	DPRINTF("Bad array: ", uc,  *ucp);
212 	*ucp = uc;
213 	return 0;
214 }
215 
216 static int
217 json_parse_object(const unsigned char **ucp, const unsigned char *ue,
218 	size_t *st, size_t lvl)
219 {
220 	const unsigned char *uc = *ucp;
221 	DPRINTF("Parse object: ", uc, *ucp);
222 	while (uc < ue) {
223 		uc = json_skip_space(uc, ue);
224 		if (uc == ue)
225 			goto out;
226 		if (*uc == '}') {
227 			uc++;
228 			goto done;
229 		}
230 		if (*uc++ != '"') {
231 			DPRINTF("not string", uc, *ucp);
232 			goto out;
233 		}
234 		DPRINTF("next field", uc, *ucp);
235 		if (!json_parse_string(&uc, ue, lvl)) {
236 			DPRINTF("not string", uc, *ucp);
237 			goto out;
238 		}
239 		uc = json_skip_space(uc, ue);
240 		if (uc == ue)
241 			goto out;
242 		if (*uc++ != ':') {
243 			DPRINTF("not colon", uc, *ucp);
244 			goto out;
245 		}
246 		if (!json_parse(&uc, ue, st, lvl + 1)) {
247 			DPRINTF("not json", uc, *ucp);
248 			goto out;
249 		}
250 		if (uc == ue)
251 			goto out;
252 		switch (*uc++) {
253 		case ',':
254 			continue;
255 		case '}': /* { */
256 		done:
257 			DPRINTF("Good object: ", uc, *ucp);
258 			*ucp = uc;
259 			return 1;
260 		default:
261 			DPRINTF("not more", uc, *ucp);
262 			*ucp = uc - 1;
263 			goto out;
264 		}
265 	}
266 out:
267 	DPRINTF("Bad object: ", uc, *ucp);
268 	*ucp = uc;
269 	return 0;
270 }
271 
272 /*ARGSUSED*/
273 static int
274 json_parse_number(const unsigned char **ucp, const unsigned char *ue,
275     size_t lvl __file_debugused)
276 {
277 	const unsigned char *uc = *ucp;
278 	int got = 0;
279 
280 	DPRINTF("Parse number: ", uc, *ucp);
281 	if (uc == ue)
282 		return 0;
283 	if (*uc == '-')
284 		uc++;
285 
286 	for (; uc < ue; uc++) {
287 		if (!json_isdigit(*uc))
288 			break;
289 		got = 1;
290 	}
291 	if (uc == ue)
292 		goto out;
293 	if (*uc == '.')
294 		uc++;
295 	for (; uc < ue; uc++) {
296 		if (!json_isdigit(*uc))
297 			break;
298 		got = 1;
299 	}
300 	if (uc == ue)
301 		goto out;
302 	if (got && (*uc == 'e' || *uc == 'E')) {
303 		uc++;
304 		got = 0;
305 		if (uc == ue)
306 			goto out;
307 		if (*uc == '+' || *uc == '-')
308 			uc++;
309 		for (; uc < ue; uc++) {
310 			if (!json_isdigit(*uc))
311 				break;
312 			got = 1;
313 		}
314 	}
315 out:
316 	if (!got)
317 		DPRINTF("Bad number: ", uc, *ucp);
318 	else
319 		DPRINTF("Good number: ", uc, *ucp);
320 	*ucp = uc;
321 	return got;
322 }
323 
324 /*ARGSUSED*/
325 static int
326 json_parse_const(const unsigned char **ucp, const unsigned char *ue,
327     const char *str, size_t len, size_t lvl __file_debugused)
328 {
329 	const unsigned char *uc = *ucp;
330 
331 	DPRINTF("Parse const: ", uc, *ucp);
332 	*ucp += --len - 1;
333 	if (*ucp > ue)
334 		*ucp = ue;
335 	for (; uc < ue && --len;) {
336 		if (*uc++ != *++str) {
337 			DPRINTF("Bad const: ", uc, *ucp);
338 			return 0;
339 		}
340 	}
341 	DPRINTF("Good const: ", uc, *ucp);
342 	return 1;
343 }
344 
345 static int
346 json_parse(const unsigned char **ucp, const unsigned char *ue,
347     size_t *st, size_t lvl)
348 {
349 	const unsigned char *uc, *ouc;
350 	int rv = 0;
351 	int t;
352 
353 	ouc = uc = json_skip_space(*ucp, ue);
354 	if (uc == ue)
355 		goto out;
356 
357 	// Avoid recursion
358 	if (lvl > 500) {
359 		DPRINTF("Too many levels", uc, *ucp);
360 		return 0;
361 	}
362 #if JSON_COUNT
363 	/* bail quickly if not counting */
364 	if (lvl > 1 && (st[JSON_OBJECT] || st[JSON_ARRAYN]))
365 		return 1;
366 #endif
367 
368 	DPRINTF("Parse general: ", uc, *ucp);
369 	switch (*uc++) {
370 	case '"':
371 		rv = json_parse_string(&uc, ue, lvl + 1);
372 		t = JSON_STRING;
373 		break;
374 	case '[':
375 		rv = json_parse_array(&uc, ue, st, lvl + 1);
376 		t = JSON_ARRAY;
377 		break;
378 	case '{': /* '}' */
379 		rv = json_parse_object(&uc, ue, st, lvl + 1);
380 		t = JSON_OBJECT;
381 		break;
382 	case 't':
383 		rv = json_parse_const(&uc, ue, "true", sizeof("true"), lvl + 1);
384 		t = JSON_CONSTANT;
385 		break;
386 	case 'f':
387 		rv = json_parse_const(&uc, ue, "false", sizeof("false"),
388 		    lvl + 1);
389 		t = JSON_CONSTANT;
390 		break;
391 	case 'n':
392 		rv = json_parse_const(&uc, ue, "null", sizeof("null"), lvl + 1);
393 		t = JSON_CONSTANT;
394 		break;
395 	default:
396 		--uc;
397 		rv = json_parse_number(&uc, ue, lvl + 1);
398 		t = JSON_NUMBER;
399 		break;
400 	}
401 	if (rv)
402 		st[t]++;
403 	uc = json_skip_space(uc, ue);
404 out:
405 	DPRINTF("End general: ", uc, *ucp);
406 	*ucp = uc;
407 	if (lvl == 0) {
408 		if (!rv)
409 			return 0;
410 		if (uc == ue)
411 			return (st[JSON_ARRAYN] || st[JSON_OBJECT]) ? 1 : 0;
412 		if (*ouc == *uc && json_parse(&uc, ue, st, 1))
413 			return (st[JSON_ARRAYN] || st[JSON_OBJECT]) ? 2 : 0;
414 		else
415 			return 0;
416 	}
417 	return rv;
418 }
419 
420 #ifndef TEST
421 int
422 file_is_json(struct magic_set *ms, const struct buffer *b)
423 {
424 	const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
425 	const unsigned char *ue = uc + b->flen;
426 	size_t st[JSON_MAX];
427 	int mime = ms->flags & MAGIC_MIME;
428 	int jt;
429 
430 
431 	if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
432 		return 0;
433 
434 	memset(st, 0, sizeof(st));
435 
436 	if ((jt = json_parse(&uc, ue, st, 0)) == 0)
437 		return 0;
438 
439 	if (mime == MAGIC_MIME_ENCODING)
440 		return 1;
441 	if (mime) {
442 		if (file_printf(ms, "application/%s",
443 		    jt == 1 ? "json" : "x-ndjason") == -1)
444 			return -1;
445 		return 1;
446 	}
447 	if (file_printf(ms, "%sJSON text data",
448 	    jt == 1 ? "" : "New Line Delimited ") == -1)
449 		return -1;
450 #if JSON_COUNT
451 #define P(n) st[n], st[n] > 1 ? "s" : ""
452 	if (file_printf(ms, " (%" SIZE_T_FORMAT "u object%s, %" SIZE_T_FORMAT
453 	    "u array%s, %" SIZE_T_FORMAT "u string%s, %" SIZE_T_FORMAT
454 	    "u constant%s, %" SIZE_T_FORMAT "u number%s, %" SIZE_T_FORMAT
455 	    "u >1array%s)",
456 	    P(JSON_OBJECT), P(JSON_ARRAY), P(JSON_STRING), P(JSON_CONSTANT),
457 	    P(JSON_NUMBER), P(JSON_ARRAYN))
458 	    == -1)
459 		return -1;
460 #endif
461 	return 1;
462 }
463 
464 #else
465 
466 #include <sys/types.h>
467 #include <sys/stat.h>
468 #include <stdio.h>
469 #include <fcntl.h>
470 #include <unistd.h>
471 #include <stdlib.h>
472 #include <stdint.h>
473 #include <err.h>
474 
475 int
476 main(int argc, char *argv[])
477 {
478 	int fd, rv;
479 	struct stat st;
480 	unsigned char *p;
481 	size_t stats[JSON_MAX];
482 
483 	if ((fd = open(argv[1], O_RDONLY)) == -1)
484 		err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
485 
486 	if (fstat(fd, &st) == -1)
487 		err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
488 
489 	if ((p = CAST(char *, malloc(st.st_size))) == NULL)
490 		err(EXIT_FAILURE, "Can't allocate %jd bytes",
491 		    (intmax_t)st.st_size);
492 	if (read(fd, p, st.st_size) != st.st_size)
493 		err(EXIT_FAILURE, "Can't read %jd bytes",
494 		    (intmax_t)st.st_size);
495 	memset(stats, 0, sizeof(stats));
496 	printf("is json %d\n", json_parse((const unsigned char **)&p,
497 	    p + st.st_size, stats, 0));
498 	return 0;
499 }
500 #endif
501