xref: /freebsd/contrib/file/src/is_csv.c (revision d38c30c092828f4882ce13b08d0bd3fd6dc7afb5)
1*d38c30c0SXin LI /*-
2*d38c30c0SXin LI  * Copyright (c) 2019 Christos Zoulas
3*d38c30c0SXin LI  * All rights reserved.
4*d38c30c0SXin LI  *
5*d38c30c0SXin LI  * Redistribution and use in source and binary forms, with or without
6*d38c30c0SXin LI  * modification, are permitted provided that the following conditions
7*d38c30c0SXin LI  * are met:
8*d38c30c0SXin LI  * 1. Redistributions of source code must retain the above copyright
9*d38c30c0SXin LI  *    notice, this list of conditions and the following disclaimer.
10*d38c30c0SXin LI  * 2. Redistributions in binary form must reproduce the above copyright
11*d38c30c0SXin LI  *    notice, this list of conditions and the following disclaimer in the
12*d38c30c0SXin LI  *    documentation and/or other materials provided with the distribution.
13*d38c30c0SXin LI  *
14*d38c30c0SXin LI  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
15*d38c30c0SXin LI  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16*d38c30c0SXin LI  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17*d38c30c0SXin LI  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
18*d38c30c0SXin LI  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19*d38c30c0SXin LI  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20*d38c30c0SXin LI  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21*d38c30c0SXin LI  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22*d38c30c0SXin LI  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23*d38c30c0SXin LI  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24*d38c30c0SXin LI  * POSSIBILITY OF SUCH DAMAGE.
25*d38c30c0SXin LI  */
26*d38c30c0SXin LI 
27*d38c30c0SXin LI /*
28*d38c30c0SXin LI  * Parse CSV object serialization format (RFC-4180, RFC-7111)
29*d38c30c0SXin LI  */
30*d38c30c0SXin LI 
31*d38c30c0SXin LI #ifndef TEST
32*d38c30c0SXin LI #include "file.h"
33*d38c30c0SXin LI 
34*d38c30c0SXin LI #ifndef lint
35*d38c30c0SXin LI FILE_RCSID("@(#)$File: is_csv.c,v 1.4 2019/06/26 20:31:31 christos Exp $")
36*d38c30c0SXin LI #endif
37*d38c30c0SXin LI 
38*d38c30c0SXin LI #include <string.h>
39*d38c30c0SXin LI #include "magic.h"
40*d38c30c0SXin LI #else
41*d38c30c0SXin LI #include <sys/types.h>
42*d38c30c0SXin LI #endif
43*d38c30c0SXin LI 
44*d38c30c0SXin LI 
45*d38c30c0SXin LI #ifdef DEBUG
46*d38c30c0SXin LI #include <stdio.h>
47*d38c30c0SXin LI #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
48*d38c30c0SXin LI #else
49*d38c30c0SXin LI #define DPRINTF(fmt, ...)
50*d38c30c0SXin LI #endif
51*d38c30c0SXin LI 
52*d38c30c0SXin LI /*
53*d38c30c0SXin LI  * if CSV_LINES == 0:
54*d38c30c0SXin LI  *	check all the lines in the buffer
55*d38c30c0SXin LI  * otherwise:
56*d38c30c0SXin LI  *	check only up-to the number of lines specified
57*d38c30c0SXin LI  *
58*d38c30c0SXin LI  * the last line count is always ignored if it does not end in CRLF
59*d38c30c0SXin LI  */
60*d38c30c0SXin LI #ifndef CSV_LINES
61*d38c30c0SXin LI #define CSV_LINES 10
62*d38c30c0SXin LI #endif
63*d38c30c0SXin LI 
64*d38c30c0SXin LI static int csv_parse(const unsigned char *, const unsigned char *);
65*d38c30c0SXin LI 
66*d38c30c0SXin LI static const unsigned char *
67*d38c30c0SXin LI eatquote(const unsigned char *uc, const unsigned char *ue)
68*d38c30c0SXin LI {
69*d38c30c0SXin LI 	int quote = 0;
70*d38c30c0SXin LI 
71*d38c30c0SXin LI 	while (uc < ue) {
72*d38c30c0SXin LI 		unsigned char c = *uc++;
73*d38c30c0SXin LI 		if (c != '"') {
74*d38c30c0SXin LI 			// We already got one, done.
75*d38c30c0SXin LI 			if (quote) {
76*d38c30c0SXin LI 				return --uc;
77*d38c30c0SXin LI 			}
78*d38c30c0SXin LI 			continue;
79*d38c30c0SXin LI 		}
80*d38c30c0SXin LI 		if (quote) {
81*d38c30c0SXin LI 			// quote-quote escapes
82*d38c30c0SXin LI 			quote = 0;
83*d38c30c0SXin LI 			continue;
84*d38c30c0SXin LI 		}
85*d38c30c0SXin LI 		// first quote
86*d38c30c0SXin LI 		quote = 1;
87*d38c30c0SXin LI 	}
88*d38c30c0SXin LI 	return ue;
89*d38c30c0SXin LI }
90*d38c30c0SXin LI 
91*d38c30c0SXin LI static int
92*d38c30c0SXin LI csv_parse(const unsigned char *uc, const unsigned char *ue)
93*d38c30c0SXin LI {
94*d38c30c0SXin LI 	size_t nf = 0, tf = 0, nl = 0;
95*d38c30c0SXin LI 
96*d38c30c0SXin LI 	while (uc < ue) {
97*d38c30c0SXin LI 		unsigned char c;
98*d38c30c0SXin LI 		switch (c = *uc++) {
99*d38c30c0SXin LI 		case '"':
100*d38c30c0SXin LI 			// Eat until the matching quote
101*d38c30c0SXin LI 			uc = eatquote(uc, ue);
102*d38c30c0SXin LI 			break;
103*d38c30c0SXin LI 		case ',':
104*d38c30c0SXin LI 			nf++;
105*d38c30c0SXin LI 			break;
106*d38c30c0SXin LI 		case '\n':
107*d38c30c0SXin LI 			DPRINTF("%zu %zu %zu\n", nl, nf, tf);
108*d38c30c0SXin LI 			nl++;
109*d38c30c0SXin LI #if CSV_LINES
110*d38c30c0SXin LI 			if (nl == CSV_LINES)
111*d38c30c0SXin LI 				return tf != 0 && tf == nf;
112*d38c30c0SXin LI #endif
113*d38c30c0SXin LI 			if (tf == 0) {
114*d38c30c0SXin LI 				// First time and no fields, give up
115*d38c30c0SXin LI 				if (nf == 0)
116*d38c30c0SXin LI 					return 0;
117*d38c30c0SXin LI 				// First time, set the number of fields
118*d38c30c0SXin LI 				tf = nf;
119*d38c30c0SXin LI 			} else if (tf != nf) {
120*d38c30c0SXin LI 				// Field number mismatch, we are done.
121*d38c30c0SXin LI 				return 0;
122*d38c30c0SXin LI 			}
123*d38c30c0SXin LI 			nf = 0;
124*d38c30c0SXin LI 			break;
125*d38c30c0SXin LI 		default:
126*d38c30c0SXin LI 			break;
127*d38c30c0SXin LI 		}
128*d38c30c0SXin LI 	}
129*d38c30c0SXin LI 	return tf && nl > 2;
130*d38c30c0SXin LI }
131*d38c30c0SXin LI 
132*d38c30c0SXin LI #ifndef TEST
133*d38c30c0SXin LI int
134*d38c30c0SXin LI file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text)
135*d38c30c0SXin LI {
136*d38c30c0SXin LI 	const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
137*d38c30c0SXin LI 	const unsigned char *ue = uc + b->flen;
138*d38c30c0SXin LI 	int mime = ms->flags & MAGIC_MIME;
139*d38c30c0SXin LI 
140*d38c30c0SXin LI 	if (!looks_text)
141*d38c30c0SXin LI 		return 0;
142*d38c30c0SXin LI 
143*d38c30c0SXin LI 	if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
144*d38c30c0SXin LI 		return 0;
145*d38c30c0SXin LI 
146*d38c30c0SXin LI 	if (!csv_parse(uc, ue))
147*d38c30c0SXin LI 		return 0;
148*d38c30c0SXin LI 
149*d38c30c0SXin LI 	if (mime == MAGIC_MIME_ENCODING)
150*d38c30c0SXin LI 		return 1;
151*d38c30c0SXin LI 
152*d38c30c0SXin LI 	if (mime) {
153*d38c30c0SXin LI 		if (file_printf(ms, "application/csv") == -1)
154*d38c30c0SXin LI 			return -1;
155*d38c30c0SXin LI 		return 1;
156*d38c30c0SXin LI 	}
157*d38c30c0SXin LI 
158*d38c30c0SXin LI 	if (file_printf(ms, "CSV text") == -1)
159*d38c30c0SXin LI 		return -1;
160*d38c30c0SXin LI 
161*d38c30c0SXin LI 	return 1;
162*d38c30c0SXin LI }
163*d38c30c0SXin LI 
164*d38c30c0SXin LI #else
165*d38c30c0SXin LI 
166*d38c30c0SXin LI #include <sys/types.h>
167*d38c30c0SXin LI #include <sys/stat.h>
168*d38c30c0SXin LI #include <stdio.h>
169*d38c30c0SXin LI #include <fcntl.h>
170*d38c30c0SXin LI #include <unistd.h>
171*d38c30c0SXin LI #include <stdlib.h>
172*d38c30c0SXin LI #include <stdint.h>
173*d38c30c0SXin LI #include <err.h>
174*d38c30c0SXin LI 
175*d38c30c0SXin LI int
176*d38c30c0SXin LI main(int argc, char *argv[])
177*d38c30c0SXin LI {
178*d38c30c0SXin LI 	int fd, rv;
179*d38c30c0SXin LI 	struct stat st;
180*d38c30c0SXin LI 	unsigned char *p;
181*d38c30c0SXin LI 
182*d38c30c0SXin LI 	if ((fd = open(argv[1], O_RDONLY)) == -1)
183*d38c30c0SXin LI 		err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
184*d38c30c0SXin LI 
185*d38c30c0SXin LI 	if (fstat(fd, &st) == -1)
186*d38c30c0SXin LI 		err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
187*d38c30c0SXin LI 
188*d38c30c0SXin LI 	if ((p = malloc(st.st_size)) == NULL)
189*d38c30c0SXin LI 		err(EXIT_FAILURE, "Can't allocate %jd bytes",
190*d38c30c0SXin LI 		    (intmax_t)st.st_size);
191*d38c30c0SXin LI 	if (read(fd, p, st.st_size) != st.st_size)
192*d38c30c0SXin LI 		err(EXIT_FAILURE, "Can't read %jd bytes",
193*d38c30c0SXin LI 		    (intmax_t)st.st_size);
194*d38c30c0SXin LI 	printf("is csv %d\n", csv_parse(p, p + st.st_size));
195*d38c30c0SXin LI 	return 0;
196*d38c30c0SXin LI }
197*d38c30c0SXin LI #endif
198