xref: /freebsd/contrib/file/src/is_csv.c (revision ae316d1d1cffd71ab7751f94e10118777a88e027)
1d38c30c0SXin LI /*-
2d38c30c0SXin LI  * Copyright (c) 2019 Christos Zoulas
3d38c30c0SXin LI  * All rights reserved.
4d38c30c0SXin LI  *
5d38c30c0SXin LI  * Redistribution and use in source and binary forms, with or without
6d38c30c0SXin LI  * modification, are permitted provided that the following conditions
7d38c30c0SXin LI  * are met:
8d38c30c0SXin LI  * 1. Redistributions of source code must retain the above copyright
9d38c30c0SXin LI  *    notice, this list of conditions and the following disclaimer.
10d38c30c0SXin LI  * 2. Redistributions in binary form must reproduce the above copyright
11d38c30c0SXin LI  *    notice, this list of conditions and the following disclaimer in the
12d38c30c0SXin LI  *    documentation and/or other materials provided with the distribution.
13d38c30c0SXin LI  *
14d38c30c0SXin LI  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
15d38c30c0SXin LI  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16d38c30c0SXin LI  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17d38c30c0SXin LI  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
18d38c30c0SXin LI  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19d38c30c0SXin LI  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20d38c30c0SXin LI  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21d38c30c0SXin LI  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22d38c30c0SXin LI  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23d38c30c0SXin LI  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24d38c30c0SXin LI  * POSSIBILITY OF SUCH DAMAGE.
25d38c30c0SXin LI  */
26d38c30c0SXin LI 
27d38c30c0SXin LI /*
28d38c30c0SXin LI  * Parse CSV object serialization format (RFC-4180, RFC-7111)
29d38c30c0SXin LI  */
30d38c30c0SXin LI 
31d38c30c0SXin LI #ifndef TEST
32d38c30c0SXin LI #include "file.h"
33d38c30c0SXin LI 
34d38c30c0SXin LI #ifndef lint
35*ae316d1dSXin LI FILE_RCSID("@(#)$File: is_csv.c,v 1.15 2024/05/18 15:16:13 christos Exp $")
36d38c30c0SXin LI #endif
37d38c30c0SXin LI 
38d38c30c0SXin LI #include <string.h>
39d38c30c0SXin LI #include "magic.h"
40d38c30c0SXin LI #else
41*ae316d1dSXin LI #define CAST(a, b)	((a)(b))
42d38c30c0SXin LI #include <sys/types.h>
43d38c30c0SXin LI #endif
44d38c30c0SXin LI 
45d38c30c0SXin LI 
46d38c30c0SXin LI #ifdef DEBUG
47d38c30c0SXin LI #include <stdio.h>
48d38c30c0SXin LI #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
49d38c30c0SXin LI #else
50d38c30c0SXin LI #define DPRINTF(fmt, ...)
51d38c30c0SXin LI #endif
52d38c30c0SXin LI 
53d38c30c0SXin LI /*
54d38c30c0SXin LI  * if CSV_LINES == 0:
55d38c30c0SXin LI  *	check all the lines in the buffer
56d38c30c0SXin LI  * otherwise:
57d38c30c0SXin LI  *	check only up-to the number of lines specified
58d38c30c0SXin LI  *
59d38c30c0SXin LI  * the last line count is always ignored if it does not end in CRLF
60d38c30c0SXin LI  */
61d38c30c0SXin LI #ifndef CSV_LINES
62d38c30c0SXin LI #define CSV_LINES 10
63d38c30c0SXin LI #endif
64d38c30c0SXin LI 
65d38c30c0SXin LI static int csv_parse(const unsigned char *, const unsigned char *);
66d38c30c0SXin LI 
67d38c30c0SXin LI static const unsigned char *
eatquote(const unsigned char * uc,const unsigned char * ue)68d38c30c0SXin LI eatquote(const unsigned char *uc, const unsigned char *ue)
69d38c30c0SXin LI {
70d38c30c0SXin LI 	int quote = 0;
71d38c30c0SXin LI 
72d38c30c0SXin LI 	while (uc < ue) {
73d38c30c0SXin LI 		unsigned char c = *uc++;
74d38c30c0SXin LI 		if (c != '"') {
75d38c30c0SXin LI 			// We already got one, done.
76d38c30c0SXin LI 			if (quote) {
77d38c30c0SXin LI 				return --uc;
78d38c30c0SXin LI 			}
79d38c30c0SXin LI 			continue;
80d38c30c0SXin LI 		}
81d38c30c0SXin LI 		if (quote) {
82d38c30c0SXin LI 			// quote-quote escapes
83d38c30c0SXin LI 			quote = 0;
84d38c30c0SXin LI 			continue;
85d38c30c0SXin LI 		}
86d38c30c0SXin LI 		// first quote
87d38c30c0SXin LI 		quote = 1;
88d38c30c0SXin LI 	}
89d38c30c0SXin LI 	return ue;
90d38c30c0SXin LI }
91d38c30c0SXin LI 
92d38c30c0SXin LI static int
csv_parse(const unsigned char * uc,const unsigned char * ue)93d38c30c0SXin LI csv_parse(const unsigned char *uc, const unsigned char *ue)
94d38c30c0SXin LI {
95d38c30c0SXin LI 	size_t nf = 0, tf = 0, nl = 0;
96d38c30c0SXin LI 
97d38c30c0SXin LI 	while (uc < ue) {
9843a5ec4eSXin LI 		switch (*uc++) {
99d38c30c0SXin LI 		case '"':
100d38c30c0SXin LI 			// Eat until the matching quote
101d38c30c0SXin LI 			uc = eatquote(uc, ue);
102d38c30c0SXin LI 			break;
103d38c30c0SXin LI 		case ',':
104d38c30c0SXin LI 			nf++;
105d38c30c0SXin LI 			break;
106d38c30c0SXin LI 		case '\n':
107d38c30c0SXin LI 			DPRINTF("%zu %zu %zu\n", nl, nf, tf);
108d38c30c0SXin LI 			nl++;
109d38c30c0SXin LI #if CSV_LINES
110d38c30c0SXin LI 			if (nl == CSV_LINES)
111*ae316d1dSXin LI 				return tf > 1 && tf == nf;
112d38c30c0SXin LI #endif
113d38c30c0SXin LI 			if (tf == 0) {
114d38c30c0SXin LI 				// First time and no fields, give up
115d38c30c0SXin LI 				if (nf == 0)
116d38c30c0SXin LI 					return 0;
117d38c30c0SXin LI 				// First time, set the number of fields
118d38c30c0SXin LI 				tf = nf;
119d38c30c0SXin LI 			} else if (tf != nf) {
120d38c30c0SXin LI 				// Field number mismatch, we are done.
121d38c30c0SXin LI 				return 0;
122d38c30c0SXin LI 			}
123d38c30c0SXin LI 			nf = 0;
124d38c30c0SXin LI 			break;
125d38c30c0SXin LI 		default:
126d38c30c0SXin LI 			break;
127d38c30c0SXin LI 		}
128d38c30c0SXin LI 	}
129*ae316d1dSXin LI 	return tf > 1 && nl >= 2;
130d38c30c0SXin LI }
131d38c30c0SXin LI 
132d38c30c0SXin LI #ifndef TEST
133d38c30c0SXin LI int
file_is_csv(struct magic_set * ms,const struct buffer * b,int looks_text,const char * code)134898496eeSXin LI file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text,
135898496eeSXin LI     const char *code)
136d38c30c0SXin LI {
137d38c30c0SXin LI 	const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
138d38c30c0SXin LI 	const unsigned char *ue = uc + b->flen;
139d38c30c0SXin LI 	int mime = ms->flags & MAGIC_MIME;
140d38c30c0SXin LI 
141d38c30c0SXin LI 	if (!looks_text)
142d38c30c0SXin LI 		return 0;
143d38c30c0SXin LI 
144d38c30c0SXin LI 	if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
145d38c30c0SXin LI 		return 0;
146d38c30c0SXin LI 
147d38c30c0SXin LI 	if (!csv_parse(uc, ue))
148d38c30c0SXin LI 		return 0;
149d38c30c0SXin LI 
150d38c30c0SXin LI 	if (mime == MAGIC_MIME_ENCODING)
151d38c30c0SXin LI 		return 1;
152d38c30c0SXin LI 
153d38c30c0SXin LI 	if (mime) {
15443a5ec4eSXin LI 		if (file_printf(ms, "text/csv") == -1)
155d38c30c0SXin LI 			return -1;
156d38c30c0SXin LI 		return 1;
157d38c30c0SXin LI 	}
158d38c30c0SXin LI 
159898496eeSXin LI 	if (file_printf(ms, "CSV %s%stext", code ? code : "",
160898496eeSXin LI 	    code ? " " : "") == -1)
161d38c30c0SXin LI 		return -1;
162d38c30c0SXin LI 
163d38c30c0SXin LI 	return 1;
164d38c30c0SXin LI }
165d38c30c0SXin LI 
166d38c30c0SXin LI #else
167d38c30c0SXin LI 
168d38c30c0SXin LI #include <sys/types.h>
169d38c30c0SXin LI #include <sys/stat.h>
170d38c30c0SXin LI #include <stdio.h>
171d38c30c0SXin LI #include <fcntl.h>
172d38c30c0SXin LI #include <unistd.h>
173d38c30c0SXin LI #include <stdlib.h>
174d38c30c0SXin LI #include <stdint.h>
175d38c30c0SXin LI #include <err.h>
176d38c30c0SXin LI 
177d38c30c0SXin LI int
main(int argc,char * argv[])178d38c30c0SXin LI main(int argc, char *argv[])
179d38c30c0SXin LI {
180898496eeSXin LI 	int fd;
181d38c30c0SXin LI 	struct stat st;
182d38c30c0SXin LI 	unsigned char *p;
183d38c30c0SXin LI 
184d38c30c0SXin LI 	if ((fd = open(argv[1], O_RDONLY)) == -1)
185d38c30c0SXin LI 		err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
186d38c30c0SXin LI 
187d38c30c0SXin LI 	if (fstat(fd, &st) == -1)
188d38c30c0SXin LI 		err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
189d38c30c0SXin LI 
190*ae316d1dSXin LI 	if ((p = CAST(unsigned char *, malloc(st.st_size))) == NULL)
191d38c30c0SXin LI 		err(EXIT_FAILURE, "Can't allocate %jd bytes",
192d38c30c0SXin LI 		    (intmax_t)st.st_size);
193d38c30c0SXin LI 	if (read(fd, p, st.st_size) != st.st_size)
194d38c30c0SXin LI 		err(EXIT_FAILURE, "Can't read %jd bytes",
195d38c30c0SXin LI 		    (intmax_t)st.st_size);
196d38c30c0SXin LI 	printf("is csv %d\n", csv_parse(p, p + st.st_size));
197d38c30c0SXin LI 	return 0;
198d38c30c0SXin LI }
199d38c30c0SXin LI #endif
200