1 /*-
2 * Copyright (c) 2019 Christos Zoulas
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
15 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
18 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24 * POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 /*
28 * Parse CSV object serialization format (RFC-4180, RFC-7111)
29 */
30
31 #ifndef TEST
32 #include "file.h"
33
34 #ifndef lint
35 FILE_RCSID("@(#)$File: is_csv.c,v 1.15 2024/05/18 15:16:13 christos Exp $")
36 #endif
37
38 #include <string.h>
39 #include "magic.h"
40 #else
41 #define CAST(a, b) ((a)(b))
42 #include <sys/types.h>
43 #endif
44
45
46 #ifdef DEBUG
47 #include <stdio.h>
48 #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
49 #else
50 #define DPRINTF(fmt, ...)
51 #endif
52
53 /*
54 * if CSV_LINES == 0:
55 * check all the lines in the buffer
56 * otherwise:
57 * check only up-to the number of lines specified
58 *
59 * the last line count is always ignored if it does not end in CRLF
60 */
61 #ifndef CSV_LINES
62 #define CSV_LINES 10
63 #endif
64
65 static int csv_parse(const unsigned char *, const unsigned char *);
66
67 static const unsigned char *
eatquote(const unsigned char * uc,const unsigned char * ue)68 eatquote(const unsigned char *uc, const unsigned char *ue)
69 {
70 int quote = 0;
71
72 while (uc < ue) {
73 unsigned char c = *uc++;
74 if (c != '"') {
75 // We already got one, done.
76 if (quote) {
77 return --uc;
78 }
79 continue;
80 }
81 if (quote) {
82 // quote-quote escapes
83 quote = 0;
84 continue;
85 }
86 // first quote
87 quote = 1;
88 }
89 return ue;
90 }
91
92 static int
csv_parse(const unsigned char * uc,const unsigned char * ue)93 csv_parse(const unsigned char *uc, const unsigned char *ue)
94 {
95 size_t nf = 0, tf = 0, nl = 0;
96
97 while (uc < ue) {
98 switch (*uc++) {
99 case '"':
100 // Eat until the matching quote
101 uc = eatquote(uc, ue);
102 break;
103 case ',':
104 nf++;
105 break;
106 case '\n':
107 DPRINTF("%zu %zu %zu\n", nl, nf, tf);
108 nl++;
109 #if CSV_LINES
110 if (nl == CSV_LINES)
111 return tf > 1 && tf == nf;
112 #endif
113 if (tf == 0) {
114 // First time and no fields, give up
115 if (nf == 0)
116 return 0;
117 // First time, set the number of fields
118 tf = nf;
119 } else if (tf != nf) {
120 // Field number mismatch, we are done.
121 return 0;
122 }
123 nf = 0;
124 break;
125 default:
126 break;
127 }
128 }
129 return tf > 1 && nl >= 2;
130 }
131
132 #ifndef TEST
133 int
file_is_csv(struct magic_set * ms,const struct buffer * b,int looks_text,const char * code)134 file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text,
135 const char *code)
136 {
137 const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
138 const unsigned char *ue = uc + b->flen;
139 int mime = ms->flags & MAGIC_MIME;
140
141 if (!looks_text)
142 return 0;
143
144 if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
145 return 0;
146
147 if (!csv_parse(uc, ue))
148 return 0;
149
150 if (mime == MAGIC_MIME_ENCODING)
151 return 1;
152
153 if (mime) {
154 if (file_printf(ms, "text/csv") == -1)
155 return -1;
156 return 1;
157 }
158
159 if (file_printf(ms, "CSV %s%stext", code ? code : "",
160 code ? " " : "") == -1)
161 return -1;
162
163 return 1;
164 }
165
166 #else
167
168 #include <sys/types.h>
169 #include <sys/stat.h>
170 #include <stdio.h>
171 #include <fcntl.h>
172 #include <unistd.h>
173 #include <stdlib.h>
174 #include <stdint.h>
175 #include <err.h>
176
177 int
main(int argc,char * argv[])178 main(int argc, char *argv[])
179 {
180 int fd;
181 struct stat st;
182 unsigned char *p;
183
184 if ((fd = open(argv[1], O_RDONLY)) == -1)
185 err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
186
187 if (fstat(fd, &st) == -1)
188 err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
189
190 if ((p = CAST(unsigned char *, malloc(st.st_size))) == NULL)
191 err(EXIT_FAILURE, "Can't allocate %jd bytes",
192 (intmax_t)st.st_size);
193 if (read(fd, p, st.st_size) != st.st_size)
194 err(EXIT_FAILURE, "Can't read %jd bytes",
195 (intmax_t)st.st_size);
196 printf("is csv %d\n", csv_parse(p, p + st.st_size));
197 return 0;
198 }
199 #endif
200