1d38c30c0SXin LI /*-
2d38c30c0SXin LI * Copyright (c) 2019 Christos Zoulas
3d38c30c0SXin LI * All rights reserved.
4d38c30c0SXin LI *
5d38c30c0SXin LI * Redistribution and use in source and binary forms, with or without
6d38c30c0SXin LI * modification, are permitted provided that the following conditions
7d38c30c0SXin LI * are met:
8d38c30c0SXin LI * 1. Redistributions of source code must retain the above copyright
9d38c30c0SXin LI * notice, this list of conditions and the following disclaimer.
10d38c30c0SXin LI * 2. Redistributions in binary form must reproduce the above copyright
11d38c30c0SXin LI * notice, this list of conditions and the following disclaimer in the
12d38c30c0SXin LI * documentation and/or other materials provided with the distribution.
13d38c30c0SXin LI *
14d38c30c0SXin LI * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
15d38c30c0SXin LI * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16d38c30c0SXin LI * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17d38c30c0SXin LI * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
18d38c30c0SXin LI * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19d38c30c0SXin LI * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20d38c30c0SXin LI * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21d38c30c0SXin LI * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22d38c30c0SXin LI * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23d38c30c0SXin LI * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24d38c30c0SXin LI * POSSIBILITY OF SUCH DAMAGE.
25d38c30c0SXin LI */
26d38c30c0SXin LI
27d38c30c0SXin LI /*
28d38c30c0SXin LI * Parse CSV object serialization format (RFC-4180, RFC-7111)
29d38c30c0SXin LI */
30d38c30c0SXin LI
31d38c30c0SXin LI #ifndef TEST
32d38c30c0SXin LI #include "file.h"
33d38c30c0SXin LI
34d38c30c0SXin LI #ifndef lint
35*ae316d1dSXin LI FILE_RCSID("@(#)$File: is_csv.c,v 1.15 2024/05/18 15:16:13 christos Exp $")
36d38c30c0SXin LI #endif
37d38c30c0SXin LI
38d38c30c0SXin LI #include <string.h>
39d38c30c0SXin LI #include "magic.h"
40d38c30c0SXin LI #else
41*ae316d1dSXin LI #define CAST(a, b) ((a)(b))
42d38c30c0SXin LI #include <sys/types.h>
43d38c30c0SXin LI #endif
44d38c30c0SXin LI
45d38c30c0SXin LI
46d38c30c0SXin LI #ifdef DEBUG
47d38c30c0SXin LI #include <stdio.h>
48d38c30c0SXin LI #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
49d38c30c0SXin LI #else
50d38c30c0SXin LI #define DPRINTF(fmt, ...)
51d38c30c0SXin LI #endif
52d38c30c0SXin LI
53d38c30c0SXin LI /*
54d38c30c0SXin LI * if CSV_LINES == 0:
55d38c30c0SXin LI * check all the lines in the buffer
56d38c30c0SXin LI * otherwise:
57d38c30c0SXin LI * check only up-to the number of lines specified
58d38c30c0SXin LI *
59d38c30c0SXin LI * the last line count is always ignored if it does not end in CRLF
60d38c30c0SXin LI */
61d38c30c0SXin LI #ifndef CSV_LINES
62d38c30c0SXin LI #define CSV_LINES 10
63d38c30c0SXin LI #endif
64d38c30c0SXin LI
65d38c30c0SXin LI static int csv_parse(const unsigned char *, const unsigned char *);
66d38c30c0SXin LI
67d38c30c0SXin LI static const unsigned char *
eatquote(const unsigned char * uc,const unsigned char * ue)68d38c30c0SXin LI eatquote(const unsigned char *uc, const unsigned char *ue)
69d38c30c0SXin LI {
70d38c30c0SXin LI int quote = 0;
71d38c30c0SXin LI
72d38c30c0SXin LI while (uc < ue) {
73d38c30c0SXin LI unsigned char c = *uc++;
74d38c30c0SXin LI if (c != '"') {
75d38c30c0SXin LI // We already got one, done.
76d38c30c0SXin LI if (quote) {
77d38c30c0SXin LI return --uc;
78d38c30c0SXin LI }
79d38c30c0SXin LI continue;
80d38c30c0SXin LI }
81d38c30c0SXin LI if (quote) {
82d38c30c0SXin LI // quote-quote escapes
83d38c30c0SXin LI quote = 0;
84d38c30c0SXin LI continue;
85d38c30c0SXin LI }
86d38c30c0SXin LI // first quote
87d38c30c0SXin LI quote = 1;
88d38c30c0SXin LI }
89d38c30c0SXin LI return ue;
90d38c30c0SXin LI }
91d38c30c0SXin LI
92d38c30c0SXin LI static int
csv_parse(const unsigned char * uc,const unsigned char * ue)93d38c30c0SXin LI csv_parse(const unsigned char *uc, const unsigned char *ue)
94d38c30c0SXin LI {
95d38c30c0SXin LI size_t nf = 0, tf = 0, nl = 0;
96d38c30c0SXin LI
97d38c30c0SXin LI while (uc < ue) {
9843a5ec4eSXin LI switch (*uc++) {
99d38c30c0SXin LI case '"':
100d38c30c0SXin LI // Eat until the matching quote
101d38c30c0SXin LI uc = eatquote(uc, ue);
102d38c30c0SXin LI break;
103d38c30c0SXin LI case ',':
104d38c30c0SXin LI nf++;
105d38c30c0SXin LI break;
106d38c30c0SXin LI case '\n':
107d38c30c0SXin LI DPRINTF("%zu %zu %zu\n", nl, nf, tf);
108d38c30c0SXin LI nl++;
109d38c30c0SXin LI #if CSV_LINES
110d38c30c0SXin LI if (nl == CSV_LINES)
111*ae316d1dSXin LI return tf > 1 && tf == nf;
112d38c30c0SXin LI #endif
113d38c30c0SXin LI if (tf == 0) {
114d38c30c0SXin LI // First time and no fields, give up
115d38c30c0SXin LI if (nf == 0)
116d38c30c0SXin LI return 0;
117d38c30c0SXin LI // First time, set the number of fields
118d38c30c0SXin LI tf = nf;
119d38c30c0SXin LI } else if (tf != nf) {
120d38c30c0SXin LI // Field number mismatch, we are done.
121d38c30c0SXin LI return 0;
122d38c30c0SXin LI }
123d38c30c0SXin LI nf = 0;
124d38c30c0SXin LI break;
125d38c30c0SXin LI default:
126d38c30c0SXin LI break;
127d38c30c0SXin LI }
128d38c30c0SXin LI }
129*ae316d1dSXin LI return tf > 1 && nl >= 2;
130d38c30c0SXin LI }
131d38c30c0SXin LI
132d38c30c0SXin LI #ifndef TEST
133d38c30c0SXin LI int
file_is_csv(struct magic_set * ms,const struct buffer * b,int looks_text,const char * code)134898496eeSXin LI file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text,
135898496eeSXin LI const char *code)
136d38c30c0SXin LI {
137d38c30c0SXin LI const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
138d38c30c0SXin LI const unsigned char *ue = uc + b->flen;
139d38c30c0SXin LI int mime = ms->flags & MAGIC_MIME;
140d38c30c0SXin LI
141d38c30c0SXin LI if (!looks_text)
142d38c30c0SXin LI return 0;
143d38c30c0SXin LI
144d38c30c0SXin LI if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
145d38c30c0SXin LI return 0;
146d38c30c0SXin LI
147d38c30c0SXin LI if (!csv_parse(uc, ue))
148d38c30c0SXin LI return 0;
149d38c30c0SXin LI
150d38c30c0SXin LI if (mime == MAGIC_MIME_ENCODING)
151d38c30c0SXin LI return 1;
152d38c30c0SXin LI
153d38c30c0SXin LI if (mime) {
15443a5ec4eSXin LI if (file_printf(ms, "text/csv") == -1)
155d38c30c0SXin LI return -1;
156d38c30c0SXin LI return 1;
157d38c30c0SXin LI }
158d38c30c0SXin LI
159898496eeSXin LI if (file_printf(ms, "CSV %s%stext", code ? code : "",
160898496eeSXin LI code ? " " : "") == -1)
161d38c30c0SXin LI return -1;
162d38c30c0SXin LI
163d38c30c0SXin LI return 1;
164d38c30c0SXin LI }
165d38c30c0SXin LI
166d38c30c0SXin LI #else
167d38c30c0SXin LI
168d38c30c0SXin LI #include <sys/types.h>
169d38c30c0SXin LI #include <sys/stat.h>
170d38c30c0SXin LI #include <stdio.h>
171d38c30c0SXin LI #include <fcntl.h>
172d38c30c0SXin LI #include <unistd.h>
173d38c30c0SXin LI #include <stdlib.h>
174d38c30c0SXin LI #include <stdint.h>
175d38c30c0SXin LI #include <err.h>
176d38c30c0SXin LI
177d38c30c0SXin LI int
main(int argc,char * argv[])178d38c30c0SXin LI main(int argc, char *argv[])
179d38c30c0SXin LI {
180898496eeSXin LI int fd;
181d38c30c0SXin LI struct stat st;
182d38c30c0SXin LI unsigned char *p;
183d38c30c0SXin LI
184d38c30c0SXin LI if ((fd = open(argv[1], O_RDONLY)) == -1)
185d38c30c0SXin LI err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
186d38c30c0SXin LI
187d38c30c0SXin LI if (fstat(fd, &st) == -1)
188d38c30c0SXin LI err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
189d38c30c0SXin LI
190*ae316d1dSXin LI if ((p = CAST(unsigned char *, malloc(st.st_size))) == NULL)
191d38c30c0SXin LI err(EXIT_FAILURE, "Can't allocate %jd bytes",
192d38c30c0SXin LI (intmax_t)st.st_size);
193d38c30c0SXin LI if (read(fd, p, st.st_size) != st.st_size)
194d38c30c0SXin LI err(EXIT_FAILURE, "Can't read %jd bytes",
195d38c30c0SXin LI (intmax_t)st.st_size);
196d38c30c0SXin LI printf("is csv %d\n", csv_parse(p, p + st.st_size));
197d38c30c0SXin LI return 0;
198d38c30c0SXin LI }
199d38c30c0SXin LI #endif
200