1*d38c30c0SXin LI /*- 2*d38c30c0SXin LI * Copyright (c) 2019 Christos Zoulas 3*d38c30c0SXin LI * All rights reserved. 4*d38c30c0SXin LI * 5*d38c30c0SXin LI * Redistribution and use in source and binary forms, with or without 6*d38c30c0SXin LI * modification, are permitted provided that the following conditions 7*d38c30c0SXin LI * are met: 8*d38c30c0SXin LI * 1. Redistributions of source code must retain the above copyright 9*d38c30c0SXin LI * notice, this list of conditions and the following disclaimer. 10*d38c30c0SXin LI * 2. Redistributions in binary form must reproduce the above copyright 11*d38c30c0SXin LI * notice, this list of conditions and the following disclaimer in the 12*d38c30c0SXin LI * documentation and/or other materials provided with the distribution. 13*d38c30c0SXin LI * 14*d38c30c0SXin LI * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 15*d38c30c0SXin LI * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 16*d38c30c0SXin LI * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17*d38c30c0SXin LI * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 18*d38c30c0SXin LI * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 19*d38c30c0SXin LI * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 20*d38c30c0SXin LI * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 21*d38c30c0SXin LI * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 22*d38c30c0SXin LI * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23*d38c30c0SXin LI * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24*d38c30c0SXin LI * POSSIBILITY OF SUCH DAMAGE. 25*d38c30c0SXin LI */ 26*d38c30c0SXin LI 27*d38c30c0SXin LI /* 28*d38c30c0SXin LI * Parse CSV object serialization format (RFC-4180, RFC-7111) 29*d38c30c0SXin LI */ 30*d38c30c0SXin LI 31*d38c30c0SXin LI #ifndef TEST 32*d38c30c0SXin LI #include "file.h" 33*d38c30c0SXin LI 34*d38c30c0SXin LI #ifndef lint 35*d38c30c0SXin LI FILE_RCSID("@(#)$File: is_csv.c,v 1.4 2019/06/26 20:31:31 christos Exp $") 36*d38c30c0SXin LI #endif 37*d38c30c0SXin LI 38*d38c30c0SXin LI #include <string.h> 39*d38c30c0SXin LI #include "magic.h" 40*d38c30c0SXin LI #else 41*d38c30c0SXin LI #include <sys/types.h> 42*d38c30c0SXin LI #endif 43*d38c30c0SXin LI 44*d38c30c0SXin LI 45*d38c30c0SXin LI #ifdef DEBUG 46*d38c30c0SXin LI #include <stdio.h> 47*d38c30c0SXin LI #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__) 48*d38c30c0SXin LI #else 49*d38c30c0SXin LI #define DPRINTF(fmt, ...) 50*d38c30c0SXin LI #endif 51*d38c30c0SXin LI 52*d38c30c0SXin LI /* 53*d38c30c0SXin LI * if CSV_LINES == 0: 54*d38c30c0SXin LI * check all the lines in the buffer 55*d38c30c0SXin LI * otherwise: 56*d38c30c0SXin LI * check only up-to the number of lines specified 57*d38c30c0SXin LI * 58*d38c30c0SXin LI * the last line count is always ignored if it does not end in CRLF 59*d38c30c0SXin LI */ 60*d38c30c0SXin LI #ifndef CSV_LINES 61*d38c30c0SXin LI #define CSV_LINES 10 62*d38c30c0SXin LI #endif 63*d38c30c0SXin LI 64*d38c30c0SXin LI static int csv_parse(const unsigned char *, const unsigned char *); 65*d38c30c0SXin LI 66*d38c30c0SXin LI static const unsigned char * 67*d38c30c0SXin LI eatquote(const unsigned char *uc, const unsigned char *ue) 68*d38c30c0SXin LI { 69*d38c30c0SXin LI int quote = 0; 70*d38c30c0SXin LI 71*d38c30c0SXin LI while (uc < ue) { 72*d38c30c0SXin LI unsigned char c = *uc++; 73*d38c30c0SXin LI if (c != '"') { 74*d38c30c0SXin LI // We already got one, done. 75*d38c30c0SXin LI if (quote) { 76*d38c30c0SXin LI return --uc; 77*d38c30c0SXin LI } 78*d38c30c0SXin LI continue; 79*d38c30c0SXin LI } 80*d38c30c0SXin LI if (quote) { 81*d38c30c0SXin LI // quote-quote escapes 82*d38c30c0SXin LI quote = 0; 83*d38c30c0SXin LI continue; 84*d38c30c0SXin LI } 85*d38c30c0SXin LI // first quote 86*d38c30c0SXin LI quote = 1; 87*d38c30c0SXin LI } 88*d38c30c0SXin LI return ue; 89*d38c30c0SXin LI } 90*d38c30c0SXin LI 91*d38c30c0SXin LI static int 92*d38c30c0SXin LI csv_parse(const unsigned char *uc, const unsigned char *ue) 93*d38c30c0SXin LI { 94*d38c30c0SXin LI size_t nf = 0, tf = 0, nl = 0; 95*d38c30c0SXin LI 96*d38c30c0SXin LI while (uc < ue) { 97*d38c30c0SXin LI unsigned char c; 98*d38c30c0SXin LI switch (c = *uc++) { 99*d38c30c0SXin LI case '"': 100*d38c30c0SXin LI // Eat until the matching quote 101*d38c30c0SXin LI uc = eatquote(uc, ue); 102*d38c30c0SXin LI break; 103*d38c30c0SXin LI case ',': 104*d38c30c0SXin LI nf++; 105*d38c30c0SXin LI break; 106*d38c30c0SXin LI case '\n': 107*d38c30c0SXin LI DPRINTF("%zu %zu %zu\n", nl, nf, tf); 108*d38c30c0SXin LI nl++; 109*d38c30c0SXin LI #if CSV_LINES 110*d38c30c0SXin LI if (nl == CSV_LINES) 111*d38c30c0SXin LI return tf != 0 && tf == nf; 112*d38c30c0SXin LI #endif 113*d38c30c0SXin LI if (tf == 0) { 114*d38c30c0SXin LI // First time and no fields, give up 115*d38c30c0SXin LI if (nf == 0) 116*d38c30c0SXin LI return 0; 117*d38c30c0SXin LI // First time, set the number of fields 118*d38c30c0SXin LI tf = nf; 119*d38c30c0SXin LI } else if (tf != nf) { 120*d38c30c0SXin LI // Field number mismatch, we are done. 121*d38c30c0SXin LI return 0; 122*d38c30c0SXin LI } 123*d38c30c0SXin LI nf = 0; 124*d38c30c0SXin LI break; 125*d38c30c0SXin LI default: 126*d38c30c0SXin LI break; 127*d38c30c0SXin LI } 128*d38c30c0SXin LI } 129*d38c30c0SXin LI return tf && nl > 2; 130*d38c30c0SXin LI } 131*d38c30c0SXin LI 132*d38c30c0SXin LI #ifndef TEST 133*d38c30c0SXin LI int 134*d38c30c0SXin LI file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text) 135*d38c30c0SXin LI { 136*d38c30c0SXin LI const unsigned char *uc = CAST(const unsigned char *, b->fbuf); 137*d38c30c0SXin LI const unsigned char *ue = uc + b->flen; 138*d38c30c0SXin LI int mime = ms->flags & MAGIC_MIME; 139*d38c30c0SXin LI 140*d38c30c0SXin LI if (!looks_text) 141*d38c30c0SXin LI return 0; 142*d38c30c0SXin LI 143*d38c30c0SXin LI if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0) 144*d38c30c0SXin LI return 0; 145*d38c30c0SXin LI 146*d38c30c0SXin LI if (!csv_parse(uc, ue)) 147*d38c30c0SXin LI return 0; 148*d38c30c0SXin LI 149*d38c30c0SXin LI if (mime == MAGIC_MIME_ENCODING) 150*d38c30c0SXin LI return 1; 151*d38c30c0SXin LI 152*d38c30c0SXin LI if (mime) { 153*d38c30c0SXin LI if (file_printf(ms, "application/csv") == -1) 154*d38c30c0SXin LI return -1; 155*d38c30c0SXin LI return 1; 156*d38c30c0SXin LI } 157*d38c30c0SXin LI 158*d38c30c0SXin LI if (file_printf(ms, "CSV text") == -1) 159*d38c30c0SXin LI return -1; 160*d38c30c0SXin LI 161*d38c30c0SXin LI return 1; 162*d38c30c0SXin LI } 163*d38c30c0SXin LI 164*d38c30c0SXin LI #else 165*d38c30c0SXin LI 166*d38c30c0SXin LI #include <sys/types.h> 167*d38c30c0SXin LI #include <sys/stat.h> 168*d38c30c0SXin LI #include <stdio.h> 169*d38c30c0SXin LI #include <fcntl.h> 170*d38c30c0SXin LI #include <unistd.h> 171*d38c30c0SXin LI #include <stdlib.h> 172*d38c30c0SXin LI #include <stdint.h> 173*d38c30c0SXin LI #include <err.h> 174*d38c30c0SXin LI 175*d38c30c0SXin LI int 176*d38c30c0SXin LI main(int argc, char *argv[]) 177*d38c30c0SXin LI { 178*d38c30c0SXin LI int fd, rv; 179*d38c30c0SXin LI struct stat st; 180*d38c30c0SXin LI unsigned char *p; 181*d38c30c0SXin LI 182*d38c30c0SXin LI if ((fd = open(argv[1], O_RDONLY)) == -1) 183*d38c30c0SXin LI err(EXIT_FAILURE, "Can't open `%s'", argv[1]); 184*d38c30c0SXin LI 185*d38c30c0SXin LI if (fstat(fd, &st) == -1) 186*d38c30c0SXin LI err(EXIT_FAILURE, "Can't stat `%s'", argv[1]); 187*d38c30c0SXin LI 188*d38c30c0SXin LI if ((p = malloc(st.st_size)) == NULL) 189*d38c30c0SXin LI err(EXIT_FAILURE, "Can't allocate %jd bytes", 190*d38c30c0SXin LI (intmax_t)st.st_size); 191*d38c30c0SXin LI if (read(fd, p, st.st_size) != st.st_size) 192*d38c30c0SXin LI err(EXIT_FAILURE, "Can't read %jd bytes", 193*d38c30c0SXin LI (intmax_t)st.st_size); 194*d38c30c0SXin LI printf("is csv %d\n", csv_parse(p, p + st.st_size)); 195*d38c30c0SXin LI return 0; 196*d38c30c0SXin LI } 197*d38c30c0SXin LI #endif 198