1d38c30c0SXin LI /*- 2d38c30c0SXin LI * Copyright (c) 2019 Christos Zoulas 3d38c30c0SXin LI * All rights reserved. 4d38c30c0SXin LI * 5d38c30c0SXin LI * Redistribution and use in source and binary forms, with or without 6d38c30c0SXin LI * modification, are permitted provided that the following conditions 7d38c30c0SXin LI * are met: 8d38c30c0SXin LI * 1. Redistributions of source code must retain the above copyright 9d38c30c0SXin LI * notice, this list of conditions and the following disclaimer. 10d38c30c0SXin LI * 2. Redistributions in binary form must reproduce the above copyright 11d38c30c0SXin LI * notice, this list of conditions and the following disclaimer in the 12d38c30c0SXin LI * documentation and/or other materials provided with the distribution. 13d38c30c0SXin LI * 14d38c30c0SXin LI * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 15d38c30c0SXin LI * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 16d38c30c0SXin LI * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17d38c30c0SXin LI * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 18d38c30c0SXin LI * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 19d38c30c0SXin LI * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 20d38c30c0SXin LI * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 21d38c30c0SXin LI * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 22d38c30c0SXin LI * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23d38c30c0SXin LI * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24d38c30c0SXin LI * POSSIBILITY OF SUCH DAMAGE. 25d38c30c0SXin LI */ 26d38c30c0SXin LI 27d38c30c0SXin LI /* 28d38c30c0SXin LI * Parse CSV object serialization format (RFC-4180, RFC-7111) 29d38c30c0SXin LI */ 30d38c30c0SXin LI 31d38c30c0SXin LI #ifndef TEST 32d38c30c0SXin LI #include "file.h" 33d38c30c0SXin LI 34d38c30c0SXin LI #ifndef lint 35*43a5ec4eSXin LI FILE_RCSID("@(#)$File: is_csv.c,v 1.6 2020/08/09 16:43:36 christos Exp $") 36d38c30c0SXin LI #endif 37d38c30c0SXin LI 38d38c30c0SXin LI #include <string.h> 39d38c30c0SXin LI #include "magic.h" 40d38c30c0SXin LI #else 41d38c30c0SXin LI #include <sys/types.h> 42d38c30c0SXin LI #endif 43d38c30c0SXin LI 44d38c30c0SXin LI 45d38c30c0SXin LI #ifdef DEBUG 46d38c30c0SXin LI #include <stdio.h> 47d38c30c0SXin LI #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__) 48d38c30c0SXin LI #else 49d38c30c0SXin LI #define DPRINTF(fmt, ...) 50d38c30c0SXin LI #endif 51d38c30c0SXin LI 52d38c30c0SXin LI /* 53d38c30c0SXin LI * if CSV_LINES == 0: 54d38c30c0SXin LI * check all the lines in the buffer 55d38c30c0SXin LI * otherwise: 56d38c30c0SXin LI * check only up-to the number of lines specified 57d38c30c0SXin LI * 58d38c30c0SXin LI * the last line count is always ignored if it does not end in CRLF 59d38c30c0SXin LI */ 60d38c30c0SXin LI #ifndef CSV_LINES 61d38c30c0SXin LI #define CSV_LINES 10 62d38c30c0SXin LI #endif 63d38c30c0SXin LI 64d38c30c0SXin LI static int csv_parse(const unsigned char *, const unsigned char *); 65d38c30c0SXin LI 66d38c30c0SXin LI static const unsigned char * 67d38c30c0SXin LI eatquote(const unsigned char *uc, const unsigned char *ue) 68d38c30c0SXin LI { 69d38c30c0SXin LI int quote = 0; 70d38c30c0SXin LI 71d38c30c0SXin LI while (uc < ue) { 72d38c30c0SXin LI unsigned char c = *uc++; 73d38c30c0SXin LI if (c != '"') { 74d38c30c0SXin LI // We already got one, done. 75d38c30c0SXin LI if (quote) { 76d38c30c0SXin LI return --uc; 77d38c30c0SXin LI } 78d38c30c0SXin LI continue; 79d38c30c0SXin LI } 80d38c30c0SXin LI if (quote) { 81d38c30c0SXin LI // quote-quote escapes 82d38c30c0SXin LI quote = 0; 83d38c30c0SXin LI continue; 84d38c30c0SXin LI } 85d38c30c0SXin LI // first quote 86d38c30c0SXin LI quote = 1; 87d38c30c0SXin LI } 88d38c30c0SXin LI return ue; 89d38c30c0SXin LI } 90d38c30c0SXin LI 91d38c30c0SXin LI static int 92d38c30c0SXin LI csv_parse(const unsigned char *uc, const unsigned char *ue) 93d38c30c0SXin LI { 94d38c30c0SXin LI size_t nf = 0, tf = 0, nl = 0; 95d38c30c0SXin LI 96d38c30c0SXin LI while (uc < ue) { 97*43a5ec4eSXin LI switch (*uc++) { 98d38c30c0SXin LI case '"': 99d38c30c0SXin LI // Eat until the matching quote 100d38c30c0SXin LI uc = eatquote(uc, ue); 101d38c30c0SXin LI break; 102d38c30c0SXin LI case ',': 103d38c30c0SXin LI nf++; 104d38c30c0SXin LI break; 105d38c30c0SXin LI case '\n': 106d38c30c0SXin LI DPRINTF("%zu %zu %zu\n", nl, nf, tf); 107d38c30c0SXin LI nl++; 108d38c30c0SXin LI #if CSV_LINES 109d38c30c0SXin LI if (nl == CSV_LINES) 110d38c30c0SXin LI return tf != 0 && tf == nf; 111d38c30c0SXin LI #endif 112d38c30c0SXin LI if (tf == 0) { 113d38c30c0SXin LI // First time and no fields, give up 114d38c30c0SXin LI if (nf == 0) 115d38c30c0SXin LI return 0; 116d38c30c0SXin LI // First time, set the number of fields 117d38c30c0SXin LI tf = nf; 118d38c30c0SXin LI } else if (tf != nf) { 119d38c30c0SXin LI // Field number mismatch, we are done. 120d38c30c0SXin LI return 0; 121d38c30c0SXin LI } 122d38c30c0SXin LI nf = 0; 123d38c30c0SXin LI break; 124d38c30c0SXin LI default: 125d38c30c0SXin LI break; 126d38c30c0SXin LI } 127d38c30c0SXin LI } 128d38c30c0SXin LI return tf && nl > 2; 129d38c30c0SXin LI } 130d38c30c0SXin LI 131d38c30c0SXin LI #ifndef TEST 132d38c30c0SXin LI int 133d38c30c0SXin LI file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text) 134d38c30c0SXin LI { 135d38c30c0SXin LI const unsigned char *uc = CAST(const unsigned char *, b->fbuf); 136d38c30c0SXin LI const unsigned char *ue = uc + b->flen; 137d38c30c0SXin LI int mime = ms->flags & MAGIC_MIME; 138d38c30c0SXin LI 139d38c30c0SXin LI if (!looks_text) 140d38c30c0SXin LI return 0; 141d38c30c0SXin LI 142d38c30c0SXin LI if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0) 143d38c30c0SXin LI return 0; 144d38c30c0SXin LI 145d38c30c0SXin LI if (!csv_parse(uc, ue)) 146d38c30c0SXin LI return 0; 147d38c30c0SXin LI 148d38c30c0SXin LI if (mime == MAGIC_MIME_ENCODING) 149d38c30c0SXin LI return 1; 150d38c30c0SXin LI 151d38c30c0SXin LI if (mime) { 152*43a5ec4eSXin LI if (file_printf(ms, "text/csv") == -1) 153d38c30c0SXin LI return -1; 154d38c30c0SXin LI return 1; 155d38c30c0SXin LI } 156d38c30c0SXin LI 157d38c30c0SXin LI if (file_printf(ms, "CSV text") == -1) 158d38c30c0SXin LI return -1; 159d38c30c0SXin LI 160d38c30c0SXin LI return 1; 161d38c30c0SXin LI } 162d38c30c0SXin LI 163d38c30c0SXin LI #else 164d38c30c0SXin LI 165d38c30c0SXin LI #include <sys/types.h> 166d38c30c0SXin LI #include <sys/stat.h> 167d38c30c0SXin LI #include <stdio.h> 168d38c30c0SXin LI #include <fcntl.h> 169d38c30c0SXin LI #include <unistd.h> 170d38c30c0SXin LI #include <stdlib.h> 171d38c30c0SXin LI #include <stdint.h> 172d38c30c0SXin LI #include <err.h> 173d38c30c0SXin LI 174d38c30c0SXin LI int 175d38c30c0SXin LI main(int argc, char *argv[]) 176d38c30c0SXin LI { 177d38c30c0SXin LI int fd, rv; 178d38c30c0SXin LI struct stat st; 179d38c30c0SXin LI unsigned char *p; 180d38c30c0SXin LI 181d38c30c0SXin LI if ((fd = open(argv[1], O_RDONLY)) == -1) 182d38c30c0SXin LI err(EXIT_FAILURE, "Can't open `%s'", argv[1]); 183d38c30c0SXin LI 184d38c30c0SXin LI if (fstat(fd, &st) == -1) 185d38c30c0SXin LI err(EXIT_FAILURE, "Can't stat `%s'", argv[1]); 186d38c30c0SXin LI 187d38c30c0SXin LI if ((p = malloc(st.st_size)) == NULL) 188d38c30c0SXin LI err(EXIT_FAILURE, "Can't allocate %jd bytes", 189d38c30c0SXin LI (intmax_t)st.st_size); 190d38c30c0SXin LI if (read(fd, p, st.st_size) != st.st_size) 191d38c30c0SXin LI err(EXIT_FAILURE, "Can't read %jd bytes", 192d38c30c0SXin LI (intmax_t)st.st_size); 193d38c30c0SXin LI printf("is csv %d\n", csv_parse(p, p + st.st_size)); 194d38c30c0SXin LI return 0; 195d38c30c0SXin LI } 196d38c30c0SXin LI #endif 197