1 /*- 2 * Copyright (c) 2019 Christos Zoulas 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 15 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 16 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 18 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 19 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 20 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 21 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 22 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 * POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 /* 28 * Parse CSV object serialization format (RFC-4180, RFC-7111) 29 */ 30 31 #ifndef TEST 32 #include "file.h" 33 34 #ifndef lint 35 FILE_RCSID("@(#)$File: is_csv.c,v 1.15 2024/05/18 15:16:13 christos Exp $") 36 #endif 37 38 #include <string.h> 39 #include "magic.h" 40 #else 41 #define CAST(a, b) ((a)(b)) 42 #include <sys/types.h> 43 #endif 44 45 46 #ifdef DEBUG 47 #include <stdio.h> 48 #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__) 49 #else 50 #define DPRINTF(fmt, ...) 51 #endif 52 53 /* 54 * if CSV_LINES == 0: 55 * check all the lines in the buffer 56 * otherwise: 57 * check only up-to the number of lines specified 58 * 59 * the last line count is always ignored if it does not end in CRLF 60 */ 61 #ifndef CSV_LINES 62 #define CSV_LINES 10 63 #endif 64 65 static int csv_parse(const unsigned char *, const unsigned char *); 66 67 static const unsigned char * 68 eatquote(const unsigned char *uc, const unsigned char *ue) 69 { 70 int quote = 0; 71 72 while (uc < ue) { 73 unsigned char c = *uc++; 74 if (c != '"') { 75 // We already got one, done. 76 if (quote) { 77 return --uc; 78 } 79 continue; 80 } 81 if (quote) { 82 // quote-quote escapes 83 quote = 0; 84 continue; 85 } 86 // first quote 87 quote = 1; 88 } 89 return ue; 90 } 91 92 static int 93 csv_parse(const unsigned char *uc, const unsigned char *ue) 94 { 95 size_t nf = 0, tf = 0, nl = 0; 96 97 while (uc < ue) { 98 switch (*uc++) { 99 case '"': 100 // Eat until the matching quote 101 uc = eatquote(uc, ue); 102 break; 103 case ',': 104 nf++; 105 break; 106 case '\n': 107 DPRINTF("%zu %zu %zu\n", nl, nf, tf); 108 nl++; 109 #if CSV_LINES 110 if (nl == CSV_LINES) 111 return tf > 1 && tf == nf; 112 #endif 113 if (tf == 0) { 114 // First time and no fields, give up 115 if (nf == 0) 116 return 0; 117 // First time, set the number of fields 118 tf = nf; 119 } else if (tf != nf) { 120 // Field number mismatch, we are done. 121 return 0; 122 } 123 nf = 0; 124 break; 125 default: 126 break; 127 } 128 } 129 return tf > 1 && nl >= 2; 130 } 131 132 #ifndef TEST 133 int 134 file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text, 135 const char *code) 136 { 137 const unsigned char *uc = CAST(const unsigned char *, b->fbuf); 138 const unsigned char *ue = uc + b->flen; 139 int mime = ms->flags & MAGIC_MIME; 140 141 if (!looks_text) 142 return 0; 143 144 if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0) 145 return 0; 146 147 if (!csv_parse(uc, ue)) 148 return 0; 149 150 if (mime == MAGIC_MIME_ENCODING) 151 return 1; 152 153 if (mime) { 154 if (file_printf(ms, "text/csv") == -1) 155 return -1; 156 return 1; 157 } 158 159 if (file_printf(ms, "CSV %s%stext", code ? code : "", 160 code ? " " : "") == -1) 161 return -1; 162 163 return 1; 164 } 165 166 #else 167 168 #include <sys/types.h> 169 #include <sys/stat.h> 170 #include <stdio.h> 171 #include <fcntl.h> 172 #include <unistd.h> 173 #include <stdlib.h> 174 #include <stdint.h> 175 #include <err.h> 176 177 int 178 main(int argc, char *argv[]) 179 { 180 int fd; 181 struct stat st; 182 unsigned char *p; 183 184 if ((fd = open(argv[1], O_RDONLY)) == -1) 185 err(EXIT_FAILURE, "Can't open `%s'", argv[1]); 186 187 if (fstat(fd, &st) == -1) 188 err(EXIT_FAILURE, "Can't stat `%s'", argv[1]); 189 190 if ((p = CAST(unsigned char *, malloc(st.st_size))) == NULL) 191 err(EXIT_FAILURE, "Can't allocate %jd bytes", 192 (intmax_t)st.st_size); 193 if (read(fd, p, st.st_size) != st.st_size) 194 err(EXIT_FAILURE, "Can't read %jd bytes", 195 (intmax_t)st.st_size); 196 printf("is csv %d\n", csv_parse(p, p + st.st_size)); 197 return 0; 198 } 199 #endif 200