1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1985-2007 AT&T Knowledge Ventures * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Knowledge Ventures * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * Phong Vo <kpv@research.att.com> * 20 * * 21 ***********************************************************************/ 22 #pragma prototyped 23 24 /* 25 * determine record format by sampling data in <buf,size> 26 * total is the total file size, <=0 if not available 27 * return r: 28 * -1 could not determine 29 * RECTYPE(r)==REC_fixed fixed length REC_F_SIZE(r) 30 * RECTYPE(r)==REC_delimited variable length delimiter=REC_D_DELIMITER(r) 31 * RECTYPE(r)==REC_variable variable length 32 */ 33 34 #include <recfmt.h> 35 36 typedef struct 37 { 38 unsigned int rep[4 * 1024]; 39 unsigned int hit[UCHAR_MAX + 1]; 40 } Sample_t; 41 42 Recfmt_t 43 recfmt(const void* buf, size_t size, off_t total) 44 { 45 register unsigned char* s; 46 register unsigned char* t; 47 register Sample_t* q; 48 register unsigned int* h; 49 register unsigned int i; 50 unsigned int j; 51 unsigned int k; 52 unsigned int n; 53 unsigned int m; 54 unsigned int x; 55 unsigned long f; 56 unsigned long g; 57 58 static unsigned char terminators[] = { '\n', 0x15, 0x25 }; 59 60 /* 61 * check for V format 62 */ 63 64 s = (unsigned char*)buf; 65 t = s + size; 66 while ((k = (t - s)) >= 4 && !s[2] && !s[3]) 67 { 68 if ((i = (s[0]<<8)|s[1]) > k) 69 break; 70 s += i; 71 } 72 if (!k || size > 2 * k) 73 return REC_V_TYPE(4, 0, 2, 0, 1); 74 s = (unsigned char*)buf; 75 76 /* 77 * check for terminated records 78 */ 79 80 for (i = 0; i < elementsof(terminators); i++) 81 if ((t = (unsigned char*)memchr((void*)s, k = terminators[i], size / 2)) && (n = t - s + 1) > 1 && (total <= 0 || !(total % n))) 82 { 83 for (j = n - 1; j < size; j += n) 84 if (s[j] != k) 85 { 86 n = 0; 87 break; 88 } 89 if (n) 90 return REC_D_TYPE(terminators[i]); 91 } 92 93 /* 94 * check fixed length record frequencies 95 */ 96 97 if (!(q = newof(0, Sample_t, 1, 0))) 98 return REC_N_TYPE(); 99 x = 0; 100 for (i = 0; i < size; i++) 101 { 102 h = q->hit + s[i]; 103 m = i - *h; 104 *h = i; 105 if (m < elementsof(q->rep)) 106 { 107 if (m > x) 108 x = m; 109 q->rep[m]++; 110 } 111 } 112 n = 0; 113 m = 0; 114 f = ~0; 115 for (i = x; i > 1; i--) 116 { 117 if ((total <= 0 || !(total % i)) && q->rep[i] > q->rep[n]) 118 { 119 m++; 120 g = 0; 121 for (j = i; j < size - i; j += i) 122 for (k = 0; k < i; k++) 123 if (s[j + k] != s[j + k - i]) 124 g++; 125 g = (((g * 100) / i) * 100) / q->rep[i]; 126 if (g <= f) 127 { 128 f = g; 129 n = i; 130 } 131 } 132 } 133 if (m <= 1 && n <= 2 && total > 1 && total < 256) 134 { 135 n = 0; 136 for (i = 0; i < size; i++) 137 for (j = 0; j < elementsof(terminators); j++) 138 if (s[i] == terminators[j]) 139 n++; 140 n = n ? 0 : total; 141 } 142 free(q); 143 return n ? REC_F_TYPE(n) : REC_N_TYPE(); 144 } 145 146 #if MAIN 147 148 main() 149 { 150 void* s; 151 size_t size; 152 off_t total; 153 154 if (!(s = sfreserve(sfstdin, SF_UNBOUND, 0))) 155 { 156 sfprintf(sfstderr, "read error\n"); 157 return 1; 158 } 159 size = sfvalue(sfstdin); 160 total = sfsize(sfstdin); 161 sfprintf(sfstdout, "%d\n", recfmt(s, size, total)); 162 return 0; 163 } 164 165 #endif 166