1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1985-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
19 * Phong Vo <kpv@research.att.com> *
20 * *
21 ***********************************************************************/
22 #pragma prototyped
23
24 /*
25 * determine record format by sampling data in <buf,size>
26 * total is the total file size, <=0 if not available
27 * return r:
28 * -1 could not determine
29 * RECTYPE(r)==REC_fixed fixed length REC_F_SIZE(r)
30 * RECTYPE(r)==REC_delimited variable length delimiter=REC_D_DELIMITER(r)
31 * RECTYPE(r)==REC_variable variable length
32 */
33
34 #include <recfmt.h>
35
36 typedef struct
37 {
38 unsigned int rep[4 * 1024];
39 unsigned int hit[UCHAR_MAX + 1];
40 } Sample_t;
41
42 Recfmt_t
recfmt(const void * buf,size_t size,off_t total)43 recfmt(const void* buf, size_t size, off_t total)
44 {
45 register unsigned char* s;
46 register unsigned char* t;
47 register Sample_t* q;
48 register unsigned int* h;
49 register unsigned int i;
50 unsigned int j;
51 unsigned int k;
52 unsigned int n;
53 unsigned int m;
54 unsigned int x;
55 unsigned long f;
56 unsigned long g;
57
58 static unsigned char terminators[] = { '\n', 0x15, 0x25 };
59
60 /*
61 * check for V format
62 */
63
64 s = (unsigned char*)buf;
65 t = s + size;
66 while ((k = (t - s)) >= 4 && !s[2] && !s[3])
67 {
68 if ((i = (s[0]<<8)|s[1]) > k)
69 break;
70 s += i;
71 }
72 if (!k || size > 2 * k)
73 return REC_V_TYPE(4, 0, 2, 0, 1);
74 s = (unsigned char*)buf;
75
76 /*
77 * check for terminated records
78 */
79
80 for (i = 0; i < elementsof(terminators); i++)
81 if ((t = (unsigned char*)memchr((void*)s, k = terminators[i], size / 2)) && (n = t - s + 1) > 1 && (total <= 0 || !(total % n)))
82 {
83 for (j = n - 1; j < size; j += n)
84 if (s[j] != k)
85 {
86 n = 0;
87 break;
88 }
89 if (n)
90 return REC_D_TYPE(terminators[i]);
91 }
92
93 /*
94 * check fixed length record frequencies
95 */
96
97 if (!(q = newof(0, Sample_t, 1, 0)))
98 return REC_N_TYPE();
99 x = 0;
100 for (i = 0; i < size; i++)
101 {
102 h = q->hit + s[i];
103 m = i - *h;
104 *h = i;
105 if (m < elementsof(q->rep))
106 {
107 if (m > x)
108 x = m;
109 q->rep[m]++;
110 }
111 }
112 n = 0;
113 m = 0;
114 f = ~0;
115 for (i = x; i > 1; i--)
116 {
117 if ((total <= 0 || !(total % i)) && q->rep[i] > q->rep[n])
118 {
119 m++;
120 g = 0;
121 for (j = i; j < size - i; j += i)
122 for (k = 0; k < i; k++)
123 if (s[j + k] != s[j + k - i])
124 g++;
125 g = (((g * 100) / i) * 100) / q->rep[i];
126 if (g <= f)
127 {
128 f = g;
129 n = i;
130 }
131 }
132 }
133 if (m <= 1 && n <= 2 && total > 1 && total < 256)
134 {
135 n = 0;
136 for (i = 0; i < size; i++)
137 for (j = 0; j < elementsof(terminators); j++)
138 if (s[i] == terminators[j])
139 n++;
140 n = n ? 0 : total;
141 }
142 free(q);
143 return n ? REC_F_TYPE(n) : REC_N_TYPE();
144 }
145
146 #if MAIN
147
main()148 main()
149 {
150 void* s;
151 size_t size;
152 off_t total;
153
154 if (!(s = sfreserve(sfstdin, SF_UNBOUND, 0)))
155 {
156 sfprintf(sfstderr, "read error\n");
157 return 1;
158 }
159 size = sfvalue(sfstdin);
160 total = sfsize(sfstdin);
161 sfprintf(sfstdout, "%d\n", recfmt(s, size, total));
162 return 0;
163 }
164
165 #endif
166