1da2e3ebdSchin /***********************************************************************
2da2e3ebdSchin * *
3da2e3ebdSchin * This software is part of the ast package *
4*3e14f97fSRoger A. Faulkner * Copyright (c) 1985-2010 AT&T Intellectual Property *
5da2e3ebdSchin * and is licensed under the *
6da2e3ebdSchin * Common Public License, Version 1.0 *
77c2fbfb3SApril Chin * by AT&T Intellectual Property *
8da2e3ebdSchin * *
9da2e3ebdSchin * A copy of the License is available at *
10da2e3ebdSchin * http://www.opensource.org/licenses/cpl1.0.txt *
11da2e3ebdSchin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
12da2e3ebdSchin * *
13da2e3ebdSchin * Information and Software Systems Research *
14da2e3ebdSchin * AT&T Research *
15da2e3ebdSchin * Florham Park NJ *
16da2e3ebdSchin * *
17da2e3ebdSchin * Glenn Fowler <gsf@research.att.com> *
18da2e3ebdSchin * David Korn <dgk@research.att.com> *
19da2e3ebdSchin * Phong Vo <kpv@research.att.com> *
20da2e3ebdSchin * *
21da2e3ebdSchin ***********************************************************************/
22da2e3ebdSchin #pragma prototyped
23da2e3ebdSchin
24da2e3ebdSchin /*
25da2e3ebdSchin * determine record format by sampling data in <buf,size>
26da2e3ebdSchin * total is the total file size, <=0 if not available
27da2e3ebdSchin * return r:
28da2e3ebdSchin * -1 could not determine
29da2e3ebdSchin * RECTYPE(r)==REC_fixed fixed length REC_F_SIZE(r)
30da2e3ebdSchin * RECTYPE(r)==REC_delimited variable length delimiter=REC_D_DELIMITER(r)
31da2e3ebdSchin * RECTYPE(r)==REC_variable variable length
32da2e3ebdSchin */
33da2e3ebdSchin
34da2e3ebdSchin #include <recfmt.h>
35da2e3ebdSchin
36da2e3ebdSchin typedef struct
37da2e3ebdSchin {
38da2e3ebdSchin unsigned int rep[4 * 1024];
39da2e3ebdSchin unsigned int hit[UCHAR_MAX + 1];
40da2e3ebdSchin } Sample_t;
41da2e3ebdSchin
42da2e3ebdSchin Recfmt_t
recfmt(const void * buf,size_t size,off_t total)43da2e3ebdSchin recfmt(const void* buf, size_t size, off_t total)
44da2e3ebdSchin {
45da2e3ebdSchin register unsigned char* s;
46da2e3ebdSchin register unsigned char* t;
47da2e3ebdSchin register Sample_t* q;
48da2e3ebdSchin register unsigned int* h;
49da2e3ebdSchin register unsigned int i;
50da2e3ebdSchin unsigned int j;
51da2e3ebdSchin unsigned int k;
52da2e3ebdSchin unsigned int n;
53da2e3ebdSchin unsigned int m;
54da2e3ebdSchin unsigned int x;
55da2e3ebdSchin unsigned long f;
56da2e3ebdSchin unsigned long g;
57da2e3ebdSchin
58da2e3ebdSchin static unsigned char terminators[] = { '\n', 0x15, 0x25 };
59da2e3ebdSchin
60da2e3ebdSchin /*
61da2e3ebdSchin * check for V format
62da2e3ebdSchin */
63da2e3ebdSchin
64da2e3ebdSchin s = (unsigned char*)buf;
65da2e3ebdSchin t = s + size;
66da2e3ebdSchin while ((k = (t - s)) >= 4 && !s[2] && !s[3])
67da2e3ebdSchin {
68da2e3ebdSchin if ((i = (s[0]<<8)|s[1]) > k)
69da2e3ebdSchin break;
70da2e3ebdSchin s += i;
71da2e3ebdSchin }
72da2e3ebdSchin if (!k || size > 2 * k)
73da2e3ebdSchin return REC_V_TYPE(4, 0, 2, 0, 1);
74da2e3ebdSchin s = (unsigned char*)buf;
75da2e3ebdSchin
76da2e3ebdSchin /*
77da2e3ebdSchin * check for terminated records
78da2e3ebdSchin */
79da2e3ebdSchin
80da2e3ebdSchin for (i = 0; i < elementsof(terminators); i++)
81da2e3ebdSchin if ((t = (unsigned char*)memchr((void*)s, k = terminators[i], size / 2)) && (n = t - s + 1) > 1 && (total <= 0 || !(total % n)))
82da2e3ebdSchin {
83da2e3ebdSchin for (j = n - 1; j < size; j += n)
84da2e3ebdSchin if (s[j] != k)
85da2e3ebdSchin {
86da2e3ebdSchin n = 0;
87da2e3ebdSchin break;
88da2e3ebdSchin }
89da2e3ebdSchin if (n)
90da2e3ebdSchin return REC_D_TYPE(terminators[i]);
91da2e3ebdSchin }
92da2e3ebdSchin
93da2e3ebdSchin /*
94da2e3ebdSchin * check fixed length record frequencies
95da2e3ebdSchin */
96da2e3ebdSchin
97da2e3ebdSchin if (!(q = newof(0, Sample_t, 1, 0)))
98da2e3ebdSchin return REC_N_TYPE();
99da2e3ebdSchin x = 0;
100da2e3ebdSchin for (i = 0; i < size; i++)
101da2e3ebdSchin {
102da2e3ebdSchin h = q->hit + s[i];
103da2e3ebdSchin m = i - *h;
104da2e3ebdSchin *h = i;
105da2e3ebdSchin if (m < elementsof(q->rep))
106da2e3ebdSchin {
107da2e3ebdSchin if (m > x)
108da2e3ebdSchin x = m;
109da2e3ebdSchin q->rep[m]++;
110da2e3ebdSchin }
111da2e3ebdSchin }
112da2e3ebdSchin n = 0;
113da2e3ebdSchin m = 0;
114da2e3ebdSchin f = ~0;
115da2e3ebdSchin for (i = x; i > 1; i--)
116da2e3ebdSchin {
117da2e3ebdSchin if ((total <= 0 || !(total % i)) && q->rep[i] > q->rep[n])
118da2e3ebdSchin {
119da2e3ebdSchin m++;
120da2e3ebdSchin g = 0;
121da2e3ebdSchin for (j = i; j < size - i; j += i)
122da2e3ebdSchin for (k = 0; k < i; k++)
123da2e3ebdSchin if (s[j + k] != s[j + k - i])
124da2e3ebdSchin g++;
125da2e3ebdSchin g = (((g * 100) / i) * 100) / q->rep[i];
126da2e3ebdSchin if (g <= f)
127da2e3ebdSchin {
128da2e3ebdSchin f = g;
129da2e3ebdSchin n = i;
130da2e3ebdSchin }
131da2e3ebdSchin }
132da2e3ebdSchin }
133da2e3ebdSchin if (m <= 1 && n <= 2 && total > 1 && total < 256)
134da2e3ebdSchin {
135da2e3ebdSchin n = 0;
136da2e3ebdSchin for (i = 0; i < size; i++)
137da2e3ebdSchin for (j = 0; j < elementsof(terminators); j++)
138da2e3ebdSchin if (s[i] == terminators[j])
139da2e3ebdSchin n++;
140da2e3ebdSchin n = n ? 0 : total;
141da2e3ebdSchin }
142da2e3ebdSchin free(q);
143da2e3ebdSchin return n ? REC_F_TYPE(n) : REC_N_TYPE();
144da2e3ebdSchin }
145da2e3ebdSchin
146da2e3ebdSchin #if MAIN
147da2e3ebdSchin
main()148da2e3ebdSchin main()
149da2e3ebdSchin {
150da2e3ebdSchin void* s;
151da2e3ebdSchin size_t size;
152da2e3ebdSchin off_t total;
153da2e3ebdSchin
154da2e3ebdSchin if (!(s = sfreserve(sfstdin, SF_UNBOUND, 0)))
155da2e3ebdSchin {
156da2e3ebdSchin sfprintf(sfstderr, "read error\n");
157da2e3ebdSchin return 1;
158da2e3ebdSchin }
159da2e3ebdSchin size = sfvalue(sfstdin);
160da2e3ebdSchin total = sfsize(sfstdin);
161da2e3ebdSchin sfprintf(sfstdout, "%d\n", recfmt(s, size, total));
162da2e3ebdSchin return 0;
163da2e3ebdSchin }
164da2e3ebdSchin
165da2e3ebdSchin #endif
166