xref: /titanic_44/usr/src/lib/libast/common/misc/recfmt.c (revision 98157a7002f4f2cf7978f3084ca5577f0a1d72b2)
1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *           Copyright (c) 1985-2007 AT&T Knowledge Ventures            *
5 *                      and is licensed under the                       *
6 *                  Common Public License, Version 1.0                  *
7 *                      by AT&T Knowledge Ventures                      *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *            http://www.opensource.org/licenses/cpl1.0.txt             *
11 *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                  David Korn <dgk@research.att.com>                   *
19 *                   Phong Vo <kpv@research.att.com>                    *
20 *                                                                      *
21 ***********************************************************************/
22 #pragma prototyped
23 
24 /*
25  * determine record format by sampling data in <buf,size>
26  * total is the total file size, <=0 if not available
27  * return r:
28  *	-1				could not determine
29  *	RECTYPE(r)==REC_fixed		fixed length REC_F_SIZE(r)
30  *	RECTYPE(r)==REC_delimited	variable length delimiter=REC_D_DELIMITER(r)
31  *	RECTYPE(r)==REC_variable	variable length
32  */
33 
34 #include <recfmt.h>
35 
36 typedef struct
37 {
38 	unsigned int	rep[4 * 1024];
39 	unsigned int	hit[UCHAR_MAX + 1];
40 } Sample_t;
41 
42 Recfmt_t
43 recfmt(const void* buf, size_t size, off_t total)
44 {
45 	register unsigned char*		s;
46 	register unsigned char*		t;
47 	register Sample_t*		q;
48 	register unsigned int*		h;
49 	register unsigned int		i;
50 	unsigned int			j;
51 	unsigned int			k;
52 	unsigned int			n;
53 	unsigned int			m;
54 	unsigned int			x;
55 	unsigned long			f;
56 	unsigned long			g;
57 
58 	static unsigned char		terminators[] = { '\n', 0x15, 0x25 };
59 
60 	/*
61 	 * check for V format
62 	 */
63 
64 	s = (unsigned char*)buf;
65 	t = s + size;
66 	while ((k = (t - s)) >= 4 && !s[2] && !s[3])
67 	{
68 		if ((i = (s[0]<<8)|s[1]) > k)
69 			break;
70 		s += i;
71 	}
72 	if (!k || size > 2 * k)
73 		return REC_V_TYPE(4, 0, 2, 0, 1);
74 	s = (unsigned char*)buf;
75 
76 	/*
77 	 * check for terminated records
78 	 */
79 
80 	for (i = 0; i < elementsof(terminators); i++)
81 		if ((t = (unsigned char*)memchr((void*)s, k = terminators[i], size / 2)) && (n = t - s + 1) > 1 && (total <= 0 || !(total % n)))
82 		{
83 			for (j = n - 1; j < size; j += n)
84 				if (s[j] != k)
85 				{
86 					n = 0;
87 					break;
88 				}
89 			if (n)
90 				return REC_D_TYPE(terminators[i]);
91 		}
92 
93 	/*
94 	 * check fixed length record frequencies
95 	 */
96 
97 	if (!(q = newof(0, Sample_t, 1, 0)))
98 		return REC_N_TYPE();
99 	x = 0;
100 	for (i = 0; i < size; i++)
101 	{
102 		h = q->hit + s[i];
103 		m = i - *h;
104 		*h = i;
105 		if (m < elementsof(q->rep))
106 		{
107 			if (m > x)
108 				x = m;
109 			q->rep[m]++;
110 		}
111 	}
112 	n = 0;
113 	m = 0;
114 	f = ~0;
115 	for (i = x; i > 1; i--)
116 	{
117 		if ((total <= 0 || !(total % i)) && q->rep[i] > q->rep[n])
118 		{
119 			m++;
120 			g = 0;
121 			for (j = i; j < size - i; j += i)
122 				for (k = 0; k < i; k++)
123 					if (s[j + k] != s[j + k - i])
124 						g++;
125 			g = (((g * 100) / i) * 100) / q->rep[i];
126 			if (g <= f)
127 			{
128 				f = g;
129 				n = i;
130 			}
131 		}
132 	}
133 	if (m <= 1 && n <= 2 && total > 1 && total < 256)
134 	{
135 		n = 0;
136 		for (i = 0; i < size; i++)
137 			for (j = 0; j < elementsof(terminators); j++)
138 				if (s[i] == terminators[j])
139 					n++;
140 		n = n ? 0 : total;
141 	}
142 	free(q);
143 	return n ? REC_F_TYPE(n) : REC_N_TYPE();
144 }
145 
146 #if MAIN
147 
148 main()
149 {
150 	void*	s;
151 	size_t	size;
152 	off_t	total;
153 
154 	if (!(s = sfreserve(sfstdin, SF_UNBOUND, 0)))
155 	{
156 		sfprintf(sfstderr, "read error\n");
157 		return 1;
158 	}
159 	size = sfvalue(sfstdin);
160 	total = sfsize(sfstdin);
161 	sfprintf(sfstdout, "%d\n", recfmt(s, size, total));
162 	return 0;
163 }
164 
165 #endif
166