xref: /illumos-gate/usr/src/cmd/spell/huff.c (revision 20a7641f9918de8574b8b3b47dbe35c4bfc78df1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 #include <unistd.h>
31 #include <stdlib.h>
32 #include <stdio.h>
33 
34 #define	BYTE 8
35 #define	QW 1		/* width of bas-q digit in bits */
36 
37 /*
38  * this stuff should be local and hidden; it was made
39  * accessible outside for dirty reasons: 20% faster spell
40  */
41 #include "huff.h"
42 struct huff huffcode;
43 
44 /*
45  * Infinite Huffman code
46  *
47  * Let the messages be exponentially distributed with ratio r:
48  * 	P {message k} = r^k*(1-r),	k = 0, 1, ...
49  * Let the messages be coded in base q, and suppose
50  * 	r^n = 1/q
51  * If each decade(base q) contains n codes, then
52  * the messages assigned to each decade will be q times
53  * as probable as the next. Moreover the code for the tail of
54  * the distribution after truncating one decade should look
55  * just like the original, but longer by one leading digit q-1.
56  * 	q(z+n) = z + (q-1)q^w
57  * where z is first code of decade, w is width of code, in shortest
58  * full decade. Examples, base 2:
59  * 	r^1 = 1/2	r^5 = 1/2
60  * 	0		0110
61  * 	10		0111
62  * 	110		1000
63  * 	1110		1001
64  * 	...		1010
65  * 			10110
66  * 	w = 1, z = 0		w = 4, z = 0110
67  * Rewriting slightly
68  * 	(q-1)z + q*n = (q-1)q^w
69  * whence z is a multiple of q and n is a multiple of q-1. Let
70  * 	z = cq, n = d(q-1)
71  * We pick w to be the least integer such that
72  * 	d = n/(q-1) <= q^(w-1)
73  * Then solve for c
74  * 	c = q^(w-1) - d
75  * If c is not zero, the first decade may be preceded by
76  * even shorter (w-1)-digit codes 0, 1, ..., c-1. Thus
77  * the example code with r^5 = 1/2 becomes
78  * 	000
79  * 	001
80  * 	010
81  * 	0110
82  * 	0111
83  * 	1000
84  * 	1001
85  * 	1010
86  * 	10110
87  * 	...
88  * 	110110
89  * 	...
90  * The expected number of base-q digits in a codeword is then
91  *	w - 1 + r^c/(1-r^n)
92  * The present routines require q to be a power of 2
93  */
94 /*
95  * There is a lot of hanky-panky with left justification against
96  * sign instead of simple left justification because
97  * unsigned long is not available
98  */
99 #define	L (BYTE*(sizeof (long))-1)	/* length of signless long */
100 #define	MASK (~((unsigned long)1<<L))	/* mask out sign */
101 
102 /*
103  * decode the prefix of word y (which is left justified against sign)
104  * place mesage number into place pointed to by kp
105  * return length (in bits) of decoded prefix or 0 if code is out of
106  * range
107  */
108 int
109 decode(long y, long *pk)
110 {
111 	int l;
112 	long v;
113 	if (y < cs) {
114 		*pk = y >> (long)(L+QW-w);
115 		return (w-QW);
116 	}
117 	for (l = w, v = v0; y >= qcs;
118 	    y = ((unsigned long)y << QW) & MASK, v += n)
119 		if ((l += QW) > L)
120 			return (0);
121 	*pk = v + (y>>(long)(L-w));
122 	return (l);
123 }
124 
125 /*
126  * encode message k and put result (right justified) into
127  * place pointed to by py.
128  * return length (in bits) of result,
129  * or 0 if code is too long
130  */
131 
132 int
133 encode(long k, long *py)
134 {
135 	int l;
136 	long y;
137 	if (k < c) {
138 		*py = k;
139 		return (w-QW);
140 	}
141 	for (k -= c, y = 1, l = w; k >= n; k -= n, y <<= QW)
142 		if ((l += QW) > L)
143 			return (0);
144 	*py = ((y-1)<<w) + cq + k;
145 	return (l);
146 }
147 
148 
149 /*
150  * Initialization code, given expected value of k
151  * E(k) = r/(1-r) = a
152  * and given base width b
153  * return expected length of coded messages
154  */
155 static struct qlog {
156 	long p;
157 	double u;
158 } z;
159 
160 static struct qlog
161 qlog(double x, double y, long p, double u)	/* find smallest p so x^p<=y */
162 {
163 
164 	if (u/x <= y) {
165 		z.p = 0;
166 		z.u = 1;
167 	} else {
168 		z = qlog(x, y, p+p, u*u);
169 		if (u*z.u/x > y) {
170 			z.p += p;
171 			z.u *= u;
172 		}
173 	}
174 	return (z);
175 }
176 
177 double
178 huff(float a)
179 {
180 	int i, q;
181 	long d, j;
182 	double r = a/(1.0 + a);
183 	double rc, rq;
184 
185 	for (i = 0, q = 1, rq = r; i < QW; i++, q *= 2, rq *= rq)
186 		continue;
187 	rq /= r;	/* rq = r^(q-1) */
188 	(void) qlog(rq, 1./q, 1L, rq);
189 	d = z.p;
190 	n = d*(q-1);
191 	if (n != d * (q - 1))
192 		abort();	/* time to make n long */
193 	for (w = QW, j = 1; j < d; w += QW, j *= q)
194 		continue;
195 	c = j - d;
196 	cq = c*q;
197 	cs = cq<<(L-w);
198 	qcs = (((long)(q-1)<<w) + cq) << (L-QW-w);
199 	v0 = c - cq;
200 	for (i = 0, rc = 1; i < c; i++, rc *= r)	/* rc = r^c */
201 		continue;
202 	return (w + QW*(rc/(1-z.u) - 1));
203 }
204 
205 void
206 whuff(void)
207 {
208 	(void) fwrite((char *) & huffcode, sizeof (huffcode), 1, stdout);
209 }
210 
211 int
212 rhuff(FILE *f)
213 {
214 	return (read(fileno(f), (char *)&huffcode, sizeof (huffcode)) ==
215 	    sizeof (huffcode));
216 }
217