cmd/spell/huff.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
/*	  All Rights Reserved  	*/

#pragma ident	"%Z%%M%	%I%	%E% SMI"


#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>

#define	BYTE 8
#define	QW 1		/* width of bas-q digit in bits */

/*
 * this stuff should be local and hidden; it was made
 * accessible outside for dirty reasons: 20% faster spell
 */
#include "huff.h"
struct huff huffcode;

/*
 * Infinite Huffman code
 *
 * Let the messages be exponentially distributed with ratio r:
 * 	P {message k} = r^k*(1-r),	k = 0, 1, ...
 * Let the messages be coded in base q, and suppose
 * 	r^n = 1/q
 * If each decade(base q) contains n codes, then
 * the messages assigned to each decade will be q times
 * as probable as the next. Moreover the code for the tail of
 * the distribution after truncating one decade should look
 * just like the original, but longer by one leading digit q-1.
 * 	q(z+n) = z + (q-1)q^w
 * where z is first code of decade, w is width of code, in shortest
 * full decade. Examples, base 2:
 * 	r^1 = 1/2	r^5 = 1/2
 * 	0		0110
 * 	10		0111
 * 	110		1000
 * 	1110		1001
 * 	...		1010
 * 			10110
 * 	w = 1, z = 0		w = 4, z = 0110
 * Rewriting slightly
 * 	(q-1)z + q*n = (q-1)q^w
 * whence z is a multiple of q and n is a multiple of q-1. Let
 * 	z = cq, n = d(q-1)
 * We pick w to be the least integer such that
 * 	d = n/(q-1) <= q^(w-1)
 * Then solve for c
 * 	c = q^(w-1) - d
 * If c is not zero, the first decade may be preceded by
 * even shorter (w-1)-digit codes 0, 1, ..., c-1. Thus
 * the example code with r^5 = 1/2 becomes
 * 	000
 * 	001
 * 	010
 * 	0110
 * 	0111
 * 	1000
 * 	1001
 * 	1010
 * 	10110
 * 	...
 * 	110110
 * 	...
 * The expected number of base-q digits in a codeword is then
 *	w - 1 + r^c/(1-r^n)
 * The present routines require q to be a power of 2
 */
/*
 * There is a lot of hanky-panky with left justification against
 * sign instead of simple left justification because
 * unsigned long is not available
 */
#define	L (BYTE*(sizeof (long))-1)	/* length of signless long */
#define	MASK (~((unsigned long)1<<L))	/* mask out sign */

/*
 * decode the prefix of word y (which is left justified against sign)
 * place mesage number into place pointed to by kp
 * return length (in bits) of decoded prefix or 0 if code is out of
 * range
 */
int
decode(long y, long *pk)
{
	int l;
	long v;
	if (y < cs) {
		*pk = y >> (long)(L+QW-w);
		return (w-QW);
	}
	for (l = w, v = v0; y >= qcs;
	    y = ((unsigned long)y << QW) & MASK, v += n)
		if ((l += QW) > L)
			return (0);
	*pk = v + (y>>(long)(L-w));
	return (l);
}

/*
 * encode message k and put result (right justified) into
 * place pointed to by py.
 * return length (in bits) of result,
 * or 0 if code is too long
 */

int
encode(long k, long *py)
{
	int l;
	long y;
	if (k < c) {
		*py = k;
		return (w-QW);
	}
	for (k -= c, y = 1, l = w; k >= n; k -= n, y <<= QW)
		if ((l += QW) > L)
			return (0);
	*py = ((y-1)<<w) + cq + k;
	return (l);
}


/*
 * Initialization code, given expected value of k
 * E(k) = r/(1-r) = a
 * and given base width b
 * return expected length of coded messages
 */
static struct qlog {
	long p;
	double u;
} z;

static struct qlog
qlog(double x, double y, long p, double u)	/* find smallest p so x^p<=y */
{

	if (u/x <= y) {
		z.p = 0;
		z.u = 1;
	} else {
		z = qlog(x, y, p+p, u*u);
		if (u*z.u/x > y) {
			z.p += p;
			z.u *= u;
		}
	}
	return (z);
}

double
huff(float a)
{
	int i, q;
	long d, j;
	double r = a/(1.0 + a);
	double rc, rq;

	for (i = 0, q = 1, rq = r; i < QW; i++, q *= 2, rq *= rq)
		continue;
	rq /= r;	/* rq = r^(q-1) */
	(void) qlog(rq, 1./q, 1L, rq);
	d = z.p;
	n = d*(q-1);
	if (n != d * (q - 1))
		abort();	/* time to make n long */
	for (w = QW, j = 1; j < d; w += QW, j *= q)
		continue;
	c = j - d;
	cq = c*q;
	cs = cq<<(L-w);
	qcs = (((long)(q-1)<<w) + cq) << (L-QW-w);
	v0 = c - cq;
	for (i = 0, rc = 1; i < c; i++, rc *= r)	/* rc = r^c */
		continue;
	return (w + QW*(rc/(1-z.u) - 1));
}

void
whuff(void)
{
	(void) fwrite((char *) & huffcode, sizeof (huffcode), 1, stdout);
}

int
rhuff(FILE *f)
{
	return (read(fileno(f), (char *)&huffcode, sizeof (huffcode)) ==
	    sizeof (huffcode));
}