xref: /freebsd/contrib/xz/src/liblzma/rangecoder/range_encoder.h (revision 73ed8e77a79398eb8e7b600a0b67f286e9e5cd53)
181ad8388SMartin Matuska ///////////////////////////////////////////////////////////////////////////////
281ad8388SMartin Matuska //
381ad8388SMartin Matuska /// \file       range_encoder.h
481ad8388SMartin Matuska /// \brief      Range Encoder
581ad8388SMartin Matuska ///
681ad8388SMartin Matuska //  Authors:    Igor Pavlov
781ad8388SMartin Matuska //              Lasse Collin
881ad8388SMartin Matuska //
981ad8388SMartin Matuska //  This file has been put into the public domain.
1081ad8388SMartin Matuska //  You can do whatever you want with this file.
1181ad8388SMartin Matuska //
1281ad8388SMartin Matuska ///////////////////////////////////////////////////////////////////////////////
1381ad8388SMartin Matuska 
1481ad8388SMartin Matuska #ifndef LZMA_RANGE_ENCODER_H
1581ad8388SMartin Matuska #define LZMA_RANGE_ENCODER_H
1681ad8388SMartin Matuska 
1781ad8388SMartin Matuska #include "range_common.h"
1881ad8388SMartin Matuska #include "price.h"
1981ad8388SMartin Matuska 
2081ad8388SMartin Matuska 
2181ad8388SMartin Matuska /// Maximum number of symbols that can be put pending into lzma_range_encoder
22*73ed8e77SXin LI /// structure between calls to lzma_rc_encode(). For LZMA, 48+5 is enough
2381ad8388SMartin Matuska /// (match with big distance and length followed by range encoder flush).
24*73ed8e77SXin LI #define RC_SYMBOLS_MAX 53
2581ad8388SMartin Matuska 
2681ad8388SMartin Matuska 
2781ad8388SMartin Matuska typedef struct {
2881ad8388SMartin Matuska 	uint64_t low;
2981ad8388SMartin Matuska 	uint64_t cache_size;
3081ad8388SMartin Matuska 	uint32_t range;
3181ad8388SMartin Matuska 	uint8_t cache;
3281ad8388SMartin Matuska 
33*73ed8e77SXin LI 	/// Number of bytes written out by rc_encode() -> rc_shift_low()
34*73ed8e77SXin LI 	uint64_t out_total;
35*73ed8e77SXin LI 
3681ad8388SMartin Matuska 	/// Number of symbols in the tables
3781ad8388SMartin Matuska 	size_t count;
3881ad8388SMartin Matuska 
3981ad8388SMartin Matuska 	/// rc_encode()'s position in the tables
4081ad8388SMartin Matuska 	size_t pos;
4181ad8388SMartin Matuska 
4281ad8388SMartin Matuska 	/// Symbols to encode
4381ad8388SMartin Matuska 	enum {
4481ad8388SMartin Matuska 		RC_BIT_0,
4581ad8388SMartin Matuska 		RC_BIT_1,
4681ad8388SMartin Matuska 		RC_DIRECT_0,
4781ad8388SMartin Matuska 		RC_DIRECT_1,
4881ad8388SMartin Matuska 		RC_FLUSH,
4981ad8388SMartin Matuska 	} symbols[RC_SYMBOLS_MAX];
5081ad8388SMartin Matuska 
5181ad8388SMartin Matuska 	/// Probabilities associated with RC_BIT_0 or RC_BIT_1
5281ad8388SMartin Matuska 	probability *probs[RC_SYMBOLS_MAX];
5381ad8388SMartin Matuska 
5481ad8388SMartin Matuska } lzma_range_encoder;
5581ad8388SMartin Matuska 
5681ad8388SMartin Matuska 
5781ad8388SMartin Matuska static inline void
5881ad8388SMartin Matuska rc_reset(lzma_range_encoder *rc)
5981ad8388SMartin Matuska {
6081ad8388SMartin Matuska 	rc->low = 0;
6181ad8388SMartin Matuska 	rc->cache_size = 1;
6281ad8388SMartin Matuska 	rc->range = UINT32_MAX;
6381ad8388SMartin Matuska 	rc->cache = 0;
64*73ed8e77SXin LI 	rc->out_total = 0;
6581ad8388SMartin Matuska 	rc->count = 0;
6681ad8388SMartin Matuska 	rc->pos = 0;
6781ad8388SMartin Matuska }
6881ad8388SMartin Matuska 
6981ad8388SMartin Matuska 
7081ad8388SMartin Matuska static inline void
71*73ed8e77SXin LI rc_forget(lzma_range_encoder *rc)
72*73ed8e77SXin LI {
73*73ed8e77SXin LI 	// This must not be called when rc_encode() is partially done.
74*73ed8e77SXin LI 	assert(rc->pos == 0);
75*73ed8e77SXin LI 	rc->count = 0;
76*73ed8e77SXin LI }
77*73ed8e77SXin LI 
78*73ed8e77SXin LI 
79*73ed8e77SXin LI static inline void
8081ad8388SMartin Matuska rc_bit(lzma_range_encoder *rc, probability *prob, uint32_t bit)
8181ad8388SMartin Matuska {
8281ad8388SMartin Matuska 	rc->symbols[rc->count] = bit;
8381ad8388SMartin Matuska 	rc->probs[rc->count] = prob;
8481ad8388SMartin Matuska 	++rc->count;
8581ad8388SMartin Matuska }
8681ad8388SMartin Matuska 
8781ad8388SMartin Matuska 
8881ad8388SMartin Matuska static inline void
8981ad8388SMartin Matuska rc_bittree(lzma_range_encoder *rc, probability *probs,
9081ad8388SMartin Matuska 		uint32_t bit_count, uint32_t symbol)
9181ad8388SMartin Matuska {
9281ad8388SMartin Matuska 	uint32_t model_index = 1;
9381ad8388SMartin Matuska 
9481ad8388SMartin Matuska 	do {
9581ad8388SMartin Matuska 		const uint32_t bit = (symbol >> --bit_count) & 1;
9681ad8388SMartin Matuska 		rc_bit(rc, &probs[model_index], bit);
9781ad8388SMartin Matuska 		model_index = (model_index << 1) + bit;
9881ad8388SMartin Matuska 	} while (bit_count != 0);
9981ad8388SMartin Matuska }
10081ad8388SMartin Matuska 
10181ad8388SMartin Matuska 
10281ad8388SMartin Matuska static inline void
10381ad8388SMartin Matuska rc_bittree_reverse(lzma_range_encoder *rc, probability *probs,
10481ad8388SMartin Matuska 		uint32_t bit_count, uint32_t symbol)
10581ad8388SMartin Matuska {
10681ad8388SMartin Matuska 	uint32_t model_index = 1;
10781ad8388SMartin Matuska 
10881ad8388SMartin Matuska 	do {
10981ad8388SMartin Matuska 		const uint32_t bit = symbol & 1;
11081ad8388SMartin Matuska 		symbol >>= 1;
11181ad8388SMartin Matuska 		rc_bit(rc, &probs[model_index], bit);
11281ad8388SMartin Matuska 		model_index = (model_index << 1) + bit;
11381ad8388SMartin Matuska 	} while (--bit_count != 0);
11481ad8388SMartin Matuska }
11581ad8388SMartin Matuska 
11681ad8388SMartin Matuska 
11781ad8388SMartin Matuska static inline void
11881ad8388SMartin Matuska rc_direct(lzma_range_encoder *rc,
11981ad8388SMartin Matuska 		uint32_t value, uint32_t bit_count)
12081ad8388SMartin Matuska {
12181ad8388SMartin Matuska 	do {
12281ad8388SMartin Matuska 		rc->symbols[rc->count++]
12381ad8388SMartin Matuska 				= RC_DIRECT_0 + ((value >> --bit_count) & 1);
12481ad8388SMartin Matuska 	} while (bit_count != 0);
12581ad8388SMartin Matuska }
12681ad8388SMartin Matuska 
12781ad8388SMartin Matuska 
12881ad8388SMartin Matuska static inline void
12981ad8388SMartin Matuska rc_flush(lzma_range_encoder *rc)
13081ad8388SMartin Matuska {
13181ad8388SMartin Matuska 	for (size_t i = 0; i < 5; ++i)
13281ad8388SMartin Matuska 		rc->symbols[rc->count++] = RC_FLUSH;
13381ad8388SMartin Matuska }
13481ad8388SMartin Matuska 
13581ad8388SMartin Matuska 
13681ad8388SMartin Matuska static inline bool
13781ad8388SMartin Matuska rc_shift_low(lzma_range_encoder *rc,
13881ad8388SMartin Matuska 		uint8_t *out, size_t *out_pos, size_t out_size)
13981ad8388SMartin Matuska {
14081ad8388SMartin Matuska 	if ((uint32_t)(rc->low) < (uint32_t)(0xFF000000)
14181ad8388SMartin Matuska 			|| (uint32_t)(rc->low >> 32) != 0) {
14281ad8388SMartin Matuska 		do {
14381ad8388SMartin Matuska 			if (*out_pos == out_size)
14481ad8388SMartin Matuska 				return true;
14581ad8388SMartin Matuska 
14681ad8388SMartin Matuska 			out[*out_pos] = rc->cache + (uint8_t)(rc->low >> 32);
14781ad8388SMartin Matuska 			++*out_pos;
148*73ed8e77SXin LI 			++rc->out_total;
14981ad8388SMartin Matuska 			rc->cache = 0xFF;
15081ad8388SMartin Matuska 
15181ad8388SMartin Matuska 		} while (--rc->cache_size != 0);
15281ad8388SMartin Matuska 
15381ad8388SMartin Matuska 		rc->cache = (rc->low >> 24) & 0xFF;
15481ad8388SMartin Matuska 	}
15581ad8388SMartin Matuska 
15681ad8388SMartin Matuska 	++rc->cache_size;
15781ad8388SMartin Matuska 	rc->low = (rc->low & 0x00FFFFFF) << RC_SHIFT_BITS;
15881ad8388SMartin Matuska 
15981ad8388SMartin Matuska 	return false;
16081ad8388SMartin Matuska }
16181ad8388SMartin Matuska 
16281ad8388SMartin Matuska 
163*73ed8e77SXin LI // NOTE: The last two arguments are uint64_t instead of size_t because in
164*73ed8e77SXin LI // the dummy version these refer to the size of the whole range-encoded
165*73ed8e77SXin LI // output stream, not just to the currently available output buffer space.
166*73ed8e77SXin LI static inline bool
167*73ed8e77SXin LI rc_shift_low_dummy(uint64_t *low, uint64_t *cache_size, uint8_t *cache,
168*73ed8e77SXin LI 		uint64_t *out_pos, uint64_t out_size)
169*73ed8e77SXin LI {
170*73ed8e77SXin LI 	if ((uint32_t)(*low) < (uint32_t)(0xFF000000)
171*73ed8e77SXin LI 			|| (uint32_t)(*low >> 32) != 0) {
172*73ed8e77SXin LI 		do {
173*73ed8e77SXin LI 			if (*out_pos == out_size)
174*73ed8e77SXin LI 				return true;
175*73ed8e77SXin LI 
176*73ed8e77SXin LI 			++*out_pos;
177*73ed8e77SXin LI 			*cache = 0xFF;
178*73ed8e77SXin LI 
179*73ed8e77SXin LI 		} while (--*cache_size != 0);
180*73ed8e77SXin LI 
181*73ed8e77SXin LI 		*cache = (*low >> 24) & 0xFF;
182*73ed8e77SXin LI 	}
183*73ed8e77SXin LI 
184*73ed8e77SXin LI 	++*cache_size;
185*73ed8e77SXin LI 	*low = (*low & 0x00FFFFFF) << RC_SHIFT_BITS;
186*73ed8e77SXin LI 
187*73ed8e77SXin LI 	return false;
188*73ed8e77SXin LI }
189*73ed8e77SXin LI 
190*73ed8e77SXin LI 
19181ad8388SMartin Matuska static inline bool
19281ad8388SMartin Matuska rc_encode(lzma_range_encoder *rc,
19381ad8388SMartin Matuska 		uint8_t *out, size_t *out_pos, size_t out_size)
19481ad8388SMartin Matuska {
19581ad8388SMartin Matuska 	assert(rc->count <= RC_SYMBOLS_MAX);
19681ad8388SMartin Matuska 
19781ad8388SMartin Matuska 	while (rc->pos < rc->count) {
19881ad8388SMartin Matuska 		// Normalize
19981ad8388SMartin Matuska 		if (rc->range < RC_TOP_VALUE) {
20081ad8388SMartin Matuska 			if (rc_shift_low(rc, out, out_pos, out_size))
20181ad8388SMartin Matuska 				return true;
20281ad8388SMartin Matuska 
20381ad8388SMartin Matuska 			rc->range <<= RC_SHIFT_BITS;
20481ad8388SMartin Matuska 		}
20581ad8388SMartin Matuska 
20681ad8388SMartin Matuska 		// Encode a bit
20781ad8388SMartin Matuska 		switch (rc->symbols[rc->pos]) {
20881ad8388SMartin Matuska 		case RC_BIT_0: {
20981ad8388SMartin Matuska 			probability prob = *rc->probs[rc->pos];
21081ad8388SMartin Matuska 			rc->range = (rc->range >> RC_BIT_MODEL_TOTAL_BITS)
21181ad8388SMartin Matuska 					* prob;
21281ad8388SMartin Matuska 			prob += (RC_BIT_MODEL_TOTAL - prob) >> RC_MOVE_BITS;
21381ad8388SMartin Matuska 			*rc->probs[rc->pos] = prob;
21481ad8388SMartin Matuska 			break;
21581ad8388SMartin Matuska 		}
21681ad8388SMartin Matuska 
21781ad8388SMartin Matuska 		case RC_BIT_1: {
21881ad8388SMartin Matuska 			probability prob = *rc->probs[rc->pos];
21981ad8388SMartin Matuska 			const uint32_t bound = prob * (rc->range
22081ad8388SMartin Matuska 					>> RC_BIT_MODEL_TOTAL_BITS);
22181ad8388SMartin Matuska 			rc->low += bound;
22281ad8388SMartin Matuska 			rc->range -= bound;
22381ad8388SMartin Matuska 			prob -= prob >> RC_MOVE_BITS;
22481ad8388SMartin Matuska 			*rc->probs[rc->pos] = prob;
22581ad8388SMartin Matuska 			break;
22681ad8388SMartin Matuska 		}
22781ad8388SMartin Matuska 
22881ad8388SMartin Matuska 		case RC_DIRECT_0:
22981ad8388SMartin Matuska 			rc->range >>= 1;
23081ad8388SMartin Matuska 			break;
23181ad8388SMartin Matuska 
23281ad8388SMartin Matuska 		case RC_DIRECT_1:
23381ad8388SMartin Matuska 			rc->range >>= 1;
23481ad8388SMartin Matuska 			rc->low += rc->range;
23581ad8388SMartin Matuska 			break;
23681ad8388SMartin Matuska 
23781ad8388SMartin Matuska 		case RC_FLUSH:
23881ad8388SMartin Matuska 			// Prevent further normalizations.
23981ad8388SMartin Matuska 			rc->range = UINT32_MAX;
24081ad8388SMartin Matuska 
24181ad8388SMartin Matuska 			// Flush the last five bytes (see rc_flush()).
24281ad8388SMartin Matuska 			do {
24381ad8388SMartin Matuska 				if (rc_shift_low(rc, out, out_pos, out_size))
24481ad8388SMartin Matuska 					return true;
24581ad8388SMartin Matuska 			} while (++rc->pos < rc->count);
24681ad8388SMartin Matuska 
24781ad8388SMartin Matuska 			// Reset the range encoder so we are ready to continue
24881ad8388SMartin Matuska 			// encoding if we weren't finishing the stream.
24981ad8388SMartin Matuska 			rc_reset(rc);
25081ad8388SMartin Matuska 			return false;
25181ad8388SMartin Matuska 
25281ad8388SMartin Matuska 		default:
25381ad8388SMartin Matuska 			assert(0);
25481ad8388SMartin Matuska 			break;
25581ad8388SMartin Matuska 		}
25681ad8388SMartin Matuska 
25781ad8388SMartin Matuska 		++rc->pos;
25881ad8388SMartin Matuska 	}
25981ad8388SMartin Matuska 
26081ad8388SMartin Matuska 	rc->count = 0;
26181ad8388SMartin Matuska 	rc->pos = 0;
26281ad8388SMartin Matuska 
26381ad8388SMartin Matuska 	return false;
26481ad8388SMartin Matuska }
26581ad8388SMartin Matuska 
26681ad8388SMartin Matuska 
267*73ed8e77SXin LI static inline bool
268*73ed8e77SXin LI rc_encode_dummy(const lzma_range_encoder *rc, uint64_t out_limit)
269*73ed8e77SXin LI {
270*73ed8e77SXin LI 	assert(rc->count <= RC_SYMBOLS_MAX);
271*73ed8e77SXin LI 
272*73ed8e77SXin LI 	uint64_t low = rc->low;
273*73ed8e77SXin LI 	uint64_t cache_size = rc->cache_size;
274*73ed8e77SXin LI 	uint32_t range = rc->range;
275*73ed8e77SXin LI 	uint8_t cache = rc->cache;
276*73ed8e77SXin LI 	uint64_t out_pos = rc->out_total;
277*73ed8e77SXin LI 
278*73ed8e77SXin LI 	size_t pos = rc->pos;
279*73ed8e77SXin LI 
280*73ed8e77SXin LI 	while (true) {
281*73ed8e77SXin LI 		// Normalize
282*73ed8e77SXin LI 		if (range < RC_TOP_VALUE) {
283*73ed8e77SXin LI 			if (rc_shift_low_dummy(&low, &cache_size, &cache,
284*73ed8e77SXin LI 					&out_pos, out_limit))
285*73ed8e77SXin LI 				return true;
286*73ed8e77SXin LI 
287*73ed8e77SXin LI 			range <<= RC_SHIFT_BITS;
288*73ed8e77SXin LI 		}
289*73ed8e77SXin LI 
290*73ed8e77SXin LI 		// This check is here because the normalization above must
291*73ed8e77SXin LI 		// be done before flushing the last bytes.
292*73ed8e77SXin LI 		if (pos == rc->count)
293*73ed8e77SXin LI 			break;
294*73ed8e77SXin LI 
295*73ed8e77SXin LI 		// Encode a bit
296*73ed8e77SXin LI 		switch (rc->symbols[pos]) {
297*73ed8e77SXin LI 		case RC_BIT_0: {
298*73ed8e77SXin LI 			probability prob = *rc->probs[pos];
299*73ed8e77SXin LI 			range = (range >> RC_BIT_MODEL_TOTAL_BITS)
300*73ed8e77SXin LI 					* prob;
301*73ed8e77SXin LI 			break;
302*73ed8e77SXin LI 		}
303*73ed8e77SXin LI 
304*73ed8e77SXin LI 		case RC_BIT_1: {
305*73ed8e77SXin LI 			probability prob = *rc->probs[pos];
306*73ed8e77SXin LI 			const uint32_t bound = prob * (range
307*73ed8e77SXin LI 					>> RC_BIT_MODEL_TOTAL_BITS);
308*73ed8e77SXin LI 			low += bound;
309*73ed8e77SXin LI 			range -= bound;
310*73ed8e77SXin LI 			break;
311*73ed8e77SXin LI 		}
312*73ed8e77SXin LI 
313*73ed8e77SXin LI 		case RC_DIRECT_0:
314*73ed8e77SXin LI 			range >>= 1;
315*73ed8e77SXin LI 			break;
316*73ed8e77SXin LI 
317*73ed8e77SXin LI 		case RC_DIRECT_1:
318*73ed8e77SXin LI 			range >>= 1;
319*73ed8e77SXin LI 			low += range;
320*73ed8e77SXin LI 			break;
321*73ed8e77SXin LI 
322*73ed8e77SXin LI 		case RC_FLUSH:
323*73ed8e77SXin LI 		default:
324*73ed8e77SXin LI 			assert(0);
325*73ed8e77SXin LI 			break;
326*73ed8e77SXin LI 		}
327*73ed8e77SXin LI 
328*73ed8e77SXin LI 		++pos;
329*73ed8e77SXin LI 	}
330*73ed8e77SXin LI 
331*73ed8e77SXin LI 	// Flush the last bytes. This isn't in rc->symbols[] so we do
332*73ed8e77SXin LI 	// it after the above loop to take into account the size of
333*73ed8e77SXin LI 	// the flushing that will be done at the end of the stream.
334*73ed8e77SXin LI 	for (pos = 0; pos < 5; ++pos) {
335*73ed8e77SXin LI 		if (rc_shift_low_dummy(&low, &cache_size,
336*73ed8e77SXin LI 				&cache, &out_pos, out_limit))
337*73ed8e77SXin LI 			return true;
338*73ed8e77SXin LI 	}
339*73ed8e77SXin LI 
340*73ed8e77SXin LI 	return false;
341*73ed8e77SXin LI }
342*73ed8e77SXin LI 
343*73ed8e77SXin LI 
34481ad8388SMartin Matuska static inline uint64_t
34581ad8388SMartin Matuska rc_pending(const lzma_range_encoder *rc)
34681ad8388SMartin Matuska {
34781ad8388SMartin Matuska 	return rc->cache_size + 5 - 1;
34881ad8388SMartin Matuska }
34981ad8388SMartin Matuska 
35081ad8388SMartin Matuska #endif
351