xref: /freebsd/contrib/xz/src/liblzma/rangecoder/range_encoder.h (revision 3b35e7ee8de9b0260149a2b77e87a2b9c7a36244)
1*3b35e7eeSXin LI // SPDX-License-Identifier: 0BSD
2*3b35e7eeSXin LI 
381ad8388SMartin Matuska ///////////////////////////////////////////////////////////////////////////////
481ad8388SMartin Matuska //
581ad8388SMartin Matuska /// \file       range_encoder.h
681ad8388SMartin Matuska /// \brief      Range Encoder
781ad8388SMartin Matuska ///
881ad8388SMartin Matuska //  Authors:    Igor Pavlov
981ad8388SMartin Matuska //              Lasse Collin
1081ad8388SMartin Matuska //
1181ad8388SMartin Matuska ///////////////////////////////////////////////////////////////////////////////
1281ad8388SMartin Matuska 
1381ad8388SMartin Matuska #ifndef LZMA_RANGE_ENCODER_H
1481ad8388SMartin Matuska #define LZMA_RANGE_ENCODER_H
1581ad8388SMartin Matuska 
1681ad8388SMartin Matuska #include "range_common.h"
1781ad8388SMartin Matuska #include "price.h"
1881ad8388SMartin Matuska 
1981ad8388SMartin Matuska 
2081ad8388SMartin Matuska /// Maximum number of symbols that can be put pending into lzma_range_encoder
2173ed8e77SXin LI /// structure between calls to lzma_rc_encode(). For LZMA, 48+5 is enough
2281ad8388SMartin Matuska /// (match with big distance and length followed by range encoder flush).
2373ed8e77SXin LI #define RC_SYMBOLS_MAX 53
2481ad8388SMartin Matuska 
2581ad8388SMartin Matuska 
2681ad8388SMartin Matuska typedef struct {
2781ad8388SMartin Matuska 	uint64_t low;
2881ad8388SMartin Matuska 	uint64_t cache_size;
2981ad8388SMartin Matuska 	uint32_t range;
3081ad8388SMartin Matuska 	uint8_t cache;
3181ad8388SMartin Matuska 
3273ed8e77SXin LI 	/// Number of bytes written out by rc_encode() -> rc_shift_low()
3373ed8e77SXin LI 	uint64_t out_total;
3473ed8e77SXin LI 
3581ad8388SMartin Matuska 	/// Number of symbols in the tables
3681ad8388SMartin Matuska 	size_t count;
3781ad8388SMartin Matuska 
3881ad8388SMartin Matuska 	/// rc_encode()'s position in the tables
3981ad8388SMartin Matuska 	size_t pos;
4081ad8388SMartin Matuska 
4181ad8388SMartin Matuska 	/// Symbols to encode
4281ad8388SMartin Matuska 	enum {
4381ad8388SMartin Matuska 		RC_BIT_0,
4481ad8388SMartin Matuska 		RC_BIT_1,
4581ad8388SMartin Matuska 		RC_DIRECT_0,
4681ad8388SMartin Matuska 		RC_DIRECT_1,
4781ad8388SMartin Matuska 		RC_FLUSH,
4881ad8388SMartin Matuska 	} symbols[RC_SYMBOLS_MAX];
4981ad8388SMartin Matuska 
5081ad8388SMartin Matuska 	/// Probabilities associated with RC_BIT_0 or RC_BIT_1
5181ad8388SMartin Matuska 	probability *probs[RC_SYMBOLS_MAX];
5281ad8388SMartin Matuska 
5381ad8388SMartin Matuska } lzma_range_encoder;
5481ad8388SMartin Matuska 
5581ad8388SMartin Matuska 
5681ad8388SMartin Matuska static inline void
5781ad8388SMartin Matuska rc_reset(lzma_range_encoder *rc)
5881ad8388SMartin Matuska {
5981ad8388SMartin Matuska 	rc->low = 0;
6081ad8388SMartin Matuska 	rc->cache_size = 1;
6181ad8388SMartin Matuska 	rc->range = UINT32_MAX;
6281ad8388SMartin Matuska 	rc->cache = 0;
6373ed8e77SXin LI 	rc->out_total = 0;
6481ad8388SMartin Matuska 	rc->count = 0;
6581ad8388SMartin Matuska 	rc->pos = 0;
6681ad8388SMartin Matuska }
6781ad8388SMartin Matuska 
6881ad8388SMartin Matuska 
6981ad8388SMartin Matuska static inline void
7073ed8e77SXin LI rc_forget(lzma_range_encoder *rc)
7173ed8e77SXin LI {
7273ed8e77SXin LI 	// This must not be called when rc_encode() is partially done.
7373ed8e77SXin LI 	assert(rc->pos == 0);
7473ed8e77SXin LI 	rc->count = 0;
7573ed8e77SXin LI }
7673ed8e77SXin LI 
7773ed8e77SXin LI 
7873ed8e77SXin LI static inline void
7981ad8388SMartin Matuska rc_bit(lzma_range_encoder *rc, probability *prob, uint32_t bit)
8081ad8388SMartin Matuska {
8181ad8388SMartin Matuska 	rc->symbols[rc->count] = bit;
8281ad8388SMartin Matuska 	rc->probs[rc->count] = prob;
8381ad8388SMartin Matuska 	++rc->count;
8481ad8388SMartin Matuska }
8581ad8388SMartin Matuska 
8681ad8388SMartin Matuska 
8781ad8388SMartin Matuska static inline void
8881ad8388SMartin Matuska rc_bittree(lzma_range_encoder *rc, probability *probs,
8981ad8388SMartin Matuska 		uint32_t bit_count, uint32_t symbol)
9081ad8388SMartin Matuska {
9181ad8388SMartin Matuska 	uint32_t model_index = 1;
9281ad8388SMartin Matuska 
9381ad8388SMartin Matuska 	do {
9481ad8388SMartin Matuska 		const uint32_t bit = (symbol >> --bit_count) & 1;
9581ad8388SMartin Matuska 		rc_bit(rc, &probs[model_index], bit);
9681ad8388SMartin Matuska 		model_index = (model_index << 1) + bit;
9781ad8388SMartin Matuska 	} while (bit_count != 0);
9881ad8388SMartin Matuska }
9981ad8388SMartin Matuska 
10081ad8388SMartin Matuska 
10181ad8388SMartin Matuska static inline void
10281ad8388SMartin Matuska rc_bittree_reverse(lzma_range_encoder *rc, probability *probs,
10381ad8388SMartin Matuska 		uint32_t bit_count, uint32_t symbol)
10481ad8388SMartin Matuska {
10581ad8388SMartin Matuska 	uint32_t model_index = 1;
10681ad8388SMartin Matuska 
10781ad8388SMartin Matuska 	do {
10881ad8388SMartin Matuska 		const uint32_t bit = symbol & 1;
10981ad8388SMartin Matuska 		symbol >>= 1;
11081ad8388SMartin Matuska 		rc_bit(rc, &probs[model_index], bit);
11181ad8388SMartin Matuska 		model_index = (model_index << 1) + bit;
11281ad8388SMartin Matuska 	} while (--bit_count != 0);
11381ad8388SMartin Matuska }
11481ad8388SMartin Matuska 
11581ad8388SMartin Matuska 
11681ad8388SMartin Matuska static inline void
11781ad8388SMartin Matuska rc_direct(lzma_range_encoder *rc,
11881ad8388SMartin Matuska 		uint32_t value, uint32_t bit_count)
11981ad8388SMartin Matuska {
12081ad8388SMartin Matuska 	do {
12181ad8388SMartin Matuska 		rc->symbols[rc->count++]
12281ad8388SMartin Matuska 				= RC_DIRECT_0 + ((value >> --bit_count) & 1);
12381ad8388SMartin Matuska 	} while (bit_count != 0);
12481ad8388SMartin Matuska }
12581ad8388SMartin Matuska 
12681ad8388SMartin Matuska 
12781ad8388SMartin Matuska static inline void
12881ad8388SMartin Matuska rc_flush(lzma_range_encoder *rc)
12981ad8388SMartin Matuska {
13081ad8388SMartin Matuska 	for (size_t i = 0; i < 5; ++i)
13181ad8388SMartin Matuska 		rc->symbols[rc->count++] = RC_FLUSH;
13281ad8388SMartin Matuska }
13381ad8388SMartin Matuska 
13481ad8388SMartin Matuska 
13581ad8388SMartin Matuska static inline bool
13681ad8388SMartin Matuska rc_shift_low(lzma_range_encoder *rc,
13781ad8388SMartin Matuska 		uint8_t *out, size_t *out_pos, size_t out_size)
13881ad8388SMartin Matuska {
13981ad8388SMartin Matuska 	if ((uint32_t)(rc->low) < (uint32_t)(0xFF000000)
14081ad8388SMartin Matuska 			|| (uint32_t)(rc->low >> 32) != 0) {
14181ad8388SMartin Matuska 		do {
14281ad8388SMartin Matuska 			if (*out_pos == out_size)
14381ad8388SMartin Matuska 				return true;
14481ad8388SMartin Matuska 
14581ad8388SMartin Matuska 			out[*out_pos] = rc->cache + (uint8_t)(rc->low >> 32);
14681ad8388SMartin Matuska 			++*out_pos;
14773ed8e77SXin LI 			++rc->out_total;
14881ad8388SMartin Matuska 			rc->cache = 0xFF;
14981ad8388SMartin Matuska 
15081ad8388SMartin Matuska 		} while (--rc->cache_size != 0);
15181ad8388SMartin Matuska 
15281ad8388SMartin Matuska 		rc->cache = (rc->low >> 24) & 0xFF;
15381ad8388SMartin Matuska 	}
15481ad8388SMartin Matuska 
15581ad8388SMartin Matuska 	++rc->cache_size;
15681ad8388SMartin Matuska 	rc->low = (rc->low & 0x00FFFFFF) << RC_SHIFT_BITS;
15781ad8388SMartin Matuska 
15881ad8388SMartin Matuska 	return false;
15981ad8388SMartin Matuska }
16081ad8388SMartin Matuska 
16181ad8388SMartin Matuska 
16273ed8e77SXin LI // NOTE: The last two arguments are uint64_t instead of size_t because in
16373ed8e77SXin LI // the dummy version these refer to the size of the whole range-encoded
16473ed8e77SXin LI // output stream, not just to the currently available output buffer space.
16573ed8e77SXin LI static inline bool
16673ed8e77SXin LI rc_shift_low_dummy(uint64_t *low, uint64_t *cache_size, uint8_t *cache,
16773ed8e77SXin LI 		uint64_t *out_pos, uint64_t out_size)
16873ed8e77SXin LI {
16973ed8e77SXin LI 	if ((uint32_t)(*low) < (uint32_t)(0xFF000000)
17073ed8e77SXin LI 			|| (uint32_t)(*low >> 32) != 0) {
17173ed8e77SXin LI 		do {
17273ed8e77SXin LI 			if (*out_pos == out_size)
17373ed8e77SXin LI 				return true;
17473ed8e77SXin LI 
17573ed8e77SXin LI 			++*out_pos;
17673ed8e77SXin LI 			*cache = 0xFF;
17773ed8e77SXin LI 
17873ed8e77SXin LI 		} while (--*cache_size != 0);
17973ed8e77SXin LI 
18073ed8e77SXin LI 		*cache = (*low >> 24) & 0xFF;
18173ed8e77SXin LI 	}
18273ed8e77SXin LI 
18373ed8e77SXin LI 	++*cache_size;
18473ed8e77SXin LI 	*low = (*low & 0x00FFFFFF) << RC_SHIFT_BITS;
18573ed8e77SXin LI 
18673ed8e77SXin LI 	return false;
18773ed8e77SXin LI }
18873ed8e77SXin LI 
18973ed8e77SXin LI 
19081ad8388SMartin Matuska static inline bool
19181ad8388SMartin Matuska rc_encode(lzma_range_encoder *rc,
19281ad8388SMartin Matuska 		uint8_t *out, size_t *out_pos, size_t out_size)
19381ad8388SMartin Matuska {
19481ad8388SMartin Matuska 	assert(rc->count <= RC_SYMBOLS_MAX);
19581ad8388SMartin Matuska 
19681ad8388SMartin Matuska 	while (rc->pos < rc->count) {
19781ad8388SMartin Matuska 		// Normalize
19881ad8388SMartin Matuska 		if (rc->range < RC_TOP_VALUE) {
19981ad8388SMartin Matuska 			if (rc_shift_low(rc, out, out_pos, out_size))
20081ad8388SMartin Matuska 				return true;
20181ad8388SMartin Matuska 
20281ad8388SMartin Matuska 			rc->range <<= RC_SHIFT_BITS;
20381ad8388SMartin Matuska 		}
20481ad8388SMartin Matuska 
20581ad8388SMartin Matuska 		// Encode a bit
20681ad8388SMartin Matuska 		switch (rc->symbols[rc->pos]) {
20781ad8388SMartin Matuska 		case RC_BIT_0: {
20881ad8388SMartin Matuska 			probability prob = *rc->probs[rc->pos];
20981ad8388SMartin Matuska 			rc->range = (rc->range >> RC_BIT_MODEL_TOTAL_BITS)
21081ad8388SMartin Matuska 					* prob;
21181ad8388SMartin Matuska 			prob += (RC_BIT_MODEL_TOTAL - prob) >> RC_MOVE_BITS;
21281ad8388SMartin Matuska 			*rc->probs[rc->pos] = prob;
21381ad8388SMartin Matuska 			break;
21481ad8388SMartin Matuska 		}
21581ad8388SMartin Matuska 
21681ad8388SMartin Matuska 		case RC_BIT_1: {
21781ad8388SMartin Matuska 			probability prob = *rc->probs[rc->pos];
21881ad8388SMartin Matuska 			const uint32_t bound = prob * (rc->range
21981ad8388SMartin Matuska 					>> RC_BIT_MODEL_TOTAL_BITS);
22081ad8388SMartin Matuska 			rc->low += bound;
22181ad8388SMartin Matuska 			rc->range -= bound;
22281ad8388SMartin Matuska 			prob -= prob >> RC_MOVE_BITS;
22381ad8388SMartin Matuska 			*rc->probs[rc->pos] = prob;
22481ad8388SMartin Matuska 			break;
22581ad8388SMartin Matuska 		}
22681ad8388SMartin Matuska 
22781ad8388SMartin Matuska 		case RC_DIRECT_0:
22881ad8388SMartin Matuska 			rc->range >>= 1;
22981ad8388SMartin Matuska 			break;
23081ad8388SMartin Matuska 
23181ad8388SMartin Matuska 		case RC_DIRECT_1:
23281ad8388SMartin Matuska 			rc->range >>= 1;
23381ad8388SMartin Matuska 			rc->low += rc->range;
23481ad8388SMartin Matuska 			break;
23581ad8388SMartin Matuska 
23681ad8388SMartin Matuska 		case RC_FLUSH:
23781ad8388SMartin Matuska 			// Prevent further normalizations.
23881ad8388SMartin Matuska 			rc->range = UINT32_MAX;
23981ad8388SMartin Matuska 
24081ad8388SMartin Matuska 			// Flush the last five bytes (see rc_flush()).
24181ad8388SMartin Matuska 			do {
24281ad8388SMartin Matuska 				if (rc_shift_low(rc, out, out_pos, out_size))
24381ad8388SMartin Matuska 					return true;
24481ad8388SMartin Matuska 			} while (++rc->pos < rc->count);
24581ad8388SMartin Matuska 
24681ad8388SMartin Matuska 			// Reset the range encoder so we are ready to continue
24781ad8388SMartin Matuska 			// encoding if we weren't finishing the stream.
24881ad8388SMartin Matuska 			rc_reset(rc);
24981ad8388SMartin Matuska 			return false;
25081ad8388SMartin Matuska 
25181ad8388SMartin Matuska 		default:
25281ad8388SMartin Matuska 			assert(0);
25381ad8388SMartin Matuska 			break;
25481ad8388SMartin Matuska 		}
25581ad8388SMartin Matuska 
25681ad8388SMartin Matuska 		++rc->pos;
25781ad8388SMartin Matuska 	}
25881ad8388SMartin Matuska 
25981ad8388SMartin Matuska 	rc->count = 0;
26081ad8388SMartin Matuska 	rc->pos = 0;
26181ad8388SMartin Matuska 
26281ad8388SMartin Matuska 	return false;
26381ad8388SMartin Matuska }
26481ad8388SMartin Matuska 
26581ad8388SMartin Matuska 
26673ed8e77SXin LI static inline bool
26773ed8e77SXin LI rc_encode_dummy(const lzma_range_encoder *rc, uint64_t out_limit)
26873ed8e77SXin LI {
26973ed8e77SXin LI 	assert(rc->count <= RC_SYMBOLS_MAX);
27073ed8e77SXin LI 
27173ed8e77SXin LI 	uint64_t low = rc->low;
27273ed8e77SXin LI 	uint64_t cache_size = rc->cache_size;
27373ed8e77SXin LI 	uint32_t range = rc->range;
27473ed8e77SXin LI 	uint8_t cache = rc->cache;
27573ed8e77SXin LI 	uint64_t out_pos = rc->out_total;
27673ed8e77SXin LI 
27773ed8e77SXin LI 	size_t pos = rc->pos;
27873ed8e77SXin LI 
27973ed8e77SXin LI 	while (true) {
28073ed8e77SXin LI 		// Normalize
28173ed8e77SXin LI 		if (range < RC_TOP_VALUE) {
28273ed8e77SXin LI 			if (rc_shift_low_dummy(&low, &cache_size, &cache,
28373ed8e77SXin LI 					&out_pos, out_limit))
28473ed8e77SXin LI 				return true;
28573ed8e77SXin LI 
28673ed8e77SXin LI 			range <<= RC_SHIFT_BITS;
28773ed8e77SXin LI 		}
28873ed8e77SXin LI 
28973ed8e77SXin LI 		// This check is here because the normalization above must
29073ed8e77SXin LI 		// be done before flushing the last bytes.
29173ed8e77SXin LI 		if (pos == rc->count)
29273ed8e77SXin LI 			break;
29373ed8e77SXin LI 
29473ed8e77SXin LI 		// Encode a bit
29573ed8e77SXin LI 		switch (rc->symbols[pos]) {
29673ed8e77SXin LI 		case RC_BIT_0: {
29773ed8e77SXin LI 			probability prob = *rc->probs[pos];
29873ed8e77SXin LI 			range = (range >> RC_BIT_MODEL_TOTAL_BITS)
29973ed8e77SXin LI 					* prob;
30073ed8e77SXin LI 			break;
30173ed8e77SXin LI 		}
30273ed8e77SXin LI 
30373ed8e77SXin LI 		case RC_BIT_1: {
30473ed8e77SXin LI 			probability prob = *rc->probs[pos];
30573ed8e77SXin LI 			const uint32_t bound = prob * (range
30673ed8e77SXin LI 					>> RC_BIT_MODEL_TOTAL_BITS);
30773ed8e77SXin LI 			low += bound;
30873ed8e77SXin LI 			range -= bound;
30973ed8e77SXin LI 			break;
31073ed8e77SXin LI 		}
31173ed8e77SXin LI 
31273ed8e77SXin LI 		case RC_DIRECT_0:
31373ed8e77SXin LI 			range >>= 1;
31473ed8e77SXin LI 			break;
31573ed8e77SXin LI 
31673ed8e77SXin LI 		case RC_DIRECT_1:
31773ed8e77SXin LI 			range >>= 1;
31873ed8e77SXin LI 			low += range;
31973ed8e77SXin LI 			break;
32073ed8e77SXin LI 
32173ed8e77SXin LI 		case RC_FLUSH:
32273ed8e77SXin LI 		default:
32373ed8e77SXin LI 			assert(0);
32473ed8e77SXin LI 			break;
32573ed8e77SXin LI 		}
32673ed8e77SXin LI 
32773ed8e77SXin LI 		++pos;
32873ed8e77SXin LI 	}
32973ed8e77SXin LI 
33073ed8e77SXin LI 	// Flush the last bytes. This isn't in rc->symbols[] so we do
33173ed8e77SXin LI 	// it after the above loop to take into account the size of
33273ed8e77SXin LI 	// the flushing that will be done at the end of the stream.
33373ed8e77SXin LI 	for (pos = 0; pos < 5; ++pos) {
33473ed8e77SXin LI 		if (rc_shift_low_dummy(&low, &cache_size,
33573ed8e77SXin LI 				&cache, &out_pos, out_limit))
33673ed8e77SXin LI 			return true;
33773ed8e77SXin LI 	}
33873ed8e77SXin LI 
33973ed8e77SXin LI 	return false;
34073ed8e77SXin LI }
34173ed8e77SXin LI 
34273ed8e77SXin LI 
34381ad8388SMartin Matuska static inline uint64_t
34481ad8388SMartin Matuska rc_pending(const lzma_range_encoder *rc)
34581ad8388SMartin Matuska {
34681ad8388SMartin Matuska 	return rc->cache_size + 5 - 1;
34781ad8388SMartin Matuska }
34881ad8388SMartin Matuska 
34981ad8388SMartin Matuska #endif
350