xref: /freebsd/contrib/xz/src/liblzma/lz/lz_decoder.h (revision 3b35e7ee8de9b0260149a2b77e87a2b9c7a36244)
1*3b35e7eeSXin LI // SPDX-License-Identifier: 0BSD
2*3b35e7eeSXin LI 
381ad8388SMartin Matuska ///////////////////////////////////////////////////////////////////////////////
481ad8388SMartin Matuska //
581ad8388SMartin Matuska /// \file       lz_decoder.h
681ad8388SMartin Matuska /// \brief      LZ out window
781ad8388SMartin Matuska ///
881ad8388SMartin Matuska //  Authors:    Igor Pavlov
981ad8388SMartin Matuska //              Lasse Collin
1081ad8388SMartin Matuska //
1181ad8388SMartin Matuska ///////////////////////////////////////////////////////////////////////////////
1281ad8388SMartin Matuska 
1381ad8388SMartin Matuska #ifndef LZMA_LZ_DECODER_H
1481ad8388SMartin Matuska #define LZMA_LZ_DECODER_H
1581ad8388SMartin Matuska 
1681ad8388SMartin Matuska #include "common.h"
1781ad8388SMartin Matuska 
1881ad8388SMartin Matuska 
19*3b35e7eeSXin LI /// Maximum length of a match rounded up to a nice power of 2 which is
20*3b35e7eeSXin LI /// a good size for aligned memcpy(). The allocated dictionary buffer will
21*3b35e7eeSXin LI /// be 2 * LZ_DICT_REPEAT_MAX bytes larger than the actual dictionary size:
22*3b35e7eeSXin LI ///
23*3b35e7eeSXin LI /// (1) Every time the decoder reaches the end of the dictionary buffer,
24*3b35e7eeSXin LI ///     the last LZ_DICT_REPEAT_MAX bytes will be copied to the beginning.
25*3b35e7eeSXin LI ///     This way dict_repeat() will only need to copy from one place,
26*3b35e7eeSXin LI ///     never from both the end and beginning of the buffer.
27*3b35e7eeSXin LI ///
28*3b35e7eeSXin LI /// (2) The other LZ_DICT_REPEAT_MAX bytes is kept as a buffer between
29*3b35e7eeSXin LI ///     the oldest byte still in the dictionary and the current write
30*3b35e7eeSXin LI ///     position. This way dict_repeat(dict, dict->size - 1, &len)
31*3b35e7eeSXin LI ///     won't need memmove() as the copying cannot overlap.
32*3b35e7eeSXin LI ///
33*3b35e7eeSXin LI /// Note that memcpy() still cannot be used if distance < len.
34*3b35e7eeSXin LI ///
35*3b35e7eeSXin LI /// LZMA's longest match length is 273 so pick a multiple of 16 above that.
36*3b35e7eeSXin LI #define LZ_DICT_REPEAT_MAX 288
37*3b35e7eeSXin LI 
38*3b35e7eeSXin LI 
3981ad8388SMartin Matuska typedef struct {
40*3b35e7eeSXin LI 	/// Pointer to the dictionary buffer.
4181ad8388SMartin Matuska 	uint8_t *buf;
4281ad8388SMartin Matuska 
4381ad8388SMartin Matuska 	/// Write position in dictionary. The next byte will be written to
4481ad8388SMartin Matuska 	/// buf[pos].
4581ad8388SMartin Matuska 	size_t pos;
4681ad8388SMartin Matuska 
4781ad8388SMartin Matuska 	/// Indicates how full the dictionary is. This is used by
4881ad8388SMartin Matuska 	/// dict_is_distance_valid() to detect corrupt files that would
4981ad8388SMartin Matuska 	/// read beyond the beginning of the dictionary.
5081ad8388SMartin Matuska 	size_t full;
5181ad8388SMartin Matuska 
5281ad8388SMartin Matuska 	/// Write limit
5381ad8388SMartin Matuska 	size_t limit;
5481ad8388SMartin Matuska 
55*3b35e7eeSXin LI 	/// Allocated size of buf. This is 2 * LZ_DICT_REPEAT_MAX bytes
56*3b35e7eeSXin LI 	/// larger than the actual dictionary size. This is enforced by
57*3b35e7eeSXin LI 	/// how the value for "full" is set; it can be at most
58*3b35e7eeSXin LI 	/// "size - 2 * LZ_DICT_REPEAT_MAX".
5981ad8388SMartin Matuska 	size_t size;
6081ad8388SMartin Matuska 
61*3b35e7eeSXin LI 	/// True once the dictionary has become full and the writing position
62*3b35e7eeSXin LI 	/// has been wrapped in decode_buffer() in lz_decoder.c.
63*3b35e7eeSXin LI 	bool has_wrapped;
64*3b35e7eeSXin LI 
6581ad8388SMartin Matuska 	/// True when dictionary should be reset before decoding more data.
6681ad8388SMartin Matuska 	bool need_reset;
6781ad8388SMartin Matuska 
6881ad8388SMartin Matuska } lzma_dict;
6981ad8388SMartin Matuska 
7081ad8388SMartin Matuska 
7181ad8388SMartin Matuska typedef struct {
7281ad8388SMartin Matuska 	size_t dict_size;
7381ad8388SMartin Matuska 	const uint8_t *preset_dict;
7481ad8388SMartin Matuska 	size_t preset_dict_size;
7581ad8388SMartin Matuska } lzma_lz_options;
7681ad8388SMartin Matuska 
7781ad8388SMartin Matuska 
7881ad8388SMartin Matuska typedef struct {
7981ad8388SMartin Matuska 	/// Data specific to the LZ-based decoder
801456f0f9SXin LI 	void *coder;
8181ad8388SMartin Matuska 
8281ad8388SMartin Matuska 	/// Function to decode from in[] to *dict
831456f0f9SXin LI 	lzma_ret (*code)(void *coder,
8481ad8388SMartin Matuska 			lzma_dict *restrict dict, const uint8_t *restrict in,
8581ad8388SMartin Matuska 			size_t *restrict in_pos, size_t in_size);
8681ad8388SMartin Matuska 
871456f0f9SXin LI 	void (*reset)(void *coder, const void *options);
8881ad8388SMartin Matuska 
899e6bbe47SXin LI 	/// Set the uncompressed size. If uncompressed_size == LZMA_VLI_UNKNOWN
909e6bbe47SXin LI 	/// then allow_eopm will always be true.
919e6bbe47SXin LI 	void (*set_uncompressed)(void *coder, lzma_vli uncompressed_size,
929e6bbe47SXin LI 			bool allow_eopm);
9381ad8388SMartin Matuska 
9481ad8388SMartin Matuska 	/// Free allocated resources
951456f0f9SXin LI 	void (*end)(void *coder, const lzma_allocator *allocator);
9681ad8388SMartin Matuska 
9781ad8388SMartin Matuska } lzma_lz_decoder;
9881ad8388SMartin Matuska 
9981ad8388SMartin Matuska 
10081ad8388SMartin Matuska #define LZMA_LZ_DECODER_INIT \
10181ad8388SMartin Matuska 	(lzma_lz_decoder){ \
10281ad8388SMartin Matuska 		.coder = NULL, \
10381ad8388SMartin Matuska 		.code = NULL, \
10481ad8388SMartin Matuska 		.reset = NULL, \
10581ad8388SMartin Matuska 		.set_uncompressed = NULL, \
10681ad8388SMartin Matuska 		.end = NULL, \
10781ad8388SMartin Matuska 	}
10881ad8388SMartin Matuska 
10981ad8388SMartin Matuska 
11081ad8388SMartin Matuska extern lzma_ret lzma_lz_decoder_init(lzma_next_coder *next,
11153200025SRui Paulo 		const lzma_allocator *allocator,
11253200025SRui Paulo 		const lzma_filter_info *filters,
11381ad8388SMartin Matuska 		lzma_ret (*lz_init)(lzma_lz_decoder *lz,
11473ed8e77SXin LI 			const lzma_allocator *allocator,
11573ed8e77SXin LI 			lzma_vli id, const void *options,
11681ad8388SMartin Matuska 			lzma_lz_options *lz_options));
11781ad8388SMartin Matuska 
11881ad8388SMartin Matuska extern uint64_t lzma_lz_decoder_memusage(size_t dictionary_size);
11981ad8388SMartin Matuska 
12081ad8388SMartin Matuska 
12181ad8388SMartin Matuska //////////////////////
12281ad8388SMartin Matuska // Inline functions //
12381ad8388SMartin Matuska //////////////////////
12481ad8388SMartin Matuska 
12581ad8388SMartin Matuska /// Get a byte from the history buffer.
12681ad8388SMartin Matuska static inline uint8_t
12781ad8388SMartin Matuska dict_get(const lzma_dict *const dict, const uint32_t distance)
12881ad8388SMartin Matuska {
12981ad8388SMartin Matuska 	return dict->buf[dict->pos - distance - 1
130*3b35e7eeSXin LI 			+ (distance < dict->pos
131*3b35e7eeSXin LI 				? 0 : dict->size - LZ_DICT_REPEAT_MAX)];
132*3b35e7eeSXin LI }
133*3b35e7eeSXin LI 
134*3b35e7eeSXin LI 
135*3b35e7eeSXin LI /// Optimized version of dict_get(dict, 0)
136*3b35e7eeSXin LI static inline uint8_t
137*3b35e7eeSXin LI dict_get0(const lzma_dict *const dict)
138*3b35e7eeSXin LI {
139*3b35e7eeSXin LI 	return dict->buf[dict->pos - 1];
14081ad8388SMartin Matuska }
14181ad8388SMartin Matuska 
14281ad8388SMartin Matuska 
14381ad8388SMartin Matuska /// Test if dictionary is empty.
14481ad8388SMartin Matuska static inline bool
14581ad8388SMartin Matuska dict_is_empty(const lzma_dict *const dict)
14681ad8388SMartin Matuska {
14781ad8388SMartin Matuska 	return dict->full == 0;
14881ad8388SMartin Matuska }
14981ad8388SMartin Matuska 
15081ad8388SMartin Matuska 
15181ad8388SMartin Matuska /// Validate the match distance
15281ad8388SMartin Matuska static inline bool
15381ad8388SMartin Matuska dict_is_distance_valid(const lzma_dict *const dict, const size_t distance)
15481ad8388SMartin Matuska {
15581ad8388SMartin Matuska 	return dict->full > distance;
15681ad8388SMartin Matuska }
15781ad8388SMartin Matuska 
15881ad8388SMartin Matuska 
15981ad8388SMartin Matuska /// Repeat *len bytes at distance.
16081ad8388SMartin Matuska static inline bool
16181ad8388SMartin Matuska dict_repeat(lzma_dict *dict, uint32_t distance, uint32_t *len)
16281ad8388SMartin Matuska {
16381ad8388SMartin Matuska 	// Don't write past the end of the dictionary.
16481ad8388SMartin Matuska 	const size_t dict_avail = dict->limit - dict->pos;
165e0f0e66dSMartin Matuska 	uint32_t left = my_min(dict_avail, *len);
16681ad8388SMartin Matuska 	*len -= left;
16781ad8388SMartin Matuska 
168*3b35e7eeSXin LI 	size_t back = dict->pos - distance - 1;
169*3b35e7eeSXin LI 	if (distance >= dict->pos)
170*3b35e7eeSXin LI 		back += dict->size - LZ_DICT_REPEAT_MAX;
171*3b35e7eeSXin LI 
17281ad8388SMartin Matuska 	// Repeat a block of data from the history. Because memcpy() is faster
17381ad8388SMartin Matuska 	// than copying byte by byte in a loop, the copying process gets split
174*3b35e7eeSXin LI 	// into two cases.
17581ad8388SMartin Matuska 	if (distance < left) {
17681ad8388SMartin Matuska 		// Source and target areas overlap, thus we can't use
17781ad8388SMartin Matuska 		// memcpy() nor even memmove() safely.
17881ad8388SMartin Matuska 		do {
179*3b35e7eeSXin LI 			dict->buf[dict->pos++] = dict->buf[back++];
18081ad8388SMartin Matuska 		} while (--left > 0);
1812f9cd13dSXin LI 	} else {
182*3b35e7eeSXin LI 		memcpy(dict->buf + dict->pos, dict->buf + back, left);
1832f9cd13dSXin LI 		dict->pos += left;
1842f9cd13dSXin LI 	}
18581ad8388SMartin Matuska 
18681ad8388SMartin Matuska 	// Update how full the dictionary is.
187*3b35e7eeSXin LI 	if (!dict->has_wrapped)
188*3b35e7eeSXin LI 		dict->full = dict->pos - 2 * LZ_DICT_REPEAT_MAX;
18981ad8388SMartin Matuska 
190*3b35e7eeSXin LI 	return *len != 0;
191*3b35e7eeSXin LI }
192*3b35e7eeSXin LI 
193*3b35e7eeSXin LI 
194*3b35e7eeSXin LI static inline void
195*3b35e7eeSXin LI dict_put(lzma_dict *dict, uint8_t byte)
196*3b35e7eeSXin LI {
197*3b35e7eeSXin LI 	dict->buf[dict->pos++] = byte;
198*3b35e7eeSXin LI 
199*3b35e7eeSXin LI 	if (!dict->has_wrapped)
200*3b35e7eeSXin LI 		dict->full = dict->pos - 2 * LZ_DICT_REPEAT_MAX;
20181ad8388SMartin Matuska }
20281ad8388SMartin Matuska 
20381ad8388SMartin Matuska 
20481ad8388SMartin Matuska /// Puts one byte into the dictionary. Returns true if the dictionary was
20581ad8388SMartin Matuska /// already full and the byte couldn't be added.
20681ad8388SMartin Matuska static inline bool
207*3b35e7eeSXin LI dict_put_safe(lzma_dict *dict, uint8_t byte)
20881ad8388SMartin Matuska {
20981ad8388SMartin Matuska 	if (unlikely(dict->pos == dict->limit))
21081ad8388SMartin Matuska 		return true;
21181ad8388SMartin Matuska 
212*3b35e7eeSXin LI 	dict_put(dict, byte);
21381ad8388SMartin Matuska 	return false;
21481ad8388SMartin Matuska }
21581ad8388SMartin Matuska 
21681ad8388SMartin Matuska 
21781ad8388SMartin Matuska /// Copies arbitrary amount of data into the dictionary.
21881ad8388SMartin Matuska static inline void
21981ad8388SMartin Matuska dict_write(lzma_dict *restrict dict, const uint8_t *restrict in,
22081ad8388SMartin Matuska 		size_t *restrict in_pos, size_t in_size,
22181ad8388SMartin Matuska 		size_t *restrict left)
22281ad8388SMartin Matuska {
22381ad8388SMartin Matuska 	// NOTE: If we are being given more data than the size of the
22481ad8388SMartin Matuska 	// dictionary, it could be possible to optimize the LZ decoder
22581ad8388SMartin Matuska 	// so that not everything needs to go through the dictionary.
22681ad8388SMartin Matuska 	// This shouldn't be very common thing in practice though, and
22781ad8388SMartin Matuska 	// the slowdown of one extra memcpy() isn't bad compared to how
22881ad8388SMartin Matuska 	// much time it would have taken if the data were compressed.
22981ad8388SMartin Matuska 
23081ad8388SMartin Matuska 	if (in_size - *in_pos > *left)
23181ad8388SMartin Matuska 		in_size = *in_pos + *left;
23281ad8388SMartin Matuska 
23381ad8388SMartin Matuska 	*left -= lzma_bufcpy(in, in_pos, in_size,
23481ad8388SMartin Matuska 			dict->buf, &dict->pos, dict->limit);
23581ad8388SMartin Matuska 
236*3b35e7eeSXin LI 	if (!dict->has_wrapped)
237*3b35e7eeSXin LI 		dict->full = dict->pos - 2 * LZ_DICT_REPEAT_MAX;
23881ad8388SMartin Matuska 
23981ad8388SMartin Matuska 	return;
24081ad8388SMartin Matuska }
24181ad8388SMartin Matuska 
24281ad8388SMartin Matuska 
24381ad8388SMartin Matuska static inline void
24481ad8388SMartin Matuska dict_reset(lzma_dict *dict)
24581ad8388SMartin Matuska {
24681ad8388SMartin Matuska 	dict->need_reset = true;
24781ad8388SMartin Matuska 	return;
24881ad8388SMartin Matuska }
24981ad8388SMartin Matuska 
25081ad8388SMartin Matuska #endif
251