xref: /freebsd/contrib/xz/src/liblzma/lz/lz_decoder.c (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 // SPDX-License-Identifier: 0BSD
2 
3 ///////////////////////////////////////////////////////////////////////////////
4 //
5 /// \file       lz_decoder.c
6 /// \brief      LZ out window
7 ///
8 //  Authors:    Igor Pavlov
9 //              Lasse Collin
10 //
11 ///////////////////////////////////////////////////////////////////////////////
12 
13 // liblzma supports multiple LZ77-based filters. The LZ part is shared
14 // between these filters. The LZ code takes care of dictionary handling
15 // and passing the data between filters in the chain. The filter-specific
16 // part decodes from the input buffer to the dictionary.
17 
18 
19 #include "lz_decoder.h"
20 
21 
22 typedef struct {
23 	/// Dictionary (history buffer)
24 	lzma_dict dict;
25 
26 	/// The actual LZ-based decoder e.g. LZMA
27 	lzma_lz_decoder lz;
28 
29 	/// Next filter in the chain, if any. Note that LZMA and LZMA2 are
30 	/// only allowed as the last filter, but the long-range filter in
31 	/// future can be in the middle of the chain.
32 	lzma_next_coder next;
33 
34 	/// True if the next filter in the chain has returned LZMA_STREAM_END.
35 	bool next_finished;
36 
37 	/// True if the LZ decoder (e.g. LZMA) has detected end of payload
38 	/// marker. This may become true before next_finished becomes true.
39 	bool this_finished;
40 
41 	/// Temporary buffer needed when the LZ-based filter is not the last
42 	/// filter in the chain. The output of the next filter is first
43 	/// decoded into buffer[], which is then used as input for the actual
44 	/// LZ-based decoder.
45 	struct {
46 		size_t pos;
47 		size_t size;
48 		uint8_t buffer[LZMA_BUFFER_SIZE];
49 	} temp;
50 } lzma_coder;
51 
52 
53 static void
54 lz_decoder_reset(lzma_coder *coder)
55 {
56 	coder->dict.pos = 2 * LZ_DICT_REPEAT_MAX;
57 	coder->dict.full = 0;
58 	coder->dict.buf[2 * LZ_DICT_REPEAT_MAX - 1] = '\0';
59 	coder->dict.has_wrapped = false;
60 	coder->dict.need_reset = false;
61 	return;
62 }
63 
64 
65 static lzma_ret
66 decode_buffer(lzma_coder *coder,
67 		const uint8_t *restrict in, size_t *restrict in_pos,
68 		size_t in_size, uint8_t *restrict out,
69 		size_t *restrict out_pos, size_t out_size)
70 {
71 	while (true) {
72 		// Wrap the dictionary if needed.
73 		if (coder->dict.pos == coder->dict.size) {
74 			// See the comment of #define LZ_DICT_REPEAT_MAX.
75 			coder->dict.pos = LZ_DICT_REPEAT_MAX;
76 			coder->dict.has_wrapped = true;
77 			memcpy(coder->dict.buf, coder->dict.buf
78 						+ coder->dict.size
79 						- LZ_DICT_REPEAT_MAX,
80 					LZ_DICT_REPEAT_MAX);
81 		}
82 
83 		// Store the current dictionary position. It is needed to know
84 		// where to start copying to the out[] buffer.
85 		const size_t dict_start = coder->dict.pos;
86 
87 		// Calculate how much we allow coder->lz.code() to decode.
88 		// It must not decode past the end of the dictionary
89 		// buffer, and we don't want it to decode more than is
90 		// actually needed to fill the out[] buffer.
91 		coder->dict.limit = coder->dict.pos
92 				+ my_min(out_size - *out_pos,
93 					coder->dict.size - coder->dict.pos);
94 
95 		// Call the coder->lz.code() to do the actual decoding.
96 		const lzma_ret ret = coder->lz.code(
97 				coder->lz.coder, &coder->dict,
98 				in, in_pos, in_size);
99 
100 		// Copy the decoded data from the dictionary to the out[]
101 		// buffer. Do it conditionally because out can be NULL
102 		// (in which case copy_size is always 0). Calling memcpy()
103 		// with a null-pointer is undefined even if the third
104 		// argument is 0.
105 		const size_t copy_size = coder->dict.pos - dict_start;
106 		assert(copy_size <= out_size - *out_pos);
107 
108 		if (copy_size > 0)
109 			memcpy(out + *out_pos, coder->dict.buf + dict_start,
110 					copy_size);
111 
112 		*out_pos += copy_size;
113 
114 		// Reset the dictionary if so requested by coder->lz.code().
115 		if (coder->dict.need_reset) {
116 			lz_decoder_reset(coder);
117 
118 			// Since we reset dictionary, we don't check if
119 			// dictionary became full.
120 			if (ret != LZMA_OK || *out_pos == out_size)
121 				return ret;
122 		} else {
123 			// Return if everything got decoded or an error
124 			// occurred, or if there's no more data to decode.
125 			//
126 			// Note that detecting if there's something to decode
127 			// is done by looking if dictionary become full
128 			// instead of looking if *in_pos == in_size. This
129 			// is because it is possible that all the input was
130 			// consumed already but some data is pending to be
131 			// written to the dictionary.
132 			if (ret != LZMA_OK || *out_pos == out_size
133 					|| coder->dict.pos < coder->dict.size)
134 				return ret;
135 		}
136 	}
137 }
138 
139 
140 static lzma_ret
141 lz_decode(void *coder_ptr, const lzma_allocator *allocator,
142 		const uint8_t *restrict in, size_t *restrict in_pos,
143 		size_t in_size, uint8_t *restrict out,
144 		size_t *restrict out_pos, size_t out_size,
145 		lzma_action action)
146 {
147 	lzma_coder *coder = coder_ptr;
148 
149 	if (coder->next.code == NULL)
150 		return decode_buffer(coder, in, in_pos, in_size,
151 				out, out_pos, out_size);
152 
153 	// We aren't the last coder in the chain, we need to decode
154 	// our input to a temporary buffer.
155 	while (*out_pos < out_size) {
156 		// Fill the temporary buffer if it is empty.
157 		if (!coder->next_finished
158 				&& coder->temp.pos == coder->temp.size) {
159 			coder->temp.pos = 0;
160 			coder->temp.size = 0;
161 
162 			const lzma_ret ret = coder->next.code(
163 					coder->next.coder,
164 					allocator, in, in_pos, in_size,
165 					coder->temp.buffer, &coder->temp.size,
166 					LZMA_BUFFER_SIZE, action);
167 
168 			if (ret == LZMA_STREAM_END)
169 				coder->next_finished = true;
170 			else if (ret != LZMA_OK || coder->temp.size == 0)
171 				return ret;
172 		}
173 
174 		if (coder->this_finished) {
175 			if (coder->temp.size != 0)
176 				return LZMA_DATA_ERROR;
177 
178 			if (coder->next_finished)
179 				return LZMA_STREAM_END;
180 
181 			return LZMA_OK;
182 		}
183 
184 		const lzma_ret ret = decode_buffer(coder, coder->temp.buffer,
185 				&coder->temp.pos, coder->temp.size,
186 				out, out_pos, out_size);
187 
188 		if (ret == LZMA_STREAM_END)
189 			coder->this_finished = true;
190 		else if (ret != LZMA_OK)
191 			return ret;
192 		else if (coder->next_finished && *out_pos < out_size)
193 			return LZMA_DATA_ERROR;
194 	}
195 
196 	return LZMA_OK;
197 }
198 
199 
200 static void
201 lz_decoder_end(void *coder_ptr, const lzma_allocator *allocator)
202 {
203 	lzma_coder *coder = coder_ptr;
204 
205 	lzma_next_end(&coder->next, allocator);
206 	lzma_free(coder->dict.buf, allocator);
207 
208 	if (coder->lz.end != NULL)
209 		coder->lz.end(coder->lz.coder, allocator);
210 	else
211 		lzma_free(coder->lz.coder, allocator);
212 
213 	lzma_free(coder, allocator);
214 	return;
215 }
216 
217 
218 extern lzma_ret
219 lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
220 		const lzma_filter_info *filters,
221 		lzma_ret (*lz_init)(lzma_lz_decoder *lz,
222 			const lzma_allocator *allocator,
223 			lzma_vli id, const void *options,
224 			lzma_lz_options *lz_options))
225 {
226 	// Allocate the base structure if it isn't already allocated.
227 	lzma_coder *coder = next->coder;
228 	if (coder == NULL) {
229 		coder = lzma_alloc(sizeof(lzma_coder), allocator);
230 		if (coder == NULL)
231 			return LZMA_MEM_ERROR;
232 
233 		next->coder = coder;
234 		next->code = &lz_decode;
235 		next->end = &lz_decoder_end;
236 
237 		coder->dict.buf = NULL;
238 		coder->dict.size = 0;
239 		coder->lz = LZMA_LZ_DECODER_INIT;
240 		coder->next = LZMA_NEXT_CODER_INIT;
241 	}
242 
243 	// Allocate and initialize the LZ-based decoder. It will also give
244 	// us the dictionary size.
245 	lzma_lz_options lz_options;
246 	return_if_error(lz_init(&coder->lz, allocator,
247 			filters[0].id, filters[0].options, &lz_options));
248 
249 	// If the dictionary size is very small, increase it to 4096 bytes.
250 	// This is to prevent constant wrapping of the dictionary, which
251 	// would slow things down. The downside is that since we don't check
252 	// separately for the real dictionary size, we may happily accept
253 	// corrupt files.
254 	if (lz_options.dict_size < 4096)
255 		lz_options.dict_size = 4096;
256 
257 	// Make dictionary size a multiple of 16. Some LZ-based decoders like
258 	// LZMA use the lowest bits lzma_dict.pos to know the alignment of the
259 	// data. Aligned buffer is also good when memcpying from the
260 	// dictionary to the output buffer, since applications are
261 	// recommended to give aligned buffers to liblzma.
262 	//
263 	// Reserve 2 * LZ_DICT_REPEAT_MAX bytes of extra space which is
264 	// needed for alloc_size.
265 	//
266 	// Avoid integer overflow.
267 	if (lz_options.dict_size > SIZE_MAX - 15 - 2 * LZ_DICT_REPEAT_MAX)
268 		return LZMA_MEM_ERROR;
269 
270 	lz_options.dict_size = (lz_options.dict_size + 15) & ~((size_t)(15));
271 
272 	// Reserve extra space as explained in the comment
273 	// of #define LZ_DICT_REPEAT_MAX.
274 	const size_t alloc_size
275 			= lz_options.dict_size + 2 * LZ_DICT_REPEAT_MAX;
276 
277 	// Allocate and initialize the dictionary.
278 	if (coder->dict.size != alloc_size) {
279 		lzma_free(coder->dict.buf, allocator);
280 		coder->dict.buf = lzma_alloc(alloc_size, allocator);
281 		if (coder->dict.buf == NULL)
282 			return LZMA_MEM_ERROR;
283 
284 		// NOTE: Yes, alloc_size, not lz_options.dict_size. The way
285 		// coder->dict.full is updated will take care that we will
286 		// still reject distances larger than lz_options.dict_size.
287 		coder->dict.size = alloc_size;
288 	}
289 
290 	lz_decoder_reset(next->coder);
291 
292 	// Use the preset dictionary if it was given to us.
293 	if (lz_options.preset_dict != NULL
294 			&& lz_options.preset_dict_size > 0) {
295 		// If the preset dictionary is bigger than the actual
296 		// dictionary, copy only the tail.
297 		const size_t copy_size = my_min(lz_options.preset_dict_size,
298 				lz_options.dict_size);
299 		const size_t offset = lz_options.preset_dict_size - copy_size;
300 		memcpy(coder->dict.buf + coder->dict.pos,
301 				lz_options.preset_dict + offset,
302 				copy_size);
303 
304 		// dict.pos isn't zero after lz_decoder_reset().
305 		coder->dict.pos += copy_size;
306 		coder->dict.full = copy_size;
307 	}
308 
309 	// Miscellaneous initializations
310 	coder->next_finished = false;
311 	coder->this_finished = false;
312 	coder->temp.pos = 0;
313 	coder->temp.size = 0;
314 
315 	// Initialize the next filter in the chain, if any.
316 	return lzma_next_filter_init(&coder->next, allocator, filters + 1);
317 }
318 
319 
320 extern uint64_t
321 lzma_lz_decoder_memusage(size_t dictionary_size)
322 {
323 	return sizeof(lzma_coder) + (uint64_t)(dictionary_size);
324 }
325