xref: /freebsd/contrib/xz/src/xz/coder.c (revision 128836d304d93f2d00eb14069c27089ab46c38d4)
1 // SPDX-License-Identifier: 0BSD
2 
3 ///////////////////////////////////////////////////////////////////////////////
4 //
5 /// \file       coder.c
6 /// \brief      Compresses or uncompresses a file
7 //
8 //  Authors:    Lasse Collin
9 //              Jia Tan
10 //
11 ///////////////////////////////////////////////////////////////////////////////
12 
13 #include "private.h"
14 #include "tuklib_integer.h"
15 
16 
17 /// Return value type for coder_init().
18 enum coder_init_ret {
19 	CODER_INIT_NORMAL,
20 	CODER_INIT_PASSTHRU,
21 	CODER_INIT_ERROR,
22 };
23 
24 
25 enum operation_mode opt_mode = MODE_COMPRESS;
26 enum format_type opt_format = FORMAT_AUTO;
27 bool opt_auto_adjust = true;
28 bool opt_single_stream = false;
29 uint64_t opt_block_size = 0;
30 block_list_entry *opt_block_list = NULL;
31 uint64_t block_list_largest;
32 uint32_t block_list_chain_mask;
33 
34 /// Stream used to communicate with liblzma
35 static lzma_stream strm = LZMA_STREAM_INIT;
36 
37 /// Maximum number of filter chains. The first filter chain is the default,
38 /// and 9 other filter chains can be specified with --filtersX.
39 #define NUM_FILTER_CHAIN_MAX 10
40 
41 /// The default filter chain is in chains[0]. It is used for encoding
42 /// in all supported formats and also for decdoing raw streams. The other
43 /// filter chains are set by --filtersX to support changing filters with
44 /// the --block-list option.
45 static lzma_filter chains[NUM_FILTER_CHAIN_MAX][LZMA_FILTERS_MAX + 1];
46 
47 /// Bitmask indicating which filter chains are actually used when encoding
48 /// in the .xz format. This is needed since the filter chains specified using
49 /// --filtersX (or the default filter chain) might in reality be unneeded
50 /// if they are never used in --block-list. When --block-list isn't
51 /// specified, only the default filter chain is used, thus the initial
52 /// value of this variable is 1U << 0 (the number of the default chain is 0).
53 static uint32_t chains_used_mask = 1U << 0;
54 
55 /// Input and output buffers
56 static io_buf in_buf;
57 static io_buf out_buf;
58 
59 /// Number of filters in the default filter chain. Zero indicates that
60 /// we are using a preset.
61 static uint32_t filters_count = 0;
62 
63 /// Number of the preset (0-9)
64 static uint32_t preset_number = LZMA_PRESET_DEFAULT;
65 
66 /// True if the current default filter chain was set using the --filters
67 /// option. The filter chain is reset if a preset option (like -9) or an
68 /// old-style filter option (like --lzma2) is used after a --filters option.
69 static bool string_to_filter_used = false;
70 
71 /// Integrity check type
72 static lzma_check check;
73 
74 /// This becomes false if the --check=CHECK option is used.
75 static bool check_default = true;
76 
77 /// Indicates if unconsumed input is allowed to remain after
78 /// decoding has successfully finished. This is set for each file
79 /// in coder_init().
80 static bool allow_trailing_input;
81 
82 #ifdef MYTHREAD_ENABLED
83 static lzma_mt mt_options = {
84 	.flags = 0,
85 	.timeout = 300,
86 };
87 #endif
88 
89 
90 extern void
coder_set_check(lzma_check new_check)91 coder_set_check(lzma_check new_check)
92 {
93 	check = new_check;
94 	check_default = false;
95 	return;
96 }
97 
98 
99 static void
forget_filter_chain(void)100 forget_filter_chain(void)
101 {
102 	// Setting a preset or using --filters makes us forget
103 	// the earlier custom filter chain (if any).
104 	if (filters_count > 0) {
105 		lzma_filters_free(chains[0], NULL);
106 		filters_count = 0;
107 	}
108 
109 	string_to_filter_used = false;
110 	return;
111 }
112 
113 
114 extern void
coder_set_preset(uint32_t new_preset)115 coder_set_preset(uint32_t new_preset)
116 {
117 	preset_number &= ~LZMA_PRESET_LEVEL_MASK;
118 	preset_number |= new_preset;
119 	forget_filter_chain();
120 	return;
121 }
122 
123 
124 extern void
coder_set_extreme(void)125 coder_set_extreme(void)
126 {
127 	preset_number |= LZMA_PRESET_EXTREME;
128 	forget_filter_chain();
129 	return;
130 }
131 
132 
133 extern void
coder_add_filter(lzma_vli id,void * options)134 coder_add_filter(lzma_vli id, void *options)
135 {
136 	if (filters_count == LZMA_FILTERS_MAX)
137 		message_fatal(_("Maximum number of filters is four"));
138 
139 	if (string_to_filter_used)
140 		forget_filter_chain();
141 
142 	chains[0][filters_count].id = id;
143 	chains[0][filters_count].options = options;
144 
145 	// Terminate the filter chain with LZMA_VLI_UNKNOWN to simplify
146 	// implementation of forget_filter_chain().
147 	chains[0][++filters_count].id = LZMA_VLI_UNKNOWN;
148 
149 	// Setting a custom filter chain makes us forget the preset options.
150 	// This makes a difference if one specifies e.g. "xz -9 --lzma2 -e"
151 	// where the custom filter chain resets the preset level back to
152 	// the default 6, making the example equivalent to "xz -6e".
153 	preset_number = LZMA_PRESET_DEFAULT;
154 
155 	return;
156 }
157 
158 
159 static void
str_to_filters(const char * str,uint32_t index,uint32_t flags)160 str_to_filters(const char *str, uint32_t index, uint32_t flags)
161 {
162 	int error_pos;
163 	const char *err = lzma_str_to_filters(str, &error_pos,
164 			chains[index], flags, NULL);
165 
166 	if (err != NULL) {
167 		char filter_num[2] = "";
168 		if (index > 0)
169 			filter_num[0] = '0' + index;
170 
171 		// liblzma doesn't translate the error messages but
172 		// the messages are included in xz's translations.
173 		message(V_ERROR, _("Error in --filters%s=FILTERS option:"),
174 				filter_num);
175 		message(V_ERROR, "%s", str);
176 		message(V_ERROR, "%*s^", error_pos, "");
177 		message_fatal("%s", _(err));
178 	}
179 }
180 
181 
182 extern void
coder_add_filters_from_str(const char * filter_str)183 coder_add_filters_from_str(const char *filter_str)
184 {
185 	// Forget presets and previously defined filter chain. See
186 	// coder_add_filter() above for why preset_number must be reset too.
187 	forget_filter_chain();
188 	preset_number = LZMA_PRESET_DEFAULT;
189 
190 	string_to_filter_used = true;
191 
192 	// Include LZMA_STR_ALL_FILTERS so this can be used with --format=raw.
193 	str_to_filters(filter_str, 0, LZMA_STR_ALL_FILTERS);
194 
195 	// Set the filters_count to be the number of filters converted from
196 	// the string.
197 	for (filters_count = 0; chains[0][filters_count].id
198 			!= LZMA_VLI_UNKNOWN;
199 			++filters_count) ;
200 
201 	assert(filters_count > 0);
202 	return;
203 }
204 
205 
206 extern void
coder_add_block_filters(const char * str,size_t slot)207 coder_add_block_filters(const char *str, size_t slot)
208 {
209 	// Free old filters first, if they were previously allocated.
210 	if (chains_used_mask & (1U << slot))
211 		lzma_filters_free(chains[slot], NULL);
212 
213 	str_to_filters(str, slot, 0);
214 
215 	chains_used_mask |= 1U << slot;
216 }
217 
218 
219 tuklib_attr_noreturn
220 static void
memlimit_too_small(uint64_t memory_usage)221 memlimit_too_small(uint64_t memory_usage)
222 {
223 	message(V_ERROR, _("Memory usage limit is too low for the given "
224 			"filter setup."));
225 	message_mem_needed(V_ERROR, memory_usage);
226 	tuklib_exit(E_ERROR, E_ERROR, false);
227 }
228 
229 
230 #ifdef HAVE_ENCODERS
231 /// \brief      Calculate the memory usage of each filter chain.
232 ///
233 /// \param      chains_memusages    If non-NULL, the memusage of the encoder
234 ///                                 or decoder for each chain is stored in
235 ///                                 this array.
236 /// \param      mt                  If non-NULL, calculate memory usage of
237 ///                                 multithreaded encoder.
238 /// \param      encode              Whether to calculate encoder or decoder
239 ///                                 memory usage. This must be true if
240 ///                                 mt != NULL.
241 ///
242 /// \return     Return the highest memory usage of all of the filter chains.
243 static uint64_t
get_chains_memusage(uint64_t * chains_memusages,const lzma_mt * mt,bool encode)244 get_chains_memusage(uint64_t *chains_memusages, const lzma_mt *mt, bool encode)
245 {
246 	uint64_t max_memusage = 0;
247 
248 #ifdef MYTHREAD_ENABLED
249 	// Copy multithreading options to a temporary struct since the
250 	// "filters" member needs to be changed.
251 	lzma_mt mt_local;
252 	if (mt != NULL)
253 		mt_local = *mt;
254 #else
255 	(void)mt;
256 #endif
257 
258 	for (uint32_t i = 0; i < ARRAY_SIZE(chains); i++) {
259 		if (!(chains_used_mask & (1U << i)))
260 			continue;
261 
262 		uint64_t memusage = UINT64_MAX;
263 #ifdef MYTHREAD_ENABLED
264 		if (mt != NULL) {
265 			assert(encode);
266 			mt_local.filters = chains[i];
267 			memusage = lzma_stream_encoder_mt_memusage(&mt_local);
268 		} else
269 #endif
270 		if (encode) {
271 			memusage = lzma_raw_encoder_memusage(chains[i]);
272 		}
273 #ifdef HAVE_DECODERS
274 		else {
275 			memusage = lzma_raw_decoder_memusage(chains[i]);
276 		}
277 #endif
278 
279 		if (chains_memusages != NULL)
280 			chains_memusages[i] = memusage;
281 
282 		if (memusage > max_memusage)
283 			max_memusage = memusage;
284 	}
285 
286 	return max_memusage;
287 }
288 #endif
289 
290 
291 extern void
coder_set_compression_settings(void)292 coder_set_compression_settings(void)
293 {
294 #ifdef HAVE_LZIP_DECODER
295 	// .lz compression isn't supported.
296 	assert(opt_format != FORMAT_LZIP);
297 #endif
298 
299 	// The default check type is CRC64, but fallback to CRC32
300 	// if CRC64 isn't supported by the copy of liblzma we are
301 	// using. CRC32 is always supported.
302 	if (check_default) {
303 		check = LZMA_CHECK_CRC64;
304 		if (!lzma_check_is_supported(check))
305 			check = LZMA_CHECK_CRC32;
306 	}
307 
308 #ifdef HAVE_ENCODERS
309 	if (opt_block_list != NULL) {
310 		// args.c ensures these.
311 		assert(opt_mode == MODE_COMPRESS);
312 		assert(opt_format == FORMAT_XZ);
313 
314 		// Find out if block_list_chain_mask has a bit set that
315 		// isn't set in chains_used_mask.
316 		const uint32_t missing_chains_mask
317 				= (block_list_chain_mask ^ chains_used_mask)
318 				& block_list_chain_mask;
319 
320 		// If a filter chain was specified in --block-list but no
321 		// matching --filtersX option was used, exit with an error.
322 		if (missing_chains_mask != 0) {
323 			// Get the number of the first missing filter chain
324 			// and show it in the error message.
325 			const unsigned first_missing
326 				= (unsigned)ctz32(missing_chains_mask);
327 
328 			message_fatal(_("filter chain %u used by "
329 				"--block-list but not specified "
330 				"with --filters%u="),
331 				first_missing, first_missing);
332 		}
333 
334 		// Omit the unused filter chains from mask of used chains.
335 		//
336 		// (FIXME? When built with debugging, coder_free() will free()
337 		// the filter chains (except the default chain) which makes
338 		// Valgrind show fewer reachable allocations. But coder_free()
339 		// uses this mask to determine which chains to free. Thus it
340 		// won't free the ones that are cleared here from the mask.
341 		// In practice this doesn't matter.)
342 		chains_used_mask &= block_list_chain_mask;
343 	} else {
344 		// Reset filters used mask in case --block-list is not
345 		// used, but --filtersX is used.
346 		chains_used_mask = 1U << 0;
347 	}
348 #endif
349 
350 	// Options for LZMA1 or LZMA2 in case we are using a preset.
351 	static lzma_options_lzma opt_lzma;
352 
353 	// The first filter in the chains[] array is for the default
354 	// filter chain.
355 	lzma_filter *default_filters = chains[0];
356 
357 	if (filters_count == 0 && chains_used_mask & 1) {
358 		// We are using a preset. This is not a good idea in raw mode
359 		// except when playing around with things. Different versions
360 		// of this software may use different options in presets, and
361 		// thus make uncompressing the raw data difficult.
362 		if (opt_format == FORMAT_RAW) {
363 			// The message is shown only if warnings are allowed
364 			// but the exit status isn't changed.
365 			message(V_WARNING, _("Using a preset in raw mode "
366 					"is discouraged."));
367 			message(V_WARNING, _("The exact options of the "
368 					"presets may vary between software "
369 					"versions."));
370 		}
371 
372 		// Get the preset for LZMA1 or LZMA2.
373 		if (lzma_lzma_preset(&opt_lzma, preset_number))
374 			message_bug();
375 
376 		// Use LZMA2 except with --format=lzma we use LZMA1.
377 		default_filters[0].id = opt_format == FORMAT_LZMA
378 				? LZMA_FILTER_LZMA1 : LZMA_FILTER_LZMA2;
379 		default_filters[0].options = &opt_lzma;
380 
381 		filters_count = 1;
382 
383 		// Terminate the filter options array.
384 		default_filters[1].id = LZMA_VLI_UNKNOWN;
385 	}
386 
387 	// If we are using the .lzma format, allow exactly one filter
388 	// which has to be LZMA1. There is no need to check if the default
389 	// filter chain is being used since it can only be disabled if
390 	// --block-list is used, which is incompatible with FORMAT_LZMA.
391 	if (opt_format == FORMAT_LZMA && (filters_count != 1
392 			|| default_filters[0].id != LZMA_FILTER_LZMA1))
393 		message_fatal(_("The .lzma format supports only "
394 				"the LZMA1 filter"));
395 
396 	// If we are using the .xz format, make sure that there is no LZMA1
397 	// filter to prevent LZMA_PROG_ERROR. With the chains from --filtersX
398 	// we have already ensured this by calling lzma_str_to_filters()
399 	// without setting the flags that would allow non-.xz filters.
400 	if (opt_format == FORMAT_XZ && chains_used_mask & 1)
401 		for (size_t i = 0; i < filters_count; ++i)
402 			if (default_filters[i].id == LZMA_FILTER_LZMA1)
403 				message_fatal(_("LZMA1 cannot be used "
404 						"with the .xz format"));
405 
406 	if (chains_used_mask & 1) {
407 		// Print the selected default filter chain.
408 		message_filters_show(V_DEBUG, default_filters);
409 	}
410 
411 	// The --flush-timeout option requires LZMA_SYNC_FLUSH support
412 	// from the filter chain. Currently the threaded encoder doesn't
413 	// support LZMA_SYNC_FLUSH so single-threaded mode must be used.
414 	if (opt_mode == MODE_COMPRESS && opt_flush_timeout != 0) {
415 		for (unsigned i = 0; i < ARRAY_SIZE(chains); ++i) {
416 			if (!(chains_used_mask & (1U << i)))
417 				continue;
418 
419 			const lzma_filter *fc = chains[i];
420 			for (size_t j = 0; fc[j].id != LZMA_VLI_UNKNOWN; j++) {
421 				switch (fc[j].id) {
422 				case LZMA_FILTER_LZMA2:
423 				case LZMA_FILTER_DELTA:
424 					break;
425 
426 				default:
427 					message_fatal(_("Filter chain %u is "
428 							"incompatible with "
429 							"--flush-timeout"),
430 							i);
431 				}
432 			}
433 		}
434 
435 		if (hardware_threads_is_mt()) {
436 			message(V_WARNING, _("Switching to single-threaded "
437 					"mode due to --flush-timeout"));
438 			hardware_threads_set(1);
439 		}
440 	}
441 
442 	// Get memory limit and the memory usage of the used filter chains.
443 	// Note that if --format=raw was used, we can be decompressing
444 	// using the default filter chain.
445 	//
446 	// If multithreaded .xz compression is done, the memory limit
447 	// will be replaced.
448 	uint64_t memory_limit = hardware_memlimit_get(opt_mode);
449 	uint64_t memory_usage = UINT64_MAX;
450 
451 #ifdef HAVE_ENCODERS
452 	// Memory usage for each encoder filter chain (default
453 	// or --filtersX). The encoder options may need to be
454 	// scaled down depending on the memory usage limit.
455 	uint64_t encoder_memusages[ARRAY_SIZE(chains)];
456 #endif
457 
458 	if (opt_mode == MODE_COMPRESS) {
459 #ifdef HAVE_ENCODERS
460 #	ifdef MYTHREAD_ENABLED
461 		if (opt_format == FORMAT_XZ && hardware_threads_is_mt()) {
462 			memory_limit = hardware_memlimit_mtenc_get();
463 			mt_options.threads = hardware_threads_get();
464 
465 			uint64_t block_size = opt_block_size;
466 
467 			// If opt_block_size is not set, find the maximum
468 			// recommended Block size based on the filter chains
469 			if (block_size == 0) {
470 				for (unsigned i = 0; i < ARRAY_SIZE(chains);
471 						i++) {
472 					if (!(chains_used_mask & (1U << i)))
473 						continue;
474 
475 					uint64_t size = lzma_mt_block_size(
476 							chains[i]);
477 
478 					// If this returns an error, then one
479 					// of the filter chains in use is
480 					// invalid, so there is no point in
481 					// progressing further.
482 					if (size == UINT64_MAX)
483 						message_fatal(_("Unsupported "
484 							"options in filter "
485 							"chain %u"), i);
486 
487 					if (size > block_size)
488 						block_size = size;
489 				}
490 
491 				// If --block-list was used and our current
492 				// Block size exceeds the largest size
493 				// in --block-list, reduce the Block size of
494 				// the multithreaded encoder. The extra size
495 				// would only be a waste of RAM. With a
496 				// smaller Block size we might even be able
497 				// to use more threads in some cases.
498 				if (block_list_largest > 0 && block_size
499 						> block_list_largest)
500 					block_size = block_list_largest;
501 			}
502 
503 			mt_options.block_size = block_size;
504 			mt_options.check = check;
505 
506 			memory_usage = get_chains_memusage(encoder_memusages,
507 						&mt_options, true);
508 			if (memory_usage != UINT64_MAX)
509 				message(V_DEBUG, _("Using up to %" PRIu32
510 						" threads."),
511 						mt_options.threads);
512 		} else
513 #	endif
514 		{
515 			memory_usage = get_chains_memusage(encoder_memusages,
516 					NULL, true);
517 		}
518 #endif
519 	} else {
520 #ifdef HAVE_DECODERS
521 		memory_usage = lzma_raw_decoder_memusage(default_filters);
522 #endif
523 	}
524 
525 	if (memory_usage == UINT64_MAX)
526 		message_fatal(_("Unsupported filter chain or filter options"));
527 
528 	// Print memory usage info before possible dictionary
529 	// size auto-adjusting.
530 	//
531 	// NOTE: If only encoder support was built, we cannot show
532 	// what the decoder memory usage will be.
533 	message_mem_needed(V_DEBUG, memory_usage);
534 
535 #if defined(HAVE_ENCODERS) && defined(HAVE_DECODERS)
536 	if (opt_mode == MODE_COMPRESS && message_verbosity_get() >= V_DEBUG) {
537 		const uint64_t decmem = get_chains_memusage(NULL, NULL, false);
538 		if (decmem != UINT64_MAX)
539 			message(V_DEBUG, _("Decompression will need "
540 					"%s MiB of memory."), uint64_to_str(
541 						round_up_to_mib(decmem), 0));
542 	}
543 #endif
544 
545 	if (memory_usage <= memory_limit)
546 		return;
547 
548 	// With --format=raw settings are never adjusted to meet
549 	// the memory usage limit.
550 	if (opt_format == FORMAT_RAW)
551 		memlimit_too_small(memory_usage);
552 
553 	assert(opt_mode == MODE_COMPRESS);
554 
555 #ifdef HAVE_ENCODERS
556 #	ifdef MYTHREAD_ENABLED
557 	if (opt_format == FORMAT_XZ && hardware_threads_is_mt()) {
558 		// Try to reduce the number of threads before
559 		// adjusting the compression settings down.
560 		while (mt_options.threads > 1) {
561 			// Reduce the number of threads by one and check
562 			// the memory usage.
563 			--mt_options.threads;
564 			memory_usage = get_chains_memusage(encoder_memusages,
565 					&mt_options, true);
566 			if (memory_usage == UINT64_MAX)
567 				message_bug();
568 
569 			if (memory_usage <= memory_limit) {
570 				// The memory usage is now low enough.
571 				//
572 				// Since 5.6.1: This is only shown at
573 				// V_DEBUG instead of V_WARNING because
574 				// changing the number of threads doesn't
575 				// affect the output. On some systems this
576 				// message would be too common now that
577 				// multithreaded compression is the default.
578 				message(V_DEBUG, _("Reduced the number of "
579 					"threads from %s to %s to not exceed "
580 					"the memory usage limit of %s MiB"),
581 					uint64_to_str(
582 						hardware_threads_get(), 0),
583 					uint64_to_str(mt_options.threads, 1),
584 					uint64_to_str(round_up_to_mib(
585 						memory_limit), 2));
586 				return;
587 			}
588 		}
589 
590 		// If the memory usage limit is only a soft limit (automatic
591 		// number of threads and no --memlimit-compress), the limit
592 		// is only used to reduce the number of threads and once at
593 		// just one thread, the limit is completely ignored. This
594 		// way -T0 won't use insane amount of memory but at the same
595 		// time the soft limit will never make xz fail and never make
596 		// xz change settings that would affect the compressed output.
597 		//
598 		// Since 5.6.1: Like above, this is now shown at V_DEBUG
599 		// instead of V_WARNING.
600 		if (hardware_memlimit_mtenc_is_default()) {
601 			message(V_DEBUG, _("Reduced the number of threads "
602 				"from %s to one. The automatic memory usage "
603 				"limit of %s MiB is still being exceeded. "
604 				"%s MiB of memory is required. "
605 				"Continuing anyway."),
606 				uint64_to_str(hardware_threads_get(), 0),
607 				uint64_to_str(
608 					round_up_to_mib(memory_limit), 1),
609 				uint64_to_str(
610 					round_up_to_mib(memory_usage), 2));
611 			return;
612 		}
613 
614 		// If --no-adjust was used, we cannot drop to single-threaded
615 		// mode since it produces different compressed output.
616 		//
617 		// NOTE: In xz 5.2.x, --no-adjust also prevented reducing
618 		// the number of threads. This changed in 5.3.3alpha.
619 		if (!opt_auto_adjust)
620 			memlimit_too_small(memory_usage);
621 
622 		// Switch to single-threaded mode. It uses
623 		// less memory than using one thread in
624 		// the multithreaded mode but the output
625 		// is also different.
626 		hardware_threads_set(1);
627 		memory_usage = get_chains_memusage(encoder_memusages,
628 				NULL, true);
629 		message(V_WARNING, _("Switching to single-threaded mode "
630 			"to not exceed the memory usage limit of %s MiB"),
631 			uint64_to_str(round_up_to_mib(memory_limit), 0));
632 	}
633 #	endif
634 
635 	if (memory_usage <= memory_limit)
636 		return;
637 
638 	// Don't adjust LZMA2 or LZMA1 dictionary size if --no-adjust
639 	// was specified as that would change the compressed output.
640 	if (!opt_auto_adjust)
641 		memlimit_too_small(memory_usage);
642 
643 	// Adjust each filter chain that is exceeding the memory usage limit.
644 	for (unsigned i = 0; i < ARRAY_SIZE(chains); i++) {
645 		// Skip unused chains.
646 		if (!(chains_used_mask & (1U << i)))
647 			continue;
648 
649 		// Skip chains that already meet the memory usage limit.
650 		if (encoder_memusages[i] <=  memory_limit)
651 			continue;
652 
653 		// Look for the last filter if it is LZMA2 or LZMA1, so we
654 		// can make it use less RAM. We cannot adjust other filters.
655 		unsigned j = 0;
656 		while (chains[i][j].id != LZMA_FILTER_LZMA2
657 				&& chains[i][j].id != LZMA_FILTER_LZMA1) {
658 			// NOTE: This displays the too high limit of this
659 			// particular filter chain. If multiple chains are
660 			// specified and another one would need more then
661 			// this message could be confusing. As long as LZMA2
662 			// is the only memory hungry filter in .xz this
663 			// doesn't matter at all in practice.
664 			//
665 			// FIXME? However, it's sort of odd still if we had
666 			// switched from multithreaded mode to single-threaded
667 			// mode because single-threaded produces different
668 			// output. So the messages could perhaps be clearer.
669 			// Another case of this is a few lines below.
670 			if (chains[i][j].id == LZMA_VLI_UNKNOWN)
671 				memlimit_too_small(encoder_memusages[i]);
672 
673 			++j;
674 		}
675 
676 		// Decrease the dictionary size until we meet the memory
677 		// usage limit. First round down to full mebibytes.
678 		lzma_options_lzma *opt = chains[i][j].options;
679 		const uint32_t orig_dict_size = opt->dict_size;
680 		opt->dict_size &= ~((UINT32_C(1) << 20) - 1);
681 
682 		while (true) {
683 			// If it is below 1 MiB, auto-adjusting failed.
684 			//
685 			// FIXME? See the FIXME a few lines above.
686 			if (opt->dict_size < (UINT32_C(1) << 20))
687 				memlimit_too_small(encoder_memusages[i]);
688 
689 			encoder_memusages[i]
690 				= lzma_raw_encoder_memusage(chains[i]);
691 			if (encoder_memusages[i] == UINT64_MAX)
692 				message_bug();
693 
694 			// Accept it if it is low enough.
695 			if (encoder_memusages[i] <= memory_limit)
696 				break;
697 
698 			// Otherwise adjust it 1 MiB down and try again.
699 			opt->dict_size -= UINT32_C(1) << 20;
700 		}
701 
702 		// Tell the user that we decreased the dictionary size.
703 		// The message is slightly different between the default
704 		// filter chain (0) or and chains from --filtersX.
705 		const char lzma_num = chains[i][j].id == LZMA_FILTER_LZMA2
706 					? '2' : '1';
707 		const char *from_size = uint64_to_str(orig_dict_size >> 20, 0);
708 		const char *to_size = uint64_to_str(opt->dict_size >> 20, 1);
709 		const char *limit_size = uint64_to_str(round_up_to_mib(
710 					memory_limit), 2);
711 		if (i == 0)
712 			message(V_WARNING, _("Adjusted LZMA%c dictionary "
713 				"size from %s MiB to %s MiB to not exceed the "
714 				"memory usage limit of %s MiB"),
715 				lzma_num, from_size, to_size, limit_size);
716 		else
717 			message(V_WARNING, _("Adjusted LZMA%c dictionary size "
718 				"for --filters%u from %s MiB to %s MiB to not "
719 				"exceed the memory usage limit of %s MiB"),
720 				lzma_num, i, from_size, to_size, limit_size);
721 	}
722 #endif
723 
724 	return;
725 }
726 
727 
728 #ifdef HAVE_DECODERS
729 /// Return true if the data in in_buf seems to be in the .xz format.
730 static bool
is_format_xz(void)731 is_format_xz(void)
732 {
733 	// Specify the magic as hex to be compatible with EBCDIC systems.
734 	static const uint8_t magic[6] = { 0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00 };
735 	return strm.avail_in >= sizeof(magic)
736 			&& memcmp(in_buf.u8, magic, sizeof(magic)) == 0;
737 }
738 
739 
740 /// Return true if the data in in_buf seems to be in the .lzma format.
741 static bool
is_format_lzma(void)742 is_format_lzma(void)
743 {
744 	// The .lzma header is 13 bytes.
745 	if (strm.avail_in < 13)
746 		return false;
747 
748 	// Decode the LZMA1 properties.
749 	lzma_filter filter = { .id = LZMA_FILTER_LZMA1 };
750 	if (lzma_properties_decode(&filter, NULL, in_buf.u8, 5) != LZMA_OK)
751 		return false;
752 
753 	// A hack to ditch tons of false positives: We allow only dictionary
754 	// sizes that are 2^n or 2^n + 2^(n-1) or UINT32_MAX. LZMA_Alone
755 	// created only files with 2^n, but accepts any dictionary size.
756 	// If someone complains, this will be reconsidered.
757 	lzma_options_lzma *opt = filter.options;
758 	const uint32_t dict_size = opt->dict_size;
759 	free(opt);
760 
761 	if (dict_size != UINT32_MAX) {
762 		uint32_t d = dict_size - 1;
763 		d |= d >> 2;
764 		d |= d >> 3;
765 		d |= d >> 4;
766 		d |= d >> 8;
767 		d |= d >> 16;
768 		++d;
769 		if (d != dict_size || dict_size == 0)
770 			return false;
771 	}
772 
773 	// Another hack to ditch false positives: Assume that if the
774 	// uncompressed size is known, it must be less than 256 GiB.
775 	// Again, if someone complains, this will be reconsidered.
776 	uint64_t uncompressed_size = 0;
777 	for (size_t i = 0; i < 8; ++i)
778 		uncompressed_size |= (uint64_t)(in_buf.u8[5 + i]) << (i * 8);
779 
780 	if (uncompressed_size != UINT64_MAX
781 			&& uncompressed_size > (UINT64_C(1) << 38))
782 		return false;
783 
784 	return true;
785 }
786 
787 
788 #ifdef HAVE_LZIP_DECODER
789 /// Return true if the data in in_buf seems to be in the .lz format.
790 static bool
is_format_lzip(void)791 is_format_lzip(void)
792 {
793 	static const uint8_t magic[4] = { 0x4C, 0x5A, 0x49, 0x50 };
794 	return strm.avail_in >= sizeof(magic)
795 			&& memcmp(in_buf.u8, magic, sizeof(magic)) == 0;
796 }
797 #endif
798 #endif
799 
800 
801 /// Detect the input file type (for now, this done only when decompressing),
802 /// and initialize an appropriate coder. Return value indicates if a normal
803 /// liblzma-based coder was initialized (CODER_INIT_NORMAL), if passthru
804 /// mode should be used (CODER_INIT_PASSTHRU), or if an error occurred
805 /// (CODER_INIT_ERROR).
806 static enum coder_init_ret
coder_init(file_pair * pair)807 coder_init(file_pair *pair)
808 {
809 	lzma_ret ret = LZMA_PROG_ERROR;
810 
811 	// In most cases if there is input left when coding finishes,
812 	// something has gone wrong. Exceptions are --single-stream
813 	// and decoding .lz files which can contain trailing non-.lz data.
814 	// These will be handled later in this function.
815 	allow_trailing_input = false;
816 
817 	// Set the first filter chain. If the --block-list option is not
818 	// used then use the default filter chain (chains[0]).
819 	// Otherwise, use first filter chain from the block list.
820 	lzma_filter *active_filters = opt_block_list == NULL
821 			? chains[0]
822 			: chains[opt_block_list[0].chain_num];
823 
824 	if (opt_mode == MODE_COMPRESS) {
825 #ifdef HAVE_ENCODERS
826 		switch (opt_format) {
827 		case FORMAT_AUTO:
828 			// args.c ensures this.
829 			assert(0);
830 			break;
831 
832 		case FORMAT_XZ:
833 #	ifdef MYTHREAD_ENABLED
834 			mt_options.filters = active_filters;
835 			if (hardware_threads_is_mt())
836 				ret = lzma_stream_encoder_mt(
837 						&strm, &mt_options);
838 			else
839 #	endif
840 				ret = lzma_stream_encoder(
841 						&strm, active_filters, check);
842 			break;
843 
844 		case FORMAT_LZMA:
845 			ret = lzma_alone_encoder(&strm,
846 					active_filters[0].options);
847 			break;
848 
849 #	ifdef HAVE_LZIP_DECODER
850 		case FORMAT_LZIP:
851 			// args.c should disallow this.
852 			assert(0);
853 			ret = LZMA_PROG_ERROR;
854 			break;
855 #	endif
856 
857 		case FORMAT_RAW:
858 			ret = lzma_raw_encoder(&strm, active_filters);
859 			break;
860 		}
861 #endif
862 	} else {
863 #ifdef HAVE_DECODERS
864 		uint32_t flags = 0;
865 
866 		// It seems silly to warn about unsupported check if the
867 		// check won't be verified anyway due to --ignore-check.
868 		if (opt_ignore_check)
869 			flags |= LZMA_IGNORE_CHECK;
870 		else
871 			flags |= LZMA_TELL_UNSUPPORTED_CHECK;
872 
873 		if (opt_single_stream)
874 			allow_trailing_input = true;
875 		else
876 			flags |= LZMA_CONCATENATED;
877 
878 		// We abuse FORMAT_AUTO to indicate unknown file format,
879 		// for which we may consider passthru mode.
880 		enum format_type init_format = FORMAT_AUTO;
881 
882 		switch (opt_format) {
883 		case FORMAT_AUTO:
884 			// .lz is checked before .lzma since .lzma detection
885 			// is more complicated (no magic bytes).
886 			if (is_format_xz())
887 				init_format = FORMAT_XZ;
888 #	ifdef HAVE_LZIP_DECODER
889 			else if (is_format_lzip())
890 				init_format = FORMAT_LZIP;
891 #	endif
892 			else if (is_format_lzma())
893 				init_format = FORMAT_LZMA;
894 			break;
895 
896 		case FORMAT_XZ:
897 			if (is_format_xz())
898 				init_format = FORMAT_XZ;
899 			break;
900 
901 		case FORMAT_LZMA:
902 			if (is_format_lzma())
903 				init_format = FORMAT_LZMA;
904 			break;
905 
906 #	ifdef HAVE_LZIP_DECODER
907 		case FORMAT_LZIP:
908 			if (is_format_lzip())
909 				init_format = FORMAT_LZIP;
910 			break;
911 #	endif
912 
913 		case FORMAT_RAW:
914 			init_format = FORMAT_RAW;
915 			break;
916 		}
917 
918 		switch (init_format) {
919 		case FORMAT_AUTO:
920 			// Unknown file format. If --decompress --stdout
921 			// --force have been given, then we copy the input
922 			// as is to stdout. Checking for MODE_DECOMPRESS
923 			// is needed, because we don't want to do use
924 			// passthru mode with --test.
925 			if (opt_mode == MODE_DECOMPRESS
926 					&& opt_stdout && opt_force) {
927 				// These are needed for progress info.
928 				strm.total_in = 0;
929 				strm.total_out = 0;
930 				return CODER_INIT_PASSTHRU;
931 			}
932 
933 			ret = LZMA_FORMAT_ERROR;
934 			break;
935 
936 		case FORMAT_XZ:
937 #	ifdef MYTHREAD_ENABLED
938 			mt_options.flags = flags;
939 
940 			mt_options.threads = hardware_threads_get();
941 			mt_options.memlimit_stop
942 				= hardware_memlimit_get(MODE_DECOMPRESS);
943 
944 			// If single-threaded mode was requested, set the
945 			// memlimit for threading to zero. This forces the
946 			// decoder to use single-threaded mode which matches
947 			// the behavior of lzma_stream_decoder().
948 			//
949 			// Otherwise use the limit for threaded decompression
950 			// which has a sane default (users are still free to
951 			// make it insanely high though).
952 			mt_options.memlimit_threading
953 					= mt_options.threads == 1
954 					? 0 : hardware_memlimit_mtdec_get();
955 
956 			ret = lzma_stream_decoder_mt(&strm, &mt_options);
957 #	else
958 			ret = lzma_stream_decoder(&strm,
959 					hardware_memlimit_get(
960 						MODE_DECOMPRESS), flags);
961 #	endif
962 			break;
963 
964 		case FORMAT_LZMA:
965 			ret = lzma_alone_decoder(&strm,
966 					hardware_memlimit_get(
967 						MODE_DECOMPRESS));
968 			break;
969 
970 #	ifdef HAVE_LZIP_DECODER
971 		case FORMAT_LZIP:
972 			allow_trailing_input = true;
973 			ret = lzma_lzip_decoder(&strm,
974 					hardware_memlimit_get(
975 						MODE_DECOMPRESS), flags);
976 			break;
977 #	endif
978 
979 		case FORMAT_RAW:
980 			// Memory usage has already been checked in
981 			// coder_set_compression_settings().
982 			ret = lzma_raw_decoder(&strm, active_filters);
983 			break;
984 		}
985 
986 		// Try to decode the headers. This will catch too low
987 		// memory usage limit in case it happens in the first
988 		// Block of the first Stream, which is where it very
989 		// probably will happen if it is going to happen.
990 		//
991 		// This will also catch unsupported check type which
992 		// we treat as a warning only. If there are empty
993 		// concatenated Streams with unsupported check type then
994 		// the message can be shown more than once here. The loop
995 		// is used in case there is first a warning about
996 		// unsupported check type and then the first Block
997 		// would exceed the memlimit.
998 		if (ret == LZMA_OK && init_format != FORMAT_RAW) {
999 			strm.next_out = NULL;
1000 			strm.avail_out = 0;
1001 			while ((ret = lzma_code(&strm, LZMA_RUN))
1002 					== LZMA_UNSUPPORTED_CHECK)
1003 				message_warning(_("%s: %s"),
1004 					tuklib_mask_nonprint(pair->src_name),
1005 					message_strm(ret));
1006 
1007 			// With --single-stream lzma_code won't wait for
1008 			// LZMA_FINISH and thus it can return LZMA_STREAM_END
1009 			// if the file has no uncompressed data inside.
1010 			// So treat LZMA_STREAM_END as LZMA_OK here.
1011 			// When lzma_code() is called again in coder_normal()
1012 			// it will return LZMA_STREAM_END again.
1013 			if (ret == LZMA_STREAM_END)
1014 				ret = LZMA_OK;
1015 		}
1016 #endif
1017 	}
1018 
1019 	if (ret != LZMA_OK) {
1020 		message_error(_("%s: %s"),
1021 				tuklib_mask_nonprint(pair->src_name),
1022 				message_strm(ret));
1023 		if (ret == LZMA_MEMLIMIT_ERROR)
1024 			message_mem_needed(V_ERROR, lzma_memusage(&strm));
1025 
1026 		return CODER_INIT_ERROR;
1027 	}
1028 
1029 	return CODER_INIT_NORMAL;
1030 }
1031 
1032 
1033 #ifdef HAVE_ENCODERS
1034 /// Resolve conflicts between opt_block_size and opt_block_list in single
1035 /// threaded mode. We want to default to opt_block_list, except when it is
1036 /// larger than opt_block_size. If this is the case for the current Block
1037 /// at *list_pos, then we break into smaller Blocks. Otherwise advance
1038 /// to the next Block in opt_block_list, and break apart if needed.
1039 static void
split_block(uint64_t * block_remaining,uint64_t * next_block_remaining,size_t * list_pos)1040 split_block(uint64_t *block_remaining,
1041 	    uint64_t *next_block_remaining,
1042 	    size_t *list_pos)
1043 {
1044 	if (*next_block_remaining > 0) {
1045 		// The Block at *list_pos has previously been split up.
1046 		assert(!hardware_threads_is_mt());
1047 		assert(opt_block_size > 0);
1048 		assert(opt_block_list != NULL);
1049 
1050 		if (*next_block_remaining > opt_block_size) {
1051 			// We have to split the current Block at *list_pos
1052 			// into another opt_block_size length Block.
1053 			*block_remaining = opt_block_size;
1054 		} else {
1055 			// This is the last remaining split Block for the
1056 			// Block at *list_pos.
1057 			*block_remaining = *next_block_remaining;
1058 		}
1059 
1060 		*next_block_remaining -= *block_remaining;
1061 
1062 	} else {
1063 		// The Block at *list_pos has been finished. Go to the next
1064 		// entry in the list. If the end of the list has been
1065 		// reached, reuse the size and filters of the last Block.
1066 		if (opt_block_list[*list_pos + 1].size != 0) {
1067 			++*list_pos;
1068 
1069 			// Update the filters if needed.
1070 			if (opt_block_list[*list_pos - 1].chain_num
1071 				!= opt_block_list[*list_pos].chain_num) {
1072 				const unsigned chain_num
1073 					= opt_block_list[*list_pos].chain_num;
1074 				const lzma_filter *next = chains[chain_num];
1075 				const lzma_ret ret = lzma_filters_update(
1076 						&strm, next);
1077 
1078 				if (ret != LZMA_OK) {
1079 					// This message is only possible if
1080 					// the filter chain has unsupported
1081 					// options since the filter chain is
1082 					// validated using
1083 					// lzma_raw_encoder_memusage() or
1084 					// lzma_stream_encoder_mt_memusage().
1085 					// Some options are not validated until
1086 					// the encoders are initialized.
1087 					message_fatal(
1088 						_("Error changing to "
1089 						"filter chain %u: %s"),
1090 						chain_num,
1091 						message_strm(ret));
1092 				}
1093 			}
1094 		}
1095 
1096 		*block_remaining = opt_block_list[*list_pos].size;
1097 
1098 		// If in single-threaded mode, split up the Block if needed.
1099 		// This is not needed in multi-threaded mode because liblzma
1100 		// will do this due to how threaded encoding works.
1101 		if (!hardware_threads_is_mt() && opt_block_size > 0
1102 				&& *block_remaining > opt_block_size) {
1103 			*next_block_remaining
1104 					= *block_remaining - opt_block_size;
1105 			*block_remaining = opt_block_size;
1106 		}
1107 	}
1108 }
1109 #endif
1110 
1111 
1112 static bool
coder_write_output(file_pair * pair)1113 coder_write_output(file_pair *pair)
1114 {
1115 	if (opt_mode != MODE_TEST) {
1116 		if (io_write(pair, &out_buf, IO_BUFFER_SIZE - strm.avail_out))
1117 			return true;
1118 	}
1119 
1120 	strm.next_out = out_buf.u8;
1121 	strm.avail_out = IO_BUFFER_SIZE;
1122 	return false;
1123 }
1124 
1125 
1126 /// Compress or decompress using liblzma.
1127 static bool
coder_normal(file_pair * pair)1128 coder_normal(file_pair *pair)
1129 {
1130 	// Encoder needs to know when we have given all the input to it.
1131 	// The decoders need to know it too when we are using
1132 	// LZMA_CONCATENATED. We need to check for src_eof here, because
1133 	// the first input chunk has been already read if decompressing,
1134 	// and that may have been the only chunk we will read.
1135 	lzma_action action = pair->src_eof ? LZMA_FINISH : LZMA_RUN;
1136 
1137 	lzma_ret ret;
1138 
1139 	// Assume that something goes wrong.
1140 	bool success = false;
1141 
1142 #ifdef HAVE_ENCODERS
1143 	// block_remaining indicates how many input bytes to encode before
1144 	// finishing the current .xz Block. The Block size is set with
1145 	// --block-size=SIZE and --block-list. They have an effect only when
1146 	// compressing to the .xz format. If block_remaining == UINT64_MAX,
1147 	// only a single block is created.
1148 	uint64_t block_remaining = UINT64_MAX;
1149 
1150 	// next_block_remaining for when we are in single-threaded mode and
1151 	// the Block in --block-list is larger than the --block-size=SIZE.
1152 	uint64_t next_block_remaining = 0;
1153 
1154 	// Position in opt_block_list. Unused if --block-list wasn't used.
1155 	size_t list_pos = 0;
1156 
1157 	// Handle --block-size for single-threaded mode and the first step
1158 	// of --block-list.
1159 	if (opt_mode == MODE_COMPRESS && opt_format == FORMAT_XZ) {
1160 		// --block-size doesn't do anything here in threaded mode,
1161 		// because the threaded encoder will take care of splitting
1162 		// to fixed-sized Blocks.
1163 		if (!hardware_threads_is_mt() && opt_block_size > 0)
1164 			block_remaining = opt_block_size;
1165 
1166 		// If --block-list was used, start with the first size.
1167 		//
1168 		// For threaded case, --block-size specifies how big Blocks
1169 		// the encoder needs to be prepared to create at maximum
1170 		// and --block-list will simultaneously cause new Blocks
1171 		// to be started at specified intervals. To keep things
1172 		// logical, the same is done in single-threaded mode. The
1173 		// output is still not identical because in single-threaded
1174 		// mode the size info isn't written into Block Headers.
1175 		if (opt_block_list != NULL) {
1176 			if (block_remaining < opt_block_list[list_pos].size) {
1177 				assert(!hardware_threads_is_mt());
1178 				next_block_remaining =
1179 						opt_block_list[list_pos].size
1180 						- block_remaining;
1181 			} else {
1182 				block_remaining =
1183 						opt_block_list[list_pos].size;
1184 			}
1185 		}
1186 	}
1187 #endif
1188 
1189 	strm.next_out = out_buf.u8;
1190 	strm.avail_out = IO_BUFFER_SIZE;
1191 
1192 	while (!user_abort) {
1193 		// Fill the input buffer if it is empty and we aren't
1194 		// flushing or finishing.
1195 		if (strm.avail_in == 0 && action == LZMA_RUN) {
1196 			strm.next_in = in_buf.u8;
1197 #ifdef HAVE_ENCODERS
1198 			const size_t read_size = my_min(block_remaining,
1199 					IO_BUFFER_SIZE);
1200 #else
1201 			const size_t read_size = IO_BUFFER_SIZE;
1202 #endif
1203 			strm.avail_in = io_read(pair, &in_buf, read_size);
1204 
1205 			if (strm.avail_in == SIZE_MAX)
1206 				break;
1207 
1208 			if (pair->src_eof) {
1209 				action = LZMA_FINISH;
1210 			}
1211 #ifdef HAVE_ENCODERS
1212 			else if (block_remaining != UINT64_MAX) {
1213 				// Start a new Block after every
1214 				// opt_block_size bytes of input.
1215 				block_remaining -= strm.avail_in;
1216 				if (block_remaining == 0)
1217 					action = LZMA_FULL_BARRIER;
1218 			}
1219 
1220 			if (action == LZMA_RUN && pair->flush_needed)
1221 				action = LZMA_SYNC_FLUSH;
1222 #endif
1223 		}
1224 
1225 		// Let liblzma do the actual work.
1226 		ret = lzma_code(&strm, action);
1227 
1228 		// Write out if the output buffer became full.
1229 		if (strm.avail_out == 0) {
1230 			if (coder_write_output(pair))
1231 				break;
1232 		}
1233 
1234 #ifdef HAVE_ENCODERS
1235 		if (ret == LZMA_STREAM_END && (action == LZMA_SYNC_FLUSH
1236 				|| action == LZMA_FULL_BARRIER)) {
1237 			if (action == LZMA_SYNC_FLUSH) {
1238 				// Flushing completed. Write the pending data
1239 				// out immediately so that the reading side
1240 				// can decompress everything compressed so far.
1241 				if (coder_write_output(pair))
1242 					break;
1243 
1244 				// Mark that we haven't seen any new input
1245 				// since the previous flush.
1246 				pair->src_has_seen_input = false;
1247 				pair->flush_needed = false;
1248 			} else {
1249 				// Start a new Block after LZMA_FULL_BARRIER.
1250 				if (opt_block_list == NULL) {
1251 					assert(!hardware_threads_is_mt());
1252 					assert(opt_block_size > 0);
1253 					block_remaining = opt_block_size;
1254 				} else {
1255 					split_block(&block_remaining,
1256 							&next_block_remaining,
1257 							&list_pos);
1258 				}
1259 			}
1260 
1261 			// Start a new Block after LZMA_FULL_FLUSH or continue
1262 			// the same block after LZMA_SYNC_FLUSH.
1263 			action = LZMA_RUN;
1264 		} else
1265 #endif
1266 		if (ret != LZMA_OK) {
1267 			// Determine if the return value indicates that we
1268 			// won't continue coding. LZMA_NO_CHECK would be
1269 			// here too if LZMA_TELL_ANY_CHECK was used.
1270 			const bool stop = ret != LZMA_UNSUPPORTED_CHECK;
1271 
1272 			if (stop) {
1273 				// Write the remaining bytes even if something
1274 				// went wrong, because that way the user gets
1275 				// as much data as possible, which can be good
1276 				// when trying to get at least some useful
1277 				// data out of damaged files.
1278 				if (coder_write_output(pair))
1279 					break;
1280 			}
1281 
1282 			if (ret == LZMA_STREAM_END) {
1283 				if (allow_trailing_input) {
1284 					io_fix_src_pos(pair, strm.avail_in);
1285 					success = true;
1286 					break;
1287 				}
1288 
1289 				// Check that there is no trailing garbage.
1290 				// This is needed for LZMA_Alone and raw
1291 				// streams. This is *not* done with .lz files
1292 				// as that format specifically requires
1293 				// allowing trailing garbage.
1294 				if (strm.avail_in == 0 && !pair->src_eof) {
1295 					// Try reading one more byte.
1296 					// Hopefully we don't get any more
1297 					// input, and thus pair->src_eof
1298 					// becomes true.
1299 					strm.avail_in = io_read(
1300 							pair, &in_buf, 1);
1301 					if (strm.avail_in == SIZE_MAX)
1302 						break;
1303 
1304 					assert(strm.avail_in == 0
1305 							|| strm.avail_in == 1);
1306 				}
1307 
1308 				if (strm.avail_in == 0) {
1309 					assert(pair->src_eof);
1310 					success = true;
1311 					break;
1312 				}
1313 
1314 				// We hadn't reached the end of the file.
1315 				ret = LZMA_DATA_ERROR;
1316 				assert(stop);
1317 			}
1318 
1319 			// If we get here and stop is true, something went
1320 			// wrong and we print an error. Otherwise it's just
1321 			// a warning and coding can continue.
1322 			if (stop) {
1323 				message_error(_("%s: %s"),
1324 					tuklib_mask_nonprint(pair->src_name),
1325 					message_strm(ret));
1326 			} else {
1327 				message_warning(_("%s: %s"),
1328 					tuklib_mask_nonprint(pair->src_name),
1329 					message_strm(ret));
1330 
1331 				// When compressing, all possible errors set
1332 				// stop to true.
1333 				assert(opt_mode != MODE_COMPRESS);
1334 			}
1335 
1336 			if (ret == LZMA_MEMLIMIT_ERROR) {
1337 				// Display how much memory it would have
1338 				// actually needed.
1339 				message_mem_needed(V_ERROR,
1340 						lzma_memusage(&strm));
1341 			}
1342 
1343 			if (stop)
1344 				break;
1345 		}
1346 
1347 		// Show progress information under certain conditions.
1348 		message_progress_update();
1349 	}
1350 
1351 	return success;
1352 }
1353 
1354 
1355 /// Copy from input file to output file without processing the data in any
1356 /// way. This is used only when trying to decompress unrecognized files
1357 /// with --decompress --stdout --force, so the output is always stdout.
1358 static bool
coder_passthru(file_pair * pair)1359 coder_passthru(file_pair *pair)
1360 {
1361 	while (strm.avail_in != 0) {
1362 		if (user_abort)
1363 			return false;
1364 
1365 		if (io_write(pair, &in_buf, strm.avail_in))
1366 			return false;
1367 
1368 		strm.total_in += strm.avail_in;
1369 		strm.total_out = strm.total_in;
1370 		message_progress_update();
1371 
1372 		strm.avail_in = io_read(pair, &in_buf, IO_BUFFER_SIZE);
1373 		if (strm.avail_in == SIZE_MAX)
1374 			return false;
1375 	}
1376 
1377 	return true;
1378 }
1379 
1380 
1381 extern void
coder_run(const char * filename)1382 coder_run(const char *filename)
1383 {
1384 	// Set and possibly print the filename for the progress message.
1385 	message_filename(filename);
1386 
1387 	// Try to open the input file.
1388 	file_pair *pair = io_open_src(filename);
1389 	if (pair == NULL)
1390 		return;
1391 
1392 	// Assume that something goes wrong.
1393 	bool success = false;
1394 
1395 	if (opt_mode == MODE_COMPRESS) {
1396 		strm.next_in = NULL;
1397 		strm.avail_in = 0;
1398 	} else {
1399 		// Read the first chunk of input data. This is needed
1400 		// to detect the input file type.
1401 		strm.next_in = in_buf.u8;
1402 		strm.avail_in = io_read(pair, &in_buf, IO_BUFFER_SIZE);
1403 	}
1404 
1405 	if (strm.avail_in != SIZE_MAX) {
1406 		// Initialize the coder. This will detect the file format
1407 		// and, in decompression or testing mode, check the memory
1408 		// usage of the first Block too. This way we don't try to
1409 		// open the destination file if we see that coding wouldn't
1410 		// work at all anyway. This also avoids deleting the old
1411 		// "target" file if --force was used.
1412 		const enum coder_init_ret init_ret = coder_init(pair);
1413 
1414 		if (init_ret != CODER_INIT_ERROR && !user_abort) {
1415 			// Don't open the destination file when --test
1416 			// is used.
1417 			if (opt_mode == MODE_TEST || !io_open_dest(pair)) {
1418 				// Remember the current time. It is needed
1419 				// for progress indicator.
1420 				mytime_set_start_time();
1421 
1422 				// Initialize the progress indicator.
1423 				//
1424 				// NOTE: When reading from stdin, fstat()
1425 				// isn't called on it and thus src_st.st_size
1426 				// is zero. If stdin pointed to a regular
1427 				// file, it would still be possible to know
1428 				// the file size but then we would also need
1429 				// to take into account the current reading
1430 				// position since with stdin it isn't
1431 				// necessarily at the beginning of the file.
1432 				const bool is_passthru = init_ret
1433 						== CODER_INIT_PASSTHRU;
1434 				const uint64_t in_size
1435 					= pair->src_st.st_size <= 0
1436 					? 0 : (uint64_t)(pair->src_st.st_size);
1437 				message_progress_start(&strm,
1438 						is_passthru, in_size);
1439 
1440 				// Do the actual coding or passthru.
1441 				if (is_passthru)
1442 					success = coder_passthru(pair);
1443 				else
1444 					success = coder_normal(pair);
1445 
1446 				message_progress_end(success);
1447 			}
1448 		}
1449 	}
1450 
1451 	// Close the file pair. It needs to know if coding was successful to
1452 	// know if the source or target file should be unlinked.
1453 	io_close(pair, success);
1454 
1455 	return;
1456 }
1457 
1458 
1459 #ifndef NDEBUG
1460 extern void
coder_free(void)1461 coder_free(void)
1462 {
1463 	// Free starting from the second filter chain since the default
1464 	// filter chain may have its options set from a static variable
1465 	// in coder_set_compression_settings(). Since this is only run in
1466 	// debug mode and will be freed when the process ends anyway, we
1467 	// don't worry about freeing it.
1468 	for (uint32_t i = 1; i < ARRAY_SIZE(chains); i++) {
1469 		if (chains_used_mask & (1U << i))
1470 			lzma_filters_free(chains[i], NULL);
1471 	}
1472 
1473 	lzma_end(&strm);
1474 	return;
1475 }
1476 #endif
1477