xref: /freebsd/usr.bin/gzip/unxz.c (revision 036d2e814bf0f5d88ffb4b24c159320894541757)
1 /*	$NetBSD: unxz.c,v 1.8 2018/10/06 16:36:45 martin Exp $	*/
2 
3 /*-
4  * SPDX-License-Identifier: BSD-2-Clause-NetBSD
5  *
6  * Copyright (c) 2011 The NetBSD Foundation, Inc.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to The NetBSD Foundation
10  * by Christos Zoulas.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 #include <stdarg.h>
37 #include <errno.h>
38 #include <stdio.h>
39 #include <unistd.h>
40 #include <lzma.h>
41 
42 static off_t
43 unxz(int i, int o, char *pre, size_t prelen, off_t *bytes_in)
44 {
45 	lzma_stream strm = LZMA_STREAM_INIT;
46 	static const int flags = LZMA_TELL_UNSUPPORTED_CHECK|LZMA_CONCATENATED;
47 	lzma_ret ret;
48 	lzma_action action = LZMA_RUN;
49 	off_t bytes_out, bp;
50 	uint8_t ibuf[BUFSIZ];
51 	uint8_t obuf[BUFSIZ];
52 
53 	if (bytes_in == NULL)
54 		bytes_in = &bp;
55 
56 	strm.next_in = ibuf;
57 	memcpy(ibuf, pre, prelen);
58 	strm.avail_in = read(i, ibuf + prelen, sizeof(ibuf) - prelen);
59 	if (strm.avail_in == (size_t)-1)
60 		maybe_err("read failed");
61 	infile_newdata(strm.avail_in);
62 	strm.avail_in += prelen;
63 	*bytes_in = strm.avail_in;
64 
65 	if ((ret = lzma_stream_decoder(&strm, UINT64_MAX, flags)) != LZMA_OK)
66 		maybe_errx("Can't initialize decoder (%d)", ret);
67 
68 	strm.next_out = NULL;
69 	strm.avail_out = 0;
70 	if ((ret = lzma_code(&strm, LZMA_RUN)) != LZMA_OK)
71 		maybe_errx("Can't read headers (%d)", ret);
72 
73 	bytes_out = 0;
74 	strm.next_out = obuf;
75 	strm.avail_out = sizeof(obuf);
76 
77 	for (;;) {
78 		check_siginfo();
79 		if (strm.avail_in == 0) {
80 			strm.next_in = ibuf;
81 			strm.avail_in = read(i, ibuf, sizeof(ibuf));
82 			switch (strm.avail_in) {
83 			case (size_t)-1:
84 				maybe_err("read failed");
85 				/*NOTREACHED*/
86 			case 0:
87 				action = LZMA_FINISH;
88 				break;
89 			default:
90 				infile_newdata(strm.avail_in);
91 				*bytes_in += strm.avail_in;
92 				break;
93 			}
94 		}
95 
96 		ret = lzma_code(&strm, action);
97 
98 		// Write and check write error before checking decoder error.
99 		// This way as much data as possible gets written to output
100 		// even if decoder detected an error.
101 		if (strm.avail_out == 0 || ret != LZMA_OK) {
102 			const size_t write_size = sizeof(obuf) - strm.avail_out;
103 
104 			if (write(o, obuf, write_size) != (ssize_t)write_size)
105 				maybe_err("write failed");
106 
107 			strm.next_out = obuf;
108 			strm.avail_out = sizeof(obuf);
109 			bytes_out += write_size;
110 		}
111 
112 		if (ret != LZMA_OK) {
113 			if (ret == LZMA_STREAM_END) {
114 				// Check that there's no trailing garbage.
115 				if (strm.avail_in != 0 || read(i, ibuf, 1))
116 					ret = LZMA_DATA_ERROR;
117 				else {
118 					lzma_end(&strm);
119 					return bytes_out;
120 				}
121 			}
122 
123 			const char *msg;
124 			switch (ret) {
125 			case LZMA_MEM_ERROR:
126 				msg = strerror(ENOMEM);
127 				break;
128 
129 			case LZMA_FORMAT_ERROR:
130 				msg = "File format not recognized";
131 				break;
132 
133 			case LZMA_OPTIONS_ERROR:
134 				// FIXME: Better message?
135 				msg = "Unsupported compression options";
136 				break;
137 
138 			case LZMA_DATA_ERROR:
139 				msg = "File is corrupt";
140 				break;
141 
142 			case LZMA_BUF_ERROR:
143 				msg = "Unexpected end of input";
144 				break;
145 
146 			case LZMA_MEMLIMIT_ERROR:
147 				msg = "Reached memory limit";
148 				break;
149 
150 			default:
151 				maybe_errx("Unknown error (%d)", ret);
152 				break;
153 			}
154 			maybe_errx("%s", msg);
155 
156 		}
157 	}
158 }
159 
160 #include <stdbool.h>
161 
162 /*
163  * Copied various bits and pieces from xz support code or brute force
164  * replacements.
165  */
166 
167 #define	my_min(A,B)	((A)<(B)?(A):(B))
168 
169 // Some systems have suboptimal BUFSIZ. Use a bit bigger value on them.
170 // We also need that IO_BUFFER_SIZE is a multiple of 8 (sizeof(uint64_t))
171 #if BUFSIZ <= 1024
172 #       define IO_BUFFER_SIZE 8192
173 #else
174 #       define IO_BUFFER_SIZE (BUFSIZ & ~7U)
175 #endif
176 
177 /// is_sparse() accesses the buffer as uint64_t for maximum speed.
178 /// Use an union to make sure that the buffer is properly aligned.
179 typedef union {
180         uint8_t u8[IO_BUFFER_SIZE];
181         uint32_t u32[IO_BUFFER_SIZE / sizeof(uint32_t)];
182         uint64_t u64[IO_BUFFER_SIZE / sizeof(uint64_t)];
183 } io_buf;
184 
185 
186 static bool
187 io_pread(int fd, io_buf *buf, size_t size, off_t pos)
188 {
189 	// Using lseek() and read() is more portable than pread() and
190 	// for us it is as good as real pread().
191 	if (lseek(fd, pos, SEEK_SET) != pos) {
192 		return true;
193 	}
194 
195 	const size_t amount = read(fd, buf, size);
196 	if (amount == SIZE_MAX)
197 		return true;
198 
199 	if (amount != size) {
200 		return true;
201 	}
202 
203 	return false;
204 }
205 
206 /*
207  * Most of the following is copied (mostly verbatim) from the xz
208  * distribution, from file src/xz/list.c
209  */
210 
211 ///////////////////////////////////////////////////////////////////////////////
212 //
213 /// \file       list.c
214 /// \brief      Listing information about .xz files
215 //
216 //  Author:     Lasse Collin
217 //
218 //  This file has been put into the public domain.
219 //  You can do whatever you want with this file.
220 //
221 ///////////////////////////////////////////////////////////////////////////////
222 
223 
224 /// Information about a .xz file
225 typedef struct {
226 	/// Combined Index of all Streams in the file
227 	lzma_index *idx;
228 
229 	/// Total amount of Stream Padding
230 	uint64_t stream_padding;
231 
232 	/// Highest memory usage so far
233 	uint64_t memusage_max;
234 
235 	/// True if all Blocks so far have Compressed Size and
236 	/// Uncompressed Size fields
237 	bool all_have_sizes;
238 
239 	/// Oldest XZ Utils version that will decompress the file
240 	uint32_t min_version;
241 
242 } xz_file_info;
243 
244 #define XZ_FILE_INFO_INIT { NULL, 0, 0, true, 50000002 }
245 
246 
247 /// \brief      Parse the Index(es) from the given .xz file
248 ///
249 /// \param      xfi     Pointer to structure where the decoded information
250 ///                     is stored.
251 /// \param      pair    Input file
252 ///
253 /// \return     On success, false is returned. On error, true is returned.
254 ///
255 // TODO: This function is pretty big. liblzma should have a function that
256 // takes a callback function to parse the Index(es) from a .xz file to make
257 // it easy for applications.
258 static bool
259 parse_indexes(xz_file_info *xfi, int src_fd)
260 {
261 	struct stat st;
262 
263 	fstat(src_fd, &st);
264 	if (st.st_size <= 0) {
265 		return true;
266 	}
267 
268 	if (st.st_size < 2 * LZMA_STREAM_HEADER_SIZE) {
269 		return true;
270 	}
271 
272 	io_buf buf;
273 	lzma_stream_flags header_flags;
274 	lzma_stream_flags footer_flags;
275 	lzma_ret ret;
276 
277 	// lzma_stream for the Index decoder
278 	lzma_stream strm = LZMA_STREAM_INIT;
279 
280 	// All Indexes decoded so far
281 	lzma_index *combined_index = NULL;
282 
283 	// The Index currently being decoded
284 	lzma_index *this_index = NULL;
285 
286 	// Current position in the file. We parse the file backwards so
287 	// initialize it to point to the end of the file.
288 	off_t pos = st.st_size;
289 
290 	// Each loop iteration decodes one Index.
291 	do {
292 		// Check that there is enough data left to contain at least
293 		// the Stream Header and Stream Footer. This check cannot
294 		// fail in the first pass of this loop.
295 		if (pos < 2 * LZMA_STREAM_HEADER_SIZE) {
296 			goto error;
297 		}
298 
299 		pos -= LZMA_STREAM_HEADER_SIZE;
300 		lzma_vli stream_padding = 0;
301 
302 		// Locate the Stream Footer. There may be Stream Padding which
303 		// we must skip when reading backwards.
304 		while (true) {
305 			if (pos < LZMA_STREAM_HEADER_SIZE) {
306 				goto error;
307 			}
308 
309 			if (io_pread(src_fd, &buf,
310 					LZMA_STREAM_HEADER_SIZE, pos))
311 				goto error;
312 
313 			// Stream Padding is always a multiple of four bytes.
314 			int i = 2;
315 			if (buf.u32[i] != 0)
316 				break;
317 
318 			// To avoid calling io_pread() for every four bytes
319 			// of Stream Padding, take advantage that we read
320 			// 12 bytes (LZMA_STREAM_HEADER_SIZE) already and
321 			// check them too before calling io_pread() again.
322 			do {
323 				stream_padding += 4;
324 				pos -= 4;
325 				--i;
326 			} while (i >= 0 && buf.u32[i] == 0);
327 		}
328 
329 		// Decode the Stream Footer.
330 		ret = lzma_stream_footer_decode(&footer_flags, buf.u8);
331 		if (ret != LZMA_OK) {
332 			goto error;
333 		}
334 
335 		// Check that the Stream Footer doesn't specify something
336 		// that we don't support. This can only happen if the xz
337 		// version is older than liblzma and liblzma supports
338 		// something new.
339 		//
340 		// It is enough to check Stream Footer. Stream Header must
341 		// match when it is compared against Stream Footer with
342 		// lzma_stream_flags_compare().
343 		if (footer_flags.version != 0) {
344 			goto error;
345 		}
346 
347 		// Check that the size of the Index field looks sane.
348 		lzma_vli index_size = footer_flags.backward_size;
349 		if ((lzma_vli)(pos) < index_size + LZMA_STREAM_HEADER_SIZE) {
350 			goto error;
351 		}
352 
353 		// Set pos to the beginning of the Index.
354 		pos -= index_size;
355 
356 		// Decode the Index.
357 		ret = lzma_index_decoder(&strm, &this_index, UINT64_MAX);
358 		if (ret != LZMA_OK) {
359 			goto error;
360 		}
361 
362 		do {
363 			// Don't give the decoder more input than the
364 			// Index size.
365 			strm.avail_in = my_min(IO_BUFFER_SIZE, index_size);
366 			if (io_pread(src_fd, &buf, strm.avail_in, pos))
367 				goto error;
368 
369 			pos += strm.avail_in;
370 			index_size -= strm.avail_in;
371 
372 			strm.next_in = buf.u8;
373 			ret = lzma_code(&strm, LZMA_RUN);
374 
375 		} while (ret == LZMA_OK);
376 
377 		// If the decoding seems to be successful, check also that
378 		// the Index decoder consumed as much input as indicated
379 		// by the Backward Size field.
380 		if (ret == LZMA_STREAM_END)
381 			if (index_size != 0 || strm.avail_in != 0)
382 				ret = LZMA_DATA_ERROR;
383 
384 		if (ret != LZMA_STREAM_END) {
385 			// LZMA_BUFFER_ERROR means that the Index decoder
386 			// would have liked more input than what the Index
387 			// size should be according to Stream Footer.
388 			// The message for LZMA_DATA_ERROR makes more
389 			// sense in that case.
390 			if (ret == LZMA_BUF_ERROR)
391 				ret = LZMA_DATA_ERROR;
392 
393 			goto error;
394 		}
395 
396 		// Decode the Stream Header and check that its Stream Flags
397 		// match the Stream Footer.
398 		pos -= footer_flags.backward_size + LZMA_STREAM_HEADER_SIZE;
399 		if ((lzma_vli)(pos) < lzma_index_total_size(this_index)) {
400 			goto error;
401 		}
402 
403 		pos -= lzma_index_total_size(this_index);
404 		if (io_pread(src_fd, &buf, LZMA_STREAM_HEADER_SIZE, pos))
405 			goto error;
406 
407 		ret = lzma_stream_header_decode(&header_flags, buf.u8);
408 		if (ret != LZMA_OK) {
409 			goto error;
410 		}
411 
412 		ret = lzma_stream_flags_compare(&header_flags, &footer_flags);
413 		if (ret != LZMA_OK) {
414 			goto error;
415 		}
416 
417 		// Store the decoded Stream Flags into this_index. This is
418 		// needed so that we can print which Check is used in each
419 		// Stream.
420 		ret = lzma_index_stream_flags(this_index, &footer_flags);
421 		if (ret != LZMA_OK)
422 			goto error;
423 
424 		// Store also the size of the Stream Padding field. It is
425 		// needed to show the offsets of the Streams correctly.
426 		ret = lzma_index_stream_padding(this_index, stream_padding);
427 		if (ret != LZMA_OK)
428 			goto error;
429 
430 		if (combined_index != NULL) {
431 			// Append the earlier decoded Indexes
432 			// after this_index.
433 			ret = lzma_index_cat(
434 					this_index, combined_index, NULL);
435 			if (ret != LZMA_OK) {
436 				goto error;
437 			}
438 		}
439 
440 		combined_index = this_index;
441 		this_index = NULL;
442 
443 		xfi->stream_padding += stream_padding;
444 
445 	} while (pos > 0);
446 
447 	lzma_end(&strm);
448 
449 	// All OK. Make combined_index available to the caller.
450 	xfi->idx = combined_index;
451 	return false;
452 
453 error:
454 	// Something went wrong, free the allocated memory.
455 	lzma_end(&strm);
456 	lzma_index_end(combined_index, NULL);
457 	lzma_index_end(this_index, NULL);
458 	return true;
459 }
460 
461 /***************** end of copy form list.c *************************/
462 
463 /*
464  * Small wrapper to extract total length of a file
465  */
466 off_t
467 unxz_len(int fd)
468 {
469 	xz_file_info xfi = XZ_FILE_INFO_INIT;
470 	if (!parse_indexes(&xfi, fd)) {
471 		off_t res = lzma_index_uncompressed_size(xfi.idx);
472 		lzma_index_end(xfi.idx, NULL);
473 		return res;
474 	}
475 	return 0;
476 }
477 
478