xref: /freebsd/usr.bin/gzip/unlz.c (revision 35c0a8c449fd2b7f75029ebed5e10852240f0865)
1 /*	$NetBSD: unlz.c,v 1.6 2018/11/11 01:42:36 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 2018 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Christos Zoulas.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*  Lzd - Educational decompressor for the lzip format
33     Copyright (C) 2013-2018 Antonio Diaz Diaz.
34 
35     This program is free software. Redistribution and use in source and
36     binary forms, with or without modification, are permitted provided
37     that the following conditions are met:
38 
39     1. Redistributions of source code must retain the above copyright
40     notice, this list of conditions and the following disclaimer.
41 
42     2. Redistributions in binary form must reproduce the above copyright
43     notice, this list of conditions and the following disclaimer in the
44     documentation and/or other materials provided with the distribution.
45 
46     This program is distributed in the hope that it will be useful,
47     but WITHOUT ANY WARRANTY; without even the implied warranty of
48     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
49 */
50 
51 #include <sys/param.h>
52 #include <stdio.h>
53 #include <string.h>
54 #include <stdlib.h>
55 #include <stdint.h>
56 #include <stdbool.h>
57 #include <errno.h>
58 #include <unistd.h>
59 
60 #define LZ_STATES		12
61 
62 #define LITERAL_CONTEXT_BITS	3
63 #define POS_STATE_BITS		2
64 #define POS_STATES		(1 << POS_STATE_BITS)
65 #define POS_STATE_MASK 		(POS_STATES - 1)
66 
67 #define STATES			4
68 #define DIS_SLOT_BITS		6
69 
70 #define DIS_MODEL_START		4
71 #define DIS_MODEL_END		14
72 
73 #define MODELED_DISTANCES	(1 << (DIS_MODEL_END / 2))
74 #define DIS_ALIGN_BITS		4
75 #define DIS_ALIGN_SIZE		(1 << DIS_ALIGN_BITS)
76 
77 #define LOW_BITS		3
78 #define MID_BITS		3
79 #define HIGH_BITS		8
80 
81 #define LOW_SYMBOLS		(1 << LOW_BITS)
82 #define MID_SYMBOLS		(1 << MID_BITS)
83 #define HIGH_SYMBOLS		(1 << HIGH_BITS)
84 
85 #define MAX_SYMBOLS 		(LOW_SYMBOLS + MID_SYMBOLS + HIGH_SYMBOLS)
86 
87 #define MIN_MATCH_LEN		2
88 
89 #define BIT_MODEL_MOVE_BITS	5
90 #define BIT_MODEL_TOTAL_BITS 	11
91 #define BIT_MODEL_TOTAL 	(1 << BIT_MODEL_TOTAL_BITS)
92 #define BIT_MODEL_INIT		(BIT_MODEL_TOTAL / 2)
93 
94 static const int lz_st_next[] = {
95 	0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5,
96 };
97 
98 static bool
99 lz_st_is_char(int st) {
100 	return st < 7;
101 }
102 
103 static int
104 lz_st_get_char(int st) {
105 	return lz_st_next[st];
106 }
107 
108 static int
109 lz_st_get_match(int st) {
110 	return st < 7 ? 7 : 10;
111 }
112 
113 static int
114 lz_st_get_rep(int st) {
115 	return st < 7 ? 8 : 11;
116 }
117 
118 static int
119 lz_st_get_short_rep(int st) {
120 	return st < 7 ? 9 : 11;
121 }
122 
123 struct lz_len_model {
124 	int choice1;
125 	int choice2;
126 	int bm_low[POS_STATES][LOW_SYMBOLS];
127 	int bm_mid[POS_STATES][MID_SYMBOLS];
128 	int bm_high[HIGH_SYMBOLS];
129 };
130 
131 static uint32_t lz_crc[256];
132 
133 static void
134 lz_crc_init(void)
135 {
136 	for (unsigned i = 0; i < nitems(lz_crc); i++) {
137 		unsigned c = i;
138 		for (unsigned j = 0; j < 8; j++) {
139 			if (c & 1)
140 				c = 0xEDB88320U ^ (c >> 1);
141 			else
142 				c >>= 1;
143 		}
144 		lz_crc[i] = c;
145       }
146 }
147 
148 static void
149 lz_crc_update(uint32_t *crc, const uint8_t *buf, size_t len)
150 {
151 	for (size_t i = 0; i < len; i++)
152 		*crc = lz_crc[(*crc ^ buf[i]) & 0xFF] ^ (*crc >> 8);
153 }
154 
155 struct lz_range_decoder {
156 	FILE *fp;
157 	uint32_t code;
158 	uint32_t range;
159 };
160 
161 static int
162 lz_rd_create(struct lz_range_decoder *rd, FILE *fp)
163 {
164 	rd->fp = fp;
165 	rd->code = 0;
166 	rd->range = ~0;
167 	for (int i = 0; i < 5; i++)
168 		rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
169 	return ferror(rd->fp) ? -1 : 0;
170 }
171 
172 static unsigned
173 lz_rd_decode(struct lz_range_decoder *rd, int num_bits)
174 {
175 	unsigned symbol = 0;
176 
177 	for (int i = num_bits; i > 0; i--) {
178 		rd->range >>= 1;
179 		symbol <<= 1;
180 		if (rd->code >= rd->range) {
181 			rd->code -= rd->range;
182 			symbol |= 1;
183 		}
184 		if (rd->range <= 0x00FFFFFFU) {
185 			rd->range <<= 8;
186 			rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
187 		}
188 	}
189 
190 	return symbol;
191 }
192 
193 static unsigned
194 lz_rd_decode_bit(struct lz_range_decoder *rd, int *bm)
195 {
196 	unsigned symbol;
197 	const uint32_t bound = (rd->range >> BIT_MODEL_TOTAL_BITS) * *bm;
198 
199 	if(rd->code < bound) {
200 		rd->range = bound;
201 		*bm += (BIT_MODEL_TOTAL - *bm) >> BIT_MODEL_MOVE_BITS;
202 		symbol = 0;
203 	}
204 	else {
205 		rd->range -= bound;
206 		rd->code -= bound;
207 		*bm -= *bm >> BIT_MODEL_MOVE_BITS;
208 		symbol = 1;
209 	}
210 
211 	if (rd->range <= 0x00FFFFFFU) {
212 		rd->range <<= 8;
213 		rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
214 	}
215 	return symbol;
216 }
217 
218 static unsigned
219 lz_rd_decode_tree(struct lz_range_decoder *rd, int *bm, int num_bits)
220 {
221 	unsigned symbol = 1;
222 
223 	for (int i = 0; i < num_bits; i++)
224 		symbol = (symbol << 1) | lz_rd_decode_bit(rd, &bm[symbol]);
225 
226 	return symbol - (1 << num_bits);
227 }
228 
229 static unsigned
230 lz_rd_decode_tree_reversed(struct lz_range_decoder *rd, int *bm, int num_bits)
231 {
232 	unsigned symbol = lz_rd_decode_tree(rd, bm, num_bits);
233 	unsigned reversed_symbol = 0;
234 
235 	for (int i = 0; i < num_bits; i++) {
236 		reversed_symbol = (reversed_symbol << 1) | (symbol & 1);
237 		symbol >>= 1;
238 	}
239 
240 	return reversed_symbol;
241 }
242 
243 static unsigned
244 lz_rd_decode_matched(struct lz_range_decoder *rd, int *bm, int match_byte)
245 {
246 	unsigned symbol = 1;
247 
248 	for (int i = 7; i >= 0; i--) {
249 		const unsigned match_bit = (match_byte >> i) & 1;
250 		const unsigned bit = lz_rd_decode_bit(rd,
251 		    &bm[symbol + (match_bit << 8) + 0x100]);
252 		symbol = (symbol << 1) | bit;
253 		if (match_bit != bit) {
254 			while (symbol < 0x100) {
255 				symbol = (symbol << 1) |
256 				    lz_rd_decode_bit(rd, &bm[symbol]);
257 			}
258 			break;
259 		}
260 	}
261 	return symbol & 0xFF;
262 }
263 
264 static unsigned
265 lz_rd_decode_len(struct lz_range_decoder *rd, struct lz_len_model *lm,
266     int pos_state)
267 {
268 	if (lz_rd_decode_bit(rd, &lm->choice1) == 0)
269 		return lz_rd_decode_tree(rd, lm->bm_low[pos_state], LOW_BITS);
270 
271 	if (lz_rd_decode_bit(rd, &lm->choice2) == 0) {
272 		return LOW_SYMBOLS +
273 		    lz_rd_decode_tree(rd, lm->bm_mid[pos_state], MID_BITS);
274 	}
275 
276 	return LOW_SYMBOLS + MID_SYMBOLS +
277            lz_rd_decode_tree(rd, lm->bm_high, HIGH_BITS);
278 }
279 
280 struct lz_decoder {
281 	FILE *fin, *fout;
282 	off_t pos, ppos, spos, dict_size;
283 	bool wrapped;
284 	uint32_t crc;
285 	uint8_t *obuf;
286 	struct lz_range_decoder rdec;
287 };
288 
289 static int
290 lz_flush(struct lz_decoder *lz)
291 {
292 	off_t offs = lz->pos - lz->spos;
293 	if (offs <= 0)
294 		return -1;
295 
296 	size_t size = (size_t)offs;
297 	lz_crc_update(&lz->crc, lz->obuf + lz->spos, size);
298 	if (fwrite(lz->obuf + lz->spos, 1, size, lz->fout) != size)
299 		return -1;
300 
301 	lz->wrapped = lz->pos >= lz->dict_size;
302 	if (lz->wrapped) {
303 		lz->ppos += lz->pos;
304 		lz->pos = 0;
305 	}
306 	lz->spos = lz->pos;
307 	return 0;
308 }
309 
310 static void
311 lz_destroy(struct lz_decoder *lz)
312 {
313 	if (lz->fin)
314 		fclose(lz->fin);
315 	if (lz->fout)
316 		fclose(lz->fout);
317 	free(lz->obuf);
318 }
319 
320 static int
321 lz_create(struct lz_decoder *lz, int fin, int fdout, int dict_size)
322 {
323 	memset(lz, 0, sizeof(*lz));
324 
325 	lz->fin = fdopen(dup(fin), "r");
326 	if (lz->fin == NULL)
327 		goto out;
328 
329 	lz->fout = fdopen(dup(fdout), "w");
330 	if (lz->fout == NULL)
331 		goto out;
332 
333 	lz->pos = lz->ppos = lz->spos = 0;
334 	lz->crc = ~0;
335 	lz->dict_size = dict_size;
336 	lz->wrapped = false;
337 
338 	lz->obuf = malloc(dict_size);
339 	if (lz->obuf == NULL)
340 		goto out;
341 
342 	if (lz_rd_create(&lz->rdec, lz->fin) == -1)
343 		goto out;
344 	return 0;
345 out:
346 	lz_destroy(lz);
347 	return -1;
348 }
349 
350 static uint8_t
351 lz_peek(const struct lz_decoder *lz, unsigned ahead)
352 {
353 	off_t diff = lz->pos - ahead - 1;
354 
355 	if (diff >= 0)
356 		return lz->obuf[diff];
357 
358 	if (lz->wrapped)
359 		return lz->obuf[lz->dict_size + diff];
360 
361 	return 0;
362 }
363 
364 static void
365 lz_put(struct lz_decoder *lz, uint8_t b)
366 {
367 	lz->obuf[lz->pos++] = b;
368 	if (lz->dict_size == lz->pos)
369 		lz_flush(lz);
370 }
371 
372 static off_t
373 lz_get_data_position(const struct lz_decoder *lz)
374 {
375 	return lz->ppos + lz->pos;
376 }
377 
378 static unsigned
379 lz_get_crc(const struct lz_decoder *lz)
380 {
381 	return lz->crc ^ 0xffffffffU;
382 }
383 
384 static void
385 lz_bm_init(int *a, size_t l)
386 {
387 	for (size_t i = 0; i < l; i++)
388 		a[i] = BIT_MODEL_INIT;
389 }
390 
391 #define LZ_BM_INIT(a)	lz_bm_init(a, nitems(a))
392 #define LZ_BM_INIT2(a)	do { \
393 	size_t l = nitems(a[0]); \
394 	for (size_t i = 0; i < nitems(a); i++) \
395 		lz_bm_init(a[i], l); \
396 } while (/*CONSTCOND*/0)
397 
398 #define LZ_MODEL_INIT(a) do { \
399 	a.choice1 = BIT_MODEL_INIT; \
400 	a.choice2 = BIT_MODEL_INIT; \
401 	LZ_BM_INIT2(a.bm_low); \
402 	LZ_BM_INIT2(a.bm_mid); \
403 	LZ_BM_INIT(a.bm_high); \
404 } while (/*CONSTCOND*/0)
405 
406 static bool
407 lz_decode_member(struct lz_decoder *lz)
408 {
409 	int bm_literal[1 << LITERAL_CONTEXT_BITS][0x300];
410 	int bm_match[LZ_STATES][POS_STATES];
411 	int bm_rep[4][LZ_STATES];
412 	int bm_len[LZ_STATES][POS_STATES];
413 	int bm_dis_slot[LZ_STATES][1 << DIS_SLOT_BITS];
414 	int bm_dis[MODELED_DISTANCES - DIS_MODEL_END + 1];
415 	int bm_align[DIS_ALIGN_SIZE];
416 
417 	LZ_BM_INIT2(bm_literal);
418 	LZ_BM_INIT2(bm_match);
419 	LZ_BM_INIT2(bm_rep);
420 	LZ_BM_INIT2(bm_len);
421 	LZ_BM_INIT2(bm_dis_slot);
422 	LZ_BM_INIT(bm_dis);
423 	LZ_BM_INIT(bm_align);
424 
425 	struct lz_len_model match_len_model;
426 	struct lz_len_model rep_len_model;
427 
428 	LZ_MODEL_INIT(match_len_model);
429 	LZ_MODEL_INIT(rep_len_model);
430 
431 	struct lz_range_decoder *rd = &lz->rdec;
432 	unsigned rep[4] = { 0 };
433 
434 
435 	int state = 0;
436 
437 	while (!feof(lz->fin) && !ferror(lz->fin)) {
438 		const int pos_state = lz_get_data_position(lz) & POS_STATE_MASK;
439 		// bit 1
440 		if (lz_rd_decode_bit(rd, &bm_match[state][pos_state]) == 0) {
441 			const uint8_t prev_byte = lz_peek(lz, 0);
442 			const int literal_state =
443 			    prev_byte >> (8 - LITERAL_CONTEXT_BITS);
444 			int *bm = bm_literal[literal_state];
445 			if (lz_st_is_char(state))
446 				lz_put(lz, lz_rd_decode_tree(rd, bm, 8));
447 			else {
448 				int peek = lz_peek(lz, rep[0]);
449 				lz_put(lz, lz_rd_decode_matched(rd, bm, peek));
450 			}
451 			state = lz_st_get_char(state);
452 			continue;
453 		}
454 		int len;
455 		// bit 2
456 		if (lz_rd_decode_bit(rd, &bm_rep[0][state]) != 0) {
457 			// bit 3
458 			if (lz_rd_decode_bit(rd, &bm_rep[1][state]) == 0) {
459 				// bit 4
460 				if (lz_rd_decode_bit(rd,
461 				    &bm_len[state][pos_state]) == 0)
462 				{
463 					state = lz_st_get_short_rep(state);
464 					lz_put(lz, lz_peek(lz, rep[0]));
465 					continue;
466 				}
467 			} else {
468 				unsigned distance;
469 				// bit 4
470 				if (lz_rd_decode_bit(rd, &bm_rep[2][state])
471 				    == 0)
472 					distance = rep[1];
473 				else {
474 					// bit 5
475 					if (lz_rd_decode_bit(rd,
476 					    &bm_rep[3][state]) == 0)
477 						distance = rep[2];
478 					else {
479 						distance = rep[3];
480 						rep[3] = rep[2];
481 					}
482 					rep[2] = rep[1];
483 				}
484 				rep[1] = rep[0];
485 				rep[0] = distance;
486 			}
487 			state = lz_st_get_rep(state);
488 			len = MIN_MATCH_LEN +
489 			    lz_rd_decode_len(rd, &rep_len_model, pos_state);
490 		} else {
491 			rep[3] = rep[2]; rep[2] = rep[1]; rep[1] = rep[0];
492 			len = MIN_MATCH_LEN +
493 			    lz_rd_decode_len(rd, &match_len_model, pos_state);
494 			const int len_state =
495 			    MIN(len - MIN_MATCH_LEN, STATES - 1);
496 			rep[0] = lz_rd_decode_tree(rd, bm_dis_slot[len_state],
497 			    DIS_SLOT_BITS);
498 			if (rep[0] >= DIS_MODEL_START) {
499 				const unsigned dis_slot = rep[0];
500 				const int direct_bits = (dis_slot >> 1) - 1;
501 			        rep[0] = (2 | (dis_slot & 1)) << direct_bits;
502 				if (dis_slot < DIS_MODEL_END)
503 					rep[0] += lz_rd_decode_tree_reversed(rd,
504 					    &bm_dis[rep[0] - dis_slot],
505                                             direct_bits);
506 				else {
507 					rep[0] += lz_rd_decode(rd, direct_bits
508 					    - DIS_ALIGN_BITS) << DIS_ALIGN_BITS;
509 					rep[0] += lz_rd_decode_tree_reversed(rd,
510 					    bm_align, DIS_ALIGN_BITS);
511 					if (rep[0] == 0xFFFFFFFFU) {
512 						lz_flush(lz);
513 						return len == MIN_MATCH_LEN;
514 					}
515 				}
516 			}
517 			state = lz_st_get_match(state);
518 			if (rep[0] >= lz->dict_size ||
519 			    (rep[0] >= lz->pos && !lz->wrapped)) {
520 				lz_flush(lz);
521 				return false;
522 			}
523 		}
524 		for (int i = 0; i < len; i++)
525 			lz_put(lz, lz_peek(lz, rep[0]));
526     	}
527 	lz_flush(lz);
528 	return false;
529 }
530 
531 /*
532  * 0-3	CRC32 of the uncompressed data
533  * 4-11 size of the uncompressed data
534  * 12-19 member size including header and trailer
535  */
536 #define TRAILER_SIZE 20
537 
538 
539 static off_t
540 lz_decode(int fin, int fdout, unsigned dict_size, off_t *insize)
541 {
542 	struct lz_decoder lz;
543 	off_t rv = -1;
544 
545 	if (lz_create(&lz, fin, fdout, dict_size) == -1)
546 		return -1;
547 
548 	if (!lz_decode_member(&lz))
549 		goto out;
550 
551 	uint8_t trailer[TRAILER_SIZE];
552 
553 	for(size_t i = 0; i < nitems(trailer); i++)
554 		trailer[i] = (uint8_t)getc(lz.fin);
555 
556 	unsigned crc = 0;
557 	for (int i = 3; i >= 0; --i) {
558 		crc <<= 8;
559 		crc += trailer[i];
560 	}
561 
562 	int64_t data_size = 0;
563 	for (int i = 11; i >= 4; --i) {
564 		data_size <<= 8;
565 		data_size += trailer[i];
566 	}
567 
568 	if (crc != lz_get_crc(&lz) || data_size != lz_get_data_position(&lz))
569 		goto out;
570 
571 	rv = 0;
572 	for (int i = 19; i >= 12; --i) {
573 		rv <<= 8;
574 		rv += trailer[i];
575 	}
576 	if (insize)
577 		*insize = rv;
578 #if 0
579 	/* Does not work with pipes */
580 	rv = ftello(lz.fout);
581 #else
582 	rv = data_size;
583 #endif
584 out:
585 	lz_destroy(&lz);
586 	return rv;
587 }
588 
589 
590 /*
591  * 0-3 magic
592  * 4 version
593  * 5 coded dict_size
594  */
595 #define HDR_SIZE 6
596 #define MIN_DICTIONARY_SIZE (1 << 12)
597 #define MAX_DICTIONARY_SIZE (1 << 29)
598 
599 static const char hdrmagic[] = { 'L', 'Z', 'I', 'P', 1 };
600 
601 static unsigned
602 lz_get_dict_size(unsigned char c)
603 {
604 	unsigned dict_size = 1 << (c & 0x1f);
605 	dict_size -= (dict_size >> 2) * ( (c >> 5) & 0x7);
606 	if (dict_size < MIN_DICTIONARY_SIZE || dict_size > MAX_DICTIONARY_SIZE)
607 		return 0;
608 	return dict_size;
609 }
610 
611 static off_t
612 unlz(int fin, int fout, char *pre, size_t prelen, off_t *bytes_in)
613 {
614 	if (lz_crc[0] == 0)
615 		lz_crc_init();
616 
617 	char header[HDR_SIZE];
618 
619 	if (pre && prelen)
620 		memcpy(header, pre, prelen);
621 
622 	ssize_t nr = read(fin, header + prelen, sizeof(header) - prelen);
623 	switch (nr) {
624 	case -1:
625 		return -1;
626 	case 0:
627 		return prelen ? -1 : 0;
628 	default:
629 		if ((size_t)nr != sizeof(header) - prelen)
630 			return -1;
631 		break;
632 	}
633 
634 	if (memcmp(header, hdrmagic, sizeof(hdrmagic)) != 0)
635 		return -1;
636 
637 	unsigned dict_size = lz_get_dict_size(header[5]);
638 	if (dict_size == 0)
639 		return -1;
640 
641 	return lz_decode(fin, fout, dict_size, bytes_in);
642 }
643