xref: /freebsd/usr.bin/gzip/unlz.c (revision e12ff891366cf94db4bfe4c2c810b26a5531053d)
1 /*	$NetBSD: unlz.c,v 1.6 2018/11/11 01:42:36 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 2018 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Christos Zoulas.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  *
31  * $FreeBSD$
32  */
33 
34 /*  Lzd - Educational decompressor for the lzip format
35     Copyright (C) 2013-2018 Antonio Diaz Diaz.
36 
37     This program is free software. Redistribution and use in source and
38     binary forms, with or without modification, are permitted provided
39     that the following conditions are met:
40 
41     1. Redistributions of source code must retain the above copyright
42     notice, this list of conditions and the following disclaimer.
43 
44     2. Redistributions in binary form must reproduce the above copyright
45     notice, this list of conditions and the following disclaimer in the
46     documentation and/or other materials provided with the distribution.
47 
48     This program is distributed in the hope that it will be useful,
49     but WITHOUT ANY WARRANTY; without even the implied warranty of
50     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
51 */
52 
53 #include <sys/param.h>
54 #include <stdio.h>
55 #include <string.h>
56 #include <stdlib.h>
57 #include <stdint.h>
58 #include <stdbool.h>
59 #include <errno.h>
60 #include <unistd.h>
61 
62 #define LZ_STATES		12
63 
64 #define LITERAL_CONTEXT_BITS	3
65 #define POS_STATE_BITS		2
66 #define POS_STATES		(1 << POS_STATE_BITS)
67 #define POS_STATE_MASK 		(POS_STATES - 1)
68 
69 #define STATES			4
70 #define DIS_SLOT_BITS		6
71 
72 #define DIS_MODEL_START		4
73 #define DIS_MODEL_END		14
74 
75 #define MODELED_DISTANCES	(1 << (DIS_MODEL_END / 2))
76 #define DIS_ALIGN_BITS		4
77 #define DIS_ALIGN_SIZE		(1 << DIS_ALIGN_BITS)
78 
79 #define LOW_BITS		3
80 #define MID_BITS		3
81 #define HIGH_BITS		8
82 
83 #define LOW_SYMBOLS		(1 << LOW_BITS)
84 #define MID_SYMBOLS		(1 << MID_BITS)
85 #define HIGH_SYMBOLS		(1 << HIGH_BITS)
86 
87 #define MAX_SYMBOLS 		(LOW_SYMBOLS + MID_SYMBOLS + HIGH_SYMBOLS)
88 
89 #define MIN_MATCH_LEN		2
90 
91 #define BIT_MODEL_MOVE_BITS	5
92 #define BIT_MODEL_TOTAL_BITS 	11
93 #define BIT_MODEL_TOTAL 	(1 << BIT_MODEL_TOTAL_BITS)
94 #define BIT_MODEL_INIT		(BIT_MODEL_TOTAL / 2)
95 
96 static const int lz_st_next[] = {
97 	0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5,
98 };
99 
100 static bool
101 lz_st_is_char(int st) {
102 	return st < 7;
103 }
104 
105 static int
106 lz_st_get_char(int st) {
107 	return lz_st_next[st];
108 }
109 
110 static int
111 lz_st_get_match(int st) {
112 	return st < 7 ? 7 : 10;
113 }
114 
115 static int
116 lz_st_get_rep(int st) {
117 	return st < 7 ? 8 : 11;
118 }
119 
120 static int
121 lz_st_get_short_rep(int st) {
122 	return st < 7 ? 9 : 11;
123 }
124 
125 struct lz_len_model {
126 	int choice1;
127 	int choice2;
128 	int bm_low[POS_STATES][LOW_SYMBOLS];
129 	int bm_mid[POS_STATES][MID_SYMBOLS];
130 	int bm_high[HIGH_SYMBOLS];
131 };
132 
133 static uint32_t lz_crc[256];
134 
135 static void
136 lz_crc_init(void)
137 {
138 	for (unsigned i = 0; i < nitems(lz_crc); i++) {
139 		unsigned c = i;
140 		for (unsigned j = 0; j < 8; j++) {
141 			if (c & 1)
142 				c = 0xEDB88320U ^ (c >> 1);
143 			else
144 				c >>= 1;
145 		}
146 		lz_crc[i] = c;
147       }
148 }
149 
150 static void
151 lz_crc_update(uint32_t *crc, const uint8_t *buf, size_t len)
152 {
153 	for (size_t i = 0; i < len; i++)
154 		*crc = lz_crc[(*crc ^ buf[i]) & 0xFF] ^ (*crc >> 8);
155 }
156 
157 struct lz_range_decoder {
158 	FILE *fp;
159 	uint32_t code;
160 	uint32_t range;
161 };
162 
163 static int
164 lz_rd_create(struct lz_range_decoder *rd, FILE *fp)
165 {
166 	rd->fp = fp;
167 	rd->code = 0;
168 	rd->range = ~0;
169 	for (int i = 0; i < 5; i++)
170 		rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
171 	return ferror(rd->fp) ? -1 : 0;
172 }
173 
174 static unsigned
175 lz_rd_decode(struct lz_range_decoder *rd, int num_bits)
176 {
177 	unsigned symbol = 0;
178 
179 	for (int i = num_bits; i > 0; i--) {
180 		rd->range >>= 1;
181 		symbol <<= 1;
182 		if (rd->code >= rd->range) {
183 			rd->code -= rd->range;
184 			symbol |= 1;
185 		}
186 		if (rd->range <= 0x00FFFFFFU) {
187 			rd->range <<= 8;
188 			rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
189 		}
190 	}
191 
192 	return symbol;
193 }
194 
195 static unsigned
196 lz_rd_decode_bit(struct lz_range_decoder *rd, int *bm)
197 {
198 	unsigned symbol;
199 	const uint32_t bound = (rd->range >> BIT_MODEL_TOTAL_BITS) * *bm;
200 
201 	if(rd->code < bound) {
202 		rd->range = bound;
203 		*bm += (BIT_MODEL_TOTAL - *bm) >> BIT_MODEL_MOVE_BITS;
204 		symbol = 0;
205 	}
206 	else {
207 		rd->range -= bound;
208 		rd->code -= bound;
209 		*bm -= *bm >> BIT_MODEL_MOVE_BITS;
210 		symbol = 1;
211 	}
212 
213 	if (rd->range <= 0x00FFFFFFU) {
214 		rd->range <<= 8;
215 		rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
216 	}
217 	return symbol;
218 }
219 
220 static unsigned
221 lz_rd_decode_tree(struct lz_range_decoder *rd, int *bm, int num_bits)
222 {
223 	unsigned symbol = 1;
224 
225 	for (int i = 0; i < num_bits; i++)
226 		symbol = (symbol << 1) | lz_rd_decode_bit(rd, &bm[symbol]);
227 
228 	return symbol - (1 << num_bits);
229 }
230 
231 static unsigned
232 lz_rd_decode_tree_reversed(struct lz_range_decoder *rd, int *bm, int num_bits)
233 {
234 	unsigned symbol = lz_rd_decode_tree(rd, bm, num_bits);
235 	unsigned reversed_symbol = 0;
236 
237 	for (int i = 0; i < num_bits; i++) {
238 		reversed_symbol = (reversed_symbol << 1) | (symbol & 1);
239 		symbol >>= 1;
240 	}
241 
242 	return reversed_symbol;
243 }
244 
245 static unsigned
246 lz_rd_decode_matched(struct lz_range_decoder *rd, int *bm, int match_byte)
247 {
248 	unsigned symbol = 1;
249 
250 	for (int i = 7; i >= 0; i--) {
251 		const unsigned match_bit = (match_byte >> i) & 1;
252 		const unsigned bit = lz_rd_decode_bit(rd,
253 		    &bm[symbol + (match_bit << 8) + 0x100]);
254 		symbol = (symbol << 1) | bit;
255 		if (match_bit != bit) {
256 			while (symbol < 0x100) {
257 				symbol = (symbol << 1) |
258 				    lz_rd_decode_bit(rd, &bm[symbol]);
259 			}
260 			break;
261 		}
262 	}
263 	return symbol & 0xFF;
264 }
265 
266 static unsigned
267 lz_rd_decode_len(struct lz_range_decoder *rd, struct lz_len_model *lm,
268     int pos_state)
269 {
270 	if (lz_rd_decode_bit(rd, &lm->choice1) == 0)
271 		return lz_rd_decode_tree(rd, lm->bm_low[pos_state], LOW_BITS);
272 
273 	if (lz_rd_decode_bit(rd, &lm->choice2) == 0) {
274 		return LOW_SYMBOLS +
275 		    lz_rd_decode_tree(rd, lm->bm_mid[pos_state], MID_BITS);
276 	}
277 
278 	return LOW_SYMBOLS + MID_SYMBOLS +
279            lz_rd_decode_tree(rd, lm->bm_high, HIGH_BITS);
280 }
281 
282 struct lz_decoder {
283 	FILE *fin, *fout;
284 	off_t pos, ppos, spos, dict_size;
285 	bool wrapped;
286 	uint32_t crc;
287 	uint8_t *obuf;
288 	struct lz_range_decoder rdec;
289 };
290 
291 static int
292 lz_flush(struct lz_decoder *lz)
293 {
294 	off_t offs = lz->pos - lz->spos;
295 	if (offs <= 0)
296 		return -1;
297 
298 	size_t size = (size_t)offs;
299 	lz_crc_update(&lz->crc, lz->obuf + lz->spos, size);
300 	if (fwrite(lz->obuf + lz->spos, 1, size, lz->fout) != size)
301 		return -1;
302 
303 	lz->wrapped = lz->pos >= lz->dict_size;
304 	if (lz->wrapped) {
305 		lz->ppos += lz->pos;
306 		lz->pos = 0;
307 	}
308 	lz->spos = lz->pos;
309 	return 0;
310 }
311 
312 static void
313 lz_destroy(struct lz_decoder *lz)
314 {
315 	if (lz->fin)
316 		fclose(lz->fin);
317 	if (lz->fout)
318 		fclose(lz->fout);
319 	free(lz->obuf);
320 }
321 
322 static int
323 lz_create(struct lz_decoder *lz, int fin, int fdout, int dict_size)
324 {
325 	memset(lz, 0, sizeof(*lz));
326 
327 	lz->fin = fdopen(dup(fin), "r");
328 	if (lz->fin == NULL)
329 		goto out;
330 
331 	lz->fout = fdopen(dup(fdout), "w");
332 	if (lz->fout == NULL)
333 		goto out;
334 
335 	lz->pos = lz->ppos = lz->spos = 0;
336 	lz->crc = ~0;
337 	lz->dict_size = dict_size;
338 	lz->wrapped = false;
339 
340 	lz->obuf = malloc(dict_size);
341 	if (lz->obuf == NULL)
342 		goto out;
343 
344 	if (lz_rd_create(&lz->rdec, lz->fin) == -1)
345 		goto out;
346 	return 0;
347 out:
348 	lz_destroy(lz);
349 	return -1;
350 }
351 
352 static uint8_t
353 lz_peek(const struct lz_decoder *lz, unsigned ahead)
354 {
355 	off_t diff = lz->pos - ahead - 1;
356 
357 	if (diff >= 0)
358 		return lz->obuf[diff];
359 
360 	if (lz->wrapped)
361 		return lz->obuf[lz->dict_size + diff];
362 
363 	return 0;
364 }
365 
366 static void
367 lz_put(struct lz_decoder *lz, uint8_t b)
368 {
369 	lz->obuf[lz->pos++] = b;
370 	if (lz->dict_size == lz->pos)
371 		lz_flush(lz);
372 }
373 
374 static off_t
375 lz_get_data_position(const struct lz_decoder *lz)
376 {
377 	return lz->ppos + lz->pos;
378 }
379 
380 static unsigned
381 lz_get_crc(const struct lz_decoder *lz)
382 {
383 	return lz->crc ^ 0xffffffffU;
384 }
385 
386 static void
387 lz_bm_init(int *a, size_t l)
388 {
389 	for (size_t i = 0; i < l; i++)
390 		a[i] = BIT_MODEL_INIT;
391 }
392 
393 #define LZ_BM_INIT(a)	lz_bm_init(a, nitems(a))
394 #define LZ_BM_INIT2(a)	do { \
395 	size_t l = nitems(a[0]); \
396 	for (size_t i = 0; i < nitems(a); i++) \
397 		lz_bm_init(a[i], l); \
398 } while (/*CONSTCOND*/0)
399 
400 #define LZ_MODEL_INIT(a) do { \
401 	a.choice1 = BIT_MODEL_INIT; \
402 	a.choice2 = BIT_MODEL_INIT; \
403 	LZ_BM_INIT2(a.bm_low); \
404 	LZ_BM_INIT2(a.bm_mid); \
405 	LZ_BM_INIT(a.bm_high); \
406 } while (/*CONSTCOND*/0)
407 
408 static bool
409 lz_decode_member(struct lz_decoder *lz)
410 {
411 	int bm_literal[1 << LITERAL_CONTEXT_BITS][0x300];
412 	int bm_match[LZ_STATES][POS_STATES];
413 	int bm_rep[4][LZ_STATES];
414 	int bm_len[LZ_STATES][POS_STATES];
415 	int bm_dis_slot[LZ_STATES][1 << DIS_SLOT_BITS];
416 	int bm_dis[MODELED_DISTANCES - DIS_MODEL_END + 1];
417 	int bm_align[DIS_ALIGN_SIZE];
418 
419 	LZ_BM_INIT2(bm_literal);
420 	LZ_BM_INIT2(bm_match);
421 	LZ_BM_INIT2(bm_rep);
422 	LZ_BM_INIT2(bm_len);
423 	LZ_BM_INIT2(bm_dis_slot);
424 	LZ_BM_INIT(bm_dis);
425 	LZ_BM_INIT(bm_align);
426 
427 	struct lz_len_model match_len_model;
428 	struct lz_len_model rep_len_model;
429 
430 	LZ_MODEL_INIT(match_len_model);
431 	LZ_MODEL_INIT(rep_len_model);
432 
433 	struct lz_range_decoder *rd = &lz->rdec;
434 	unsigned rep[4] = { 0 };
435 
436 
437 	int state = 0;
438 
439 	while (!feof(lz->fin) && !ferror(lz->fin)) {
440 		const int pos_state = lz_get_data_position(lz) & POS_STATE_MASK;
441 		// bit 1
442 		if (lz_rd_decode_bit(rd, &bm_match[state][pos_state]) == 0) {
443 			const uint8_t prev_byte = lz_peek(lz, 0);
444 			const int literal_state =
445 			    prev_byte >> (8 - LITERAL_CONTEXT_BITS);
446 			int *bm = bm_literal[literal_state];
447 			if (lz_st_is_char(state))
448 				lz_put(lz, lz_rd_decode_tree(rd, bm, 8));
449 			else {
450 				int peek = lz_peek(lz, rep[0]);
451 				lz_put(lz, lz_rd_decode_matched(rd, bm, peek));
452 			}
453 			state = lz_st_get_char(state);
454 			continue;
455 		}
456 		int len;
457 		// bit 2
458 		if (lz_rd_decode_bit(rd, &bm_rep[0][state]) != 0) {
459 			// bit 3
460 			if (lz_rd_decode_bit(rd, &bm_rep[1][state]) == 0) {
461 				// bit 4
462 				if (lz_rd_decode_bit(rd,
463 				    &bm_len[state][pos_state]) == 0)
464 				{
465 					state = lz_st_get_short_rep(state);
466 					lz_put(lz, lz_peek(lz, rep[0]));
467 					continue;
468 				}
469 			} else {
470 				unsigned distance;
471 				// bit 4
472 				if (lz_rd_decode_bit(rd, &bm_rep[2][state])
473 				    == 0)
474 					distance = rep[1];
475 				else {
476 					// bit 5
477 					if (lz_rd_decode_bit(rd,
478 					    &bm_rep[3][state]) == 0)
479 						distance = rep[2];
480 					else {
481 						distance = rep[3];
482 						rep[3] = rep[2];
483 					}
484 					rep[2] = rep[1];
485 				}
486 				rep[1] = rep[0];
487 				rep[0] = distance;
488 			}
489 			state = lz_st_get_rep(state);
490 			len = MIN_MATCH_LEN +
491 			    lz_rd_decode_len(rd, &rep_len_model, pos_state);
492 		} else {
493 			rep[3] = rep[2]; rep[2] = rep[1]; rep[1] = rep[0];
494 			len = MIN_MATCH_LEN +
495 			    lz_rd_decode_len(rd, &match_len_model, pos_state);
496 			const int len_state =
497 			    MIN(len - MIN_MATCH_LEN, STATES - 1);
498 			rep[0] = lz_rd_decode_tree(rd, bm_dis_slot[len_state],
499 			    DIS_SLOT_BITS);
500 			if (rep[0] >= DIS_MODEL_START) {
501 				const unsigned dis_slot = rep[0];
502 				const int direct_bits = (dis_slot >> 1) - 1;
503 			        rep[0] = (2 | (dis_slot & 1)) << direct_bits;
504 				if (dis_slot < DIS_MODEL_END)
505 					rep[0] += lz_rd_decode_tree_reversed(rd,
506 					    &bm_dis[rep[0] - dis_slot],
507                                             direct_bits);
508 				else {
509 					rep[0] += lz_rd_decode(rd, direct_bits
510 					    - DIS_ALIGN_BITS) << DIS_ALIGN_BITS;
511 					rep[0] += lz_rd_decode_tree_reversed(rd,
512 					    bm_align, DIS_ALIGN_BITS);
513 					if (rep[0] == 0xFFFFFFFFU) {
514 						lz_flush(lz);
515 						return len == MIN_MATCH_LEN;
516 					}
517 				}
518 			}
519 			state = lz_st_get_match(state);
520 			if (rep[0] >= lz->dict_size ||
521 			    (rep[0] >= lz->pos && !lz->wrapped)) {
522 				lz_flush(lz);
523 				return false;
524 			}
525 		}
526 		for (int i = 0; i < len; i++)
527 			lz_put(lz, lz_peek(lz, rep[0]));
528     	}
529 	lz_flush(lz);
530 	return false;
531 }
532 
533 /*
534  * 0-3	CRC32 of the uncompressed data
535  * 4-11 size of the uncompressed data
536  * 12-19 member size including header and trailer
537  */
538 #define TRAILER_SIZE 20
539 
540 
541 static off_t
542 lz_decode(int fin, int fdout, unsigned dict_size, off_t *insize)
543 {
544 	struct lz_decoder lz;
545 	off_t rv = -1;
546 
547 	if (lz_create(&lz, fin, fdout, dict_size) == -1)
548 		return -1;
549 
550 	if (!lz_decode_member(&lz))
551 		goto out;
552 
553 	uint8_t trailer[TRAILER_SIZE];
554 
555 	for(size_t i = 0; i < nitems(trailer); i++)
556 		trailer[i] = (uint8_t)getc(lz.fin);
557 
558 	unsigned crc = 0;
559 	for (int i = 3; i >= 0; --i) {
560 		crc <<= 8;
561 		crc += trailer[i];
562 	}
563 
564 	int64_t data_size = 0;
565 	for (int i = 11; i >= 4; --i) {
566 		data_size <<= 8;
567 		data_size += trailer[i];
568 	}
569 
570 	if (crc != lz_get_crc(&lz) || data_size != lz_get_data_position(&lz))
571 		goto out;
572 
573 	rv = 0;
574 	for (int i = 19; i >= 12; --i) {
575 		rv <<= 8;
576 		rv += trailer[i];
577 	}
578 	if (insize)
579 		*insize = rv;
580 #if 0
581 	/* Does not work with pipes */
582 	rv = ftello(lz.fout);
583 #else
584 	rv = data_size;
585 #endif
586 out:
587 	lz_destroy(&lz);
588 	return rv;
589 }
590 
591 
592 /*
593  * 0-3 magic
594  * 4 version
595  * 5 coded dict_size
596  */
597 #define HDR_SIZE 6
598 #define MIN_DICTIONARY_SIZE (1 << 12)
599 #define MAX_DICTIONARY_SIZE (1 << 29)
600 
601 static const char hdrmagic[] = { 'L', 'Z', 'I', 'P', 1 };
602 
603 static unsigned
604 lz_get_dict_size(unsigned char c)
605 {
606 	unsigned dict_size = 1 << (c & 0x1f);
607 	dict_size -= (dict_size >> 2) * ( (c >> 5) & 0x7);
608 	if (dict_size < MIN_DICTIONARY_SIZE || dict_size > MAX_DICTIONARY_SIZE)
609 		return 0;
610 	return dict_size;
611 }
612 
613 static off_t
614 unlz(int fin, int fout, char *pre, size_t prelen, off_t *bytes_in)
615 {
616 	if (lz_crc[0] == 0)
617 		lz_crc_init();
618 
619 	char header[HDR_SIZE];
620 
621 	if (prelen > sizeof(header))
622 		return -1;
623 	if (pre && prelen)
624 		memcpy(header, pre, prelen);
625 
626 	ssize_t nr = read(fin, header + prelen, sizeof(header) - prelen);
627 	switch (nr) {
628 	case -1:
629 		return -1;
630 	case 0:
631 		return prelen ? -1 : 0;
632 	default:
633 		if ((size_t)nr != sizeof(header) - prelen)
634 			return -1;
635 		break;
636 	}
637 
638 	if (memcmp(header, hdrmagic, sizeof(hdrmagic)) != 0)
639 		return -1;
640 
641 	unsigned dict_size = lz_get_dict_size(header[5]);
642 	if (dict_size == 0)
643 		return -1;
644 
645 	return lz_decode(fin, fout, dict_size, bytes_in);
646 }
647