1 /* $NetBSD: unlz.c,v 1.6 2018/11/11 01:42:36 christos Exp $ */
2
3 /*-
4 * Copyright (c) 2018 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Christos Zoulas.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /* Lzd - Educational decompressor for the lzip format
33 Copyright (C) 2013-2018 Antonio Diaz Diaz.
34
35 This program is free software. Redistribution and use in source and
36 binary forms, with or without modification, are permitted provided
37 that the following conditions are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 This program is distributed in the hope that it will be useful,
47 but WITHOUT ANY WARRANTY; without even the implied warranty of
48 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
49 */
50
51 #include <sys/param.h>
52 #include <stdio.h>
53 #include <string.h>
54 #include <stdlib.h>
55 #include <stdint.h>
56 #include <stdbool.h>
57 #include <errno.h>
58 #include <unistd.h>
59
60 #define LZ_STATES 12
61
62 #define LITERAL_CONTEXT_BITS 3
63 #define POS_STATE_BITS 2
64 #define POS_STATES (1 << POS_STATE_BITS)
65 #define POS_STATE_MASK (POS_STATES - 1)
66
67 #define STATES 4
68 #define DIS_SLOT_BITS 6
69
70 #define DIS_MODEL_START 4
71 #define DIS_MODEL_END 14
72
73 #define MODELED_DISTANCES (1 << (DIS_MODEL_END / 2))
74 #define DIS_ALIGN_BITS 4
75 #define DIS_ALIGN_SIZE (1 << DIS_ALIGN_BITS)
76
77 #define LOW_BITS 3
78 #define MID_BITS 3
79 #define HIGH_BITS 8
80
81 #define LOW_SYMBOLS (1 << LOW_BITS)
82 #define MID_SYMBOLS (1 << MID_BITS)
83 #define HIGH_SYMBOLS (1 << HIGH_BITS)
84
85 #define MAX_SYMBOLS (LOW_SYMBOLS + MID_SYMBOLS + HIGH_SYMBOLS)
86
87 #define MIN_MATCH_LEN 2
88
89 #define BIT_MODEL_MOVE_BITS 5
90 #define BIT_MODEL_TOTAL_BITS 11
91 #define BIT_MODEL_TOTAL (1 << BIT_MODEL_TOTAL_BITS)
92 #define BIT_MODEL_INIT (BIT_MODEL_TOTAL / 2)
93
94 static const int lz_st_next[] = {
95 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5,
96 };
97
98 static bool
lz_st_is_char(int st)99 lz_st_is_char(int st) {
100 return st < 7;
101 }
102
103 static int
lz_st_get_char(int st)104 lz_st_get_char(int st) {
105 return lz_st_next[st];
106 }
107
108 static int
lz_st_get_match(int st)109 lz_st_get_match(int st) {
110 return st < 7 ? 7 : 10;
111 }
112
113 static int
lz_st_get_rep(int st)114 lz_st_get_rep(int st) {
115 return st < 7 ? 8 : 11;
116 }
117
118 static int
lz_st_get_short_rep(int st)119 lz_st_get_short_rep(int st) {
120 return st < 7 ? 9 : 11;
121 }
122
123 struct lz_len_model {
124 int choice1;
125 int choice2;
126 int bm_low[POS_STATES][LOW_SYMBOLS];
127 int bm_mid[POS_STATES][MID_SYMBOLS];
128 int bm_high[HIGH_SYMBOLS];
129 };
130
131 static uint32_t lz_crc[256];
132
133 static void
lz_crc_init(void)134 lz_crc_init(void)
135 {
136 for (unsigned i = 0; i < nitems(lz_crc); i++) {
137 unsigned c = i;
138 for (unsigned j = 0; j < 8; j++) {
139 if (c & 1)
140 c = 0xEDB88320U ^ (c >> 1);
141 else
142 c >>= 1;
143 }
144 lz_crc[i] = c;
145 }
146 }
147
148 static void
lz_crc_update(uint32_t * crc,const uint8_t * buf,size_t len)149 lz_crc_update(uint32_t *crc, const uint8_t *buf, size_t len)
150 {
151 for (size_t i = 0; i < len; i++)
152 *crc = lz_crc[(*crc ^ buf[i]) & 0xFF] ^ (*crc >> 8);
153 }
154
155 struct lz_range_decoder {
156 FILE *fp;
157 uint32_t code;
158 uint32_t range;
159 };
160
161 static int
lz_rd_create(struct lz_range_decoder * rd,FILE * fp)162 lz_rd_create(struct lz_range_decoder *rd, FILE *fp)
163 {
164 rd->fp = fp;
165 rd->code = 0;
166 rd->range = ~0;
167 for (int i = 0; i < 5; i++)
168 rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
169 return ferror(rd->fp) ? -1 : 0;
170 }
171
172 static unsigned
lz_rd_decode(struct lz_range_decoder * rd,int num_bits)173 lz_rd_decode(struct lz_range_decoder *rd, int num_bits)
174 {
175 unsigned symbol = 0;
176
177 for (int i = num_bits; i > 0; i--) {
178 rd->range >>= 1;
179 symbol <<= 1;
180 if (rd->code >= rd->range) {
181 rd->code -= rd->range;
182 symbol |= 1;
183 }
184 if (rd->range <= 0x00FFFFFFU) {
185 rd->range <<= 8;
186 rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
187 }
188 }
189
190 return symbol;
191 }
192
193 static unsigned
lz_rd_decode_bit(struct lz_range_decoder * rd,int * bm)194 lz_rd_decode_bit(struct lz_range_decoder *rd, int *bm)
195 {
196 unsigned symbol;
197 const uint32_t bound = (rd->range >> BIT_MODEL_TOTAL_BITS) * *bm;
198
199 if(rd->code < bound) {
200 rd->range = bound;
201 *bm += (BIT_MODEL_TOTAL - *bm) >> BIT_MODEL_MOVE_BITS;
202 symbol = 0;
203 }
204 else {
205 rd->range -= bound;
206 rd->code -= bound;
207 *bm -= *bm >> BIT_MODEL_MOVE_BITS;
208 symbol = 1;
209 }
210
211 if (rd->range <= 0x00FFFFFFU) {
212 rd->range <<= 8;
213 rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
214 }
215 return symbol;
216 }
217
218 static unsigned
lz_rd_decode_tree(struct lz_range_decoder * rd,int * bm,int num_bits)219 lz_rd_decode_tree(struct lz_range_decoder *rd, int *bm, int num_bits)
220 {
221 unsigned symbol = 1;
222
223 for (int i = 0; i < num_bits; i++)
224 symbol = (symbol << 1) | lz_rd_decode_bit(rd, &bm[symbol]);
225
226 return symbol - (1 << num_bits);
227 }
228
229 static unsigned
lz_rd_decode_tree_reversed(struct lz_range_decoder * rd,int * bm,int num_bits)230 lz_rd_decode_tree_reversed(struct lz_range_decoder *rd, int *bm, int num_bits)
231 {
232 unsigned symbol = lz_rd_decode_tree(rd, bm, num_bits);
233 unsigned reversed_symbol = 0;
234
235 for (int i = 0; i < num_bits; i++) {
236 reversed_symbol = (reversed_symbol << 1) | (symbol & 1);
237 symbol >>= 1;
238 }
239
240 return reversed_symbol;
241 }
242
243 static unsigned
lz_rd_decode_matched(struct lz_range_decoder * rd,int * bm,int match_byte)244 lz_rd_decode_matched(struct lz_range_decoder *rd, int *bm, int match_byte)
245 {
246 unsigned symbol = 1;
247
248 for (int i = 7; i >= 0; i--) {
249 const unsigned match_bit = (match_byte >> i) & 1;
250 const unsigned bit = lz_rd_decode_bit(rd,
251 &bm[symbol + (match_bit << 8) + 0x100]);
252 symbol = (symbol << 1) | bit;
253 if (match_bit != bit) {
254 while (symbol < 0x100) {
255 symbol = (symbol << 1) |
256 lz_rd_decode_bit(rd, &bm[symbol]);
257 }
258 break;
259 }
260 }
261 return symbol & 0xFF;
262 }
263
264 static unsigned
lz_rd_decode_len(struct lz_range_decoder * rd,struct lz_len_model * lm,int pos_state)265 lz_rd_decode_len(struct lz_range_decoder *rd, struct lz_len_model *lm,
266 int pos_state)
267 {
268 if (lz_rd_decode_bit(rd, &lm->choice1) == 0)
269 return lz_rd_decode_tree(rd, lm->bm_low[pos_state], LOW_BITS);
270
271 if (lz_rd_decode_bit(rd, &lm->choice2) == 0) {
272 return LOW_SYMBOLS +
273 lz_rd_decode_tree(rd, lm->bm_mid[pos_state], MID_BITS);
274 }
275
276 return LOW_SYMBOLS + MID_SYMBOLS +
277 lz_rd_decode_tree(rd, lm->bm_high, HIGH_BITS);
278 }
279
280 struct lz_decoder {
281 FILE *fin, *fout;
282 off_t pos, ppos, spos, dict_size;
283 bool wrapped;
284 uint32_t crc;
285 uint8_t *obuf;
286 struct lz_range_decoder rdec;
287 };
288
289 static int
lz_flush(struct lz_decoder * lz)290 lz_flush(struct lz_decoder *lz)
291 {
292 off_t offs = lz->pos - lz->spos;
293 if (offs <= 0)
294 return -1;
295
296 size_t size = (size_t)offs;
297 lz_crc_update(&lz->crc, lz->obuf + lz->spos, size);
298 if (fwrite(lz->obuf + lz->spos, 1, size, lz->fout) != size)
299 return -1;
300
301 lz->wrapped = lz->pos >= lz->dict_size;
302 if (lz->wrapped) {
303 lz->ppos += lz->pos;
304 lz->pos = 0;
305 }
306 lz->spos = lz->pos;
307 return 0;
308 }
309
310 static void
lz_destroy(struct lz_decoder * lz)311 lz_destroy(struct lz_decoder *lz)
312 {
313 if (lz->fin)
314 fclose(lz->fin);
315 if (lz->fout)
316 fclose(lz->fout);
317 free(lz->obuf);
318 }
319
320 static int
lz_create(struct lz_decoder * lz,int fin,int fdout,int dict_size)321 lz_create(struct lz_decoder *lz, int fin, int fdout, int dict_size)
322 {
323 memset(lz, 0, sizeof(*lz));
324
325 lz->fin = fdopen(dup(fin), "r");
326 if (lz->fin == NULL)
327 goto out;
328
329 lz->fout = fdopen(dup(fdout), "w");
330 if (lz->fout == NULL)
331 goto out;
332
333 lz->pos = lz->ppos = lz->spos = 0;
334 lz->crc = ~0;
335 lz->dict_size = dict_size;
336 lz->wrapped = false;
337
338 lz->obuf = malloc(dict_size);
339 if (lz->obuf == NULL)
340 goto out;
341
342 if (lz_rd_create(&lz->rdec, lz->fin) == -1)
343 goto out;
344 return 0;
345 out:
346 lz_destroy(lz);
347 return -1;
348 }
349
350 static uint8_t
lz_peek(const struct lz_decoder * lz,unsigned ahead)351 lz_peek(const struct lz_decoder *lz, unsigned ahead)
352 {
353 off_t diff = lz->pos - ahead - 1;
354
355 if (diff >= 0)
356 return lz->obuf[diff];
357
358 if (lz->wrapped)
359 return lz->obuf[lz->dict_size + diff];
360
361 return 0;
362 }
363
364 static void
lz_put(struct lz_decoder * lz,uint8_t b)365 lz_put(struct lz_decoder *lz, uint8_t b)
366 {
367 lz->obuf[lz->pos++] = b;
368 if (lz->dict_size == lz->pos)
369 lz_flush(lz);
370 }
371
372 static off_t
lz_get_data_position(const struct lz_decoder * lz)373 lz_get_data_position(const struct lz_decoder *lz)
374 {
375 return lz->ppos + lz->pos;
376 }
377
378 static unsigned
lz_get_crc(const struct lz_decoder * lz)379 lz_get_crc(const struct lz_decoder *lz)
380 {
381 return lz->crc ^ 0xffffffffU;
382 }
383
384 static void
lz_bm_init(int * a,size_t l)385 lz_bm_init(int *a, size_t l)
386 {
387 for (size_t i = 0; i < l; i++)
388 a[i] = BIT_MODEL_INIT;
389 }
390
391 #define LZ_BM_INIT(a) lz_bm_init(a, nitems(a))
392 #define LZ_BM_INIT2(a) do { \
393 size_t l = nitems(a[0]); \
394 for (size_t i = 0; i < nitems(a); i++) \
395 lz_bm_init(a[i], l); \
396 } while (/*CONSTCOND*/0)
397
398 #define LZ_MODEL_INIT(a) do { \
399 a.choice1 = BIT_MODEL_INIT; \
400 a.choice2 = BIT_MODEL_INIT; \
401 LZ_BM_INIT2(a.bm_low); \
402 LZ_BM_INIT2(a.bm_mid); \
403 LZ_BM_INIT(a.bm_high); \
404 } while (/*CONSTCOND*/0)
405
406 static bool
lz_decode_member(struct lz_decoder * lz)407 lz_decode_member(struct lz_decoder *lz)
408 {
409 int bm_literal[1 << LITERAL_CONTEXT_BITS][0x300];
410 int bm_match[LZ_STATES][POS_STATES];
411 int bm_rep[4][LZ_STATES];
412 int bm_len[LZ_STATES][POS_STATES];
413 int bm_dis_slot[LZ_STATES][1 << DIS_SLOT_BITS];
414 int bm_dis[MODELED_DISTANCES - DIS_MODEL_END + 1];
415 int bm_align[DIS_ALIGN_SIZE];
416
417 LZ_BM_INIT2(bm_literal);
418 LZ_BM_INIT2(bm_match);
419 LZ_BM_INIT2(bm_rep);
420 LZ_BM_INIT2(bm_len);
421 LZ_BM_INIT2(bm_dis_slot);
422 LZ_BM_INIT(bm_dis);
423 LZ_BM_INIT(bm_align);
424
425 struct lz_len_model match_len_model;
426 struct lz_len_model rep_len_model;
427
428 LZ_MODEL_INIT(match_len_model);
429 LZ_MODEL_INIT(rep_len_model);
430
431 struct lz_range_decoder *rd = &lz->rdec;
432 unsigned rep[4] = { 0 };
433
434
435 int state = 0;
436
437 while (!feof(lz->fin) && !ferror(lz->fin)) {
438 const int pos_state = lz_get_data_position(lz) & POS_STATE_MASK;
439 // bit 1
440 if (lz_rd_decode_bit(rd, &bm_match[state][pos_state]) == 0) {
441 const uint8_t prev_byte = lz_peek(lz, 0);
442 const int literal_state =
443 prev_byte >> (8 - LITERAL_CONTEXT_BITS);
444 int *bm = bm_literal[literal_state];
445 if (lz_st_is_char(state))
446 lz_put(lz, lz_rd_decode_tree(rd, bm, 8));
447 else {
448 int peek = lz_peek(lz, rep[0]);
449 lz_put(lz, lz_rd_decode_matched(rd, bm, peek));
450 }
451 state = lz_st_get_char(state);
452 continue;
453 }
454 int len;
455 // bit 2
456 if (lz_rd_decode_bit(rd, &bm_rep[0][state]) != 0) {
457 // bit 3
458 if (lz_rd_decode_bit(rd, &bm_rep[1][state]) == 0) {
459 // bit 4
460 if (lz_rd_decode_bit(rd,
461 &bm_len[state][pos_state]) == 0)
462 {
463 state = lz_st_get_short_rep(state);
464 lz_put(lz, lz_peek(lz, rep[0]));
465 continue;
466 }
467 } else {
468 unsigned distance;
469 // bit 4
470 if (lz_rd_decode_bit(rd, &bm_rep[2][state])
471 == 0)
472 distance = rep[1];
473 else {
474 // bit 5
475 if (lz_rd_decode_bit(rd,
476 &bm_rep[3][state]) == 0)
477 distance = rep[2];
478 else {
479 distance = rep[3];
480 rep[3] = rep[2];
481 }
482 rep[2] = rep[1];
483 }
484 rep[1] = rep[0];
485 rep[0] = distance;
486 }
487 state = lz_st_get_rep(state);
488 len = MIN_MATCH_LEN +
489 lz_rd_decode_len(rd, &rep_len_model, pos_state);
490 } else {
491 rep[3] = rep[2]; rep[2] = rep[1]; rep[1] = rep[0];
492 len = MIN_MATCH_LEN +
493 lz_rd_decode_len(rd, &match_len_model, pos_state);
494 const int len_state =
495 MIN(len - MIN_MATCH_LEN, STATES - 1);
496 rep[0] = lz_rd_decode_tree(rd, bm_dis_slot[len_state],
497 DIS_SLOT_BITS);
498 if (rep[0] >= DIS_MODEL_START) {
499 const unsigned dis_slot = rep[0];
500 const int direct_bits = (dis_slot >> 1) - 1;
501 rep[0] = (2 | (dis_slot & 1)) << direct_bits;
502 if (dis_slot < DIS_MODEL_END)
503 rep[0] += lz_rd_decode_tree_reversed(rd,
504 &bm_dis[rep[0] - dis_slot],
505 direct_bits);
506 else {
507 rep[0] += lz_rd_decode(rd, direct_bits
508 - DIS_ALIGN_BITS) << DIS_ALIGN_BITS;
509 rep[0] += lz_rd_decode_tree_reversed(rd,
510 bm_align, DIS_ALIGN_BITS);
511 if (rep[0] == 0xFFFFFFFFU) {
512 lz_flush(lz);
513 return len == MIN_MATCH_LEN;
514 }
515 }
516 }
517 state = lz_st_get_match(state);
518 if (rep[0] >= lz->dict_size ||
519 (rep[0] >= lz->pos && !lz->wrapped)) {
520 lz_flush(lz);
521 return false;
522 }
523 }
524 for (int i = 0; i < len; i++)
525 lz_put(lz, lz_peek(lz, rep[0]));
526 }
527 lz_flush(lz);
528 return false;
529 }
530
531 /*
532 * 0-3 CRC32 of the uncompressed data
533 * 4-11 size of the uncompressed data
534 * 12-19 member size including header and trailer
535 */
536 #define TRAILER_SIZE 20
537
538
539 static off_t
lz_decode(int fin,int fdout,unsigned dict_size,off_t * insize)540 lz_decode(int fin, int fdout, unsigned dict_size, off_t *insize)
541 {
542 struct lz_decoder lz;
543 off_t rv = -1;
544
545 if (lz_create(&lz, fin, fdout, dict_size) == -1)
546 return -1;
547
548 if (!lz_decode_member(&lz))
549 goto out;
550
551 uint8_t trailer[TRAILER_SIZE];
552
553 for(size_t i = 0; i < nitems(trailer); i++)
554 trailer[i] = (uint8_t)getc(lz.fin);
555
556 unsigned crc = 0;
557 for (int i = 3; i >= 0; --i) {
558 crc <<= 8;
559 crc += trailer[i];
560 }
561
562 int64_t data_size = 0;
563 for (int i = 11; i >= 4; --i) {
564 data_size <<= 8;
565 data_size += trailer[i];
566 }
567
568 if (crc != lz_get_crc(&lz) || data_size != lz_get_data_position(&lz))
569 goto out;
570
571 rv = 0;
572 for (int i = 19; i >= 12; --i) {
573 rv <<= 8;
574 rv += trailer[i];
575 }
576 if (insize)
577 *insize = rv;
578 #if 0
579 /* Does not work with pipes */
580 rv = ftello(lz.fout);
581 #else
582 rv = data_size;
583 #endif
584 out:
585 lz_destroy(&lz);
586 return rv;
587 }
588
589
590 /*
591 * 0-3 magic
592 * 4 version
593 * 5 coded dict_size
594 */
595 #define HDR_SIZE 6
596 #define MIN_DICTIONARY_SIZE (1 << 12)
597 #define MAX_DICTIONARY_SIZE (1 << 29)
598
599 static const char hdrmagic[] = { 'L', 'Z', 'I', 'P', 1 };
600
601 static unsigned
lz_get_dict_size(unsigned char c)602 lz_get_dict_size(unsigned char c)
603 {
604 unsigned dict_size = 1 << (c & 0x1f);
605 dict_size -= (dict_size >> 2) * ( (c >> 5) & 0x7);
606 if (dict_size < MIN_DICTIONARY_SIZE || dict_size > MAX_DICTIONARY_SIZE)
607 return 0;
608 return dict_size;
609 }
610
611 static off_t
unlz(int fin,int fout,char * pre,size_t prelen,off_t * bytes_in)612 unlz(int fin, int fout, char *pre, size_t prelen, off_t *bytes_in)
613 {
614 if (lz_crc[0] == 0)
615 lz_crc_init();
616
617 char header[HDR_SIZE];
618
619 if (pre && prelen)
620 memcpy(header, pre, prelen);
621
622 ssize_t nr = read(fin, header + prelen, sizeof(header) - prelen);
623 switch (nr) {
624 case -1:
625 return -1;
626 case 0:
627 return prelen ? -1 : 0;
628 default:
629 if ((size_t)nr != sizeof(header) - prelen)
630 return -1;
631 break;
632 }
633
634 if (memcmp(header, hdrmagic, sizeof(hdrmagic)) != 0)
635 return -1;
636
637 unsigned dict_size = lz_get_dict_size(header[5]);
638 if (dict_size == 0)
639 return -1;
640
641 return lz_decode(fin, fout, dict_size, bytes_in);
642 }
643