1 /* $NetBSD: unlz.c,v 1.6 2018/11/11 01:42:36 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 2018 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Christos Zoulas. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 * 31 * $FreeBSD$ 32 */ 33 34 /* Lzd - Educational decompressor for the lzip format 35 Copyright (C) 2013-2018 Antonio Diaz Diaz. 36 37 This program is free software. Redistribution and use in source and 38 binary forms, with or without modification, are permitted provided 39 that the following conditions are met: 40 41 1. Redistributions of source code must retain the above copyright 42 notice, this list of conditions and the following disclaimer. 43 44 2. Redistributions in binary form must reproduce the above copyright 45 notice, this list of conditions and the following disclaimer in the 46 documentation and/or other materials provided with the distribution. 47 48 This program is distributed in the hope that it will be useful, 49 but WITHOUT ANY WARRANTY; without even the implied warranty of 50 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 51 */ 52 53 #include <sys/param.h> 54 #include <stdio.h> 55 #include <string.h> 56 #include <stdlib.h> 57 #include <stdint.h> 58 #include <stdbool.h> 59 #include <errno.h> 60 #include <unistd.h> 61 62 #define LZ_STATES 12 63 64 #define LITERAL_CONTEXT_BITS 3 65 #define POS_STATE_BITS 2 66 #define POS_STATES (1 << POS_STATE_BITS) 67 #define POS_STATE_MASK (POS_STATES - 1) 68 69 #define STATES 4 70 #define DIS_SLOT_BITS 6 71 72 #define DIS_MODEL_START 4 73 #define DIS_MODEL_END 14 74 75 #define MODELED_DISTANCES (1 << (DIS_MODEL_END / 2)) 76 #define DIS_ALIGN_BITS 4 77 #define DIS_ALIGN_SIZE (1 << DIS_ALIGN_BITS) 78 79 #define LOW_BITS 3 80 #define MID_BITS 3 81 #define HIGH_BITS 8 82 83 #define LOW_SYMBOLS (1 << LOW_BITS) 84 #define MID_SYMBOLS (1 << MID_BITS) 85 #define HIGH_SYMBOLS (1 << HIGH_BITS) 86 87 #define MAX_SYMBOLS (LOW_SYMBOLS + MID_SYMBOLS + HIGH_SYMBOLS) 88 89 #define MIN_MATCH_LEN 2 90 91 #define BIT_MODEL_MOVE_BITS 5 92 #define BIT_MODEL_TOTAL_BITS 11 93 #define BIT_MODEL_TOTAL (1 << BIT_MODEL_TOTAL_BITS) 94 #define BIT_MODEL_INIT (BIT_MODEL_TOTAL / 2) 95 96 static const int lz_st_next[] = { 97 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5, 98 }; 99 100 static bool 101 lz_st_is_char(int st) { 102 return st < 7; 103 } 104 105 static int 106 lz_st_get_char(int st) { 107 return lz_st_next[st]; 108 } 109 110 static int 111 lz_st_get_match(int st) { 112 return st < 7 ? 7 : 10; 113 } 114 115 static int 116 lz_st_get_rep(int st) { 117 return st < 7 ? 8 : 11; 118 } 119 120 static int 121 lz_st_get_short_rep(int st) { 122 return st < 7 ? 9 : 11; 123 } 124 125 struct lz_len_model { 126 int choice1; 127 int choice2; 128 int bm_low[POS_STATES][LOW_SYMBOLS]; 129 int bm_mid[POS_STATES][MID_SYMBOLS]; 130 int bm_high[HIGH_SYMBOLS]; 131 }; 132 133 static uint32_t lz_crc[256]; 134 135 static void 136 lz_crc_init(void) 137 { 138 for (unsigned i = 0; i < nitems(lz_crc); i++) { 139 unsigned c = i; 140 for (unsigned j = 0; j < 8; j++) { 141 if (c & 1) 142 c = 0xEDB88320U ^ (c >> 1); 143 else 144 c >>= 1; 145 } 146 lz_crc[i] = c; 147 } 148 } 149 150 static void 151 lz_crc_update(uint32_t *crc, const uint8_t *buf, size_t len) 152 { 153 for (size_t i = 0; i < len; i++) 154 *crc = lz_crc[(*crc ^ buf[i]) & 0xFF] ^ (*crc >> 8); 155 } 156 157 struct lz_range_decoder { 158 FILE *fp; 159 uint32_t code; 160 uint32_t range; 161 }; 162 163 static int 164 lz_rd_create(struct lz_range_decoder *rd, FILE *fp) 165 { 166 rd->fp = fp; 167 rd->code = 0; 168 rd->range = ~0; 169 for (int i = 0; i < 5; i++) 170 rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp); 171 return ferror(rd->fp) ? -1 : 0; 172 } 173 174 static unsigned 175 lz_rd_decode(struct lz_range_decoder *rd, int num_bits) 176 { 177 unsigned symbol = 0; 178 179 for (int i = num_bits; i > 0; i--) { 180 rd->range >>= 1; 181 symbol <<= 1; 182 if (rd->code >= rd->range) { 183 rd->code -= rd->range; 184 symbol |= 1; 185 } 186 if (rd->range <= 0x00FFFFFFU) { 187 rd->range <<= 8; 188 rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp); 189 } 190 } 191 192 return symbol; 193 } 194 195 static unsigned 196 lz_rd_decode_bit(struct lz_range_decoder *rd, int *bm) 197 { 198 unsigned symbol; 199 const uint32_t bound = (rd->range >> BIT_MODEL_TOTAL_BITS) * *bm; 200 201 if(rd->code < bound) { 202 rd->range = bound; 203 *bm += (BIT_MODEL_TOTAL - *bm) >> BIT_MODEL_MOVE_BITS; 204 symbol = 0; 205 } 206 else { 207 rd->range -= bound; 208 rd->code -= bound; 209 *bm -= *bm >> BIT_MODEL_MOVE_BITS; 210 symbol = 1; 211 } 212 213 if (rd->range <= 0x00FFFFFFU) { 214 rd->range <<= 8; 215 rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp); 216 } 217 return symbol; 218 } 219 220 static unsigned 221 lz_rd_decode_tree(struct lz_range_decoder *rd, int *bm, int num_bits) 222 { 223 unsigned symbol = 1; 224 225 for (int i = 0; i < num_bits; i++) 226 symbol = (symbol << 1) | lz_rd_decode_bit(rd, &bm[symbol]); 227 228 return symbol - (1 << num_bits); 229 } 230 231 static unsigned 232 lz_rd_decode_tree_reversed(struct lz_range_decoder *rd, int *bm, int num_bits) 233 { 234 unsigned symbol = lz_rd_decode_tree(rd, bm, num_bits); 235 unsigned reversed_symbol = 0; 236 237 for (int i = 0; i < num_bits; i++) { 238 reversed_symbol = (reversed_symbol << 1) | (symbol & 1); 239 symbol >>= 1; 240 } 241 242 return reversed_symbol; 243 } 244 245 static unsigned 246 lz_rd_decode_matched(struct lz_range_decoder *rd, int *bm, int match_byte) 247 { 248 unsigned symbol = 1; 249 250 for (int i = 7; i >= 0; i--) { 251 const unsigned match_bit = (match_byte >> i) & 1; 252 const unsigned bit = lz_rd_decode_bit(rd, 253 &bm[symbol + (match_bit << 8) + 0x100]); 254 symbol = (symbol << 1) | bit; 255 if (match_bit != bit) { 256 while (symbol < 0x100) { 257 symbol = (symbol << 1) | 258 lz_rd_decode_bit(rd, &bm[symbol]); 259 } 260 break; 261 } 262 } 263 return symbol & 0xFF; 264 } 265 266 static unsigned 267 lz_rd_decode_len(struct lz_range_decoder *rd, struct lz_len_model *lm, 268 int pos_state) 269 { 270 if (lz_rd_decode_bit(rd, &lm->choice1) == 0) 271 return lz_rd_decode_tree(rd, lm->bm_low[pos_state], LOW_BITS); 272 273 if (lz_rd_decode_bit(rd, &lm->choice2) == 0) { 274 return LOW_SYMBOLS + 275 lz_rd_decode_tree(rd, lm->bm_mid[pos_state], MID_BITS); 276 } 277 278 return LOW_SYMBOLS + MID_SYMBOLS + 279 lz_rd_decode_tree(rd, lm->bm_high, HIGH_BITS); 280 } 281 282 struct lz_decoder { 283 FILE *fin, *fout; 284 off_t pos, ppos, spos, dict_size; 285 bool wrapped; 286 uint32_t crc; 287 uint8_t *obuf; 288 struct lz_range_decoder rdec; 289 }; 290 291 static int 292 lz_flush(struct lz_decoder *lz) 293 { 294 off_t offs = lz->pos - lz->spos; 295 if (offs <= 0) 296 return -1; 297 298 size_t size = (size_t)offs; 299 lz_crc_update(&lz->crc, lz->obuf + lz->spos, size); 300 if (fwrite(lz->obuf + lz->spos, 1, size, lz->fout) != size) 301 return -1; 302 303 lz->wrapped = lz->pos >= lz->dict_size; 304 if (lz->wrapped) { 305 lz->ppos += lz->pos; 306 lz->pos = 0; 307 } 308 lz->spos = lz->pos; 309 return 0; 310 } 311 312 static void 313 lz_destroy(struct lz_decoder *lz) 314 { 315 if (lz->fin) 316 fclose(lz->fin); 317 if (lz->fout) 318 fclose(lz->fout); 319 free(lz->obuf); 320 } 321 322 static int 323 lz_create(struct lz_decoder *lz, int fin, int fdout, int dict_size) 324 { 325 memset(lz, 0, sizeof(*lz)); 326 327 lz->fin = fdopen(dup(fin), "r"); 328 if (lz->fin == NULL) 329 goto out; 330 331 lz->fout = fdopen(dup(fdout), "w"); 332 if (lz->fout == NULL) 333 goto out; 334 335 lz->pos = lz->ppos = lz->spos = 0; 336 lz->crc = ~0; 337 lz->dict_size = dict_size; 338 lz->wrapped = false; 339 340 lz->obuf = malloc(dict_size); 341 if (lz->obuf == NULL) 342 goto out; 343 344 if (lz_rd_create(&lz->rdec, lz->fin) == -1) 345 goto out; 346 return 0; 347 out: 348 lz_destroy(lz); 349 return -1; 350 } 351 352 static uint8_t 353 lz_peek(const struct lz_decoder *lz, unsigned ahead) 354 { 355 off_t diff = lz->pos - ahead - 1; 356 357 if (diff >= 0) 358 return lz->obuf[diff]; 359 360 if (lz->wrapped) 361 return lz->obuf[lz->dict_size + diff]; 362 363 return 0; 364 } 365 366 static void 367 lz_put(struct lz_decoder *lz, uint8_t b) 368 { 369 lz->obuf[lz->pos++] = b; 370 if (lz->dict_size == lz->pos) 371 lz_flush(lz); 372 } 373 374 static off_t 375 lz_get_data_position(const struct lz_decoder *lz) 376 { 377 return lz->ppos + lz->pos; 378 } 379 380 static unsigned 381 lz_get_crc(const struct lz_decoder *lz) 382 { 383 return lz->crc ^ 0xffffffffU; 384 } 385 386 static void 387 lz_bm_init(int *a, size_t l) 388 { 389 for (size_t i = 0; i < l; i++) 390 a[i] = BIT_MODEL_INIT; 391 } 392 393 #define LZ_BM_INIT(a) lz_bm_init(a, nitems(a)) 394 #define LZ_BM_INIT2(a) do { \ 395 size_t l = nitems(a[0]); \ 396 for (size_t i = 0; i < nitems(a); i++) \ 397 lz_bm_init(a[i], l); \ 398 } while (/*CONSTCOND*/0) 399 400 #define LZ_MODEL_INIT(a) do { \ 401 a.choice1 = BIT_MODEL_INIT; \ 402 a.choice2 = BIT_MODEL_INIT; \ 403 LZ_BM_INIT2(a.bm_low); \ 404 LZ_BM_INIT2(a.bm_mid); \ 405 LZ_BM_INIT(a.bm_high); \ 406 } while (/*CONSTCOND*/0) 407 408 static bool 409 lz_decode_member(struct lz_decoder *lz) 410 { 411 int bm_literal[1 << LITERAL_CONTEXT_BITS][0x300]; 412 int bm_match[LZ_STATES][POS_STATES]; 413 int bm_rep[4][LZ_STATES]; 414 int bm_len[LZ_STATES][POS_STATES]; 415 int bm_dis_slot[LZ_STATES][1 << DIS_SLOT_BITS]; 416 int bm_dis[MODELED_DISTANCES - DIS_MODEL_END + 1]; 417 int bm_align[DIS_ALIGN_SIZE]; 418 419 LZ_BM_INIT2(bm_literal); 420 LZ_BM_INIT2(bm_match); 421 LZ_BM_INIT2(bm_rep); 422 LZ_BM_INIT2(bm_len); 423 LZ_BM_INIT2(bm_dis_slot); 424 LZ_BM_INIT(bm_dis); 425 LZ_BM_INIT(bm_align); 426 427 struct lz_len_model match_len_model; 428 struct lz_len_model rep_len_model; 429 430 LZ_MODEL_INIT(match_len_model); 431 LZ_MODEL_INIT(rep_len_model); 432 433 struct lz_range_decoder *rd = &lz->rdec; 434 unsigned rep[4] = { 0 }; 435 436 437 int state = 0; 438 439 while (!feof(lz->fin) && !ferror(lz->fin)) { 440 const int pos_state = lz_get_data_position(lz) & POS_STATE_MASK; 441 // bit 1 442 if (lz_rd_decode_bit(rd, &bm_match[state][pos_state]) == 0) { 443 const uint8_t prev_byte = lz_peek(lz, 0); 444 const int literal_state = 445 prev_byte >> (8 - LITERAL_CONTEXT_BITS); 446 int *bm = bm_literal[literal_state]; 447 if (lz_st_is_char(state)) 448 lz_put(lz, lz_rd_decode_tree(rd, bm, 8)); 449 else { 450 int peek = lz_peek(lz, rep[0]); 451 lz_put(lz, lz_rd_decode_matched(rd, bm, peek)); 452 } 453 state = lz_st_get_char(state); 454 continue; 455 } 456 int len; 457 // bit 2 458 if (lz_rd_decode_bit(rd, &bm_rep[0][state]) != 0) { 459 // bit 3 460 if (lz_rd_decode_bit(rd, &bm_rep[1][state]) == 0) { 461 // bit 4 462 if (lz_rd_decode_bit(rd, 463 &bm_len[state][pos_state]) == 0) 464 { 465 state = lz_st_get_short_rep(state); 466 lz_put(lz, lz_peek(lz, rep[0])); 467 continue; 468 } 469 } else { 470 unsigned distance; 471 // bit 4 472 if (lz_rd_decode_bit(rd, &bm_rep[2][state]) 473 == 0) 474 distance = rep[1]; 475 else { 476 // bit 5 477 if (lz_rd_decode_bit(rd, 478 &bm_rep[3][state]) == 0) 479 distance = rep[2]; 480 else { 481 distance = rep[3]; 482 rep[3] = rep[2]; 483 } 484 rep[2] = rep[1]; 485 } 486 rep[1] = rep[0]; 487 rep[0] = distance; 488 } 489 state = lz_st_get_rep(state); 490 len = MIN_MATCH_LEN + 491 lz_rd_decode_len(rd, &rep_len_model, pos_state); 492 } else { 493 rep[3] = rep[2]; rep[2] = rep[1]; rep[1] = rep[0]; 494 len = MIN_MATCH_LEN + 495 lz_rd_decode_len(rd, &match_len_model, pos_state); 496 const int len_state = 497 MIN(len - MIN_MATCH_LEN, STATES - 1); 498 rep[0] = lz_rd_decode_tree(rd, bm_dis_slot[len_state], 499 DIS_SLOT_BITS); 500 if (rep[0] >= DIS_MODEL_START) { 501 const unsigned dis_slot = rep[0]; 502 const int direct_bits = (dis_slot >> 1) - 1; 503 rep[0] = (2 | (dis_slot & 1)) << direct_bits; 504 if (dis_slot < DIS_MODEL_END) 505 rep[0] += lz_rd_decode_tree_reversed(rd, 506 &bm_dis[rep[0] - dis_slot], 507 direct_bits); 508 else { 509 rep[0] += lz_rd_decode(rd, direct_bits 510 - DIS_ALIGN_BITS) << DIS_ALIGN_BITS; 511 rep[0] += lz_rd_decode_tree_reversed(rd, 512 bm_align, DIS_ALIGN_BITS); 513 if (rep[0] == 0xFFFFFFFFU) { 514 lz_flush(lz); 515 return len == MIN_MATCH_LEN; 516 } 517 } 518 } 519 state = lz_st_get_match(state); 520 if (rep[0] >= lz->dict_size || 521 (rep[0] >= lz->pos && !lz->wrapped)) { 522 lz_flush(lz); 523 return false; 524 } 525 } 526 for (int i = 0; i < len; i++) 527 lz_put(lz, lz_peek(lz, rep[0])); 528 } 529 lz_flush(lz); 530 return false; 531 } 532 533 /* 534 * 0-3 CRC32 of the uncompressed data 535 * 4-11 size of the uncompressed data 536 * 12-19 member size including header and trailer 537 */ 538 #define TRAILER_SIZE 20 539 540 541 static off_t 542 lz_decode(int fin, int fdout, unsigned dict_size, off_t *insize) 543 { 544 struct lz_decoder lz; 545 off_t rv = -1; 546 547 if (lz_create(&lz, fin, fdout, dict_size) == -1) 548 return -1; 549 550 if (!lz_decode_member(&lz)) 551 goto out; 552 553 uint8_t trailer[TRAILER_SIZE]; 554 555 for(size_t i = 0; i < nitems(trailer); i++) 556 trailer[i] = (uint8_t)getc(lz.fin); 557 558 unsigned crc = 0; 559 for (int i = 3; i >= 0; --i) { 560 crc <<= 8; 561 crc += trailer[i]; 562 } 563 564 int64_t data_size = 0; 565 for (int i = 11; i >= 4; --i) { 566 data_size <<= 8; 567 data_size += trailer[i]; 568 } 569 570 if (crc != lz_get_crc(&lz) || data_size != lz_get_data_position(&lz)) 571 goto out; 572 573 rv = 0; 574 for (int i = 19; i >= 12; --i) { 575 rv <<= 8; 576 rv += trailer[i]; 577 } 578 if (insize) 579 *insize = rv; 580 #if 0 581 /* Does not work with pipes */ 582 rv = ftello(lz.fout); 583 #else 584 rv = data_size; 585 #endif 586 out: 587 lz_destroy(&lz); 588 return rv; 589 } 590 591 592 /* 593 * 0-3 magic 594 * 4 version 595 * 5 coded dict_size 596 */ 597 #define HDR_SIZE 6 598 #define MIN_DICTIONARY_SIZE (1 << 12) 599 #define MAX_DICTIONARY_SIZE (1 << 29) 600 601 static const char hdrmagic[] = { 'L', 'Z', 'I', 'P', 1 }; 602 603 static unsigned 604 lz_get_dict_size(unsigned char c) 605 { 606 unsigned dict_size = 1 << (c & 0x1f); 607 dict_size -= (dict_size >> 2) * ( (c >> 5) & 0x7); 608 if (dict_size < MIN_DICTIONARY_SIZE || dict_size > MAX_DICTIONARY_SIZE) 609 return 0; 610 return dict_size; 611 } 612 613 static off_t 614 unlz(int fin, int fout, char *pre, size_t prelen, off_t *bytes_in) 615 { 616 if (lz_crc[0] == 0) 617 lz_crc_init(); 618 619 char header[HDR_SIZE]; 620 621 if (pre && prelen) 622 memcpy(header, pre, prelen); 623 624 ssize_t nr = read(fin, header + prelen, sizeof(header) - prelen); 625 switch (nr) { 626 case -1: 627 return -1; 628 case 0: 629 return prelen ? -1 : 0; 630 default: 631 if ((size_t)nr != sizeof(header) - prelen) 632 return -1; 633 break; 634 } 635 636 if (memcmp(header, hdrmagic, sizeof(hdrmagic)) != 0) 637 return -1; 638 639 unsigned dict_size = lz_get_dict_size(header[5]); 640 if (dict_size == 0) 641 return -1; 642 643 return lz_decode(fin, fout, dict_size, bytes_in); 644 } 645