1 /* $NetBSD: unlz.c,v 1.6 2018/11/11 01:42:36 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 2018 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Christos Zoulas. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* Lzd - Educational decompressor for the lzip format 33 Copyright (C) 2013-2018 Antonio Diaz Diaz. 34 35 This program is free software. Redistribution and use in source and 36 binary forms, with or without modification, are permitted provided 37 that the following conditions are met: 38 39 1. Redistributions of source code must retain the above copyright 40 notice, this list of conditions and the following disclaimer. 41 42 2. Redistributions in binary form must reproduce the above copyright 43 notice, this list of conditions and the following disclaimer in the 44 documentation and/or other materials provided with the distribution. 45 46 This program is distributed in the hope that it will be useful, 47 but WITHOUT ANY WARRANTY; without even the implied warranty of 48 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 49 */ 50 51 #include <sys/param.h> 52 #include <stdio.h> 53 #include <string.h> 54 #include <stdlib.h> 55 #include <stdint.h> 56 #include <stdbool.h> 57 #include <errno.h> 58 #include <unistd.h> 59 60 #define LZ_STATES 12 61 62 #define LITERAL_CONTEXT_BITS 3 63 #define POS_STATE_BITS 2 64 #define POS_STATES (1 << POS_STATE_BITS) 65 #define POS_STATE_MASK (POS_STATES - 1) 66 67 #define STATES 4 68 #define DIS_SLOT_BITS 6 69 70 #define DIS_MODEL_START 4 71 #define DIS_MODEL_END 14 72 73 #define MODELED_DISTANCES (1 << (DIS_MODEL_END / 2)) 74 #define DIS_ALIGN_BITS 4 75 #define DIS_ALIGN_SIZE (1 << DIS_ALIGN_BITS) 76 77 #define LOW_BITS 3 78 #define MID_BITS 3 79 #define HIGH_BITS 8 80 81 #define LOW_SYMBOLS (1 << LOW_BITS) 82 #define MID_SYMBOLS (1 << MID_BITS) 83 #define HIGH_SYMBOLS (1 << HIGH_BITS) 84 85 #define MAX_SYMBOLS (LOW_SYMBOLS + MID_SYMBOLS + HIGH_SYMBOLS) 86 87 #define MIN_MATCH_LEN 2 88 89 #define BIT_MODEL_MOVE_BITS 5 90 #define BIT_MODEL_TOTAL_BITS 11 91 #define BIT_MODEL_TOTAL (1 << BIT_MODEL_TOTAL_BITS) 92 #define BIT_MODEL_INIT (BIT_MODEL_TOTAL / 2) 93 94 static const int lz_st_next[] = { 95 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5, 96 }; 97 98 static bool 99 lz_st_is_char(int st) { 100 return st < 7; 101 } 102 103 static int 104 lz_st_get_char(int st) { 105 return lz_st_next[st]; 106 } 107 108 static int 109 lz_st_get_match(int st) { 110 return st < 7 ? 7 : 10; 111 } 112 113 static int 114 lz_st_get_rep(int st) { 115 return st < 7 ? 8 : 11; 116 } 117 118 static int 119 lz_st_get_short_rep(int st) { 120 return st < 7 ? 9 : 11; 121 } 122 123 struct lz_len_model { 124 int choice1; 125 int choice2; 126 int bm_low[POS_STATES][LOW_SYMBOLS]; 127 int bm_mid[POS_STATES][MID_SYMBOLS]; 128 int bm_high[HIGH_SYMBOLS]; 129 }; 130 131 static uint32_t lz_crc[256]; 132 133 static void 134 lz_crc_init(void) 135 { 136 for (unsigned i = 0; i < nitems(lz_crc); i++) { 137 unsigned c = i; 138 for (unsigned j = 0; j < 8; j++) { 139 if (c & 1) 140 c = 0xEDB88320U ^ (c >> 1); 141 else 142 c >>= 1; 143 } 144 lz_crc[i] = c; 145 } 146 } 147 148 static void 149 lz_crc_update(uint32_t *crc, const uint8_t *buf, size_t len) 150 { 151 for (size_t i = 0; i < len; i++) 152 *crc = lz_crc[(*crc ^ buf[i]) & 0xFF] ^ (*crc >> 8); 153 } 154 155 struct lz_range_decoder { 156 FILE *fp; 157 uint32_t code; 158 uint32_t range; 159 }; 160 161 static int 162 lz_rd_create(struct lz_range_decoder *rd, FILE *fp) 163 { 164 rd->fp = fp; 165 rd->code = 0; 166 rd->range = ~0; 167 for (int i = 0; i < 5; i++) 168 rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp); 169 return ferror(rd->fp) ? -1 : 0; 170 } 171 172 static unsigned 173 lz_rd_decode(struct lz_range_decoder *rd, int num_bits) 174 { 175 unsigned symbol = 0; 176 177 for (int i = num_bits; i > 0; i--) { 178 rd->range >>= 1; 179 symbol <<= 1; 180 if (rd->code >= rd->range) { 181 rd->code -= rd->range; 182 symbol |= 1; 183 } 184 if (rd->range <= 0x00FFFFFFU) { 185 rd->range <<= 8; 186 rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp); 187 } 188 } 189 190 return symbol; 191 } 192 193 static unsigned 194 lz_rd_decode_bit(struct lz_range_decoder *rd, int *bm) 195 { 196 unsigned symbol; 197 const uint32_t bound = (rd->range >> BIT_MODEL_TOTAL_BITS) * *bm; 198 199 if(rd->code < bound) { 200 rd->range = bound; 201 *bm += (BIT_MODEL_TOTAL - *bm) >> BIT_MODEL_MOVE_BITS; 202 symbol = 0; 203 } 204 else { 205 rd->range -= bound; 206 rd->code -= bound; 207 *bm -= *bm >> BIT_MODEL_MOVE_BITS; 208 symbol = 1; 209 } 210 211 if (rd->range <= 0x00FFFFFFU) { 212 rd->range <<= 8; 213 rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp); 214 } 215 return symbol; 216 } 217 218 static unsigned 219 lz_rd_decode_tree(struct lz_range_decoder *rd, int *bm, int num_bits) 220 { 221 unsigned symbol = 1; 222 223 for (int i = 0; i < num_bits; i++) 224 symbol = (symbol << 1) | lz_rd_decode_bit(rd, &bm[symbol]); 225 226 return symbol - (1 << num_bits); 227 } 228 229 static unsigned 230 lz_rd_decode_tree_reversed(struct lz_range_decoder *rd, int *bm, int num_bits) 231 { 232 unsigned symbol = lz_rd_decode_tree(rd, bm, num_bits); 233 unsigned reversed_symbol = 0; 234 235 for (int i = 0; i < num_bits; i++) { 236 reversed_symbol = (reversed_symbol << 1) | (symbol & 1); 237 symbol >>= 1; 238 } 239 240 return reversed_symbol; 241 } 242 243 static unsigned 244 lz_rd_decode_matched(struct lz_range_decoder *rd, int *bm, int match_byte) 245 { 246 unsigned symbol = 1; 247 248 for (int i = 7; i >= 0; i--) { 249 const unsigned match_bit = (match_byte >> i) & 1; 250 const unsigned bit = lz_rd_decode_bit(rd, 251 &bm[symbol + (match_bit << 8) + 0x100]); 252 symbol = (symbol << 1) | bit; 253 if (match_bit != bit) { 254 while (symbol < 0x100) { 255 symbol = (symbol << 1) | 256 lz_rd_decode_bit(rd, &bm[symbol]); 257 } 258 break; 259 } 260 } 261 return symbol & 0xFF; 262 } 263 264 static unsigned 265 lz_rd_decode_len(struct lz_range_decoder *rd, struct lz_len_model *lm, 266 int pos_state) 267 { 268 if (lz_rd_decode_bit(rd, &lm->choice1) == 0) 269 return lz_rd_decode_tree(rd, lm->bm_low[pos_state], LOW_BITS); 270 271 if (lz_rd_decode_bit(rd, &lm->choice2) == 0) { 272 return LOW_SYMBOLS + 273 lz_rd_decode_tree(rd, lm->bm_mid[pos_state], MID_BITS); 274 } 275 276 return LOW_SYMBOLS + MID_SYMBOLS + 277 lz_rd_decode_tree(rd, lm->bm_high, HIGH_BITS); 278 } 279 280 struct lz_decoder { 281 FILE *fin, *fout; 282 off_t pos, ppos, spos, dict_size; 283 bool wrapped; 284 uint32_t crc; 285 uint8_t *obuf; 286 struct lz_range_decoder rdec; 287 }; 288 289 static int 290 lz_flush(struct lz_decoder *lz) 291 { 292 off_t offs = lz->pos - lz->spos; 293 if (offs <= 0) 294 return -1; 295 296 size_t size = (size_t)offs; 297 lz_crc_update(&lz->crc, lz->obuf + lz->spos, size); 298 if (fwrite(lz->obuf + lz->spos, 1, size, lz->fout) != size) 299 return -1; 300 301 lz->wrapped = lz->pos >= lz->dict_size; 302 if (lz->wrapped) { 303 lz->ppos += lz->pos; 304 lz->pos = 0; 305 } 306 lz->spos = lz->pos; 307 return 0; 308 } 309 310 static void 311 lz_destroy(struct lz_decoder *lz) 312 { 313 if (lz->fin) 314 fclose(lz->fin); 315 if (lz->fout) 316 fclose(lz->fout); 317 free(lz->obuf); 318 } 319 320 static int 321 lz_create(struct lz_decoder *lz, int fin, int fdout, int dict_size) 322 { 323 memset(lz, 0, sizeof(*lz)); 324 325 lz->fin = fdopen(dup(fin), "r"); 326 if (lz->fin == NULL) 327 goto out; 328 329 lz->fout = fdopen(dup(fdout), "w"); 330 if (lz->fout == NULL) 331 goto out; 332 333 lz->pos = lz->ppos = lz->spos = 0; 334 lz->crc = ~0; 335 lz->dict_size = dict_size; 336 lz->wrapped = false; 337 338 lz->obuf = malloc(dict_size); 339 if (lz->obuf == NULL) 340 goto out; 341 342 if (lz_rd_create(&lz->rdec, lz->fin) == -1) 343 goto out; 344 return 0; 345 out: 346 lz_destroy(lz); 347 return -1; 348 } 349 350 static uint8_t 351 lz_peek(const struct lz_decoder *lz, unsigned ahead) 352 { 353 off_t diff = lz->pos - ahead - 1; 354 355 if (diff >= 0) 356 return lz->obuf[diff]; 357 358 if (lz->wrapped) 359 return lz->obuf[lz->dict_size + diff]; 360 361 return 0; 362 } 363 364 static void 365 lz_put(struct lz_decoder *lz, uint8_t b) 366 { 367 lz->obuf[lz->pos++] = b; 368 if (lz->dict_size == lz->pos) 369 lz_flush(lz); 370 } 371 372 static off_t 373 lz_get_data_position(const struct lz_decoder *lz) 374 { 375 return lz->ppos + lz->pos; 376 } 377 378 static unsigned 379 lz_get_crc(const struct lz_decoder *lz) 380 { 381 return lz->crc ^ 0xffffffffU; 382 } 383 384 static void 385 lz_bm_init(int *a, size_t l) 386 { 387 for (size_t i = 0; i < l; i++) 388 a[i] = BIT_MODEL_INIT; 389 } 390 391 #define LZ_BM_INIT(a) lz_bm_init(a, nitems(a)) 392 #define LZ_BM_INIT2(a) do { \ 393 size_t l = nitems(a[0]); \ 394 for (size_t i = 0; i < nitems(a); i++) \ 395 lz_bm_init(a[i], l); \ 396 } while (/*CONSTCOND*/0) 397 398 #define LZ_MODEL_INIT(a) do { \ 399 a.choice1 = BIT_MODEL_INIT; \ 400 a.choice2 = BIT_MODEL_INIT; \ 401 LZ_BM_INIT2(a.bm_low); \ 402 LZ_BM_INIT2(a.bm_mid); \ 403 LZ_BM_INIT(a.bm_high); \ 404 } while (/*CONSTCOND*/0) 405 406 static bool 407 lz_decode_member(struct lz_decoder *lz) 408 { 409 int bm_literal[1 << LITERAL_CONTEXT_BITS][0x300]; 410 int bm_match[LZ_STATES][POS_STATES]; 411 int bm_rep[4][LZ_STATES]; 412 int bm_len[LZ_STATES][POS_STATES]; 413 int bm_dis_slot[LZ_STATES][1 << DIS_SLOT_BITS]; 414 int bm_dis[MODELED_DISTANCES - DIS_MODEL_END + 1]; 415 int bm_align[DIS_ALIGN_SIZE]; 416 417 LZ_BM_INIT2(bm_literal); 418 LZ_BM_INIT2(bm_match); 419 LZ_BM_INIT2(bm_rep); 420 LZ_BM_INIT2(bm_len); 421 LZ_BM_INIT2(bm_dis_slot); 422 LZ_BM_INIT(bm_dis); 423 LZ_BM_INIT(bm_align); 424 425 struct lz_len_model match_len_model; 426 struct lz_len_model rep_len_model; 427 428 LZ_MODEL_INIT(match_len_model); 429 LZ_MODEL_INIT(rep_len_model); 430 431 struct lz_range_decoder *rd = &lz->rdec; 432 unsigned rep[4] = { 0 }; 433 434 435 int state = 0; 436 437 while (!feof(lz->fin) && !ferror(lz->fin)) { 438 const int pos_state = lz_get_data_position(lz) & POS_STATE_MASK; 439 // bit 1 440 if (lz_rd_decode_bit(rd, &bm_match[state][pos_state]) == 0) { 441 const uint8_t prev_byte = lz_peek(lz, 0); 442 const int literal_state = 443 prev_byte >> (8 - LITERAL_CONTEXT_BITS); 444 int *bm = bm_literal[literal_state]; 445 if (lz_st_is_char(state)) 446 lz_put(lz, lz_rd_decode_tree(rd, bm, 8)); 447 else { 448 int peek = lz_peek(lz, rep[0]); 449 lz_put(lz, lz_rd_decode_matched(rd, bm, peek)); 450 } 451 state = lz_st_get_char(state); 452 continue; 453 } 454 int len; 455 // bit 2 456 if (lz_rd_decode_bit(rd, &bm_rep[0][state]) != 0) { 457 // bit 3 458 if (lz_rd_decode_bit(rd, &bm_rep[1][state]) == 0) { 459 // bit 4 460 if (lz_rd_decode_bit(rd, 461 &bm_len[state][pos_state]) == 0) 462 { 463 state = lz_st_get_short_rep(state); 464 lz_put(lz, lz_peek(lz, rep[0])); 465 continue; 466 } 467 } else { 468 unsigned distance; 469 // bit 4 470 if (lz_rd_decode_bit(rd, &bm_rep[2][state]) 471 == 0) 472 distance = rep[1]; 473 else { 474 // bit 5 475 if (lz_rd_decode_bit(rd, 476 &bm_rep[3][state]) == 0) 477 distance = rep[2]; 478 else { 479 distance = rep[3]; 480 rep[3] = rep[2]; 481 } 482 rep[2] = rep[1]; 483 } 484 rep[1] = rep[0]; 485 rep[0] = distance; 486 } 487 state = lz_st_get_rep(state); 488 len = MIN_MATCH_LEN + 489 lz_rd_decode_len(rd, &rep_len_model, pos_state); 490 } else { 491 rep[3] = rep[2]; rep[2] = rep[1]; rep[1] = rep[0]; 492 len = MIN_MATCH_LEN + 493 lz_rd_decode_len(rd, &match_len_model, pos_state); 494 const int len_state = 495 MIN(len - MIN_MATCH_LEN, STATES - 1); 496 rep[0] = lz_rd_decode_tree(rd, bm_dis_slot[len_state], 497 DIS_SLOT_BITS); 498 if (rep[0] >= DIS_MODEL_START) { 499 const unsigned dis_slot = rep[0]; 500 const int direct_bits = (dis_slot >> 1) - 1; 501 rep[0] = (2 | (dis_slot & 1)) << direct_bits; 502 if (dis_slot < DIS_MODEL_END) 503 rep[0] += lz_rd_decode_tree_reversed(rd, 504 &bm_dis[rep[0] - dis_slot], 505 direct_bits); 506 else { 507 rep[0] += lz_rd_decode(rd, direct_bits 508 - DIS_ALIGN_BITS) << DIS_ALIGN_BITS; 509 rep[0] += lz_rd_decode_tree_reversed(rd, 510 bm_align, DIS_ALIGN_BITS); 511 if (rep[0] == 0xFFFFFFFFU) { 512 lz_flush(lz); 513 return len == MIN_MATCH_LEN; 514 } 515 } 516 } 517 state = lz_st_get_match(state); 518 if (rep[0] >= lz->dict_size || 519 (rep[0] >= lz->pos && !lz->wrapped)) { 520 lz_flush(lz); 521 return false; 522 } 523 } 524 for (int i = 0; i < len; i++) 525 lz_put(lz, lz_peek(lz, rep[0])); 526 } 527 lz_flush(lz); 528 return false; 529 } 530 531 /* 532 * 0-3 CRC32 of the uncompressed data 533 * 4-11 size of the uncompressed data 534 * 12-19 member size including header and trailer 535 */ 536 #define TRAILER_SIZE 20 537 538 539 static off_t 540 lz_decode(int fin, int fdout, unsigned dict_size, off_t *insize) 541 { 542 struct lz_decoder lz; 543 off_t rv = -1; 544 545 if (lz_create(&lz, fin, fdout, dict_size) == -1) 546 return -1; 547 548 if (!lz_decode_member(&lz)) 549 goto out; 550 551 uint8_t trailer[TRAILER_SIZE]; 552 553 for(size_t i = 0; i < nitems(trailer); i++) 554 trailer[i] = (uint8_t)getc(lz.fin); 555 556 unsigned crc = 0; 557 for (int i = 3; i >= 0; --i) { 558 crc <<= 8; 559 crc += trailer[i]; 560 } 561 562 int64_t data_size = 0; 563 for (int i = 11; i >= 4; --i) { 564 data_size <<= 8; 565 data_size += trailer[i]; 566 } 567 568 if (crc != lz_get_crc(&lz) || data_size != lz_get_data_position(&lz)) 569 goto out; 570 571 rv = 0; 572 for (int i = 19; i >= 12; --i) { 573 rv <<= 8; 574 rv += trailer[i]; 575 } 576 if (insize) 577 *insize = rv; 578 #if 0 579 /* Does not work with pipes */ 580 rv = ftello(lz.fout); 581 #else 582 rv = data_size; 583 #endif 584 out: 585 lz_destroy(&lz); 586 return rv; 587 } 588 589 590 /* 591 * 0-3 magic 592 * 4 version 593 * 5 coded dict_size 594 */ 595 #define HDR_SIZE 6 596 #define MIN_DICTIONARY_SIZE (1 << 12) 597 #define MAX_DICTIONARY_SIZE (1 << 29) 598 599 static const char hdrmagic[] = { 'L', 'Z', 'I', 'P', 1 }; 600 601 static unsigned 602 lz_get_dict_size(unsigned char c) 603 { 604 unsigned dict_size = 1 << (c & 0x1f); 605 dict_size -= (dict_size >> 2) * ( (c >> 5) & 0x7); 606 if (dict_size < MIN_DICTIONARY_SIZE || dict_size > MAX_DICTIONARY_SIZE) 607 return 0; 608 return dict_size; 609 } 610 611 static off_t 612 unlz(int fin, int fout, char *pre, size_t prelen, off_t *bytes_in) 613 { 614 if (lz_crc[0] == 0) 615 lz_crc_init(); 616 617 char header[HDR_SIZE]; 618 619 if (pre && prelen) 620 memcpy(header, pre, prelen); 621 622 ssize_t nr = read(fin, header + prelen, sizeof(header) - prelen); 623 switch (nr) { 624 case -1: 625 return -1; 626 case 0: 627 return prelen ? -1 : 0; 628 default: 629 if ((size_t)nr != sizeof(header) - prelen) 630 return -1; 631 break; 632 } 633 634 if (memcmp(header, hdrmagic, sizeof(hdrmagic)) != 0) 635 return -1; 636 637 unsigned dict_size = lz_get_dict_size(header[5]); 638 if (dict_size == 0) 639 return -1; 640 641 return lz_decode(fin, fout, dict_size, bytes_in); 642 } 643