1 // SPDX-License-Identifier: 0BSD 2 3 /* 4 * Branch/Call/Jump (BCJ) filter decoders 5 * 6 * Authors: Lasse Collin <lasse.collin@tukaani.org> 7 * Igor Pavlov <https://7-zip.org/> 8 */ 9 10 #include "xz_private.h" 11 12 /* 13 * The rest of the file is inside this ifdef. It makes things a little more 14 * convenient when building without support for any BCJ filters. 15 */ 16 #ifdef XZ_DEC_BCJ 17 18 struct xz_dec_bcj { 19 /* Type of the BCJ filter being used */ 20 enum { 21 BCJ_X86 = 4, /* x86 or x86-64 */ 22 BCJ_POWERPC = 5, /* Big endian only */ 23 BCJ_IA64 = 6, /* Big or little endian */ 24 BCJ_ARM = 7, /* Little endian only */ 25 BCJ_ARMTHUMB = 8, /* Little endian only */ 26 BCJ_SPARC = 9, /* Big or little endian */ 27 BCJ_ARM64 = 10, /* AArch64 */ 28 BCJ_RISCV = 11 /* RV32GQC_Zfh, RV64GQC_Zfh */ 29 } type; 30 31 /* 32 * Return value of the next filter in the chain. We need to preserve 33 * this information across calls, because we must not call the next 34 * filter anymore once it has returned XZ_STREAM_END. 35 */ 36 enum xz_ret ret; 37 38 /* True if we are operating in single-call mode. */ 39 bool single_call; 40 41 /* 42 * Absolute position relative to the beginning of the uncompressed 43 * data (in a single .xz Block). We care only about the lowest 32 44 * bits so this doesn't need to be uint64_t even with big files. 45 */ 46 uint32_t pos; 47 48 /* x86 filter state */ 49 uint32_t x86_prev_mask; 50 51 /* Temporary space to hold the variables from struct xz_buf */ 52 uint8_t *out; 53 size_t out_pos; 54 size_t out_size; 55 56 struct { 57 /* Amount of already filtered data in the beginning of buf */ 58 size_t filtered; 59 60 /* Total amount of data currently stored in buf */ 61 size_t size; 62 63 /* 64 * Buffer to hold a mix of filtered and unfiltered data. This 65 * needs to be big enough to hold Alignment + 2 * Look-ahead: 66 * 67 * Type Alignment Look-ahead 68 * x86 1 4 69 * PowerPC 4 0 70 * IA-64 16 0 71 * ARM 4 0 72 * ARM-Thumb 2 2 73 * SPARC 4 0 74 */ 75 uint8_t buf[16]; 76 } temp; 77 }; 78 79 #ifdef XZ_DEC_X86 80 /* 81 * This is used to test the most significant byte of a memory address 82 * in an x86 instruction. 83 */ 84 static inline int bcj_x86_test_msbyte(uint8_t b) 85 { 86 return b == 0x00 || b == 0xFF; 87 } 88 89 static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 90 { 91 static const bool mask_to_allowed_status[8] 92 = { true, true, true, false, true, false, false, false }; 93 94 static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 }; 95 96 size_t i; 97 size_t prev_pos = (size_t)-1; 98 uint32_t prev_mask = s->x86_prev_mask; 99 uint32_t src; 100 uint32_t dest; 101 uint32_t j; 102 uint8_t b; 103 104 if (size <= 4) 105 return 0; 106 107 size -= 4; 108 for (i = 0; i < size; ++i) { 109 if ((buf[i] & 0xFE) != 0xE8) 110 continue; 111 112 prev_pos = i - prev_pos; 113 if (prev_pos > 3) { 114 prev_mask = 0; 115 } else { 116 prev_mask = (prev_mask << (prev_pos - 1)) & 7; 117 if (prev_mask != 0) { 118 b = buf[i + 4 - mask_to_bit_num[prev_mask]]; 119 if (!mask_to_allowed_status[prev_mask] 120 || bcj_x86_test_msbyte(b)) { 121 prev_pos = i; 122 prev_mask = (prev_mask << 1) | 1; 123 continue; 124 } 125 } 126 } 127 128 prev_pos = i; 129 130 if (bcj_x86_test_msbyte(buf[i + 4])) { 131 src = get_unaligned_le32(buf + i + 1); 132 while (true) { 133 dest = src - (s->pos + (uint32_t)i + 5); 134 if (prev_mask == 0) 135 break; 136 137 j = mask_to_bit_num[prev_mask] * 8; 138 b = (uint8_t)(dest >> (24 - j)); 139 if (!bcj_x86_test_msbyte(b)) 140 break; 141 142 src = dest ^ (((uint32_t)1 << (32 - j)) - 1); 143 } 144 145 dest &= 0x01FFFFFF; 146 dest |= (uint32_t)0 - (dest & 0x01000000); 147 put_unaligned_le32(dest, buf + i + 1); 148 i += 4; 149 } else { 150 prev_mask = (prev_mask << 1) | 1; 151 } 152 } 153 154 prev_pos = i - prev_pos; 155 s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1); 156 return i; 157 } 158 #endif 159 160 #ifdef XZ_DEC_POWERPC 161 static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 162 { 163 size_t i; 164 uint32_t instr; 165 166 size &= ~(size_t)3; 167 168 for (i = 0; i < size; i += 4) { 169 instr = get_unaligned_be32(buf + i); 170 if ((instr & 0xFC000003) == 0x48000001) { 171 instr &= 0x03FFFFFC; 172 instr -= s->pos + (uint32_t)i; 173 instr &= 0x03FFFFFC; 174 instr |= 0x48000001; 175 put_unaligned_be32(instr, buf + i); 176 } 177 } 178 179 return i; 180 } 181 #endif 182 183 #ifdef XZ_DEC_IA64 184 static size_t bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 185 { 186 static const uint8_t branch_table[32] = { 187 0, 0, 0, 0, 0, 0, 0, 0, 188 0, 0, 0, 0, 0, 0, 0, 0, 189 4, 4, 6, 6, 0, 0, 7, 7, 190 4, 4, 0, 0, 4, 4, 0, 0 191 }; 192 193 /* 194 * The local variables take a little bit stack space, but it's less 195 * than what LZMA2 decoder takes, so it doesn't make sense to reduce 196 * stack usage here without doing that for the LZMA2 decoder too. 197 */ 198 199 /* Loop counters */ 200 size_t i; 201 size_t j; 202 203 /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */ 204 uint32_t slot; 205 206 /* Bitwise offset of the instruction indicated by slot */ 207 uint32_t bit_pos; 208 209 /* bit_pos split into byte and bit parts */ 210 uint32_t byte_pos; 211 uint32_t bit_res; 212 213 /* Address part of an instruction */ 214 uint32_t addr; 215 216 /* Mask used to detect which instructions to convert */ 217 uint32_t mask; 218 219 /* 41-bit instruction stored somewhere in the lowest 48 bits */ 220 uint64_t instr; 221 222 /* Instruction normalized with bit_res for easier manipulation */ 223 uint64_t norm; 224 225 size &= ~(size_t)15; 226 227 for (i = 0; i < size; i += 16) { 228 mask = branch_table[buf[i] & 0x1F]; 229 for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) { 230 if (((mask >> slot) & 1) == 0) 231 continue; 232 233 byte_pos = bit_pos >> 3; 234 bit_res = bit_pos & 7; 235 instr = 0; 236 for (j = 0; j < 6; ++j) 237 instr |= (uint64_t)(buf[i + j + byte_pos]) 238 << (8 * j); 239 240 norm = instr >> bit_res; 241 242 if (((norm >> 37) & 0x0F) == 0x05 243 && ((norm >> 9) & 0x07) == 0) { 244 addr = (norm >> 13) & 0x0FFFFF; 245 addr |= ((uint32_t)(norm >> 36) & 1) << 20; 246 addr <<= 4; 247 addr -= s->pos + (uint32_t)i; 248 addr >>= 4; 249 250 norm &= ~((uint64_t)0x8FFFFF << 13); 251 norm |= (uint64_t)(addr & 0x0FFFFF) << 13; 252 norm |= (uint64_t)(addr & 0x100000) 253 << (36 - 20); 254 255 instr &= (1 << bit_res) - 1; 256 instr |= norm << bit_res; 257 258 for (j = 0; j < 6; j++) 259 buf[i + j + byte_pos] 260 = (uint8_t)(instr >> (8 * j)); 261 } 262 } 263 } 264 265 return i; 266 } 267 #endif 268 269 #ifdef XZ_DEC_ARM 270 static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 271 { 272 size_t i; 273 uint32_t addr; 274 275 size &= ~(size_t)3; 276 277 for (i = 0; i < size; i += 4) { 278 if (buf[i + 3] == 0xEB) { 279 addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8) 280 | ((uint32_t)buf[i + 2] << 16); 281 addr <<= 2; 282 addr -= s->pos + (uint32_t)i + 8; 283 addr >>= 2; 284 buf[i] = (uint8_t)addr; 285 buf[i + 1] = (uint8_t)(addr >> 8); 286 buf[i + 2] = (uint8_t)(addr >> 16); 287 } 288 } 289 290 return i; 291 } 292 #endif 293 294 #ifdef XZ_DEC_ARMTHUMB 295 static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 296 { 297 size_t i; 298 uint32_t addr; 299 300 if (size < 4) 301 return 0; 302 303 size -= 4; 304 305 for (i = 0; i <= size; i += 2) { 306 if ((buf[i + 1] & 0xF8) == 0xF0 307 && (buf[i + 3] & 0xF8) == 0xF8) { 308 addr = (((uint32_t)buf[i + 1] & 0x07) << 19) 309 | ((uint32_t)buf[i] << 11) 310 | (((uint32_t)buf[i + 3] & 0x07) << 8) 311 | (uint32_t)buf[i + 2]; 312 addr <<= 1; 313 addr -= s->pos + (uint32_t)i + 4; 314 addr >>= 1; 315 buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07)); 316 buf[i] = (uint8_t)(addr >> 11); 317 buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07)); 318 buf[i + 2] = (uint8_t)addr; 319 i += 2; 320 } 321 } 322 323 return i; 324 } 325 #endif 326 327 #ifdef XZ_DEC_SPARC 328 static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 329 { 330 size_t i; 331 uint32_t instr; 332 333 size &= ~(size_t)3; 334 335 for (i = 0; i < size; i += 4) { 336 instr = get_unaligned_be32(buf + i); 337 if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) { 338 instr <<= 2; 339 instr -= s->pos + (uint32_t)i; 340 instr >>= 2; 341 instr = ((uint32_t)0x40000000 - (instr & 0x400000)) 342 | 0x40000000 | (instr & 0x3FFFFF); 343 put_unaligned_be32(instr, buf + i); 344 } 345 } 346 347 return i; 348 } 349 #endif 350 351 #ifdef XZ_DEC_ARM64 352 static size_t bcj_arm64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 353 { 354 size_t i; 355 uint32_t instr; 356 uint32_t addr; 357 358 size &= ~(size_t)3; 359 360 for (i = 0; i < size; i += 4) { 361 instr = get_unaligned_le32(buf + i); 362 363 if ((instr >> 26) == 0x25) { 364 /* BL instruction */ 365 addr = instr - ((s->pos + (uint32_t)i) >> 2); 366 instr = 0x94000000 | (addr & 0x03FFFFFF); 367 put_unaligned_le32(instr, buf + i); 368 369 } else if ((instr & 0x9F000000) == 0x90000000) { 370 /* ADRP instruction */ 371 addr = ((instr >> 29) & 3) | ((instr >> 3) & 0x1FFFFC); 372 373 /* Only convert values in the range +/-512 MiB. */ 374 if ((addr + 0x020000) & 0x1C0000) 375 continue; 376 377 addr -= (s->pos + (uint32_t)i) >> 12; 378 379 instr &= 0x9000001F; 380 instr |= (addr & 3) << 29; 381 instr |= (addr & 0x03FFFC) << 3; 382 instr |= (0U - (addr & 0x020000)) & 0xE00000; 383 384 put_unaligned_le32(instr, buf + i); 385 } 386 } 387 388 return i; 389 } 390 #endif 391 392 #ifdef XZ_DEC_RISCV 393 static size_t bcj_riscv(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 394 { 395 size_t i; 396 uint32_t b1; 397 uint32_t b2; 398 uint32_t b3; 399 uint32_t instr; 400 uint32_t instr2; 401 uint32_t instr2_rs1; 402 uint32_t addr; 403 404 if (size < 8) 405 return 0; 406 407 size -= 8; 408 409 for (i = 0; i <= size; i += 2) { 410 instr = buf[i]; 411 412 if (instr == 0xEF) { 413 /* JAL */ 414 b1 = buf[i + 1]; 415 if ((b1 & 0x0D) != 0) 416 continue; 417 418 b2 = buf[i + 2]; 419 b3 = buf[i + 3]; 420 421 addr = ((b1 & 0xF0) << 13) | (b2 << 9) | (b3 << 1); 422 addr -= s->pos + (uint32_t)i; 423 424 buf[i + 1] = (uint8_t)((b1 & 0x0F) 425 | ((addr >> 8) & 0xF0)); 426 427 buf[i + 2] = (uint8_t)(((addr >> 16) & 0x0F) 428 | ((addr >> 7) & 0x10) 429 | ((addr << 4) & 0xE0)); 430 431 buf[i + 3] = (uint8_t)(((addr >> 4) & 0x7F) 432 | ((addr >> 13) & 0x80)); 433 434 i += 4 - 2; 435 436 } else if ((instr & 0x7F) == 0x17) { 437 /* AUIPC */ 438 instr |= (uint32_t)buf[i + 1] << 8; 439 instr |= (uint32_t)buf[i + 2] << 16; 440 instr |= (uint32_t)buf[i + 3] << 24; 441 442 if (instr & 0xE80) { 443 /* AUIPC's rd doesn't equal x0 or x2. */ 444 instr2 = get_unaligned_le32(buf + i + 4); 445 446 if (((instr << 8) ^ (instr2 - 3)) & 0xF8003) { 447 i += 6 - 2; 448 continue; 449 } 450 451 addr = (instr & 0xFFFFF000) + (instr2 >> 20); 452 453 instr = 0x17 | (2 << 7) | (instr2 << 12); 454 instr2 = addr; 455 } else { 456 /* AUIPC's rd equals x0 or x2. */ 457 instr2_rs1 = instr >> 27; 458 459 if ((uint32_t)((instr - 0x3117) << 18) 460 >= (instr2_rs1 & 0x1D)) { 461 i += 4 - 2; 462 continue; 463 } 464 465 addr = get_unaligned_be32(buf + i + 4); 466 addr -= s->pos + (uint32_t)i; 467 468 instr2 = (instr >> 12) | (addr << 20); 469 470 instr = 0x17 | (instr2_rs1 << 7) 471 | ((addr + 0x800) & 0xFFFFF000); 472 } 473 474 put_unaligned_le32(instr, buf + i); 475 put_unaligned_le32(instr2, buf + i + 4); 476 477 i += 8 - 2; 478 } 479 } 480 481 return i; 482 } 483 #endif 484 485 /* 486 * Apply the selected BCJ filter. Update *pos and s->pos to match the amount 487 * of data that got filtered. 488 * 489 * NOTE: This is implemented as a switch statement to avoid using function 490 * pointers, which could be problematic in the kernel boot code, which must 491 * avoid pointers to static data (at least on x86). 492 */ 493 static void bcj_apply(struct xz_dec_bcj *s, 494 uint8_t *buf, size_t *pos, size_t size) 495 { 496 size_t filtered; 497 498 buf += *pos; 499 size -= *pos; 500 501 switch (s->type) { 502 #ifdef XZ_DEC_X86 503 case BCJ_X86: 504 filtered = bcj_x86(s, buf, size); 505 break; 506 #endif 507 #ifdef XZ_DEC_POWERPC 508 case BCJ_POWERPC: 509 filtered = bcj_powerpc(s, buf, size); 510 break; 511 #endif 512 #ifdef XZ_DEC_IA64 513 case BCJ_IA64: 514 filtered = bcj_ia64(s, buf, size); 515 break; 516 #endif 517 #ifdef XZ_DEC_ARM 518 case BCJ_ARM: 519 filtered = bcj_arm(s, buf, size); 520 break; 521 #endif 522 #ifdef XZ_DEC_ARMTHUMB 523 case BCJ_ARMTHUMB: 524 filtered = bcj_armthumb(s, buf, size); 525 break; 526 #endif 527 #ifdef XZ_DEC_SPARC 528 case BCJ_SPARC: 529 filtered = bcj_sparc(s, buf, size); 530 break; 531 #endif 532 #ifdef XZ_DEC_ARM64 533 case BCJ_ARM64: 534 filtered = bcj_arm64(s, buf, size); 535 break; 536 #endif 537 #ifdef XZ_DEC_RISCV 538 case BCJ_RISCV: 539 filtered = bcj_riscv(s, buf, size); 540 break; 541 #endif 542 default: 543 /* Never reached but silence compiler warnings. */ 544 filtered = 0; 545 break; 546 } 547 548 *pos += filtered; 549 s->pos += filtered; 550 } 551 552 /* 553 * Flush pending filtered data from temp to the output buffer. 554 * Move the remaining mixture of possibly filtered and unfiltered 555 * data to the beginning of temp. 556 */ 557 static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b) 558 { 559 size_t copy_size; 560 561 copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos); 562 memcpy(b->out + b->out_pos, s->temp.buf, copy_size); 563 b->out_pos += copy_size; 564 565 s->temp.filtered -= copy_size; 566 s->temp.size -= copy_size; 567 memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size); 568 } 569 570 /* 571 * The BCJ filter functions are primitive in sense that they process the 572 * data in chunks of 1-16 bytes. To hide this issue, this function does 573 * some buffering. 574 */ 575 enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, struct xz_dec_lzma2 *lzma2, 576 struct xz_buf *b) 577 { 578 size_t out_start; 579 580 /* 581 * Flush pending already filtered data to the output buffer. Return 582 * immediately if we couldn't flush everything, or if the next 583 * filter in the chain had already returned XZ_STREAM_END. 584 */ 585 if (s->temp.filtered > 0) { 586 bcj_flush(s, b); 587 if (s->temp.filtered > 0) 588 return XZ_OK; 589 590 if (s->ret == XZ_STREAM_END) 591 return XZ_STREAM_END; 592 } 593 594 /* 595 * If we have more output space than what is currently pending in 596 * temp, copy the unfiltered data from temp to the output buffer 597 * and try to fill the output buffer by decoding more data from the 598 * next filter in the chain. Apply the BCJ filter on the new data 599 * in the output buffer. If everything cannot be filtered, copy it 600 * to temp and rewind the output buffer position accordingly. 601 * 602 * This needs to be always run when temp.size == 0 to handle a special 603 * case where the output buffer is full and the next filter has no 604 * more output coming but hasn't returned XZ_STREAM_END yet. 605 */ 606 if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) { 607 out_start = b->out_pos; 608 memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size); 609 b->out_pos += s->temp.size; 610 611 s->ret = xz_dec_lzma2_run(lzma2, b); 612 if (s->ret != XZ_STREAM_END 613 && (s->ret != XZ_OK || s->single_call)) 614 return s->ret; 615 616 bcj_apply(s, b->out, &out_start, b->out_pos); 617 618 /* 619 * As an exception, if the next filter returned XZ_STREAM_END, 620 * we can do that too, since the last few bytes that remain 621 * unfiltered are meant to remain unfiltered. 622 */ 623 if (s->ret == XZ_STREAM_END) 624 return XZ_STREAM_END; 625 626 s->temp.size = b->out_pos - out_start; 627 b->out_pos -= s->temp.size; 628 memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size); 629 630 /* 631 * If there wasn't enough input to the next filter to fill 632 * the output buffer with unfiltered data, there's no point 633 * to try decoding more data to temp. 634 */ 635 if (b->out_pos + s->temp.size < b->out_size) 636 return XZ_OK; 637 } 638 639 /* 640 * We have unfiltered data in temp. If the output buffer isn't full 641 * yet, try to fill the temp buffer by decoding more data from the 642 * next filter. Apply the BCJ filter on temp. Then we hopefully can 643 * fill the actual output buffer by copying filtered data from temp. 644 * A mix of filtered and unfiltered data may be left in temp; it will 645 * be taken care on the next call to this function. 646 */ 647 if (b->out_pos < b->out_size) { 648 /* Make b->out{,_pos,_size} temporarily point to s->temp. */ 649 s->out = b->out; 650 s->out_pos = b->out_pos; 651 s->out_size = b->out_size; 652 b->out = s->temp.buf; 653 b->out_pos = s->temp.size; 654 b->out_size = sizeof(s->temp.buf); 655 656 s->ret = xz_dec_lzma2_run(lzma2, b); 657 658 s->temp.size = b->out_pos; 659 b->out = s->out; 660 b->out_pos = s->out_pos; 661 b->out_size = s->out_size; 662 663 if (s->ret != XZ_OK && s->ret != XZ_STREAM_END) 664 return s->ret; 665 666 bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size); 667 668 /* 669 * If the next filter returned XZ_STREAM_END, we mark that 670 * everything is filtered, since the last unfiltered bytes 671 * of the stream are meant to be left as is. 672 */ 673 if (s->ret == XZ_STREAM_END) 674 s->temp.filtered = s->temp.size; 675 676 bcj_flush(s, b); 677 if (s->temp.filtered > 0) 678 return XZ_OK; 679 } 680 681 return s->ret; 682 } 683 684 struct xz_dec_bcj *xz_dec_bcj_create(bool single_call) 685 { 686 struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL); 687 if (s != NULL) 688 s->single_call = single_call; 689 690 return s; 691 } 692 693 enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id) 694 { 695 switch (id) { 696 #ifdef XZ_DEC_X86 697 case BCJ_X86: 698 #endif 699 #ifdef XZ_DEC_POWERPC 700 case BCJ_POWERPC: 701 #endif 702 #ifdef XZ_DEC_IA64 703 case BCJ_IA64: 704 #endif 705 #ifdef XZ_DEC_ARM 706 case BCJ_ARM: 707 #endif 708 #ifdef XZ_DEC_ARMTHUMB 709 case BCJ_ARMTHUMB: 710 #endif 711 #ifdef XZ_DEC_SPARC 712 case BCJ_SPARC: 713 #endif 714 #ifdef XZ_DEC_ARM64 715 case BCJ_ARM64: 716 #endif 717 #ifdef XZ_DEC_RISCV 718 case BCJ_RISCV: 719 #endif 720 break; 721 722 default: 723 /* Unsupported Filter ID */ 724 return XZ_OPTIONS_ERROR; 725 } 726 727 s->type = id; 728 s->ret = XZ_OK; 729 s->pos = 0; 730 s->x86_prev_mask = 0; 731 s->temp.filtered = 0; 732 s->temp.size = 0; 733 734 return XZ_OK; 735 } 736 737 #endif 738