1 // SPDX-License-Identifier: 0BSD 2 3 /* 4 * Branch/Call/Jump (BCJ) filter decoders 5 * 6 * Authors: Lasse Collin <lasse.collin@tukaani.org> 7 * Igor Pavlov <https://7-zip.org/> 8 */ 9 10 #include "xz_private.h" 11 12 /* 13 * The rest of the file is inside this ifdef. It makes things a little more 14 * convenient when building without support for any BCJ filters. 15 */ 16 #ifdef XZ_DEC_BCJ 17 18 struct xz_dec_bcj { 19 /* Type of the BCJ filter being used */ 20 enum { 21 BCJ_X86 = 4, /* x86 or x86-64 */ 22 BCJ_POWERPC = 5, /* Big endian only */ 23 BCJ_ARM = 7, /* Little endian only */ 24 BCJ_ARMTHUMB = 8, /* Little endian only */ 25 BCJ_SPARC = 9, /* Big or little endian */ 26 BCJ_ARM64 = 10, /* AArch64 */ 27 BCJ_RISCV = 11 /* RV32GQC_Zfh, RV64GQC_Zfh */ 28 } type; 29 30 /* 31 * Return value of the next filter in the chain. We need to preserve 32 * this information across calls, because we must not call the next 33 * filter anymore once it has returned XZ_STREAM_END. 34 */ 35 enum xz_ret ret; 36 37 /* True if we are operating in single-call mode. */ 38 bool single_call; 39 40 /* 41 * Absolute position relative to the beginning of the uncompressed 42 * data (in a single .xz Block). We care only about the lowest 32 43 * bits so this doesn't need to be uint64_t even with big files. 44 */ 45 uint32_t pos; 46 47 /* x86 filter state */ 48 uint32_t x86_prev_mask; 49 50 /* Temporary space to hold the variables from struct xz_buf */ 51 uint8_t *out; 52 size_t out_pos; 53 size_t out_size; 54 55 struct { 56 /* Amount of already filtered data in the beginning of buf */ 57 size_t filtered; 58 59 /* Total amount of data currently stored in buf */ 60 size_t size; 61 62 /* 63 * Buffer to hold a mix of filtered and unfiltered data. This 64 * needs to be big enough to hold Alignment + 2 * Look-ahead: 65 * 66 * Type Alignment Look-ahead 67 * x86 1 4 68 * PowerPC 4 0 69 * IA-64 16 0 70 * ARM 4 0 71 * ARM-Thumb 2 2 72 * SPARC 4 0 73 */ 74 uint8_t buf[16]; 75 } temp; 76 }; 77 78 #ifdef XZ_DEC_X86 79 /* 80 * This is used to test the most significant byte of a memory address 81 * in an x86 instruction. 82 */ 83 static inline int bcj_x86_test_msbyte(uint8_t b) 84 { 85 return b == 0x00 || b == 0xFF; 86 } 87 88 static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 89 { 90 static const bool mask_to_allowed_status[8] 91 = { true, true, true, false, true, false, false, false }; 92 93 static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 }; 94 95 size_t i; 96 size_t prev_pos = (size_t)-1; 97 uint32_t prev_mask = s->x86_prev_mask; 98 uint32_t src; 99 uint32_t dest; 100 uint32_t j; 101 uint8_t b; 102 103 if (size <= 4) 104 return 0; 105 106 size -= 4; 107 for (i = 0; i < size; ++i) { 108 if ((buf[i] & 0xFE) != 0xE8) 109 continue; 110 111 prev_pos = i - prev_pos; 112 if (prev_pos > 3) { 113 prev_mask = 0; 114 } else { 115 prev_mask = (prev_mask << (prev_pos - 1)) & 7; 116 if (prev_mask != 0) { 117 b = buf[i + 4 - mask_to_bit_num[prev_mask]]; 118 if (!mask_to_allowed_status[prev_mask] 119 || bcj_x86_test_msbyte(b)) { 120 prev_pos = i; 121 prev_mask = (prev_mask << 1) | 1; 122 continue; 123 } 124 } 125 } 126 127 prev_pos = i; 128 129 if (bcj_x86_test_msbyte(buf[i + 4])) { 130 src = get_unaligned_le32(buf + i + 1); 131 while (true) { 132 dest = src - (s->pos + (uint32_t)i + 5); 133 if (prev_mask == 0) 134 break; 135 136 j = mask_to_bit_num[prev_mask] * 8; 137 b = (uint8_t)(dest >> (24 - j)); 138 if (!bcj_x86_test_msbyte(b)) 139 break; 140 141 src = dest ^ (((uint32_t)1 << (32 - j)) - 1); 142 } 143 144 dest &= 0x01FFFFFF; 145 dest |= (uint32_t)0 - (dest & 0x01000000); 146 put_unaligned_le32(dest, buf + i + 1); 147 i += 4; 148 } else { 149 prev_mask = (prev_mask << 1) | 1; 150 } 151 } 152 153 prev_pos = i - prev_pos; 154 s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1); 155 return i; 156 } 157 #endif 158 159 #ifdef XZ_DEC_POWERPC 160 static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 161 { 162 size_t i; 163 uint32_t instr; 164 165 size &= ~(size_t)3; 166 167 for (i = 0; i < size; i += 4) { 168 instr = get_unaligned_be32(buf + i); 169 if ((instr & 0xFC000003) == 0x48000001) { 170 instr &= 0x03FFFFFC; 171 instr -= s->pos + (uint32_t)i; 172 instr &= 0x03FFFFFC; 173 instr |= 0x48000001; 174 put_unaligned_be32(instr, buf + i); 175 } 176 } 177 178 return i; 179 } 180 #endif 181 182 #ifdef XZ_DEC_ARM 183 static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 184 { 185 size_t i; 186 uint32_t addr; 187 188 size &= ~(size_t)3; 189 190 for (i = 0; i < size; i += 4) { 191 if (buf[i + 3] == 0xEB) { 192 addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8) 193 | ((uint32_t)buf[i + 2] << 16); 194 addr <<= 2; 195 addr -= s->pos + (uint32_t)i + 8; 196 addr >>= 2; 197 buf[i] = (uint8_t)addr; 198 buf[i + 1] = (uint8_t)(addr >> 8); 199 buf[i + 2] = (uint8_t)(addr >> 16); 200 } 201 } 202 203 return i; 204 } 205 #endif 206 207 #ifdef XZ_DEC_ARMTHUMB 208 static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 209 { 210 size_t i; 211 uint32_t addr; 212 213 if (size < 4) 214 return 0; 215 216 size -= 4; 217 218 for (i = 0; i <= size; i += 2) { 219 if ((buf[i + 1] & 0xF8) == 0xF0 220 && (buf[i + 3] & 0xF8) == 0xF8) { 221 addr = (((uint32_t)buf[i + 1] & 0x07) << 19) 222 | ((uint32_t)buf[i] << 11) 223 | (((uint32_t)buf[i + 3] & 0x07) << 8) 224 | (uint32_t)buf[i + 2]; 225 addr <<= 1; 226 addr -= s->pos + (uint32_t)i + 4; 227 addr >>= 1; 228 buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07)); 229 buf[i] = (uint8_t)(addr >> 11); 230 buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07)); 231 buf[i + 2] = (uint8_t)addr; 232 i += 2; 233 } 234 } 235 236 return i; 237 } 238 #endif 239 240 #ifdef XZ_DEC_SPARC 241 static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 242 { 243 size_t i; 244 uint32_t instr; 245 246 size &= ~(size_t)3; 247 248 for (i = 0; i < size; i += 4) { 249 instr = get_unaligned_be32(buf + i); 250 if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) { 251 instr <<= 2; 252 instr -= s->pos + (uint32_t)i; 253 instr >>= 2; 254 instr = ((uint32_t)0x40000000 - (instr & 0x400000)) 255 | 0x40000000 | (instr & 0x3FFFFF); 256 put_unaligned_be32(instr, buf + i); 257 } 258 } 259 260 return i; 261 } 262 #endif 263 264 #ifdef XZ_DEC_ARM64 265 static size_t bcj_arm64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 266 { 267 size_t i; 268 uint32_t instr; 269 uint32_t addr; 270 271 size &= ~(size_t)3; 272 273 for (i = 0; i < size; i += 4) { 274 instr = get_unaligned_le32(buf + i); 275 276 if ((instr >> 26) == 0x25) { 277 /* BL instruction */ 278 addr = instr - ((s->pos + (uint32_t)i) >> 2); 279 instr = 0x94000000 | (addr & 0x03FFFFFF); 280 put_unaligned_le32(instr, buf + i); 281 282 } else if ((instr & 0x9F000000) == 0x90000000) { 283 /* ADRP instruction */ 284 addr = ((instr >> 29) & 3) | ((instr >> 3) & 0x1FFFFC); 285 286 /* Only convert values in the range +/-512 MiB. */ 287 if ((addr + 0x020000) & 0x1C0000) 288 continue; 289 290 addr -= (s->pos + (uint32_t)i) >> 12; 291 292 instr &= 0x9000001F; 293 instr |= (addr & 3) << 29; 294 instr |= (addr & 0x03FFFC) << 3; 295 instr |= (0U - (addr & 0x020000)) & 0xE00000; 296 297 put_unaligned_le32(instr, buf + i); 298 } 299 } 300 301 return i; 302 } 303 #endif 304 305 #ifdef XZ_DEC_RISCV 306 static size_t bcj_riscv(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 307 { 308 size_t i; 309 uint32_t b1; 310 uint32_t b2; 311 uint32_t b3; 312 uint32_t instr; 313 uint32_t instr2; 314 uint32_t instr2_rs1; 315 uint32_t addr; 316 317 if (size < 8) 318 return 0; 319 320 size -= 8; 321 322 for (i = 0; i <= size; i += 2) { 323 instr = buf[i]; 324 325 if (instr == 0xEF) { 326 /* JAL */ 327 b1 = buf[i + 1]; 328 if ((b1 & 0x0D) != 0) 329 continue; 330 331 b2 = buf[i + 2]; 332 b3 = buf[i + 3]; 333 334 addr = ((b1 & 0xF0) << 13) | (b2 << 9) | (b3 << 1); 335 addr -= s->pos + (uint32_t)i; 336 337 buf[i + 1] = (uint8_t)((b1 & 0x0F) 338 | ((addr >> 8) & 0xF0)); 339 340 buf[i + 2] = (uint8_t)(((addr >> 16) & 0x0F) 341 | ((addr >> 7) & 0x10) 342 | ((addr << 4) & 0xE0)); 343 344 buf[i + 3] = (uint8_t)(((addr >> 4) & 0x7F) 345 | ((addr >> 13) & 0x80)); 346 347 i += 4 - 2; 348 349 } else if ((instr & 0x7F) == 0x17) { 350 /* AUIPC */ 351 instr |= (uint32_t)buf[i + 1] << 8; 352 instr |= (uint32_t)buf[i + 2] << 16; 353 instr |= (uint32_t)buf[i + 3] << 24; 354 355 if (instr & 0xE80) { 356 /* AUIPC's rd doesn't equal x0 or x2. */ 357 instr2 = get_unaligned_le32(buf + i + 4); 358 359 if (((instr << 8) ^ (instr2 - 3)) & 0xF8003) { 360 i += 6 - 2; 361 continue; 362 } 363 364 addr = (instr & 0xFFFFF000) + (instr2 >> 20); 365 366 instr = 0x17 | (2 << 7) | (instr2 << 12); 367 instr2 = addr; 368 } else { 369 /* AUIPC's rd equals x0 or x2. */ 370 instr2_rs1 = instr >> 27; 371 372 if ((uint32_t)((instr - 0x3117) << 18) 373 >= (instr2_rs1 & 0x1D)) { 374 i += 4 - 2; 375 continue; 376 } 377 378 addr = get_unaligned_be32(buf + i + 4); 379 addr -= s->pos + (uint32_t)i; 380 381 instr2 = (instr >> 12) | (addr << 20); 382 383 instr = 0x17 | (instr2_rs1 << 7) 384 | ((addr + 0x800) & 0xFFFFF000); 385 } 386 387 put_unaligned_le32(instr, buf + i); 388 put_unaligned_le32(instr2, buf + i + 4); 389 390 i += 8 - 2; 391 } 392 } 393 394 return i; 395 } 396 #endif 397 398 /* 399 * Apply the selected BCJ filter. Update *pos and s->pos to match the amount 400 * of data that got filtered. 401 * 402 * NOTE: This is implemented as a switch statement to avoid using function 403 * pointers, which could be problematic in the kernel boot code, which must 404 * avoid pointers to static data (at least on x86). 405 */ 406 static void bcj_apply(struct xz_dec_bcj *s, 407 uint8_t *buf, size_t *pos, size_t size) 408 { 409 size_t filtered; 410 411 buf += *pos; 412 size -= *pos; 413 414 switch (s->type) { 415 #ifdef XZ_DEC_X86 416 case BCJ_X86: 417 filtered = bcj_x86(s, buf, size); 418 break; 419 #endif 420 #ifdef XZ_DEC_POWERPC 421 case BCJ_POWERPC: 422 filtered = bcj_powerpc(s, buf, size); 423 break; 424 #endif 425 #ifdef XZ_DEC_ARM 426 case BCJ_ARM: 427 filtered = bcj_arm(s, buf, size); 428 break; 429 #endif 430 #ifdef XZ_DEC_ARMTHUMB 431 case BCJ_ARMTHUMB: 432 filtered = bcj_armthumb(s, buf, size); 433 break; 434 #endif 435 #ifdef XZ_DEC_SPARC 436 case BCJ_SPARC: 437 filtered = bcj_sparc(s, buf, size); 438 break; 439 #endif 440 #ifdef XZ_DEC_ARM64 441 case BCJ_ARM64: 442 filtered = bcj_arm64(s, buf, size); 443 break; 444 #endif 445 #ifdef XZ_DEC_RISCV 446 case BCJ_RISCV: 447 filtered = bcj_riscv(s, buf, size); 448 break; 449 #endif 450 default: 451 /* Never reached but silence compiler warnings. */ 452 filtered = 0; 453 break; 454 } 455 456 *pos += filtered; 457 s->pos += filtered; 458 } 459 460 /* 461 * Flush pending filtered data from temp to the output buffer. 462 * Move the remaining mixture of possibly filtered and unfiltered 463 * data to the beginning of temp. 464 */ 465 static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b) 466 { 467 size_t copy_size; 468 469 copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos); 470 memcpy(b->out + b->out_pos, s->temp.buf, copy_size); 471 b->out_pos += copy_size; 472 473 s->temp.filtered -= copy_size; 474 s->temp.size -= copy_size; 475 memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size); 476 } 477 478 /* 479 * The BCJ filter functions are primitive in sense that they process the 480 * data in chunks of 1-16 bytes. To hide this issue, this function does 481 * some buffering. 482 */ 483 enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, struct xz_dec_lzma2 *lzma2, 484 struct xz_buf *b) 485 { 486 size_t out_start; 487 488 /* 489 * Flush pending already filtered data to the output buffer. Return 490 * immediately if we couldn't flush everything, or if the next 491 * filter in the chain had already returned XZ_STREAM_END. 492 */ 493 if (s->temp.filtered > 0) { 494 bcj_flush(s, b); 495 if (s->temp.filtered > 0) 496 return XZ_OK; 497 498 if (s->ret == XZ_STREAM_END) 499 return XZ_STREAM_END; 500 } 501 502 /* 503 * If we have more output space than what is currently pending in 504 * temp, copy the unfiltered data from temp to the output buffer 505 * and try to fill the output buffer by decoding more data from the 506 * next filter in the chain. Apply the BCJ filter on the new data 507 * in the output buffer. If everything cannot be filtered, copy it 508 * to temp and rewind the output buffer position accordingly. 509 * 510 * This needs to be always run when temp.size == 0 to handle a special 511 * case where the output buffer is full and the next filter has no 512 * more output coming but hasn't returned XZ_STREAM_END yet. 513 */ 514 if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) { 515 out_start = b->out_pos; 516 memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size); 517 b->out_pos += s->temp.size; 518 519 s->ret = xz_dec_lzma2_run(lzma2, b); 520 if (s->ret != XZ_STREAM_END 521 && (s->ret != XZ_OK || s->single_call)) 522 return s->ret; 523 524 bcj_apply(s, b->out, &out_start, b->out_pos); 525 526 /* 527 * As an exception, if the next filter returned XZ_STREAM_END, 528 * we can do that too, since the last few bytes that remain 529 * unfiltered are meant to remain unfiltered. 530 */ 531 if (s->ret == XZ_STREAM_END) 532 return XZ_STREAM_END; 533 534 s->temp.size = b->out_pos - out_start; 535 b->out_pos -= s->temp.size; 536 memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size); 537 538 /* 539 * If there wasn't enough input to the next filter to fill 540 * the output buffer with unfiltered data, there's no point 541 * to try decoding more data to temp. 542 */ 543 if (b->out_pos + s->temp.size < b->out_size) 544 return XZ_OK; 545 } 546 547 /* 548 * We have unfiltered data in temp. If the output buffer isn't full 549 * yet, try to fill the temp buffer by decoding more data from the 550 * next filter. Apply the BCJ filter on temp. Then we hopefully can 551 * fill the actual output buffer by copying filtered data from temp. 552 * A mix of filtered and unfiltered data may be left in temp; it will 553 * be taken care on the next call to this function. 554 */ 555 if (b->out_pos < b->out_size) { 556 /* Make b->out{,_pos,_size} temporarily point to s->temp. */ 557 s->out = b->out; 558 s->out_pos = b->out_pos; 559 s->out_size = b->out_size; 560 b->out = s->temp.buf; 561 b->out_pos = s->temp.size; 562 b->out_size = sizeof(s->temp.buf); 563 564 s->ret = xz_dec_lzma2_run(lzma2, b); 565 566 s->temp.size = b->out_pos; 567 b->out = s->out; 568 b->out_pos = s->out_pos; 569 b->out_size = s->out_size; 570 571 if (s->ret != XZ_OK && s->ret != XZ_STREAM_END) 572 return s->ret; 573 574 bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size); 575 576 /* 577 * If the next filter returned XZ_STREAM_END, we mark that 578 * everything is filtered, since the last unfiltered bytes 579 * of the stream are meant to be left as is. 580 */ 581 if (s->ret == XZ_STREAM_END) 582 s->temp.filtered = s->temp.size; 583 584 bcj_flush(s, b); 585 if (s->temp.filtered > 0) 586 return XZ_OK; 587 } 588 589 return s->ret; 590 } 591 592 struct xz_dec_bcj *xz_dec_bcj_create(bool single_call) 593 { 594 struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL); 595 if (s != NULL) 596 s->single_call = single_call; 597 598 return s; 599 } 600 601 enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id) 602 { 603 switch (id) { 604 #ifdef XZ_DEC_X86 605 case BCJ_X86: 606 #endif 607 #ifdef XZ_DEC_POWERPC 608 case BCJ_POWERPC: 609 #endif 610 #ifdef XZ_DEC_ARM 611 case BCJ_ARM: 612 #endif 613 #ifdef XZ_DEC_ARMTHUMB 614 case BCJ_ARMTHUMB: 615 #endif 616 #ifdef XZ_DEC_SPARC 617 case BCJ_SPARC: 618 #endif 619 #ifdef XZ_DEC_ARM64 620 case BCJ_ARM64: 621 #endif 622 #ifdef XZ_DEC_RISCV 623 case BCJ_RISCV: 624 #endif 625 break; 626 627 default: 628 /* Unsupported Filter ID */ 629 return XZ_OPTIONS_ERROR; 630 } 631 632 s->type = id; 633 s->ret = XZ_OK; 634 s->pos = 0; 635 s->x86_prev_mask = 0; 636 s->temp.filtered = 0; 637 s->temp.size = 0; 638 639 return XZ_OK; 640 } 641 642 #endif 643