1 // SPDX-License-Identifier: 0BSD 2 3 /* 4 * Branch/Call/Jump (BCJ) filter decoders 5 * 6 * Authors: Lasse Collin <lasse.collin@tukaani.org> 7 * Igor Pavlov <https://7-zip.org/> 8 */ 9 10 #include "xz_private.h" 11 12 /* 13 * The rest of the file is inside this ifdef. It makes things a little more 14 * convenient when building without support for any BCJ filters. 15 */ 16 #ifdef XZ_DEC_BCJ 17 18 struct xz_dec_bcj { 19 /* Type of the BCJ filter being used */ 20 enum { 21 BCJ_X86 = 4, /* x86 or x86-64 */ 22 BCJ_POWERPC = 5, /* Big endian only */ 23 BCJ_IA64 = 6, /* Big or little endian */ 24 BCJ_ARM = 7, /* Little endian only */ 25 BCJ_ARMTHUMB = 8, /* Little endian only */ 26 BCJ_SPARC = 9 /* Big or little endian */ 27 } type; 28 29 /* 30 * Return value of the next filter in the chain. We need to preserve 31 * this information across calls, because we must not call the next 32 * filter anymore once it has returned XZ_STREAM_END. 33 */ 34 enum xz_ret ret; 35 36 /* True if we are operating in single-call mode. */ 37 bool single_call; 38 39 /* 40 * Absolute position relative to the beginning of the uncompressed 41 * data (in a single .xz Block). We care only about the lowest 32 42 * bits so this doesn't need to be uint64_t even with big files. 43 */ 44 uint32_t pos; 45 46 /* x86 filter state */ 47 uint32_t x86_prev_mask; 48 49 /* Temporary space to hold the variables from struct xz_buf */ 50 uint8_t *out; 51 size_t out_pos; 52 size_t out_size; 53 54 struct { 55 /* Amount of already filtered data in the beginning of buf */ 56 size_t filtered; 57 58 /* Total amount of data currently stored in buf */ 59 size_t size; 60 61 /* 62 * Buffer to hold a mix of filtered and unfiltered data. This 63 * needs to be big enough to hold Alignment + 2 * Look-ahead: 64 * 65 * Type Alignment Look-ahead 66 * x86 1 4 67 * PowerPC 4 0 68 * IA-64 16 0 69 * ARM 4 0 70 * ARM-Thumb 2 2 71 * SPARC 4 0 72 */ 73 uint8_t buf[16]; 74 } temp; 75 }; 76 77 #ifdef XZ_DEC_X86 78 /* 79 * This is used to test the most significant byte of a memory address 80 * in an x86 instruction. 81 */ 82 static inline int bcj_x86_test_msbyte(uint8_t b) 83 { 84 return b == 0x00 || b == 0xFF; 85 } 86 87 static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 88 { 89 static const bool mask_to_allowed_status[8] 90 = { true, true, true, false, true, false, false, false }; 91 92 static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 }; 93 94 size_t i; 95 size_t prev_pos = (size_t)-1; 96 uint32_t prev_mask = s->x86_prev_mask; 97 uint32_t src; 98 uint32_t dest; 99 uint32_t j; 100 uint8_t b; 101 102 if (size <= 4) 103 return 0; 104 105 size -= 4; 106 for (i = 0; i < size; ++i) { 107 if ((buf[i] & 0xFE) != 0xE8) 108 continue; 109 110 prev_pos = i - prev_pos; 111 if (prev_pos > 3) { 112 prev_mask = 0; 113 } else { 114 prev_mask = (prev_mask << (prev_pos - 1)) & 7; 115 if (prev_mask != 0) { 116 b = buf[i + 4 - mask_to_bit_num[prev_mask]]; 117 if (!mask_to_allowed_status[prev_mask] 118 || bcj_x86_test_msbyte(b)) { 119 prev_pos = i; 120 prev_mask = (prev_mask << 1) | 1; 121 continue; 122 } 123 } 124 } 125 126 prev_pos = i; 127 128 if (bcj_x86_test_msbyte(buf[i + 4])) { 129 src = get_unaligned_le32(buf + i + 1); 130 while (true) { 131 dest = src - (s->pos + (uint32_t)i + 5); 132 if (prev_mask == 0) 133 break; 134 135 j = mask_to_bit_num[prev_mask] * 8; 136 b = (uint8_t)(dest >> (24 - j)); 137 if (!bcj_x86_test_msbyte(b)) 138 break; 139 140 src = dest ^ (((uint32_t)1 << (32 - j)) - 1); 141 } 142 143 dest &= 0x01FFFFFF; 144 dest |= (uint32_t)0 - (dest & 0x01000000); 145 put_unaligned_le32(dest, buf + i + 1); 146 i += 4; 147 } else { 148 prev_mask = (prev_mask << 1) | 1; 149 } 150 } 151 152 prev_pos = i - prev_pos; 153 s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1); 154 return i; 155 } 156 #endif 157 158 #ifdef XZ_DEC_POWERPC 159 static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 160 { 161 size_t i; 162 uint32_t instr; 163 164 for (i = 0; i + 4 <= size; i += 4) { 165 instr = get_unaligned_be32(buf + i); 166 if ((instr & 0xFC000003) == 0x48000001) { 167 instr &= 0x03FFFFFC; 168 instr -= s->pos + (uint32_t)i; 169 instr &= 0x03FFFFFC; 170 instr |= 0x48000001; 171 put_unaligned_be32(instr, buf + i); 172 } 173 } 174 175 return i; 176 } 177 #endif 178 179 #ifdef XZ_DEC_IA64 180 static size_t bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 181 { 182 static const uint8_t branch_table[32] = { 183 0, 0, 0, 0, 0, 0, 0, 0, 184 0, 0, 0, 0, 0, 0, 0, 0, 185 4, 4, 6, 6, 0, 0, 7, 7, 186 4, 4, 0, 0, 4, 4, 0, 0 187 }; 188 189 /* 190 * The local variables take a little bit stack space, but it's less 191 * than what LZMA2 decoder takes, so it doesn't make sense to reduce 192 * stack usage here without doing that for the LZMA2 decoder too. 193 */ 194 195 /* Loop counters */ 196 size_t i; 197 size_t j; 198 199 /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */ 200 uint32_t slot; 201 202 /* Bitwise offset of the instruction indicated by slot */ 203 uint32_t bit_pos; 204 205 /* bit_pos split into byte and bit parts */ 206 uint32_t byte_pos; 207 uint32_t bit_res; 208 209 /* Address part of an instruction */ 210 uint32_t addr; 211 212 /* Mask used to detect which instructions to convert */ 213 uint32_t mask; 214 215 /* 41-bit instruction stored somewhere in the lowest 48 bits */ 216 uint64_t instr; 217 218 /* Instruction normalized with bit_res for easier manipulation */ 219 uint64_t norm; 220 221 for (i = 0; i + 16 <= size; i += 16) { 222 mask = branch_table[buf[i] & 0x1F]; 223 for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) { 224 if (((mask >> slot) & 1) == 0) 225 continue; 226 227 byte_pos = bit_pos >> 3; 228 bit_res = bit_pos & 7; 229 instr = 0; 230 for (j = 0; j < 6; ++j) 231 instr |= (uint64_t)(buf[i + j + byte_pos]) 232 << (8 * j); 233 234 norm = instr >> bit_res; 235 236 if (((norm >> 37) & 0x0F) == 0x05 237 && ((norm >> 9) & 0x07) == 0) { 238 addr = (norm >> 13) & 0x0FFFFF; 239 addr |= ((uint32_t)(norm >> 36) & 1) << 20; 240 addr <<= 4; 241 addr -= s->pos + (uint32_t)i; 242 addr >>= 4; 243 244 norm &= ~((uint64_t)0x8FFFFF << 13); 245 norm |= (uint64_t)(addr & 0x0FFFFF) << 13; 246 norm |= (uint64_t)(addr & 0x100000) 247 << (36 - 20); 248 249 instr &= (1 << bit_res) - 1; 250 instr |= norm << bit_res; 251 252 for (j = 0; j < 6; j++) 253 buf[i + j + byte_pos] 254 = (uint8_t)(instr >> (8 * j)); 255 } 256 } 257 } 258 259 return i; 260 } 261 #endif 262 263 #ifdef XZ_DEC_ARM 264 static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 265 { 266 size_t i; 267 uint32_t addr; 268 269 for (i = 0; i + 4 <= size; i += 4) { 270 if (buf[i + 3] == 0xEB) { 271 addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8) 272 | ((uint32_t)buf[i + 2] << 16); 273 addr <<= 2; 274 addr -= s->pos + (uint32_t)i + 8; 275 addr >>= 2; 276 buf[i] = (uint8_t)addr; 277 buf[i + 1] = (uint8_t)(addr >> 8); 278 buf[i + 2] = (uint8_t)(addr >> 16); 279 } 280 } 281 282 return i; 283 } 284 #endif 285 286 #ifdef XZ_DEC_ARMTHUMB 287 static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 288 { 289 size_t i; 290 uint32_t addr; 291 292 for (i = 0; i + 4 <= size; i += 2) { 293 if ((buf[i + 1] & 0xF8) == 0xF0 294 && (buf[i + 3] & 0xF8) == 0xF8) { 295 addr = (((uint32_t)buf[i + 1] & 0x07) << 19) 296 | ((uint32_t)buf[i] << 11) 297 | (((uint32_t)buf[i + 3] & 0x07) << 8) 298 | (uint32_t)buf[i + 2]; 299 addr <<= 1; 300 addr -= s->pos + (uint32_t)i + 4; 301 addr >>= 1; 302 buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07)); 303 buf[i] = (uint8_t)(addr >> 11); 304 buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07)); 305 buf[i + 2] = (uint8_t)addr; 306 i += 2; 307 } 308 } 309 310 return i; 311 } 312 #endif 313 314 #ifdef XZ_DEC_SPARC 315 static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 316 { 317 size_t i; 318 uint32_t instr; 319 320 for (i = 0; i + 4 <= size; i += 4) { 321 instr = get_unaligned_be32(buf + i); 322 if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) { 323 instr <<= 2; 324 instr -= s->pos + (uint32_t)i; 325 instr >>= 2; 326 instr = ((uint32_t)0x40000000 - (instr & 0x400000)) 327 | 0x40000000 | (instr & 0x3FFFFF); 328 put_unaligned_be32(instr, buf + i); 329 } 330 } 331 332 return i; 333 } 334 #endif 335 336 /* 337 * Apply the selected BCJ filter. Update *pos and s->pos to match the amount 338 * of data that got filtered. 339 * 340 * NOTE: This is implemented as a switch statement to avoid using function 341 * pointers, which could be problematic in the kernel boot code, which must 342 * avoid pointers to static data (at least on x86). 343 */ 344 static void bcj_apply(struct xz_dec_bcj *s, 345 uint8_t *buf, size_t *pos, size_t size) 346 { 347 size_t filtered; 348 349 buf += *pos; 350 size -= *pos; 351 352 switch (s->type) { 353 #ifdef XZ_DEC_X86 354 case BCJ_X86: 355 filtered = bcj_x86(s, buf, size); 356 break; 357 #endif 358 #ifdef XZ_DEC_POWERPC 359 case BCJ_POWERPC: 360 filtered = bcj_powerpc(s, buf, size); 361 break; 362 #endif 363 #ifdef XZ_DEC_IA64 364 case BCJ_IA64: 365 filtered = bcj_ia64(s, buf, size); 366 break; 367 #endif 368 #ifdef XZ_DEC_ARM 369 case BCJ_ARM: 370 filtered = bcj_arm(s, buf, size); 371 break; 372 #endif 373 #ifdef XZ_DEC_ARMTHUMB 374 case BCJ_ARMTHUMB: 375 filtered = bcj_armthumb(s, buf, size); 376 break; 377 #endif 378 #ifdef XZ_DEC_SPARC 379 case BCJ_SPARC: 380 filtered = bcj_sparc(s, buf, size); 381 break; 382 #endif 383 default: 384 /* Never reached but silence compiler warnings. */ 385 filtered = 0; 386 break; 387 } 388 389 *pos += filtered; 390 s->pos += filtered; 391 } 392 393 /* 394 * Flush pending filtered data from temp to the output buffer. 395 * Move the remaining mixture of possibly filtered and unfiltered 396 * data to the beginning of temp. 397 */ 398 static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b) 399 { 400 size_t copy_size; 401 402 copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos); 403 memcpy(b->out + b->out_pos, s->temp.buf, copy_size); 404 b->out_pos += copy_size; 405 406 s->temp.filtered -= copy_size; 407 s->temp.size -= copy_size; 408 memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size); 409 } 410 411 /* 412 * The BCJ filter functions are primitive in sense that they process the 413 * data in chunks of 1-16 bytes. To hide this issue, this function does 414 * some buffering. 415 */ 416 XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, 417 struct xz_dec_lzma2 *lzma2, 418 struct xz_buf *b) 419 { 420 size_t out_start; 421 422 /* 423 * Flush pending already filtered data to the output buffer. Return 424 * immediately if we couldn't flush everything, or if the next 425 * filter in the chain had already returned XZ_STREAM_END. 426 */ 427 if (s->temp.filtered > 0) { 428 bcj_flush(s, b); 429 if (s->temp.filtered > 0) 430 return XZ_OK; 431 432 if (s->ret == XZ_STREAM_END) 433 return XZ_STREAM_END; 434 } 435 436 /* 437 * If we have more output space than what is currently pending in 438 * temp, copy the unfiltered data from temp to the output buffer 439 * and try to fill the output buffer by decoding more data from the 440 * next filter in the chain. Apply the BCJ filter on the new data 441 * in the output buffer. If everything cannot be filtered, copy it 442 * to temp and rewind the output buffer position accordingly. 443 * 444 * This needs to be always run when temp.size == 0 to handle a special 445 * case where the output buffer is full and the next filter has no 446 * more output coming but hasn't returned XZ_STREAM_END yet. 447 */ 448 if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) { 449 out_start = b->out_pos; 450 memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size); 451 b->out_pos += s->temp.size; 452 453 s->ret = xz_dec_lzma2_run(lzma2, b); 454 if (s->ret != XZ_STREAM_END 455 && (s->ret != XZ_OK || s->single_call)) 456 return s->ret; 457 458 bcj_apply(s, b->out, &out_start, b->out_pos); 459 460 /* 461 * As an exception, if the next filter returned XZ_STREAM_END, 462 * we can do that too, since the last few bytes that remain 463 * unfiltered are meant to remain unfiltered. 464 */ 465 if (s->ret == XZ_STREAM_END) 466 return XZ_STREAM_END; 467 468 s->temp.size = b->out_pos - out_start; 469 b->out_pos -= s->temp.size; 470 memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size); 471 472 /* 473 * If there wasn't enough input to the next filter to fill 474 * the output buffer with unfiltered data, there's no point 475 * to try decoding more data to temp. 476 */ 477 if (b->out_pos + s->temp.size < b->out_size) 478 return XZ_OK; 479 } 480 481 /* 482 * We have unfiltered data in temp. If the output buffer isn't full 483 * yet, try to fill the temp buffer by decoding more data from the 484 * next filter. Apply the BCJ filter on temp. Then we hopefully can 485 * fill the actual output buffer by copying filtered data from temp. 486 * A mix of filtered and unfiltered data may be left in temp; it will 487 * be taken care on the next call to this function. 488 */ 489 if (b->out_pos < b->out_size) { 490 /* Make b->out{,_pos,_size} temporarily point to s->temp. */ 491 s->out = b->out; 492 s->out_pos = b->out_pos; 493 s->out_size = b->out_size; 494 b->out = s->temp.buf; 495 b->out_pos = s->temp.size; 496 b->out_size = sizeof(s->temp.buf); 497 498 s->ret = xz_dec_lzma2_run(lzma2, b); 499 500 s->temp.size = b->out_pos; 501 b->out = s->out; 502 b->out_pos = s->out_pos; 503 b->out_size = s->out_size; 504 505 if (s->ret != XZ_OK && s->ret != XZ_STREAM_END) 506 return s->ret; 507 508 bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size); 509 510 /* 511 * If the next filter returned XZ_STREAM_END, we mark that 512 * everything is filtered, since the last unfiltered bytes 513 * of the stream are meant to be left as is. 514 */ 515 if (s->ret == XZ_STREAM_END) 516 s->temp.filtered = s->temp.size; 517 518 bcj_flush(s, b); 519 if (s->temp.filtered > 0) 520 return XZ_OK; 521 } 522 523 return s->ret; 524 } 525 526 XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call) 527 { 528 struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL); 529 if (s != NULL) 530 s->single_call = single_call; 531 532 return s; 533 } 534 535 XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id) 536 { 537 switch (id) { 538 #ifdef XZ_DEC_X86 539 case BCJ_X86: 540 #endif 541 #ifdef XZ_DEC_POWERPC 542 case BCJ_POWERPC: 543 #endif 544 #ifdef XZ_DEC_IA64 545 case BCJ_IA64: 546 #endif 547 #ifdef XZ_DEC_ARM 548 case BCJ_ARM: 549 #endif 550 #ifdef XZ_DEC_ARMTHUMB 551 case BCJ_ARMTHUMB: 552 #endif 553 #ifdef XZ_DEC_SPARC 554 case BCJ_SPARC: 555 #endif 556 break; 557 558 default: 559 /* Unsupported Filter ID */ 560 return XZ_OPTIONS_ERROR; 561 } 562 563 s->type = id; 564 s->ret = XZ_OK; 565 s->pos = 0; 566 s->x86_prev_mask = 0; 567 s->temp.filtered = 0; 568 s->temp.size = 0; 569 570 return XZ_OK; 571 } 572 573 #endif 574