1 // SPDX-License-Identifier: 0BSD 2 3 /////////////////////////////////////////////////////////////////////////////// 4 // 5 /// \file riscv.c 6 /// \brief Filter for 32-bit/64-bit little/big endian RISC-V binaries 7 /// 8 /// This converts program counter relative addresses in function calls 9 /// (JAL, AUIPC+JALR), address calculation of functions and global 10 /// variables (AUIPC+ADDI), loads (AUIPC+load), and stores (AUIPC+store). 11 /// 12 /// For AUIPC+inst2 pairs, the paired instruction checking is fairly relaxed. 13 /// The paired instruction opcode must only have its lowest two bits set, 14 /// meaning it will convert any paired instruction that is not a 16-bit 15 /// compressed instruction. This was shown to be enough to keep the number 16 /// of false matches low while improving code size and speed. 17 // 18 // Authors: Lasse Collin 19 // Jia Tan 20 // 21 // Special thanks: 22 // 23 // - Chien Wong <m@xv97.com> provided a few early versions of RISC-V 24 // filter variants along with test files and benchmark results. 25 // 26 // - Igor Pavlov helped a lot in the filter design, getting it both 27 // faster and smaller. The implementation here is still independently 28 // written, not based on LZMA SDK. 29 // 30 /////////////////////////////////////////////////////////////////////////////// 31 32 /* 33 34 RISC-V filtering 35 ================ 36 37 RV32I and RV64I, possibly combined with extensions C, Zfh, F, D, 38 and Q, are identical enough that the same filter works for both. 39 40 The instruction encoding is always little endian, even on systems 41 with big endian data access. Thus the same filter works for both 42 endiannesses. 43 44 The following instructions have program counter relative 45 (pc-relative) behavior: 46 47 JAL 48 --- 49 50 JAL is used for function calls (including tail calls) and 51 unconditional jumps within functions. Jumps within functions 52 aren't useful to filter because the absolute addresses often 53 appear only once or at most a few times. Tail calls and jumps 54 within functions look the same to a simple filter so neither 55 are filtered, that is, JAL x0 is ignored (the ABI name of the 56 register x0 is "zero"). 57 58 Almost all calls store the return address to register x1 (ra) 59 or x5 (t0). To reduce false matches when the filter is applied 60 to non-code data, only the JAL instructions that use x1 or x5 61 are converted. JAL has pc-relative range of +/-1 MiB so longer 62 calls and jumps need another method (AUIPC+JALR). 63 64 C.J and C.JAL 65 ------------- 66 67 C.J and C.JAL have pc-relative range of +/-2 KiB. 68 69 C.J is for tail calls and jumps within functions and isn't 70 filtered for the reasons mentioned for JAL x0. 71 72 C.JAL is an RV32C-only instruction. Its encoding overlaps with 73 RV64C-only C.ADDIW which is a common instruction. So if filtering 74 C.JAL was useful (it wasn't tested) then a separate filter would 75 be needed for RV32 and RV64. Also, false positives would be a 76 significant problem when the filter is applied to non-code data 77 because C.JAL needs only five bits to match. Thus, this filter 78 doesn't modify C.JAL instructions. 79 80 BEQ, BNE, BLT, BGE, BLTU, BGEU, C.BEQZ, and C.BNEZ 81 -------------------------------------------------- 82 83 These are conditional branches with pc-relative range 84 of +/-4 KiB (+/-256 B for C.*). The absolute addresses often 85 appear only once and very short distances are the most common, 86 so filtering these instructions would make compression worse. 87 88 AUIPC with rd != x0 89 ------------------- 90 91 AUIPC is paired with a second instruction (inst2) to do 92 pc-relative jumps, calls, loads, stores, and for taking 93 an address of a symbol. AUIPC has a 20-bit immediate and 94 the possible inst2 choices have a 12-bit immediate. 95 96 AUIPC stores pc + 20-bit signed immediate to a register. 97 The immediate encodes a multiple of 4 KiB so AUIPC itself 98 has a pc-relative range of +/-2 GiB. AUIPC does *NOT* set 99 the lowest 12 bits of the result to zero! This means that 100 the 12-bit immediate in inst2 cannot just include the lowest 101 12 bits of the absolute address as is; the immediate has to 102 compensate for the lowest 12 bits that AUIPC copies from the 103 program counter. This means that a good filter has to convert 104 not only AUIPC but also the paired inst2. 105 106 A strict filter would focus on filtering the following 107 AUIPC+inst2 pairs: 108 109 - AUIPC+JALR: Function calls, including tail calls. 110 111 - AUIPC+ADDI: Calculating the address of a function 112 or a global variable. 113 114 - AUIPC+load/store from the base instruction sets 115 (RV32I, RV64I) or from the floating point extensions 116 Zfh, F, D, and Q: 117 * RV32I: LB, LH, LW, LBU, LHU, SB, SH, SW 118 * RV64I has also: LD, LWU, SD 119 * Zfh: FLH, FSH 120 * F: FLW, FSW 121 * D: FLD, FSD 122 * Q: FLQ, FSQ 123 124 NOTE: AUIPC+inst2 can only be a pair if AUIPC's rd specifies 125 the same register as inst2's rs1. 126 127 Instead of strictly accepting only the above instructions as inst2, 128 this filter uses a much simpler condition: the lowest two bits of 129 inst2 must be set, that is, inst2 must not be a 16-bit compressed 130 instruction. So this will accept all 32-bit and possible future 131 extended instructions as a pair to AUIPC if the bits in AUIPC's 132 rd [11:7] match the bits [19:15] in inst2 (the bits that I-type and 133 S-type instructions use for rs1). Testing showed that this relaxed 134 condition for inst2 did not consistently or significantly affect 135 compression ratio but it reduced code size and improved speed. 136 137 Additionally, the paired instruction is always treated as an I-type 138 instruction. The S-type instructions used by stores (SB, SH, SW, 139 etc.) place the lowest 5 bits of the immediate in a different 140 location than I-type instructions. AUIPC+store pairs are less 141 common than other pairs, and testing showed that the extra 142 code required to handle S-type instructions was not worth the 143 compression ratio gained. 144 145 AUIPC+inst2 don't necessarily appear sequentially next to each 146 other although very often they do. Especially AUIPC+JALR are 147 sequential as that may allow instruction fusion in processors 148 (and perhaps help branch prediction as a fused AUIPC+JALR is 149 a direct branch while JALR alone is an indirect branch). 150 151 Clang 16 can generate code where AUIPC+inst2 is split: 152 153 - AUIPC is outside a loop and inst2 (load/store) is inside 154 the loop. This way the AUIPC instruction needs to be 155 executed only once. 156 157 - Load-modify-store may have AUIPC for the load and the same 158 AUIPC-result is used for the store too. This may get combined 159 with AUIPC being outside the loop. 160 161 - AUIPC is before a conditional branch and inst2 is hundreds 162 of bytes away at the branch target. 163 164 - Inner and outer pair: 165 166 auipc a1,0x2f 167 auipc a2,0x3d 168 ld a2,-500(a2) 169 addi a1,a1,-233 170 171 - Many split pairs with an untaken conditional branch between: 172 173 auipc s9,0x1613 # Pair 1 174 auipc s4,0x1613 # Pair 2 175 auipc s6,0x1613 # Pair 3 176 auipc s10,0x1613 # Pair 4 177 beqz a5,a3baae 178 ld a0,0(a6) 179 ld a6,246(s9) # Pair 1 180 ld a1,250(s4) # Pair 2 181 ld a3,254(s6) # Pair 3 182 ld a4,258(s10) # Pair 4 183 184 It's not possible to find all split pairs in a filter like this. 185 At least in 2024, simple sequential pairs are 99 % of AUIPC uses 186 so filtering only such pairs gives good results and makes the 187 filter simpler. However, it's possible that future compilers will 188 produce different code where sequential pairs aren't as common. 189 190 This filter doesn't convert AUIPC instructions alone because: 191 192 (1) The conversion would be off-by-one (or off-by-4096) half the 193 time because the lowest 12 bits from inst2 (inst2_imm12) 194 aren't known. We only know that the absolute address is 195 pc + AUIPC_imm20 + [-2048, +2047] but there is no way to 196 know the exact 4096-byte multiple (or 4096 * n + 2048): 197 there are always two possibilities because AUIPC copies 198 the 12 lowest bits from pc instead of zeroing them. 199 200 NOTE: The sign-extension of inst2_imm12 adds a tiny bit 201 of extra complexity to AUIPC math in general but it's not 202 the reason for this problem. The sign-extension only changes 203 the relative position of the pc-relative 4096-byte window. 204 205 (2) Matching AUIPC instruction alone requires only seven bits. 206 When the filter is applied to non-code data, that leads 207 to many false positives which make compression worse. 208 As long as most AUIPC+inst2 pairs appear as two consecutive 209 instructions, converting only such pairs gives better results. 210 211 In assembly, AUIPC+inst2 tend to look like this: 212 213 # Call: 214 auipc ra, 0x12345 215 jalr ra, -42(ra) 216 217 # Tail call: 218 auipc t1, 0x12345 219 jalr zero, -42(t1) 220 221 # Getting the absolute address: 222 auipc a0, 0x12345 223 addi a0, a0, -42 224 225 # rd of inst2 isn't necessarily the same as rs1 even 226 # in cases where there is no reason to preserve rs1. 227 auipc a0, 0x12345 228 addi a1, a0, -42 229 230 As of 2024, 16-bit instructions from the C extension don't 231 appear as inst2. The RISC-V psABI doesn't list AUIPC+C.* as 232 a linker relaxation type explicitly but it's not disallowed 233 either. Usefulness is limited as most of the time the lowest 234 12 bits won't fit in a C instruction. This filter doesn't 235 support AUIPC+C.* combinations because this makes the filter 236 simpler, there are no test files, and it hopefully will never 237 be needed anyway. 238 239 (Compare AUIPC to ARM64 where ADRP does set the lowest 12 bits 240 to zero. The paired instruction has the lowest 12 bits of the 241 absolute address as is in a zero-extended immediate. Thus the 242 ARM64 filter doesn't need to care about the instructions that 243 are paired with ADRP. An off-by-4096 issue can still occur if 244 the code section isn't aligned with the filter's start offset. 245 It's not a problem with standalone ELF files but Windows PE 246 files need start_offset=3072 for best results. Also, a .tar 247 stores files with 512-byte alignment so most of the time it 248 won't be the best for ARM64.) 249 250 AUIPC with rd == x0 251 ------------------- 252 253 AUIPC instructions with rd=x0 are reserved for HINTs in the base 254 instruction set. Such AUIPC instructions are never filtered. 255 256 As of January 2024, it seems likely that AUIPC with rd=x0 will 257 be used for landing pads (pseudoinstruction LPAD). LPAD is used 258 to mark valid targets for indirect jumps (for JALR), for example, 259 beginnings of functions. The 20-bit immediate in LPAD instruction 260 is a label, not a pc-relative address. Thus it would be 261 counterproductive to convert AUIPC instructions with rd=x0. 262 263 Often the next instruction after LPAD won't have rs1=x0 and thus 264 the filtering would be skipped for that reason alone. However, 265 it's not good to rely on this. For example, consider a function 266 that begins like this: 267 268 int foo(int i) 269 { 270 if (i <= 234) { 271 ... 272 } 273 274 A compiler may generate something like this: 275 276 lpad 0x54321 277 li a5, 234 278 bgt a0, a5, .L2 279 280 Converting the pseudoinstructions to raw instructions: 281 282 auipc x0, 0x54321 283 addi x15, x0, 234 284 blt x15, x10, .L2 285 286 In this case the filter would undesirably convert the AUIPC+ADDI 287 pair if the filter didn't explicitly skip AUIPC instructions 288 that have rd=x0. 289 290 */ 291 292 293 #include "simple_private.h" 294 295 296 // This checks two conditions at once: 297 // - AUIPC rd == inst2 rs1. 298 // - inst2 opcode has the lowest two bits set. 299 // 300 // The 8 bit left shift aligns the rd of AUIPC with the rs1 of inst2. 301 // By XORing the registers, any non-zero value in those bits indicates the 302 // registers are not equal and thus not an AUIPC pair. Subtracting 3 from 303 // inst2 will zero out the first two opcode bits only when they are set. 304 // The mask tests if any of the register or opcode bits are set (and thus 305 // not an AUIPC pair). 306 // 307 // Alternative expression: (((((auipc) << 8) ^ (inst2)) & 0xF8003) != 3) 308 #define NOT_AUIPC_PAIR(auipc, inst2) \ 309 ((((auipc) << 8) ^ ((inst2) - 3)) & 0xF8003) 310 311 // This macro checks multiple conditions: 312 // (1) AUIPC rd [11:7] == x2 (special rd value). 313 // (2) AUIPC bits 12 and 13 set (the lowest two opcode bits of packed inst2). 314 // (3) inst2_rs1 doesn't equal x0 or x2 because the opposite 315 // conversion is only done when 316 // auipc_rd != x0 && 317 // auipc_rd != x2 && 318 // auipc_rd == inst2_rs1. 319 // 320 // The left-hand side takes care of (1) and (2). 321 // (a) The lowest 7 bits are already known to be AUIPC so subtracting 0x17 322 // makes those bits zeros. 323 // (b) If AUIPC rd equals x2, subtracting 0x100 makes bits [11:7] zeros. 324 // If rd doesn't equal x2, then there will be at least one non-zero bit 325 // and the next step (c) is irrelevant. 326 // (c) If the lowest two opcode bits of the packed inst2 are set in [13:12], 327 // then subtracting 0x3000 will make those bits zeros. Otherwise there 328 // will be at least one non-zero bit. 329 // 330 // The shift by 18 removes the high bits from the final '>=' comparison and 331 // ensures that any non-zero result will be larger than any possible result 332 // from the right-hand side of the comparison. The cast ensures that the 333 // left-hand side didn't get promoted to a larger type than uint32_t. 334 // 335 // On the right-hand side, inst2_rs1 & 0x1D will be non-zero as long as 336 // inst2_rs1 is not x0 or x2. 337 // 338 // The final '>=' comparison will make the expression true if: 339 // - The subtraction caused any bits to be set (special AUIPC rd value not 340 // used or inst2 opcode bits not set). (non-zero >= non-zero or 0) 341 // - The subtraction did not cause any bits to be set but inst2_rs1 was 342 // x0 or x2. (0 >= 0) 343 #define NOT_SPECIAL_AUIPC(auipc, inst2_rs1) \ 344 ((uint32_t)(((auipc) - 0x3117) << 18) >= ((inst2_rs1) & 0x1D)) 345 346 347 // The encode and decode functions are split for this filter because of the 348 // AUIPC+inst2 filtering. This filter design allows a decoder-only 349 // implementation to be smaller than alternative designs. 350 351 #ifdef HAVE_ENCODER_RISCV 352 static size_t 353 riscv_encode(void *simple lzma_attribute((__unused__)), 354 uint32_t now_pos, 355 bool is_encoder lzma_attribute((__unused__)), 356 uint8_t *buffer, size_t size) 357 { 358 // Avoid using i + 8 <= size in the loop condition. 359 // 360 // NOTE: If there is a JAL in the last six bytes of the stream, it 361 // won't be converted. This is intentional to keep the code simpler. 362 if (size < 8) 363 return 0; 364 365 size -= 8; 366 367 size_t i; 368 369 // The loop is advanced by 2 bytes every iteration since the 370 // instruction stream may include 16-bit instructions (C extension). 371 for (i = 0; i <= size; i += 2) { 372 uint32_t inst = buffer[i]; 373 374 if (inst == 0xEF) { 375 // JAL 376 const uint32_t b1 = buffer[i + 1]; 377 378 // Only filter rd=x1(ra) and rd=x5(t0). 379 if ((b1 & 0x0D) != 0) 380 continue; 381 382 // The 20-bit immediate is in four pieces. 383 // The encoder stores it in big endian form 384 // since it improves compression slightly. 385 const uint32_t b2 = buffer[i + 2]; 386 const uint32_t b3 = buffer[i + 3]; 387 const uint32_t pc = now_pos + (uint32_t)i; 388 389 // The following chart shows the highest three bytes of JAL, focusing on 390 // the 20-bit immediate field [31:12]. The first row of numbers is the 391 // bit position in a 32-bit little endian instruction. The second row of 392 // numbers shows the order of the immediate field in a J-type instruction. 393 // The last row is the bit number in each byte. 394 // 395 // To determine the amount to shift each bit, subtract the value in 396 // the last row from the value in the second last row. If the number 397 // is positive, shift left. If negative, shift right. 398 // 399 // For example, at the rightmost side of the chart, the bit 4 in b1 is 400 // the bit 12 of the address. Thus that bit needs to be shifted left 401 // by 12 - 4 = 8 bits to put it in the right place in the addr variable. 402 // 403 // NOTE: The immediate of a J-type instruction holds bits [20:1] of 404 // the address. The bit [0] is always 0 and not part of the immediate. 405 // 406 // | b3 | b2 | b1 | 407 // | 31 30 29 28 27 26 25 24 | 23 22 21 20 19 18 17 16 | 15 14 13 12 x x x x | 408 // | 20 10 9 8 7 6 5 4 | 3 2 1 11 19 18 17 16 | 15 14 13 12 x x x x | 409 // | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 7 6 5 4 x x x x | 410 411 uint32_t addr = ((b1 & 0xF0) << 8) 412 | ((b2 & 0x0F) << 16) 413 | ((b2 & 0x10) << 7) 414 | ((b2 & 0xE0) >> 4) 415 | ((b3 & 0x7F) << 4) 416 | ((b3 & 0x80) << 13); 417 418 addr += pc; 419 420 buffer[i + 1] = (uint8_t)((b1 & 0x0F) 421 | ((addr >> 13) & 0xF0)); 422 423 buffer[i + 2] = (uint8_t)(addr >> 9); 424 buffer[i + 3] = (uint8_t)(addr >> 1); 425 426 // The "-2" is included because the for-loop will 427 // always increment by 2. In this case, we want to 428 // skip an extra 2 bytes since we used 4 bytes 429 // of input. 430 i += 4 - 2; 431 432 } else if ((inst & 0x7F) == 0x17) { 433 // AUIPC 434 inst |= (uint32_t)buffer[i + 1] << 8; 435 inst |= (uint32_t)buffer[i + 2] << 16; 436 inst |= (uint32_t)buffer[i + 3] << 24; 437 438 // Branch based on AUIPC's rd. The bitmask test does 439 // the same thing as this: 440 // 441 // const uint32_t auipc_rd = (inst >> 7) & 0x1F; 442 // if (auipc_rd != 0 && auipc_rd != 2) { 443 if (inst & 0xE80) { 444 // AUIPC's rd doesn't equal x0 or x2. 445 446 // Check if AUIPC+inst2 are a pair. 447 uint32_t inst2 = read32le(buffer + i + 4); 448 449 if (NOT_AUIPC_PAIR(inst, inst2)) { 450 // The NOT_AUIPC_PAIR macro allows 451 // a false AUIPC+AUIPC pair if the 452 // bits [19:15] (where rs1 would be) 453 // in the second AUIPC match the rd 454 // of the first AUIPC. 455 // 456 // We must skip enough forward so 457 // that the first two bytes of the 458 // second AUIPC cannot get converted. 459 // Such a conversion could make the 460 // current pair become a valid pair 461 // which would desync the decoder. 462 // 463 // Skipping six bytes is enough even 464 // though the above condition looks 465 // at the lowest four bits of the 466 // buffer[i + 6] too. This is safe 467 // because this filter never changes 468 // those bits if a conversion at 469 // that position is done. 470 i += 6 - 2; 471 continue; 472 } 473 474 // Convert AUIPC+inst2 to a special format: 475 // 476 // - The lowest 7 bits [6:0] retain the 477 // AUIPC opcode. 478 // 479 // - The rd [11:7] is set to x2(sp). x2 is 480 // used as the stack pointer so AUIPC with 481 // rd=x2 should be very rare in real-world 482 // executables. 483 // 484 // - The remaining 20 bits [31:12] (that 485 // normally hold the pc-relative immediate) 486 // are used to store the lowest 20 bits of 487 // inst2. That is, the 12-bit immediate of 488 // inst2 is not included. 489 // 490 // - The location of the original inst2 is 491 // used to store the 32-bit absolute 492 // address in big endian format. Compared 493 // to the 20+12-bit split encoding, this 494 // results in a longer uninterrupted 495 // sequence of identical common bytes 496 // when the same address is referred 497 // with different instruction pairs 498 // (like AUIPC+LD vs. AUIPC+ADDI) or 499 // when the occurrences of the same 500 // pair use different registers. When 501 // referring to adjacent memory locations 502 // (like function calls that go via the 503 // ELF PLT), in big endian order only the 504 // last 1-2 bytes differ; in little endian 505 // the differing 1-2 bytes would be in the 506 // middle of the 8-byte sequence. 507 // 508 // When reversing the transformation, the 509 // original rd of AUIPC can be restored 510 // from inst2's rs1 as they are required to 511 // be the same. 512 513 // Arithmetic right shift makes sign extension 514 // trivial but (1) it's implementation-defined 515 // behavior (C99/C11/C23 6.5.7-p5) and so is 516 // (2) casting unsigned to signed (6.3.1.3-p3). 517 // 518 // One can check for (1) with 519 // 520 // if ((-1 >> 1) == -1) ... 521 // 522 // but (2) has to be checked from the 523 // compiler docs. GCC promises that (1) 524 // and (2) behave in the common expected 525 // way and thus 526 // 527 // addr += (uint32_t)( 528 // (int32_t)inst2 >> 20); 529 // 530 // does the same as the code below. But since 531 // the 100 % portable way is only a few bytes 532 // bigger code and there is no real speed 533 // difference, let's just use that, especially 534 // since the decoder doesn't need this at all. 535 uint32_t addr = inst & 0xFFFFF000; 536 addr += (inst2 >> 20) 537 - ((inst2 >> 19) & 0x1000); 538 539 addr += now_pos + (uint32_t)i; 540 541 // Construct the first 32 bits: 542 // [6:0] AUIPC opcode 543 // [11:7] Special AUIPC rd = x2 544 // [31:12] The lowest 20 bits of inst2 545 inst = 0x17 | (2 << 7) | (inst2 << 12); 546 547 write32le(buffer + i, inst); 548 549 // The second 32 bits store the absolute 550 // address in big endian order. 551 write32be(buffer + i + 4, addr); 552 } else { 553 // AUIPC's rd equals x0 or x2. 554 // 555 // x0 indicates a landing pad (LPAD). 556 // It's always skipped. 557 // 558 // AUIPC with rd == x2 is used for the special 559 // format as explained above. When the input 560 // contains a byte sequence that matches the 561 // special format, "fake" decoding must be 562 // done to keep the filter bijective (that 563 // is, safe to apply on arbitrary data). 564 // 565 // See the "x0 or x2" section in riscv_decode() 566 // for how the "real" decoding is done. The 567 // "fake" decoding is a simplified version 568 // of "real" decoding with the following 569 // differences (these reduce code size of 570 // the decoder): 571 // (1) The lowest 12 bits aren't sign-extended. 572 // (2) No address conversion is done. 573 // (3) Big endian format isn't used (the fake 574 // address is in little endian order). 575 576 // Check if inst matches the special format. 577 const uint32_t fake_rs1 = inst >> 27; 578 579 if (NOT_SPECIAL_AUIPC(inst, fake_rs1)) { 580 i += 4 - 2; 581 continue; 582 } 583 584 const uint32_t fake_addr = 585 read32le(buffer + i + 4); 586 587 // Construct the second 32 bits: 588 // [19:0] Upper 20 bits from AUIPC 589 // [31:20] The lowest 12 bits of fake_addr 590 const uint32_t fake_inst2 = (inst >> 12) 591 | (fake_addr << 20); 592 593 // Construct new first 32 bits from: 594 // [6:0] AUIPC opcode 595 // [11:7] Fake AUIPC rd = fake_rs1 596 // [31:12] The highest 20 bits of fake_addr 597 inst = 0x17 | (fake_rs1 << 7) 598 | (fake_addr & 0xFFFFF000); 599 600 write32le(buffer + i, inst); 601 write32le(buffer + i + 4, fake_inst2); 602 } 603 604 i += 8 - 2; 605 } 606 } 607 608 return i; 609 } 610 611 612 extern lzma_ret 613 lzma_simple_riscv_encoder_init(lzma_next_coder *next, 614 const lzma_allocator *allocator, 615 const lzma_filter_info *filters) 616 { 617 return lzma_simple_coder_init(next, allocator, filters, 618 &riscv_encode, 0, 8, 2, true); 619 } 620 #endif 621 622 623 #ifdef HAVE_DECODER_RISCV 624 static size_t 625 riscv_decode(void *simple lzma_attribute((__unused__)), 626 uint32_t now_pos, 627 bool is_encoder lzma_attribute((__unused__)), 628 uint8_t *buffer, size_t size) 629 { 630 if (size < 8) 631 return 0; 632 633 size -= 8; 634 635 size_t i; 636 for (i = 0; i <= size; i += 2) { 637 uint32_t inst = buffer[i]; 638 639 if (inst == 0xEF) { 640 // JAL 641 const uint32_t b1 = buffer[i + 1]; 642 643 // Only filter rd=x1(ra) and rd=x5(t0). 644 if ((b1 & 0x0D) != 0) 645 continue; 646 647 const uint32_t b2 = buffer[i + 2]; 648 const uint32_t b3 = buffer[i + 3]; 649 const uint32_t pc = now_pos + (uint32_t)i; 650 651 // | b3 | b2 | b1 | 652 // | 31 30 29 28 27 26 25 24 | 23 22 21 20 19 18 17 16 | 15 14 13 12 x x x x | 653 // | 20 10 9 8 7 6 5 4 | 3 2 1 11 19 18 17 16 | 15 14 13 12 x x x x | 654 // | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 7 6 5 4 x x x x | 655 656 uint32_t addr = ((b1 & 0xF0) << 13) 657 | (b2 << 9) | (b3 << 1); 658 659 addr -= pc; 660 661 buffer[i + 1] = (uint8_t)((b1 & 0x0F) 662 | ((addr >> 8) & 0xF0)); 663 664 buffer[i + 2] = (uint8_t)(((addr >> 16) & 0x0F) 665 | ((addr >> 7) & 0x10) 666 | ((addr << 4) & 0xE0)); 667 668 buffer[i + 3] = (uint8_t)(((addr >> 4) & 0x7F) 669 | ((addr >> 13) & 0x80)); 670 671 i += 4 - 2; 672 673 } else if ((inst & 0x7F) == 0x17) { 674 // AUIPC 675 uint32_t inst2; 676 677 inst |= (uint32_t)buffer[i + 1] << 8; 678 inst |= (uint32_t)buffer[i + 2] << 16; 679 inst |= (uint32_t)buffer[i + 3] << 24; 680 681 if (inst & 0xE80) { 682 // AUIPC's rd doesn't equal x0 or x2. 683 684 // Check if it is a "fake" AUIPC+inst2 pair. 685 inst2 = read32le(buffer + i + 4); 686 687 if (NOT_AUIPC_PAIR(inst, inst2)) { 688 i += 6 - 2; 689 continue; 690 } 691 692 // Decode (or more like re-encode) the "fake" 693 // pair. The "fake" format doesn't do 694 // sign-extension, address conversion, or 695 // use big endian. (The use of little endian 696 // allows sharing the write32le() calls in 697 // the decoder to reduce code size when 698 // unaligned access isn't supported.) 699 uint32_t addr = inst & 0xFFFFF000; 700 addr += inst2 >> 20; 701 702 inst = 0x17 | (2 << 7) | (inst2 << 12); 703 inst2 = addr; 704 } else { 705 // AUIPC's rd equals x0 or x2. 706 707 // Check if inst matches the special format 708 // used by the encoder. 709 const uint32_t inst2_rs1 = inst >> 27; 710 711 if (NOT_SPECIAL_AUIPC(inst, inst2_rs1)) { 712 i += 4 - 2; 713 continue; 714 } 715 716 // Decode the "real" pair. 717 uint32_t addr = read32be(buffer + i + 4); 718 719 addr -= now_pos + (uint32_t)i; 720 721 // The second instruction: 722 // - Get the lowest 20 bits from inst. 723 // - Add the lowest 12 bits of the address 724 // as the immediate field. 725 inst2 = (inst >> 12) | (addr << 20); 726 727 // AUIPC: 728 // - rd is the same as inst2_rs1. 729 // - The sign extension of the lowest 12 bits 730 // must be taken into account. 731 inst = 0x17 | (inst2_rs1 << 7) 732 | ((addr + 0x800) & 0xFFFFF000); 733 } 734 735 // Both decoder branches write in little endian order. 736 write32le(buffer + i, inst); 737 write32le(buffer + i + 4, inst2); 738 739 i += 8 - 2; 740 } 741 } 742 743 return i; 744 } 745 746 747 extern lzma_ret 748 lzma_simple_riscv_decoder_init(lzma_next_coder *next, 749 const lzma_allocator *allocator, 750 const lzma_filter_info *filters) 751 { 752 return lzma_simple_coder_init(next, allocator, filters, 753 &riscv_decode, 0, 8, 2, false); 754 } 755 #endif 756