xref: /freebsd/contrib/xz/src/liblzma/simple/riscv.c (revision 128836d304d93f2d00eb14069c27089ab46c38d4)
13b35e7eeSXin LI // SPDX-License-Identifier: 0BSD
23b35e7eeSXin LI 
33b35e7eeSXin LI ///////////////////////////////////////////////////////////////////////////////
43b35e7eeSXin LI //
53b35e7eeSXin LI /// \file       riscv.c
63b35e7eeSXin LI /// \brief      Filter for 32-bit/64-bit little/big endian RISC-V binaries
73b35e7eeSXin LI ///
83b35e7eeSXin LI /// This converts program counter relative addresses in function calls
93b35e7eeSXin LI /// (JAL, AUIPC+JALR), address calculation of functions and global
103b35e7eeSXin LI /// variables (AUIPC+ADDI), loads (AUIPC+load), and stores (AUIPC+store).
113b35e7eeSXin LI ///
123b35e7eeSXin LI /// For AUIPC+inst2 pairs, the paired instruction checking is fairly relaxed.
133b35e7eeSXin LI /// The paired instruction opcode must only have its lowest two bits set,
143b35e7eeSXin LI /// meaning it will convert any paired instruction that is not a 16-bit
153b35e7eeSXin LI /// compressed instruction. This was shown to be enough to keep the number
163b35e7eeSXin LI /// of false matches low while improving code size and speed.
173b35e7eeSXin LI //
183b35e7eeSXin LI //  Authors:    Lasse Collin
193b35e7eeSXin LI //              Jia Tan
203b35e7eeSXin LI //
213b35e7eeSXin LI //  Special thanks:
223b35e7eeSXin LI //
233b35e7eeSXin LI //    - Chien Wong <m@xv97.com> provided a few early versions of RISC-V
243b35e7eeSXin LI //      filter variants along with test files and benchmark results.
253b35e7eeSXin LI //
263b35e7eeSXin LI //    - Igor Pavlov helped a lot in the filter design, getting it both
273b35e7eeSXin LI //      faster and smaller. The implementation here is still independently
283b35e7eeSXin LI //      written, not based on LZMA SDK.
293b35e7eeSXin LI //
303b35e7eeSXin LI ///////////////////////////////////////////////////////////////////////////////
313b35e7eeSXin LI 
323b35e7eeSXin LI /*
333b35e7eeSXin LI 
343b35e7eeSXin LI RISC-V filtering
353b35e7eeSXin LI ================
363b35e7eeSXin LI 
373b35e7eeSXin LI     RV32I and RV64I, possibly combined with extensions C, Zfh, F, D,
383b35e7eeSXin LI     and Q, are identical enough that the same filter works for both.
393b35e7eeSXin LI 
403b35e7eeSXin LI     The instruction encoding is always little endian, even on systems
413b35e7eeSXin LI     with big endian data access. Thus the same filter works for both
423b35e7eeSXin LI     endiannesses.
433b35e7eeSXin LI 
443b35e7eeSXin LI     The following instructions have program counter relative
453b35e7eeSXin LI     (pc-relative) behavior:
463b35e7eeSXin LI 
473b35e7eeSXin LI JAL
483b35e7eeSXin LI ---
493b35e7eeSXin LI 
503b35e7eeSXin LI     JAL is used for function calls (including tail calls) and
513b35e7eeSXin LI     unconditional jumps within functions. Jumps within functions
523b35e7eeSXin LI     aren't useful to filter because the absolute addresses often
533b35e7eeSXin LI     appear only once or at most a few times. Tail calls and jumps
543b35e7eeSXin LI     within functions look the same to a simple filter so neither
553b35e7eeSXin LI     are filtered, that is, JAL x0 is ignored (the ABI name of the
563b35e7eeSXin LI     register x0 is "zero").
573b35e7eeSXin LI 
583b35e7eeSXin LI     Almost all calls store the return address to register x1 (ra)
593b35e7eeSXin LI     or x5 (t0). To reduce false matches when the filter is applied
603b35e7eeSXin LI     to non-code data, only the JAL instructions that use x1 or x5
613b35e7eeSXin LI     are converted. JAL has pc-relative range of +/-1 MiB so longer
623b35e7eeSXin LI     calls and jumps need another method (AUIPC+JALR).
633b35e7eeSXin LI 
643b35e7eeSXin LI C.J and C.JAL
653b35e7eeSXin LI -------------
663b35e7eeSXin LI 
673b35e7eeSXin LI     C.J and C.JAL have pc-relative range of +/-2 KiB.
683b35e7eeSXin LI 
693b35e7eeSXin LI     C.J is for tail calls and jumps within functions and isn't
703b35e7eeSXin LI     filtered for the reasons mentioned for JAL x0.
713b35e7eeSXin LI 
723b35e7eeSXin LI     C.JAL is an RV32C-only instruction. Its encoding overlaps with
733b35e7eeSXin LI     RV64C-only C.ADDIW which is a common instruction. So if filtering
743b35e7eeSXin LI     C.JAL was useful (it wasn't tested) then a separate filter would
753b35e7eeSXin LI     be needed for RV32 and RV64. Also, false positives would be a
763b35e7eeSXin LI     significant problem when the filter is applied to non-code data
773b35e7eeSXin LI     because C.JAL needs only five bits to match. Thus, this filter
783b35e7eeSXin LI     doesn't modify C.JAL instructions.
793b35e7eeSXin LI 
803b35e7eeSXin LI BEQ, BNE, BLT, BGE, BLTU, BGEU, C.BEQZ, and C.BNEZ
813b35e7eeSXin LI --------------------------------------------------
823b35e7eeSXin LI 
833b35e7eeSXin LI     These are conditional branches with pc-relative range
843b35e7eeSXin LI     of +/-4 KiB (+/-256 B for C.*). The absolute addresses often
853b35e7eeSXin LI     appear only once and very short distances are the most common,
863b35e7eeSXin LI     so filtering these instructions would make compression worse.
873b35e7eeSXin LI 
883b35e7eeSXin LI AUIPC with rd != x0
893b35e7eeSXin LI -------------------
903b35e7eeSXin LI 
913b35e7eeSXin LI     AUIPC is paired with a second instruction (inst2) to do
923b35e7eeSXin LI     pc-relative jumps, calls, loads, stores, and for taking
933b35e7eeSXin LI     an address of a symbol. AUIPC has a 20-bit immediate and
943b35e7eeSXin LI     the possible inst2 choices have a 12-bit immediate.
953b35e7eeSXin LI 
963b35e7eeSXin LI     AUIPC stores pc + 20-bit signed immediate to a register.
973b35e7eeSXin LI     The immediate encodes a multiple of 4 KiB so AUIPC itself
983b35e7eeSXin LI     has a pc-relative range of +/-2 GiB. AUIPC does *NOT* set
993b35e7eeSXin LI     the lowest 12 bits of the result to zero! This means that
1003b35e7eeSXin LI     the 12-bit immediate in inst2 cannot just include the lowest
1013b35e7eeSXin LI     12 bits of the absolute address as is; the immediate has to
1023b35e7eeSXin LI     compensate for the lowest 12 bits that AUIPC copies from the
1033b35e7eeSXin LI     program counter. This means that a good filter has to convert
1043b35e7eeSXin LI     not only AUIPC but also the paired inst2.
1053b35e7eeSXin LI 
1063b35e7eeSXin LI     A strict filter would focus on filtering the following
1073b35e7eeSXin LI     AUIPC+inst2 pairs:
1083b35e7eeSXin LI 
1093b35e7eeSXin LI       - AUIPC+JALR: Function calls, including tail calls.
1103b35e7eeSXin LI 
1113b35e7eeSXin LI       - AUIPC+ADDI: Calculating the address of a function
1123b35e7eeSXin LI         or a global variable.
1133b35e7eeSXin LI 
1143b35e7eeSXin LI       - AUIPC+load/store from the base instruction sets
1153b35e7eeSXin LI         (RV32I, RV64I) or from the floating point extensions
1163b35e7eeSXin LI         Zfh, F, D, and Q:
1173b35e7eeSXin LI           * RV32I: LB, LH, LW, LBU, LHU, SB, SH, SW
1183b35e7eeSXin LI           * RV64I has also: LD, LWU, SD
1193b35e7eeSXin LI           * Zfh: FLH, FSH
1203b35e7eeSXin LI           * F: FLW, FSW
1213b35e7eeSXin LI           * D: FLD, FSD
1223b35e7eeSXin LI           * Q: FLQ, FSQ
1233b35e7eeSXin LI 
1243b35e7eeSXin LI     NOTE: AUIPC+inst2 can only be a pair if AUIPC's rd specifies
1253b35e7eeSXin LI     the same register as inst2's rs1.
1263b35e7eeSXin LI 
1273b35e7eeSXin LI     Instead of strictly accepting only the above instructions as inst2,
1283b35e7eeSXin LI     this filter uses a much simpler condition: the lowest two bits of
1293b35e7eeSXin LI     inst2 must be set, that is, inst2 must not be a 16-bit compressed
1303b35e7eeSXin LI     instruction. So this will accept all 32-bit and possible future
1313b35e7eeSXin LI     extended instructions as a pair to AUIPC if the bits in AUIPC's
1323b35e7eeSXin LI     rd [11:7] match the bits [19:15] in inst2 (the bits that I-type and
1333b35e7eeSXin LI     S-type instructions use for rs1). Testing showed that this relaxed
1343b35e7eeSXin LI     condition for inst2 did not consistently or significantly affect
1353b35e7eeSXin LI     compression ratio but it reduced code size and improved speed.
1363b35e7eeSXin LI 
1373b35e7eeSXin LI     Additionally, the paired instruction is always treated as an I-type
1383b35e7eeSXin LI     instruction. The S-type instructions used by stores (SB, SH, SW,
1393b35e7eeSXin LI     etc.) place the lowest 5 bits of the immediate in a different
1403b35e7eeSXin LI     location than I-type instructions. AUIPC+store pairs are less
1413b35e7eeSXin LI     common than other pairs, and testing showed that the extra
1423b35e7eeSXin LI     code required to handle S-type instructions was not worth the
1433b35e7eeSXin LI     compression ratio gained.
1443b35e7eeSXin LI 
1453b35e7eeSXin LI     AUIPC+inst2 don't necessarily appear sequentially next to each
1463b35e7eeSXin LI     other although very often they do. Especially AUIPC+JALR are
1473b35e7eeSXin LI     sequential as that may allow instruction fusion in processors
1483b35e7eeSXin LI     (and perhaps help branch prediction as a fused AUIPC+JALR is
1493b35e7eeSXin LI     a direct branch while JALR alone is an indirect branch).
1503b35e7eeSXin LI 
1513b35e7eeSXin LI     Clang 16 can generate code where AUIPC+inst2 is split:
1523b35e7eeSXin LI 
1533b35e7eeSXin LI       - AUIPC is outside a loop and inst2 (load/store) is inside
1543b35e7eeSXin LI         the loop. This way the AUIPC instruction needs to be
1553b35e7eeSXin LI         executed only once.
1563b35e7eeSXin LI 
1573b35e7eeSXin LI       - Load-modify-store may have AUIPC for the load and the same
1583b35e7eeSXin LI         AUIPC-result is used for the store too. This may get combined
1593b35e7eeSXin LI         with AUIPC being outside the loop.
1603b35e7eeSXin LI 
1613b35e7eeSXin LI       - AUIPC is before a conditional branch and inst2 is hundreds
1623b35e7eeSXin LI         of bytes away at the branch target.
1633b35e7eeSXin LI 
1643b35e7eeSXin LI       - Inner and outer pair:
1653b35e7eeSXin LI 
1663b35e7eeSXin LI             auipc   a1,0x2f
1673b35e7eeSXin LI             auipc   a2,0x3d
1683b35e7eeSXin LI             ld      a2,-500(a2)
1693b35e7eeSXin LI             addi    a1,a1,-233
1703b35e7eeSXin LI 
1713b35e7eeSXin LI       - Many split pairs with an untaken conditional branch between:
1723b35e7eeSXin LI 
1733b35e7eeSXin LI             auipc   s9,0x1613   # Pair 1
1743b35e7eeSXin LI             auipc   s4,0x1613   # Pair 2
1753b35e7eeSXin LI             auipc   s6,0x1613   # Pair 3
1763b35e7eeSXin LI             auipc   s10,0x1613  # Pair 4
1773b35e7eeSXin LI             beqz    a5,a3baae
1783b35e7eeSXin LI             ld      a0,0(a6)
1793b35e7eeSXin LI             ld      a6,246(s9)  # Pair 1
1803b35e7eeSXin LI             ld      a1,250(s4)  # Pair 2
1813b35e7eeSXin LI             ld      a3,254(s6)  # Pair 3
1823b35e7eeSXin LI             ld      a4,258(s10) # Pair 4
1833b35e7eeSXin LI 
1843b35e7eeSXin LI     It's not possible to find all split pairs in a filter like this.
1853b35e7eeSXin LI     At least in 2024, simple sequential pairs are 99 % of AUIPC uses
1863b35e7eeSXin LI     so filtering only such pairs gives good results and makes the
1873b35e7eeSXin LI     filter simpler. However, it's possible that future compilers will
1883b35e7eeSXin LI     produce different code where sequential pairs aren't as common.
1893b35e7eeSXin LI 
1903b35e7eeSXin LI     This filter doesn't convert AUIPC instructions alone because:
1913b35e7eeSXin LI 
1923b35e7eeSXin LI     (1) The conversion would be off-by-one (or off-by-4096) half the
1933b35e7eeSXin LI         time because the lowest 12 bits from inst2 (inst2_imm12)
1943b35e7eeSXin LI         aren't known. We only know that the absolute address is
1953b35e7eeSXin LI         pc + AUIPC_imm20 + [-2048, +2047] but there is no way to
1963b35e7eeSXin LI         know the exact 4096-byte multiple (or 4096 * n + 2048):
1973b35e7eeSXin LI         there are always two possibilities because AUIPC copies
1983b35e7eeSXin LI         the 12 lowest bits from pc instead of zeroing them.
1993b35e7eeSXin LI 
2003b35e7eeSXin LI         NOTE: The sign-extension of inst2_imm12 adds a tiny bit
2013b35e7eeSXin LI         of extra complexity to AUIPC math in general but it's not
2023b35e7eeSXin LI         the reason for this problem. The sign-extension only changes
2033b35e7eeSXin LI         the relative position of the pc-relative 4096-byte window.
2043b35e7eeSXin LI 
2053b35e7eeSXin LI     (2) Matching AUIPC instruction alone requires only seven bits.
2063b35e7eeSXin LI         When the filter is applied to non-code data, that leads
2073b35e7eeSXin LI         to many false positives which make compression worse.
2083b35e7eeSXin LI         As long as most AUIPC+inst2 pairs appear as two consecutive
2093b35e7eeSXin LI         instructions, converting only such pairs gives better results.
2103b35e7eeSXin LI 
2113b35e7eeSXin LI     In assembly, AUIPC+inst2 tend to look like this:
2123b35e7eeSXin LI 
2133b35e7eeSXin LI         # Call:
2143b35e7eeSXin LI         auipc   ra, 0x12345
2153b35e7eeSXin LI         jalr    ra, -42(ra)
2163b35e7eeSXin LI 
2173b35e7eeSXin LI         # Tail call:
2183b35e7eeSXin LI         auipc   t1, 0x12345
2193b35e7eeSXin LI         jalr    zero, -42(t1)
2203b35e7eeSXin LI 
2213b35e7eeSXin LI         # Getting the absolute address:
2223b35e7eeSXin LI         auipc   a0, 0x12345
2233b35e7eeSXin LI         addi    a0, a0, -42
2243b35e7eeSXin LI 
2253b35e7eeSXin LI         # rd of inst2 isn't necessarily the same as rs1 even
2263b35e7eeSXin LI         # in cases where there is no reason to preserve rs1.
2273b35e7eeSXin LI         auipc   a0, 0x12345
2283b35e7eeSXin LI         addi    a1, a0, -42
2293b35e7eeSXin LI 
2303b35e7eeSXin LI     As of 2024, 16-bit instructions from the C extension don't
2313b35e7eeSXin LI     appear as inst2. The RISC-V psABI doesn't list AUIPC+C.* as
2323b35e7eeSXin LI     a linker relaxation type explicitly but it's not disallowed
2333b35e7eeSXin LI     either. Usefulness is limited as most of the time the lowest
2343b35e7eeSXin LI     12 bits won't fit in a C instruction. This filter doesn't
2353b35e7eeSXin LI     support AUIPC+C.* combinations because this makes the filter
2363b35e7eeSXin LI     simpler, there are no test files, and it hopefully will never
2373b35e7eeSXin LI     be needed anyway.
2383b35e7eeSXin LI 
2393b35e7eeSXin LI     (Compare AUIPC to ARM64 where ADRP does set the lowest 12 bits
2403b35e7eeSXin LI     to zero. The paired instruction has the lowest 12 bits of the
2413b35e7eeSXin LI     absolute address as is in a zero-extended immediate. Thus the
2423b35e7eeSXin LI     ARM64 filter doesn't need to care about the instructions that
2433b35e7eeSXin LI     are paired with ADRP. An off-by-4096 issue can still occur if
2443b35e7eeSXin LI     the code section isn't aligned with the filter's start offset.
2453b35e7eeSXin LI     It's not a problem with standalone ELF files but Windows PE
2463b35e7eeSXin LI     files need start_offset=3072 for best results. Also, a .tar
2473b35e7eeSXin LI     stores files with 512-byte alignment so most of the time it
2483b35e7eeSXin LI     won't be the best for ARM64.)
2493b35e7eeSXin LI 
2503b35e7eeSXin LI AUIPC with rd == x0
2513b35e7eeSXin LI -------------------
2523b35e7eeSXin LI 
2533b35e7eeSXin LI     AUIPC instructions with rd=x0 are reserved for HINTs in the base
2543b35e7eeSXin LI     instruction set. Such AUIPC instructions are never filtered.
2553b35e7eeSXin LI 
2563b35e7eeSXin LI     As of January 2024, it seems likely that AUIPC with rd=x0 will
2573b35e7eeSXin LI     be used for landing pads (pseudoinstruction LPAD). LPAD is used
2583b35e7eeSXin LI     to mark valid targets for indirect jumps (for JALR), for example,
2593b35e7eeSXin LI     beginnings of functions. The 20-bit immediate in LPAD instruction
2603b35e7eeSXin LI     is a label, not a pc-relative address. Thus it would be
2613b35e7eeSXin LI     counterproductive to convert AUIPC instructions with rd=x0.
2623b35e7eeSXin LI 
2633b35e7eeSXin LI     Often the next instruction after LPAD won't have rs1=x0 and thus
2643b35e7eeSXin LI     the filtering would be skipped for that reason alone. However,
2653b35e7eeSXin LI     it's not good to rely on this. For example, consider a function
2663b35e7eeSXin LI     that begins like this:
2673b35e7eeSXin LI 
2683b35e7eeSXin LI         int foo(int i)
2693b35e7eeSXin LI         {
2703b35e7eeSXin LI             if (i <= 234) {
2713b35e7eeSXin LI                 ...
2723b35e7eeSXin LI             }
2733b35e7eeSXin LI 
2743b35e7eeSXin LI     A compiler may generate something like this:
2753b35e7eeSXin LI 
2763b35e7eeSXin LI         lpad    0x54321
2773b35e7eeSXin LI         li      a5, 234
2783b35e7eeSXin LI         bgt     a0, a5, .L2
2793b35e7eeSXin LI 
2803b35e7eeSXin LI     Converting the pseudoinstructions to raw instructions:
2813b35e7eeSXin LI 
2823b35e7eeSXin LI         auipc   x0, 0x54321
2833b35e7eeSXin LI         addi    x15, x0, 234
2843b35e7eeSXin LI         blt     x15, x10, .L2
2853b35e7eeSXin LI 
2863b35e7eeSXin LI     In this case the filter would undesirably convert the AUIPC+ADDI
2873b35e7eeSXin LI     pair if the filter didn't explicitly skip AUIPC instructions
2883b35e7eeSXin LI     that have rd=x0.
2893b35e7eeSXin LI 
2903b35e7eeSXin LI */
2913b35e7eeSXin LI 
2923b35e7eeSXin LI 
2933b35e7eeSXin LI #include "simple_private.h"
2943b35e7eeSXin LI 
2953b35e7eeSXin LI 
2963b35e7eeSXin LI // This checks two conditions at once:
2973b35e7eeSXin LI //    - AUIPC rd == inst2 rs1.
2983b35e7eeSXin LI //    - inst2 opcode has the lowest two bits set.
2993b35e7eeSXin LI //
3003b35e7eeSXin LI // The 8 bit left shift aligns the rd of AUIPC with the rs1 of inst2.
3013b35e7eeSXin LI // By XORing the registers, any non-zero value in those bits indicates the
3023b35e7eeSXin LI // registers are not equal and thus not an AUIPC pair. Subtracting 3 from
3033b35e7eeSXin LI // inst2 will zero out the first two opcode bits only when they are set.
3043b35e7eeSXin LI // The mask tests if any of the register or opcode bits are set (and thus
3053b35e7eeSXin LI // not an AUIPC pair).
3063b35e7eeSXin LI //
3073b35e7eeSXin LI // Alternative expression: (((((auipc) << 8) ^ (inst2)) & 0xF8003) != 3)
3083b35e7eeSXin LI #define NOT_AUIPC_PAIR(auipc, inst2) \
3093b35e7eeSXin LI 	((((auipc) << 8) ^ ((inst2) - 3)) & 0xF8003)
3103b35e7eeSXin LI 
3113b35e7eeSXin LI // This macro checks multiple conditions:
3123b35e7eeSXin LI //   (1) AUIPC rd [11:7] == x2 (special rd value).
3133b35e7eeSXin LI //   (2) AUIPC bits 12 and 13 set (the lowest two opcode bits of packed inst2).
3143b35e7eeSXin LI //   (3) inst2_rs1 doesn't equal x0 or x2 because the opposite
3153b35e7eeSXin LI //       conversion is only done when
3163b35e7eeSXin LI //       auipc_rd != x0 &&
3173b35e7eeSXin LI //       auipc_rd != x2 &&
3183b35e7eeSXin LI //       auipc_rd == inst2_rs1.
3193b35e7eeSXin LI //
3203b35e7eeSXin LI // The left-hand side takes care of (1) and (2).
3213b35e7eeSXin LI //   (a) The lowest 7 bits are already known to be AUIPC so subtracting 0x17
3223b35e7eeSXin LI //       makes those bits zeros.
3233b35e7eeSXin LI //   (b) If AUIPC rd equals x2, subtracting 0x100 makes bits [11:7] zeros.
3243b35e7eeSXin LI //       If rd doesn't equal x2, then there will be at least one non-zero bit
3253b35e7eeSXin LI //       and the next step (c) is irrelevant.
3263b35e7eeSXin LI //   (c) If the lowest two opcode bits of the packed inst2 are set in [13:12],
3273b35e7eeSXin LI //       then subtracting 0x3000 will make those bits zeros. Otherwise there
3283b35e7eeSXin LI //       will be at least one non-zero bit.
3293b35e7eeSXin LI //
3303b35e7eeSXin LI // The shift by 18 removes the high bits from the final '>=' comparison and
3313b35e7eeSXin LI // ensures that any non-zero result will be larger than any possible result
3323b35e7eeSXin LI // from the right-hand side of the comparison. The cast ensures that the
3333b35e7eeSXin LI // left-hand side didn't get promoted to a larger type than uint32_t.
3343b35e7eeSXin LI //
3353b35e7eeSXin LI // On the right-hand side, inst2_rs1 & 0x1D will be non-zero as long as
3363b35e7eeSXin LI // inst2_rs1 is not x0 or x2.
3373b35e7eeSXin LI //
3383b35e7eeSXin LI // The final '>=' comparison will make the expression true if:
3393b35e7eeSXin LI //   - The subtraction caused any bits to be set (special AUIPC rd value not
3403b35e7eeSXin LI //     used or inst2 opcode bits not set). (non-zero >= non-zero or 0)
3413b35e7eeSXin LI //   - The subtraction did not cause any bits to be set but inst2_rs1 was
3423b35e7eeSXin LI //     x0 or x2. (0 >= 0)
3433b35e7eeSXin LI #define NOT_SPECIAL_AUIPC(auipc, inst2_rs1) \
3443b35e7eeSXin LI 	((uint32_t)(((auipc) - 0x3117) << 18) >= ((inst2_rs1) & 0x1D))
3453b35e7eeSXin LI 
3463b35e7eeSXin LI 
3473b35e7eeSXin LI // The encode and decode functions are split for this filter because of the
3483b35e7eeSXin LI // AUIPC+inst2 filtering. This filter design allows a decoder-only
3493b35e7eeSXin LI // implementation to be smaller than alternative designs.
3503b35e7eeSXin LI 
3513b35e7eeSXin LI #ifdef HAVE_ENCODER_RISCV
3523b35e7eeSXin LI static size_t
riscv_encode(void * simple lzma_attribute ((__unused__)),uint32_t now_pos,bool is_encoder lzma_attribute ((__unused__)),uint8_t * buffer,size_t size)3533b35e7eeSXin LI riscv_encode(void *simple lzma_attribute((__unused__)),
3543b35e7eeSXin LI 		uint32_t now_pos,
3553b35e7eeSXin LI 		bool is_encoder lzma_attribute((__unused__)),
3563b35e7eeSXin LI 		uint8_t *buffer, size_t size)
3573b35e7eeSXin LI {
3583b35e7eeSXin LI 	// Avoid using i + 8 <= size in the loop condition.
3593b35e7eeSXin LI 	//
3603b35e7eeSXin LI 	// NOTE: If there is a JAL in the last six bytes of the stream, it
3613b35e7eeSXin LI 	// won't be converted. This is intentional to keep the code simpler.
3623b35e7eeSXin LI 	if (size < 8)
3633b35e7eeSXin LI 		return 0;
3643b35e7eeSXin LI 
3653b35e7eeSXin LI 	size -= 8;
3663b35e7eeSXin LI 
3673b35e7eeSXin LI 	size_t i;
3683b35e7eeSXin LI 
3693b35e7eeSXin LI 	// The loop is advanced by 2 bytes every iteration since the
3703b35e7eeSXin LI 	// instruction stream may include 16-bit instructions (C extension).
3713b35e7eeSXin LI 	for (i = 0; i <= size; i += 2) {
3723b35e7eeSXin LI 		uint32_t inst = buffer[i];
3733b35e7eeSXin LI 
3743b35e7eeSXin LI 		if (inst == 0xEF) {
3753b35e7eeSXin LI 			// JAL
3763b35e7eeSXin LI 			const uint32_t b1 = buffer[i + 1];
3773b35e7eeSXin LI 
3783b35e7eeSXin LI 			// Only filter rd=x1(ra) and rd=x5(t0).
3793b35e7eeSXin LI 			if ((b1 & 0x0D) != 0)
3803b35e7eeSXin LI 				continue;
3813b35e7eeSXin LI 
3823b35e7eeSXin LI 			// The 20-bit immediate is in four pieces.
3833b35e7eeSXin LI 			// The encoder stores it in big endian form
3843b35e7eeSXin LI 			// since it improves compression slightly.
3853b35e7eeSXin LI 			const uint32_t b2 = buffer[i + 2];
3863b35e7eeSXin LI 			const uint32_t b3 = buffer[i + 3];
3873b35e7eeSXin LI 			const uint32_t pc = now_pos + (uint32_t)i;
3883b35e7eeSXin LI 
3893b35e7eeSXin LI // The following chart shows the highest three bytes of JAL, focusing on
3903b35e7eeSXin LI // the 20-bit immediate field [31:12]. The first row of numbers is the
3913b35e7eeSXin LI // bit position in a 32-bit little endian instruction. The second row of
3923b35e7eeSXin LI // numbers shows the order of the immediate field in a J-type instruction.
3933b35e7eeSXin LI // The last row is the bit number in each byte.
3943b35e7eeSXin LI //
3953b35e7eeSXin LI // To determine the amount to shift each bit, subtract the value in
3963b35e7eeSXin LI // the last row from the value in the second last row. If the number
3973b35e7eeSXin LI // is positive, shift left. If negative, shift right.
3983b35e7eeSXin LI //
3993b35e7eeSXin LI // For example, at the rightmost side of the chart, the bit 4 in b1 is
4003b35e7eeSXin LI // the bit 12 of the address. Thus that bit needs to be shifted left
4013b35e7eeSXin LI // by 12 - 4 = 8 bits to put it in the right place in the addr variable.
4023b35e7eeSXin LI //
4033b35e7eeSXin LI // NOTE: The immediate of a J-type instruction holds bits [20:1] of
4043b35e7eeSXin LI // the address. The bit [0] is always 0 and not part of the immediate.
4053b35e7eeSXin LI //
4063b35e7eeSXin LI // |          b3             |          b2             |          b1         |
4073b35e7eeSXin LI // | 31 30 29 28 27 26 25 24 | 23 22 21 20 19 18 17 16 | 15 14 13 12 x x x x |
4083b35e7eeSXin LI // | 20 10  9  8  7  6  5  4 |  3  2  1 11 19 18 17 16 | 15 14 13 12 x x x x |
4093b35e7eeSXin LI // |  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |  7  6  5  4 x x x x |
4103b35e7eeSXin LI 
4113b35e7eeSXin LI 			uint32_t addr = ((b1 & 0xF0) << 8)
4123b35e7eeSXin LI 					| ((b2 & 0x0F) << 16)
4133b35e7eeSXin LI 					| ((b2 & 0x10) << 7)
4143b35e7eeSXin LI 					| ((b2 & 0xE0) >> 4)
4153b35e7eeSXin LI 					| ((b3 & 0x7F) << 4)
4163b35e7eeSXin LI 					| ((b3 & 0x80) << 13);
4173b35e7eeSXin LI 
4183b35e7eeSXin LI 			addr += pc;
4193b35e7eeSXin LI 
4203b35e7eeSXin LI 			buffer[i + 1] = (uint8_t)((b1 & 0x0F)
4213b35e7eeSXin LI 					| ((addr >> 13) & 0xF0));
4223b35e7eeSXin LI 
4233b35e7eeSXin LI 			buffer[i + 2] = (uint8_t)(addr >> 9);
4243b35e7eeSXin LI 			buffer[i + 3] = (uint8_t)(addr >> 1);
4253b35e7eeSXin LI 
4263b35e7eeSXin LI 			// The "-2" is included because the for-loop will
4273b35e7eeSXin LI 			// always increment by 2. In this case, we want to
4283b35e7eeSXin LI 			// skip an extra 2 bytes since we used 4 bytes
4293b35e7eeSXin LI 			// of input.
4303b35e7eeSXin LI 			i += 4 - 2;
4313b35e7eeSXin LI 
4323b35e7eeSXin LI 		} else if ((inst & 0x7F) == 0x17) {
4333b35e7eeSXin LI 			// AUIPC
4343b35e7eeSXin LI 			inst |= (uint32_t)buffer[i + 1] << 8;
4353b35e7eeSXin LI 			inst |= (uint32_t)buffer[i + 2] << 16;
4363b35e7eeSXin LI 			inst |= (uint32_t)buffer[i + 3] << 24;
4373b35e7eeSXin LI 
4383b35e7eeSXin LI 			// Branch based on AUIPC's rd. The bitmask test does
4393b35e7eeSXin LI 			// the same thing as this:
4403b35e7eeSXin LI 			//
4413b35e7eeSXin LI 			//     const uint32_t auipc_rd = (inst >> 7) & 0x1F;
4423b35e7eeSXin LI 			//     if (auipc_rd != 0 && auipc_rd != 2) {
4433b35e7eeSXin LI  			if (inst & 0xE80) {
4443b35e7eeSXin LI 				// AUIPC's rd doesn't equal x0 or x2.
4453b35e7eeSXin LI 
4463b35e7eeSXin LI 				// Check if AUIPC+inst2 are a pair.
4473b35e7eeSXin LI 				uint32_t inst2 = read32le(buffer + i + 4);
4483b35e7eeSXin LI 
4493b35e7eeSXin LI 				if (NOT_AUIPC_PAIR(inst, inst2)) {
4503b35e7eeSXin LI 					// The NOT_AUIPC_PAIR macro allows
4513b35e7eeSXin LI 					// a false AUIPC+AUIPC pair if the
4523b35e7eeSXin LI 					// bits [19:15] (where rs1 would be)
4533b35e7eeSXin LI 					// in the second AUIPC match the rd
4543b35e7eeSXin LI 					// of the first AUIPC.
4553b35e7eeSXin LI 					//
4563b35e7eeSXin LI 					// We must skip enough forward so
4573b35e7eeSXin LI 					// that the first two bytes of the
4583b35e7eeSXin LI 					// second AUIPC cannot get converted.
4593b35e7eeSXin LI 					// Such a conversion could make the
4603b35e7eeSXin LI 					// current pair become a valid pair
4613b35e7eeSXin LI 					// which would desync the decoder.
4623b35e7eeSXin LI 					//
4633b35e7eeSXin LI 					// Skipping six bytes is enough even
4643b35e7eeSXin LI 					// though the above condition looks
4653b35e7eeSXin LI 					// at the lowest four bits of the
4663b35e7eeSXin LI 					// buffer[i + 6] too. This is safe
4673b35e7eeSXin LI 					// because this filter never changes
4683b35e7eeSXin LI 					// those bits if a conversion at
4693b35e7eeSXin LI 					// that position is done.
4703b35e7eeSXin LI 					i += 6 - 2;
4713b35e7eeSXin LI 					continue;
4723b35e7eeSXin LI 				}
4733b35e7eeSXin LI 
4743b35e7eeSXin LI 				// Convert AUIPC+inst2 to a special format:
4753b35e7eeSXin LI 				//
4763b35e7eeSXin LI 				//   - The lowest 7 bits [6:0] retain the
4773b35e7eeSXin LI 				//     AUIPC opcode.
4783b35e7eeSXin LI 				//
4793b35e7eeSXin LI 				//   - The rd [11:7] is set to x2(sp). x2 is
4803b35e7eeSXin LI 				//     used as the stack pointer so AUIPC with
4813b35e7eeSXin LI 				//     rd=x2 should be very rare in real-world
4823b35e7eeSXin LI 				//     executables.
4833b35e7eeSXin LI 				//
4843b35e7eeSXin LI 				//   - The remaining 20 bits [31:12] (that
4853b35e7eeSXin LI 				//     normally hold the pc-relative immediate)
4863b35e7eeSXin LI 				//     are used to store the lowest 20 bits of
4873b35e7eeSXin LI 				//     inst2. That is, the 12-bit immediate of
4883b35e7eeSXin LI 				//     inst2 is not included.
4893b35e7eeSXin LI 				//
4903b35e7eeSXin LI 				//   - The location of the original inst2 is
4913b35e7eeSXin LI 				//     used to store the 32-bit absolute
4923b35e7eeSXin LI 				//     address in big endian format. Compared
4933b35e7eeSXin LI 				//     to the 20+12-bit split encoding, this
4943b35e7eeSXin LI 				//     results in a longer uninterrupted
4953b35e7eeSXin LI 				//     sequence of identical common bytes
4963b35e7eeSXin LI 				//     when the same address is referred
4973b35e7eeSXin LI 				//     with different instruction pairs
4983b35e7eeSXin LI 				//     (like AUIPC+LD vs. AUIPC+ADDI) or
4993b35e7eeSXin LI 				//     when the occurrences of the same
5003b35e7eeSXin LI 				//     pair use different registers. When
5013b35e7eeSXin LI 				//     referring to adjacent memory locations
5023b35e7eeSXin LI 				//     (like function calls that go via the
5033b35e7eeSXin LI 				//     ELF PLT), in big endian order only the
5043b35e7eeSXin LI 				//     last 1-2 bytes differ; in little endian
5053b35e7eeSXin LI 				//     the differing 1-2 bytes would be in the
5063b35e7eeSXin LI 				//     middle of the 8-byte sequence.
5073b35e7eeSXin LI 				//
5083b35e7eeSXin LI 				// When reversing the transformation, the
5093b35e7eeSXin LI 				// original rd of AUIPC can be restored
5103b35e7eeSXin LI 				// from inst2's rs1 as they are required to
5113b35e7eeSXin LI 				// be the same.
5123b35e7eeSXin LI 
5133b35e7eeSXin LI 				// Arithmetic right shift makes sign extension
5143b35e7eeSXin LI 				// trivial but (1) it's implementation-defined
5153b35e7eeSXin LI 				// behavior (C99/C11/C23 6.5.7-p5) and so is
5163b35e7eeSXin LI 				// (2) casting unsigned to signed (6.3.1.3-p3).
5173b35e7eeSXin LI 				//
5183b35e7eeSXin LI 				// One can check for (1) with
5193b35e7eeSXin LI 				//
5203b35e7eeSXin LI 				//     if ((-1 >> 1) == -1) ...
5213b35e7eeSXin LI 				//
5223b35e7eeSXin LI 				// but (2) has to be checked from the
5233b35e7eeSXin LI 				// compiler docs. GCC promises that (1)
5243b35e7eeSXin LI 				// and (2) behave in the common expected
5253b35e7eeSXin LI 				// way and thus
5263b35e7eeSXin LI 				//
5273b35e7eeSXin LI 				//     addr += (uint32_t)(
5283b35e7eeSXin LI 				//             (int32_t)inst2 >> 20);
5293b35e7eeSXin LI 				//
5303b35e7eeSXin LI 				// does the same as the code below. But since
5313b35e7eeSXin LI 				// the 100 % portable way is only a few bytes
5323b35e7eeSXin LI 				// bigger code and there is no real speed
5333b35e7eeSXin LI 				// difference, let's just use that, especially
5343b35e7eeSXin LI 				// since the decoder doesn't need this at all.
5353b35e7eeSXin LI 				uint32_t addr = inst & 0xFFFFF000;
5363b35e7eeSXin LI 				addr += (inst2 >> 20)
5373b35e7eeSXin LI 						- ((inst2 >> 19) & 0x1000);
5383b35e7eeSXin LI 
5393b35e7eeSXin LI 				addr += now_pos + (uint32_t)i;
5403b35e7eeSXin LI 
5413b35e7eeSXin LI 				// Construct the first 32 bits:
5423b35e7eeSXin LI 				//   [6:0]    AUIPC opcode
5433b35e7eeSXin LI 				//   [11:7]   Special AUIPC rd = x2
5443b35e7eeSXin LI 				//   [31:12]  The lowest 20 bits of inst2
5453b35e7eeSXin LI 				inst = 0x17 | (2 << 7) | (inst2 << 12);
5463b35e7eeSXin LI 
5473b35e7eeSXin LI 				write32le(buffer + i, inst);
5483b35e7eeSXin LI 
5493b35e7eeSXin LI 				// The second 32 bits store the absolute
5503b35e7eeSXin LI 				// address in big endian order.
5513b35e7eeSXin LI 				write32be(buffer + i + 4, addr);
5523b35e7eeSXin LI 			} else {
5533b35e7eeSXin LI 				// AUIPC's rd equals x0 or x2.
5543b35e7eeSXin LI 				//
5553b35e7eeSXin LI 				// x0 indicates a landing pad (LPAD).
5563b35e7eeSXin LI 				// It's always skipped.
5573b35e7eeSXin LI 				//
5583b35e7eeSXin LI 				// AUIPC with rd == x2 is used for the special
5593b35e7eeSXin LI 				// format as explained above. When the input
5603b35e7eeSXin LI 				// contains a byte sequence that matches the
5613b35e7eeSXin LI 				// special format, "fake" decoding must be
5623b35e7eeSXin LI 				// done to keep the filter bijective (that
5633b35e7eeSXin LI 				// is, safe to apply on arbitrary data).
5643b35e7eeSXin LI 				//
5653b35e7eeSXin LI 				// See the "x0 or x2" section in riscv_decode()
5663b35e7eeSXin LI 				// for how the "real" decoding is done. The
5673b35e7eeSXin LI 				// "fake" decoding is a simplified version
5683b35e7eeSXin LI 				// of "real" decoding with the following
5693b35e7eeSXin LI 				// differences (these reduce code size of
5703b35e7eeSXin LI 				// the decoder):
5713b35e7eeSXin LI 				// (1) The lowest 12 bits aren't sign-extended.
5723b35e7eeSXin LI 				// (2) No address conversion is done.
5733b35e7eeSXin LI 				// (3) Big endian format isn't used (the fake
5743b35e7eeSXin LI 				//     address is in little endian order).
5753b35e7eeSXin LI 
5763b35e7eeSXin LI 				// Check if inst matches the special format.
5773b35e7eeSXin LI 				const uint32_t fake_rs1 = inst >> 27;
5783b35e7eeSXin LI 
5793b35e7eeSXin LI 				if (NOT_SPECIAL_AUIPC(inst, fake_rs1)) {
5803b35e7eeSXin LI 					i += 4 - 2;
5813b35e7eeSXin LI 					continue;
5823b35e7eeSXin LI 				}
5833b35e7eeSXin LI 
5843b35e7eeSXin LI 				const uint32_t fake_addr =
5853b35e7eeSXin LI 						read32le(buffer + i + 4);
5863b35e7eeSXin LI 
5873b35e7eeSXin LI 				// Construct the second 32 bits:
5883b35e7eeSXin LI 				//   [19:0]   Upper 20 bits from AUIPC
5893b35e7eeSXin LI 				//   [31:20]  The lowest 12 bits of fake_addr
5903b35e7eeSXin LI 				const uint32_t fake_inst2 = (inst >> 12)
5913b35e7eeSXin LI 						| (fake_addr << 20);
5923b35e7eeSXin LI 
5933b35e7eeSXin LI 				// Construct new first 32 bits from:
5943b35e7eeSXin LI 				//   [6:0]   AUIPC opcode
5953b35e7eeSXin LI 				//   [11:7]  Fake AUIPC rd = fake_rs1
5963b35e7eeSXin LI 				//   [31:12] The highest 20 bits of fake_addr
5973b35e7eeSXin LI 				inst = 0x17 | (fake_rs1 << 7)
5983b35e7eeSXin LI 					| (fake_addr & 0xFFFFF000);
5993b35e7eeSXin LI 
6003b35e7eeSXin LI 				write32le(buffer + i, inst);
6013b35e7eeSXin LI 				write32le(buffer + i + 4, fake_inst2);
6023b35e7eeSXin LI 			}
6033b35e7eeSXin LI 
6043b35e7eeSXin LI 			i += 8 - 2;
6053b35e7eeSXin LI 		}
6063b35e7eeSXin LI 	}
6073b35e7eeSXin LI 
6083b35e7eeSXin LI 	return i;
6093b35e7eeSXin LI }
6103b35e7eeSXin LI 
6113b35e7eeSXin LI 
6123b35e7eeSXin LI extern lzma_ret
lzma_simple_riscv_encoder_init(lzma_next_coder * next,const lzma_allocator * allocator,const lzma_filter_info * filters)6133b35e7eeSXin LI lzma_simple_riscv_encoder_init(lzma_next_coder *next,
6143b35e7eeSXin LI 		const lzma_allocator *allocator,
6153b35e7eeSXin LI 		const lzma_filter_info *filters)
6163b35e7eeSXin LI {
6173b35e7eeSXin LI 	return lzma_simple_coder_init(next, allocator, filters,
6183b35e7eeSXin LI 			&riscv_encode, 0, 8, 2, true);
6193b35e7eeSXin LI }
620*128836d3SXin LI 
621*128836d3SXin LI 
622*128836d3SXin LI extern LZMA_API(size_t)
lzma_bcj_riscv_encode(uint32_t start_offset,uint8_t * buf,size_t size)623*128836d3SXin LI lzma_bcj_riscv_encode(uint32_t start_offset, uint8_t *buf, size_t size)
624*128836d3SXin LI {
625*128836d3SXin LI 	// start_offset must be a multiple of two.
626*128836d3SXin LI 	start_offset &= ~UINT32_C(1);
627*128836d3SXin LI 	return riscv_encode(NULL, start_offset, true, buf, size);
628*128836d3SXin LI }
6293b35e7eeSXin LI #endif
6303b35e7eeSXin LI 
6313b35e7eeSXin LI 
6323b35e7eeSXin LI #ifdef HAVE_DECODER_RISCV
6333b35e7eeSXin LI static size_t
riscv_decode(void * simple lzma_attribute ((__unused__)),uint32_t now_pos,bool is_encoder lzma_attribute ((__unused__)),uint8_t * buffer,size_t size)6343b35e7eeSXin LI riscv_decode(void *simple lzma_attribute((__unused__)),
6353b35e7eeSXin LI 		uint32_t now_pos,
6363b35e7eeSXin LI 		bool is_encoder lzma_attribute((__unused__)),
6373b35e7eeSXin LI 		uint8_t *buffer, size_t size)
6383b35e7eeSXin LI {
6393b35e7eeSXin LI 	if (size < 8)
6403b35e7eeSXin LI 		return 0;
6413b35e7eeSXin LI 
6423b35e7eeSXin LI 	size -= 8;
6433b35e7eeSXin LI 
6443b35e7eeSXin LI 	size_t i;
6453b35e7eeSXin LI 	for (i = 0; i <= size; i += 2) {
6463b35e7eeSXin LI 		uint32_t inst = buffer[i];
6473b35e7eeSXin LI 
6483b35e7eeSXin LI 		if (inst == 0xEF) {
6493b35e7eeSXin LI 			// JAL
6503b35e7eeSXin LI 			const uint32_t b1 = buffer[i + 1];
6513b35e7eeSXin LI 
6523b35e7eeSXin LI 			// Only filter rd=x1(ra) and rd=x5(t0).
6533b35e7eeSXin LI 			if ((b1 & 0x0D) != 0)
6543b35e7eeSXin LI 				continue;
6553b35e7eeSXin LI 
6563b35e7eeSXin LI 			const uint32_t b2 = buffer[i + 2];
6573b35e7eeSXin LI 			const uint32_t b3 = buffer[i + 3];
6583b35e7eeSXin LI 			const uint32_t pc = now_pos + (uint32_t)i;
6593b35e7eeSXin LI 
6603b35e7eeSXin LI // |          b3             |          b2             |          b1         |
6613b35e7eeSXin LI // | 31 30 29 28 27 26 25 24 | 23 22 21 20 19 18 17 16 | 15 14 13 12 x x x x |
6623b35e7eeSXin LI // | 20 10  9  8  7  6  5  4 |  3  2  1 11 19 18 17 16 | 15 14 13 12 x x x x |
6633b35e7eeSXin LI // |  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |  7  6  5  4 x x x x |
6643b35e7eeSXin LI 
6653b35e7eeSXin LI 			uint32_t addr = ((b1 & 0xF0) << 13)
6663b35e7eeSXin LI 					| (b2 << 9) | (b3 << 1);
6673b35e7eeSXin LI 
6683b35e7eeSXin LI 			addr -= pc;
6693b35e7eeSXin LI 
6703b35e7eeSXin LI 			buffer[i + 1] = (uint8_t)((b1 & 0x0F)
6713b35e7eeSXin LI 					| ((addr >> 8) & 0xF0));
6723b35e7eeSXin LI 
6733b35e7eeSXin LI 			buffer[i + 2] = (uint8_t)(((addr >> 16) & 0x0F)
6743b35e7eeSXin LI 					| ((addr >> 7) & 0x10)
6753b35e7eeSXin LI 					| ((addr << 4) & 0xE0));
6763b35e7eeSXin LI 
6773b35e7eeSXin LI 			buffer[i + 3] = (uint8_t)(((addr >> 4) & 0x7F)
6783b35e7eeSXin LI 					| ((addr >> 13) & 0x80));
6793b35e7eeSXin LI 
6803b35e7eeSXin LI 			i += 4 - 2;
6813b35e7eeSXin LI 
6823b35e7eeSXin LI 		} else if ((inst & 0x7F) == 0x17) {
6833b35e7eeSXin LI 			// AUIPC
6843b35e7eeSXin LI 			uint32_t inst2;
6853b35e7eeSXin LI 
6863b35e7eeSXin LI 			inst |= (uint32_t)buffer[i + 1] << 8;
6873b35e7eeSXin LI 			inst |= (uint32_t)buffer[i + 2] << 16;
6883b35e7eeSXin LI 			inst |= (uint32_t)buffer[i + 3] << 24;
6893b35e7eeSXin LI 
6903b35e7eeSXin LI 			if (inst & 0xE80) {
6913b35e7eeSXin LI 				// AUIPC's rd doesn't equal x0 or x2.
6923b35e7eeSXin LI 
6933b35e7eeSXin LI 				// Check if it is a "fake" AUIPC+inst2 pair.
6943b35e7eeSXin LI 				inst2 = read32le(buffer + i + 4);
6953b35e7eeSXin LI 
6963b35e7eeSXin LI 				if (NOT_AUIPC_PAIR(inst, inst2)) {
6973b35e7eeSXin LI 					i += 6 - 2;
6983b35e7eeSXin LI 					continue;
6993b35e7eeSXin LI 				}
7003b35e7eeSXin LI 
7013b35e7eeSXin LI 				// Decode (or more like re-encode) the "fake"
7023b35e7eeSXin LI 				// pair. The "fake" format doesn't do
7033b35e7eeSXin LI 				// sign-extension, address conversion, or
7043b35e7eeSXin LI 				// use big endian. (The use of little endian
7053b35e7eeSXin LI 				// allows sharing the write32le() calls in
7063b35e7eeSXin LI 				// the decoder to reduce code size when
7073b35e7eeSXin LI 				// unaligned access isn't supported.)
7083b35e7eeSXin LI 				uint32_t addr = inst & 0xFFFFF000;
7093b35e7eeSXin LI 				addr += inst2 >> 20;
7103b35e7eeSXin LI 
7113b35e7eeSXin LI 				inst = 0x17 | (2 << 7) | (inst2 << 12);
7123b35e7eeSXin LI 				inst2 = addr;
7133b35e7eeSXin LI 			} else {
7143b35e7eeSXin LI 				// AUIPC's rd equals x0 or x2.
7153b35e7eeSXin LI 
7163b35e7eeSXin LI 				// Check if inst matches the special format
7173b35e7eeSXin LI 				// used by the encoder.
7183b35e7eeSXin LI 				const uint32_t inst2_rs1 = inst >> 27;
7193b35e7eeSXin LI 
7203b35e7eeSXin LI 				if (NOT_SPECIAL_AUIPC(inst, inst2_rs1)) {
7213b35e7eeSXin LI 					i += 4 - 2;
7223b35e7eeSXin LI 					continue;
7233b35e7eeSXin LI 				}
7243b35e7eeSXin LI 
7253b35e7eeSXin LI 				// Decode the "real" pair.
7263b35e7eeSXin LI 				uint32_t addr = read32be(buffer + i + 4);
7273b35e7eeSXin LI 
7283b35e7eeSXin LI 				addr -= now_pos + (uint32_t)i;
7293b35e7eeSXin LI 
7303b35e7eeSXin LI 				// The second instruction:
7313b35e7eeSXin LI 				//   - Get the lowest 20 bits from inst.
7323b35e7eeSXin LI 				//   - Add the lowest 12 bits of the address
7333b35e7eeSXin LI 				//     as the immediate field.
7343b35e7eeSXin LI 				inst2 = (inst >> 12) | (addr << 20);
7353b35e7eeSXin LI 
7363b35e7eeSXin LI 				// AUIPC:
7373b35e7eeSXin LI 				//   - rd is the same as inst2_rs1.
7383b35e7eeSXin LI 				//   - The sign extension of the lowest 12 bits
7393b35e7eeSXin LI 				//     must be taken into account.
7403b35e7eeSXin LI 				inst = 0x17 | (inst2_rs1 << 7)
7413b35e7eeSXin LI 					| ((addr + 0x800) & 0xFFFFF000);
7423b35e7eeSXin LI 			}
7433b35e7eeSXin LI 
7443b35e7eeSXin LI 			// Both decoder branches write in little endian order.
7453b35e7eeSXin LI 			write32le(buffer + i, inst);
7463b35e7eeSXin LI 			write32le(buffer + i + 4, inst2);
7473b35e7eeSXin LI 
7483b35e7eeSXin LI 			i += 8 - 2;
7493b35e7eeSXin LI 		}
7503b35e7eeSXin LI 	}
7513b35e7eeSXin LI 
7523b35e7eeSXin LI 	return i;
7533b35e7eeSXin LI }
7543b35e7eeSXin LI 
7553b35e7eeSXin LI 
7563b35e7eeSXin LI extern lzma_ret
lzma_simple_riscv_decoder_init(lzma_next_coder * next,const lzma_allocator * allocator,const lzma_filter_info * filters)7573b35e7eeSXin LI lzma_simple_riscv_decoder_init(lzma_next_coder *next,
7583b35e7eeSXin LI 		const lzma_allocator *allocator,
7593b35e7eeSXin LI 		const lzma_filter_info *filters)
7603b35e7eeSXin LI {
7613b35e7eeSXin LI 	return lzma_simple_coder_init(next, allocator, filters,
7623b35e7eeSXin LI 			&riscv_decode, 0, 8, 2, false);
7633b35e7eeSXin LI }
764*128836d3SXin LI 
765*128836d3SXin LI 
766*128836d3SXin LI extern LZMA_API(size_t)
lzma_bcj_riscv_decode(uint32_t start_offset,uint8_t * buf,size_t size)767*128836d3SXin LI lzma_bcj_riscv_decode(uint32_t start_offset, uint8_t *buf, size_t size)
768*128836d3SXin LI {
769*128836d3SXin LI 	// start_offset must be a multiple of two.
770*128836d3SXin LI 	start_offset &= ~UINT32_C(1);
771*128836d3SXin LI 	return riscv_decode(NULL, start_offset, false, buf, size);
772*128836d3SXin LI }
7733b35e7eeSXin LI #endif
774