liblzma/simple/riscv.c

3b35e7eeSXin LI// SPDX-License-Identifier: 0BSD
3b35e7eeSXin LI
3b35e7eeSXin LI///////////////////////////////////////////////////////////////////////////////
3b35e7eeSXin LI//
3b35e7eeSXin LI/// \file       riscv.c
3b35e7eeSXin LI/// \brief      Filter for 32-bit/64-bit little/big endian RISC-V binaries
3b35e7eeSXin LI///
3b35e7eeSXin LI/// This converts program counter relative addresses in function calls
3b35e7eeSXin LI/// (JAL, AUIPC+JALR), address calculation of functions and global
3b35e7eeSXin LI/// variables (AUIPC+ADDI), loads (AUIPC+load), and stores (AUIPC+store).
3b35e7eeSXin LI///
3b35e7eeSXin LI/// For AUIPC+inst2 pairs, the paired instruction checking is fairly relaxed.
3b35e7eeSXin LI/// The paired instruction opcode must only have its lowest two bits set,
3b35e7eeSXin LI/// meaning it will convert any paired instruction that is not a 16-bit
3b35e7eeSXin LI/// compressed instruction. This was shown to be enough to keep the number
3b35e7eeSXin LI/// of false matches low while improving code size and speed.
3b35e7eeSXin LI//
3b35e7eeSXin LI//  Authors:    Lasse Collin
3b35e7eeSXin LI//              Jia Tan
3b35e7eeSXin LI//
3b35e7eeSXin LI//  Special thanks:
3b35e7eeSXin LI//
3b35e7eeSXin LI//    - Chien Wong <m@xv97.com> provided a few early versions of RISC-V
3b35e7eeSXin LI//      filter variants along with test files and benchmark results.
3b35e7eeSXin LI//
3b35e7eeSXin LI//    - Igor Pavlov helped a lot in the filter design, getting it both
3b35e7eeSXin LI//      faster and smaller. The implementation here is still independently
3b35e7eeSXin LI//      written, not based on LZMA SDK.
3b35e7eeSXin LI//
3b35e7eeSXin LI///////////////////////////////////////////////////////////////////////////////
3b35e7eeSXin LI
3b35e7eeSXin LI/*
3b35e7eeSXin LI
3b35e7eeSXin LIRISC-V filtering
3b35e7eeSXin LI================
3b35e7eeSXin LI
3b35e7eeSXin LI    RV32I and RV64I, possibly combined with extensions C, Zfh, F, D,
3b35e7eeSXin LI    and Q, are identical enough that the same filter works for both.
3b35e7eeSXin LI
3b35e7eeSXin LI    The instruction encoding is always little endian, even on systems
3b35e7eeSXin LI    with big endian data access. Thus the same filter works for both
3b35e7eeSXin LI    endiannesses.
3b35e7eeSXin LI
3b35e7eeSXin LI    The following instructions have program counter relative
3b35e7eeSXin LI    (pc-relative) behavior:
3b35e7eeSXin LI
3b35e7eeSXin LIJAL
3b35e7eeSXin LI---
3b35e7eeSXin LI
3b35e7eeSXin LI    JAL is used for function calls (including tail calls) and
3b35e7eeSXin LI    unconditional jumps within functions. Jumps within functions
3b35e7eeSXin LI    aren't useful to filter because the absolute addresses often
3b35e7eeSXin LI    appear only once or at most a few times. Tail calls and jumps
3b35e7eeSXin LI    within functions look the same to a simple filter so neither
3b35e7eeSXin LI    are filtered, that is, JAL x0 is ignored (the ABI name of the
3b35e7eeSXin LI    register x0 is "zero").
3b35e7eeSXin LI
3b35e7eeSXin LI    Almost all calls store the return address to register x1 (ra)
3b35e7eeSXin LI    or x5 (t0). To reduce false matches when the filter is applied
3b35e7eeSXin LI    to non-code data, only the JAL instructions that use x1 or x5
3b35e7eeSXin LI    are converted. JAL has pc-relative range of +/-1 MiB so longer
3b35e7eeSXin LI    calls and jumps need another method (AUIPC+JALR).
3b35e7eeSXin LI
3b35e7eeSXin LIC.J and C.JAL
3b35e7eeSXin LI-------------
3b35e7eeSXin LI
3b35e7eeSXin LI    C.J and C.JAL have pc-relative range of +/-2 KiB.
3b35e7eeSXin LI
3b35e7eeSXin LI    C.J is for tail calls and jumps within functions and isn't
3b35e7eeSXin LI    filtered for the reasons mentioned for JAL x0.
3b35e7eeSXin LI
3b35e7eeSXin LI    C.JAL is an RV32C-only instruction. Its encoding overlaps with
3b35e7eeSXin LI    RV64C-only C.ADDIW which is a common instruction. So if filtering
3b35e7eeSXin LI    C.JAL was useful (it wasn't tested) then a separate filter would
3b35e7eeSXin LI    be needed for RV32 and RV64. Also, false positives would be a
3b35e7eeSXin LI    significant problem when the filter is applied to non-code data
3b35e7eeSXin LI    because C.JAL needs only five bits to match. Thus, this filter
3b35e7eeSXin LI    doesn't modify C.JAL instructions.
3b35e7eeSXin LI
3b35e7eeSXin LIBEQ, BNE, BLT, BGE, BLTU, BGEU, C.BEQZ, and C.BNEZ
3b35e7eeSXin LI--------------------------------------------------
3b35e7eeSXin LI
3b35e7eeSXin LI    These are conditional branches with pc-relative range
3b35e7eeSXin LI    of +/-4 KiB (+/-256 B for C.*). The absolute addresses often
3b35e7eeSXin LI    appear only once and very short distances are the most common,
3b35e7eeSXin LI    so filtering these instructions would make compression worse.
3b35e7eeSXin LI
3b35e7eeSXin LIAUIPC with rd != x0
3b35e7eeSXin LI-------------------
3b35e7eeSXin LI
3b35e7eeSXin LI    AUIPC is paired with a second instruction (inst2) to do
3b35e7eeSXin LI    pc-relative jumps, calls, loads, stores, and for taking
3b35e7eeSXin LI    an address of a symbol. AUIPC has a 20-bit immediate and
3b35e7eeSXin LI    the possible inst2 choices have a 12-bit immediate.
3b35e7eeSXin LI
3b35e7eeSXin LI    AUIPC stores pc + 20-bit signed immediate to a register.
3b35e7eeSXin LI    The immediate encodes a multiple of 4 KiB so AUIPC itself
3b35e7eeSXin LI    has a pc-relative range of +/-2 GiB. AUIPC does *NOT* set
3b35e7eeSXin LI    the lowest 12 bits of the result to zero! This means that
3b35e7eeSXin LI    the 12-bit immediate in inst2 cannot just include the lowest
3b35e7eeSXin LI    12 bits of the absolute address as is; the immediate has to
3b35e7eeSXin LI    compensate for the lowest 12 bits that AUIPC copies from the
3b35e7eeSXin LI    program counter. This means that a good filter has to convert
3b35e7eeSXin LI    not only AUIPC but also the paired inst2.
3b35e7eeSXin LI
3b35e7eeSXin LI    A strict filter would focus on filtering the following
3b35e7eeSXin LI    AUIPC+inst2 pairs:
3b35e7eeSXin LI
3b35e7eeSXin LI      - AUIPC+JALR: Function calls, including tail calls.
3b35e7eeSXin LI
3b35e7eeSXin LI      - AUIPC+ADDI: Calculating the address of a function
3b35e7eeSXin LI        or a global variable.
3b35e7eeSXin LI
3b35e7eeSXin LI      - AUIPC+load/store from the base instruction sets
3b35e7eeSXin LI        (RV32I, RV64I) or from the floating point extensions
3b35e7eeSXin LI        Zfh, F, D, and Q:
3b35e7eeSXin LI          * RV32I: LB, LH, LW, LBU, LHU, SB, SH, SW
3b35e7eeSXin LI          * RV64I has also: LD, LWU, SD
3b35e7eeSXin LI          * Zfh: FLH, FSH
3b35e7eeSXin LI          * F: FLW, FSW
3b35e7eeSXin LI          * D: FLD, FSD
3b35e7eeSXin LI          * Q: FLQ, FSQ
3b35e7eeSXin LI
3b35e7eeSXin LI    NOTE: AUIPC+inst2 can only be a pair if AUIPC's rd specifies
3b35e7eeSXin LI    the same register as inst2's rs1.
3b35e7eeSXin LI
3b35e7eeSXin LI    Instead of strictly accepting only the above instructions as inst2,
3b35e7eeSXin LI    this filter uses a much simpler condition: the lowest two bits of
3b35e7eeSXin LI    inst2 must be set, that is, inst2 must not be a 16-bit compressed
3b35e7eeSXin LI    instruction. So this will accept all 32-bit and possible future
3b35e7eeSXin LI    extended instructions as a pair to AUIPC if the bits in AUIPC's
3b35e7eeSXin LI    rd [11:7] match the bits [19:15] in inst2 (the bits that I-type and
3b35e7eeSXin LI    S-type instructions use for rs1). Testing showed that this relaxed
3b35e7eeSXin LI    condition for inst2 did not consistently or significantly affect
3b35e7eeSXin LI    compression ratio but it reduced code size and improved speed.
3b35e7eeSXin LI
3b35e7eeSXin LI    Additionally, the paired instruction is always treated as an I-type
3b35e7eeSXin LI    instruction. The S-type instructions used by stores (SB, SH, SW,
3b35e7eeSXin LI    etc.) place the lowest 5 bits of the immediate in a different
3b35e7eeSXin LI    location than I-type instructions. AUIPC+store pairs are less
3b35e7eeSXin LI    common than other pairs, and testing showed that the extra
3b35e7eeSXin LI    code required to handle S-type instructions was not worth the
3b35e7eeSXin LI    compression ratio gained.
3b35e7eeSXin LI
3b35e7eeSXin LI    AUIPC+inst2 don't necessarily appear sequentially next to each
3b35e7eeSXin LI    other although very often they do. Especially AUIPC+JALR are
3b35e7eeSXin LI    sequential as that may allow instruction fusion in processors
3b35e7eeSXin LI    (and perhaps help branch prediction as a fused AUIPC+JALR is
3b35e7eeSXin LI    a direct branch while JALR alone is an indirect branch).
3b35e7eeSXin LI
3b35e7eeSXin LI    Clang 16 can generate code where AUIPC+inst2 is split:
3b35e7eeSXin LI
3b35e7eeSXin LI      - AUIPC is outside a loop and inst2 (load/store) is inside
3b35e7eeSXin LI        the loop. This way the AUIPC instruction needs to be
3b35e7eeSXin LI        executed only once.
3b35e7eeSXin LI
3b35e7eeSXin LI      - Load-modify-store may have AUIPC for the load and the same
3b35e7eeSXin LI        AUIPC-result is used for the store too. This may get combined
3b35e7eeSXin LI        with AUIPC being outside the loop.
3b35e7eeSXin LI
3b35e7eeSXin LI      - AUIPC is before a conditional branch and inst2 is hundreds
3b35e7eeSXin LI        of bytes away at the branch target.
3b35e7eeSXin LI
3b35e7eeSXin LI      - Inner and outer pair:
3b35e7eeSXin LI
3b35e7eeSXin LI            auipc   a1,0x2f
3b35e7eeSXin LI            auipc   a2,0x3d
3b35e7eeSXin LI            ld      a2,-500(a2)
3b35e7eeSXin LI            addi    a1,a1,-233
3b35e7eeSXin LI
3b35e7eeSXin LI      - Many split pairs with an untaken conditional branch between:
3b35e7eeSXin LI
3b35e7eeSXin LI            auipc   s9,0x1613   # Pair 1
3b35e7eeSXin LI            auipc   s4,0x1613   # Pair 2
3b35e7eeSXin LI            auipc   s6,0x1613   # Pair 3
3b35e7eeSXin LI            auipc   s10,0x1613  # Pair 4
3b35e7eeSXin LI            beqz    a5,a3baae
3b35e7eeSXin LI            ld      a0,0(a6)
3b35e7eeSXin LI            ld      a6,246(s9)  # Pair 1
3b35e7eeSXin LI            ld      a1,250(s4)  # Pair 2
3b35e7eeSXin LI            ld      a3,254(s6)  # Pair 3
3b35e7eeSXin LI            ld      a4,258(s10) # Pair 4
3b35e7eeSXin LI
3b35e7eeSXin LI    It's not possible to find all split pairs in a filter like this.
3b35e7eeSXin LI    At least in 2024, simple sequential pairs are 99 % of AUIPC uses
3b35e7eeSXin LI    so filtering only such pairs gives good results and makes the
3b35e7eeSXin LI    filter simpler. However, it's possible that future compilers will
3b35e7eeSXin LI    produce different code where sequential pairs aren't as common.
3b35e7eeSXin LI
3b35e7eeSXin LI    This filter doesn't convert AUIPC instructions alone because:
3b35e7eeSXin LI
3b35e7eeSXin LI    (1) The conversion would be off-by-one (or off-by-4096) half the
3b35e7eeSXin LI        time because the lowest 12 bits from inst2 (inst2_imm12)
3b35e7eeSXin LI        aren't known. We only know that the absolute address is
3b35e7eeSXin LI        pc + AUIPC_imm20 + [-2048, +2047] but there is no way to
3b35e7eeSXin LI        know the exact 4096-byte multiple (or 4096 * n + 2048):
3b35e7eeSXin LI        there are always two possibilities because AUIPC copies
3b35e7eeSXin LI        the 12 lowest bits from pc instead of zeroing them.
3b35e7eeSXin LI
3b35e7eeSXin LI        NOTE: The sign-extension of inst2_imm12 adds a tiny bit
3b35e7eeSXin LI        of extra complexity to AUIPC math in general but it's not
3b35e7eeSXin LI        the reason for this problem. The sign-extension only changes
3b35e7eeSXin LI        the relative position of the pc-relative 4096-byte window.
3b35e7eeSXin LI
3b35e7eeSXin LI    (2) Matching AUIPC instruction alone requires only seven bits.
3b35e7eeSXin LI        When the filter is applied to non-code data, that leads
3b35e7eeSXin LI        to many false positives which make compression worse.
3b35e7eeSXin LI        As long as most AUIPC+inst2 pairs appear as two consecutive
3b35e7eeSXin LI        instructions, converting only such pairs gives better results.
3b35e7eeSXin LI
3b35e7eeSXin LI    In assembly, AUIPC+inst2 tend to look like this:
3b35e7eeSXin LI
3b35e7eeSXin LI        # Call:
3b35e7eeSXin LI        auipc   ra, 0x12345
3b35e7eeSXin LI        jalr    ra, -42(ra)
3b35e7eeSXin LI
3b35e7eeSXin LI        # Tail call:
3b35e7eeSXin LI        auipc   t1, 0x12345
3b35e7eeSXin LI        jalr    zero, -42(t1)
3b35e7eeSXin LI
3b35e7eeSXin LI        # Getting the absolute address:
3b35e7eeSXin LI        auipc   a0, 0x12345
3b35e7eeSXin LI        addi    a0, a0, -42
3b35e7eeSXin LI
3b35e7eeSXin LI        # rd of inst2 isn't necessarily the same as rs1 even
3b35e7eeSXin LI        # in cases where there is no reason to preserve rs1.
3b35e7eeSXin LI        auipc   a0, 0x12345
3b35e7eeSXin LI        addi    a1, a0, -42
3b35e7eeSXin LI
3b35e7eeSXin LI    As of 2024, 16-bit instructions from the C extension don't
3b35e7eeSXin LI    appear as inst2. The RISC-V psABI doesn't list AUIPC+C.* as
3b35e7eeSXin LI    a linker relaxation type explicitly but it's not disallowed
3b35e7eeSXin LI    either. Usefulness is limited as most of the time the lowest
3b35e7eeSXin LI    12 bits won't fit in a C instruction. This filter doesn't
3b35e7eeSXin LI    support AUIPC+C.* combinations because this makes the filter
3b35e7eeSXin LI    simpler, there are no test files, and it hopefully will never
3b35e7eeSXin LI    be needed anyway.
3b35e7eeSXin LI
3b35e7eeSXin LI    (Compare AUIPC to ARM64 where ADRP does set the lowest 12 bits
3b35e7eeSXin LI    to zero. The paired instruction has the lowest 12 bits of the
3b35e7eeSXin LI    absolute address as is in a zero-extended immediate. Thus the
3b35e7eeSXin LI    ARM64 filter doesn't need to care about the instructions that
3b35e7eeSXin LI    are paired with ADRP. An off-by-4096 issue can still occur if
3b35e7eeSXin LI    the code section isn't aligned with the filter's start offset.
3b35e7eeSXin LI    It's not a problem with standalone ELF files but Windows PE
3b35e7eeSXin LI    files need start_offset=3072 for best results. Also, a .tar
3b35e7eeSXin LI    stores files with 512-byte alignment so most of the time it
3b35e7eeSXin LI    won't be the best for ARM64.)
3b35e7eeSXin LI
3b35e7eeSXin LIAUIPC with rd == x0
3b35e7eeSXin LI-------------------
3b35e7eeSXin LI
3b35e7eeSXin LI    AUIPC instructions with rd=x0 are reserved for HINTs in the base
3b35e7eeSXin LI    instruction set. Such AUIPC instructions are never filtered.
3b35e7eeSXin LI
3b35e7eeSXin LI    As of January 2024, it seems likely that AUIPC with rd=x0 will
3b35e7eeSXin LI    be used for landing pads (pseudoinstruction LPAD). LPAD is used
3b35e7eeSXin LI    to mark valid targets for indirect jumps (for JALR), for example,
3b35e7eeSXin LI    beginnings of functions. The 20-bit immediate in LPAD instruction
3b35e7eeSXin LI    is a label, not a pc-relative address. Thus it would be
3b35e7eeSXin LI    counterproductive to convert AUIPC instructions with rd=x0.
3b35e7eeSXin LI
3b35e7eeSXin LI    Often the next instruction after LPAD won't have rs1=x0 and thus
3b35e7eeSXin LI    the filtering would be skipped for that reason alone. However,
3b35e7eeSXin LI    it's not good to rely on this. For example, consider a function
3b35e7eeSXin LI    that begins like this:
3b35e7eeSXin LI
3b35e7eeSXin LI        int foo(int i)
3b35e7eeSXin LI        {
3b35e7eeSXin LI            if (i <= 234) {
3b35e7eeSXin LI                ...
3b35e7eeSXin LI            }
3b35e7eeSXin LI
3b35e7eeSXin LI    A compiler may generate something like this:
3b35e7eeSXin LI
3b35e7eeSXin LI        lpad    0x54321
3b35e7eeSXin LI        li      a5, 234
3b35e7eeSXin LI        bgt     a0, a5, .L2
3b35e7eeSXin LI
3b35e7eeSXin LI    Converting the pseudoinstructions to raw instructions:
3b35e7eeSXin LI
3b35e7eeSXin LI        auipc   x0, 0x54321
3b35e7eeSXin LI        addi    x15, x0, 234
3b35e7eeSXin LI        blt     x15, x10, .L2
3b35e7eeSXin LI
3b35e7eeSXin LI    In this case the filter would undesirably convert the AUIPC+ADDI
3b35e7eeSXin LI    pair if the filter didn't explicitly skip AUIPC instructions
3b35e7eeSXin LI    that have rd=x0.
3b35e7eeSXin LI
3b35e7eeSXin LI*/
3b35e7eeSXin LI
3b35e7eeSXin LI
3b35e7eeSXin LI#include "simple_private.h"
3b35e7eeSXin LI
3b35e7eeSXin LI
3b35e7eeSXin LI// This checks two conditions at once:
3b35e7eeSXin LI//    - AUIPC rd == inst2 rs1.
3b35e7eeSXin LI//    - inst2 opcode has the lowest two bits set.
3b35e7eeSXin LI//
3b35e7eeSXin LI// The 8 bit left shift aligns the rd of AUIPC with the rs1 of inst2.
3b35e7eeSXin LI// By XORing the registers, any non-zero value in those bits indicates the
3b35e7eeSXin LI// registers are not equal and thus not an AUIPC pair. Subtracting 3 from
3b35e7eeSXin LI// inst2 will zero out the first two opcode bits only when they are set.
3b35e7eeSXin LI// The mask tests if any of the register or opcode bits are set (and thus
3b35e7eeSXin LI// not an AUIPC pair).
3b35e7eeSXin LI//
3b35e7eeSXin LI// Alternative expression: (((((auipc) << 8) ^ (inst2)) & 0xF8003) != 3)
3b35e7eeSXin LI#define NOT_AUIPC_PAIR(auipc, inst2) \
3b35e7eeSXin LI	((((auipc) << 8) ^ ((inst2) - 3)) & 0xF8003)
3b35e7eeSXin LI
3b35e7eeSXin LI// This macro checks multiple conditions:
3b35e7eeSXin LI//   (1) AUIPC rd [11:7] == x2 (special rd value).
3b35e7eeSXin LI//   (2) AUIPC bits 12 and 13 set (the lowest two opcode bits of packed inst2).
3b35e7eeSXin LI//   (3) inst2_rs1 doesn't equal x0 or x2 because the opposite
3b35e7eeSXin LI//       conversion is only done when
3b35e7eeSXin LI//       auipc_rd != x0 &&
3b35e7eeSXin LI//       auipc_rd != x2 &&
3b35e7eeSXin LI//       auipc_rd == inst2_rs1.
3b35e7eeSXin LI//
3b35e7eeSXin LI// The left-hand side takes care of (1) and (2).
3b35e7eeSXin LI//   (a) The lowest 7 bits are already known to be AUIPC so subtracting 0x17
3b35e7eeSXin LI//       makes those bits zeros.
3b35e7eeSXin LI//   (b) If AUIPC rd equals x2, subtracting 0x100 makes bits [11:7] zeros.
3b35e7eeSXin LI//       If rd doesn't equal x2, then there will be at least one non-zero bit
3b35e7eeSXin LI//       and the next step (c) is irrelevant.
3b35e7eeSXin LI//   (c) If the lowest two opcode bits of the packed inst2 are set in [13:12],
3b35e7eeSXin LI//       then subtracting 0x3000 will make those bits zeros. Otherwise there
3b35e7eeSXin LI//       will be at least one non-zero bit.
3b35e7eeSXin LI//
3b35e7eeSXin LI// The shift by 18 removes the high bits from the final '>=' comparison and
3b35e7eeSXin LI// ensures that any non-zero result will be larger than any possible result
3b35e7eeSXin LI// from the right-hand side of the comparison. The cast ensures that the
3b35e7eeSXin LI// left-hand side didn't get promoted to a larger type than uint32_t.
3b35e7eeSXin LI//
3b35e7eeSXin LI// On the right-hand side, inst2_rs1 & 0x1D will be non-zero as long as
3b35e7eeSXin LI// inst2_rs1 is not x0 or x2.
3b35e7eeSXin LI//
3b35e7eeSXin LI// The final '>=' comparison will make the expression true if:
3b35e7eeSXin LI//   - The subtraction caused any bits to be set (special AUIPC rd value not
3b35e7eeSXin LI//     used or inst2 opcode bits not set). (non-zero >= non-zero or 0)
3b35e7eeSXin LI//   - The subtraction did not cause any bits to be set but inst2_rs1 was
3b35e7eeSXin LI//     x0 or x2. (0 >= 0)
3b35e7eeSXin LI#define NOT_SPECIAL_AUIPC(auipc, inst2_rs1) \
3b35e7eeSXin LI	((uint32_t)(((auipc) - 0x3117) << 18) >= ((inst2_rs1) & 0x1D))
3b35e7eeSXin LI
3b35e7eeSXin LI
3b35e7eeSXin LI// The encode and decode functions are split for this filter because of the
3b35e7eeSXin LI// AUIPC+inst2 filtering. This filter design allows a decoder-only
3b35e7eeSXin LI// implementation to be smaller than alternative designs.
3b35e7eeSXin LI
3b35e7eeSXin LI#ifdef HAVE_ENCODER_RISCV
3b35e7eeSXin LIstatic size_t
3b35e7eeSXin LIriscv_encode(void *simple lzma_attribute((__unused__)),
3b35e7eeSXin LI		uint32_t now_pos,
3b35e7eeSXin LI		bool is_encoder lzma_attribute((__unused__)),
3b35e7eeSXin LI		uint8_t *buffer, size_t size)
3b35e7eeSXin LI{
3b35e7eeSXin LI	// Avoid using i + 8 <= size in the loop condition.
3b35e7eeSXin LI	//
3b35e7eeSXin LI	// NOTE: If there is a JAL in the last six bytes of the stream, it
3b35e7eeSXin LI	// won't be converted. This is intentional to keep the code simpler.
3b35e7eeSXin LI	if (size < 8)
3b35e7eeSXin LI		return 0;
3b35e7eeSXin LI
3b35e7eeSXin LI	size -= 8;
3b35e7eeSXin LI
3b35e7eeSXin LI	size_t i;
3b35e7eeSXin LI
3b35e7eeSXin LI	// The loop is advanced by 2 bytes every iteration since the
3b35e7eeSXin LI	// instruction stream may include 16-bit instructions (C extension).
3b35e7eeSXin LI	for (i = 0; i <= size; i += 2) {
3b35e7eeSXin LI		uint32_t inst = buffer[i];
3b35e7eeSXin LI
3b35e7eeSXin LI		if (inst == 0xEF) {
3b35e7eeSXin LI			// JAL
3b35e7eeSXin LI			const uint32_t b1 = buffer[i + 1];
3b35e7eeSXin LI
3b35e7eeSXin LI			// Only filter rd=x1(ra) and rd=x5(t0).
3b35e7eeSXin LI			if ((b1 & 0x0D) != 0)
3b35e7eeSXin LI				continue;
3b35e7eeSXin LI
3b35e7eeSXin LI			// The 20-bit immediate is in four pieces.
3b35e7eeSXin LI			// The encoder stores it in big endian form
3b35e7eeSXin LI			// since it improves compression slightly.
3b35e7eeSXin LI			const uint32_t b2 = buffer[i + 2];
3b35e7eeSXin LI			const uint32_t b3 = buffer[i + 3];
3b35e7eeSXin LI			const uint32_t pc = now_pos + (uint32_t)i;
3b35e7eeSXin LI
3b35e7eeSXin LI// The following chart shows the highest three bytes of JAL, focusing on
3b35e7eeSXin LI// the 20-bit immediate field [31:12]. The first row of numbers is the
3b35e7eeSXin LI// bit position in a 32-bit little endian instruction. The second row of
3b35e7eeSXin LI// numbers shows the order of the immediate field in a J-type instruction.
3b35e7eeSXin LI// The last row is the bit number in each byte.
3b35e7eeSXin LI//
3b35e7eeSXin LI// To determine the amount to shift each bit, subtract the value in
3b35e7eeSXin LI// the last row from the value in the second last row. If the number
3b35e7eeSXin LI// is positive, shift left. If negative, shift right.
3b35e7eeSXin LI//
3b35e7eeSXin LI// For example, at the rightmost side of the chart, the bit 4 in b1 is
3b35e7eeSXin LI// the bit 12 of the address. Thus that bit needs to be shifted left
3b35e7eeSXin LI// by 12 - 4 = 8 bits to put it in the right place in the addr variable.
3b35e7eeSXin LI//
3b35e7eeSXin LI// NOTE: The immediate of a J-type instruction holds bits [20:1] of
3b35e7eeSXin LI// the address. The bit [0] is always 0 and not part of the immediate.
3b35e7eeSXin LI//
3b35e7eeSXin LI// |          b3             |          b2             |          b1         |
3b35e7eeSXin LI// | 31 30 29 28 27 26 25 24 | 23 22 21 20 19 18 17 16 | 15 14 13 12 x x x x |
3b35e7eeSXin LI// | 20 10  9  8  7  6  5  4 |  3  2  1 11 19 18 17 16 | 15 14 13 12 x x x x |
3b35e7eeSXin LI// |  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |  7  6  5  4 x x x x |
3b35e7eeSXin LI
3b35e7eeSXin LI			uint32_t addr = ((b1 & 0xF0) << 8)
3b35e7eeSXin LI					| ((b2 & 0x0F) << 16)
3b35e7eeSXin LI					| ((b2 & 0x10) << 7)
3b35e7eeSXin LI					| ((b2 & 0xE0) >> 4)
3b35e7eeSXin LI					| ((b3 & 0x7F) << 4)
3b35e7eeSXin LI					| ((b3 & 0x80) << 13);
3b35e7eeSXin LI
3b35e7eeSXin LI			addr += pc;
3b35e7eeSXin LI
3b35e7eeSXin LI			buffer[i + 1] = (uint8_t)((b1 & 0x0F)
3b35e7eeSXin LI					| ((addr >> 13) & 0xF0));
3b35e7eeSXin LI
3b35e7eeSXin LI			buffer[i + 2] = (uint8_t)(addr >> 9);
3b35e7eeSXin LI			buffer[i + 3] = (uint8_t)(addr >> 1);
3b35e7eeSXin LI
3b35e7eeSXin LI			// The "-2" is included because the for-loop will
3b35e7eeSXin LI			// always increment by 2. In this case, we want to
3b35e7eeSXin LI			// skip an extra 2 bytes since we used 4 bytes
3b35e7eeSXin LI			// of input.
3b35e7eeSXin LI			i += 4 - 2;
3b35e7eeSXin LI
3b35e7eeSXin LI		} else if ((inst & 0x7F) == 0x17) {
3b35e7eeSXin LI			// AUIPC
3b35e7eeSXin LI			inst |= (uint32_t)buffer[i + 1] << 8;
3b35e7eeSXin LI			inst |= (uint32_t)buffer[i + 2] << 16;
3b35e7eeSXin LI			inst |= (uint32_t)buffer[i + 3] << 24;
3b35e7eeSXin LI
3b35e7eeSXin LI			// Branch based on AUIPC's rd. The bitmask test does
3b35e7eeSXin LI			// the same thing as this:
3b35e7eeSXin LI			//
3b35e7eeSXin LI			//     const uint32_t auipc_rd = (inst >> 7) & 0x1F;
3b35e7eeSXin LI			//     if (auipc_rd != 0 && auipc_rd != 2) {
3b35e7eeSXin LI 			if (inst & 0xE80) {
3b35e7eeSXin LI				// AUIPC's rd doesn't equal x0 or x2.
3b35e7eeSXin LI
3b35e7eeSXin LI				// Check if AUIPC+inst2 are a pair.
3b35e7eeSXin LI				uint32_t inst2 = read32le(buffer + i + 4);
3b35e7eeSXin LI
3b35e7eeSXin LI				if (NOT_AUIPC_PAIR(inst, inst2)) {
3b35e7eeSXin LI					// The NOT_AUIPC_PAIR macro allows
3b35e7eeSXin LI					// a false AUIPC+AUIPC pair if the
3b35e7eeSXin LI					// bits [19:15] (where rs1 would be)
3b35e7eeSXin LI					// in the second AUIPC match the rd
3b35e7eeSXin LI					// of the first AUIPC.
3b35e7eeSXin LI					//
3b35e7eeSXin LI					// We must skip enough forward so
3b35e7eeSXin LI					// that the first two bytes of the
3b35e7eeSXin LI					// second AUIPC cannot get converted.
3b35e7eeSXin LI					// Such a conversion could make the
3b35e7eeSXin LI					// current pair become a valid pair
3b35e7eeSXin LI					// which would desync the decoder.
3b35e7eeSXin LI					//
3b35e7eeSXin LI					// Skipping six bytes is enough even
3b35e7eeSXin LI					// though the above condition looks
3b35e7eeSXin LI					// at the lowest four bits of the
3b35e7eeSXin LI					// buffer[i + 6] too. This is safe
3b35e7eeSXin LI					// because this filter never changes
3b35e7eeSXin LI					// those bits if a conversion at
3b35e7eeSXin LI					// that position is done.
3b35e7eeSXin LI					i += 6 - 2;
3b35e7eeSXin LI					continue;
3b35e7eeSXin LI				}
3b35e7eeSXin LI
3b35e7eeSXin LI				// Convert AUIPC+inst2 to a special format:
3b35e7eeSXin LI				//
3b35e7eeSXin LI				//   - The lowest 7 bits [6:0] retain the
3b35e7eeSXin LI				//     AUIPC opcode.
3b35e7eeSXin LI				//
3b35e7eeSXin LI				//   - The rd [11:7] is set to x2(sp). x2 is
3b35e7eeSXin LI				//     used as the stack pointer so AUIPC with
3b35e7eeSXin LI				//     rd=x2 should be very rare in real-world
3b35e7eeSXin LI				//     executables.
3b35e7eeSXin LI				//
3b35e7eeSXin LI				//   - The remaining 20 bits [31:12] (that
3b35e7eeSXin LI				//     normally hold the pc-relative immediate)
3b35e7eeSXin LI				//     are used to store the lowest 20 bits of
3b35e7eeSXin LI				//     inst2. That is, the 12-bit immediate of
3b35e7eeSXin LI				//     inst2 is not included.
3b35e7eeSXin LI				//
3b35e7eeSXin LI				//   - The location of the original inst2 is
3b35e7eeSXin LI				//     used to store the 32-bit absolute
3b35e7eeSXin LI				//     address in big endian format. Compared
3b35e7eeSXin LI				//     to the 20+12-bit split encoding, this
3b35e7eeSXin LI				//     results in a longer uninterrupted
3b35e7eeSXin LI				//     sequence of identical common bytes
3b35e7eeSXin LI				//     when the same address is referred
3b35e7eeSXin LI				//     with different instruction pairs
3b35e7eeSXin LI				//     (like AUIPC+LD vs. AUIPC+ADDI) or
3b35e7eeSXin LI				//     when the occurrences of the same
3b35e7eeSXin LI				//     pair use different registers. When
3b35e7eeSXin LI				//     referring to adjacent memory locations
3b35e7eeSXin LI				//     (like function calls that go via the
3b35e7eeSXin LI				//     ELF PLT), in big endian order only the
3b35e7eeSXin LI				//     last 1-2 bytes differ; in little endian
3b35e7eeSXin LI				//     the differing 1-2 bytes would be in the
3b35e7eeSXin LI				//     middle of the 8-byte sequence.
3b35e7eeSXin LI				//
3b35e7eeSXin LI				// When reversing the transformation, the
3b35e7eeSXin LI				// original rd of AUIPC can be restored
3b35e7eeSXin LI				// from inst2's rs1 as they are required to
3b35e7eeSXin LI				// be the same.
3b35e7eeSXin LI
3b35e7eeSXin LI				// Arithmetic right shift makes sign extension
3b35e7eeSXin LI				// trivial but (1) it's implementation-defined
3b35e7eeSXin LI				// behavior (C99/C11/C23 6.5.7-p5) and so is
3b35e7eeSXin LI				// (2) casting unsigned to signed (6.3.1.3-p3).
3b35e7eeSXin LI				//
3b35e7eeSXin LI				// One can check for (1) with
3b35e7eeSXin LI				//
3b35e7eeSXin LI				//     if ((-1 >> 1) == -1) ...
3b35e7eeSXin LI				//
3b35e7eeSXin LI				// but (2) has to be checked from the
3b35e7eeSXin LI				// compiler docs. GCC promises that (1)
3b35e7eeSXin LI				// and (2) behave in the common expected
3b35e7eeSXin LI				// way and thus
3b35e7eeSXin LI				//
3b35e7eeSXin LI				//     addr += (uint32_t)(
3b35e7eeSXin LI				//             (int32_t)inst2 >> 20);
3b35e7eeSXin LI				//
3b35e7eeSXin LI				// does the same as the code below. But since
3b35e7eeSXin LI				// the 100 % portable way is only a few bytes
3b35e7eeSXin LI				// bigger code and there is no real speed
3b35e7eeSXin LI				// difference, let's just use that, especially
3b35e7eeSXin LI				// since the decoder doesn't need this at all.
3b35e7eeSXin LI				uint32_t addr = inst & 0xFFFFF000;
3b35e7eeSXin LI				addr += (inst2 >> 20)
3b35e7eeSXin LI						- ((inst2 >> 19) & 0x1000);
3b35e7eeSXin LI
3b35e7eeSXin LI				addr += now_pos + (uint32_t)i;
3b35e7eeSXin LI
3b35e7eeSXin LI				// Construct the first 32 bits:
3b35e7eeSXin LI				//   [6:0]    AUIPC opcode
3b35e7eeSXin LI				//   [11:7]   Special AUIPC rd = x2
3b35e7eeSXin LI				//   [31:12]  The lowest 20 bits of inst2
3b35e7eeSXin LI				inst = 0x17 | (2 << 7) | (inst2 << 12);
3b35e7eeSXin LI
3b35e7eeSXin LI				write32le(buffer + i, inst);
3b35e7eeSXin LI
3b35e7eeSXin LI				// The second 32 bits store the absolute
3b35e7eeSXin LI				// address in big endian order.
3b35e7eeSXin LI				write32be(buffer + i + 4, addr);
3b35e7eeSXin LI			} else {
3b35e7eeSXin LI				// AUIPC's rd equals x0 or x2.
3b35e7eeSXin LI				//
3b35e7eeSXin LI				// x0 indicates a landing pad (LPAD).
3b35e7eeSXin LI				// It's always skipped.
3b35e7eeSXin LI				//
3b35e7eeSXin LI				// AUIPC with rd == x2 is used for the special
3b35e7eeSXin LI				// format as explained above. When the input
3b35e7eeSXin LI				// contains a byte sequence that matches the
3b35e7eeSXin LI				// special format, "fake" decoding must be
3b35e7eeSXin LI				// done to keep the filter bijective (that
3b35e7eeSXin LI				// is, safe to apply on arbitrary data).
3b35e7eeSXin LI				//
3b35e7eeSXin LI				// See the "x0 or x2" section in riscv_decode()
3b35e7eeSXin LI				// for how the "real" decoding is done. The
3b35e7eeSXin LI				// "fake" decoding is a simplified version
3b35e7eeSXin LI				// of "real" decoding with the following
3b35e7eeSXin LI				// differences (these reduce code size of
3b35e7eeSXin LI				// the decoder):
3b35e7eeSXin LI				// (1) The lowest 12 bits aren't sign-extended.
3b35e7eeSXin LI				// (2) No address conversion is done.
3b35e7eeSXin LI				// (3) Big endian format isn't used (the fake
3b35e7eeSXin LI				//     address is in little endian order).
3b35e7eeSXin LI
3b35e7eeSXin LI				// Check if inst matches the special format.
3b35e7eeSXin LI				const uint32_t fake_rs1 = inst >> 27;
3b35e7eeSXin LI
3b35e7eeSXin LI				if (NOT_SPECIAL_AUIPC(inst, fake_rs1)) {
3b35e7eeSXin LI					i += 4 - 2;
3b35e7eeSXin LI					continue;
3b35e7eeSXin LI				}
3b35e7eeSXin LI
3b35e7eeSXin LI				const uint32_t fake_addr =
3b35e7eeSXin LI						read32le(buffer + i + 4);
3b35e7eeSXin LI
3b35e7eeSXin LI				// Construct the second 32 bits:
3b35e7eeSXin LI				//   [19:0]   Upper 20 bits from AUIPC
3b35e7eeSXin LI				//   [31:20]  The lowest 12 bits of fake_addr
3b35e7eeSXin LI				const uint32_t fake_inst2 = (inst >> 12)
3b35e7eeSXin LI						| (fake_addr << 20);
3b35e7eeSXin LI
3b35e7eeSXin LI				// Construct new first 32 bits from:
3b35e7eeSXin LI				//   [6:0]   AUIPC opcode
3b35e7eeSXin LI				//   [11:7]  Fake AUIPC rd = fake_rs1
3b35e7eeSXin LI				//   [31:12] The highest 20 bits of fake_addr
3b35e7eeSXin LI				inst = 0x17 | (fake_rs1 << 7)
3b35e7eeSXin LI					| (fake_addr & 0xFFFFF000);
3b35e7eeSXin LI
3b35e7eeSXin LI				write32le(buffer + i, inst);
3b35e7eeSXin LI				write32le(buffer + i + 4, fake_inst2);
3b35e7eeSXin LI			}
3b35e7eeSXin LI
3b35e7eeSXin LI			i += 8 - 2;
3b35e7eeSXin LI		}
3b35e7eeSXin LI	}
3b35e7eeSXin LI
3b35e7eeSXin LI	return i;
3b35e7eeSXin LI}
3b35e7eeSXin LI
3b35e7eeSXin LI
3b35e7eeSXin LIextern lzma_ret
3b35e7eeSXin LIlzma_simple_riscv_encoder_init(lzma_next_coder *next,
3b35e7eeSXin LI		const lzma_allocator *allocator,
3b35e7eeSXin LI		const lzma_filter_info *filters)
3b35e7eeSXin LI{
3b35e7eeSXin LI	return lzma_simple_coder_init(next, allocator, filters,
3b35e7eeSXin LI			&riscv_encode, 0, 8, 2, true);
3b35e7eeSXin LI}
*128836d3SXin LI
*128836d3SXin LI
*128836d3SXin LIextern LZMA_API(size_t)
*128836d3SXin LIlzma_bcj_riscv_encode(uint32_t start_offset, uint8_t *buf, size_t size)
*128836d3SXin LI{
*128836d3SXin LI	// start_offset must be a multiple of two.
*128836d3SXin LI	start_offset &= ~UINT32_C(1);
*128836d3SXin LI	return riscv_encode(NULL, start_offset, true, buf, size);
*128836d3SXin LI}
3b35e7eeSXin LI#endif
3b35e7eeSXin LI
3b35e7eeSXin LI
3b35e7eeSXin LI#ifdef HAVE_DECODER_RISCV
3b35e7eeSXin LIstatic size_t
3b35e7eeSXin LIriscv_decode(void *simple lzma_attribute((__unused__)),
3b35e7eeSXin LI		uint32_t now_pos,
3b35e7eeSXin LI		bool is_encoder lzma_attribute((__unused__)),
3b35e7eeSXin LI		uint8_t *buffer, size_t size)
3b35e7eeSXin LI{
3b35e7eeSXin LI	if (size < 8)
3b35e7eeSXin LI		return 0;
3b35e7eeSXin LI
3b35e7eeSXin LI	size -= 8;
3b35e7eeSXin LI
3b35e7eeSXin LI	size_t i;
3b35e7eeSXin LI	for (i = 0; i <= size; i += 2) {
3b35e7eeSXin LI		uint32_t inst = buffer[i];
3b35e7eeSXin LI
3b35e7eeSXin LI		if (inst == 0xEF) {
3b35e7eeSXin LI			// JAL
3b35e7eeSXin LI			const uint32_t b1 = buffer[i + 1];
3b35e7eeSXin LI
3b35e7eeSXin LI			// Only filter rd=x1(ra) and rd=x5(t0).
3b35e7eeSXin LI			if ((b1 & 0x0D) != 0)
3b35e7eeSXin LI				continue;
3b35e7eeSXin LI
3b35e7eeSXin LI			const uint32_t b2 = buffer[i + 2];
3b35e7eeSXin LI			const uint32_t b3 = buffer[i + 3];
3b35e7eeSXin LI			const uint32_t pc = now_pos + (uint32_t)i;
3b35e7eeSXin LI
3b35e7eeSXin LI// |          b3             |          b2             |          b1         |
3b35e7eeSXin LI// | 31 30 29 28 27 26 25 24 | 23 22 21 20 19 18 17 16 | 15 14 13 12 x x x x |
3b35e7eeSXin LI// | 20 10  9  8  7  6  5  4 |  3  2  1 11 19 18 17 16 | 15 14 13 12 x x x x |
3b35e7eeSXin LI// |  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |  7  6  5  4 x x x x |
3b35e7eeSXin LI
3b35e7eeSXin LI			uint32_t addr = ((b1 & 0xF0) << 13)
3b35e7eeSXin LI					| (b2 << 9) | (b3 << 1);
3b35e7eeSXin LI
3b35e7eeSXin LI			addr -= pc;
3b35e7eeSXin LI
3b35e7eeSXin LI			buffer[i + 1] = (uint8_t)((b1 & 0x0F)
3b35e7eeSXin LI					| ((addr >> 8) & 0xF0));
3b35e7eeSXin LI
3b35e7eeSXin LI			buffer[i + 2] = (uint8_t)(((addr >> 16) & 0x0F)
3b35e7eeSXin LI					| ((addr >> 7) & 0x10)
3b35e7eeSXin LI					| ((addr << 4) & 0xE0));
3b35e7eeSXin LI
3b35e7eeSXin LI			buffer[i + 3] = (uint8_t)(((addr >> 4) & 0x7F)
3b35e7eeSXin LI					| ((addr >> 13) & 0x80));
3b35e7eeSXin LI
3b35e7eeSXin LI			i += 4 - 2;
3b35e7eeSXin LI
3b35e7eeSXin LI		} else if ((inst & 0x7F) == 0x17) {
3b35e7eeSXin LI			// AUIPC
3b35e7eeSXin LI			uint32_t inst2;
3b35e7eeSXin LI
3b35e7eeSXin LI			inst |= (uint32_t)buffer[i + 1] << 8;
3b35e7eeSXin LI			inst |= (uint32_t)buffer[i + 2] << 16;
3b35e7eeSXin LI			inst |= (uint32_t)buffer[i + 3] << 24;
3b35e7eeSXin LI
3b35e7eeSXin LI			if (inst & 0xE80) {
3b35e7eeSXin LI				// AUIPC's rd doesn't equal x0 or x2.
3b35e7eeSXin LI
3b35e7eeSXin LI				// Check if it is a "fake" AUIPC+inst2 pair.
3b35e7eeSXin LI				inst2 = read32le(buffer + i + 4);
3b35e7eeSXin LI
3b35e7eeSXin LI				if (NOT_AUIPC_PAIR(inst, inst2)) {
3b35e7eeSXin LI					i += 6 - 2;
3b35e7eeSXin LI					continue;
3b35e7eeSXin LI				}
3b35e7eeSXin LI
3b35e7eeSXin LI				// Decode (or more like re-encode) the "fake"
3b35e7eeSXin LI				// pair. The "fake" format doesn't do
3b35e7eeSXin LI				// sign-extension, address conversion, or
3b35e7eeSXin LI				// use big endian. (The use of little endian
3b35e7eeSXin LI				// allows sharing the write32le() calls in
3b35e7eeSXin LI				// the decoder to reduce code size when
3b35e7eeSXin LI				// unaligned access isn't supported.)
3b35e7eeSXin LI				uint32_t addr = inst & 0xFFFFF000;
3b35e7eeSXin LI				addr += inst2 >> 20;
3b35e7eeSXin LI
3b35e7eeSXin LI				inst = 0x17 | (2 << 7) | (inst2 << 12);
3b35e7eeSXin LI				inst2 = addr;
3b35e7eeSXin LI			} else {
3b35e7eeSXin LI				// AUIPC's rd equals x0 or x2.
3b35e7eeSXin LI
3b35e7eeSXin LI				// Check if inst matches the special format
3b35e7eeSXin LI				// used by the encoder.
3b35e7eeSXin LI				const uint32_t inst2_rs1 = inst >> 27;
3b35e7eeSXin LI
3b35e7eeSXin LI				if (NOT_SPECIAL_AUIPC(inst, inst2_rs1)) {
3b35e7eeSXin LI					i += 4 - 2;
3b35e7eeSXin LI					continue;
3b35e7eeSXin LI				}
3b35e7eeSXin LI
3b35e7eeSXin LI				// Decode the "real" pair.
3b35e7eeSXin LI				uint32_t addr = read32be(buffer + i + 4);
3b35e7eeSXin LI
3b35e7eeSXin LI				addr -= now_pos + (uint32_t)i;
3b35e7eeSXin LI
3b35e7eeSXin LI				// The second instruction:
3b35e7eeSXin LI				//   - Get the lowest 20 bits from inst.
3b35e7eeSXin LI				//   - Add the lowest 12 bits of the address
3b35e7eeSXin LI				//     as the immediate field.
3b35e7eeSXin LI				inst2 = (inst >> 12) | (addr << 20);
3b35e7eeSXin LI
3b35e7eeSXin LI				// AUIPC:
3b35e7eeSXin LI				//   - rd is the same as inst2_rs1.
3b35e7eeSXin LI				//   - The sign extension of the lowest 12 bits
3b35e7eeSXin LI				//     must be taken into account.
3b35e7eeSXin LI				inst = 0x17 | (inst2_rs1 << 7)
3b35e7eeSXin LI					| ((addr + 0x800) & 0xFFFFF000);
3b35e7eeSXin LI			}
3b35e7eeSXin LI
3b35e7eeSXin LI			// Both decoder branches write in little endian order.
3b35e7eeSXin LI			write32le(buffer + i, inst);
3b35e7eeSXin LI			write32le(buffer + i + 4, inst2);
3b35e7eeSXin LI
3b35e7eeSXin LI			i += 8 - 2;
3b35e7eeSXin LI		}
3b35e7eeSXin LI	}
3b35e7eeSXin LI
3b35e7eeSXin LI	return i;
3b35e7eeSXin LI}
3b35e7eeSXin LI
3b35e7eeSXin LI
3b35e7eeSXin LIextern lzma_ret
3b35e7eeSXin LIlzma_simple_riscv_decoder_init(lzma_next_coder *next,
3b35e7eeSXin LI		const lzma_allocator *allocator,
3b35e7eeSXin LI		const lzma_filter_info *filters)
3b35e7eeSXin LI{
3b35e7eeSXin LI	return lzma_simple_coder_init(next, allocator, filters,
3b35e7eeSXin LI			&riscv_decode, 0, 8, 2, false);
3b35e7eeSXin LI}
*128836d3SXin LI
*128836d3SXin LI
*128836d3SXin LIextern LZMA_API(size_t)
*128836d3SXin LIlzma_bcj_riscv_decode(uint32_t start_offset, uint8_t *buf, size_t size)
*128836d3SXin LI{
*128836d3SXin LI	// start_offset must be a multiple of two.
*128836d3SXin LI	start_offset &= ~UINT32_C(1);
*128836d3SXin LI	return riscv_decode(NULL, start_offset, false, buf, size);
*128836d3SXin LI}
3b35e7eeSXin LI#endif