riscv.c - OpenGrok cross reference for /freebsd/contrib/xz/src/liblzma/simple/riscv.c

Lines Matching +full:1 +full:- +full:bit +full:- +full:only
1 // SPDX-License-Identifier: 0BSD
6 /// \brief      Filter for 32-bit/64-bit little/big endian RISC-V binaries
13 /// The paired instruction opcode must only have its lowest two bits set,
14 /// meaning it will convert any paired instruction that is not a 16-bit
23 //    - Chien Wong <m@xv97.com> provided a few early versions of RISC-V
26 //    - Igor Pavlov helped a lot in the filter design, getting it both
34 RISC-V filtering
45     (pc-relative) behavior:
48 ---
53     appear only once or at most a few times. Tail calls and jumps
60     to non-code data, only the JAL instructions that use x1 or x5
61     are converted. JAL has pc-relative range of +/-1 MiB so longer
65 -------------
67     C.J and C.JAL have pc-relative range of +/-2 KiB.
72     C.JAL is an RV32C-only instruction. Its encoding overlaps with
73     RV64C-only C.ADDIW which is a common instruction. So if filtering
76     significant problem when the filter is applied to non-code data
77     because C.JAL needs only five bits to match. Thus, this filter
81 --------------------------------------------------
83     These are conditional branches with pc-relative range
84     of +/-4 KiB (+/-256 B for C.*). The absolute addresses often
85     appear only once and very short distances are the most common,
89 -------------------
92     pc-relative jumps, calls, loads, stores, and for taking
93     an address of a symbol. AUIPC has a 20-bit immediate and
94     the possible inst2 choices have a 12-bit immediate.
96     AUIPC stores pc + 20-bit signed immediate to a register.
98     has a pc-relative range of +/-2 GiB. AUIPC does *NOT* set
100     the 12-bit immediate in inst2 cannot just include the lowest
104     not only AUIPC but also the paired inst2.
109       - AUIPC+JALR: Function calls, including tail calls.
111       - AUIPC+ADDI: Calculating the address of a function
114       - AUIPC+load/store from the base instruction sets
124     NOTE: AUIPC+inst2 can only be a pair if AUIPC's rd specifies
127     Instead of strictly accepting only the above instructions as inst2,
129     inst2 must be set, that is, inst2 must not be a 16-bit compressed
130     instruction. So this will accept all 32-bit and possible future
132     rd [11:7] match the bits [19:15] in inst2 (the bits that I-type and
133     S-type instructions use for rs1). Testing showed that this relaxed
137     Additionally, the paired instruction is always treated as an I-type
138     instruction. The S-type instructions used by stores (SB, SH, SW,
140     location than I-type instructions. AUIPC+store pairs are less
142     code required to handle S-type instructions was not worth the
153       - AUIPC is outside a loop and inst2 (load/store) is inside
155         executed only once.
157       - Load-modify-store may have AUIPC for the load and the same
158         AUIPC-result is used for the store too. This may get combined
161       - AUIPC is before a conditional branch and inst2 is hundreds
164       - Inner and outer pair:
168             ld      a2,-500(a2)
169             addi    a1,a1,-233
171       - Many split pairs with an untaken conditional branch between:
173             auipc   s9,0x1613   # Pair 1
179             ld      a6,246(s9)  # Pair 1
186     so filtering only such pairs gives good results and makes the
192     (1) The conversion would be off-by-one (or off-by-4096) half the
194         aren't known. We only know that the absolute address is
195         pc + AUIPC_imm20 + [-2048, +2047] but there is no way to
196         know the exact 4096-byte multiple (or 4096 * n + 2048):
200         NOTE: The sign-extension of inst2_imm12 adds a tiny bit
202         the reason for this problem. The sign-extension only changes
203         the relative position of the pc-relative 4096-byte window.
205     (2) Matching AUIPC instruction alone requires only seven bits.
206         When the filter is applied to non-code data, that leads
209         instructions, converting only such pairs gives better results.
215         jalr    ra, -42(ra)
219         jalr    zero, -42(t1)
223         addi    a0, a0, -42
228         addi    a1, a0, -42
230     As of 2024, 16-bit instructions from the C extension don't
231     appear as inst2. The RISC-V psABI doesn't list AUIPC+C.* as
241     absolute address as is in a zero-extended immediate. Thus the
243     are paired with ADRP. An off-by-4096 issue can still occur if
247     stores files with 512-byte alignment so most of the time it
251 -------------------
259     beginnings of functions. The 20-bit immediate in LPAD instruction
260     is a label, not a pc-relative address. Thus it would be
297 //    - AUIPC rd == inst2 rs1.
298 //    - inst2 opcode has the lowest two bits set.
300 // The 8 bit left shift aligns the rd of AUIPC with the rs1 of inst2.
301 // By XORing the registers, any non-zero value in those bits indicates the
303 // inst2 will zero out the first two opcode bits only when they are set.
309 	((((auipc) << 8) ^ ((inst2) - 3)) & 0xF8003)
312 //   (1) AUIPC rd [11:7] == x2 (special rd value).
315 //       conversion is only done when
320 // The left-hand side takes care of (1) and (2).
324 //       If rd doesn't equal x2, then there will be at least one non-zero bit
328 //       will be at least one non-zero bit.
331 // ensures that any non-zero result will be larger than any possible result
332 // from the right-hand side of the comparison. The cast ensures that the
333 // left-hand side didn't get promoted to a larger type than uint32_t.
335 // On the right-hand side, inst2_rs1 & 0x1D will be non-zero as long as
339 //   - The subtraction caused any bits to be set (special AUIPC rd value not
340 //     used or inst2 opcode bits not set). (non-zero >= non-zero or 0)
341 //   - The subtraction did not cause any bits to be set but inst2_rs1 was
344 	((uint32_t)(((auipc) - 0x3117) << 18) >= ((inst2_rs1) & 0x1D))
348 // AUIPC+inst2 filtering. This filter design allows a decoder-only
365 	size -= 8;  in riscv_encode()
370 	// instruction stream may include 16-bit instructions (C extension).  in riscv_encode()
376 			const uint32_t b1 = buffer[i + 1];  in riscv_encode()
378 			// Only filter rd=x1(ra) and rd=x5(t0).  in riscv_encode()
382 			// The 20-bit immediate is in four pieces.  in riscv_encode()
390 // the 20-bit immediate field [31:12]. The first row of numbers is the  in riscv_encode()
391 // bit position in a 32-bit little endian instruction. The second row of  in riscv_encode()
392 // numbers shows the order of the immediate field in a J-type instruction.  in riscv_encode()
393 // The last row is the bit number in each byte.  in riscv_encode()
395 // To determine the amount to shift each bit, subtract the value in  in riscv_encode()
399 // For example, at the rightmost side of the chart, the bit 4 in b1 is  in riscv_encode()
400 // the bit 12 of the address. Thus that bit needs to be shifted left  in riscv_encode()
401 // by 12 - 4 = 8 bits to put it in the right place in the addr variable.  in riscv_encode()
403 // NOTE: The immediate of a J-type instruction holds bits [20:1] of  in riscv_encode()
404 // the address. The bit [0] is always 0 and not part of the immediate.  in riscv_encode()
408 // | 20 10  9  8  7  6  5  4 |  3  2  1 11 19 18 17 16 | 15 14 13 12 x x x x |  in riscv_encode()
409 // |  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |  7  6  5  4 x x x x |  in riscv_encode()
420 			buffer[i + 1] = (uint8_t)((b1 & 0x0F)  in riscv_encode()
424 			buffer[i + 3] = (uint8_t)(addr >> 1);  in riscv_encode()
426 			// The "-2" is included because the for-loop will  in riscv_encode()
430 			i += 4 - 2;  in riscv_encode()
434 			inst |= (uint32_t)buffer[i + 1] << 8;  in riscv_encode()
470 					i += 6 - 2;  in riscv_encode()
476 				//   - The lowest 7 bits [6:0] retain the  in riscv_encode()
479 				//   - The rd [11:7] is set to x2(sp). x2 is  in riscv_encode()
481 				//     rd=x2 should be very rare in real-world  in riscv_encode()
484 				//   - The remaining 20 bits [31:12] (that  in riscv_encode()
485 				//     normally hold the pc-relative immediate)  in riscv_encode()
487 				//     inst2. That is, the 12-bit immediate of  in riscv_encode()
490 				//   - The location of the original inst2 is  in riscv_encode()
491 				//     used to store the 32-bit absolute  in riscv_encode()
493 				//     to the 20+12-bit split encoding, this  in riscv_encode()
503 				//     ELF PLT), in big endian order only the  in riscv_encode()
504 				//     last 1-2 bytes differ; in little endian  in riscv_encode()
505 				//     the differing 1-2 bytes would be in the  in riscv_encode()
506 				//     middle of the 8-byte sequence.  in riscv_encode()
514 				// trivial but (1) it's implementation-defined  in riscv_encode()
515 				// behavior (C99/C11/C23 6.5.7-p5) and so is  in riscv_encode()
516 				// (2) casting unsigned to signed (6.3.1.3-p3).  in riscv_encode()
518 				// One can check for (1) with  in riscv_encode()
520 				//     if ((-1 >> 1) == -1) ...  in riscv_encode()
523 				// compiler docs. GCC promises that (1)  in riscv_encode()
531 				// the 100 % portable way is only a few bytes  in riscv_encode()
537 						- ((inst2 >> 19) & 0x1000);  in riscv_encode()
571 				// (1) The lowest 12 bits aren't sign-extended.  in riscv_encode()
580 					i += 4 - 2;  in riscv_encode()
604 			i += 8 - 2;  in riscv_encode()
626 	start_offset &= ~UINT32_C(1);  in lzma_bcj_riscv_encode()
642 	size -= 8;  in riscv_decode()
650 			const uint32_t b1 = buffer[i + 1];  in riscv_decode()
652 			// Only filter rd=x1(ra) and rd=x5(t0).  in riscv_decode()
662 // | 20 10  9  8  7  6  5  4 |  3  2  1 11 19 18 17 16 | 15 14 13 12 x x x x |  in riscv_decode()
663 // |  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |  7  6  5  4 x x x x |  in riscv_decode()
666 					| (b2 << 9) | (b3 << 1);  in riscv_decode()
668 			addr -= pc;  in riscv_decode()
670 			buffer[i + 1] = (uint8_t)((b1 & 0x0F)  in riscv_decode()
680 			i += 4 - 2;  in riscv_decode()
686 			inst |= (uint32_t)buffer[i + 1] << 8;  in riscv_decode()
697 					i += 6 - 2;  in riscv_decode()
701 				// Decode (or more like re-encode) the "fake"  in riscv_decode()
703 				// sign-extension, address conversion, or  in riscv_decode()
721 					i += 4 - 2;  in riscv_decode()
728 				addr -= now_pos + (uint32_t)i;  in riscv_decode()
731 				//   - Get the lowest 20 bits from inst.  in riscv_decode()
732 				//   - Add the lowest 12 bits of the address  in riscv_decode()
737 				//   - rd is the same as inst2_rs1.  in riscv_decode()
738 				//   - The sign extension of the lowest 12 bits  in riscv_decode()
748 			i += 8 - 2;  in riscv_decode()
770 	start_offset &= ~UINT32_C(1);  in lzma_bcj_riscv_decode()