xref: /freebsd/contrib/xz/src/liblzma/simple/arm64.c (revision 128836d304d93f2d00eb14069c27089ab46c38d4)
1 // SPDX-License-Identifier: 0BSD
2 
3 ///////////////////////////////////////////////////////////////////////////////
4 //
5 /// \file       arm64.c
6 /// \brief      Filter for ARM64 binaries
7 ///
8 /// This converts ARM64 relative addresses in the BL and ADRP immediates
9 /// to absolute values to increase redundancy of ARM64 code.
10 ///
11 /// Converting B or ADR instructions was also tested but it's not useful.
12 /// A majority of the jumps for the B instruction are very small (+/- 0xFF).
13 /// These are typical for loops and if-statements. Encoding them to their
14 /// absolute address reduces redundancy since many of the small relative
15 /// jump values are repeated, but very few of the absolute addresses are.
16 //
17 //  Authors:    Lasse Collin
18 //              Jia Tan
19 //              Igor Pavlov
20 //
21 ///////////////////////////////////////////////////////////////////////////////
22 
23 #include "simple_private.h"
24 
25 
26 static size_t
arm64_code(void * simple lzma_attribute ((__unused__)),uint32_t now_pos,bool is_encoder,uint8_t * buffer,size_t size)27 arm64_code(void *simple lzma_attribute((__unused__)),
28 		uint32_t now_pos, bool is_encoder,
29 		uint8_t *buffer, size_t size)
30 {
31 	size &= ~(size_t)3;
32 
33 	size_t i;
34 
35 	// Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower
36 	// with auto-vectorization that is enabled by default with -O2.
37 	// Such vectorization bloat happens with -O2 when targeting ARM64 too
38 	// but performance hasn't been tested.
39 #ifdef __clang__
40 #	pragma clang loop vectorize(disable)
41 #endif
42 	for (i = 0; i < size; i += 4) {
43 		uint32_t pc = (uint32_t)(now_pos + i);
44 		uint32_t instr = read32le(buffer + i);
45 
46 		if ((instr >> 26) == 0x25) {
47 			// BL instruction:
48 			// The full 26-bit immediate is converted.
49 			// The range is +/-128 MiB.
50 			//
51 			// Using the full range helps quite a lot with
52 			// big executables. Smaller range would reduce false
53 			// positives in non-code sections of the input though
54 			// so this is a compromise that slightly favors big
55 			// files. With the full range, only six bits of the 32
56 			// need to match to trigger a conversion.
57 			const uint32_t src = instr;
58 			instr = 0x94000000;
59 
60 			pc >>= 2;
61 			if (!is_encoder)
62 				pc = 0U - pc;
63 
64 			instr |= (src + pc) & 0x03FFFFFF;
65 			write32le(buffer + i, instr);
66 
67 		} else if ((instr & 0x9F000000) == 0x90000000) {
68 			// ADRP instruction:
69 			// Only values in the range +/-512 MiB are converted.
70 			//
71 			// Using less than the full +/-4 GiB range reduces
72 			// false positives on non-code sections of the input
73 			// while being excellent for executables up to 512 MiB.
74 			// The positive effect of ADRP conversion is smaller
75 			// than that of BL but it also doesn't hurt so much in
76 			// non-code sections of input because, with +/-512 MiB
77 			// range, nine bits of 32 need to match to trigger a
78 			// conversion (two 10-bit match choices = 9 bits).
79 			const uint32_t src = ((instr >> 29) & 3)
80 					| ((instr >> 3) & 0x001FFFFC);
81 
82 			// With the addition only one branch is needed to
83 			// check the +/- range. This is usually false when
84 			// processing ARM64 code so branch prediction will
85 			// handle it well in terms of performance.
86 			//
87 			//if ((src & 0x001E0000) != 0
88 			// && (src & 0x001E0000) != 0x001E0000)
89 			if ((src + 0x00020000) & 0x001C0000)
90 				continue;
91 
92 			instr &= 0x9000001F;
93 
94 			pc >>= 12;
95 			if (!is_encoder)
96 				pc = 0U - pc;
97 
98 			const uint32_t dest = src + pc;
99 			instr |= (dest & 3) << 29;
100 			instr |= (dest & 0x0003FFFC) << 3;
101 			instr |= (0U - (dest & 0x00020000)) & 0x00E00000;
102 			write32le(buffer + i, instr);
103 		}
104 	}
105 
106 	return i;
107 }
108 
109 
110 static lzma_ret
arm64_coder_init(lzma_next_coder * next,const lzma_allocator * allocator,const lzma_filter_info * filters,bool is_encoder)111 arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator,
112 		const lzma_filter_info *filters, bool is_encoder)
113 {
114 	return lzma_simple_coder_init(next, allocator, filters,
115 			&arm64_code, 0, 4, 4, is_encoder);
116 }
117 
118 
119 #ifdef HAVE_ENCODER_ARM64
120 extern lzma_ret
lzma_simple_arm64_encoder_init(lzma_next_coder * next,const lzma_allocator * allocator,const lzma_filter_info * filters)121 lzma_simple_arm64_encoder_init(lzma_next_coder *next,
122 		const lzma_allocator *allocator,
123 		const lzma_filter_info *filters)
124 {
125 	return arm64_coder_init(next, allocator, filters, true);
126 }
127 
128 
129 extern LZMA_API(size_t)
lzma_bcj_arm64_encode(uint32_t start_offset,uint8_t * buf,size_t size)130 lzma_bcj_arm64_encode(uint32_t start_offset, uint8_t *buf, size_t size)
131 {
132 	// start_offset must be a multiple of four.
133 	start_offset &= ~UINT32_C(3);
134 	return arm64_code(NULL, start_offset, true, buf, size);
135 }
136 #endif
137 
138 
139 #ifdef HAVE_DECODER_ARM64
140 extern lzma_ret
lzma_simple_arm64_decoder_init(lzma_next_coder * next,const lzma_allocator * allocator,const lzma_filter_info * filters)141 lzma_simple_arm64_decoder_init(lzma_next_coder *next,
142 		const lzma_allocator *allocator,
143 		const lzma_filter_info *filters)
144 {
145 	return arm64_coder_init(next, allocator, filters, false);
146 }
147 
148 
149 extern LZMA_API(size_t)
lzma_bcj_arm64_decode(uint32_t start_offset,uint8_t * buf,size_t size)150 lzma_bcj_arm64_decode(uint32_t start_offset, uint8_t *buf, size_t size)
151 {
152 	// start_offset must be a multiple of four.
153 	start_offset &= ~UINT32_C(3);
154 	return arm64_code(NULL, start_offset, false, buf, size);
155 }
156 #endif
157