xref: /freebsd/contrib/xz/src/liblzma/simple/arm64.c (revision aa1a8ff2d6dbc51ef058f46f3db5a8bb77967145)
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 /// \file       arm64.c
4 /// \brief      Filter for ARM64 binaries
5 ///
6 /// This converts ARM64 relative addresses in the BL and ADRP immediates
7 /// to absolute values to increase redundancy of ARM64 code.
8 ///
9 /// Converting B or ADR instructions was also tested but it's not useful.
10 /// A majority of the jumps for the B instruction are very small (+/- 0xFF).
11 /// These are typical for loops and if-statements. Encoding them to their
12 /// absolute address reduces redundancy since many of the small relative
13 /// jump values are repeated, but very few of the absolute addresses are.
14 //
15 //  Authors:    Lasse Collin
16 //              Jia Tan
17 //              Igor Pavlov
18 //
19 //  This file has been put into the public domain.
20 //  You can do whatever you want with this file.
21 //
22 ///////////////////////////////////////////////////////////////////////////////
23 
24 #include "simple_private.h"
25 
26 
27 static size_t
28 arm64_code(void *simple lzma_attribute((__unused__)),
29 		uint32_t now_pos, bool is_encoder,
30 		uint8_t *buffer, size_t size)
31 {
32 	size_t i;
33 
34 	// Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower
35 	// with auto-vectorization that is enabled by default with -O2.
36 	// Such vectorization bloat happens with -O2 when targeting ARM64 too
37 	// but performance hasn't been tested.
38 #ifdef __clang__
39 #	pragma clang loop vectorize(disable)
40 #endif
41 	for (i = 0; i + 4 <= size; i += 4) {
42 		uint32_t pc = (uint32_t)(now_pos + i);
43 		uint32_t instr = read32le(buffer + i);
44 
45 		if ((instr >> 26) == 0x25) {
46 			// BL instruction:
47 			// The full 26-bit immediate is converted.
48 			// The range is +/-128 MiB.
49 			//
50 			// Using the full range is helps quite a lot with
51 			// big executables. Smaller range would reduce false
52 			// positives in non-code sections of the input though
53 			// so this is a compromise that slightly favors big
54 			// files. With the full range only six bits of the 32
55 			// need to match to trigger a conversion.
56 			const uint32_t src = instr;
57 			instr = 0x94000000;
58 
59 			pc >>= 2;
60 			if (!is_encoder)
61 				pc = 0U - pc;
62 
63 			instr |= (src + pc) & 0x03FFFFFF;
64 			write32le(buffer + i, instr);
65 
66 		} else if ((instr & 0x9F000000) == 0x90000000) {
67 			// ADRP instruction:
68 			// Only values in the range +/-512 MiB are converted.
69 			//
70 			// Using less than the full +/-4 GiB range reduces
71 			// false positives on non-code sections of the input
72 			// while being excellent for executables up to 512 MiB.
73 			// The positive effect of ADRP conversion is smaller
74 			// than that of BL but it also doesn't hurt so much in
75 			// non-code sections of input because, with +/-512 MiB
76 			// range, nine bits of 32 need to match to trigger a
77 			// conversion (two 10-bit match choices = 9 bits).
78 			const uint32_t src = ((instr >> 29) & 3)
79 					| ((instr >> 3) & 0x001FFFFC);
80 
81 			// With the addition only one branch is needed to
82 			// check the +/- range. This is usually false when
83 			// processing ARM64 code so branch prediction will
84 			// handle it well in terms of performance.
85 			//
86 			//if ((src & 0x001E0000) != 0
87 			// && (src & 0x001E0000) != 0x001E0000)
88 			if ((src + 0x00020000) & 0x001C0000)
89 				continue;
90 
91 			instr &= 0x9000001F;
92 
93 			pc >>= 12;
94 			if (!is_encoder)
95 				pc = 0U - pc;
96 
97 			const uint32_t dest = src + pc;
98 			instr |= (dest & 3) << 29;
99 			instr |= (dest & 0x0003FFFC) << 3;
100 			instr |= (0U - (dest & 0x00020000)) & 0x00E00000;
101 			write32le(buffer + i, instr);
102 		}
103 	}
104 
105 	return i;
106 }
107 
108 
109 static lzma_ret
110 arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator,
111 		const lzma_filter_info *filters, bool is_encoder)
112 {
113 	return lzma_simple_coder_init(next, allocator, filters,
114 			&arm64_code, 0, 4, 4, is_encoder);
115 }
116 
117 
118 #ifdef HAVE_ENCODER_ARM64
119 extern lzma_ret
120 lzma_simple_arm64_encoder_init(lzma_next_coder *next,
121 		const lzma_allocator *allocator,
122 		const lzma_filter_info *filters)
123 {
124 	return arm64_coder_init(next, allocator, filters, true);
125 }
126 #endif
127 
128 
129 #ifdef HAVE_DECODER_ARM64
130 extern lzma_ret
131 lzma_simple_arm64_decoder_init(lzma_next_coder *next,
132 		const lzma_allocator *allocator,
133 		const lzma_filter_info *filters)
134 {
135 	return arm64_coder_init(next, allocator, filters, false);
136 }
137 #endif
138