xref: /freebsd/contrib/xz/src/liblzma/simple/arm64.c (revision 128836d304d93f2d00eb14069c27089ab46c38d4)
13b35e7eeSXin LI // SPDX-License-Identifier: 0BSD
23b35e7eeSXin LI 
373ed8e77SXin LI ///////////////////////////////////////////////////////////////////////////////
473ed8e77SXin LI //
573ed8e77SXin LI /// \file       arm64.c
673ed8e77SXin LI /// \brief      Filter for ARM64 binaries
773ed8e77SXin LI ///
873ed8e77SXin LI /// This converts ARM64 relative addresses in the BL and ADRP immediates
973ed8e77SXin LI /// to absolute values to increase redundancy of ARM64 code.
1073ed8e77SXin LI ///
1173ed8e77SXin LI /// Converting B or ADR instructions was also tested but it's not useful.
1273ed8e77SXin LI /// A majority of the jumps for the B instruction are very small (+/- 0xFF).
1373ed8e77SXin LI /// These are typical for loops and if-statements. Encoding them to their
1473ed8e77SXin LI /// absolute address reduces redundancy since many of the small relative
1573ed8e77SXin LI /// jump values are repeated, but very few of the absolute addresses are.
1673ed8e77SXin LI //
1773ed8e77SXin LI //  Authors:    Lasse Collin
1873ed8e77SXin LI //              Jia Tan
19047153b4SXin LI //              Igor Pavlov
2073ed8e77SXin LI //
2173ed8e77SXin LI ///////////////////////////////////////////////////////////////////////////////
2273ed8e77SXin LI 
2373ed8e77SXin LI #include "simple_private.h"
2473ed8e77SXin LI 
2573ed8e77SXin LI 
2673ed8e77SXin LI static size_t
arm64_code(void * simple lzma_attribute ((__unused__)),uint32_t now_pos,bool is_encoder,uint8_t * buffer,size_t size)2773ed8e77SXin LI arm64_code(void *simple lzma_attribute((__unused__)),
2873ed8e77SXin LI 		uint32_t now_pos, bool is_encoder,
2973ed8e77SXin LI 		uint8_t *buffer, size_t size)
3073ed8e77SXin LI {
31*128836d3SXin LI 	size &= ~(size_t)3;
32*128836d3SXin LI 
3373ed8e77SXin LI 	size_t i;
3473ed8e77SXin LI 
3573ed8e77SXin LI 	// Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower
3673ed8e77SXin LI 	// with auto-vectorization that is enabled by default with -O2.
3773ed8e77SXin LI 	// Such vectorization bloat happens with -O2 when targeting ARM64 too
3873ed8e77SXin LI 	// but performance hasn't been tested.
3973ed8e77SXin LI #ifdef __clang__
4073ed8e77SXin LI #	pragma clang loop vectorize(disable)
4173ed8e77SXin LI #endif
42*128836d3SXin LI 	for (i = 0; i < size; i += 4) {
4373ed8e77SXin LI 		uint32_t pc = (uint32_t)(now_pos + i);
4473ed8e77SXin LI 		uint32_t instr = read32le(buffer + i);
4573ed8e77SXin LI 
4673ed8e77SXin LI 		if ((instr >> 26) == 0x25) {
4773ed8e77SXin LI 			// BL instruction:
4873ed8e77SXin LI 			// The full 26-bit immediate is converted.
4973ed8e77SXin LI 			// The range is +/-128 MiB.
5073ed8e77SXin LI 			//
5126743408SXin LI 			// Using the full range helps quite a lot with
5273ed8e77SXin LI 			// big executables. Smaller range would reduce false
5373ed8e77SXin LI 			// positives in non-code sections of the input though
5473ed8e77SXin LI 			// so this is a compromise that slightly favors big
5526743408SXin LI 			// files. With the full range, only six bits of the 32
5673ed8e77SXin LI 			// need to match to trigger a conversion.
5773ed8e77SXin LI 			const uint32_t src = instr;
5873ed8e77SXin LI 			instr = 0x94000000;
5973ed8e77SXin LI 
6073ed8e77SXin LI 			pc >>= 2;
6173ed8e77SXin LI 			if (!is_encoder)
6273ed8e77SXin LI 				pc = 0U - pc;
6373ed8e77SXin LI 
6473ed8e77SXin LI 			instr |= (src + pc) & 0x03FFFFFF;
6573ed8e77SXin LI 			write32le(buffer + i, instr);
6673ed8e77SXin LI 
6773ed8e77SXin LI 		} else if ((instr & 0x9F000000) == 0x90000000) {
6873ed8e77SXin LI 			// ADRP instruction:
6973ed8e77SXin LI 			// Only values in the range +/-512 MiB are converted.
7073ed8e77SXin LI 			//
7173ed8e77SXin LI 			// Using less than the full +/-4 GiB range reduces
7273ed8e77SXin LI 			// false positives on non-code sections of the input
7373ed8e77SXin LI 			// while being excellent for executables up to 512 MiB.
7473ed8e77SXin LI 			// The positive effect of ADRP conversion is smaller
7573ed8e77SXin LI 			// than that of BL but it also doesn't hurt so much in
7673ed8e77SXin LI 			// non-code sections of input because, with +/-512 MiB
7773ed8e77SXin LI 			// range, nine bits of 32 need to match to trigger a
7873ed8e77SXin LI 			// conversion (two 10-bit match choices = 9 bits).
7973ed8e77SXin LI 			const uint32_t src = ((instr >> 29) & 3)
8073ed8e77SXin LI 					| ((instr >> 3) & 0x001FFFFC);
8173ed8e77SXin LI 
8273ed8e77SXin LI 			// With the addition only one branch is needed to
8373ed8e77SXin LI 			// check the +/- range. This is usually false when
8473ed8e77SXin LI 			// processing ARM64 code so branch prediction will
8573ed8e77SXin LI 			// handle it well in terms of performance.
8673ed8e77SXin LI 			//
8773ed8e77SXin LI 			//if ((src & 0x001E0000) != 0
8873ed8e77SXin LI 			// && (src & 0x001E0000) != 0x001E0000)
8973ed8e77SXin LI 			if ((src + 0x00020000) & 0x001C0000)
9073ed8e77SXin LI 				continue;
9173ed8e77SXin LI 
9273ed8e77SXin LI 			instr &= 0x9000001F;
9373ed8e77SXin LI 
9473ed8e77SXin LI 			pc >>= 12;
9573ed8e77SXin LI 			if (!is_encoder)
9673ed8e77SXin LI 				pc = 0U - pc;
9773ed8e77SXin LI 
9873ed8e77SXin LI 			const uint32_t dest = src + pc;
9973ed8e77SXin LI 			instr |= (dest & 3) << 29;
10073ed8e77SXin LI 			instr |= (dest & 0x0003FFFC) << 3;
10173ed8e77SXin LI 			instr |= (0U - (dest & 0x00020000)) & 0x00E00000;
10273ed8e77SXin LI 			write32le(buffer + i, instr);
10373ed8e77SXin LI 		}
10473ed8e77SXin LI 	}
10573ed8e77SXin LI 
10673ed8e77SXin LI 	return i;
10773ed8e77SXin LI }
10873ed8e77SXin LI 
10973ed8e77SXin LI 
11073ed8e77SXin LI static lzma_ret
arm64_coder_init(lzma_next_coder * next,const lzma_allocator * allocator,const lzma_filter_info * filters,bool is_encoder)11173ed8e77SXin LI arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator,
11273ed8e77SXin LI 		const lzma_filter_info *filters, bool is_encoder)
11373ed8e77SXin LI {
11473ed8e77SXin LI 	return lzma_simple_coder_init(next, allocator, filters,
11573ed8e77SXin LI 			&arm64_code, 0, 4, 4, is_encoder);
11673ed8e77SXin LI }
11773ed8e77SXin LI 
11873ed8e77SXin LI 
11973ed8e77SXin LI #ifdef HAVE_ENCODER_ARM64
12073ed8e77SXin LI extern lzma_ret
lzma_simple_arm64_encoder_init(lzma_next_coder * next,const lzma_allocator * allocator,const lzma_filter_info * filters)12173ed8e77SXin LI lzma_simple_arm64_encoder_init(lzma_next_coder *next,
12273ed8e77SXin LI 		const lzma_allocator *allocator,
12373ed8e77SXin LI 		const lzma_filter_info *filters)
12473ed8e77SXin LI {
12573ed8e77SXin LI 	return arm64_coder_init(next, allocator, filters, true);
12673ed8e77SXin LI }
127*128836d3SXin LI 
128*128836d3SXin LI 
129*128836d3SXin LI extern LZMA_API(size_t)
lzma_bcj_arm64_encode(uint32_t start_offset,uint8_t * buf,size_t size)130*128836d3SXin LI lzma_bcj_arm64_encode(uint32_t start_offset, uint8_t *buf, size_t size)
131*128836d3SXin LI {
132*128836d3SXin LI 	// start_offset must be a multiple of four.
133*128836d3SXin LI 	start_offset &= ~UINT32_C(3);
134*128836d3SXin LI 	return arm64_code(NULL, start_offset, true, buf, size);
135*128836d3SXin LI }
13673ed8e77SXin LI #endif
13773ed8e77SXin LI 
13873ed8e77SXin LI 
13973ed8e77SXin LI #ifdef HAVE_DECODER_ARM64
14073ed8e77SXin LI extern lzma_ret
lzma_simple_arm64_decoder_init(lzma_next_coder * next,const lzma_allocator * allocator,const lzma_filter_info * filters)14173ed8e77SXin LI lzma_simple_arm64_decoder_init(lzma_next_coder *next,
14273ed8e77SXin LI 		const lzma_allocator *allocator,
14373ed8e77SXin LI 		const lzma_filter_info *filters)
14473ed8e77SXin LI {
14573ed8e77SXin LI 	return arm64_coder_init(next, allocator, filters, false);
14673ed8e77SXin LI }
147*128836d3SXin LI 
148*128836d3SXin LI 
149*128836d3SXin LI extern LZMA_API(size_t)
lzma_bcj_arm64_decode(uint32_t start_offset,uint8_t * buf,size_t size)150*128836d3SXin LI lzma_bcj_arm64_decode(uint32_t start_offset, uint8_t *buf, size_t size)
151*128836d3SXin LI {
152*128836d3SXin LI 	// start_offset must be a multiple of four.
153*128836d3SXin LI 	start_offset &= ~UINT32_C(3);
154*128836d3SXin LI 	return arm64_code(NULL, start_offset, false, buf, size);
155*128836d3SXin LI }
15673ed8e77SXin LI #endif
157