1 // SPDX-License-Identifier: 0BSD 2 3 /////////////////////////////////////////////////////////////////////////////// 4 // 5 /// \file arm64.c 6 /// \brief Filter for ARM64 binaries 7 /// 8 /// This converts ARM64 relative addresses in the BL and ADRP immediates 9 /// to absolute values to increase redundancy of ARM64 code. 10 /// 11 /// Converting B or ADR instructions was also tested but it's not useful. 12 /// A majority of the jumps for the B instruction are very small (+/- 0xFF). 13 /// These are typical for loops and if-statements. Encoding them to their 14 /// absolute address reduces redundancy since many of the small relative 15 /// jump values are repeated, but very few of the absolute addresses are. 16 // 17 // Authors: Lasse Collin 18 // Jia Tan 19 // Igor Pavlov 20 // 21 /////////////////////////////////////////////////////////////////////////////// 22 23 #include "simple_private.h" 24 25 26 static size_t 27 arm64_code(void *simple lzma_attribute((__unused__)), 28 uint32_t now_pos, bool is_encoder, 29 uint8_t *buffer, size_t size) 30 { 31 size &= ~(size_t)3; 32 33 size_t i; 34 35 // Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower 36 // with auto-vectorization that is enabled by default with -O2. 37 // Such vectorization bloat happens with -O2 when targeting ARM64 too 38 // but performance hasn't been tested. 39 #ifdef __clang__ 40 # pragma clang loop vectorize(disable) 41 #endif 42 for (i = 0; i < size; i += 4) { 43 uint32_t pc = (uint32_t)(now_pos + i); 44 uint32_t instr = read32le(buffer + i); 45 46 if ((instr >> 26) == 0x25) { 47 // BL instruction: 48 // The full 26-bit immediate is converted. 49 // The range is +/-128 MiB. 50 // 51 // Using the full range helps quite a lot with 52 // big executables. Smaller range would reduce false 53 // positives in non-code sections of the input though 54 // so this is a compromise that slightly favors big 55 // files. With the full range, only six bits of the 32 56 // need to match to trigger a conversion. 57 const uint32_t src = instr; 58 instr = 0x94000000; 59 60 pc >>= 2; 61 if (!is_encoder) 62 pc = 0U - pc; 63 64 instr |= (src + pc) & 0x03FFFFFF; 65 write32le(buffer + i, instr); 66 67 } else if ((instr & 0x9F000000) == 0x90000000) { 68 // ADRP instruction: 69 // Only values in the range +/-512 MiB are converted. 70 // 71 // Using less than the full +/-4 GiB range reduces 72 // false positives on non-code sections of the input 73 // while being excellent for executables up to 512 MiB. 74 // The positive effect of ADRP conversion is smaller 75 // than that of BL but it also doesn't hurt so much in 76 // non-code sections of input because, with +/-512 MiB 77 // range, nine bits of 32 need to match to trigger a 78 // conversion (two 10-bit match choices = 9 bits). 79 const uint32_t src = ((instr >> 29) & 3) 80 | ((instr >> 3) & 0x001FFFFC); 81 82 // With the addition only one branch is needed to 83 // check the +/- range. This is usually false when 84 // processing ARM64 code so branch prediction will 85 // handle it well in terms of performance. 86 // 87 //if ((src & 0x001E0000) != 0 88 // && (src & 0x001E0000) != 0x001E0000) 89 if ((src + 0x00020000) & 0x001C0000) 90 continue; 91 92 instr &= 0x9000001F; 93 94 pc >>= 12; 95 if (!is_encoder) 96 pc = 0U - pc; 97 98 const uint32_t dest = src + pc; 99 instr |= (dest & 3) << 29; 100 instr |= (dest & 0x0003FFFC) << 3; 101 instr |= (0U - (dest & 0x00020000)) & 0x00E00000; 102 write32le(buffer + i, instr); 103 } 104 } 105 106 return i; 107 } 108 109 110 static lzma_ret 111 arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator, 112 const lzma_filter_info *filters, bool is_encoder) 113 { 114 return lzma_simple_coder_init(next, allocator, filters, 115 &arm64_code, 0, 4, 4, is_encoder); 116 } 117 118 119 #ifdef HAVE_ENCODER_ARM64 120 extern lzma_ret 121 lzma_simple_arm64_encoder_init(lzma_next_coder *next, 122 const lzma_allocator *allocator, 123 const lzma_filter_info *filters) 124 { 125 return arm64_coder_init(next, allocator, filters, true); 126 } 127 128 129 extern LZMA_API(size_t) 130 lzma_bcj_arm64_encode(uint32_t start_offset, uint8_t *buf, size_t size) 131 { 132 // start_offset must be a multiple of four. 133 start_offset &= ~UINT32_C(3); 134 return arm64_code(NULL, start_offset, true, buf, size); 135 } 136 #endif 137 138 139 #ifdef HAVE_DECODER_ARM64 140 extern lzma_ret 141 lzma_simple_arm64_decoder_init(lzma_next_coder *next, 142 const lzma_allocator *allocator, 143 const lzma_filter_info *filters) 144 { 145 return arm64_coder_init(next, allocator, filters, false); 146 } 147 148 149 extern LZMA_API(size_t) 150 lzma_bcj_arm64_decode(uint32_t start_offset, uint8_t *buf, size_t size) 151 { 152 // start_offset must be a multiple of four. 153 start_offset &= ~UINT32_C(3); 154 return arm64_code(NULL, start_offset, false, buf, size); 155 } 156 #endif 157