1 /////////////////////////////////////////////////////////////////////////////// 2 // 3 /// \file arm64.c 4 /// \brief Filter for ARM64 binaries 5 /// 6 /// This converts ARM64 relative addresses in the BL and ADRP immediates 7 /// to absolute values to increase redundancy of ARM64 code. 8 /// 9 /// Converting B or ADR instructions was also tested but it's not useful. 10 /// A majority of the jumps for the B instruction are very small (+/- 0xFF). 11 /// These are typical for loops and if-statements. Encoding them to their 12 /// absolute address reduces redundancy since many of the small relative 13 /// jump values are repeated, but very few of the absolute addresses are. 14 // 15 // Authors: Lasse Collin 16 // Jia Tan 17 // Igor Pavlov 18 // 19 // This file has been put into the public domain. 20 // You can do whatever you want with this file. 21 // 22 /////////////////////////////////////////////////////////////////////////////// 23 24 #include "simple_private.h" 25 26 27 static size_t 28 arm64_code(void *simple lzma_attribute((__unused__)), 29 uint32_t now_pos, bool is_encoder, 30 uint8_t *buffer, size_t size) 31 { 32 size_t i; 33 34 // Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower 35 // with auto-vectorization that is enabled by default with -O2. 36 // Such vectorization bloat happens with -O2 when targeting ARM64 too 37 // but performance hasn't been tested. 38 #ifdef __clang__ 39 # pragma clang loop vectorize(disable) 40 #endif 41 for (i = 0; i + 4 <= size; i += 4) { 42 uint32_t pc = (uint32_t)(now_pos + i); 43 uint32_t instr = read32le(buffer + i); 44 45 if ((instr >> 26) == 0x25) { 46 // BL instruction: 47 // The full 26-bit immediate is converted. 48 // The range is +/-128 MiB. 49 // 50 // Using the full range is helps quite a lot with 51 // big executables. Smaller range would reduce false 52 // positives in non-code sections of the input though 53 // so this is a compromise that slightly favors big 54 // files. With the full range only six bits of the 32 55 // need to match to trigger a conversion. 56 const uint32_t src = instr; 57 instr = 0x94000000; 58 59 pc >>= 2; 60 if (!is_encoder) 61 pc = 0U - pc; 62 63 instr |= (src + pc) & 0x03FFFFFF; 64 write32le(buffer + i, instr); 65 66 } else if ((instr & 0x9F000000) == 0x90000000) { 67 // ADRP instruction: 68 // Only values in the range +/-512 MiB are converted. 69 // 70 // Using less than the full +/-4 GiB range reduces 71 // false positives on non-code sections of the input 72 // while being excellent for executables up to 512 MiB. 73 // The positive effect of ADRP conversion is smaller 74 // than that of BL but it also doesn't hurt so much in 75 // non-code sections of input because, with +/-512 MiB 76 // range, nine bits of 32 need to match to trigger a 77 // conversion (two 10-bit match choices = 9 bits). 78 const uint32_t src = ((instr >> 29) & 3) 79 | ((instr >> 3) & 0x001FFFFC); 80 81 // With the addition only one branch is needed to 82 // check the +/- range. This is usually false when 83 // processing ARM64 code so branch prediction will 84 // handle it well in terms of performance. 85 // 86 //if ((src & 0x001E0000) != 0 87 // && (src & 0x001E0000) != 0x001E0000) 88 if ((src + 0x00020000) & 0x001C0000) 89 continue; 90 91 instr &= 0x9000001F; 92 93 pc >>= 12; 94 if (!is_encoder) 95 pc = 0U - pc; 96 97 const uint32_t dest = src + pc; 98 instr |= (dest & 3) << 29; 99 instr |= (dest & 0x0003FFFC) << 3; 100 instr |= (0U - (dest & 0x00020000)) & 0x00E00000; 101 write32le(buffer + i, instr); 102 } 103 } 104 105 return i; 106 } 107 108 109 static lzma_ret 110 arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator, 111 const lzma_filter_info *filters, bool is_encoder) 112 { 113 return lzma_simple_coder_init(next, allocator, filters, 114 &arm64_code, 0, 4, 4, is_encoder); 115 } 116 117 118 #ifdef HAVE_ENCODER_ARM64 119 extern lzma_ret 120 lzma_simple_arm64_encoder_init(lzma_next_coder *next, 121 const lzma_allocator *allocator, 122 const lzma_filter_info *filters) 123 { 124 return arm64_coder_init(next, allocator, filters, true); 125 } 126 #endif 127 128 129 #ifdef HAVE_DECODER_ARM64 130 extern lzma_ret 131 lzma_simple_arm64_decoder_init(lzma_next_coder *next, 132 const lzma_allocator *allocator, 133 const lzma_filter_info *filters) 134 { 135 return arm64_coder_init(next, allocator, filters, false); 136 } 137 #endif 138