1 /////////////////////////////////////////////////////////////////////////////// 2 // 3 /// \file arm64.c 4 /// \brief Filter for ARM64 binaries 5 /// 6 /// This converts ARM64 relative addresses in the BL and ADRP immediates 7 /// to absolute values to increase redundancy of ARM64 code. 8 /// 9 /// Converting B or ADR instructions was also tested but it's not useful. 10 /// A majority of the jumps for the B instruction are very small (+/- 0xFF). 11 /// These are typical for loops and if-statements. Encoding them to their 12 /// absolute address reduces redundancy since many of the small relative 13 /// jump values are repeated, but very few of the absolute addresses are. 14 // 15 // Authors: Lasse Collin 16 // Jia Tan 17 // 18 // This file has been put into the public domain. 19 // You can do whatever you want with this file. 20 // 21 /////////////////////////////////////////////////////////////////////////////// 22 23 #include "simple_private.h" 24 25 26 static size_t 27 arm64_code(void *simple lzma_attribute((__unused__)), 28 uint32_t now_pos, bool is_encoder, 29 uint8_t *buffer, size_t size) 30 { 31 size_t i; 32 33 // Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower 34 // with auto-vectorization that is enabled by default with -O2. 35 // Such vectorization bloat happens with -O2 when targeting ARM64 too 36 // but performance hasn't been tested. 37 #ifdef __clang__ 38 # pragma clang loop vectorize(disable) 39 #endif 40 for (i = 0; i + 4 <= size; i += 4) { 41 uint32_t pc = (uint32_t)(now_pos + i); 42 uint32_t instr = read32le(buffer + i); 43 44 if ((instr >> 26) == 0x25) { 45 // BL instruction: 46 // The full 26-bit immediate is converted. 47 // The range is +/-128 MiB. 48 // 49 // Using the full range is helps quite a lot with 50 // big executables. Smaller range would reduce false 51 // positives in non-code sections of the input though 52 // so this is a compromise that slightly favors big 53 // files. With the full range only six bits of the 32 54 // need to match to trigger a conversion. 55 const uint32_t src = instr; 56 instr = 0x94000000; 57 58 pc >>= 2; 59 if (!is_encoder) 60 pc = 0U - pc; 61 62 instr |= (src + pc) & 0x03FFFFFF; 63 write32le(buffer + i, instr); 64 65 } else if ((instr & 0x9F000000) == 0x90000000) { 66 // ADRP instruction: 67 // Only values in the range +/-512 MiB are converted. 68 // 69 // Using less than the full +/-4 GiB range reduces 70 // false positives on non-code sections of the input 71 // while being excellent for executables up to 512 MiB. 72 // The positive effect of ADRP conversion is smaller 73 // than that of BL but it also doesn't hurt so much in 74 // non-code sections of input because, with +/-512 MiB 75 // range, nine bits of 32 need to match to trigger a 76 // conversion (two 10-bit match choices = 9 bits). 77 const uint32_t src = ((instr >> 29) & 3) 78 | ((instr >> 3) & 0x001FFFFC); 79 80 // With the addition only one branch is needed to 81 // check the +/- range. This is usually false when 82 // processing ARM64 code so branch prediction will 83 // handle it well in terms of performance. 84 // 85 //if ((src & 0x001E0000) != 0 86 // && (src & 0x001E0000) != 0x001E0000) 87 if ((src + 0x00020000) & 0x001C0000) 88 continue; 89 90 instr &= 0x9000001F; 91 92 pc >>= 12; 93 if (!is_encoder) 94 pc = 0U - pc; 95 96 const uint32_t dest = src + pc; 97 instr |= (dest & 3) << 29; 98 instr |= (dest & 0x0003FFFC) << 3; 99 instr |= (0U - (dest & 0x00020000)) & 0x00E00000; 100 write32le(buffer + i, instr); 101 } 102 } 103 104 return i; 105 } 106 107 108 static lzma_ret 109 arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator, 110 const lzma_filter_info *filters, bool is_encoder) 111 { 112 return lzma_simple_coder_init(next, allocator, filters, 113 &arm64_code, 0, 4, 4, is_encoder); 114 } 115 116 117 #ifdef HAVE_ENCODER_ARM64 118 extern lzma_ret 119 lzma_simple_arm64_encoder_init(lzma_next_coder *next, 120 const lzma_allocator *allocator, 121 const lzma_filter_info *filters) 122 { 123 return arm64_coder_init(next, allocator, filters, true); 124 } 125 #endif 126 127 128 #ifdef HAVE_DECODER_ARM64 129 extern lzma_ret 130 lzma_simple_arm64_decoder_init(lzma_next_coder *next, 131 const lzma_allocator *allocator, 132 const lzma_filter_info *filters) 133 { 134 return arm64_coder_init(next, allocator, filters, false); 135 } 136 #endif 137