1 // SPDX-License-Identifier: 0BSD 2 3 /////////////////////////////////////////////////////////////////////////////// 4 // 5 /// \file crc32_arm64.h 6 /// \brief CRC32 calculation with ARM64 optimization 7 // 8 // Authors: Chenxi Mao 9 // Jia Tan 10 // Lasse Collin 11 // 12 /////////////////////////////////////////////////////////////////////////////// 13 14 #ifndef LZMA_CRC32_ARM64_H 15 #define LZMA_CRC32_ARM64_H 16 17 // MSVC always has the CRC intrinsics available when building for ARM64 18 // there is no need to include any header files. 19 #ifndef _MSC_VER 20 # include <arm_acle.h> 21 #endif 22 23 // If both versions are going to be built, we need runtime detection 24 // to check if the instructions are supported. 25 #if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED) 26 # if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO) 27 # include <sys/auxv.h> 28 # elif defined(_WIN32) 29 # include <processthreadsapi.h> 30 # elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME) 31 # include <sys/sysctl.h> 32 # endif 33 #endif 34 35 // Some EDG-based compilers support ARM64 and define __GNUC__ 36 // (such as Nvidia's nvcc), but do not support function attributes. 37 // 38 // NOTE: Build systems check for this too, keep them in sync with this. 39 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__) 40 # define crc_attr_target __attribute__((__target__("+crc"))) 41 #else 42 # define crc_attr_target 43 #endif 44 45 46 crc_attr_target 47 static uint32_t 48 crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc) 49 { 50 crc = ~crc; 51 52 if (size >= 8) { 53 // Align the input buffer because this was shown to be 54 // significantly faster than unaligned accesses. 55 const size_t align = (0 - (uintptr_t)buf) & 7; 56 57 if (align & 1) 58 crc = __crc32b(crc, *buf++); 59 60 if (align & 2) { 61 crc = __crc32h(crc, aligned_read16le(buf)); 62 buf += 2; 63 } 64 65 if (align & 4) { 66 crc = __crc32w(crc, aligned_read32le(buf)); 67 buf += 4; 68 } 69 70 size -= align; 71 72 // Process 8 bytes at a time. The end point is determined by 73 // ignoring the least significant three bits of size to 74 // ensure we do not process past the bounds of the buffer. 75 // This guarantees that limit is a multiple of 8 and is 76 // strictly less than size. 77 for (const uint8_t *limit = buf + (size & ~(size_t)7); 78 buf < limit; buf += 8) 79 crc = __crc32d(crc, aligned_read64le(buf)); 80 81 size &= 7; 82 } 83 84 // Process the remaining bytes that are not 8 byte aligned. 85 if (size & 4) { 86 crc = __crc32w(crc, aligned_read32le(buf)); 87 buf += 4; 88 } 89 90 if (size & 2) { 91 crc = __crc32h(crc, aligned_read16le(buf)); 92 buf += 2; 93 } 94 95 if (size & 1) 96 crc = __crc32b(crc, *buf); 97 98 return ~crc; 99 } 100 101 102 #if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED) 103 static inline bool 104 is_arch_extension_supported(void) 105 { 106 #if defined(HAVE_GETAUXVAL) 107 return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0; 108 109 #elif defined(HAVE_ELF_AUX_INFO) 110 unsigned long feature_flags; 111 112 if (elf_aux_info(AT_HWCAP, &feature_flags, sizeof(feature_flags)) != 0) 113 return false; 114 115 return (feature_flags & HWCAP_CRC32) != 0; 116 117 #elif defined(_WIN32) 118 return IsProcessorFeaturePresent( 119 PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE); 120 121 #elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME) 122 int has_crc32 = 0; 123 size_t size = sizeof(has_crc32); 124 125 // The sysctlbyname() function requires a string identifier for the 126 // CPU feature it tests. The Apple documentation lists the string 127 // "hw.optional.armv8_crc32", which can be found here: 128 // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics#3915619 129 if (sysctlbyname("hw.optional.armv8_crc32", &has_crc32, 130 &size, NULL, 0) != 0) 131 return false; 132 133 return has_crc32; 134 135 #else 136 // If a runtime detection method cannot be found, then this must 137 // be a compile time error. The checks in crc_common.h should ensure 138 // a runtime detection method is always found if this function is 139 // built. It would be possible to just return false here, but this 140 // is inefficient for binary size and runtime since only the generic 141 // method could ever be used. 142 # error Runtime detection method unavailable. 143 #endif 144 } 145 #endif 146 147 #endif // LZMA_CRC32_ARM64_H 148