1 // SPDX-License-Identifier: 0BSD 2 3 /////////////////////////////////////////////////////////////////////////////// 4 // 5 /// \file crc32_arm64.h 6 /// \brief CRC32 calculation with ARM64 optimization 7 // 8 // Authors: Chenxi Mao 9 // Jia Tan 10 // Hans Jansen 11 // 12 /////////////////////////////////////////////////////////////////////////////// 13 14 15 #ifndef LZMA_CRC32_ARM64_H 16 #define LZMA_CRC32_ARM64_H 17 18 // MSVC always has the CRC intrinsics available when building for ARM64 19 // there is no need to include any header files. 20 #ifndef _MSC_VER 21 # include <arm_acle.h> 22 #endif 23 24 #if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED) 25 # if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO) 26 # include <sys/auxv.h> 27 # elif defined(_WIN32) 28 # include <processthreadsapi.h> 29 # elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME) 30 # include <sys/sysctl.h> 31 # endif 32 #endif 33 34 // Some EDG-based compilers support ARM64 and define __GNUC__ 35 // (such as Nvidia's nvcc), but do not support function attributes. 36 // 37 // NOTE: Build systems check for this too, keep them in sync with this. 38 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__) 39 # define crc_attr_target \ 40 __attribute__((__target__("+crc"))) 41 #else 42 # define crc_attr_target 43 #endif 44 45 46 crc_attr_target 47 static uint32_t 48 crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc) 49 { 50 crc = ~crc; 51 52 // Align the input buffer because this was shown to be 53 // significantly faster than unaligned accesses. 54 const size_t align_amount = my_min(size, (8 - (uintptr_t)buf) & 7); 55 56 for (const uint8_t *limit = buf + align_amount; buf < limit; ++buf) 57 crc = __crc32b(crc, *buf); 58 59 size -= align_amount; 60 61 // Process 8 bytes at a time. The end point is determined by 62 // ignoring the least significant three bits of size to ensure 63 // we do not process past the bounds of the buffer. This guarantees 64 // that limit is a multiple of 8 and is strictly less than size. 65 for (const uint8_t *limit = buf + (size & ~((size_t)7)); 66 buf < limit; buf += 8) 67 crc = __crc32d(crc, aligned_read64le(buf)); 68 69 // Process the remaining bytes that are not 8 byte aligned. 70 for (const uint8_t *limit = buf + (size & 7); buf < limit; ++buf) 71 crc = __crc32b(crc, *buf); 72 73 return ~crc; 74 } 75 76 77 #if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED) 78 static inline bool 79 is_arch_extension_supported(void) 80 { 81 #if defined(HAVE_GETAUXVAL) 82 return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0; 83 84 #elif defined(HAVE_ELF_AUX_INFO) 85 unsigned long feature_flags; 86 87 elf_aux_info(AT_HWCAP, &feature_flags, sizeof(feature_flags)); 88 return feature_flags & HWCAP_CRC32 != 0; 89 90 #elif defined(_WIN32) 91 return IsProcessorFeaturePresent( 92 PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE); 93 94 #elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME) 95 int has_crc32 = 0; 96 size_t size = sizeof(has_crc32); 97 98 // The sysctlbyname() function requires a string identifier for the 99 // CPU feature it tests. The Apple documentation lists the string 100 // "hw.optional.armv8_crc32", which can be found here: 101 // (https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics#3915619) 102 int err = sysctlbyname("hw.optional.armv8_crc32", &has_crc32, 103 &size, NULL, 0); 104 105 return !err && has_crc32; 106 107 #else 108 // If a runtime detection method cannot be found, then this must 109 // be a compile time error. The checks in crc_common.h should ensure 110 // a runtime detection method is always found if this function is 111 // built. It would be possible to just return false here, but this 112 // is inefficient for binary size and runtime since only the generic 113 // method could ever be used. 114 # error Runtime detection method unavailable. 115 #endif 116 } 117 #endif 118 119 #endif // LZMA_CRC32_ARM64_H 120