1 #include <stdbool.h> 2 #include <stddef.h> 3 #include <stdint.h> 4 5 #include "blake3_impl.h" 6 7 #if defined(IS_X86) 8 #if defined(_MSC_VER) 9 #include <intrin.h> 10 #elif defined(__GNUC__) 11 #include <immintrin.h> 12 #else 13 #error "Unimplemented!" 14 #endif 15 #endif 16 17 #define MAYBE_UNUSED(x) (void)((x)) 18 19 #if defined(IS_X86) 20 static uint64_t xgetbv(void) { 21 #if defined(_MSC_VER) 22 return _xgetbv(0); 23 #else 24 uint32_t eax = 0, edx = 0; 25 __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); 26 return ((uint64_t)edx << 32) | eax; 27 #endif 28 } 29 30 static void cpuid(uint32_t out[4], uint32_t id) { 31 #if defined(_MSC_VER) 32 __cpuid((int *)out, id); 33 #elif defined(__i386__) || defined(_M_IX86) 34 __asm__ __volatile__("movl %%ebx, %1\n" 35 "cpuid\n" 36 "xchgl %1, %%ebx\n" 37 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) 38 : "a"(id)); 39 #else 40 __asm__ __volatile__("cpuid\n" 41 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) 42 : "a"(id)); 43 #endif 44 } 45 46 static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { 47 #if defined(_MSC_VER) 48 __cpuidex((int *)out, id, sid); 49 #elif defined(__i386__) || defined(_M_IX86) 50 __asm__ __volatile__("movl %%ebx, %1\n" 51 "cpuid\n" 52 "xchgl %1, %%ebx\n" 53 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) 54 : "a"(id), "c"(sid)); 55 #else 56 __asm__ __volatile__("cpuid\n" 57 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) 58 : "a"(id), "c"(sid)); 59 #endif 60 } 61 62 #endif 63 64 enum cpu_feature { 65 SSE2 = 1 << 0, 66 SSSE3 = 1 << 1, 67 SSE41 = 1 << 2, 68 AVX = 1 << 3, 69 AVX2 = 1 << 4, 70 AVX512F = 1 << 5, 71 AVX512VL = 1 << 6, 72 /* ... */ 73 UNDEFINED = 1 << 30 74 }; 75 76 #if !defined(BLAKE3_TESTING) 77 static /* Allow the variable to be controlled manually for testing */ 78 #endif 79 enum cpu_feature g_cpu_features = UNDEFINED; 80 81 LLVM_ATTRIBUTE_USED 82 #if !defined(BLAKE3_TESTING) 83 static 84 #endif 85 enum cpu_feature 86 get_cpu_features(void) { 87 88 if (g_cpu_features != UNDEFINED) { 89 return g_cpu_features; 90 } else { 91 #if defined(IS_X86) 92 uint32_t regs[4] = {0}; 93 uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; 94 (void)edx; 95 enum cpu_feature features = 0; 96 cpuid(regs, 0); 97 const int max_id = *eax; 98 cpuid(regs, 1); 99 #if defined(__amd64__) || defined(_M_X64) 100 features |= SSE2; 101 #else 102 if (*edx & (1UL << 26)) 103 features |= SSE2; 104 #endif 105 if (*ecx & (1UL << 0)) 106 features |= SSSE3; 107 if (*ecx & (1UL << 19)) 108 features |= SSE41; 109 110 if (*ecx & (1UL << 27)) { // OSXSAVE 111 const uint64_t mask = xgetbv(); 112 if ((mask & 6) == 6) { // SSE and AVX states 113 if (*ecx & (1UL << 28)) 114 features |= AVX; 115 if (max_id >= 7) { 116 cpuidex(regs, 7, 0); 117 if (*ebx & (1UL << 5)) 118 features |= AVX2; 119 if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm 120 if (*ebx & (1UL << 31)) 121 features |= AVX512VL; 122 if (*ebx & (1UL << 16)) 123 features |= AVX512F; 124 } 125 } 126 } 127 } 128 g_cpu_features = features; 129 return features; 130 #else 131 /* How to detect NEON? */ 132 return 0; 133 #endif 134 } 135 } 136 137 void blake3_compress_in_place(uint32_t cv[8], 138 const uint8_t block[BLAKE3_BLOCK_LEN], 139 uint8_t block_len, uint64_t counter, 140 uint8_t flags) { 141 #if defined(IS_X86) 142 const enum cpu_feature features = get_cpu_features(); 143 MAYBE_UNUSED(features); 144 #if !defined(BLAKE3_NO_AVX512) 145 if (features & AVX512VL) { 146 blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); 147 return; 148 } 149 #endif 150 #if !defined(BLAKE3_NO_SSE41) 151 if (features & SSE41) { 152 blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); 153 return; 154 } 155 #endif 156 #if !defined(BLAKE3_NO_SSE2) 157 if (features & SSE2) { 158 blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); 159 return; 160 } 161 #endif 162 #endif 163 blake3_compress_in_place_portable(cv, block, block_len, counter, flags); 164 } 165 166 void blake3_compress_xof(const uint32_t cv[8], 167 const uint8_t block[BLAKE3_BLOCK_LEN], 168 uint8_t block_len, uint64_t counter, uint8_t flags, 169 uint8_t out[64]) { 170 #if defined(IS_X86) 171 const enum cpu_feature features = get_cpu_features(); 172 MAYBE_UNUSED(features); 173 #if !defined(BLAKE3_NO_AVX512) 174 if (features & AVX512VL) { 175 blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); 176 return; 177 } 178 #endif 179 #if !defined(BLAKE3_NO_SSE41) 180 if (features & SSE41) { 181 blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); 182 return; 183 } 184 #endif 185 #if !defined(BLAKE3_NO_SSE2) 186 if (features & SSE2) { 187 blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); 188 return; 189 } 190 #endif 191 #endif 192 blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); 193 } 194 195 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, 196 size_t blocks, const uint32_t key[8], uint64_t counter, 197 bool increment_counter, uint8_t flags, 198 uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 199 #if defined(IS_X86) 200 const enum cpu_feature features = get_cpu_features(); 201 MAYBE_UNUSED(features); 202 #if !defined(BLAKE3_NO_AVX512) 203 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { 204 blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, 205 increment_counter, flags, flags_start, flags_end, 206 out); 207 return; 208 } 209 #endif 210 #if !defined(BLAKE3_NO_AVX2) 211 if (features & AVX2) { 212 blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, 213 increment_counter, flags, flags_start, flags_end, 214 out); 215 return; 216 } 217 #endif 218 #if !defined(BLAKE3_NO_SSE41) 219 if (features & SSE41) { 220 blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, 221 increment_counter, flags, flags_start, flags_end, 222 out); 223 return; 224 } 225 #endif 226 #if !defined(BLAKE3_NO_SSE2) 227 if (features & SSE2) { 228 blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, 229 increment_counter, flags, flags_start, flags_end, 230 out); 231 return; 232 } 233 #endif 234 #endif 235 236 #if BLAKE3_USE_NEON == 1 237 blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, 238 increment_counter, flags, flags_start, flags_end, out); 239 return; 240 #endif 241 242 blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, 243 increment_counter, flags, flags_start, flags_end, 244 out); 245 } 246 247 // The dynamically detected SIMD degree of the current platform. 248 size_t blake3_simd_degree(void) { 249 #if defined(IS_X86) 250 const enum cpu_feature features = get_cpu_features(); 251 MAYBE_UNUSED(features); 252 #if !defined(BLAKE3_NO_AVX512) 253 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { 254 return 16; 255 } 256 #endif 257 #if !defined(BLAKE3_NO_AVX2) 258 if (features & AVX2) { 259 return 8; 260 } 261 #endif 262 #if !defined(BLAKE3_NO_SSE41) 263 if (features & SSE41) { 264 return 4; 265 } 266 #endif 267 #if !defined(BLAKE3_NO_SSE2) 268 if (features & SSE2) { 269 return 4; 270 } 271 #endif 272 #endif 273 #if BLAKE3_USE_NEON == 1 274 return 4; 275 #endif 276 return 1; 277 } 278