1 #include <stdbool.h>
2 #include <stddef.h>
3 #include <stdint.h>
4
5 #include "blake3_impl.h"
6
7 #if defined(IS_X86)
8 #if defined(_MSC_VER)
9 #include <intrin.h>
10 #elif defined(__GNUC__)
11 #include <immintrin.h>
12 #else
13 #error "Unimplemented!"
14 #endif
15 #endif
16
17 #define MAYBE_UNUSED(x) (void)((x))
18
19 #if defined(IS_X86)
xgetbv(void)20 static uint64_t xgetbv(void) {
21 #if defined(_MSC_VER)
22 return _xgetbv(0);
23 #else
24 uint32_t eax = 0, edx = 0;
25 __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
26 return ((uint64_t)edx << 32) | eax;
27 #endif
28 }
29
cpuid(uint32_t out[4],uint32_t id)30 static void cpuid(uint32_t out[4], uint32_t id) {
31 #if defined(_MSC_VER)
32 __cpuid((int *)out, id);
33 #elif defined(__i386__) || defined(_M_IX86)
34 __asm__ __volatile__("movl %%ebx, %1\n"
35 "cpuid\n"
36 "xchgl %1, %%ebx\n"
37 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
38 : "a"(id));
39 #else
40 __asm__ __volatile__("cpuid\n"
41 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
42 : "a"(id));
43 #endif
44 }
45
cpuidex(uint32_t out[4],uint32_t id,uint32_t sid)46 static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
47 #if defined(_MSC_VER)
48 __cpuidex((int *)out, id, sid);
49 #elif defined(__i386__) || defined(_M_IX86)
50 __asm__ __volatile__("movl %%ebx, %1\n"
51 "cpuid\n"
52 "xchgl %1, %%ebx\n"
53 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
54 : "a"(id), "c"(sid));
55 #else
56 __asm__ __volatile__("cpuid\n"
57 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
58 : "a"(id), "c"(sid));
59 #endif
60 }
61
62 #endif
63
64 enum cpu_feature {
65 SSE2 = 1 << 0,
66 SSSE3 = 1 << 1,
67 SSE41 = 1 << 2,
68 AVX = 1 << 3,
69 AVX2 = 1 << 4,
70 AVX512F = 1 << 5,
71 AVX512VL = 1 << 6,
72 /* ... */
73 UNDEFINED = 1 << 30
74 };
75
76 #if !defined(BLAKE3_TESTING)
77 static /* Allow the variable to be controlled manually for testing */
78 #endif
79 enum cpu_feature g_cpu_features = UNDEFINED;
80
81 LLVM_ATTRIBUTE_USED
82 #if !defined(BLAKE3_TESTING)
83 static
84 #endif
85 enum cpu_feature
get_cpu_features(void)86 get_cpu_features(void) {
87
88 if (g_cpu_features != UNDEFINED) {
89 return g_cpu_features;
90 } else {
91 #if defined(IS_X86)
92 uint32_t regs[4] = {0};
93 uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3];
94 (void)edx;
95 enum cpu_feature features = 0;
96 cpuid(regs, 0);
97 const int max_id = *eax;
98 cpuid(regs, 1);
99 #if defined(__amd64__) || defined(_M_X64)
100 features |= SSE2;
101 #else
102 if (*edx & (1UL << 26))
103 features |= SSE2;
104 #endif
105 if (*ecx & (1UL << 0))
106 features |= SSSE3;
107 if (*ecx & (1UL << 19))
108 features |= SSE41;
109
110 if (*ecx & (1UL << 27)) { // OSXSAVE
111 const uint64_t mask = xgetbv();
112 if ((mask & 6) == 6) { // SSE and AVX states
113 if (*ecx & (1UL << 28))
114 features |= AVX;
115 if (max_id >= 7) {
116 cpuidex(regs, 7, 0);
117 if (*ebx & (1UL << 5))
118 features |= AVX2;
119 if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
120 if (*ebx & (1UL << 31))
121 features |= AVX512VL;
122 if (*ebx & (1UL << 16))
123 features |= AVX512F;
124 }
125 }
126 }
127 }
128 g_cpu_features = features;
129 return features;
130 #else
131 /* How to detect NEON? */
132 return 0;
133 #endif
134 }
135 }
136
blake3_compress_in_place(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)137 void blake3_compress_in_place(uint32_t cv[8],
138 const uint8_t block[BLAKE3_BLOCK_LEN],
139 uint8_t block_len, uint64_t counter,
140 uint8_t flags) {
141 #if defined(IS_X86)
142 const enum cpu_feature features = get_cpu_features();
143 MAYBE_UNUSED(features);
144 #if !defined(BLAKE3_NO_AVX512)
145 if (features & AVX512VL) {
146 blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
147 return;
148 }
149 #endif
150 #if !defined(BLAKE3_NO_SSE41)
151 if (features & SSE41) {
152 blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
153 return;
154 }
155 #endif
156 #if !defined(BLAKE3_NO_SSE2)
157 if (features & SSE2) {
158 blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
159 return;
160 }
161 #endif
162 #endif
163 blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
164 }
165
blake3_compress_xof(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])166 void blake3_compress_xof(const uint32_t cv[8],
167 const uint8_t block[BLAKE3_BLOCK_LEN],
168 uint8_t block_len, uint64_t counter, uint8_t flags,
169 uint8_t out[64]) {
170 #if defined(IS_X86)
171 const enum cpu_feature features = get_cpu_features();
172 MAYBE_UNUSED(features);
173 #if !defined(BLAKE3_NO_AVX512)
174 if (features & AVX512VL) {
175 blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
176 return;
177 }
178 #endif
179 #if !defined(BLAKE3_NO_SSE41)
180 if (features & SSE41) {
181 blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
182 return;
183 }
184 #endif
185 #if !defined(BLAKE3_NO_SSE2)
186 if (features & SSE2) {
187 blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
188 return;
189 }
190 #endif
191 #endif
192 blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
193 }
194
blake3_hash_many(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)195 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
196 size_t blocks, const uint32_t key[8], uint64_t counter,
197 bool increment_counter, uint8_t flags,
198 uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
199 #if defined(IS_X86)
200 const enum cpu_feature features = get_cpu_features();
201 MAYBE_UNUSED(features);
202 #if !defined(BLAKE3_NO_AVX512)
203 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
204 blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
205 increment_counter, flags, flags_start, flags_end,
206 out);
207 return;
208 }
209 #endif
210 #if !defined(BLAKE3_NO_AVX2)
211 if (features & AVX2) {
212 blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
213 increment_counter, flags, flags_start, flags_end,
214 out);
215 return;
216 }
217 #endif
218 #if !defined(BLAKE3_NO_SSE41)
219 if (features & SSE41) {
220 blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
221 increment_counter, flags, flags_start, flags_end,
222 out);
223 return;
224 }
225 #endif
226 #if !defined(BLAKE3_NO_SSE2)
227 if (features & SSE2) {
228 blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
229 increment_counter, flags, flags_start, flags_end,
230 out);
231 return;
232 }
233 #endif
234 #endif
235
236 #if BLAKE3_USE_NEON == 1
237 blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
238 increment_counter, flags, flags_start, flags_end, out);
239 return;
240 #endif
241
242 blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
243 increment_counter, flags, flags_start, flags_end,
244 out);
245 }
246
247 // The dynamically detected SIMD degree of the current platform.
blake3_simd_degree(void)248 size_t blake3_simd_degree(void) {
249 #if defined(IS_X86)
250 const enum cpu_feature features = get_cpu_features();
251 MAYBE_UNUSED(features);
252 #if !defined(BLAKE3_NO_AVX512)
253 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
254 return 16;
255 }
256 #endif
257 #if !defined(BLAKE3_NO_AVX2)
258 if (features & AVX2) {
259 return 8;
260 }
261 #endif
262 #if !defined(BLAKE3_NO_SSE41)
263 if (features & SSE41) {
264 return 4;
265 }
266 #endif
267 #if !defined(BLAKE3_NO_SSE2)
268 if (features & SSE2) {
269 return 4;
270 }
271 #endif
272 #endif
273 #if BLAKE3_USE_NEON == 1
274 return 4;
275 #endif
276 return 1;
277 }
278