1*81ad6265SDimitry Andric #include <stdbool.h>
2*81ad6265SDimitry Andric #include <stddef.h>
3*81ad6265SDimitry Andric #include <stdint.h>
4*81ad6265SDimitry Andric
5*81ad6265SDimitry Andric #include "blake3_impl.h"
6*81ad6265SDimitry Andric
7*81ad6265SDimitry Andric #if defined(IS_X86)
8*81ad6265SDimitry Andric #if defined(_MSC_VER)
9*81ad6265SDimitry Andric #include <intrin.h>
10*81ad6265SDimitry Andric #elif defined(__GNUC__)
11*81ad6265SDimitry Andric #include <immintrin.h>
12*81ad6265SDimitry Andric #else
13*81ad6265SDimitry Andric #error "Unimplemented!"
14*81ad6265SDimitry Andric #endif
15*81ad6265SDimitry Andric #endif
16*81ad6265SDimitry Andric
17*81ad6265SDimitry Andric #define MAYBE_UNUSED(x) (void)((x))
18*81ad6265SDimitry Andric
19*81ad6265SDimitry Andric #if defined(IS_X86)
xgetbv(void)20*81ad6265SDimitry Andric static uint64_t xgetbv(void) {
21*81ad6265SDimitry Andric #if defined(_MSC_VER)
22*81ad6265SDimitry Andric return _xgetbv(0);
23*81ad6265SDimitry Andric #else
24*81ad6265SDimitry Andric uint32_t eax = 0, edx = 0;
25*81ad6265SDimitry Andric __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
26*81ad6265SDimitry Andric return ((uint64_t)edx << 32) | eax;
27*81ad6265SDimitry Andric #endif
28*81ad6265SDimitry Andric }
29*81ad6265SDimitry Andric
cpuid(uint32_t out[4],uint32_t id)30*81ad6265SDimitry Andric static void cpuid(uint32_t out[4], uint32_t id) {
31*81ad6265SDimitry Andric #if defined(_MSC_VER)
32*81ad6265SDimitry Andric __cpuid((int *)out, id);
33*81ad6265SDimitry Andric #elif defined(__i386__) || defined(_M_IX86)
34*81ad6265SDimitry Andric __asm__ __volatile__("movl %%ebx, %1\n"
35*81ad6265SDimitry Andric "cpuid\n"
36*81ad6265SDimitry Andric "xchgl %1, %%ebx\n"
37*81ad6265SDimitry Andric : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
38*81ad6265SDimitry Andric : "a"(id));
39*81ad6265SDimitry Andric #else
40*81ad6265SDimitry Andric __asm__ __volatile__("cpuid\n"
41*81ad6265SDimitry Andric : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
42*81ad6265SDimitry Andric : "a"(id));
43*81ad6265SDimitry Andric #endif
44*81ad6265SDimitry Andric }
45*81ad6265SDimitry Andric
cpuidex(uint32_t out[4],uint32_t id,uint32_t sid)46*81ad6265SDimitry Andric static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
47*81ad6265SDimitry Andric #if defined(_MSC_VER)
48*81ad6265SDimitry Andric __cpuidex((int *)out, id, sid);
49*81ad6265SDimitry Andric #elif defined(__i386__) || defined(_M_IX86)
50*81ad6265SDimitry Andric __asm__ __volatile__("movl %%ebx, %1\n"
51*81ad6265SDimitry Andric "cpuid\n"
52*81ad6265SDimitry Andric "xchgl %1, %%ebx\n"
53*81ad6265SDimitry Andric : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
54*81ad6265SDimitry Andric : "a"(id), "c"(sid));
55*81ad6265SDimitry Andric #else
56*81ad6265SDimitry Andric __asm__ __volatile__("cpuid\n"
57*81ad6265SDimitry Andric : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
58*81ad6265SDimitry Andric : "a"(id), "c"(sid));
59*81ad6265SDimitry Andric #endif
60*81ad6265SDimitry Andric }
61*81ad6265SDimitry Andric
62*81ad6265SDimitry Andric #endif
63*81ad6265SDimitry Andric
64*81ad6265SDimitry Andric enum cpu_feature {
65*81ad6265SDimitry Andric SSE2 = 1 << 0,
66*81ad6265SDimitry Andric SSSE3 = 1 << 1,
67*81ad6265SDimitry Andric SSE41 = 1 << 2,
68*81ad6265SDimitry Andric AVX = 1 << 3,
69*81ad6265SDimitry Andric AVX2 = 1 << 4,
70*81ad6265SDimitry Andric AVX512F = 1 << 5,
71*81ad6265SDimitry Andric AVX512VL = 1 << 6,
72*81ad6265SDimitry Andric /* ... */
73*81ad6265SDimitry Andric UNDEFINED = 1 << 30
74*81ad6265SDimitry Andric };
75*81ad6265SDimitry Andric
76*81ad6265SDimitry Andric #if !defined(BLAKE3_TESTING)
77*81ad6265SDimitry Andric static /* Allow the variable to be controlled manually for testing */
78*81ad6265SDimitry Andric #endif
79*81ad6265SDimitry Andric enum cpu_feature g_cpu_features = UNDEFINED;
80*81ad6265SDimitry Andric
81*81ad6265SDimitry Andric LLVM_ATTRIBUTE_USED
82*81ad6265SDimitry Andric #if !defined(BLAKE3_TESTING)
83*81ad6265SDimitry Andric static
84*81ad6265SDimitry Andric #endif
85*81ad6265SDimitry Andric enum cpu_feature
get_cpu_features(void)86*81ad6265SDimitry Andric get_cpu_features(void) {
87*81ad6265SDimitry Andric
88*81ad6265SDimitry Andric if (g_cpu_features != UNDEFINED) {
89*81ad6265SDimitry Andric return g_cpu_features;
90*81ad6265SDimitry Andric } else {
91*81ad6265SDimitry Andric #if defined(IS_X86)
92*81ad6265SDimitry Andric uint32_t regs[4] = {0};
93*81ad6265SDimitry Andric uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3];
94*81ad6265SDimitry Andric (void)edx;
95*81ad6265SDimitry Andric enum cpu_feature features = 0;
96*81ad6265SDimitry Andric cpuid(regs, 0);
97*81ad6265SDimitry Andric const int max_id = *eax;
98*81ad6265SDimitry Andric cpuid(regs, 1);
99*81ad6265SDimitry Andric #if defined(__amd64__) || defined(_M_X64)
100*81ad6265SDimitry Andric features |= SSE2;
101*81ad6265SDimitry Andric #else
102*81ad6265SDimitry Andric if (*edx & (1UL << 26))
103*81ad6265SDimitry Andric features |= SSE2;
104*81ad6265SDimitry Andric #endif
105*81ad6265SDimitry Andric if (*ecx & (1UL << 0))
106*81ad6265SDimitry Andric features |= SSSE3;
107*81ad6265SDimitry Andric if (*ecx & (1UL << 19))
108*81ad6265SDimitry Andric features |= SSE41;
109*81ad6265SDimitry Andric
110*81ad6265SDimitry Andric if (*ecx & (1UL << 27)) { // OSXSAVE
111*81ad6265SDimitry Andric const uint64_t mask = xgetbv();
112*81ad6265SDimitry Andric if ((mask & 6) == 6) { // SSE and AVX states
113*81ad6265SDimitry Andric if (*ecx & (1UL << 28))
114*81ad6265SDimitry Andric features |= AVX;
115*81ad6265SDimitry Andric if (max_id >= 7) {
116*81ad6265SDimitry Andric cpuidex(regs, 7, 0);
117*81ad6265SDimitry Andric if (*ebx & (1UL << 5))
118*81ad6265SDimitry Andric features |= AVX2;
119*81ad6265SDimitry Andric if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
120*81ad6265SDimitry Andric if (*ebx & (1UL << 31))
121*81ad6265SDimitry Andric features |= AVX512VL;
122*81ad6265SDimitry Andric if (*ebx & (1UL << 16))
123*81ad6265SDimitry Andric features |= AVX512F;
124*81ad6265SDimitry Andric }
125*81ad6265SDimitry Andric }
126*81ad6265SDimitry Andric }
127*81ad6265SDimitry Andric }
128*81ad6265SDimitry Andric g_cpu_features = features;
129*81ad6265SDimitry Andric return features;
130*81ad6265SDimitry Andric #else
131*81ad6265SDimitry Andric /* How to detect NEON? */
132*81ad6265SDimitry Andric return 0;
133*81ad6265SDimitry Andric #endif
134*81ad6265SDimitry Andric }
135*81ad6265SDimitry Andric }
136*81ad6265SDimitry Andric
blake3_compress_in_place(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)137*81ad6265SDimitry Andric void blake3_compress_in_place(uint32_t cv[8],
138*81ad6265SDimitry Andric const uint8_t block[BLAKE3_BLOCK_LEN],
139*81ad6265SDimitry Andric uint8_t block_len, uint64_t counter,
140*81ad6265SDimitry Andric uint8_t flags) {
141*81ad6265SDimitry Andric #if defined(IS_X86)
142*81ad6265SDimitry Andric const enum cpu_feature features = get_cpu_features();
143*81ad6265SDimitry Andric MAYBE_UNUSED(features);
144*81ad6265SDimitry Andric #if !defined(BLAKE3_NO_AVX512)
145*81ad6265SDimitry Andric if (features & AVX512VL) {
146*81ad6265SDimitry Andric blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
147*81ad6265SDimitry Andric return;
148*81ad6265SDimitry Andric }
149*81ad6265SDimitry Andric #endif
150*81ad6265SDimitry Andric #if !defined(BLAKE3_NO_SSE41)
151*81ad6265SDimitry Andric if (features & SSE41) {
152*81ad6265SDimitry Andric blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
153*81ad6265SDimitry Andric return;
154*81ad6265SDimitry Andric }
155*81ad6265SDimitry Andric #endif
156*81ad6265SDimitry Andric #if !defined(BLAKE3_NO_SSE2)
157*81ad6265SDimitry Andric if (features & SSE2) {
158*81ad6265SDimitry Andric blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
159*81ad6265SDimitry Andric return;
160*81ad6265SDimitry Andric }
161*81ad6265SDimitry Andric #endif
162*81ad6265SDimitry Andric #endif
163*81ad6265SDimitry Andric blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
164*81ad6265SDimitry Andric }
165*81ad6265SDimitry Andric
blake3_compress_xof(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])166*81ad6265SDimitry Andric void blake3_compress_xof(const uint32_t cv[8],
167*81ad6265SDimitry Andric const uint8_t block[BLAKE3_BLOCK_LEN],
168*81ad6265SDimitry Andric uint8_t block_len, uint64_t counter, uint8_t flags,
169*81ad6265SDimitry Andric uint8_t out[64]) {
170*81ad6265SDimitry Andric #if defined(IS_X86)
171*81ad6265SDimitry Andric const enum cpu_feature features = get_cpu_features();
172*81ad6265SDimitry Andric MAYBE_UNUSED(features);
173*81ad6265SDimitry Andric #if !defined(BLAKE3_NO_AVX512)
174*81ad6265SDimitry Andric if (features & AVX512VL) {
175*81ad6265SDimitry Andric blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
176*81ad6265SDimitry Andric return;
177*81ad6265SDimitry Andric }
178*81ad6265SDimitry Andric #endif
179*81ad6265SDimitry Andric #if !defined(BLAKE3_NO_SSE41)
180*81ad6265SDimitry Andric if (features & SSE41) {
181*81ad6265SDimitry Andric blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
182*81ad6265SDimitry Andric return;
183*81ad6265SDimitry Andric }
184*81ad6265SDimitry Andric #endif
185*81ad6265SDimitry Andric #if !defined(BLAKE3_NO_SSE2)
186*81ad6265SDimitry Andric if (features & SSE2) {
187*81ad6265SDimitry Andric blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
188*81ad6265SDimitry Andric return;
189*81ad6265SDimitry Andric }
190*81ad6265SDimitry Andric #endif
191*81ad6265SDimitry Andric #endif
192*81ad6265SDimitry Andric blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
193*81ad6265SDimitry Andric }
194*81ad6265SDimitry Andric
blake3_hash_many(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)195*81ad6265SDimitry Andric void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
196*81ad6265SDimitry Andric size_t blocks, const uint32_t key[8], uint64_t counter,
197*81ad6265SDimitry Andric bool increment_counter, uint8_t flags,
198*81ad6265SDimitry Andric uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
199*81ad6265SDimitry Andric #if defined(IS_X86)
200*81ad6265SDimitry Andric const enum cpu_feature features = get_cpu_features();
201*81ad6265SDimitry Andric MAYBE_UNUSED(features);
202*81ad6265SDimitry Andric #if !defined(BLAKE3_NO_AVX512)
203*81ad6265SDimitry Andric if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
204*81ad6265SDimitry Andric blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
205*81ad6265SDimitry Andric increment_counter, flags, flags_start, flags_end,
206*81ad6265SDimitry Andric out);
207*81ad6265SDimitry Andric return;
208*81ad6265SDimitry Andric }
209*81ad6265SDimitry Andric #endif
210*81ad6265SDimitry Andric #if !defined(BLAKE3_NO_AVX2)
211*81ad6265SDimitry Andric if (features & AVX2) {
212*81ad6265SDimitry Andric blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
213*81ad6265SDimitry Andric increment_counter, flags, flags_start, flags_end,
214*81ad6265SDimitry Andric out);
215*81ad6265SDimitry Andric return;
216*81ad6265SDimitry Andric }
217*81ad6265SDimitry Andric #endif
218*81ad6265SDimitry Andric #if !defined(BLAKE3_NO_SSE41)
219*81ad6265SDimitry Andric if (features & SSE41) {
220*81ad6265SDimitry Andric blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
221*81ad6265SDimitry Andric increment_counter, flags, flags_start, flags_end,
222*81ad6265SDimitry Andric out);
223*81ad6265SDimitry Andric return;
224*81ad6265SDimitry Andric }
225*81ad6265SDimitry Andric #endif
226*81ad6265SDimitry Andric #if !defined(BLAKE3_NO_SSE2)
227*81ad6265SDimitry Andric if (features & SSE2) {
228*81ad6265SDimitry Andric blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
229*81ad6265SDimitry Andric increment_counter, flags, flags_start, flags_end,
230*81ad6265SDimitry Andric out);
231*81ad6265SDimitry Andric return;
232*81ad6265SDimitry Andric }
233*81ad6265SDimitry Andric #endif
234*81ad6265SDimitry Andric #endif
235*81ad6265SDimitry Andric
236*81ad6265SDimitry Andric #if BLAKE3_USE_NEON == 1
237*81ad6265SDimitry Andric blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
238*81ad6265SDimitry Andric increment_counter, flags, flags_start, flags_end, out);
239*81ad6265SDimitry Andric return;
240*81ad6265SDimitry Andric #endif
241*81ad6265SDimitry Andric
242*81ad6265SDimitry Andric blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
243*81ad6265SDimitry Andric increment_counter, flags, flags_start, flags_end,
244*81ad6265SDimitry Andric out);
245*81ad6265SDimitry Andric }
246*81ad6265SDimitry Andric
247*81ad6265SDimitry Andric // The dynamically detected SIMD degree of the current platform.
blake3_simd_degree(void)248*81ad6265SDimitry Andric size_t blake3_simd_degree(void) {
249*81ad6265SDimitry Andric #if defined(IS_X86)
250*81ad6265SDimitry Andric const enum cpu_feature features = get_cpu_features();
251*81ad6265SDimitry Andric MAYBE_UNUSED(features);
252*81ad6265SDimitry Andric #if !defined(BLAKE3_NO_AVX512)
253*81ad6265SDimitry Andric if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
254*81ad6265SDimitry Andric return 16;
255*81ad6265SDimitry Andric }
256*81ad6265SDimitry Andric #endif
257*81ad6265SDimitry Andric #if !defined(BLAKE3_NO_AVX2)
258*81ad6265SDimitry Andric if (features & AVX2) {
259*81ad6265SDimitry Andric return 8;
260*81ad6265SDimitry Andric }
261*81ad6265SDimitry Andric #endif
262*81ad6265SDimitry Andric #if !defined(BLAKE3_NO_SSE41)
263*81ad6265SDimitry Andric if (features & SSE41) {
264*81ad6265SDimitry Andric return 4;
265*81ad6265SDimitry Andric }
266*81ad6265SDimitry Andric #endif
267*81ad6265SDimitry Andric #if !defined(BLAKE3_NO_SSE2)
268*81ad6265SDimitry Andric if (features & SSE2) {
269*81ad6265SDimitry Andric return 4;
270*81ad6265SDimitry Andric }
271*81ad6265SDimitry Andric #endif
272*81ad6265SDimitry Andric #endif
273*81ad6265SDimitry Andric #if BLAKE3_USE_NEON == 1
274*81ad6265SDimitry Andric return 4;
275*81ad6265SDimitry Andric #endif
276*81ad6265SDimitry Andric return 1;
277*81ad6265SDimitry Andric }
278