1*0957b409SSimon J. Gerraty /*
2*0957b409SSimon J. Gerraty * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3*0957b409SSimon J. Gerraty *
4*0957b409SSimon J. Gerraty * Permission is hereby granted, free of charge, to any person obtaining
5*0957b409SSimon J. Gerraty * a copy of this software and associated documentation files (the
6*0957b409SSimon J. Gerraty * "Software"), to deal in the Software without restriction, including
7*0957b409SSimon J. Gerraty * without limitation the rights to use, copy, modify, merge, publish,
8*0957b409SSimon J. Gerraty * distribute, sublicense, and/or sell copies of the Software, and to
9*0957b409SSimon J. Gerraty * permit persons to whom the Software is furnished to do so, subject to
10*0957b409SSimon J. Gerraty * the following conditions:
11*0957b409SSimon J. Gerraty *
12*0957b409SSimon J. Gerraty * The above copyright notice and this permission notice shall be
13*0957b409SSimon J. Gerraty * included in all copies or substantial portions of the Software.
14*0957b409SSimon J. Gerraty *
15*0957b409SSimon J. Gerraty * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16*0957b409SSimon J. Gerraty * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17*0957b409SSimon J. Gerraty * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18*0957b409SSimon J. Gerraty * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19*0957b409SSimon J. Gerraty * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20*0957b409SSimon J. Gerraty * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21*0957b409SSimon J. Gerraty * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*0957b409SSimon J. Gerraty * SOFTWARE.
23*0957b409SSimon J. Gerraty */
24*0957b409SSimon J. Gerraty
25*0957b409SSimon J. Gerraty #define BR_ENABLE_INTRINSICS 1
26*0957b409SSimon J. Gerraty #include "inner.h"
27*0957b409SSimon J. Gerraty
28*0957b409SSimon J. Gerraty /*
29*0957b409SSimon J. Gerraty * This is the GHASH implementation that leverages the pclmulqdq opcode
30*0957b409SSimon J. Gerraty * (from the AES-NI instructions).
31*0957b409SSimon J. Gerraty */
32*0957b409SSimon J. Gerraty
33*0957b409SSimon J. Gerraty #if BR_AES_X86NI
34*0957b409SSimon J. Gerraty
35*0957b409SSimon J. Gerraty /*
36*0957b409SSimon J. Gerraty * Test CPU support for PCLMULQDQ.
37*0957b409SSimon J. Gerraty */
38*0957b409SSimon J. Gerraty static inline int
pclmul_supported(void)39*0957b409SSimon J. Gerraty pclmul_supported(void)
40*0957b409SSimon J. Gerraty {
41*0957b409SSimon J. Gerraty /*
42*0957b409SSimon J. Gerraty * Bit mask for features in ECX:
43*0957b409SSimon J. Gerraty * 1 PCLMULQDQ support
44*0957b409SSimon J. Gerraty */
45*0957b409SSimon J. Gerraty return br_cpuid(0, 0, 0x00000002, 0);
46*0957b409SSimon J. Gerraty }
47*0957b409SSimon J. Gerraty
48*0957b409SSimon J. Gerraty /* see bearssl_hash.h */
49*0957b409SSimon J. Gerraty br_ghash
br_ghash_pclmul_get(void)50*0957b409SSimon J. Gerraty br_ghash_pclmul_get(void)
51*0957b409SSimon J. Gerraty {
52*0957b409SSimon J. Gerraty return pclmul_supported() ? &br_ghash_pclmul : 0;
53*0957b409SSimon J. Gerraty }
54*0957b409SSimon J. Gerraty
55*0957b409SSimon J. Gerraty BR_TARGETS_X86_UP
56*0957b409SSimon J. Gerraty
57*0957b409SSimon J. Gerraty /*
58*0957b409SSimon J. Gerraty * GHASH is defined over elements of GF(2^128) with "full little-endian"
59*0957b409SSimon J. Gerraty * representation: leftmost byte is least significant, and, within each
60*0957b409SSimon J. Gerraty * byte, leftmost _bit_ is least significant. The natural ordering in
61*0957b409SSimon J. Gerraty * x86 is "mixed little-endian": bytes are ordered from least to most
62*0957b409SSimon J. Gerraty * significant, but bits within a byte are in most-to-least significant
63*0957b409SSimon J. Gerraty * order. Going to full little-endian representation would require
64*0957b409SSimon J. Gerraty * reversing bits within each byte, which is doable but expensive.
65*0957b409SSimon J. Gerraty *
66*0957b409SSimon J. Gerraty * Instead, we go to full big-endian representation, by swapping bytes
67*0957b409SSimon J. Gerraty * around, which is done with a single _mm_shuffle_epi8() opcode (it
68*0957b409SSimon J. Gerraty * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
69*0957b409SSimon J. Gerraty * can use a full big-endian representation because in a carryless
70*0957b409SSimon J. Gerraty * multiplication, we have a nice bit reversal property:
71*0957b409SSimon J. Gerraty *
72*0957b409SSimon J. Gerraty * rev_128(x) * rev_128(y) = rev_255(x * y)
73*0957b409SSimon J. Gerraty *
74*0957b409SSimon J. Gerraty * So by using full big-endian, we still get the right result, except
75*0957b409SSimon J. Gerraty * that it is right-shifted by 1 bit. The left-shift is relatively
76*0957b409SSimon J. Gerraty * inexpensive, and it can be mutualised.
77*0957b409SSimon J. Gerraty *
78*0957b409SSimon J. Gerraty *
79*0957b409SSimon J. Gerraty * Since SSE2 opcodes do not have facilities for shitfting full 128-bit
80*0957b409SSimon J. Gerraty * values with bit precision, we have to break down values into 64-bit
81*0957b409SSimon J. Gerraty * chunks. We number chunks from 0 to 3 in left to right order.
82*0957b409SSimon J. Gerraty */
83*0957b409SSimon J. Gerraty
84*0957b409SSimon J. Gerraty /*
85*0957b409SSimon J. Gerraty * Byte-swap a complete 128-bit value. This normally uses
86*0957b409SSimon J. Gerraty * _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode).
87*0957b409SSimon J. Gerraty * However, this crashes old Clang versions, so, for Clang before 3.8,
88*0957b409SSimon J. Gerraty * we use an alternate (and less efficient) version.
89*0957b409SSimon J. Gerraty */
90*0957b409SSimon J. Gerraty #if BR_CLANG && !BR_CLANG_3_8
91*0957b409SSimon J. Gerraty #define BYTESWAP_DECL
92*0957b409SSimon J. Gerraty #define BYTESWAP_PREP (void)0
93*0957b409SSimon J. Gerraty #define BYTESWAP(x) do { \
94*0957b409SSimon J. Gerraty __m128i byteswap1, byteswap2; \
95*0957b409SSimon J. Gerraty byteswap1 = (x); \
96*0957b409SSimon J. Gerraty byteswap2 = _mm_srli_epi16(byteswap1, 8); \
97*0957b409SSimon J. Gerraty byteswap1 = _mm_slli_epi16(byteswap1, 8); \
98*0957b409SSimon J. Gerraty byteswap1 = _mm_or_si128(byteswap1, byteswap2); \
99*0957b409SSimon J. Gerraty byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \
100*0957b409SSimon J. Gerraty byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \
101*0957b409SSimon J. Gerraty (x) = _mm_shuffle_epi32(byteswap1, 0x4E); \
102*0957b409SSimon J. Gerraty } while (0)
103*0957b409SSimon J. Gerraty #else
104*0957b409SSimon J. Gerraty #define BYTESWAP_DECL __m128i byteswap_index;
105*0957b409SSimon J. Gerraty #define BYTESWAP_PREP do { \
106*0957b409SSimon J. Gerraty byteswap_index = _mm_set_epi8( \
107*0957b409SSimon J. Gerraty 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
108*0957b409SSimon J. Gerraty } while (0)
109*0957b409SSimon J. Gerraty #define BYTESWAP(x) do { \
110*0957b409SSimon J. Gerraty (x) = _mm_shuffle_epi8((x), byteswap_index); \
111*0957b409SSimon J. Gerraty } while (0)
112*0957b409SSimon J. Gerraty #endif
113*0957b409SSimon J. Gerraty
114*0957b409SSimon J. Gerraty /*
115*0957b409SSimon J. Gerraty * Call pclmulqdq. Clang appears to have trouble with the intrinsic, so,
116*0957b409SSimon J. Gerraty * for that compiler, we use inline assembly. Inline assembly is
117*0957b409SSimon J. Gerraty * potentially a bit slower because the compiler does not understand
118*0957b409SSimon J. Gerraty * what the opcode does, and thus cannot optimize instruction
119*0957b409SSimon J. Gerraty * scheduling.
120*0957b409SSimon J. Gerraty *
121*0957b409SSimon J. Gerraty * We use a target of "sse2" only, so that Clang may still handle the
122*0957b409SSimon J. Gerraty * '__m128i' type and allocate SSE2 registers.
123*0957b409SSimon J. Gerraty */
124*0957b409SSimon J. Gerraty #if BR_CLANG
125*0957b409SSimon J. Gerraty BR_TARGET("sse2")
126*0957b409SSimon J. Gerraty static inline __m128i
pclmulqdq00(__m128i x,__m128i y)127*0957b409SSimon J. Gerraty pclmulqdq00(__m128i x, __m128i y)
128*0957b409SSimon J. Gerraty {
129*0957b409SSimon J. Gerraty __asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y));
130*0957b409SSimon J. Gerraty return x;
131*0957b409SSimon J. Gerraty }
132*0957b409SSimon J. Gerraty BR_TARGET("sse2")
133*0957b409SSimon J. Gerraty static inline __m128i
pclmulqdq11(__m128i x,__m128i y)134*0957b409SSimon J. Gerraty pclmulqdq11(__m128i x, __m128i y)
135*0957b409SSimon J. Gerraty {
136*0957b409SSimon J. Gerraty __asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y));
137*0957b409SSimon J. Gerraty return x;
138*0957b409SSimon J. Gerraty }
139*0957b409SSimon J. Gerraty #else
140*0957b409SSimon J. Gerraty #define pclmulqdq00(x, y) _mm_clmulepi64_si128(x, y, 0x00)
141*0957b409SSimon J. Gerraty #define pclmulqdq11(x, y) _mm_clmulepi64_si128(x, y, 0x11)
142*0957b409SSimon J. Gerraty #endif
143*0957b409SSimon J. Gerraty
144*0957b409SSimon J. Gerraty /*
145*0957b409SSimon J. Gerraty * From a 128-bit value kw, compute kx as the XOR of the two 64-bit
146*0957b409SSimon J. Gerraty * halves of kw (into the right half of kx; left half is unspecified).
147*0957b409SSimon J. Gerraty */
148*0957b409SSimon J. Gerraty #define BK(kw, kx) do { \
149*0957b409SSimon J. Gerraty kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
150*0957b409SSimon J. Gerraty } while (0)
151*0957b409SSimon J. Gerraty
152*0957b409SSimon J. Gerraty /*
153*0957b409SSimon J. Gerraty * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
154*0957b409SSimon J. Gerraty * the XOR of the two values (kx).
155*0957b409SSimon J. Gerraty */
156*0957b409SSimon J. Gerraty #define PBK(k0, k1, kw, kx) do { \
157*0957b409SSimon J. Gerraty kw = _mm_unpacklo_epi64(k1, k0); \
158*0957b409SSimon J. Gerraty kx = _mm_xor_si128(k0, k1); \
159*0957b409SSimon J. Gerraty } while (0)
160*0957b409SSimon J. Gerraty
161*0957b409SSimon J. Gerraty /*
162*0957b409SSimon J. Gerraty * Left-shift by 1 bit a 256-bit value (in four 64-bit words).
163*0957b409SSimon J. Gerraty */
164*0957b409SSimon J. Gerraty #define SL_256(x0, x1, x2, x3) do { \
165*0957b409SSimon J. Gerraty x0 = _mm_or_si128( \
166*0957b409SSimon J. Gerraty _mm_slli_epi64(x0, 1), \
167*0957b409SSimon J. Gerraty _mm_srli_epi64(x1, 63)); \
168*0957b409SSimon J. Gerraty x1 = _mm_or_si128( \
169*0957b409SSimon J. Gerraty _mm_slli_epi64(x1, 1), \
170*0957b409SSimon J. Gerraty _mm_srli_epi64(x2, 63)); \
171*0957b409SSimon J. Gerraty x2 = _mm_or_si128( \
172*0957b409SSimon J. Gerraty _mm_slli_epi64(x2, 1), \
173*0957b409SSimon J. Gerraty _mm_srli_epi64(x3, 63)); \
174*0957b409SSimon J. Gerraty x3 = _mm_slli_epi64(x3, 1); \
175*0957b409SSimon J. Gerraty } while (0)
176*0957b409SSimon J. Gerraty
177*0957b409SSimon J. Gerraty /*
178*0957b409SSimon J. Gerraty * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
179*0957b409SSimon J. Gerraty * result is written in x0..x1.
180*0957b409SSimon J. Gerraty */
181*0957b409SSimon J. Gerraty #define REDUCE_F128(x0, x1, x2, x3) do { \
182*0957b409SSimon J. Gerraty x1 = _mm_xor_si128( \
183*0957b409SSimon J. Gerraty x1, \
184*0957b409SSimon J. Gerraty _mm_xor_si128( \
185*0957b409SSimon J. Gerraty _mm_xor_si128( \
186*0957b409SSimon J. Gerraty x3, \
187*0957b409SSimon J. Gerraty _mm_srli_epi64(x3, 1)), \
188*0957b409SSimon J. Gerraty _mm_xor_si128( \
189*0957b409SSimon J. Gerraty _mm_srli_epi64(x3, 2), \
190*0957b409SSimon J. Gerraty _mm_srli_epi64(x3, 7)))); \
191*0957b409SSimon J. Gerraty x2 = _mm_xor_si128( \
192*0957b409SSimon J. Gerraty _mm_xor_si128( \
193*0957b409SSimon J. Gerraty x2, \
194*0957b409SSimon J. Gerraty _mm_slli_epi64(x3, 63)), \
195*0957b409SSimon J. Gerraty _mm_xor_si128( \
196*0957b409SSimon J. Gerraty _mm_slli_epi64(x3, 62), \
197*0957b409SSimon J. Gerraty _mm_slli_epi64(x3, 57))); \
198*0957b409SSimon J. Gerraty x0 = _mm_xor_si128( \
199*0957b409SSimon J. Gerraty x0, \
200*0957b409SSimon J. Gerraty _mm_xor_si128( \
201*0957b409SSimon J. Gerraty _mm_xor_si128( \
202*0957b409SSimon J. Gerraty x2, \
203*0957b409SSimon J. Gerraty _mm_srli_epi64(x2, 1)), \
204*0957b409SSimon J. Gerraty _mm_xor_si128( \
205*0957b409SSimon J. Gerraty _mm_srli_epi64(x2, 2), \
206*0957b409SSimon J. Gerraty _mm_srli_epi64(x2, 7)))); \
207*0957b409SSimon J. Gerraty x1 = _mm_xor_si128( \
208*0957b409SSimon J. Gerraty _mm_xor_si128( \
209*0957b409SSimon J. Gerraty x1, \
210*0957b409SSimon J. Gerraty _mm_slli_epi64(x2, 63)), \
211*0957b409SSimon J. Gerraty _mm_xor_si128( \
212*0957b409SSimon J. Gerraty _mm_slli_epi64(x2, 62), \
213*0957b409SSimon J. Gerraty _mm_slli_epi64(x2, 57))); \
214*0957b409SSimon J. Gerraty } while (0)
215*0957b409SSimon J. Gerraty
216*0957b409SSimon J. Gerraty /*
217*0957b409SSimon J. Gerraty * Square value kw into (dw,dx).
218*0957b409SSimon J. Gerraty */
219*0957b409SSimon J. Gerraty #define SQUARE_F128(kw, dw, dx) do { \
220*0957b409SSimon J. Gerraty __m128i z0, z1, z2, z3; \
221*0957b409SSimon J. Gerraty z1 = pclmulqdq11(kw, kw); \
222*0957b409SSimon J. Gerraty z3 = pclmulqdq00(kw, kw); \
223*0957b409SSimon J. Gerraty z0 = _mm_shuffle_epi32(z1, 0x0E); \
224*0957b409SSimon J. Gerraty z2 = _mm_shuffle_epi32(z3, 0x0E); \
225*0957b409SSimon J. Gerraty SL_256(z0, z1, z2, z3); \
226*0957b409SSimon J. Gerraty REDUCE_F128(z0, z1, z2, z3); \
227*0957b409SSimon J. Gerraty PBK(z0, z1, dw, dx); \
228*0957b409SSimon J. Gerraty } while (0)
229*0957b409SSimon J. Gerraty
230*0957b409SSimon J. Gerraty /* see bearssl_hash.h */
231*0957b409SSimon J. Gerraty BR_TARGET("ssse3,pclmul")
232*0957b409SSimon J. Gerraty void
br_ghash_pclmul(void * y,const void * h,const void * data,size_t len)233*0957b409SSimon J. Gerraty br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
234*0957b409SSimon J. Gerraty {
235*0957b409SSimon J. Gerraty const unsigned char *buf1, *buf2;
236*0957b409SSimon J. Gerraty unsigned char tmp[64];
237*0957b409SSimon J. Gerraty size_t num4, num1;
238*0957b409SSimon J. Gerraty __m128i yw, h1w, h1x;
239*0957b409SSimon J. Gerraty BYTESWAP_DECL
240*0957b409SSimon J. Gerraty
241*0957b409SSimon J. Gerraty /*
242*0957b409SSimon J. Gerraty * We split data into two chunks. First chunk starts at buf1
243*0957b409SSimon J. Gerraty * and contains num4 blocks of 64-byte values. Second chunk
244*0957b409SSimon J. Gerraty * starts at buf2 and contains num1 blocks of 16-byte values.
245*0957b409SSimon J. Gerraty * We want the first chunk to be as large as possible.
246*0957b409SSimon J. Gerraty */
247*0957b409SSimon J. Gerraty buf1 = data;
248*0957b409SSimon J. Gerraty num4 = len >> 6;
249*0957b409SSimon J. Gerraty len &= 63;
250*0957b409SSimon J. Gerraty buf2 = buf1 + (num4 << 6);
251*0957b409SSimon J. Gerraty num1 = (len + 15) >> 4;
252*0957b409SSimon J. Gerraty if ((len & 15) != 0) {
253*0957b409SSimon J. Gerraty memcpy(tmp, buf2, len);
254*0957b409SSimon J. Gerraty memset(tmp + len, 0, (num1 << 4) - len);
255*0957b409SSimon J. Gerraty buf2 = tmp;
256*0957b409SSimon J. Gerraty }
257*0957b409SSimon J. Gerraty
258*0957b409SSimon J. Gerraty /*
259*0957b409SSimon J. Gerraty * Preparatory step for endian conversions.
260*0957b409SSimon J. Gerraty */
261*0957b409SSimon J. Gerraty BYTESWAP_PREP;
262*0957b409SSimon J. Gerraty
263*0957b409SSimon J. Gerraty /*
264*0957b409SSimon J. Gerraty * Load y and h.
265*0957b409SSimon J. Gerraty */
266*0957b409SSimon J. Gerraty yw = _mm_loadu_si128(y);
267*0957b409SSimon J. Gerraty h1w = _mm_loadu_si128(h);
268*0957b409SSimon J. Gerraty BYTESWAP(yw);
269*0957b409SSimon J. Gerraty BYTESWAP(h1w);
270*0957b409SSimon J. Gerraty BK(h1w, h1x);
271*0957b409SSimon J. Gerraty
272*0957b409SSimon J. Gerraty if (num4 > 0) {
273*0957b409SSimon J. Gerraty __m128i h2w, h2x, h3w, h3x, h4w, h4x;
274*0957b409SSimon J. Gerraty __m128i t0, t1, t2, t3;
275*0957b409SSimon J. Gerraty
276*0957b409SSimon J. Gerraty /*
277*0957b409SSimon J. Gerraty * Compute h2 = h^2.
278*0957b409SSimon J. Gerraty */
279*0957b409SSimon J. Gerraty SQUARE_F128(h1w, h2w, h2x);
280*0957b409SSimon J. Gerraty
281*0957b409SSimon J. Gerraty /*
282*0957b409SSimon J. Gerraty * Compute h3 = h^3 = h*(h^2).
283*0957b409SSimon J. Gerraty */
284*0957b409SSimon J. Gerraty t1 = pclmulqdq11(h1w, h2w);
285*0957b409SSimon J. Gerraty t3 = pclmulqdq00(h1w, h2w);
286*0957b409SSimon J. Gerraty t2 = _mm_xor_si128(pclmulqdq00(h1x, h2x),
287*0957b409SSimon J. Gerraty _mm_xor_si128(t1, t3));
288*0957b409SSimon J. Gerraty t0 = _mm_shuffle_epi32(t1, 0x0E);
289*0957b409SSimon J. Gerraty t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
290*0957b409SSimon J. Gerraty t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
291*0957b409SSimon J. Gerraty SL_256(t0, t1, t2, t3);
292*0957b409SSimon J. Gerraty REDUCE_F128(t0, t1, t2, t3);
293*0957b409SSimon J. Gerraty PBK(t0, t1, h3w, h3x);
294*0957b409SSimon J. Gerraty
295*0957b409SSimon J. Gerraty /*
296*0957b409SSimon J. Gerraty * Compute h4 = h^4 = (h^2)^2.
297*0957b409SSimon J. Gerraty */
298*0957b409SSimon J. Gerraty SQUARE_F128(h2w, h4w, h4x);
299*0957b409SSimon J. Gerraty
300*0957b409SSimon J. Gerraty while (num4 -- > 0) {
301*0957b409SSimon J. Gerraty __m128i aw0, aw1, aw2, aw3;
302*0957b409SSimon J. Gerraty __m128i ax0, ax1, ax2, ax3;
303*0957b409SSimon J. Gerraty
304*0957b409SSimon J. Gerraty aw0 = _mm_loadu_si128((void *)(buf1 + 0));
305*0957b409SSimon J. Gerraty aw1 = _mm_loadu_si128((void *)(buf1 + 16));
306*0957b409SSimon J. Gerraty aw2 = _mm_loadu_si128((void *)(buf1 + 32));
307*0957b409SSimon J. Gerraty aw3 = _mm_loadu_si128((void *)(buf1 + 48));
308*0957b409SSimon J. Gerraty BYTESWAP(aw0);
309*0957b409SSimon J. Gerraty BYTESWAP(aw1);
310*0957b409SSimon J. Gerraty BYTESWAP(aw2);
311*0957b409SSimon J. Gerraty BYTESWAP(aw3);
312*0957b409SSimon J. Gerraty buf1 += 64;
313*0957b409SSimon J. Gerraty
314*0957b409SSimon J. Gerraty aw0 = _mm_xor_si128(aw0, yw);
315*0957b409SSimon J. Gerraty BK(aw1, ax1);
316*0957b409SSimon J. Gerraty BK(aw2, ax2);
317*0957b409SSimon J. Gerraty BK(aw3, ax3);
318*0957b409SSimon J. Gerraty BK(aw0, ax0);
319*0957b409SSimon J. Gerraty
320*0957b409SSimon J. Gerraty t1 = _mm_xor_si128(
321*0957b409SSimon J. Gerraty _mm_xor_si128(
322*0957b409SSimon J. Gerraty pclmulqdq11(aw0, h4w),
323*0957b409SSimon J. Gerraty pclmulqdq11(aw1, h3w)),
324*0957b409SSimon J. Gerraty _mm_xor_si128(
325*0957b409SSimon J. Gerraty pclmulqdq11(aw2, h2w),
326*0957b409SSimon J. Gerraty pclmulqdq11(aw3, h1w)));
327*0957b409SSimon J. Gerraty t3 = _mm_xor_si128(
328*0957b409SSimon J. Gerraty _mm_xor_si128(
329*0957b409SSimon J. Gerraty pclmulqdq00(aw0, h4w),
330*0957b409SSimon J. Gerraty pclmulqdq00(aw1, h3w)),
331*0957b409SSimon J. Gerraty _mm_xor_si128(
332*0957b409SSimon J. Gerraty pclmulqdq00(aw2, h2w),
333*0957b409SSimon J. Gerraty pclmulqdq00(aw3, h1w)));
334*0957b409SSimon J. Gerraty t2 = _mm_xor_si128(
335*0957b409SSimon J. Gerraty _mm_xor_si128(
336*0957b409SSimon J. Gerraty pclmulqdq00(ax0, h4x),
337*0957b409SSimon J. Gerraty pclmulqdq00(ax1, h3x)),
338*0957b409SSimon J. Gerraty _mm_xor_si128(
339*0957b409SSimon J. Gerraty pclmulqdq00(ax2, h2x),
340*0957b409SSimon J. Gerraty pclmulqdq00(ax3, h1x)));
341*0957b409SSimon J. Gerraty t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
342*0957b409SSimon J. Gerraty t0 = _mm_shuffle_epi32(t1, 0x0E);
343*0957b409SSimon J. Gerraty t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
344*0957b409SSimon J. Gerraty t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
345*0957b409SSimon J. Gerraty SL_256(t0, t1, t2, t3);
346*0957b409SSimon J. Gerraty REDUCE_F128(t0, t1, t2, t3);
347*0957b409SSimon J. Gerraty yw = _mm_unpacklo_epi64(t1, t0);
348*0957b409SSimon J. Gerraty }
349*0957b409SSimon J. Gerraty }
350*0957b409SSimon J. Gerraty
351*0957b409SSimon J. Gerraty while (num1 -- > 0) {
352*0957b409SSimon J. Gerraty __m128i aw, ax;
353*0957b409SSimon J. Gerraty __m128i t0, t1, t2, t3;
354*0957b409SSimon J. Gerraty
355*0957b409SSimon J. Gerraty aw = _mm_loadu_si128((void *)buf2);
356*0957b409SSimon J. Gerraty BYTESWAP(aw);
357*0957b409SSimon J. Gerraty buf2 += 16;
358*0957b409SSimon J. Gerraty
359*0957b409SSimon J. Gerraty aw = _mm_xor_si128(aw, yw);
360*0957b409SSimon J. Gerraty BK(aw, ax);
361*0957b409SSimon J. Gerraty
362*0957b409SSimon J. Gerraty t1 = pclmulqdq11(aw, h1w);
363*0957b409SSimon J. Gerraty t3 = pclmulqdq00(aw, h1w);
364*0957b409SSimon J. Gerraty t2 = pclmulqdq00(ax, h1x);
365*0957b409SSimon J. Gerraty t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
366*0957b409SSimon J. Gerraty t0 = _mm_shuffle_epi32(t1, 0x0E);
367*0957b409SSimon J. Gerraty t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
368*0957b409SSimon J. Gerraty t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
369*0957b409SSimon J. Gerraty SL_256(t0, t1, t2, t3);
370*0957b409SSimon J. Gerraty REDUCE_F128(t0, t1, t2, t3);
371*0957b409SSimon J. Gerraty yw = _mm_unpacklo_epi64(t1, t0);
372*0957b409SSimon J. Gerraty }
373*0957b409SSimon J. Gerraty
374*0957b409SSimon J. Gerraty BYTESWAP(yw);
375*0957b409SSimon J. Gerraty _mm_storeu_si128(y, yw);
376*0957b409SSimon J. Gerraty }
377*0957b409SSimon J. Gerraty
378*0957b409SSimon J. Gerraty BR_TARGETS_X86_DOWN
379*0957b409SSimon J. Gerraty
380*0957b409SSimon J. Gerraty #else
381*0957b409SSimon J. Gerraty
382*0957b409SSimon J. Gerraty /* see bearssl_hash.h */
383*0957b409SSimon J. Gerraty br_ghash
384*0957b409SSimon J. Gerraty br_ghash_pclmul_get(void)
385*0957b409SSimon J. Gerraty {
386*0957b409SSimon J. Gerraty return 0;
387*0957b409SSimon J. Gerraty }
388*0957b409SSimon J. Gerraty
389*0957b409SSimon J. Gerraty #endif
390