1*0957b409SSimon J. Gerraty /*
2*0957b409SSimon J. Gerraty * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3*0957b409SSimon J. Gerraty *
4*0957b409SSimon J. Gerraty * Permission is hereby granted, free of charge, to any person obtaining
5*0957b409SSimon J. Gerraty * a copy of this software and associated documentation files (the
6*0957b409SSimon J. Gerraty * "Software"), to deal in the Software without restriction, including
7*0957b409SSimon J. Gerraty * without limitation the rights to use, copy, modify, merge, publish,
8*0957b409SSimon J. Gerraty * distribute, sublicense, and/or sell copies of the Software, and to
9*0957b409SSimon J. Gerraty * permit persons to whom the Software is furnished to do so, subject to
10*0957b409SSimon J. Gerraty * the following conditions:
11*0957b409SSimon J. Gerraty *
12*0957b409SSimon J. Gerraty * The above copyright notice and this permission notice shall be
13*0957b409SSimon J. Gerraty * included in all copies or substantial portions of the Software.
14*0957b409SSimon J. Gerraty *
15*0957b409SSimon J. Gerraty * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16*0957b409SSimon J. Gerraty * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17*0957b409SSimon J. Gerraty * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18*0957b409SSimon J. Gerraty * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19*0957b409SSimon J. Gerraty * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20*0957b409SSimon J. Gerraty * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21*0957b409SSimon J. Gerraty * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*0957b409SSimon J. Gerraty * SOFTWARE.
23*0957b409SSimon J. Gerraty */
24*0957b409SSimon J. Gerraty
25*0957b409SSimon J. Gerraty #define BR_ENABLE_INTRINSICS 1
26*0957b409SSimon J. Gerraty #include "inner.h"
27*0957b409SSimon J. Gerraty
28*0957b409SSimon J. Gerraty /*
29*0957b409SSimon J. Gerraty * This code contains the AES key schedule implementation using the
30*0957b409SSimon J. Gerraty * AES-NI opcodes.
31*0957b409SSimon J. Gerraty */
32*0957b409SSimon J. Gerraty
33*0957b409SSimon J. Gerraty #if BR_AES_X86NI
34*0957b409SSimon J. Gerraty
35*0957b409SSimon J. Gerraty /* see inner.h */
36*0957b409SSimon J. Gerraty int
br_aes_x86ni_supported(void)37*0957b409SSimon J. Gerraty br_aes_x86ni_supported(void)
38*0957b409SSimon J. Gerraty {
39*0957b409SSimon J. Gerraty /*
40*0957b409SSimon J. Gerraty * Bit mask for features in ECX:
41*0957b409SSimon J. Gerraty * 19 SSE4.1 (used for _mm_insert_epi32(), for AES-CTR)
42*0957b409SSimon J. Gerraty * 25 AES-NI
43*0957b409SSimon J. Gerraty */
44*0957b409SSimon J. Gerraty return br_cpuid(0, 0, 0x02080000, 0);
45*0957b409SSimon J. Gerraty }
46*0957b409SSimon J. Gerraty
47*0957b409SSimon J. Gerraty BR_TARGETS_X86_UP
48*0957b409SSimon J. Gerraty
49*0957b409SSimon J. Gerraty BR_TARGET("sse2,aes")
50*0957b409SSimon J. Gerraty static inline __m128i
expand_step128(__m128i k,__m128i k2)51*0957b409SSimon J. Gerraty expand_step128(__m128i k, __m128i k2)
52*0957b409SSimon J. Gerraty {
53*0957b409SSimon J. Gerraty k = _mm_xor_si128(k, _mm_slli_si128(k, 4));
54*0957b409SSimon J. Gerraty k = _mm_xor_si128(k, _mm_slli_si128(k, 4));
55*0957b409SSimon J. Gerraty k = _mm_xor_si128(k, _mm_slli_si128(k, 4));
56*0957b409SSimon J. Gerraty k2 = _mm_shuffle_epi32(k2, 0xFF);
57*0957b409SSimon J. Gerraty return _mm_xor_si128(k, k2);
58*0957b409SSimon J. Gerraty }
59*0957b409SSimon J. Gerraty
60*0957b409SSimon J. Gerraty BR_TARGET("sse2,aes")
61*0957b409SSimon J. Gerraty static inline void
expand_step192(__m128i * t1,__m128i * t2,__m128i * t3)62*0957b409SSimon J. Gerraty expand_step192(__m128i *t1, __m128i *t2, __m128i *t3)
63*0957b409SSimon J. Gerraty {
64*0957b409SSimon J. Gerraty __m128i t4;
65*0957b409SSimon J. Gerraty
66*0957b409SSimon J. Gerraty *t2 = _mm_shuffle_epi32(*t2, 0x55);
67*0957b409SSimon J. Gerraty t4 = _mm_slli_si128(*t1, 0x4);
68*0957b409SSimon J. Gerraty *t1 = _mm_xor_si128(*t1, t4);
69*0957b409SSimon J. Gerraty t4 = _mm_slli_si128(t4, 0x4);
70*0957b409SSimon J. Gerraty *t1 = _mm_xor_si128(*t1, t4);
71*0957b409SSimon J. Gerraty t4 = _mm_slli_si128(t4, 0x4);
72*0957b409SSimon J. Gerraty *t1 = _mm_xor_si128(*t1, t4);
73*0957b409SSimon J. Gerraty *t1 = _mm_xor_si128(*t1, *t2);
74*0957b409SSimon J. Gerraty *t2 = _mm_shuffle_epi32(*t1, 0xFF);
75*0957b409SSimon J. Gerraty t4 = _mm_slli_si128(*t3, 0x4);
76*0957b409SSimon J. Gerraty *t3 = _mm_xor_si128(*t3, t4);
77*0957b409SSimon J. Gerraty *t3 = _mm_xor_si128(*t3, *t2);
78*0957b409SSimon J. Gerraty }
79*0957b409SSimon J. Gerraty
80*0957b409SSimon J. Gerraty BR_TARGET("sse2,aes")
81*0957b409SSimon J. Gerraty static inline void
expand_step256_1(__m128i * t1,__m128i * t2)82*0957b409SSimon J. Gerraty expand_step256_1(__m128i *t1, __m128i *t2)
83*0957b409SSimon J. Gerraty {
84*0957b409SSimon J. Gerraty __m128i t4;
85*0957b409SSimon J. Gerraty
86*0957b409SSimon J. Gerraty *t2 = _mm_shuffle_epi32(*t2, 0xFF);
87*0957b409SSimon J. Gerraty t4 = _mm_slli_si128(*t1, 0x4);
88*0957b409SSimon J. Gerraty *t1 = _mm_xor_si128(*t1, t4);
89*0957b409SSimon J. Gerraty t4 = _mm_slli_si128(t4, 0x4);
90*0957b409SSimon J. Gerraty *t1 = _mm_xor_si128(*t1, t4);
91*0957b409SSimon J. Gerraty t4 = _mm_slli_si128(t4, 0x4);
92*0957b409SSimon J. Gerraty *t1 = _mm_xor_si128(*t1, t4);
93*0957b409SSimon J. Gerraty *t1 = _mm_xor_si128(*t1, *t2);
94*0957b409SSimon J. Gerraty }
95*0957b409SSimon J. Gerraty
96*0957b409SSimon J. Gerraty BR_TARGET("sse2,aes")
97*0957b409SSimon J. Gerraty static inline void
expand_step256_2(__m128i * t1,__m128i * t3)98*0957b409SSimon J. Gerraty expand_step256_2(__m128i *t1, __m128i *t3)
99*0957b409SSimon J. Gerraty {
100*0957b409SSimon J. Gerraty __m128i t2, t4;
101*0957b409SSimon J. Gerraty
102*0957b409SSimon J. Gerraty t4 = _mm_aeskeygenassist_si128(*t1, 0x0);
103*0957b409SSimon J. Gerraty t2 = _mm_shuffle_epi32(t4, 0xAA);
104*0957b409SSimon J. Gerraty t4 = _mm_slli_si128(*t3, 0x4);
105*0957b409SSimon J. Gerraty *t3 = _mm_xor_si128(*t3, t4);
106*0957b409SSimon J. Gerraty t4 = _mm_slli_si128(t4, 0x4);
107*0957b409SSimon J. Gerraty *t3 = _mm_xor_si128(*t3, t4);
108*0957b409SSimon J. Gerraty t4 = _mm_slli_si128(t4, 0x4);
109*0957b409SSimon J. Gerraty *t3 = _mm_xor_si128(*t3, t4);
110*0957b409SSimon J. Gerraty *t3 = _mm_xor_si128(*t3, t2);
111*0957b409SSimon J. Gerraty }
112*0957b409SSimon J. Gerraty
113*0957b409SSimon J. Gerraty /*
114*0957b409SSimon J. Gerraty * Perform key schedule for AES, encryption direction. Subkeys are written
115*0957b409SSimon J. Gerraty * in sk[], and the number of rounds is returned. Key length MUST be 16,
116*0957b409SSimon J. Gerraty * 24 or 32 bytes.
117*0957b409SSimon J. Gerraty */
118*0957b409SSimon J. Gerraty BR_TARGET("sse2,aes")
119*0957b409SSimon J. Gerraty static unsigned
x86ni_keysched(__m128i * sk,const void * key,size_t len)120*0957b409SSimon J. Gerraty x86ni_keysched(__m128i *sk, const void *key, size_t len)
121*0957b409SSimon J. Gerraty {
122*0957b409SSimon J. Gerraty const unsigned char *kb;
123*0957b409SSimon J. Gerraty
124*0957b409SSimon J. Gerraty #define KEXP128(k, i, rcon) do { \
125*0957b409SSimon J. Gerraty k = expand_step128(k, _mm_aeskeygenassist_si128(k, rcon)); \
126*0957b409SSimon J. Gerraty sk[i] = k; \
127*0957b409SSimon J. Gerraty } while (0)
128*0957b409SSimon J. Gerraty
129*0957b409SSimon J. Gerraty #define KEXP192(i, rcon1, rcon2) do { \
130*0957b409SSimon J. Gerraty sk[(i) + 0] = t1; \
131*0957b409SSimon J. Gerraty sk[(i) + 1] = t3; \
132*0957b409SSimon J. Gerraty t2 = _mm_aeskeygenassist_si128(t3, rcon1); \
133*0957b409SSimon J. Gerraty expand_step192(&t1, &t2, &t3); \
134*0957b409SSimon J. Gerraty sk[(i) + 1] = _mm_castpd_si128(_mm_shuffle_pd( \
135*0957b409SSimon J. Gerraty _mm_castsi128_pd(sk[(i) + 1]), \
136*0957b409SSimon J. Gerraty _mm_castsi128_pd(t1), 0)); \
137*0957b409SSimon J. Gerraty sk[(i) + 2] = _mm_castpd_si128(_mm_shuffle_pd( \
138*0957b409SSimon J. Gerraty _mm_castsi128_pd(t1), \
139*0957b409SSimon J. Gerraty _mm_castsi128_pd(t3), 1)); \
140*0957b409SSimon J. Gerraty t2 = _mm_aeskeygenassist_si128(t3, rcon2); \
141*0957b409SSimon J. Gerraty expand_step192(&t1, &t2, &t3); \
142*0957b409SSimon J. Gerraty } while (0)
143*0957b409SSimon J. Gerraty
144*0957b409SSimon J. Gerraty #define KEXP256(i, rcon) do { \
145*0957b409SSimon J. Gerraty sk[(i) + 0] = t3; \
146*0957b409SSimon J. Gerraty t2 = _mm_aeskeygenassist_si128(t3, rcon); \
147*0957b409SSimon J. Gerraty expand_step256_1(&t1, &t2); \
148*0957b409SSimon J. Gerraty sk[(i) + 1] = t1; \
149*0957b409SSimon J. Gerraty expand_step256_2(&t1, &t3); \
150*0957b409SSimon J. Gerraty } while (0)
151*0957b409SSimon J. Gerraty
152*0957b409SSimon J. Gerraty kb = key;
153*0957b409SSimon J. Gerraty switch (len) {
154*0957b409SSimon J. Gerraty __m128i t1, t2, t3;
155*0957b409SSimon J. Gerraty
156*0957b409SSimon J. Gerraty case 16:
157*0957b409SSimon J. Gerraty t1 = _mm_loadu_si128((const void *)kb);
158*0957b409SSimon J. Gerraty sk[0] = t1;
159*0957b409SSimon J. Gerraty KEXP128(t1, 1, 0x01);
160*0957b409SSimon J. Gerraty KEXP128(t1, 2, 0x02);
161*0957b409SSimon J. Gerraty KEXP128(t1, 3, 0x04);
162*0957b409SSimon J. Gerraty KEXP128(t1, 4, 0x08);
163*0957b409SSimon J. Gerraty KEXP128(t1, 5, 0x10);
164*0957b409SSimon J. Gerraty KEXP128(t1, 6, 0x20);
165*0957b409SSimon J. Gerraty KEXP128(t1, 7, 0x40);
166*0957b409SSimon J. Gerraty KEXP128(t1, 8, 0x80);
167*0957b409SSimon J. Gerraty KEXP128(t1, 9, 0x1B);
168*0957b409SSimon J. Gerraty KEXP128(t1, 10, 0x36);
169*0957b409SSimon J. Gerraty return 10;
170*0957b409SSimon J. Gerraty
171*0957b409SSimon J. Gerraty case 24:
172*0957b409SSimon J. Gerraty t1 = _mm_loadu_si128((const void *)kb);
173*0957b409SSimon J. Gerraty t3 = _mm_loadu_si128((const void *)(kb + 8));
174*0957b409SSimon J. Gerraty t3 = _mm_shuffle_epi32(t3, 0x4E);
175*0957b409SSimon J. Gerraty KEXP192(0, 0x01, 0x02);
176*0957b409SSimon J. Gerraty KEXP192(3, 0x04, 0x08);
177*0957b409SSimon J. Gerraty KEXP192(6, 0x10, 0x20);
178*0957b409SSimon J. Gerraty KEXP192(9, 0x40, 0x80);
179*0957b409SSimon J. Gerraty sk[12] = t1;
180*0957b409SSimon J. Gerraty return 12;
181*0957b409SSimon J. Gerraty
182*0957b409SSimon J. Gerraty case 32:
183*0957b409SSimon J. Gerraty t1 = _mm_loadu_si128((const void *)kb);
184*0957b409SSimon J. Gerraty t3 = _mm_loadu_si128((const void *)(kb + 16));
185*0957b409SSimon J. Gerraty sk[0] = t1;
186*0957b409SSimon J. Gerraty KEXP256( 1, 0x01);
187*0957b409SSimon J. Gerraty KEXP256( 3, 0x02);
188*0957b409SSimon J. Gerraty KEXP256( 5, 0x04);
189*0957b409SSimon J. Gerraty KEXP256( 7, 0x08);
190*0957b409SSimon J. Gerraty KEXP256( 9, 0x10);
191*0957b409SSimon J. Gerraty KEXP256(11, 0x20);
192*0957b409SSimon J. Gerraty sk[13] = t3;
193*0957b409SSimon J. Gerraty t2 = _mm_aeskeygenassist_si128(t3, 0x40);
194*0957b409SSimon J. Gerraty expand_step256_1(&t1, &t2);
195*0957b409SSimon J. Gerraty sk[14] = t1;
196*0957b409SSimon J. Gerraty return 14;
197*0957b409SSimon J. Gerraty
198*0957b409SSimon J. Gerraty default:
199*0957b409SSimon J. Gerraty return 0;
200*0957b409SSimon J. Gerraty }
201*0957b409SSimon J. Gerraty
202*0957b409SSimon J. Gerraty #undef KEXP128
203*0957b409SSimon J. Gerraty #undef KEXP192
204*0957b409SSimon J. Gerraty #undef KEXP256
205*0957b409SSimon J. Gerraty }
206*0957b409SSimon J. Gerraty
207*0957b409SSimon J. Gerraty /* see inner.h */
208*0957b409SSimon J. Gerraty BR_TARGET("sse2,aes")
209*0957b409SSimon J. Gerraty unsigned
br_aes_x86ni_keysched_enc(unsigned char * skni,const void * key,size_t len)210*0957b409SSimon J. Gerraty br_aes_x86ni_keysched_enc(unsigned char *skni, const void *key, size_t len)
211*0957b409SSimon J. Gerraty {
212*0957b409SSimon J. Gerraty __m128i sk[15];
213*0957b409SSimon J. Gerraty unsigned num_rounds;
214*0957b409SSimon J. Gerraty
215*0957b409SSimon J. Gerraty num_rounds = x86ni_keysched(sk, key, len);
216*0957b409SSimon J. Gerraty memcpy(skni, sk, (num_rounds + 1) << 4);
217*0957b409SSimon J. Gerraty return num_rounds;
218*0957b409SSimon J. Gerraty }
219*0957b409SSimon J. Gerraty
220*0957b409SSimon J. Gerraty /* see inner.h */
221*0957b409SSimon J. Gerraty BR_TARGET("sse2,aes")
222*0957b409SSimon J. Gerraty unsigned
br_aes_x86ni_keysched_dec(unsigned char * skni,const void * key,size_t len)223*0957b409SSimon J. Gerraty br_aes_x86ni_keysched_dec(unsigned char *skni, const void *key, size_t len)
224*0957b409SSimon J. Gerraty {
225*0957b409SSimon J. Gerraty __m128i sk[15];
226*0957b409SSimon J. Gerraty unsigned u, num_rounds;
227*0957b409SSimon J. Gerraty
228*0957b409SSimon J. Gerraty num_rounds = x86ni_keysched(sk, key, len);
229*0957b409SSimon J. Gerraty _mm_storeu_si128((void *)skni, sk[num_rounds]);
230*0957b409SSimon J. Gerraty for (u = 1; u < num_rounds; u ++) {
231*0957b409SSimon J. Gerraty _mm_storeu_si128((void *)(skni + (u << 4)),
232*0957b409SSimon J. Gerraty _mm_aesimc_si128(sk[num_rounds - u]));
233*0957b409SSimon J. Gerraty }
234*0957b409SSimon J. Gerraty _mm_storeu_si128((void *)(skni + (num_rounds << 4)), sk[0]);
235*0957b409SSimon J. Gerraty return num_rounds;
236*0957b409SSimon J. Gerraty }
237*0957b409SSimon J. Gerraty
238*0957b409SSimon J. Gerraty BR_TARGETS_X86_DOWN
239*0957b409SSimon J. Gerraty
240*0957b409SSimon J. Gerraty #endif
241