1 /*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #define BR_ENABLE_INTRINSICS 1
26 #include "inner.h"
27
28 #if BR_AES_X86NI
29
30 /* see bearssl_block.h */
31 const br_block_ctrcbc_class *
br_aes_x86ni_ctrcbc_get_vtable(void)32 br_aes_x86ni_ctrcbc_get_vtable(void)
33 {
34 return br_aes_x86ni_supported() ? &br_aes_x86ni_ctrcbc_vtable : NULL;
35 }
36
37 /* see bearssl_block.h */
38 void
br_aes_x86ni_ctrcbc_init(br_aes_x86ni_ctrcbc_keys * ctx,const void * key,size_t len)39 br_aes_x86ni_ctrcbc_init(br_aes_x86ni_ctrcbc_keys *ctx,
40 const void *key, size_t len)
41 {
42 ctx->vtable = &br_aes_x86ni_ctrcbc_vtable;
43 ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len);
44 }
45
46 BR_TARGETS_X86_UP
47
48 /* see bearssl_block.h */
49 BR_TARGET("sse2,sse4.1,aes")
50 void
br_aes_x86ni_ctrcbc_ctr(const br_aes_x86ni_ctrcbc_keys * ctx,void * ctr,void * data,size_t len)51 br_aes_x86ni_ctrcbc_ctr(const br_aes_x86ni_ctrcbc_keys *ctx,
52 void *ctr, void *data, size_t len)
53 {
54 unsigned char *buf;
55 unsigned num_rounds;
56 __m128i sk[15];
57 __m128i ivx0, ivx1, ivx2, ivx3;
58 __m128i erev, zero, one, four, notthree;
59 unsigned u;
60
61 buf = data;
62 num_rounds = ctx->num_rounds;
63 for (u = 0; u <= num_rounds; u ++) {
64 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
65 }
66
67 /*
68 * Some SSE2 constants.
69 */
70 erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
71 8, 9, 10, 11, 12, 13, 14, 15);
72 zero = _mm_setzero_si128();
73 one = _mm_set_epi64x(0, 1);
74 four = _mm_set_epi64x(0, 4);
75 notthree = _mm_sub_epi64(zero, four);
76
77 /*
78 * Decode the counter in big-endian and pre-increment the other
79 * three counters.
80 */
81 ivx0 = _mm_shuffle_epi8(_mm_loadu_si128((void *)ctr), erev);
82 ivx1 = _mm_add_epi64(ivx0, one);
83 ivx1 = _mm_sub_epi64(ivx1,
84 _mm_slli_si128(_mm_cmpeq_epi64(ivx1, zero), 8));
85 ivx2 = _mm_add_epi64(ivx1, one);
86 ivx2 = _mm_sub_epi64(ivx2,
87 _mm_slli_si128(_mm_cmpeq_epi64(ivx2, zero), 8));
88 ivx3 = _mm_add_epi64(ivx2, one);
89 ivx3 = _mm_sub_epi64(ivx3,
90 _mm_slli_si128(_mm_cmpeq_epi64(ivx3, zero), 8));
91 while (len > 0) {
92 __m128i x0, x1, x2, x3;
93
94 /*
95 * Load counter values; we need to byteswap them because
96 * the specification says that they use big-endian.
97 */
98 x0 = _mm_shuffle_epi8(ivx0, erev);
99 x1 = _mm_shuffle_epi8(ivx1, erev);
100 x2 = _mm_shuffle_epi8(ivx2, erev);
101 x3 = _mm_shuffle_epi8(ivx3, erev);
102
103 x0 = _mm_xor_si128(x0, sk[0]);
104 x1 = _mm_xor_si128(x1, sk[0]);
105 x2 = _mm_xor_si128(x2, sk[0]);
106 x3 = _mm_xor_si128(x3, sk[0]);
107 x0 = _mm_aesenc_si128(x0, sk[1]);
108 x1 = _mm_aesenc_si128(x1, sk[1]);
109 x2 = _mm_aesenc_si128(x2, sk[1]);
110 x3 = _mm_aesenc_si128(x3, sk[1]);
111 x0 = _mm_aesenc_si128(x0, sk[2]);
112 x1 = _mm_aesenc_si128(x1, sk[2]);
113 x2 = _mm_aesenc_si128(x2, sk[2]);
114 x3 = _mm_aesenc_si128(x3, sk[2]);
115 x0 = _mm_aesenc_si128(x0, sk[3]);
116 x1 = _mm_aesenc_si128(x1, sk[3]);
117 x2 = _mm_aesenc_si128(x2, sk[3]);
118 x3 = _mm_aesenc_si128(x3, sk[3]);
119 x0 = _mm_aesenc_si128(x0, sk[4]);
120 x1 = _mm_aesenc_si128(x1, sk[4]);
121 x2 = _mm_aesenc_si128(x2, sk[4]);
122 x3 = _mm_aesenc_si128(x3, sk[4]);
123 x0 = _mm_aesenc_si128(x0, sk[5]);
124 x1 = _mm_aesenc_si128(x1, sk[5]);
125 x2 = _mm_aesenc_si128(x2, sk[5]);
126 x3 = _mm_aesenc_si128(x3, sk[5]);
127 x0 = _mm_aesenc_si128(x0, sk[6]);
128 x1 = _mm_aesenc_si128(x1, sk[6]);
129 x2 = _mm_aesenc_si128(x2, sk[6]);
130 x3 = _mm_aesenc_si128(x3, sk[6]);
131 x0 = _mm_aesenc_si128(x0, sk[7]);
132 x1 = _mm_aesenc_si128(x1, sk[7]);
133 x2 = _mm_aesenc_si128(x2, sk[7]);
134 x3 = _mm_aesenc_si128(x3, sk[7]);
135 x0 = _mm_aesenc_si128(x0, sk[8]);
136 x1 = _mm_aesenc_si128(x1, sk[8]);
137 x2 = _mm_aesenc_si128(x2, sk[8]);
138 x3 = _mm_aesenc_si128(x3, sk[8]);
139 x0 = _mm_aesenc_si128(x0, sk[9]);
140 x1 = _mm_aesenc_si128(x1, sk[9]);
141 x2 = _mm_aesenc_si128(x2, sk[9]);
142 x3 = _mm_aesenc_si128(x3, sk[9]);
143 if (num_rounds == 10) {
144 x0 = _mm_aesenclast_si128(x0, sk[10]);
145 x1 = _mm_aesenclast_si128(x1, sk[10]);
146 x2 = _mm_aesenclast_si128(x2, sk[10]);
147 x3 = _mm_aesenclast_si128(x3, sk[10]);
148 } else if (num_rounds == 12) {
149 x0 = _mm_aesenc_si128(x0, sk[10]);
150 x1 = _mm_aesenc_si128(x1, sk[10]);
151 x2 = _mm_aesenc_si128(x2, sk[10]);
152 x3 = _mm_aesenc_si128(x3, sk[10]);
153 x0 = _mm_aesenc_si128(x0, sk[11]);
154 x1 = _mm_aesenc_si128(x1, sk[11]);
155 x2 = _mm_aesenc_si128(x2, sk[11]);
156 x3 = _mm_aesenc_si128(x3, sk[11]);
157 x0 = _mm_aesenclast_si128(x0, sk[12]);
158 x1 = _mm_aesenclast_si128(x1, sk[12]);
159 x2 = _mm_aesenclast_si128(x2, sk[12]);
160 x3 = _mm_aesenclast_si128(x3, sk[12]);
161 } else {
162 x0 = _mm_aesenc_si128(x0, sk[10]);
163 x1 = _mm_aesenc_si128(x1, sk[10]);
164 x2 = _mm_aesenc_si128(x2, sk[10]);
165 x3 = _mm_aesenc_si128(x3, sk[10]);
166 x0 = _mm_aesenc_si128(x0, sk[11]);
167 x1 = _mm_aesenc_si128(x1, sk[11]);
168 x2 = _mm_aesenc_si128(x2, sk[11]);
169 x3 = _mm_aesenc_si128(x3, sk[11]);
170 x0 = _mm_aesenc_si128(x0, sk[12]);
171 x1 = _mm_aesenc_si128(x1, sk[12]);
172 x2 = _mm_aesenc_si128(x2, sk[12]);
173 x3 = _mm_aesenc_si128(x3, sk[12]);
174 x0 = _mm_aesenc_si128(x0, sk[13]);
175 x1 = _mm_aesenc_si128(x1, sk[13]);
176 x2 = _mm_aesenc_si128(x2, sk[13]);
177 x3 = _mm_aesenc_si128(x3, sk[13]);
178 x0 = _mm_aesenclast_si128(x0, sk[14]);
179 x1 = _mm_aesenclast_si128(x1, sk[14]);
180 x2 = _mm_aesenclast_si128(x2, sk[14]);
181 x3 = _mm_aesenclast_si128(x3, sk[14]);
182 }
183 if (len >= 64) {
184 x0 = _mm_xor_si128(x0,
185 _mm_loadu_si128((void *)(buf + 0)));
186 x1 = _mm_xor_si128(x1,
187 _mm_loadu_si128((void *)(buf + 16)));
188 x2 = _mm_xor_si128(x2,
189 _mm_loadu_si128((void *)(buf + 32)));
190 x3 = _mm_xor_si128(x3,
191 _mm_loadu_si128((void *)(buf + 48)));
192 _mm_storeu_si128((void *)(buf + 0), x0);
193 _mm_storeu_si128((void *)(buf + 16), x1);
194 _mm_storeu_si128((void *)(buf + 32), x2);
195 _mm_storeu_si128((void *)(buf + 48), x3);
196 buf += 64;
197 len -= 64;
198 } else {
199 unsigned char tmp[64];
200
201 _mm_storeu_si128((void *)(tmp + 0), x0);
202 _mm_storeu_si128((void *)(tmp + 16), x1);
203 _mm_storeu_si128((void *)(tmp + 32), x2);
204 _mm_storeu_si128((void *)(tmp + 48), x3);
205 for (u = 0; u < len; u ++) {
206 buf[u] ^= tmp[u];
207 }
208 switch (len) {
209 case 16:
210 ivx0 = ivx1;
211 break;
212 case 32:
213 ivx0 = ivx2;
214 break;
215 case 48:
216 ivx0 = ivx3;
217 break;
218 }
219 break;
220 }
221
222 /*
223 * Add 4 to each counter value. For carry propagation
224 * into the upper 64-bit words, we would need to compare
225 * the results with 4, but SSE2+ has only _signed_
226 * comparisons. Instead, we mask out the low two bits,
227 * and check whether the remaining bits are zero.
228 */
229 ivx0 = _mm_add_epi64(ivx0, four);
230 ivx1 = _mm_add_epi64(ivx1, four);
231 ivx2 = _mm_add_epi64(ivx2, four);
232 ivx3 = _mm_add_epi64(ivx3, four);
233 ivx0 = _mm_sub_epi64(ivx0,
234 _mm_slli_si128(_mm_cmpeq_epi64(
235 _mm_and_si128(ivx0, notthree), zero), 8));
236 ivx1 = _mm_sub_epi64(ivx1,
237 _mm_slli_si128(_mm_cmpeq_epi64(
238 _mm_and_si128(ivx1, notthree), zero), 8));
239 ivx2 = _mm_sub_epi64(ivx2,
240 _mm_slli_si128(_mm_cmpeq_epi64(
241 _mm_and_si128(ivx2, notthree), zero), 8));
242 ivx3 = _mm_sub_epi64(ivx3,
243 _mm_slli_si128(_mm_cmpeq_epi64(
244 _mm_and_si128(ivx3, notthree), zero), 8));
245 }
246
247 /*
248 * Write back new counter value. The loop took care to put the
249 * right counter value in ivx0.
250 */
251 _mm_storeu_si128((void *)ctr, _mm_shuffle_epi8(ivx0, erev));
252 }
253
254 /* see bearssl_block.h */
255 BR_TARGET("sse2,sse4.1,aes")
256 void
br_aes_x86ni_ctrcbc_mac(const br_aes_x86ni_ctrcbc_keys * ctx,void * cbcmac,const void * data,size_t len)257 br_aes_x86ni_ctrcbc_mac(const br_aes_x86ni_ctrcbc_keys *ctx,
258 void *cbcmac, const void *data, size_t len)
259 {
260 const unsigned char *buf;
261 unsigned num_rounds;
262 __m128i sk[15], ivx;
263 unsigned u;
264
265 buf = data;
266 ivx = _mm_loadu_si128(cbcmac);
267 num_rounds = ctx->num_rounds;
268 for (u = 0; u <= num_rounds; u ++) {
269 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
270 }
271 while (len > 0) {
272 __m128i x;
273
274 x = _mm_xor_si128(_mm_loadu_si128((void *)buf), ivx);
275 x = _mm_xor_si128(x, sk[0]);
276 x = _mm_aesenc_si128(x, sk[1]);
277 x = _mm_aesenc_si128(x, sk[2]);
278 x = _mm_aesenc_si128(x, sk[3]);
279 x = _mm_aesenc_si128(x, sk[4]);
280 x = _mm_aesenc_si128(x, sk[5]);
281 x = _mm_aesenc_si128(x, sk[6]);
282 x = _mm_aesenc_si128(x, sk[7]);
283 x = _mm_aesenc_si128(x, sk[8]);
284 x = _mm_aesenc_si128(x, sk[9]);
285 if (num_rounds == 10) {
286 x = _mm_aesenclast_si128(x, sk[10]);
287 } else if (num_rounds == 12) {
288 x = _mm_aesenc_si128(x, sk[10]);
289 x = _mm_aesenc_si128(x, sk[11]);
290 x = _mm_aesenclast_si128(x, sk[12]);
291 } else {
292 x = _mm_aesenc_si128(x, sk[10]);
293 x = _mm_aesenc_si128(x, sk[11]);
294 x = _mm_aesenc_si128(x, sk[12]);
295 x = _mm_aesenc_si128(x, sk[13]);
296 x = _mm_aesenclast_si128(x, sk[14]);
297 }
298 ivx = x;
299 buf += 16;
300 len -= 16;
301 }
302 _mm_storeu_si128(cbcmac, ivx);
303 }
304
305 /* see bearssl_block.h */
306 BR_TARGET("sse2,sse4.1,aes")
307 void
br_aes_x86ni_ctrcbc_encrypt(const br_aes_x86ni_ctrcbc_keys * ctx,void * ctr,void * cbcmac,void * data,size_t len)308 br_aes_x86ni_ctrcbc_encrypt(const br_aes_x86ni_ctrcbc_keys *ctx,
309 void *ctr, void *cbcmac, void *data, size_t len)
310 {
311 unsigned char *buf;
312 unsigned num_rounds;
313 __m128i sk[15];
314 __m128i ivx, cmx;
315 __m128i erev, zero, one;
316 unsigned u;
317 int first_iter;
318
319 num_rounds = ctx->num_rounds;
320 for (u = 0; u <= num_rounds; u ++) {
321 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
322 }
323
324 /*
325 * Some SSE2 constants.
326 */
327 erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
328 8, 9, 10, 11, 12, 13, 14, 15);
329 zero = _mm_setzero_si128();
330 one = _mm_set_epi64x(0, 1);
331
332 /*
333 * Decode the counter in big-endian.
334 */
335 ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev);
336 cmx = _mm_loadu_si128(cbcmac);
337
338 buf = data;
339 first_iter = 1;
340 while (len > 0) {
341 __m128i dx, x0, x1;
342
343 /*
344 * Load initial values:
345 * dx encrypted block of data
346 * x0 counter (for CTR encryption)
347 * x1 input for CBC-MAC
348 */
349 dx = _mm_loadu_si128((void *)buf);
350 x0 = _mm_shuffle_epi8(ivx, erev);
351 x1 = cmx;
352
353 x0 = _mm_xor_si128(x0, sk[0]);
354 x1 = _mm_xor_si128(x1, sk[0]);
355 x0 = _mm_aesenc_si128(x0, sk[1]);
356 x1 = _mm_aesenc_si128(x1, sk[1]);
357 x0 = _mm_aesenc_si128(x0, sk[2]);
358 x1 = _mm_aesenc_si128(x1, sk[2]);
359 x0 = _mm_aesenc_si128(x0, sk[3]);
360 x1 = _mm_aesenc_si128(x1, sk[3]);
361 x0 = _mm_aesenc_si128(x0, sk[4]);
362 x1 = _mm_aesenc_si128(x1, sk[4]);
363 x0 = _mm_aesenc_si128(x0, sk[5]);
364 x1 = _mm_aesenc_si128(x1, sk[5]);
365 x0 = _mm_aesenc_si128(x0, sk[6]);
366 x1 = _mm_aesenc_si128(x1, sk[6]);
367 x0 = _mm_aesenc_si128(x0, sk[7]);
368 x1 = _mm_aesenc_si128(x1, sk[7]);
369 x0 = _mm_aesenc_si128(x0, sk[8]);
370 x1 = _mm_aesenc_si128(x1, sk[8]);
371 x0 = _mm_aesenc_si128(x0, sk[9]);
372 x1 = _mm_aesenc_si128(x1, sk[9]);
373 if (num_rounds == 10) {
374 x0 = _mm_aesenclast_si128(x0, sk[10]);
375 x1 = _mm_aesenclast_si128(x1, sk[10]);
376 } else if (num_rounds == 12) {
377 x0 = _mm_aesenc_si128(x0, sk[10]);
378 x1 = _mm_aesenc_si128(x1, sk[10]);
379 x0 = _mm_aesenc_si128(x0, sk[11]);
380 x1 = _mm_aesenc_si128(x1, sk[11]);
381 x0 = _mm_aesenclast_si128(x0, sk[12]);
382 x1 = _mm_aesenclast_si128(x1, sk[12]);
383 } else {
384 x0 = _mm_aesenc_si128(x0, sk[10]);
385 x1 = _mm_aesenc_si128(x1, sk[10]);
386 x0 = _mm_aesenc_si128(x0, sk[11]);
387 x1 = _mm_aesenc_si128(x1, sk[11]);
388 x0 = _mm_aesenc_si128(x0, sk[12]);
389 x1 = _mm_aesenc_si128(x1, sk[12]);
390 x0 = _mm_aesenc_si128(x0, sk[13]);
391 x1 = _mm_aesenc_si128(x1, sk[13]);
392 x0 = _mm_aesenclast_si128(x0, sk[14]);
393 x1 = _mm_aesenclast_si128(x1, sk[14]);
394 }
395
396 x0 = _mm_xor_si128(x0, dx);
397 if (first_iter) {
398 cmx = _mm_xor_si128(cmx, x0);
399 first_iter = 0;
400 } else {
401 cmx = _mm_xor_si128(x1, x0);
402 }
403 _mm_storeu_si128((void *)buf, x0);
404
405 buf += 16;
406 len -= 16;
407
408 /*
409 * Increment the counter value.
410 */
411 ivx = _mm_add_epi64(ivx, one);
412 ivx = _mm_sub_epi64(ivx,
413 _mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8));
414
415 /*
416 * If this was the last iteration, then compute the
417 * extra block encryption to complete CBC-MAC.
418 */
419 if (len == 0) {
420 cmx = _mm_xor_si128(cmx, sk[0]);
421 cmx = _mm_aesenc_si128(cmx, sk[1]);
422 cmx = _mm_aesenc_si128(cmx, sk[2]);
423 cmx = _mm_aesenc_si128(cmx, sk[3]);
424 cmx = _mm_aesenc_si128(cmx, sk[4]);
425 cmx = _mm_aesenc_si128(cmx, sk[5]);
426 cmx = _mm_aesenc_si128(cmx, sk[6]);
427 cmx = _mm_aesenc_si128(cmx, sk[7]);
428 cmx = _mm_aesenc_si128(cmx, sk[8]);
429 cmx = _mm_aesenc_si128(cmx, sk[9]);
430 if (num_rounds == 10) {
431 cmx = _mm_aesenclast_si128(cmx, sk[10]);
432 } else if (num_rounds == 12) {
433 cmx = _mm_aesenc_si128(cmx, sk[10]);
434 cmx = _mm_aesenc_si128(cmx, sk[11]);
435 cmx = _mm_aesenclast_si128(cmx, sk[12]);
436 } else {
437 cmx = _mm_aesenc_si128(cmx, sk[10]);
438 cmx = _mm_aesenc_si128(cmx, sk[11]);
439 cmx = _mm_aesenc_si128(cmx, sk[12]);
440 cmx = _mm_aesenc_si128(cmx, sk[13]);
441 cmx = _mm_aesenclast_si128(cmx, sk[14]);
442 }
443 break;
444 }
445 }
446
447 /*
448 * Write back new counter value and CBC-MAC value.
449 */
450 _mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev));
451 _mm_storeu_si128(cbcmac, cmx);
452 }
453
454 /* see bearssl_block.h */
455 BR_TARGET("sse2,sse4.1,aes")
456 void
br_aes_x86ni_ctrcbc_decrypt(const br_aes_x86ni_ctrcbc_keys * ctx,void * ctr,void * cbcmac,void * data,size_t len)457 br_aes_x86ni_ctrcbc_decrypt(const br_aes_x86ni_ctrcbc_keys *ctx,
458 void *ctr, void *cbcmac, void *data, size_t len)
459 {
460 unsigned char *buf;
461 unsigned num_rounds;
462 __m128i sk[15];
463 __m128i ivx, cmx;
464 __m128i erev, zero, one;
465 unsigned u;
466
467 num_rounds = ctx->num_rounds;
468 for (u = 0; u <= num_rounds; u ++) {
469 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
470 }
471
472 /*
473 * Some SSE2 constants.
474 */
475 erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
476 8, 9, 10, 11, 12, 13, 14, 15);
477 zero = _mm_setzero_si128();
478 one = _mm_set_epi64x(0, 1);
479
480 /*
481 * Decode the counter in big-endian.
482 */
483 ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev);
484 cmx = _mm_loadu_si128(cbcmac);
485
486 buf = data;
487 while (len > 0) {
488 __m128i dx, x0, x1;
489
490 /*
491 * Load initial values:
492 * dx encrypted block of data
493 * x0 counter (for CTR encryption)
494 * x1 input for CBC-MAC
495 */
496 dx = _mm_loadu_si128((void *)buf);
497 x0 = _mm_shuffle_epi8(ivx, erev);
498 x1 = _mm_xor_si128(cmx, dx);
499
500 x0 = _mm_xor_si128(x0, sk[0]);
501 x1 = _mm_xor_si128(x1, sk[0]);
502 x0 = _mm_aesenc_si128(x0, sk[1]);
503 x1 = _mm_aesenc_si128(x1, sk[1]);
504 x0 = _mm_aesenc_si128(x0, sk[2]);
505 x1 = _mm_aesenc_si128(x1, sk[2]);
506 x0 = _mm_aesenc_si128(x0, sk[3]);
507 x1 = _mm_aesenc_si128(x1, sk[3]);
508 x0 = _mm_aesenc_si128(x0, sk[4]);
509 x1 = _mm_aesenc_si128(x1, sk[4]);
510 x0 = _mm_aesenc_si128(x0, sk[5]);
511 x1 = _mm_aesenc_si128(x1, sk[5]);
512 x0 = _mm_aesenc_si128(x0, sk[6]);
513 x1 = _mm_aesenc_si128(x1, sk[6]);
514 x0 = _mm_aesenc_si128(x0, sk[7]);
515 x1 = _mm_aesenc_si128(x1, sk[7]);
516 x0 = _mm_aesenc_si128(x0, sk[8]);
517 x1 = _mm_aesenc_si128(x1, sk[8]);
518 x0 = _mm_aesenc_si128(x0, sk[9]);
519 x1 = _mm_aesenc_si128(x1, sk[9]);
520 if (num_rounds == 10) {
521 x0 = _mm_aesenclast_si128(x0, sk[10]);
522 x1 = _mm_aesenclast_si128(x1, sk[10]);
523 } else if (num_rounds == 12) {
524 x0 = _mm_aesenc_si128(x0, sk[10]);
525 x1 = _mm_aesenc_si128(x1, sk[10]);
526 x0 = _mm_aesenc_si128(x0, sk[11]);
527 x1 = _mm_aesenc_si128(x1, sk[11]);
528 x0 = _mm_aesenclast_si128(x0, sk[12]);
529 x1 = _mm_aesenclast_si128(x1, sk[12]);
530 } else {
531 x0 = _mm_aesenc_si128(x0, sk[10]);
532 x1 = _mm_aesenc_si128(x1, sk[10]);
533 x0 = _mm_aesenc_si128(x0, sk[11]);
534 x1 = _mm_aesenc_si128(x1, sk[11]);
535 x0 = _mm_aesenc_si128(x0, sk[12]);
536 x1 = _mm_aesenc_si128(x1, sk[12]);
537 x0 = _mm_aesenc_si128(x0, sk[13]);
538 x1 = _mm_aesenc_si128(x1, sk[13]);
539 x0 = _mm_aesenclast_si128(x0, sk[14]);
540 x1 = _mm_aesenclast_si128(x1, sk[14]);
541 }
542 x0 = _mm_xor_si128(x0, dx);
543 cmx = x1;
544 _mm_storeu_si128((void *)buf, x0);
545
546 buf += 16;
547 len -= 16;
548
549 /*
550 * Increment the counter value.
551 */
552 ivx = _mm_add_epi64(ivx, one);
553 ivx = _mm_sub_epi64(ivx,
554 _mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8));
555 }
556
557 /*
558 * Write back new counter value and CBC-MAC value.
559 */
560 _mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev));
561 _mm_storeu_si128(cbcmac, cmx);
562 }
563
564 BR_TARGETS_X86_DOWN
565
566 /* see bearssl_block.h */
567 const br_block_ctrcbc_class br_aes_x86ni_ctrcbc_vtable = {
568 sizeof(br_aes_x86ni_ctrcbc_keys),
569 16,
570 4,
571 (void (*)(const br_block_ctrcbc_class **, const void *, size_t))
572 &br_aes_x86ni_ctrcbc_init,
573 (void (*)(const br_block_ctrcbc_class *const *,
574 void *, void *, void *, size_t))
575 &br_aes_x86ni_ctrcbc_encrypt,
576 (void (*)(const br_block_ctrcbc_class *const *,
577 void *, void *, void *, size_t))
578 &br_aes_x86ni_ctrcbc_decrypt,
579 (void (*)(const br_block_ctrcbc_class *const *,
580 void *, void *, size_t))
581 &br_aes_x86ni_ctrcbc_ctr,
582 (void (*)(const br_block_ctrcbc_class *const *,
583 void *, const void *, size_t))
584 &br_aes_x86ni_ctrcbc_mac
585 };
586
587 #else
588
589 /* see bearssl_block.h */
590 const br_block_ctrcbc_class *
br_aes_x86ni_ctrcbc_get_vtable(void)591 br_aes_x86ni_ctrcbc_get_vtable(void)
592 {
593 return NULL;
594 }
595
596 #endif
597