xref: /freebsd/contrib/bearssl/src/symcipher/aes_x86ni_ctrcbc.c (revision 146537449b45fb6c35e8bbe9fe82c5131484972d)
1 /*
2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining
5  * a copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #define BR_ENABLE_INTRINSICS   1
26 #include "inner.h"
27 
28 #if BR_AES_X86NI
29 
30 /* see bearssl_block.h */
31 const br_block_ctrcbc_class *
32 br_aes_x86ni_ctrcbc_get_vtable(void)
33 {
34 	return br_aes_x86ni_supported() ? &br_aes_x86ni_ctrcbc_vtable : NULL;
35 }
36 
37 /* see bearssl_block.h */
38 void
39 br_aes_x86ni_ctrcbc_init(br_aes_x86ni_ctrcbc_keys *ctx,
40 	const void *key, size_t len)
41 {
42 	ctx->vtable = &br_aes_x86ni_ctrcbc_vtable;
43 	ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len);
44 }
45 
46 BR_TARGETS_X86_UP
47 
48 /* see bearssl_block.h */
49 BR_TARGET("sse2,sse4.1,aes")
50 void
51 br_aes_x86ni_ctrcbc_ctr(const br_aes_x86ni_ctrcbc_keys *ctx,
52 	void *ctr, void *data, size_t len)
53 {
54 	unsigned char *buf;
55 	unsigned num_rounds;
56 	__m128i sk[15];
57 	__m128i ivx0, ivx1, ivx2, ivx3;
58 	__m128i erev, zero, one, four, notthree;
59 	unsigned u;
60 
61 	buf = data;
62 	num_rounds = ctx->num_rounds;
63 	for (u = 0; u <= num_rounds; u ++) {
64 		sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
65 	}
66 
67 	/*
68 	 * Some SSE2 constants.
69 	 */
70 	erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
71 		8, 9, 10, 11, 12, 13, 14, 15);
72 	zero = _mm_setzero_si128();
73 	one = _mm_set_epi64x(0, 1);
74 	four = _mm_set_epi64x(0, 4);
75 	notthree = _mm_sub_epi64(zero, four);
76 
77 	/*
78 	 * Decode the counter in big-endian and pre-increment the other
79 	 * three counters.
80 	 */
81 	ivx0 = _mm_shuffle_epi8(_mm_loadu_si128((void *)ctr), erev);
82 	ivx1 = _mm_add_epi64(ivx0, one);
83 	ivx1 = _mm_sub_epi64(ivx1,
84 		_mm_slli_si128(_mm_cmpeq_epi64(ivx1, zero), 8));
85 	ivx2 = _mm_add_epi64(ivx1, one);
86 	ivx2 = _mm_sub_epi64(ivx2,
87 		_mm_slli_si128(_mm_cmpeq_epi64(ivx2, zero), 8));
88 	ivx3 = _mm_add_epi64(ivx2, one);
89 	ivx3 = _mm_sub_epi64(ivx3,
90 		_mm_slli_si128(_mm_cmpeq_epi64(ivx3, zero), 8));
91 	while (len > 0) {
92 		__m128i x0, x1, x2, x3;
93 
94 		/*
95 		 * Load counter values; we need to byteswap them because
96 		 * the specification says that they use big-endian.
97 		 */
98 		x0 = _mm_shuffle_epi8(ivx0, erev);
99 		x1 = _mm_shuffle_epi8(ivx1, erev);
100 		x2 = _mm_shuffle_epi8(ivx2, erev);
101 		x3 = _mm_shuffle_epi8(ivx3, erev);
102 
103 		x0 = _mm_xor_si128(x0, sk[0]);
104 		x1 = _mm_xor_si128(x1, sk[0]);
105 		x2 = _mm_xor_si128(x2, sk[0]);
106 		x3 = _mm_xor_si128(x3, sk[0]);
107 		x0 = _mm_aesenc_si128(x0, sk[1]);
108 		x1 = _mm_aesenc_si128(x1, sk[1]);
109 		x2 = _mm_aesenc_si128(x2, sk[1]);
110 		x3 = _mm_aesenc_si128(x3, sk[1]);
111 		x0 = _mm_aesenc_si128(x0, sk[2]);
112 		x1 = _mm_aesenc_si128(x1, sk[2]);
113 		x2 = _mm_aesenc_si128(x2, sk[2]);
114 		x3 = _mm_aesenc_si128(x3, sk[2]);
115 		x0 = _mm_aesenc_si128(x0, sk[3]);
116 		x1 = _mm_aesenc_si128(x1, sk[3]);
117 		x2 = _mm_aesenc_si128(x2, sk[3]);
118 		x3 = _mm_aesenc_si128(x3, sk[3]);
119 		x0 = _mm_aesenc_si128(x0, sk[4]);
120 		x1 = _mm_aesenc_si128(x1, sk[4]);
121 		x2 = _mm_aesenc_si128(x2, sk[4]);
122 		x3 = _mm_aesenc_si128(x3, sk[4]);
123 		x0 = _mm_aesenc_si128(x0, sk[5]);
124 		x1 = _mm_aesenc_si128(x1, sk[5]);
125 		x2 = _mm_aesenc_si128(x2, sk[5]);
126 		x3 = _mm_aesenc_si128(x3, sk[5]);
127 		x0 = _mm_aesenc_si128(x0, sk[6]);
128 		x1 = _mm_aesenc_si128(x1, sk[6]);
129 		x2 = _mm_aesenc_si128(x2, sk[6]);
130 		x3 = _mm_aesenc_si128(x3, sk[6]);
131 		x0 = _mm_aesenc_si128(x0, sk[7]);
132 		x1 = _mm_aesenc_si128(x1, sk[7]);
133 		x2 = _mm_aesenc_si128(x2, sk[7]);
134 		x3 = _mm_aesenc_si128(x3, sk[7]);
135 		x0 = _mm_aesenc_si128(x0, sk[8]);
136 		x1 = _mm_aesenc_si128(x1, sk[8]);
137 		x2 = _mm_aesenc_si128(x2, sk[8]);
138 		x3 = _mm_aesenc_si128(x3, sk[8]);
139 		x0 = _mm_aesenc_si128(x0, sk[9]);
140 		x1 = _mm_aesenc_si128(x1, sk[9]);
141 		x2 = _mm_aesenc_si128(x2, sk[9]);
142 		x3 = _mm_aesenc_si128(x3, sk[9]);
143 		if (num_rounds == 10) {
144 			x0 = _mm_aesenclast_si128(x0, sk[10]);
145 			x1 = _mm_aesenclast_si128(x1, sk[10]);
146 			x2 = _mm_aesenclast_si128(x2, sk[10]);
147 			x3 = _mm_aesenclast_si128(x3, sk[10]);
148 		} else if (num_rounds == 12) {
149 			x0 = _mm_aesenc_si128(x0, sk[10]);
150 			x1 = _mm_aesenc_si128(x1, sk[10]);
151 			x2 = _mm_aesenc_si128(x2, sk[10]);
152 			x3 = _mm_aesenc_si128(x3, sk[10]);
153 			x0 = _mm_aesenc_si128(x0, sk[11]);
154 			x1 = _mm_aesenc_si128(x1, sk[11]);
155 			x2 = _mm_aesenc_si128(x2, sk[11]);
156 			x3 = _mm_aesenc_si128(x3, sk[11]);
157 			x0 = _mm_aesenclast_si128(x0, sk[12]);
158 			x1 = _mm_aesenclast_si128(x1, sk[12]);
159 			x2 = _mm_aesenclast_si128(x2, sk[12]);
160 			x3 = _mm_aesenclast_si128(x3, sk[12]);
161 		} else {
162 			x0 = _mm_aesenc_si128(x0, sk[10]);
163 			x1 = _mm_aesenc_si128(x1, sk[10]);
164 			x2 = _mm_aesenc_si128(x2, sk[10]);
165 			x3 = _mm_aesenc_si128(x3, sk[10]);
166 			x0 = _mm_aesenc_si128(x0, sk[11]);
167 			x1 = _mm_aesenc_si128(x1, sk[11]);
168 			x2 = _mm_aesenc_si128(x2, sk[11]);
169 			x3 = _mm_aesenc_si128(x3, sk[11]);
170 			x0 = _mm_aesenc_si128(x0, sk[12]);
171 			x1 = _mm_aesenc_si128(x1, sk[12]);
172 			x2 = _mm_aesenc_si128(x2, sk[12]);
173 			x3 = _mm_aesenc_si128(x3, sk[12]);
174 			x0 = _mm_aesenc_si128(x0, sk[13]);
175 			x1 = _mm_aesenc_si128(x1, sk[13]);
176 			x2 = _mm_aesenc_si128(x2, sk[13]);
177 			x3 = _mm_aesenc_si128(x3, sk[13]);
178 			x0 = _mm_aesenclast_si128(x0, sk[14]);
179 			x1 = _mm_aesenclast_si128(x1, sk[14]);
180 			x2 = _mm_aesenclast_si128(x2, sk[14]);
181 			x3 = _mm_aesenclast_si128(x3, sk[14]);
182 		}
183 		if (len >= 64) {
184 			x0 = _mm_xor_si128(x0,
185 				_mm_loadu_si128((void *)(buf +  0)));
186 			x1 = _mm_xor_si128(x1,
187 				_mm_loadu_si128((void *)(buf + 16)));
188 			x2 = _mm_xor_si128(x2,
189 				_mm_loadu_si128((void *)(buf + 32)));
190 			x3 = _mm_xor_si128(x3,
191 				_mm_loadu_si128((void *)(buf + 48)));
192 			_mm_storeu_si128((void *)(buf +  0), x0);
193 			_mm_storeu_si128((void *)(buf + 16), x1);
194 			_mm_storeu_si128((void *)(buf + 32), x2);
195 			_mm_storeu_si128((void *)(buf + 48), x3);
196 			buf += 64;
197 			len -= 64;
198 		} else {
199 			unsigned char tmp[64];
200 
201 			_mm_storeu_si128((void *)(tmp +  0), x0);
202 			_mm_storeu_si128((void *)(tmp + 16), x1);
203 			_mm_storeu_si128((void *)(tmp + 32), x2);
204 			_mm_storeu_si128((void *)(tmp + 48), x3);
205 			for (u = 0; u < len; u ++) {
206 				buf[u] ^= tmp[u];
207 			}
208 			switch (len) {
209 			case 16:
210 				ivx0 = ivx1;
211 				break;
212 			case 32:
213 				ivx0 = ivx2;
214 				break;
215 			case 48:
216 				ivx0 = ivx3;
217 				break;
218 			}
219 			break;
220 		}
221 
222 		/*
223 		 * Add 4 to each counter value. For carry propagation
224 		 * into the upper 64-bit words, we would need to compare
225 		 * the results with 4, but SSE2+ has only _signed_
226 		 * comparisons. Instead, we mask out the low two bits,
227 		 * and check whether the remaining bits are zero.
228 		 */
229 		ivx0 = _mm_add_epi64(ivx0, four);
230 		ivx1 = _mm_add_epi64(ivx1, four);
231 		ivx2 = _mm_add_epi64(ivx2, four);
232 		ivx3 = _mm_add_epi64(ivx3, four);
233 		ivx0 = _mm_sub_epi64(ivx0,
234 			_mm_slli_si128(_mm_cmpeq_epi64(
235 				_mm_and_si128(ivx0, notthree), zero), 8));
236 		ivx1 = _mm_sub_epi64(ivx1,
237 			_mm_slli_si128(_mm_cmpeq_epi64(
238 				_mm_and_si128(ivx1, notthree), zero), 8));
239 		ivx2 = _mm_sub_epi64(ivx2,
240 			_mm_slli_si128(_mm_cmpeq_epi64(
241 				_mm_and_si128(ivx2, notthree), zero), 8));
242 		ivx3 = _mm_sub_epi64(ivx3,
243 			_mm_slli_si128(_mm_cmpeq_epi64(
244 				_mm_and_si128(ivx3, notthree), zero), 8));
245 	}
246 
247 	/*
248 	 * Write back new counter value. The loop took care to put the
249 	 * right counter value in ivx0.
250 	 */
251 	_mm_storeu_si128((void *)ctr, _mm_shuffle_epi8(ivx0, erev));
252 }
253 
254 /* see bearssl_block.h */
255 BR_TARGET("sse2,sse4.1,aes")
256 void
257 br_aes_x86ni_ctrcbc_mac(const br_aes_x86ni_ctrcbc_keys *ctx,
258 	void *cbcmac, const void *data, size_t len)
259 {
260 	const unsigned char *buf;
261 	unsigned num_rounds;
262 	__m128i sk[15], ivx;
263 	unsigned u;
264 
265 	buf = data;
266 	ivx = _mm_loadu_si128(cbcmac);
267 	num_rounds = ctx->num_rounds;
268 	for (u = 0; u <= num_rounds; u ++) {
269 		sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
270 	}
271 	while (len > 0) {
272 		__m128i x;
273 
274 		x = _mm_xor_si128(_mm_loadu_si128((void *)buf), ivx);
275 		x = _mm_xor_si128(x, sk[0]);
276 		x = _mm_aesenc_si128(x, sk[1]);
277 		x = _mm_aesenc_si128(x, sk[2]);
278 		x = _mm_aesenc_si128(x, sk[3]);
279 		x = _mm_aesenc_si128(x, sk[4]);
280 		x = _mm_aesenc_si128(x, sk[5]);
281 		x = _mm_aesenc_si128(x, sk[6]);
282 		x = _mm_aesenc_si128(x, sk[7]);
283 		x = _mm_aesenc_si128(x, sk[8]);
284 		x = _mm_aesenc_si128(x, sk[9]);
285 		if (num_rounds == 10) {
286 			x = _mm_aesenclast_si128(x, sk[10]);
287 		} else if (num_rounds == 12) {
288 			x = _mm_aesenc_si128(x, sk[10]);
289 			x = _mm_aesenc_si128(x, sk[11]);
290 			x = _mm_aesenclast_si128(x, sk[12]);
291 		} else {
292 			x = _mm_aesenc_si128(x, sk[10]);
293 			x = _mm_aesenc_si128(x, sk[11]);
294 			x = _mm_aesenc_si128(x, sk[12]);
295 			x = _mm_aesenc_si128(x, sk[13]);
296 			x = _mm_aesenclast_si128(x, sk[14]);
297 		}
298 		ivx = x;
299 		buf += 16;
300 		len -= 16;
301 	}
302 	_mm_storeu_si128(cbcmac, ivx);
303 }
304 
305 /* see bearssl_block.h */
306 BR_TARGET("sse2,sse4.1,aes")
307 void
308 br_aes_x86ni_ctrcbc_encrypt(const br_aes_x86ni_ctrcbc_keys *ctx,
309 	void *ctr, void *cbcmac, void *data, size_t len)
310 {
311 	unsigned char *buf;
312 	unsigned num_rounds;
313 	__m128i sk[15];
314 	__m128i ivx, cmx;
315 	__m128i erev, zero, one;
316 	unsigned u;
317 	int first_iter;
318 
319 	num_rounds = ctx->num_rounds;
320 	for (u = 0; u <= num_rounds; u ++) {
321 		sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
322 	}
323 
324 	/*
325 	 * Some SSE2 constants.
326 	 */
327 	erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
328 		8, 9, 10, 11, 12, 13, 14, 15);
329 	zero = _mm_setzero_si128();
330 	one = _mm_set_epi64x(0, 1);
331 
332 	/*
333 	 * Decode the counter in big-endian.
334 	 */
335 	ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev);
336 	cmx = _mm_loadu_si128(cbcmac);
337 
338 	buf = data;
339 	first_iter = 1;
340 	while (len > 0) {
341 		__m128i dx, x0, x1;
342 
343 		/*
344 		 * Load initial values:
345 		 *   dx   encrypted block of data
346 		 *   x0   counter (for CTR encryption)
347 		 *   x1   input for CBC-MAC
348 		 */
349 		dx = _mm_loadu_si128((void *)buf);
350 		x0 = _mm_shuffle_epi8(ivx, erev);
351 		x1 = cmx;
352 
353 		x0 = _mm_xor_si128(x0, sk[0]);
354 		x1 = _mm_xor_si128(x1, sk[0]);
355 		x0 = _mm_aesenc_si128(x0, sk[1]);
356 		x1 = _mm_aesenc_si128(x1, sk[1]);
357 		x0 = _mm_aesenc_si128(x0, sk[2]);
358 		x1 = _mm_aesenc_si128(x1, sk[2]);
359 		x0 = _mm_aesenc_si128(x0, sk[3]);
360 		x1 = _mm_aesenc_si128(x1, sk[3]);
361 		x0 = _mm_aesenc_si128(x0, sk[4]);
362 		x1 = _mm_aesenc_si128(x1, sk[4]);
363 		x0 = _mm_aesenc_si128(x0, sk[5]);
364 		x1 = _mm_aesenc_si128(x1, sk[5]);
365 		x0 = _mm_aesenc_si128(x0, sk[6]);
366 		x1 = _mm_aesenc_si128(x1, sk[6]);
367 		x0 = _mm_aesenc_si128(x0, sk[7]);
368 		x1 = _mm_aesenc_si128(x1, sk[7]);
369 		x0 = _mm_aesenc_si128(x0, sk[8]);
370 		x1 = _mm_aesenc_si128(x1, sk[8]);
371 		x0 = _mm_aesenc_si128(x0, sk[9]);
372 		x1 = _mm_aesenc_si128(x1, sk[9]);
373 		if (num_rounds == 10) {
374 			x0 = _mm_aesenclast_si128(x0, sk[10]);
375 			x1 = _mm_aesenclast_si128(x1, sk[10]);
376 		} else if (num_rounds == 12) {
377 			x0 = _mm_aesenc_si128(x0, sk[10]);
378 			x1 = _mm_aesenc_si128(x1, sk[10]);
379 			x0 = _mm_aesenc_si128(x0, sk[11]);
380 			x1 = _mm_aesenc_si128(x1, sk[11]);
381 			x0 = _mm_aesenclast_si128(x0, sk[12]);
382 			x1 = _mm_aesenclast_si128(x1, sk[12]);
383 		} else {
384 			x0 = _mm_aesenc_si128(x0, sk[10]);
385 			x1 = _mm_aesenc_si128(x1, sk[10]);
386 			x0 = _mm_aesenc_si128(x0, sk[11]);
387 			x1 = _mm_aesenc_si128(x1, sk[11]);
388 			x0 = _mm_aesenc_si128(x0, sk[12]);
389 			x1 = _mm_aesenc_si128(x1, sk[12]);
390 			x0 = _mm_aesenc_si128(x0, sk[13]);
391 			x1 = _mm_aesenc_si128(x1, sk[13]);
392 			x0 = _mm_aesenclast_si128(x0, sk[14]);
393 			x1 = _mm_aesenclast_si128(x1, sk[14]);
394 		}
395 
396 		x0 = _mm_xor_si128(x0, dx);
397 		if (first_iter) {
398 			cmx = _mm_xor_si128(cmx, x0);
399 			first_iter = 0;
400 		} else {
401 			cmx = _mm_xor_si128(x1, x0);
402 		}
403 		_mm_storeu_si128((void *)buf, x0);
404 
405 		buf += 16;
406 		len -= 16;
407 
408 		/*
409 		 * Increment the counter value.
410 		 */
411 		ivx = _mm_add_epi64(ivx, one);
412 		ivx = _mm_sub_epi64(ivx,
413 			_mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8));
414 
415 		/*
416 		 * If this was the last iteration, then compute the
417 		 * extra block encryption to complete CBC-MAC.
418 		 */
419 		if (len == 0) {
420 			cmx = _mm_xor_si128(cmx, sk[0]);
421 			cmx = _mm_aesenc_si128(cmx, sk[1]);
422 			cmx = _mm_aesenc_si128(cmx, sk[2]);
423 			cmx = _mm_aesenc_si128(cmx, sk[3]);
424 			cmx = _mm_aesenc_si128(cmx, sk[4]);
425 			cmx = _mm_aesenc_si128(cmx, sk[5]);
426 			cmx = _mm_aesenc_si128(cmx, sk[6]);
427 			cmx = _mm_aesenc_si128(cmx, sk[7]);
428 			cmx = _mm_aesenc_si128(cmx, sk[8]);
429 			cmx = _mm_aesenc_si128(cmx, sk[9]);
430 			if (num_rounds == 10) {
431 				cmx = _mm_aesenclast_si128(cmx, sk[10]);
432 			} else if (num_rounds == 12) {
433 				cmx = _mm_aesenc_si128(cmx, sk[10]);
434 				cmx = _mm_aesenc_si128(cmx, sk[11]);
435 				cmx = _mm_aesenclast_si128(cmx, sk[12]);
436 			} else {
437 				cmx = _mm_aesenc_si128(cmx, sk[10]);
438 				cmx = _mm_aesenc_si128(cmx, sk[11]);
439 				cmx = _mm_aesenc_si128(cmx, sk[12]);
440 				cmx = _mm_aesenc_si128(cmx, sk[13]);
441 				cmx = _mm_aesenclast_si128(cmx, sk[14]);
442 			}
443 			break;
444 		}
445 	}
446 
447 	/*
448 	 * Write back new counter value and CBC-MAC value.
449 	 */
450 	_mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev));
451 	_mm_storeu_si128(cbcmac, cmx);
452 }
453 
454 /* see bearssl_block.h */
455 BR_TARGET("sse2,sse4.1,aes")
456 void
457 br_aes_x86ni_ctrcbc_decrypt(const br_aes_x86ni_ctrcbc_keys *ctx,
458 	void *ctr, void *cbcmac, void *data, size_t len)
459 {
460 	unsigned char *buf;
461 	unsigned num_rounds;
462 	__m128i sk[15];
463 	__m128i ivx, cmx;
464 	__m128i erev, zero, one;
465 	unsigned u;
466 
467 	num_rounds = ctx->num_rounds;
468 	for (u = 0; u <= num_rounds; u ++) {
469 		sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
470 	}
471 
472 	/*
473 	 * Some SSE2 constants.
474 	 */
475 	erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
476 		8, 9, 10, 11, 12, 13, 14, 15);
477 	zero = _mm_setzero_si128();
478 	one = _mm_set_epi64x(0, 1);
479 
480 	/*
481 	 * Decode the counter in big-endian.
482 	 */
483 	ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev);
484 	cmx = _mm_loadu_si128(cbcmac);
485 
486 	buf = data;
487 	while (len > 0) {
488 		__m128i dx, x0, x1;
489 
490 		/*
491 		 * Load initial values:
492 		 *   dx   encrypted block of data
493 		 *   x0   counter (for CTR encryption)
494 		 *   x1   input for CBC-MAC
495 		 */
496 		dx = _mm_loadu_si128((void *)buf);
497 		x0 = _mm_shuffle_epi8(ivx, erev);
498 		x1 = _mm_xor_si128(cmx, dx);
499 
500 		x0 = _mm_xor_si128(x0, sk[0]);
501 		x1 = _mm_xor_si128(x1, sk[0]);
502 		x0 = _mm_aesenc_si128(x0, sk[1]);
503 		x1 = _mm_aesenc_si128(x1, sk[1]);
504 		x0 = _mm_aesenc_si128(x0, sk[2]);
505 		x1 = _mm_aesenc_si128(x1, sk[2]);
506 		x0 = _mm_aesenc_si128(x0, sk[3]);
507 		x1 = _mm_aesenc_si128(x1, sk[3]);
508 		x0 = _mm_aesenc_si128(x0, sk[4]);
509 		x1 = _mm_aesenc_si128(x1, sk[4]);
510 		x0 = _mm_aesenc_si128(x0, sk[5]);
511 		x1 = _mm_aesenc_si128(x1, sk[5]);
512 		x0 = _mm_aesenc_si128(x0, sk[6]);
513 		x1 = _mm_aesenc_si128(x1, sk[6]);
514 		x0 = _mm_aesenc_si128(x0, sk[7]);
515 		x1 = _mm_aesenc_si128(x1, sk[7]);
516 		x0 = _mm_aesenc_si128(x0, sk[8]);
517 		x1 = _mm_aesenc_si128(x1, sk[8]);
518 		x0 = _mm_aesenc_si128(x0, sk[9]);
519 		x1 = _mm_aesenc_si128(x1, sk[9]);
520 		if (num_rounds == 10) {
521 			x0 = _mm_aesenclast_si128(x0, sk[10]);
522 			x1 = _mm_aesenclast_si128(x1, sk[10]);
523 		} else if (num_rounds == 12) {
524 			x0 = _mm_aesenc_si128(x0, sk[10]);
525 			x1 = _mm_aesenc_si128(x1, sk[10]);
526 			x0 = _mm_aesenc_si128(x0, sk[11]);
527 			x1 = _mm_aesenc_si128(x1, sk[11]);
528 			x0 = _mm_aesenclast_si128(x0, sk[12]);
529 			x1 = _mm_aesenclast_si128(x1, sk[12]);
530 		} else {
531 			x0 = _mm_aesenc_si128(x0, sk[10]);
532 			x1 = _mm_aesenc_si128(x1, sk[10]);
533 			x0 = _mm_aesenc_si128(x0, sk[11]);
534 			x1 = _mm_aesenc_si128(x1, sk[11]);
535 			x0 = _mm_aesenc_si128(x0, sk[12]);
536 			x1 = _mm_aesenc_si128(x1, sk[12]);
537 			x0 = _mm_aesenc_si128(x0, sk[13]);
538 			x1 = _mm_aesenc_si128(x1, sk[13]);
539 			x0 = _mm_aesenclast_si128(x0, sk[14]);
540 			x1 = _mm_aesenclast_si128(x1, sk[14]);
541 		}
542 		x0 = _mm_xor_si128(x0, dx);
543 		cmx = x1;
544 		_mm_storeu_si128((void *)buf, x0);
545 
546 		buf += 16;
547 		len -= 16;
548 
549 		/*
550 		 * Increment the counter value.
551 		 */
552 		ivx = _mm_add_epi64(ivx, one);
553 		ivx = _mm_sub_epi64(ivx,
554 			_mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8));
555 	}
556 
557 	/*
558 	 * Write back new counter value and CBC-MAC value.
559 	 */
560 	_mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev));
561 	_mm_storeu_si128(cbcmac, cmx);
562 }
563 
564 BR_TARGETS_X86_DOWN
565 
566 /* see bearssl_block.h */
567 const br_block_ctrcbc_class br_aes_x86ni_ctrcbc_vtable = {
568 	sizeof(br_aes_x86ni_ctrcbc_keys),
569 	16,
570 	4,
571 	(void (*)(const br_block_ctrcbc_class **, const void *, size_t))
572 		&br_aes_x86ni_ctrcbc_init,
573 	(void (*)(const br_block_ctrcbc_class *const *,
574 		void *, void *, void *, size_t))
575 		&br_aes_x86ni_ctrcbc_encrypt,
576 	(void (*)(const br_block_ctrcbc_class *const *,
577 		void *, void *, void *, size_t))
578 		&br_aes_x86ni_ctrcbc_decrypt,
579 	(void (*)(const br_block_ctrcbc_class *const *,
580 		void *, void *, size_t))
581 		&br_aes_x86ni_ctrcbc_ctr,
582 	(void (*)(const br_block_ctrcbc_class *const *,
583 		void *, const void *, size_t))
584 		&br_aes_x86ni_ctrcbc_mac
585 };
586 
587 #else
588 
589 /* see bearssl_block.h */
590 const br_block_ctrcbc_class *
591 br_aes_x86ni_ctrcbc_get_vtable(void)
592 {
593 	return NULL;
594 }
595 
596 #endif
597