1 /*-
2 * Copyright (C) 2008 Damien Miller <djm@mindrot.org>
3 * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
4 * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net>
5 * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org>
6 * Copyright (c) 2014 The FreeBSD Foundation
7 * All rights reserved.
8 *
9 * Portions of this software were developed by John-Mark Gurney
10 * under sponsorship of the FreeBSD Foundation and
11 * Rubicon Communications, LLC (Netgate).
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #include <sys/param.h>
36 #include <sys/libkern.h>
37 #include <sys/malloc.h>
38 #include <sys/proc.h>
39 #include <sys/systm.h>
40 #include <crypto/aesni/aesni.h>
41
42 #include <opencrypto/gmac.h>
43
44 #include "aesencdec.h"
45 #include <smmintrin.h>
46
47 MALLOC_DECLARE(M_AESNI);
48
49 struct blocks8 {
50 __m128i blk[8];
51 } __packed;
52
53 void
aesni_encrypt_cbc(int rounds,const void * key_schedule,size_t len,const uint8_t * from,uint8_t * to,const uint8_t iv[static AES_BLOCK_LEN])54 aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len,
55 const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN])
56 {
57 __m128i tot, ivreg;
58 size_t i;
59
60 len /= AES_BLOCK_LEN;
61 ivreg = _mm_loadu_si128((const __m128i *)iv);
62 for (i = 0; i < len; i++) {
63 tot = aesni_enc(rounds - 1, key_schedule,
64 _mm_loadu_si128((const __m128i *)from) ^ ivreg);
65 ivreg = tot;
66 _mm_storeu_si128((__m128i *)to, tot);
67 from += AES_BLOCK_LEN;
68 to += AES_BLOCK_LEN;
69 }
70 }
71
72 void
aesni_decrypt_cbc(int rounds,const void * key_schedule,size_t len,uint8_t * buf,const uint8_t iv[static AES_BLOCK_LEN])73 aesni_decrypt_cbc(int rounds, const void *key_schedule, size_t len,
74 uint8_t *buf, const uint8_t iv[static AES_BLOCK_LEN])
75 {
76 __m128i blocks[8];
77 struct blocks8 *blks;
78 __m128i ivreg, nextiv;
79 size_t i, j, cnt;
80
81 ivreg = _mm_loadu_si128((const __m128i *)iv);
82 cnt = len / AES_BLOCK_LEN / 8;
83 for (i = 0; i < cnt; i++) {
84 blks = (struct blocks8 *)buf;
85 aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
86 blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
87 blks->blk[6], blks->blk[7], &blocks[0]);
88 for (j = 0; j < 8; j++) {
89 nextiv = blks->blk[j];
90 blks->blk[j] = blocks[j] ^ ivreg;
91 ivreg = nextiv;
92 }
93 buf += AES_BLOCK_LEN * 8;
94 }
95 i *= 8;
96 cnt = len / AES_BLOCK_LEN;
97 for (; i < cnt; i++) {
98 nextiv = _mm_loadu_si128((void *)buf);
99 _mm_storeu_si128((void *)buf,
100 aesni_dec(rounds - 1, key_schedule, nextiv) ^ ivreg);
101 ivreg = nextiv;
102 buf += AES_BLOCK_LEN;
103 }
104 }
105
106 void
aesni_encrypt_ecb(int rounds,const void * key_schedule,size_t len,const uint8_t * from,uint8_t * to)107 aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len,
108 const uint8_t *from, uint8_t *to)
109 {
110 __m128i tot;
111 __m128i tout[8];
112 struct blocks8 *top;
113 const struct blocks8 *blks;
114 size_t i, cnt;
115
116 cnt = len / AES_BLOCK_LEN / 8;
117 for (i = 0; i < cnt; i++) {
118 blks = (const struct blocks8 *)from;
119 top = (struct blocks8 *)to;
120 aesni_enc8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
121 blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
122 blks->blk[6], blks->blk[7], tout);
123 top->blk[0] = tout[0];
124 top->blk[1] = tout[1];
125 top->blk[2] = tout[2];
126 top->blk[3] = tout[3];
127 top->blk[4] = tout[4];
128 top->blk[5] = tout[5];
129 top->blk[6] = tout[6];
130 top->blk[7] = tout[7];
131 from += AES_BLOCK_LEN * 8;
132 to += AES_BLOCK_LEN * 8;
133 }
134 i *= 8;
135 cnt = len / AES_BLOCK_LEN;
136 for (; i < cnt; i++) {
137 tot = aesni_enc(rounds - 1, key_schedule,
138 _mm_loadu_si128((const __m128i *)from));
139 _mm_storeu_si128((__m128i *)to, tot);
140 from += AES_BLOCK_LEN;
141 to += AES_BLOCK_LEN;
142 }
143 }
144
145 void
aesni_decrypt_ecb(int rounds,const void * key_schedule,size_t len,const uint8_t * from,uint8_t * to)146 aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len,
147 const uint8_t *from, uint8_t *to)
148 {
149 __m128i tot;
150 __m128i tout[8];
151 const struct blocks8 *blks;
152 struct blocks8 *top;
153 size_t i, cnt;
154
155 cnt = len / AES_BLOCK_LEN / 8;
156 for (i = 0; i < cnt; i++) {
157 blks = (const struct blocks8 *)from;
158 top = (struct blocks8 *)to;
159 aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
160 blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
161 blks->blk[6], blks->blk[7], tout);
162 top->blk[0] = tout[0];
163 top->blk[1] = tout[1];
164 top->blk[2] = tout[2];
165 top->blk[3] = tout[3];
166 top->blk[4] = tout[4];
167 top->blk[5] = tout[5];
168 top->blk[6] = tout[6];
169 top->blk[7] = tout[7];
170 from += AES_BLOCK_LEN * 8;
171 to += AES_BLOCK_LEN * 8;
172 }
173 i *= 8;
174 cnt = len / AES_BLOCK_LEN;
175 for (; i < cnt; i++) {
176 tot = aesni_dec(rounds - 1, key_schedule,
177 _mm_loadu_si128((const __m128i *)from));
178 _mm_storeu_si128((__m128i *)to, tot);
179 from += AES_BLOCK_LEN;
180 to += AES_BLOCK_LEN;
181 }
182 }
183
184 /*
185 * mixed endian increment, low 64bits stored in hi word to be compatible
186 * with _icm's BSWAP.
187 */
188 static inline __m128i
nextc(__m128i x)189 nextc(__m128i x)
190 {
191 const __m128i ONE = _mm_setr_epi32(0, 0, 1, 0);
192 const __m128i ZERO = _mm_setzero_si128();
193
194 x = _mm_add_epi64(x, ONE);
195 __m128i t = _mm_cmpeq_epi64(x, ZERO);
196 t = _mm_unpackhi_epi64(t, ZERO);
197 x = _mm_sub_epi64(x, t);
198
199 return x;
200 }
201
202 void
aesni_encrypt_icm(int rounds,const void * key_schedule,size_t len,const uint8_t * from,uint8_t * to,const uint8_t iv[static AES_BLOCK_LEN])203 aesni_encrypt_icm(int rounds, const void *key_schedule, size_t len,
204 const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN])
205 {
206 __m128i tot;
207 __m128i tmp1, tmp2, tmp3, tmp4;
208 __m128i tmp5, tmp6, tmp7, tmp8;
209 __m128i ctr1, ctr2, ctr3, ctr4;
210 __m128i ctr5, ctr6, ctr7, ctr8;
211 __m128i BSWAP_EPI64;
212 __m128i tout[8];
213 __m128i block;
214 struct blocks8 *top;
215 const struct blocks8 *blks;
216 size_t i, cnt, resid;
217
218 BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7);
219
220 ctr1 = _mm_loadu_si128((const __m128i *)iv);
221 ctr1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
222
223 cnt = len / AES_BLOCK_LEN / 8;
224 for (i = 0; i < cnt; i++) {
225 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
226 ctr2 = nextc(ctr1);
227 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
228 ctr3 = nextc(ctr2);
229 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
230 ctr4 = nextc(ctr3);
231 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
232 ctr5 = nextc(ctr4);
233 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
234 ctr6 = nextc(ctr5);
235 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
236 ctr7 = nextc(ctr6);
237 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
238 ctr8 = nextc(ctr7);
239 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
240 ctr1 = nextc(ctr8);
241
242 blks = (const struct blocks8 *)from;
243 top = (struct blocks8 *)to;
244 aesni_enc8(rounds - 1, key_schedule, tmp1, tmp2, tmp3, tmp4,
245 tmp5, tmp6, tmp7, tmp8, tout);
246
247 top->blk[0] = blks->blk[0] ^ tout[0];
248 top->blk[1] = blks->blk[1] ^ tout[1];
249 top->blk[2] = blks->blk[2] ^ tout[2];
250 top->blk[3] = blks->blk[3] ^ tout[3];
251 top->blk[4] = blks->blk[4] ^ tout[4];
252 top->blk[5] = blks->blk[5] ^ tout[5];
253 top->blk[6] = blks->blk[6] ^ tout[6];
254 top->blk[7] = blks->blk[7] ^ tout[7];
255
256 from += AES_BLOCK_LEN * 8;
257 to += AES_BLOCK_LEN * 8;
258 }
259 i *= 8;
260 cnt = len / AES_BLOCK_LEN;
261 for (; i < cnt; i++) {
262 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
263 ctr1 = nextc(ctr1);
264
265 tot = aesni_enc(rounds - 1, key_schedule, tmp1);
266
267 tot = tot ^ _mm_loadu_si128((const __m128i *)from);
268 _mm_storeu_si128((__m128i *)to, tot);
269
270 from += AES_BLOCK_LEN;
271 to += AES_BLOCK_LEN;
272 }
273
274 /*
275 * Handle remaining partial round. Copy the remaining payload onto the
276 * stack to ensure that the full block can be loaded safely.
277 */
278 resid = len % AES_BLOCK_LEN;
279 if (resid != 0) {
280 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
281 tot = aesni_enc(rounds - 1, key_schedule, tmp1);
282 block = _mm_setzero_si128();
283 memcpy(&block, from, resid);
284 tot = tot ^ _mm_loadu_si128(&block);
285 memcpy(to, &tot, resid);
286 explicit_bzero(&block, sizeof(block));
287 }
288 }
289
290 #define AES_XTS_BLOCKSIZE 16
291 #define AES_XTS_IVSIZE 8
292 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */
293
294 static inline __m128i
xts_crank_lfsr(__m128i inp)295 xts_crank_lfsr(__m128i inp)
296 {
297 const __m128i alphamask = _mm_set_epi32(1, 1, 1, AES_XTS_ALPHA);
298 __m128i xtweak, ret;
299
300 /* set up xor mask */
301 xtweak = _mm_shuffle_epi32(inp, 0x93);
302 xtweak = _mm_srai_epi32(xtweak, 31);
303 xtweak &= alphamask;
304
305 /* next term */
306 ret = _mm_slli_epi32(inp, 1);
307 ret ^= xtweak;
308
309 return ret;
310 }
311
312 static void
aesni_crypt_xts_block(int rounds,const __m128i * key_schedule,__m128i * tweak,const uint8_t * from,uint8_t * to,int do_encrypt)313 aesni_crypt_xts_block(int rounds, const __m128i *key_schedule, __m128i *tweak,
314 const uint8_t *from, uint8_t *to, int do_encrypt)
315 {
316 __m128i block;
317
318 block = _mm_loadu_si128((const __m128i *)from) ^ *tweak;
319
320 if (do_encrypt)
321 block = aesni_enc(rounds - 1, key_schedule, block);
322 else
323 block = aesni_dec(rounds - 1, key_schedule, block);
324
325 _mm_storeu_si128((__m128i *)to, block ^ *tweak);
326
327 *tweak = xts_crank_lfsr(*tweak);
328 }
329
330 static void
aesni_crypt_xts_block8(int rounds,const __m128i * key_schedule,__m128i * tweak,const uint8_t * from,uint8_t * to,int do_encrypt)331 aesni_crypt_xts_block8(int rounds, const __m128i *key_schedule, __m128i *tweak,
332 const uint8_t *from, uint8_t *to, int do_encrypt)
333 {
334 __m128i tmptweak;
335 __m128i a, b, c, d, e, f, g, h;
336 __m128i tweaks[8];
337 __m128i tmp[8];
338 __m128i *top;
339 const __m128i *fromp;
340
341 tmptweak = *tweak;
342
343 /*
344 * unroll the loop. This lets gcc put values directly in the
345 * register and saves memory accesses.
346 */
347 fromp = (const __m128i *)from;
348 #define PREPINP(v, pos) \
349 do { \
350 tweaks[(pos)] = tmptweak; \
351 (v) = _mm_loadu_si128(&fromp[pos]) ^ \
352 tmptweak; \
353 tmptweak = xts_crank_lfsr(tmptweak); \
354 } while (0)
355 PREPINP(a, 0);
356 PREPINP(b, 1);
357 PREPINP(c, 2);
358 PREPINP(d, 3);
359 PREPINP(e, 4);
360 PREPINP(f, 5);
361 PREPINP(g, 6);
362 PREPINP(h, 7);
363 *tweak = tmptweak;
364
365 if (do_encrypt)
366 aesni_enc8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h,
367 tmp);
368 else
369 aesni_dec8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h,
370 tmp);
371
372 top = (__m128i *)to;
373 _mm_storeu_si128(&top[0], tmp[0] ^ tweaks[0]);
374 _mm_storeu_si128(&top[1], tmp[1] ^ tweaks[1]);
375 _mm_storeu_si128(&top[2], tmp[2] ^ tweaks[2]);
376 _mm_storeu_si128(&top[3], tmp[3] ^ tweaks[3]);
377 _mm_storeu_si128(&top[4], tmp[4] ^ tweaks[4]);
378 _mm_storeu_si128(&top[5], tmp[5] ^ tweaks[5]);
379 _mm_storeu_si128(&top[6], tmp[6] ^ tweaks[6]);
380 _mm_storeu_si128(&top[7], tmp[7] ^ tweaks[7]);
381 }
382
383 static void
aesni_crypt_xts(int rounds,const __m128i * data_schedule,const __m128i * tweak_schedule,size_t len,const uint8_t * from,uint8_t * to,const uint8_t iv[static AES_BLOCK_LEN],int do_encrypt)384 aesni_crypt_xts(int rounds, const __m128i *data_schedule,
385 const __m128i *tweak_schedule, size_t len, const uint8_t *from,
386 uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt)
387 {
388 __m128i tweakreg;
389 uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16);
390 size_t i, cnt;
391
392 /*
393 * Prepare tweak as E_k2(IV). IV is specified as LE representation
394 * of a 64-bit block number which we allow to be passed in directly.
395 */
396 #if BYTE_ORDER == LITTLE_ENDIAN
397 bcopy(iv, tweak, AES_XTS_IVSIZE);
398 /* Last 64 bits of IV are always zero. */
399 bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE);
400 #else
401 #error Only LITTLE_ENDIAN architectures are supported.
402 #endif
403 tweakreg = _mm_loadu_si128((__m128i *)&tweak[0]);
404 tweakreg = aesni_enc(rounds - 1, tweak_schedule, tweakreg);
405
406 cnt = len / AES_XTS_BLOCKSIZE / 8;
407 for (i = 0; i < cnt; i++) {
408 aesni_crypt_xts_block8(rounds, data_schedule, &tweakreg,
409 from, to, do_encrypt);
410 from += AES_XTS_BLOCKSIZE * 8;
411 to += AES_XTS_BLOCKSIZE * 8;
412 }
413 i *= 8;
414 cnt = len / AES_XTS_BLOCKSIZE;
415 for (; i < cnt; i++) {
416 aesni_crypt_xts_block(rounds, data_schedule, &tweakreg,
417 from, to, do_encrypt);
418 from += AES_XTS_BLOCKSIZE;
419 to += AES_XTS_BLOCKSIZE;
420 }
421 }
422
423 void
aesni_encrypt_xts(int rounds,const void * data_schedule,const void * tweak_schedule,size_t len,const uint8_t * from,uint8_t * to,const uint8_t iv[static AES_BLOCK_LEN])424 aesni_encrypt_xts(int rounds, const void *data_schedule,
425 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
426 const uint8_t iv[static AES_BLOCK_LEN])
427 {
428
429 aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to,
430 iv, 1);
431 }
432
433 void
aesni_decrypt_xts(int rounds,const void * data_schedule,const void * tweak_schedule,size_t len,const uint8_t * from,uint8_t * to,const uint8_t iv[static AES_BLOCK_LEN])434 aesni_decrypt_xts(int rounds, const void *data_schedule,
435 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
436 const uint8_t iv[static AES_BLOCK_LEN])
437 {
438
439 aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to,
440 iv, 0);
441 }
442
443 void
aesni_cipher_setup_common(struct aesni_session * ses,const struct crypto_session_params * csp,const uint8_t * key,int keylen)444 aesni_cipher_setup_common(struct aesni_session *ses,
445 const struct crypto_session_params *csp, const uint8_t *key, int keylen)
446 {
447 int decsched;
448
449 decsched = 1;
450
451 switch (csp->csp_cipher_alg) {
452 case CRYPTO_AES_ICM:
453 case CRYPTO_AES_NIST_GCM_16:
454 case CRYPTO_AES_CCM_16:
455 decsched = 0;
456 break;
457 }
458
459 if (csp->csp_cipher_alg == CRYPTO_AES_XTS)
460 keylen /= 2;
461
462 switch (keylen * 8) {
463 case 128:
464 ses->rounds = AES128_ROUNDS;
465 break;
466 case 192:
467 ses->rounds = AES192_ROUNDS;
468 break;
469 case 256:
470 ses->rounds = AES256_ROUNDS;
471 break;
472 default:
473 panic("shouldn't happen");
474 }
475
476 aesni_set_enckey(key, ses->enc_schedule, ses->rounds);
477 if (decsched)
478 aesni_set_deckey(ses->enc_schedule, ses->dec_schedule,
479 ses->rounds);
480
481 if (csp->csp_cipher_alg == CRYPTO_AES_XTS)
482 aesni_set_enckey(key + keylen, ses->xts_schedule,
483 ses->rounds);
484 }
485