xref: /freebsd/sys/crypto/aesni/aesni_wrap.c (revision faf25f48d601ae39f5752602f3020e2e92605625)
1 /*-
2  * Copyright (C) 2008 Damien Miller <djm@mindrot.org>
3  * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
4  * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net>
5  * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org>
6  * Copyright (c) 2014 The FreeBSD Foundation
7  * All rights reserved.
8  *
9  * Portions of this software were developed by John-Mark Gurney
10  * under sponsorship of the FreeBSD Foundation and
11  * Rubicon Communications, LLC (Netgate).
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include <sys/param.h>
39 #include <sys/libkern.h>
40 #include <sys/malloc.h>
41 #include <sys/proc.h>
42 #include <sys/systm.h>
43 #include <crypto/aesni/aesni.h>
44 
45 #include <opencrypto/gmac.h>
46 
47 #include "aesencdec.h"
48 #include <smmintrin.h>
49 
50 MALLOC_DECLARE(M_AESNI);
51 
52 struct blocks8 {
53 	__m128i	blk[8];
54 } __packed;
55 
56 void
57 aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len,
58     const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN])
59 {
60 	__m128i tot, ivreg;
61 	size_t i;
62 
63 	len /= AES_BLOCK_LEN;
64 	ivreg = _mm_loadu_si128((const __m128i *)iv);
65 	for (i = 0; i < len; i++) {
66 		tot = aesni_enc(rounds - 1, key_schedule,
67 		    _mm_loadu_si128((const __m128i *)from) ^ ivreg);
68 		ivreg = tot;
69 		_mm_storeu_si128((__m128i *)to, tot);
70 		from += AES_BLOCK_LEN;
71 		to += AES_BLOCK_LEN;
72 	}
73 }
74 
75 void
76 aesni_decrypt_cbc(int rounds, const void *key_schedule, size_t len,
77     uint8_t *buf, const uint8_t iv[static AES_BLOCK_LEN])
78 {
79 	__m128i blocks[8];
80 	struct blocks8 *blks;
81 	__m128i ivreg, nextiv;
82 	size_t i, j, cnt;
83 
84 	ivreg = _mm_loadu_si128((const __m128i *)iv);
85 	cnt = len / AES_BLOCK_LEN / 8;
86 	for (i = 0; i < cnt; i++) {
87 		blks = (struct blocks8 *)buf;
88 		aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
89 		    blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
90 		    blks->blk[6], blks->blk[7], &blocks[0]);
91 		for (j = 0; j < 8; j++) {
92 			nextiv = blks->blk[j];
93 			blks->blk[j] = blocks[j] ^ ivreg;
94 			ivreg = nextiv;
95 		}
96 		buf += AES_BLOCK_LEN * 8;
97 	}
98 	i *= 8;
99 	cnt = len / AES_BLOCK_LEN;
100 	for (; i < cnt; i++) {
101 		nextiv = _mm_loadu_si128((void *)buf);
102 		_mm_storeu_si128((void *)buf,
103 		    aesni_dec(rounds - 1, key_schedule, nextiv) ^ ivreg);
104 		ivreg = nextiv;
105 		buf += AES_BLOCK_LEN;
106 	}
107 }
108 
109 void
110 aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len,
111     const uint8_t *from, uint8_t *to)
112 {
113 	__m128i tot;
114 	__m128i tout[8];
115 	struct blocks8 *top;
116 	const struct blocks8 *blks;
117 	size_t i, cnt;
118 
119 	cnt = len / AES_BLOCK_LEN / 8;
120 	for (i = 0; i < cnt; i++) {
121 		blks = (const struct blocks8 *)from;
122 		top = (struct blocks8 *)to;
123 		aesni_enc8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
124 		    blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
125 		    blks->blk[6], blks->blk[7], tout);
126 		top->blk[0] = tout[0];
127 		top->blk[1] = tout[1];
128 		top->blk[2] = tout[2];
129 		top->blk[3] = tout[3];
130 		top->blk[4] = tout[4];
131 		top->blk[5] = tout[5];
132 		top->blk[6] = tout[6];
133 		top->blk[7] = tout[7];
134 		from += AES_BLOCK_LEN * 8;
135 		to += AES_BLOCK_LEN * 8;
136 	}
137 	i *= 8;
138 	cnt = len / AES_BLOCK_LEN;
139 	for (; i < cnt; i++) {
140 		tot = aesni_enc(rounds - 1, key_schedule,
141 		    _mm_loadu_si128((const __m128i *)from));
142 		_mm_storeu_si128((__m128i *)to, tot);
143 		from += AES_BLOCK_LEN;
144 		to += AES_BLOCK_LEN;
145 	}
146 }
147 
148 void
149 aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len,
150     const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN])
151 {
152 	__m128i tot;
153 	__m128i tout[8];
154 	const struct blocks8 *blks;
155 	struct blocks8 *top;
156 	size_t i, cnt;
157 
158 	cnt = len / AES_BLOCK_LEN / 8;
159 	for (i = 0; i < cnt; i++) {
160 		blks = (const struct blocks8 *)from;
161 		top = (struct blocks8 *)to;
162 		aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
163 		    blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
164 		    blks->blk[6], blks->blk[7], tout);
165 		top->blk[0] = tout[0];
166 		top->blk[1] = tout[1];
167 		top->blk[2] = tout[2];
168 		top->blk[3] = tout[3];
169 		top->blk[4] = tout[4];
170 		top->blk[5] = tout[5];
171 		top->blk[6] = tout[6];
172 		top->blk[7] = tout[7];
173 		from += AES_BLOCK_LEN * 8;
174 		to += AES_BLOCK_LEN * 8;
175 	}
176 	i *= 8;
177 	cnt = len / AES_BLOCK_LEN;
178 	for (; i < cnt; i++) {
179 		tot = aesni_dec(rounds - 1, key_schedule,
180 		    _mm_loadu_si128((const __m128i *)from));
181 		_mm_storeu_si128((__m128i *)to, tot);
182 		from += AES_BLOCK_LEN;
183 		to += AES_BLOCK_LEN;
184 	}
185 }
186 
187 /*
188  * mixed endian increment, low 64bits stored in hi word to be compatible
189  * with _icm's BSWAP.
190  */
191 static inline __m128i
192 nextc(__m128i x)
193 {
194 	const __m128i ONE = _mm_setr_epi32(0, 0, 1, 0);
195 	const __m128i ZERO = _mm_setzero_si128();
196 
197 	x = _mm_add_epi64(x, ONE);
198 	__m128i t = _mm_cmpeq_epi64(x, ZERO);
199 	t = _mm_unpackhi_epi64(t, ZERO);
200 	x = _mm_sub_epi64(x, t);
201 
202 	return x;
203 }
204 
205 void
206 aesni_encrypt_icm(int rounds, const void *key_schedule, size_t len,
207     const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN])
208 {
209 	__m128i tot;
210 	__m128i tmp1, tmp2, tmp3, tmp4;
211 	__m128i tmp5, tmp6, tmp7, tmp8;
212 	__m128i ctr1, ctr2, ctr3, ctr4;
213 	__m128i ctr5, ctr6, ctr7, ctr8;
214 	__m128i BSWAP_EPI64;
215 	__m128i tout[8];
216 	__m128i block;
217 	struct blocks8 *top;
218 	const struct blocks8 *blks;
219 	size_t i, cnt, resid;
220 
221 	BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7);
222 
223 	ctr1 = _mm_loadu_si128((const __m128i *)iv);
224 	ctr1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
225 
226 	cnt = len / AES_BLOCK_LEN / 8;
227 	for (i = 0; i < cnt; i++) {
228 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
229 		ctr2 = nextc(ctr1);
230 		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
231 		ctr3 = nextc(ctr2);
232 		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
233 		ctr4 = nextc(ctr3);
234 		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
235 		ctr5 = nextc(ctr4);
236 		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
237 		ctr6 = nextc(ctr5);
238 		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
239 		ctr7 = nextc(ctr6);
240 		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
241 		ctr8 = nextc(ctr7);
242 		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
243 		ctr1 = nextc(ctr8);
244 
245 		blks = (const struct blocks8 *)from;
246 		top = (struct blocks8 *)to;
247 		aesni_enc8(rounds - 1, key_schedule, tmp1, tmp2, tmp3, tmp4,
248 		    tmp5, tmp6, tmp7, tmp8, tout);
249 
250 		top->blk[0] = blks->blk[0] ^ tout[0];
251 		top->blk[1] = blks->blk[1] ^ tout[1];
252 		top->blk[2] = blks->blk[2] ^ tout[2];
253 		top->blk[3] = blks->blk[3] ^ tout[3];
254 		top->blk[4] = blks->blk[4] ^ tout[4];
255 		top->blk[5] = blks->blk[5] ^ tout[5];
256 		top->blk[6] = blks->blk[6] ^ tout[6];
257 		top->blk[7] = blks->blk[7] ^ tout[7];
258 
259 		from += AES_BLOCK_LEN * 8;
260 		to += AES_BLOCK_LEN * 8;
261 	}
262 	i *= 8;
263 	cnt = len / AES_BLOCK_LEN;
264 	for (; i < cnt; i++) {
265 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
266 		ctr1 = nextc(ctr1);
267 
268 		tot = aesni_enc(rounds - 1, key_schedule, tmp1);
269 
270 		tot = tot ^ _mm_loadu_si128((const __m128i *)from);
271 		_mm_storeu_si128((__m128i *)to, tot);
272 
273 		from += AES_BLOCK_LEN;
274 		to += AES_BLOCK_LEN;
275 	}
276 
277 	/*
278 	 * Handle remaining partial round.  Copy the remaining payload onto the
279 	 * stack to ensure that the full block can be loaded safely.
280 	 */
281 	resid = len % AES_BLOCK_LEN;
282 	if (resid != 0) {
283 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
284 		tot = aesni_enc(rounds - 1, key_schedule, tmp1);
285 		block = _mm_setzero_si128();
286 		memcpy(&block, from, resid);
287 		tot = tot ^ _mm_loadu_si128(&block);
288 		memcpy(to, &tot, resid);
289 		explicit_bzero(&block, sizeof(block));
290 	}
291 }
292 
293 #define	AES_XTS_BLOCKSIZE	16
294 #define	AES_XTS_IVSIZE		8
295 #define	AES_XTS_ALPHA		0x87	/* GF(2^128) generator polynomial */
296 
297 static inline __m128i
298 xts_crank_lfsr(__m128i inp)
299 {
300 	const __m128i alphamask = _mm_set_epi32(1, 1, 1, AES_XTS_ALPHA);
301 	__m128i xtweak, ret;
302 
303 	/* set up xor mask */
304 	xtweak = _mm_shuffle_epi32(inp, 0x93);
305 	xtweak = _mm_srai_epi32(xtweak, 31);
306 	xtweak &= alphamask;
307 
308 	/* next term */
309 	ret = _mm_slli_epi32(inp, 1);
310 	ret ^= xtweak;
311 
312 	return ret;
313 }
314 
315 static void
316 aesni_crypt_xts_block(int rounds, const __m128i *key_schedule, __m128i *tweak,
317     const uint8_t *from, uint8_t *to, int do_encrypt)
318 {
319 	__m128i block;
320 
321 	block = _mm_loadu_si128((const __m128i *)from) ^ *tweak;
322 
323 	if (do_encrypt)
324 		block = aesni_enc(rounds - 1, key_schedule, block);
325 	else
326 		block = aesni_dec(rounds - 1, key_schedule, block);
327 
328 	_mm_storeu_si128((__m128i *)to, block ^ *tweak);
329 
330 	*tweak = xts_crank_lfsr(*tweak);
331 }
332 
333 static void
334 aesni_crypt_xts_block8(int rounds, const __m128i *key_schedule, __m128i *tweak,
335     const uint8_t *from, uint8_t *to, int do_encrypt)
336 {
337 	__m128i tmptweak;
338 	__m128i a, b, c, d, e, f, g, h;
339 	__m128i tweaks[8];
340 	__m128i tmp[8];
341 	__m128i *top;
342 	const __m128i *fromp;
343 
344 	tmptweak = *tweak;
345 
346 	/*
347 	 * unroll the loop.  This lets gcc put values directly in the
348 	 * register and saves memory accesses.
349 	 */
350 	fromp = (const __m128i *)from;
351 #define PREPINP(v, pos) 					\
352 		do {						\
353 			tweaks[(pos)] = tmptweak;		\
354 			(v) = _mm_loadu_si128(&fromp[pos]) ^	\
355 			    tmptweak;				\
356 			tmptweak = xts_crank_lfsr(tmptweak);	\
357 		} while (0)
358 	PREPINP(a, 0);
359 	PREPINP(b, 1);
360 	PREPINP(c, 2);
361 	PREPINP(d, 3);
362 	PREPINP(e, 4);
363 	PREPINP(f, 5);
364 	PREPINP(g, 6);
365 	PREPINP(h, 7);
366 	*tweak = tmptweak;
367 
368 	if (do_encrypt)
369 		aesni_enc8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h,
370 		    tmp);
371 	else
372 		aesni_dec8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h,
373 		    tmp);
374 
375 	top = (__m128i *)to;
376 	_mm_storeu_si128(&top[0], tmp[0] ^ tweaks[0]);
377 	_mm_storeu_si128(&top[1], tmp[1] ^ tweaks[1]);
378 	_mm_storeu_si128(&top[2], tmp[2] ^ tweaks[2]);
379 	_mm_storeu_si128(&top[3], tmp[3] ^ tweaks[3]);
380 	_mm_storeu_si128(&top[4], tmp[4] ^ tweaks[4]);
381 	_mm_storeu_si128(&top[5], tmp[5] ^ tweaks[5]);
382 	_mm_storeu_si128(&top[6], tmp[6] ^ tweaks[6]);
383 	_mm_storeu_si128(&top[7], tmp[7] ^ tweaks[7]);
384 }
385 
386 static void
387 aesni_crypt_xts(int rounds, const __m128i *data_schedule,
388     const __m128i *tweak_schedule, size_t len, const uint8_t *from,
389     uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt)
390 {
391 	__m128i tweakreg;
392 	uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16);
393 	size_t i, cnt;
394 
395 	/*
396 	 * Prepare tweak as E_k2(IV). IV is specified as LE representation
397 	 * of a 64-bit block number which we allow to be passed in directly.
398 	 */
399 #if BYTE_ORDER == LITTLE_ENDIAN
400 	bcopy(iv, tweak, AES_XTS_IVSIZE);
401 	/* Last 64 bits of IV are always zero. */
402 	bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE);
403 #else
404 #error Only LITTLE_ENDIAN architectures are supported.
405 #endif
406 	tweakreg = _mm_loadu_si128((__m128i *)&tweak[0]);
407 	tweakreg = aesni_enc(rounds - 1, tweak_schedule, tweakreg);
408 
409 	cnt = len / AES_XTS_BLOCKSIZE / 8;
410 	for (i = 0; i < cnt; i++) {
411 		aesni_crypt_xts_block8(rounds, data_schedule, &tweakreg,
412 		    from, to, do_encrypt);
413 		from += AES_XTS_BLOCKSIZE * 8;
414 		to += AES_XTS_BLOCKSIZE * 8;
415 	}
416 	i *= 8;
417 	cnt = len / AES_XTS_BLOCKSIZE;
418 	for (; i < cnt; i++) {
419 		aesni_crypt_xts_block(rounds, data_schedule, &tweakreg,
420 		    from, to, do_encrypt);
421 		from += AES_XTS_BLOCKSIZE;
422 		to += AES_XTS_BLOCKSIZE;
423 	}
424 }
425 
426 void
427 aesni_encrypt_xts(int rounds, const void *data_schedule,
428     const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
429     const uint8_t iv[static AES_BLOCK_LEN])
430 {
431 
432 	aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to,
433 	    iv, 1);
434 }
435 
436 void
437 aesni_decrypt_xts(int rounds, const void *data_schedule,
438     const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
439     const uint8_t iv[static AES_BLOCK_LEN])
440 {
441 
442 	aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to,
443 	    iv, 0);
444 }
445 
446 void
447 aesni_cipher_setup_common(struct aesni_session *ses,
448     const struct crypto_session_params *csp, const uint8_t *key, int keylen)
449 {
450 	int decsched;
451 
452 	decsched = 1;
453 
454 	switch (csp->csp_cipher_alg) {
455 	case CRYPTO_AES_ICM:
456 	case CRYPTO_AES_NIST_GCM_16:
457 	case CRYPTO_AES_CCM_16:
458 		decsched = 0;
459 		break;
460 	}
461 
462 	if (csp->csp_cipher_alg == CRYPTO_AES_XTS)
463 		keylen /= 2;
464 
465 	switch (keylen * 8) {
466 	case 128:
467 		ses->rounds = AES128_ROUNDS;
468 		break;
469 	case 192:
470 		ses->rounds = AES192_ROUNDS;
471 		break;
472 	case 256:
473 		ses->rounds = AES256_ROUNDS;
474 		break;
475 	default:
476 		panic("shouldn't happen");
477 	}
478 
479 	aesni_set_enckey(key, ses->enc_schedule, ses->rounds);
480 	if (decsched)
481 		aesni_set_deckey(ses->enc_schedule, ses->dec_schedule,
482 		    ses->rounds);
483 
484 	if (csp->csp_cipher_alg == CRYPTO_AES_XTS)
485 		aesni_set_enckey(key + keylen, ses->xts_schedule,
486 		    ses->rounds);
487 }
488