1 /*- 2 * Copyright (C) 2008 Damien Miller <djm@mindrot.org> 3 * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org> 4 * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net> 5 * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org> 6 * Copyright (c) 2014 The FreeBSD Foundation 7 * All rights reserved. 8 * 9 * Portions of this software were developed by John-Mark Gurney 10 * under sponsorship of the FreeBSD Foundation and 11 * Rubicon Communications, LLC (Netgate). 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 #include <sys/param.h> 37 #include <sys/libkern.h> 38 #include <sys/malloc.h> 39 #include <sys/proc.h> 40 #include <sys/systm.h> 41 #include <crypto/aesni/aesni.h> 42 43 #include <opencrypto/gmac.h> 44 45 #include "aesencdec.h" 46 #include <smmintrin.h> 47 48 MALLOC_DECLARE(M_AESNI); 49 50 struct blocks8 { 51 __m128i blk[8]; 52 } __packed; 53 54 void 55 aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len, 56 const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN]) 57 { 58 __m128i tot, ivreg; 59 size_t i; 60 61 len /= AES_BLOCK_LEN; 62 ivreg = _mm_loadu_si128((const __m128i *)iv); 63 for (i = 0; i < len; i++) { 64 tot = aesni_enc(rounds - 1, key_schedule, 65 _mm_loadu_si128((const __m128i *)from) ^ ivreg); 66 ivreg = tot; 67 _mm_storeu_si128((__m128i *)to, tot); 68 from += AES_BLOCK_LEN; 69 to += AES_BLOCK_LEN; 70 } 71 } 72 73 void 74 aesni_decrypt_cbc(int rounds, const void *key_schedule, size_t len, 75 uint8_t *buf, const uint8_t iv[static AES_BLOCK_LEN]) 76 { 77 __m128i blocks[8]; 78 struct blocks8 *blks; 79 __m128i ivreg, nextiv; 80 size_t i, j, cnt; 81 82 ivreg = _mm_loadu_si128((const __m128i *)iv); 83 cnt = len / AES_BLOCK_LEN / 8; 84 for (i = 0; i < cnt; i++) { 85 blks = (struct blocks8 *)buf; 86 aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], 87 blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], 88 blks->blk[6], blks->blk[7], &blocks[0]); 89 for (j = 0; j < 8; j++) { 90 nextiv = blks->blk[j]; 91 blks->blk[j] = blocks[j] ^ ivreg; 92 ivreg = nextiv; 93 } 94 buf += AES_BLOCK_LEN * 8; 95 } 96 i *= 8; 97 cnt = len / AES_BLOCK_LEN; 98 for (; i < cnt; i++) { 99 nextiv = _mm_loadu_si128((void *)buf); 100 _mm_storeu_si128((void *)buf, 101 aesni_dec(rounds - 1, key_schedule, nextiv) ^ ivreg); 102 ivreg = nextiv; 103 buf += AES_BLOCK_LEN; 104 } 105 } 106 107 void 108 aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len, 109 const uint8_t *from, uint8_t *to) 110 { 111 __m128i tot; 112 __m128i tout[8]; 113 struct blocks8 *top; 114 const struct blocks8 *blks; 115 size_t i, cnt; 116 117 cnt = len / AES_BLOCK_LEN / 8; 118 for (i = 0; i < cnt; i++) { 119 blks = (const struct blocks8 *)from; 120 top = (struct blocks8 *)to; 121 aesni_enc8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], 122 blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], 123 blks->blk[6], blks->blk[7], tout); 124 top->blk[0] = tout[0]; 125 top->blk[1] = tout[1]; 126 top->blk[2] = tout[2]; 127 top->blk[3] = tout[3]; 128 top->blk[4] = tout[4]; 129 top->blk[5] = tout[5]; 130 top->blk[6] = tout[6]; 131 top->blk[7] = tout[7]; 132 from += AES_BLOCK_LEN * 8; 133 to += AES_BLOCK_LEN * 8; 134 } 135 i *= 8; 136 cnt = len / AES_BLOCK_LEN; 137 for (; i < cnt; i++) { 138 tot = aesni_enc(rounds - 1, key_schedule, 139 _mm_loadu_si128((const __m128i *)from)); 140 _mm_storeu_si128((__m128i *)to, tot); 141 from += AES_BLOCK_LEN; 142 to += AES_BLOCK_LEN; 143 } 144 } 145 146 void 147 aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len, 148 const uint8_t *from, uint8_t *to) 149 { 150 __m128i tot; 151 __m128i tout[8]; 152 const struct blocks8 *blks; 153 struct blocks8 *top; 154 size_t i, cnt; 155 156 cnt = len / AES_BLOCK_LEN / 8; 157 for (i = 0; i < cnt; i++) { 158 blks = (const struct blocks8 *)from; 159 top = (struct blocks8 *)to; 160 aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], 161 blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], 162 blks->blk[6], blks->blk[7], tout); 163 top->blk[0] = tout[0]; 164 top->blk[1] = tout[1]; 165 top->blk[2] = tout[2]; 166 top->blk[3] = tout[3]; 167 top->blk[4] = tout[4]; 168 top->blk[5] = tout[5]; 169 top->blk[6] = tout[6]; 170 top->blk[7] = tout[7]; 171 from += AES_BLOCK_LEN * 8; 172 to += AES_BLOCK_LEN * 8; 173 } 174 i *= 8; 175 cnt = len / AES_BLOCK_LEN; 176 for (; i < cnt; i++) { 177 tot = aesni_dec(rounds - 1, key_schedule, 178 _mm_loadu_si128((const __m128i *)from)); 179 _mm_storeu_si128((__m128i *)to, tot); 180 from += AES_BLOCK_LEN; 181 to += AES_BLOCK_LEN; 182 } 183 } 184 185 /* 186 * mixed endian increment, low 64bits stored in hi word to be compatible 187 * with _icm's BSWAP. 188 */ 189 static inline __m128i 190 nextc(__m128i x) 191 { 192 const __m128i ONE = _mm_setr_epi32(0, 0, 1, 0); 193 const __m128i ZERO = _mm_setzero_si128(); 194 195 x = _mm_add_epi64(x, ONE); 196 __m128i t = _mm_cmpeq_epi64(x, ZERO); 197 t = _mm_unpackhi_epi64(t, ZERO); 198 x = _mm_sub_epi64(x, t); 199 200 return x; 201 } 202 203 void 204 aesni_encrypt_icm(int rounds, const void *key_schedule, size_t len, 205 const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN]) 206 { 207 __m128i tot; 208 __m128i tmp1, tmp2, tmp3, tmp4; 209 __m128i tmp5, tmp6, tmp7, tmp8; 210 __m128i ctr1, ctr2, ctr3, ctr4; 211 __m128i ctr5, ctr6, ctr7, ctr8; 212 __m128i BSWAP_EPI64; 213 __m128i tout[8]; 214 __m128i block; 215 struct blocks8 *top; 216 const struct blocks8 *blks; 217 size_t i, cnt, resid; 218 219 BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7); 220 221 ctr1 = _mm_loadu_si128((const __m128i *)iv); 222 ctr1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 223 224 cnt = len / AES_BLOCK_LEN / 8; 225 for (i = 0; i < cnt; i++) { 226 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 227 ctr2 = nextc(ctr1); 228 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 229 ctr3 = nextc(ctr2); 230 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 231 ctr4 = nextc(ctr3); 232 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); 233 ctr5 = nextc(ctr4); 234 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); 235 ctr6 = nextc(ctr5); 236 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); 237 ctr7 = nextc(ctr6); 238 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); 239 ctr8 = nextc(ctr7); 240 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); 241 ctr1 = nextc(ctr8); 242 243 blks = (const struct blocks8 *)from; 244 top = (struct blocks8 *)to; 245 aesni_enc8(rounds - 1, key_schedule, tmp1, tmp2, tmp3, tmp4, 246 tmp5, tmp6, tmp7, tmp8, tout); 247 248 top->blk[0] = blks->blk[0] ^ tout[0]; 249 top->blk[1] = blks->blk[1] ^ tout[1]; 250 top->blk[2] = blks->blk[2] ^ tout[2]; 251 top->blk[3] = blks->blk[3] ^ tout[3]; 252 top->blk[4] = blks->blk[4] ^ tout[4]; 253 top->blk[5] = blks->blk[5] ^ tout[5]; 254 top->blk[6] = blks->blk[6] ^ tout[6]; 255 top->blk[7] = blks->blk[7] ^ tout[7]; 256 257 from += AES_BLOCK_LEN * 8; 258 to += AES_BLOCK_LEN * 8; 259 } 260 i *= 8; 261 cnt = len / AES_BLOCK_LEN; 262 for (; i < cnt; i++) { 263 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 264 ctr1 = nextc(ctr1); 265 266 tot = aesni_enc(rounds - 1, key_schedule, tmp1); 267 268 tot = tot ^ _mm_loadu_si128((const __m128i *)from); 269 _mm_storeu_si128((__m128i *)to, tot); 270 271 from += AES_BLOCK_LEN; 272 to += AES_BLOCK_LEN; 273 } 274 275 /* 276 * Handle remaining partial round. Copy the remaining payload onto the 277 * stack to ensure that the full block can be loaded safely. 278 */ 279 resid = len % AES_BLOCK_LEN; 280 if (resid != 0) { 281 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 282 tot = aesni_enc(rounds - 1, key_schedule, tmp1); 283 block = _mm_setzero_si128(); 284 memcpy(&block, from, resid); 285 tot = tot ^ _mm_loadu_si128(&block); 286 memcpy(to, &tot, resid); 287 explicit_bzero(&block, sizeof(block)); 288 } 289 } 290 291 #define AES_XTS_BLOCKSIZE 16 292 #define AES_XTS_IVSIZE 8 293 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */ 294 295 static inline __m128i 296 xts_crank_lfsr(__m128i inp) 297 { 298 const __m128i alphamask = _mm_set_epi32(1, 1, 1, AES_XTS_ALPHA); 299 __m128i xtweak, ret; 300 301 /* set up xor mask */ 302 xtweak = _mm_shuffle_epi32(inp, 0x93); 303 xtweak = _mm_srai_epi32(xtweak, 31); 304 xtweak &= alphamask; 305 306 /* next term */ 307 ret = _mm_slli_epi32(inp, 1); 308 ret ^= xtweak; 309 310 return ret; 311 } 312 313 static void 314 aesni_crypt_xts_block(int rounds, const __m128i *key_schedule, __m128i *tweak, 315 const uint8_t *from, uint8_t *to, int do_encrypt) 316 { 317 __m128i block; 318 319 block = _mm_loadu_si128((const __m128i *)from) ^ *tweak; 320 321 if (do_encrypt) 322 block = aesni_enc(rounds - 1, key_schedule, block); 323 else 324 block = aesni_dec(rounds - 1, key_schedule, block); 325 326 _mm_storeu_si128((__m128i *)to, block ^ *tweak); 327 328 *tweak = xts_crank_lfsr(*tweak); 329 } 330 331 static void 332 aesni_crypt_xts_block8(int rounds, const __m128i *key_schedule, __m128i *tweak, 333 const uint8_t *from, uint8_t *to, int do_encrypt) 334 { 335 __m128i tmptweak; 336 __m128i a, b, c, d, e, f, g, h; 337 __m128i tweaks[8]; 338 __m128i tmp[8]; 339 __m128i *top; 340 const __m128i *fromp; 341 342 tmptweak = *tweak; 343 344 /* 345 * unroll the loop. This lets gcc put values directly in the 346 * register and saves memory accesses. 347 */ 348 fromp = (const __m128i *)from; 349 #define PREPINP(v, pos) \ 350 do { \ 351 tweaks[(pos)] = tmptweak; \ 352 (v) = _mm_loadu_si128(&fromp[pos]) ^ \ 353 tmptweak; \ 354 tmptweak = xts_crank_lfsr(tmptweak); \ 355 } while (0) 356 PREPINP(a, 0); 357 PREPINP(b, 1); 358 PREPINP(c, 2); 359 PREPINP(d, 3); 360 PREPINP(e, 4); 361 PREPINP(f, 5); 362 PREPINP(g, 6); 363 PREPINP(h, 7); 364 *tweak = tmptweak; 365 366 if (do_encrypt) 367 aesni_enc8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, 368 tmp); 369 else 370 aesni_dec8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, 371 tmp); 372 373 top = (__m128i *)to; 374 _mm_storeu_si128(&top[0], tmp[0] ^ tweaks[0]); 375 _mm_storeu_si128(&top[1], tmp[1] ^ tweaks[1]); 376 _mm_storeu_si128(&top[2], tmp[2] ^ tweaks[2]); 377 _mm_storeu_si128(&top[3], tmp[3] ^ tweaks[3]); 378 _mm_storeu_si128(&top[4], tmp[4] ^ tweaks[4]); 379 _mm_storeu_si128(&top[5], tmp[5] ^ tweaks[5]); 380 _mm_storeu_si128(&top[6], tmp[6] ^ tweaks[6]); 381 _mm_storeu_si128(&top[7], tmp[7] ^ tweaks[7]); 382 } 383 384 static void 385 aesni_crypt_xts(int rounds, const __m128i *data_schedule, 386 const __m128i *tweak_schedule, size_t len, const uint8_t *from, 387 uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt) 388 { 389 __m128i tweakreg; 390 uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16); 391 size_t i, cnt; 392 393 /* 394 * Prepare tweak as E_k2(IV). IV is specified as LE representation 395 * of a 64-bit block number which we allow to be passed in directly. 396 */ 397 #if BYTE_ORDER == LITTLE_ENDIAN 398 bcopy(iv, tweak, AES_XTS_IVSIZE); 399 /* Last 64 bits of IV are always zero. */ 400 bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE); 401 #else 402 #error Only LITTLE_ENDIAN architectures are supported. 403 #endif 404 tweakreg = _mm_loadu_si128((__m128i *)&tweak[0]); 405 tweakreg = aesni_enc(rounds - 1, tweak_schedule, tweakreg); 406 407 cnt = len / AES_XTS_BLOCKSIZE / 8; 408 for (i = 0; i < cnt; i++) { 409 aesni_crypt_xts_block8(rounds, data_schedule, &tweakreg, 410 from, to, do_encrypt); 411 from += AES_XTS_BLOCKSIZE * 8; 412 to += AES_XTS_BLOCKSIZE * 8; 413 } 414 i *= 8; 415 cnt = len / AES_XTS_BLOCKSIZE; 416 for (; i < cnt; i++) { 417 aesni_crypt_xts_block(rounds, data_schedule, &tweakreg, 418 from, to, do_encrypt); 419 from += AES_XTS_BLOCKSIZE; 420 to += AES_XTS_BLOCKSIZE; 421 } 422 } 423 424 void 425 aesni_encrypt_xts(int rounds, const void *data_schedule, 426 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, 427 const uint8_t iv[static AES_BLOCK_LEN]) 428 { 429 430 aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to, 431 iv, 1); 432 } 433 434 void 435 aesni_decrypt_xts(int rounds, const void *data_schedule, 436 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, 437 const uint8_t iv[static AES_BLOCK_LEN]) 438 { 439 440 aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to, 441 iv, 0); 442 } 443 444 void 445 aesni_cipher_setup_common(struct aesni_session *ses, 446 const struct crypto_session_params *csp, const uint8_t *key, int keylen) 447 { 448 int decsched; 449 450 decsched = 1; 451 452 switch (csp->csp_cipher_alg) { 453 case CRYPTO_AES_ICM: 454 case CRYPTO_AES_NIST_GCM_16: 455 case CRYPTO_AES_CCM_16: 456 decsched = 0; 457 break; 458 } 459 460 if (csp->csp_cipher_alg == CRYPTO_AES_XTS) 461 keylen /= 2; 462 463 switch (keylen * 8) { 464 case 128: 465 ses->rounds = AES128_ROUNDS; 466 break; 467 case 192: 468 ses->rounds = AES192_ROUNDS; 469 break; 470 case 256: 471 ses->rounds = AES256_ROUNDS; 472 break; 473 default: 474 panic("shouldn't happen"); 475 } 476 477 aesni_set_enckey(key, ses->enc_schedule, ses->rounds); 478 if (decsched) 479 aesni_set_deckey(ses->enc_schedule, ses->dec_schedule, 480 ses->rounds); 481 482 if (csp->csp_cipher_alg == CRYPTO_AES_XTS) 483 aesni_set_enckey(key + keylen, ses->xts_schedule, 484 ses->rounds); 485 } 486