1 /*- 2 * Copyright (C) 2008 Damien Miller <djm@mindrot.org> 3 * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org> 4 * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net> 5 * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org> 6 * Copyright (c) 2014 The FreeBSD Foundation 7 * All rights reserved. 8 * 9 * Portions of this software were developed by John-Mark Gurney 10 * under sponsorship of the FreeBSD Foundation and 11 * Rubicon Communications, LLC (Netgate). 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/param.h> 36 #include <sys/libkern.h> 37 #include <sys/malloc.h> 38 #include <sys/proc.h> 39 #include <sys/systm.h> 40 #include <crypto/aesni/aesni.h> 41 42 #include <opencrypto/gmac.h> 43 44 #include "aesencdec.h" 45 #include <smmintrin.h> 46 47 MALLOC_DECLARE(M_AESNI); 48 49 struct blocks8 { 50 __m128i blk[8]; 51 } __packed; 52 53 void 54 aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len, 55 const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN]) 56 { 57 __m128i tot, ivreg; 58 size_t i; 59 60 len /= AES_BLOCK_LEN; 61 ivreg = _mm_loadu_si128((const __m128i *)iv); 62 for (i = 0; i < len; i++) { 63 tot = aesni_enc(rounds - 1, key_schedule, 64 _mm_loadu_si128((const __m128i *)from) ^ ivreg); 65 ivreg = tot; 66 _mm_storeu_si128((__m128i *)to, tot); 67 from += AES_BLOCK_LEN; 68 to += AES_BLOCK_LEN; 69 } 70 } 71 72 void 73 aesni_decrypt_cbc(int rounds, const void *key_schedule, size_t len, 74 uint8_t *buf, const uint8_t iv[static AES_BLOCK_LEN]) 75 { 76 __m128i blocks[8]; 77 struct blocks8 *blks; 78 __m128i ivreg, nextiv; 79 size_t i, j, cnt; 80 81 ivreg = _mm_loadu_si128((const __m128i *)iv); 82 cnt = len / AES_BLOCK_LEN / 8; 83 for (i = 0; i < cnt; i++) { 84 blks = (struct blocks8 *)buf; 85 aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], 86 blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], 87 blks->blk[6], blks->blk[7], &blocks[0]); 88 for (j = 0; j < 8; j++) { 89 nextiv = blks->blk[j]; 90 blks->blk[j] = blocks[j] ^ ivreg; 91 ivreg = nextiv; 92 } 93 buf += AES_BLOCK_LEN * 8; 94 } 95 i *= 8; 96 cnt = len / AES_BLOCK_LEN; 97 for (; i < cnt; i++) { 98 nextiv = _mm_loadu_si128((void *)buf); 99 _mm_storeu_si128((void *)buf, 100 aesni_dec(rounds - 1, key_schedule, nextiv) ^ ivreg); 101 ivreg = nextiv; 102 buf += AES_BLOCK_LEN; 103 } 104 } 105 106 void 107 aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len, 108 const uint8_t *from, uint8_t *to) 109 { 110 __m128i tot; 111 __m128i tout[8]; 112 struct blocks8 *top; 113 const struct blocks8 *blks; 114 size_t i, cnt; 115 116 cnt = len / AES_BLOCK_LEN / 8; 117 for (i = 0; i < cnt; i++) { 118 blks = (const struct blocks8 *)from; 119 top = (struct blocks8 *)to; 120 aesni_enc8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], 121 blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], 122 blks->blk[6], blks->blk[7], tout); 123 top->blk[0] = tout[0]; 124 top->blk[1] = tout[1]; 125 top->blk[2] = tout[2]; 126 top->blk[3] = tout[3]; 127 top->blk[4] = tout[4]; 128 top->blk[5] = tout[5]; 129 top->blk[6] = tout[6]; 130 top->blk[7] = tout[7]; 131 from += AES_BLOCK_LEN * 8; 132 to += AES_BLOCK_LEN * 8; 133 } 134 i *= 8; 135 cnt = len / AES_BLOCK_LEN; 136 for (; i < cnt; i++) { 137 tot = aesni_enc(rounds - 1, key_schedule, 138 _mm_loadu_si128((const __m128i *)from)); 139 _mm_storeu_si128((__m128i *)to, tot); 140 from += AES_BLOCK_LEN; 141 to += AES_BLOCK_LEN; 142 } 143 } 144 145 void 146 aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len, 147 const uint8_t *from, uint8_t *to) 148 { 149 __m128i tot; 150 __m128i tout[8]; 151 const struct blocks8 *blks; 152 struct blocks8 *top; 153 size_t i, cnt; 154 155 cnt = len / AES_BLOCK_LEN / 8; 156 for (i = 0; i < cnt; i++) { 157 blks = (const struct blocks8 *)from; 158 top = (struct blocks8 *)to; 159 aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], 160 blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], 161 blks->blk[6], blks->blk[7], tout); 162 top->blk[0] = tout[0]; 163 top->blk[1] = tout[1]; 164 top->blk[2] = tout[2]; 165 top->blk[3] = tout[3]; 166 top->blk[4] = tout[4]; 167 top->blk[5] = tout[5]; 168 top->blk[6] = tout[6]; 169 top->blk[7] = tout[7]; 170 from += AES_BLOCK_LEN * 8; 171 to += AES_BLOCK_LEN * 8; 172 } 173 i *= 8; 174 cnt = len / AES_BLOCK_LEN; 175 for (; i < cnt; i++) { 176 tot = aesni_dec(rounds - 1, key_schedule, 177 _mm_loadu_si128((const __m128i *)from)); 178 _mm_storeu_si128((__m128i *)to, tot); 179 from += AES_BLOCK_LEN; 180 to += AES_BLOCK_LEN; 181 } 182 } 183 184 /* 185 * mixed endian increment, low 64bits stored in hi word to be compatible 186 * with _icm's BSWAP. 187 */ 188 static inline __m128i 189 nextc(__m128i x) 190 { 191 const __m128i ONE = _mm_setr_epi32(0, 0, 1, 0); 192 const __m128i ZERO = _mm_setzero_si128(); 193 194 x = _mm_add_epi64(x, ONE); 195 __m128i t = _mm_cmpeq_epi64(x, ZERO); 196 t = _mm_unpackhi_epi64(t, ZERO); 197 x = _mm_sub_epi64(x, t); 198 199 return x; 200 } 201 202 void 203 aesni_encrypt_icm(int rounds, const void *key_schedule, size_t len, 204 const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN]) 205 { 206 __m128i tot; 207 __m128i tmp1, tmp2, tmp3, tmp4; 208 __m128i tmp5, tmp6, tmp7, tmp8; 209 __m128i ctr1, ctr2, ctr3, ctr4; 210 __m128i ctr5, ctr6, ctr7, ctr8; 211 __m128i BSWAP_EPI64; 212 __m128i tout[8]; 213 __m128i block; 214 struct blocks8 *top; 215 const struct blocks8 *blks; 216 size_t i, cnt, resid; 217 218 BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7); 219 220 ctr1 = _mm_loadu_si128((const __m128i *)iv); 221 ctr1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 222 223 cnt = len / AES_BLOCK_LEN / 8; 224 for (i = 0; i < cnt; i++) { 225 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 226 ctr2 = nextc(ctr1); 227 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 228 ctr3 = nextc(ctr2); 229 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 230 ctr4 = nextc(ctr3); 231 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); 232 ctr5 = nextc(ctr4); 233 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); 234 ctr6 = nextc(ctr5); 235 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); 236 ctr7 = nextc(ctr6); 237 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); 238 ctr8 = nextc(ctr7); 239 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); 240 ctr1 = nextc(ctr8); 241 242 blks = (const struct blocks8 *)from; 243 top = (struct blocks8 *)to; 244 aesni_enc8(rounds - 1, key_schedule, tmp1, tmp2, tmp3, tmp4, 245 tmp5, tmp6, tmp7, tmp8, tout); 246 247 top->blk[0] = blks->blk[0] ^ tout[0]; 248 top->blk[1] = blks->blk[1] ^ tout[1]; 249 top->blk[2] = blks->blk[2] ^ tout[2]; 250 top->blk[3] = blks->blk[3] ^ tout[3]; 251 top->blk[4] = blks->blk[4] ^ tout[4]; 252 top->blk[5] = blks->blk[5] ^ tout[5]; 253 top->blk[6] = blks->blk[6] ^ tout[6]; 254 top->blk[7] = blks->blk[7] ^ tout[7]; 255 256 from += AES_BLOCK_LEN * 8; 257 to += AES_BLOCK_LEN * 8; 258 } 259 i *= 8; 260 cnt = len / AES_BLOCK_LEN; 261 for (; i < cnt; i++) { 262 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 263 ctr1 = nextc(ctr1); 264 265 tot = aesni_enc(rounds - 1, key_schedule, tmp1); 266 267 tot = tot ^ _mm_loadu_si128((const __m128i *)from); 268 _mm_storeu_si128((__m128i *)to, tot); 269 270 from += AES_BLOCK_LEN; 271 to += AES_BLOCK_LEN; 272 } 273 274 /* 275 * Handle remaining partial round. Copy the remaining payload onto the 276 * stack to ensure that the full block can be loaded safely. 277 */ 278 resid = len % AES_BLOCK_LEN; 279 if (resid != 0) { 280 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 281 tot = aesni_enc(rounds - 1, key_schedule, tmp1); 282 block = _mm_setzero_si128(); 283 memcpy(&block, from, resid); 284 tot = tot ^ _mm_loadu_si128(&block); 285 memcpy(to, &tot, resid); 286 explicit_bzero(&block, sizeof(block)); 287 } 288 } 289 290 #define AES_XTS_BLOCKSIZE 16 291 #define AES_XTS_IVSIZE 8 292 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */ 293 294 static inline __m128i 295 xts_crank_lfsr(__m128i inp) 296 { 297 const __m128i alphamask = _mm_set_epi32(1, 1, 1, AES_XTS_ALPHA); 298 __m128i xtweak, ret; 299 300 /* set up xor mask */ 301 xtweak = _mm_shuffle_epi32(inp, 0x93); 302 xtweak = _mm_srai_epi32(xtweak, 31); 303 xtweak &= alphamask; 304 305 /* next term */ 306 ret = _mm_slli_epi32(inp, 1); 307 ret ^= xtweak; 308 309 return ret; 310 } 311 312 static void 313 aesni_crypt_xts_block(int rounds, const __m128i *key_schedule, __m128i *tweak, 314 const uint8_t *from, uint8_t *to, int do_encrypt) 315 { 316 __m128i block; 317 318 block = _mm_loadu_si128((const __m128i *)from) ^ *tweak; 319 320 if (do_encrypt) 321 block = aesni_enc(rounds - 1, key_schedule, block); 322 else 323 block = aesni_dec(rounds - 1, key_schedule, block); 324 325 _mm_storeu_si128((__m128i *)to, block ^ *tweak); 326 327 *tweak = xts_crank_lfsr(*tweak); 328 } 329 330 static void 331 aesni_crypt_xts_block8(int rounds, const __m128i *key_schedule, __m128i *tweak, 332 const uint8_t *from, uint8_t *to, int do_encrypt) 333 { 334 __m128i tmptweak; 335 __m128i a, b, c, d, e, f, g, h; 336 __m128i tweaks[8]; 337 __m128i tmp[8]; 338 __m128i *top; 339 const __m128i *fromp; 340 341 tmptweak = *tweak; 342 343 /* 344 * unroll the loop. This lets gcc put values directly in the 345 * register and saves memory accesses. 346 */ 347 fromp = (const __m128i *)from; 348 #define PREPINP(v, pos) \ 349 do { \ 350 tweaks[(pos)] = tmptweak; \ 351 (v) = _mm_loadu_si128(&fromp[pos]) ^ \ 352 tmptweak; \ 353 tmptweak = xts_crank_lfsr(tmptweak); \ 354 } while (0) 355 PREPINP(a, 0); 356 PREPINP(b, 1); 357 PREPINP(c, 2); 358 PREPINP(d, 3); 359 PREPINP(e, 4); 360 PREPINP(f, 5); 361 PREPINP(g, 6); 362 PREPINP(h, 7); 363 *tweak = tmptweak; 364 365 if (do_encrypt) 366 aesni_enc8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, 367 tmp); 368 else 369 aesni_dec8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, 370 tmp); 371 372 top = (__m128i *)to; 373 _mm_storeu_si128(&top[0], tmp[0] ^ tweaks[0]); 374 _mm_storeu_si128(&top[1], tmp[1] ^ tweaks[1]); 375 _mm_storeu_si128(&top[2], tmp[2] ^ tweaks[2]); 376 _mm_storeu_si128(&top[3], tmp[3] ^ tweaks[3]); 377 _mm_storeu_si128(&top[4], tmp[4] ^ tweaks[4]); 378 _mm_storeu_si128(&top[5], tmp[5] ^ tweaks[5]); 379 _mm_storeu_si128(&top[6], tmp[6] ^ tweaks[6]); 380 _mm_storeu_si128(&top[7], tmp[7] ^ tweaks[7]); 381 } 382 383 static void 384 aesni_crypt_xts(int rounds, const __m128i *data_schedule, 385 const __m128i *tweak_schedule, size_t len, const uint8_t *from, 386 uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt) 387 { 388 __m128i tweakreg; 389 uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16); 390 size_t i, cnt; 391 392 /* 393 * Prepare tweak as E_k2(IV). IV is specified as LE representation 394 * of a 64-bit block number which we allow to be passed in directly. 395 */ 396 #if BYTE_ORDER == LITTLE_ENDIAN 397 bcopy(iv, tweak, AES_XTS_IVSIZE); 398 /* Last 64 bits of IV are always zero. */ 399 bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE); 400 #else 401 #error Only LITTLE_ENDIAN architectures are supported. 402 #endif 403 tweakreg = _mm_loadu_si128((__m128i *)&tweak[0]); 404 tweakreg = aesni_enc(rounds - 1, tweak_schedule, tweakreg); 405 406 cnt = len / AES_XTS_BLOCKSIZE / 8; 407 for (i = 0; i < cnt; i++) { 408 aesni_crypt_xts_block8(rounds, data_schedule, &tweakreg, 409 from, to, do_encrypt); 410 from += AES_XTS_BLOCKSIZE * 8; 411 to += AES_XTS_BLOCKSIZE * 8; 412 } 413 i *= 8; 414 cnt = len / AES_XTS_BLOCKSIZE; 415 for (; i < cnt; i++) { 416 aesni_crypt_xts_block(rounds, data_schedule, &tweakreg, 417 from, to, do_encrypt); 418 from += AES_XTS_BLOCKSIZE; 419 to += AES_XTS_BLOCKSIZE; 420 } 421 } 422 423 void 424 aesni_encrypt_xts(int rounds, const void *data_schedule, 425 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, 426 const uint8_t iv[static AES_BLOCK_LEN]) 427 { 428 429 aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to, 430 iv, 1); 431 } 432 433 void 434 aesni_decrypt_xts(int rounds, const void *data_schedule, 435 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, 436 const uint8_t iv[static AES_BLOCK_LEN]) 437 { 438 439 aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to, 440 iv, 0); 441 } 442 443 void 444 aesni_cipher_setup_common(struct aesni_session *ses, 445 const struct crypto_session_params *csp, const uint8_t *key, int keylen) 446 { 447 int decsched; 448 449 decsched = 1; 450 451 switch (csp->csp_cipher_alg) { 452 case CRYPTO_AES_ICM: 453 case CRYPTO_AES_NIST_GCM_16: 454 case CRYPTO_AES_CCM_16: 455 decsched = 0; 456 break; 457 } 458 459 if (csp->csp_cipher_alg == CRYPTO_AES_XTS) 460 keylen /= 2; 461 462 switch (keylen * 8) { 463 case 128: 464 ses->rounds = AES128_ROUNDS; 465 break; 466 case 192: 467 ses->rounds = AES192_ROUNDS; 468 break; 469 case 256: 470 ses->rounds = AES256_ROUNDS; 471 break; 472 default: 473 panic("shouldn't happen"); 474 } 475 476 aesni_set_enckey(key, ses->enc_schedule, ses->rounds); 477 if (decsched) 478 aesni_set_deckey(ses->enc_schedule, ses->dec_schedule, 479 ses->rounds); 480 481 if (csp->csp_cipher_alg == CRYPTO_AES_XTS) 482 aesni_set_enckey(key + keylen, ses->xts_schedule, 483 ses->rounds); 484 } 485