1 /*- 2 * Copyright (C) 2008 Damien Miller <djm@mindrot.org> 3 * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org> 4 * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net> 5 * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org> 6 * Copyright (c) 2014 The FreeBSD Foundation 7 * All rights reserved. 8 * 9 * Portions of this software were developed by John-Mark Gurney 10 * under sponsorship of the FreeBSD Foundation and 11 * Rubicon Communications, LLC (Netgate). 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include <sys/param.h> 39 #include <sys/libkern.h> 40 #include <sys/malloc.h> 41 #include <sys/proc.h> 42 #include <sys/systm.h> 43 #include <crypto/aesni/aesni.h> 44 45 #include <opencrypto/gmac.h> 46 47 #include "aesencdec.h" 48 #include <smmintrin.h> 49 50 MALLOC_DECLARE(M_AESNI); 51 52 struct blocks8 { 53 __m128i blk[8]; 54 } __packed; 55 56 void 57 aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len, 58 const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN]) 59 { 60 __m128i tot, ivreg; 61 size_t i; 62 63 len /= AES_BLOCK_LEN; 64 ivreg = _mm_loadu_si128((const __m128i *)iv); 65 for (i = 0; i < len; i++) { 66 tot = aesni_enc(rounds - 1, key_schedule, 67 _mm_loadu_si128((const __m128i *)from) ^ ivreg); 68 ivreg = tot; 69 _mm_storeu_si128((__m128i *)to, tot); 70 from += AES_BLOCK_LEN; 71 to += AES_BLOCK_LEN; 72 } 73 } 74 75 void 76 aesni_decrypt_cbc(int rounds, const void *key_schedule, size_t len, 77 uint8_t *buf, const uint8_t iv[static AES_BLOCK_LEN]) 78 { 79 __m128i blocks[8]; 80 struct blocks8 *blks; 81 __m128i ivreg, nextiv; 82 size_t i, j, cnt; 83 84 ivreg = _mm_loadu_si128((const __m128i *)iv); 85 cnt = len / AES_BLOCK_LEN / 8; 86 for (i = 0; i < cnt; i++) { 87 blks = (struct blocks8 *)buf; 88 aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], 89 blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], 90 blks->blk[6], blks->blk[7], &blocks[0]); 91 for (j = 0; j < 8; j++) { 92 nextiv = blks->blk[j]; 93 blks->blk[j] = blocks[j] ^ ivreg; 94 ivreg = nextiv; 95 } 96 buf += AES_BLOCK_LEN * 8; 97 } 98 i *= 8; 99 cnt = len / AES_BLOCK_LEN; 100 for (; i < cnt; i++) { 101 nextiv = _mm_loadu_si128((void *)buf); 102 _mm_storeu_si128((void *)buf, 103 aesni_dec(rounds - 1, key_schedule, nextiv) ^ ivreg); 104 ivreg = nextiv; 105 buf += AES_BLOCK_LEN; 106 } 107 } 108 109 void 110 aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len, 111 const uint8_t *from, uint8_t *to) 112 { 113 __m128i tot; 114 __m128i tout[8]; 115 struct blocks8 *top; 116 const struct blocks8 *blks; 117 size_t i, cnt; 118 119 cnt = len / AES_BLOCK_LEN / 8; 120 for (i = 0; i < cnt; i++) { 121 blks = (const struct blocks8 *)from; 122 top = (struct blocks8 *)to; 123 aesni_enc8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], 124 blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], 125 blks->blk[6], blks->blk[7], tout); 126 top->blk[0] = tout[0]; 127 top->blk[1] = tout[1]; 128 top->blk[2] = tout[2]; 129 top->blk[3] = tout[3]; 130 top->blk[4] = tout[4]; 131 top->blk[5] = tout[5]; 132 top->blk[6] = tout[6]; 133 top->blk[7] = tout[7]; 134 from += AES_BLOCK_LEN * 8; 135 to += AES_BLOCK_LEN * 8; 136 } 137 i *= 8; 138 cnt = len / AES_BLOCK_LEN; 139 for (; i < cnt; i++) { 140 tot = aesni_enc(rounds - 1, key_schedule, 141 _mm_loadu_si128((const __m128i *)from)); 142 _mm_storeu_si128((__m128i *)to, tot); 143 from += AES_BLOCK_LEN; 144 to += AES_BLOCK_LEN; 145 } 146 } 147 148 void 149 aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len, 150 const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]) 151 { 152 __m128i tot; 153 __m128i tout[8]; 154 const struct blocks8 *blks; 155 struct blocks8 *top; 156 size_t i, cnt; 157 158 cnt = len / AES_BLOCK_LEN / 8; 159 for (i = 0; i < cnt; i++) { 160 blks = (const struct blocks8 *)from; 161 top = (struct blocks8 *)to; 162 aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], 163 blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], 164 blks->blk[6], blks->blk[7], tout); 165 top->blk[0] = tout[0]; 166 top->blk[1] = tout[1]; 167 top->blk[2] = tout[2]; 168 top->blk[3] = tout[3]; 169 top->blk[4] = tout[4]; 170 top->blk[5] = tout[5]; 171 top->blk[6] = tout[6]; 172 top->blk[7] = tout[7]; 173 from += AES_BLOCK_LEN * 8; 174 to += AES_BLOCK_LEN * 8; 175 } 176 i *= 8; 177 cnt = len / AES_BLOCK_LEN; 178 for (; i < cnt; i++) { 179 tot = aesni_dec(rounds - 1, key_schedule, 180 _mm_loadu_si128((const __m128i *)from)); 181 _mm_storeu_si128((__m128i *)to, tot); 182 from += AES_BLOCK_LEN; 183 to += AES_BLOCK_LEN; 184 } 185 } 186 187 /* 188 * mixed endian increment, low 64bits stored in hi word to be compatible 189 * with _icm's BSWAP. 190 */ 191 static inline __m128i 192 nextc(__m128i x) 193 { 194 const __m128i ONE = _mm_setr_epi32(0, 0, 1, 0); 195 const __m128i ZERO = _mm_setzero_si128(); 196 197 x = _mm_add_epi64(x, ONE); 198 __m128i t = _mm_cmpeq_epi64(x, ZERO); 199 t = _mm_unpackhi_epi64(t, ZERO); 200 x = _mm_sub_epi64(x, t); 201 202 return x; 203 } 204 205 void 206 aesni_encrypt_icm(int rounds, const void *key_schedule, size_t len, 207 const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN]) 208 { 209 __m128i tot; 210 __m128i tmp1, tmp2, tmp3, tmp4; 211 __m128i tmp5, tmp6, tmp7, tmp8; 212 __m128i ctr1, ctr2, ctr3, ctr4; 213 __m128i ctr5, ctr6, ctr7, ctr8; 214 __m128i BSWAP_EPI64; 215 __m128i tout[8]; 216 struct blocks8 *top; 217 const struct blocks8 *blks; 218 size_t i, cnt; 219 220 BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7); 221 222 ctr1 = _mm_loadu_si128((__m128i*)iv); 223 ctr1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 224 225 cnt = len / AES_BLOCK_LEN / 8; 226 for (i = 0; i < cnt; i++) { 227 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 228 ctr2 = nextc(ctr1); 229 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 230 ctr3 = nextc(ctr2); 231 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 232 ctr4 = nextc(ctr3); 233 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); 234 ctr5 = nextc(ctr4); 235 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); 236 ctr6 = nextc(ctr5); 237 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); 238 ctr7 = nextc(ctr6); 239 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); 240 ctr8 = nextc(ctr7); 241 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); 242 ctr1 = nextc(ctr8); 243 244 blks = (const struct blocks8 *)from; 245 top = (struct blocks8 *)to; 246 aesni_enc8(rounds - 1, key_schedule, tmp1, tmp2, tmp3, tmp4, 247 tmp5, tmp6, tmp7, tmp8, tout); 248 249 top->blk[0] = blks->blk[0] ^ tout[0]; 250 top->blk[1] = blks->blk[1] ^ tout[1]; 251 top->blk[2] = blks->blk[2] ^ tout[2]; 252 top->blk[3] = blks->blk[3] ^ tout[3]; 253 top->blk[4] = blks->blk[4] ^ tout[4]; 254 top->blk[5] = blks->blk[5] ^ tout[5]; 255 top->blk[6] = blks->blk[6] ^ tout[6]; 256 top->blk[7] = blks->blk[7] ^ tout[7]; 257 258 from += AES_BLOCK_LEN * 8; 259 to += AES_BLOCK_LEN * 8; 260 } 261 i *= 8; 262 cnt = len / AES_BLOCK_LEN; 263 for (; i < cnt; i++) { 264 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 265 ctr1 = nextc(ctr1); 266 267 tot = aesni_enc(rounds - 1, key_schedule, tmp1); 268 269 tot = tot ^ _mm_loadu_si128((const __m128i *)from); 270 _mm_storeu_si128((__m128i *)to, tot); 271 272 from += AES_BLOCK_LEN; 273 to += AES_BLOCK_LEN; 274 } 275 276 /* handle remaining partial round */ 277 if (len % AES_BLOCK_LEN != 0) { 278 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 279 tot = aesni_enc(rounds - 1, key_schedule, tmp1); 280 tot = tot ^ _mm_loadu_si128((const __m128i *)from); 281 memcpy(to, &tot, len % AES_BLOCK_LEN); 282 } 283 } 284 285 #define AES_XTS_BLOCKSIZE 16 286 #define AES_XTS_IVSIZE 8 287 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */ 288 289 static inline __m128i 290 xts_crank_lfsr(__m128i inp) 291 { 292 const __m128i alphamask = _mm_set_epi32(1, 1, 1, AES_XTS_ALPHA); 293 __m128i xtweak, ret; 294 295 /* set up xor mask */ 296 xtweak = _mm_shuffle_epi32(inp, 0x93); 297 xtweak = _mm_srai_epi32(xtweak, 31); 298 xtweak &= alphamask; 299 300 /* next term */ 301 ret = _mm_slli_epi32(inp, 1); 302 ret ^= xtweak; 303 304 return ret; 305 } 306 307 static void 308 aesni_crypt_xts_block(int rounds, const __m128i *key_schedule, __m128i *tweak, 309 const uint8_t *from, uint8_t *to, int do_encrypt) 310 { 311 __m128i block; 312 313 block = _mm_loadu_si128((const __m128i *)from) ^ *tweak; 314 315 if (do_encrypt) 316 block = aesni_enc(rounds - 1, key_schedule, block); 317 else 318 block = aesni_dec(rounds - 1, key_schedule, block); 319 320 _mm_storeu_si128((__m128i *)to, block ^ *tweak); 321 322 *tweak = xts_crank_lfsr(*tweak); 323 } 324 325 static void 326 aesni_crypt_xts_block8(int rounds, const __m128i *key_schedule, __m128i *tweak, 327 const uint8_t *from, uint8_t *to, int do_encrypt) 328 { 329 __m128i tmptweak; 330 __m128i a, b, c, d, e, f, g, h; 331 __m128i tweaks[8]; 332 __m128i tmp[8]; 333 __m128i *top; 334 const __m128i *fromp; 335 336 tmptweak = *tweak; 337 338 /* 339 * unroll the loop. This lets gcc put values directly in the 340 * register and saves memory accesses. 341 */ 342 fromp = (const __m128i *)from; 343 #define PREPINP(v, pos) \ 344 do { \ 345 tweaks[(pos)] = tmptweak; \ 346 (v) = _mm_loadu_si128(&fromp[pos]) ^ \ 347 tmptweak; \ 348 tmptweak = xts_crank_lfsr(tmptweak); \ 349 } while (0) 350 PREPINP(a, 0); 351 PREPINP(b, 1); 352 PREPINP(c, 2); 353 PREPINP(d, 3); 354 PREPINP(e, 4); 355 PREPINP(f, 5); 356 PREPINP(g, 6); 357 PREPINP(h, 7); 358 *tweak = tmptweak; 359 360 if (do_encrypt) 361 aesni_enc8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, 362 tmp); 363 else 364 aesni_dec8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, 365 tmp); 366 367 top = (__m128i *)to; 368 _mm_storeu_si128(&top[0], tmp[0] ^ tweaks[0]); 369 _mm_storeu_si128(&top[1], tmp[1] ^ tweaks[1]); 370 _mm_storeu_si128(&top[2], tmp[2] ^ tweaks[2]); 371 _mm_storeu_si128(&top[3], tmp[3] ^ tweaks[3]); 372 _mm_storeu_si128(&top[4], tmp[4] ^ tweaks[4]); 373 _mm_storeu_si128(&top[5], tmp[5] ^ tweaks[5]); 374 _mm_storeu_si128(&top[6], tmp[6] ^ tweaks[6]); 375 _mm_storeu_si128(&top[7], tmp[7] ^ tweaks[7]); 376 } 377 378 static void 379 aesni_crypt_xts(int rounds, const __m128i *data_schedule, 380 const __m128i *tweak_schedule, size_t len, const uint8_t *from, 381 uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt) 382 { 383 __m128i tweakreg; 384 uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16); 385 size_t i, cnt; 386 387 /* 388 * Prepare tweak as E_k2(IV). IV is specified as LE representation 389 * of a 64-bit block number which we allow to be passed in directly. 390 */ 391 #if BYTE_ORDER == LITTLE_ENDIAN 392 bcopy(iv, tweak, AES_XTS_IVSIZE); 393 /* Last 64 bits of IV are always zero. */ 394 bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE); 395 #else 396 #error Only LITTLE_ENDIAN architectures are supported. 397 #endif 398 tweakreg = _mm_loadu_si128((__m128i *)&tweak[0]); 399 tweakreg = aesni_enc(rounds - 1, tweak_schedule, tweakreg); 400 401 cnt = len / AES_XTS_BLOCKSIZE / 8; 402 for (i = 0; i < cnt; i++) { 403 aesni_crypt_xts_block8(rounds, data_schedule, &tweakreg, 404 from, to, do_encrypt); 405 from += AES_XTS_BLOCKSIZE * 8; 406 to += AES_XTS_BLOCKSIZE * 8; 407 } 408 i *= 8; 409 cnt = len / AES_XTS_BLOCKSIZE; 410 for (; i < cnt; i++) { 411 aesni_crypt_xts_block(rounds, data_schedule, &tweakreg, 412 from, to, do_encrypt); 413 from += AES_XTS_BLOCKSIZE; 414 to += AES_XTS_BLOCKSIZE; 415 } 416 } 417 418 void 419 aesni_encrypt_xts(int rounds, const void *data_schedule, 420 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, 421 const uint8_t iv[static AES_BLOCK_LEN]) 422 { 423 424 aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to, 425 iv, 1); 426 } 427 428 void 429 aesni_decrypt_xts(int rounds, const void *data_schedule, 430 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, 431 const uint8_t iv[static AES_BLOCK_LEN]) 432 { 433 434 aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to, 435 iv, 0); 436 } 437 438 int 439 aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key, 440 int keylen) 441 { 442 int decsched; 443 444 decsched = 1; 445 446 switch (ses->algo) { 447 case CRYPTO_AES_ICM: 448 case CRYPTO_AES_NIST_GCM_16: 449 decsched = 0; 450 /* FALLTHROUGH */ 451 case CRYPTO_AES_CBC: 452 switch (keylen) { 453 case 128: 454 ses->rounds = AES128_ROUNDS; 455 break; 456 case 192: 457 ses->rounds = AES192_ROUNDS; 458 break; 459 case 256: 460 ses->rounds = AES256_ROUNDS; 461 break; 462 default: 463 CRYPTDEB("invalid CBC/ICM/GCM key length"); 464 return (EINVAL); 465 } 466 break; 467 case CRYPTO_AES_XTS: 468 switch (keylen) { 469 case 256: 470 ses->rounds = AES128_ROUNDS; 471 break; 472 case 512: 473 ses->rounds = AES256_ROUNDS; 474 break; 475 default: 476 CRYPTDEB("invalid XTS key length"); 477 return (EINVAL); 478 } 479 break; 480 default: 481 return (EINVAL); 482 } 483 484 aesni_set_enckey(key, ses->enc_schedule, ses->rounds); 485 if (decsched) 486 aesni_set_deckey(ses->enc_schedule, ses->dec_schedule, 487 ses->rounds); 488 489 if (ses->algo == CRYPTO_AES_XTS) 490 aesni_set_enckey(key + keylen / 16, ses->xts_schedule, 491 ses->rounds); 492 493 return (0); 494 } 495