1 /*- 2 * Copyright (c) 2016 The FreeBSD Foundation 3 * Copyright (c) 2020 Ampere Computing 4 * All rights reserved. 5 * 6 * This software was developed by Andrew Turner under 7 * sponsorship from the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * This file is derived from aesni_wrap.c: 31 * Copyright (C) 2008 Damien Miller <djm@mindrot.org> 32 * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org> 33 * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net> 34 * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org> 35 * Copyright (c) 2014 The FreeBSD Foundation 36 */ 37 38 /* 39 * This code is built with floating-point enabled. Make sure to have entered 40 * into floating-point context before calling any of these functions. 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/malloc.h> 49 #include <sys/queue.h> 50 51 #include <opencrypto/cryptodev.h> 52 #include <opencrypto/gmac.h> 53 #include <crypto/rijndael/rijndael.h> 54 #include <crypto/armv8/armv8_crypto.h> 55 56 #include <arm_neon.h> 57 58 static uint8x16_t 59 armv8_aes_enc(int rounds, const uint8x16_t *keysched, const uint8x16_t from) 60 { 61 uint8x16_t tmp; 62 int i; 63 64 tmp = from; 65 for (i = 0; i < rounds - 1; i += 2) { 66 tmp = vaeseq_u8(tmp, keysched[i]); 67 tmp = vaesmcq_u8(tmp); 68 tmp = vaeseq_u8(tmp, keysched[i + 1]); 69 tmp = vaesmcq_u8(tmp); 70 } 71 72 tmp = vaeseq_u8(tmp, keysched[rounds - 1]); 73 tmp = vaesmcq_u8(tmp); 74 tmp = vaeseq_u8(tmp, keysched[rounds]); 75 tmp = veorq_u8(tmp, keysched[rounds + 1]); 76 77 return (tmp); 78 } 79 80 static uint8x16_t 81 armv8_aes_dec(int rounds, const uint8x16_t *keysched, const uint8x16_t from) 82 { 83 uint8x16_t tmp; 84 int i; 85 86 tmp = from; 87 for (i = 0; i < rounds - 1; i += 2) { 88 tmp = vaesdq_u8(tmp, keysched[i]); 89 tmp = vaesimcq_u8(tmp); 90 tmp = vaesdq_u8(tmp, keysched[i+1]); 91 tmp = vaesimcq_u8(tmp); 92 } 93 94 tmp = vaesdq_u8(tmp, keysched[rounds - 1]); 95 tmp = vaesimcq_u8(tmp); 96 tmp = vaesdq_u8(tmp, keysched[rounds]); 97 tmp = veorq_u8(tmp, keysched[rounds + 1]); 98 99 return (tmp); 100 } 101 102 void 103 armv8_aes_encrypt_cbc(const AES_key_t *key, size_t len, 104 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 105 const uint8_t iv[static AES_BLOCK_LEN]) 106 { 107 uint8x16_t tot, ivreg, tmp; 108 uint8_t block[AES_BLOCK_LEN], *from, *to; 109 size_t fromseglen, oseglen, seglen, toseglen; 110 111 KASSERT(len % AES_BLOCK_LEN == 0, 112 ("%s: length %zu not a multiple of the block size", __func__, len)); 113 114 ivreg = vld1q_u8(iv); 115 for (; len > 0; len -= seglen) { 116 from = crypto_cursor_segment(fromc, &fromseglen); 117 to = crypto_cursor_segment(toc, &toseglen); 118 119 seglen = ulmin(len, ulmin(fromseglen, toseglen)); 120 if (seglen < AES_BLOCK_LEN) { 121 crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block); 122 tmp = vld1q_u8(block); 123 tot = armv8_aes_enc(key->aes_rounds - 1, 124 (const void *)key->aes_key, veorq_u8(tmp, ivreg)); 125 ivreg = tot; 126 vst1q_u8(block, tot); 127 crypto_cursor_copyback(toc, AES_BLOCK_LEN, block); 128 seglen = AES_BLOCK_LEN; 129 } else { 130 for (oseglen = seglen; seglen >= AES_BLOCK_LEN; 131 seglen -= AES_BLOCK_LEN) { 132 tmp = vld1q_u8(from); 133 tot = armv8_aes_enc(key->aes_rounds - 1, 134 (const void *)key->aes_key, 135 veorq_u8(tmp, ivreg)); 136 ivreg = tot; 137 vst1q_u8(to, tot); 138 from += AES_BLOCK_LEN; 139 to += AES_BLOCK_LEN; 140 } 141 seglen = oseglen - seglen; 142 crypto_cursor_advance(fromc, seglen); 143 crypto_cursor_advance(toc, seglen); 144 } 145 } 146 147 explicit_bzero(block, sizeof(block)); 148 } 149 150 void 151 armv8_aes_decrypt_cbc(const AES_key_t *key, size_t len, 152 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 153 const uint8_t iv[static AES_BLOCK_LEN]) 154 { 155 uint8x16_t ivreg, nextiv, tmp; 156 uint8_t block[AES_BLOCK_LEN], *from, *to; 157 size_t fromseglen, oseglen, seglen, toseglen; 158 159 KASSERT(len % AES_BLOCK_LEN == 0, 160 ("%s: length %zu not a multiple of the block size", __func__, len)); 161 162 ivreg = vld1q_u8(iv); 163 for (; len > 0; len -= seglen) { 164 from = crypto_cursor_segment(fromc, &fromseglen); 165 to = crypto_cursor_segment(toc, &toseglen); 166 167 seglen = ulmin(len, ulmin(fromseglen, toseglen)); 168 if (seglen < AES_BLOCK_LEN) { 169 crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block); 170 nextiv = vld1q_u8(block); 171 tmp = armv8_aes_dec(key->aes_rounds - 1, 172 (const void *)key->aes_key, nextiv); 173 vst1q_u8(block, veorq_u8(tmp, ivreg)); 174 ivreg = nextiv; 175 crypto_cursor_copyback(toc, AES_BLOCK_LEN, block); 176 seglen = AES_BLOCK_LEN; 177 } else { 178 for (oseglen = seglen; seglen >= AES_BLOCK_LEN; 179 seglen -= AES_BLOCK_LEN) { 180 nextiv = vld1q_u8(from); 181 tmp = armv8_aes_dec(key->aes_rounds - 1, 182 (const void *)key->aes_key, nextiv); 183 vst1q_u8(to, veorq_u8(tmp, ivreg)); 184 ivreg = nextiv; 185 from += AES_BLOCK_LEN; 186 to += AES_BLOCK_LEN; 187 } 188 crypto_cursor_advance(fromc, oseglen - seglen); 189 crypto_cursor_advance(toc, oseglen - seglen); 190 seglen = oseglen - seglen; 191 } 192 } 193 194 explicit_bzero(block, sizeof(block)); 195 } 196 197 #define AES_XTS_BLOCKSIZE 16 198 #define AES_XTS_IVSIZE 8 199 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */ 200 201 static inline int32x4_t 202 xts_crank_lfsr(int32x4_t inp) 203 { 204 const int32x4_t alphamask = {AES_XTS_ALPHA, 1, 1, 1}; 205 int32x4_t xtweak, ret; 206 207 /* set up xor mask */ 208 xtweak = vextq_s32(inp, inp, 3); 209 xtweak = vshrq_n_s32(xtweak, 31); 210 xtweak &= alphamask; 211 212 /* next term */ 213 ret = vshlq_n_s32(inp, 1); 214 ret ^= xtweak; 215 216 return ret; 217 } 218 219 static void 220 armv8_aes_crypt_xts_block(int rounds, const uint8x16_t *key_schedule, 221 uint8x16_t *tweak, const uint8_t *from, uint8_t *to, int do_encrypt) 222 { 223 uint8x16_t block; 224 225 block = vld1q_u8(from) ^ *tweak; 226 227 if (do_encrypt) 228 block = armv8_aes_enc(rounds - 1, key_schedule, block); 229 else 230 block = armv8_aes_dec(rounds - 1, key_schedule, block); 231 232 vst1q_u8(to, block ^ *tweak); 233 234 *tweak = vreinterpretq_u8_s32(xts_crank_lfsr(vreinterpretq_s32_u8(*tweak))); 235 } 236 237 static void 238 armv8_aes_crypt_xts(int rounds, const uint8x16_t *data_schedule, 239 const uint8x16_t *tweak_schedule, size_t len, 240 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 241 const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt) 242 { 243 uint8x16_t tweakreg; 244 uint8_t block[AES_XTS_BLOCKSIZE] __aligned(16); 245 uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16); 246 uint8_t *from, *to; 247 size_t fromseglen, oseglen, seglen, toseglen; 248 249 KASSERT(len % AES_XTS_BLOCKSIZE == 0, 250 ("%s: length %zu not a multiple of the block size", __func__, len)); 251 252 /* 253 * Prepare tweak as E_k2(IV). IV is specified as LE representation 254 * of a 64-bit block number which we allow to be passed in directly. 255 */ 256 #if BYTE_ORDER == LITTLE_ENDIAN 257 bcopy(iv, tweak, AES_XTS_IVSIZE); 258 /* Last 64 bits of IV are always zero. */ 259 bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE); 260 #else 261 #error Only LITTLE_ENDIAN architectures are supported. 262 #endif 263 tweakreg = vld1q_u8(tweak); 264 tweakreg = armv8_aes_enc(rounds - 1, tweak_schedule, tweakreg); 265 266 for (; len > 0; len -= seglen) { 267 from = crypto_cursor_segment(fromc, &fromseglen); 268 to = crypto_cursor_segment(toc, &toseglen); 269 270 seglen = ulmin(len, ulmin(fromseglen, toseglen)); 271 if (seglen < AES_XTS_BLOCKSIZE) { 272 crypto_cursor_copydata(fromc, AES_XTS_BLOCKSIZE, block); 273 armv8_aes_crypt_xts_block(rounds, data_schedule, 274 &tweakreg, block, block, do_encrypt); 275 crypto_cursor_copyback(toc, AES_XTS_BLOCKSIZE, block); 276 seglen = AES_XTS_BLOCKSIZE; 277 } else { 278 for (oseglen = seglen; seglen >= AES_XTS_BLOCKSIZE; 279 seglen -= AES_XTS_BLOCKSIZE) { 280 armv8_aes_crypt_xts_block(rounds, data_schedule, 281 &tweakreg, from, to, do_encrypt); 282 from += AES_XTS_BLOCKSIZE; 283 to += AES_XTS_BLOCKSIZE; 284 } 285 seglen = oseglen - seglen; 286 crypto_cursor_advance(fromc, seglen); 287 crypto_cursor_advance(toc, seglen); 288 } 289 } 290 291 explicit_bzero(block, sizeof(block)); 292 } 293 294 void 295 armv8_aes_encrypt_xts(AES_key_t *data_schedule, 296 const void *tweak_schedule, size_t len, struct crypto_buffer_cursor *fromc, 297 struct crypto_buffer_cursor *toc, const uint8_t iv[static AES_BLOCK_LEN]) 298 { 299 armv8_aes_crypt_xts(data_schedule->aes_rounds, 300 (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc, 301 toc, iv, 1); 302 } 303 304 void 305 armv8_aes_decrypt_xts(AES_key_t *data_schedule, 306 const void *tweak_schedule, size_t len, 307 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 308 const uint8_t iv[static AES_BLOCK_LEN]) 309 { 310 armv8_aes_crypt_xts(data_schedule->aes_rounds, 311 (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc, 312 toc, iv, 0); 313 314 } 315 #define AES_INC_COUNTER(counter) \ 316 do { \ 317 for (int pos = AES_BLOCK_LEN - 1; \ 318 pos >= 0; pos--) \ 319 if (++(counter)[pos]) \ 320 break; \ 321 } while (0) 322 323 struct armv8_gcm_state { 324 __uint128_val_t EK0; 325 __uint128_val_t EKi; 326 __uint128_val_t Xi; 327 __uint128_val_t lenblock; 328 uint8_t aes_counter[AES_BLOCK_LEN]; 329 }; 330 331 static void 332 armv8_aes_gmac_setup(struct armv8_gcm_state *s, AES_key_t *aes_key, 333 const uint8_t *authdata, size_t authdatalen, 334 const uint8_t iv[static AES_GCM_IV_LEN], const __uint128_val_t *Htable) 335 { 336 uint8_t block[AES_BLOCK_LEN]; 337 size_t trailer; 338 339 bzero(s->aes_counter, AES_BLOCK_LEN); 340 memcpy(s->aes_counter, iv, AES_GCM_IV_LEN); 341 342 /* Setup the counter */ 343 s->aes_counter[AES_BLOCK_LEN - 1] = 1; 344 345 /* EK0 for a final GMAC round */ 346 aes_v8_encrypt(s->aes_counter, s->EK0.c, aes_key); 347 348 /* GCM starts with 2 as counter, 1 is used for final xor of tag. */ 349 s->aes_counter[AES_BLOCK_LEN - 1] = 2; 350 351 memset(s->Xi.c, 0, sizeof(s->Xi.c)); 352 trailer = authdatalen % AES_BLOCK_LEN; 353 if (authdatalen - trailer > 0) { 354 gcm_ghash_v8(s->Xi.u, Htable, authdata, authdatalen - trailer); 355 authdata += authdatalen - trailer; 356 } 357 if (trailer > 0 || authdatalen == 0) { 358 memset(block, 0, sizeof(block)); 359 memcpy(block, authdata, trailer); 360 gcm_ghash_v8(s->Xi.u, Htable, block, AES_BLOCK_LEN); 361 } 362 } 363 364 static void 365 armv8_aes_gmac_finish(struct armv8_gcm_state *s, size_t len, 366 size_t authdatalen, const __uint128_val_t *Htable) 367 { 368 /* Lengths block */ 369 s->lenblock.u[0] = s->lenblock.u[1] = 0; 370 s->lenblock.d[1] = htobe32(authdatalen * 8); 371 s->lenblock.d[3] = htobe32(len * 8); 372 gcm_ghash_v8(s->Xi.u, Htable, s->lenblock.c, AES_BLOCK_LEN); 373 374 s->Xi.u[0] ^= s->EK0.u[0]; 375 s->Xi.u[1] ^= s->EK0.u[1]; 376 } 377 378 static void 379 armv8_aes_encrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key, 380 const uint64_t *from, uint64_t *to) 381 { 382 aes_v8_encrypt(s->aes_counter, s->EKi.c, aes_key); 383 AES_INC_COUNTER(s->aes_counter); 384 to[0] = from[0] ^ s->EKi.u[0]; 385 to[1] = from[1] ^ s->EKi.u[1]; 386 } 387 388 static void 389 armv8_aes_decrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key, 390 const uint64_t *from, uint64_t *to) 391 { 392 armv8_aes_encrypt_gcm_block(s, aes_key, from, to); 393 } 394 395 void 396 armv8_aes_encrypt_gcm(AES_key_t *aes_key, size_t len, 397 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 398 size_t authdatalen, const uint8_t *authdata, 399 uint8_t tag[static GMAC_DIGEST_LEN], 400 const uint8_t iv[static AES_GCM_IV_LEN], 401 const __uint128_val_t *Htable) 402 { 403 struct armv8_gcm_state s; 404 uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN); 405 uint64_t *from64, *to64; 406 size_t fromseglen, i, olen, oseglen, seglen, toseglen; 407 408 armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable); 409 410 for (olen = len; len > 0; len -= seglen) { 411 from64 = crypto_cursor_segment(fromc, &fromseglen); 412 to64 = crypto_cursor_segment(toc, &toseglen); 413 414 seglen = ulmin(len, ulmin(fromseglen, toseglen)); 415 if (seglen < AES_BLOCK_LEN) { 416 seglen = ulmin(len, AES_BLOCK_LEN); 417 418 memset(block, 0, sizeof(block)); 419 crypto_cursor_copydata(fromc, (int)seglen, block); 420 421 if (seglen == AES_BLOCK_LEN) { 422 armv8_aes_encrypt_gcm_block(&s, aes_key, 423 (uint64_t *)block, (uint64_t *)block); 424 } else { 425 aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key); 426 AES_INC_COUNTER(s.aes_counter); 427 for (i = 0; i < seglen; i++) 428 block[i] ^= s.EKi.c[i]; 429 } 430 gcm_ghash_v8(s.Xi.u, Htable, block, seglen); 431 432 crypto_cursor_copyback(toc, (int)seglen, block); 433 } else { 434 for (oseglen = seglen; seglen >= AES_BLOCK_LEN; 435 seglen -= AES_BLOCK_LEN) { 436 armv8_aes_encrypt_gcm_block(&s, aes_key, from64, 437 to64); 438 gcm_ghash_v8(s.Xi.u, Htable, (uint8_t *)to64, 439 AES_BLOCK_LEN); 440 441 from64 += 2; 442 to64 += 2; 443 } 444 445 seglen = oseglen - seglen; 446 crypto_cursor_advance(fromc, seglen); 447 crypto_cursor_advance(toc, seglen); 448 } 449 } 450 451 armv8_aes_gmac_finish(&s, olen, authdatalen, Htable); 452 memcpy(tag, s.Xi.c, GMAC_DIGEST_LEN); 453 454 explicit_bzero(block, sizeof(block)); 455 explicit_bzero(&s, sizeof(s)); 456 } 457 458 int 459 armv8_aes_decrypt_gcm(AES_key_t *aes_key, size_t len, 460 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 461 size_t authdatalen, const uint8_t *authdata, 462 const uint8_t tag[static GMAC_DIGEST_LEN], 463 const uint8_t iv[static AES_GCM_IV_LEN], 464 const __uint128_val_t *Htable) 465 { 466 struct armv8_gcm_state s; 467 struct crypto_buffer_cursor fromcc; 468 uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN), *from; 469 uint64_t *block64, *from64, *to64; 470 size_t fromseglen, olen, oseglen, seglen, toseglen; 471 int error; 472 473 armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable); 474 475 crypto_cursor_copy(fromc, &fromcc); 476 for (olen = len; len > 0; len -= seglen) { 477 from = crypto_cursor_segment(&fromcc, &fromseglen); 478 seglen = ulmin(len, fromseglen); 479 seglen -= seglen % AES_BLOCK_LEN; 480 if (seglen > 0) { 481 gcm_ghash_v8(s.Xi.u, Htable, from, seglen); 482 crypto_cursor_advance(&fromcc, seglen); 483 } else { 484 memset(block, 0, sizeof(block)); 485 seglen = ulmin(len, AES_BLOCK_LEN); 486 crypto_cursor_copydata(&fromcc, seglen, block); 487 gcm_ghash_v8(s.Xi.u, Htable, block, seglen); 488 } 489 } 490 491 armv8_aes_gmac_finish(&s, olen, authdatalen, Htable); 492 493 if (timingsafe_bcmp(tag, s.Xi.c, GMAC_DIGEST_LEN) != 0) { 494 error = EBADMSG; 495 goto out; 496 } 497 498 block64 = (uint64_t *)block; 499 for (len = olen; len > 0; len -= seglen) { 500 from64 = crypto_cursor_segment(fromc, &fromseglen); 501 to64 = crypto_cursor_segment(toc, &toseglen); 502 503 seglen = ulmin(len, ulmin(fromseglen, toseglen)); 504 if (seglen < AES_BLOCK_LEN) { 505 seglen = ulmin(len, AES_BLOCK_LEN); 506 507 memset(block, 0, sizeof(block)); 508 crypto_cursor_copydata(fromc, seglen, block); 509 510 armv8_aes_decrypt_gcm_block(&s, aes_key, block64, 511 block64); 512 513 crypto_cursor_copyback(toc, (int)seglen, block); 514 } else { 515 for (oseglen = seglen; seglen >= AES_BLOCK_LEN; 516 seglen -= AES_BLOCK_LEN) { 517 armv8_aes_decrypt_gcm_block(&s, aes_key, from64, 518 to64); 519 520 from64 += 2; 521 to64 += 2; 522 } 523 524 seglen = oseglen - seglen; 525 crypto_cursor_advance(fromc, seglen); 526 crypto_cursor_advance(toc, seglen); 527 } 528 } 529 530 error = 0; 531 out: 532 explicit_bzero(block, sizeof(block)); 533 explicit_bzero(&s, sizeof(s)); 534 return (error); 535 } 536