1 /*- 2 * Copyright (c) 2016 The FreeBSD Foundation 3 * Copyright (c) 2020 Ampere Computing 4 * All rights reserved. 5 * 6 * This software was developed by Andrew Turner under 7 * sponsorship from the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * This file is derived from aesni_wrap.c: 31 * Copyright (C) 2008 Damien Miller <djm@mindrot.org> 32 * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org> 33 * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net> 34 * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org> 35 * Copyright (c) 2014 The FreeBSD Foundation 36 */ 37 38 /* 39 * This code is built with floating-point enabled. Make sure to have entered 40 * into floating-point context before calling any of these functions. 41 */ 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/malloc.h> 46 #include <sys/queue.h> 47 48 #include <opencrypto/cryptodev.h> 49 #include <opencrypto/gmac.h> 50 #include <crypto/rijndael/rijndael.h> 51 #include <crypto/armv8/armv8_crypto.h> 52 53 #include <arm_neon.h> 54 55 static uint8x16_t 56 armv8_aes_enc(int rounds, const uint8x16_t *keysched, const uint8x16_t from) 57 { 58 uint8x16_t tmp; 59 int i; 60 61 tmp = from; 62 for (i = 0; i < rounds - 1; i += 2) { 63 tmp = vaeseq_u8(tmp, keysched[i]); 64 tmp = vaesmcq_u8(tmp); 65 tmp = vaeseq_u8(tmp, keysched[i + 1]); 66 tmp = vaesmcq_u8(tmp); 67 } 68 69 tmp = vaeseq_u8(tmp, keysched[rounds - 1]); 70 tmp = vaesmcq_u8(tmp); 71 tmp = vaeseq_u8(tmp, keysched[rounds]); 72 tmp = veorq_u8(tmp, keysched[rounds + 1]); 73 74 return (tmp); 75 } 76 77 static uint8x16_t 78 armv8_aes_dec(int rounds, const uint8x16_t *keysched, const uint8x16_t from) 79 { 80 uint8x16_t tmp; 81 int i; 82 83 tmp = from; 84 for (i = 0; i < rounds - 1; i += 2) { 85 tmp = vaesdq_u8(tmp, keysched[i]); 86 tmp = vaesimcq_u8(tmp); 87 tmp = vaesdq_u8(tmp, keysched[i+1]); 88 tmp = vaesimcq_u8(tmp); 89 } 90 91 tmp = vaesdq_u8(tmp, keysched[rounds - 1]); 92 tmp = vaesimcq_u8(tmp); 93 tmp = vaesdq_u8(tmp, keysched[rounds]); 94 tmp = veorq_u8(tmp, keysched[rounds + 1]); 95 96 return (tmp); 97 } 98 99 void 100 armv8_aes_encrypt_cbc(const AES_key_t *key, size_t len, 101 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 102 const uint8_t iv[static AES_BLOCK_LEN]) 103 { 104 uint8x16_t tot, ivreg, tmp; 105 uint8_t block[AES_BLOCK_LEN], *from, *to; 106 size_t fromseglen, oseglen, seglen, toseglen; 107 108 KASSERT(len % AES_BLOCK_LEN == 0, 109 ("%s: length %zu not a multiple of the block size", __func__, len)); 110 111 ivreg = vld1q_u8(iv); 112 for (; len > 0; len -= seglen) { 113 from = crypto_cursor_segment(fromc, &fromseglen); 114 to = crypto_cursor_segment(toc, &toseglen); 115 116 seglen = ulmin(len, ulmin(fromseglen, toseglen)); 117 if (seglen < AES_BLOCK_LEN) { 118 crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block); 119 tmp = vld1q_u8(block); 120 tot = armv8_aes_enc(key->aes_rounds - 1, 121 (const void *)key->aes_key, veorq_u8(tmp, ivreg)); 122 ivreg = tot; 123 vst1q_u8(block, tot); 124 crypto_cursor_copyback(toc, AES_BLOCK_LEN, block); 125 seglen = AES_BLOCK_LEN; 126 } else { 127 for (oseglen = seglen; seglen >= AES_BLOCK_LEN; 128 seglen -= AES_BLOCK_LEN) { 129 tmp = vld1q_u8(from); 130 tot = armv8_aes_enc(key->aes_rounds - 1, 131 (const void *)key->aes_key, 132 veorq_u8(tmp, ivreg)); 133 ivreg = tot; 134 vst1q_u8(to, tot); 135 from += AES_BLOCK_LEN; 136 to += AES_BLOCK_LEN; 137 } 138 seglen = oseglen - seglen; 139 crypto_cursor_advance(fromc, seglen); 140 crypto_cursor_advance(toc, seglen); 141 } 142 } 143 144 explicit_bzero(block, sizeof(block)); 145 } 146 147 void 148 armv8_aes_decrypt_cbc(const AES_key_t *key, size_t len, 149 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 150 const uint8_t iv[static AES_BLOCK_LEN]) 151 { 152 uint8x16_t ivreg, nextiv, tmp; 153 uint8_t block[AES_BLOCK_LEN], *from, *to; 154 size_t fromseglen, oseglen, seglen, toseglen; 155 156 KASSERT(len % AES_BLOCK_LEN == 0, 157 ("%s: length %zu not a multiple of the block size", __func__, len)); 158 159 ivreg = vld1q_u8(iv); 160 for (; len > 0; len -= seglen) { 161 from = crypto_cursor_segment(fromc, &fromseglen); 162 to = crypto_cursor_segment(toc, &toseglen); 163 164 seglen = ulmin(len, ulmin(fromseglen, toseglen)); 165 if (seglen < AES_BLOCK_LEN) { 166 crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block); 167 nextiv = vld1q_u8(block); 168 tmp = armv8_aes_dec(key->aes_rounds - 1, 169 (const void *)key->aes_key, nextiv); 170 vst1q_u8(block, veorq_u8(tmp, ivreg)); 171 ivreg = nextiv; 172 crypto_cursor_copyback(toc, AES_BLOCK_LEN, block); 173 seglen = AES_BLOCK_LEN; 174 } else { 175 for (oseglen = seglen; seglen >= AES_BLOCK_LEN; 176 seglen -= AES_BLOCK_LEN) { 177 nextiv = vld1q_u8(from); 178 tmp = armv8_aes_dec(key->aes_rounds - 1, 179 (const void *)key->aes_key, nextiv); 180 vst1q_u8(to, veorq_u8(tmp, ivreg)); 181 ivreg = nextiv; 182 from += AES_BLOCK_LEN; 183 to += AES_BLOCK_LEN; 184 } 185 crypto_cursor_advance(fromc, oseglen - seglen); 186 crypto_cursor_advance(toc, oseglen - seglen); 187 seglen = oseglen - seglen; 188 } 189 } 190 191 explicit_bzero(block, sizeof(block)); 192 } 193 194 #define AES_XTS_BLOCKSIZE 16 195 #define AES_XTS_IVSIZE 8 196 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */ 197 198 static inline int32x4_t 199 xts_crank_lfsr(int32x4_t inp) 200 { 201 const int32x4_t alphamask = {AES_XTS_ALPHA, 1, 1, 1}; 202 int32x4_t xtweak, ret; 203 204 /* set up xor mask */ 205 xtweak = vextq_s32(inp, inp, 3); 206 xtweak = vshrq_n_s32(xtweak, 31); 207 xtweak &= alphamask; 208 209 /* next term */ 210 ret = vshlq_n_s32(inp, 1); 211 ret ^= xtweak; 212 213 return ret; 214 } 215 216 static void 217 armv8_aes_crypt_xts_block(int rounds, const uint8x16_t *key_schedule, 218 uint8x16_t *tweak, const uint8_t *from, uint8_t *to, int do_encrypt) 219 { 220 uint8x16_t block; 221 222 block = vld1q_u8(from) ^ *tweak; 223 224 if (do_encrypt) 225 block = armv8_aes_enc(rounds - 1, key_schedule, block); 226 else 227 block = armv8_aes_dec(rounds - 1, key_schedule, block); 228 229 vst1q_u8(to, block ^ *tweak); 230 231 *tweak = vreinterpretq_u8_s32(xts_crank_lfsr(vreinterpretq_s32_u8(*tweak))); 232 } 233 234 static void 235 armv8_aes_crypt_xts(int rounds, const uint8x16_t *data_schedule, 236 const uint8x16_t *tweak_schedule, size_t len, 237 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 238 const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt) 239 { 240 uint8x16_t tweakreg; 241 uint8_t block[AES_XTS_BLOCKSIZE] __aligned(16); 242 uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16); 243 uint8_t *from, *to; 244 size_t fromseglen, oseglen, seglen, toseglen; 245 246 KASSERT(len % AES_XTS_BLOCKSIZE == 0, 247 ("%s: length %zu not a multiple of the block size", __func__, len)); 248 249 /* 250 * Prepare tweak as E_k2(IV). IV is specified as LE representation 251 * of a 64-bit block number which we allow to be passed in directly. 252 */ 253 #if BYTE_ORDER == LITTLE_ENDIAN 254 bcopy(iv, tweak, AES_XTS_IVSIZE); 255 /* Last 64 bits of IV are always zero. */ 256 bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE); 257 #else 258 #error Only LITTLE_ENDIAN architectures are supported. 259 #endif 260 tweakreg = vld1q_u8(tweak); 261 tweakreg = armv8_aes_enc(rounds - 1, tweak_schedule, tweakreg); 262 263 for (; len > 0; len -= seglen) { 264 from = crypto_cursor_segment(fromc, &fromseglen); 265 to = crypto_cursor_segment(toc, &toseglen); 266 267 seglen = ulmin(len, ulmin(fromseglen, toseglen)); 268 if (seglen < AES_XTS_BLOCKSIZE) { 269 crypto_cursor_copydata(fromc, AES_XTS_BLOCKSIZE, block); 270 armv8_aes_crypt_xts_block(rounds, data_schedule, 271 &tweakreg, block, block, do_encrypt); 272 crypto_cursor_copyback(toc, AES_XTS_BLOCKSIZE, block); 273 seglen = AES_XTS_BLOCKSIZE; 274 } else { 275 for (oseglen = seglen; seglen >= AES_XTS_BLOCKSIZE; 276 seglen -= AES_XTS_BLOCKSIZE) { 277 armv8_aes_crypt_xts_block(rounds, data_schedule, 278 &tweakreg, from, to, do_encrypt); 279 from += AES_XTS_BLOCKSIZE; 280 to += AES_XTS_BLOCKSIZE; 281 } 282 seglen = oseglen - seglen; 283 crypto_cursor_advance(fromc, seglen); 284 crypto_cursor_advance(toc, seglen); 285 } 286 } 287 288 explicit_bzero(block, sizeof(block)); 289 } 290 291 void 292 armv8_aes_encrypt_xts(AES_key_t *data_schedule, 293 const void *tweak_schedule, size_t len, struct crypto_buffer_cursor *fromc, 294 struct crypto_buffer_cursor *toc, const uint8_t iv[static AES_BLOCK_LEN]) 295 { 296 armv8_aes_crypt_xts(data_schedule->aes_rounds, 297 (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc, 298 toc, iv, 1); 299 } 300 301 void 302 armv8_aes_decrypt_xts(AES_key_t *data_schedule, 303 const void *tweak_schedule, size_t len, 304 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 305 const uint8_t iv[static AES_BLOCK_LEN]) 306 { 307 armv8_aes_crypt_xts(data_schedule->aes_rounds, 308 (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc, 309 toc, iv, 0); 310 311 } 312 #define AES_INC_COUNTER(counter) \ 313 do { \ 314 for (int pos = AES_BLOCK_LEN - 1; \ 315 pos >= 0; pos--) \ 316 if (++(counter)[pos]) \ 317 break; \ 318 } while (0) 319 320 struct armv8_gcm_state { 321 __uint128_val_t EK0; 322 __uint128_val_t EKi; 323 __uint128_val_t Xi; 324 __uint128_val_t lenblock; 325 uint8_t aes_counter[AES_BLOCK_LEN]; 326 }; 327 328 static void 329 armv8_aes_gmac_setup(struct armv8_gcm_state *s, AES_key_t *aes_key, 330 const uint8_t *authdata, size_t authdatalen, 331 const uint8_t iv[static AES_GCM_IV_LEN], const __uint128_val_t *Htable) 332 { 333 uint8_t block[AES_BLOCK_LEN]; 334 size_t trailer; 335 336 bzero(s->aes_counter, AES_BLOCK_LEN); 337 memcpy(s->aes_counter, iv, AES_GCM_IV_LEN); 338 339 /* Setup the counter */ 340 s->aes_counter[AES_BLOCK_LEN - 1] = 1; 341 342 /* EK0 for a final GMAC round */ 343 aes_v8_encrypt(s->aes_counter, s->EK0.c, aes_key); 344 345 /* GCM starts with 2 as counter, 1 is used for final xor of tag. */ 346 s->aes_counter[AES_BLOCK_LEN - 1] = 2; 347 348 memset(s->Xi.c, 0, sizeof(s->Xi.c)); 349 trailer = authdatalen % AES_BLOCK_LEN; 350 if (authdatalen - trailer > 0) { 351 gcm_ghash_v8(s->Xi.u, Htable, authdata, authdatalen - trailer); 352 authdata += authdatalen - trailer; 353 } 354 if (trailer > 0 || authdatalen == 0) { 355 memset(block, 0, sizeof(block)); 356 memcpy(block, authdata, trailer); 357 gcm_ghash_v8(s->Xi.u, Htable, block, AES_BLOCK_LEN); 358 } 359 } 360 361 static void 362 armv8_aes_gmac_finish(struct armv8_gcm_state *s, size_t len, 363 size_t authdatalen, const __uint128_val_t *Htable) 364 { 365 /* Lengths block */ 366 s->lenblock.u[0] = s->lenblock.u[1] = 0; 367 s->lenblock.d[1] = htobe32(authdatalen * 8); 368 s->lenblock.d[3] = htobe32(len * 8); 369 gcm_ghash_v8(s->Xi.u, Htable, s->lenblock.c, AES_BLOCK_LEN); 370 371 s->Xi.u[0] ^= s->EK0.u[0]; 372 s->Xi.u[1] ^= s->EK0.u[1]; 373 } 374 375 static void 376 armv8_aes_encrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key, 377 const uint64_t *from, uint64_t *to) 378 { 379 aes_v8_encrypt(s->aes_counter, s->EKi.c, aes_key); 380 AES_INC_COUNTER(s->aes_counter); 381 to[0] = from[0] ^ s->EKi.u[0]; 382 to[1] = from[1] ^ s->EKi.u[1]; 383 } 384 385 static void 386 armv8_aes_decrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key, 387 const uint64_t *from, uint64_t *to) 388 { 389 armv8_aes_encrypt_gcm_block(s, aes_key, from, to); 390 } 391 392 void 393 armv8_aes_encrypt_gcm(AES_key_t *aes_key, size_t len, 394 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 395 size_t authdatalen, const uint8_t *authdata, 396 uint8_t tag[static GMAC_DIGEST_LEN], 397 const uint8_t iv[static AES_GCM_IV_LEN], 398 const __uint128_val_t *Htable) 399 { 400 struct armv8_gcm_state s; 401 uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN); 402 uint64_t *from64, *to64; 403 size_t fromseglen, i, olen, oseglen, seglen, toseglen; 404 405 armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable); 406 407 for (olen = len; len > 0; len -= seglen) { 408 from64 = crypto_cursor_segment(fromc, &fromseglen); 409 to64 = crypto_cursor_segment(toc, &toseglen); 410 411 seglen = ulmin(len, ulmin(fromseglen, toseglen)); 412 if (seglen < AES_BLOCK_LEN) { 413 seglen = ulmin(len, AES_BLOCK_LEN); 414 415 memset(block, 0, sizeof(block)); 416 crypto_cursor_copydata(fromc, (int)seglen, block); 417 418 if (seglen == AES_BLOCK_LEN) { 419 armv8_aes_encrypt_gcm_block(&s, aes_key, 420 (uint64_t *)block, (uint64_t *)block); 421 } else { 422 aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key); 423 AES_INC_COUNTER(s.aes_counter); 424 for (i = 0; i < seglen; i++) 425 block[i] ^= s.EKi.c[i]; 426 } 427 gcm_ghash_v8(s.Xi.u, Htable, block, seglen); 428 429 crypto_cursor_copyback(toc, (int)seglen, block); 430 } else { 431 for (oseglen = seglen; seglen >= AES_BLOCK_LEN; 432 seglen -= AES_BLOCK_LEN) { 433 armv8_aes_encrypt_gcm_block(&s, aes_key, from64, 434 to64); 435 gcm_ghash_v8(s.Xi.u, Htable, (uint8_t *)to64, 436 AES_BLOCK_LEN); 437 438 from64 += 2; 439 to64 += 2; 440 } 441 442 seglen = oseglen - seglen; 443 crypto_cursor_advance(fromc, seglen); 444 crypto_cursor_advance(toc, seglen); 445 } 446 } 447 448 armv8_aes_gmac_finish(&s, olen, authdatalen, Htable); 449 memcpy(tag, s.Xi.c, GMAC_DIGEST_LEN); 450 451 explicit_bzero(block, sizeof(block)); 452 explicit_bzero(&s, sizeof(s)); 453 } 454 455 int 456 armv8_aes_decrypt_gcm(AES_key_t *aes_key, size_t len, 457 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 458 size_t authdatalen, const uint8_t *authdata, 459 const uint8_t tag[static GMAC_DIGEST_LEN], 460 const uint8_t iv[static AES_GCM_IV_LEN], 461 const __uint128_val_t *Htable) 462 { 463 struct armv8_gcm_state s; 464 struct crypto_buffer_cursor fromcc; 465 uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN), *from; 466 uint64_t *block64, *from64, *to64; 467 size_t fromseglen, olen, oseglen, seglen, toseglen; 468 int error; 469 470 armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable); 471 472 crypto_cursor_copy(fromc, &fromcc); 473 for (olen = len; len > 0; len -= seglen) { 474 from = crypto_cursor_segment(&fromcc, &fromseglen); 475 seglen = ulmin(len, fromseglen); 476 seglen -= seglen % AES_BLOCK_LEN; 477 if (seglen > 0) { 478 gcm_ghash_v8(s.Xi.u, Htable, from, seglen); 479 crypto_cursor_advance(&fromcc, seglen); 480 } else { 481 memset(block, 0, sizeof(block)); 482 seglen = ulmin(len, AES_BLOCK_LEN); 483 crypto_cursor_copydata(&fromcc, seglen, block); 484 gcm_ghash_v8(s.Xi.u, Htable, block, seglen); 485 } 486 } 487 488 armv8_aes_gmac_finish(&s, olen, authdatalen, Htable); 489 490 if (timingsafe_bcmp(tag, s.Xi.c, GMAC_DIGEST_LEN) != 0) { 491 error = EBADMSG; 492 goto out; 493 } 494 495 block64 = (uint64_t *)block; 496 for (len = olen; len > 0; len -= seglen) { 497 from64 = crypto_cursor_segment(fromc, &fromseglen); 498 to64 = crypto_cursor_segment(toc, &toseglen); 499 500 seglen = ulmin(len, ulmin(fromseglen, toseglen)); 501 if (seglen < AES_BLOCK_LEN) { 502 seglen = ulmin(len, AES_BLOCK_LEN); 503 504 memset(block, 0, sizeof(block)); 505 crypto_cursor_copydata(fromc, seglen, block); 506 507 armv8_aes_decrypt_gcm_block(&s, aes_key, block64, 508 block64); 509 510 crypto_cursor_copyback(toc, (int)seglen, block); 511 } else { 512 for (oseglen = seglen; seglen >= AES_BLOCK_LEN; 513 seglen -= AES_BLOCK_LEN) { 514 armv8_aes_decrypt_gcm_block(&s, aes_key, from64, 515 to64); 516 517 from64 += 2; 518 to64 += 2; 519 } 520 521 seglen = oseglen - seglen; 522 crypto_cursor_advance(fromc, seglen); 523 crypto_cursor_advance(toc, seglen); 524 } 525 } 526 527 error = 0; 528 out: 529 explicit_bzero(block, sizeof(block)); 530 explicit_bzero(&s, sizeof(s)); 531 return (error); 532 } 533