1 /*- 2 * Copyright (c) 2016 The FreeBSD Foundation 3 * Copyright (c) 2020 Ampere Computing 4 * All rights reserved. 5 * 6 * This software was developed by Andrew Turner under 7 * sponsorship from the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * This file is derived from aesni_wrap.c: 31 * Copyright (C) 2008 Damien Miller <djm@mindrot.org> 32 * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org> 33 * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net> 34 * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org> 35 * Copyright (c) 2014 The FreeBSD Foundation 36 */ 37 38 /* 39 * This code is built with floating-point enabled. Make sure to have entered 40 * into floating-point context before calling any of these functions. 41 */ 42 43 #include <sys/cdefs.h> 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/malloc.h> 47 #include <sys/queue.h> 48 49 #include <opencrypto/cryptodev.h> 50 #include <opencrypto/gmac.h> 51 #include <crypto/rijndael/rijndael.h> 52 #include <crypto/armv8/armv8_crypto.h> 53 54 #include <arm_neon.h> 55 56 static uint8x16_t 57 armv8_aes_enc(int rounds, const uint8x16_t *keysched, const uint8x16_t from) 58 { 59 uint8x16_t tmp; 60 int i; 61 62 tmp = from; 63 for (i = 0; i < rounds - 1; i += 2) { 64 tmp = vaeseq_u8(tmp, keysched[i]); 65 tmp = vaesmcq_u8(tmp); 66 tmp = vaeseq_u8(tmp, keysched[i + 1]); 67 tmp = vaesmcq_u8(tmp); 68 } 69 70 tmp = vaeseq_u8(tmp, keysched[rounds - 1]); 71 tmp = vaesmcq_u8(tmp); 72 tmp = vaeseq_u8(tmp, keysched[rounds]); 73 tmp = veorq_u8(tmp, keysched[rounds + 1]); 74 75 return (tmp); 76 } 77 78 static uint8x16_t 79 armv8_aes_dec(int rounds, const uint8x16_t *keysched, const uint8x16_t from) 80 { 81 uint8x16_t tmp; 82 int i; 83 84 tmp = from; 85 for (i = 0; i < rounds - 1; i += 2) { 86 tmp = vaesdq_u8(tmp, keysched[i]); 87 tmp = vaesimcq_u8(tmp); 88 tmp = vaesdq_u8(tmp, keysched[i+1]); 89 tmp = vaesimcq_u8(tmp); 90 } 91 92 tmp = vaesdq_u8(tmp, keysched[rounds - 1]); 93 tmp = vaesimcq_u8(tmp); 94 tmp = vaesdq_u8(tmp, keysched[rounds]); 95 tmp = veorq_u8(tmp, keysched[rounds + 1]); 96 97 return (tmp); 98 } 99 100 void 101 armv8_aes_encrypt_cbc(const AES_key_t *key, size_t len, 102 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 103 const uint8_t iv[static AES_BLOCK_LEN]) 104 { 105 uint8x16_t tot, ivreg, tmp; 106 uint8_t block[AES_BLOCK_LEN], *from, *to; 107 size_t fromseglen, oseglen, seglen, toseglen; 108 109 KASSERT(len % AES_BLOCK_LEN == 0, 110 ("%s: length %zu not a multiple of the block size", __func__, len)); 111 112 ivreg = vld1q_u8(iv); 113 for (; len > 0; len -= seglen) { 114 from = crypto_cursor_segment(fromc, &fromseglen); 115 to = crypto_cursor_segment(toc, &toseglen); 116 117 seglen = ulmin(len, ulmin(fromseglen, toseglen)); 118 if (seglen < AES_BLOCK_LEN) { 119 crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block); 120 tmp = vld1q_u8(block); 121 tot = armv8_aes_enc(key->aes_rounds - 1, 122 (const void *)key->aes_key, veorq_u8(tmp, ivreg)); 123 ivreg = tot; 124 vst1q_u8(block, tot); 125 crypto_cursor_copyback(toc, AES_BLOCK_LEN, block); 126 seglen = AES_BLOCK_LEN; 127 } else { 128 for (oseglen = seglen; seglen >= AES_BLOCK_LEN; 129 seglen -= AES_BLOCK_LEN) { 130 tmp = vld1q_u8(from); 131 tot = armv8_aes_enc(key->aes_rounds - 1, 132 (const void *)key->aes_key, 133 veorq_u8(tmp, ivreg)); 134 ivreg = tot; 135 vst1q_u8(to, tot); 136 from += AES_BLOCK_LEN; 137 to += AES_BLOCK_LEN; 138 } 139 seglen = oseglen - seglen; 140 crypto_cursor_advance(fromc, seglen); 141 crypto_cursor_advance(toc, seglen); 142 } 143 } 144 145 explicit_bzero(block, sizeof(block)); 146 } 147 148 void 149 armv8_aes_decrypt_cbc(const AES_key_t *key, size_t len, 150 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 151 const uint8_t iv[static AES_BLOCK_LEN]) 152 { 153 uint8x16_t ivreg, nextiv, tmp; 154 uint8_t block[AES_BLOCK_LEN], *from, *to; 155 size_t fromseglen, oseglen, seglen, toseglen; 156 157 KASSERT(len % AES_BLOCK_LEN == 0, 158 ("%s: length %zu not a multiple of the block size", __func__, len)); 159 160 ivreg = vld1q_u8(iv); 161 for (; len > 0; len -= seglen) { 162 from = crypto_cursor_segment(fromc, &fromseglen); 163 to = crypto_cursor_segment(toc, &toseglen); 164 165 seglen = ulmin(len, ulmin(fromseglen, toseglen)); 166 if (seglen < AES_BLOCK_LEN) { 167 crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block); 168 nextiv = vld1q_u8(block); 169 tmp = armv8_aes_dec(key->aes_rounds - 1, 170 (const void *)key->aes_key, nextiv); 171 vst1q_u8(block, veorq_u8(tmp, ivreg)); 172 ivreg = nextiv; 173 crypto_cursor_copyback(toc, AES_BLOCK_LEN, block); 174 seglen = AES_BLOCK_LEN; 175 } else { 176 for (oseglen = seglen; seglen >= AES_BLOCK_LEN; 177 seglen -= AES_BLOCK_LEN) { 178 nextiv = vld1q_u8(from); 179 tmp = armv8_aes_dec(key->aes_rounds - 1, 180 (const void *)key->aes_key, nextiv); 181 vst1q_u8(to, veorq_u8(tmp, ivreg)); 182 ivreg = nextiv; 183 from += AES_BLOCK_LEN; 184 to += AES_BLOCK_LEN; 185 } 186 crypto_cursor_advance(fromc, oseglen - seglen); 187 crypto_cursor_advance(toc, oseglen - seglen); 188 seglen = oseglen - seglen; 189 } 190 } 191 192 explicit_bzero(block, sizeof(block)); 193 } 194 195 #define AES_XTS_BLOCKSIZE 16 196 #define AES_XTS_IVSIZE 8 197 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */ 198 199 static inline int32x4_t 200 xts_crank_lfsr(int32x4_t inp) 201 { 202 const int32x4_t alphamask = {AES_XTS_ALPHA, 1, 1, 1}; 203 int32x4_t xtweak, ret; 204 205 /* set up xor mask */ 206 xtweak = vextq_s32(inp, inp, 3); 207 xtweak = vshrq_n_s32(xtweak, 31); 208 xtweak &= alphamask; 209 210 /* next term */ 211 ret = vshlq_n_s32(inp, 1); 212 ret ^= xtweak; 213 214 return ret; 215 } 216 217 static void 218 armv8_aes_crypt_xts_block(int rounds, const uint8x16_t *key_schedule, 219 uint8x16_t *tweak, const uint8_t *from, uint8_t *to, int do_encrypt) 220 { 221 uint8x16_t block; 222 223 block = vld1q_u8(from) ^ *tweak; 224 225 if (do_encrypt) 226 block = armv8_aes_enc(rounds - 1, key_schedule, block); 227 else 228 block = armv8_aes_dec(rounds - 1, key_schedule, block); 229 230 vst1q_u8(to, block ^ *tweak); 231 232 *tweak = vreinterpretq_u8_s32(xts_crank_lfsr(vreinterpretq_s32_u8(*tweak))); 233 } 234 235 static void 236 armv8_aes_crypt_xts(int rounds, const uint8x16_t *data_schedule, 237 const uint8x16_t *tweak_schedule, size_t len, 238 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 239 const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt) 240 { 241 uint8x16_t tweakreg; 242 uint8_t block[AES_XTS_BLOCKSIZE] __aligned(16); 243 uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16); 244 uint8_t *from, *to; 245 size_t fromseglen, oseglen, seglen, toseglen; 246 247 KASSERT(len % AES_XTS_BLOCKSIZE == 0, 248 ("%s: length %zu not a multiple of the block size", __func__, len)); 249 250 /* 251 * Prepare tweak as E_k2(IV). IV is specified as LE representation 252 * of a 64-bit block number which we allow to be passed in directly. 253 */ 254 #if BYTE_ORDER == LITTLE_ENDIAN 255 bcopy(iv, tweak, AES_XTS_IVSIZE); 256 /* Last 64 bits of IV are always zero. */ 257 bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE); 258 #else 259 #error Only LITTLE_ENDIAN architectures are supported. 260 #endif 261 tweakreg = vld1q_u8(tweak); 262 tweakreg = armv8_aes_enc(rounds - 1, tweak_schedule, tweakreg); 263 264 for (; len > 0; len -= seglen) { 265 from = crypto_cursor_segment(fromc, &fromseglen); 266 to = crypto_cursor_segment(toc, &toseglen); 267 268 seglen = ulmin(len, ulmin(fromseglen, toseglen)); 269 if (seglen < AES_XTS_BLOCKSIZE) { 270 crypto_cursor_copydata(fromc, AES_XTS_BLOCKSIZE, block); 271 armv8_aes_crypt_xts_block(rounds, data_schedule, 272 &tweakreg, block, block, do_encrypt); 273 crypto_cursor_copyback(toc, AES_XTS_BLOCKSIZE, block); 274 seglen = AES_XTS_BLOCKSIZE; 275 } else { 276 for (oseglen = seglen; seglen >= AES_XTS_BLOCKSIZE; 277 seglen -= AES_XTS_BLOCKSIZE) { 278 armv8_aes_crypt_xts_block(rounds, data_schedule, 279 &tweakreg, from, to, do_encrypt); 280 from += AES_XTS_BLOCKSIZE; 281 to += AES_XTS_BLOCKSIZE; 282 } 283 seglen = oseglen - seglen; 284 crypto_cursor_advance(fromc, seglen); 285 crypto_cursor_advance(toc, seglen); 286 } 287 } 288 289 explicit_bzero(block, sizeof(block)); 290 } 291 292 void 293 armv8_aes_encrypt_xts(AES_key_t *data_schedule, 294 const void *tweak_schedule, size_t len, struct crypto_buffer_cursor *fromc, 295 struct crypto_buffer_cursor *toc, const uint8_t iv[static AES_BLOCK_LEN]) 296 { 297 armv8_aes_crypt_xts(data_schedule->aes_rounds, 298 (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc, 299 toc, iv, 1); 300 } 301 302 void 303 armv8_aes_decrypt_xts(AES_key_t *data_schedule, 304 const void *tweak_schedule, size_t len, 305 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 306 const uint8_t iv[static AES_BLOCK_LEN]) 307 { 308 armv8_aes_crypt_xts(data_schedule->aes_rounds, 309 (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc, 310 toc, iv, 0); 311 312 } 313 #define AES_INC_COUNTER(counter) \ 314 do { \ 315 for (int pos = AES_BLOCK_LEN - 1; \ 316 pos >= 0; pos--) \ 317 if (++(counter)[pos]) \ 318 break; \ 319 } while (0) 320 321 struct armv8_gcm_state { 322 __uint128_val_t EK0; 323 __uint128_val_t EKi; 324 __uint128_val_t Xi; 325 __uint128_val_t lenblock; 326 uint8_t aes_counter[AES_BLOCK_LEN]; 327 }; 328 329 static void 330 armv8_aes_gmac_setup(struct armv8_gcm_state *s, AES_key_t *aes_key, 331 const uint8_t *authdata, size_t authdatalen, 332 const uint8_t iv[static AES_GCM_IV_LEN], const __uint128_val_t *Htable) 333 { 334 uint8_t block[AES_BLOCK_LEN]; 335 size_t trailer; 336 337 bzero(s->aes_counter, AES_BLOCK_LEN); 338 memcpy(s->aes_counter, iv, AES_GCM_IV_LEN); 339 340 /* Setup the counter */ 341 s->aes_counter[AES_BLOCK_LEN - 1] = 1; 342 343 /* EK0 for a final GMAC round */ 344 aes_v8_encrypt(s->aes_counter, s->EK0.c, aes_key); 345 346 /* GCM starts with 2 as counter, 1 is used for final xor of tag. */ 347 s->aes_counter[AES_BLOCK_LEN - 1] = 2; 348 349 memset(s->Xi.c, 0, sizeof(s->Xi.c)); 350 trailer = authdatalen % AES_BLOCK_LEN; 351 if (authdatalen - trailer > 0) { 352 gcm_ghash_v8(s->Xi.u, Htable, authdata, authdatalen - trailer); 353 authdata += authdatalen - trailer; 354 } 355 if (trailer > 0 || authdatalen == 0) { 356 memset(block, 0, sizeof(block)); 357 memcpy(block, authdata, trailer); 358 gcm_ghash_v8(s->Xi.u, Htable, block, AES_BLOCK_LEN); 359 } 360 } 361 362 static void 363 armv8_aes_gmac_finish(struct armv8_gcm_state *s, size_t len, 364 size_t authdatalen, const __uint128_val_t *Htable) 365 { 366 /* Lengths block */ 367 s->lenblock.u[0] = s->lenblock.u[1] = 0; 368 s->lenblock.d[1] = htobe32(authdatalen * 8); 369 s->lenblock.d[3] = htobe32(len * 8); 370 gcm_ghash_v8(s->Xi.u, Htable, s->lenblock.c, AES_BLOCK_LEN); 371 372 s->Xi.u[0] ^= s->EK0.u[0]; 373 s->Xi.u[1] ^= s->EK0.u[1]; 374 } 375 376 static void 377 armv8_aes_encrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key, 378 const uint64_t *from, uint64_t *to) 379 { 380 aes_v8_encrypt(s->aes_counter, s->EKi.c, aes_key); 381 AES_INC_COUNTER(s->aes_counter); 382 to[0] = from[0] ^ s->EKi.u[0]; 383 to[1] = from[1] ^ s->EKi.u[1]; 384 } 385 386 static void 387 armv8_aes_decrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key, 388 const uint64_t *from, uint64_t *to) 389 { 390 armv8_aes_encrypt_gcm_block(s, aes_key, from, to); 391 } 392 393 void 394 armv8_aes_encrypt_gcm(AES_key_t *aes_key, size_t len, 395 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 396 size_t authdatalen, const uint8_t *authdata, 397 uint8_t tag[static GMAC_DIGEST_LEN], 398 const uint8_t iv[static AES_GCM_IV_LEN], 399 const __uint128_val_t *Htable) 400 { 401 struct armv8_gcm_state s; 402 uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN); 403 uint64_t *from64, *to64; 404 size_t fromseglen, i, olen, oseglen, seglen, toseglen; 405 406 armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable); 407 408 for (olen = len; len > 0; len -= seglen) { 409 from64 = crypto_cursor_segment(fromc, &fromseglen); 410 to64 = crypto_cursor_segment(toc, &toseglen); 411 412 seglen = ulmin(len, ulmin(fromseglen, toseglen)); 413 if (seglen < AES_BLOCK_LEN) { 414 seglen = ulmin(len, AES_BLOCK_LEN); 415 416 memset(block, 0, sizeof(block)); 417 crypto_cursor_copydata(fromc, (int)seglen, block); 418 419 if (seglen == AES_BLOCK_LEN) { 420 armv8_aes_encrypt_gcm_block(&s, aes_key, 421 (uint64_t *)block, (uint64_t *)block); 422 } else { 423 aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key); 424 AES_INC_COUNTER(s.aes_counter); 425 for (i = 0; i < seglen; i++) 426 block[i] ^= s.EKi.c[i]; 427 } 428 gcm_ghash_v8(s.Xi.u, Htable, block, seglen); 429 430 crypto_cursor_copyback(toc, (int)seglen, block); 431 } else { 432 for (oseglen = seglen; seglen >= AES_BLOCK_LEN; 433 seglen -= AES_BLOCK_LEN) { 434 armv8_aes_encrypt_gcm_block(&s, aes_key, from64, 435 to64); 436 gcm_ghash_v8(s.Xi.u, Htable, (uint8_t *)to64, 437 AES_BLOCK_LEN); 438 439 from64 += 2; 440 to64 += 2; 441 } 442 443 seglen = oseglen - seglen; 444 crypto_cursor_advance(fromc, seglen); 445 crypto_cursor_advance(toc, seglen); 446 } 447 } 448 449 armv8_aes_gmac_finish(&s, olen, authdatalen, Htable); 450 memcpy(tag, s.Xi.c, GMAC_DIGEST_LEN); 451 452 explicit_bzero(block, sizeof(block)); 453 explicit_bzero(&s, sizeof(s)); 454 } 455 456 int 457 armv8_aes_decrypt_gcm(AES_key_t *aes_key, size_t len, 458 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc, 459 size_t authdatalen, const uint8_t *authdata, 460 const uint8_t tag[static GMAC_DIGEST_LEN], 461 const uint8_t iv[static AES_GCM_IV_LEN], 462 const __uint128_val_t *Htable) 463 { 464 struct armv8_gcm_state s; 465 struct crypto_buffer_cursor fromcc; 466 uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN), *from; 467 uint64_t *block64, *from64, *to64; 468 size_t fromseglen, olen, oseglen, seglen, toseglen; 469 int error; 470 471 armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable); 472 473 crypto_cursor_copy(fromc, &fromcc); 474 for (olen = len; len > 0; len -= seglen) { 475 from = crypto_cursor_segment(&fromcc, &fromseglen); 476 seglen = ulmin(len, fromseglen); 477 seglen -= seglen % AES_BLOCK_LEN; 478 if (seglen > 0) { 479 gcm_ghash_v8(s.Xi.u, Htable, from, seglen); 480 crypto_cursor_advance(&fromcc, seglen); 481 } else { 482 memset(block, 0, sizeof(block)); 483 seglen = ulmin(len, AES_BLOCK_LEN); 484 crypto_cursor_copydata(&fromcc, seglen, block); 485 gcm_ghash_v8(s.Xi.u, Htable, block, seglen); 486 } 487 } 488 489 armv8_aes_gmac_finish(&s, olen, authdatalen, Htable); 490 491 if (timingsafe_bcmp(tag, s.Xi.c, GMAC_DIGEST_LEN) != 0) { 492 error = EBADMSG; 493 goto out; 494 } 495 496 block64 = (uint64_t *)block; 497 for (len = olen; len > 0; len -= seglen) { 498 from64 = crypto_cursor_segment(fromc, &fromseglen); 499 to64 = crypto_cursor_segment(toc, &toseglen); 500 501 seglen = ulmin(len, ulmin(fromseglen, toseglen)); 502 if (seglen < AES_BLOCK_LEN) { 503 seglen = ulmin(len, AES_BLOCK_LEN); 504 505 memset(block, 0, sizeof(block)); 506 crypto_cursor_copydata(fromc, seglen, block); 507 508 armv8_aes_decrypt_gcm_block(&s, aes_key, block64, 509 block64); 510 511 crypto_cursor_copyback(toc, (int)seglen, block); 512 } else { 513 for (oseglen = seglen; seglen >= AES_BLOCK_LEN; 514 seglen -= AES_BLOCK_LEN) { 515 armv8_aes_decrypt_gcm_block(&s, aes_key, from64, 516 to64); 517 518 from64 += 2; 519 to64 += 2; 520 } 521 522 seglen = oseglen - seglen; 523 crypto_cursor_advance(fromc, seglen); 524 crypto_cursor_advance(toc, seglen); 525 } 526 } 527 528 error = 0; 529 out: 530 explicit_bzero(block, sizeof(block)); 531 explicit_bzero(&s, sizeof(s)); 532 return (error); 533 } 534