1 /*- 2 * Copyright (c) 2016 The FreeBSD Foundation 3 * Copyright (c) 2020 Ampere Computing 4 * All rights reserved. 5 * 6 * This software was developed by Andrew Turner under 7 * sponsorship from the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 /* 32 * This code is built with floating-point enabled. Make sure to have entered 33 * into floating-point context before calling any of these functions. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/malloc.h> 42 #include <sys/queue.h> 43 44 #include <opencrypto/cryptodev.h> 45 #include <opencrypto/gmac.h> 46 #include <crypto/rijndael/rijndael.h> 47 #include <crypto/armv8/armv8_crypto.h> 48 49 #include <arm_neon.h> 50 51 static uint8x16_t 52 armv8_aes_enc(int rounds, const uint8x16_t *keysched, const uint8x16_t from) 53 { 54 uint8x16_t tmp; 55 int i; 56 57 tmp = from; 58 for (i = 0; i < rounds - 1; i += 2) { 59 tmp = vaeseq_u8(tmp, keysched[i]); 60 tmp = vaesmcq_u8(tmp); 61 tmp = vaeseq_u8(tmp, keysched[i + 1]); 62 tmp = vaesmcq_u8(tmp); 63 } 64 65 tmp = vaeseq_u8(tmp, keysched[rounds - 1]); 66 tmp = vaesmcq_u8(tmp); 67 tmp = vaeseq_u8(tmp, keysched[rounds]); 68 tmp = veorq_u8(tmp, keysched[rounds + 1]); 69 70 return (tmp); 71 } 72 73 static uint8x16_t 74 armv8_aes_dec(int rounds, const uint8x16_t *keysched, const uint8x16_t from) 75 { 76 uint8x16_t tmp; 77 int i; 78 79 tmp = from; 80 for (i = 0; i < rounds - 1; i += 2) { 81 tmp = vaesdq_u8(tmp, keysched[i]); 82 tmp = vaesimcq_u8(tmp); 83 tmp = vaesdq_u8(tmp, keysched[i+1]); 84 tmp = vaesimcq_u8(tmp); 85 } 86 87 tmp = vaesdq_u8(tmp, keysched[rounds - 1]); 88 tmp = vaesimcq_u8(tmp); 89 tmp = vaesdq_u8(tmp, keysched[rounds]); 90 tmp = veorq_u8(tmp, keysched[rounds + 1]); 91 92 return (tmp); 93 } 94 95 void 96 armv8_aes_encrypt_cbc(const AES_key_t *key, size_t len, 97 const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN]) 98 { 99 uint8x16_t tot, ivreg, tmp; 100 size_t i; 101 102 len /= AES_BLOCK_LEN; 103 ivreg = vld1q_u8(iv); 104 for (i = 0; i < len; i++) { 105 tmp = vld1q_u8(from); 106 tot = armv8_aes_enc(key->aes_rounds - 1, 107 (const void*)key->aes_key, veorq_u8(tmp, ivreg)); 108 ivreg = tot; 109 vst1q_u8(to, tot); 110 from += AES_BLOCK_LEN; 111 to += AES_BLOCK_LEN; 112 } 113 } 114 115 void 116 armv8_aes_decrypt_cbc(const AES_key_t *key, size_t len, 117 uint8_t *buf, const uint8_t iv[static AES_BLOCK_LEN]) 118 { 119 uint8x16_t ivreg, nextiv, tmp; 120 size_t i; 121 122 len /= AES_BLOCK_LEN; 123 ivreg = vld1q_u8(iv); 124 for (i = 0; i < len; i++) { 125 nextiv = vld1q_u8(buf); 126 tmp = armv8_aes_dec(key->aes_rounds - 1, 127 (const void*)key->aes_key, nextiv); 128 vst1q_u8(buf, veorq_u8(tmp, ivreg)); 129 ivreg = nextiv; 130 buf += AES_BLOCK_LEN; 131 } 132 } 133 134 #define AES_XTS_BLOCKSIZE 16 135 #define AES_XTS_IVSIZE 8 136 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */ 137 138 static inline int32x4_t 139 xts_crank_lfsr(int32x4_t inp) 140 { 141 const int32x4_t alphamask = {AES_XTS_ALPHA, 1, 1, 1}; 142 int32x4_t xtweak, ret; 143 144 /* set up xor mask */ 145 xtweak = vextq_s32(inp, inp, 3); 146 xtweak = vshrq_n_s32(xtweak, 31); 147 xtweak &= alphamask; 148 149 /* next term */ 150 ret = vshlq_n_s32(inp, 1); 151 ret ^= xtweak; 152 153 return ret; 154 } 155 156 static void 157 armv8_aes_crypt_xts_block(int rounds, const uint8x16_t *key_schedule, 158 uint8x16_t *tweak, const uint8_t *from, uint8_t *to, int do_encrypt) 159 { 160 uint8x16_t block; 161 162 block = vld1q_u8(from) ^ *tweak; 163 164 if (do_encrypt) 165 block = armv8_aes_enc(rounds - 1, key_schedule, block); 166 else 167 block = armv8_aes_dec(rounds - 1, key_schedule, block); 168 169 vst1q_u8(to, block ^ *tweak); 170 171 *tweak = vreinterpretq_u8_s32(xts_crank_lfsr(vreinterpretq_s32_u8(*tweak))); 172 } 173 174 static void 175 armv8_aes_crypt_xts(int rounds, const uint8x16_t *data_schedule, 176 const uint8x16_t *tweak_schedule, size_t len, const uint8_t *from, 177 uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt) 178 { 179 uint8x16_t tweakreg; 180 uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16); 181 size_t i, cnt; 182 183 /* 184 * Prepare tweak as E_k2(IV). IV is specified as LE representation 185 * of a 64-bit block number which we allow to be passed in directly. 186 */ 187 #if BYTE_ORDER == LITTLE_ENDIAN 188 bcopy(iv, tweak, AES_XTS_IVSIZE); 189 /* Last 64 bits of IV are always zero. */ 190 bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE); 191 #else 192 #error Only LITTLE_ENDIAN architectures are supported. 193 #endif 194 tweakreg = vld1q_u8(tweak); 195 tweakreg = armv8_aes_enc(rounds - 1, tweak_schedule, tweakreg); 196 197 cnt = len / AES_XTS_BLOCKSIZE; 198 for (i = 0; i < cnt; i++) { 199 armv8_aes_crypt_xts_block(rounds, data_schedule, &tweakreg, 200 from, to, do_encrypt); 201 from += AES_XTS_BLOCKSIZE; 202 to += AES_XTS_BLOCKSIZE; 203 } 204 } 205 206 void 207 armv8_aes_encrypt_xts(AES_key_t *data_schedule, 208 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, 209 const uint8_t iv[static AES_BLOCK_LEN]) 210 { 211 212 armv8_aes_crypt_xts(data_schedule->aes_rounds, 213 (const void *)&data_schedule->aes_key, tweak_schedule, len, from, 214 to, iv, 1); 215 } 216 217 void 218 armv8_aes_decrypt_xts(AES_key_t *data_schedule, 219 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, 220 const uint8_t iv[static AES_BLOCK_LEN]) 221 { 222 223 armv8_aes_crypt_xts(data_schedule->aes_rounds, 224 (const void *)&data_schedule->aes_key, tweak_schedule, len, from, 225 to,iv, 0); 226 227 } 228 229 #define AES_INC_COUNTER(counter) \ 230 do { \ 231 for (int pos = AES_BLOCK_LEN - 1; \ 232 pos >= 0; pos--) \ 233 if (++(counter)[pos]) \ 234 break; \ 235 } while (0) 236 237 void 238 armv8_aes_encrypt_gcm(AES_key_t *aes_key, size_t len, 239 const uint8_t *from, uint8_t *to, 240 size_t authdatalen, const uint8_t *authdata, 241 uint8_t tag[static GMAC_DIGEST_LEN], 242 const uint8_t iv[static AES_GCM_IV_LEN], 243 const __uint128_val_t *Htable) 244 { 245 size_t i; 246 const uint64_t *from64; 247 uint64_t *to64; 248 uint8_t aes_counter[AES_BLOCK_LEN]; 249 uint8_t block[AES_BLOCK_LEN]; 250 size_t trailer; 251 __uint128_val_t EK0, EKi, Xi, lenblock; 252 253 bzero(&aes_counter, AES_BLOCK_LEN); 254 memcpy(aes_counter, iv, AES_GCM_IV_LEN); 255 256 /* Setup the counter */ 257 aes_counter[AES_BLOCK_LEN - 1] = 1; 258 259 /* EK0 for a final GMAC round */ 260 aes_v8_encrypt(aes_counter, EK0.c, aes_key); 261 262 /* GCM starts with 2 as counter, 1 is used for final xor of tag. */ 263 aes_counter[AES_BLOCK_LEN - 1] = 2; 264 265 memset(Xi.c, 0, sizeof(Xi.c)); 266 memset(block, 0, sizeof(block)); 267 memcpy(block, authdata, min(authdatalen, sizeof(block))); 268 gcm_ghash_v8(Xi.u, Htable, block, AES_BLOCK_LEN); 269 270 from64 = (const uint64_t*)from; 271 to64 = (uint64_t*)to; 272 trailer = len % AES_BLOCK_LEN; 273 274 for (i = 0; i < (len - trailer); i += AES_BLOCK_LEN) { 275 aes_v8_encrypt(aes_counter, EKi.c, aes_key); 276 AES_INC_COUNTER(aes_counter); 277 to64[0] = from64[0] ^ EKi.u[0]; 278 to64[1] = from64[1] ^ EKi.u[1]; 279 gcm_ghash_v8(Xi.u, Htable, (uint8_t*)to64, AES_BLOCK_LEN); 280 281 to64 += 2; 282 from64 += 2; 283 } 284 285 to += (len - trailer); 286 from += (len - trailer); 287 288 if (trailer) { 289 aes_v8_encrypt(aes_counter, EKi.c, aes_key); 290 AES_INC_COUNTER(aes_counter); 291 for (i = 0; i < trailer; i++) { 292 block[i] = to[i] = from[i] ^ EKi.c[i % AES_BLOCK_LEN]; 293 } 294 295 for (; i < AES_BLOCK_LEN; i++) 296 block[i] = 0; 297 298 gcm_ghash_v8(Xi.u, Htable, block, AES_BLOCK_LEN); 299 } 300 301 /* Lengths block */ 302 lenblock.u[0] = lenblock.u[1] = 0; 303 lenblock.d[1] = htobe32(authdatalen * 8); 304 lenblock.d[3] = htobe32(len * 8); 305 gcm_ghash_v8(Xi.u, Htable, lenblock.c, AES_BLOCK_LEN); 306 307 Xi.u[0] ^= EK0.u[0]; 308 Xi.u[1] ^= EK0.u[1]; 309 memcpy(tag, Xi.c, GMAC_DIGEST_LEN); 310 311 explicit_bzero(aes_counter, sizeof(aes_counter)); 312 explicit_bzero(Xi.c, sizeof(Xi.c)); 313 explicit_bzero(EK0.c, sizeof(EK0.c)); 314 explicit_bzero(EKi.c, sizeof(EKi.c)); 315 explicit_bzero(lenblock.c, sizeof(lenblock.c)); 316 } 317 318 int 319 armv8_aes_decrypt_gcm(AES_key_t *aes_key, size_t len, 320 const uint8_t *from, uint8_t *to, 321 size_t authdatalen, const uint8_t *authdata, 322 const uint8_t tag[static GMAC_DIGEST_LEN], 323 const uint8_t iv[static AES_GCM_IV_LEN], 324 const __uint128_val_t *Htable) 325 { 326 size_t i; 327 const uint64_t *from64; 328 uint64_t *to64; 329 uint8_t aes_counter[AES_BLOCK_LEN]; 330 uint8_t block[AES_BLOCK_LEN]; 331 size_t trailer; 332 __uint128_val_t EK0, EKi, Xi, lenblock; 333 int error; 334 335 error = 0; 336 bzero(&aes_counter, AES_BLOCK_LEN); 337 memcpy(aes_counter, iv, AES_GCM_IV_LEN); 338 339 /* Setup the counter */ 340 aes_counter[AES_BLOCK_LEN - 1] = 1; 341 342 /* EK0 for a final GMAC round */ 343 aes_v8_encrypt(aes_counter, EK0.c, aes_key); 344 345 memset(Xi.c, 0, sizeof(Xi.c)); 346 memset(block, 0, sizeof(block)); 347 memcpy(block, authdata, min(authdatalen, sizeof(block))); 348 gcm_ghash_v8(Xi.u, Htable, block, AES_BLOCK_LEN); 349 trailer = len % AES_BLOCK_LEN; 350 gcm_ghash_v8(Xi.u, Htable, from, len - trailer); 351 352 if (trailer) { 353 for (i = 0; i < trailer; i++) 354 block[i] = from[len - trailer + i]; 355 for (; i < AES_BLOCK_LEN; i++) 356 block[i] = 0; 357 gcm_ghash_v8(Xi.u, Htable, block, AES_BLOCK_LEN); 358 } 359 360 /* Lengths block */ 361 lenblock.u[0] = lenblock.u[1] = 0; 362 lenblock.d[1] = htobe32(authdatalen * 8); 363 lenblock.d[3] = htobe32(len * 8); 364 gcm_ghash_v8(Xi.u, Htable, lenblock.c, AES_BLOCK_LEN); 365 366 Xi.u[0] ^= EK0.u[0]; 367 Xi.u[1] ^= EK0.u[1]; 368 if (timingsafe_bcmp(tag, Xi.c, GMAC_DIGEST_LEN) != 0) { 369 error = EBADMSG; 370 goto out; 371 } 372 373 /* GCM starts with 2 as counter, 1 is used for final xor of tag. */ 374 aes_counter[AES_BLOCK_LEN - 1] = 2; 375 376 from64 = (const uint64_t*)from; 377 to64 = (uint64_t*)to; 378 379 for (i = 0; i < (len - trailer); i += AES_BLOCK_LEN) { 380 aes_v8_encrypt(aes_counter, EKi.c, aes_key); 381 AES_INC_COUNTER(aes_counter); 382 to64[0] = from64[0] ^ EKi.u[0]; 383 to64[1] = from64[1] ^ EKi.u[1]; 384 to64 += 2; 385 from64 += 2; 386 } 387 388 to += (len - trailer); 389 from += (len - trailer); 390 391 if (trailer) { 392 aes_v8_encrypt(aes_counter, EKi.c, aes_key); 393 AES_INC_COUNTER(aes_counter); 394 for (i = 0; i < trailer; i++) 395 to[i] = from[i] ^ EKi.c[i % AES_BLOCK_LEN]; 396 } 397 398 out: 399 explicit_bzero(aes_counter, sizeof(aes_counter)); 400 explicit_bzero(Xi.c, sizeof(Xi.c)); 401 explicit_bzero(EK0.c, sizeof(EK0.c)); 402 explicit_bzero(EKi.c, sizeof(EKi.c)); 403 explicit_bzero(lenblock.c, sizeof(lenblock.c)); 404 405 return (error); 406 } 407