1 /*- 2 * Copyright (c) 2016 The FreeBSD Foundation 3 * Copyright (c) 2020 Ampere Computing 4 * All rights reserved. 5 * 6 * This software was developed by Andrew Turner under 7 * sponsorship from the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 /* 32 * This code is built with floating-point enabled. Make sure to have entered 33 * into floating-point context before calling any of these functions. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/malloc.h> 42 #include <sys/queue.h> 43 44 #include <opencrypto/cryptodev.h> 45 #include <opencrypto/gmac.h> 46 #include <crypto/rijndael/rijndael.h> 47 #include <crypto/armv8/armv8_crypto.h> 48 49 #include <arm_neon.h> 50 51 static uint8x16_t 52 armv8_aes_enc(int rounds, const uint8x16_t *keysched, const uint8x16_t from) 53 { 54 uint8x16_t tmp; 55 int i; 56 57 tmp = from; 58 for (i = 0; i < rounds - 1; i += 2) { 59 tmp = vaeseq_u8(tmp, keysched[i]); 60 tmp = vaesmcq_u8(tmp); 61 tmp = vaeseq_u8(tmp, keysched[i + 1]); 62 tmp = vaesmcq_u8(tmp); 63 } 64 65 tmp = vaeseq_u8(tmp, keysched[rounds - 1]); 66 tmp = vaesmcq_u8(tmp); 67 tmp = vaeseq_u8(tmp, keysched[rounds]); 68 tmp = veorq_u8(tmp, keysched[rounds + 1]); 69 70 return (tmp); 71 } 72 73 static uint8x16_t 74 armv8_aes_dec(int rounds, const uint8x16_t *keysched, const uint8x16_t from) 75 { 76 uint8x16_t tmp; 77 int i; 78 79 tmp = from; 80 for (i = 0; i < rounds - 1; i += 2) { 81 tmp = vaesdq_u8(tmp, keysched[i]); 82 tmp = vaesimcq_u8(tmp); 83 tmp = vaesdq_u8(tmp, keysched[i+1]); 84 tmp = vaesimcq_u8(tmp); 85 } 86 87 tmp = vaesdq_u8(tmp, keysched[rounds - 1]); 88 tmp = vaesimcq_u8(tmp); 89 tmp = vaesdq_u8(tmp, keysched[rounds]); 90 tmp = veorq_u8(tmp, keysched[rounds + 1]); 91 92 return (tmp); 93 } 94 95 void 96 armv8_aes_encrypt_cbc(const AES_key_t *key, size_t len, 97 const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN]) 98 { 99 uint8x16_t tot, ivreg, tmp; 100 size_t i; 101 102 len /= AES_BLOCK_LEN; 103 ivreg = vld1q_u8(iv); 104 for (i = 0; i < len; i++) { 105 tmp = vld1q_u8(from); 106 tot = armv8_aes_enc(key->aes_rounds - 1, 107 (const void*)key->aes_key, veorq_u8(tmp, ivreg)); 108 ivreg = tot; 109 vst1q_u8(to, tot); 110 from += AES_BLOCK_LEN; 111 to += AES_BLOCK_LEN; 112 } 113 } 114 115 void 116 armv8_aes_decrypt_cbc(const AES_key_t *key, size_t len, 117 uint8_t *buf, const uint8_t iv[static AES_BLOCK_LEN]) 118 { 119 uint8x16_t ivreg, nextiv, tmp; 120 size_t i; 121 122 len /= AES_BLOCK_LEN; 123 ivreg = vld1q_u8(iv); 124 for (i = 0; i < len; i++) { 125 nextiv = vld1q_u8(buf); 126 tmp = armv8_aes_dec(key->aes_rounds - 1, 127 (const void*)key->aes_key, nextiv); 128 vst1q_u8(buf, veorq_u8(tmp, ivreg)); 129 ivreg = nextiv; 130 buf += AES_BLOCK_LEN; 131 } 132 } 133 134 #define AES_XTS_BLOCKSIZE 16 135 #define AES_XTS_IVSIZE 8 136 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */ 137 138 static inline int32x4_t 139 xts_crank_lfsr(int32x4_t inp) 140 { 141 const int32x4_t alphamask = {AES_XTS_ALPHA, 1, 1, 1}; 142 int32x4_t xtweak, ret; 143 144 /* set up xor mask */ 145 xtweak = vextq_s32(inp, inp, 3); 146 xtweak = vshrq_n_s32(xtweak, 31); 147 xtweak &= alphamask; 148 149 /* next term */ 150 ret = vshlq_n_s32(inp, 1); 151 ret ^= xtweak; 152 153 return ret; 154 } 155 156 static void 157 armv8_aes_crypt_xts_block(int rounds, const uint8x16_t *key_schedule, 158 uint8x16_t *tweak, const uint8_t *from, uint8_t *to, int do_encrypt) 159 { 160 uint8x16_t block; 161 162 block = vld1q_u8(from) ^ *tweak; 163 164 if (do_encrypt) 165 block = armv8_aes_enc(rounds - 1, key_schedule, block); 166 else 167 block = armv8_aes_dec(rounds - 1, key_schedule, block); 168 169 vst1q_u8(to, block ^ *tweak); 170 171 *tweak = vreinterpretq_u8_s32(xts_crank_lfsr(vreinterpretq_s32_u8(*tweak))); 172 } 173 174 static void 175 armv8_aes_crypt_xts(int rounds, const uint8x16_t *data_schedule, 176 const uint8x16_t *tweak_schedule, size_t len, const uint8_t *from, 177 uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt) 178 { 179 uint8x16_t tweakreg; 180 uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16); 181 size_t i, cnt; 182 183 /* 184 * Prepare tweak as E_k2(IV). IV is specified as LE representation 185 * of a 64-bit block number which we allow to be passed in directly. 186 */ 187 #if BYTE_ORDER == LITTLE_ENDIAN 188 bcopy(iv, tweak, AES_XTS_IVSIZE); 189 /* Last 64 bits of IV are always zero. */ 190 bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE); 191 #else 192 #error Only LITTLE_ENDIAN architectures are supported. 193 #endif 194 tweakreg = vld1q_u8(tweak); 195 tweakreg = armv8_aes_enc(rounds - 1, tweak_schedule, tweakreg); 196 197 cnt = len / AES_XTS_BLOCKSIZE; 198 for (i = 0; i < cnt; i++) { 199 armv8_aes_crypt_xts_block(rounds, data_schedule, &tweakreg, 200 from, to, do_encrypt); 201 from += AES_XTS_BLOCKSIZE; 202 to += AES_XTS_BLOCKSIZE; 203 } 204 } 205 206 void 207 armv8_aes_encrypt_xts(AES_key_t *data_schedule, 208 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, 209 const uint8_t iv[static AES_BLOCK_LEN]) 210 { 211 212 armv8_aes_crypt_xts(data_schedule->aes_rounds, 213 (const void *)&data_schedule->aes_key, tweak_schedule, len, from, 214 to, iv, 1); 215 } 216 217 void 218 armv8_aes_decrypt_xts(AES_key_t *data_schedule, 219 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, 220 const uint8_t iv[static AES_BLOCK_LEN]) 221 { 222 223 armv8_aes_crypt_xts(data_schedule->aes_rounds, 224 (const void *)&data_schedule->aes_key, tweak_schedule, len, from, 225 to,iv, 0); 226 227 } 228 229 #define AES_INC_COUNTER(counter) \ 230 do { \ 231 for (int pos = AES_BLOCK_LEN - 1; \ 232 pos >= 0; pos--) \ 233 if (++(counter)[pos]) \ 234 break; \ 235 } while (0) 236 237 struct armv8_gcm_state { 238 __uint128_val_t EK0; 239 __uint128_val_t EKi; 240 __uint128_val_t Xi; 241 __uint128_val_t lenblock; 242 uint8_t aes_counter[AES_BLOCK_LEN]; 243 }; 244 245 void 246 armv8_aes_encrypt_gcm(AES_key_t *aes_key, size_t len, 247 const uint8_t *from, uint8_t *to, 248 size_t authdatalen, const uint8_t *authdata, 249 uint8_t tag[static GMAC_DIGEST_LEN], 250 const uint8_t iv[static AES_GCM_IV_LEN], 251 const __uint128_val_t *Htable) 252 { 253 struct armv8_gcm_state s; 254 const uint64_t *from64; 255 uint64_t *to64; 256 uint8_t block[AES_BLOCK_LEN]; 257 size_t i, trailer; 258 259 bzero(&s.aes_counter, AES_BLOCK_LEN); 260 memcpy(s.aes_counter, iv, AES_GCM_IV_LEN); 261 262 /* Setup the counter */ 263 s.aes_counter[AES_BLOCK_LEN - 1] = 1; 264 265 /* EK0 for a final GMAC round */ 266 aes_v8_encrypt(s.aes_counter, s.EK0.c, aes_key); 267 268 /* GCM starts with 2 as counter, 1 is used for final xor of tag. */ 269 s.aes_counter[AES_BLOCK_LEN - 1] = 2; 270 271 memset(s.Xi.c, 0, sizeof(s.Xi.c)); 272 trailer = authdatalen % AES_BLOCK_LEN; 273 if (authdatalen - trailer > 0) { 274 gcm_ghash_v8(s.Xi.u, Htable, authdata, authdatalen - trailer); 275 authdata += authdatalen - trailer; 276 } 277 if (trailer > 0 || authdatalen == 0) { 278 memset(block, 0, sizeof(block)); 279 memcpy(block, authdata, trailer); 280 gcm_ghash_v8(s.Xi.u, Htable, block, AES_BLOCK_LEN); 281 } 282 283 from64 = (const uint64_t*)from; 284 to64 = (uint64_t*)to; 285 trailer = len % AES_BLOCK_LEN; 286 287 for (i = 0; i < (len - trailer); i += AES_BLOCK_LEN) { 288 aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key); 289 AES_INC_COUNTER(s.aes_counter); 290 to64[0] = from64[0] ^ s.EKi.u[0]; 291 to64[1] = from64[1] ^ s.EKi.u[1]; 292 gcm_ghash_v8(s.Xi.u, Htable, (uint8_t*)to64, AES_BLOCK_LEN); 293 294 to64 += 2; 295 from64 += 2; 296 } 297 298 to += (len - trailer); 299 from += (len - trailer); 300 301 if (trailer) { 302 aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key); 303 AES_INC_COUNTER(s.aes_counter); 304 memset(block, 0, sizeof(block)); 305 for (i = 0; i < trailer; i++) { 306 block[i] = to[i] = from[i] ^ s.EKi.c[i]; 307 } 308 309 gcm_ghash_v8(s.Xi.u, Htable, block, AES_BLOCK_LEN); 310 } 311 312 /* Lengths block */ 313 s.lenblock.u[0] = s.lenblock.u[1] = 0; 314 s.lenblock.d[1] = htobe32(authdatalen * 8); 315 s.lenblock.d[3] = htobe32(len * 8); 316 gcm_ghash_v8(s.Xi.u, Htable, s.lenblock.c, AES_BLOCK_LEN); 317 318 s.Xi.u[0] ^= s.EK0.u[0]; 319 s.Xi.u[1] ^= s.EK0.u[1]; 320 memcpy(tag, s.Xi.c, GMAC_DIGEST_LEN); 321 322 explicit_bzero(&s, sizeof(s)); 323 } 324 325 int 326 armv8_aes_decrypt_gcm(AES_key_t *aes_key, size_t len, 327 const uint8_t *from, uint8_t *to, 328 size_t authdatalen, const uint8_t *authdata, 329 const uint8_t tag[static GMAC_DIGEST_LEN], 330 const uint8_t iv[static AES_GCM_IV_LEN], 331 const __uint128_val_t *Htable) 332 { 333 struct armv8_gcm_state s; 334 const uint64_t *from64; 335 uint64_t *to64; 336 uint8_t block[AES_BLOCK_LEN]; 337 size_t i, trailer; 338 int error; 339 340 error = 0; 341 bzero(&s.aes_counter, AES_BLOCK_LEN); 342 memcpy(s.aes_counter, iv, AES_GCM_IV_LEN); 343 344 /* Setup the counter */ 345 s.aes_counter[AES_BLOCK_LEN - 1] = 1; 346 347 /* EK0 for a final GMAC round */ 348 aes_v8_encrypt(s.aes_counter, s.EK0.c, aes_key); 349 350 memset(s.Xi.c, 0, sizeof(s.Xi.c)); 351 trailer = authdatalen % AES_BLOCK_LEN; 352 if (authdatalen - trailer > 0) { 353 gcm_ghash_v8(s.Xi.u, Htable, authdata, authdatalen - trailer); 354 authdata += authdatalen - trailer; 355 } 356 if (trailer > 0 || authdatalen == 0) { 357 memset(block, 0, sizeof(block)); 358 memcpy(block, authdata, trailer); 359 gcm_ghash_v8(s.Xi.u, Htable, block, AES_BLOCK_LEN); 360 } 361 362 trailer = len % AES_BLOCK_LEN; 363 if (len - trailer > 0) 364 gcm_ghash_v8(s.Xi.u, Htable, from, len - trailer); 365 if (trailer > 0) { 366 memset(block, 0, sizeof(block)); 367 memcpy(block, from + len - trailer, trailer); 368 gcm_ghash_v8(s.Xi.u, Htable, block, AES_BLOCK_LEN); 369 } 370 371 /* Lengths block */ 372 s.lenblock.u[0] = s.lenblock.u[1] = 0; 373 s.lenblock.d[1] = htobe32(authdatalen * 8); 374 s.lenblock.d[3] = htobe32(len * 8); 375 gcm_ghash_v8(s.Xi.u, Htable, s.lenblock.c, AES_BLOCK_LEN); 376 377 s.Xi.u[0] ^= s.EK0.u[0]; 378 s.Xi.u[1] ^= s.EK0.u[1]; 379 if (timingsafe_bcmp(tag, s.Xi.c, GMAC_DIGEST_LEN) != 0) { 380 error = EBADMSG; 381 goto out; 382 } 383 384 /* GCM starts with 2 as counter, 1 is used for final xor of tag. */ 385 s.aes_counter[AES_BLOCK_LEN - 1] = 2; 386 387 from64 = (const uint64_t*)from; 388 to64 = (uint64_t*)to; 389 390 for (i = 0; i < (len - trailer); i += AES_BLOCK_LEN) { 391 aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key); 392 AES_INC_COUNTER(s.aes_counter); 393 to64[0] = from64[0] ^ s.EKi.u[0]; 394 to64[1] = from64[1] ^ s.EKi.u[1]; 395 to64 += 2; 396 from64 += 2; 397 } 398 399 to += (len - trailer); 400 from += (len - trailer); 401 402 if (trailer) { 403 aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key); 404 AES_INC_COUNTER(s.aes_counter); 405 for (i = 0; i < trailer; i++) 406 to[i] = from[i] ^ s.EKi.c[i]; 407 } 408 409 out: 410 explicit_bzero(&s, sizeof(s)); 411 return (error); 412 } 413