1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/cmn_err.h> 28 #include <modes/modes.h> 29 #include <sys/crypto/common.h> 30 #include <sys/crypto/icp.h> 31 #include <sys/crypto/impl.h> 32 #include <sys/byteorder.h> 33 #include <sys/simd.h> 34 #include <modes/gcm_impl.h> 35 #ifdef CAN_USE_GCM_ASM 36 #include <aes/aes_impl.h> 37 #endif 38 39 #define GHASH(c, d, t, o) \ 40 xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ 41 (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \ 42 (uint64_t *)(void *)(t)); 43 44 /* Select GCM implementation */ 45 #define IMPL_FASTEST (UINT32_MAX) 46 #define IMPL_CYCLE (UINT32_MAX-1) 47 #ifdef CAN_USE_GCM_ASM 48 #define IMPL_AVX (UINT32_MAX-2) 49 #if CAN_USE_GCM_ASM >= 2 50 #define IMPL_AVX2 (UINT32_MAX-3) 51 #endif 52 #endif 53 #define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) 54 static uint32_t icp_gcm_impl = IMPL_FASTEST; 55 static uint32_t user_sel_impl = IMPL_FASTEST; 56 57 #ifdef CAN_USE_GCM_ASM 58 /* Does the architecture we run on support the MOVBE instruction? */ 59 boolean_t gcm_avx_can_use_movbe = B_FALSE; 60 /* 61 * Whether to use the optimized openssl gcm and ghash implementations. 62 */ 63 static gcm_impl gcm_impl_used = GCM_IMPL_GENERIC; 64 #define GCM_IMPL_USED (*(volatile gcm_impl *)&gcm_impl_used) 65 66 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *); 67 68 static inline boolean_t gcm_avx_will_work(void); 69 static inline boolean_t gcm_avx2_will_work(void); 70 static inline void gcm_use_impl(gcm_impl impl); 71 static inline gcm_impl gcm_toggle_impl(void); 72 73 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, 74 crypto_data_t *, size_t); 75 76 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); 77 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); 78 static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *, 79 size_t, size_t); 80 #endif /* ifdef CAN_USE_GCM_ASM */ 81 82 /* 83 * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode 84 * is done in another function. 85 */ 86 int 87 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, 88 crypto_data_t *out, size_t block_size, 89 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 90 void (*copy_block)(uint8_t *, uint8_t *), 91 void (*xor_block)(uint8_t *, uint8_t *)) 92 { 93 #ifdef CAN_USE_GCM_ASM 94 if (ctx->impl != GCM_IMPL_GENERIC) 95 return (gcm_mode_encrypt_contiguous_blocks_avx( 96 ctx, data, length, out, block_size)); 97 #endif 98 99 const gcm_impl_ops_t *gops; 100 size_t remainder = length; 101 size_t need = 0; 102 uint8_t *datap = (uint8_t *)data; 103 uint8_t *blockp; 104 uint8_t *lastp; 105 void *iov_or_mp; 106 offset_t offset; 107 uint8_t *out_data_1; 108 uint8_t *out_data_2; 109 size_t out_data_1_len; 110 uint64_t counter; 111 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 112 113 if (length + ctx->gcm_remainder_len < block_size) { 114 /* accumulate bytes here and return */ 115 memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, 116 datap, 117 length); 118 ctx->gcm_remainder_len += length; 119 if (ctx->gcm_copy_to == NULL) { 120 ctx->gcm_copy_to = datap; 121 } 122 return (CRYPTO_SUCCESS); 123 } 124 125 crypto_init_ptrs(out, &iov_or_mp, &offset); 126 127 gops = gcm_impl_get_ops(); 128 do { 129 /* Unprocessed data from last call. */ 130 if (ctx->gcm_remainder_len > 0) { 131 need = block_size - ctx->gcm_remainder_len; 132 133 if (need > remainder) 134 return (CRYPTO_DATA_LEN_RANGE); 135 136 memcpy(&((uint8_t *)ctx->gcm_remainder) 137 [ctx->gcm_remainder_len], datap, need); 138 139 blockp = (uint8_t *)ctx->gcm_remainder; 140 } else { 141 blockp = datap; 142 } 143 144 /* 145 * Increment counter. Counter bits are confined 146 * to the bottom 32 bits of the counter block. 147 */ 148 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 149 counter = htonll(counter + 1); 150 counter &= counter_mask; 151 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 152 153 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, 154 (uint8_t *)ctx->gcm_tmp); 155 xor_block(blockp, (uint8_t *)ctx->gcm_tmp); 156 157 lastp = (uint8_t *)ctx->gcm_tmp; 158 159 ctx->gcm_processed_data_len += block_size; 160 161 crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, 162 &out_data_1_len, &out_data_2, block_size); 163 164 /* copy block to where it belongs */ 165 if (out_data_1_len == block_size) { 166 copy_block(lastp, out_data_1); 167 } else { 168 memcpy(out_data_1, lastp, out_data_1_len); 169 if (out_data_2 != NULL) { 170 memcpy(out_data_2, 171 lastp + out_data_1_len, 172 block_size - out_data_1_len); 173 } 174 } 175 /* update offset */ 176 out->cd_offset += block_size; 177 178 /* add ciphertext to the hash */ 179 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops); 180 181 /* Update pointer to next block of data to be processed. */ 182 if (ctx->gcm_remainder_len != 0) { 183 datap += need; 184 ctx->gcm_remainder_len = 0; 185 } else { 186 datap += block_size; 187 } 188 189 remainder = (size_t)&data[length] - (size_t)datap; 190 191 /* Incomplete last block. */ 192 if (remainder > 0 && remainder < block_size) { 193 memcpy(ctx->gcm_remainder, datap, remainder); 194 ctx->gcm_remainder_len = remainder; 195 ctx->gcm_copy_to = datap; 196 goto out; 197 } 198 ctx->gcm_copy_to = NULL; 199 200 } while (remainder > 0); 201 out: 202 return (CRYPTO_SUCCESS); 203 } 204 205 int 206 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, 207 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 208 void (*copy_block)(uint8_t *, uint8_t *), 209 void (*xor_block)(uint8_t *, uint8_t *)) 210 { 211 (void) copy_block; 212 #ifdef CAN_USE_GCM_ASM 213 if (ctx->impl != GCM_IMPL_GENERIC) 214 return (gcm_encrypt_final_avx(ctx, out, block_size)); 215 #endif 216 217 const gcm_impl_ops_t *gops; 218 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 219 uint8_t *ghash, *macp = NULL; 220 int i, rv; 221 222 if (out->cd_length < 223 (ctx->gcm_remainder_len + ctx->gcm_tag_len)) { 224 return (CRYPTO_DATA_LEN_RANGE); 225 } 226 227 gops = gcm_impl_get_ops(); 228 ghash = (uint8_t *)ctx->gcm_ghash; 229 230 if (ctx->gcm_remainder_len > 0) { 231 uint64_t counter; 232 uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp; 233 234 /* 235 * Here is where we deal with data that is not a 236 * multiple of the block size. 237 */ 238 239 /* 240 * Increment counter. 241 */ 242 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 243 counter = htonll(counter + 1); 244 counter &= counter_mask; 245 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 246 247 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, 248 (uint8_t *)ctx->gcm_tmp); 249 250 macp = (uint8_t *)ctx->gcm_remainder; 251 memset(macp + ctx->gcm_remainder_len, 0, 252 block_size - ctx->gcm_remainder_len); 253 254 /* XOR with counter block */ 255 for (i = 0; i < ctx->gcm_remainder_len; i++) { 256 macp[i] ^= tmpp[i]; 257 } 258 259 /* add ciphertext to the hash */ 260 GHASH(ctx, macp, ghash, gops); 261 262 ctx->gcm_processed_data_len += ctx->gcm_remainder_len; 263 } 264 265 ctx->gcm_len_a_len_c[1] = 266 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); 267 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); 268 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, 269 (uint8_t *)ctx->gcm_J0); 270 xor_block((uint8_t *)ctx->gcm_J0, ghash); 271 272 if (ctx->gcm_remainder_len > 0) { 273 rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len); 274 if (rv != CRYPTO_SUCCESS) 275 return (rv); 276 } 277 out->cd_offset += ctx->gcm_remainder_len; 278 ctx->gcm_remainder_len = 0; 279 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); 280 if (rv != CRYPTO_SUCCESS) 281 return (rv); 282 out->cd_offset += ctx->gcm_tag_len; 283 284 return (CRYPTO_SUCCESS); 285 } 286 287 /* 288 * This will only deal with decrypting the last block of the input that 289 * might not be a multiple of block length. 290 */ 291 static void 292 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index, 293 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 294 void (*xor_block)(uint8_t *, uint8_t *)) 295 { 296 uint8_t *datap, *outp, *counterp; 297 uint64_t counter; 298 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 299 int i; 300 301 /* 302 * Increment counter. 303 * Counter bits are confined to the bottom 32 bits 304 */ 305 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 306 counter = htonll(counter + 1); 307 counter &= counter_mask; 308 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 309 310 datap = (uint8_t *)ctx->gcm_remainder; 311 outp = &((ctx->gcm_pt_buf)[index]); 312 counterp = (uint8_t *)ctx->gcm_tmp; 313 314 /* authentication tag */ 315 memset((uint8_t *)ctx->gcm_tmp, 0, block_size); 316 memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len); 317 318 /* add ciphertext to the hash */ 319 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops()); 320 321 /* decrypt remaining ciphertext */ 322 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp); 323 324 /* XOR with counter block */ 325 for (i = 0; i < ctx->gcm_remainder_len; i++) { 326 outp[i] = datap[i] ^ counterp[i]; 327 } 328 } 329 330 int 331 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, 332 crypto_data_t *out, size_t block_size, 333 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 334 void (*copy_block)(uint8_t *, uint8_t *), 335 void (*xor_block)(uint8_t *, uint8_t *)) 336 { 337 (void) out, (void) block_size, (void) encrypt_block, (void) copy_block, 338 (void) xor_block; 339 size_t new_len; 340 uint8_t *new; 341 342 /* 343 * Copy contiguous ciphertext input blocks to plaintext buffer. 344 * Ciphertext will be decrypted in the final. 345 */ 346 if (length > 0) { 347 new_len = ctx->gcm_pt_buf_len + length; 348 new = vmem_alloc(new_len, KM_SLEEP); 349 if (new == NULL) { 350 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); 351 ctx->gcm_pt_buf = NULL; 352 return (CRYPTO_HOST_MEMORY); 353 } 354 355 if (ctx->gcm_pt_buf != NULL) { 356 memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); 357 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); 358 } else { 359 ASSERT0(ctx->gcm_pt_buf_len); 360 } 361 362 ctx->gcm_pt_buf = new; 363 ctx->gcm_pt_buf_len = new_len; 364 memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data, 365 length); 366 ctx->gcm_processed_data_len += length; 367 } 368 369 ctx->gcm_remainder_len = 0; 370 return (CRYPTO_SUCCESS); 371 } 372 373 int 374 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, 375 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 376 void (*xor_block)(uint8_t *, uint8_t *)) 377 { 378 #ifdef CAN_USE_GCM_ASM 379 if (ctx->impl != GCM_IMPL_GENERIC) 380 return (gcm_decrypt_final_avx(ctx, out, block_size)); 381 #endif 382 383 const gcm_impl_ops_t *gops; 384 size_t pt_len; 385 size_t remainder; 386 uint8_t *ghash; 387 uint8_t *blockp; 388 uint8_t *cbp; 389 uint64_t counter; 390 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 391 int processed = 0, rv; 392 393 ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len); 394 395 gops = gcm_impl_get_ops(); 396 pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; 397 ghash = (uint8_t *)ctx->gcm_ghash; 398 blockp = ctx->gcm_pt_buf; 399 remainder = pt_len; 400 while (remainder > 0) { 401 /* Incomplete last block */ 402 if (remainder < block_size) { 403 memcpy(ctx->gcm_remainder, blockp, remainder); 404 ctx->gcm_remainder_len = remainder; 405 /* 406 * not expecting anymore ciphertext, just 407 * compute plaintext for the remaining input 408 */ 409 gcm_decrypt_incomplete_block(ctx, block_size, 410 processed, encrypt_block, xor_block); 411 ctx->gcm_remainder_len = 0; 412 goto out; 413 } 414 /* add ciphertext to the hash */ 415 GHASH(ctx, blockp, ghash, gops); 416 417 /* 418 * Increment counter. 419 * Counter bits are confined to the bottom 32 bits 420 */ 421 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 422 counter = htonll(counter + 1); 423 counter &= counter_mask; 424 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 425 426 cbp = (uint8_t *)ctx->gcm_tmp; 427 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp); 428 429 /* XOR with ciphertext */ 430 xor_block(cbp, blockp); 431 432 processed += block_size; 433 blockp += block_size; 434 remainder -= block_size; 435 } 436 out: 437 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); 438 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); 439 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, 440 (uint8_t *)ctx->gcm_J0); 441 xor_block((uint8_t *)ctx->gcm_J0, ghash); 442 443 /* compare the input authentication tag with what we calculated */ 444 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { 445 /* They don't match */ 446 return (CRYPTO_INVALID_MAC); 447 } else { 448 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); 449 if (rv != CRYPTO_SUCCESS) 450 return (rv); 451 out->cd_offset += pt_len; 452 } 453 return (CRYPTO_SUCCESS); 454 } 455 456 static int 457 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param) 458 { 459 size_t tag_len; 460 461 /* 462 * Check the length of the authentication tag (in bits). 463 */ 464 tag_len = gcm_param->ulTagBits; 465 switch (tag_len) { 466 case 32: 467 case 64: 468 case 96: 469 case 104: 470 case 112: 471 case 120: 472 case 128: 473 break; 474 default: 475 return (CRYPTO_MECHANISM_PARAM_INVALID); 476 } 477 478 if (gcm_param->ulIvLen == 0) 479 return (CRYPTO_MECHANISM_PARAM_INVALID); 480 481 return (CRYPTO_SUCCESS); 482 } 483 484 static void 485 gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len, 486 gcm_ctx_t *ctx, size_t block_size, 487 void (*copy_block)(uint8_t *, uint8_t *), 488 void (*xor_block)(uint8_t *, uint8_t *)) 489 { 490 const gcm_impl_ops_t *gops; 491 uint8_t *cb; 492 ulong_t remainder = iv_len; 493 ulong_t processed = 0; 494 uint8_t *datap, *ghash; 495 uint64_t len_a_len_c[2]; 496 497 gops = gcm_impl_get_ops(); 498 ghash = (uint8_t *)ctx->gcm_ghash; 499 cb = (uint8_t *)ctx->gcm_cb; 500 if (iv_len == 12) { 501 memcpy(cb, iv, 12); 502 cb[12] = 0; 503 cb[13] = 0; 504 cb[14] = 0; 505 cb[15] = 1; 506 /* J0 will be used again in the final */ 507 copy_block(cb, (uint8_t *)ctx->gcm_J0); 508 } else { 509 /* GHASH the IV */ 510 do { 511 if (remainder < block_size) { 512 memset(cb, 0, block_size); 513 memcpy(cb, &(iv[processed]), remainder); 514 datap = (uint8_t *)cb; 515 remainder = 0; 516 } else { 517 datap = (uint8_t *)(&(iv[processed])); 518 processed += block_size; 519 remainder -= block_size; 520 } 521 GHASH(ctx, datap, ghash, gops); 522 } while (remainder > 0); 523 524 len_a_len_c[0] = 0; 525 len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len)); 526 GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops); 527 528 /* J0 will be used again in the final */ 529 copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb); 530 } 531 } 532 533 static int 534 gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, 535 const uint8_t *auth_data, size_t auth_data_len, size_t block_size, 536 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 537 void (*copy_block)(uint8_t *, uint8_t *), 538 void (*xor_block)(uint8_t *, uint8_t *)) 539 { 540 const gcm_impl_ops_t *gops; 541 uint8_t *ghash, *datap, *authp; 542 size_t remainder, processed; 543 544 /* encrypt zero block to get subkey H */ 545 memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); 546 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H, 547 (uint8_t *)ctx->gcm_H); 548 549 gcm_format_initial_blocks(iv, iv_len, ctx, block_size, 550 copy_block, xor_block); 551 552 gops = gcm_impl_get_ops(); 553 authp = (uint8_t *)ctx->gcm_tmp; 554 ghash = (uint8_t *)ctx->gcm_ghash; 555 memset(authp, 0, block_size); 556 memset(ghash, 0, block_size); 557 558 processed = 0; 559 remainder = auth_data_len; 560 do { 561 if (remainder < block_size) { 562 /* 563 * There's not a block full of data, pad rest of 564 * buffer with zero 565 */ 566 567 if (auth_data != NULL) { 568 memset(authp, 0, block_size); 569 memcpy(authp, &(auth_data[processed]), 570 remainder); 571 } else { 572 ASSERT0(remainder); 573 } 574 575 datap = (uint8_t *)authp; 576 remainder = 0; 577 } else { 578 datap = (uint8_t *)(&(auth_data[processed])); 579 processed += block_size; 580 remainder -= block_size; 581 } 582 583 /* add auth data to the hash */ 584 GHASH(ctx, datap, ghash, gops); 585 586 } while (remainder > 0); 587 588 return (CRYPTO_SUCCESS); 589 } 590 591 /* 592 * Init the GCM context struct. Handle the cycle and avx implementations here. 593 */ 594 int 595 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, 596 size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, 597 uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), 598 void (*xor_block)(uint8_t *, uint8_t *)) 599 { 600 CK_AES_GCM_PARAMS *gcm_param; 601 int rv = CRYPTO_SUCCESS; 602 size_t tag_len, iv_len; 603 604 if (param != NULL) { 605 gcm_param = (CK_AES_GCM_PARAMS *)(void *)param; 606 607 /* GCM mode. */ 608 if ((rv = gcm_validate_args(gcm_param)) != 0) { 609 return (rv); 610 } 611 gcm_ctx->gcm_flags |= GCM_MODE; 612 613 size_t tbits = gcm_param->ulTagBits; 614 tag_len = CRYPTO_BITS2BYTES(tbits); 615 iv_len = gcm_param->ulIvLen; 616 617 gcm_ctx->gcm_tag_len = tag_len; 618 gcm_ctx->gcm_processed_data_len = 0; 619 620 /* these values are in bits */ 621 gcm_ctx->gcm_len_a_len_c[0] 622 = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen)); 623 } else { 624 return (CRYPTO_MECHANISM_PARAM_INVALID); 625 } 626 627 const uint8_t *iv = (const uint8_t *)gcm_param->pIv; 628 const uint8_t *aad = (const uint8_t *)gcm_param->pAAD; 629 size_t aad_len = gcm_param->ulAADLen; 630 631 #ifdef CAN_USE_GCM_ASM 632 boolean_t needs_bswap = 633 ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap; 634 635 if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { 636 gcm_ctx->impl = GCM_IMPL_USED; 637 } else { 638 /* 639 * Handle the "cycle" implementation by creating different 640 * contexts, one per implementation. 641 */ 642 gcm_ctx->impl = gcm_toggle_impl(); 643 644 /* The AVX impl. doesn't handle byte swapped key schedules. */ 645 if (needs_bswap == B_TRUE) { 646 gcm_ctx->impl = GCM_IMPL_GENERIC; 647 } 648 /* 649 * If this is an AVX context, use the MOVBE and the BSWAP 650 * variants alternately. 651 */ 652 if (gcm_ctx->impl == GCM_IMPL_AVX && 653 zfs_movbe_available() == B_TRUE) { 654 (void) atomic_toggle_boolean_nv( 655 (volatile boolean_t *)&gcm_avx_can_use_movbe); 656 } 657 } 658 /* 659 * We don't handle byte swapped key schedules in the avx code path, 660 * still they could be created by the aes generic implementation. 661 * Make sure not to use them since we'll corrupt data if we do. 662 */ 663 if (gcm_ctx->impl != GCM_IMPL_GENERIC && needs_bswap == B_TRUE) { 664 gcm_ctx->impl = GCM_IMPL_GENERIC; 665 666 cmn_err_once(CE_WARN, 667 "ICP: Can't use the aes generic or cycle implementations " 668 "in combination with the gcm avx or avx2-vaes " 669 "implementation!"); 670 cmn_err_once(CE_WARN, 671 "ICP: Falling back to a compatible implementation, " 672 "aes-gcm performance will likely be degraded."); 673 cmn_err_once(CE_WARN, 674 "ICP: Choose at least the x86_64 aes implementation to " 675 "restore performance."); 676 } 677 678 /* 679 * AVX implementations use Htable with sizes depending on 680 * implementation. 681 */ 682 if (gcm_ctx->impl != GCM_IMPL_GENERIC) { 683 rv = gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len, 684 block_size); 685 } 686 else 687 #endif /* ifdef CAN_USE_GCM_ASM */ 688 if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size, 689 encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) { 690 rv = CRYPTO_MECHANISM_PARAM_INVALID; 691 } 692 693 return (rv); 694 } 695 696 void * 697 gcm_alloc_ctx(int kmflag) 698 { 699 gcm_ctx_t *gcm_ctx; 700 701 if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) 702 return (NULL); 703 704 gcm_ctx->gcm_flags = GCM_MODE; 705 return (gcm_ctx); 706 } 707 708 /* GCM implementation that contains the fastest methods */ 709 static gcm_impl_ops_t gcm_fastest_impl = { 710 .name = "fastest" 711 }; 712 713 /* All compiled in implementations */ 714 static const gcm_impl_ops_t *gcm_all_impl[] = { 715 &gcm_generic_impl, 716 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) 717 &gcm_pclmulqdq_impl, 718 #endif 719 }; 720 721 /* Indicate that benchmark has been completed */ 722 static boolean_t gcm_impl_initialized = B_FALSE; 723 724 /* Hold all supported implementations */ 725 static size_t gcm_supp_impl_cnt = 0; 726 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)]; 727 728 /* 729 * Returns the GCM operations for encrypt/decrypt/key setup. When a 730 * SIMD implementation is not allowed in the current context, then 731 * fallback to the fastest generic implementation. 732 */ 733 const gcm_impl_ops_t * 734 gcm_impl_get_ops(void) 735 { 736 if (!kfpu_allowed()) 737 return (&gcm_generic_impl); 738 739 const gcm_impl_ops_t *ops = NULL; 740 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); 741 742 switch (impl) { 743 case IMPL_FASTEST: 744 ASSERT(gcm_impl_initialized); 745 ops = &gcm_fastest_impl; 746 break; 747 case IMPL_CYCLE: 748 /* Cycle through supported implementations */ 749 ASSERT(gcm_impl_initialized); 750 ASSERT3U(gcm_supp_impl_cnt, >, 0); 751 static size_t cycle_impl_idx = 0; 752 size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt; 753 ops = gcm_supp_impl[idx]; 754 break; 755 #ifdef CAN_USE_GCM_ASM 756 case IMPL_AVX: 757 #if CAN_USE_GCM_ASM >= 2 758 case IMPL_AVX2: 759 #endif 760 /* 761 * Make sure that we return a valid implementation while 762 * switching to the avx implementation since there still 763 * may be unfinished non-avx contexts around. 764 */ 765 ops = &gcm_generic_impl; 766 break; 767 #endif 768 default: 769 ASSERT3U(impl, <, gcm_supp_impl_cnt); 770 ASSERT3U(gcm_supp_impl_cnt, >, 0); 771 if (impl < ARRAY_SIZE(gcm_all_impl)) 772 ops = gcm_supp_impl[impl]; 773 break; 774 } 775 776 ASSERT3P(ops, !=, NULL); 777 778 return (ops); 779 } 780 781 /* 782 * Initialize all supported implementations. 783 */ 784 void 785 gcm_impl_init(void) 786 { 787 gcm_impl_ops_t *curr_impl; 788 int i, c; 789 790 /* Move supported implementations into gcm_supp_impls */ 791 for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) { 792 curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i]; 793 794 if (curr_impl->is_supported()) 795 gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl; 796 } 797 gcm_supp_impl_cnt = c; 798 799 /* 800 * Set the fastest implementation given the assumption that the 801 * hardware accelerated version is the fastest. 802 */ 803 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) 804 if (gcm_pclmulqdq_impl.is_supported()) { 805 memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl, 806 sizeof (gcm_fastest_impl)); 807 } else 808 #endif 809 { 810 memcpy(&gcm_fastest_impl, &gcm_generic_impl, 811 sizeof (gcm_fastest_impl)); 812 } 813 814 strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX); 815 816 #ifdef CAN_USE_GCM_ASM 817 /* 818 * Use the avx implementation if it's available and the implementation 819 * hasn't changed from its default value of fastest on module load. 820 */ 821 #if CAN_USE_GCM_ASM >= 2 822 if (gcm_avx2_will_work()) { 823 if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { 824 gcm_use_impl(GCM_IMPL_AVX2); 825 } 826 } else 827 #endif 828 if (gcm_avx_will_work()) { 829 #ifdef HAVE_MOVBE 830 if (zfs_movbe_available() == B_TRUE) { 831 atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE); 832 } 833 #endif 834 if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { 835 gcm_use_impl(GCM_IMPL_AVX); 836 } 837 } 838 #endif 839 /* Finish initialization */ 840 atomic_swap_32(&icp_gcm_impl, user_sel_impl); 841 gcm_impl_initialized = B_TRUE; 842 } 843 844 static const struct { 845 const char *name; 846 uint32_t sel; 847 } gcm_impl_opts[] = { 848 { "cycle", IMPL_CYCLE }, 849 { "fastest", IMPL_FASTEST }, 850 #ifdef CAN_USE_GCM_ASM 851 { "avx", IMPL_AVX }, 852 { "avx2-vaes", IMPL_AVX2 }, 853 #endif 854 }; 855 856 /* 857 * Function sets desired gcm implementation. 858 * 859 * If we are called before init(), user preference will be saved in 860 * user_sel_impl, and applied in later init() call. This occurs when module 861 * parameter is specified on module load. Otherwise, directly update 862 * icp_gcm_impl. 863 * 864 * @val Name of gcm implementation to use 865 * @param Unused. 866 */ 867 int 868 gcm_impl_set(const char *val) 869 { 870 int err = -EINVAL; 871 char req_name[GCM_IMPL_NAME_MAX]; 872 uint32_t impl = GCM_IMPL_READ(user_sel_impl); 873 size_t i; 874 875 /* sanitize input */ 876 i = strnlen(val, GCM_IMPL_NAME_MAX); 877 if (i == 0 || i >= GCM_IMPL_NAME_MAX) 878 return (err); 879 880 strlcpy(req_name, val, GCM_IMPL_NAME_MAX); 881 while (i > 0 && isspace(req_name[i-1])) 882 i--; 883 req_name[i] = '\0'; 884 885 /* Check mandatory options */ 886 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { 887 #ifdef CAN_USE_GCM_ASM 888 #if CAN_USE_GCM_ASM >= 2 889 /* Ignore avx implementation if it won't work. */ 890 if (gcm_impl_opts[i].sel == IMPL_AVX2 && 891 !gcm_avx2_will_work()) { 892 continue; 893 } 894 #endif 895 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { 896 continue; 897 } 898 #endif 899 if (strcmp(req_name, gcm_impl_opts[i].name) == 0) { 900 impl = gcm_impl_opts[i].sel; 901 err = 0; 902 break; 903 } 904 } 905 906 /* check all supported impl if init() was already called */ 907 if (err != 0 && gcm_impl_initialized) { 908 /* check all supported implementations */ 909 for (i = 0; i < gcm_supp_impl_cnt; i++) { 910 if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) { 911 impl = i; 912 err = 0; 913 break; 914 } 915 } 916 } 917 #ifdef CAN_USE_GCM_ASM 918 /* 919 * Use the avx implementation if available and the requested one is 920 * avx or fastest. 921 */ 922 #if CAN_USE_GCM_ASM >= 2 923 if (gcm_avx2_will_work() == B_TRUE && 924 (impl == IMPL_AVX2 || impl == IMPL_FASTEST)) { 925 gcm_use_impl(GCM_IMPL_AVX2); 926 } else 927 #endif 928 if (gcm_avx_will_work() == B_TRUE && 929 (impl == IMPL_AVX || impl == IMPL_FASTEST)) { 930 gcm_use_impl(GCM_IMPL_AVX); 931 } else { 932 gcm_use_impl(GCM_IMPL_GENERIC); 933 } 934 #endif 935 936 if (err == 0) { 937 if (gcm_impl_initialized) 938 atomic_swap_32(&icp_gcm_impl, impl); 939 else 940 atomic_swap_32(&user_sel_impl, impl); 941 } 942 943 return (err); 944 } 945 946 #if defined(_KERNEL) && defined(__linux__) 947 948 static int 949 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp) 950 { 951 return (gcm_impl_set(val)); 952 } 953 954 static int 955 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) 956 { 957 int i, cnt = 0; 958 char *fmt; 959 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); 960 961 /* list mandatory options */ 962 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { 963 #ifdef CAN_USE_GCM_ASM 964 /* Ignore avx implementation if it won't work. */ 965 #if CAN_USE_GCM_ASM >= 2 966 if (gcm_impl_opts[i].sel == IMPL_AVX2 && 967 !gcm_avx2_will_work()) { 968 continue; 969 } 970 #endif 971 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { 972 continue; 973 } 974 #endif 975 fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s "; 976 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, 977 gcm_impl_opts[i].name); 978 } 979 980 /* list all supported implementations */ 981 for (i = 0; i < gcm_supp_impl_cnt; i++) { 982 fmt = (i == impl) ? "[%s] " : "%s "; 983 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, 984 gcm_supp_impl[i]->name); 985 } 986 987 return (cnt); 988 } 989 990 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get, 991 NULL, 0644); 992 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); 993 #endif /* defined(__KERNEL) */ 994 995 #ifdef CAN_USE_GCM_ASM 996 #define GCM_BLOCK_LEN 16 997 /* 998 * The openssl asm routines are 6x aggregated and need that many bytes 999 * at minimum. 1000 */ 1001 #define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6) 1002 #define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3) 1003 /* 1004 * Ensure the chunk size is reasonable since we are allocating a 1005 * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts. 1006 */ 1007 #define GCM_AVX_MAX_CHUNK_SIZE \ 1008 (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES) 1009 1010 /* Clear the FPU registers since they hold sensitive internal state. */ 1011 #define clear_fpu_regs() clear_fpu_regs_avx() 1012 1013 #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) 1014 1015 /* Get the chunk size module parameter. */ 1016 #define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size 1017 1018 /* 1019 * Module parameter: number of bytes to process at once while owning the FPU. 1020 * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is 1021 * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES. 1022 */ 1023 static uint32_t gcm_avx_chunk_size = 1024 ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; 1025 1026 /* 1027 * GCM definitions: uint128_t is copied from include/crypto/modes.h 1028 * Avoiding u128 because it is already defined in kernel sources. 1029 */ 1030 typedef struct { 1031 uint64_t hi, lo; 1032 } uint128_t; 1033 1034 extern void ASMABI clear_fpu_regs_avx(void); 1035 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst); 1036 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr, 1037 const uint32_t pt[4], uint32_t ct[4]); 1038 1039 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]); 1040 #if CAN_USE_GCM_ASM >= 2 1041 extern void ASMABI gcm_init_vpclmulqdq_avx2(uint128_t Htable[16], 1042 const uint64_t H[2]); 1043 #endif 1044 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable, 1045 const uint8_t *in, size_t len); 1046 #if CAN_USE_GCM_ASM >= 2 1047 extern void ASMABI gcm_ghash_vpclmulqdq_avx2(uint64_t ghash[2], 1048 const uint64_t *Htable, const uint8_t *in, size_t len); 1049 #endif 1050 static inline void GHASH_AVX(gcm_ctx_t *ctx, const uint8_t *in, size_t len) 1051 { 1052 switch (ctx->impl) { 1053 #if CAN_USE_GCM_ASM >= 2 1054 case GCM_IMPL_AVX2: 1055 gcm_ghash_vpclmulqdq_avx2(ctx->gcm_ghash, 1056 (const uint64_t *)ctx->gcm_Htable, in, len); 1057 break; 1058 #endif 1059 1060 case GCM_IMPL_AVX: 1061 gcm_ghash_avx(ctx->gcm_ghash, 1062 (const uint64_t *)ctx->gcm_Htable, in, len); 1063 break; 1064 1065 default: 1066 VERIFY(B_FALSE); 1067 } 1068 } 1069 1070 typedef size_t ASMABI aesni_gcm_encrypt_impl(const uint8_t *, uint8_t *, 1071 size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *); 1072 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, 1073 const void *, uint64_t *, uint64_t *); 1074 #if CAN_USE_GCM_ASM >= 2 1075 extern void ASMABI aes_gcm_enc_update_vaes_avx2(const uint8_t *in, 1076 uint8_t *out, size_t len, const void *key, const uint8_t ivec[16], 1077 const uint128_t Htable[16], uint8_t Xi[16]); 1078 #endif 1079 1080 typedef size_t ASMABI aesni_gcm_decrypt_impl(const uint8_t *, uint8_t *, 1081 size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *); 1082 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, 1083 const void *, uint64_t *, uint64_t *); 1084 #if CAN_USE_GCM_ASM >= 2 1085 extern void ASMABI aes_gcm_dec_update_vaes_avx2(const uint8_t *in, 1086 uint8_t *out, size_t len, const void *key, const uint8_t ivec[16], 1087 const uint128_t Htable[16], uint8_t Xi[16]); 1088 #endif 1089 1090 static inline boolean_t 1091 gcm_avx2_will_work(void) 1092 { 1093 return (kfpu_allowed() && 1094 zfs_avx2_available() && zfs_vaes_available() && 1095 zfs_vpclmulqdq_available()); 1096 } 1097 1098 static inline boolean_t 1099 gcm_avx_will_work(void) 1100 { 1101 /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */ 1102 return (kfpu_allowed() && 1103 zfs_avx_available() && zfs_aes_available() && 1104 zfs_pclmulqdq_available()); 1105 } 1106 1107 static inline void 1108 gcm_use_impl(gcm_impl impl) 1109 { 1110 switch (impl) { 1111 #if CAN_USE_GCM_ASM >= 2 1112 case GCM_IMPL_AVX2: 1113 if (gcm_avx2_will_work() == B_TRUE) { 1114 atomic_swap_32(&gcm_impl_used, impl); 1115 return; 1116 } 1117 1118 zfs_fallthrough; 1119 #endif 1120 1121 case GCM_IMPL_AVX: 1122 if (gcm_avx_will_work() == B_TRUE) { 1123 atomic_swap_32(&gcm_impl_used, impl); 1124 return; 1125 } 1126 1127 zfs_fallthrough; 1128 1129 default: 1130 atomic_swap_32(&gcm_impl_used, GCM_IMPL_GENERIC); 1131 } 1132 } 1133 1134 static inline boolean_t 1135 gcm_impl_will_work(gcm_impl impl) 1136 { 1137 switch (impl) { 1138 #if CAN_USE_GCM_ASM >= 2 1139 case GCM_IMPL_AVX2: 1140 return (gcm_avx2_will_work()); 1141 #endif 1142 1143 case GCM_IMPL_AVX: 1144 return (gcm_avx_will_work()); 1145 1146 default: 1147 return (B_TRUE); 1148 } 1149 } 1150 1151 static inline gcm_impl 1152 gcm_toggle_impl(void) 1153 { 1154 gcm_impl current_impl, new_impl; 1155 do { /* handle races */ 1156 current_impl = atomic_load_32(&gcm_impl_used); 1157 new_impl = current_impl; 1158 while (B_TRUE) { /* handle incompatble implementations */ 1159 new_impl = (new_impl + 1) % GCM_IMPL_MAX; 1160 if (gcm_impl_will_work(new_impl)) { 1161 break; 1162 } 1163 } 1164 1165 } while (atomic_cas_32(&gcm_impl_used, current_impl, new_impl) != 1166 current_impl); 1167 1168 return (new_impl); 1169 } 1170 1171 1172 /* Increment the GCM counter block by n. */ 1173 static inline void 1174 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n) 1175 { 1176 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 1177 uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask); 1178 1179 counter = htonll(counter + n); 1180 counter &= counter_mask; 1181 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 1182 } 1183 1184 static size_t aesni_gcm_encrypt_avx(const uint8_t *in, uint8_t *out, 1185 size_t len, const void *key, uint64_t *iv, const uint64_t *Htable, 1186 uint64_t *Xip) 1187 { 1188 (void) Htable; 1189 return (aesni_gcm_encrypt(in, out, len, key, iv, Xip)); 1190 } 1191 1192 #if CAN_USE_GCM_ASM >= 2 1193 // kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four 1194 // bits of a |size_t|. 1195 // This is from boringssl/crypto/fipsmodule/aes/gcm.cc.inc 1196 static const size_t kSizeTWithoutLower4Bits = (size_t)-16; 1197 1198 /* The following CRYPTO methods are from boringssl/crypto/internal.h */ 1199 static inline uint32_t CRYPTO_bswap4(uint32_t x) { 1200 return (__builtin_bswap32(x)); 1201 } 1202 1203 static inline uint32_t CRYPTO_load_u32_be(const void *in) { 1204 uint32_t v; 1205 memcpy(&v, in, sizeof (v)); 1206 return (CRYPTO_bswap4(v)); 1207 } 1208 1209 static inline void CRYPTO_store_u32_be(void *out, uint32_t v) { 1210 v = CRYPTO_bswap4(v); 1211 memcpy(out, &v, sizeof (v)); 1212 } 1213 1214 static size_t aesni_gcm_encrypt_avx2(const uint8_t *in, uint8_t *out, 1215 size_t len, const void *key, uint64_t *iv, const uint64_t *Htable, 1216 uint64_t *Xip) 1217 { 1218 uint8_t *ivec = (uint8_t *)iv; 1219 len &= kSizeTWithoutLower4Bits; 1220 aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec, 1221 (const uint128_t *)Htable, (uint8_t *)Xip); 1222 CRYPTO_store_u32_be(&ivec[12], 1223 CRYPTO_load_u32_be(&ivec[12]) + len / 16); 1224 return (len); 1225 } 1226 #endif /* if CAN_USE_GCM_ASM >= 2 */ 1227 1228 /* 1229 * Encrypt multiple blocks of data in GCM mode. 1230 * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines 1231 * if possible. While processing a chunk the FPU is "locked". 1232 */ 1233 static int 1234 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, 1235 size_t length, crypto_data_t *out, size_t block_size) 1236 { 1237 size_t bleft = length; 1238 size_t need = 0; 1239 size_t done = 0; 1240 uint8_t *datap = (uint8_t *)data; 1241 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; 1242 aesni_gcm_encrypt_impl *encrypt_blocks = 1243 #if CAN_USE_GCM_ASM >= 2 1244 ctx->impl == GCM_IMPL_AVX2 ? 1245 aesni_gcm_encrypt_avx2 : 1246 #endif 1247 aesni_gcm_encrypt_avx; 1248 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); 1249 uint64_t *ghash = ctx->gcm_ghash; 1250 uint64_t *htable = ctx->gcm_Htable; 1251 uint64_t *cb = ctx->gcm_cb; 1252 uint8_t *ct_buf = NULL; 1253 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; 1254 int rv = CRYPTO_SUCCESS; 1255 1256 ASSERT(block_size == GCM_BLOCK_LEN); 1257 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==, 1258 B_FALSE); 1259 /* 1260 * If the last call left an incomplete block, try to fill 1261 * it first. 1262 */ 1263 if (ctx->gcm_remainder_len > 0) { 1264 need = block_size - ctx->gcm_remainder_len; 1265 if (length < need) { 1266 /* Accumulate bytes here and return. */ 1267 memcpy((uint8_t *)ctx->gcm_remainder + 1268 ctx->gcm_remainder_len, datap, length); 1269 1270 ctx->gcm_remainder_len += length; 1271 if (ctx->gcm_copy_to == NULL) { 1272 ctx->gcm_copy_to = datap; 1273 } 1274 return (CRYPTO_SUCCESS); 1275 } else { 1276 /* Complete incomplete block. */ 1277 memcpy((uint8_t *)ctx->gcm_remainder + 1278 ctx->gcm_remainder_len, datap, need); 1279 1280 ctx->gcm_copy_to = NULL; 1281 } 1282 } 1283 1284 /* Allocate a buffer to encrypt to if there is enough input. */ 1285 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { 1286 ct_buf = vmem_alloc(chunk_size, KM_SLEEP); 1287 if (ct_buf == NULL) { 1288 return (CRYPTO_HOST_MEMORY); 1289 } 1290 } 1291 1292 /* If we completed an incomplete block, encrypt and write it out. */ 1293 if (ctx->gcm_remainder_len > 0) { 1294 kfpu_begin(); 1295 aes_encrypt_intel(key->encr_ks.ks32, key->nr, 1296 (const uint32_t *)cb, (uint32_t *)tmp); 1297 1298 gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp); 1299 GHASH_AVX(ctx, tmp, block_size); 1300 clear_fpu_regs(); 1301 kfpu_end(); 1302 rv = crypto_put_output_data(tmp, out, block_size); 1303 out->cd_offset += block_size; 1304 gcm_incr_counter_block(ctx); 1305 ctx->gcm_processed_data_len += block_size; 1306 bleft -= need; 1307 datap += need; 1308 ctx->gcm_remainder_len = 0; 1309 } 1310 1311 /* Do the bulk encryption in chunk_size blocks. */ 1312 for (; bleft >= chunk_size; bleft -= chunk_size) { 1313 kfpu_begin(); 1314 done = encrypt_blocks( 1315 datap, ct_buf, chunk_size, key, cb, htable, ghash); 1316 1317 clear_fpu_regs(); 1318 kfpu_end(); 1319 if (done != chunk_size) { 1320 rv = CRYPTO_FAILED; 1321 goto out_nofpu; 1322 } 1323 rv = crypto_put_output_data(ct_buf, out, chunk_size); 1324 if (rv != CRYPTO_SUCCESS) { 1325 goto out_nofpu; 1326 } 1327 out->cd_offset += chunk_size; 1328 datap += chunk_size; 1329 ctx->gcm_processed_data_len += chunk_size; 1330 } 1331 /* Check if we are already done. */ 1332 if (bleft == 0) { 1333 goto out_nofpu; 1334 } 1335 /* Bulk encrypt the remaining data. */ 1336 kfpu_begin(); 1337 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { 1338 done = encrypt_blocks(datap, ct_buf, bleft, key, cb, htable, 1339 ghash); 1340 if (done == 0) { 1341 rv = CRYPTO_FAILED; 1342 goto out; 1343 } 1344 rv = crypto_put_output_data(ct_buf, out, done); 1345 if (rv != CRYPTO_SUCCESS) { 1346 goto out; 1347 } 1348 out->cd_offset += done; 1349 ctx->gcm_processed_data_len += done; 1350 datap += done; 1351 bleft -= done; 1352 1353 } 1354 /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */ 1355 while (bleft > 0) { 1356 if (bleft < block_size) { 1357 memcpy(ctx->gcm_remainder, datap, bleft); 1358 ctx->gcm_remainder_len = bleft; 1359 ctx->gcm_copy_to = datap; 1360 goto out; 1361 } 1362 /* Encrypt, hash and write out. */ 1363 aes_encrypt_intel(key->encr_ks.ks32, key->nr, 1364 (const uint32_t *)cb, (uint32_t *)tmp); 1365 1366 gcm_xor_avx(datap, tmp); 1367 GHASH_AVX(ctx, tmp, block_size); 1368 rv = crypto_put_output_data(tmp, out, block_size); 1369 if (rv != CRYPTO_SUCCESS) { 1370 goto out; 1371 } 1372 out->cd_offset += block_size; 1373 gcm_incr_counter_block(ctx); 1374 ctx->gcm_processed_data_len += block_size; 1375 datap += block_size; 1376 bleft -= block_size; 1377 } 1378 out: 1379 clear_fpu_regs(); 1380 kfpu_end(); 1381 out_nofpu: 1382 if (ct_buf != NULL) { 1383 vmem_free(ct_buf, chunk_size); 1384 } 1385 return (rv); 1386 } 1387 1388 /* 1389 * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual 1390 * incomplete last block. Encrypt the ICB. Calculate the tag and write it out. 1391 */ 1392 static int 1393 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) 1394 { 1395 uint8_t *ghash = (uint8_t *)ctx->gcm_ghash; 1396 uint32_t *J0 = (uint32_t *)ctx->gcm_J0; 1397 uint8_t *remainder = (uint8_t *)ctx->gcm_remainder; 1398 size_t rem_len = ctx->gcm_remainder_len; 1399 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; 1400 int aes_rounds = ((aes_key_t *)keysched)->nr; 1401 int rv; 1402 1403 ASSERT(block_size == GCM_BLOCK_LEN); 1404 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==, 1405 B_FALSE); 1406 1407 if (out->cd_length < (rem_len + ctx->gcm_tag_len)) { 1408 return (CRYPTO_DATA_LEN_RANGE); 1409 } 1410 1411 kfpu_begin(); 1412 /* Pad last incomplete block with zeros, encrypt and hash. */ 1413 if (rem_len > 0) { 1414 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; 1415 const uint32_t *cb = (uint32_t *)ctx->gcm_cb; 1416 1417 aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp); 1418 memset(remainder + rem_len, 0, block_size - rem_len); 1419 for (int i = 0; i < rem_len; i++) { 1420 remainder[i] ^= tmp[i]; 1421 } 1422 GHASH_AVX(ctx, remainder, block_size); 1423 ctx->gcm_processed_data_len += rem_len; 1424 /* No need to increment counter_block, it's the last block. */ 1425 } 1426 /* Finish tag. */ 1427 ctx->gcm_len_a_len_c[1] = 1428 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); 1429 GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size); 1430 aes_encrypt_intel(keysched, aes_rounds, J0, J0); 1431 1432 gcm_xor_avx((uint8_t *)J0, ghash); 1433 clear_fpu_regs(); 1434 kfpu_end(); 1435 1436 /* Output remainder. */ 1437 if (rem_len > 0) { 1438 rv = crypto_put_output_data(remainder, out, rem_len); 1439 if (rv != CRYPTO_SUCCESS) 1440 return (rv); 1441 } 1442 out->cd_offset += rem_len; 1443 ctx->gcm_remainder_len = 0; 1444 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); 1445 if (rv != CRYPTO_SUCCESS) 1446 return (rv); 1447 1448 out->cd_offset += ctx->gcm_tag_len; 1449 return (CRYPTO_SUCCESS); 1450 } 1451 1452 static size_t aesni_gcm_decrypt_avx(const uint8_t *in, uint8_t *out, 1453 size_t len, const void *key, uint64_t *iv, const uint64_t *Htable, 1454 uint64_t *Xip) 1455 { 1456 (void) Htable; 1457 return (aesni_gcm_decrypt(in, out, len, key, iv, Xip)); 1458 } 1459 1460 #if CAN_USE_GCM_ASM >= 2 1461 static size_t aesni_gcm_decrypt_avx2(const uint8_t *in, uint8_t *out, 1462 size_t len, const void *key, uint64_t *iv, const uint64_t *Htable, 1463 uint64_t *Xip) 1464 { 1465 uint8_t *ivec = (uint8_t *)iv; 1466 len &= kSizeTWithoutLower4Bits; 1467 aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec, 1468 (const uint128_t *)Htable, (uint8_t *)Xip); 1469 CRYPTO_store_u32_be(&ivec[12], 1470 CRYPTO_load_u32_be(&ivec[12]) + len / 16); 1471 return (len); 1472 } 1473 #endif /* if CAN_USE_GCM_ASM >= 2 */ 1474 1475 /* 1476 * Finalize decryption: We just have accumulated crypto text, so now we 1477 * decrypt it here inplace. 1478 */ 1479 static int 1480 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) 1481 { 1482 ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len); 1483 ASSERT3U(block_size, ==, 16); 1484 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==, 1485 B_FALSE); 1486 1487 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; 1488 aesni_gcm_decrypt_impl *decrypt_blocks = 1489 #if CAN_USE_GCM_ASM >= 2 1490 ctx->impl == GCM_IMPL_AVX2 ? 1491 aesni_gcm_decrypt_avx2 : 1492 #endif 1493 aesni_gcm_decrypt_avx; 1494 size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; 1495 uint8_t *datap = ctx->gcm_pt_buf; 1496 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); 1497 uint32_t *cb = (uint32_t *)ctx->gcm_cb; 1498 uint64_t *htable = ctx->gcm_Htable; 1499 uint64_t *ghash = ctx->gcm_ghash; 1500 uint32_t *tmp = (uint32_t *)ctx->gcm_tmp; 1501 int rv = CRYPTO_SUCCESS; 1502 size_t bleft, done; 1503 1504 /* 1505 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be 1506 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of 1507 * GCM_AVX_MIN_DECRYPT_BYTES. 1508 */ 1509 for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) { 1510 kfpu_begin(); 1511 done = decrypt_blocks(datap, datap, chunk_size, 1512 (const void *)key, ctx->gcm_cb, htable, ghash); 1513 clear_fpu_regs(); 1514 kfpu_end(); 1515 if (done != chunk_size) { 1516 return (CRYPTO_FAILED); 1517 } 1518 datap += done; 1519 } 1520 /* Decrypt remainder, which is less than chunk size, in one go. */ 1521 kfpu_begin(); 1522 if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) { 1523 done = decrypt_blocks(datap, datap, bleft, 1524 (const void *)key, ctx->gcm_cb, htable, ghash); 1525 if (done == 0) { 1526 clear_fpu_regs(); 1527 kfpu_end(); 1528 return (CRYPTO_FAILED); 1529 } 1530 datap += done; 1531 bleft -= done; 1532 } 1533 ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES); 1534 1535 /* 1536 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain, 1537 * decrypt them block by block. 1538 */ 1539 while (bleft > 0) { 1540 /* Incomplete last block. */ 1541 if (bleft < block_size) { 1542 uint8_t *lastb = (uint8_t *)ctx->gcm_remainder; 1543 1544 memset(lastb, 0, block_size); 1545 memcpy(lastb, datap, bleft); 1546 /* The GCM processing. */ 1547 GHASH_AVX(ctx, lastb, block_size); 1548 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); 1549 for (size_t i = 0; i < bleft; i++) { 1550 datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i]; 1551 } 1552 break; 1553 } 1554 /* The GCM processing. */ 1555 GHASH_AVX(ctx, datap, block_size); 1556 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); 1557 gcm_xor_avx((uint8_t *)tmp, datap); 1558 gcm_incr_counter_block(ctx); 1559 1560 datap += block_size; 1561 bleft -= block_size; 1562 } 1563 if (rv != CRYPTO_SUCCESS) { 1564 clear_fpu_regs(); 1565 kfpu_end(); 1566 return (rv); 1567 } 1568 /* Decryption done, finish the tag. */ 1569 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); 1570 GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size); 1571 aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0, 1572 (uint32_t *)ctx->gcm_J0); 1573 1574 gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash); 1575 1576 /* We are done with the FPU, restore its state. */ 1577 clear_fpu_regs(); 1578 kfpu_end(); 1579 1580 /* Compare the input authentication tag with what we calculated. */ 1581 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { 1582 /* They don't match. */ 1583 return (CRYPTO_INVALID_MAC); 1584 } 1585 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); 1586 if (rv != CRYPTO_SUCCESS) { 1587 return (rv); 1588 } 1589 out->cd_offset += pt_len; 1590 return (CRYPTO_SUCCESS); 1591 } 1592 1593 /* 1594 * Initialize the GCM params H, Htabtle and the counter block. Save the 1595 * initial counter block. 1596 */ 1597 static int 1598 gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, 1599 const uint8_t *auth_data, size_t auth_data_len, size_t block_size) 1600 { 1601 uint8_t *cb = (uint8_t *)ctx->gcm_cb; 1602 uint64_t *H = ctx->gcm_H; 1603 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; 1604 int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr; 1605 const uint8_t *datap = auth_data; 1606 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; 1607 size_t bleft; 1608 1609 ASSERT(block_size == GCM_BLOCK_LEN); 1610 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==, 1611 B_FALSE); 1612 1613 size_t htab_len = 0; 1614 #if CAN_USE_GCM_ASM >= 2 1615 if (ctx->impl == GCM_IMPL_AVX2) { 1616 /* 1617 * BoringSSL's API specifies uint128_t[16] for htab; but only 1618 * uint128_t[12] are used. 1619 * See https://github.com/google/boringssl/blob/ 1620 * 813840dd094f9e9c1b00a7368aa25e656554221f1/crypto/fipsmodule/ 1621 * modes/asm/aes-gcm-avx2-x86_64.pl#L198-L200 1622 */ 1623 htab_len = (2 * 8 * sizeof (uint128_t)); 1624 } else 1625 #endif /* CAN_USE_GCM_ASM >= 2 */ 1626 { 1627 htab_len = (2 * 6 * sizeof (uint128_t)); 1628 } 1629 1630 ctx->gcm_Htable = kmem_alloc(htab_len, KM_SLEEP); 1631 if (ctx->gcm_Htable == NULL) { 1632 return (CRYPTO_HOST_MEMORY); 1633 } 1634 1635 /* Init H (encrypt zero block) and create the initial counter block. */ 1636 memset(H, 0, sizeof (ctx->gcm_H)); 1637 kfpu_begin(); 1638 aes_encrypt_intel(keysched, aes_rounds, 1639 (const uint32_t *)H, (uint32_t *)H); 1640 1641 #if CAN_USE_GCM_ASM >= 2 1642 if (ctx->impl == GCM_IMPL_AVX2) { 1643 gcm_init_vpclmulqdq_avx2((uint128_t *)ctx->gcm_Htable, H); 1644 } else 1645 #endif /* if CAN_USE_GCM_ASM >= 2 */ 1646 { 1647 gcm_init_htab_avx(ctx->gcm_Htable, H); 1648 } 1649 1650 if (iv_len == 12) { 1651 memcpy(cb, iv, 12); 1652 cb[12] = 0; 1653 cb[13] = 0; 1654 cb[14] = 0; 1655 cb[15] = 1; 1656 /* We need the ICB later. */ 1657 memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0)); 1658 } else { 1659 /* 1660 * Most consumers use 12 byte IVs, so it's OK to use the 1661 * original routines for other IV sizes, just avoid nesting 1662 * kfpu_begin calls. 1663 */ 1664 clear_fpu_regs(); 1665 kfpu_end(); 1666 gcm_format_initial_blocks(iv, iv_len, ctx, block_size, 1667 aes_copy_block, aes_xor_block); 1668 kfpu_begin(); 1669 } 1670 1671 memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash)); 1672 1673 /* Openssl post increments the counter, adjust for that. */ 1674 gcm_incr_counter_block(ctx); 1675 1676 /* Ghash AAD in chunk_size blocks. */ 1677 for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) { 1678 GHASH_AVX(ctx, datap, chunk_size); 1679 datap += chunk_size; 1680 clear_fpu_regs(); 1681 kfpu_end(); 1682 kfpu_begin(); 1683 } 1684 /* Ghash the remainder and handle possible incomplete GCM block. */ 1685 if (bleft > 0) { 1686 size_t incomp = bleft % block_size; 1687 1688 bleft -= incomp; 1689 if (bleft > 0) { 1690 GHASH_AVX(ctx, datap, bleft); 1691 datap += bleft; 1692 } 1693 if (incomp > 0) { 1694 /* Zero pad and hash incomplete last block. */ 1695 uint8_t *authp = (uint8_t *)ctx->gcm_tmp; 1696 1697 memset(authp, 0, block_size); 1698 memcpy(authp, datap, incomp); 1699 GHASH_AVX(ctx, authp, block_size); 1700 } 1701 } 1702 clear_fpu_regs(); 1703 kfpu_end(); 1704 return (CRYPTO_SUCCESS); 1705 } 1706 1707 #if defined(_KERNEL) 1708 static int 1709 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) 1710 { 1711 unsigned long val; 1712 char val_rounded[16]; 1713 int error = 0; 1714 1715 error = kstrtoul(buf, 0, &val); 1716 if (error) 1717 return (error); 1718 1719 val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; 1720 1721 if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE) 1722 return (-EINVAL); 1723 1724 snprintf(val_rounded, 16, "%u", (uint32_t)val); 1725 error = param_set_uint(val_rounded, kp); 1726 return (error); 1727 } 1728 1729 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size, 1730 param_get_uint, &gcm_avx_chunk_size, 0644); 1731 1732 MODULE_PARM_DESC(icp_gcm_avx_chunk_size, 1733 "How many bytes to process while owning the FPU"); 1734 1735 #endif /* defined(__KERNEL) */ 1736 #endif /* ifdef CAN_USE_GCM_ASM */ 1737