1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/cmn_err.h> 28 #include <modes/modes.h> 29 #include <sys/crypto/common.h> 30 #include <sys/crypto/icp.h> 31 #include <sys/crypto/impl.h> 32 #include <sys/byteorder.h> 33 #include <sys/simd.h> 34 #include <modes/gcm_impl.h> 35 #ifdef CAN_USE_GCM_ASM 36 #include <aes/aes_impl.h> 37 #endif 38 39 #define GHASH(c, d, t, o) \ 40 xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ 41 (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \ 42 (uint64_t *)(void *)(t)); 43 44 /* Select GCM implementation */ 45 #define IMPL_FASTEST (UINT32_MAX) 46 #define IMPL_CYCLE (UINT32_MAX-1) 47 #ifdef CAN_USE_GCM_ASM 48 #define IMPL_AVX (UINT32_MAX-2) 49 #endif 50 #define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) 51 static uint32_t icp_gcm_impl = IMPL_FASTEST; 52 static uint32_t user_sel_impl = IMPL_FASTEST; 53 54 #ifdef CAN_USE_GCM_ASM 55 /* Does the architecture we run on support the MOVBE instruction? */ 56 boolean_t gcm_avx_can_use_movbe = B_FALSE; 57 /* 58 * Whether to use the optimized openssl gcm and ghash implementations. 59 * Set to true if module parameter icp_gcm_impl == "avx". 60 */ 61 static boolean_t gcm_use_avx = B_FALSE; 62 #define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) 63 64 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *); 65 66 static inline boolean_t gcm_avx_will_work(void); 67 static inline void gcm_set_avx(boolean_t); 68 static inline boolean_t gcm_toggle_avx(void); 69 static inline size_t gcm_simd_get_htab_size(boolean_t); 70 71 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, 72 crypto_data_t *, size_t); 73 74 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); 75 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); 76 static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *, 77 size_t, size_t); 78 #endif /* ifdef CAN_USE_GCM_ASM */ 79 80 /* 81 * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode 82 * is done in another function. 83 */ 84 int 85 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, 86 crypto_data_t *out, size_t block_size, 87 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 88 void (*copy_block)(uint8_t *, uint8_t *), 89 void (*xor_block)(uint8_t *, uint8_t *)) 90 { 91 #ifdef CAN_USE_GCM_ASM 92 if (ctx->gcm_use_avx == B_TRUE) 93 return (gcm_mode_encrypt_contiguous_blocks_avx( 94 ctx, data, length, out, block_size)); 95 #endif 96 97 const gcm_impl_ops_t *gops; 98 size_t remainder = length; 99 size_t need = 0; 100 uint8_t *datap = (uint8_t *)data; 101 uint8_t *blockp; 102 uint8_t *lastp; 103 void *iov_or_mp; 104 offset_t offset; 105 uint8_t *out_data_1; 106 uint8_t *out_data_2; 107 size_t out_data_1_len; 108 uint64_t counter; 109 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 110 111 if (length + ctx->gcm_remainder_len < block_size) { 112 /* accumulate bytes here and return */ 113 memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, 114 datap, 115 length); 116 ctx->gcm_remainder_len += length; 117 if (ctx->gcm_copy_to == NULL) { 118 ctx->gcm_copy_to = datap; 119 } 120 return (CRYPTO_SUCCESS); 121 } 122 123 crypto_init_ptrs(out, &iov_or_mp, &offset); 124 125 gops = gcm_impl_get_ops(); 126 do { 127 /* Unprocessed data from last call. */ 128 if (ctx->gcm_remainder_len > 0) { 129 need = block_size - ctx->gcm_remainder_len; 130 131 if (need > remainder) 132 return (CRYPTO_DATA_LEN_RANGE); 133 134 memcpy(&((uint8_t *)ctx->gcm_remainder) 135 [ctx->gcm_remainder_len], datap, need); 136 137 blockp = (uint8_t *)ctx->gcm_remainder; 138 } else { 139 blockp = datap; 140 } 141 142 /* 143 * Increment counter. Counter bits are confined 144 * to the bottom 32 bits of the counter block. 145 */ 146 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 147 counter = htonll(counter + 1); 148 counter &= counter_mask; 149 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 150 151 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, 152 (uint8_t *)ctx->gcm_tmp); 153 xor_block(blockp, (uint8_t *)ctx->gcm_tmp); 154 155 lastp = (uint8_t *)ctx->gcm_tmp; 156 157 ctx->gcm_processed_data_len += block_size; 158 159 crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, 160 &out_data_1_len, &out_data_2, block_size); 161 162 /* copy block to where it belongs */ 163 if (out_data_1_len == block_size) { 164 copy_block(lastp, out_data_1); 165 } else { 166 memcpy(out_data_1, lastp, out_data_1_len); 167 if (out_data_2 != NULL) { 168 memcpy(out_data_2, 169 lastp + out_data_1_len, 170 block_size - out_data_1_len); 171 } 172 } 173 /* update offset */ 174 out->cd_offset += block_size; 175 176 /* add ciphertext to the hash */ 177 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops); 178 179 /* Update pointer to next block of data to be processed. */ 180 if (ctx->gcm_remainder_len != 0) { 181 datap += need; 182 ctx->gcm_remainder_len = 0; 183 } else { 184 datap += block_size; 185 } 186 187 remainder = (size_t)&data[length] - (size_t)datap; 188 189 /* Incomplete last block. */ 190 if (remainder > 0 && remainder < block_size) { 191 memcpy(ctx->gcm_remainder, datap, remainder); 192 ctx->gcm_remainder_len = remainder; 193 ctx->gcm_copy_to = datap; 194 goto out; 195 } 196 ctx->gcm_copy_to = NULL; 197 198 } while (remainder > 0); 199 out: 200 return (CRYPTO_SUCCESS); 201 } 202 203 int 204 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, 205 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 206 void (*copy_block)(uint8_t *, uint8_t *), 207 void (*xor_block)(uint8_t *, uint8_t *)) 208 { 209 (void) copy_block; 210 #ifdef CAN_USE_GCM_ASM 211 if (ctx->gcm_use_avx == B_TRUE) 212 return (gcm_encrypt_final_avx(ctx, out, block_size)); 213 #endif 214 215 const gcm_impl_ops_t *gops; 216 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 217 uint8_t *ghash, *macp = NULL; 218 int i, rv; 219 220 if (out->cd_length < 221 (ctx->gcm_remainder_len + ctx->gcm_tag_len)) { 222 return (CRYPTO_DATA_LEN_RANGE); 223 } 224 225 gops = gcm_impl_get_ops(); 226 ghash = (uint8_t *)ctx->gcm_ghash; 227 228 if (ctx->gcm_remainder_len > 0) { 229 uint64_t counter; 230 uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp; 231 232 /* 233 * Here is where we deal with data that is not a 234 * multiple of the block size. 235 */ 236 237 /* 238 * Increment counter. 239 */ 240 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 241 counter = htonll(counter + 1); 242 counter &= counter_mask; 243 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 244 245 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, 246 (uint8_t *)ctx->gcm_tmp); 247 248 macp = (uint8_t *)ctx->gcm_remainder; 249 memset(macp + ctx->gcm_remainder_len, 0, 250 block_size - ctx->gcm_remainder_len); 251 252 /* XOR with counter block */ 253 for (i = 0; i < ctx->gcm_remainder_len; i++) { 254 macp[i] ^= tmpp[i]; 255 } 256 257 /* add ciphertext to the hash */ 258 GHASH(ctx, macp, ghash, gops); 259 260 ctx->gcm_processed_data_len += ctx->gcm_remainder_len; 261 } 262 263 ctx->gcm_len_a_len_c[1] = 264 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); 265 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); 266 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, 267 (uint8_t *)ctx->gcm_J0); 268 xor_block((uint8_t *)ctx->gcm_J0, ghash); 269 270 if (ctx->gcm_remainder_len > 0) { 271 rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len); 272 if (rv != CRYPTO_SUCCESS) 273 return (rv); 274 } 275 out->cd_offset += ctx->gcm_remainder_len; 276 ctx->gcm_remainder_len = 0; 277 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); 278 if (rv != CRYPTO_SUCCESS) 279 return (rv); 280 out->cd_offset += ctx->gcm_tag_len; 281 282 return (CRYPTO_SUCCESS); 283 } 284 285 /* 286 * This will only deal with decrypting the last block of the input that 287 * might not be a multiple of block length. 288 */ 289 static void 290 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index, 291 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 292 void (*xor_block)(uint8_t *, uint8_t *)) 293 { 294 uint8_t *datap, *outp, *counterp; 295 uint64_t counter; 296 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 297 int i; 298 299 /* 300 * Increment counter. 301 * Counter bits are confined to the bottom 32 bits 302 */ 303 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 304 counter = htonll(counter + 1); 305 counter &= counter_mask; 306 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 307 308 datap = (uint8_t *)ctx->gcm_remainder; 309 outp = &((ctx->gcm_pt_buf)[index]); 310 counterp = (uint8_t *)ctx->gcm_tmp; 311 312 /* authentication tag */ 313 memset((uint8_t *)ctx->gcm_tmp, 0, block_size); 314 memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len); 315 316 /* add ciphertext to the hash */ 317 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops()); 318 319 /* decrypt remaining ciphertext */ 320 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp); 321 322 /* XOR with counter block */ 323 for (i = 0; i < ctx->gcm_remainder_len; i++) { 324 outp[i] = datap[i] ^ counterp[i]; 325 } 326 } 327 328 int 329 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, 330 crypto_data_t *out, size_t block_size, 331 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 332 void (*copy_block)(uint8_t *, uint8_t *), 333 void (*xor_block)(uint8_t *, uint8_t *)) 334 { 335 (void) out, (void) block_size, (void) encrypt_block, (void) copy_block, 336 (void) xor_block; 337 size_t new_len; 338 uint8_t *new; 339 340 /* 341 * Copy contiguous ciphertext input blocks to plaintext buffer. 342 * Ciphertext will be decrypted in the final. 343 */ 344 if (length > 0) { 345 new_len = ctx->gcm_pt_buf_len + length; 346 new = vmem_alloc(new_len, KM_SLEEP); 347 if (new == NULL) { 348 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); 349 ctx->gcm_pt_buf = NULL; 350 return (CRYPTO_HOST_MEMORY); 351 } 352 353 if (ctx->gcm_pt_buf != NULL) { 354 memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); 355 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); 356 } else { 357 ASSERT0(ctx->gcm_pt_buf_len); 358 } 359 360 ctx->gcm_pt_buf = new; 361 ctx->gcm_pt_buf_len = new_len; 362 memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data, 363 length); 364 ctx->gcm_processed_data_len += length; 365 } 366 367 ctx->gcm_remainder_len = 0; 368 return (CRYPTO_SUCCESS); 369 } 370 371 int 372 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, 373 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 374 void (*xor_block)(uint8_t *, uint8_t *)) 375 { 376 #ifdef CAN_USE_GCM_ASM 377 if (ctx->gcm_use_avx == B_TRUE) 378 return (gcm_decrypt_final_avx(ctx, out, block_size)); 379 #endif 380 381 const gcm_impl_ops_t *gops; 382 size_t pt_len; 383 size_t remainder; 384 uint8_t *ghash; 385 uint8_t *blockp; 386 uint8_t *cbp; 387 uint64_t counter; 388 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 389 int processed = 0, rv; 390 391 ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len); 392 393 gops = gcm_impl_get_ops(); 394 pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; 395 ghash = (uint8_t *)ctx->gcm_ghash; 396 blockp = ctx->gcm_pt_buf; 397 remainder = pt_len; 398 while (remainder > 0) { 399 /* Incomplete last block */ 400 if (remainder < block_size) { 401 memcpy(ctx->gcm_remainder, blockp, remainder); 402 ctx->gcm_remainder_len = remainder; 403 /* 404 * not expecting anymore ciphertext, just 405 * compute plaintext for the remaining input 406 */ 407 gcm_decrypt_incomplete_block(ctx, block_size, 408 processed, encrypt_block, xor_block); 409 ctx->gcm_remainder_len = 0; 410 goto out; 411 } 412 /* add ciphertext to the hash */ 413 GHASH(ctx, blockp, ghash, gops); 414 415 /* 416 * Increment counter. 417 * Counter bits are confined to the bottom 32 bits 418 */ 419 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 420 counter = htonll(counter + 1); 421 counter &= counter_mask; 422 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 423 424 cbp = (uint8_t *)ctx->gcm_tmp; 425 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp); 426 427 /* XOR with ciphertext */ 428 xor_block(cbp, blockp); 429 430 processed += block_size; 431 blockp += block_size; 432 remainder -= block_size; 433 } 434 out: 435 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); 436 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); 437 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, 438 (uint8_t *)ctx->gcm_J0); 439 xor_block((uint8_t *)ctx->gcm_J0, ghash); 440 441 /* compare the input authentication tag with what we calculated */ 442 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { 443 /* They don't match */ 444 return (CRYPTO_INVALID_MAC); 445 } else { 446 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); 447 if (rv != CRYPTO_SUCCESS) 448 return (rv); 449 out->cd_offset += pt_len; 450 } 451 return (CRYPTO_SUCCESS); 452 } 453 454 static int 455 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param) 456 { 457 size_t tag_len; 458 459 /* 460 * Check the length of the authentication tag (in bits). 461 */ 462 tag_len = gcm_param->ulTagBits; 463 switch (tag_len) { 464 case 32: 465 case 64: 466 case 96: 467 case 104: 468 case 112: 469 case 120: 470 case 128: 471 break; 472 default: 473 return (CRYPTO_MECHANISM_PARAM_INVALID); 474 } 475 476 if (gcm_param->ulIvLen == 0) 477 return (CRYPTO_MECHANISM_PARAM_INVALID); 478 479 return (CRYPTO_SUCCESS); 480 } 481 482 static void 483 gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len, 484 gcm_ctx_t *ctx, size_t block_size, 485 void (*copy_block)(uint8_t *, uint8_t *), 486 void (*xor_block)(uint8_t *, uint8_t *)) 487 { 488 const gcm_impl_ops_t *gops; 489 uint8_t *cb; 490 ulong_t remainder = iv_len; 491 ulong_t processed = 0; 492 uint8_t *datap, *ghash; 493 uint64_t len_a_len_c[2]; 494 495 gops = gcm_impl_get_ops(); 496 ghash = (uint8_t *)ctx->gcm_ghash; 497 cb = (uint8_t *)ctx->gcm_cb; 498 if (iv_len == 12) { 499 memcpy(cb, iv, 12); 500 cb[12] = 0; 501 cb[13] = 0; 502 cb[14] = 0; 503 cb[15] = 1; 504 /* J0 will be used again in the final */ 505 copy_block(cb, (uint8_t *)ctx->gcm_J0); 506 } else { 507 /* GHASH the IV */ 508 do { 509 if (remainder < block_size) { 510 memset(cb, 0, block_size); 511 memcpy(cb, &(iv[processed]), remainder); 512 datap = (uint8_t *)cb; 513 remainder = 0; 514 } else { 515 datap = (uint8_t *)(&(iv[processed])); 516 processed += block_size; 517 remainder -= block_size; 518 } 519 GHASH(ctx, datap, ghash, gops); 520 } while (remainder > 0); 521 522 len_a_len_c[0] = 0; 523 len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len)); 524 GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops); 525 526 /* J0 will be used again in the final */ 527 copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb); 528 } 529 } 530 531 static int 532 gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, 533 const uint8_t *auth_data, size_t auth_data_len, size_t block_size, 534 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 535 void (*copy_block)(uint8_t *, uint8_t *), 536 void (*xor_block)(uint8_t *, uint8_t *)) 537 { 538 const gcm_impl_ops_t *gops; 539 uint8_t *ghash, *datap, *authp; 540 size_t remainder, processed; 541 542 /* encrypt zero block to get subkey H */ 543 memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); 544 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H, 545 (uint8_t *)ctx->gcm_H); 546 547 gcm_format_initial_blocks(iv, iv_len, ctx, block_size, 548 copy_block, xor_block); 549 550 gops = gcm_impl_get_ops(); 551 authp = (uint8_t *)ctx->gcm_tmp; 552 ghash = (uint8_t *)ctx->gcm_ghash; 553 memset(authp, 0, block_size); 554 memset(ghash, 0, block_size); 555 556 processed = 0; 557 remainder = auth_data_len; 558 do { 559 if (remainder < block_size) { 560 /* 561 * There's not a block full of data, pad rest of 562 * buffer with zero 563 */ 564 565 if (auth_data != NULL) { 566 memset(authp, 0, block_size); 567 memcpy(authp, &(auth_data[processed]), 568 remainder); 569 } else { 570 ASSERT0(remainder); 571 } 572 573 datap = (uint8_t *)authp; 574 remainder = 0; 575 } else { 576 datap = (uint8_t *)(&(auth_data[processed])); 577 processed += block_size; 578 remainder -= block_size; 579 } 580 581 /* add auth data to the hash */ 582 GHASH(ctx, datap, ghash, gops); 583 584 } while (remainder > 0); 585 586 return (CRYPTO_SUCCESS); 587 } 588 589 /* 590 * Init the GCM context struct. Handle the cycle and avx implementations here. 591 */ 592 int 593 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, 594 size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, 595 uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), 596 void (*xor_block)(uint8_t *, uint8_t *)) 597 { 598 CK_AES_GCM_PARAMS *gcm_param; 599 int rv = CRYPTO_SUCCESS; 600 size_t tag_len, iv_len; 601 602 if (param != NULL) { 603 gcm_param = (CK_AES_GCM_PARAMS *)(void *)param; 604 605 /* GCM mode. */ 606 if ((rv = gcm_validate_args(gcm_param)) != 0) { 607 return (rv); 608 } 609 gcm_ctx->gcm_flags |= GCM_MODE; 610 611 size_t tbits = gcm_param->ulTagBits; 612 tag_len = CRYPTO_BITS2BYTES(tbits); 613 iv_len = gcm_param->ulIvLen; 614 615 gcm_ctx->gcm_tag_len = tag_len; 616 gcm_ctx->gcm_processed_data_len = 0; 617 618 /* these values are in bits */ 619 gcm_ctx->gcm_len_a_len_c[0] 620 = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen)); 621 } else { 622 return (CRYPTO_MECHANISM_PARAM_INVALID); 623 } 624 625 const uint8_t *iv = (const uint8_t *)gcm_param->pIv; 626 const uint8_t *aad = (const uint8_t *)gcm_param->pAAD; 627 size_t aad_len = gcm_param->ulAADLen; 628 629 #ifdef CAN_USE_GCM_ASM 630 boolean_t needs_bswap = 631 ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap; 632 633 if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { 634 gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; 635 } else { 636 /* 637 * Handle the "cycle" implementation by creating avx and 638 * non-avx contexts alternately. 639 */ 640 gcm_ctx->gcm_use_avx = gcm_toggle_avx(); 641 642 /* The avx impl. doesn't handle byte swapped key schedules. */ 643 if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) { 644 gcm_ctx->gcm_use_avx = B_FALSE; 645 } 646 /* 647 * If this is a GCM context, use the MOVBE and the BSWAP 648 * variants alternately. 649 */ 650 if (gcm_ctx->gcm_use_avx == B_TRUE && 651 zfs_movbe_available() == B_TRUE) { 652 (void) atomic_toggle_boolean_nv( 653 (volatile boolean_t *)&gcm_avx_can_use_movbe); 654 } 655 } 656 /* 657 * We don't handle byte swapped key schedules in the avx code path, 658 * still they could be created by the aes generic implementation. 659 * Make sure not to use them since we'll corrupt data if we do. 660 */ 661 if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) { 662 gcm_ctx->gcm_use_avx = B_FALSE; 663 664 cmn_err_once(CE_WARN, 665 "ICP: Can't use the aes generic or cycle implementations " 666 "in combination with the gcm avx implementation!"); 667 cmn_err_once(CE_WARN, 668 "ICP: Falling back to a compatible implementation, " 669 "aes-gcm performance will likely be degraded."); 670 cmn_err_once(CE_WARN, 671 "ICP: Choose at least the x86_64 aes implementation to " 672 "restore performance."); 673 } 674 675 /* Allocate Htab memory as needed. */ 676 if (gcm_ctx->gcm_use_avx == B_TRUE) { 677 size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); 678 679 if (htab_len == 0) { 680 return (CRYPTO_MECHANISM_PARAM_INVALID); 681 } 682 gcm_ctx->gcm_htab_len = htab_len; 683 gcm_ctx->gcm_Htable = 684 kmem_alloc(htab_len, KM_SLEEP); 685 686 if (gcm_ctx->gcm_Htable == NULL) { 687 return (CRYPTO_HOST_MEMORY); 688 } 689 } 690 /* Avx and non avx context initialization differs from here on. */ 691 if (gcm_ctx->gcm_use_avx == B_FALSE) { 692 #endif /* ifdef CAN_USE_GCM_ASM */ 693 if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size, 694 encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) { 695 rv = CRYPTO_MECHANISM_PARAM_INVALID; 696 } 697 #ifdef CAN_USE_GCM_ASM 698 } else { 699 if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len, 700 block_size) != CRYPTO_SUCCESS) { 701 rv = CRYPTO_MECHANISM_PARAM_INVALID; 702 } 703 } 704 #endif /* ifdef CAN_USE_GCM_ASM */ 705 706 return (rv); 707 } 708 709 void * 710 gcm_alloc_ctx(int kmflag) 711 { 712 gcm_ctx_t *gcm_ctx; 713 714 if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) 715 return (NULL); 716 717 gcm_ctx->gcm_flags = GCM_MODE; 718 return (gcm_ctx); 719 } 720 721 /* GCM implementation that contains the fastest methods */ 722 static gcm_impl_ops_t gcm_fastest_impl = { 723 .name = "fastest" 724 }; 725 726 /* All compiled in implementations */ 727 static const gcm_impl_ops_t *gcm_all_impl[] = { 728 &gcm_generic_impl, 729 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) 730 &gcm_pclmulqdq_impl, 731 #endif 732 }; 733 734 /* Indicate that benchmark has been completed */ 735 static boolean_t gcm_impl_initialized = B_FALSE; 736 737 /* Hold all supported implementations */ 738 static size_t gcm_supp_impl_cnt = 0; 739 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)]; 740 741 /* 742 * Returns the GCM operations for encrypt/decrypt/key setup. When a 743 * SIMD implementation is not allowed in the current context, then 744 * fallback to the fastest generic implementation. 745 */ 746 const gcm_impl_ops_t * 747 gcm_impl_get_ops(void) 748 { 749 if (!kfpu_allowed()) 750 return (&gcm_generic_impl); 751 752 const gcm_impl_ops_t *ops = NULL; 753 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); 754 755 switch (impl) { 756 case IMPL_FASTEST: 757 ASSERT(gcm_impl_initialized); 758 ops = &gcm_fastest_impl; 759 break; 760 case IMPL_CYCLE: 761 /* Cycle through supported implementations */ 762 ASSERT(gcm_impl_initialized); 763 ASSERT3U(gcm_supp_impl_cnt, >, 0); 764 static size_t cycle_impl_idx = 0; 765 size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt; 766 ops = gcm_supp_impl[idx]; 767 break; 768 #ifdef CAN_USE_GCM_ASM 769 case IMPL_AVX: 770 /* 771 * Make sure that we return a valid implementation while 772 * switching to the avx implementation since there still 773 * may be unfinished non-avx contexts around. 774 */ 775 ops = &gcm_generic_impl; 776 break; 777 #endif 778 default: 779 ASSERT3U(impl, <, gcm_supp_impl_cnt); 780 ASSERT3U(gcm_supp_impl_cnt, >, 0); 781 if (impl < ARRAY_SIZE(gcm_all_impl)) 782 ops = gcm_supp_impl[impl]; 783 break; 784 } 785 786 ASSERT3P(ops, !=, NULL); 787 788 return (ops); 789 } 790 791 /* 792 * Initialize all supported implementations. 793 */ 794 void 795 gcm_impl_init(void) 796 { 797 gcm_impl_ops_t *curr_impl; 798 int i, c; 799 800 /* Move supported implementations into gcm_supp_impls */ 801 for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) { 802 curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i]; 803 804 if (curr_impl->is_supported()) 805 gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl; 806 } 807 gcm_supp_impl_cnt = c; 808 809 /* 810 * Set the fastest implementation given the assumption that the 811 * hardware accelerated version is the fastest. 812 */ 813 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) 814 if (gcm_pclmulqdq_impl.is_supported()) { 815 memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl, 816 sizeof (gcm_fastest_impl)); 817 } else 818 #endif 819 { 820 memcpy(&gcm_fastest_impl, &gcm_generic_impl, 821 sizeof (gcm_fastest_impl)); 822 } 823 824 strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX); 825 826 #ifdef CAN_USE_GCM_ASM 827 /* 828 * Use the avx implementation if it's available and the implementation 829 * hasn't changed from its default value of fastest on module load. 830 */ 831 if (gcm_avx_will_work()) { 832 #ifdef HAVE_MOVBE 833 if (zfs_movbe_available() == B_TRUE) { 834 atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE); 835 } 836 #endif 837 if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { 838 gcm_set_avx(B_TRUE); 839 } 840 } 841 #endif 842 /* Finish initialization */ 843 atomic_swap_32(&icp_gcm_impl, user_sel_impl); 844 gcm_impl_initialized = B_TRUE; 845 } 846 847 static const struct { 848 const char *name; 849 uint32_t sel; 850 } gcm_impl_opts[] = { 851 { "cycle", IMPL_CYCLE }, 852 { "fastest", IMPL_FASTEST }, 853 #ifdef CAN_USE_GCM_ASM 854 { "avx", IMPL_AVX }, 855 #endif 856 }; 857 858 /* 859 * Function sets desired gcm implementation. 860 * 861 * If we are called before init(), user preference will be saved in 862 * user_sel_impl, and applied in later init() call. This occurs when module 863 * parameter is specified on module load. Otherwise, directly update 864 * icp_gcm_impl. 865 * 866 * @val Name of gcm implementation to use 867 * @param Unused. 868 */ 869 int 870 gcm_impl_set(const char *val) 871 { 872 int err = -EINVAL; 873 char req_name[GCM_IMPL_NAME_MAX]; 874 uint32_t impl = GCM_IMPL_READ(user_sel_impl); 875 size_t i; 876 877 /* sanitize input */ 878 i = strnlen(val, GCM_IMPL_NAME_MAX); 879 if (i == 0 || i >= GCM_IMPL_NAME_MAX) 880 return (err); 881 882 strlcpy(req_name, val, GCM_IMPL_NAME_MAX); 883 while (i > 0 && isspace(req_name[i-1])) 884 i--; 885 req_name[i] = '\0'; 886 887 /* Check mandatory options */ 888 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { 889 #ifdef CAN_USE_GCM_ASM 890 /* Ignore avx implementation if it won't work. */ 891 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { 892 continue; 893 } 894 #endif 895 if (strcmp(req_name, gcm_impl_opts[i].name) == 0) { 896 impl = gcm_impl_opts[i].sel; 897 err = 0; 898 break; 899 } 900 } 901 902 /* check all supported impl if init() was already called */ 903 if (err != 0 && gcm_impl_initialized) { 904 /* check all supported implementations */ 905 for (i = 0; i < gcm_supp_impl_cnt; i++) { 906 if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) { 907 impl = i; 908 err = 0; 909 break; 910 } 911 } 912 } 913 #ifdef CAN_USE_GCM_ASM 914 /* 915 * Use the avx implementation if available and the requested one is 916 * avx or fastest. 917 */ 918 if (gcm_avx_will_work() == B_TRUE && 919 (impl == IMPL_AVX || impl == IMPL_FASTEST)) { 920 gcm_set_avx(B_TRUE); 921 } else { 922 gcm_set_avx(B_FALSE); 923 } 924 #endif 925 926 if (err == 0) { 927 if (gcm_impl_initialized) 928 atomic_swap_32(&icp_gcm_impl, impl); 929 else 930 atomic_swap_32(&user_sel_impl, impl); 931 } 932 933 return (err); 934 } 935 936 #if defined(_KERNEL) && defined(__linux__) 937 938 static int 939 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp) 940 { 941 return (gcm_impl_set(val)); 942 } 943 944 static int 945 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) 946 { 947 int i, cnt = 0; 948 char *fmt; 949 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); 950 951 ASSERT(gcm_impl_initialized); 952 953 /* list mandatory options */ 954 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { 955 #ifdef CAN_USE_GCM_ASM 956 /* Ignore avx implementation if it won't work. */ 957 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { 958 continue; 959 } 960 #endif 961 fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s "; 962 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, 963 gcm_impl_opts[i].name); 964 } 965 966 /* list all supported implementations */ 967 for (i = 0; i < gcm_supp_impl_cnt; i++) { 968 fmt = (i == impl) ? "[%s] " : "%s "; 969 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, 970 gcm_supp_impl[i]->name); 971 } 972 973 return (cnt); 974 } 975 976 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get, 977 NULL, 0644); 978 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); 979 #endif /* defined(__KERNEL) */ 980 981 #ifdef CAN_USE_GCM_ASM 982 #define GCM_BLOCK_LEN 16 983 /* 984 * The openssl asm routines are 6x aggregated and need that many bytes 985 * at minimum. 986 */ 987 #define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6) 988 #define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3) 989 /* 990 * Ensure the chunk size is reasonable since we are allocating a 991 * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts. 992 */ 993 #define GCM_AVX_MAX_CHUNK_SIZE \ 994 (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES) 995 996 /* Clear the FPU registers since they hold sensitive internal state. */ 997 #define clear_fpu_regs() clear_fpu_regs_avx() 998 #define GHASH_AVX(ctx, in, len) \ 999 gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \ 1000 in, len) 1001 1002 #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) 1003 1004 /* Get the chunk size module parameter. */ 1005 #define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size 1006 1007 /* 1008 * Module parameter: number of bytes to process at once while owning the FPU. 1009 * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is 1010 * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES. 1011 */ 1012 static uint32_t gcm_avx_chunk_size = 1013 ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; 1014 1015 extern void ASMABI clear_fpu_regs_avx(void); 1016 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst); 1017 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr, 1018 const uint32_t pt[4], uint32_t ct[4]); 1019 1020 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]); 1021 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable, 1022 const uint8_t *in, size_t len); 1023 1024 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, 1025 const void *, uint64_t *, uint64_t *); 1026 1027 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, 1028 const void *, uint64_t *, uint64_t *); 1029 1030 static inline boolean_t 1031 gcm_avx_will_work(void) 1032 { 1033 /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */ 1034 return (kfpu_allowed() && 1035 zfs_avx_available() && zfs_aes_available() && 1036 zfs_pclmulqdq_available()); 1037 } 1038 1039 static inline void 1040 gcm_set_avx(boolean_t val) 1041 { 1042 if (gcm_avx_will_work() == B_TRUE) { 1043 atomic_swap_32(&gcm_use_avx, val); 1044 } 1045 } 1046 1047 static inline boolean_t 1048 gcm_toggle_avx(void) 1049 { 1050 if (gcm_avx_will_work() == B_TRUE) { 1051 return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX)); 1052 } else { 1053 return (B_FALSE); 1054 } 1055 } 1056 1057 static inline size_t 1058 gcm_simd_get_htab_size(boolean_t simd_mode) 1059 { 1060 switch (simd_mode) { 1061 case B_TRUE: 1062 return (2 * 6 * 2 * sizeof (uint64_t)); 1063 1064 default: 1065 return (0); 1066 } 1067 } 1068 1069 1070 /* Increment the GCM counter block by n. */ 1071 static inline void 1072 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n) 1073 { 1074 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 1075 uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask); 1076 1077 counter = htonll(counter + n); 1078 counter &= counter_mask; 1079 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 1080 } 1081 1082 /* 1083 * Encrypt multiple blocks of data in GCM mode. 1084 * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines 1085 * if possible. While processing a chunk the FPU is "locked". 1086 */ 1087 static int 1088 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, 1089 size_t length, crypto_data_t *out, size_t block_size) 1090 { 1091 size_t bleft = length; 1092 size_t need = 0; 1093 size_t done = 0; 1094 uint8_t *datap = (uint8_t *)data; 1095 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; 1096 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); 1097 uint64_t *ghash = ctx->gcm_ghash; 1098 uint64_t *cb = ctx->gcm_cb; 1099 uint8_t *ct_buf = NULL; 1100 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; 1101 int rv = CRYPTO_SUCCESS; 1102 1103 ASSERT(block_size == GCM_BLOCK_LEN); 1104 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==, 1105 B_FALSE); 1106 /* 1107 * If the last call left an incomplete block, try to fill 1108 * it first. 1109 */ 1110 if (ctx->gcm_remainder_len > 0) { 1111 need = block_size - ctx->gcm_remainder_len; 1112 if (length < need) { 1113 /* Accumulate bytes here and return. */ 1114 memcpy((uint8_t *)ctx->gcm_remainder + 1115 ctx->gcm_remainder_len, datap, length); 1116 1117 ctx->gcm_remainder_len += length; 1118 if (ctx->gcm_copy_to == NULL) { 1119 ctx->gcm_copy_to = datap; 1120 } 1121 return (CRYPTO_SUCCESS); 1122 } else { 1123 /* Complete incomplete block. */ 1124 memcpy((uint8_t *)ctx->gcm_remainder + 1125 ctx->gcm_remainder_len, datap, need); 1126 1127 ctx->gcm_copy_to = NULL; 1128 } 1129 } 1130 1131 /* Allocate a buffer to encrypt to if there is enough input. */ 1132 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { 1133 ct_buf = vmem_alloc(chunk_size, KM_SLEEP); 1134 if (ct_buf == NULL) { 1135 return (CRYPTO_HOST_MEMORY); 1136 } 1137 } 1138 1139 /* If we completed an incomplete block, encrypt and write it out. */ 1140 if (ctx->gcm_remainder_len > 0) { 1141 kfpu_begin(); 1142 aes_encrypt_intel(key->encr_ks.ks32, key->nr, 1143 (const uint32_t *)cb, (uint32_t *)tmp); 1144 1145 gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp); 1146 GHASH_AVX(ctx, tmp, block_size); 1147 clear_fpu_regs(); 1148 kfpu_end(); 1149 rv = crypto_put_output_data(tmp, out, block_size); 1150 out->cd_offset += block_size; 1151 gcm_incr_counter_block(ctx); 1152 ctx->gcm_processed_data_len += block_size; 1153 bleft -= need; 1154 datap += need; 1155 ctx->gcm_remainder_len = 0; 1156 } 1157 1158 /* Do the bulk encryption in chunk_size blocks. */ 1159 for (; bleft >= chunk_size; bleft -= chunk_size) { 1160 kfpu_begin(); 1161 done = aesni_gcm_encrypt( 1162 datap, ct_buf, chunk_size, key, cb, ghash); 1163 1164 clear_fpu_regs(); 1165 kfpu_end(); 1166 if (done != chunk_size) { 1167 rv = CRYPTO_FAILED; 1168 goto out_nofpu; 1169 } 1170 rv = crypto_put_output_data(ct_buf, out, chunk_size); 1171 if (rv != CRYPTO_SUCCESS) { 1172 goto out_nofpu; 1173 } 1174 out->cd_offset += chunk_size; 1175 datap += chunk_size; 1176 ctx->gcm_processed_data_len += chunk_size; 1177 } 1178 /* Check if we are already done. */ 1179 if (bleft == 0) { 1180 goto out_nofpu; 1181 } 1182 /* Bulk encrypt the remaining data. */ 1183 kfpu_begin(); 1184 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { 1185 done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash); 1186 if (done == 0) { 1187 rv = CRYPTO_FAILED; 1188 goto out; 1189 } 1190 rv = crypto_put_output_data(ct_buf, out, done); 1191 if (rv != CRYPTO_SUCCESS) { 1192 goto out; 1193 } 1194 out->cd_offset += done; 1195 ctx->gcm_processed_data_len += done; 1196 datap += done; 1197 bleft -= done; 1198 1199 } 1200 /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */ 1201 while (bleft > 0) { 1202 if (bleft < block_size) { 1203 memcpy(ctx->gcm_remainder, datap, bleft); 1204 ctx->gcm_remainder_len = bleft; 1205 ctx->gcm_copy_to = datap; 1206 goto out; 1207 } 1208 /* Encrypt, hash and write out. */ 1209 aes_encrypt_intel(key->encr_ks.ks32, key->nr, 1210 (const uint32_t *)cb, (uint32_t *)tmp); 1211 1212 gcm_xor_avx(datap, tmp); 1213 GHASH_AVX(ctx, tmp, block_size); 1214 rv = crypto_put_output_data(tmp, out, block_size); 1215 if (rv != CRYPTO_SUCCESS) { 1216 goto out; 1217 } 1218 out->cd_offset += block_size; 1219 gcm_incr_counter_block(ctx); 1220 ctx->gcm_processed_data_len += block_size; 1221 datap += block_size; 1222 bleft -= block_size; 1223 } 1224 out: 1225 clear_fpu_regs(); 1226 kfpu_end(); 1227 out_nofpu: 1228 if (ct_buf != NULL) { 1229 vmem_free(ct_buf, chunk_size); 1230 } 1231 return (rv); 1232 } 1233 1234 /* 1235 * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual 1236 * incomplete last block. Encrypt the ICB. Calculate the tag and write it out. 1237 */ 1238 static int 1239 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) 1240 { 1241 uint8_t *ghash = (uint8_t *)ctx->gcm_ghash; 1242 uint32_t *J0 = (uint32_t *)ctx->gcm_J0; 1243 uint8_t *remainder = (uint8_t *)ctx->gcm_remainder; 1244 size_t rem_len = ctx->gcm_remainder_len; 1245 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; 1246 int aes_rounds = ((aes_key_t *)keysched)->nr; 1247 int rv; 1248 1249 ASSERT(block_size == GCM_BLOCK_LEN); 1250 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==, 1251 B_FALSE); 1252 1253 if (out->cd_length < (rem_len + ctx->gcm_tag_len)) { 1254 return (CRYPTO_DATA_LEN_RANGE); 1255 } 1256 1257 kfpu_begin(); 1258 /* Pad last incomplete block with zeros, encrypt and hash. */ 1259 if (rem_len > 0) { 1260 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; 1261 const uint32_t *cb = (uint32_t *)ctx->gcm_cb; 1262 1263 aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp); 1264 memset(remainder + rem_len, 0, block_size - rem_len); 1265 for (int i = 0; i < rem_len; i++) { 1266 remainder[i] ^= tmp[i]; 1267 } 1268 GHASH_AVX(ctx, remainder, block_size); 1269 ctx->gcm_processed_data_len += rem_len; 1270 /* No need to increment counter_block, it's the last block. */ 1271 } 1272 /* Finish tag. */ 1273 ctx->gcm_len_a_len_c[1] = 1274 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); 1275 GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size); 1276 aes_encrypt_intel(keysched, aes_rounds, J0, J0); 1277 1278 gcm_xor_avx((uint8_t *)J0, ghash); 1279 clear_fpu_regs(); 1280 kfpu_end(); 1281 1282 /* Output remainder. */ 1283 if (rem_len > 0) { 1284 rv = crypto_put_output_data(remainder, out, rem_len); 1285 if (rv != CRYPTO_SUCCESS) 1286 return (rv); 1287 } 1288 out->cd_offset += rem_len; 1289 ctx->gcm_remainder_len = 0; 1290 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); 1291 if (rv != CRYPTO_SUCCESS) 1292 return (rv); 1293 1294 out->cd_offset += ctx->gcm_tag_len; 1295 return (CRYPTO_SUCCESS); 1296 } 1297 1298 /* 1299 * Finalize decryption: We just have accumulated crypto text, so now we 1300 * decrypt it here inplace. 1301 */ 1302 static int 1303 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) 1304 { 1305 ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len); 1306 ASSERT3U(block_size, ==, 16); 1307 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==, 1308 B_FALSE); 1309 1310 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; 1311 size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; 1312 uint8_t *datap = ctx->gcm_pt_buf; 1313 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); 1314 uint32_t *cb = (uint32_t *)ctx->gcm_cb; 1315 uint64_t *ghash = ctx->gcm_ghash; 1316 uint32_t *tmp = (uint32_t *)ctx->gcm_tmp; 1317 int rv = CRYPTO_SUCCESS; 1318 size_t bleft, done; 1319 1320 /* 1321 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be 1322 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of 1323 * GCM_AVX_MIN_DECRYPT_BYTES. 1324 */ 1325 for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) { 1326 kfpu_begin(); 1327 done = aesni_gcm_decrypt(datap, datap, chunk_size, 1328 (const void *)key, ctx->gcm_cb, ghash); 1329 clear_fpu_regs(); 1330 kfpu_end(); 1331 if (done != chunk_size) { 1332 return (CRYPTO_FAILED); 1333 } 1334 datap += done; 1335 } 1336 /* Decrypt remainder, which is less than chunk size, in one go. */ 1337 kfpu_begin(); 1338 if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) { 1339 done = aesni_gcm_decrypt(datap, datap, bleft, 1340 (const void *)key, ctx->gcm_cb, ghash); 1341 if (done == 0) { 1342 clear_fpu_regs(); 1343 kfpu_end(); 1344 return (CRYPTO_FAILED); 1345 } 1346 datap += done; 1347 bleft -= done; 1348 } 1349 ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES); 1350 1351 /* 1352 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain, 1353 * decrypt them block by block. 1354 */ 1355 while (bleft > 0) { 1356 /* Incomplete last block. */ 1357 if (bleft < block_size) { 1358 uint8_t *lastb = (uint8_t *)ctx->gcm_remainder; 1359 1360 memset(lastb, 0, block_size); 1361 memcpy(lastb, datap, bleft); 1362 /* The GCM processing. */ 1363 GHASH_AVX(ctx, lastb, block_size); 1364 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); 1365 for (size_t i = 0; i < bleft; i++) { 1366 datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i]; 1367 } 1368 break; 1369 } 1370 /* The GCM processing. */ 1371 GHASH_AVX(ctx, datap, block_size); 1372 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); 1373 gcm_xor_avx((uint8_t *)tmp, datap); 1374 gcm_incr_counter_block(ctx); 1375 1376 datap += block_size; 1377 bleft -= block_size; 1378 } 1379 if (rv != CRYPTO_SUCCESS) { 1380 clear_fpu_regs(); 1381 kfpu_end(); 1382 return (rv); 1383 } 1384 /* Decryption done, finish the tag. */ 1385 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); 1386 GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size); 1387 aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0, 1388 (uint32_t *)ctx->gcm_J0); 1389 1390 gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash); 1391 1392 /* We are done with the FPU, restore its state. */ 1393 clear_fpu_regs(); 1394 kfpu_end(); 1395 1396 /* Compare the input authentication tag with what we calculated. */ 1397 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { 1398 /* They don't match. */ 1399 return (CRYPTO_INVALID_MAC); 1400 } 1401 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); 1402 if (rv != CRYPTO_SUCCESS) { 1403 return (rv); 1404 } 1405 out->cd_offset += pt_len; 1406 return (CRYPTO_SUCCESS); 1407 } 1408 1409 /* 1410 * Initialize the GCM params H, Htabtle and the counter block. Save the 1411 * initial counter block. 1412 */ 1413 static int 1414 gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, 1415 const uint8_t *auth_data, size_t auth_data_len, size_t block_size) 1416 { 1417 uint8_t *cb = (uint8_t *)ctx->gcm_cb; 1418 uint64_t *H = ctx->gcm_H; 1419 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; 1420 int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr; 1421 const uint8_t *datap = auth_data; 1422 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; 1423 size_t bleft; 1424 1425 ASSERT(block_size == GCM_BLOCK_LEN); 1426 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==, 1427 B_FALSE); 1428 1429 /* Init H (encrypt zero block) and create the initial counter block. */ 1430 memset(H, 0, sizeof (ctx->gcm_H)); 1431 kfpu_begin(); 1432 aes_encrypt_intel(keysched, aes_rounds, 1433 (const uint32_t *)H, (uint32_t *)H); 1434 1435 gcm_init_htab_avx(ctx->gcm_Htable, H); 1436 1437 if (iv_len == 12) { 1438 memcpy(cb, iv, 12); 1439 cb[12] = 0; 1440 cb[13] = 0; 1441 cb[14] = 0; 1442 cb[15] = 1; 1443 /* We need the ICB later. */ 1444 memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0)); 1445 } else { 1446 /* 1447 * Most consumers use 12 byte IVs, so it's OK to use the 1448 * original routines for other IV sizes, just avoid nesting 1449 * kfpu_begin calls. 1450 */ 1451 clear_fpu_regs(); 1452 kfpu_end(); 1453 gcm_format_initial_blocks(iv, iv_len, ctx, block_size, 1454 aes_copy_block, aes_xor_block); 1455 kfpu_begin(); 1456 } 1457 1458 memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash)); 1459 1460 /* Openssl post increments the counter, adjust for that. */ 1461 gcm_incr_counter_block(ctx); 1462 1463 /* Ghash AAD in chunk_size blocks. */ 1464 for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) { 1465 GHASH_AVX(ctx, datap, chunk_size); 1466 datap += chunk_size; 1467 clear_fpu_regs(); 1468 kfpu_end(); 1469 kfpu_begin(); 1470 } 1471 /* Ghash the remainder and handle possible incomplete GCM block. */ 1472 if (bleft > 0) { 1473 size_t incomp = bleft % block_size; 1474 1475 bleft -= incomp; 1476 if (bleft > 0) { 1477 GHASH_AVX(ctx, datap, bleft); 1478 datap += bleft; 1479 } 1480 if (incomp > 0) { 1481 /* Zero pad and hash incomplete last block. */ 1482 uint8_t *authp = (uint8_t *)ctx->gcm_tmp; 1483 1484 memset(authp, 0, block_size); 1485 memcpy(authp, datap, incomp); 1486 GHASH_AVX(ctx, authp, block_size); 1487 } 1488 } 1489 clear_fpu_regs(); 1490 kfpu_end(); 1491 return (CRYPTO_SUCCESS); 1492 } 1493 1494 #if defined(_KERNEL) 1495 static int 1496 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) 1497 { 1498 unsigned long val; 1499 char val_rounded[16]; 1500 int error = 0; 1501 1502 error = kstrtoul(buf, 0, &val); 1503 if (error) 1504 return (error); 1505 1506 val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; 1507 1508 if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE) 1509 return (-EINVAL); 1510 1511 snprintf(val_rounded, 16, "%u", (uint32_t)val); 1512 error = param_set_uint(val_rounded, kp); 1513 return (error); 1514 } 1515 1516 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size, 1517 param_get_uint, &gcm_avx_chunk_size, 0644); 1518 1519 MODULE_PARM_DESC(icp_gcm_avx_chunk_size, 1520 "How many bytes to process while owning the FPU"); 1521 1522 #endif /* defined(__KERNEL) */ 1523 #endif /* ifdef CAN_USE_GCM_ASM */ 1524