1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/zfs_context.h> 26 #include <sys/cmn_err.h> 27 #include <modes/modes.h> 28 #include <sys/crypto/common.h> 29 #include <sys/crypto/icp.h> 30 #include <sys/crypto/impl.h> 31 #include <sys/byteorder.h> 32 #include <sys/simd.h> 33 #include <modes/gcm_impl.h> 34 #ifdef CAN_USE_GCM_ASM 35 #include <aes/aes_impl.h> 36 #endif 37 38 #define GHASH(c, d, t, o) \ 39 xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ 40 (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \ 41 (uint64_t *)(void *)(t)); 42 43 /* Select GCM implementation */ 44 #define IMPL_FASTEST (UINT32_MAX) 45 #define IMPL_CYCLE (UINT32_MAX-1) 46 #ifdef CAN_USE_GCM_ASM 47 #define IMPL_AVX (UINT32_MAX-2) 48 #endif 49 #define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) 50 static uint32_t icp_gcm_impl = IMPL_FASTEST; 51 static uint32_t user_sel_impl = IMPL_FASTEST; 52 53 static inline int gcm_init_ctx_impl(boolean_t, gcm_ctx_t *, char *, size_t, 54 int (*)(const void *, const uint8_t *, uint8_t *), 55 void (*)(uint8_t *, uint8_t *), 56 void (*)(uint8_t *, uint8_t *)); 57 58 #ifdef CAN_USE_GCM_ASM 59 /* Does the architecture we run on support the MOVBE instruction? */ 60 boolean_t gcm_avx_can_use_movbe = B_FALSE; 61 /* 62 * Whether to use the optimized openssl gcm and ghash implementations. 63 * Set to true if module parameter icp_gcm_impl == "avx". 64 */ 65 static boolean_t gcm_use_avx = B_FALSE; 66 #define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) 67 68 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *); 69 70 static inline boolean_t gcm_avx_will_work(void); 71 static inline void gcm_set_avx(boolean_t); 72 static inline boolean_t gcm_toggle_avx(void); 73 static inline size_t gcm_simd_get_htab_size(boolean_t); 74 75 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, 76 crypto_data_t *, size_t); 77 78 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); 79 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); 80 static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *, 81 size_t, size_t); 82 #endif /* ifdef CAN_USE_GCM_ASM */ 83 84 /* 85 * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode 86 * is done in another function. 87 */ 88 int 89 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, 90 crypto_data_t *out, size_t block_size, 91 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 92 void (*copy_block)(uint8_t *, uint8_t *), 93 void (*xor_block)(uint8_t *, uint8_t *)) 94 { 95 #ifdef CAN_USE_GCM_ASM 96 if (ctx->gcm_use_avx == B_TRUE) 97 return (gcm_mode_encrypt_contiguous_blocks_avx( 98 ctx, data, length, out, block_size)); 99 #endif 100 101 const gcm_impl_ops_t *gops; 102 size_t remainder = length; 103 size_t need = 0; 104 uint8_t *datap = (uint8_t *)data; 105 uint8_t *blockp; 106 uint8_t *lastp; 107 void *iov_or_mp; 108 offset_t offset; 109 uint8_t *out_data_1; 110 uint8_t *out_data_2; 111 size_t out_data_1_len; 112 uint64_t counter; 113 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 114 115 if (length + ctx->gcm_remainder_len < block_size) { 116 /* accumulate bytes here and return */ 117 memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, 118 datap, 119 length); 120 ctx->gcm_remainder_len += length; 121 if (ctx->gcm_copy_to == NULL) { 122 ctx->gcm_copy_to = datap; 123 } 124 return (CRYPTO_SUCCESS); 125 } 126 127 crypto_init_ptrs(out, &iov_or_mp, &offset); 128 129 gops = gcm_impl_get_ops(); 130 do { 131 /* Unprocessed data from last call. */ 132 if (ctx->gcm_remainder_len > 0) { 133 need = block_size - ctx->gcm_remainder_len; 134 135 if (need > remainder) 136 return (CRYPTO_DATA_LEN_RANGE); 137 138 memcpy(&((uint8_t *)ctx->gcm_remainder) 139 [ctx->gcm_remainder_len], datap, need); 140 141 blockp = (uint8_t *)ctx->gcm_remainder; 142 } else { 143 blockp = datap; 144 } 145 146 /* 147 * Increment counter. Counter bits are confined 148 * to the bottom 32 bits of the counter block. 149 */ 150 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 151 counter = htonll(counter + 1); 152 counter &= counter_mask; 153 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 154 155 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, 156 (uint8_t *)ctx->gcm_tmp); 157 xor_block(blockp, (uint8_t *)ctx->gcm_tmp); 158 159 lastp = (uint8_t *)ctx->gcm_tmp; 160 161 ctx->gcm_processed_data_len += block_size; 162 163 crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, 164 &out_data_1_len, &out_data_2, block_size); 165 166 /* copy block to where it belongs */ 167 if (out_data_1_len == block_size) { 168 copy_block(lastp, out_data_1); 169 } else { 170 memcpy(out_data_1, lastp, out_data_1_len); 171 if (out_data_2 != NULL) { 172 memcpy(out_data_2, 173 lastp + out_data_1_len, 174 block_size - out_data_1_len); 175 } 176 } 177 /* update offset */ 178 out->cd_offset += block_size; 179 180 /* add ciphertext to the hash */ 181 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops); 182 183 /* Update pointer to next block of data to be processed. */ 184 if (ctx->gcm_remainder_len != 0) { 185 datap += need; 186 ctx->gcm_remainder_len = 0; 187 } else { 188 datap += block_size; 189 } 190 191 remainder = (size_t)&data[length] - (size_t)datap; 192 193 /* Incomplete last block. */ 194 if (remainder > 0 && remainder < block_size) { 195 memcpy(ctx->gcm_remainder, datap, remainder); 196 ctx->gcm_remainder_len = remainder; 197 ctx->gcm_copy_to = datap; 198 goto out; 199 } 200 ctx->gcm_copy_to = NULL; 201 202 } while (remainder > 0); 203 out: 204 return (CRYPTO_SUCCESS); 205 } 206 207 int 208 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, 209 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 210 void (*copy_block)(uint8_t *, uint8_t *), 211 void (*xor_block)(uint8_t *, uint8_t *)) 212 { 213 (void) copy_block; 214 #ifdef CAN_USE_GCM_ASM 215 if (ctx->gcm_use_avx == B_TRUE) 216 return (gcm_encrypt_final_avx(ctx, out, block_size)); 217 #endif 218 219 const gcm_impl_ops_t *gops; 220 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 221 uint8_t *ghash, *macp = NULL; 222 int i, rv; 223 224 if (out->cd_length < 225 (ctx->gcm_remainder_len + ctx->gcm_tag_len)) { 226 return (CRYPTO_DATA_LEN_RANGE); 227 } 228 229 gops = gcm_impl_get_ops(); 230 ghash = (uint8_t *)ctx->gcm_ghash; 231 232 if (ctx->gcm_remainder_len > 0) { 233 uint64_t counter; 234 uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp; 235 236 /* 237 * Here is where we deal with data that is not a 238 * multiple of the block size. 239 */ 240 241 /* 242 * Increment counter. 243 */ 244 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 245 counter = htonll(counter + 1); 246 counter &= counter_mask; 247 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 248 249 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, 250 (uint8_t *)ctx->gcm_tmp); 251 252 macp = (uint8_t *)ctx->gcm_remainder; 253 memset(macp + ctx->gcm_remainder_len, 0, 254 block_size - ctx->gcm_remainder_len); 255 256 /* XOR with counter block */ 257 for (i = 0; i < ctx->gcm_remainder_len; i++) { 258 macp[i] ^= tmpp[i]; 259 } 260 261 /* add ciphertext to the hash */ 262 GHASH(ctx, macp, ghash, gops); 263 264 ctx->gcm_processed_data_len += ctx->gcm_remainder_len; 265 } 266 267 ctx->gcm_len_a_len_c[1] = 268 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); 269 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); 270 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, 271 (uint8_t *)ctx->gcm_J0); 272 xor_block((uint8_t *)ctx->gcm_J0, ghash); 273 274 if (ctx->gcm_remainder_len > 0) { 275 rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len); 276 if (rv != CRYPTO_SUCCESS) 277 return (rv); 278 } 279 out->cd_offset += ctx->gcm_remainder_len; 280 ctx->gcm_remainder_len = 0; 281 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); 282 if (rv != CRYPTO_SUCCESS) 283 return (rv); 284 out->cd_offset += ctx->gcm_tag_len; 285 286 return (CRYPTO_SUCCESS); 287 } 288 289 /* 290 * This will only deal with decrypting the last block of the input that 291 * might not be a multiple of block length. 292 */ 293 static void 294 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index, 295 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 296 void (*xor_block)(uint8_t *, uint8_t *)) 297 { 298 uint8_t *datap, *outp, *counterp; 299 uint64_t counter; 300 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 301 int i; 302 303 /* 304 * Increment counter. 305 * Counter bits are confined to the bottom 32 bits 306 */ 307 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 308 counter = htonll(counter + 1); 309 counter &= counter_mask; 310 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 311 312 datap = (uint8_t *)ctx->gcm_remainder; 313 outp = &((ctx->gcm_pt_buf)[index]); 314 counterp = (uint8_t *)ctx->gcm_tmp; 315 316 /* authentication tag */ 317 memset((uint8_t *)ctx->gcm_tmp, 0, block_size); 318 memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len); 319 320 /* add ciphertext to the hash */ 321 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops()); 322 323 /* decrypt remaining ciphertext */ 324 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp); 325 326 /* XOR with counter block */ 327 for (i = 0; i < ctx->gcm_remainder_len; i++) { 328 outp[i] = datap[i] ^ counterp[i]; 329 } 330 } 331 332 int 333 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, 334 crypto_data_t *out, size_t block_size, 335 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 336 void (*copy_block)(uint8_t *, uint8_t *), 337 void (*xor_block)(uint8_t *, uint8_t *)) 338 { 339 (void) out, (void) block_size, (void) encrypt_block, (void) copy_block, 340 (void) xor_block; 341 size_t new_len; 342 uint8_t *new; 343 344 /* 345 * Copy contiguous ciphertext input blocks to plaintext buffer. 346 * Ciphertext will be decrypted in the final. 347 */ 348 if (length > 0) { 349 new_len = ctx->gcm_pt_buf_len + length; 350 new = vmem_alloc(new_len, KM_SLEEP); 351 if (new == NULL) { 352 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); 353 ctx->gcm_pt_buf = NULL; 354 return (CRYPTO_HOST_MEMORY); 355 } 356 357 if (ctx->gcm_pt_buf != NULL) { 358 memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); 359 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); 360 } else { 361 ASSERT0(ctx->gcm_pt_buf_len); 362 } 363 364 ctx->gcm_pt_buf = new; 365 ctx->gcm_pt_buf_len = new_len; 366 memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data, 367 length); 368 ctx->gcm_processed_data_len += length; 369 } 370 371 ctx->gcm_remainder_len = 0; 372 return (CRYPTO_SUCCESS); 373 } 374 375 int 376 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, 377 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 378 void (*xor_block)(uint8_t *, uint8_t *)) 379 { 380 #ifdef CAN_USE_GCM_ASM 381 if (ctx->gcm_use_avx == B_TRUE) 382 return (gcm_decrypt_final_avx(ctx, out, block_size)); 383 #endif 384 385 const gcm_impl_ops_t *gops; 386 size_t pt_len; 387 size_t remainder; 388 uint8_t *ghash; 389 uint8_t *blockp; 390 uint8_t *cbp; 391 uint64_t counter; 392 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 393 int processed = 0, rv; 394 395 ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len); 396 397 gops = gcm_impl_get_ops(); 398 pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; 399 ghash = (uint8_t *)ctx->gcm_ghash; 400 blockp = ctx->gcm_pt_buf; 401 remainder = pt_len; 402 while (remainder > 0) { 403 /* Incomplete last block */ 404 if (remainder < block_size) { 405 memcpy(ctx->gcm_remainder, blockp, remainder); 406 ctx->gcm_remainder_len = remainder; 407 /* 408 * not expecting anymore ciphertext, just 409 * compute plaintext for the remaining input 410 */ 411 gcm_decrypt_incomplete_block(ctx, block_size, 412 processed, encrypt_block, xor_block); 413 ctx->gcm_remainder_len = 0; 414 goto out; 415 } 416 /* add ciphertext to the hash */ 417 GHASH(ctx, blockp, ghash, gops); 418 419 /* 420 * Increment counter. 421 * Counter bits are confined to the bottom 32 bits 422 */ 423 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 424 counter = htonll(counter + 1); 425 counter &= counter_mask; 426 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 427 428 cbp = (uint8_t *)ctx->gcm_tmp; 429 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp); 430 431 /* XOR with ciphertext */ 432 xor_block(cbp, blockp); 433 434 processed += block_size; 435 blockp += block_size; 436 remainder -= block_size; 437 } 438 out: 439 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); 440 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); 441 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, 442 (uint8_t *)ctx->gcm_J0); 443 xor_block((uint8_t *)ctx->gcm_J0, ghash); 444 445 /* compare the input authentication tag with what we calculated */ 446 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { 447 /* They don't match */ 448 return (CRYPTO_INVALID_MAC); 449 } else { 450 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); 451 if (rv != CRYPTO_SUCCESS) 452 return (rv); 453 out->cd_offset += pt_len; 454 } 455 return (CRYPTO_SUCCESS); 456 } 457 458 static int 459 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param) 460 { 461 size_t tag_len; 462 463 /* 464 * Check the length of the authentication tag (in bits). 465 */ 466 tag_len = gcm_param->ulTagBits; 467 switch (tag_len) { 468 case 32: 469 case 64: 470 case 96: 471 case 104: 472 case 112: 473 case 120: 474 case 128: 475 break; 476 default: 477 return (CRYPTO_MECHANISM_PARAM_INVALID); 478 } 479 480 if (gcm_param->ulIvLen == 0) 481 return (CRYPTO_MECHANISM_PARAM_INVALID); 482 483 return (CRYPTO_SUCCESS); 484 } 485 486 static void 487 gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len, 488 gcm_ctx_t *ctx, size_t block_size, 489 void (*copy_block)(uint8_t *, uint8_t *), 490 void (*xor_block)(uint8_t *, uint8_t *)) 491 { 492 const gcm_impl_ops_t *gops; 493 uint8_t *cb; 494 ulong_t remainder = iv_len; 495 ulong_t processed = 0; 496 uint8_t *datap, *ghash; 497 uint64_t len_a_len_c[2]; 498 499 gops = gcm_impl_get_ops(); 500 ghash = (uint8_t *)ctx->gcm_ghash; 501 cb = (uint8_t *)ctx->gcm_cb; 502 if (iv_len == 12) { 503 memcpy(cb, iv, 12); 504 cb[12] = 0; 505 cb[13] = 0; 506 cb[14] = 0; 507 cb[15] = 1; 508 /* J0 will be used again in the final */ 509 copy_block(cb, (uint8_t *)ctx->gcm_J0); 510 } else { 511 /* GHASH the IV */ 512 do { 513 if (remainder < block_size) { 514 memset(cb, 0, block_size); 515 memcpy(cb, &(iv[processed]), remainder); 516 datap = (uint8_t *)cb; 517 remainder = 0; 518 } else { 519 datap = (uint8_t *)(&(iv[processed])); 520 processed += block_size; 521 remainder -= block_size; 522 } 523 GHASH(ctx, datap, ghash, gops); 524 } while (remainder > 0); 525 526 len_a_len_c[0] = 0; 527 len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len)); 528 GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops); 529 530 /* J0 will be used again in the final */ 531 copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb); 532 } 533 } 534 535 static int 536 gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, 537 const uint8_t *auth_data, size_t auth_data_len, size_t block_size, 538 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 539 void (*copy_block)(uint8_t *, uint8_t *), 540 void (*xor_block)(uint8_t *, uint8_t *)) 541 { 542 const gcm_impl_ops_t *gops; 543 uint8_t *ghash, *datap, *authp; 544 size_t remainder, processed; 545 546 /* encrypt zero block to get subkey H */ 547 memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); 548 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H, 549 (uint8_t *)ctx->gcm_H); 550 551 gcm_format_initial_blocks(iv, iv_len, ctx, block_size, 552 copy_block, xor_block); 553 554 gops = gcm_impl_get_ops(); 555 authp = (uint8_t *)ctx->gcm_tmp; 556 ghash = (uint8_t *)ctx->gcm_ghash; 557 memset(authp, 0, block_size); 558 memset(ghash, 0, block_size); 559 560 processed = 0; 561 remainder = auth_data_len; 562 do { 563 if (remainder < block_size) { 564 /* 565 * There's not a block full of data, pad rest of 566 * buffer with zero 567 */ 568 569 if (auth_data != NULL) { 570 memset(authp, 0, block_size); 571 memcpy(authp, &(auth_data[processed]), 572 remainder); 573 } else { 574 ASSERT0(remainder); 575 } 576 577 datap = (uint8_t *)authp; 578 remainder = 0; 579 } else { 580 datap = (uint8_t *)(&(auth_data[processed])); 581 processed += block_size; 582 remainder -= block_size; 583 } 584 585 /* add auth data to the hash */ 586 GHASH(ctx, datap, ghash, gops); 587 588 } while (remainder > 0); 589 590 return (CRYPTO_SUCCESS); 591 } 592 593 /* 594 * The following function is called at encrypt or decrypt init time 595 * for AES GCM mode. 596 */ 597 int 598 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, 599 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 600 void (*copy_block)(uint8_t *, uint8_t *), 601 void (*xor_block)(uint8_t *, uint8_t *)) 602 { 603 return (gcm_init_ctx_impl(B_FALSE, gcm_ctx, param, block_size, 604 encrypt_block, copy_block, xor_block)); 605 } 606 607 /* 608 * The following function is called at encrypt or decrypt init time 609 * for AES GMAC mode. 610 */ 611 int 612 gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, 613 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 614 void (*copy_block)(uint8_t *, uint8_t *), 615 void (*xor_block)(uint8_t *, uint8_t *)) 616 { 617 return (gcm_init_ctx_impl(B_TRUE, gcm_ctx, param, block_size, 618 encrypt_block, copy_block, xor_block)); 619 } 620 621 /* 622 * Init the GCM context struct. Handle the cycle and avx implementations here. 623 * Initialization of a GMAC context differs slightly from a GCM context. 624 */ 625 static inline int 626 gcm_init_ctx_impl(boolean_t gmac_mode, gcm_ctx_t *gcm_ctx, char *param, 627 size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, 628 uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), 629 void (*xor_block)(uint8_t *, uint8_t *)) 630 { 631 CK_AES_GCM_PARAMS *gcm_param; 632 int rv = CRYPTO_SUCCESS; 633 size_t tag_len, iv_len; 634 635 if (param != NULL) { 636 gcm_param = (CK_AES_GCM_PARAMS *)(void *)param; 637 638 if (gmac_mode == B_FALSE) { 639 /* GCM mode. */ 640 if ((rv = gcm_validate_args(gcm_param)) != 0) { 641 return (rv); 642 } 643 gcm_ctx->gcm_flags |= GCM_MODE; 644 645 size_t tbits = gcm_param->ulTagBits; 646 tag_len = CRYPTO_BITS2BYTES(tbits); 647 iv_len = gcm_param->ulIvLen; 648 } else { 649 /* GMAC mode. */ 650 gcm_ctx->gcm_flags |= GMAC_MODE; 651 tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS); 652 iv_len = AES_GMAC_IV_LEN; 653 } 654 gcm_ctx->gcm_tag_len = tag_len; 655 gcm_ctx->gcm_processed_data_len = 0; 656 657 /* these values are in bits */ 658 gcm_ctx->gcm_len_a_len_c[0] 659 = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen)); 660 } else { 661 return (CRYPTO_MECHANISM_PARAM_INVALID); 662 } 663 664 const uint8_t *iv = (const uint8_t *)gcm_param->pIv; 665 const uint8_t *aad = (const uint8_t *)gcm_param->pAAD; 666 size_t aad_len = gcm_param->ulAADLen; 667 668 #ifdef CAN_USE_GCM_ASM 669 boolean_t needs_bswap = 670 ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap; 671 672 if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { 673 gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; 674 } else { 675 /* 676 * Handle the "cycle" implementation by creating avx and 677 * non-avx contexts alternately. 678 */ 679 gcm_ctx->gcm_use_avx = gcm_toggle_avx(); 680 681 /* The avx impl. doesn't handle byte swapped key schedules. */ 682 if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) { 683 gcm_ctx->gcm_use_avx = B_FALSE; 684 } 685 /* 686 * If this is a GCM context, use the MOVBE and the BSWAP 687 * variants alternately. GMAC contexts code paths do not 688 * use the MOVBE instruction. 689 */ 690 if (gcm_ctx->gcm_use_avx == B_TRUE && gmac_mode == B_FALSE && 691 zfs_movbe_available() == B_TRUE) { 692 (void) atomic_toggle_boolean_nv( 693 (volatile boolean_t *)&gcm_avx_can_use_movbe); 694 } 695 } 696 /* 697 * We don't handle byte swapped key schedules in the avx code path, 698 * still they could be created by the aes generic implementation. 699 * Make sure not to use them since we'll corrupt data if we do. 700 */ 701 if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) { 702 gcm_ctx->gcm_use_avx = B_FALSE; 703 704 cmn_err_once(CE_WARN, 705 "ICP: Can't use the aes generic or cycle implementations " 706 "in combination with the gcm avx implementation!"); 707 cmn_err_once(CE_WARN, 708 "ICP: Falling back to a compatible implementation, " 709 "aes-gcm performance will likely be degraded."); 710 cmn_err_once(CE_WARN, 711 "ICP: Choose at least the x86_64 aes implementation to " 712 "restore performance."); 713 } 714 715 /* Allocate Htab memory as needed. */ 716 if (gcm_ctx->gcm_use_avx == B_TRUE) { 717 size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); 718 719 if (htab_len == 0) { 720 return (CRYPTO_MECHANISM_PARAM_INVALID); 721 } 722 gcm_ctx->gcm_htab_len = htab_len; 723 gcm_ctx->gcm_Htable = 724 kmem_alloc(htab_len, KM_SLEEP); 725 726 if (gcm_ctx->gcm_Htable == NULL) { 727 return (CRYPTO_HOST_MEMORY); 728 } 729 } 730 /* Avx and non avx context initialization differs from here on. */ 731 if (gcm_ctx->gcm_use_avx == B_FALSE) { 732 #endif /* ifdef CAN_USE_GCM_ASM */ 733 if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size, 734 encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) { 735 rv = CRYPTO_MECHANISM_PARAM_INVALID; 736 } 737 #ifdef CAN_USE_GCM_ASM 738 } else { 739 if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len, 740 block_size) != CRYPTO_SUCCESS) { 741 rv = CRYPTO_MECHANISM_PARAM_INVALID; 742 } 743 } 744 #endif /* ifdef CAN_USE_GCM_ASM */ 745 746 return (rv); 747 } 748 749 void * 750 gcm_alloc_ctx(int kmflag) 751 { 752 gcm_ctx_t *gcm_ctx; 753 754 if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) 755 return (NULL); 756 757 gcm_ctx->gcm_flags = GCM_MODE; 758 return (gcm_ctx); 759 } 760 761 void * 762 gmac_alloc_ctx(int kmflag) 763 { 764 gcm_ctx_t *gcm_ctx; 765 766 if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) 767 return (NULL); 768 769 gcm_ctx->gcm_flags = GMAC_MODE; 770 return (gcm_ctx); 771 } 772 773 /* GCM implementation that contains the fastest methods */ 774 static gcm_impl_ops_t gcm_fastest_impl = { 775 .name = "fastest" 776 }; 777 778 /* All compiled in implementations */ 779 static const gcm_impl_ops_t *gcm_all_impl[] = { 780 &gcm_generic_impl, 781 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) 782 &gcm_pclmulqdq_impl, 783 #endif 784 }; 785 786 /* Indicate that benchmark has been completed */ 787 static boolean_t gcm_impl_initialized = B_FALSE; 788 789 /* Hold all supported implementations */ 790 static size_t gcm_supp_impl_cnt = 0; 791 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)]; 792 793 /* 794 * Returns the GCM operations for encrypt/decrypt/key setup. When a 795 * SIMD implementation is not allowed in the current context, then 796 * fallback to the fastest generic implementation. 797 */ 798 const gcm_impl_ops_t * 799 gcm_impl_get_ops(void) 800 { 801 if (!kfpu_allowed()) 802 return (&gcm_generic_impl); 803 804 const gcm_impl_ops_t *ops = NULL; 805 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); 806 807 switch (impl) { 808 case IMPL_FASTEST: 809 ASSERT(gcm_impl_initialized); 810 ops = &gcm_fastest_impl; 811 break; 812 case IMPL_CYCLE: 813 /* Cycle through supported implementations */ 814 ASSERT(gcm_impl_initialized); 815 ASSERT3U(gcm_supp_impl_cnt, >, 0); 816 static size_t cycle_impl_idx = 0; 817 size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt; 818 ops = gcm_supp_impl[idx]; 819 break; 820 #ifdef CAN_USE_GCM_ASM 821 case IMPL_AVX: 822 /* 823 * Make sure that we return a valid implementation while 824 * switching to the avx implementation since there still 825 * may be unfinished non-avx contexts around. 826 */ 827 ops = &gcm_generic_impl; 828 break; 829 #endif 830 default: 831 ASSERT3U(impl, <, gcm_supp_impl_cnt); 832 ASSERT3U(gcm_supp_impl_cnt, >, 0); 833 if (impl < ARRAY_SIZE(gcm_all_impl)) 834 ops = gcm_supp_impl[impl]; 835 break; 836 } 837 838 ASSERT3P(ops, !=, NULL); 839 840 return (ops); 841 } 842 843 /* 844 * Initialize all supported implementations. 845 */ 846 void 847 gcm_impl_init(void) 848 { 849 gcm_impl_ops_t *curr_impl; 850 int i, c; 851 852 /* Move supported implementations into gcm_supp_impls */ 853 for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) { 854 curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i]; 855 856 if (curr_impl->is_supported()) 857 gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl; 858 } 859 gcm_supp_impl_cnt = c; 860 861 /* 862 * Set the fastest implementation given the assumption that the 863 * hardware accelerated version is the fastest. 864 */ 865 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) 866 if (gcm_pclmulqdq_impl.is_supported()) { 867 memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl, 868 sizeof (gcm_fastest_impl)); 869 } else 870 #endif 871 { 872 memcpy(&gcm_fastest_impl, &gcm_generic_impl, 873 sizeof (gcm_fastest_impl)); 874 } 875 876 strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX); 877 878 #ifdef CAN_USE_GCM_ASM 879 /* 880 * Use the avx implementation if it's available and the implementation 881 * hasn't changed from its default value of fastest on module load. 882 */ 883 if (gcm_avx_will_work()) { 884 #ifdef HAVE_MOVBE 885 if (zfs_movbe_available() == B_TRUE) { 886 atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE); 887 } 888 #endif 889 if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { 890 gcm_set_avx(B_TRUE); 891 } 892 } 893 #endif 894 /* Finish initialization */ 895 atomic_swap_32(&icp_gcm_impl, user_sel_impl); 896 gcm_impl_initialized = B_TRUE; 897 } 898 899 static const struct { 900 const char *name; 901 uint32_t sel; 902 } gcm_impl_opts[] = { 903 { "cycle", IMPL_CYCLE }, 904 { "fastest", IMPL_FASTEST }, 905 #ifdef CAN_USE_GCM_ASM 906 { "avx", IMPL_AVX }, 907 #endif 908 }; 909 910 /* 911 * Function sets desired gcm implementation. 912 * 913 * If we are called before init(), user preference will be saved in 914 * user_sel_impl, and applied in later init() call. This occurs when module 915 * parameter is specified on module load. Otherwise, directly update 916 * icp_gcm_impl. 917 * 918 * @val Name of gcm implementation to use 919 * @param Unused. 920 */ 921 int 922 gcm_impl_set(const char *val) 923 { 924 int err = -EINVAL; 925 char req_name[GCM_IMPL_NAME_MAX]; 926 uint32_t impl = GCM_IMPL_READ(user_sel_impl); 927 size_t i; 928 929 /* sanitize input */ 930 i = strnlen(val, GCM_IMPL_NAME_MAX); 931 if (i == 0 || i >= GCM_IMPL_NAME_MAX) 932 return (err); 933 934 strlcpy(req_name, val, GCM_IMPL_NAME_MAX); 935 while (i > 0 && isspace(req_name[i-1])) 936 i--; 937 req_name[i] = '\0'; 938 939 /* Check mandatory options */ 940 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { 941 #ifdef CAN_USE_GCM_ASM 942 /* Ignore avx implementation if it won't work. */ 943 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { 944 continue; 945 } 946 #endif 947 if (strcmp(req_name, gcm_impl_opts[i].name) == 0) { 948 impl = gcm_impl_opts[i].sel; 949 err = 0; 950 break; 951 } 952 } 953 954 /* check all supported impl if init() was already called */ 955 if (err != 0 && gcm_impl_initialized) { 956 /* check all supported implementations */ 957 for (i = 0; i < gcm_supp_impl_cnt; i++) { 958 if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) { 959 impl = i; 960 err = 0; 961 break; 962 } 963 } 964 } 965 #ifdef CAN_USE_GCM_ASM 966 /* 967 * Use the avx implementation if available and the requested one is 968 * avx or fastest. 969 */ 970 if (gcm_avx_will_work() == B_TRUE && 971 (impl == IMPL_AVX || impl == IMPL_FASTEST)) { 972 gcm_set_avx(B_TRUE); 973 } else { 974 gcm_set_avx(B_FALSE); 975 } 976 #endif 977 978 if (err == 0) { 979 if (gcm_impl_initialized) 980 atomic_swap_32(&icp_gcm_impl, impl); 981 else 982 atomic_swap_32(&user_sel_impl, impl); 983 } 984 985 return (err); 986 } 987 988 #if defined(_KERNEL) && defined(__linux__) 989 990 static int 991 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp) 992 { 993 return (gcm_impl_set(val)); 994 } 995 996 static int 997 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) 998 { 999 int i, cnt = 0; 1000 char *fmt; 1001 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); 1002 1003 ASSERT(gcm_impl_initialized); 1004 1005 /* list mandatory options */ 1006 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { 1007 #ifdef CAN_USE_GCM_ASM 1008 /* Ignore avx implementation if it won't work. */ 1009 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { 1010 continue; 1011 } 1012 #endif 1013 fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s "; 1014 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, 1015 gcm_impl_opts[i].name); 1016 } 1017 1018 /* list all supported implementations */ 1019 for (i = 0; i < gcm_supp_impl_cnt; i++) { 1020 fmt = (i == impl) ? "[%s] " : "%s "; 1021 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, 1022 gcm_supp_impl[i]->name); 1023 } 1024 1025 return (cnt); 1026 } 1027 1028 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get, 1029 NULL, 0644); 1030 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); 1031 #endif /* defined(__KERNEL) */ 1032 1033 #ifdef CAN_USE_GCM_ASM 1034 #define GCM_BLOCK_LEN 16 1035 /* 1036 * The openssl asm routines are 6x aggregated and need that many bytes 1037 * at minimum. 1038 */ 1039 #define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6) 1040 #define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3) 1041 /* 1042 * Ensure the chunk size is reasonable since we are allocating a 1043 * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts. 1044 */ 1045 #define GCM_AVX_MAX_CHUNK_SIZE \ 1046 (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES) 1047 1048 /* Clear the FPU registers since they hold sensitive internal state. */ 1049 #define clear_fpu_regs() clear_fpu_regs_avx() 1050 #define GHASH_AVX(ctx, in, len) \ 1051 gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \ 1052 in, len) 1053 1054 #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) 1055 1056 /* Get the chunk size module parameter. */ 1057 #define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size 1058 1059 /* 1060 * Module parameter: number of bytes to process at once while owning the FPU. 1061 * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is 1062 * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES. 1063 */ 1064 static uint32_t gcm_avx_chunk_size = 1065 ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; 1066 1067 extern void ASMABI clear_fpu_regs_avx(void); 1068 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst); 1069 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr, 1070 const uint32_t pt[4], uint32_t ct[4]); 1071 1072 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]); 1073 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable, 1074 const uint8_t *in, size_t len); 1075 1076 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, 1077 const void *, uint64_t *, uint64_t *); 1078 1079 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, 1080 const void *, uint64_t *, uint64_t *); 1081 1082 static inline boolean_t 1083 gcm_avx_will_work(void) 1084 { 1085 /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */ 1086 return (kfpu_allowed() && 1087 zfs_avx_available() && zfs_aes_available() && 1088 zfs_pclmulqdq_available()); 1089 } 1090 1091 static inline void 1092 gcm_set_avx(boolean_t val) 1093 { 1094 if (gcm_avx_will_work() == B_TRUE) { 1095 atomic_swap_32(&gcm_use_avx, val); 1096 } 1097 } 1098 1099 static inline boolean_t 1100 gcm_toggle_avx(void) 1101 { 1102 if (gcm_avx_will_work() == B_TRUE) { 1103 return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX)); 1104 } else { 1105 return (B_FALSE); 1106 } 1107 } 1108 1109 static inline size_t 1110 gcm_simd_get_htab_size(boolean_t simd_mode) 1111 { 1112 switch (simd_mode) { 1113 case B_TRUE: 1114 return (2 * 6 * 2 * sizeof (uint64_t)); 1115 1116 default: 1117 return (0); 1118 } 1119 } 1120 1121 1122 /* Increment the GCM counter block by n. */ 1123 static inline void 1124 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n) 1125 { 1126 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 1127 uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask); 1128 1129 counter = htonll(counter + n); 1130 counter &= counter_mask; 1131 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 1132 } 1133 1134 /* 1135 * Encrypt multiple blocks of data in GCM mode. 1136 * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines 1137 * if possible. While processing a chunk the FPU is "locked". 1138 */ 1139 static int 1140 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, 1141 size_t length, crypto_data_t *out, size_t block_size) 1142 { 1143 size_t bleft = length; 1144 size_t need = 0; 1145 size_t done = 0; 1146 uint8_t *datap = (uint8_t *)data; 1147 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; 1148 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); 1149 uint64_t *ghash = ctx->gcm_ghash; 1150 uint64_t *cb = ctx->gcm_cb; 1151 uint8_t *ct_buf = NULL; 1152 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; 1153 int rv = CRYPTO_SUCCESS; 1154 1155 ASSERT(block_size == GCM_BLOCK_LEN); 1156 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==, 1157 B_FALSE); 1158 /* 1159 * If the last call left an incomplete block, try to fill 1160 * it first. 1161 */ 1162 if (ctx->gcm_remainder_len > 0) { 1163 need = block_size - ctx->gcm_remainder_len; 1164 if (length < need) { 1165 /* Accumulate bytes here and return. */ 1166 memcpy((uint8_t *)ctx->gcm_remainder + 1167 ctx->gcm_remainder_len, datap, length); 1168 1169 ctx->gcm_remainder_len += length; 1170 if (ctx->gcm_copy_to == NULL) { 1171 ctx->gcm_copy_to = datap; 1172 } 1173 return (CRYPTO_SUCCESS); 1174 } else { 1175 /* Complete incomplete block. */ 1176 memcpy((uint8_t *)ctx->gcm_remainder + 1177 ctx->gcm_remainder_len, datap, need); 1178 1179 ctx->gcm_copy_to = NULL; 1180 } 1181 } 1182 1183 /* Allocate a buffer to encrypt to if there is enough input. */ 1184 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { 1185 ct_buf = vmem_alloc(chunk_size, KM_SLEEP); 1186 if (ct_buf == NULL) { 1187 return (CRYPTO_HOST_MEMORY); 1188 } 1189 } 1190 1191 /* If we completed an incomplete block, encrypt and write it out. */ 1192 if (ctx->gcm_remainder_len > 0) { 1193 kfpu_begin(); 1194 aes_encrypt_intel(key->encr_ks.ks32, key->nr, 1195 (const uint32_t *)cb, (uint32_t *)tmp); 1196 1197 gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp); 1198 GHASH_AVX(ctx, tmp, block_size); 1199 clear_fpu_regs(); 1200 kfpu_end(); 1201 rv = crypto_put_output_data(tmp, out, block_size); 1202 out->cd_offset += block_size; 1203 gcm_incr_counter_block(ctx); 1204 ctx->gcm_processed_data_len += block_size; 1205 bleft -= need; 1206 datap += need; 1207 ctx->gcm_remainder_len = 0; 1208 } 1209 1210 /* Do the bulk encryption in chunk_size blocks. */ 1211 for (; bleft >= chunk_size; bleft -= chunk_size) { 1212 kfpu_begin(); 1213 done = aesni_gcm_encrypt( 1214 datap, ct_buf, chunk_size, key, cb, ghash); 1215 1216 clear_fpu_regs(); 1217 kfpu_end(); 1218 if (done != chunk_size) { 1219 rv = CRYPTO_FAILED; 1220 goto out_nofpu; 1221 } 1222 rv = crypto_put_output_data(ct_buf, out, chunk_size); 1223 if (rv != CRYPTO_SUCCESS) { 1224 goto out_nofpu; 1225 } 1226 out->cd_offset += chunk_size; 1227 datap += chunk_size; 1228 ctx->gcm_processed_data_len += chunk_size; 1229 } 1230 /* Check if we are already done. */ 1231 if (bleft == 0) { 1232 goto out_nofpu; 1233 } 1234 /* Bulk encrypt the remaining data. */ 1235 kfpu_begin(); 1236 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { 1237 done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash); 1238 if (done == 0) { 1239 rv = CRYPTO_FAILED; 1240 goto out; 1241 } 1242 rv = crypto_put_output_data(ct_buf, out, done); 1243 if (rv != CRYPTO_SUCCESS) { 1244 goto out; 1245 } 1246 out->cd_offset += done; 1247 ctx->gcm_processed_data_len += done; 1248 datap += done; 1249 bleft -= done; 1250 1251 } 1252 /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */ 1253 while (bleft > 0) { 1254 if (bleft < block_size) { 1255 memcpy(ctx->gcm_remainder, datap, bleft); 1256 ctx->gcm_remainder_len = bleft; 1257 ctx->gcm_copy_to = datap; 1258 goto out; 1259 } 1260 /* Encrypt, hash and write out. */ 1261 aes_encrypt_intel(key->encr_ks.ks32, key->nr, 1262 (const uint32_t *)cb, (uint32_t *)tmp); 1263 1264 gcm_xor_avx(datap, tmp); 1265 GHASH_AVX(ctx, tmp, block_size); 1266 rv = crypto_put_output_data(tmp, out, block_size); 1267 if (rv != CRYPTO_SUCCESS) { 1268 goto out; 1269 } 1270 out->cd_offset += block_size; 1271 gcm_incr_counter_block(ctx); 1272 ctx->gcm_processed_data_len += block_size; 1273 datap += block_size; 1274 bleft -= block_size; 1275 } 1276 out: 1277 clear_fpu_regs(); 1278 kfpu_end(); 1279 out_nofpu: 1280 if (ct_buf != NULL) { 1281 vmem_free(ct_buf, chunk_size); 1282 } 1283 return (rv); 1284 } 1285 1286 /* 1287 * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual 1288 * incomplete last block. Encrypt the ICB. Calculate the tag and write it out. 1289 */ 1290 static int 1291 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) 1292 { 1293 uint8_t *ghash = (uint8_t *)ctx->gcm_ghash; 1294 uint32_t *J0 = (uint32_t *)ctx->gcm_J0; 1295 uint8_t *remainder = (uint8_t *)ctx->gcm_remainder; 1296 size_t rem_len = ctx->gcm_remainder_len; 1297 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; 1298 int aes_rounds = ((aes_key_t *)keysched)->nr; 1299 int rv; 1300 1301 ASSERT(block_size == GCM_BLOCK_LEN); 1302 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==, 1303 B_FALSE); 1304 1305 if (out->cd_length < (rem_len + ctx->gcm_tag_len)) { 1306 return (CRYPTO_DATA_LEN_RANGE); 1307 } 1308 1309 kfpu_begin(); 1310 /* Pad last incomplete block with zeros, encrypt and hash. */ 1311 if (rem_len > 0) { 1312 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; 1313 const uint32_t *cb = (uint32_t *)ctx->gcm_cb; 1314 1315 aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp); 1316 memset(remainder + rem_len, 0, block_size - rem_len); 1317 for (int i = 0; i < rem_len; i++) { 1318 remainder[i] ^= tmp[i]; 1319 } 1320 GHASH_AVX(ctx, remainder, block_size); 1321 ctx->gcm_processed_data_len += rem_len; 1322 /* No need to increment counter_block, it's the last block. */ 1323 } 1324 /* Finish tag. */ 1325 ctx->gcm_len_a_len_c[1] = 1326 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); 1327 GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size); 1328 aes_encrypt_intel(keysched, aes_rounds, J0, J0); 1329 1330 gcm_xor_avx((uint8_t *)J0, ghash); 1331 clear_fpu_regs(); 1332 kfpu_end(); 1333 1334 /* Output remainder. */ 1335 if (rem_len > 0) { 1336 rv = crypto_put_output_data(remainder, out, rem_len); 1337 if (rv != CRYPTO_SUCCESS) 1338 return (rv); 1339 } 1340 out->cd_offset += rem_len; 1341 ctx->gcm_remainder_len = 0; 1342 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); 1343 if (rv != CRYPTO_SUCCESS) 1344 return (rv); 1345 1346 out->cd_offset += ctx->gcm_tag_len; 1347 return (CRYPTO_SUCCESS); 1348 } 1349 1350 /* 1351 * Finalize decryption: We just have accumulated crypto text, so now we 1352 * decrypt it here inplace. 1353 */ 1354 static int 1355 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) 1356 { 1357 ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len); 1358 ASSERT3U(block_size, ==, 16); 1359 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==, 1360 B_FALSE); 1361 1362 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; 1363 size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; 1364 uint8_t *datap = ctx->gcm_pt_buf; 1365 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); 1366 uint32_t *cb = (uint32_t *)ctx->gcm_cb; 1367 uint64_t *ghash = ctx->gcm_ghash; 1368 uint32_t *tmp = (uint32_t *)ctx->gcm_tmp; 1369 int rv = CRYPTO_SUCCESS; 1370 size_t bleft, done; 1371 1372 /* 1373 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be 1374 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of 1375 * GCM_AVX_MIN_DECRYPT_BYTES. 1376 */ 1377 for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) { 1378 kfpu_begin(); 1379 done = aesni_gcm_decrypt(datap, datap, chunk_size, 1380 (const void *)key, ctx->gcm_cb, ghash); 1381 clear_fpu_regs(); 1382 kfpu_end(); 1383 if (done != chunk_size) { 1384 return (CRYPTO_FAILED); 1385 } 1386 datap += done; 1387 } 1388 /* Decrypt remainder, which is less than chunk size, in one go. */ 1389 kfpu_begin(); 1390 if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) { 1391 done = aesni_gcm_decrypt(datap, datap, bleft, 1392 (const void *)key, ctx->gcm_cb, ghash); 1393 if (done == 0) { 1394 clear_fpu_regs(); 1395 kfpu_end(); 1396 return (CRYPTO_FAILED); 1397 } 1398 datap += done; 1399 bleft -= done; 1400 } 1401 ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES); 1402 1403 /* 1404 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain, 1405 * decrypt them block by block. 1406 */ 1407 while (bleft > 0) { 1408 /* Incomplete last block. */ 1409 if (bleft < block_size) { 1410 uint8_t *lastb = (uint8_t *)ctx->gcm_remainder; 1411 1412 memset(lastb, 0, block_size); 1413 memcpy(lastb, datap, bleft); 1414 /* The GCM processing. */ 1415 GHASH_AVX(ctx, lastb, block_size); 1416 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); 1417 for (size_t i = 0; i < bleft; i++) { 1418 datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i]; 1419 } 1420 break; 1421 } 1422 /* The GCM processing. */ 1423 GHASH_AVX(ctx, datap, block_size); 1424 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); 1425 gcm_xor_avx((uint8_t *)tmp, datap); 1426 gcm_incr_counter_block(ctx); 1427 1428 datap += block_size; 1429 bleft -= block_size; 1430 } 1431 if (rv != CRYPTO_SUCCESS) { 1432 clear_fpu_regs(); 1433 kfpu_end(); 1434 return (rv); 1435 } 1436 /* Decryption done, finish the tag. */ 1437 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); 1438 GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size); 1439 aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0, 1440 (uint32_t *)ctx->gcm_J0); 1441 1442 gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash); 1443 1444 /* We are done with the FPU, restore its state. */ 1445 clear_fpu_regs(); 1446 kfpu_end(); 1447 1448 /* Compare the input authentication tag with what we calculated. */ 1449 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { 1450 /* They don't match. */ 1451 return (CRYPTO_INVALID_MAC); 1452 } 1453 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); 1454 if (rv != CRYPTO_SUCCESS) { 1455 return (rv); 1456 } 1457 out->cd_offset += pt_len; 1458 return (CRYPTO_SUCCESS); 1459 } 1460 1461 /* 1462 * Initialize the GCM params H, Htabtle and the counter block. Save the 1463 * initial counter block. 1464 */ 1465 static int 1466 gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, 1467 const uint8_t *auth_data, size_t auth_data_len, size_t block_size) 1468 { 1469 uint8_t *cb = (uint8_t *)ctx->gcm_cb; 1470 uint64_t *H = ctx->gcm_H; 1471 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; 1472 int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr; 1473 const uint8_t *datap = auth_data; 1474 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; 1475 size_t bleft; 1476 1477 ASSERT(block_size == GCM_BLOCK_LEN); 1478 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==, 1479 B_FALSE); 1480 1481 /* Init H (encrypt zero block) and create the initial counter block. */ 1482 memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash)); 1483 memset(H, 0, sizeof (ctx->gcm_H)); 1484 kfpu_begin(); 1485 aes_encrypt_intel(keysched, aes_rounds, 1486 (const uint32_t *)H, (uint32_t *)H); 1487 1488 gcm_init_htab_avx(ctx->gcm_Htable, H); 1489 1490 if (iv_len == 12) { 1491 memcpy(cb, iv, 12); 1492 cb[12] = 0; 1493 cb[13] = 0; 1494 cb[14] = 0; 1495 cb[15] = 1; 1496 /* We need the ICB later. */ 1497 memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0)); 1498 } else { 1499 /* 1500 * Most consumers use 12 byte IVs, so it's OK to use the 1501 * original routines for other IV sizes, just avoid nesting 1502 * kfpu_begin calls. 1503 */ 1504 clear_fpu_regs(); 1505 kfpu_end(); 1506 gcm_format_initial_blocks(iv, iv_len, ctx, block_size, 1507 aes_copy_block, aes_xor_block); 1508 kfpu_begin(); 1509 } 1510 1511 /* Openssl post increments the counter, adjust for that. */ 1512 gcm_incr_counter_block(ctx); 1513 1514 /* Ghash AAD in chunk_size blocks. */ 1515 for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) { 1516 GHASH_AVX(ctx, datap, chunk_size); 1517 datap += chunk_size; 1518 clear_fpu_regs(); 1519 kfpu_end(); 1520 kfpu_begin(); 1521 } 1522 /* Ghash the remainder and handle possible incomplete GCM block. */ 1523 if (bleft > 0) { 1524 size_t incomp = bleft % block_size; 1525 1526 bleft -= incomp; 1527 if (bleft > 0) { 1528 GHASH_AVX(ctx, datap, bleft); 1529 datap += bleft; 1530 } 1531 if (incomp > 0) { 1532 /* Zero pad and hash incomplete last block. */ 1533 uint8_t *authp = (uint8_t *)ctx->gcm_tmp; 1534 1535 memset(authp, 0, block_size); 1536 memcpy(authp, datap, incomp); 1537 GHASH_AVX(ctx, authp, block_size); 1538 } 1539 } 1540 clear_fpu_regs(); 1541 kfpu_end(); 1542 return (CRYPTO_SUCCESS); 1543 } 1544 1545 #if defined(_KERNEL) 1546 static int 1547 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) 1548 { 1549 unsigned long val; 1550 char val_rounded[16]; 1551 int error = 0; 1552 1553 error = kstrtoul(buf, 0, &val); 1554 if (error) 1555 return (error); 1556 1557 val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; 1558 1559 if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE) 1560 return (-EINVAL); 1561 1562 snprintf(val_rounded, 16, "%u", (uint32_t)val); 1563 error = param_set_uint(val_rounded, kp); 1564 return (error); 1565 } 1566 1567 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size, 1568 param_get_uint, &gcm_avx_chunk_size, 0644); 1569 1570 MODULE_PARM_DESC(icp_gcm_avx_chunk_size, 1571 "How many bytes to process while owning the FPU"); 1572 1573 #endif /* defined(__KERNEL) */ 1574 #endif /* ifdef CAN_USE_GCM_ASM */ 1575