1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/zfs_context.h> 26 #include <modes/modes.h> 27 #include <sys/crypto/common.h> 28 #include <sys/crypto/icp.h> 29 #include <sys/crypto/impl.h> 30 #include <sys/byteorder.h> 31 #include <sys/simd.h> 32 #include <modes/gcm_impl.h> 33 #ifdef CAN_USE_GCM_ASM 34 #include <aes/aes_impl.h> 35 #endif 36 37 #define GHASH(c, d, t, o) \ 38 xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ 39 (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \ 40 (uint64_t *)(void *)(t)); 41 42 /* Select GCM implementation */ 43 #define IMPL_FASTEST (UINT32_MAX) 44 #define IMPL_CYCLE (UINT32_MAX-1) 45 #ifdef CAN_USE_GCM_ASM 46 #define IMPL_AVX (UINT32_MAX-2) 47 #endif 48 #define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) 49 static uint32_t icp_gcm_impl = IMPL_FASTEST; 50 static uint32_t user_sel_impl = IMPL_FASTEST; 51 52 #ifdef CAN_USE_GCM_ASM 53 /* Does the architecture we run on support the MOVBE instruction? */ 54 boolean_t gcm_avx_can_use_movbe = B_FALSE; 55 /* 56 * Whether to use the optimized openssl gcm and ghash implementations. 57 * Set to true if module parameter icp_gcm_impl == "avx". 58 */ 59 static boolean_t gcm_use_avx = B_FALSE; 60 #define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) 61 62 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *); 63 64 static inline boolean_t gcm_avx_will_work(void); 65 static inline void gcm_set_avx(boolean_t); 66 static inline boolean_t gcm_toggle_avx(void); 67 static inline size_t gcm_simd_get_htab_size(boolean_t); 68 69 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, 70 crypto_data_t *, size_t); 71 72 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); 73 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); 74 static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *, 75 size_t, size_t); 76 #endif /* ifdef CAN_USE_GCM_ASM */ 77 78 /* 79 * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode 80 * is done in another function. 81 */ 82 int 83 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, 84 crypto_data_t *out, size_t block_size, 85 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 86 void (*copy_block)(uint8_t *, uint8_t *), 87 void (*xor_block)(uint8_t *, uint8_t *)) 88 { 89 #ifdef CAN_USE_GCM_ASM 90 if (ctx->gcm_use_avx == B_TRUE) 91 return (gcm_mode_encrypt_contiguous_blocks_avx( 92 ctx, data, length, out, block_size)); 93 #endif 94 95 const gcm_impl_ops_t *gops; 96 size_t remainder = length; 97 size_t need = 0; 98 uint8_t *datap = (uint8_t *)data; 99 uint8_t *blockp; 100 uint8_t *lastp; 101 void *iov_or_mp; 102 offset_t offset; 103 uint8_t *out_data_1; 104 uint8_t *out_data_2; 105 size_t out_data_1_len; 106 uint64_t counter; 107 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 108 109 if (length + ctx->gcm_remainder_len < block_size) { 110 /* accumulate bytes here and return */ 111 memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, 112 datap, 113 length); 114 ctx->gcm_remainder_len += length; 115 if (ctx->gcm_copy_to == NULL) { 116 ctx->gcm_copy_to = datap; 117 } 118 return (CRYPTO_SUCCESS); 119 } 120 121 crypto_init_ptrs(out, &iov_or_mp, &offset); 122 123 gops = gcm_impl_get_ops(); 124 do { 125 /* Unprocessed data from last call. */ 126 if (ctx->gcm_remainder_len > 0) { 127 need = block_size - ctx->gcm_remainder_len; 128 129 if (need > remainder) 130 return (CRYPTO_DATA_LEN_RANGE); 131 132 memcpy(&((uint8_t *)ctx->gcm_remainder) 133 [ctx->gcm_remainder_len], datap, need); 134 135 blockp = (uint8_t *)ctx->gcm_remainder; 136 } else { 137 blockp = datap; 138 } 139 140 /* 141 * Increment counter. Counter bits are confined 142 * to the bottom 32 bits of the counter block. 143 */ 144 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 145 counter = htonll(counter + 1); 146 counter &= counter_mask; 147 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 148 149 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, 150 (uint8_t *)ctx->gcm_tmp); 151 xor_block(blockp, (uint8_t *)ctx->gcm_tmp); 152 153 lastp = (uint8_t *)ctx->gcm_tmp; 154 155 ctx->gcm_processed_data_len += block_size; 156 157 crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, 158 &out_data_1_len, &out_data_2, block_size); 159 160 /* copy block to where it belongs */ 161 if (out_data_1_len == block_size) { 162 copy_block(lastp, out_data_1); 163 } else { 164 memcpy(out_data_1, lastp, out_data_1_len); 165 if (out_data_2 != NULL) { 166 memcpy(out_data_2, 167 lastp + out_data_1_len, 168 block_size - out_data_1_len); 169 } 170 } 171 /* update offset */ 172 out->cd_offset += block_size; 173 174 /* add ciphertext to the hash */ 175 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops); 176 177 /* Update pointer to next block of data to be processed. */ 178 if (ctx->gcm_remainder_len != 0) { 179 datap += need; 180 ctx->gcm_remainder_len = 0; 181 } else { 182 datap += block_size; 183 } 184 185 remainder = (size_t)&data[length] - (size_t)datap; 186 187 /* Incomplete last block. */ 188 if (remainder > 0 && remainder < block_size) { 189 memcpy(ctx->gcm_remainder, datap, remainder); 190 ctx->gcm_remainder_len = remainder; 191 ctx->gcm_copy_to = datap; 192 goto out; 193 } 194 ctx->gcm_copy_to = NULL; 195 196 } while (remainder > 0); 197 out: 198 return (CRYPTO_SUCCESS); 199 } 200 201 int 202 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, 203 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 204 void (*copy_block)(uint8_t *, uint8_t *), 205 void (*xor_block)(uint8_t *, uint8_t *)) 206 { 207 (void) copy_block; 208 #ifdef CAN_USE_GCM_ASM 209 if (ctx->gcm_use_avx == B_TRUE) 210 return (gcm_encrypt_final_avx(ctx, out, block_size)); 211 #endif 212 213 const gcm_impl_ops_t *gops; 214 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 215 uint8_t *ghash, *macp = NULL; 216 int i, rv; 217 218 if (out->cd_length < 219 (ctx->gcm_remainder_len + ctx->gcm_tag_len)) { 220 return (CRYPTO_DATA_LEN_RANGE); 221 } 222 223 gops = gcm_impl_get_ops(); 224 ghash = (uint8_t *)ctx->gcm_ghash; 225 226 if (ctx->gcm_remainder_len > 0) { 227 uint64_t counter; 228 uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp; 229 230 /* 231 * Here is where we deal with data that is not a 232 * multiple of the block size. 233 */ 234 235 /* 236 * Increment counter. 237 */ 238 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 239 counter = htonll(counter + 1); 240 counter &= counter_mask; 241 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 242 243 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, 244 (uint8_t *)ctx->gcm_tmp); 245 246 macp = (uint8_t *)ctx->gcm_remainder; 247 memset(macp + ctx->gcm_remainder_len, 0, 248 block_size - ctx->gcm_remainder_len); 249 250 /* XOR with counter block */ 251 for (i = 0; i < ctx->gcm_remainder_len; i++) { 252 macp[i] ^= tmpp[i]; 253 } 254 255 /* add ciphertext to the hash */ 256 GHASH(ctx, macp, ghash, gops); 257 258 ctx->gcm_processed_data_len += ctx->gcm_remainder_len; 259 } 260 261 ctx->gcm_len_a_len_c[1] = 262 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); 263 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); 264 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, 265 (uint8_t *)ctx->gcm_J0); 266 xor_block((uint8_t *)ctx->gcm_J0, ghash); 267 268 if (ctx->gcm_remainder_len > 0) { 269 rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len); 270 if (rv != CRYPTO_SUCCESS) 271 return (rv); 272 } 273 out->cd_offset += ctx->gcm_remainder_len; 274 ctx->gcm_remainder_len = 0; 275 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); 276 if (rv != CRYPTO_SUCCESS) 277 return (rv); 278 out->cd_offset += ctx->gcm_tag_len; 279 280 return (CRYPTO_SUCCESS); 281 } 282 283 /* 284 * This will only deal with decrypting the last block of the input that 285 * might not be a multiple of block length. 286 */ 287 static void 288 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index, 289 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 290 void (*xor_block)(uint8_t *, uint8_t *)) 291 { 292 uint8_t *datap, *outp, *counterp; 293 uint64_t counter; 294 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 295 int i; 296 297 /* 298 * Increment counter. 299 * Counter bits are confined to the bottom 32 bits 300 */ 301 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 302 counter = htonll(counter + 1); 303 counter &= counter_mask; 304 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 305 306 datap = (uint8_t *)ctx->gcm_remainder; 307 outp = &((ctx->gcm_pt_buf)[index]); 308 counterp = (uint8_t *)ctx->gcm_tmp; 309 310 /* authentication tag */ 311 memset((uint8_t *)ctx->gcm_tmp, 0, block_size); 312 memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len); 313 314 /* add ciphertext to the hash */ 315 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops()); 316 317 /* decrypt remaining ciphertext */ 318 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp); 319 320 /* XOR with counter block */ 321 for (i = 0; i < ctx->gcm_remainder_len; i++) { 322 outp[i] = datap[i] ^ counterp[i]; 323 } 324 } 325 326 int 327 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, 328 crypto_data_t *out, size_t block_size, 329 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 330 void (*copy_block)(uint8_t *, uint8_t *), 331 void (*xor_block)(uint8_t *, uint8_t *)) 332 { 333 (void) out, (void) block_size, (void) encrypt_block, (void) copy_block, 334 (void) xor_block; 335 size_t new_len; 336 uint8_t *new; 337 338 /* 339 * Copy contiguous ciphertext input blocks to plaintext buffer. 340 * Ciphertext will be decrypted in the final. 341 */ 342 if (length > 0) { 343 new_len = ctx->gcm_pt_buf_len + length; 344 new = vmem_alloc(new_len, KM_SLEEP); 345 if (new == NULL) { 346 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); 347 ctx->gcm_pt_buf = NULL; 348 return (CRYPTO_HOST_MEMORY); 349 } 350 351 if (ctx->gcm_pt_buf != NULL) { 352 memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); 353 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); 354 } else { 355 ASSERT0(ctx->gcm_pt_buf_len); 356 } 357 358 ctx->gcm_pt_buf = new; 359 ctx->gcm_pt_buf_len = new_len; 360 memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data, 361 length); 362 ctx->gcm_processed_data_len += length; 363 } 364 365 ctx->gcm_remainder_len = 0; 366 return (CRYPTO_SUCCESS); 367 } 368 369 int 370 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, 371 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 372 void (*xor_block)(uint8_t *, uint8_t *)) 373 { 374 #ifdef CAN_USE_GCM_ASM 375 if (ctx->gcm_use_avx == B_TRUE) 376 return (gcm_decrypt_final_avx(ctx, out, block_size)); 377 #endif 378 379 const gcm_impl_ops_t *gops; 380 size_t pt_len; 381 size_t remainder; 382 uint8_t *ghash; 383 uint8_t *blockp; 384 uint8_t *cbp; 385 uint64_t counter; 386 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 387 int processed = 0, rv; 388 389 ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len); 390 391 gops = gcm_impl_get_ops(); 392 pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; 393 ghash = (uint8_t *)ctx->gcm_ghash; 394 blockp = ctx->gcm_pt_buf; 395 remainder = pt_len; 396 while (remainder > 0) { 397 /* Incomplete last block */ 398 if (remainder < block_size) { 399 memcpy(ctx->gcm_remainder, blockp, remainder); 400 ctx->gcm_remainder_len = remainder; 401 /* 402 * not expecting anymore ciphertext, just 403 * compute plaintext for the remaining input 404 */ 405 gcm_decrypt_incomplete_block(ctx, block_size, 406 processed, encrypt_block, xor_block); 407 ctx->gcm_remainder_len = 0; 408 goto out; 409 } 410 /* add ciphertext to the hash */ 411 GHASH(ctx, blockp, ghash, gops); 412 413 /* 414 * Increment counter. 415 * Counter bits are confined to the bottom 32 bits 416 */ 417 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 418 counter = htonll(counter + 1); 419 counter &= counter_mask; 420 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 421 422 cbp = (uint8_t *)ctx->gcm_tmp; 423 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp); 424 425 /* XOR with ciphertext */ 426 xor_block(cbp, blockp); 427 428 processed += block_size; 429 blockp += block_size; 430 remainder -= block_size; 431 } 432 out: 433 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); 434 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); 435 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, 436 (uint8_t *)ctx->gcm_J0); 437 xor_block((uint8_t *)ctx->gcm_J0, ghash); 438 439 /* compare the input authentication tag with what we calculated */ 440 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { 441 /* They don't match */ 442 return (CRYPTO_INVALID_MAC); 443 } else { 444 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); 445 if (rv != CRYPTO_SUCCESS) 446 return (rv); 447 out->cd_offset += pt_len; 448 } 449 return (CRYPTO_SUCCESS); 450 } 451 452 static int 453 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param) 454 { 455 size_t tag_len; 456 457 /* 458 * Check the length of the authentication tag (in bits). 459 */ 460 tag_len = gcm_param->ulTagBits; 461 switch (tag_len) { 462 case 32: 463 case 64: 464 case 96: 465 case 104: 466 case 112: 467 case 120: 468 case 128: 469 break; 470 default: 471 return (CRYPTO_MECHANISM_PARAM_INVALID); 472 } 473 474 if (gcm_param->ulIvLen == 0) 475 return (CRYPTO_MECHANISM_PARAM_INVALID); 476 477 return (CRYPTO_SUCCESS); 478 } 479 480 static void 481 gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len, 482 gcm_ctx_t *ctx, size_t block_size, 483 void (*copy_block)(uint8_t *, uint8_t *), 484 void (*xor_block)(uint8_t *, uint8_t *)) 485 { 486 const gcm_impl_ops_t *gops; 487 uint8_t *cb; 488 ulong_t remainder = iv_len; 489 ulong_t processed = 0; 490 uint8_t *datap, *ghash; 491 uint64_t len_a_len_c[2]; 492 493 gops = gcm_impl_get_ops(); 494 ghash = (uint8_t *)ctx->gcm_ghash; 495 cb = (uint8_t *)ctx->gcm_cb; 496 if (iv_len == 12) { 497 memcpy(cb, iv, 12); 498 cb[12] = 0; 499 cb[13] = 0; 500 cb[14] = 0; 501 cb[15] = 1; 502 /* J0 will be used again in the final */ 503 copy_block(cb, (uint8_t *)ctx->gcm_J0); 504 } else { 505 /* GHASH the IV */ 506 do { 507 if (remainder < block_size) { 508 memset(cb, 0, block_size); 509 memcpy(cb, &(iv[processed]), remainder); 510 datap = (uint8_t *)cb; 511 remainder = 0; 512 } else { 513 datap = (uint8_t *)(&(iv[processed])); 514 processed += block_size; 515 remainder -= block_size; 516 } 517 GHASH(ctx, datap, ghash, gops); 518 } while (remainder > 0); 519 520 len_a_len_c[0] = 0; 521 len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len)); 522 GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops); 523 524 /* J0 will be used again in the final */ 525 copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb); 526 } 527 } 528 529 static int 530 gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, 531 unsigned char *auth_data, size_t auth_data_len, size_t block_size, 532 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 533 void (*copy_block)(uint8_t *, uint8_t *), 534 void (*xor_block)(uint8_t *, uint8_t *)) 535 { 536 const gcm_impl_ops_t *gops; 537 uint8_t *ghash, *datap, *authp; 538 size_t remainder, processed; 539 540 /* encrypt zero block to get subkey H */ 541 memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); 542 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H, 543 (uint8_t *)ctx->gcm_H); 544 545 gcm_format_initial_blocks(iv, iv_len, ctx, block_size, 546 copy_block, xor_block); 547 548 gops = gcm_impl_get_ops(); 549 authp = (uint8_t *)ctx->gcm_tmp; 550 ghash = (uint8_t *)ctx->gcm_ghash; 551 memset(authp, 0, block_size); 552 memset(ghash, 0, block_size); 553 554 processed = 0; 555 remainder = auth_data_len; 556 do { 557 if (remainder < block_size) { 558 /* 559 * There's not a block full of data, pad rest of 560 * buffer with zero 561 */ 562 563 if (auth_data != NULL) { 564 memset(authp, 0, block_size); 565 memcpy(authp, &(auth_data[processed]), 566 remainder); 567 } else { 568 ASSERT0(remainder); 569 } 570 571 datap = (uint8_t *)authp; 572 remainder = 0; 573 } else { 574 datap = (uint8_t *)(&(auth_data[processed])); 575 processed += block_size; 576 remainder -= block_size; 577 } 578 579 /* add auth data to the hash */ 580 GHASH(ctx, datap, ghash, gops); 581 582 } while (remainder > 0); 583 584 return (CRYPTO_SUCCESS); 585 } 586 587 /* 588 * The following function is called at encrypt or decrypt init time 589 * for AES GCM mode. 590 * 591 * Init the GCM context struct. Handle the cycle and avx implementations here. 592 */ 593 int 594 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, 595 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 596 void (*copy_block)(uint8_t *, uint8_t *), 597 void (*xor_block)(uint8_t *, uint8_t *)) 598 { 599 int rv; 600 CK_AES_GCM_PARAMS *gcm_param; 601 602 if (param != NULL) { 603 gcm_param = (CK_AES_GCM_PARAMS *)(void *)param; 604 605 if ((rv = gcm_validate_args(gcm_param)) != 0) { 606 return (rv); 607 } 608 609 gcm_ctx->gcm_tag_len = gcm_param->ulTagBits; 610 gcm_ctx->gcm_tag_len >>= 3; 611 gcm_ctx->gcm_processed_data_len = 0; 612 613 /* these values are in bits */ 614 gcm_ctx->gcm_len_a_len_c[0] 615 = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen)); 616 617 rv = CRYPTO_SUCCESS; 618 gcm_ctx->gcm_flags |= GCM_MODE; 619 } else { 620 return (CRYPTO_MECHANISM_PARAM_INVALID); 621 } 622 623 #ifdef CAN_USE_GCM_ASM 624 if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { 625 gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; 626 } else { 627 /* 628 * Handle the "cycle" implementation by creating avx and 629 * non-avx contexts alternately. 630 */ 631 gcm_ctx->gcm_use_avx = gcm_toggle_avx(); 632 /* 633 * We don't handle byte swapped key schedules in the avx 634 * code path. 635 */ 636 aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; 637 if (ks->ops->needs_byteswap == B_TRUE) { 638 gcm_ctx->gcm_use_avx = B_FALSE; 639 } 640 /* Use the MOVBE and the BSWAP variants alternately. */ 641 if (gcm_ctx->gcm_use_avx == B_TRUE && 642 zfs_movbe_available() == B_TRUE) { 643 (void) atomic_toggle_boolean_nv( 644 (volatile boolean_t *)&gcm_avx_can_use_movbe); 645 } 646 } 647 /* Allocate Htab memory as needed. */ 648 if (gcm_ctx->gcm_use_avx == B_TRUE) { 649 size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); 650 651 if (htab_len == 0) { 652 return (CRYPTO_MECHANISM_PARAM_INVALID); 653 } 654 gcm_ctx->gcm_htab_len = htab_len; 655 gcm_ctx->gcm_Htable = 656 kmem_alloc(htab_len, KM_SLEEP); 657 658 if (gcm_ctx->gcm_Htable == NULL) { 659 return (CRYPTO_HOST_MEMORY); 660 } 661 } 662 /* Avx and non avx context initialization differs from here on. */ 663 if (gcm_ctx->gcm_use_avx == B_FALSE) { 664 #endif /* ifdef CAN_USE_GCM_ASM */ 665 if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, 666 gcm_param->pAAD, gcm_param->ulAADLen, block_size, 667 encrypt_block, copy_block, xor_block) != 0) { 668 rv = CRYPTO_MECHANISM_PARAM_INVALID; 669 } 670 #ifdef CAN_USE_GCM_ASM 671 } else { 672 if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, 673 gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) { 674 rv = CRYPTO_MECHANISM_PARAM_INVALID; 675 } 676 } 677 #endif /* ifdef CAN_USE_GCM_ASM */ 678 679 return (rv); 680 } 681 682 int 683 gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, 684 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 685 void (*copy_block)(uint8_t *, uint8_t *), 686 void (*xor_block)(uint8_t *, uint8_t *)) 687 { 688 int rv; 689 CK_AES_GMAC_PARAMS *gmac_param; 690 691 if (param != NULL) { 692 gmac_param = (CK_AES_GMAC_PARAMS *)(void *)param; 693 694 gcm_ctx->gcm_tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS); 695 gcm_ctx->gcm_processed_data_len = 0; 696 697 /* these values are in bits */ 698 gcm_ctx->gcm_len_a_len_c[0] 699 = htonll(CRYPTO_BYTES2BITS(gmac_param->ulAADLen)); 700 701 rv = CRYPTO_SUCCESS; 702 gcm_ctx->gcm_flags |= GMAC_MODE; 703 } else { 704 return (CRYPTO_MECHANISM_PARAM_INVALID); 705 } 706 707 #ifdef CAN_USE_GCM_ASM 708 /* 709 * Handle the "cycle" implementation by creating avx and non avx 710 * contexts alternately. 711 */ 712 if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { 713 gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; 714 } else { 715 gcm_ctx->gcm_use_avx = gcm_toggle_avx(); 716 } 717 /* We don't handle byte swapped key schedules in the avx code path. */ 718 aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; 719 if (ks->ops->needs_byteswap == B_TRUE) { 720 gcm_ctx->gcm_use_avx = B_FALSE; 721 } 722 /* Allocate Htab memory as needed. */ 723 if (gcm_ctx->gcm_use_avx == B_TRUE) { 724 size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); 725 726 if (htab_len == 0) { 727 return (CRYPTO_MECHANISM_PARAM_INVALID); 728 } 729 gcm_ctx->gcm_htab_len = htab_len; 730 gcm_ctx->gcm_Htable = 731 kmem_alloc(htab_len, KM_SLEEP); 732 733 if (gcm_ctx->gcm_Htable == NULL) { 734 return (CRYPTO_HOST_MEMORY); 735 } 736 } 737 738 /* Avx and non avx context initialization differs from here on. */ 739 if (gcm_ctx->gcm_use_avx == B_FALSE) { 740 #endif /* ifdef CAN_USE_GCM_ASM */ 741 if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, 742 gmac_param->pAAD, gmac_param->ulAADLen, block_size, 743 encrypt_block, copy_block, xor_block) != 0) { 744 rv = CRYPTO_MECHANISM_PARAM_INVALID; 745 } 746 #ifdef CAN_USE_GCM_ASM 747 } else { 748 if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, 749 gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) { 750 rv = CRYPTO_MECHANISM_PARAM_INVALID; 751 } 752 } 753 #endif /* ifdef CAN_USE_GCM_ASM */ 754 755 return (rv); 756 } 757 758 void * 759 gcm_alloc_ctx(int kmflag) 760 { 761 gcm_ctx_t *gcm_ctx; 762 763 if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) 764 return (NULL); 765 766 gcm_ctx->gcm_flags = GCM_MODE; 767 return (gcm_ctx); 768 } 769 770 void * 771 gmac_alloc_ctx(int kmflag) 772 { 773 gcm_ctx_t *gcm_ctx; 774 775 if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) 776 return (NULL); 777 778 gcm_ctx->gcm_flags = GMAC_MODE; 779 return (gcm_ctx); 780 } 781 782 /* GCM implementation that contains the fastest methods */ 783 static gcm_impl_ops_t gcm_fastest_impl = { 784 .name = "fastest" 785 }; 786 787 /* All compiled in implementations */ 788 static const gcm_impl_ops_t *gcm_all_impl[] = { 789 &gcm_generic_impl, 790 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) 791 &gcm_pclmulqdq_impl, 792 #endif 793 }; 794 795 /* Indicate that benchmark has been completed */ 796 static boolean_t gcm_impl_initialized = B_FALSE; 797 798 /* Hold all supported implementations */ 799 static size_t gcm_supp_impl_cnt = 0; 800 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)]; 801 802 /* 803 * Returns the GCM operations for encrypt/decrypt/key setup. When a 804 * SIMD implementation is not allowed in the current context, then 805 * fallback to the fastest generic implementation. 806 */ 807 const gcm_impl_ops_t * 808 gcm_impl_get_ops(void) 809 { 810 if (!kfpu_allowed()) 811 return (&gcm_generic_impl); 812 813 const gcm_impl_ops_t *ops = NULL; 814 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); 815 816 switch (impl) { 817 case IMPL_FASTEST: 818 ASSERT(gcm_impl_initialized); 819 ops = &gcm_fastest_impl; 820 break; 821 case IMPL_CYCLE: 822 /* Cycle through supported implementations */ 823 ASSERT(gcm_impl_initialized); 824 ASSERT3U(gcm_supp_impl_cnt, >, 0); 825 static size_t cycle_impl_idx = 0; 826 size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt; 827 ops = gcm_supp_impl[idx]; 828 break; 829 #ifdef CAN_USE_GCM_ASM 830 case IMPL_AVX: 831 /* 832 * Make sure that we return a valid implementation while 833 * switching to the avx implementation since there still 834 * may be unfinished non-avx contexts around. 835 */ 836 ops = &gcm_generic_impl; 837 break; 838 #endif 839 default: 840 ASSERT3U(impl, <, gcm_supp_impl_cnt); 841 ASSERT3U(gcm_supp_impl_cnt, >, 0); 842 if (impl < ARRAY_SIZE(gcm_all_impl)) 843 ops = gcm_supp_impl[impl]; 844 break; 845 } 846 847 ASSERT3P(ops, !=, NULL); 848 849 return (ops); 850 } 851 852 /* 853 * Initialize all supported implementations. 854 */ 855 void 856 gcm_impl_init(void) 857 { 858 gcm_impl_ops_t *curr_impl; 859 int i, c; 860 861 /* Move supported implementations into gcm_supp_impls */ 862 for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) { 863 curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i]; 864 865 if (curr_impl->is_supported()) 866 gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl; 867 } 868 gcm_supp_impl_cnt = c; 869 870 /* 871 * Set the fastest implementation given the assumption that the 872 * hardware accelerated version is the fastest. 873 */ 874 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) 875 if (gcm_pclmulqdq_impl.is_supported()) { 876 memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl, 877 sizeof (gcm_fastest_impl)); 878 } else 879 #endif 880 { 881 memcpy(&gcm_fastest_impl, &gcm_generic_impl, 882 sizeof (gcm_fastest_impl)); 883 } 884 885 strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX); 886 887 #ifdef CAN_USE_GCM_ASM 888 /* 889 * Use the avx implementation if it's available and the implementation 890 * hasn't changed from its default value of fastest on module load. 891 */ 892 if (gcm_avx_will_work()) { 893 #ifdef HAVE_MOVBE 894 if (zfs_movbe_available() == B_TRUE) { 895 atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE); 896 } 897 #endif 898 if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { 899 gcm_set_avx(B_TRUE); 900 } 901 } 902 #endif 903 /* Finish initialization */ 904 atomic_swap_32(&icp_gcm_impl, user_sel_impl); 905 gcm_impl_initialized = B_TRUE; 906 } 907 908 static const struct { 909 const char *name; 910 uint32_t sel; 911 } gcm_impl_opts[] = { 912 { "cycle", IMPL_CYCLE }, 913 { "fastest", IMPL_FASTEST }, 914 #ifdef CAN_USE_GCM_ASM 915 { "avx", IMPL_AVX }, 916 #endif 917 }; 918 919 /* 920 * Function sets desired gcm implementation. 921 * 922 * If we are called before init(), user preference will be saved in 923 * user_sel_impl, and applied in later init() call. This occurs when module 924 * parameter is specified on module load. Otherwise, directly update 925 * icp_gcm_impl. 926 * 927 * @val Name of gcm implementation to use 928 * @param Unused. 929 */ 930 int 931 gcm_impl_set(const char *val) 932 { 933 int err = -EINVAL; 934 char req_name[GCM_IMPL_NAME_MAX]; 935 uint32_t impl = GCM_IMPL_READ(user_sel_impl); 936 size_t i; 937 938 /* sanitize input */ 939 i = strnlen(val, GCM_IMPL_NAME_MAX); 940 if (i == 0 || i >= GCM_IMPL_NAME_MAX) 941 return (err); 942 943 strlcpy(req_name, val, GCM_IMPL_NAME_MAX); 944 while (i > 0 && isspace(req_name[i-1])) 945 i--; 946 req_name[i] = '\0'; 947 948 /* Check mandatory options */ 949 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { 950 #ifdef CAN_USE_GCM_ASM 951 /* Ignore avx implementation if it won't work. */ 952 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { 953 continue; 954 } 955 #endif 956 if (strcmp(req_name, gcm_impl_opts[i].name) == 0) { 957 impl = gcm_impl_opts[i].sel; 958 err = 0; 959 break; 960 } 961 } 962 963 /* check all supported impl if init() was already called */ 964 if (err != 0 && gcm_impl_initialized) { 965 /* check all supported implementations */ 966 for (i = 0; i < gcm_supp_impl_cnt; i++) { 967 if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) { 968 impl = i; 969 err = 0; 970 break; 971 } 972 } 973 } 974 #ifdef CAN_USE_GCM_ASM 975 /* 976 * Use the avx implementation if available and the requested one is 977 * avx or fastest. 978 */ 979 if (gcm_avx_will_work() == B_TRUE && 980 (impl == IMPL_AVX || impl == IMPL_FASTEST)) { 981 gcm_set_avx(B_TRUE); 982 } else { 983 gcm_set_avx(B_FALSE); 984 } 985 #endif 986 987 if (err == 0) { 988 if (gcm_impl_initialized) 989 atomic_swap_32(&icp_gcm_impl, impl); 990 else 991 atomic_swap_32(&user_sel_impl, impl); 992 } 993 994 return (err); 995 } 996 997 #if defined(_KERNEL) && defined(__linux__) 998 999 static int 1000 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp) 1001 { 1002 return (gcm_impl_set(val)); 1003 } 1004 1005 static int 1006 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) 1007 { 1008 int i, cnt = 0; 1009 char *fmt; 1010 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); 1011 1012 ASSERT(gcm_impl_initialized); 1013 1014 /* list mandatory options */ 1015 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { 1016 #ifdef CAN_USE_GCM_ASM 1017 /* Ignore avx implementation if it won't work. */ 1018 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { 1019 continue; 1020 } 1021 #endif 1022 fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s "; 1023 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, 1024 gcm_impl_opts[i].name); 1025 } 1026 1027 /* list all supported implementations */ 1028 for (i = 0; i < gcm_supp_impl_cnt; i++) { 1029 fmt = (i == impl) ? "[%s] " : "%s "; 1030 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, 1031 gcm_supp_impl[i]->name); 1032 } 1033 1034 return (cnt); 1035 } 1036 1037 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get, 1038 NULL, 0644); 1039 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); 1040 #endif /* defined(__KERNEL) */ 1041 1042 #ifdef CAN_USE_GCM_ASM 1043 #define GCM_BLOCK_LEN 16 1044 /* 1045 * The openssl asm routines are 6x aggregated and need that many bytes 1046 * at minimum. 1047 */ 1048 #define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6) 1049 #define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3) 1050 /* 1051 * Ensure the chunk size is reasonable since we are allocating a 1052 * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts. 1053 */ 1054 #define GCM_AVX_MAX_CHUNK_SIZE \ 1055 (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES) 1056 1057 /* Clear the FPU registers since they hold sensitive internal state. */ 1058 #define clear_fpu_regs() clear_fpu_regs_avx() 1059 #define GHASH_AVX(ctx, in, len) \ 1060 gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \ 1061 in, len) 1062 1063 #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) 1064 1065 /* Get the chunk size module parameter. */ 1066 #define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size 1067 1068 /* 1069 * Module parameter: number of bytes to process at once while owning the FPU. 1070 * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is 1071 * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES. 1072 */ 1073 static uint32_t gcm_avx_chunk_size = 1074 ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; 1075 1076 extern void ASMABI clear_fpu_regs_avx(void); 1077 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst); 1078 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr, 1079 const uint32_t pt[4], uint32_t ct[4]); 1080 1081 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]); 1082 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable, 1083 const uint8_t *in, size_t len); 1084 1085 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, 1086 const void *, uint64_t *, uint64_t *); 1087 1088 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, 1089 const void *, uint64_t *, uint64_t *); 1090 1091 static inline boolean_t 1092 gcm_avx_will_work(void) 1093 { 1094 /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */ 1095 return (kfpu_allowed() && 1096 zfs_avx_available() && zfs_aes_available() && 1097 zfs_pclmulqdq_available()); 1098 } 1099 1100 static inline void 1101 gcm_set_avx(boolean_t val) 1102 { 1103 if (gcm_avx_will_work() == B_TRUE) { 1104 atomic_swap_32(&gcm_use_avx, val); 1105 } 1106 } 1107 1108 static inline boolean_t 1109 gcm_toggle_avx(void) 1110 { 1111 if (gcm_avx_will_work() == B_TRUE) { 1112 return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX)); 1113 } else { 1114 return (B_FALSE); 1115 } 1116 } 1117 1118 static inline size_t 1119 gcm_simd_get_htab_size(boolean_t simd_mode) 1120 { 1121 switch (simd_mode) { 1122 case B_TRUE: 1123 return (2 * 6 * 2 * sizeof (uint64_t)); 1124 1125 default: 1126 return (0); 1127 } 1128 } 1129 1130 /* 1131 * Clear sensitive data in the context. 1132 * 1133 * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and 1134 * ctx->gcm_Htable contain the hash sub key which protects authentication. 1135 * 1136 * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for 1137 * a known plaintext attack, they consists of the IV and the first and last 1138 * counter respectively. If they should be cleared is debatable. 1139 */ 1140 static inline void 1141 gcm_clear_ctx(gcm_ctx_t *ctx) 1142 { 1143 memset(ctx->gcm_remainder, 0, sizeof (ctx->gcm_remainder)); 1144 memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); 1145 memset(ctx->gcm_J0, 0, sizeof (ctx->gcm_J0)); 1146 memset(ctx->gcm_tmp, 0, sizeof (ctx->gcm_tmp)); 1147 } 1148 1149 /* Increment the GCM counter block by n. */ 1150 static inline void 1151 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n) 1152 { 1153 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 1154 uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask); 1155 1156 counter = htonll(counter + n); 1157 counter &= counter_mask; 1158 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 1159 } 1160 1161 /* 1162 * Encrypt multiple blocks of data in GCM mode. 1163 * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines 1164 * if possible. While processing a chunk the FPU is "locked". 1165 */ 1166 static int 1167 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, 1168 size_t length, crypto_data_t *out, size_t block_size) 1169 { 1170 size_t bleft = length; 1171 size_t need = 0; 1172 size_t done = 0; 1173 uint8_t *datap = (uint8_t *)data; 1174 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; 1175 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); 1176 uint64_t *ghash = ctx->gcm_ghash; 1177 uint64_t *cb = ctx->gcm_cb; 1178 uint8_t *ct_buf = NULL; 1179 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; 1180 int rv = CRYPTO_SUCCESS; 1181 1182 ASSERT(block_size == GCM_BLOCK_LEN); 1183 /* 1184 * If the last call left an incomplete block, try to fill 1185 * it first. 1186 */ 1187 if (ctx->gcm_remainder_len > 0) { 1188 need = block_size - ctx->gcm_remainder_len; 1189 if (length < need) { 1190 /* Accumulate bytes here and return. */ 1191 memcpy((uint8_t *)ctx->gcm_remainder + 1192 ctx->gcm_remainder_len, datap, length); 1193 1194 ctx->gcm_remainder_len += length; 1195 if (ctx->gcm_copy_to == NULL) { 1196 ctx->gcm_copy_to = datap; 1197 } 1198 return (CRYPTO_SUCCESS); 1199 } else { 1200 /* Complete incomplete block. */ 1201 memcpy((uint8_t *)ctx->gcm_remainder + 1202 ctx->gcm_remainder_len, datap, need); 1203 1204 ctx->gcm_copy_to = NULL; 1205 } 1206 } 1207 1208 /* Allocate a buffer to encrypt to if there is enough input. */ 1209 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { 1210 ct_buf = vmem_alloc(chunk_size, KM_SLEEP); 1211 if (ct_buf == NULL) { 1212 return (CRYPTO_HOST_MEMORY); 1213 } 1214 } 1215 1216 /* If we completed an incomplete block, encrypt and write it out. */ 1217 if (ctx->gcm_remainder_len > 0) { 1218 kfpu_begin(); 1219 aes_encrypt_intel(key->encr_ks.ks32, key->nr, 1220 (const uint32_t *)cb, (uint32_t *)tmp); 1221 1222 gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp); 1223 GHASH_AVX(ctx, tmp, block_size); 1224 clear_fpu_regs(); 1225 kfpu_end(); 1226 rv = crypto_put_output_data(tmp, out, block_size); 1227 out->cd_offset += block_size; 1228 gcm_incr_counter_block(ctx); 1229 ctx->gcm_processed_data_len += block_size; 1230 bleft -= need; 1231 datap += need; 1232 ctx->gcm_remainder_len = 0; 1233 } 1234 1235 /* Do the bulk encryption in chunk_size blocks. */ 1236 for (; bleft >= chunk_size; bleft -= chunk_size) { 1237 kfpu_begin(); 1238 done = aesni_gcm_encrypt( 1239 datap, ct_buf, chunk_size, key, cb, ghash); 1240 1241 clear_fpu_regs(); 1242 kfpu_end(); 1243 if (done != chunk_size) { 1244 rv = CRYPTO_FAILED; 1245 goto out_nofpu; 1246 } 1247 rv = crypto_put_output_data(ct_buf, out, chunk_size); 1248 if (rv != CRYPTO_SUCCESS) { 1249 goto out_nofpu; 1250 } 1251 out->cd_offset += chunk_size; 1252 datap += chunk_size; 1253 ctx->gcm_processed_data_len += chunk_size; 1254 } 1255 /* Check if we are already done. */ 1256 if (bleft == 0) { 1257 goto out_nofpu; 1258 } 1259 /* Bulk encrypt the remaining data. */ 1260 kfpu_begin(); 1261 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { 1262 done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash); 1263 if (done == 0) { 1264 rv = CRYPTO_FAILED; 1265 goto out; 1266 } 1267 rv = crypto_put_output_data(ct_buf, out, done); 1268 if (rv != CRYPTO_SUCCESS) { 1269 goto out; 1270 } 1271 out->cd_offset += done; 1272 ctx->gcm_processed_data_len += done; 1273 datap += done; 1274 bleft -= done; 1275 1276 } 1277 /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */ 1278 while (bleft > 0) { 1279 if (bleft < block_size) { 1280 memcpy(ctx->gcm_remainder, datap, bleft); 1281 ctx->gcm_remainder_len = bleft; 1282 ctx->gcm_copy_to = datap; 1283 goto out; 1284 } 1285 /* Encrypt, hash and write out. */ 1286 aes_encrypt_intel(key->encr_ks.ks32, key->nr, 1287 (const uint32_t *)cb, (uint32_t *)tmp); 1288 1289 gcm_xor_avx(datap, tmp); 1290 GHASH_AVX(ctx, tmp, block_size); 1291 rv = crypto_put_output_data(tmp, out, block_size); 1292 if (rv != CRYPTO_SUCCESS) { 1293 goto out; 1294 } 1295 out->cd_offset += block_size; 1296 gcm_incr_counter_block(ctx); 1297 ctx->gcm_processed_data_len += block_size; 1298 datap += block_size; 1299 bleft -= block_size; 1300 } 1301 out: 1302 clear_fpu_regs(); 1303 kfpu_end(); 1304 out_nofpu: 1305 if (ct_buf != NULL) { 1306 vmem_free(ct_buf, chunk_size); 1307 } 1308 return (rv); 1309 } 1310 1311 /* 1312 * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual 1313 * incomplete last block. Encrypt the ICB. Calculate the tag and write it out. 1314 */ 1315 static int 1316 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) 1317 { 1318 uint8_t *ghash = (uint8_t *)ctx->gcm_ghash; 1319 uint32_t *J0 = (uint32_t *)ctx->gcm_J0; 1320 uint8_t *remainder = (uint8_t *)ctx->gcm_remainder; 1321 size_t rem_len = ctx->gcm_remainder_len; 1322 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; 1323 int aes_rounds = ((aes_key_t *)keysched)->nr; 1324 int rv; 1325 1326 ASSERT(block_size == GCM_BLOCK_LEN); 1327 1328 if (out->cd_length < (rem_len + ctx->gcm_tag_len)) { 1329 return (CRYPTO_DATA_LEN_RANGE); 1330 } 1331 1332 kfpu_begin(); 1333 /* Pad last incomplete block with zeros, encrypt and hash. */ 1334 if (rem_len > 0) { 1335 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; 1336 const uint32_t *cb = (uint32_t *)ctx->gcm_cb; 1337 1338 aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp); 1339 memset(remainder + rem_len, 0, block_size - rem_len); 1340 for (int i = 0; i < rem_len; i++) { 1341 remainder[i] ^= tmp[i]; 1342 } 1343 GHASH_AVX(ctx, remainder, block_size); 1344 ctx->gcm_processed_data_len += rem_len; 1345 /* No need to increment counter_block, it's the last block. */ 1346 } 1347 /* Finish tag. */ 1348 ctx->gcm_len_a_len_c[1] = 1349 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); 1350 GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size); 1351 aes_encrypt_intel(keysched, aes_rounds, J0, J0); 1352 1353 gcm_xor_avx((uint8_t *)J0, ghash); 1354 clear_fpu_regs(); 1355 kfpu_end(); 1356 1357 /* Output remainder. */ 1358 if (rem_len > 0) { 1359 rv = crypto_put_output_data(remainder, out, rem_len); 1360 if (rv != CRYPTO_SUCCESS) 1361 return (rv); 1362 } 1363 out->cd_offset += rem_len; 1364 ctx->gcm_remainder_len = 0; 1365 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); 1366 if (rv != CRYPTO_SUCCESS) 1367 return (rv); 1368 1369 out->cd_offset += ctx->gcm_tag_len; 1370 /* Clear sensitive data in the context before returning. */ 1371 gcm_clear_ctx(ctx); 1372 return (CRYPTO_SUCCESS); 1373 } 1374 1375 /* 1376 * Finalize decryption: We just have accumulated crypto text, so now we 1377 * decrypt it here inplace. 1378 */ 1379 static int 1380 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) 1381 { 1382 ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len); 1383 ASSERT3U(block_size, ==, 16); 1384 1385 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; 1386 size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; 1387 uint8_t *datap = ctx->gcm_pt_buf; 1388 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); 1389 uint32_t *cb = (uint32_t *)ctx->gcm_cb; 1390 uint64_t *ghash = ctx->gcm_ghash; 1391 uint32_t *tmp = (uint32_t *)ctx->gcm_tmp; 1392 int rv = CRYPTO_SUCCESS; 1393 size_t bleft, done; 1394 1395 /* 1396 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be 1397 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of 1398 * GCM_AVX_MIN_DECRYPT_BYTES. 1399 */ 1400 for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) { 1401 kfpu_begin(); 1402 done = aesni_gcm_decrypt(datap, datap, chunk_size, 1403 (const void *)key, ctx->gcm_cb, ghash); 1404 clear_fpu_regs(); 1405 kfpu_end(); 1406 if (done != chunk_size) { 1407 return (CRYPTO_FAILED); 1408 } 1409 datap += done; 1410 } 1411 /* Decrypt remainder, which is less than chunk size, in one go. */ 1412 kfpu_begin(); 1413 if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) { 1414 done = aesni_gcm_decrypt(datap, datap, bleft, 1415 (const void *)key, ctx->gcm_cb, ghash); 1416 if (done == 0) { 1417 clear_fpu_regs(); 1418 kfpu_end(); 1419 return (CRYPTO_FAILED); 1420 } 1421 datap += done; 1422 bleft -= done; 1423 } 1424 ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES); 1425 1426 /* 1427 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain, 1428 * decrypt them block by block. 1429 */ 1430 while (bleft > 0) { 1431 /* Incomplete last block. */ 1432 if (bleft < block_size) { 1433 uint8_t *lastb = (uint8_t *)ctx->gcm_remainder; 1434 1435 memset(lastb, 0, block_size); 1436 memcpy(lastb, datap, bleft); 1437 /* The GCM processing. */ 1438 GHASH_AVX(ctx, lastb, block_size); 1439 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); 1440 for (size_t i = 0; i < bleft; i++) { 1441 datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i]; 1442 } 1443 break; 1444 } 1445 /* The GCM processing. */ 1446 GHASH_AVX(ctx, datap, block_size); 1447 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); 1448 gcm_xor_avx((uint8_t *)tmp, datap); 1449 gcm_incr_counter_block(ctx); 1450 1451 datap += block_size; 1452 bleft -= block_size; 1453 } 1454 if (rv != CRYPTO_SUCCESS) { 1455 clear_fpu_regs(); 1456 kfpu_end(); 1457 return (rv); 1458 } 1459 /* Decryption done, finish the tag. */ 1460 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); 1461 GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size); 1462 aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0, 1463 (uint32_t *)ctx->gcm_J0); 1464 1465 gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash); 1466 1467 /* We are done with the FPU, restore its state. */ 1468 clear_fpu_regs(); 1469 kfpu_end(); 1470 1471 /* Compare the input authentication tag with what we calculated. */ 1472 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { 1473 /* They don't match. */ 1474 return (CRYPTO_INVALID_MAC); 1475 } 1476 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); 1477 if (rv != CRYPTO_SUCCESS) { 1478 return (rv); 1479 } 1480 out->cd_offset += pt_len; 1481 gcm_clear_ctx(ctx); 1482 return (CRYPTO_SUCCESS); 1483 } 1484 1485 /* 1486 * Initialize the GCM params H, Htabtle and the counter block. Save the 1487 * initial counter block. 1488 */ 1489 static int 1490 gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, 1491 unsigned char *auth_data, size_t auth_data_len, size_t block_size) 1492 { 1493 uint8_t *cb = (uint8_t *)ctx->gcm_cb; 1494 uint64_t *H = ctx->gcm_H; 1495 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; 1496 int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr; 1497 uint8_t *datap = auth_data; 1498 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; 1499 size_t bleft; 1500 1501 ASSERT(block_size == GCM_BLOCK_LEN); 1502 1503 /* Init H (encrypt zero block) and create the initial counter block. */ 1504 memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash)); 1505 memset(H, 0, sizeof (ctx->gcm_H)); 1506 kfpu_begin(); 1507 aes_encrypt_intel(keysched, aes_rounds, 1508 (const uint32_t *)H, (uint32_t *)H); 1509 1510 gcm_init_htab_avx(ctx->gcm_Htable, H); 1511 1512 if (iv_len == 12) { 1513 memcpy(cb, iv, 12); 1514 cb[12] = 0; 1515 cb[13] = 0; 1516 cb[14] = 0; 1517 cb[15] = 1; 1518 /* We need the ICB later. */ 1519 memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0)); 1520 } else { 1521 /* 1522 * Most consumers use 12 byte IVs, so it's OK to use the 1523 * original routines for other IV sizes, just avoid nesting 1524 * kfpu_begin calls. 1525 */ 1526 clear_fpu_regs(); 1527 kfpu_end(); 1528 gcm_format_initial_blocks(iv, iv_len, ctx, block_size, 1529 aes_copy_block, aes_xor_block); 1530 kfpu_begin(); 1531 } 1532 1533 /* Openssl post increments the counter, adjust for that. */ 1534 gcm_incr_counter_block(ctx); 1535 1536 /* Ghash AAD in chunk_size blocks. */ 1537 for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) { 1538 GHASH_AVX(ctx, datap, chunk_size); 1539 datap += chunk_size; 1540 clear_fpu_regs(); 1541 kfpu_end(); 1542 kfpu_begin(); 1543 } 1544 /* Ghash the remainder and handle possible incomplete GCM block. */ 1545 if (bleft > 0) { 1546 size_t incomp = bleft % block_size; 1547 1548 bleft -= incomp; 1549 if (bleft > 0) { 1550 GHASH_AVX(ctx, datap, bleft); 1551 datap += bleft; 1552 } 1553 if (incomp > 0) { 1554 /* Zero pad and hash incomplete last block. */ 1555 uint8_t *authp = (uint8_t *)ctx->gcm_tmp; 1556 1557 memset(authp, 0, block_size); 1558 memcpy(authp, datap, incomp); 1559 GHASH_AVX(ctx, authp, block_size); 1560 } 1561 } 1562 clear_fpu_regs(); 1563 kfpu_end(); 1564 return (CRYPTO_SUCCESS); 1565 } 1566 1567 #if defined(_KERNEL) 1568 static int 1569 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) 1570 { 1571 unsigned long val; 1572 char val_rounded[16]; 1573 int error = 0; 1574 1575 error = kstrtoul(buf, 0, &val); 1576 if (error) 1577 return (error); 1578 1579 val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; 1580 1581 if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE) 1582 return (-EINVAL); 1583 1584 snprintf(val_rounded, 16, "%u", (uint32_t)val); 1585 error = param_set_uint(val_rounded, kp); 1586 return (error); 1587 } 1588 1589 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size, 1590 param_get_uint, &gcm_avx_chunk_size, 0644); 1591 1592 MODULE_PARM_DESC(icp_gcm_avx_chunk_size, 1593 "How many bytes to process while owning the FPU"); 1594 1595 #endif /* defined(__KERNEL) */ 1596 #endif /* ifdef CAN_USE_GCM_ASM */ 1597