1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/zfs_context.h> 26 #include <modes/modes.h> 27 #include <sys/crypto/common.h> 28 #include <sys/crypto/icp.h> 29 #include <sys/crypto/impl.h> 30 #include <sys/byteorder.h> 31 #include <sys/simd.h> 32 #include <modes/gcm_impl.h> 33 #ifdef CAN_USE_GCM_ASM 34 #include <aes/aes_impl.h> 35 #endif 36 37 #define GHASH(c, d, t, o) \ 38 xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ 39 (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \ 40 (uint64_t *)(void *)(t)); 41 42 /* Select GCM implementation */ 43 #define IMPL_FASTEST (UINT32_MAX) 44 #define IMPL_CYCLE (UINT32_MAX-1) 45 #ifdef CAN_USE_GCM_ASM 46 #define IMPL_AVX (UINT32_MAX-2) 47 #endif 48 #define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) 49 static uint32_t icp_gcm_impl = IMPL_FASTEST; 50 static uint32_t user_sel_impl = IMPL_FASTEST; 51 52 #ifdef CAN_USE_GCM_ASM 53 /* Does the architecture we run on support the MOVBE instruction? */ 54 boolean_t gcm_avx_can_use_movbe = B_FALSE; 55 /* 56 * Whether to use the optimized openssl gcm and ghash implementations. 57 * Set to true if module parameter icp_gcm_impl == "avx". 58 */ 59 static boolean_t gcm_use_avx = B_FALSE; 60 #define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) 61 62 extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); 63 64 static inline boolean_t gcm_avx_will_work(void); 65 static inline void gcm_set_avx(boolean_t); 66 static inline boolean_t gcm_toggle_avx(void); 67 static inline size_t gcm_simd_get_htab_size(boolean_t); 68 69 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, 70 crypto_data_t *, size_t); 71 72 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); 73 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); 74 static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *, 75 size_t, size_t); 76 #endif /* ifdef CAN_USE_GCM_ASM */ 77 78 /* 79 * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode 80 * is done in another function. 81 */ 82 int 83 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, 84 crypto_data_t *out, size_t block_size, 85 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 86 void (*copy_block)(uint8_t *, uint8_t *), 87 void (*xor_block)(uint8_t *, uint8_t *)) 88 { 89 #ifdef CAN_USE_GCM_ASM 90 if (ctx->gcm_use_avx == B_TRUE) 91 return (gcm_mode_encrypt_contiguous_blocks_avx( 92 ctx, data, length, out, block_size)); 93 #endif 94 95 const gcm_impl_ops_t *gops; 96 size_t remainder = length; 97 size_t need = 0; 98 uint8_t *datap = (uint8_t *)data; 99 uint8_t *blockp; 100 uint8_t *lastp; 101 void *iov_or_mp; 102 offset_t offset; 103 uint8_t *out_data_1; 104 uint8_t *out_data_2; 105 size_t out_data_1_len; 106 uint64_t counter; 107 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 108 109 if (length + ctx->gcm_remainder_len < block_size) { 110 /* accumulate bytes here and return */ 111 memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, 112 datap, 113 length); 114 ctx->gcm_remainder_len += length; 115 if (ctx->gcm_copy_to == NULL) { 116 ctx->gcm_copy_to = datap; 117 } 118 return (CRYPTO_SUCCESS); 119 } 120 121 lastp = (uint8_t *)ctx->gcm_cb; 122 crypto_init_ptrs(out, &iov_or_mp, &offset); 123 124 gops = gcm_impl_get_ops(); 125 do { 126 /* Unprocessed data from last call. */ 127 if (ctx->gcm_remainder_len > 0) { 128 need = block_size - ctx->gcm_remainder_len; 129 130 if (need > remainder) 131 return (CRYPTO_DATA_LEN_RANGE); 132 133 memcpy(&((uint8_t *)ctx->gcm_remainder) 134 [ctx->gcm_remainder_len], datap, need); 135 136 blockp = (uint8_t *)ctx->gcm_remainder; 137 } else { 138 blockp = datap; 139 } 140 141 /* 142 * Increment counter. Counter bits are confined 143 * to the bottom 32 bits of the counter block. 144 */ 145 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 146 counter = htonll(counter + 1); 147 counter &= counter_mask; 148 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 149 150 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, 151 (uint8_t *)ctx->gcm_tmp); 152 xor_block(blockp, (uint8_t *)ctx->gcm_tmp); 153 154 lastp = (uint8_t *)ctx->gcm_tmp; 155 156 ctx->gcm_processed_data_len += block_size; 157 158 crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, 159 &out_data_1_len, &out_data_2, block_size); 160 161 /* copy block to where it belongs */ 162 if (out_data_1_len == block_size) { 163 copy_block(lastp, out_data_1); 164 } else { 165 memcpy(out_data_1, lastp, out_data_1_len); 166 if (out_data_2 != NULL) { 167 memcpy(out_data_2, 168 lastp + out_data_1_len, 169 block_size - out_data_1_len); 170 } 171 } 172 /* update offset */ 173 out->cd_offset += block_size; 174 175 /* add ciphertext to the hash */ 176 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops); 177 178 /* Update pointer to next block of data to be processed. */ 179 if (ctx->gcm_remainder_len != 0) { 180 datap += need; 181 ctx->gcm_remainder_len = 0; 182 } else { 183 datap += block_size; 184 } 185 186 remainder = (size_t)&data[length] - (size_t)datap; 187 188 /* Incomplete last block. */ 189 if (remainder > 0 && remainder < block_size) { 190 memcpy(ctx->gcm_remainder, datap, remainder); 191 ctx->gcm_remainder_len = remainder; 192 ctx->gcm_copy_to = datap; 193 goto out; 194 } 195 ctx->gcm_copy_to = NULL; 196 197 } while (remainder > 0); 198 out: 199 return (CRYPTO_SUCCESS); 200 } 201 202 int 203 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, 204 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 205 void (*copy_block)(uint8_t *, uint8_t *), 206 void (*xor_block)(uint8_t *, uint8_t *)) 207 { 208 (void) copy_block; 209 #ifdef CAN_USE_GCM_ASM 210 if (ctx->gcm_use_avx == B_TRUE) 211 return (gcm_encrypt_final_avx(ctx, out, block_size)); 212 #endif 213 214 const gcm_impl_ops_t *gops; 215 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 216 uint8_t *ghash, *macp = NULL; 217 int i, rv; 218 219 if (out->cd_length < 220 (ctx->gcm_remainder_len + ctx->gcm_tag_len)) { 221 return (CRYPTO_DATA_LEN_RANGE); 222 } 223 224 gops = gcm_impl_get_ops(); 225 ghash = (uint8_t *)ctx->gcm_ghash; 226 227 if (ctx->gcm_remainder_len > 0) { 228 uint64_t counter; 229 uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp; 230 231 /* 232 * Here is where we deal with data that is not a 233 * multiple of the block size. 234 */ 235 236 /* 237 * Increment counter. 238 */ 239 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 240 counter = htonll(counter + 1); 241 counter &= counter_mask; 242 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 243 244 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, 245 (uint8_t *)ctx->gcm_tmp); 246 247 macp = (uint8_t *)ctx->gcm_remainder; 248 memset(macp + ctx->gcm_remainder_len, 0, 249 block_size - ctx->gcm_remainder_len); 250 251 /* XOR with counter block */ 252 for (i = 0; i < ctx->gcm_remainder_len; i++) { 253 macp[i] ^= tmpp[i]; 254 } 255 256 /* add ciphertext to the hash */ 257 GHASH(ctx, macp, ghash, gops); 258 259 ctx->gcm_processed_data_len += ctx->gcm_remainder_len; 260 } 261 262 ctx->gcm_len_a_len_c[1] = 263 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); 264 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); 265 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, 266 (uint8_t *)ctx->gcm_J0); 267 xor_block((uint8_t *)ctx->gcm_J0, ghash); 268 269 if (ctx->gcm_remainder_len > 0) { 270 rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len); 271 if (rv != CRYPTO_SUCCESS) 272 return (rv); 273 } 274 out->cd_offset += ctx->gcm_remainder_len; 275 ctx->gcm_remainder_len = 0; 276 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); 277 if (rv != CRYPTO_SUCCESS) 278 return (rv); 279 out->cd_offset += ctx->gcm_tag_len; 280 281 return (CRYPTO_SUCCESS); 282 } 283 284 /* 285 * This will only deal with decrypting the last block of the input that 286 * might not be a multiple of block length. 287 */ 288 static void 289 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index, 290 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 291 void (*xor_block)(uint8_t *, uint8_t *)) 292 { 293 uint8_t *datap, *outp, *counterp; 294 uint64_t counter; 295 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 296 int i; 297 298 /* 299 * Increment counter. 300 * Counter bits are confined to the bottom 32 bits 301 */ 302 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 303 counter = htonll(counter + 1); 304 counter &= counter_mask; 305 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 306 307 datap = (uint8_t *)ctx->gcm_remainder; 308 outp = &((ctx->gcm_pt_buf)[index]); 309 counterp = (uint8_t *)ctx->gcm_tmp; 310 311 /* authentication tag */ 312 memset((uint8_t *)ctx->gcm_tmp, 0, block_size); 313 memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len); 314 315 /* add ciphertext to the hash */ 316 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops()); 317 318 /* decrypt remaining ciphertext */ 319 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp); 320 321 /* XOR with counter block */ 322 for (i = 0; i < ctx->gcm_remainder_len; i++) { 323 outp[i] = datap[i] ^ counterp[i]; 324 } 325 } 326 327 int 328 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, 329 crypto_data_t *out, size_t block_size, 330 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 331 void (*copy_block)(uint8_t *, uint8_t *), 332 void (*xor_block)(uint8_t *, uint8_t *)) 333 { 334 (void) out, (void) block_size, (void) encrypt_block, (void) copy_block, 335 (void) xor_block; 336 size_t new_len; 337 uint8_t *new; 338 339 /* 340 * Copy contiguous ciphertext input blocks to plaintext buffer. 341 * Ciphertext will be decrypted in the final. 342 */ 343 if (length > 0) { 344 new_len = ctx->gcm_pt_buf_len + length; 345 new = vmem_alloc(new_len, KM_SLEEP); 346 if (new == NULL) { 347 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); 348 ctx->gcm_pt_buf = NULL; 349 return (CRYPTO_HOST_MEMORY); 350 } 351 352 if (ctx->gcm_pt_buf != NULL) { 353 memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); 354 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); 355 } else { 356 ASSERT0(ctx->gcm_pt_buf_len); 357 } 358 359 ctx->gcm_pt_buf = new; 360 ctx->gcm_pt_buf_len = new_len; 361 memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data, 362 length); 363 ctx->gcm_processed_data_len += length; 364 } 365 366 ctx->gcm_remainder_len = 0; 367 return (CRYPTO_SUCCESS); 368 } 369 370 int 371 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, 372 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 373 void (*xor_block)(uint8_t *, uint8_t *)) 374 { 375 #ifdef CAN_USE_GCM_ASM 376 if (ctx->gcm_use_avx == B_TRUE) 377 return (gcm_decrypt_final_avx(ctx, out, block_size)); 378 #endif 379 380 const gcm_impl_ops_t *gops; 381 size_t pt_len; 382 size_t remainder; 383 uint8_t *ghash; 384 uint8_t *blockp; 385 uint8_t *cbp; 386 uint64_t counter; 387 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 388 int processed = 0, rv; 389 390 ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len); 391 392 gops = gcm_impl_get_ops(); 393 pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; 394 ghash = (uint8_t *)ctx->gcm_ghash; 395 blockp = ctx->gcm_pt_buf; 396 remainder = pt_len; 397 while (remainder > 0) { 398 /* Incomplete last block */ 399 if (remainder < block_size) { 400 memcpy(ctx->gcm_remainder, blockp, remainder); 401 ctx->gcm_remainder_len = remainder; 402 /* 403 * not expecting anymore ciphertext, just 404 * compute plaintext for the remaining input 405 */ 406 gcm_decrypt_incomplete_block(ctx, block_size, 407 processed, encrypt_block, xor_block); 408 ctx->gcm_remainder_len = 0; 409 goto out; 410 } 411 /* add ciphertext to the hash */ 412 GHASH(ctx, blockp, ghash, gops); 413 414 /* 415 * Increment counter. 416 * Counter bits are confined to the bottom 32 bits 417 */ 418 counter = ntohll(ctx->gcm_cb[1] & counter_mask); 419 counter = htonll(counter + 1); 420 counter &= counter_mask; 421 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 422 423 cbp = (uint8_t *)ctx->gcm_tmp; 424 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp); 425 426 /* XOR with ciphertext */ 427 xor_block(cbp, blockp); 428 429 processed += block_size; 430 blockp += block_size; 431 remainder -= block_size; 432 } 433 out: 434 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); 435 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); 436 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, 437 (uint8_t *)ctx->gcm_J0); 438 xor_block((uint8_t *)ctx->gcm_J0, ghash); 439 440 /* compare the input authentication tag with what we calculated */ 441 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { 442 /* They don't match */ 443 return (CRYPTO_INVALID_MAC); 444 } else { 445 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); 446 if (rv != CRYPTO_SUCCESS) 447 return (rv); 448 out->cd_offset += pt_len; 449 } 450 return (CRYPTO_SUCCESS); 451 } 452 453 static int 454 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param) 455 { 456 size_t tag_len; 457 458 /* 459 * Check the length of the authentication tag (in bits). 460 */ 461 tag_len = gcm_param->ulTagBits; 462 switch (tag_len) { 463 case 32: 464 case 64: 465 case 96: 466 case 104: 467 case 112: 468 case 120: 469 case 128: 470 break; 471 default: 472 return (CRYPTO_MECHANISM_PARAM_INVALID); 473 } 474 475 if (gcm_param->ulIvLen == 0) 476 return (CRYPTO_MECHANISM_PARAM_INVALID); 477 478 return (CRYPTO_SUCCESS); 479 } 480 481 static void 482 gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len, 483 gcm_ctx_t *ctx, size_t block_size, 484 void (*copy_block)(uint8_t *, uint8_t *), 485 void (*xor_block)(uint8_t *, uint8_t *)) 486 { 487 const gcm_impl_ops_t *gops; 488 uint8_t *cb; 489 ulong_t remainder = iv_len; 490 ulong_t processed = 0; 491 uint8_t *datap, *ghash; 492 uint64_t len_a_len_c[2]; 493 494 gops = gcm_impl_get_ops(); 495 ghash = (uint8_t *)ctx->gcm_ghash; 496 cb = (uint8_t *)ctx->gcm_cb; 497 if (iv_len == 12) { 498 memcpy(cb, iv, 12); 499 cb[12] = 0; 500 cb[13] = 0; 501 cb[14] = 0; 502 cb[15] = 1; 503 /* J0 will be used again in the final */ 504 copy_block(cb, (uint8_t *)ctx->gcm_J0); 505 } else { 506 /* GHASH the IV */ 507 do { 508 if (remainder < block_size) { 509 memset(cb, 0, block_size); 510 memcpy(cb, &(iv[processed]), remainder); 511 datap = (uint8_t *)cb; 512 remainder = 0; 513 } else { 514 datap = (uint8_t *)(&(iv[processed])); 515 processed += block_size; 516 remainder -= block_size; 517 } 518 GHASH(ctx, datap, ghash, gops); 519 } while (remainder > 0); 520 521 len_a_len_c[0] = 0; 522 len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len)); 523 GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops); 524 525 /* J0 will be used again in the final */ 526 copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb); 527 } 528 } 529 530 static int 531 gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, 532 unsigned char *auth_data, size_t auth_data_len, size_t block_size, 533 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 534 void (*copy_block)(uint8_t *, uint8_t *), 535 void (*xor_block)(uint8_t *, uint8_t *)) 536 { 537 const gcm_impl_ops_t *gops; 538 uint8_t *ghash, *datap, *authp; 539 size_t remainder, processed; 540 541 /* encrypt zero block to get subkey H */ 542 memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); 543 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H, 544 (uint8_t *)ctx->gcm_H); 545 546 gcm_format_initial_blocks(iv, iv_len, ctx, block_size, 547 copy_block, xor_block); 548 549 gops = gcm_impl_get_ops(); 550 authp = (uint8_t *)ctx->gcm_tmp; 551 ghash = (uint8_t *)ctx->gcm_ghash; 552 memset(authp, 0, block_size); 553 memset(ghash, 0, block_size); 554 555 processed = 0; 556 remainder = auth_data_len; 557 do { 558 if (remainder < block_size) { 559 /* 560 * There's not a block full of data, pad rest of 561 * buffer with zero 562 */ 563 564 if (auth_data != NULL) { 565 memset(authp, 0, block_size); 566 memcpy(authp, &(auth_data[processed]), 567 remainder); 568 } else { 569 ASSERT0(remainder); 570 } 571 572 datap = (uint8_t *)authp; 573 remainder = 0; 574 } else { 575 datap = (uint8_t *)(&(auth_data[processed])); 576 processed += block_size; 577 remainder -= block_size; 578 } 579 580 /* add auth data to the hash */ 581 GHASH(ctx, datap, ghash, gops); 582 583 } while (remainder > 0); 584 585 return (CRYPTO_SUCCESS); 586 } 587 588 /* 589 * The following function is called at encrypt or decrypt init time 590 * for AES GCM mode. 591 * 592 * Init the GCM context struct. Handle the cycle and avx implementations here. 593 */ 594 int 595 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, 596 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 597 void (*copy_block)(uint8_t *, uint8_t *), 598 void (*xor_block)(uint8_t *, uint8_t *)) 599 { 600 int rv; 601 CK_AES_GCM_PARAMS *gcm_param; 602 603 if (param != NULL) { 604 gcm_param = (CK_AES_GCM_PARAMS *)(void *)param; 605 606 if ((rv = gcm_validate_args(gcm_param)) != 0) { 607 return (rv); 608 } 609 610 gcm_ctx->gcm_tag_len = gcm_param->ulTagBits; 611 gcm_ctx->gcm_tag_len >>= 3; 612 gcm_ctx->gcm_processed_data_len = 0; 613 614 /* these values are in bits */ 615 gcm_ctx->gcm_len_a_len_c[0] 616 = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen)); 617 618 rv = CRYPTO_SUCCESS; 619 gcm_ctx->gcm_flags |= GCM_MODE; 620 } else { 621 return (CRYPTO_MECHANISM_PARAM_INVALID); 622 } 623 624 #ifdef CAN_USE_GCM_ASM 625 if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { 626 gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; 627 } else { 628 /* 629 * Handle the "cycle" implementation by creating avx and 630 * non-avx contexts alternately. 631 */ 632 gcm_ctx->gcm_use_avx = gcm_toggle_avx(); 633 /* 634 * We don't handle byte swapped key schedules in the avx 635 * code path. 636 */ 637 aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; 638 if (ks->ops->needs_byteswap == B_TRUE) { 639 gcm_ctx->gcm_use_avx = B_FALSE; 640 } 641 /* Use the MOVBE and the BSWAP variants alternately. */ 642 if (gcm_ctx->gcm_use_avx == B_TRUE && 643 zfs_movbe_available() == B_TRUE) { 644 (void) atomic_toggle_boolean_nv( 645 (volatile boolean_t *)&gcm_avx_can_use_movbe); 646 } 647 } 648 /* Allocate Htab memory as needed. */ 649 if (gcm_ctx->gcm_use_avx == B_TRUE) { 650 size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); 651 652 if (htab_len == 0) { 653 return (CRYPTO_MECHANISM_PARAM_INVALID); 654 } 655 gcm_ctx->gcm_htab_len = htab_len; 656 gcm_ctx->gcm_Htable = 657 (uint64_t *)kmem_alloc(htab_len, KM_SLEEP); 658 659 if (gcm_ctx->gcm_Htable == NULL) { 660 return (CRYPTO_HOST_MEMORY); 661 } 662 } 663 /* Avx and non avx context initialization differs from here on. */ 664 if (gcm_ctx->gcm_use_avx == B_FALSE) { 665 #endif /* ifdef CAN_USE_GCM_ASM */ 666 if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, 667 gcm_param->pAAD, gcm_param->ulAADLen, block_size, 668 encrypt_block, copy_block, xor_block) != 0) { 669 rv = CRYPTO_MECHANISM_PARAM_INVALID; 670 } 671 #ifdef CAN_USE_GCM_ASM 672 } else { 673 if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, 674 gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) { 675 rv = CRYPTO_MECHANISM_PARAM_INVALID; 676 } 677 } 678 #endif /* ifdef CAN_USE_GCM_ASM */ 679 680 return (rv); 681 } 682 683 int 684 gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, 685 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), 686 void (*copy_block)(uint8_t *, uint8_t *), 687 void (*xor_block)(uint8_t *, uint8_t *)) 688 { 689 int rv; 690 CK_AES_GMAC_PARAMS *gmac_param; 691 692 if (param != NULL) { 693 gmac_param = (CK_AES_GMAC_PARAMS *)(void *)param; 694 695 gcm_ctx->gcm_tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS); 696 gcm_ctx->gcm_processed_data_len = 0; 697 698 /* these values are in bits */ 699 gcm_ctx->gcm_len_a_len_c[0] 700 = htonll(CRYPTO_BYTES2BITS(gmac_param->ulAADLen)); 701 702 rv = CRYPTO_SUCCESS; 703 gcm_ctx->gcm_flags |= GMAC_MODE; 704 } else { 705 return (CRYPTO_MECHANISM_PARAM_INVALID); 706 } 707 708 #ifdef CAN_USE_GCM_ASM 709 /* 710 * Handle the "cycle" implementation by creating avx and non avx 711 * contexts alternately. 712 */ 713 if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { 714 gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; 715 } else { 716 gcm_ctx->gcm_use_avx = gcm_toggle_avx(); 717 } 718 /* We don't handle byte swapped key schedules in the avx code path. */ 719 aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; 720 if (ks->ops->needs_byteswap == B_TRUE) { 721 gcm_ctx->gcm_use_avx = B_FALSE; 722 } 723 /* Allocate Htab memory as needed. */ 724 if (gcm_ctx->gcm_use_avx == B_TRUE) { 725 size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); 726 727 if (htab_len == 0) { 728 return (CRYPTO_MECHANISM_PARAM_INVALID); 729 } 730 gcm_ctx->gcm_htab_len = htab_len; 731 gcm_ctx->gcm_Htable = 732 (uint64_t *)kmem_alloc(htab_len, KM_SLEEP); 733 734 if (gcm_ctx->gcm_Htable == NULL) { 735 return (CRYPTO_HOST_MEMORY); 736 } 737 } 738 739 /* Avx and non avx context initialization differs from here on. */ 740 if (gcm_ctx->gcm_use_avx == B_FALSE) { 741 #endif /* ifdef CAN_USE_GCM_ASM */ 742 if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, 743 gmac_param->pAAD, gmac_param->ulAADLen, block_size, 744 encrypt_block, copy_block, xor_block) != 0) { 745 rv = CRYPTO_MECHANISM_PARAM_INVALID; 746 } 747 #ifdef CAN_USE_GCM_ASM 748 } else { 749 if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, 750 gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) { 751 rv = CRYPTO_MECHANISM_PARAM_INVALID; 752 } 753 } 754 #endif /* ifdef CAN_USE_GCM_ASM */ 755 756 return (rv); 757 } 758 759 void * 760 gcm_alloc_ctx(int kmflag) 761 { 762 gcm_ctx_t *gcm_ctx; 763 764 if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) 765 return (NULL); 766 767 gcm_ctx->gcm_flags = GCM_MODE; 768 return (gcm_ctx); 769 } 770 771 void * 772 gmac_alloc_ctx(int kmflag) 773 { 774 gcm_ctx_t *gcm_ctx; 775 776 if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) 777 return (NULL); 778 779 gcm_ctx->gcm_flags = GMAC_MODE; 780 return (gcm_ctx); 781 } 782 783 /* GCM implementation that contains the fastest methods */ 784 static gcm_impl_ops_t gcm_fastest_impl = { 785 .name = "fastest" 786 }; 787 788 /* All compiled in implementations */ 789 static const gcm_impl_ops_t *gcm_all_impl[] = { 790 &gcm_generic_impl, 791 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) 792 &gcm_pclmulqdq_impl, 793 #endif 794 }; 795 796 /* Indicate that benchmark has been completed */ 797 static boolean_t gcm_impl_initialized = B_FALSE; 798 799 /* Hold all supported implementations */ 800 static size_t gcm_supp_impl_cnt = 0; 801 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)]; 802 803 /* 804 * Returns the GCM operations for encrypt/decrypt/key setup. When a 805 * SIMD implementation is not allowed in the current context, then 806 * fallback to the fastest generic implementation. 807 */ 808 const gcm_impl_ops_t * 809 gcm_impl_get_ops(void) 810 { 811 if (!kfpu_allowed()) 812 return (&gcm_generic_impl); 813 814 const gcm_impl_ops_t *ops = NULL; 815 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); 816 817 switch (impl) { 818 case IMPL_FASTEST: 819 ASSERT(gcm_impl_initialized); 820 ops = &gcm_fastest_impl; 821 break; 822 case IMPL_CYCLE: 823 /* Cycle through supported implementations */ 824 ASSERT(gcm_impl_initialized); 825 ASSERT3U(gcm_supp_impl_cnt, >, 0); 826 static size_t cycle_impl_idx = 0; 827 size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt; 828 ops = gcm_supp_impl[idx]; 829 break; 830 #ifdef CAN_USE_GCM_ASM 831 case IMPL_AVX: 832 /* 833 * Make sure that we return a valid implementation while 834 * switching to the avx implementation since there still 835 * may be unfinished non-avx contexts around. 836 */ 837 ops = &gcm_generic_impl; 838 break; 839 #endif 840 default: 841 ASSERT3U(impl, <, gcm_supp_impl_cnt); 842 ASSERT3U(gcm_supp_impl_cnt, >, 0); 843 if (impl < ARRAY_SIZE(gcm_all_impl)) 844 ops = gcm_supp_impl[impl]; 845 break; 846 } 847 848 ASSERT3P(ops, !=, NULL); 849 850 return (ops); 851 } 852 853 /* 854 * Initialize all supported implementations. 855 */ 856 void 857 gcm_impl_init(void) 858 { 859 gcm_impl_ops_t *curr_impl; 860 int i, c; 861 862 /* Move supported implementations into gcm_supp_impls */ 863 for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) { 864 curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i]; 865 866 if (curr_impl->is_supported()) 867 gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl; 868 } 869 gcm_supp_impl_cnt = c; 870 871 /* 872 * Set the fastest implementation given the assumption that the 873 * hardware accelerated version is the fastest. 874 */ 875 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) 876 if (gcm_pclmulqdq_impl.is_supported()) { 877 memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl, 878 sizeof (gcm_fastest_impl)); 879 } else 880 #endif 881 { 882 memcpy(&gcm_fastest_impl, &gcm_generic_impl, 883 sizeof (gcm_fastest_impl)); 884 } 885 886 strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX); 887 888 #ifdef CAN_USE_GCM_ASM 889 /* 890 * Use the avx implementation if it's available and the implementation 891 * hasn't changed from its default value of fastest on module load. 892 */ 893 if (gcm_avx_will_work()) { 894 #ifdef HAVE_MOVBE 895 if (zfs_movbe_available() == B_TRUE) { 896 atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE); 897 } 898 #endif 899 if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { 900 gcm_set_avx(B_TRUE); 901 } 902 } 903 #endif 904 /* Finish initialization */ 905 atomic_swap_32(&icp_gcm_impl, user_sel_impl); 906 gcm_impl_initialized = B_TRUE; 907 } 908 909 static const struct { 910 const char *name; 911 uint32_t sel; 912 } gcm_impl_opts[] = { 913 { "cycle", IMPL_CYCLE }, 914 { "fastest", IMPL_FASTEST }, 915 #ifdef CAN_USE_GCM_ASM 916 { "avx", IMPL_AVX }, 917 #endif 918 }; 919 920 /* 921 * Function sets desired gcm implementation. 922 * 923 * If we are called before init(), user preference will be saved in 924 * user_sel_impl, and applied in later init() call. This occurs when module 925 * parameter is specified on module load. Otherwise, directly update 926 * icp_gcm_impl. 927 * 928 * @val Name of gcm implementation to use 929 * @param Unused. 930 */ 931 int 932 gcm_impl_set(const char *val) 933 { 934 int err = -EINVAL; 935 char req_name[GCM_IMPL_NAME_MAX]; 936 uint32_t impl = GCM_IMPL_READ(user_sel_impl); 937 size_t i; 938 939 /* sanitize input */ 940 i = strnlen(val, GCM_IMPL_NAME_MAX); 941 if (i == 0 || i >= GCM_IMPL_NAME_MAX) 942 return (err); 943 944 strlcpy(req_name, val, GCM_IMPL_NAME_MAX); 945 while (i > 0 && isspace(req_name[i-1])) 946 i--; 947 req_name[i] = '\0'; 948 949 /* Check mandatory options */ 950 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { 951 #ifdef CAN_USE_GCM_ASM 952 /* Ignore avx implementation if it won't work. */ 953 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { 954 continue; 955 } 956 #endif 957 if (strcmp(req_name, gcm_impl_opts[i].name) == 0) { 958 impl = gcm_impl_opts[i].sel; 959 err = 0; 960 break; 961 } 962 } 963 964 /* check all supported impl if init() was already called */ 965 if (err != 0 && gcm_impl_initialized) { 966 /* check all supported implementations */ 967 for (i = 0; i < gcm_supp_impl_cnt; i++) { 968 if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) { 969 impl = i; 970 err = 0; 971 break; 972 } 973 } 974 } 975 #ifdef CAN_USE_GCM_ASM 976 /* 977 * Use the avx implementation if available and the requested one is 978 * avx or fastest. 979 */ 980 if (gcm_avx_will_work() == B_TRUE && 981 (impl == IMPL_AVX || impl == IMPL_FASTEST)) { 982 gcm_set_avx(B_TRUE); 983 } else { 984 gcm_set_avx(B_FALSE); 985 } 986 #endif 987 988 if (err == 0) { 989 if (gcm_impl_initialized) 990 atomic_swap_32(&icp_gcm_impl, impl); 991 else 992 atomic_swap_32(&user_sel_impl, impl); 993 } 994 995 return (err); 996 } 997 998 #if defined(_KERNEL) && defined(__linux__) 999 1000 static int 1001 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp) 1002 { 1003 return (gcm_impl_set(val)); 1004 } 1005 1006 static int 1007 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) 1008 { 1009 int i, cnt = 0; 1010 char *fmt; 1011 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); 1012 1013 ASSERT(gcm_impl_initialized); 1014 1015 /* list mandatory options */ 1016 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { 1017 #ifdef CAN_USE_GCM_ASM 1018 /* Ignore avx implementation if it won't work. */ 1019 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { 1020 continue; 1021 } 1022 #endif 1023 fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s "; 1024 cnt += sprintf(buffer + cnt, fmt, gcm_impl_opts[i].name); 1025 } 1026 1027 /* list all supported implementations */ 1028 for (i = 0; i < gcm_supp_impl_cnt; i++) { 1029 fmt = (i == impl) ? "[%s] " : "%s "; 1030 cnt += sprintf(buffer + cnt, fmt, gcm_supp_impl[i]->name); 1031 } 1032 1033 return (cnt); 1034 } 1035 1036 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get, 1037 NULL, 0644); 1038 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); 1039 #endif /* defined(__KERNEL) */ 1040 1041 #ifdef CAN_USE_GCM_ASM 1042 #define GCM_BLOCK_LEN 16 1043 /* 1044 * The openssl asm routines are 6x aggregated and need that many bytes 1045 * at minimum. 1046 */ 1047 #define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6) 1048 #define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3) 1049 /* 1050 * Ensure the chunk size is reasonable since we are allocating a 1051 * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts. 1052 */ 1053 #define GCM_AVX_MAX_CHUNK_SIZE \ 1054 (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES) 1055 1056 /* Clear the FPU registers since they hold sensitive internal state. */ 1057 #define clear_fpu_regs() clear_fpu_regs_avx() 1058 #define GHASH_AVX(ctx, in, len) \ 1059 gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \ 1060 in, len) 1061 1062 #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) 1063 1064 /* Get the chunk size module parameter. */ 1065 #define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size 1066 1067 /* 1068 * Module parameter: number of bytes to process at once while owning the FPU. 1069 * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is 1070 * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES. 1071 */ 1072 static uint32_t gcm_avx_chunk_size = 1073 ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; 1074 1075 extern void clear_fpu_regs_avx(void); 1076 extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst); 1077 extern void aes_encrypt_intel(const uint32_t rk[], int nr, 1078 const uint32_t pt[4], uint32_t ct[4]); 1079 1080 extern void gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]); 1081 extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable, 1082 const uint8_t *in, size_t len); 1083 1084 extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, 1085 const void *, uint64_t *, uint64_t *); 1086 1087 extern size_t aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, 1088 const void *, uint64_t *, uint64_t *); 1089 1090 static inline boolean_t 1091 gcm_avx_will_work(void) 1092 { 1093 /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */ 1094 return (kfpu_allowed() && 1095 zfs_avx_available() && zfs_aes_available() && 1096 zfs_pclmulqdq_available()); 1097 } 1098 1099 static inline void 1100 gcm_set_avx(boolean_t val) 1101 { 1102 if (gcm_avx_will_work() == B_TRUE) { 1103 atomic_swap_32(&gcm_use_avx, val); 1104 } 1105 } 1106 1107 static inline boolean_t 1108 gcm_toggle_avx(void) 1109 { 1110 if (gcm_avx_will_work() == B_TRUE) { 1111 return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX)); 1112 } else { 1113 return (B_FALSE); 1114 } 1115 } 1116 1117 static inline size_t 1118 gcm_simd_get_htab_size(boolean_t simd_mode) 1119 { 1120 switch (simd_mode) { 1121 case B_TRUE: 1122 return (2 * 6 * 2 * sizeof (uint64_t)); 1123 1124 default: 1125 return (0); 1126 } 1127 } 1128 1129 /* 1130 * Clear sensitive data in the context. 1131 * 1132 * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and 1133 * ctx->gcm_Htable contain the hash sub key which protects authentication. 1134 * 1135 * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for 1136 * a known plaintext attack, they consists of the IV and the first and last 1137 * counter respectively. If they should be cleared is debatable. 1138 */ 1139 static inline void 1140 gcm_clear_ctx(gcm_ctx_t *ctx) 1141 { 1142 memset(ctx->gcm_remainder, 0, sizeof (ctx->gcm_remainder)); 1143 memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); 1144 memset(ctx->gcm_J0, 0, sizeof (ctx->gcm_J0)); 1145 memset(ctx->gcm_tmp, 0, sizeof (ctx->gcm_tmp)); 1146 } 1147 1148 /* Increment the GCM counter block by n. */ 1149 static inline void 1150 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n) 1151 { 1152 uint64_t counter_mask = ntohll(0x00000000ffffffffULL); 1153 uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask); 1154 1155 counter = htonll(counter + n); 1156 counter &= counter_mask; 1157 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; 1158 } 1159 1160 /* 1161 * Encrypt multiple blocks of data in GCM mode. 1162 * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines 1163 * if possible. While processing a chunk the FPU is "locked". 1164 */ 1165 static int 1166 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, 1167 size_t length, crypto_data_t *out, size_t block_size) 1168 { 1169 size_t bleft = length; 1170 size_t need = 0; 1171 size_t done = 0; 1172 uint8_t *datap = (uint8_t *)data; 1173 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; 1174 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); 1175 uint64_t *ghash = ctx->gcm_ghash; 1176 uint64_t *cb = ctx->gcm_cb; 1177 uint8_t *ct_buf = NULL; 1178 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; 1179 int rv = CRYPTO_SUCCESS; 1180 1181 ASSERT(block_size == GCM_BLOCK_LEN); 1182 /* 1183 * If the last call left an incomplete block, try to fill 1184 * it first. 1185 */ 1186 if (ctx->gcm_remainder_len > 0) { 1187 need = block_size - ctx->gcm_remainder_len; 1188 if (length < need) { 1189 /* Accumulate bytes here and return. */ 1190 memcpy((uint8_t *)ctx->gcm_remainder + 1191 ctx->gcm_remainder_len, datap, length); 1192 1193 ctx->gcm_remainder_len += length; 1194 if (ctx->gcm_copy_to == NULL) { 1195 ctx->gcm_copy_to = datap; 1196 } 1197 return (CRYPTO_SUCCESS); 1198 } else { 1199 /* Complete incomplete block. */ 1200 memcpy((uint8_t *)ctx->gcm_remainder + 1201 ctx->gcm_remainder_len, datap, need); 1202 1203 ctx->gcm_copy_to = NULL; 1204 } 1205 } 1206 1207 /* Allocate a buffer to encrypt to if there is enough input. */ 1208 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { 1209 ct_buf = vmem_alloc(chunk_size, KM_SLEEP); 1210 if (ct_buf == NULL) { 1211 return (CRYPTO_HOST_MEMORY); 1212 } 1213 } 1214 1215 /* If we completed an incomplete block, encrypt and write it out. */ 1216 if (ctx->gcm_remainder_len > 0) { 1217 kfpu_begin(); 1218 aes_encrypt_intel(key->encr_ks.ks32, key->nr, 1219 (const uint32_t *)cb, (uint32_t *)tmp); 1220 1221 gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp); 1222 GHASH_AVX(ctx, tmp, block_size); 1223 clear_fpu_regs(); 1224 kfpu_end(); 1225 rv = crypto_put_output_data(tmp, out, block_size); 1226 out->cd_offset += block_size; 1227 gcm_incr_counter_block(ctx); 1228 ctx->gcm_processed_data_len += block_size; 1229 bleft -= need; 1230 datap += need; 1231 ctx->gcm_remainder_len = 0; 1232 } 1233 1234 /* Do the bulk encryption in chunk_size blocks. */ 1235 for (; bleft >= chunk_size; bleft -= chunk_size) { 1236 kfpu_begin(); 1237 done = aesni_gcm_encrypt( 1238 datap, ct_buf, chunk_size, key, cb, ghash); 1239 1240 clear_fpu_regs(); 1241 kfpu_end(); 1242 if (done != chunk_size) { 1243 rv = CRYPTO_FAILED; 1244 goto out_nofpu; 1245 } 1246 rv = crypto_put_output_data(ct_buf, out, chunk_size); 1247 if (rv != CRYPTO_SUCCESS) { 1248 goto out_nofpu; 1249 } 1250 out->cd_offset += chunk_size; 1251 datap += chunk_size; 1252 ctx->gcm_processed_data_len += chunk_size; 1253 } 1254 /* Check if we are already done. */ 1255 if (bleft == 0) { 1256 goto out_nofpu; 1257 } 1258 /* Bulk encrypt the remaining data. */ 1259 kfpu_begin(); 1260 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { 1261 done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash); 1262 if (done == 0) { 1263 rv = CRYPTO_FAILED; 1264 goto out; 1265 } 1266 rv = crypto_put_output_data(ct_buf, out, done); 1267 if (rv != CRYPTO_SUCCESS) { 1268 goto out; 1269 } 1270 out->cd_offset += done; 1271 ctx->gcm_processed_data_len += done; 1272 datap += done; 1273 bleft -= done; 1274 1275 } 1276 /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */ 1277 while (bleft > 0) { 1278 if (bleft < block_size) { 1279 memcpy(ctx->gcm_remainder, datap, bleft); 1280 ctx->gcm_remainder_len = bleft; 1281 ctx->gcm_copy_to = datap; 1282 goto out; 1283 } 1284 /* Encrypt, hash and write out. */ 1285 aes_encrypt_intel(key->encr_ks.ks32, key->nr, 1286 (const uint32_t *)cb, (uint32_t *)tmp); 1287 1288 gcm_xor_avx(datap, tmp); 1289 GHASH_AVX(ctx, tmp, block_size); 1290 rv = crypto_put_output_data(tmp, out, block_size); 1291 if (rv != CRYPTO_SUCCESS) { 1292 goto out; 1293 } 1294 out->cd_offset += block_size; 1295 gcm_incr_counter_block(ctx); 1296 ctx->gcm_processed_data_len += block_size; 1297 datap += block_size; 1298 bleft -= block_size; 1299 } 1300 out: 1301 clear_fpu_regs(); 1302 kfpu_end(); 1303 out_nofpu: 1304 if (ct_buf != NULL) { 1305 vmem_free(ct_buf, chunk_size); 1306 } 1307 return (rv); 1308 } 1309 1310 /* 1311 * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual 1312 * incomplete last block. Encrypt the ICB. Calculate the tag and write it out. 1313 */ 1314 static int 1315 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) 1316 { 1317 uint8_t *ghash = (uint8_t *)ctx->gcm_ghash; 1318 uint32_t *J0 = (uint32_t *)ctx->gcm_J0; 1319 uint8_t *remainder = (uint8_t *)ctx->gcm_remainder; 1320 size_t rem_len = ctx->gcm_remainder_len; 1321 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; 1322 int aes_rounds = ((aes_key_t *)keysched)->nr; 1323 int rv; 1324 1325 ASSERT(block_size == GCM_BLOCK_LEN); 1326 1327 if (out->cd_length < (rem_len + ctx->gcm_tag_len)) { 1328 return (CRYPTO_DATA_LEN_RANGE); 1329 } 1330 1331 kfpu_begin(); 1332 /* Pad last incomplete block with zeros, encrypt and hash. */ 1333 if (rem_len > 0) { 1334 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; 1335 const uint32_t *cb = (uint32_t *)ctx->gcm_cb; 1336 1337 aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp); 1338 memset(remainder + rem_len, 0, block_size - rem_len); 1339 for (int i = 0; i < rem_len; i++) { 1340 remainder[i] ^= tmp[i]; 1341 } 1342 GHASH_AVX(ctx, remainder, block_size); 1343 ctx->gcm_processed_data_len += rem_len; 1344 /* No need to increment counter_block, it's the last block. */ 1345 } 1346 /* Finish tag. */ 1347 ctx->gcm_len_a_len_c[1] = 1348 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); 1349 GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size); 1350 aes_encrypt_intel(keysched, aes_rounds, J0, J0); 1351 1352 gcm_xor_avx((uint8_t *)J0, ghash); 1353 clear_fpu_regs(); 1354 kfpu_end(); 1355 1356 /* Output remainder. */ 1357 if (rem_len > 0) { 1358 rv = crypto_put_output_data(remainder, out, rem_len); 1359 if (rv != CRYPTO_SUCCESS) 1360 return (rv); 1361 } 1362 out->cd_offset += rem_len; 1363 ctx->gcm_remainder_len = 0; 1364 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); 1365 if (rv != CRYPTO_SUCCESS) 1366 return (rv); 1367 1368 out->cd_offset += ctx->gcm_tag_len; 1369 /* Clear sensitive data in the context before returning. */ 1370 gcm_clear_ctx(ctx); 1371 return (CRYPTO_SUCCESS); 1372 } 1373 1374 /* 1375 * Finalize decryption: We just have accumulated crypto text, so now we 1376 * decrypt it here inplace. 1377 */ 1378 static int 1379 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) 1380 { 1381 ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len); 1382 ASSERT3U(block_size, ==, 16); 1383 1384 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; 1385 size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; 1386 uint8_t *datap = ctx->gcm_pt_buf; 1387 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); 1388 uint32_t *cb = (uint32_t *)ctx->gcm_cb; 1389 uint64_t *ghash = ctx->gcm_ghash; 1390 uint32_t *tmp = (uint32_t *)ctx->gcm_tmp; 1391 int rv = CRYPTO_SUCCESS; 1392 size_t bleft, done; 1393 1394 /* 1395 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be 1396 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of 1397 * GCM_AVX_MIN_DECRYPT_BYTES. 1398 */ 1399 for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) { 1400 kfpu_begin(); 1401 done = aesni_gcm_decrypt(datap, datap, chunk_size, 1402 (const void *)key, ctx->gcm_cb, ghash); 1403 clear_fpu_regs(); 1404 kfpu_end(); 1405 if (done != chunk_size) { 1406 return (CRYPTO_FAILED); 1407 } 1408 datap += done; 1409 } 1410 /* Decrypt remainder, which is less than chunk size, in one go. */ 1411 kfpu_begin(); 1412 if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) { 1413 done = aesni_gcm_decrypt(datap, datap, bleft, 1414 (const void *)key, ctx->gcm_cb, ghash); 1415 if (done == 0) { 1416 clear_fpu_regs(); 1417 kfpu_end(); 1418 return (CRYPTO_FAILED); 1419 } 1420 datap += done; 1421 bleft -= done; 1422 } 1423 ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES); 1424 1425 /* 1426 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain, 1427 * decrypt them block by block. 1428 */ 1429 while (bleft > 0) { 1430 /* Incomplete last block. */ 1431 if (bleft < block_size) { 1432 uint8_t *lastb = (uint8_t *)ctx->gcm_remainder; 1433 1434 memset(lastb, 0, block_size); 1435 memcpy(lastb, datap, bleft); 1436 /* The GCM processing. */ 1437 GHASH_AVX(ctx, lastb, block_size); 1438 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); 1439 for (size_t i = 0; i < bleft; i++) { 1440 datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i]; 1441 } 1442 break; 1443 } 1444 /* The GCM processing. */ 1445 GHASH_AVX(ctx, datap, block_size); 1446 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); 1447 gcm_xor_avx((uint8_t *)tmp, datap); 1448 gcm_incr_counter_block(ctx); 1449 1450 datap += block_size; 1451 bleft -= block_size; 1452 } 1453 if (rv != CRYPTO_SUCCESS) { 1454 clear_fpu_regs(); 1455 kfpu_end(); 1456 return (rv); 1457 } 1458 /* Decryption done, finish the tag. */ 1459 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); 1460 GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size); 1461 aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0, 1462 (uint32_t *)ctx->gcm_J0); 1463 1464 gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash); 1465 1466 /* We are done with the FPU, restore its state. */ 1467 clear_fpu_regs(); 1468 kfpu_end(); 1469 1470 /* Compare the input authentication tag with what we calculated. */ 1471 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { 1472 /* They don't match. */ 1473 return (CRYPTO_INVALID_MAC); 1474 } 1475 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); 1476 if (rv != CRYPTO_SUCCESS) { 1477 return (rv); 1478 } 1479 out->cd_offset += pt_len; 1480 gcm_clear_ctx(ctx); 1481 return (CRYPTO_SUCCESS); 1482 } 1483 1484 /* 1485 * Initialize the GCM params H, Htabtle and the counter block. Save the 1486 * initial counter block. 1487 */ 1488 static int 1489 gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, 1490 unsigned char *auth_data, size_t auth_data_len, size_t block_size) 1491 { 1492 uint8_t *cb = (uint8_t *)ctx->gcm_cb; 1493 uint64_t *H = ctx->gcm_H; 1494 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; 1495 int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr; 1496 uint8_t *datap = auth_data; 1497 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; 1498 size_t bleft; 1499 1500 ASSERT(block_size == GCM_BLOCK_LEN); 1501 1502 /* Init H (encrypt zero block) and create the initial counter block. */ 1503 memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash)); 1504 memset(H, 0, sizeof (ctx->gcm_H)); 1505 kfpu_begin(); 1506 aes_encrypt_intel(keysched, aes_rounds, 1507 (const uint32_t *)H, (uint32_t *)H); 1508 1509 gcm_init_htab_avx(ctx->gcm_Htable, H); 1510 1511 if (iv_len == 12) { 1512 memcpy(cb, iv, 12); 1513 cb[12] = 0; 1514 cb[13] = 0; 1515 cb[14] = 0; 1516 cb[15] = 1; 1517 /* We need the ICB later. */ 1518 memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0)); 1519 } else { 1520 /* 1521 * Most consumers use 12 byte IVs, so it's OK to use the 1522 * original routines for other IV sizes, just avoid nesting 1523 * kfpu_begin calls. 1524 */ 1525 clear_fpu_regs(); 1526 kfpu_end(); 1527 gcm_format_initial_blocks(iv, iv_len, ctx, block_size, 1528 aes_copy_block, aes_xor_block); 1529 kfpu_begin(); 1530 } 1531 1532 /* Openssl post increments the counter, adjust for that. */ 1533 gcm_incr_counter_block(ctx); 1534 1535 /* Ghash AAD in chunk_size blocks. */ 1536 for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) { 1537 GHASH_AVX(ctx, datap, chunk_size); 1538 datap += chunk_size; 1539 clear_fpu_regs(); 1540 kfpu_end(); 1541 kfpu_begin(); 1542 } 1543 /* Ghash the remainder and handle possible incomplete GCM block. */ 1544 if (bleft > 0) { 1545 size_t incomp = bleft % block_size; 1546 1547 bleft -= incomp; 1548 if (bleft > 0) { 1549 GHASH_AVX(ctx, datap, bleft); 1550 datap += bleft; 1551 } 1552 if (incomp > 0) { 1553 /* Zero pad and hash incomplete last block. */ 1554 uint8_t *authp = (uint8_t *)ctx->gcm_tmp; 1555 1556 memset(authp, 0, block_size); 1557 memcpy(authp, datap, incomp); 1558 GHASH_AVX(ctx, authp, block_size); 1559 } 1560 } 1561 clear_fpu_regs(); 1562 kfpu_end(); 1563 return (CRYPTO_SUCCESS); 1564 } 1565 1566 #if defined(_KERNEL) 1567 static int 1568 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) 1569 { 1570 unsigned long val; 1571 char val_rounded[16]; 1572 int error = 0; 1573 1574 error = kstrtoul(buf, 0, &val); 1575 if (error) 1576 return (error); 1577 1578 val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; 1579 1580 if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE) 1581 return (-EINVAL); 1582 1583 snprintf(val_rounded, 16, "%u", (uint32_t)val); 1584 error = param_set_uint(val_rounded, kp); 1585 return (error); 1586 } 1587 1588 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size, 1589 param_get_uint, &gcm_avx_chunk_size, 0644); 1590 1591 MODULE_PARM_DESC(icp_gcm_avx_chunk_size, 1592 "How many bytes to process while owning the FPU"); 1593 1594 #endif /* defined(__KERNEL) */ 1595 #endif /* ifdef CAN_USE_GCM_ASM */ 1596