1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/zfs_context.h>
27 #include <sys/cmn_err.h>
28 #include <modes/modes.h>
29 #include <sys/crypto/common.h>
30 #include <sys/crypto/icp.h>
31 #include <sys/crypto/impl.h>
32 #include <sys/byteorder.h>
33 #include <sys/simd.h>
34 #include <modes/gcm_impl.h>
35 #ifdef CAN_USE_GCM_ASM
36 #include <aes/aes_impl.h>
37 #endif
38
39 #define GHASH(c, d, t, o) \
40 xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
41 (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
42 (uint64_t *)(void *)(t));
43
44 /* Select GCM implementation */
45 #define IMPL_FASTEST (UINT32_MAX)
46 #define IMPL_CYCLE (UINT32_MAX-1)
47 #ifdef CAN_USE_GCM_ASM
48 #define IMPL_AVX (UINT32_MAX-2)
49 #endif
50 #define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
51 static uint32_t icp_gcm_impl = IMPL_FASTEST;
52 static uint32_t user_sel_impl = IMPL_FASTEST;
53
54 #ifdef CAN_USE_GCM_ASM
55 /* Does the architecture we run on support the MOVBE instruction? */
56 boolean_t gcm_avx_can_use_movbe = B_FALSE;
57 /*
58 * Whether to use the optimized openssl gcm and ghash implementations.
59 * Set to true if module parameter icp_gcm_impl == "avx".
60 */
61 static boolean_t gcm_use_avx = B_FALSE;
62 #define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx)
63
64 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
65
66 static inline boolean_t gcm_avx_will_work(void);
67 static inline void gcm_set_avx(boolean_t);
68 static inline boolean_t gcm_toggle_avx(void);
69 static inline size_t gcm_simd_get_htab_size(boolean_t);
70
71 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
72 crypto_data_t *, size_t);
73
74 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
75 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
76 static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *,
77 size_t, size_t);
78 #endif /* ifdef CAN_USE_GCM_ASM */
79
80 /*
81 * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode
82 * is done in another function.
83 */
84 int
gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))85 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
86 crypto_data_t *out, size_t block_size,
87 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
88 void (*copy_block)(uint8_t *, uint8_t *),
89 void (*xor_block)(uint8_t *, uint8_t *))
90 {
91 #ifdef CAN_USE_GCM_ASM
92 if (ctx->gcm_use_avx == B_TRUE)
93 return (gcm_mode_encrypt_contiguous_blocks_avx(
94 ctx, data, length, out, block_size));
95 #endif
96
97 const gcm_impl_ops_t *gops;
98 size_t remainder = length;
99 size_t need = 0;
100 uint8_t *datap = (uint8_t *)data;
101 uint8_t *blockp;
102 uint8_t *lastp;
103 void *iov_or_mp;
104 offset_t offset;
105 uint8_t *out_data_1;
106 uint8_t *out_data_2;
107 size_t out_data_1_len;
108 uint64_t counter;
109 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
110
111 if (length + ctx->gcm_remainder_len < block_size) {
112 /* accumulate bytes here and return */
113 memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
114 datap,
115 length);
116 ctx->gcm_remainder_len += length;
117 if (ctx->gcm_copy_to == NULL) {
118 ctx->gcm_copy_to = datap;
119 }
120 return (CRYPTO_SUCCESS);
121 }
122
123 crypto_init_ptrs(out, &iov_or_mp, &offset);
124
125 gops = gcm_impl_get_ops();
126 do {
127 /* Unprocessed data from last call. */
128 if (ctx->gcm_remainder_len > 0) {
129 need = block_size - ctx->gcm_remainder_len;
130
131 if (need > remainder)
132 return (CRYPTO_DATA_LEN_RANGE);
133
134 memcpy(&((uint8_t *)ctx->gcm_remainder)
135 [ctx->gcm_remainder_len], datap, need);
136
137 blockp = (uint8_t *)ctx->gcm_remainder;
138 } else {
139 blockp = datap;
140 }
141
142 /*
143 * Increment counter. Counter bits are confined
144 * to the bottom 32 bits of the counter block.
145 */
146 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
147 counter = htonll(counter + 1);
148 counter &= counter_mask;
149 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
150
151 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
152 (uint8_t *)ctx->gcm_tmp);
153 xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
154
155 lastp = (uint8_t *)ctx->gcm_tmp;
156
157 ctx->gcm_processed_data_len += block_size;
158
159 crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
160 &out_data_1_len, &out_data_2, block_size);
161
162 /* copy block to where it belongs */
163 if (out_data_1_len == block_size) {
164 copy_block(lastp, out_data_1);
165 } else {
166 memcpy(out_data_1, lastp, out_data_1_len);
167 if (out_data_2 != NULL) {
168 memcpy(out_data_2,
169 lastp + out_data_1_len,
170 block_size - out_data_1_len);
171 }
172 }
173 /* update offset */
174 out->cd_offset += block_size;
175
176 /* add ciphertext to the hash */
177 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
178
179 /* Update pointer to next block of data to be processed. */
180 if (ctx->gcm_remainder_len != 0) {
181 datap += need;
182 ctx->gcm_remainder_len = 0;
183 } else {
184 datap += block_size;
185 }
186
187 remainder = (size_t)&data[length] - (size_t)datap;
188
189 /* Incomplete last block. */
190 if (remainder > 0 && remainder < block_size) {
191 memcpy(ctx->gcm_remainder, datap, remainder);
192 ctx->gcm_remainder_len = remainder;
193 ctx->gcm_copy_to = datap;
194 goto out;
195 }
196 ctx->gcm_copy_to = NULL;
197
198 } while (remainder > 0);
199 out:
200 return (CRYPTO_SUCCESS);
201 }
202
203 int
gcm_encrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))204 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
205 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
206 void (*copy_block)(uint8_t *, uint8_t *),
207 void (*xor_block)(uint8_t *, uint8_t *))
208 {
209 (void) copy_block;
210 #ifdef CAN_USE_GCM_ASM
211 if (ctx->gcm_use_avx == B_TRUE)
212 return (gcm_encrypt_final_avx(ctx, out, block_size));
213 #endif
214
215 const gcm_impl_ops_t *gops;
216 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
217 uint8_t *ghash, *macp = NULL;
218 int i, rv;
219
220 if (out->cd_length <
221 (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
222 return (CRYPTO_DATA_LEN_RANGE);
223 }
224
225 gops = gcm_impl_get_ops();
226 ghash = (uint8_t *)ctx->gcm_ghash;
227
228 if (ctx->gcm_remainder_len > 0) {
229 uint64_t counter;
230 uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
231
232 /*
233 * Here is where we deal with data that is not a
234 * multiple of the block size.
235 */
236
237 /*
238 * Increment counter.
239 */
240 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
241 counter = htonll(counter + 1);
242 counter &= counter_mask;
243 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
244
245 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
246 (uint8_t *)ctx->gcm_tmp);
247
248 macp = (uint8_t *)ctx->gcm_remainder;
249 memset(macp + ctx->gcm_remainder_len, 0,
250 block_size - ctx->gcm_remainder_len);
251
252 /* XOR with counter block */
253 for (i = 0; i < ctx->gcm_remainder_len; i++) {
254 macp[i] ^= tmpp[i];
255 }
256
257 /* add ciphertext to the hash */
258 GHASH(ctx, macp, ghash, gops);
259
260 ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
261 }
262
263 ctx->gcm_len_a_len_c[1] =
264 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
265 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
266 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
267 (uint8_t *)ctx->gcm_J0);
268 xor_block((uint8_t *)ctx->gcm_J0, ghash);
269
270 if (ctx->gcm_remainder_len > 0) {
271 rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
272 if (rv != CRYPTO_SUCCESS)
273 return (rv);
274 }
275 out->cd_offset += ctx->gcm_remainder_len;
276 ctx->gcm_remainder_len = 0;
277 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
278 if (rv != CRYPTO_SUCCESS)
279 return (rv);
280 out->cd_offset += ctx->gcm_tag_len;
281
282 return (CRYPTO_SUCCESS);
283 }
284
285 /*
286 * This will only deal with decrypting the last block of the input that
287 * might not be a multiple of block length.
288 */
289 static void
gcm_decrypt_incomplete_block(gcm_ctx_t * ctx,size_t block_size,size_t index,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))290 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
291 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
292 void (*xor_block)(uint8_t *, uint8_t *))
293 {
294 uint8_t *datap, *outp, *counterp;
295 uint64_t counter;
296 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
297 int i;
298
299 /*
300 * Increment counter.
301 * Counter bits are confined to the bottom 32 bits
302 */
303 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
304 counter = htonll(counter + 1);
305 counter &= counter_mask;
306 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
307
308 datap = (uint8_t *)ctx->gcm_remainder;
309 outp = &((ctx->gcm_pt_buf)[index]);
310 counterp = (uint8_t *)ctx->gcm_tmp;
311
312 /* authentication tag */
313 memset((uint8_t *)ctx->gcm_tmp, 0, block_size);
314 memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len);
315
316 /* add ciphertext to the hash */
317 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
318
319 /* decrypt remaining ciphertext */
320 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
321
322 /* XOR with counter block */
323 for (i = 0; i < ctx->gcm_remainder_len; i++) {
324 outp[i] = datap[i] ^ counterp[i];
325 }
326 }
327
328 int
gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))329 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
330 crypto_data_t *out, size_t block_size,
331 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
332 void (*copy_block)(uint8_t *, uint8_t *),
333 void (*xor_block)(uint8_t *, uint8_t *))
334 {
335 (void) out, (void) block_size, (void) encrypt_block, (void) copy_block,
336 (void) xor_block;
337 size_t new_len;
338 uint8_t *new;
339
340 /*
341 * Copy contiguous ciphertext input blocks to plaintext buffer.
342 * Ciphertext will be decrypted in the final.
343 */
344 if (length > 0) {
345 new_len = ctx->gcm_pt_buf_len + length;
346 new = vmem_alloc(new_len, KM_SLEEP);
347 if (new == NULL) {
348 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
349 ctx->gcm_pt_buf = NULL;
350 return (CRYPTO_HOST_MEMORY);
351 }
352
353 if (ctx->gcm_pt_buf != NULL) {
354 memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
355 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
356 } else {
357 ASSERT0(ctx->gcm_pt_buf_len);
358 }
359
360 ctx->gcm_pt_buf = new;
361 ctx->gcm_pt_buf_len = new_len;
362 memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data,
363 length);
364 ctx->gcm_processed_data_len += length;
365 }
366
367 ctx->gcm_remainder_len = 0;
368 return (CRYPTO_SUCCESS);
369 }
370
371 int
gcm_decrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))372 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
373 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
374 void (*xor_block)(uint8_t *, uint8_t *))
375 {
376 #ifdef CAN_USE_GCM_ASM
377 if (ctx->gcm_use_avx == B_TRUE)
378 return (gcm_decrypt_final_avx(ctx, out, block_size));
379 #endif
380
381 const gcm_impl_ops_t *gops;
382 size_t pt_len;
383 size_t remainder;
384 uint8_t *ghash;
385 uint8_t *blockp;
386 uint8_t *cbp;
387 uint64_t counter;
388 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
389 int processed = 0, rv;
390
391 ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
392
393 gops = gcm_impl_get_ops();
394 pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
395 ghash = (uint8_t *)ctx->gcm_ghash;
396 blockp = ctx->gcm_pt_buf;
397 remainder = pt_len;
398 while (remainder > 0) {
399 /* Incomplete last block */
400 if (remainder < block_size) {
401 memcpy(ctx->gcm_remainder, blockp, remainder);
402 ctx->gcm_remainder_len = remainder;
403 /*
404 * not expecting anymore ciphertext, just
405 * compute plaintext for the remaining input
406 */
407 gcm_decrypt_incomplete_block(ctx, block_size,
408 processed, encrypt_block, xor_block);
409 ctx->gcm_remainder_len = 0;
410 goto out;
411 }
412 /* add ciphertext to the hash */
413 GHASH(ctx, blockp, ghash, gops);
414
415 /*
416 * Increment counter.
417 * Counter bits are confined to the bottom 32 bits
418 */
419 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
420 counter = htonll(counter + 1);
421 counter &= counter_mask;
422 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
423
424 cbp = (uint8_t *)ctx->gcm_tmp;
425 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
426
427 /* XOR with ciphertext */
428 xor_block(cbp, blockp);
429
430 processed += block_size;
431 blockp += block_size;
432 remainder -= block_size;
433 }
434 out:
435 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
436 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
437 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
438 (uint8_t *)ctx->gcm_J0);
439 xor_block((uint8_t *)ctx->gcm_J0, ghash);
440
441 /* compare the input authentication tag with what we calculated */
442 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
443 /* They don't match */
444 return (CRYPTO_INVALID_MAC);
445 } else {
446 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
447 if (rv != CRYPTO_SUCCESS)
448 return (rv);
449 out->cd_offset += pt_len;
450 }
451 return (CRYPTO_SUCCESS);
452 }
453
454 static int
gcm_validate_args(CK_AES_GCM_PARAMS * gcm_param)455 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
456 {
457 size_t tag_len;
458
459 /*
460 * Check the length of the authentication tag (in bits).
461 */
462 tag_len = gcm_param->ulTagBits;
463 switch (tag_len) {
464 case 32:
465 case 64:
466 case 96:
467 case 104:
468 case 112:
469 case 120:
470 case 128:
471 break;
472 default:
473 return (CRYPTO_MECHANISM_PARAM_INVALID);
474 }
475
476 if (gcm_param->ulIvLen == 0)
477 return (CRYPTO_MECHANISM_PARAM_INVALID);
478
479 return (CRYPTO_SUCCESS);
480 }
481
482 static void
gcm_format_initial_blocks(const uint8_t * iv,ulong_t iv_len,gcm_ctx_t * ctx,size_t block_size,void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))483 gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len,
484 gcm_ctx_t *ctx, size_t block_size,
485 void (*copy_block)(uint8_t *, uint8_t *),
486 void (*xor_block)(uint8_t *, uint8_t *))
487 {
488 const gcm_impl_ops_t *gops;
489 uint8_t *cb;
490 ulong_t remainder = iv_len;
491 ulong_t processed = 0;
492 uint8_t *datap, *ghash;
493 uint64_t len_a_len_c[2];
494
495 gops = gcm_impl_get_ops();
496 ghash = (uint8_t *)ctx->gcm_ghash;
497 cb = (uint8_t *)ctx->gcm_cb;
498 if (iv_len == 12) {
499 memcpy(cb, iv, 12);
500 cb[12] = 0;
501 cb[13] = 0;
502 cb[14] = 0;
503 cb[15] = 1;
504 /* J0 will be used again in the final */
505 copy_block(cb, (uint8_t *)ctx->gcm_J0);
506 } else {
507 /* GHASH the IV */
508 do {
509 if (remainder < block_size) {
510 memset(cb, 0, block_size);
511 memcpy(cb, &(iv[processed]), remainder);
512 datap = (uint8_t *)cb;
513 remainder = 0;
514 } else {
515 datap = (uint8_t *)(&(iv[processed]));
516 processed += block_size;
517 remainder -= block_size;
518 }
519 GHASH(ctx, datap, ghash, gops);
520 } while (remainder > 0);
521
522 len_a_len_c[0] = 0;
523 len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
524 GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
525
526 /* J0 will be used again in the final */
527 copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
528 }
529 }
530
531 static int
gcm_init(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))532 gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
533 const uint8_t *auth_data, size_t auth_data_len, size_t block_size,
534 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
535 void (*copy_block)(uint8_t *, uint8_t *),
536 void (*xor_block)(uint8_t *, uint8_t *))
537 {
538 const gcm_impl_ops_t *gops;
539 uint8_t *ghash, *datap, *authp;
540 size_t remainder, processed;
541
542 /* encrypt zero block to get subkey H */
543 memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
544 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
545 (uint8_t *)ctx->gcm_H);
546
547 gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
548 copy_block, xor_block);
549
550 gops = gcm_impl_get_ops();
551 authp = (uint8_t *)ctx->gcm_tmp;
552 ghash = (uint8_t *)ctx->gcm_ghash;
553 memset(authp, 0, block_size);
554 memset(ghash, 0, block_size);
555
556 processed = 0;
557 remainder = auth_data_len;
558 do {
559 if (remainder < block_size) {
560 /*
561 * There's not a block full of data, pad rest of
562 * buffer with zero
563 */
564
565 if (auth_data != NULL) {
566 memset(authp, 0, block_size);
567 memcpy(authp, &(auth_data[processed]),
568 remainder);
569 } else {
570 ASSERT0(remainder);
571 }
572
573 datap = (uint8_t *)authp;
574 remainder = 0;
575 } else {
576 datap = (uint8_t *)(&(auth_data[processed]));
577 processed += block_size;
578 remainder -= block_size;
579 }
580
581 /* add auth data to the hash */
582 GHASH(ctx, datap, ghash, gops);
583
584 } while (remainder > 0);
585
586 return (CRYPTO_SUCCESS);
587 }
588
589 /*
590 * Init the GCM context struct. Handle the cycle and avx implementations here.
591 */
592 int
gcm_init_ctx(gcm_ctx_t * gcm_ctx,char * param,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))593 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
594 size_t block_size, int (*encrypt_block)(const void *, const uint8_t *,
595 uint8_t *), void (*copy_block)(uint8_t *, uint8_t *),
596 void (*xor_block)(uint8_t *, uint8_t *))
597 {
598 CK_AES_GCM_PARAMS *gcm_param;
599 int rv = CRYPTO_SUCCESS;
600 size_t tag_len, iv_len;
601
602 if (param != NULL) {
603 gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
604
605 /* GCM mode. */
606 if ((rv = gcm_validate_args(gcm_param)) != 0) {
607 return (rv);
608 }
609 gcm_ctx->gcm_flags |= GCM_MODE;
610
611 size_t tbits = gcm_param->ulTagBits;
612 tag_len = CRYPTO_BITS2BYTES(tbits);
613 iv_len = gcm_param->ulIvLen;
614
615 gcm_ctx->gcm_tag_len = tag_len;
616 gcm_ctx->gcm_processed_data_len = 0;
617
618 /* these values are in bits */
619 gcm_ctx->gcm_len_a_len_c[0]
620 = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
621 } else {
622 return (CRYPTO_MECHANISM_PARAM_INVALID);
623 }
624
625 const uint8_t *iv = (const uint8_t *)gcm_param->pIv;
626 const uint8_t *aad = (const uint8_t *)gcm_param->pAAD;
627 size_t aad_len = gcm_param->ulAADLen;
628
629 #ifdef CAN_USE_GCM_ASM
630 boolean_t needs_bswap =
631 ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
632
633 if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
634 gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
635 } else {
636 /*
637 * Handle the "cycle" implementation by creating avx and
638 * non-avx contexts alternately.
639 */
640 gcm_ctx->gcm_use_avx = gcm_toggle_avx();
641
642 /* The avx impl. doesn't handle byte swapped key schedules. */
643 if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
644 gcm_ctx->gcm_use_avx = B_FALSE;
645 }
646 /*
647 * If this is a GCM context, use the MOVBE and the BSWAP
648 * variants alternately.
649 */
650 if (gcm_ctx->gcm_use_avx == B_TRUE &&
651 zfs_movbe_available() == B_TRUE) {
652 (void) atomic_toggle_boolean_nv(
653 (volatile boolean_t *)&gcm_avx_can_use_movbe);
654 }
655 }
656 /*
657 * We don't handle byte swapped key schedules in the avx code path,
658 * still they could be created by the aes generic implementation.
659 * Make sure not to use them since we'll corrupt data if we do.
660 */
661 if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
662 gcm_ctx->gcm_use_avx = B_FALSE;
663
664 cmn_err_once(CE_WARN,
665 "ICP: Can't use the aes generic or cycle implementations "
666 "in combination with the gcm avx implementation!");
667 cmn_err_once(CE_WARN,
668 "ICP: Falling back to a compatible implementation, "
669 "aes-gcm performance will likely be degraded.");
670 cmn_err_once(CE_WARN,
671 "ICP: Choose at least the x86_64 aes implementation to "
672 "restore performance.");
673 }
674
675 /* Allocate Htab memory as needed. */
676 if (gcm_ctx->gcm_use_avx == B_TRUE) {
677 size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
678
679 if (htab_len == 0) {
680 return (CRYPTO_MECHANISM_PARAM_INVALID);
681 }
682 gcm_ctx->gcm_htab_len = htab_len;
683 gcm_ctx->gcm_Htable =
684 kmem_alloc(htab_len, KM_SLEEP);
685
686 if (gcm_ctx->gcm_Htable == NULL) {
687 return (CRYPTO_HOST_MEMORY);
688 }
689 }
690 /* Avx and non avx context initialization differs from here on. */
691 if (gcm_ctx->gcm_use_avx == B_FALSE) {
692 #endif /* ifdef CAN_USE_GCM_ASM */
693 if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
694 encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
695 rv = CRYPTO_MECHANISM_PARAM_INVALID;
696 }
697 #ifdef CAN_USE_GCM_ASM
698 } else {
699 if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
700 block_size) != CRYPTO_SUCCESS) {
701 rv = CRYPTO_MECHANISM_PARAM_INVALID;
702 }
703 }
704 #endif /* ifdef CAN_USE_GCM_ASM */
705
706 return (rv);
707 }
708
709 void *
gcm_alloc_ctx(int kmflag)710 gcm_alloc_ctx(int kmflag)
711 {
712 gcm_ctx_t *gcm_ctx;
713
714 if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
715 return (NULL);
716
717 gcm_ctx->gcm_flags = GCM_MODE;
718 return (gcm_ctx);
719 }
720
721 /* GCM implementation that contains the fastest methods */
722 static gcm_impl_ops_t gcm_fastest_impl = {
723 .name = "fastest"
724 };
725
726 /* All compiled in implementations */
727 static const gcm_impl_ops_t *gcm_all_impl[] = {
728 &gcm_generic_impl,
729 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
730 &gcm_pclmulqdq_impl,
731 #endif
732 };
733
734 /* Indicate that benchmark has been completed */
735 static boolean_t gcm_impl_initialized = B_FALSE;
736
737 /* Hold all supported implementations */
738 static size_t gcm_supp_impl_cnt = 0;
739 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
740
741 /*
742 * Returns the GCM operations for encrypt/decrypt/key setup. When a
743 * SIMD implementation is not allowed in the current context, then
744 * fallback to the fastest generic implementation.
745 */
746 const gcm_impl_ops_t *
gcm_impl_get_ops(void)747 gcm_impl_get_ops(void)
748 {
749 if (!kfpu_allowed())
750 return (&gcm_generic_impl);
751
752 const gcm_impl_ops_t *ops = NULL;
753 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
754
755 switch (impl) {
756 case IMPL_FASTEST:
757 ASSERT(gcm_impl_initialized);
758 ops = &gcm_fastest_impl;
759 break;
760 case IMPL_CYCLE:
761 /* Cycle through supported implementations */
762 ASSERT(gcm_impl_initialized);
763 ASSERT3U(gcm_supp_impl_cnt, >, 0);
764 static size_t cycle_impl_idx = 0;
765 size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
766 ops = gcm_supp_impl[idx];
767 break;
768 #ifdef CAN_USE_GCM_ASM
769 case IMPL_AVX:
770 /*
771 * Make sure that we return a valid implementation while
772 * switching to the avx implementation since there still
773 * may be unfinished non-avx contexts around.
774 */
775 ops = &gcm_generic_impl;
776 break;
777 #endif
778 default:
779 ASSERT3U(impl, <, gcm_supp_impl_cnt);
780 ASSERT3U(gcm_supp_impl_cnt, >, 0);
781 if (impl < ARRAY_SIZE(gcm_all_impl))
782 ops = gcm_supp_impl[impl];
783 break;
784 }
785
786 ASSERT3P(ops, !=, NULL);
787
788 return (ops);
789 }
790
791 /*
792 * Initialize all supported implementations.
793 */
794 void
gcm_impl_init(void)795 gcm_impl_init(void)
796 {
797 gcm_impl_ops_t *curr_impl;
798 int i, c;
799
800 /* Move supported implementations into gcm_supp_impls */
801 for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
802 curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
803
804 if (curr_impl->is_supported())
805 gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
806 }
807 gcm_supp_impl_cnt = c;
808
809 /*
810 * Set the fastest implementation given the assumption that the
811 * hardware accelerated version is the fastest.
812 */
813 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
814 if (gcm_pclmulqdq_impl.is_supported()) {
815 memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
816 sizeof (gcm_fastest_impl));
817 } else
818 #endif
819 {
820 memcpy(&gcm_fastest_impl, &gcm_generic_impl,
821 sizeof (gcm_fastest_impl));
822 }
823
824 strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
825
826 #ifdef CAN_USE_GCM_ASM
827 /*
828 * Use the avx implementation if it's available and the implementation
829 * hasn't changed from its default value of fastest on module load.
830 */
831 if (gcm_avx_will_work()) {
832 #ifdef HAVE_MOVBE
833 if (zfs_movbe_available() == B_TRUE) {
834 atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
835 }
836 #endif
837 if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
838 gcm_set_avx(B_TRUE);
839 }
840 }
841 #endif
842 /* Finish initialization */
843 atomic_swap_32(&icp_gcm_impl, user_sel_impl);
844 gcm_impl_initialized = B_TRUE;
845 }
846
847 static const struct {
848 const char *name;
849 uint32_t sel;
850 } gcm_impl_opts[] = {
851 { "cycle", IMPL_CYCLE },
852 { "fastest", IMPL_FASTEST },
853 #ifdef CAN_USE_GCM_ASM
854 { "avx", IMPL_AVX },
855 #endif
856 };
857
858 /*
859 * Function sets desired gcm implementation.
860 *
861 * If we are called before init(), user preference will be saved in
862 * user_sel_impl, and applied in later init() call. This occurs when module
863 * parameter is specified on module load. Otherwise, directly update
864 * icp_gcm_impl.
865 *
866 * @val Name of gcm implementation to use
867 * @param Unused.
868 */
869 int
gcm_impl_set(const char * val)870 gcm_impl_set(const char *val)
871 {
872 int err = -EINVAL;
873 char req_name[GCM_IMPL_NAME_MAX];
874 uint32_t impl = GCM_IMPL_READ(user_sel_impl);
875 size_t i;
876
877 /* sanitize input */
878 i = strnlen(val, GCM_IMPL_NAME_MAX);
879 if (i == 0 || i >= GCM_IMPL_NAME_MAX)
880 return (err);
881
882 strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
883 while (i > 0 && isspace(req_name[i-1]))
884 i--;
885 req_name[i] = '\0';
886
887 /* Check mandatory options */
888 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
889 #ifdef CAN_USE_GCM_ASM
890 /* Ignore avx implementation if it won't work. */
891 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
892 continue;
893 }
894 #endif
895 if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
896 impl = gcm_impl_opts[i].sel;
897 err = 0;
898 break;
899 }
900 }
901
902 /* check all supported impl if init() was already called */
903 if (err != 0 && gcm_impl_initialized) {
904 /* check all supported implementations */
905 for (i = 0; i < gcm_supp_impl_cnt; i++) {
906 if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
907 impl = i;
908 err = 0;
909 break;
910 }
911 }
912 }
913 #ifdef CAN_USE_GCM_ASM
914 /*
915 * Use the avx implementation if available and the requested one is
916 * avx or fastest.
917 */
918 if (gcm_avx_will_work() == B_TRUE &&
919 (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
920 gcm_set_avx(B_TRUE);
921 } else {
922 gcm_set_avx(B_FALSE);
923 }
924 #endif
925
926 if (err == 0) {
927 if (gcm_impl_initialized)
928 atomic_swap_32(&icp_gcm_impl, impl);
929 else
930 atomic_swap_32(&user_sel_impl, impl);
931 }
932
933 return (err);
934 }
935
936 #if defined(_KERNEL) && defined(__linux__)
937
938 static int
icp_gcm_impl_set(const char * val,zfs_kernel_param_t * kp)939 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
940 {
941 return (gcm_impl_set(val));
942 }
943
944 static int
icp_gcm_impl_get(char * buffer,zfs_kernel_param_t * kp)945 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
946 {
947 int i, cnt = 0;
948 char *fmt;
949 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
950
951 ASSERT(gcm_impl_initialized);
952
953 /* list mandatory options */
954 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
955 #ifdef CAN_USE_GCM_ASM
956 /* Ignore avx implementation if it won't work. */
957 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
958 continue;
959 }
960 #endif
961 fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
962 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
963 gcm_impl_opts[i].name);
964 }
965
966 /* list all supported implementations */
967 for (i = 0; i < gcm_supp_impl_cnt; i++) {
968 fmt = (i == impl) ? "[%s] " : "%s ";
969 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
970 gcm_supp_impl[i]->name);
971 }
972
973 return (cnt);
974 }
975
976 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
977 NULL, 0644);
978 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
979 #endif /* defined(__KERNEL) */
980
981 #ifdef CAN_USE_GCM_ASM
982 #define GCM_BLOCK_LEN 16
983 /*
984 * The openssl asm routines are 6x aggregated and need that many bytes
985 * at minimum.
986 */
987 #define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
988 #define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
989 /*
990 * Ensure the chunk size is reasonable since we are allocating a
991 * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
992 */
993 #define GCM_AVX_MAX_CHUNK_SIZE \
994 (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
995
996 /* Clear the FPU registers since they hold sensitive internal state. */
997 #define clear_fpu_regs() clear_fpu_regs_avx()
998 #define GHASH_AVX(ctx, in, len) \
999 gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
1000 in, len)
1001
1002 #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1003
1004 /* Get the chunk size module parameter. */
1005 #define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1006
1007 /*
1008 * Module parameter: number of bytes to process at once while owning the FPU.
1009 * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1010 * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1011 */
1012 static uint32_t gcm_avx_chunk_size =
1013 ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1014
1015 extern void ASMABI clear_fpu_regs_avx(void);
1016 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1017 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
1018 const uint32_t pt[4], uint32_t ct[4]);
1019
1020 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1021 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1022 const uint8_t *in, size_t len);
1023
1024 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1025 const void *, uint64_t *, uint64_t *);
1026
1027 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1028 const void *, uint64_t *, uint64_t *);
1029
1030 static inline boolean_t
gcm_avx_will_work(void)1031 gcm_avx_will_work(void)
1032 {
1033 /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1034 return (kfpu_allowed() &&
1035 zfs_avx_available() && zfs_aes_available() &&
1036 zfs_pclmulqdq_available());
1037 }
1038
1039 static inline void
gcm_set_avx(boolean_t val)1040 gcm_set_avx(boolean_t val)
1041 {
1042 if (gcm_avx_will_work() == B_TRUE) {
1043 atomic_swap_32(&gcm_use_avx, val);
1044 }
1045 }
1046
1047 static inline boolean_t
gcm_toggle_avx(void)1048 gcm_toggle_avx(void)
1049 {
1050 if (gcm_avx_will_work() == B_TRUE) {
1051 return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
1052 } else {
1053 return (B_FALSE);
1054 }
1055 }
1056
1057 static inline size_t
gcm_simd_get_htab_size(boolean_t simd_mode)1058 gcm_simd_get_htab_size(boolean_t simd_mode)
1059 {
1060 switch (simd_mode) {
1061 case B_TRUE:
1062 return (2 * 6 * 2 * sizeof (uint64_t));
1063
1064 default:
1065 return (0);
1066 }
1067 }
1068
1069
1070 /* Increment the GCM counter block by n. */
1071 static inline void
gcm_incr_counter_block_by(gcm_ctx_t * ctx,int n)1072 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1073 {
1074 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1075 uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1076
1077 counter = htonll(counter + n);
1078 counter &= counter_mask;
1079 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1080 }
1081
1082 /*
1083 * Encrypt multiple blocks of data in GCM mode.
1084 * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1085 * if possible. While processing a chunk the FPU is "locked".
1086 */
1087 static int
gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size)1088 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1089 size_t length, crypto_data_t *out, size_t block_size)
1090 {
1091 size_t bleft = length;
1092 size_t need = 0;
1093 size_t done = 0;
1094 uint8_t *datap = (uint8_t *)data;
1095 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1096 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1097 uint64_t *ghash = ctx->gcm_ghash;
1098 uint64_t *cb = ctx->gcm_cb;
1099 uint8_t *ct_buf = NULL;
1100 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1101 int rv = CRYPTO_SUCCESS;
1102
1103 ASSERT(block_size == GCM_BLOCK_LEN);
1104 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1105 B_FALSE);
1106 /*
1107 * If the last call left an incomplete block, try to fill
1108 * it first.
1109 */
1110 if (ctx->gcm_remainder_len > 0) {
1111 need = block_size - ctx->gcm_remainder_len;
1112 if (length < need) {
1113 /* Accumulate bytes here and return. */
1114 memcpy((uint8_t *)ctx->gcm_remainder +
1115 ctx->gcm_remainder_len, datap, length);
1116
1117 ctx->gcm_remainder_len += length;
1118 if (ctx->gcm_copy_to == NULL) {
1119 ctx->gcm_copy_to = datap;
1120 }
1121 return (CRYPTO_SUCCESS);
1122 } else {
1123 /* Complete incomplete block. */
1124 memcpy((uint8_t *)ctx->gcm_remainder +
1125 ctx->gcm_remainder_len, datap, need);
1126
1127 ctx->gcm_copy_to = NULL;
1128 }
1129 }
1130
1131 /* Allocate a buffer to encrypt to if there is enough input. */
1132 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1133 ct_buf = vmem_alloc(chunk_size, KM_SLEEP);
1134 if (ct_buf == NULL) {
1135 return (CRYPTO_HOST_MEMORY);
1136 }
1137 }
1138
1139 /* If we completed an incomplete block, encrypt and write it out. */
1140 if (ctx->gcm_remainder_len > 0) {
1141 kfpu_begin();
1142 aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1143 (const uint32_t *)cb, (uint32_t *)tmp);
1144
1145 gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1146 GHASH_AVX(ctx, tmp, block_size);
1147 clear_fpu_regs();
1148 kfpu_end();
1149 rv = crypto_put_output_data(tmp, out, block_size);
1150 out->cd_offset += block_size;
1151 gcm_incr_counter_block(ctx);
1152 ctx->gcm_processed_data_len += block_size;
1153 bleft -= need;
1154 datap += need;
1155 ctx->gcm_remainder_len = 0;
1156 }
1157
1158 /* Do the bulk encryption in chunk_size blocks. */
1159 for (; bleft >= chunk_size; bleft -= chunk_size) {
1160 kfpu_begin();
1161 done = aesni_gcm_encrypt(
1162 datap, ct_buf, chunk_size, key, cb, ghash);
1163
1164 clear_fpu_regs();
1165 kfpu_end();
1166 if (done != chunk_size) {
1167 rv = CRYPTO_FAILED;
1168 goto out_nofpu;
1169 }
1170 rv = crypto_put_output_data(ct_buf, out, chunk_size);
1171 if (rv != CRYPTO_SUCCESS) {
1172 goto out_nofpu;
1173 }
1174 out->cd_offset += chunk_size;
1175 datap += chunk_size;
1176 ctx->gcm_processed_data_len += chunk_size;
1177 }
1178 /* Check if we are already done. */
1179 if (bleft == 0) {
1180 goto out_nofpu;
1181 }
1182 /* Bulk encrypt the remaining data. */
1183 kfpu_begin();
1184 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1185 done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
1186 if (done == 0) {
1187 rv = CRYPTO_FAILED;
1188 goto out;
1189 }
1190 rv = crypto_put_output_data(ct_buf, out, done);
1191 if (rv != CRYPTO_SUCCESS) {
1192 goto out;
1193 }
1194 out->cd_offset += done;
1195 ctx->gcm_processed_data_len += done;
1196 datap += done;
1197 bleft -= done;
1198
1199 }
1200 /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1201 while (bleft > 0) {
1202 if (bleft < block_size) {
1203 memcpy(ctx->gcm_remainder, datap, bleft);
1204 ctx->gcm_remainder_len = bleft;
1205 ctx->gcm_copy_to = datap;
1206 goto out;
1207 }
1208 /* Encrypt, hash and write out. */
1209 aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1210 (const uint32_t *)cb, (uint32_t *)tmp);
1211
1212 gcm_xor_avx(datap, tmp);
1213 GHASH_AVX(ctx, tmp, block_size);
1214 rv = crypto_put_output_data(tmp, out, block_size);
1215 if (rv != CRYPTO_SUCCESS) {
1216 goto out;
1217 }
1218 out->cd_offset += block_size;
1219 gcm_incr_counter_block(ctx);
1220 ctx->gcm_processed_data_len += block_size;
1221 datap += block_size;
1222 bleft -= block_size;
1223 }
1224 out:
1225 clear_fpu_regs();
1226 kfpu_end();
1227 out_nofpu:
1228 if (ct_buf != NULL) {
1229 vmem_free(ct_buf, chunk_size);
1230 }
1231 return (rv);
1232 }
1233
1234 /*
1235 * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1236 * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1237 */
1238 static int
gcm_encrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1239 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1240 {
1241 uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1242 uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1243 uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1244 size_t rem_len = ctx->gcm_remainder_len;
1245 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1246 int aes_rounds = ((aes_key_t *)keysched)->nr;
1247 int rv;
1248
1249 ASSERT(block_size == GCM_BLOCK_LEN);
1250 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1251 B_FALSE);
1252
1253 if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1254 return (CRYPTO_DATA_LEN_RANGE);
1255 }
1256
1257 kfpu_begin();
1258 /* Pad last incomplete block with zeros, encrypt and hash. */
1259 if (rem_len > 0) {
1260 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1261 const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1262
1263 aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1264 memset(remainder + rem_len, 0, block_size - rem_len);
1265 for (int i = 0; i < rem_len; i++) {
1266 remainder[i] ^= tmp[i];
1267 }
1268 GHASH_AVX(ctx, remainder, block_size);
1269 ctx->gcm_processed_data_len += rem_len;
1270 /* No need to increment counter_block, it's the last block. */
1271 }
1272 /* Finish tag. */
1273 ctx->gcm_len_a_len_c[1] =
1274 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1275 GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1276 aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1277
1278 gcm_xor_avx((uint8_t *)J0, ghash);
1279 clear_fpu_regs();
1280 kfpu_end();
1281
1282 /* Output remainder. */
1283 if (rem_len > 0) {
1284 rv = crypto_put_output_data(remainder, out, rem_len);
1285 if (rv != CRYPTO_SUCCESS)
1286 return (rv);
1287 }
1288 out->cd_offset += rem_len;
1289 ctx->gcm_remainder_len = 0;
1290 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1291 if (rv != CRYPTO_SUCCESS)
1292 return (rv);
1293
1294 out->cd_offset += ctx->gcm_tag_len;
1295 return (CRYPTO_SUCCESS);
1296 }
1297
1298 /*
1299 * Finalize decryption: We just have accumulated crypto text, so now we
1300 * decrypt it here inplace.
1301 */
1302 static int
gcm_decrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1303 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1304 {
1305 ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1306 ASSERT3U(block_size, ==, 16);
1307 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1308 B_FALSE);
1309
1310 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1311 size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1312 uint8_t *datap = ctx->gcm_pt_buf;
1313 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1314 uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1315 uint64_t *ghash = ctx->gcm_ghash;
1316 uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1317 int rv = CRYPTO_SUCCESS;
1318 size_t bleft, done;
1319
1320 /*
1321 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1322 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1323 * GCM_AVX_MIN_DECRYPT_BYTES.
1324 */
1325 for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1326 kfpu_begin();
1327 done = aesni_gcm_decrypt(datap, datap, chunk_size,
1328 (const void *)key, ctx->gcm_cb, ghash);
1329 clear_fpu_regs();
1330 kfpu_end();
1331 if (done != chunk_size) {
1332 return (CRYPTO_FAILED);
1333 }
1334 datap += done;
1335 }
1336 /* Decrypt remainder, which is less than chunk size, in one go. */
1337 kfpu_begin();
1338 if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1339 done = aesni_gcm_decrypt(datap, datap, bleft,
1340 (const void *)key, ctx->gcm_cb, ghash);
1341 if (done == 0) {
1342 clear_fpu_regs();
1343 kfpu_end();
1344 return (CRYPTO_FAILED);
1345 }
1346 datap += done;
1347 bleft -= done;
1348 }
1349 ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1350
1351 /*
1352 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1353 * decrypt them block by block.
1354 */
1355 while (bleft > 0) {
1356 /* Incomplete last block. */
1357 if (bleft < block_size) {
1358 uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1359
1360 memset(lastb, 0, block_size);
1361 memcpy(lastb, datap, bleft);
1362 /* The GCM processing. */
1363 GHASH_AVX(ctx, lastb, block_size);
1364 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1365 for (size_t i = 0; i < bleft; i++) {
1366 datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1367 }
1368 break;
1369 }
1370 /* The GCM processing. */
1371 GHASH_AVX(ctx, datap, block_size);
1372 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1373 gcm_xor_avx((uint8_t *)tmp, datap);
1374 gcm_incr_counter_block(ctx);
1375
1376 datap += block_size;
1377 bleft -= block_size;
1378 }
1379 if (rv != CRYPTO_SUCCESS) {
1380 clear_fpu_regs();
1381 kfpu_end();
1382 return (rv);
1383 }
1384 /* Decryption done, finish the tag. */
1385 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1386 GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1387 aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1388 (uint32_t *)ctx->gcm_J0);
1389
1390 gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1391
1392 /* We are done with the FPU, restore its state. */
1393 clear_fpu_regs();
1394 kfpu_end();
1395
1396 /* Compare the input authentication tag with what we calculated. */
1397 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1398 /* They don't match. */
1399 return (CRYPTO_INVALID_MAC);
1400 }
1401 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1402 if (rv != CRYPTO_SUCCESS) {
1403 return (rv);
1404 }
1405 out->cd_offset += pt_len;
1406 return (CRYPTO_SUCCESS);
1407 }
1408
1409 /*
1410 * Initialize the GCM params H, Htabtle and the counter block. Save the
1411 * initial counter block.
1412 */
1413 static int
gcm_init_avx(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size)1414 gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
1415 const uint8_t *auth_data, size_t auth_data_len, size_t block_size)
1416 {
1417 uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1418 uint64_t *H = ctx->gcm_H;
1419 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1420 int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1421 const uint8_t *datap = auth_data;
1422 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1423 size_t bleft;
1424
1425 ASSERT(block_size == GCM_BLOCK_LEN);
1426 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1427 B_FALSE);
1428
1429 /* Init H (encrypt zero block) and create the initial counter block. */
1430 memset(H, 0, sizeof (ctx->gcm_H));
1431 kfpu_begin();
1432 aes_encrypt_intel(keysched, aes_rounds,
1433 (const uint32_t *)H, (uint32_t *)H);
1434
1435 gcm_init_htab_avx(ctx->gcm_Htable, H);
1436
1437 if (iv_len == 12) {
1438 memcpy(cb, iv, 12);
1439 cb[12] = 0;
1440 cb[13] = 0;
1441 cb[14] = 0;
1442 cb[15] = 1;
1443 /* We need the ICB later. */
1444 memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0));
1445 } else {
1446 /*
1447 * Most consumers use 12 byte IVs, so it's OK to use the
1448 * original routines for other IV sizes, just avoid nesting
1449 * kfpu_begin calls.
1450 */
1451 clear_fpu_regs();
1452 kfpu_end();
1453 gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1454 aes_copy_block, aes_xor_block);
1455 kfpu_begin();
1456 }
1457
1458 memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash));
1459
1460 /* Openssl post increments the counter, adjust for that. */
1461 gcm_incr_counter_block(ctx);
1462
1463 /* Ghash AAD in chunk_size blocks. */
1464 for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1465 GHASH_AVX(ctx, datap, chunk_size);
1466 datap += chunk_size;
1467 clear_fpu_regs();
1468 kfpu_end();
1469 kfpu_begin();
1470 }
1471 /* Ghash the remainder and handle possible incomplete GCM block. */
1472 if (bleft > 0) {
1473 size_t incomp = bleft % block_size;
1474
1475 bleft -= incomp;
1476 if (bleft > 0) {
1477 GHASH_AVX(ctx, datap, bleft);
1478 datap += bleft;
1479 }
1480 if (incomp > 0) {
1481 /* Zero pad and hash incomplete last block. */
1482 uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1483
1484 memset(authp, 0, block_size);
1485 memcpy(authp, datap, incomp);
1486 GHASH_AVX(ctx, authp, block_size);
1487 }
1488 }
1489 clear_fpu_regs();
1490 kfpu_end();
1491 return (CRYPTO_SUCCESS);
1492 }
1493
1494 #if defined(_KERNEL)
1495 static int
icp_gcm_avx_set_chunk_size(const char * buf,zfs_kernel_param_t * kp)1496 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1497 {
1498 unsigned long val;
1499 char val_rounded[16];
1500 int error = 0;
1501
1502 error = kstrtoul(buf, 0, &val);
1503 if (error)
1504 return (error);
1505
1506 val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1507
1508 if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1509 return (-EINVAL);
1510
1511 snprintf(val_rounded, 16, "%u", (uint32_t)val);
1512 error = param_set_uint(val_rounded, kp);
1513 return (error);
1514 }
1515
1516 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1517 param_get_uint, &gcm_avx_chunk_size, 0644);
1518
1519 MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1520 "How many bytes to process while owning the FPU");
1521
1522 #endif /* defined(__KERNEL) */
1523 #endif /* ifdef CAN_USE_GCM_ASM */
1524