xref: /freebsd/sys/contrib/openzfs/module/icp/algs/modes/gcm.c (revision ae8d58814089308028046ac80aeeb9cbb784bd0a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/zfs_context.h>
26 #include <sys/cmn_err.h>
27 #include <modes/modes.h>
28 #include <sys/crypto/common.h>
29 #include <sys/crypto/icp.h>
30 #include <sys/crypto/impl.h>
31 #include <sys/byteorder.h>
32 #include <sys/simd.h>
33 #include <modes/gcm_impl.h>
34 #ifdef CAN_USE_GCM_ASM
35 #include <aes/aes_impl.h>
36 #endif
37 
38 #define	GHASH(c, d, t, o) \
39 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
40 	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
41 	(uint64_t *)(void *)(t));
42 
43 /* Select GCM implementation */
44 #define	IMPL_FASTEST	(UINT32_MAX)
45 #define	IMPL_CYCLE	(UINT32_MAX-1)
46 #ifdef CAN_USE_GCM_ASM
47 #define	IMPL_AVX	(UINT32_MAX-2)
48 #endif
49 #define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
50 static uint32_t icp_gcm_impl = IMPL_FASTEST;
51 static uint32_t user_sel_impl = IMPL_FASTEST;
52 
53 #ifdef CAN_USE_GCM_ASM
54 /* Does the architecture we run on support the MOVBE instruction? */
55 boolean_t gcm_avx_can_use_movbe = B_FALSE;
56 /*
57  * Whether to use the optimized openssl gcm and ghash implementations.
58  * Set to true if module parameter icp_gcm_impl == "avx".
59  */
60 static boolean_t gcm_use_avx = B_FALSE;
61 #define	GCM_IMPL_USE_AVX	(*(volatile boolean_t *)&gcm_use_avx)
62 
63 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
64 
65 static inline boolean_t gcm_avx_will_work(void);
66 static inline void gcm_set_avx(boolean_t);
67 static inline boolean_t gcm_toggle_avx(void);
68 static inline size_t gcm_simd_get_htab_size(boolean_t);
69 
70 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
71     crypto_data_t *, size_t);
72 
73 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
74 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
75 static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *,
76     size_t, size_t);
77 #endif /* ifdef CAN_USE_GCM_ASM */
78 
79 /*
80  * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
81  * is done in another function.
82  */
83 int
84 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
85     crypto_data_t *out, size_t block_size,
86     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
87     void (*copy_block)(uint8_t *, uint8_t *),
88     void (*xor_block)(uint8_t *, uint8_t *))
89 {
90 #ifdef CAN_USE_GCM_ASM
91 	if (ctx->gcm_use_avx == B_TRUE)
92 		return (gcm_mode_encrypt_contiguous_blocks_avx(
93 		    ctx, data, length, out, block_size));
94 #endif
95 
96 	const gcm_impl_ops_t *gops;
97 	size_t remainder = length;
98 	size_t need = 0;
99 	uint8_t *datap = (uint8_t *)data;
100 	uint8_t *blockp;
101 	uint8_t *lastp;
102 	void *iov_or_mp;
103 	offset_t offset;
104 	uint8_t *out_data_1;
105 	uint8_t *out_data_2;
106 	size_t out_data_1_len;
107 	uint64_t counter;
108 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
109 
110 	if (length + ctx->gcm_remainder_len < block_size) {
111 		/* accumulate bytes here and return */
112 		memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
113 		    datap,
114 		    length);
115 		ctx->gcm_remainder_len += length;
116 		if (ctx->gcm_copy_to == NULL) {
117 			ctx->gcm_copy_to = datap;
118 		}
119 		return (CRYPTO_SUCCESS);
120 	}
121 
122 	crypto_init_ptrs(out, &iov_or_mp, &offset);
123 
124 	gops = gcm_impl_get_ops();
125 	do {
126 		/* Unprocessed data from last call. */
127 		if (ctx->gcm_remainder_len > 0) {
128 			need = block_size - ctx->gcm_remainder_len;
129 
130 			if (need > remainder)
131 				return (CRYPTO_DATA_LEN_RANGE);
132 
133 			memcpy(&((uint8_t *)ctx->gcm_remainder)
134 			    [ctx->gcm_remainder_len], datap, need);
135 
136 			blockp = (uint8_t *)ctx->gcm_remainder;
137 		} else {
138 			blockp = datap;
139 		}
140 
141 		/*
142 		 * Increment counter. Counter bits are confined
143 		 * to the bottom 32 bits of the counter block.
144 		 */
145 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
146 		counter = htonll(counter + 1);
147 		counter &= counter_mask;
148 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
149 
150 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
151 		    (uint8_t *)ctx->gcm_tmp);
152 		xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
153 
154 		lastp = (uint8_t *)ctx->gcm_tmp;
155 
156 		ctx->gcm_processed_data_len += block_size;
157 
158 		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
159 		    &out_data_1_len, &out_data_2, block_size);
160 
161 		/* copy block to where it belongs */
162 		if (out_data_1_len == block_size) {
163 			copy_block(lastp, out_data_1);
164 		} else {
165 			memcpy(out_data_1, lastp, out_data_1_len);
166 			if (out_data_2 != NULL) {
167 				memcpy(out_data_2,
168 				    lastp + out_data_1_len,
169 				    block_size - out_data_1_len);
170 			}
171 		}
172 		/* update offset */
173 		out->cd_offset += block_size;
174 
175 		/* add ciphertext to the hash */
176 		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
177 
178 		/* Update pointer to next block of data to be processed. */
179 		if (ctx->gcm_remainder_len != 0) {
180 			datap += need;
181 			ctx->gcm_remainder_len = 0;
182 		} else {
183 			datap += block_size;
184 		}
185 
186 		remainder = (size_t)&data[length] - (size_t)datap;
187 
188 		/* Incomplete last block. */
189 		if (remainder > 0 && remainder < block_size) {
190 			memcpy(ctx->gcm_remainder, datap, remainder);
191 			ctx->gcm_remainder_len = remainder;
192 			ctx->gcm_copy_to = datap;
193 			goto out;
194 		}
195 		ctx->gcm_copy_to = NULL;
196 
197 	} while (remainder > 0);
198 out:
199 	return (CRYPTO_SUCCESS);
200 }
201 
202 int
203 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
204     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
205     void (*copy_block)(uint8_t *, uint8_t *),
206     void (*xor_block)(uint8_t *, uint8_t *))
207 {
208 	(void) copy_block;
209 #ifdef CAN_USE_GCM_ASM
210 	if (ctx->gcm_use_avx == B_TRUE)
211 		return (gcm_encrypt_final_avx(ctx, out, block_size));
212 #endif
213 
214 	const gcm_impl_ops_t *gops;
215 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
216 	uint8_t *ghash, *macp = NULL;
217 	int i, rv;
218 
219 	if (out->cd_length <
220 	    (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
221 		return (CRYPTO_DATA_LEN_RANGE);
222 	}
223 
224 	gops = gcm_impl_get_ops();
225 	ghash = (uint8_t *)ctx->gcm_ghash;
226 
227 	if (ctx->gcm_remainder_len > 0) {
228 		uint64_t counter;
229 		uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
230 
231 		/*
232 		 * Here is where we deal with data that is not a
233 		 * multiple of the block size.
234 		 */
235 
236 		/*
237 		 * Increment counter.
238 		 */
239 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
240 		counter = htonll(counter + 1);
241 		counter &= counter_mask;
242 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
243 
244 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
245 		    (uint8_t *)ctx->gcm_tmp);
246 
247 		macp = (uint8_t *)ctx->gcm_remainder;
248 		memset(macp + ctx->gcm_remainder_len, 0,
249 		    block_size - ctx->gcm_remainder_len);
250 
251 		/* XOR with counter block */
252 		for (i = 0; i < ctx->gcm_remainder_len; i++) {
253 			macp[i] ^= tmpp[i];
254 		}
255 
256 		/* add ciphertext to the hash */
257 		GHASH(ctx, macp, ghash, gops);
258 
259 		ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
260 	}
261 
262 	ctx->gcm_len_a_len_c[1] =
263 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
264 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
265 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
266 	    (uint8_t *)ctx->gcm_J0);
267 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
268 
269 	if (ctx->gcm_remainder_len > 0) {
270 		rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
271 		if (rv != CRYPTO_SUCCESS)
272 			return (rv);
273 	}
274 	out->cd_offset += ctx->gcm_remainder_len;
275 	ctx->gcm_remainder_len = 0;
276 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
277 	if (rv != CRYPTO_SUCCESS)
278 		return (rv);
279 	out->cd_offset += ctx->gcm_tag_len;
280 
281 	return (CRYPTO_SUCCESS);
282 }
283 
284 /*
285  * This will only deal with decrypting the last block of the input that
286  * might not be a multiple of block length.
287  */
288 static void
289 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
290     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
291     void (*xor_block)(uint8_t *, uint8_t *))
292 {
293 	uint8_t *datap, *outp, *counterp;
294 	uint64_t counter;
295 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
296 	int i;
297 
298 	/*
299 	 * Increment counter.
300 	 * Counter bits are confined to the bottom 32 bits
301 	 */
302 	counter = ntohll(ctx->gcm_cb[1] & counter_mask);
303 	counter = htonll(counter + 1);
304 	counter &= counter_mask;
305 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
306 
307 	datap = (uint8_t *)ctx->gcm_remainder;
308 	outp = &((ctx->gcm_pt_buf)[index]);
309 	counterp = (uint8_t *)ctx->gcm_tmp;
310 
311 	/* authentication tag */
312 	memset((uint8_t *)ctx->gcm_tmp, 0, block_size);
313 	memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len);
314 
315 	/* add ciphertext to the hash */
316 	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
317 
318 	/* decrypt remaining ciphertext */
319 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
320 
321 	/* XOR with counter block */
322 	for (i = 0; i < ctx->gcm_remainder_len; i++) {
323 		outp[i] = datap[i] ^ counterp[i];
324 	}
325 }
326 
327 int
328 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
329     crypto_data_t *out, size_t block_size,
330     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
331     void (*copy_block)(uint8_t *, uint8_t *),
332     void (*xor_block)(uint8_t *, uint8_t *))
333 {
334 	(void) out, (void) block_size, (void) encrypt_block, (void) copy_block,
335 	    (void) xor_block;
336 	size_t new_len;
337 	uint8_t *new;
338 
339 	/*
340 	 * Copy contiguous ciphertext input blocks to plaintext buffer.
341 	 * Ciphertext will be decrypted in the final.
342 	 */
343 	if (length > 0) {
344 		new_len = ctx->gcm_pt_buf_len + length;
345 		new = vmem_alloc(new_len, KM_SLEEP);
346 		if (new == NULL) {
347 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
348 			ctx->gcm_pt_buf = NULL;
349 			return (CRYPTO_HOST_MEMORY);
350 		}
351 
352 		if (ctx->gcm_pt_buf != NULL) {
353 			memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
354 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
355 		} else {
356 			ASSERT0(ctx->gcm_pt_buf_len);
357 		}
358 
359 		ctx->gcm_pt_buf = new;
360 		ctx->gcm_pt_buf_len = new_len;
361 		memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data,
362 		    length);
363 		ctx->gcm_processed_data_len += length;
364 	}
365 
366 	ctx->gcm_remainder_len = 0;
367 	return (CRYPTO_SUCCESS);
368 }
369 
370 int
371 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
372     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
373     void (*xor_block)(uint8_t *, uint8_t *))
374 {
375 #ifdef CAN_USE_GCM_ASM
376 	if (ctx->gcm_use_avx == B_TRUE)
377 		return (gcm_decrypt_final_avx(ctx, out, block_size));
378 #endif
379 
380 	const gcm_impl_ops_t *gops;
381 	size_t pt_len;
382 	size_t remainder;
383 	uint8_t *ghash;
384 	uint8_t *blockp;
385 	uint8_t *cbp;
386 	uint64_t counter;
387 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
388 	int processed = 0, rv;
389 
390 	ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
391 
392 	gops = gcm_impl_get_ops();
393 	pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
394 	ghash = (uint8_t *)ctx->gcm_ghash;
395 	blockp = ctx->gcm_pt_buf;
396 	remainder = pt_len;
397 	while (remainder > 0) {
398 		/* Incomplete last block */
399 		if (remainder < block_size) {
400 			memcpy(ctx->gcm_remainder, blockp, remainder);
401 			ctx->gcm_remainder_len = remainder;
402 			/*
403 			 * not expecting anymore ciphertext, just
404 			 * compute plaintext for the remaining input
405 			 */
406 			gcm_decrypt_incomplete_block(ctx, block_size,
407 			    processed, encrypt_block, xor_block);
408 			ctx->gcm_remainder_len = 0;
409 			goto out;
410 		}
411 		/* add ciphertext to the hash */
412 		GHASH(ctx, blockp, ghash, gops);
413 
414 		/*
415 		 * Increment counter.
416 		 * Counter bits are confined to the bottom 32 bits
417 		 */
418 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
419 		counter = htonll(counter + 1);
420 		counter &= counter_mask;
421 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
422 
423 		cbp = (uint8_t *)ctx->gcm_tmp;
424 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
425 
426 		/* XOR with ciphertext */
427 		xor_block(cbp, blockp);
428 
429 		processed += block_size;
430 		blockp += block_size;
431 		remainder -= block_size;
432 	}
433 out:
434 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
435 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
436 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
437 	    (uint8_t *)ctx->gcm_J0);
438 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
439 
440 	/* compare the input authentication tag with what we calculated */
441 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
442 		/* They don't match */
443 		return (CRYPTO_INVALID_MAC);
444 	} else {
445 		rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
446 		if (rv != CRYPTO_SUCCESS)
447 			return (rv);
448 		out->cd_offset += pt_len;
449 	}
450 	return (CRYPTO_SUCCESS);
451 }
452 
453 static int
454 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
455 {
456 	size_t tag_len;
457 
458 	/*
459 	 * Check the length of the authentication tag (in bits).
460 	 */
461 	tag_len = gcm_param->ulTagBits;
462 	switch (tag_len) {
463 	case 32:
464 	case 64:
465 	case 96:
466 	case 104:
467 	case 112:
468 	case 120:
469 	case 128:
470 		break;
471 	default:
472 		return (CRYPTO_MECHANISM_PARAM_INVALID);
473 	}
474 
475 	if (gcm_param->ulIvLen == 0)
476 		return (CRYPTO_MECHANISM_PARAM_INVALID);
477 
478 	return (CRYPTO_SUCCESS);
479 }
480 
481 static void
482 gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len,
483     gcm_ctx_t *ctx, size_t block_size,
484     void (*copy_block)(uint8_t *, uint8_t *),
485     void (*xor_block)(uint8_t *, uint8_t *))
486 {
487 	const gcm_impl_ops_t *gops;
488 	uint8_t *cb;
489 	ulong_t remainder = iv_len;
490 	ulong_t processed = 0;
491 	uint8_t *datap, *ghash;
492 	uint64_t len_a_len_c[2];
493 
494 	gops = gcm_impl_get_ops();
495 	ghash = (uint8_t *)ctx->gcm_ghash;
496 	cb = (uint8_t *)ctx->gcm_cb;
497 	if (iv_len == 12) {
498 		memcpy(cb, iv, 12);
499 		cb[12] = 0;
500 		cb[13] = 0;
501 		cb[14] = 0;
502 		cb[15] = 1;
503 		/* J0 will be used again in the final */
504 		copy_block(cb, (uint8_t *)ctx->gcm_J0);
505 	} else {
506 		/* GHASH the IV */
507 		do {
508 			if (remainder < block_size) {
509 				memset(cb, 0, block_size);
510 				memcpy(cb, &(iv[processed]), remainder);
511 				datap = (uint8_t *)cb;
512 				remainder = 0;
513 			} else {
514 				datap = (uint8_t *)(&(iv[processed]));
515 				processed += block_size;
516 				remainder -= block_size;
517 			}
518 			GHASH(ctx, datap, ghash, gops);
519 		} while (remainder > 0);
520 
521 		len_a_len_c[0] = 0;
522 		len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
523 		GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
524 
525 		/* J0 will be used again in the final */
526 		copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
527 	}
528 }
529 
530 static int
531 gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
532     const uint8_t *auth_data, size_t auth_data_len, size_t block_size,
533     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
534     void (*copy_block)(uint8_t *, uint8_t *),
535     void (*xor_block)(uint8_t *, uint8_t *))
536 {
537 	const gcm_impl_ops_t *gops;
538 	uint8_t *ghash, *datap, *authp;
539 	size_t remainder, processed;
540 
541 	/* encrypt zero block to get subkey H */
542 	memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
543 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
544 	    (uint8_t *)ctx->gcm_H);
545 
546 	gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
547 	    copy_block, xor_block);
548 
549 	gops = gcm_impl_get_ops();
550 	authp = (uint8_t *)ctx->gcm_tmp;
551 	ghash = (uint8_t *)ctx->gcm_ghash;
552 	memset(authp, 0, block_size);
553 	memset(ghash, 0, block_size);
554 
555 	processed = 0;
556 	remainder = auth_data_len;
557 	do {
558 		if (remainder < block_size) {
559 			/*
560 			 * There's not a block full of data, pad rest of
561 			 * buffer with zero
562 			 */
563 
564 			if (auth_data != NULL) {
565 				memset(authp, 0, block_size);
566 				memcpy(authp, &(auth_data[processed]),
567 				    remainder);
568 			} else {
569 				ASSERT0(remainder);
570 			}
571 
572 			datap = (uint8_t *)authp;
573 			remainder = 0;
574 		} else {
575 			datap = (uint8_t *)(&(auth_data[processed]));
576 			processed += block_size;
577 			remainder -= block_size;
578 		}
579 
580 		/* add auth data to the hash */
581 		GHASH(ctx, datap, ghash, gops);
582 
583 	} while (remainder > 0);
584 
585 	return (CRYPTO_SUCCESS);
586 }
587 
588 /*
589  * Init the GCM context struct. Handle the cycle and avx implementations here.
590  */
591 int
592 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
593     size_t block_size, int (*encrypt_block)(const void *, const uint8_t *,
594     uint8_t *), void (*copy_block)(uint8_t *, uint8_t *),
595     void (*xor_block)(uint8_t *, uint8_t *))
596 {
597 	CK_AES_GCM_PARAMS *gcm_param;
598 	int rv = CRYPTO_SUCCESS;
599 	size_t tag_len, iv_len;
600 
601 	if (param != NULL) {
602 		gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
603 
604 		/* GCM mode. */
605 		if ((rv = gcm_validate_args(gcm_param)) != 0) {
606 			return (rv);
607 		}
608 		gcm_ctx->gcm_flags |= GCM_MODE;
609 
610 		size_t tbits = gcm_param->ulTagBits;
611 		tag_len = CRYPTO_BITS2BYTES(tbits);
612 		iv_len = gcm_param->ulIvLen;
613 
614 		gcm_ctx->gcm_tag_len = tag_len;
615 		gcm_ctx->gcm_processed_data_len = 0;
616 
617 		/* these values are in bits */
618 		gcm_ctx->gcm_len_a_len_c[0]
619 		    = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
620 	} else {
621 		return (CRYPTO_MECHANISM_PARAM_INVALID);
622 	}
623 
624 	const uint8_t *iv = (const uint8_t *)gcm_param->pIv;
625 	const uint8_t *aad = (const uint8_t *)gcm_param->pAAD;
626 	size_t aad_len = gcm_param->ulAADLen;
627 
628 #ifdef CAN_USE_GCM_ASM
629 	boolean_t needs_bswap =
630 	    ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
631 
632 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
633 		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
634 	} else {
635 		/*
636 		 * Handle the "cycle" implementation by creating avx and
637 		 * non-avx contexts alternately.
638 		 */
639 		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
640 
641 		/* The avx impl. doesn't handle byte swapped key schedules. */
642 		if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
643 			gcm_ctx->gcm_use_avx = B_FALSE;
644 		}
645 		/*
646 		 * If this is a GCM context, use the MOVBE and the BSWAP
647 		 * variants alternately.
648 		 */
649 		if (gcm_ctx->gcm_use_avx == B_TRUE &&
650 		    zfs_movbe_available() == B_TRUE) {
651 			(void) atomic_toggle_boolean_nv(
652 			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
653 		}
654 	}
655 	/*
656 	 * We don't handle byte swapped key schedules in the avx code path,
657 	 * still they could be created by the aes generic implementation.
658 	 * Make sure not to use them since we'll corrupt data if we do.
659 	 */
660 	if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
661 		gcm_ctx->gcm_use_avx = B_FALSE;
662 
663 		cmn_err_once(CE_WARN,
664 		    "ICP: Can't use the aes generic or cycle implementations "
665 		    "in combination with the gcm avx implementation!");
666 		cmn_err_once(CE_WARN,
667 		    "ICP: Falling back to a compatible implementation, "
668 		    "aes-gcm performance will likely be degraded.");
669 		cmn_err_once(CE_WARN,
670 		    "ICP: Choose at least the x86_64 aes implementation to "
671 		    "restore performance.");
672 	}
673 
674 	/* Allocate Htab memory as needed. */
675 	if (gcm_ctx->gcm_use_avx == B_TRUE) {
676 		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
677 
678 		if (htab_len == 0) {
679 			return (CRYPTO_MECHANISM_PARAM_INVALID);
680 		}
681 		gcm_ctx->gcm_htab_len = htab_len;
682 		gcm_ctx->gcm_Htable =
683 		    kmem_alloc(htab_len, KM_SLEEP);
684 
685 		if (gcm_ctx->gcm_Htable == NULL) {
686 			return (CRYPTO_HOST_MEMORY);
687 		}
688 	}
689 	/* Avx and non avx context initialization differs from here on. */
690 	if (gcm_ctx->gcm_use_avx == B_FALSE) {
691 #endif /* ifdef CAN_USE_GCM_ASM */
692 		if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
693 		    encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
694 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
695 		}
696 #ifdef CAN_USE_GCM_ASM
697 	} else {
698 		if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
699 		    block_size) != CRYPTO_SUCCESS) {
700 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
701 		}
702 	}
703 #endif /* ifdef CAN_USE_GCM_ASM */
704 
705 	return (rv);
706 }
707 
708 void *
709 gcm_alloc_ctx(int kmflag)
710 {
711 	gcm_ctx_t *gcm_ctx;
712 
713 	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
714 		return (NULL);
715 
716 	gcm_ctx->gcm_flags = GCM_MODE;
717 	return (gcm_ctx);
718 }
719 
720 /* GCM implementation that contains the fastest methods */
721 static gcm_impl_ops_t gcm_fastest_impl = {
722 	.name = "fastest"
723 };
724 
725 /* All compiled in implementations */
726 static const gcm_impl_ops_t *gcm_all_impl[] = {
727 	&gcm_generic_impl,
728 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
729 	&gcm_pclmulqdq_impl,
730 #endif
731 };
732 
733 /* Indicate that benchmark has been completed */
734 static boolean_t gcm_impl_initialized = B_FALSE;
735 
736 /* Hold all supported implementations */
737 static size_t gcm_supp_impl_cnt = 0;
738 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
739 
740 /*
741  * Returns the GCM operations for encrypt/decrypt/key setup.  When a
742  * SIMD implementation is not allowed in the current context, then
743  * fallback to the fastest generic implementation.
744  */
745 const gcm_impl_ops_t *
746 gcm_impl_get_ops(void)
747 {
748 	if (!kfpu_allowed())
749 		return (&gcm_generic_impl);
750 
751 	const gcm_impl_ops_t *ops = NULL;
752 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
753 
754 	switch (impl) {
755 	case IMPL_FASTEST:
756 		ASSERT(gcm_impl_initialized);
757 		ops = &gcm_fastest_impl;
758 		break;
759 	case IMPL_CYCLE:
760 		/* Cycle through supported implementations */
761 		ASSERT(gcm_impl_initialized);
762 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
763 		static size_t cycle_impl_idx = 0;
764 		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
765 		ops = gcm_supp_impl[idx];
766 		break;
767 #ifdef CAN_USE_GCM_ASM
768 	case IMPL_AVX:
769 		/*
770 		 * Make sure that we return a valid implementation while
771 		 * switching to the avx implementation since there still
772 		 * may be unfinished non-avx contexts around.
773 		 */
774 		ops = &gcm_generic_impl;
775 		break;
776 #endif
777 	default:
778 		ASSERT3U(impl, <, gcm_supp_impl_cnt);
779 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
780 		if (impl < ARRAY_SIZE(gcm_all_impl))
781 			ops = gcm_supp_impl[impl];
782 		break;
783 	}
784 
785 	ASSERT3P(ops, !=, NULL);
786 
787 	return (ops);
788 }
789 
790 /*
791  * Initialize all supported implementations.
792  */
793 void
794 gcm_impl_init(void)
795 {
796 	gcm_impl_ops_t *curr_impl;
797 	int i, c;
798 
799 	/* Move supported implementations into gcm_supp_impls */
800 	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
801 		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
802 
803 		if (curr_impl->is_supported())
804 			gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
805 	}
806 	gcm_supp_impl_cnt = c;
807 
808 	/*
809 	 * Set the fastest implementation given the assumption that the
810 	 * hardware accelerated version is the fastest.
811 	 */
812 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
813 	if (gcm_pclmulqdq_impl.is_supported()) {
814 		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
815 		    sizeof (gcm_fastest_impl));
816 	} else
817 #endif
818 	{
819 		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
820 		    sizeof (gcm_fastest_impl));
821 	}
822 
823 	strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
824 
825 #ifdef CAN_USE_GCM_ASM
826 	/*
827 	 * Use the avx implementation if it's available and the implementation
828 	 * hasn't changed from its default value of fastest on module load.
829 	 */
830 	if (gcm_avx_will_work()) {
831 #ifdef HAVE_MOVBE
832 		if (zfs_movbe_available() == B_TRUE) {
833 			atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
834 		}
835 #endif
836 		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
837 			gcm_set_avx(B_TRUE);
838 		}
839 	}
840 #endif
841 	/* Finish initialization */
842 	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
843 	gcm_impl_initialized = B_TRUE;
844 }
845 
846 static const struct {
847 	const char *name;
848 	uint32_t sel;
849 } gcm_impl_opts[] = {
850 		{ "cycle",	IMPL_CYCLE },
851 		{ "fastest",	IMPL_FASTEST },
852 #ifdef CAN_USE_GCM_ASM
853 		{ "avx",	IMPL_AVX },
854 #endif
855 };
856 
857 /*
858  * Function sets desired gcm implementation.
859  *
860  * If we are called before init(), user preference will be saved in
861  * user_sel_impl, and applied in later init() call. This occurs when module
862  * parameter is specified on module load. Otherwise, directly update
863  * icp_gcm_impl.
864  *
865  * @val		Name of gcm implementation to use
866  * @param	Unused.
867  */
868 int
869 gcm_impl_set(const char *val)
870 {
871 	int err = -EINVAL;
872 	char req_name[GCM_IMPL_NAME_MAX];
873 	uint32_t impl = GCM_IMPL_READ(user_sel_impl);
874 	size_t i;
875 
876 	/* sanitize input */
877 	i = strnlen(val, GCM_IMPL_NAME_MAX);
878 	if (i == 0 || i >= GCM_IMPL_NAME_MAX)
879 		return (err);
880 
881 	strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
882 	while (i > 0 && isspace(req_name[i-1]))
883 		i--;
884 	req_name[i] = '\0';
885 
886 	/* Check mandatory options */
887 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
888 #ifdef CAN_USE_GCM_ASM
889 		/* Ignore avx implementation if it won't work. */
890 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
891 			continue;
892 		}
893 #endif
894 		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
895 			impl = gcm_impl_opts[i].sel;
896 			err = 0;
897 			break;
898 		}
899 	}
900 
901 	/* check all supported impl if init() was already called */
902 	if (err != 0 && gcm_impl_initialized) {
903 		/* check all supported implementations */
904 		for (i = 0; i < gcm_supp_impl_cnt; i++) {
905 			if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
906 				impl = i;
907 				err = 0;
908 				break;
909 			}
910 		}
911 	}
912 #ifdef CAN_USE_GCM_ASM
913 	/*
914 	 * Use the avx implementation if available and the requested one is
915 	 * avx or fastest.
916 	 */
917 	if (gcm_avx_will_work() == B_TRUE &&
918 	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
919 		gcm_set_avx(B_TRUE);
920 	} else {
921 		gcm_set_avx(B_FALSE);
922 	}
923 #endif
924 
925 	if (err == 0) {
926 		if (gcm_impl_initialized)
927 			atomic_swap_32(&icp_gcm_impl, impl);
928 		else
929 			atomic_swap_32(&user_sel_impl, impl);
930 	}
931 
932 	return (err);
933 }
934 
935 #if defined(_KERNEL) && defined(__linux__)
936 
937 static int
938 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
939 {
940 	return (gcm_impl_set(val));
941 }
942 
943 static int
944 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
945 {
946 	int i, cnt = 0;
947 	char *fmt;
948 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
949 
950 	ASSERT(gcm_impl_initialized);
951 
952 	/* list mandatory options */
953 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
954 #ifdef CAN_USE_GCM_ASM
955 		/* Ignore avx implementation if it won't work. */
956 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
957 			continue;
958 		}
959 #endif
960 		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
961 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
962 		    gcm_impl_opts[i].name);
963 	}
964 
965 	/* list all supported implementations */
966 	for (i = 0; i < gcm_supp_impl_cnt; i++) {
967 		fmt = (i == impl) ? "[%s] " : "%s ";
968 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
969 		    gcm_supp_impl[i]->name);
970 	}
971 
972 	return (cnt);
973 }
974 
975 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
976     NULL, 0644);
977 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
978 #endif /* defined(__KERNEL) */
979 
980 #ifdef CAN_USE_GCM_ASM
981 #define	GCM_BLOCK_LEN 16
982 /*
983  * The openssl asm routines are 6x aggregated and need that many bytes
984  * at minimum.
985  */
986 #define	GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
987 #define	GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
988 /*
989  * Ensure the chunk size is reasonable since we are allocating a
990  * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
991  */
992 #define	GCM_AVX_MAX_CHUNK_SIZE \
993 	(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
994 
995 /* Clear the FPU registers since they hold sensitive internal state. */
996 #define	clear_fpu_regs() clear_fpu_regs_avx()
997 #define	GHASH_AVX(ctx, in, len) \
998     gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
999     in, len)
1000 
1001 #define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1002 
1003 /* Get the chunk size module parameter. */
1004 #define	GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1005 
1006 /*
1007  * Module parameter: number of bytes to process at once while owning the FPU.
1008  * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1009  * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1010  */
1011 static uint32_t gcm_avx_chunk_size =
1012 	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1013 
1014 extern void ASMABI clear_fpu_regs_avx(void);
1015 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1016 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
1017     const uint32_t pt[4], uint32_t ct[4]);
1018 
1019 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1020 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1021     const uint8_t *in, size_t len);
1022 
1023 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1024     const void *, uint64_t *, uint64_t *);
1025 
1026 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1027     const void *, uint64_t *, uint64_t *);
1028 
1029 static inline boolean_t
1030 gcm_avx_will_work(void)
1031 {
1032 	/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1033 	return (kfpu_allowed() &&
1034 	    zfs_avx_available() && zfs_aes_available() &&
1035 	    zfs_pclmulqdq_available());
1036 }
1037 
1038 static inline void
1039 gcm_set_avx(boolean_t val)
1040 {
1041 	if (gcm_avx_will_work() == B_TRUE) {
1042 		atomic_swap_32(&gcm_use_avx, val);
1043 	}
1044 }
1045 
1046 static inline boolean_t
1047 gcm_toggle_avx(void)
1048 {
1049 	if (gcm_avx_will_work() == B_TRUE) {
1050 		return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
1051 	} else {
1052 		return (B_FALSE);
1053 	}
1054 }
1055 
1056 static inline size_t
1057 gcm_simd_get_htab_size(boolean_t simd_mode)
1058 {
1059 	switch (simd_mode) {
1060 	case B_TRUE:
1061 		return (2 * 6 * 2 * sizeof (uint64_t));
1062 
1063 	default:
1064 		return (0);
1065 	}
1066 }
1067 
1068 
1069 /* Increment the GCM counter block by n. */
1070 static inline void
1071 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1072 {
1073 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1074 	uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1075 
1076 	counter = htonll(counter + n);
1077 	counter &= counter_mask;
1078 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1079 }
1080 
1081 /*
1082  * Encrypt multiple blocks of data in GCM mode.
1083  * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1084  * if possible. While processing a chunk the FPU is "locked".
1085  */
1086 static int
1087 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1088     size_t length, crypto_data_t *out, size_t block_size)
1089 {
1090 	size_t bleft = length;
1091 	size_t need = 0;
1092 	size_t done = 0;
1093 	uint8_t *datap = (uint8_t *)data;
1094 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1095 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1096 	uint64_t *ghash = ctx->gcm_ghash;
1097 	uint64_t *cb = ctx->gcm_cb;
1098 	uint8_t *ct_buf = NULL;
1099 	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1100 	int rv = CRYPTO_SUCCESS;
1101 
1102 	ASSERT(block_size == GCM_BLOCK_LEN);
1103 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1104 	    B_FALSE);
1105 	/*
1106 	 * If the last call left an incomplete block, try to fill
1107 	 * it first.
1108 	 */
1109 	if (ctx->gcm_remainder_len > 0) {
1110 		need = block_size - ctx->gcm_remainder_len;
1111 		if (length < need) {
1112 			/* Accumulate bytes here and return. */
1113 			memcpy((uint8_t *)ctx->gcm_remainder +
1114 			    ctx->gcm_remainder_len, datap, length);
1115 
1116 			ctx->gcm_remainder_len += length;
1117 			if (ctx->gcm_copy_to == NULL) {
1118 				ctx->gcm_copy_to = datap;
1119 			}
1120 			return (CRYPTO_SUCCESS);
1121 		} else {
1122 			/* Complete incomplete block. */
1123 			memcpy((uint8_t *)ctx->gcm_remainder +
1124 			    ctx->gcm_remainder_len, datap, need);
1125 
1126 			ctx->gcm_copy_to = NULL;
1127 		}
1128 	}
1129 
1130 	/* Allocate a buffer to encrypt to if there is enough input. */
1131 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1132 		ct_buf = vmem_alloc(chunk_size, KM_SLEEP);
1133 		if (ct_buf == NULL) {
1134 			return (CRYPTO_HOST_MEMORY);
1135 		}
1136 	}
1137 
1138 	/* If we completed an incomplete block, encrypt and write it out. */
1139 	if (ctx->gcm_remainder_len > 0) {
1140 		kfpu_begin();
1141 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1142 		    (const uint32_t *)cb, (uint32_t *)tmp);
1143 
1144 		gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1145 		GHASH_AVX(ctx, tmp, block_size);
1146 		clear_fpu_regs();
1147 		kfpu_end();
1148 		rv = crypto_put_output_data(tmp, out, block_size);
1149 		out->cd_offset += block_size;
1150 		gcm_incr_counter_block(ctx);
1151 		ctx->gcm_processed_data_len += block_size;
1152 		bleft -= need;
1153 		datap += need;
1154 		ctx->gcm_remainder_len = 0;
1155 	}
1156 
1157 	/* Do the bulk encryption in chunk_size blocks. */
1158 	for (; bleft >= chunk_size; bleft -= chunk_size) {
1159 		kfpu_begin();
1160 		done = aesni_gcm_encrypt(
1161 		    datap, ct_buf, chunk_size, key, cb, ghash);
1162 
1163 		clear_fpu_regs();
1164 		kfpu_end();
1165 		if (done != chunk_size) {
1166 			rv = CRYPTO_FAILED;
1167 			goto out_nofpu;
1168 		}
1169 		rv = crypto_put_output_data(ct_buf, out, chunk_size);
1170 		if (rv != CRYPTO_SUCCESS) {
1171 			goto out_nofpu;
1172 		}
1173 		out->cd_offset += chunk_size;
1174 		datap += chunk_size;
1175 		ctx->gcm_processed_data_len += chunk_size;
1176 	}
1177 	/* Check if we are already done. */
1178 	if (bleft == 0) {
1179 		goto out_nofpu;
1180 	}
1181 	/* Bulk encrypt the remaining data. */
1182 	kfpu_begin();
1183 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1184 		done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
1185 		if (done == 0) {
1186 			rv = CRYPTO_FAILED;
1187 			goto out;
1188 		}
1189 		rv = crypto_put_output_data(ct_buf, out, done);
1190 		if (rv != CRYPTO_SUCCESS) {
1191 			goto out;
1192 		}
1193 		out->cd_offset += done;
1194 		ctx->gcm_processed_data_len += done;
1195 		datap += done;
1196 		bleft -= done;
1197 
1198 	}
1199 	/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1200 	while (bleft > 0) {
1201 		if (bleft < block_size) {
1202 			memcpy(ctx->gcm_remainder, datap, bleft);
1203 			ctx->gcm_remainder_len = bleft;
1204 			ctx->gcm_copy_to = datap;
1205 			goto out;
1206 		}
1207 		/* Encrypt, hash and write out. */
1208 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1209 		    (const uint32_t *)cb, (uint32_t *)tmp);
1210 
1211 		gcm_xor_avx(datap, tmp);
1212 		GHASH_AVX(ctx, tmp, block_size);
1213 		rv = crypto_put_output_data(tmp, out, block_size);
1214 		if (rv != CRYPTO_SUCCESS) {
1215 			goto out;
1216 		}
1217 		out->cd_offset += block_size;
1218 		gcm_incr_counter_block(ctx);
1219 		ctx->gcm_processed_data_len += block_size;
1220 		datap += block_size;
1221 		bleft -= block_size;
1222 	}
1223 out:
1224 	clear_fpu_regs();
1225 	kfpu_end();
1226 out_nofpu:
1227 	if (ct_buf != NULL) {
1228 		vmem_free(ct_buf, chunk_size);
1229 	}
1230 	return (rv);
1231 }
1232 
1233 /*
1234  * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1235  * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1236  */
1237 static int
1238 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1239 {
1240 	uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1241 	uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1242 	uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1243 	size_t rem_len = ctx->gcm_remainder_len;
1244 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1245 	int aes_rounds = ((aes_key_t *)keysched)->nr;
1246 	int rv;
1247 
1248 	ASSERT(block_size == GCM_BLOCK_LEN);
1249 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1250 	    B_FALSE);
1251 
1252 	if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1253 		return (CRYPTO_DATA_LEN_RANGE);
1254 	}
1255 
1256 	kfpu_begin();
1257 	/* Pad last incomplete block with zeros, encrypt and hash. */
1258 	if (rem_len > 0) {
1259 		uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1260 		const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1261 
1262 		aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1263 		memset(remainder + rem_len, 0, block_size - rem_len);
1264 		for (int i = 0; i < rem_len; i++) {
1265 			remainder[i] ^= tmp[i];
1266 		}
1267 		GHASH_AVX(ctx, remainder, block_size);
1268 		ctx->gcm_processed_data_len += rem_len;
1269 		/* No need to increment counter_block, it's the last block. */
1270 	}
1271 	/* Finish tag. */
1272 	ctx->gcm_len_a_len_c[1] =
1273 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1274 	GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1275 	aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1276 
1277 	gcm_xor_avx((uint8_t *)J0, ghash);
1278 	clear_fpu_regs();
1279 	kfpu_end();
1280 
1281 	/* Output remainder. */
1282 	if (rem_len > 0) {
1283 		rv = crypto_put_output_data(remainder, out, rem_len);
1284 		if (rv != CRYPTO_SUCCESS)
1285 			return (rv);
1286 	}
1287 	out->cd_offset += rem_len;
1288 	ctx->gcm_remainder_len = 0;
1289 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1290 	if (rv != CRYPTO_SUCCESS)
1291 		return (rv);
1292 
1293 	out->cd_offset += ctx->gcm_tag_len;
1294 	return (CRYPTO_SUCCESS);
1295 }
1296 
1297 /*
1298  * Finalize decryption: We just have accumulated crypto text, so now we
1299  * decrypt it here inplace.
1300  */
1301 static int
1302 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1303 {
1304 	ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1305 	ASSERT3U(block_size, ==, 16);
1306 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1307 	    B_FALSE);
1308 
1309 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1310 	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1311 	uint8_t *datap = ctx->gcm_pt_buf;
1312 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1313 	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1314 	uint64_t *ghash = ctx->gcm_ghash;
1315 	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1316 	int rv = CRYPTO_SUCCESS;
1317 	size_t bleft, done;
1318 
1319 	/*
1320 	 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1321 	 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1322 	 * GCM_AVX_MIN_DECRYPT_BYTES.
1323 	 */
1324 	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1325 		kfpu_begin();
1326 		done = aesni_gcm_decrypt(datap, datap, chunk_size,
1327 		    (const void *)key, ctx->gcm_cb, ghash);
1328 		clear_fpu_regs();
1329 		kfpu_end();
1330 		if (done != chunk_size) {
1331 			return (CRYPTO_FAILED);
1332 		}
1333 		datap += done;
1334 	}
1335 	/* Decrypt remainder, which is less than chunk size, in one go. */
1336 	kfpu_begin();
1337 	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1338 		done = aesni_gcm_decrypt(datap, datap, bleft,
1339 		    (const void *)key, ctx->gcm_cb, ghash);
1340 		if (done == 0) {
1341 			clear_fpu_regs();
1342 			kfpu_end();
1343 			return (CRYPTO_FAILED);
1344 		}
1345 		datap += done;
1346 		bleft -= done;
1347 	}
1348 	ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1349 
1350 	/*
1351 	 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1352 	 * decrypt them block by block.
1353 	 */
1354 	while (bleft > 0) {
1355 		/* Incomplete last block. */
1356 		if (bleft < block_size) {
1357 			uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1358 
1359 			memset(lastb, 0, block_size);
1360 			memcpy(lastb, datap, bleft);
1361 			/* The GCM processing. */
1362 			GHASH_AVX(ctx, lastb, block_size);
1363 			aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1364 			for (size_t i = 0; i < bleft; i++) {
1365 				datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1366 			}
1367 			break;
1368 		}
1369 		/* The GCM processing. */
1370 		GHASH_AVX(ctx, datap, block_size);
1371 		aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1372 		gcm_xor_avx((uint8_t *)tmp, datap);
1373 		gcm_incr_counter_block(ctx);
1374 
1375 		datap += block_size;
1376 		bleft -= block_size;
1377 	}
1378 	if (rv != CRYPTO_SUCCESS) {
1379 		clear_fpu_regs();
1380 		kfpu_end();
1381 		return (rv);
1382 	}
1383 	/* Decryption done, finish the tag. */
1384 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1385 	GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1386 	aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1387 	    (uint32_t *)ctx->gcm_J0);
1388 
1389 	gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1390 
1391 	/* We are done with the FPU, restore its state. */
1392 	clear_fpu_regs();
1393 	kfpu_end();
1394 
1395 	/* Compare the input authentication tag with what we calculated. */
1396 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1397 		/* They don't match. */
1398 		return (CRYPTO_INVALID_MAC);
1399 	}
1400 	rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1401 	if (rv != CRYPTO_SUCCESS) {
1402 		return (rv);
1403 	}
1404 	out->cd_offset += pt_len;
1405 	return (CRYPTO_SUCCESS);
1406 }
1407 
1408 /*
1409  * Initialize the GCM params H, Htabtle and the counter block. Save the
1410  * initial counter block.
1411  */
1412 static int
1413 gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
1414     const uint8_t *auth_data, size_t auth_data_len, size_t block_size)
1415 {
1416 	uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1417 	uint64_t *H = ctx->gcm_H;
1418 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1419 	int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1420 	const uint8_t *datap = auth_data;
1421 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1422 	size_t bleft;
1423 
1424 	ASSERT(block_size == GCM_BLOCK_LEN);
1425 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1426 	    B_FALSE);
1427 
1428 	/* Init H (encrypt zero block) and create the initial counter block. */
1429 	memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash));
1430 	memset(H, 0, sizeof (ctx->gcm_H));
1431 	kfpu_begin();
1432 	aes_encrypt_intel(keysched, aes_rounds,
1433 	    (const uint32_t *)H, (uint32_t *)H);
1434 
1435 	gcm_init_htab_avx(ctx->gcm_Htable, H);
1436 
1437 	if (iv_len == 12) {
1438 		memcpy(cb, iv, 12);
1439 		cb[12] = 0;
1440 		cb[13] = 0;
1441 		cb[14] = 0;
1442 		cb[15] = 1;
1443 		/* We need the ICB later. */
1444 		memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0));
1445 	} else {
1446 		/*
1447 		 * Most consumers use 12 byte IVs, so it's OK to use the
1448 		 * original routines for other IV sizes, just avoid nesting
1449 		 * kfpu_begin calls.
1450 		 */
1451 		clear_fpu_regs();
1452 		kfpu_end();
1453 		gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1454 		    aes_copy_block, aes_xor_block);
1455 		kfpu_begin();
1456 	}
1457 
1458 	/* Openssl post increments the counter, adjust for that. */
1459 	gcm_incr_counter_block(ctx);
1460 
1461 	/* Ghash AAD in chunk_size blocks. */
1462 	for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1463 		GHASH_AVX(ctx, datap, chunk_size);
1464 		datap += chunk_size;
1465 		clear_fpu_regs();
1466 		kfpu_end();
1467 		kfpu_begin();
1468 	}
1469 	/* Ghash the remainder and handle possible incomplete GCM block. */
1470 	if (bleft > 0) {
1471 		size_t incomp = bleft % block_size;
1472 
1473 		bleft -= incomp;
1474 		if (bleft > 0) {
1475 			GHASH_AVX(ctx, datap, bleft);
1476 			datap += bleft;
1477 		}
1478 		if (incomp > 0) {
1479 			/* Zero pad and hash incomplete last block. */
1480 			uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1481 
1482 			memset(authp, 0, block_size);
1483 			memcpy(authp, datap, incomp);
1484 			GHASH_AVX(ctx, authp, block_size);
1485 		}
1486 	}
1487 	clear_fpu_regs();
1488 	kfpu_end();
1489 	return (CRYPTO_SUCCESS);
1490 }
1491 
1492 #if defined(_KERNEL)
1493 static int
1494 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1495 {
1496 	unsigned long val;
1497 	char val_rounded[16];
1498 	int error = 0;
1499 
1500 	error = kstrtoul(buf, 0, &val);
1501 	if (error)
1502 		return (error);
1503 
1504 	val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1505 
1506 	if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1507 		return (-EINVAL);
1508 
1509 	snprintf(val_rounded, 16, "%u", (uint32_t)val);
1510 	error = param_set_uint(val_rounded, kp);
1511 	return (error);
1512 }
1513 
1514 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1515     param_get_uint, &gcm_avx_chunk_size, 0644);
1516 
1517 MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1518 	"How many bytes to process while owning the FPU");
1519 
1520 #endif /* defined(__KERNEL) */
1521 #endif /* ifdef CAN_USE_GCM_ASM */
1522