xref: /freebsd/sys/contrib/openzfs/module/icp/algs/modes/gcm.c (revision 51015e6d0f570239b0c2088dc6cf2b018928375d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/zfs_context.h>
26 #include <modes/modes.h>
27 #include <sys/crypto/common.h>
28 #include <sys/crypto/icp.h>
29 #include <sys/crypto/impl.h>
30 #include <sys/byteorder.h>
31 #include <sys/simd.h>
32 #include <modes/gcm_impl.h>
33 #ifdef CAN_USE_GCM_ASM
34 #include <aes/aes_impl.h>
35 #endif
36 
37 #define	GHASH(c, d, t, o) \
38 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
39 	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
40 	(uint64_t *)(void *)(t));
41 
42 /* Select GCM implementation */
43 #define	IMPL_FASTEST	(UINT32_MAX)
44 #define	IMPL_CYCLE	(UINT32_MAX-1)
45 #ifdef CAN_USE_GCM_ASM
46 #define	IMPL_AVX	(UINT32_MAX-2)
47 #endif
48 #define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
49 static uint32_t icp_gcm_impl = IMPL_FASTEST;
50 static uint32_t user_sel_impl = IMPL_FASTEST;
51 
52 #ifdef CAN_USE_GCM_ASM
53 /* Does the architecture we run on support the MOVBE instruction? */
54 boolean_t gcm_avx_can_use_movbe = B_FALSE;
55 /*
56  * Whether to use the optimized openssl gcm and ghash implementations.
57  * Set to true if module parameter icp_gcm_impl == "avx".
58  */
59 static boolean_t gcm_use_avx = B_FALSE;
60 #define	GCM_IMPL_USE_AVX	(*(volatile boolean_t *)&gcm_use_avx)
61 
62 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
63 
64 static inline boolean_t gcm_avx_will_work(void);
65 static inline void gcm_set_avx(boolean_t);
66 static inline boolean_t gcm_toggle_avx(void);
67 static inline size_t gcm_simd_get_htab_size(boolean_t);
68 
69 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
70     crypto_data_t *, size_t);
71 
72 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
73 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
74 static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *,
75     size_t, size_t);
76 #endif /* ifdef CAN_USE_GCM_ASM */
77 
78 /*
79  * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
80  * is done in another function.
81  */
82 int
83 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
84     crypto_data_t *out, size_t block_size,
85     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
86     void (*copy_block)(uint8_t *, uint8_t *),
87     void (*xor_block)(uint8_t *, uint8_t *))
88 {
89 #ifdef CAN_USE_GCM_ASM
90 	if (ctx->gcm_use_avx == B_TRUE)
91 		return (gcm_mode_encrypt_contiguous_blocks_avx(
92 		    ctx, data, length, out, block_size));
93 #endif
94 
95 	const gcm_impl_ops_t *gops;
96 	size_t remainder = length;
97 	size_t need = 0;
98 	uint8_t *datap = (uint8_t *)data;
99 	uint8_t *blockp;
100 	uint8_t *lastp;
101 	void *iov_or_mp;
102 	offset_t offset;
103 	uint8_t *out_data_1;
104 	uint8_t *out_data_2;
105 	size_t out_data_1_len;
106 	uint64_t counter;
107 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
108 
109 	if (length + ctx->gcm_remainder_len < block_size) {
110 		/* accumulate bytes here and return */
111 		memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
112 		    datap,
113 		    length);
114 		ctx->gcm_remainder_len += length;
115 		if (ctx->gcm_copy_to == NULL) {
116 			ctx->gcm_copy_to = datap;
117 		}
118 		return (CRYPTO_SUCCESS);
119 	}
120 
121 	crypto_init_ptrs(out, &iov_or_mp, &offset);
122 
123 	gops = gcm_impl_get_ops();
124 	do {
125 		/* Unprocessed data from last call. */
126 		if (ctx->gcm_remainder_len > 0) {
127 			need = block_size - ctx->gcm_remainder_len;
128 
129 			if (need > remainder)
130 				return (CRYPTO_DATA_LEN_RANGE);
131 
132 			memcpy(&((uint8_t *)ctx->gcm_remainder)
133 			    [ctx->gcm_remainder_len], datap, need);
134 
135 			blockp = (uint8_t *)ctx->gcm_remainder;
136 		} else {
137 			blockp = datap;
138 		}
139 
140 		/*
141 		 * Increment counter. Counter bits are confined
142 		 * to the bottom 32 bits of the counter block.
143 		 */
144 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
145 		counter = htonll(counter + 1);
146 		counter &= counter_mask;
147 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
148 
149 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
150 		    (uint8_t *)ctx->gcm_tmp);
151 		xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
152 
153 		lastp = (uint8_t *)ctx->gcm_tmp;
154 
155 		ctx->gcm_processed_data_len += block_size;
156 
157 		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
158 		    &out_data_1_len, &out_data_2, block_size);
159 
160 		/* copy block to where it belongs */
161 		if (out_data_1_len == block_size) {
162 			copy_block(lastp, out_data_1);
163 		} else {
164 			memcpy(out_data_1, lastp, out_data_1_len);
165 			if (out_data_2 != NULL) {
166 				memcpy(out_data_2,
167 				    lastp + out_data_1_len,
168 				    block_size - out_data_1_len);
169 			}
170 		}
171 		/* update offset */
172 		out->cd_offset += block_size;
173 
174 		/* add ciphertext to the hash */
175 		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
176 
177 		/* Update pointer to next block of data to be processed. */
178 		if (ctx->gcm_remainder_len != 0) {
179 			datap += need;
180 			ctx->gcm_remainder_len = 0;
181 		} else {
182 			datap += block_size;
183 		}
184 
185 		remainder = (size_t)&data[length] - (size_t)datap;
186 
187 		/* Incomplete last block. */
188 		if (remainder > 0 && remainder < block_size) {
189 			memcpy(ctx->gcm_remainder, datap, remainder);
190 			ctx->gcm_remainder_len = remainder;
191 			ctx->gcm_copy_to = datap;
192 			goto out;
193 		}
194 		ctx->gcm_copy_to = NULL;
195 
196 	} while (remainder > 0);
197 out:
198 	return (CRYPTO_SUCCESS);
199 }
200 
201 int
202 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
203     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
204     void (*copy_block)(uint8_t *, uint8_t *),
205     void (*xor_block)(uint8_t *, uint8_t *))
206 {
207 	(void) copy_block;
208 #ifdef CAN_USE_GCM_ASM
209 	if (ctx->gcm_use_avx == B_TRUE)
210 		return (gcm_encrypt_final_avx(ctx, out, block_size));
211 #endif
212 
213 	const gcm_impl_ops_t *gops;
214 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
215 	uint8_t *ghash, *macp = NULL;
216 	int i, rv;
217 
218 	if (out->cd_length <
219 	    (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
220 		return (CRYPTO_DATA_LEN_RANGE);
221 	}
222 
223 	gops = gcm_impl_get_ops();
224 	ghash = (uint8_t *)ctx->gcm_ghash;
225 
226 	if (ctx->gcm_remainder_len > 0) {
227 		uint64_t counter;
228 		uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
229 
230 		/*
231 		 * Here is where we deal with data that is not a
232 		 * multiple of the block size.
233 		 */
234 
235 		/*
236 		 * Increment counter.
237 		 */
238 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
239 		counter = htonll(counter + 1);
240 		counter &= counter_mask;
241 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
242 
243 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
244 		    (uint8_t *)ctx->gcm_tmp);
245 
246 		macp = (uint8_t *)ctx->gcm_remainder;
247 		memset(macp + ctx->gcm_remainder_len, 0,
248 		    block_size - ctx->gcm_remainder_len);
249 
250 		/* XOR with counter block */
251 		for (i = 0; i < ctx->gcm_remainder_len; i++) {
252 			macp[i] ^= tmpp[i];
253 		}
254 
255 		/* add ciphertext to the hash */
256 		GHASH(ctx, macp, ghash, gops);
257 
258 		ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
259 	}
260 
261 	ctx->gcm_len_a_len_c[1] =
262 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
263 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
264 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
265 	    (uint8_t *)ctx->gcm_J0);
266 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
267 
268 	if (ctx->gcm_remainder_len > 0) {
269 		rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
270 		if (rv != CRYPTO_SUCCESS)
271 			return (rv);
272 	}
273 	out->cd_offset += ctx->gcm_remainder_len;
274 	ctx->gcm_remainder_len = 0;
275 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
276 	if (rv != CRYPTO_SUCCESS)
277 		return (rv);
278 	out->cd_offset += ctx->gcm_tag_len;
279 
280 	return (CRYPTO_SUCCESS);
281 }
282 
283 /*
284  * This will only deal with decrypting the last block of the input that
285  * might not be a multiple of block length.
286  */
287 static void
288 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
289     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
290     void (*xor_block)(uint8_t *, uint8_t *))
291 {
292 	uint8_t *datap, *outp, *counterp;
293 	uint64_t counter;
294 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
295 	int i;
296 
297 	/*
298 	 * Increment counter.
299 	 * Counter bits are confined to the bottom 32 bits
300 	 */
301 	counter = ntohll(ctx->gcm_cb[1] & counter_mask);
302 	counter = htonll(counter + 1);
303 	counter &= counter_mask;
304 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
305 
306 	datap = (uint8_t *)ctx->gcm_remainder;
307 	outp = &((ctx->gcm_pt_buf)[index]);
308 	counterp = (uint8_t *)ctx->gcm_tmp;
309 
310 	/* authentication tag */
311 	memset((uint8_t *)ctx->gcm_tmp, 0, block_size);
312 	memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len);
313 
314 	/* add ciphertext to the hash */
315 	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
316 
317 	/* decrypt remaining ciphertext */
318 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
319 
320 	/* XOR with counter block */
321 	for (i = 0; i < ctx->gcm_remainder_len; i++) {
322 		outp[i] = datap[i] ^ counterp[i];
323 	}
324 }
325 
326 int
327 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
328     crypto_data_t *out, size_t block_size,
329     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
330     void (*copy_block)(uint8_t *, uint8_t *),
331     void (*xor_block)(uint8_t *, uint8_t *))
332 {
333 	(void) out, (void) block_size, (void) encrypt_block, (void) copy_block,
334 	    (void) xor_block;
335 	size_t new_len;
336 	uint8_t *new;
337 
338 	/*
339 	 * Copy contiguous ciphertext input blocks to plaintext buffer.
340 	 * Ciphertext will be decrypted in the final.
341 	 */
342 	if (length > 0) {
343 		new_len = ctx->gcm_pt_buf_len + length;
344 		new = vmem_alloc(new_len, KM_SLEEP);
345 		if (new == NULL) {
346 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
347 			ctx->gcm_pt_buf = NULL;
348 			return (CRYPTO_HOST_MEMORY);
349 		}
350 
351 		if (ctx->gcm_pt_buf != NULL) {
352 			memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
353 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
354 		} else {
355 			ASSERT0(ctx->gcm_pt_buf_len);
356 		}
357 
358 		ctx->gcm_pt_buf = new;
359 		ctx->gcm_pt_buf_len = new_len;
360 		memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data,
361 		    length);
362 		ctx->gcm_processed_data_len += length;
363 	}
364 
365 	ctx->gcm_remainder_len = 0;
366 	return (CRYPTO_SUCCESS);
367 }
368 
369 int
370 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
371     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
372     void (*xor_block)(uint8_t *, uint8_t *))
373 {
374 #ifdef CAN_USE_GCM_ASM
375 	if (ctx->gcm_use_avx == B_TRUE)
376 		return (gcm_decrypt_final_avx(ctx, out, block_size));
377 #endif
378 
379 	const gcm_impl_ops_t *gops;
380 	size_t pt_len;
381 	size_t remainder;
382 	uint8_t *ghash;
383 	uint8_t *blockp;
384 	uint8_t *cbp;
385 	uint64_t counter;
386 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
387 	int processed = 0, rv;
388 
389 	ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
390 
391 	gops = gcm_impl_get_ops();
392 	pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
393 	ghash = (uint8_t *)ctx->gcm_ghash;
394 	blockp = ctx->gcm_pt_buf;
395 	remainder = pt_len;
396 	while (remainder > 0) {
397 		/* Incomplete last block */
398 		if (remainder < block_size) {
399 			memcpy(ctx->gcm_remainder, blockp, remainder);
400 			ctx->gcm_remainder_len = remainder;
401 			/*
402 			 * not expecting anymore ciphertext, just
403 			 * compute plaintext for the remaining input
404 			 */
405 			gcm_decrypt_incomplete_block(ctx, block_size,
406 			    processed, encrypt_block, xor_block);
407 			ctx->gcm_remainder_len = 0;
408 			goto out;
409 		}
410 		/* add ciphertext to the hash */
411 		GHASH(ctx, blockp, ghash, gops);
412 
413 		/*
414 		 * Increment counter.
415 		 * Counter bits are confined to the bottom 32 bits
416 		 */
417 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
418 		counter = htonll(counter + 1);
419 		counter &= counter_mask;
420 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
421 
422 		cbp = (uint8_t *)ctx->gcm_tmp;
423 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
424 
425 		/* XOR with ciphertext */
426 		xor_block(cbp, blockp);
427 
428 		processed += block_size;
429 		blockp += block_size;
430 		remainder -= block_size;
431 	}
432 out:
433 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
434 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
435 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
436 	    (uint8_t *)ctx->gcm_J0);
437 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
438 
439 	/* compare the input authentication tag with what we calculated */
440 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
441 		/* They don't match */
442 		return (CRYPTO_INVALID_MAC);
443 	} else {
444 		rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
445 		if (rv != CRYPTO_SUCCESS)
446 			return (rv);
447 		out->cd_offset += pt_len;
448 	}
449 	return (CRYPTO_SUCCESS);
450 }
451 
452 static int
453 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
454 {
455 	size_t tag_len;
456 
457 	/*
458 	 * Check the length of the authentication tag (in bits).
459 	 */
460 	tag_len = gcm_param->ulTagBits;
461 	switch (tag_len) {
462 	case 32:
463 	case 64:
464 	case 96:
465 	case 104:
466 	case 112:
467 	case 120:
468 	case 128:
469 		break;
470 	default:
471 		return (CRYPTO_MECHANISM_PARAM_INVALID);
472 	}
473 
474 	if (gcm_param->ulIvLen == 0)
475 		return (CRYPTO_MECHANISM_PARAM_INVALID);
476 
477 	return (CRYPTO_SUCCESS);
478 }
479 
480 static void
481 gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
482     gcm_ctx_t *ctx, size_t block_size,
483     void (*copy_block)(uint8_t *, uint8_t *),
484     void (*xor_block)(uint8_t *, uint8_t *))
485 {
486 	const gcm_impl_ops_t *gops;
487 	uint8_t *cb;
488 	ulong_t remainder = iv_len;
489 	ulong_t processed = 0;
490 	uint8_t *datap, *ghash;
491 	uint64_t len_a_len_c[2];
492 
493 	gops = gcm_impl_get_ops();
494 	ghash = (uint8_t *)ctx->gcm_ghash;
495 	cb = (uint8_t *)ctx->gcm_cb;
496 	if (iv_len == 12) {
497 		memcpy(cb, iv, 12);
498 		cb[12] = 0;
499 		cb[13] = 0;
500 		cb[14] = 0;
501 		cb[15] = 1;
502 		/* J0 will be used again in the final */
503 		copy_block(cb, (uint8_t *)ctx->gcm_J0);
504 	} else {
505 		/* GHASH the IV */
506 		do {
507 			if (remainder < block_size) {
508 				memset(cb, 0, block_size);
509 				memcpy(cb, &(iv[processed]), remainder);
510 				datap = (uint8_t *)cb;
511 				remainder = 0;
512 			} else {
513 				datap = (uint8_t *)(&(iv[processed]));
514 				processed += block_size;
515 				remainder -= block_size;
516 			}
517 			GHASH(ctx, datap, ghash, gops);
518 		} while (remainder > 0);
519 
520 		len_a_len_c[0] = 0;
521 		len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
522 		GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
523 
524 		/* J0 will be used again in the final */
525 		copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
526 	}
527 }
528 
529 static int
530 gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
531     unsigned char *auth_data, size_t auth_data_len, size_t block_size,
532     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
533     void (*copy_block)(uint8_t *, uint8_t *),
534     void (*xor_block)(uint8_t *, uint8_t *))
535 {
536 	const gcm_impl_ops_t *gops;
537 	uint8_t *ghash, *datap, *authp;
538 	size_t remainder, processed;
539 
540 	/* encrypt zero block to get subkey H */
541 	memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
542 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
543 	    (uint8_t *)ctx->gcm_H);
544 
545 	gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
546 	    copy_block, xor_block);
547 
548 	gops = gcm_impl_get_ops();
549 	authp = (uint8_t *)ctx->gcm_tmp;
550 	ghash = (uint8_t *)ctx->gcm_ghash;
551 	memset(authp, 0, block_size);
552 	memset(ghash, 0, block_size);
553 
554 	processed = 0;
555 	remainder = auth_data_len;
556 	do {
557 		if (remainder < block_size) {
558 			/*
559 			 * There's not a block full of data, pad rest of
560 			 * buffer with zero
561 			 */
562 
563 			if (auth_data != NULL) {
564 				memset(authp, 0, block_size);
565 				memcpy(authp, &(auth_data[processed]),
566 				    remainder);
567 			} else {
568 				ASSERT0(remainder);
569 			}
570 
571 			datap = (uint8_t *)authp;
572 			remainder = 0;
573 		} else {
574 			datap = (uint8_t *)(&(auth_data[processed]));
575 			processed += block_size;
576 			remainder -= block_size;
577 		}
578 
579 		/* add auth data to the hash */
580 		GHASH(ctx, datap, ghash, gops);
581 
582 	} while (remainder > 0);
583 
584 	return (CRYPTO_SUCCESS);
585 }
586 
587 /*
588  * The following function is called at encrypt or decrypt init time
589  * for AES GCM mode.
590  *
591  * Init the GCM context struct. Handle the cycle and avx implementations here.
592  */
593 int
594 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
595     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
596     void (*copy_block)(uint8_t *, uint8_t *),
597     void (*xor_block)(uint8_t *, uint8_t *))
598 {
599 	int rv;
600 	CK_AES_GCM_PARAMS *gcm_param;
601 
602 	if (param != NULL) {
603 		gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
604 
605 		if ((rv = gcm_validate_args(gcm_param)) != 0) {
606 			return (rv);
607 		}
608 
609 		gcm_ctx->gcm_tag_len = gcm_param->ulTagBits;
610 		gcm_ctx->gcm_tag_len >>= 3;
611 		gcm_ctx->gcm_processed_data_len = 0;
612 
613 		/* these values are in bits */
614 		gcm_ctx->gcm_len_a_len_c[0]
615 		    = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
616 
617 		rv = CRYPTO_SUCCESS;
618 		gcm_ctx->gcm_flags |= GCM_MODE;
619 	} else {
620 		return (CRYPTO_MECHANISM_PARAM_INVALID);
621 	}
622 
623 #ifdef CAN_USE_GCM_ASM
624 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
625 		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
626 	} else {
627 		/*
628 		 * Handle the "cycle" implementation by creating avx and
629 		 * non-avx contexts alternately.
630 		 */
631 		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
632 		/*
633 		 * We don't handle byte swapped key schedules in the avx
634 		 * code path.
635 		 */
636 		aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
637 		if (ks->ops->needs_byteswap == B_TRUE) {
638 			gcm_ctx->gcm_use_avx = B_FALSE;
639 		}
640 		/* Use the MOVBE and the BSWAP variants alternately. */
641 		if (gcm_ctx->gcm_use_avx == B_TRUE &&
642 		    zfs_movbe_available() == B_TRUE) {
643 			(void) atomic_toggle_boolean_nv(
644 			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
645 		}
646 	}
647 	/* Allocate Htab memory as needed. */
648 	if (gcm_ctx->gcm_use_avx == B_TRUE) {
649 		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
650 
651 		if (htab_len == 0) {
652 			return (CRYPTO_MECHANISM_PARAM_INVALID);
653 		}
654 		gcm_ctx->gcm_htab_len = htab_len;
655 		gcm_ctx->gcm_Htable =
656 		    kmem_alloc(htab_len, KM_SLEEP);
657 
658 		if (gcm_ctx->gcm_Htable == NULL) {
659 			return (CRYPTO_HOST_MEMORY);
660 		}
661 	}
662 	/* Avx and non avx context initialization differs from here on. */
663 	if (gcm_ctx->gcm_use_avx == B_FALSE) {
664 #endif /* ifdef CAN_USE_GCM_ASM */
665 		if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
666 		    gcm_param->pAAD, gcm_param->ulAADLen, block_size,
667 		    encrypt_block, copy_block, xor_block) != 0) {
668 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
669 		}
670 #ifdef CAN_USE_GCM_ASM
671 	} else {
672 		if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
673 		    gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) {
674 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
675 		}
676 	}
677 #endif /* ifdef CAN_USE_GCM_ASM */
678 
679 	return (rv);
680 }
681 
682 int
683 gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
684     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
685     void (*copy_block)(uint8_t *, uint8_t *),
686     void (*xor_block)(uint8_t *, uint8_t *))
687 {
688 	int rv;
689 	CK_AES_GMAC_PARAMS *gmac_param;
690 
691 	if (param != NULL) {
692 		gmac_param = (CK_AES_GMAC_PARAMS *)(void *)param;
693 
694 		gcm_ctx->gcm_tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS);
695 		gcm_ctx->gcm_processed_data_len = 0;
696 
697 		/* these values are in bits */
698 		gcm_ctx->gcm_len_a_len_c[0]
699 		    = htonll(CRYPTO_BYTES2BITS(gmac_param->ulAADLen));
700 
701 		rv = CRYPTO_SUCCESS;
702 		gcm_ctx->gcm_flags |= GMAC_MODE;
703 	} else {
704 		return (CRYPTO_MECHANISM_PARAM_INVALID);
705 	}
706 
707 #ifdef CAN_USE_GCM_ASM
708 	/*
709 	 * Handle the "cycle" implementation by creating avx and non avx
710 	 * contexts alternately.
711 	 */
712 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
713 		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
714 	} else {
715 		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
716 	}
717 	/* We don't handle byte swapped key schedules in the avx code path. */
718 	aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
719 	if (ks->ops->needs_byteswap == B_TRUE) {
720 		gcm_ctx->gcm_use_avx = B_FALSE;
721 	}
722 	/* Allocate Htab memory as needed. */
723 	if (gcm_ctx->gcm_use_avx == B_TRUE) {
724 		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
725 
726 		if (htab_len == 0) {
727 			return (CRYPTO_MECHANISM_PARAM_INVALID);
728 		}
729 		gcm_ctx->gcm_htab_len = htab_len;
730 		gcm_ctx->gcm_Htable =
731 		    kmem_alloc(htab_len, KM_SLEEP);
732 
733 		if (gcm_ctx->gcm_Htable == NULL) {
734 			return (CRYPTO_HOST_MEMORY);
735 		}
736 	}
737 
738 	/* Avx and non avx context initialization differs from here on. */
739 	if (gcm_ctx->gcm_use_avx == B_FALSE) {
740 #endif	/* ifdef CAN_USE_GCM_ASM */
741 		if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
742 		    gmac_param->pAAD, gmac_param->ulAADLen, block_size,
743 		    encrypt_block, copy_block, xor_block) != 0) {
744 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
745 		}
746 #ifdef CAN_USE_GCM_ASM
747 	} else {
748 		if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
749 		    gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) {
750 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
751 		}
752 	}
753 #endif /* ifdef CAN_USE_GCM_ASM */
754 
755 	return (rv);
756 }
757 
758 void *
759 gcm_alloc_ctx(int kmflag)
760 {
761 	gcm_ctx_t *gcm_ctx;
762 
763 	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
764 		return (NULL);
765 
766 	gcm_ctx->gcm_flags = GCM_MODE;
767 	return (gcm_ctx);
768 }
769 
770 void *
771 gmac_alloc_ctx(int kmflag)
772 {
773 	gcm_ctx_t *gcm_ctx;
774 
775 	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
776 		return (NULL);
777 
778 	gcm_ctx->gcm_flags = GMAC_MODE;
779 	return (gcm_ctx);
780 }
781 
782 /* GCM implementation that contains the fastest methods */
783 static gcm_impl_ops_t gcm_fastest_impl = {
784 	.name = "fastest"
785 };
786 
787 /* All compiled in implementations */
788 static const gcm_impl_ops_t *gcm_all_impl[] = {
789 	&gcm_generic_impl,
790 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
791 	&gcm_pclmulqdq_impl,
792 #endif
793 };
794 
795 /* Indicate that benchmark has been completed */
796 static boolean_t gcm_impl_initialized = B_FALSE;
797 
798 /* Hold all supported implementations */
799 static size_t gcm_supp_impl_cnt = 0;
800 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
801 
802 /*
803  * Returns the GCM operations for encrypt/decrypt/key setup.  When a
804  * SIMD implementation is not allowed in the current context, then
805  * fallback to the fastest generic implementation.
806  */
807 const gcm_impl_ops_t *
808 gcm_impl_get_ops(void)
809 {
810 	if (!kfpu_allowed())
811 		return (&gcm_generic_impl);
812 
813 	const gcm_impl_ops_t *ops = NULL;
814 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
815 
816 	switch (impl) {
817 	case IMPL_FASTEST:
818 		ASSERT(gcm_impl_initialized);
819 		ops = &gcm_fastest_impl;
820 		break;
821 	case IMPL_CYCLE:
822 		/* Cycle through supported implementations */
823 		ASSERT(gcm_impl_initialized);
824 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
825 		static size_t cycle_impl_idx = 0;
826 		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
827 		ops = gcm_supp_impl[idx];
828 		break;
829 #ifdef CAN_USE_GCM_ASM
830 	case IMPL_AVX:
831 		/*
832 		 * Make sure that we return a valid implementation while
833 		 * switching to the avx implementation since there still
834 		 * may be unfinished non-avx contexts around.
835 		 */
836 		ops = &gcm_generic_impl;
837 		break;
838 #endif
839 	default:
840 		ASSERT3U(impl, <, gcm_supp_impl_cnt);
841 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
842 		if (impl < ARRAY_SIZE(gcm_all_impl))
843 			ops = gcm_supp_impl[impl];
844 		break;
845 	}
846 
847 	ASSERT3P(ops, !=, NULL);
848 
849 	return (ops);
850 }
851 
852 /*
853  * Initialize all supported implementations.
854  */
855 void
856 gcm_impl_init(void)
857 {
858 	gcm_impl_ops_t *curr_impl;
859 	int i, c;
860 
861 	/* Move supported implementations into gcm_supp_impls */
862 	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
863 		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
864 
865 		if (curr_impl->is_supported())
866 			gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
867 	}
868 	gcm_supp_impl_cnt = c;
869 
870 	/*
871 	 * Set the fastest implementation given the assumption that the
872 	 * hardware accelerated version is the fastest.
873 	 */
874 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
875 	if (gcm_pclmulqdq_impl.is_supported()) {
876 		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
877 		    sizeof (gcm_fastest_impl));
878 	} else
879 #endif
880 	{
881 		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
882 		    sizeof (gcm_fastest_impl));
883 	}
884 
885 	strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
886 
887 #ifdef CAN_USE_GCM_ASM
888 	/*
889 	 * Use the avx implementation if it's available and the implementation
890 	 * hasn't changed from its default value of fastest on module load.
891 	 */
892 	if (gcm_avx_will_work()) {
893 #ifdef HAVE_MOVBE
894 		if (zfs_movbe_available() == B_TRUE) {
895 			atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
896 		}
897 #endif
898 		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
899 			gcm_set_avx(B_TRUE);
900 		}
901 	}
902 #endif
903 	/* Finish initialization */
904 	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
905 	gcm_impl_initialized = B_TRUE;
906 }
907 
908 static const struct {
909 	const char *name;
910 	uint32_t sel;
911 } gcm_impl_opts[] = {
912 		{ "cycle",	IMPL_CYCLE },
913 		{ "fastest",	IMPL_FASTEST },
914 #ifdef CAN_USE_GCM_ASM
915 		{ "avx",	IMPL_AVX },
916 #endif
917 };
918 
919 /*
920  * Function sets desired gcm implementation.
921  *
922  * If we are called before init(), user preference will be saved in
923  * user_sel_impl, and applied in later init() call. This occurs when module
924  * parameter is specified on module load. Otherwise, directly update
925  * icp_gcm_impl.
926  *
927  * @val		Name of gcm implementation to use
928  * @param	Unused.
929  */
930 int
931 gcm_impl_set(const char *val)
932 {
933 	int err = -EINVAL;
934 	char req_name[GCM_IMPL_NAME_MAX];
935 	uint32_t impl = GCM_IMPL_READ(user_sel_impl);
936 	size_t i;
937 
938 	/* sanitize input */
939 	i = strnlen(val, GCM_IMPL_NAME_MAX);
940 	if (i == 0 || i >= GCM_IMPL_NAME_MAX)
941 		return (err);
942 
943 	strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
944 	while (i > 0 && isspace(req_name[i-1]))
945 		i--;
946 	req_name[i] = '\0';
947 
948 	/* Check mandatory options */
949 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
950 #ifdef CAN_USE_GCM_ASM
951 		/* Ignore avx implementation if it won't work. */
952 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
953 			continue;
954 		}
955 #endif
956 		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
957 			impl = gcm_impl_opts[i].sel;
958 			err = 0;
959 			break;
960 		}
961 	}
962 
963 	/* check all supported impl if init() was already called */
964 	if (err != 0 && gcm_impl_initialized) {
965 		/* check all supported implementations */
966 		for (i = 0; i < gcm_supp_impl_cnt; i++) {
967 			if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
968 				impl = i;
969 				err = 0;
970 				break;
971 			}
972 		}
973 	}
974 #ifdef CAN_USE_GCM_ASM
975 	/*
976 	 * Use the avx implementation if available and the requested one is
977 	 * avx or fastest.
978 	 */
979 	if (gcm_avx_will_work() == B_TRUE &&
980 	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
981 		gcm_set_avx(B_TRUE);
982 	} else {
983 		gcm_set_avx(B_FALSE);
984 	}
985 #endif
986 
987 	if (err == 0) {
988 		if (gcm_impl_initialized)
989 			atomic_swap_32(&icp_gcm_impl, impl);
990 		else
991 			atomic_swap_32(&user_sel_impl, impl);
992 	}
993 
994 	return (err);
995 }
996 
997 #if defined(_KERNEL) && defined(__linux__)
998 
999 static int
1000 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
1001 {
1002 	return (gcm_impl_set(val));
1003 }
1004 
1005 static int
1006 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
1007 {
1008 	int i, cnt = 0;
1009 	char *fmt;
1010 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
1011 
1012 	ASSERT(gcm_impl_initialized);
1013 
1014 	/* list mandatory options */
1015 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
1016 #ifdef CAN_USE_GCM_ASM
1017 		/* Ignore avx implementation if it won't work. */
1018 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
1019 			continue;
1020 		}
1021 #endif
1022 		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
1023 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
1024 		    gcm_impl_opts[i].name);
1025 	}
1026 
1027 	/* list all supported implementations */
1028 	for (i = 0; i < gcm_supp_impl_cnt; i++) {
1029 		fmt = (i == impl) ? "[%s] " : "%s ";
1030 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
1031 		    gcm_supp_impl[i]->name);
1032 	}
1033 
1034 	return (cnt);
1035 }
1036 
1037 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
1038     NULL, 0644);
1039 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
1040 #endif /* defined(__KERNEL) */
1041 
1042 #ifdef CAN_USE_GCM_ASM
1043 #define	GCM_BLOCK_LEN 16
1044 /*
1045  * The openssl asm routines are 6x aggregated and need that many bytes
1046  * at minimum.
1047  */
1048 #define	GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
1049 #define	GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
1050 /*
1051  * Ensure the chunk size is reasonable since we are allocating a
1052  * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
1053  */
1054 #define	GCM_AVX_MAX_CHUNK_SIZE \
1055 	(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
1056 
1057 /* Clear the FPU registers since they hold sensitive internal state. */
1058 #define	clear_fpu_regs() clear_fpu_regs_avx()
1059 #define	GHASH_AVX(ctx, in, len) \
1060     gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
1061     in, len)
1062 
1063 #define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1064 
1065 /* Get the chunk size module parameter. */
1066 #define	GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1067 
1068 /*
1069  * Module parameter: number of bytes to process at once while owning the FPU.
1070  * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1071  * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1072  */
1073 static uint32_t gcm_avx_chunk_size =
1074 	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1075 
1076 extern void ASMABI clear_fpu_regs_avx(void);
1077 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1078 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
1079     const uint32_t pt[4], uint32_t ct[4]);
1080 
1081 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1082 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1083     const uint8_t *in, size_t len);
1084 
1085 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1086     const void *, uint64_t *, uint64_t *);
1087 
1088 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1089     const void *, uint64_t *, uint64_t *);
1090 
1091 static inline boolean_t
1092 gcm_avx_will_work(void)
1093 {
1094 	/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1095 	return (kfpu_allowed() &&
1096 	    zfs_avx_available() && zfs_aes_available() &&
1097 	    zfs_pclmulqdq_available());
1098 }
1099 
1100 static inline void
1101 gcm_set_avx(boolean_t val)
1102 {
1103 	if (gcm_avx_will_work() == B_TRUE) {
1104 		atomic_swap_32(&gcm_use_avx, val);
1105 	}
1106 }
1107 
1108 static inline boolean_t
1109 gcm_toggle_avx(void)
1110 {
1111 	if (gcm_avx_will_work() == B_TRUE) {
1112 		return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
1113 	} else {
1114 		return (B_FALSE);
1115 	}
1116 }
1117 
1118 static inline size_t
1119 gcm_simd_get_htab_size(boolean_t simd_mode)
1120 {
1121 	switch (simd_mode) {
1122 	case B_TRUE:
1123 		return (2 * 6 * 2 * sizeof (uint64_t));
1124 
1125 	default:
1126 		return (0);
1127 	}
1128 }
1129 
1130 /*
1131  * Clear sensitive data in the context.
1132  *
1133  * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and
1134  * ctx->gcm_Htable contain the hash sub key which protects authentication.
1135  *
1136  * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for
1137  * a known plaintext attack, they consists of the IV and the first and last
1138  * counter respectively. If they should be cleared is debatable.
1139  */
1140 static inline void
1141 gcm_clear_ctx(gcm_ctx_t *ctx)
1142 {
1143 	memset(ctx->gcm_remainder, 0, sizeof (ctx->gcm_remainder));
1144 	memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
1145 	memset(ctx->gcm_J0, 0, sizeof (ctx->gcm_J0));
1146 	memset(ctx->gcm_tmp, 0, sizeof (ctx->gcm_tmp));
1147 }
1148 
1149 /* Increment the GCM counter block by n. */
1150 static inline void
1151 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1152 {
1153 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1154 	uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1155 
1156 	counter = htonll(counter + n);
1157 	counter &= counter_mask;
1158 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1159 }
1160 
1161 /*
1162  * Encrypt multiple blocks of data in GCM mode.
1163  * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1164  * if possible. While processing a chunk the FPU is "locked".
1165  */
1166 static int
1167 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1168     size_t length, crypto_data_t *out, size_t block_size)
1169 {
1170 	size_t bleft = length;
1171 	size_t need = 0;
1172 	size_t done = 0;
1173 	uint8_t *datap = (uint8_t *)data;
1174 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1175 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1176 	uint64_t *ghash = ctx->gcm_ghash;
1177 	uint64_t *cb = ctx->gcm_cb;
1178 	uint8_t *ct_buf = NULL;
1179 	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1180 	int rv = CRYPTO_SUCCESS;
1181 
1182 	ASSERT(block_size == GCM_BLOCK_LEN);
1183 	/*
1184 	 * If the last call left an incomplete block, try to fill
1185 	 * it first.
1186 	 */
1187 	if (ctx->gcm_remainder_len > 0) {
1188 		need = block_size - ctx->gcm_remainder_len;
1189 		if (length < need) {
1190 			/* Accumulate bytes here and return. */
1191 			memcpy((uint8_t *)ctx->gcm_remainder +
1192 			    ctx->gcm_remainder_len, datap, length);
1193 
1194 			ctx->gcm_remainder_len += length;
1195 			if (ctx->gcm_copy_to == NULL) {
1196 				ctx->gcm_copy_to = datap;
1197 			}
1198 			return (CRYPTO_SUCCESS);
1199 		} else {
1200 			/* Complete incomplete block. */
1201 			memcpy((uint8_t *)ctx->gcm_remainder +
1202 			    ctx->gcm_remainder_len, datap, need);
1203 
1204 			ctx->gcm_copy_to = NULL;
1205 		}
1206 	}
1207 
1208 	/* Allocate a buffer to encrypt to if there is enough input. */
1209 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1210 		ct_buf = vmem_alloc(chunk_size, KM_SLEEP);
1211 		if (ct_buf == NULL) {
1212 			return (CRYPTO_HOST_MEMORY);
1213 		}
1214 	}
1215 
1216 	/* If we completed an incomplete block, encrypt and write it out. */
1217 	if (ctx->gcm_remainder_len > 0) {
1218 		kfpu_begin();
1219 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1220 		    (const uint32_t *)cb, (uint32_t *)tmp);
1221 
1222 		gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1223 		GHASH_AVX(ctx, tmp, block_size);
1224 		clear_fpu_regs();
1225 		kfpu_end();
1226 		rv = crypto_put_output_data(tmp, out, block_size);
1227 		out->cd_offset += block_size;
1228 		gcm_incr_counter_block(ctx);
1229 		ctx->gcm_processed_data_len += block_size;
1230 		bleft -= need;
1231 		datap += need;
1232 		ctx->gcm_remainder_len = 0;
1233 	}
1234 
1235 	/* Do the bulk encryption in chunk_size blocks. */
1236 	for (; bleft >= chunk_size; bleft -= chunk_size) {
1237 		kfpu_begin();
1238 		done = aesni_gcm_encrypt(
1239 		    datap, ct_buf, chunk_size, key, cb, ghash);
1240 
1241 		clear_fpu_regs();
1242 		kfpu_end();
1243 		if (done != chunk_size) {
1244 			rv = CRYPTO_FAILED;
1245 			goto out_nofpu;
1246 		}
1247 		rv = crypto_put_output_data(ct_buf, out, chunk_size);
1248 		if (rv != CRYPTO_SUCCESS) {
1249 			goto out_nofpu;
1250 		}
1251 		out->cd_offset += chunk_size;
1252 		datap += chunk_size;
1253 		ctx->gcm_processed_data_len += chunk_size;
1254 	}
1255 	/* Check if we are already done. */
1256 	if (bleft == 0) {
1257 		goto out_nofpu;
1258 	}
1259 	/* Bulk encrypt the remaining data. */
1260 	kfpu_begin();
1261 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1262 		done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
1263 		if (done == 0) {
1264 			rv = CRYPTO_FAILED;
1265 			goto out;
1266 		}
1267 		rv = crypto_put_output_data(ct_buf, out, done);
1268 		if (rv != CRYPTO_SUCCESS) {
1269 			goto out;
1270 		}
1271 		out->cd_offset += done;
1272 		ctx->gcm_processed_data_len += done;
1273 		datap += done;
1274 		bleft -= done;
1275 
1276 	}
1277 	/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1278 	while (bleft > 0) {
1279 		if (bleft < block_size) {
1280 			memcpy(ctx->gcm_remainder, datap, bleft);
1281 			ctx->gcm_remainder_len = bleft;
1282 			ctx->gcm_copy_to = datap;
1283 			goto out;
1284 		}
1285 		/* Encrypt, hash and write out. */
1286 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1287 		    (const uint32_t *)cb, (uint32_t *)tmp);
1288 
1289 		gcm_xor_avx(datap, tmp);
1290 		GHASH_AVX(ctx, tmp, block_size);
1291 		rv = crypto_put_output_data(tmp, out, block_size);
1292 		if (rv != CRYPTO_SUCCESS) {
1293 			goto out;
1294 		}
1295 		out->cd_offset += block_size;
1296 		gcm_incr_counter_block(ctx);
1297 		ctx->gcm_processed_data_len += block_size;
1298 		datap += block_size;
1299 		bleft -= block_size;
1300 	}
1301 out:
1302 	clear_fpu_regs();
1303 	kfpu_end();
1304 out_nofpu:
1305 	if (ct_buf != NULL) {
1306 		vmem_free(ct_buf, chunk_size);
1307 	}
1308 	return (rv);
1309 }
1310 
1311 /*
1312  * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1313  * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1314  */
1315 static int
1316 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1317 {
1318 	uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1319 	uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1320 	uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1321 	size_t rem_len = ctx->gcm_remainder_len;
1322 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1323 	int aes_rounds = ((aes_key_t *)keysched)->nr;
1324 	int rv;
1325 
1326 	ASSERT(block_size == GCM_BLOCK_LEN);
1327 
1328 	if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1329 		return (CRYPTO_DATA_LEN_RANGE);
1330 	}
1331 
1332 	kfpu_begin();
1333 	/* Pad last incomplete block with zeros, encrypt and hash. */
1334 	if (rem_len > 0) {
1335 		uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1336 		const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1337 
1338 		aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1339 		memset(remainder + rem_len, 0, block_size - rem_len);
1340 		for (int i = 0; i < rem_len; i++) {
1341 			remainder[i] ^= tmp[i];
1342 		}
1343 		GHASH_AVX(ctx, remainder, block_size);
1344 		ctx->gcm_processed_data_len += rem_len;
1345 		/* No need to increment counter_block, it's the last block. */
1346 	}
1347 	/* Finish tag. */
1348 	ctx->gcm_len_a_len_c[1] =
1349 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1350 	GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1351 	aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1352 
1353 	gcm_xor_avx((uint8_t *)J0, ghash);
1354 	clear_fpu_regs();
1355 	kfpu_end();
1356 
1357 	/* Output remainder. */
1358 	if (rem_len > 0) {
1359 		rv = crypto_put_output_data(remainder, out, rem_len);
1360 		if (rv != CRYPTO_SUCCESS)
1361 			return (rv);
1362 	}
1363 	out->cd_offset += rem_len;
1364 	ctx->gcm_remainder_len = 0;
1365 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1366 	if (rv != CRYPTO_SUCCESS)
1367 		return (rv);
1368 
1369 	out->cd_offset += ctx->gcm_tag_len;
1370 	/* Clear sensitive data in the context before returning. */
1371 	gcm_clear_ctx(ctx);
1372 	return (CRYPTO_SUCCESS);
1373 }
1374 
1375 /*
1376  * Finalize decryption: We just have accumulated crypto text, so now we
1377  * decrypt it here inplace.
1378  */
1379 static int
1380 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1381 {
1382 	ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1383 	ASSERT3U(block_size, ==, 16);
1384 
1385 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1386 	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1387 	uint8_t *datap = ctx->gcm_pt_buf;
1388 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1389 	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1390 	uint64_t *ghash = ctx->gcm_ghash;
1391 	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1392 	int rv = CRYPTO_SUCCESS;
1393 	size_t bleft, done;
1394 
1395 	/*
1396 	 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1397 	 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1398 	 * GCM_AVX_MIN_DECRYPT_BYTES.
1399 	 */
1400 	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1401 		kfpu_begin();
1402 		done = aesni_gcm_decrypt(datap, datap, chunk_size,
1403 		    (const void *)key, ctx->gcm_cb, ghash);
1404 		clear_fpu_regs();
1405 		kfpu_end();
1406 		if (done != chunk_size) {
1407 			return (CRYPTO_FAILED);
1408 		}
1409 		datap += done;
1410 	}
1411 	/* Decrypt remainder, which is less than chunk size, in one go. */
1412 	kfpu_begin();
1413 	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1414 		done = aesni_gcm_decrypt(datap, datap, bleft,
1415 		    (const void *)key, ctx->gcm_cb, ghash);
1416 		if (done == 0) {
1417 			clear_fpu_regs();
1418 			kfpu_end();
1419 			return (CRYPTO_FAILED);
1420 		}
1421 		datap += done;
1422 		bleft -= done;
1423 	}
1424 	ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1425 
1426 	/*
1427 	 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1428 	 * decrypt them block by block.
1429 	 */
1430 	while (bleft > 0) {
1431 		/* Incomplete last block. */
1432 		if (bleft < block_size) {
1433 			uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1434 
1435 			memset(lastb, 0, block_size);
1436 			memcpy(lastb, datap, bleft);
1437 			/* The GCM processing. */
1438 			GHASH_AVX(ctx, lastb, block_size);
1439 			aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1440 			for (size_t i = 0; i < bleft; i++) {
1441 				datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1442 			}
1443 			break;
1444 		}
1445 		/* The GCM processing. */
1446 		GHASH_AVX(ctx, datap, block_size);
1447 		aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1448 		gcm_xor_avx((uint8_t *)tmp, datap);
1449 		gcm_incr_counter_block(ctx);
1450 
1451 		datap += block_size;
1452 		bleft -= block_size;
1453 	}
1454 	if (rv != CRYPTO_SUCCESS) {
1455 		clear_fpu_regs();
1456 		kfpu_end();
1457 		return (rv);
1458 	}
1459 	/* Decryption done, finish the tag. */
1460 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1461 	GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1462 	aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1463 	    (uint32_t *)ctx->gcm_J0);
1464 
1465 	gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1466 
1467 	/* We are done with the FPU, restore its state. */
1468 	clear_fpu_regs();
1469 	kfpu_end();
1470 
1471 	/* Compare the input authentication tag with what we calculated. */
1472 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1473 		/* They don't match. */
1474 		return (CRYPTO_INVALID_MAC);
1475 	}
1476 	rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1477 	if (rv != CRYPTO_SUCCESS) {
1478 		return (rv);
1479 	}
1480 	out->cd_offset += pt_len;
1481 	gcm_clear_ctx(ctx);
1482 	return (CRYPTO_SUCCESS);
1483 }
1484 
1485 /*
1486  * Initialize the GCM params H, Htabtle and the counter block. Save the
1487  * initial counter block.
1488  */
1489 static int
1490 gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
1491     unsigned char *auth_data, size_t auth_data_len, size_t block_size)
1492 {
1493 	uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1494 	uint64_t *H = ctx->gcm_H;
1495 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1496 	int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1497 	uint8_t *datap = auth_data;
1498 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1499 	size_t bleft;
1500 
1501 	ASSERT(block_size == GCM_BLOCK_LEN);
1502 
1503 	/* Init H (encrypt zero block) and create the initial counter block. */
1504 	memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash));
1505 	memset(H, 0, sizeof (ctx->gcm_H));
1506 	kfpu_begin();
1507 	aes_encrypt_intel(keysched, aes_rounds,
1508 	    (const uint32_t *)H, (uint32_t *)H);
1509 
1510 	gcm_init_htab_avx(ctx->gcm_Htable, H);
1511 
1512 	if (iv_len == 12) {
1513 		memcpy(cb, iv, 12);
1514 		cb[12] = 0;
1515 		cb[13] = 0;
1516 		cb[14] = 0;
1517 		cb[15] = 1;
1518 		/* We need the ICB later. */
1519 		memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0));
1520 	} else {
1521 		/*
1522 		 * Most consumers use 12 byte IVs, so it's OK to use the
1523 		 * original routines for other IV sizes, just avoid nesting
1524 		 * kfpu_begin calls.
1525 		 */
1526 		clear_fpu_regs();
1527 		kfpu_end();
1528 		gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1529 		    aes_copy_block, aes_xor_block);
1530 		kfpu_begin();
1531 	}
1532 
1533 	/* Openssl post increments the counter, adjust for that. */
1534 	gcm_incr_counter_block(ctx);
1535 
1536 	/* Ghash AAD in chunk_size blocks. */
1537 	for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1538 		GHASH_AVX(ctx, datap, chunk_size);
1539 		datap += chunk_size;
1540 		clear_fpu_regs();
1541 		kfpu_end();
1542 		kfpu_begin();
1543 	}
1544 	/* Ghash the remainder and handle possible incomplete GCM block. */
1545 	if (bleft > 0) {
1546 		size_t incomp = bleft % block_size;
1547 
1548 		bleft -= incomp;
1549 		if (bleft > 0) {
1550 			GHASH_AVX(ctx, datap, bleft);
1551 			datap += bleft;
1552 		}
1553 		if (incomp > 0) {
1554 			/* Zero pad and hash incomplete last block. */
1555 			uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1556 
1557 			memset(authp, 0, block_size);
1558 			memcpy(authp, datap, incomp);
1559 			GHASH_AVX(ctx, authp, block_size);
1560 		}
1561 	}
1562 	clear_fpu_regs();
1563 	kfpu_end();
1564 	return (CRYPTO_SUCCESS);
1565 }
1566 
1567 #if defined(_KERNEL)
1568 static int
1569 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1570 {
1571 	unsigned long val;
1572 	char val_rounded[16];
1573 	int error = 0;
1574 
1575 	error = kstrtoul(buf, 0, &val);
1576 	if (error)
1577 		return (error);
1578 
1579 	val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1580 
1581 	if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1582 		return (-EINVAL);
1583 
1584 	snprintf(val_rounded, 16, "%u", (uint32_t)val);
1585 	error = param_set_uint(val_rounded, kp);
1586 	return (error);
1587 }
1588 
1589 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1590     param_get_uint, &gcm_avx_chunk_size, 0644);
1591 
1592 MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1593 	"How many bytes to process while owning the FPU");
1594 
1595 #endif /* defined(__KERNEL) */
1596 #endif /* ifdef CAN_USE_GCM_ASM */
1597