xref: /freebsd/sys/contrib/openzfs/module/icp/algs/modes/gcm.c (revision e92ffd9b626833ebdbf2742c8ffddc6cd94b963e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/zfs_context.h>
26 #include <modes/modes.h>
27 #include <sys/crypto/common.h>
28 #include <sys/crypto/icp.h>
29 #include <sys/crypto/impl.h>
30 #include <sys/byteorder.h>
31 #include <sys/simd.h>
32 #include <modes/gcm_impl.h>
33 #ifdef CAN_USE_GCM_ASM
34 #include <aes/aes_impl.h>
35 #endif
36 
37 #define	GHASH(c, d, t, o) \
38 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
39 	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
40 	(uint64_t *)(void *)(t));
41 
42 /* Select GCM implementation */
43 #define	IMPL_FASTEST	(UINT32_MAX)
44 #define	IMPL_CYCLE	(UINT32_MAX-1)
45 #ifdef CAN_USE_GCM_ASM
46 #define	IMPL_AVX	(UINT32_MAX-2)
47 #endif
48 #define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
49 static uint32_t icp_gcm_impl = IMPL_FASTEST;
50 static uint32_t user_sel_impl = IMPL_FASTEST;
51 
52 #ifdef CAN_USE_GCM_ASM
53 /* Does the architecture we run on support the MOVBE instruction? */
54 boolean_t gcm_avx_can_use_movbe = B_FALSE;
55 /*
56  * Whether to use the optimized openssl gcm and ghash implementations.
57  * Set to true if module parameter icp_gcm_impl == "avx".
58  */
59 static boolean_t gcm_use_avx = B_FALSE;
60 #define	GCM_IMPL_USE_AVX	(*(volatile boolean_t *)&gcm_use_avx)
61 
62 extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
63 
64 static inline boolean_t gcm_avx_will_work(void);
65 static inline void gcm_set_avx(boolean_t);
66 static inline boolean_t gcm_toggle_avx(void);
67 static inline size_t gcm_simd_get_htab_size(boolean_t);
68 
69 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
70     crypto_data_t *, size_t);
71 
72 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
73 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
74 static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *,
75     size_t, size_t);
76 #endif /* ifdef CAN_USE_GCM_ASM */
77 
78 /*
79  * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
80  * is done in another function.
81  */
82 int
83 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
84     crypto_data_t *out, size_t block_size,
85     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
86     void (*copy_block)(uint8_t *, uint8_t *),
87     void (*xor_block)(uint8_t *, uint8_t *))
88 {
89 #ifdef CAN_USE_GCM_ASM
90 	if (ctx->gcm_use_avx == B_TRUE)
91 		return (gcm_mode_encrypt_contiguous_blocks_avx(
92 		    ctx, data, length, out, block_size));
93 #endif
94 
95 	const gcm_impl_ops_t *gops;
96 	size_t remainder = length;
97 	size_t need = 0;
98 	uint8_t *datap = (uint8_t *)data;
99 	uint8_t *blockp;
100 	uint8_t *lastp;
101 	void *iov_or_mp;
102 	offset_t offset;
103 	uint8_t *out_data_1;
104 	uint8_t *out_data_2;
105 	size_t out_data_1_len;
106 	uint64_t counter;
107 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
108 
109 	if (length + ctx->gcm_remainder_len < block_size) {
110 		/* accumulate bytes here and return */
111 		bcopy(datap,
112 		    (uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
113 		    length);
114 		ctx->gcm_remainder_len += length;
115 		if (ctx->gcm_copy_to == NULL) {
116 			ctx->gcm_copy_to = datap;
117 		}
118 		return (CRYPTO_SUCCESS);
119 	}
120 
121 	lastp = (uint8_t *)ctx->gcm_cb;
122 	crypto_init_ptrs(out, &iov_or_mp, &offset);
123 
124 	gops = gcm_impl_get_ops();
125 	do {
126 		/* Unprocessed data from last call. */
127 		if (ctx->gcm_remainder_len > 0) {
128 			need = block_size - ctx->gcm_remainder_len;
129 
130 			if (need > remainder)
131 				return (CRYPTO_DATA_LEN_RANGE);
132 
133 			bcopy(datap, &((uint8_t *)ctx->gcm_remainder)
134 			    [ctx->gcm_remainder_len], need);
135 
136 			blockp = (uint8_t *)ctx->gcm_remainder;
137 		} else {
138 			blockp = datap;
139 		}
140 
141 		/*
142 		 * Increment counter. Counter bits are confined
143 		 * to the bottom 32 bits of the counter block.
144 		 */
145 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
146 		counter = htonll(counter + 1);
147 		counter &= counter_mask;
148 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
149 
150 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
151 		    (uint8_t *)ctx->gcm_tmp);
152 		xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
153 
154 		lastp = (uint8_t *)ctx->gcm_tmp;
155 
156 		ctx->gcm_processed_data_len += block_size;
157 
158 		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
159 		    &out_data_1_len, &out_data_2, block_size);
160 
161 		/* copy block to where it belongs */
162 		if (out_data_1_len == block_size) {
163 			copy_block(lastp, out_data_1);
164 		} else {
165 			bcopy(lastp, out_data_1, out_data_1_len);
166 			if (out_data_2 != NULL) {
167 				bcopy(lastp + out_data_1_len,
168 				    out_data_2,
169 				    block_size - out_data_1_len);
170 			}
171 		}
172 		/* update offset */
173 		out->cd_offset += block_size;
174 
175 		/* add ciphertext to the hash */
176 		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
177 
178 		/* Update pointer to next block of data to be processed. */
179 		if (ctx->gcm_remainder_len != 0) {
180 			datap += need;
181 			ctx->gcm_remainder_len = 0;
182 		} else {
183 			datap += block_size;
184 		}
185 
186 		remainder = (size_t)&data[length] - (size_t)datap;
187 
188 		/* Incomplete last block. */
189 		if (remainder > 0 && remainder < block_size) {
190 			bcopy(datap, ctx->gcm_remainder, remainder);
191 			ctx->gcm_remainder_len = remainder;
192 			ctx->gcm_copy_to = datap;
193 			goto out;
194 		}
195 		ctx->gcm_copy_to = NULL;
196 
197 	} while (remainder > 0);
198 out:
199 	return (CRYPTO_SUCCESS);
200 }
201 
202 int
203 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
204     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
205     void (*copy_block)(uint8_t *, uint8_t *),
206     void (*xor_block)(uint8_t *, uint8_t *))
207 {
208 	(void) copy_block;
209 #ifdef CAN_USE_GCM_ASM
210 	if (ctx->gcm_use_avx == B_TRUE)
211 		return (gcm_encrypt_final_avx(ctx, out, block_size));
212 #endif
213 
214 	const gcm_impl_ops_t *gops;
215 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
216 	uint8_t *ghash, *macp = NULL;
217 	int i, rv;
218 
219 	if (out->cd_length <
220 	    (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
221 		return (CRYPTO_DATA_LEN_RANGE);
222 	}
223 
224 	gops = gcm_impl_get_ops();
225 	ghash = (uint8_t *)ctx->gcm_ghash;
226 
227 	if (ctx->gcm_remainder_len > 0) {
228 		uint64_t counter;
229 		uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
230 
231 		/*
232 		 * Here is where we deal with data that is not a
233 		 * multiple of the block size.
234 		 */
235 
236 		/*
237 		 * Increment counter.
238 		 */
239 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
240 		counter = htonll(counter + 1);
241 		counter &= counter_mask;
242 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
243 
244 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
245 		    (uint8_t *)ctx->gcm_tmp);
246 
247 		macp = (uint8_t *)ctx->gcm_remainder;
248 		bzero(macp + ctx->gcm_remainder_len,
249 		    block_size - ctx->gcm_remainder_len);
250 
251 		/* XOR with counter block */
252 		for (i = 0; i < ctx->gcm_remainder_len; i++) {
253 			macp[i] ^= tmpp[i];
254 		}
255 
256 		/* add ciphertext to the hash */
257 		GHASH(ctx, macp, ghash, gops);
258 
259 		ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
260 	}
261 
262 	ctx->gcm_len_a_len_c[1] =
263 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
264 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
265 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
266 	    (uint8_t *)ctx->gcm_J0);
267 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
268 
269 	if (ctx->gcm_remainder_len > 0) {
270 		rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
271 		if (rv != CRYPTO_SUCCESS)
272 			return (rv);
273 	}
274 	out->cd_offset += ctx->gcm_remainder_len;
275 	ctx->gcm_remainder_len = 0;
276 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
277 	if (rv != CRYPTO_SUCCESS)
278 		return (rv);
279 	out->cd_offset += ctx->gcm_tag_len;
280 
281 	return (CRYPTO_SUCCESS);
282 }
283 
284 /*
285  * This will only deal with decrypting the last block of the input that
286  * might not be a multiple of block length.
287  */
288 static void
289 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
290     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
291     void (*xor_block)(uint8_t *, uint8_t *))
292 {
293 	uint8_t *datap, *outp, *counterp;
294 	uint64_t counter;
295 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
296 	int i;
297 
298 	/*
299 	 * Increment counter.
300 	 * Counter bits are confined to the bottom 32 bits
301 	 */
302 	counter = ntohll(ctx->gcm_cb[1] & counter_mask);
303 	counter = htonll(counter + 1);
304 	counter &= counter_mask;
305 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
306 
307 	datap = (uint8_t *)ctx->gcm_remainder;
308 	outp = &((ctx->gcm_pt_buf)[index]);
309 	counterp = (uint8_t *)ctx->gcm_tmp;
310 
311 	/* authentication tag */
312 	bzero((uint8_t *)ctx->gcm_tmp, block_size);
313 	bcopy(datap, (uint8_t *)ctx->gcm_tmp, ctx->gcm_remainder_len);
314 
315 	/* add ciphertext to the hash */
316 	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
317 
318 	/* decrypt remaining ciphertext */
319 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
320 
321 	/* XOR with counter block */
322 	for (i = 0; i < ctx->gcm_remainder_len; i++) {
323 		outp[i] = datap[i] ^ counterp[i];
324 	}
325 }
326 
327 int
328 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
329     crypto_data_t *out, size_t block_size,
330     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
331     void (*copy_block)(uint8_t *, uint8_t *),
332     void (*xor_block)(uint8_t *, uint8_t *))
333 {
334 	(void) out, (void) block_size, (void) encrypt_block, (void) copy_block,
335 	    (void) xor_block;
336 	size_t new_len;
337 	uint8_t *new;
338 
339 	/*
340 	 * Copy contiguous ciphertext input blocks to plaintext buffer.
341 	 * Ciphertext will be decrypted in the final.
342 	 */
343 	if (length > 0) {
344 		new_len = ctx->gcm_pt_buf_len + length;
345 		new = vmem_alloc(new_len, ctx->gcm_kmflag);
346 		if (new == NULL) {
347 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
348 			ctx->gcm_pt_buf = NULL;
349 			return (CRYPTO_HOST_MEMORY);
350 		}
351 		bcopy(ctx->gcm_pt_buf, new, ctx->gcm_pt_buf_len);
352 		vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
353 		ctx->gcm_pt_buf = new;
354 		ctx->gcm_pt_buf_len = new_len;
355 		bcopy(data, &ctx->gcm_pt_buf[ctx->gcm_processed_data_len],
356 		    length);
357 		ctx->gcm_processed_data_len += length;
358 	}
359 
360 	ctx->gcm_remainder_len = 0;
361 	return (CRYPTO_SUCCESS);
362 }
363 
364 int
365 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
366     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
367     void (*xor_block)(uint8_t *, uint8_t *))
368 {
369 #ifdef CAN_USE_GCM_ASM
370 	if (ctx->gcm_use_avx == B_TRUE)
371 		return (gcm_decrypt_final_avx(ctx, out, block_size));
372 #endif
373 
374 	const gcm_impl_ops_t *gops;
375 	size_t pt_len;
376 	size_t remainder;
377 	uint8_t *ghash;
378 	uint8_t *blockp;
379 	uint8_t *cbp;
380 	uint64_t counter;
381 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
382 	int processed = 0, rv;
383 
384 	ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
385 
386 	gops = gcm_impl_get_ops();
387 	pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
388 	ghash = (uint8_t *)ctx->gcm_ghash;
389 	blockp = ctx->gcm_pt_buf;
390 	remainder = pt_len;
391 	while (remainder > 0) {
392 		/* Incomplete last block */
393 		if (remainder < block_size) {
394 			bcopy(blockp, ctx->gcm_remainder, remainder);
395 			ctx->gcm_remainder_len = remainder;
396 			/*
397 			 * not expecting anymore ciphertext, just
398 			 * compute plaintext for the remaining input
399 			 */
400 			gcm_decrypt_incomplete_block(ctx, block_size,
401 			    processed, encrypt_block, xor_block);
402 			ctx->gcm_remainder_len = 0;
403 			goto out;
404 		}
405 		/* add ciphertext to the hash */
406 		GHASH(ctx, blockp, ghash, gops);
407 
408 		/*
409 		 * Increment counter.
410 		 * Counter bits are confined to the bottom 32 bits
411 		 */
412 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
413 		counter = htonll(counter + 1);
414 		counter &= counter_mask;
415 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
416 
417 		cbp = (uint8_t *)ctx->gcm_tmp;
418 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
419 
420 		/* XOR with ciphertext */
421 		xor_block(cbp, blockp);
422 
423 		processed += block_size;
424 		blockp += block_size;
425 		remainder -= block_size;
426 	}
427 out:
428 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
429 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
430 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
431 	    (uint8_t *)ctx->gcm_J0);
432 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
433 
434 	/* compare the input authentication tag with what we calculated */
435 	if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
436 		/* They don't match */
437 		return (CRYPTO_INVALID_MAC);
438 	} else {
439 		rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
440 		if (rv != CRYPTO_SUCCESS)
441 			return (rv);
442 		out->cd_offset += pt_len;
443 	}
444 	return (CRYPTO_SUCCESS);
445 }
446 
447 static int
448 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
449 {
450 	size_t tag_len;
451 
452 	/*
453 	 * Check the length of the authentication tag (in bits).
454 	 */
455 	tag_len = gcm_param->ulTagBits;
456 	switch (tag_len) {
457 	case 32:
458 	case 64:
459 	case 96:
460 	case 104:
461 	case 112:
462 	case 120:
463 	case 128:
464 		break;
465 	default:
466 		return (CRYPTO_MECHANISM_PARAM_INVALID);
467 	}
468 
469 	if (gcm_param->ulIvLen == 0)
470 		return (CRYPTO_MECHANISM_PARAM_INVALID);
471 
472 	return (CRYPTO_SUCCESS);
473 }
474 
475 static void
476 gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
477     gcm_ctx_t *ctx, size_t block_size,
478     void (*copy_block)(uint8_t *, uint8_t *),
479     void (*xor_block)(uint8_t *, uint8_t *))
480 {
481 	const gcm_impl_ops_t *gops;
482 	uint8_t *cb;
483 	ulong_t remainder = iv_len;
484 	ulong_t processed = 0;
485 	uint8_t *datap, *ghash;
486 	uint64_t len_a_len_c[2];
487 
488 	gops = gcm_impl_get_ops();
489 	ghash = (uint8_t *)ctx->gcm_ghash;
490 	cb = (uint8_t *)ctx->gcm_cb;
491 	if (iv_len == 12) {
492 		bcopy(iv, cb, 12);
493 		cb[12] = 0;
494 		cb[13] = 0;
495 		cb[14] = 0;
496 		cb[15] = 1;
497 		/* J0 will be used again in the final */
498 		copy_block(cb, (uint8_t *)ctx->gcm_J0);
499 	} else {
500 		/* GHASH the IV */
501 		do {
502 			if (remainder < block_size) {
503 				bzero(cb, block_size);
504 				bcopy(&(iv[processed]), cb, remainder);
505 				datap = (uint8_t *)cb;
506 				remainder = 0;
507 			} else {
508 				datap = (uint8_t *)(&(iv[processed]));
509 				processed += block_size;
510 				remainder -= block_size;
511 			}
512 			GHASH(ctx, datap, ghash, gops);
513 		} while (remainder > 0);
514 
515 		len_a_len_c[0] = 0;
516 		len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
517 		GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
518 
519 		/* J0 will be used again in the final */
520 		copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
521 	}
522 }
523 
524 static int
525 gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
526     unsigned char *auth_data, size_t auth_data_len, size_t block_size,
527     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
528     void (*copy_block)(uint8_t *, uint8_t *),
529     void (*xor_block)(uint8_t *, uint8_t *))
530 {
531 	const gcm_impl_ops_t *gops;
532 	uint8_t *ghash, *datap, *authp;
533 	size_t remainder, processed;
534 
535 	/* encrypt zero block to get subkey H */
536 	bzero(ctx->gcm_H, sizeof (ctx->gcm_H));
537 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
538 	    (uint8_t *)ctx->gcm_H);
539 
540 	gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
541 	    copy_block, xor_block);
542 
543 	gops = gcm_impl_get_ops();
544 	authp = (uint8_t *)ctx->gcm_tmp;
545 	ghash = (uint8_t *)ctx->gcm_ghash;
546 	bzero(authp, block_size);
547 	bzero(ghash, block_size);
548 
549 	processed = 0;
550 	remainder = auth_data_len;
551 	do {
552 		if (remainder < block_size) {
553 			/*
554 			 * There's not a block full of data, pad rest of
555 			 * buffer with zero
556 			 */
557 			bzero(authp, block_size);
558 			bcopy(&(auth_data[processed]), authp, remainder);
559 			datap = (uint8_t *)authp;
560 			remainder = 0;
561 		} else {
562 			datap = (uint8_t *)(&(auth_data[processed]));
563 			processed += block_size;
564 			remainder -= block_size;
565 		}
566 
567 		/* add auth data to the hash */
568 		GHASH(ctx, datap, ghash, gops);
569 
570 	} while (remainder > 0);
571 
572 	return (CRYPTO_SUCCESS);
573 }
574 
575 /*
576  * The following function is called at encrypt or decrypt init time
577  * for AES GCM mode.
578  *
579  * Init the GCM context struct. Handle the cycle and avx implementations here.
580  */
581 int
582 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
583     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
584     void (*copy_block)(uint8_t *, uint8_t *),
585     void (*xor_block)(uint8_t *, uint8_t *))
586 {
587 	int rv;
588 	CK_AES_GCM_PARAMS *gcm_param;
589 
590 	if (param != NULL) {
591 		gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
592 
593 		if ((rv = gcm_validate_args(gcm_param)) != 0) {
594 			return (rv);
595 		}
596 
597 		gcm_ctx->gcm_tag_len = gcm_param->ulTagBits;
598 		gcm_ctx->gcm_tag_len >>= 3;
599 		gcm_ctx->gcm_processed_data_len = 0;
600 
601 		/* these values are in bits */
602 		gcm_ctx->gcm_len_a_len_c[0]
603 		    = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
604 
605 		rv = CRYPTO_SUCCESS;
606 		gcm_ctx->gcm_flags |= GCM_MODE;
607 	} else {
608 		return (CRYPTO_MECHANISM_PARAM_INVALID);
609 	}
610 
611 #ifdef CAN_USE_GCM_ASM
612 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
613 		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
614 	} else {
615 		/*
616 		 * Handle the "cycle" implementation by creating avx and
617 		 * non-avx contexts alternately.
618 		 */
619 		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
620 		/*
621 		 * We don't handle byte swapped key schedules in the avx
622 		 * code path.
623 		 */
624 		aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
625 		if (ks->ops->needs_byteswap == B_TRUE) {
626 			gcm_ctx->gcm_use_avx = B_FALSE;
627 		}
628 		/* Use the MOVBE and the BSWAP variants alternately. */
629 		if (gcm_ctx->gcm_use_avx == B_TRUE &&
630 		    zfs_movbe_available() == B_TRUE) {
631 			(void) atomic_toggle_boolean_nv(
632 			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
633 		}
634 	}
635 	/* Allocate Htab memory as needed. */
636 	if (gcm_ctx->gcm_use_avx == B_TRUE) {
637 		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
638 
639 		if (htab_len == 0) {
640 			return (CRYPTO_MECHANISM_PARAM_INVALID);
641 		}
642 		gcm_ctx->gcm_htab_len = htab_len;
643 		gcm_ctx->gcm_Htable =
644 		    (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag);
645 
646 		if (gcm_ctx->gcm_Htable == NULL) {
647 			return (CRYPTO_HOST_MEMORY);
648 		}
649 	}
650 	/* Avx and non avx context initialization differs from here on. */
651 	if (gcm_ctx->gcm_use_avx == B_FALSE) {
652 #endif /* ifdef CAN_USE_GCM_ASM */
653 		if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
654 		    gcm_param->pAAD, gcm_param->ulAADLen, block_size,
655 		    encrypt_block, copy_block, xor_block) != 0) {
656 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
657 		}
658 #ifdef CAN_USE_GCM_ASM
659 	} else {
660 		if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
661 		    gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) {
662 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
663 		}
664 	}
665 #endif /* ifdef CAN_USE_GCM_ASM */
666 
667 	return (rv);
668 }
669 
670 int
671 gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
672     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
673     void (*copy_block)(uint8_t *, uint8_t *),
674     void (*xor_block)(uint8_t *, uint8_t *))
675 {
676 	int rv;
677 	CK_AES_GMAC_PARAMS *gmac_param;
678 
679 	if (param != NULL) {
680 		gmac_param = (CK_AES_GMAC_PARAMS *)(void *)param;
681 
682 		gcm_ctx->gcm_tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS);
683 		gcm_ctx->gcm_processed_data_len = 0;
684 
685 		/* these values are in bits */
686 		gcm_ctx->gcm_len_a_len_c[0]
687 		    = htonll(CRYPTO_BYTES2BITS(gmac_param->ulAADLen));
688 
689 		rv = CRYPTO_SUCCESS;
690 		gcm_ctx->gcm_flags |= GMAC_MODE;
691 	} else {
692 		return (CRYPTO_MECHANISM_PARAM_INVALID);
693 	}
694 
695 #ifdef CAN_USE_GCM_ASM
696 	/*
697 	 * Handle the "cycle" implementation by creating avx and non avx
698 	 * contexts alternately.
699 	 */
700 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
701 		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
702 	} else {
703 		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
704 	}
705 	/* We don't handle byte swapped key schedules in the avx code path. */
706 	aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
707 	if (ks->ops->needs_byteswap == B_TRUE) {
708 		gcm_ctx->gcm_use_avx = B_FALSE;
709 	}
710 	/* Allocate Htab memory as needed. */
711 	if (gcm_ctx->gcm_use_avx == B_TRUE) {
712 		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
713 
714 		if (htab_len == 0) {
715 			return (CRYPTO_MECHANISM_PARAM_INVALID);
716 		}
717 		gcm_ctx->gcm_htab_len = htab_len;
718 		gcm_ctx->gcm_Htable =
719 		    (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag);
720 
721 		if (gcm_ctx->gcm_Htable == NULL) {
722 			return (CRYPTO_HOST_MEMORY);
723 		}
724 	}
725 
726 	/* Avx and non avx context initialization differs from here on. */
727 	if (gcm_ctx->gcm_use_avx == B_FALSE) {
728 #endif	/* ifdef CAN_USE_GCM_ASM */
729 		if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
730 		    gmac_param->pAAD, gmac_param->ulAADLen, block_size,
731 		    encrypt_block, copy_block, xor_block) != 0) {
732 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
733 		}
734 #ifdef CAN_USE_GCM_ASM
735 	} else {
736 		if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
737 		    gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) {
738 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
739 		}
740 	}
741 #endif /* ifdef CAN_USE_GCM_ASM */
742 
743 	return (rv);
744 }
745 
746 void *
747 gcm_alloc_ctx(int kmflag)
748 {
749 	gcm_ctx_t *gcm_ctx;
750 
751 	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
752 		return (NULL);
753 
754 	gcm_ctx->gcm_flags = GCM_MODE;
755 	return (gcm_ctx);
756 }
757 
758 void *
759 gmac_alloc_ctx(int kmflag)
760 {
761 	gcm_ctx_t *gcm_ctx;
762 
763 	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
764 		return (NULL);
765 
766 	gcm_ctx->gcm_flags = GMAC_MODE;
767 	return (gcm_ctx);
768 }
769 
770 void
771 gcm_set_kmflag(gcm_ctx_t *ctx, int kmflag)
772 {
773 	ctx->gcm_kmflag = kmflag;
774 }
775 
776 /* GCM implementation that contains the fastest methods */
777 static gcm_impl_ops_t gcm_fastest_impl = {
778 	.name = "fastest"
779 };
780 
781 /* All compiled in implementations */
782 static const gcm_impl_ops_t *gcm_all_impl[] = {
783 	&gcm_generic_impl,
784 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
785 	&gcm_pclmulqdq_impl,
786 #endif
787 };
788 
789 /* Indicate that benchmark has been completed */
790 static boolean_t gcm_impl_initialized = B_FALSE;
791 
792 /* Hold all supported implementations */
793 static size_t gcm_supp_impl_cnt = 0;
794 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
795 
796 /*
797  * Returns the GCM operations for encrypt/decrypt/key setup.  When a
798  * SIMD implementation is not allowed in the current context, then
799  * fallback to the fastest generic implementation.
800  */
801 const gcm_impl_ops_t *
802 gcm_impl_get_ops()
803 {
804 	if (!kfpu_allowed())
805 		return (&gcm_generic_impl);
806 
807 	const gcm_impl_ops_t *ops = NULL;
808 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
809 
810 	switch (impl) {
811 	case IMPL_FASTEST:
812 		ASSERT(gcm_impl_initialized);
813 		ops = &gcm_fastest_impl;
814 		break;
815 	case IMPL_CYCLE:
816 		/* Cycle through supported implementations */
817 		ASSERT(gcm_impl_initialized);
818 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
819 		static size_t cycle_impl_idx = 0;
820 		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
821 		ops = gcm_supp_impl[idx];
822 		break;
823 #ifdef CAN_USE_GCM_ASM
824 	case IMPL_AVX:
825 		/*
826 		 * Make sure that we return a valid implementation while
827 		 * switching to the avx implementation since there still
828 		 * may be unfinished non-avx contexts around.
829 		 */
830 		ops = &gcm_generic_impl;
831 		break;
832 #endif
833 	default:
834 		ASSERT3U(impl, <, gcm_supp_impl_cnt);
835 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
836 		if (impl < ARRAY_SIZE(gcm_all_impl))
837 			ops = gcm_supp_impl[impl];
838 		break;
839 	}
840 
841 	ASSERT3P(ops, !=, NULL);
842 
843 	return (ops);
844 }
845 
846 /*
847  * Initialize all supported implementations.
848  */
849 void
850 gcm_impl_init(void)
851 {
852 	gcm_impl_ops_t *curr_impl;
853 	int i, c;
854 
855 	/* Move supported implementations into gcm_supp_impls */
856 	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
857 		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
858 
859 		if (curr_impl->is_supported())
860 			gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
861 	}
862 	gcm_supp_impl_cnt = c;
863 
864 	/*
865 	 * Set the fastest implementation given the assumption that the
866 	 * hardware accelerated version is the fastest.
867 	 */
868 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
869 	if (gcm_pclmulqdq_impl.is_supported()) {
870 		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
871 		    sizeof (gcm_fastest_impl));
872 	} else
873 #endif
874 	{
875 		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
876 		    sizeof (gcm_fastest_impl));
877 	}
878 
879 	strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
880 
881 #ifdef CAN_USE_GCM_ASM
882 	/*
883 	 * Use the avx implementation if it's available and the implementation
884 	 * hasn't changed from its default value of fastest on module load.
885 	 */
886 	if (gcm_avx_will_work()) {
887 #ifdef HAVE_MOVBE
888 		if (zfs_movbe_available() == B_TRUE) {
889 			atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
890 		}
891 #endif
892 		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
893 			gcm_set_avx(B_TRUE);
894 		}
895 	}
896 #endif
897 	/* Finish initialization */
898 	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
899 	gcm_impl_initialized = B_TRUE;
900 }
901 
902 static const struct {
903 	char *name;
904 	uint32_t sel;
905 } gcm_impl_opts[] = {
906 		{ "cycle",	IMPL_CYCLE },
907 		{ "fastest",	IMPL_FASTEST },
908 #ifdef CAN_USE_GCM_ASM
909 		{ "avx",	IMPL_AVX },
910 #endif
911 };
912 
913 /*
914  * Function sets desired gcm implementation.
915  *
916  * If we are called before init(), user preference will be saved in
917  * user_sel_impl, and applied in later init() call. This occurs when module
918  * parameter is specified on module load. Otherwise, directly update
919  * icp_gcm_impl.
920  *
921  * @val		Name of gcm implementation to use
922  * @param	Unused.
923  */
924 int
925 gcm_impl_set(const char *val)
926 {
927 	int err = -EINVAL;
928 	char req_name[GCM_IMPL_NAME_MAX];
929 	uint32_t impl = GCM_IMPL_READ(user_sel_impl);
930 	size_t i;
931 
932 	/* sanitize input */
933 	i = strnlen(val, GCM_IMPL_NAME_MAX);
934 	if (i == 0 || i >= GCM_IMPL_NAME_MAX)
935 		return (err);
936 
937 	strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
938 	while (i > 0 && isspace(req_name[i-1]))
939 		i--;
940 	req_name[i] = '\0';
941 
942 	/* Check mandatory options */
943 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
944 #ifdef CAN_USE_GCM_ASM
945 		/* Ignore avx implementation if it won't work. */
946 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
947 			continue;
948 		}
949 #endif
950 		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
951 			impl = gcm_impl_opts[i].sel;
952 			err = 0;
953 			break;
954 		}
955 	}
956 
957 	/* check all supported impl if init() was already called */
958 	if (err != 0 && gcm_impl_initialized) {
959 		/* check all supported implementations */
960 		for (i = 0; i < gcm_supp_impl_cnt; i++) {
961 			if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
962 				impl = i;
963 				err = 0;
964 				break;
965 			}
966 		}
967 	}
968 #ifdef CAN_USE_GCM_ASM
969 	/*
970 	 * Use the avx implementation if available and the requested one is
971 	 * avx or fastest.
972 	 */
973 	if (gcm_avx_will_work() == B_TRUE &&
974 	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
975 		gcm_set_avx(B_TRUE);
976 	} else {
977 		gcm_set_avx(B_FALSE);
978 	}
979 #endif
980 
981 	if (err == 0) {
982 		if (gcm_impl_initialized)
983 			atomic_swap_32(&icp_gcm_impl, impl);
984 		else
985 			atomic_swap_32(&user_sel_impl, impl);
986 	}
987 
988 	return (err);
989 }
990 
991 #if defined(_KERNEL) && defined(__linux__)
992 
993 static int
994 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
995 {
996 	return (gcm_impl_set(val));
997 }
998 
999 static int
1000 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
1001 {
1002 	int i, cnt = 0;
1003 	char *fmt;
1004 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
1005 
1006 	ASSERT(gcm_impl_initialized);
1007 
1008 	/* list mandatory options */
1009 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
1010 #ifdef CAN_USE_GCM_ASM
1011 		/* Ignore avx implementation if it won't work. */
1012 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
1013 			continue;
1014 		}
1015 #endif
1016 		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
1017 		cnt += sprintf(buffer + cnt, fmt, gcm_impl_opts[i].name);
1018 	}
1019 
1020 	/* list all supported implementations */
1021 	for (i = 0; i < gcm_supp_impl_cnt; i++) {
1022 		fmt = (i == impl) ? "[%s] " : "%s ";
1023 		cnt += sprintf(buffer + cnt, fmt, gcm_supp_impl[i]->name);
1024 	}
1025 
1026 	return (cnt);
1027 }
1028 
1029 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
1030     NULL, 0644);
1031 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
1032 #endif /* defined(__KERNEL) */
1033 
1034 #ifdef CAN_USE_GCM_ASM
1035 #define	GCM_BLOCK_LEN 16
1036 /*
1037  * The openssl asm routines are 6x aggregated and need that many bytes
1038  * at minimum.
1039  */
1040 #define	GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
1041 #define	GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
1042 /*
1043  * Ensure the chunk size is reasonable since we are allocating a
1044  * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
1045  */
1046 #define	GCM_AVX_MAX_CHUNK_SIZE \
1047 	(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
1048 
1049 /* Clear the FPU registers since they hold sensitive internal state. */
1050 #define	clear_fpu_regs() clear_fpu_regs_avx()
1051 #define	GHASH_AVX(ctx, in, len) \
1052     gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
1053     in, len)
1054 
1055 #define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1056 
1057 /* Get the chunk size module parameter. */
1058 #define	GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1059 
1060 /*
1061  * Module parameter: number of bytes to process at once while owning the FPU.
1062  * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1063  * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1064  */
1065 static uint32_t gcm_avx_chunk_size =
1066 	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1067 
1068 extern void clear_fpu_regs_avx(void);
1069 extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1070 extern void aes_encrypt_intel(const uint32_t rk[], int nr,
1071     const uint32_t pt[4], uint32_t ct[4]);
1072 
1073 extern void gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1074 extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1075     const uint8_t *in, size_t len);
1076 
1077 extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1078     const void *, uint64_t *, uint64_t *);
1079 
1080 extern size_t aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1081     const void *, uint64_t *, uint64_t *);
1082 
1083 static inline boolean_t
1084 gcm_avx_will_work(void)
1085 {
1086 	/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1087 	return (kfpu_allowed() &&
1088 	    zfs_avx_available() && zfs_aes_available() &&
1089 	    zfs_pclmulqdq_available());
1090 }
1091 
1092 static inline void
1093 gcm_set_avx(boolean_t val)
1094 {
1095 	if (gcm_avx_will_work() == B_TRUE) {
1096 		atomic_swap_32(&gcm_use_avx, val);
1097 	}
1098 }
1099 
1100 static inline boolean_t
1101 gcm_toggle_avx(void)
1102 {
1103 	if (gcm_avx_will_work() == B_TRUE) {
1104 		return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
1105 	} else {
1106 		return (B_FALSE);
1107 	}
1108 }
1109 
1110 static inline size_t
1111 gcm_simd_get_htab_size(boolean_t simd_mode)
1112 {
1113 	switch (simd_mode) {
1114 	case B_TRUE:
1115 		return (2 * 6 * 2 * sizeof (uint64_t));
1116 
1117 	default:
1118 		return (0);
1119 	}
1120 }
1121 
1122 /*
1123  * Clear sensitive data in the context.
1124  *
1125  * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and
1126  * ctx->gcm_Htable contain the hash sub key which protects authentication.
1127  *
1128  * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for
1129  * a known plaintext attack, they consists of the IV and the first and last
1130  * counter respectively. If they should be cleared is debatable.
1131  */
1132 static inline void
1133 gcm_clear_ctx(gcm_ctx_t *ctx)
1134 {
1135 	bzero(ctx->gcm_remainder, sizeof (ctx->gcm_remainder));
1136 	bzero(ctx->gcm_H, sizeof (ctx->gcm_H));
1137 	bzero(ctx->gcm_J0, sizeof (ctx->gcm_J0));
1138 	bzero(ctx->gcm_tmp, sizeof (ctx->gcm_tmp));
1139 }
1140 
1141 /* Increment the GCM counter block by n. */
1142 static inline void
1143 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1144 {
1145 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1146 	uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1147 
1148 	counter = htonll(counter + n);
1149 	counter &= counter_mask;
1150 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1151 }
1152 
1153 /*
1154  * Encrypt multiple blocks of data in GCM mode.
1155  * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1156  * if possible. While processing a chunk the FPU is "locked".
1157  */
1158 static int
1159 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1160     size_t length, crypto_data_t *out, size_t block_size)
1161 {
1162 	size_t bleft = length;
1163 	size_t need = 0;
1164 	size_t done = 0;
1165 	uint8_t *datap = (uint8_t *)data;
1166 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1167 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1168 	uint64_t *ghash = ctx->gcm_ghash;
1169 	uint64_t *cb = ctx->gcm_cb;
1170 	uint8_t *ct_buf = NULL;
1171 	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1172 	int rv = CRYPTO_SUCCESS;
1173 
1174 	ASSERT(block_size == GCM_BLOCK_LEN);
1175 	/*
1176 	 * If the last call left an incomplete block, try to fill
1177 	 * it first.
1178 	 */
1179 	if (ctx->gcm_remainder_len > 0) {
1180 		need = block_size - ctx->gcm_remainder_len;
1181 		if (length < need) {
1182 			/* Accumulate bytes here and return. */
1183 			bcopy(datap, (uint8_t *)ctx->gcm_remainder +
1184 			    ctx->gcm_remainder_len, length);
1185 
1186 			ctx->gcm_remainder_len += length;
1187 			if (ctx->gcm_copy_to == NULL) {
1188 				ctx->gcm_copy_to = datap;
1189 			}
1190 			return (CRYPTO_SUCCESS);
1191 		} else {
1192 			/* Complete incomplete block. */
1193 			bcopy(datap, (uint8_t *)ctx->gcm_remainder +
1194 			    ctx->gcm_remainder_len, need);
1195 
1196 			ctx->gcm_copy_to = NULL;
1197 		}
1198 	}
1199 
1200 	/* Allocate a buffer to encrypt to if there is enough input. */
1201 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1202 		ct_buf = vmem_alloc(chunk_size, ctx->gcm_kmflag);
1203 		if (ct_buf == NULL) {
1204 			return (CRYPTO_HOST_MEMORY);
1205 		}
1206 	}
1207 
1208 	/* If we completed an incomplete block, encrypt and write it out. */
1209 	if (ctx->gcm_remainder_len > 0) {
1210 		kfpu_begin();
1211 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1212 		    (const uint32_t *)cb, (uint32_t *)tmp);
1213 
1214 		gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1215 		GHASH_AVX(ctx, tmp, block_size);
1216 		clear_fpu_regs();
1217 		kfpu_end();
1218 		rv = crypto_put_output_data(tmp, out, block_size);
1219 		out->cd_offset += block_size;
1220 		gcm_incr_counter_block(ctx);
1221 		ctx->gcm_processed_data_len += block_size;
1222 		bleft -= need;
1223 		datap += need;
1224 		ctx->gcm_remainder_len = 0;
1225 	}
1226 
1227 	/* Do the bulk encryption in chunk_size blocks. */
1228 	for (; bleft >= chunk_size; bleft -= chunk_size) {
1229 		kfpu_begin();
1230 		done = aesni_gcm_encrypt(
1231 		    datap, ct_buf, chunk_size, key, cb, ghash);
1232 
1233 		clear_fpu_regs();
1234 		kfpu_end();
1235 		if (done != chunk_size) {
1236 			rv = CRYPTO_FAILED;
1237 			goto out_nofpu;
1238 		}
1239 		rv = crypto_put_output_data(ct_buf, out, chunk_size);
1240 		if (rv != CRYPTO_SUCCESS) {
1241 			goto out_nofpu;
1242 		}
1243 		out->cd_offset += chunk_size;
1244 		datap += chunk_size;
1245 		ctx->gcm_processed_data_len += chunk_size;
1246 	}
1247 	/* Check if we are already done. */
1248 	if (bleft == 0) {
1249 		goto out_nofpu;
1250 	}
1251 	/* Bulk encrypt the remaining data. */
1252 	kfpu_begin();
1253 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1254 		done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
1255 		if (done == 0) {
1256 			rv = CRYPTO_FAILED;
1257 			goto out;
1258 		}
1259 		rv = crypto_put_output_data(ct_buf, out, done);
1260 		if (rv != CRYPTO_SUCCESS) {
1261 			goto out;
1262 		}
1263 		out->cd_offset += done;
1264 		ctx->gcm_processed_data_len += done;
1265 		datap += done;
1266 		bleft -= done;
1267 
1268 	}
1269 	/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1270 	while (bleft > 0) {
1271 		if (bleft < block_size) {
1272 			bcopy(datap, ctx->gcm_remainder, bleft);
1273 			ctx->gcm_remainder_len = bleft;
1274 			ctx->gcm_copy_to = datap;
1275 			goto out;
1276 		}
1277 		/* Encrypt, hash and write out. */
1278 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1279 		    (const uint32_t *)cb, (uint32_t *)tmp);
1280 
1281 		gcm_xor_avx(datap, tmp);
1282 		GHASH_AVX(ctx, tmp, block_size);
1283 		rv = crypto_put_output_data(tmp, out, block_size);
1284 		if (rv != CRYPTO_SUCCESS) {
1285 			goto out;
1286 		}
1287 		out->cd_offset += block_size;
1288 		gcm_incr_counter_block(ctx);
1289 		ctx->gcm_processed_data_len += block_size;
1290 		datap += block_size;
1291 		bleft -= block_size;
1292 	}
1293 out:
1294 	clear_fpu_regs();
1295 	kfpu_end();
1296 out_nofpu:
1297 	if (ct_buf != NULL) {
1298 		vmem_free(ct_buf, chunk_size);
1299 	}
1300 	return (rv);
1301 }
1302 
1303 /*
1304  * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1305  * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1306  */
1307 static int
1308 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1309 {
1310 	uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1311 	uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1312 	uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1313 	size_t rem_len = ctx->gcm_remainder_len;
1314 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1315 	int aes_rounds = ((aes_key_t *)keysched)->nr;
1316 	int rv;
1317 
1318 	ASSERT(block_size == GCM_BLOCK_LEN);
1319 
1320 	if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1321 		return (CRYPTO_DATA_LEN_RANGE);
1322 	}
1323 
1324 	kfpu_begin();
1325 	/* Pad last incomplete block with zeros, encrypt and hash. */
1326 	if (rem_len > 0) {
1327 		uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1328 		const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1329 
1330 		aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1331 		bzero(remainder + rem_len, block_size - rem_len);
1332 		for (int i = 0; i < rem_len; i++) {
1333 			remainder[i] ^= tmp[i];
1334 		}
1335 		GHASH_AVX(ctx, remainder, block_size);
1336 		ctx->gcm_processed_data_len += rem_len;
1337 		/* No need to increment counter_block, it's the last block. */
1338 	}
1339 	/* Finish tag. */
1340 	ctx->gcm_len_a_len_c[1] =
1341 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1342 	GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1343 	aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1344 
1345 	gcm_xor_avx((uint8_t *)J0, ghash);
1346 	clear_fpu_regs();
1347 	kfpu_end();
1348 
1349 	/* Output remainder. */
1350 	if (rem_len > 0) {
1351 		rv = crypto_put_output_data(remainder, out, rem_len);
1352 		if (rv != CRYPTO_SUCCESS)
1353 			return (rv);
1354 	}
1355 	out->cd_offset += rem_len;
1356 	ctx->gcm_remainder_len = 0;
1357 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1358 	if (rv != CRYPTO_SUCCESS)
1359 		return (rv);
1360 
1361 	out->cd_offset += ctx->gcm_tag_len;
1362 	/* Clear sensitive data in the context before returning. */
1363 	gcm_clear_ctx(ctx);
1364 	return (CRYPTO_SUCCESS);
1365 }
1366 
1367 /*
1368  * Finalize decryption: We just have accumulated crypto text, so now we
1369  * decrypt it here inplace.
1370  */
1371 static int
1372 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1373 {
1374 	ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1375 	ASSERT3U(block_size, ==, 16);
1376 
1377 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1378 	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1379 	uint8_t *datap = ctx->gcm_pt_buf;
1380 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1381 	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1382 	uint64_t *ghash = ctx->gcm_ghash;
1383 	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1384 	int rv = CRYPTO_SUCCESS;
1385 	size_t bleft, done;
1386 
1387 	/*
1388 	 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1389 	 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1390 	 * GCM_AVX_MIN_DECRYPT_BYTES.
1391 	 */
1392 	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1393 		kfpu_begin();
1394 		done = aesni_gcm_decrypt(datap, datap, chunk_size,
1395 		    (const void *)key, ctx->gcm_cb, ghash);
1396 		clear_fpu_regs();
1397 		kfpu_end();
1398 		if (done != chunk_size) {
1399 			return (CRYPTO_FAILED);
1400 		}
1401 		datap += done;
1402 	}
1403 	/* Decrypt remainder, which is less than chunk size, in one go. */
1404 	kfpu_begin();
1405 	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1406 		done = aesni_gcm_decrypt(datap, datap, bleft,
1407 		    (const void *)key, ctx->gcm_cb, ghash);
1408 		if (done == 0) {
1409 			clear_fpu_regs();
1410 			kfpu_end();
1411 			return (CRYPTO_FAILED);
1412 		}
1413 		datap += done;
1414 		bleft -= done;
1415 	}
1416 	ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1417 
1418 	/*
1419 	 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1420 	 * decrypt them block by block.
1421 	 */
1422 	while (bleft > 0) {
1423 		/* Incomplete last block. */
1424 		if (bleft < block_size) {
1425 			uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1426 
1427 			bzero(lastb, block_size);
1428 			bcopy(datap, lastb, bleft);
1429 			/* The GCM processing. */
1430 			GHASH_AVX(ctx, lastb, block_size);
1431 			aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1432 			for (size_t i = 0; i < bleft; i++) {
1433 				datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1434 			}
1435 			break;
1436 		}
1437 		/* The GCM processing. */
1438 		GHASH_AVX(ctx, datap, block_size);
1439 		aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1440 		gcm_xor_avx((uint8_t *)tmp, datap);
1441 		gcm_incr_counter_block(ctx);
1442 
1443 		datap += block_size;
1444 		bleft -= block_size;
1445 	}
1446 	if (rv != CRYPTO_SUCCESS) {
1447 		clear_fpu_regs();
1448 		kfpu_end();
1449 		return (rv);
1450 	}
1451 	/* Decryption done, finish the tag. */
1452 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1453 	GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1454 	aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1455 	    (uint32_t *)ctx->gcm_J0);
1456 
1457 	gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1458 
1459 	/* We are done with the FPU, restore its state. */
1460 	clear_fpu_regs();
1461 	kfpu_end();
1462 
1463 	/* Compare the input authentication tag with what we calculated. */
1464 	if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1465 		/* They don't match. */
1466 		return (CRYPTO_INVALID_MAC);
1467 	}
1468 	rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1469 	if (rv != CRYPTO_SUCCESS) {
1470 		return (rv);
1471 	}
1472 	out->cd_offset += pt_len;
1473 	gcm_clear_ctx(ctx);
1474 	return (CRYPTO_SUCCESS);
1475 }
1476 
1477 /*
1478  * Initialize the GCM params H, Htabtle and the counter block. Save the
1479  * initial counter block.
1480  */
1481 static int
1482 gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
1483     unsigned char *auth_data, size_t auth_data_len, size_t block_size)
1484 {
1485 	uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1486 	uint64_t *H = ctx->gcm_H;
1487 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1488 	int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1489 	uint8_t *datap = auth_data;
1490 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1491 	size_t bleft;
1492 
1493 	ASSERT(block_size == GCM_BLOCK_LEN);
1494 
1495 	/* Init H (encrypt zero block) and create the initial counter block. */
1496 	bzero(ctx->gcm_ghash, sizeof (ctx->gcm_ghash));
1497 	bzero(H, sizeof (ctx->gcm_H));
1498 	kfpu_begin();
1499 	aes_encrypt_intel(keysched, aes_rounds,
1500 	    (const uint32_t *)H, (uint32_t *)H);
1501 
1502 	gcm_init_htab_avx(ctx->gcm_Htable, H);
1503 
1504 	if (iv_len == 12) {
1505 		bcopy(iv, cb, 12);
1506 		cb[12] = 0;
1507 		cb[13] = 0;
1508 		cb[14] = 0;
1509 		cb[15] = 1;
1510 		/* We need the ICB later. */
1511 		bcopy(cb, ctx->gcm_J0, sizeof (ctx->gcm_J0));
1512 	} else {
1513 		/*
1514 		 * Most consumers use 12 byte IVs, so it's OK to use the
1515 		 * original routines for other IV sizes, just avoid nesting
1516 		 * kfpu_begin calls.
1517 		 */
1518 		clear_fpu_regs();
1519 		kfpu_end();
1520 		gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1521 		    aes_copy_block, aes_xor_block);
1522 		kfpu_begin();
1523 	}
1524 
1525 	/* Openssl post increments the counter, adjust for that. */
1526 	gcm_incr_counter_block(ctx);
1527 
1528 	/* Ghash AAD in chunk_size blocks. */
1529 	for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1530 		GHASH_AVX(ctx, datap, chunk_size);
1531 		datap += chunk_size;
1532 		clear_fpu_regs();
1533 		kfpu_end();
1534 		kfpu_begin();
1535 	}
1536 	/* Ghash the remainder and handle possible incomplete GCM block. */
1537 	if (bleft > 0) {
1538 		size_t incomp = bleft % block_size;
1539 
1540 		bleft -= incomp;
1541 		if (bleft > 0) {
1542 			GHASH_AVX(ctx, datap, bleft);
1543 			datap += bleft;
1544 		}
1545 		if (incomp > 0) {
1546 			/* Zero pad and hash incomplete last block. */
1547 			uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1548 
1549 			bzero(authp, block_size);
1550 			bcopy(datap, authp, incomp);
1551 			GHASH_AVX(ctx, authp, block_size);
1552 		}
1553 	}
1554 	clear_fpu_regs();
1555 	kfpu_end();
1556 	return (CRYPTO_SUCCESS);
1557 }
1558 
1559 #if defined(_KERNEL)
1560 static int
1561 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1562 {
1563 	unsigned long val;
1564 	char val_rounded[16];
1565 	int error = 0;
1566 
1567 	error = kstrtoul(buf, 0, &val);
1568 	if (error)
1569 		return (error);
1570 
1571 	val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1572 
1573 	if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1574 		return (-EINVAL);
1575 
1576 	snprintf(val_rounded, 16, "%u", (uint32_t)val);
1577 	error = param_set_uint(val_rounded, kp);
1578 	return (error);
1579 }
1580 
1581 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1582     param_get_uint, &gcm_avx_chunk_size, 0644);
1583 
1584 MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1585 	"How many bytes to process while owning the FPU");
1586 
1587 #endif /* defined(__KERNEL) */
1588 #endif /* ifdef CAN_USE_GCM_ASM */
1589