xref: /freebsd/sys/contrib/openzfs/module/icp/algs/modes/gcm.c (revision 089104e0e01f080c9cd45dc5f34c4f636dea4ca7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/zfs_context.h>
26 #include <sys/cmn_err.h>
27 #include <modes/modes.h>
28 #include <sys/crypto/common.h>
29 #include <sys/crypto/icp.h>
30 #include <sys/crypto/impl.h>
31 #include <sys/byteorder.h>
32 #include <sys/simd.h>
33 #include <modes/gcm_impl.h>
34 #ifdef CAN_USE_GCM_ASM
35 #include <aes/aes_impl.h>
36 #endif
37 
38 #define	GHASH(c, d, t, o) \
39 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
40 	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
41 	(uint64_t *)(void *)(t));
42 
43 /* Select GCM implementation */
44 #define	IMPL_FASTEST	(UINT32_MAX)
45 #define	IMPL_CYCLE	(UINT32_MAX-1)
46 #ifdef CAN_USE_GCM_ASM
47 #define	IMPL_AVX	(UINT32_MAX-2)
48 #endif
49 #define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
50 static uint32_t icp_gcm_impl = IMPL_FASTEST;
51 static uint32_t user_sel_impl = IMPL_FASTEST;
52 
53 static inline int gcm_init_ctx_impl(boolean_t, gcm_ctx_t *, char *, size_t,
54     int (*)(const void *, const uint8_t *, uint8_t *),
55     void (*)(uint8_t *, uint8_t *),
56     void (*)(uint8_t *, uint8_t *));
57 
58 #ifdef CAN_USE_GCM_ASM
59 /* Does the architecture we run on support the MOVBE instruction? */
60 boolean_t gcm_avx_can_use_movbe = B_FALSE;
61 /*
62  * Whether to use the optimized openssl gcm and ghash implementations.
63  * Set to true if module parameter icp_gcm_impl == "avx".
64  */
65 static boolean_t gcm_use_avx = B_FALSE;
66 #define	GCM_IMPL_USE_AVX	(*(volatile boolean_t *)&gcm_use_avx)
67 
68 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
69 
70 static inline boolean_t gcm_avx_will_work(void);
71 static inline void gcm_set_avx(boolean_t);
72 static inline boolean_t gcm_toggle_avx(void);
73 static inline size_t gcm_simd_get_htab_size(boolean_t);
74 
75 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
76     crypto_data_t *, size_t);
77 
78 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
79 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
80 static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *,
81     size_t, size_t);
82 #endif /* ifdef CAN_USE_GCM_ASM */
83 
84 /*
85  * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
86  * is done in another function.
87  */
88 int
89 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
90     crypto_data_t *out, size_t block_size,
91     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
92     void (*copy_block)(uint8_t *, uint8_t *),
93     void (*xor_block)(uint8_t *, uint8_t *))
94 {
95 #ifdef CAN_USE_GCM_ASM
96 	if (ctx->gcm_use_avx == B_TRUE)
97 		return (gcm_mode_encrypt_contiguous_blocks_avx(
98 		    ctx, data, length, out, block_size));
99 #endif
100 
101 	const gcm_impl_ops_t *gops;
102 	size_t remainder = length;
103 	size_t need = 0;
104 	uint8_t *datap = (uint8_t *)data;
105 	uint8_t *blockp;
106 	uint8_t *lastp;
107 	void *iov_or_mp;
108 	offset_t offset;
109 	uint8_t *out_data_1;
110 	uint8_t *out_data_2;
111 	size_t out_data_1_len;
112 	uint64_t counter;
113 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
114 
115 	if (length + ctx->gcm_remainder_len < block_size) {
116 		/* accumulate bytes here and return */
117 		memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
118 		    datap,
119 		    length);
120 		ctx->gcm_remainder_len += length;
121 		if (ctx->gcm_copy_to == NULL) {
122 			ctx->gcm_copy_to = datap;
123 		}
124 		return (CRYPTO_SUCCESS);
125 	}
126 
127 	crypto_init_ptrs(out, &iov_or_mp, &offset);
128 
129 	gops = gcm_impl_get_ops();
130 	do {
131 		/* Unprocessed data from last call. */
132 		if (ctx->gcm_remainder_len > 0) {
133 			need = block_size - ctx->gcm_remainder_len;
134 
135 			if (need > remainder)
136 				return (CRYPTO_DATA_LEN_RANGE);
137 
138 			memcpy(&((uint8_t *)ctx->gcm_remainder)
139 			    [ctx->gcm_remainder_len], datap, need);
140 
141 			blockp = (uint8_t *)ctx->gcm_remainder;
142 		} else {
143 			blockp = datap;
144 		}
145 
146 		/*
147 		 * Increment counter. Counter bits are confined
148 		 * to the bottom 32 bits of the counter block.
149 		 */
150 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
151 		counter = htonll(counter + 1);
152 		counter &= counter_mask;
153 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
154 
155 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
156 		    (uint8_t *)ctx->gcm_tmp);
157 		xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
158 
159 		lastp = (uint8_t *)ctx->gcm_tmp;
160 
161 		ctx->gcm_processed_data_len += block_size;
162 
163 		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
164 		    &out_data_1_len, &out_data_2, block_size);
165 
166 		/* copy block to where it belongs */
167 		if (out_data_1_len == block_size) {
168 			copy_block(lastp, out_data_1);
169 		} else {
170 			memcpy(out_data_1, lastp, out_data_1_len);
171 			if (out_data_2 != NULL) {
172 				memcpy(out_data_2,
173 				    lastp + out_data_1_len,
174 				    block_size - out_data_1_len);
175 			}
176 		}
177 		/* update offset */
178 		out->cd_offset += block_size;
179 
180 		/* add ciphertext to the hash */
181 		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
182 
183 		/* Update pointer to next block of data to be processed. */
184 		if (ctx->gcm_remainder_len != 0) {
185 			datap += need;
186 			ctx->gcm_remainder_len = 0;
187 		} else {
188 			datap += block_size;
189 		}
190 
191 		remainder = (size_t)&data[length] - (size_t)datap;
192 
193 		/* Incomplete last block. */
194 		if (remainder > 0 && remainder < block_size) {
195 			memcpy(ctx->gcm_remainder, datap, remainder);
196 			ctx->gcm_remainder_len = remainder;
197 			ctx->gcm_copy_to = datap;
198 			goto out;
199 		}
200 		ctx->gcm_copy_to = NULL;
201 
202 	} while (remainder > 0);
203 out:
204 	return (CRYPTO_SUCCESS);
205 }
206 
207 int
208 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
209     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
210     void (*copy_block)(uint8_t *, uint8_t *),
211     void (*xor_block)(uint8_t *, uint8_t *))
212 {
213 	(void) copy_block;
214 #ifdef CAN_USE_GCM_ASM
215 	if (ctx->gcm_use_avx == B_TRUE)
216 		return (gcm_encrypt_final_avx(ctx, out, block_size));
217 #endif
218 
219 	const gcm_impl_ops_t *gops;
220 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
221 	uint8_t *ghash, *macp = NULL;
222 	int i, rv;
223 
224 	if (out->cd_length <
225 	    (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
226 		return (CRYPTO_DATA_LEN_RANGE);
227 	}
228 
229 	gops = gcm_impl_get_ops();
230 	ghash = (uint8_t *)ctx->gcm_ghash;
231 
232 	if (ctx->gcm_remainder_len > 0) {
233 		uint64_t counter;
234 		uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
235 
236 		/*
237 		 * Here is where we deal with data that is not a
238 		 * multiple of the block size.
239 		 */
240 
241 		/*
242 		 * Increment counter.
243 		 */
244 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
245 		counter = htonll(counter + 1);
246 		counter &= counter_mask;
247 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
248 
249 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
250 		    (uint8_t *)ctx->gcm_tmp);
251 
252 		macp = (uint8_t *)ctx->gcm_remainder;
253 		memset(macp + ctx->gcm_remainder_len, 0,
254 		    block_size - ctx->gcm_remainder_len);
255 
256 		/* XOR with counter block */
257 		for (i = 0; i < ctx->gcm_remainder_len; i++) {
258 			macp[i] ^= tmpp[i];
259 		}
260 
261 		/* add ciphertext to the hash */
262 		GHASH(ctx, macp, ghash, gops);
263 
264 		ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
265 	}
266 
267 	ctx->gcm_len_a_len_c[1] =
268 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
269 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
270 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
271 	    (uint8_t *)ctx->gcm_J0);
272 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
273 
274 	if (ctx->gcm_remainder_len > 0) {
275 		rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
276 		if (rv != CRYPTO_SUCCESS)
277 			return (rv);
278 	}
279 	out->cd_offset += ctx->gcm_remainder_len;
280 	ctx->gcm_remainder_len = 0;
281 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
282 	if (rv != CRYPTO_SUCCESS)
283 		return (rv);
284 	out->cd_offset += ctx->gcm_tag_len;
285 
286 	return (CRYPTO_SUCCESS);
287 }
288 
289 /*
290  * This will only deal with decrypting the last block of the input that
291  * might not be a multiple of block length.
292  */
293 static void
294 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
295     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
296     void (*xor_block)(uint8_t *, uint8_t *))
297 {
298 	uint8_t *datap, *outp, *counterp;
299 	uint64_t counter;
300 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
301 	int i;
302 
303 	/*
304 	 * Increment counter.
305 	 * Counter bits are confined to the bottom 32 bits
306 	 */
307 	counter = ntohll(ctx->gcm_cb[1] & counter_mask);
308 	counter = htonll(counter + 1);
309 	counter &= counter_mask;
310 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
311 
312 	datap = (uint8_t *)ctx->gcm_remainder;
313 	outp = &((ctx->gcm_pt_buf)[index]);
314 	counterp = (uint8_t *)ctx->gcm_tmp;
315 
316 	/* authentication tag */
317 	memset((uint8_t *)ctx->gcm_tmp, 0, block_size);
318 	memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len);
319 
320 	/* add ciphertext to the hash */
321 	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
322 
323 	/* decrypt remaining ciphertext */
324 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
325 
326 	/* XOR with counter block */
327 	for (i = 0; i < ctx->gcm_remainder_len; i++) {
328 		outp[i] = datap[i] ^ counterp[i];
329 	}
330 }
331 
332 int
333 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
334     crypto_data_t *out, size_t block_size,
335     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
336     void (*copy_block)(uint8_t *, uint8_t *),
337     void (*xor_block)(uint8_t *, uint8_t *))
338 {
339 	(void) out, (void) block_size, (void) encrypt_block, (void) copy_block,
340 	    (void) xor_block;
341 	size_t new_len;
342 	uint8_t *new;
343 
344 	/*
345 	 * Copy contiguous ciphertext input blocks to plaintext buffer.
346 	 * Ciphertext will be decrypted in the final.
347 	 */
348 	if (length > 0) {
349 		new_len = ctx->gcm_pt_buf_len + length;
350 		new = vmem_alloc(new_len, KM_SLEEP);
351 		if (new == NULL) {
352 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
353 			ctx->gcm_pt_buf = NULL;
354 			return (CRYPTO_HOST_MEMORY);
355 		}
356 
357 		if (ctx->gcm_pt_buf != NULL) {
358 			memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
359 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
360 		} else {
361 			ASSERT0(ctx->gcm_pt_buf_len);
362 		}
363 
364 		ctx->gcm_pt_buf = new;
365 		ctx->gcm_pt_buf_len = new_len;
366 		memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data,
367 		    length);
368 		ctx->gcm_processed_data_len += length;
369 	}
370 
371 	ctx->gcm_remainder_len = 0;
372 	return (CRYPTO_SUCCESS);
373 }
374 
375 int
376 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
377     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
378     void (*xor_block)(uint8_t *, uint8_t *))
379 {
380 #ifdef CAN_USE_GCM_ASM
381 	if (ctx->gcm_use_avx == B_TRUE)
382 		return (gcm_decrypt_final_avx(ctx, out, block_size));
383 #endif
384 
385 	const gcm_impl_ops_t *gops;
386 	size_t pt_len;
387 	size_t remainder;
388 	uint8_t *ghash;
389 	uint8_t *blockp;
390 	uint8_t *cbp;
391 	uint64_t counter;
392 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
393 	int processed = 0, rv;
394 
395 	ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
396 
397 	gops = gcm_impl_get_ops();
398 	pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
399 	ghash = (uint8_t *)ctx->gcm_ghash;
400 	blockp = ctx->gcm_pt_buf;
401 	remainder = pt_len;
402 	while (remainder > 0) {
403 		/* Incomplete last block */
404 		if (remainder < block_size) {
405 			memcpy(ctx->gcm_remainder, blockp, remainder);
406 			ctx->gcm_remainder_len = remainder;
407 			/*
408 			 * not expecting anymore ciphertext, just
409 			 * compute plaintext for the remaining input
410 			 */
411 			gcm_decrypt_incomplete_block(ctx, block_size,
412 			    processed, encrypt_block, xor_block);
413 			ctx->gcm_remainder_len = 0;
414 			goto out;
415 		}
416 		/* add ciphertext to the hash */
417 		GHASH(ctx, blockp, ghash, gops);
418 
419 		/*
420 		 * Increment counter.
421 		 * Counter bits are confined to the bottom 32 bits
422 		 */
423 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
424 		counter = htonll(counter + 1);
425 		counter &= counter_mask;
426 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
427 
428 		cbp = (uint8_t *)ctx->gcm_tmp;
429 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
430 
431 		/* XOR with ciphertext */
432 		xor_block(cbp, blockp);
433 
434 		processed += block_size;
435 		blockp += block_size;
436 		remainder -= block_size;
437 	}
438 out:
439 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
440 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
441 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
442 	    (uint8_t *)ctx->gcm_J0);
443 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
444 
445 	/* compare the input authentication tag with what we calculated */
446 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
447 		/* They don't match */
448 		return (CRYPTO_INVALID_MAC);
449 	} else {
450 		rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
451 		if (rv != CRYPTO_SUCCESS)
452 			return (rv);
453 		out->cd_offset += pt_len;
454 	}
455 	return (CRYPTO_SUCCESS);
456 }
457 
458 static int
459 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
460 {
461 	size_t tag_len;
462 
463 	/*
464 	 * Check the length of the authentication tag (in bits).
465 	 */
466 	tag_len = gcm_param->ulTagBits;
467 	switch (tag_len) {
468 	case 32:
469 	case 64:
470 	case 96:
471 	case 104:
472 	case 112:
473 	case 120:
474 	case 128:
475 		break;
476 	default:
477 		return (CRYPTO_MECHANISM_PARAM_INVALID);
478 	}
479 
480 	if (gcm_param->ulIvLen == 0)
481 		return (CRYPTO_MECHANISM_PARAM_INVALID);
482 
483 	return (CRYPTO_SUCCESS);
484 }
485 
486 static void
487 gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len,
488     gcm_ctx_t *ctx, size_t block_size,
489     void (*copy_block)(uint8_t *, uint8_t *),
490     void (*xor_block)(uint8_t *, uint8_t *))
491 {
492 	const gcm_impl_ops_t *gops;
493 	uint8_t *cb;
494 	ulong_t remainder = iv_len;
495 	ulong_t processed = 0;
496 	uint8_t *datap, *ghash;
497 	uint64_t len_a_len_c[2];
498 
499 	gops = gcm_impl_get_ops();
500 	ghash = (uint8_t *)ctx->gcm_ghash;
501 	cb = (uint8_t *)ctx->gcm_cb;
502 	if (iv_len == 12) {
503 		memcpy(cb, iv, 12);
504 		cb[12] = 0;
505 		cb[13] = 0;
506 		cb[14] = 0;
507 		cb[15] = 1;
508 		/* J0 will be used again in the final */
509 		copy_block(cb, (uint8_t *)ctx->gcm_J0);
510 	} else {
511 		/* GHASH the IV */
512 		do {
513 			if (remainder < block_size) {
514 				memset(cb, 0, block_size);
515 				memcpy(cb, &(iv[processed]), remainder);
516 				datap = (uint8_t *)cb;
517 				remainder = 0;
518 			} else {
519 				datap = (uint8_t *)(&(iv[processed]));
520 				processed += block_size;
521 				remainder -= block_size;
522 			}
523 			GHASH(ctx, datap, ghash, gops);
524 		} while (remainder > 0);
525 
526 		len_a_len_c[0] = 0;
527 		len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
528 		GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
529 
530 		/* J0 will be used again in the final */
531 		copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
532 	}
533 }
534 
535 static int
536 gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
537     const uint8_t *auth_data, size_t auth_data_len, size_t block_size,
538     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
539     void (*copy_block)(uint8_t *, uint8_t *),
540     void (*xor_block)(uint8_t *, uint8_t *))
541 {
542 	const gcm_impl_ops_t *gops;
543 	uint8_t *ghash, *datap, *authp;
544 	size_t remainder, processed;
545 
546 	/* encrypt zero block to get subkey H */
547 	memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
548 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
549 	    (uint8_t *)ctx->gcm_H);
550 
551 	gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
552 	    copy_block, xor_block);
553 
554 	gops = gcm_impl_get_ops();
555 	authp = (uint8_t *)ctx->gcm_tmp;
556 	ghash = (uint8_t *)ctx->gcm_ghash;
557 	memset(authp, 0, block_size);
558 	memset(ghash, 0, block_size);
559 
560 	processed = 0;
561 	remainder = auth_data_len;
562 	do {
563 		if (remainder < block_size) {
564 			/*
565 			 * There's not a block full of data, pad rest of
566 			 * buffer with zero
567 			 */
568 
569 			if (auth_data != NULL) {
570 				memset(authp, 0, block_size);
571 				memcpy(authp, &(auth_data[processed]),
572 				    remainder);
573 			} else {
574 				ASSERT0(remainder);
575 			}
576 
577 			datap = (uint8_t *)authp;
578 			remainder = 0;
579 		} else {
580 			datap = (uint8_t *)(&(auth_data[processed]));
581 			processed += block_size;
582 			remainder -= block_size;
583 		}
584 
585 		/* add auth data to the hash */
586 		GHASH(ctx, datap, ghash, gops);
587 
588 	} while (remainder > 0);
589 
590 	return (CRYPTO_SUCCESS);
591 }
592 
593 /*
594  * The following function is called at encrypt or decrypt init time
595  * for AES GCM mode.
596  */
597 int
598 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
599     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
600     void (*copy_block)(uint8_t *, uint8_t *),
601     void (*xor_block)(uint8_t *, uint8_t *))
602 {
603 	return (gcm_init_ctx_impl(B_FALSE, gcm_ctx, param, block_size,
604 	    encrypt_block, copy_block, xor_block));
605 }
606 
607 /*
608  * The following function is called at encrypt or decrypt init time
609  * for AES GMAC mode.
610  */
611 int
612 gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
613     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
614     void (*copy_block)(uint8_t *, uint8_t *),
615     void (*xor_block)(uint8_t *, uint8_t *))
616 {
617 	return (gcm_init_ctx_impl(B_TRUE, gcm_ctx, param, block_size,
618 	    encrypt_block, copy_block, xor_block));
619 }
620 
621 /*
622  * Init the GCM context struct. Handle the cycle and avx implementations here.
623  * Initialization of a GMAC context differs slightly from a GCM context.
624  */
625 static inline int
626 gcm_init_ctx_impl(boolean_t gmac_mode, gcm_ctx_t *gcm_ctx, char *param,
627     size_t block_size, int (*encrypt_block)(const void *, const uint8_t *,
628     uint8_t *), void (*copy_block)(uint8_t *, uint8_t *),
629     void (*xor_block)(uint8_t *, uint8_t *))
630 {
631 	CK_AES_GCM_PARAMS *gcm_param;
632 	int rv = CRYPTO_SUCCESS;
633 	size_t tag_len, iv_len;
634 
635 	if (param != NULL) {
636 		gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
637 
638 		if (gmac_mode == B_FALSE) {
639 			/* GCM mode. */
640 			if ((rv = gcm_validate_args(gcm_param)) != 0) {
641 				return (rv);
642 			}
643 			gcm_ctx->gcm_flags |= GCM_MODE;
644 
645 			size_t tbits = gcm_param->ulTagBits;
646 			tag_len = CRYPTO_BITS2BYTES(tbits);
647 			iv_len = gcm_param->ulIvLen;
648 		} else {
649 			/* GMAC mode. */
650 			gcm_ctx->gcm_flags |= GMAC_MODE;
651 			tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS);
652 			iv_len = AES_GMAC_IV_LEN;
653 		}
654 		gcm_ctx->gcm_tag_len = tag_len;
655 		gcm_ctx->gcm_processed_data_len = 0;
656 
657 		/* these values are in bits */
658 		gcm_ctx->gcm_len_a_len_c[0]
659 		    = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
660 	} else {
661 		return (CRYPTO_MECHANISM_PARAM_INVALID);
662 	}
663 
664 	const uint8_t *iv = (const uint8_t *)gcm_param->pIv;
665 	const uint8_t *aad = (const uint8_t *)gcm_param->pAAD;
666 	size_t aad_len = gcm_param->ulAADLen;
667 
668 #ifdef CAN_USE_GCM_ASM
669 	boolean_t needs_bswap =
670 	    ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
671 
672 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
673 		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
674 	} else {
675 		/*
676 		 * Handle the "cycle" implementation by creating avx and
677 		 * non-avx contexts alternately.
678 		 */
679 		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
680 
681 		/* The avx impl. doesn't handle byte swapped key schedules. */
682 		if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
683 			gcm_ctx->gcm_use_avx = B_FALSE;
684 		}
685 		/*
686 		 * If this is a GCM context, use the MOVBE and the BSWAP
687 		 * variants alternately. GMAC contexts code paths do not
688 		 * use the MOVBE instruction.
689 		 */
690 		if (gcm_ctx->gcm_use_avx == B_TRUE && gmac_mode == B_FALSE &&
691 		    zfs_movbe_available() == B_TRUE) {
692 			(void) atomic_toggle_boolean_nv(
693 			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
694 		}
695 	}
696 	/*
697 	 * We don't handle byte swapped key schedules in the avx code path,
698 	 * still they could be created by the aes generic implementation.
699 	 * Make sure not to use them since we'll corrupt data if we do.
700 	 */
701 	if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
702 		gcm_ctx->gcm_use_avx = B_FALSE;
703 
704 		cmn_err_once(CE_WARN,
705 		    "ICP: Can't use the aes generic or cycle implementations "
706 		    "in combination with the gcm avx implementation!");
707 		cmn_err_once(CE_WARN,
708 		    "ICP: Falling back to a compatible implementation, "
709 		    "aes-gcm performance will likely be degraded.");
710 		cmn_err_once(CE_WARN,
711 		    "ICP: Choose at least the x86_64 aes implementation to "
712 		    "restore performance.");
713 	}
714 
715 	/* Allocate Htab memory as needed. */
716 	if (gcm_ctx->gcm_use_avx == B_TRUE) {
717 		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
718 
719 		if (htab_len == 0) {
720 			return (CRYPTO_MECHANISM_PARAM_INVALID);
721 		}
722 		gcm_ctx->gcm_htab_len = htab_len;
723 		gcm_ctx->gcm_Htable =
724 		    kmem_alloc(htab_len, KM_SLEEP);
725 
726 		if (gcm_ctx->gcm_Htable == NULL) {
727 			return (CRYPTO_HOST_MEMORY);
728 		}
729 	}
730 	/* Avx and non avx context initialization differs from here on. */
731 	if (gcm_ctx->gcm_use_avx == B_FALSE) {
732 #endif /* ifdef CAN_USE_GCM_ASM */
733 		if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
734 		    encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
735 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
736 		}
737 #ifdef CAN_USE_GCM_ASM
738 	} else {
739 		if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
740 		    block_size) != CRYPTO_SUCCESS) {
741 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
742 		}
743 	}
744 #endif /* ifdef CAN_USE_GCM_ASM */
745 
746 	return (rv);
747 }
748 
749 void *
750 gcm_alloc_ctx(int kmflag)
751 {
752 	gcm_ctx_t *gcm_ctx;
753 
754 	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
755 		return (NULL);
756 
757 	gcm_ctx->gcm_flags = GCM_MODE;
758 	return (gcm_ctx);
759 }
760 
761 void *
762 gmac_alloc_ctx(int kmflag)
763 {
764 	gcm_ctx_t *gcm_ctx;
765 
766 	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
767 		return (NULL);
768 
769 	gcm_ctx->gcm_flags = GMAC_MODE;
770 	return (gcm_ctx);
771 }
772 
773 /* GCM implementation that contains the fastest methods */
774 static gcm_impl_ops_t gcm_fastest_impl = {
775 	.name = "fastest"
776 };
777 
778 /* All compiled in implementations */
779 static const gcm_impl_ops_t *gcm_all_impl[] = {
780 	&gcm_generic_impl,
781 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
782 	&gcm_pclmulqdq_impl,
783 #endif
784 };
785 
786 /* Indicate that benchmark has been completed */
787 static boolean_t gcm_impl_initialized = B_FALSE;
788 
789 /* Hold all supported implementations */
790 static size_t gcm_supp_impl_cnt = 0;
791 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
792 
793 /*
794  * Returns the GCM operations for encrypt/decrypt/key setup.  When a
795  * SIMD implementation is not allowed in the current context, then
796  * fallback to the fastest generic implementation.
797  */
798 const gcm_impl_ops_t *
799 gcm_impl_get_ops(void)
800 {
801 	if (!kfpu_allowed())
802 		return (&gcm_generic_impl);
803 
804 	const gcm_impl_ops_t *ops = NULL;
805 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
806 
807 	switch (impl) {
808 	case IMPL_FASTEST:
809 		ASSERT(gcm_impl_initialized);
810 		ops = &gcm_fastest_impl;
811 		break;
812 	case IMPL_CYCLE:
813 		/* Cycle through supported implementations */
814 		ASSERT(gcm_impl_initialized);
815 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
816 		static size_t cycle_impl_idx = 0;
817 		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
818 		ops = gcm_supp_impl[idx];
819 		break;
820 #ifdef CAN_USE_GCM_ASM
821 	case IMPL_AVX:
822 		/*
823 		 * Make sure that we return a valid implementation while
824 		 * switching to the avx implementation since there still
825 		 * may be unfinished non-avx contexts around.
826 		 */
827 		ops = &gcm_generic_impl;
828 		break;
829 #endif
830 	default:
831 		ASSERT3U(impl, <, gcm_supp_impl_cnt);
832 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
833 		if (impl < ARRAY_SIZE(gcm_all_impl))
834 			ops = gcm_supp_impl[impl];
835 		break;
836 	}
837 
838 	ASSERT3P(ops, !=, NULL);
839 
840 	return (ops);
841 }
842 
843 /*
844  * Initialize all supported implementations.
845  */
846 void
847 gcm_impl_init(void)
848 {
849 	gcm_impl_ops_t *curr_impl;
850 	int i, c;
851 
852 	/* Move supported implementations into gcm_supp_impls */
853 	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
854 		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
855 
856 		if (curr_impl->is_supported())
857 			gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
858 	}
859 	gcm_supp_impl_cnt = c;
860 
861 	/*
862 	 * Set the fastest implementation given the assumption that the
863 	 * hardware accelerated version is the fastest.
864 	 */
865 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
866 	if (gcm_pclmulqdq_impl.is_supported()) {
867 		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
868 		    sizeof (gcm_fastest_impl));
869 	} else
870 #endif
871 	{
872 		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
873 		    sizeof (gcm_fastest_impl));
874 	}
875 
876 	strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
877 
878 #ifdef CAN_USE_GCM_ASM
879 	/*
880 	 * Use the avx implementation if it's available and the implementation
881 	 * hasn't changed from its default value of fastest on module load.
882 	 */
883 	if (gcm_avx_will_work()) {
884 #ifdef HAVE_MOVBE
885 		if (zfs_movbe_available() == B_TRUE) {
886 			atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
887 		}
888 #endif
889 		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
890 			gcm_set_avx(B_TRUE);
891 		}
892 	}
893 #endif
894 	/* Finish initialization */
895 	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
896 	gcm_impl_initialized = B_TRUE;
897 }
898 
899 static const struct {
900 	const char *name;
901 	uint32_t sel;
902 } gcm_impl_opts[] = {
903 		{ "cycle",	IMPL_CYCLE },
904 		{ "fastest",	IMPL_FASTEST },
905 #ifdef CAN_USE_GCM_ASM
906 		{ "avx",	IMPL_AVX },
907 #endif
908 };
909 
910 /*
911  * Function sets desired gcm implementation.
912  *
913  * If we are called before init(), user preference will be saved in
914  * user_sel_impl, and applied in later init() call. This occurs when module
915  * parameter is specified on module load. Otherwise, directly update
916  * icp_gcm_impl.
917  *
918  * @val		Name of gcm implementation to use
919  * @param	Unused.
920  */
921 int
922 gcm_impl_set(const char *val)
923 {
924 	int err = -EINVAL;
925 	char req_name[GCM_IMPL_NAME_MAX];
926 	uint32_t impl = GCM_IMPL_READ(user_sel_impl);
927 	size_t i;
928 
929 	/* sanitize input */
930 	i = strnlen(val, GCM_IMPL_NAME_MAX);
931 	if (i == 0 || i >= GCM_IMPL_NAME_MAX)
932 		return (err);
933 
934 	strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
935 	while (i > 0 && isspace(req_name[i-1]))
936 		i--;
937 	req_name[i] = '\0';
938 
939 	/* Check mandatory options */
940 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
941 #ifdef CAN_USE_GCM_ASM
942 		/* Ignore avx implementation if it won't work. */
943 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
944 			continue;
945 		}
946 #endif
947 		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
948 			impl = gcm_impl_opts[i].sel;
949 			err = 0;
950 			break;
951 		}
952 	}
953 
954 	/* check all supported impl if init() was already called */
955 	if (err != 0 && gcm_impl_initialized) {
956 		/* check all supported implementations */
957 		for (i = 0; i < gcm_supp_impl_cnt; i++) {
958 			if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
959 				impl = i;
960 				err = 0;
961 				break;
962 			}
963 		}
964 	}
965 #ifdef CAN_USE_GCM_ASM
966 	/*
967 	 * Use the avx implementation if available and the requested one is
968 	 * avx or fastest.
969 	 */
970 	if (gcm_avx_will_work() == B_TRUE &&
971 	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
972 		gcm_set_avx(B_TRUE);
973 	} else {
974 		gcm_set_avx(B_FALSE);
975 	}
976 #endif
977 
978 	if (err == 0) {
979 		if (gcm_impl_initialized)
980 			atomic_swap_32(&icp_gcm_impl, impl);
981 		else
982 			atomic_swap_32(&user_sel_impl, impl);
983 	}
984 
985 	return (err);
986 }
987 
988 #if defined(_KERNEL) && defined(__linux__)
989 
990 static int
991 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
992 {
993 	return (gcm_impl_set(val));
994 }
995 
996 static int
997 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
998 {
999 	int i, cnt = 0;
1000 	char *fmt;
1001 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
1002 
1003 	ASSERT(gcm_impl_initialized);
1004 
1005 	/* list mandatory options */
1006 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
1007 #ifdef CAN_USE_GCM_ASM
1008 		/* Ignore avx implementation if it won't work. */
1009 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
1010 			continue;
1011 		}
1012 #endif
1013 		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
1014 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
1015 		    gcm_impl_opts[i].name);
1016 	}
1017 
1018 	/* list all supported implementations */
1019 	for (i = 0; i < gcm_supp_impl_cnt; i++) {
1020 		fmt = (i == impl) ? "[%s] " : "%s ";
1021 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
1022 		    gcm_supp_impl[i]->name);
1023 	}
1024 
1025 	return (cnt);
1026 }
1027 
1028 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
1029     NULL, 0644);
1030 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
1031 #endif /* defined(__KERNEL) */
1032 
1033 #ifdef CAN_USE_GCM_ASM
1034 #define	GCM_BLOCK_LEN 16
1035 /*
1036  * The openssl asm routines are 6x aggregated and need that many bytes
1037  * at minimum.
1038  */
1039 #define	GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
1040 #define	GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
1041 /*
1042  * Ensure the chunk size is reasonable since we are allocating a
1043  * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
1044  */
1045 #define	GCM_AVX_MAX_CHUNK_SIZE \
1046 	(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
1047 
1048 /* Clear the FPU registers since they hold sensitive internal state. */
1049 #define	clear_fpu_regs() clear_fpu_regs_avx()
1050 #define	GHASH_AVX(ctx, in, len) \
1051     gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
1052     in, len)
1053 
1054 #define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1055 
1056 /* Get the chunk size module parameter. */
1057 #define	GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1058 
1059 /*
1060  * Module parameter: number of bytes to process at once while owning the FPU.
1061  * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1062  * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1063  */
1064 static uint32_t gcm_avx_chunk_size =
1065 	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1066 
1067 extern void ASMABI clear_fpu_regs_avx(void);
1068 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1069 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
1070     const uint32_t pt[4], uint32_t ct[4]);
1071 
1072 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1073 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1074     const uint8_t *in, size_t len);
1075 
1076 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1077     const void *, uint64_t *, uint64_t *);
1078 
1079 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1080     const void *, uint64_t *, uint64_t *);
1081 
1082 static inline boolean_t
1083 gcm_avx_will_work(void)
1084 {
1085 	/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1086 	return (kfpu_allowed() &&
1087 	    zfs_avx_available() && zfs_aes_available() &&
1088 	    zfs_pclmulqdq_available());
1089 }
1090 
1091 static inline void
1092 gcm_set_avx(boolean_t val)
1093 {
1094 	if (gcm_avx_will_work() == B_TRUE) {
1095 		atomic_swap_32(&gcm_use_avx, val);
1096 	}
1097 }
1098 
1099 static inline boolean_t
1100 gcm_toggle_avx(void)
1101 {
1102 	if (gcm_avx_will_work() == B_TRUE) {
1103 		return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
1104 	} else {
1105 		return (B_FALSE);
1106 	}
1107 }
1108 
1109 static inline size_t
1110 gcm_simd_get_htab_size(boolean_t simd_mode)
1111 {
1112 	switch (simd_mode) {
1113 	case B_TRUE:
1114 		return (2 * 6 * 2 * sizeof (uint64_t));
1115 
1116 	default:
1117 		return (0);
1118 	}
1119 }
1120 
1121 
1122 /* Increment the GCM counter block by n. */
1123 static inline void
1124 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1125 {
1126 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1127 	uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1128 
1129 	counter = htonll(counter + n);
1130 	counter &= counter_mask;
1131 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1132 }
1133 
1134 /*
1135  * Encrypt multiple blocks of data in GCM mode.
1136  * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1137  * if possible. While processing a chunk the FPU is "locked".
1138  */
1139 static int
1140 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1141     size_t length, crypto_data_t *out, size_t block_size)
1142 {
1143 	size_t bleft = length;
1144 	size_t need = 0;
1145 	size_t done = 0;
1146 	uint8_t *datap = (uint8_t *)data;
1147 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1148 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1149 	uint64_t *ghash = ctx->gcm_ghash;
1150 	uint64_t *cb = ctx->gcm_cb;
1151 	uint8_t *ct_buf = NULL;
1152 	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1153 	int rv = CRYPTO_SUCCESS;
1154 
1155 	ASSERT(block_size == GCM_BLOCK_LEN);
1156 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1157 	    B_FALSE);
1158 	/*
1159 	 * If the last call left an incomplete block, try to fill
1160 	 * it first.
1161 	 */
1162 	if (ctx->gcm_remainder_len > 0) {
1163 		need = block_size - ctx->gcm_remainder_len;
1164 		if (length < need) {
1165 			/* Accumulate bytes here and return. */
1166 			memcpy((uint8_t *)ctx->gcm_remainder +
1167 			    ctx->gcm_remainder_len, datap, length);
1168 
1169 			ctx->gcm_remainder_len += length;
1170 			if (ctx->gcm_copy_to == NULL) {
1171 				ctx->gcm_copy_to = datap;
1172 			}
1173 			return (CRYPTO_SUCCESS);
1174 		} else {
1175 			/* Complete incomplete block. */
1176 			memcpy((uint8_t *)ctx->gcm_remainder +
1177 			    ctx->gcm_remainder_len, datap, need);
1178 
1179 			ctx->gcm_copy_to = NULL;
1180 		}
1181 	}
1182 
1183 	/* Allocate a buffer to encrypt to if there is enough input. */
1184 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1185 		ct_buf = vmem_alloc(chunk_size, KM_SLEEP);
1186 		if (ct_buf == NULL) {
1187 			return (CRYPTO_HOST_MEMORY);
1188 		}
1189 	}
1190 
1191 	/* If we completed an incomplete block, encrypt and write it out. */
1192 	if (ctx->gcm_remainder_len > 0) {
1193 		kfpu_begin();
1194 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1195 		    (const uint32_t *)cb, (uint32_t *)tmp);
1196 
1197 		gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1198 		GHASH_AVX(ctx, tmp, block_size);
1199 		clear_fpu_regs();
1200 		kfpu_end();
1201 		rv = crypto_put_output_data(tmp, out, block_size);
1202 		out->cd_offset += block_size;
1203 		gcm_incr_counter_block(ctx);
1204 		ctx->gcm_processed_data_len += block_size;
1205 		bleft -= need;
1206 		datap += need;
1207 		ctx->gcm_remainder_len = 0;
1208 	}
1209 
1210 	/* Do the bulk encryption in chunk_size blocks. */
1211 	for (; bleft >= chunk_size; bleft -= chunk_size) {
1212 		kfpu_begin();
1213 		done = aesni_gcm_encrypt(
1214 		    datap, ct_buf, chunk_size, key, cb, ghash);
1215 
1216 		clear_fpu_regs();
1217 		kfpu_end();
1218 		if (done != chunk_size) {
1219 			rv = CRYPTO_FAILED;
1220 			goto out_nofpu;
1221 		}
1222 		rv = crypto_put_output_data(ct_buf, out, chunk_size);
1223 		if (rv != CRYPTO_SUCCESS) {
1224 			goto out_nofpu;
1225 		}
1226 		out->cd_offset += chunk_size;
1227 		datap += chunk_size;
1228 		ctx->gcm_processed_data_len += chunk_size;
1229 	}
1230 	/* Check if we are already done. */
1231 	if (bleft == 0) {
1232 		goto out_nofpu;
1233 	}
1234 	/* Bulk encrypt the remaining data. */
1235 	kfpu_begin();
1236 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1237 		done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
1238 		if (done == 0) {
1239 			rv = CRYPTO_FAILED;
1240 			goto out;
1241 		}
1242 		rv = crypto_put_output_data(ct_buf, out, done);
1243 		if (rv != CRYPTO_SUCCESS) {
1244 			goto out;
1245 		}
1246 		out->cd_offset += done;
1247 		ctx->gcm_processed_data_len += done;
1248 		datap += done;
1249 		bleft -= done;
1250 
1251 	}
1252 	/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1253 	while (bleft > 0) {
1254 		if (bleft < block_size) {
1255 			memcpy(ctx->gcm_remainder, datap, bleft);
1256 			ctx->gcm_remainder_len = bleft;
1257 			ctx->gcm_copy_to = datap;
1258 			goto out;
1259 		}
1260 		/* Encrypt, hash and write out. */
1261 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1262 		    (const uint32_t *)cb, (uint32_t *)tmp);
1263 
1264 		gcm_xor_avx(datap, tmp);
1265 		GHASH_AVX(ctx, tmp, block_size);
1266 		rv = crypto_put_output_data(tmp, out, block_size);
1267 		if (rv != CRYPTO_SUCCESS) {
1268 			goto out;
1269 		}
1270 		out->cd_offset += block_size;
1271 		gcm_incr_counter_block(ctx);
1272 		ctx->gcm_processed_data_len += block_size;
1273 		datap += block_size;
1274 		bleft -= block_size;
1275 	}
1276 out:
1277 	clear_fpu_regs();
1278 	kfpu_end();
1279 out_nofpu:
1280 	if (ct_buf != NULL) {
1281 		vmem_free(ct_buf, chunk_size);
1282 	}
1283 	return (rv);
1284 }
1285 
1286 /*
1287  * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1288  * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1289  */
1290 static int
1291 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1292 {
1293 	uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1294 	uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1295 	uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1296 	size_t rem_len = ctx->gcm_remainder_len;
1297 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1298 	int aes_rounds = ((aes_key_t *)keysched)->nr;
1299 	int rv;
1300 
1301 	ASSERT(block_size == GCM_BLOCK_LEN);
1302 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1303 	    B_FALSE);
1304 
1305 	if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1306 		return (CRYPTO_DATA_LEN_RANGE);
1307 	}
1308 
1309 	kfpu_begin();
1310 	/* Pad last incomplete block with zeros, encrypt and hash. */
1311 	if (rem_len > 0) {
1312 		uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1313 		const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1314 
1315 		aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1316 		memset(remainder + rem_len, 0, block_size - rem_len);
1317 		for (int i = 0; i < rem_len; i++) {
1318 			remainder[i] ^= tmp[i];
1319 		}
1320 		GHASH_AVX(ctx, remainder, block_size);
1321 		ctx->gcm_processed_data_len += rem_len;
1322 		/* No need to increment counter_block, it's the last block. */
1323 	}
1324 	/* Finish tag. */
1325 	ctx->gcm_len_a_len_c[1] =
1326 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1327 	GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1328 	aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1329 
1330 	gcm_xor_avx((uint8_t *)J0, ghash);
1331 	clear_fpu_regs();
1332 	kfpu_end();
1333 
1334 	/* Output remainder. */
1335 	if (rem_len > 0) {
1336 		rv = crypto_put_output_data(remainder, out, rem_len);
1337 		if (rv != CRYPTO_SUCCESS)
1338 			return (rv);
1339 	}
1340 	out->cd_offset += rem_len;
1341 	ctx->gcm_remainder_len = 0;
1342 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1343 	if (rv != CRYPTO_SUCCESS)
1344 		return (rv);
1345 
1346 	out->cd_offset += ctx->gcm_tag_len;
1347 	return (CRYPTO_SUCCESS);
1348 }
1349 
1350 /*
1351  * Finalize decryption: We just have accumulated crypto text, so now we
1352  * decrypt it here inplace.
1353  */
1354 static int
1355 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1356 {
1357 	ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1358 	ASSERT3U(block_size, ==, 16);
1359 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1360 	    B_FALSE);
1361 
1362 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1363 	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1364 	uint8_t *datap = ctx->gcm_pt_buf;
1365 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1366 	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1367 	uint64_t *ghash = ctx->gcm_ghash;
1368 	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1369 	int rv = CRYPTO_SUCCESS;
1370 	size_t bleft, done;
1371 
1372 	/*
1373 	 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1374 	 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1375 	 * GCM_AVX_MIN_DECRYPT_BYTES.
1376 	 */
1377 	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1378 		kfpu_begin();
1379 		done = aesni_gcm_decrypt(datap, datap, chunk_size,
1380 		    (const void *)key, ctx->gcm_cb, ghash);
1381 		clear_fpu_regs();
1382 		kfpu_end();
1383 		if (done != chunk_size) {
1384 			return (CRYPTO_FAILED);
1385 		}
1386 		datap += done;
1387 	}
1388 	/* Decrypt remainder, which is less than chunk size, in one go. */
1389 	kfpu_begin();
1390 	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1391 		done = aesni_gcm_decrypt(datap, datap, bleft,
1392 		    (const void *)key, ctx->gcm_cb, ghash);
1393 		if (done == 0) {
1394 			clear_fpu_regs();
1395 			kfpu_end();
1396 			return (CRYPTO_FAILED);
1397 		}
1398 		datap += done;
1399 		bleft -= done;
1400 	}
1401 	ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1402 
1403 	/*
1404 	 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1405 	 * decrypt them block by block.
1406 	 */
1407 	while (bleft > 0) {
1408 		/* Incomplete last block. */
1409 		if (bleft < block_size) {
1410 			uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1411 
1412 			memset(lastb, 0, block_size);
1413 			memcpy(lastb, datap, bleft);
1414 			/* The GCM processing. */
1415 			GHASH_AVX(ctx, lastb, block_size);
1416 			aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1417 			for (size_t i = 0; i < bleft; i++) {
1418 				datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1419 			}
1420 			break;
1421 		}
1422 		/* The GCM processing. */
1423 		GHASH_AVX(ctx, datap, block_size);
1424 		aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1425 		gcm_xor_avx((uint8_t *)tmp, datap);
1426 		gcm_incr_counter_block(ctx);
1427 
1428 		datap += block_size;
1429 		bleft -= block_size;
1430 	}
1431 	if (rv != CRYPTO_SUCCESS) {
1432 		clear_fpu_regs();
1433 		kfpu_end();
1434 		return (rv);
1435 	}
1436 	/* Decryption done, finish the tag. */
1437 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1438 	GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1439 	aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1440 	    (uint32_t *)ctx->gcm_J0);
1441 
1442 	gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1443 
1444 	/* We are done with the FPU, restore its state. */
1445 	clear_fpu_regs();
1446 	kfpu_end();
1447 
1448 	/* Compare the input authentication tag with what we calculated. */
1449 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1450 		/* They don't match. */
1451 		return (CRYPTO_INVALID_MAC);
1452 	}
1453 	rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1454 	if (rv != CRYPTO_SUCCESS) {
1455 		return (rv);
1456 	}
1457 	out->cd_offset += pt_len;
1458 	return (CRYPTO_SUCCESS);
1459 }
1460 
1461 /*
1462  * Initialize the GCM params H, Htabtle and the counter block. Save the
1463  * initial counter block.
1464  */
1465 static int
1466 gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
1467     const uint8_t *auth_data, size_t auth_data_len, size_t block_size)
1468 {
1469 	uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1470 	uint64_t *H = ctx->gcm_H;
1471 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1472 	int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1473 	const uint8_t *datap = auth_data;
1474 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1475 	size_t bleft;
1476 
1477 	ASSERT(block_size == GCM_BLOCK_LEN);
1478 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1479 	    B_FALSE);
1480 
1481 	/* Init H (encrypt zero block) and create the initial counter block. */
1482 	memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash));
1483 	memset(H, 0, sizeof (ctx->gcm_H));
1484 	kfpu_begin();
1485 	aes_encrypt_intel(keysched, aes_rounds,
1486 	    (const uint32_t *)H, (uint32_t *)H);
1487 
1488 	gcm_init_htab_avx(ctx->gcm_Htable, H);
1489 
1490 	if (iv_len == 12) {
1491 		memcpy(cb, iv, 12);
1492 		cb[12] = 0;
1493 		cb[13] = 0;
1494 		cb[14] = 0;
1495 		cb[15] = 1;
1496 		/* We need the ICB later. */
1497 		memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0));
1498 	} else {
1499 		/*
1500 		 * Most consumers use 12 byte IVs, so it's OK to use the
1501 		 * original routines for other IV sizes, just avoid nesting
1502 		 * kfpu_begin calls.
1503 		 */
1504 		clear_fpu_regs();
1505 		kfpu_end();
1506 		gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1507 		    aes_copy_block, aes_xor_block);
1508 		kfpu_begin();
1509 	}
1510 
1511 	/* Openssl post increments the counter, adjust for that. */
1512 	gcm_incr_counter_block(ctx);
1513 
1514 	/* Ghash AAD in chunk_size blocks. */
1515 	for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1516 		GHASH_AVX(ctx, datap, chunk_size);
1517 		datap += chunk_size;
1518 		clear_fpu_regs();
1519 		kfpu_end();
1520 		kfpu_begin();
1521 	}
1522 	/* Ghash the remainder and handle possible incomplete GCM block. */
1523 	if (bleft > 0) {
1524 		size_t incomp = bleft % block_size;
1525 
1526 		bleft -= incomp;
1527 		if (bleft > 0) {
1528 			GHASH_AVX(ctx, datap, bleft);
1529 			datap += bleft;
1530 		}
1531 		if (incomp > 0) {
1532 			/* Zero pad and hash incomplete last block. */
1533 			uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1534 
1535 			memset(authp, 0, block_size);
1536 			memcpy(authp, datap, incomp);
1537 			GHASH_AVX(ctx, authp, block_size);
1538 		}
1539 	}
1540 	clear_fpu_regs();
1541 	kfpu_end();
1542 	return (CRYPTO_SUCCESS);
1543 }
1544 
1545 #if defined(_KERNEL)
1546 static int
1547 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1548 {
1549 	unsigned long val;
1550 	char val_rounded[16];
1551 	int error = 0;
1552 
1553 	error = kstrtoul(buf, 0, &val);
1554 	if (error)
1555 		return (error);
1556 
1557 	val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1558 
1559 	if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1560 		return (-EINVAL);
1561 
1562 	snprintf(val_rounded, 16, "%u", (uint32_t)val);
1563 	error = param_set_uint(val_rounded, kp);
1564 	return (error);
1565 }
1566 
1567 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1568     param_get_uint, &gcm_avx_chunk_size, 0644);
1569 
1570 MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1571 	"How many bytes to process while owning the FPU");
1572 
1573 #endif /* defined(__KERNEL) */
1574 #endif /* ifdef CAN_USE_GCM_ASM */
1575