xref: /freebsd/sys/contrib/openzfs/module/icp/algs/modes/gcm.c (revision 53a2e2635ab2d17bed1de7b4e0d782dd23ceb6ea)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/cmn_err.h>
28 #include <modes/modes.h>
29 #include <sys/crypto/common.h>
30 #include <sys/crypto/icp.h>
31 #include <sys/crypto/impl.h>
32 #include <sys/byteorder.h>
33 #include <sys/simd.h>
34 #include <modes/gcm_impl.h>
35 #ifdef CAN_USE_GCM_ASM
36 #include <aes/aes_impl.h>
37 #endif
38 
39 #define	GHASH(c, d, t, o) \
40 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
41 	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
42 	(uint64_t *)(void *)(t));
43 
44 /* Select GCM implementation */
45 #define	IMPL_FASTEST	(UINT32_MAX)
46 #define	IMPL_CYCLE	(UINT32_MAX-1)
47 #ifdef CAN_USE_GCM_ASM
48 #define	IMPL_AVX	(UINT32_MAX-2)
49 #if CAN_USE_GCM_ASM >= 2
50 #define	IMPL_AVX2	(UINT32_MAX-3)
51 #endif
52 #endif
53 #define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
54 static uint32_t icp_gcm_impl = IMPL_FASTEST;
55 static uint32_t user_sel_impl = IMPL_FASTEST;
56 
57 #ifdef CAN_USE_GCM_ASM
58 /* Does the architecture we run on support the MOVBE instruction? */
59 boolean_t gcm_avx_can_use_movbe = B_FALSE;
60 /*
61  * Whether to use the optimized openssl gcm and ghash implementations.
62  */
63 static gcm_impl gcm_impl_used = GCM_IMPL_GENERIC;
64 #define	GCM_IMPL_USED	(*(volatile gcm_impl *)&gcm_impl_used)
65 
66 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
67 
68 static inline boolean_t gcm_avx_will_work(void);
69 static inline boolean_t gcm_avx2_will_work(void);
70 static inline void gcm_use_impl(gcm_impl impl);
71 static inline gcm_impl gcm_toggle_impl(void);
72 
73 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
74     crypto_data_t *, size_t);
75 
76 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
77 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
78 static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *,
79     size_t, size_t);
80 #endif /* ifdef CAN_USE_GCM_ASM */
81 
82 /*
83  * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
84  * is done in another function.
85  */
86 int
gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))87 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
88     crypto_data_t *out, size_t block_size,
89     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
90     void (*copy_block)(uint8_t *, uint8_t *),
91     void (*xor_block)(uint8_t *, uint8_t *))
92 {
93 #ifdef CAN_USE_GCM_ASM
94 	if (ctx->impl != GCM_IMPL_GENERIC)
95 		return (gcm_mode_encrypt_contiguous_blocks_avx(
96 		    ctx, data, length, out, block_size));
97 #endif
98 
99 	const gcm_impl_ops_t *gops;
100 	size_t remainder = length;
101 	size_t need = 0;
102 	uint8_t *datap = (uint8_t *)data;
103 	uint8_t *blockp;
104 	uint8_t *lastp;
105 	void *iov_or_mp;
106 	offset_t offset;
107 	uint8_t *out_data_1;
108 	uint8_t *out_data_2;
109 	size_t out_data_1_len;
110 	uint64_t counter;
111 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
112 
113 	if (length + ctx->gcm_remainder_len < block_size) {
114 		/* accumulate bytes here and return */
115 		memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
116 		    datap,
117 		    length);
118 		ctx->gcm_remainder_len += length;
119 		if (ctx->gcm_copy_to == NULL) {
120 			ctx->gcm_copy_to = datap;
121 		}
122 		return (CRYPTO_SUCCESS);
123 	}
124 
125 	crypto_init_ptrs(out, &iov_or_mp, &offset);
126 
127 	gops = gcm_impl_get_ops();
128 	do {
129 		/* Unprocessed data from last call. */
130 		if (ctx->gcm_remainder_len > 0) {
131 			need = block_size - ctx->gcm_remainder_len;
132 
133 			if (need > remainder)
134 				return (CRYPTO_DATA_LEN_RANGE);
135 
136 			memcpy(&((uint8_t *)ctx->gcm_remainder)
137 			    [ctx->gcm_remainder_len], datap, need);
138 
139 			blockp = (uint8_t *)ctx->gcm_remainder;
140 		} else {
141 			blockp = datap;
142 		}
143 
144 		/*
145 		 * Increment counter. Counter bits are confined
146 		 * to the bottom 32 bits of the counter block.
147 		 */
148 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
149 		counter = htonll(counter + 1);
150 		counter &= counter_mask;
151 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
152 
153 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
154 		    (uint8_t *)ctx->gcm_tmp);
155 		xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
156 
157 		lastp = (uint8_t *)ctx->gcm_tmp;
158 
159 		ctx->gcm_processed_data_len += block_size;
160 
161 		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
162 		    &out_data_1_len, &out_data_2, block_size);
163 
164 		/* copy block to where it belongs */
165 		if (out_data_1_len == block_size) {
166 			copy_block(lastp, out_data_1);
167 		} else {
168 			memcpy(out_data_1, lastp, out_data_1_len);
169 			if (out_data_2 != NULL) {
170 				memcpy(out_data_2,
171 				    lastp + out_data_1_len,
172 				    block_size - out_data_1_len);
173 			}
174 		}
175 		/* update offset */
176 		out->cd_offset += block_size;
177 
178 		/* add ciphertext to the hash */
179 		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
180 
181 		/* Update pointer to next block of data to be processed. */
182 		if (ctx->gcm_remainder_len != 0) {
183 			datap += need;
184 			ctx->gcm_remainder_len = 0;
185 		} else {
186 			datap += block_size;
187 		}
188 
189 		remainder = (size_t)&data[length] - (size_t)datap;
190 
191 		/* Incomplete last block. */
192 		if (remainder > 0 && remainder < block_size) {
193 			memcpy(ctx->gcm_remainder, datap, remainder);
194 			ctx->gcm_remainder_len = remainder;
195 			ctx->gcm_copy_to = datap;
196 			goto out;
197 		}
198 		ctx->gcm_copy_to = NULL;
199 
200 	} while (remainder > 0);
201 out:
202 	return (CRYPTO_SUCCESS);
203 }
204 
205 int
gcm_encrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))206 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
207     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
208     void (*copy_block)(uint8_t *, uint8_t *),
209     void (*xor_block)(uint8_t *, uint8_t *))
210 {
211 	(void) copy_block;
212 #ifdef CAN_USE_GCM_ASM
213 	if (ctx->impl != GCM_IMPL_GENERIC)
214 		return (gcm_encrypt_final_avx(ctx, out, block_size));
215 #endif
216 
217 	const gcm_impl_ops_t *gops;
218 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
219 	uint8_t *ghash, *macp = NULL;
220 	int i, rv;
221 
222 	if (out->cd_length <
223 	    (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
224 		return (CRYPTO_DATA_LEN_RANGE);
225 	}
226 
227 	gops = gcm_impl_get_ops();
228 	ghash = (uint8_t *)ctx->gcm_ghash;
229 
230 	if (ctx->gcm_remainder_len > 0) {
231 		uint64_t counter;
232 		uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
233 
234 		/*
235 		 * Here is where we deal with data that is not a
236 		 * multiple of the block size.
237 		 */
238 
239 		/*
240 		 * Increment counter.
241 		 */
242 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
243 		counter = htonll(counter + 1);
244 		counter &= counter_mask;
245 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
246 
247 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
248 		    (uint8_t *)ctx->gcm_tmp);
249 
250 		macp = (uint8_t *)ctx->gcm_remainder;
251 		memset(macp + ctx->gcm_remainder_len, 0,
252 		    block_size - ctx->gcm_remainder_len);
253 
254 		/* XOR with counter block */
255 		for (i = 0; i < ctx->gcm_remainder_len; i++) {
256 			macp[i] ^= tmpp[i];
257 		}
258 
259 		/* add ciphertext to the hash */
260 		GHASH(ctx, macp, ghash, gops);
261 
262 		ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
263 	}
264 
265 	ctx->gcm_len_a_len_c[1] =
266 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
267 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
268 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
269 	    (uint8_t *)ctx->gcm_J0);
270 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
271 
272 	if (ctx->gcm_remainder_len > 0) {
273 		rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
274 		if (rv != CRYPTO_SUCCESS)
275 			return (rv);
276 	}
277 	out->cd_offset += ctx->gcm_remainder_len;
278 	ctx->gcm_remainder_len = 0;
279 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
280 	if (rv != CRYPTO_SUCCESS)
281 		return (rv);
282 	out->cd_offset += ctx->gcm_tag_len;
283 
284 	return (CRYPTO_SUCCESS);
285 }
286 
287 /*
288  * This will only deal with decrypting the last block of the input that
289  * might not be a multiple of block length.
290  */
291 static void
gcm_decrypt_incomplete_block(gcm_ctx_t * ctx,size_t block_size,size_t index,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))292 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
293     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
294     void (*xor_block)(uint8_t *, uint8_t *))
295 {
296 	uint8_t *datap, *outp, *counterp;
297 	uint64_t counter;
298 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
299 	int i;
300 
301 	/*
302 	 * Increment counter.
303 	 * Counter bits are confined to the bottom 32 bits
304 	 */
305 	counter = ntohll(ctx->gcm_cb[1] & counter_mask);
306 	counter = htonll(counter + 1);
307 	counter &= counter_mask;
308 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
309 
310 	datap = (uint8_t *)ctx->gcm_remainder;
311 	outp = &((ctx->gcm_pt_buf)[index]);
312 	counterp = (uint8_t *)ctx->gcm_tmp;
313 
314 	/* authentication tag */
315 	memset((uint8_t *)ctx->gcm_tmp, 0, block_size);
316 	memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len);
317 
318 	/* add ciphertext to the hash */
319 	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
320 
321 	/* decrypt remaining ciphertext */
322 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
323 
324 	/* XOR with counter block */
325 	for (i = 0; i < ctx->gcm_remainder_len; i++) {
326 		outp[i] = datap[i] ^ counterp[i];
327 	}
328 }
329 
330 int
gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))331 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
332     crypto_data_t *out, size_t block_size,
333     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
334     void (*copy_block)(uint8_t *, uint8_t *),
335     void (*xor_block)(uint8_t *, uint8_t *))
336 {
337 	(void) out, (void) block_size, (void) encrypt_block, (void) copy_block,
338 	    (void) xor_block;
339 	size_t new_len;
340 	uint8_t *new;
341 
342 	/*
343 	 * Copy contiguous ciphertext input blocks to plaintext buffer.
344 	 * Ciphertext will be decrypted in the final.
345 	 */
346 	if (length > 0) {
347 		new_len = ctx->gcm_pt_buf_len + length;
348 		new = vmem_alloc(new_len, KM_SLEEP);
349 		if (new == NULL) {
350 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
351 			ctx->gcm_pt_buf = NULL;
352 			return (CRYPTO_HOST_MEMORY);
353 		}
354 
355 		if (ctx->gcm_pt_buf != NULL) {
356 			memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
357 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
358 		} else {
359 			ASSERT0(ctx->gcm_pt_buf_len);
360 		}
361 
362 		ctx->gcm_pt_buf = new;
363 		ctx->gcm_pt_buf_len = new_len;
364 		memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data,
365 		    length);
366 		ctx->gcm_processed_data_len += length;
367 	}
368 
369 	ctx->gcm_remainder_len = 0;
370 	return (CRYPTO_SUCCESS);
371 }
372 
373 int
gcm_decrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))374 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
375     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
376     void (*xor_block)(uint8_t *, uint8_t *))
377 {
378 #ifdef CAN_USE_GCM_ASM
379 	if (ctx->impl != GCM_IMPL_GENERIC)
380 		return (gcm_decrypt_final_avx(ctx, out, block_size));
381 #endif
382 
383 	const gcm_impl_ops_t *gops;
384 	size_t pt_len;
385 	size_t remainder;
386 	uint8_t *ghash;
387 	uint8_t *blockp;
388 	uint8_t *cbp;
389 	uint64_t counter;
390 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
391 	int processed = 0, rv;
392 
393 	ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
394 
395 	gops = gcm_impl_get_ops();
396 	pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
397 	ghash = (uint8_t *)ctx->gcm_ghash;
398 	blockp = ctx->gcm_pt_buf;
399 	remainder = pt_len;
400 	while (remainder > 0) {
401 		/* Incomplete last block */
402 		if (remainder < block_size) {
403 			memcpy(ctx->gcm_remainder, blockp, remainder);
404 			ctx->gcm_remainder_len = remainder;
405 			/*
406 			 * not expecting anymore ciphertext, just
407 			 * compute plaintext for the remaining input
408 			 */
409 			gcm_decrypt_incomplete_block(ctx, block_size,
410 			    processed, encrypt_block, xor_block);
411 			ctx->gcm_remainder_len = 0;
412 			goto out;
413 		}
414 		/* add ciphertext to the hash */
415 		GHASH(ctx, blockp, ghash, gops);
416 
417 		/*
418 		 * Increment counter.
419 		 * Counter bits are confined to the bottom 32 bits
420 		 */
421 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
422 		counter = htonll(counter + 1);
423 		counter &= counter_mask;
424 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
425 
426 		cbp = (uint8_t *)ctx->gcm_tmp;
427 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
428 
429 		/* XOR with ciphertext */
430 		xor_block(cbp, blockp);
431 
432 		processed += block_size;
433 		blockp += block_size;
434 		remainder -= block_size;
435 	}
436 out:
437 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
438 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
439 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
440 	    (uint8_t *)ctx->gcm_J0);
441 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
442 
443 	/* compare the input authentication tag with what we calculated */
444 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
445 		/* They don't match */
446 		return (CRYPTO_INVALID_MAC);
447 	} else {
448 		rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
449 		if (rv != CRYPTO_SUCCESS)
450 			return (rv);
451 		out->cd_offset += pt_len;
452 	}
453 	return (CRYPTO_SUCCESS);
454 }
455 
456 static int
gcm_validate_args(CK_AES_GCM_PARAMS * gcm_param)457 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
458 {
459 	size_t tag_len;
460 
461 	/*
462 	 * Check the length of the authentication tag (in bits).
463 	 */
464 	tag_len = gcm_param->ulTagBits;
465 	switch (tag_len) {
466 	case 32:
467 	case 64:
468 	case 96:
469 	case 104:
470 	case 112:
471 	case 120:
472 	case 128:
473 		break;
474 	default:
475 		return (CRYPTO_MECHANISM_PARAM_INVALID);
476 	}
477 
478 	if (gcm_param->ulIvLen == 0)
479 		return (CRYPTO_MECHANISM_PARAM_INVALID);
480 
481 	return (CRYPTO_SUCCESS);
482 }
483 
484 static void
gcm_format_initial_blocks(const uint8_t * iv,ulong_t iv_len,gcm_ctx_t * ctx,size_t block_size,void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))485 gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len,
486     gcm_ctx_t *ctx, size_t block_size,
487     void (*copy_block)(uint8_t *, uint8_t *),
488     void (*xor_block)(uint8_t *, uint8_t *))
489 {
490 	const gcm_impl_ops_t *gops;
491 	uint8_t *cb;
492 	ulong_t remainder = iv_len;
493 	ulong_t processed = 0;
494 	uint8_t *datap, *ghash;
495 	uint64_t len_a_len_c[2];
496 
497 	gops = gcm_impl_get_ops();
498 	ghash = (uint8_t *)ctx->gcm_ghash;
499 	cb = (uint8_t *)ctx->gcm_cb;
500 	if (iv_len == 12) {
501 		memcpy(cb, iv, 12);
502 		cb[12] = 0;
503 		cb[13] = 0;
504 		cb[14] = 0;
505 		cb[15] = 1;
506 		/* J0 will be used again in the final */
507 		copy_block(cb, (uint8_t *)ctx->gcm_J0);
508 	} else {
509 		/* GHASH the IV */
510 		do {
511 			if (remainder < block_size) {
512 				memset(cb, 0, block_size);
513 				memcpy(cb, &(iv[processed]), remainder);
514 				datap = (uint8_t *)cb;
515 				remainder = 0;
516 			} else {
517 				datap = (uint8_t *)(&(iv[processed]));
518 				processed += block_size;
519 				remainder -= block_size;
520 			}
521 			GHASH(ctx, datap, ghash, gops);
522 		} while (remainder > 0);
523 
524 		len_a_len_c[0] = 0;
525 		len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
526 		GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
527 
528 		/* J0 will be used again in the final */
529 		copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
530 	}
531 }
532 
533 static int
gcm_init(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))534 gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
535     const uint8_t *auth_data, size_t auth_data_len, size_t block_size,
536     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
537     void (*copy_block)(uint8_t *, uint8_t *),
538     void (*xor_block)(uint8_t *, uint8_t *))
539 {
540 	const gcm_impl_ops_t *gops;
541 	uint8_t *ghash, *datap, *authp;
542 	size_t remainder, processed;
543 
544 	/* encrypt zero block to get subkey H */
545 	memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
546 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
547 	    (uint8_t *)ctx->gcm_H);
548 
549 	gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
550 	    copy_block, xor_block);
551 
552 	gops = gcm_impl_get_ops();
553 	authp = (uint8_t *)ctx->gcm_tmp;
554 	ghash = (uint8_t *)ctx->gcm_ghash;
555 	memset(authp, 0, block_size);
556 	memset(ghash, 0, block_size);
557 
558 	processed = 0;
559 	remainder = auth_data_len;
560 	do {
561 		if (remainder < block_size) {
562 			/*
563 			 * There's not a block full of data, pad rest of
564 			 * buffer with zero
565 			 */
566 
567 			if (auth_data != NULL) {
568 				memset(authp, 0, block_size);
569 				memcpy(authp, &(auth_data[processed]),
570 				    remainder);
571 			} else {
572 				ASSERT0(remainder);
573 			}
574 
575 			datap = (uint8_t *)authp;
576 			remainder = 0;
577 		} else {
578 			datap = (uint8_t *)(&(auth_data[processed]));
579 			processed += block_size;
580 			remainder -= block_size;
581 		}
582 
583 		/* add auth data to the hash */
584 		GHASH(ctx, datap, ghash, gops);
585 
586 	} while (remainder > 0);
587 
588 	return (CRYPTO_SUCCESS);
589 }
590 
591 /*
592  * Init the GCM context struct. Handle the cycle and avx implementations here.
593  */
594 int
gcm_init_ctx(gcm_ctx_t * gcm_ctx,char * param,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))595 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
596     size_t block_size, int (*encrypt_block)(const void *, const uint8_t *,
597     uint8_t *), void (*copy_block)(uint8_t *, uint8_t *),
598     void (*xor_block)(uint8_t *, uint8_t *))
599 {
600 	CK_AES_GCM_PARAMS *gcm_param;
601 	int rv = CRYPTO_SUCCESS;
602 	size_t tag_len, iv_len;
603 
604 	if (param != NULL) {
605 		gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
606 
607 		/* GCM mode. */
608 		if ((rv = gcm_validate_args(gcm_param)) != 0) {
609 			return (rv);
610 		}
611 		gcm_ctx->gcm_flags |= GCM_MODE;
612 
613 		size_t tbits = gcm_param->ulTagBits;
614 		tag_len = CRYPTO_BITS2BYTES(tbits);
615 		iv_len = gcm_param->ulIvLen;
616 
617 		gcm_ctx->gcm_tag_len = tag_len;
618 		gcm_ctx->gcm_processed_data_len = 0;
619 
620 		/* these values are in bits */
621 		gcm_ctx->gcm_len_a_len_c[0]
622 		    = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
623 	} else {
624 		return (CRYPTO_MECHANISM_PARAM_INVALID);
625 	}
626 
627 	const uint8_t *iv = (const uint8_t *)gcm_param->pIv;
628 	const uint8_t *aad = (const uint8_t *)gcm_param->pAAD;
629 	size_t aad_len = gcm_param->ulAADLen;
630 
631 #ifdef CAN_USE_GCM_ASM
632 	boolean_t needs_bswap =
633 	    ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
634 
635 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
636 		gcm_ctx->impl = GCM_IMPL_USED;
637 	} else {
638 		/*
639 		 * Handle the "cycle" implementation by creating different
640 		 * contexts, one per implementation.
641 		 */
642 		gcm_ctx->impl = gcm_toggle_impl();
643 
644 		/* The AVX impl. doesn't handle byte swapped key schedules. */
645 		if (needs_bswap == B_TRUE) {
646 			gcm_ctx->impl = GCM_IMPL_GENERIC;
647 		}
648 		/*
649 		 * If this is an AVX context, use the MOVBE and the BSWAP
650 		 * variants alternately.
651 		 */
652 		if (gcm_ctx->impl == GCM_IMPL_AVX &&
653 		    zfs_movbe_available() == B_TRUE) {
654 			(void) atomic_toggle_boolean_nv(
655 			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
656 		}
657 	}
658 	/*
659 	 * We don't handle byte swapped key schedules in the avx code path,
660 	 * still they could be created by the aes generic implementation.
661 	 * Make sure not to use them since we'll corrupt data if we do.
662 	 */
663 	if (gcm_ctx->impl != GCM_IMPL_GENERIC && needs_bswap == B_TRUE) {
664 		gcm_ctx->impl = GCM_IMPL_GENERIC;
665 
666 		cmn_err_once(CE_WARN,
667 		    "ICP: Can't use the aes generic or cycle implementations "
668 		    "in combination with the gcm avx or avx2-vaes "
669 		    "implementation!");
670 		cmn_err_once(CE_WARN,
671 		    "ICP: Falling back to a compatible implementation, "
672 		    "aes-gcm performance will likely be degraded.");
673 		cmn_err_once(CE_WARN,
674 		    "ICP: Choose at least the x86_64 aes implementation to "
675 		    "restore performance.");
676 	}
677 
678 	/*
679 	 * AVX implementations use Htable with sizes depending on
680 	 * implementation.
681 	 */
682 	if (gcm_ctx->impl != GCM_IMPL_GENERIC) {
683 		rv = gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
684 		    block_size);
685 	}
686 	else
687 #endif /* ifdef CAN_USE_GCM_ASM */
688 	if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
689 	    encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
690 		rv = CRYPTO_MECHANISM_PARAM_INVALID;
691 	}
692 
693 	return (rv);
694 }
695 
696 void *
gcm_alloc_ctx(int kmflag)697 gcm_alloc_ctx(int kmflag)
698 {
699 	gcm_ctx_t *gcm_ctx;
700 
701 	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
702 		return (NULL);
703 
704 	gcm_ctx->gcm_flags = GCM_MODE;
705 	return (gcm_ctx);
706 }
707 
708 /* GCM implementation that contains the fastest methods */
709 static gcm_impl_ops_t gcm_fastest_impl = {
710 	.name = "fastest"
711 };
712 
713 /* All compiled in implementations */
714 static const gcm_impl_ops_t *gcm_all_impl[] = {
715 	&gcm_generic_impl,
716 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
717 	&gcm_pclmulqdq_impl,
718 #endif
719 };
720 
721 /* Indicate that benchmark has been completed */
722 static boolean_t gcm_impl_initialized = B_FALSE;
723 
724 /* Hold all supported implementations */
725 static size_t gcm_supp_impl_cnt = 0;
726 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
727 
728 /*
729  * Returns the GCM operations for encrypt/decrypt/key setup.  When a
730  * SIMD implementation is not allowed in the current context, then
731  * fallback to the fastest generic implementation.
732  */
733 const gcm_impl_ops_t *
gcm_impl_get_ops(void)734 gcm_impl_get_ops(void)
735 {
736 	if (!kfpu_allowed())
737 		return (&gcm_generic_impl);
738 
739 	const gcm_impl_ops_t *ops = NULL;
740 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
741 
742 	switch (impl) {
743 	case IMPL_FASTEST:
744 		ASSERT(gcm_impl_initialized);
745 		ops = &gcm_fastest_impl;
746 		break;
747 	case IMPL_CYCLE:
748 		/* Cycle through supported implementations */
749 		ASSERT(gcm_impl_initialized);
750 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
751 		static size_t cycle_impl_idx = 0;
752 		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
753 		ops = gcm_supp_impl[idx];
754 		break;
755 #ifdef CAN_USE_GCM_ASM
756 	case IMPL_AVX:
757 #if CAN_USE_GCM_ASM >= 2
758 	case IMPL_AVX2:
759 #endif
760 		/*
761 		 * Make sure that we return a valid implementation while
762 		 * switching to the avx implementation since there still
763 		 * may be unfinished non-avx contexts around.
764 		 */
765 		ops = &gcm_generic_impl;
766 		break;
767 #endif
768 	default:
769 		ASSERT3U(impl, <, gcm_supp_impl_cnt);
770 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
771 		if (impl < ARRAY_SIZE(gcm_all_impl))
772 			ops = gcm_supp_impl[impl];
773 		break;
774 	}
775 
776 	ASSERT3P(ops, !=, NULL);
777 
778 	return (ops);
779 }
780 
781 /*
782  * Initialize all supported implementations.
783  */
784 void
gcm_impl_init(void)785 gcm_impl_init(void)
786 {
787 	gcm_impl_ops_t *curr_impl;
788 	int i, c;
789 
790 	/* Move supported implementations into gcm_supp_impls */
791 	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
792 		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
793 
794 		if (curr_impl->is_supported())
795 			gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
796 	}
797 	gcm_supp_impl_cnt = c;
798 
799 	/*
800 	 * Set the fastest implementation given the assumption that the
801 	 * hardware accelerated version is the fastest.
802 	 */
803 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
804 	if (gcm_pclmulqdq_impl.is_supported()) {
805 		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
806 		    sizeof (gcm_fastest_impl));
807 	} else
808 #endif
809 	{
810 		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
811 		    sizeof (gcm_fastest_impl));
812 	}
813 
814 	strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
815 
816 #ifdef CAN_USE_GCM_ASM
817 	/*
818 	 * Use the avx implementation if it's available and the implementation
819 	 * hasn't changed from its default value of fastest on module load.
820 	 */
821 #if CAN_USE_GCM_ASM >= 2
822 	if (gcm_avx2_will_work()) {
823 		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
824 			gcm_use_impl(GCM_IMPL_AVX2);
825 		}
826 	} else
827 #endif
828 	if (gcm_avx_will_work()) {
829 #ifdef HAVE_MOVBE
830 		if (zfs_movbe_available() == B_TRUE) {
831 			atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
832 		}
833 #endif
834 		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
835 			gcm_use_impl(GCM_IMPL_AVX);
836 		}
837 	}
838 #endif
839 	/* Finish initialization */
840 	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
841 	gcm_impl_initialized = B_TRUE;
842 }
843 
844 static const struct {
845 	const char *name;
846 	uint32_t sel;
847 } gcm_impl_opts[] = {
848 		{ "cycle",	IMPL_CYCLE },
849 		{ "fastest",	IMPL_FASTEST },
850 #ifdef CAN_USE_GCM_ASM
851 		{ "avx",	IMPL_AVX },
852 		{ "avx2-vaes",	IMPL_AVX2 },
853 #endif
854 };
855 
856 /*
857  * Function sets desired gcm implementation.
858  *
859  * If we are called before init(), user preference will be saved in
860  * user_sel_impl, and applied in later init() call. This occurs when module
861  * parameter is specified on module load. Otherwise, directly update
862  * icp_gcm_impl.
863  *
864  * @val		Name of gcm implementation to use
865  * @param	Unused.
866  */
867 int
gcm_impl_set(const char * val)868 gcm_impl_set(const char *val)
869 {
870 	int err = -EINVAL;
871 	char req_name[GCM_IMPL_NAME_MAX];
872 	uint32_t impl = GCM_IMPL_READ(user_sel_impl);
873 	size_t i;
874 
875 	/* sanitize input */
876 	i = strnlen(val, GCM_IMPL_NAME_MAX);
877 	if (i == 0 || i >= GCM_IMPL_NAME_MAX)
878 		return (err);
879 
880 	strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
881 	while (i > 0 && isspace(req_name[i-1]))
882 		i--;
883 	req_name[i] = '\0';
884 
885 	/* Check mandatory options */
886 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
887 #ifdef CAN_USE_GCM_ASM
888 #if CAN_USE_GCM_ASM >= 2
889 		/* Ignore avx implementation if it won't work. */
890 		if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
891 		    !gcm_avx2_will_work()) {
892 			continue;
893 		}
894 #endif
895 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
896 			continue;
897 		}
898 #endif
899 		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
900 			impl = gcm_impl_opts[i].sel;
901 			err = 0;
902 			break;
903 		}
904 	}
905 
906 	/* check all supported impl if init() was already called */
907 	if (err != 0 && gcm_impl_initialized) {
908 		/* check all supported implementations */
909 		for (i = 0; i < gcm_supp_impl_cnt; i++) {
910 			if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
911 				impl = i;
912 				err = 0;
913 				break;
914 			}
915 		}
916 	}
917 #ifdef CAN_USE_GCM_ASM
918 	/*
919 	 * Use the avx implementation if available and the requested one is
920 	 * avx or fastest.
921 	 */
922 #if CAN_USE_GCM_ASM >= 2
923 	if (gcm_avx2_will_work() == B_TRUE &&
924 	    (impl == IMPL_AVX2 || impl == IMPL_FASTEST)) {
925 		gcm_use_impl(GCM_IMPL_AVX2);
926 	} else
927 #endif
928 	if (gcm_avx_will_work() == B_TRUE &&
929 	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
930 		gcm_use_impl(GCM_IMPL_AVX);
931 	} else {
932 		gcm_use_impl(GCM_IMPL_GENERIC);
933 	}
934 #endif
935 
936 	if (err == 0) {
937 		if (gcm_impl_initialized)
938 			atomic_swap_32(&icp_gcm_impl, impl);
939 		else
940 			atomic_swap_32(&user_sel_impl, impl);
941 	}
942 
943 	return (err);
944 }
945 
946 #if defined(_KERNEL) && defined(__linux__)
947 
948 static int
icp_gcm_impl_set(const char * val,zfs_kernel_param_t * kp)949 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
950 {
951 	return (gcm_impl_set(val));
952 }
953 
954 static int
icp_gcm_impl_get(char * buffer,zfs_kernel_param_t * kp)955 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
956 {
957 	int i, cnt = 0;
958 	char *fmt;
959 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
960 
961 	/* list mandatory options */
962 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
963 #ifdef CAN_USE_GCM_ASM
964 		/* Ignore avx implementation if it won't work. */
965 #if CAN_USE_GCM_ASM >= 2
966 		if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
967 		    !gcm_avx2_will_work()) {
968 			continue;
969 		}
970 #endif
971 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
972 			continue;
973 		}
974 #endif
975 		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
976 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
977 		    gcm_impl_opts[i].name);
978 	}
979 
980 	/* list all supported implementations */
981 	for (i = 0; i < gcm_supp_impl_cnt; i++) {
982 		fmt = (i == impl) ? "[%s] " : "%s ";
983 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
984 		    gcm_supp_impl[i]->name);
985 	}
986 
987 	return (cnt);
988 }
989 
990 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
991     NULL, 0644);
992 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
993 #endif /* defined(__KERNEL) */
994 
995 #ifdef CAN_USE_GCM_ASM
996 #define	GCM_BLOCK_LEN 16
997 /*
998  * The openssl asm routines are 6x aggregated and need that many bytes
999  * at minimum.
1000  */
1001 #define	GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
1002 #define	GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
1003 /*
1004  * Ensure the chunk size is reasonable since we are allocating a
1005  * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
1006  */
1007 #define	GCM_AVX_MAX_CHUNK_SIZE \
1008 	(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
1009 
1010 /* Clear the FPU registers since they hold sensitive internal state. */
1011 #define	clear_fpu_regs() clear_fpu_regs_avx()
1012 
1013 #define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1014 
1015 /* Get the chunk size module parameter. */
1016 #define	GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1017 
1018 /*
1019  * Module parameter: number of bytes to process at once while owning the FPU.
1020  * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1021  * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1022  */
1023 static uint32_t gcm_avx_chunk_size =
1024 	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1025 
1026 /*
1027  * GCM definitions: uint128_t is copied from include/crypto/modes.h
1028  * Avoiding u128 because it is already defined in kernel sources.
1029  */
1030 typedef struct {
1031     uint64_t hi, lo;
1032 } uint128_t;
1033 
1034 extern void ASMABI clear_fpu_regs_avx(void);
1035 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1036 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
1037     const uint32_t pt[4], uint32_t ct[4]);
1038 
1039 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1040 #if CAN_USE_GCM_ASM >= 2
1041 extern void ASMABI gcm_init_vpclmulqdq_avx2(uint128_t Htable[16],
1042     const uint64_t H[2]);
1043 #endif
1044 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1045     const uint8_t *in, size_t len);
1046 #if CAN_USE_GCM_ASM >= 2
1047 extern void ASMABI gcm_ghash_vpclmulqdq_avx2(uint64_t ghash[2],
1048     const uint64_t *Htable, const uint8_t *in, size_t len);
1049 #endif
GHASH_AVX(gcm_ctx_t * ctx,const uint8_t * in,size_t len)1050 static inline void GHASH_AVX(gcm_ctx_t *ctx, const uint8_t *in, size_t len)
1051 {
1052 	switch (ctx->impl) {
1053 #if CAN_USE_GCM_ASM >= 2
1054 		case GCM_IMPL_AVX2:
1055 			gcm_ghash_vpclmulqdq_avx2(ctx->gcm_ghash,
1056 			    (const uint64_t *)ctx->gcm_Htable, in, len);
1057 			break;
1058 #endif
1059 
1060 		case GCM_IMPL_AVX:
1061 			gcm_ghash_avx(ctx->gcm_ghash,
1062 			    (const uint64_t *)ctx->gcm_Htable, in, len);
1063 			break;
1064 
1065 		default:
1066 			VERIFY(B_FALSE);
1067 	}
1068 }
1069 
1070 typedef size_t ASMABI aesni_gcm_encrypt_impl(const uint8_t *, uint8_t *,
1071     size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
1072 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1073     const void *, uint64_t *, uint64_t *);
1074 #if CAN_USE_GCM_ASM >= 2
1075 extern void ASMABI aes_gcm_enc_update_vaes_avx2(const uint8_t *in,
1076     uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
1077     const uint128_t Htable[16], uint8_t Xi[16]);
1078 #endif
1079 
1080 typedef size_t ASMABI aesni_gcm_decrypt_impl(const uint8_t *, uint8_t *,
1081     size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
1082 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1083     const void *, uint64_t *, uint64_t *);
1084 #if CAN_USE_GCM_ASM >= 2
1085 extern void ASMABI aes_gcm_dec_update_vaes_avx2(const uint8_t *in,
1086     uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
1087     const uint128_t Htable[16], uint8_t Xi[16]);
1088 #endif
1089 
1090 static inline boolean_t
gcm_avx2_will_work(void)1091 gcm_avx2_will_work(void)
1092 {
1093 	return (kfpu_allowed() &&
1094 	    zfs_avx2_available() && zfs_vaes_available() &&
1095 	    zfs_vpclmulqdq_available());
1096 }
1097 
1098 static inline boolean_t
gcm_avx_will_work(void)1099 gcm_avx_will_work(void)
1100 {
1101 	/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1102 	return (kfpu_allowed() &&
1103 	    zfs_avx_available() && zfs_aes_available() &&
1104 	    zfs_pclmulqdq_available());
1105 }
1106 
1107 static inline void
gcm_use_impl(gcm_impl impl)1108 gcm_use_impl(gcm_impl impl)
1109 {
1110 	switch (impl) {
1111 #if CAN_USE_GCM_ASM >= 2
1112 		case GCM_IMPL_AVX2:
1113 			if (gcm_avx2_will_work() == B_TRUE) {
1114 				atomic_swap_32(&gcm_impl_used, impl);
1115 				return;
1116 			}
1117 
1118 			zfs_fallthrough;
1119 #endif
1120 
1121 		case GCM_IMPL_AVX:
1122 			if (gcm_avx_will_work() == B_TRUE) {
1123 				atomic_swap_32(&gcm_impl_used, impl);
1124 				return;
1125 			}
1126 
1127 			zfs_fallthrough;
1128 
1129 		default:
1130 			atomic_swap_32(&gcm_impl_used, GCM_IMPL_GENERIC);
1131 	}
1132 }
1133 
1134 static inline boolean_t
gcm_impl_will_work(gcm_impl impl)1135 gcm_impl_will_work(gcm_impl impl)
1136 {
1137 	switch (impl) {
1138 #if CAN_USE_GCM_ASM >= 2
1139 		case GCM_IMPL_AVX2:
1140 			return (gcm_avx2_will_work());
1141 #endif
1142 
1143 		case GCM_IMPL_AVX:
1144 			return (gcm_avx_will_work());
1145 
1146 		default:
1147 			return (B_TRUE);
1148 	}
1149 }
1150 
1151 static inline gcm_impl
gcm_toggle_impl(void)1152 gcm_toggle_impl(void)
1153 {
1154 	gcm_impl current_impl, new_impl;
1155 	do { /* handle races */
1156 		current_impl = atomic_load_32(&gcm_impl_used);
1157 		new_impl = current_impl;
1158 		while (B_TRUE) { /* handle incompatble implementations */
1159 			new_impl = (new_impl + 1) % GCM_IMPL_MAX;
1160 			if (gcm_impl_will_work(new_impl)) {
1161 				break;
1162 			}
1163 		}
1164 
1165 	} while (atomic_cas_32(&gcm_impl_used, current_impl, new_impl) !=
1166 	    current_impl);
1167 
1168 	return (new_impl);
1169 }
1170 
1171 
1172 /* Increment the GCM counter block by n. */
1173 static inline void
gcm_incr_counter_block_by(gcm_ctx_t * ctx,int n)1174 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1175 {
1176 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1177 	uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1178 
1179 	counter = htonll(counter + n);
1180 	counter &= counter_mask;
1181 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1182 }
1183 
aesni_gcm_encrypt_avx(const uint8_t * in,uint8_t * out,size_t len,const void * key,uint64_t * iv,const uint64_t * Htable,uint64_t * Xip)1184 static size_t aesni_gcm_encrypt_avx(const uint8_t *in, uint8_t *out,
1185     size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
1186     uint64_t *Xip)
1187 {
1188 	(void) Htable;
1189 	return (aesni_gcm_encrypt(in, out, len, key, iv, Xip));
1190 }
1191 
1192 #if CAN_USE_GCM_ASM >= 2
1193 // kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four
1194 // bits of a |size_t|.
1195 // This is from boringssl/crypto/fipsmodule/aes/gcm.cc.inc
1196 static const size_t kSizeTWithoutLower4Bits = (size_t)-16;
1197 
1198 /* The following CRYPTO methods are from boringssl/crypto/internal.h */
CRYPTO_bswap4(uint32_t x)1199 static inline uint32_t CRYPTO_bswap4(uint32_t x) {
1200 	return (__builtin_bswap32(x));
1201 }
1202 
CRYPTO_load_u32_be(const void * in)1203 static inline uint32_t CRYPTO_load_u32_be(const void *in) {
1204 	uint32_t v;
1205 	memcpy(&v, in, sizeof (v));
1206 	return (CRYPTO_bswap4(v));
1207 }
1208 
CRYPTO_store_u32_be(void * out,uint32_t v)1209 static inline void CRYPTO_store_u32_be(void *out, uint32_t v) {
1210 	v = CRYPTO_bswap4(v);
1211 	memcpy(out, &v, sizeof (v));
1212 }
1213 
aesni_gcm_encrypt_avx2(const uint8_t * in,uint8_t * out,size_t len,const void * key,uint64_t * iv,const uint64_t * Htable,uint64_t * Xip)1214 static size_t aesni_gcm_encrypt_avx2(const uint8_t *in, uint8_t *out,
1215     size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
1216     uint64_t *Xip)
1217 {
1218 	uint8_t *ivec = (uint8_t *)iv;
1219 	len &= kSizeTWithoutLower4Bits;
1220 	aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec,
1221 	    (const uint128_t *)Htable, (uint8_t *)Xip);
1222 	CRYPTO_store_u32_be(&ivec[12],
1223 	    CRYPTO_load_u32_be(&ivec[12]) + len / 16);
1224 	return (len);
1225 }
1226 #endif /* if CAN_USE_GCM_ASM >= 2 */
1227 
1228 /*
1229  * Encrypt multiple blocks of data in GCM mode.
1230  * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1231  * if possible. While processing a chunk the FPU is "locked".
1232  */
1233 static int
gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size)1234 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1235     size_t length, crypto_data_t *out, size_t block_size)
1236 {
1237 	size_t bleft = length;
1238 	size_t need = 0;
1239 	size_t done = 0;
1240 	uint8_t *datap = (uint8_t *)data;
1241 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1242 	aesni_gcm_encrypt_impl *encrypt_blocks =
1243 #if CAN_USE_GCM_ASM >= 2
1244 	    ctx->impl == GCM_IMPL_AVX2 ?
1245 	    aesni_gcm_encrypt_avx2 :
1246 #endif
1247 	    aesni_gcm_encrypt_avx;
1248 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1249 	uint64_t *ghash = ctx->gcm_ghash;
1250 	uint64_t *htable = ctx->gcm_Htable;
1251 	uint64_t *cb = ctx->gcm_cb;
1252 	uint8_t *ct_buf = NULL;
1253 	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1254 	int rv = CRYPTO_SUCCESS;
1255 
1256 	ASSERT(block_size == GCM_BLOCK_LEN);
1257 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1258 	    B_FALSE);
1259 	/*
1260 	 * If the last call left an incomplete block, try to fill
1261 	 * it first.
1262 	 */
1263 	if (ctx->gcm_remainder_len > 0) {
1264 		need = block_size - ctx->gcm_remainder_len;
1265 		if (length < need) {
1266 			/* Accumulate bytes here and return. */
1267 			memcpy((uint8_t *)ctx->gcm_remainder +
1268 			    ctx->gcm_remainder_len, datap, length);
1269 
1270 			ctx->gcm_remainder_len += length;
1271 			if (ctx->gcm_copy_to == NULL) {
1272 				ctx->gcm_copy_to = datap;
1273 			}
1274 			return (CRYPTO_SUCCESS);
1275 		} else {
1276 			/* Complete incomplete block. */
1277 			memcpy((uint8_t *)ctx->gcm_remainder +
1278 			    ctx->gcm_remainder_len, datap, need);
1279 
1280 			ctx->gcm_copy_to = NULL;
1281 		}
1282 	}
1283 
1284 	/* Allocate a buffer to encrypt to if there is enough input. */
1285 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1286 		ct_buf = vmem_alloc(chunk_size, KM_SLEEP);
1287 		if (ct_buf == NULL) {
1288 			return (CRYPTO_HOST_MEMORY);
1289 		}
1290 	}
1291 
1292 	/* If we completed an incomplete block, encrypt and write it out. */
1293 	if (ctx->gcm_remainder_len > 0) {
1294 		kfpu_begin();
1295 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1296 		    (const uint32_t *)cb, (uint32_t *)tmp);
1297 
1298 		gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1299 		GHASH_AVX(ctx, tmp, block_size);
1300 		clear_fpu_regs();
1301 		kfpu_end();
1302 		rv = crypto_put_output_data(tmp, out, block_size);
1303 		out->cd_offset += block_size;
1304 		gcm_incr_counter_block(ctx);
1305 		ctx->gcm_processed_data_len += block_size;
1306 		bleft -= need;
1307 		datap += need;
1308 		ctx->gcm_remainder_len = 0;
1309 	}
1310 
1311 	/* Do the bulk encryption in chunk_size blocks. */
1312 	for (; bleft >= chunk_size; bleft -= chunk_size) {
1313 		kfpu_begin();
1314 		done = encrypt_blocks(
1315 		    datap, ct_buf, chunk_size, key, cb, htable, ghash);
1316 
1317 		clear_fpu_regs();
1318 		kfpu_end();
1319 		if (done != chunk_size) {
1320 			rv = CRYPTO_FAILED;
1321 			goto out_nofpu;
1322 		}
1323 		rv = crypto_put_output_data(ct_buf, out, chunk_size);
1324 		if (rv != CRYPTO_SUCCESS) {
1325 			goto out_nofpu;
1326 		}
1327 		out->cd_offset += chunk_size;
1328 		datap += chunk_size;
1329 		ctx->gcm_processed_data_len += chunk_size;
1330 	}
1331 	/* Check if we are already done. */
1332 	if (bleft == 0) {
1333 		goto out_nofpu;
1334 	}
1335 	/* Bulk encrypt the remaining data. */
1336 	kfpu_begin();
1337 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1338 		done = encrypt_blocks(datap, ct_buf, bleft, key, cb, htable,
1339 		    ghash);
1340 		if (done == 0) {
1341 			rv = CRYPTO_FAILED;
1342 			goto out;
1343 		}
1344 		rv = crypto_put_output_data(ct_buf, out, done);
1345 		if (rv != CRYPTO_SUCCESS) {
1346 			goto out;
1347 		}
1348 		out->cd_offset += done;
1349 		ctx->gcm_processed_data_len += done;
1350 		datap += done;
1351 		bleft -= done;
1352 
1353 	}
1354 	/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1355 	while (bleft > 0) {
1356 		if (bleft < block_size) {
1357 			memcpy(ctx->gcm_remainder, datap, bleft);
1358 			ctx->gcm_remainder_len = bleft;
1359 			ctx->gcm_copy_to = datap;
1360 			goto out;
1361 		}
1362 		/* Encrypt, hash and write out. */
1363 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1364 		    (const uint32_t *)cb, (uint32_t *)tmp);
1365 
1366 		gcm_xor_avx(datap, tmp);
1367 		GHASH_AVX(ctx, tmp, block_size);
1368 		rv = crypto_put_output_data(tmp, out, block_size);
1369 		if (rv != CRYPTO_SUCCESS) {
1370 			goto out;
1371 		}
1372 		out->cd_offset += block_size;
1373 		gcm_incr_counter_block(ctx);
1374 		ctx->gcm_processed_data_len += block_size;
1375 		datap += block_size;
1376 		bleft -= block_size;
1377 	}
1378 out:
1379 	clear_fpu_regs();
1380 	kfpu_end();
1381 out_nofpu:
1382 	if (ct_buf != NULL) {
1383 		vmem_free(ct_buf, chunk_size);
1384 	}
1385 	return (rv);
1386 }
1387 
1388 /*
1389  * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1390  * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1391  */
1392 static int
gcm_encrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1393 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1394 {
1395 	uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1396 	uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1397 	uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1398 	size_t rem_len = ctx->gcm_remainder_len;
1399 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1400 	int aes_rounds = ((aes_key_t *)keysched)->nr;
1401 	int rv;
1402 
1403 	ASSERT(block_size == GCM_BLOCK_LEN);
1404 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1405 	    B_FALSE);
1406 
1407 	if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1408 		return (CRYPTO_DATA_LEN_RANGE);
1409 	}
1410 
1411 	kfpu_begin();
1412 	/* Pad last incomplete block with zeros, encrypt and hash. */
1413 	if (rem_len > 0) {
1414 		uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1415 		const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1416 
1417 		aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1418 		memset(remainder + rem_len, 0, block_size - rem_len);
1419 		for (int i = 0; i < rem_len; i++) {
1420 			remainder[i] ^= tmp[i];
1421 		}
1422 		GHASH_AVX(ctx, remainder, block_size);
1423 		ctx->gcm_processed_data_len += rem_len;
1424 		/* No need to increment counter_block, it's the last block. */
1425 	}
1426 	/* Finish tag. */
1427 	ctx->gcm_len_a_len_c[1] =
1428 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1429 	GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1430 	aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1431 
1432 	gcm_xor_avx((uint8_t *)J0, ghash);
1433 	clear_fpu_regs();
1434 	kfpu_end();
1435 
1436 	/* Output remainder. */
1437 	if (rem_len > 0) {
1438 		rv = crypto_put_output_data(remainder, out, rem_len);
1439 		if (rv != CRYPTO_SUCCESS)
1440 			return (rv);
1441 	}
1442 	out->cd_offset += rem_len;
1443 	ctx->gcm_remainder_len = 0;
1444 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1445 	if (rv != CRYPTO_SUCCESS)
1446 		return (rv);
1447 
1448 	out->cd_offset += ctx->gcm_tag_len;
1449 	return (CRYPTO_SUCCESS);
1450 }
1451 
aesni_gcm_decrypt_avx(const uint8_t * in,uint8_t * out,size_t len,const void * key,uint64_t * iv,const uint64_t * Htable,uint64_t * Xip)1452 static size_t aesni_gcm_decrypt_avx(const uint8_t *in, uint8_t *out,
1453     size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
1454     uint64_t *Xip)
1455 {
1456 	(void) Htable;
1457 	return (aesni_gcm_decrypt(in, out, len, key, iv, Xip));
1458 }
1459 
1460 #if CAN_USE_GCM_ASM >= 2
aesni_gcm_decrypt_avx2(const uint8_t * in,uint8_t * out,size_t len,const void * key,uint64_t * iv,const uint64_t * Htable,uint64_t * Xip)1461 static size_t aesni_gcm_decrypt_avx2(const uint8_t *in, uint8_t *out,
1462     size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
1463     uint64_t *Xip)
1464 {
1465 	uint8_t *ivec = (uint8_t *)iv;
1466 	len &= kSizeTWithoutLower4Bits;
1467 	aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec,
1468 	    (const uint128_t *)Htable, (uint8_t *)Xip);
1469 	CRYPTO_store_u32_be(&ivec[12],
1470 	    CRYPTO_load_u32_be(&ivec[12]) + len / 16);
1471 	return (len);
1472 }
1473 #endif /* if CAN_USE_GCM_ASM >= 2 */
1474 
1475 /*
1476  * Finalize decryption: We just have accumulated crypto text, so now we
1477  * decrypt it here inplace.
1478  */
1479 static int
gcm_decrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1480 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1481 {
1482 	ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1483 	ASSERT3U(block_size, ==, 16);
1484 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1485 	    B_FALSE);
1486 
1487 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1488 	aesni_gcm_decrypt_impl *decrypt_blocks =
1489 #if CAN_USE_GCM_ASM >= 2
1490 	    ctx->impl == GCM_IMPL_AVX2 ?
1491 	    aesni_gcm_decrypt_avx2 :
1492 #endif
1493 	    aesni_gcm_decrypt_avx;
1494 	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1495 	uint8_t *datap = ctx->gcm_pt_buf;
1496 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1497 	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1498 	uint64_t *htable = ctx->gcm_Htable;
1499 	uint64_t *ghash = ctx->gcm_ghash;
1500 	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1501 	int rv = CRYPTO_SUCCESS;
1502 	size_t bleft, done;
1503 
1504 	/*
1505 	 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1506 	 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1507 	 * GCM_AVX_MIN_DECRYPT_BYTES.
1508 	 */
1509 	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1510 		kfpu_begin();
1511 		done = decrypt_blocks(datap, datap, chunk_size,
1512 		    (const void *)key, ctx->gcm_cb, htable, ghash);
1513 		clear_fpu_regs();
1514 		kfpu_end();
1515 		if (done != chunk_size) {
1516 			return (CRYPTO_FAILED);
1517 		}
1518 		datap += done;
1519 	}
1520 	/* Decrypt remainder, which is less than chunk size, in one go. */
1521 	kfpu_begin();
1522 	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1523 		done = decrypt_blocks(datap, datap, bleft,
1524 		    (const void *)key, ctx->gcm_cb, htable, ghash);
1525 		if (done == 0) {
1526 			clear_fpu_regs();
1527 			kfpu_end();
1528 			return (CRYPTO_FAILED);
1529 		}
1530 		datap += done;
1531 		bleft -= done;
1532 	}
1533 	ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1534 
1535 	/*
1536 	 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1537 	 * decrypt them block by block.
1538 	 */
1539 	while (bleft > 0) {
1540 		/* Incomplete last block. */
1541 		if (bleft < block_size) {
1542 			uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1543 
1544 			memset(lastb, 0, block_size);
1545 			memcpy(lastb, datap, bleft);
1546 			/* The GCM processing. */
1547 			GHASH_AVX(ctx, lastb, block_size);
1548 			aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1549 			for (size_t i = 0; i < bleft; i++) {
1550 				datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1551 			}
1552 			break;
1553 		}
1554 		/* The GCM processing. */
1555 		GHASH_AVX(ctx, datap, block_size);
1556 		aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1557 		gcm_xor_avx((uint8_t *)tmp, datap);
1558 		gcm_incr_counter_block(ctx);
1559 
1560 		datap += block_size;
1561 		bleft -= block_size;
1562 	}
1563 	if (rv != CRYPTO_SUCCESS) {
1564 		clear_fpu_regs();
1565 		kfpu_end();
1566 		return (rv);
1567 	}
1568 	/* Decryption done, finish the tag. */
1569 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1570 	GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1571 	aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1572 	    (uint32_t *)ctx->gcm_J0);
1573 
1574 	gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1575 
1576 	/* We are done with the FPU, restore its state. */
1577 	clear_fpu_regs();
1578 	kfpu_end();
1579 
1580 	/* Compare the input authentication tag with what we calculated. */
1581 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1582 		/* They don't match. */
1583 		return (CRYPTO_INVALID_MAC);
1584 	}
1585 	rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1586 	if (rv != CRYPTO_SUCCESS) {
1587 		return (rv);
1588 	}
1589 	out->cd_offset += pt_len;
1590 	return (CRYPTO_SUCCESS);
1591 }
1592 
1593 /*
1594  * Initialize the GCM params H, Htabtle and the counter block. Save the
1595  * initial counter block.
1596  */
1597 static int
gcm_init_avx(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size)1598 gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
1599     const uint8_t *auth_data, size_t auth_data_len, size_t block_size)
1600 {
1601 	uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1602 	uint64_t *H = ctx->gcm_H;
1603 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1604 	int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1605 	const uint8_t *datap = auth_data;
1606 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1607 	size_t bleft;
1608 
1609 	ASSERT(block_size == GCM_BLOCK_LEN);
1610 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1611 	    B_FALSE);
1612 
1613 	size_t htab_len = 0;
1614 #if CAN_USE_GCM_ASM >= 2
1615 	if (ctx->impl == GCM_IMPL_AVX2) {
1616 		/*
1617 		 * BoringSSL's API specifies uint128_t[16] for htab; but only
1618 		 * uint128_t[12] are used.
1619 		 * See https://github.com/google/boringssl/blob/
1620 		 * 813840dd094f9e9c1b00a7368aa25e656554221f1/crypto/fipsmodule/
1621 		 * modes/asm/aes-gcm-avx2-x86_64.pl#L198-L200
1622 		 */
1623 		htab_len = (2 * 8 * sizeof (uint128_t));
1624 	} else
1625 #endif /* CAN_USE_GCM_ASM >= 2 */
1626 	{
1627 		htab_len = (2 * 6 * sizeof (uint128_t));
1628 	}
1629 
1630 	ctx->gcm_Htable = kmem_alloc(htab_len, KM_SLEEP);
1631 	if (ctx->gcm_Htable == NULL) {
1632 		return (CRYPTO_HOST_MEMORY);
1633 	}
1634 
1635 	/* Init H (encrypt zero block) and create the initial counter block. */
1636 	memset(H, 0, sizeof (ctx->gcm_H));
1637 	kfpu_begin();
1638 	aes_encrypt_intel(keysched, aes_rounds,
1639 	    (const uint32_t *)H, (uint32_t *)H);
1640 
1641 #if CAN_USE_GCM_ASM >= 2
1642 	if (ctx->impl == GCM_IMPL_AVX2) {
1643 		gcm_init_vpclmulqdq_avx2((uint128_t *)ctx->gcm_Htable, H);
1644 	} else
1645 #endif /* if CAN_USE_GCM_ASM >= 2 */
1646 	{
1647 		gcm_init_htab_avx(ctx->gcm_Htable, H);
1648 	}
1649 
1650 	if (iv_len == 12) {
1651 		memcpy(cb, iv, 12);
1652 		cb[12] = 0;
1653 		cb[13] = 0;
1654 		cb[14] = 0;
1655 		cb[15] = 1;
1656 		/* We need the ICB later. */
1657 		memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0));
1658 	} else {
1659 		/*
1660 		 * Most consumers use 12 byte IVs, so it's OK to use the
1661 		 * original routines for other IV sizes, just avoid nesting
1662 		 * kfpu_begin calls.
1663 		 */
1664 		clear_fpu_regs();
1665 		kfpu_end();
1666 		gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1667 		    aes_copy_block, aes_xor_block);
1668 		kfpu_begin();
1669 	}
1670 
1671 	memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash));
1672 
1673 	/* Openssl post increments the counter, adjust for that. */
1674 	gcm_incr_counter_block(ctx);
1675 
1676 	/* Ghash AAD in chunk_size blocks. */
1677 	for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1678 		GHASH_AVX(ctx, datap, chunk_size);
1679 		datap += chunk_size;
1680 		clear_fpu_regs();
1681 		kfpu_end();
1682 		kfpu_begin();
1683 	}
1684 	/* Ghash the remainder and handle possible incomplete GCM block. */
1685 	if (bleft > 0) {
1686 		size_t incomp = bleft % block_size;
1687 
1688 		bleft -= incomp;
1689 		if (bleft > 0) {
1690 			GHASH_AVX(ctx, datap, bleft);
1691 			datap += bleft;
1692 		}
1693 		if (incomp > 0) {
1694 			/* Zero pad and hash incomplete last block. */
1695 			uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1696 
1697 			memset(authp, 0, block_size);
1698 			memcpy(authp, datap, incomp);
1699 			GHASH_AVX(ctx, authp, block_size);
1700 		}
1701 	}
1702 	clear_fpu_regs();
1703 	kfpu_end();
1704 	return (CRYPTO_SUCCESS);
1705 }
1706 
1707 #if defined(_KERNEL)
1708 static int
icp_gcm_avx_set_chunk_size(const char * buf,zfs_kernel_param_t * kp)1709 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1710 {
1711 	unsigned long val;
1712 	char val_rounded[16];
1713 	int error = 0;
1714 
1715 	error = kstrtoul(buf, 0, &val);
1716 	if (error)
1717 		return (error);
1718 
1719 	val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1720 
1721 	if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1722 		return (-EINVAL);
1723 
1724 	snprintf(val_rounded, 16, "%u", (uint32_t)val);
1725 	error = param_set_uint(val_rounded, kp);
1726 	return (error);
1727 }
1728 
1729 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1730     param_get_uint, &gcm_avx_chunk_size, 0644);
1731 
1732 MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1733 	"How many bytes to process while owning the FPU");
1734 
1735 #endif /* defined(__KERNEL) */
1736 #endif /* ifdef CAN_USE_GCM_ASM */
1737