xref: /freebsd/sys/contrib/openzfs/module/icp/algs/modes/gcm.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/cmn_err.h>
28 #include <modes/modes.h>
29 #include <sys/crypto/common.h>
30 #include <sys/crypto/icp.h>
31 #include <sys/crypto/impl.h>
32 #include <sys/byteorder.h>
33 #include <sys/simd.h>
34 #include <modes/gcm_impl.h>
35 #ifdef CAN_USE_GCM_ASM
36 #include <aes/aes_impl.h>
37 #endif
38 
39 #define	GHASH(c, d, t, o) \
40 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
41 	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
42 	(uint64_t *)(void *)(t));
43 
44 /* Select GCM implementation */
45 #define	IMPL_FASTEST	(UINT32_MAX)
46 #define	IMPL_CYCLE	(UINT32_MAX-1)
47 #ifdef CAN_USE_GCM_ASM
48 #define	IMPL_AVX	(UINT32_MAX-2)
49 #endif
50 #define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
51 static uint32_t icp_gcm_impl = IMPL_FASTEST;
52 static uint32_t user_sel_impl = IMPL_FASTEST;
53 
54 #ifdef CAN_USE_GCM_ASM
55 /* Does the architecture we run on support the MOVBE instruction? */
56 boolean_t gcm_avx_can_use_movbe = B_FALSE;
57 /*
58  * Whether to use the optimized openssl gcm and ghash implementations.
59  * Set to true if module parameter icp_gcm_impl == "avx".
60  */
61 static boolean_t gcm_use_avx = B_FALSE;
62 #define	GCM_IMPL_USE_AVX	(*(volatile boolean_t *)&gcm_use_avx)
63 
64 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
65 
66 static inline boolean_t gcm_avx_will_work(void);
67 static inline void gcm_set_avx(boolean_t);
68 static inline boolean_t gcm_toggle_avx(void);
69 static inline size_t gcm_simd_get_htab_size(boolean_t);
70 
71 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
72     crypto_data_t *, size_t);
73 
74 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
75 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
76 static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *,
77     size_t, size_t);
78 #endif /* ifdef CAN_USE_GCM_ASM */
79 
80 /*
81  * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
82  * is done in another function.
83  */
84 int
gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))85 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
86     crypto_data_t *out, size_t block_size,
87     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
88     void (*copy_block)(uint8_t *, uint8_t *),
89     void (*xor_block)(uint8_t *, uint8_t *))
90 {
91 #ifdef CAN_USE_GCM_ASM
92 	if (ctx->gcm_use_avx == B_TRUE)
93 		return (gcm_mode_encrypt_contiguous_blocks_avx(
94 		    ctx, data, length, out, block_size));
95 #endif
96 
97 	const gcm_impl_ops_t *gops;
98 	size_t remainder = length;
99 	size_t need = 0;
100 	uint8_t *datap = (uint8_t *)data;
101 	uint8_t *blockp;
102 	uint8_t *lastp;
103 	void *iov_or_mp;
104 	offset_t offset;
105 	uint8_t *out_data_1;
106 	uint8_t *out_data_2;
107 	size_t out_data_1_len;
108 	uint64_t counter;
109 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
110 
111 	if (length + ctx->gcm_remainder_len < block_size) {
112 		/* accumulate bytes here and return */
113 		memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
114 		    datap,
115 		    length);
116 		ctx->gcm_remainder_len += length;
117 		if (ctx->gcm_copy_to == NULL) {
118 			ctx->gcm_copy_to = datap;
119 		}
120 		return (CRYPTO_SUCCESS);
121 	}
122 
123 	crypto_init_ptrs(out, &iov_or_mp, &offset);
124 
125 	gops = gcm_impl_get_ops();
126 	do {
127 		/* Unprocessed data from last call. */
128 		if (ctx->gcm_remainder_len > 0) {
129 			need = block_size - ctx->gcm_remainder_len;
130 
131 			if (need > remainder)
132 				return (CRYPTO_DATA_LEN_RANGE);
133 
134 			memcpy(&((uint8_t *)ctx->gcm_remainder)
135 			    [ctx->gcm_remainder_len], datap, need);
136 
137 			blockp = (uint8_t *)ctx->gcm_remainder;
138 		} else {
139 			blockp = datap;
140 		}
141 
142 		/*
143 		 * Increment counter. Counter bits are confined
144 		 * to the bottom 32 bits of the counter block.
145 		 */
146 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
147 		counter = htonll(counter + 1);
148 		counter &= counter_mask;
149 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
150 
151 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
152 		    (uint8_t *)ctx->gcm_tmp);
153 		xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
154 
155 		lastp = (uint8_t *)ctx->gcm_tmp;
156 
157 		ctx->gcm_processed_data_len += block_size;
158 
159 		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
160 		    &out_data_1_len, &out_data_2, block_size);
161 
162 		/* copy block to where it belongs */
163 		if (out_data_1_len == block_size) {
164 			copy_block(lastp, out_data_1);
165 		} else {
166 			memcpy(out_data_1, lastp, out_data_1_len);
167 			if (out_data_2 != NULL) {
168 				memcpy(out_data_2,
169 				    lastp + out_data_1_len,
170 				    block_size - out_data_1_len);
171 			}
172 		}
173 		/* update offset */
174 		out->cd_offset += block_size;
175 
176 		/* add ciphertext to the hash */
177 		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
178 
179 		/* Update pointer to next block of data to be processed. */
180 		if (ctx->gcm_remainder_len != 0) {
181 			datap += need;
182 			ctx->gcm_remainder_len = 0;
183 		} else {
184 			datap += block_size;
185 		}
186 
187 		remainder = (size_t)&data[length] - (size_t)datap;
188 
189 		/* Incomplete last block. */
190 		if (remainder > 0 && remainder < block_size) {
191 			memcpy(ctx->gcm_remainder, datap, remainder);
192 			ctx->gcm_remainder_len = remainder;
193 			ctx->gcm_copy_to = datap;
194 			goto out;
195 		}
196 		ctx->gcm_copy_to = NULL;
197 
198 	} while (remainder > 0);
199 out:
200 	return (CRYPTO_SUCCESS);
201 }
202 
203 int
gcm_encrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))204 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
205     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
206     void (*copy_block)(uint8_t *, uint8_t *),
207     void (*xor_block)(uint8_t *, uint8_t *))
208 {
209 	(void) copy_block;
210 #ifdef CAN_USE_GCM_ASM
211 	if (ctx->gcm_use_avx == B_TRUE)
212 		return (gcm_encrypt_final_avx(ctx, out, block_size));
213 #endif
214 
215 	const gcm_impl_ops_t *gops;
216 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
217 	uint8_t *ghash, *macp = NULL;
218 	int i, rv;
219 
220 	if (out->cd_length <
221 	    (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
222 		return (CRYPTO_DATA_LEN_RANGE);
223 	}
224 
225 	gops = gcm_impl_get_ops();
226 	ghash = (uint8_t *)ctx->gcm_ghash;
227 
228 	if (ctx->gcm_remainder_len > 0) {
229 		uint64_t counter;
230 		uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
231 
232 		/*
233 		 * Here is where we deal with data that is not a
234 		 * multiple of the block size.
235 		 */
236 
237 		/*
238 		 * Increment counter.
239 		 */
240 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
241 		counter = htonll(counter + 1);
242 		counter &= counter_mask;
243 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
244 
245 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
246 		    (uint8_t *)ctx->gcm_tmp);
247 
248 		macp = (uint8_t *)ctx->gcm_remainder;
249 		memset(macp + ctx->gcm_remainder_len, 0,
250 		    block_size - ctx->gcm_remainder_len);
251 
252 		/* XOR with counter block */
253 		for (i = 0; i < ctx->gcm_remainder_len; i++) {
254 			macp[i] ^= tmpp[i];
255 		}
256 
257 		/* add ciphertext to the hash */
258 		GHASH(ctx, macp, ghash, gops);
259 
260 		ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
261 	}
262 
263 	ctx->gcm_len_a_len_c[1] =
264 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
265 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
266 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
267 	    (uint8_t *)ctx->gcm_J0);
268 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
269 
270 	if (ctx->gcm_remainder_len > 0) {
271 		rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
272 		if (rv != CRYPTO_SUCCESS)
273 			return (rv);
274 	}
275 	out->cd_offset += ctx->gcm_remainder_len;
276 	ctx->gcm_remainder_len = 0;
277 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
278 	if (rv != CRYPTO_SUCCESS)
279 		return (rv);
280 	out->cd_offset += ctx->gcm_tag_len;
281 
282 	return (CRYPTO_SUCCESS);
283 }
284 
285 /*
286  * This will only deal with decrypting the last block of the input that
287  * might not be a multiple of block length.
288  */
289 static void
gcm_decrypt_incomplete_block(gcm_ctx_t * ctx,size_t block_size,size_t index,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))290 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
291     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
292     void (*xor_block)(uint8_t *, uint8_t *))
293 {
294 	uint8_t *datap, *outp, *counterp;
295 	uint64_t counter;
296 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
297 	int i;
298 
299 	/*
300 	 * Increment counter.
301 	 * Counter bits are confined to the bottom 32 bits
302 	 */
303 	counter = ntohll(ctx->gcm_cb[1] & counter_mask);
304 	counter = htonll(counter + 1);
305 	counter &= counter_mask;
306 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
307 
308 	datap = (uint8_t *)ctx->gcm_remainder;
309 	outp = &((ctx->gcm_pt_buf)[index]);
310 	counterp = (uint8_t *)ctx->gcm_tmp;
311 
312 	/* authentication tag */
313 	memset((uint8_t *)ctx->gcm_tmp, 0, block_size);
314 	memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len);
315 
316 	/* add ciphertext to the hash */
317 	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
318 
319 	/* decrypt remaining ciphertext */
320 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
321 
322 	/* XOR with counter block */
323 	for (i = 0; i < ctx->gcm_remainder_len; i++) {
324 		outp[i] = datap[i] ^ counterp[i];
325 	}
326 }
327 
328 int
gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))329 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
330     crypto_data_t *out, size_t block_size,
331     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
332     void (*copy_block)(uint8_t *, uint8_t *),
333     void (*xor_block)(uint8_t *, uint8_t *))
334 {
335 	(void) out, (void) block_size, (void) encrypt_block, (void) copy_block,
336 	    (void) xor_block;
337 	size_t new_len;
338 	uint8_t *new;
339 
340 	/*
341 	 * Copy contiguous ciphertext input blocks to plaintext buffer.
342 	 * Ciphertext will be decrypted in the final.
343 	 */
344 	if (length > 0) {
345 		new_len = ctx->gcm_pt_buf_len + length;
346 		new = vmem_alloc(new_len, KM_SLEEP);
347 		if (new == NULL) {
348 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
349 			ctx->gcm_pt_buf = NULL;
350 			return (CRYPTO_HOST_MEMORY);
351 		}
352 
353 		if (ctx->gcm_pt_buf != NULL) {
354 			memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
355 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
356 		} else {
357 			ASSERT0(ctx->gcm_pt_buf_len);
358 		}
359 
360 		ctx->gcm_pt_buf = new;
361 		ctx->gcm_pt_buf_len = new_len;
362 		memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data,
363 		    length);
364 		ctx->gcm_processed_data_len += length;
365 	}
366 
367 	ctx->gcm_remainder_len = 0;
368 	return (CRYPTO_SUCCESS);
369 }
370 
371 int
gcm_decrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))372 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
373     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
374     void (*xor_block)(uint8_t *, uint8_t *))
375 {
376 #ifdef CAN_USE_GCM_ASM
377 	if (ctx->gcm_use_avx == B_TRUE)
378 		return (gcm_decrypt_final_avx(ctx, out, block_size));
379 #endif
380 
381 	const gcm_impl_ops_t *gops;
382 	size_t pt_len;
383 	size_t remainder;
384 	uint8_t *ghash;
385 	uint8_t *blockp;
386 	uint8_t *cbp;
387 	uint64_t counter;
388 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
389 	int processed = 0, rv;
390 
391 	ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
392 
393 	gops = gcm_impl_get_ops();
394 	pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
395 	ghash = (uint8_t *)ctx->gcm_ghash;
396 	blockp = ctx->gcm_pt_buf;
397 	remainder = pt_len;
398 	while (remainder > 0) {
399 		/* Incomplete last block */
400 		if (remainder < block_size) {
401 			memcpy(ctx->gcm_remainder, blockp, remainder);
402 			ctx->gcm_remainder_len = remainder;
403 			/*
404 			 * not expecting anymore ciphertext, just
405 			 * compute plaintext for the remaining input
406 			 */
407 			gcm_decrypt_incomplete_block(ctx, block_size,
408 			    processed, encrypt_block, xor_block);
409 			ctx->gcm_remainder_len = 0;
410 			goto out;
411 		}
412 		/* add ciphertext to the hash */
413 		GHASH(ctx, blockp, ghash, gops);
414 
415 		/*
416 		 * Increment counter.
417 		 * Counter bits are confined to the bottom 32 bits
418 		 */
419 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
420 		counter = htonll(counter + 1);
421 		counter &= counter_mask;
422 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
423 
424 		cbp = (uint8_t *)ctx->gcm_tmp;
425 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
426 
427 		/* XOR with ciphertext */
428 		xor_block(cbp, blockp);
429 
430 		processed += block_size;
431 		blockp += block_size;
432 		remainder -= block_size;
433 	}
434 out:
435 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
436 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
437 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
438 	    (uint8_t *)ctx->gcm_J0);
439 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
440 
441 	/* compare the input authentication tag with what we calculated */
442 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
443 		/* They don't match */
444 		return (CRYPTO_INVALID_MAC);
445 	} else {
446 		rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
447 		if (rv != CRYPTO_SUCCESS)
448 			return (rv);
449 		out->cd_offset += pt_len;
450 	}
451 	return (CRYPTO_SUCCESS);
452 }
453 
454 static int
gcm_validate_args(CK_AES_GCM_PARAMS * gcm_param)455 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
456 {
457 	size_t tag_len;
458 
459 	/*
460 	 * Check the length of the authentication tag (in bits).
461 	 */
462 	tag_len = gcm_param->ulTagBits;
463 	switch (tag_len) {
464 	case 32:
465 	case 64:
466 	case 96:
467 	case 104:
468 	case 112:
469 	case 120:
470 	case 128:
471 		break;
472 	default:
473 		return (CRYPTO_MECHANISM_PARAM_INVALID);
474 	}
475 
476 	if (gcm_param->ulIvLen == 0)
477 		return (CRYPTO_MECHANISM_PARAM_INVALID);
478 
479 	return (CRYPTO_SUCCESS);
480 }
481 
482 static void
gcm_format_initial_blocks(const uint8_t * iv,ulong_t iv_len,gcm_ctx_t * ctx,size_t block_size,void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))483 gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len,
484     gcm_ctx_t *ctx, size_t block_size,
485     void (*copy_block)(uint8_t *, uint8_t *),
486     void (*xor_block)(uint8_t *, uint8_t *))
487 {
488 	const gcm_impl_ops_t *gops;
489 	uint8_t *cb;
490 	ulong_t remainder = iv_len;
491 	ulong_t processed = 0;
492 	uint8_t *datap, *ghash;
493 	uint64_t len_a_len_c[2];
494 
495 	gops = gcm_impl_get_ops();
496 	ghash = (uint8_t *)ctx->gcm_ghash;
497 	cb = (uint8_t *)ctx->gcm_cb;
498 	if (iv_len == 12) {
499 		memcpy(cb, iv, 12);
500 		cb[12] = 0;
501 		cb[13] = 0;
502 		cb[14] = 0;
503 		cb[15] = 1;
504 		/* J0 will be used again in the final */
505 		copy_block(cb, (uint8_t *)ctx->gcm_J0);
506 	} else {
507 		/* GHASH the IV */
508 		do {
509 			if (remainder < block_size) {
510 				memset(cb, 0, block_size);
511 				memcpy(cb, &(iv[processed]), remainder);
512 				datap = (uint8_t *)cb;
513 				remainder = 0;
514 			} else {
515 				datap = (uint8_t *)(&(iv[processed]));
516 				processed += block_size;
517 				remainder -= block_size;
518 			}
519 			GHASH(ctx, datap, ghash, gops);
520 		} while (remainder > 0);
521 
522 		len_a_len_c[0] = 0;
523 		len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
524 		GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
525 
526 		/* J0 will be used again in the final */
527 		copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
528 	}
529 }
530 
531 static int
gcm_init(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))532 gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
533     const uint8_t *auth_data, size_t auth_data_len, size_t block_size,
534     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
535     void (*copy_block)(uint8_t *, uint8_t *),
536     void (*xor_block)(uint8_t *, uint8_t *))
537 {
538 	const gcm_impl_ops_t *gops;
539 	uint8_t *ghash, *datap, *authp;
540 	size_t remainder, processed;
541 
542 	/* encrypt zero block to get subkey H */
543 	memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
544 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
545 	    (uint8_t *)ctx->gcm_H);
546 
547 	gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
548 	    copy_block, xor_block);
549 
550 	gops = gcm_impl_get_ops();
551 	authp = (uint8_t *)ctx->gcm_tmp;
552 	ghash = (uint8_t *)ctx->gcm_ghash;
553 	memset(authp, 0, block_size);
554 	memset(ghash, 0, block_size);
555 
556 	processed = 0;
557 	remainder = auth_data_len;
558 	do {
559 		if (remainder < block_size) {
560 			/*
561 			 * There's not a block full of data, pad rest of
562 			 * buffer with zero
563 			 */
564 
565 			if (auth_data != NULL) {
566 				memset(authp, 0, block_size);
567 				memcpy(authp, &(auth_data[processed]),
568 				    remainder);
569 			} else {
570 				ASSERT0(remainder);
571 			}
572 
573 			datap = (uint8_t *)authp;
574 			remainder = 0;
575 		} else {
576 			datap = (uint8_t *)(&(auth_data[processed]));
577 			processed += block_size;
578 			remainder -= block_size;
579 		}
580 
581 		/* add auth data to the hash */
582 		GHASH(ctx, datap, ghash, gops);
583 
584 	} while (remainder > 0);
585 
586 	return (CRYPTO_SUCCESS);
587 }
588 
589 /*
590  * Init the GCM context struct. Handle the cycle and avx implementations here.
591  */
592 int
gcm_init_ctx(gcm_ctx_t * gcm_ctx,char * param,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))593 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
594     size_t block_size, int (*encrypt_block)(const void *, const uint8_t *,
595     uint8_t *), void (*copy_block)(uint8_t *, uint8_t *),
596     void (*xor_block)(uint8_t *, uint8_t *))
597 {
598 	CK_AES_GCM_PARAMS *gcm_param;
599 	int rv = CRYPTO_SUCCESS;
600 	size_t tag_len, iv_len;
601 
602 	if (param != NULL) {
603 		gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
604 
605 		/* GCM mode. */
606 		if ((rv = gcm_validate_args(gcm_param)) != 0) {
607 			return (rv);
608 		}
609 		gcm_ctx->gcm_flags |= GCM_MODE;
610 
611 		size_t tbits = gcm_param->ulTagBits;
612 		tag_len = CRYPTO_BITS2BYTES(tbits);
613 		iv_len = gcm_param->ulIvLen;
614 
615 		gcm_ctx->gcm_tag_len = tag_len;
616 		gcm_ctx->gcm_processed_data_len = 0;
617 
618 		/* these values are in bits */
619 		gcm_ctx->gcm_len_a_len_c[0]
620 		    = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
621 	} else {
622 		return (CRYPTO_MECHANISM_PARAM_INVALID);
623 	}
624 
625 	const uint8_t *iv = (const uint8_t *)gcm_param->pIv;
626 	const uint8_t *aad = (const uint8_t *)gcm_param->pAAD;
627 	size_t aad_len = gcm_param->ulAADLen;
628 
629 #ifdef CAN_USE_GCM_ASM
630 	boolean_t needs_bswap =
631 	    ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
632 
633 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
634 		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
635 	} else {
636 		/*
637 		 * Handle the "cycle" implementation by creating avx and
638 		 * non-avx contexts alternately.
639 		 */
640 		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
641 
642 		/* The avx impl. doesn't handle byte swapped key schedules. */
643 		if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
644 			gcm_ctx->gcm_use_avx = B_FALSE;
645 		}
646 		/*
647 		 * If this is a GCM context, use the MOVBE and the BSWAP
648 		 * variants alternately.
649 		 */
650 		if (gcm_ctx->gcm_use_avx == B_TRUE &&
651 		    zfs_movbe_available() == B_TRUE) {
652 			(void) atomic_toggle_boolean_nv(
653 			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
654 		}
655 	}
656 	/*
657 	 * We don't handle byte swapped key schedules in the avx code path,
658 	 * still they could be created by the aes generic implementation.
659 	 * Make sure not to use them since we'll corrupt data if we do.
660 	 */
661 	if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
662 		gcm_ctx->gcm_use_avx = B_FALSE;
663 
664 		cmn_err_once(CE_WARN,
665 		    "ICP: Can't use the aes generic or cycle implementations "
666 		    "in combination with the gcm avx implementation!");
667 		cmn_err_once(CE_WARN,
668 		    "ICP: Falling back to a compatible implementation, "
669 		    "aes-gcm performance will likely be degraded.");
670 		cmn_err_once(CE_WARN,
671 		    "ICP: Choose at least the x86_64 aes implementation to "
672 		    "restore performance.");
673 	}
674 
675 	/* Allocate Htab memory as needed. */
676 	if (gcm_ctx->gcm_use_avx == B_TRUE) {
677 		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
678 
679 		if (htab_len == 0) {
680 			return (CRYPTO_MECHANISM_PARAM_INVALID);
681 		}
682 		gcm_ctx->gcm_htab_len = htab_len;
683 		gcm_ctx->gcm_Htable =
684 		    kmem_alloc(htab_len, KM_SLEEP);
685 
686 		if (gcm_ctx->gcm_Htable == NULL) {
687 			return (CRYPTO_HOST_MEMORY);
688 		}
689 	}
690 	/* Avx and non avx context initialization differs from here on. */
691 	if (gcm_ctx->gcm_use_avx == B_FALSE) {
692 #endif /* ifdef CAN_USE_GCM_ASM */
693 		if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
694 		    encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
695 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
696 		}
697 #ifdef CAN_USE_GCM_ASM
698 	} else {
699 		if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
700 		    block_size) != CRYPTO_SUCCESS) {
701 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
702 		}
703 	}
704 #endif /* ifdef CAN_USE_GCM_ASM */
705 
706 	return (rv);
707 }
708 
709 void *
gcm_alloc_ctx(int kmflag)710 gcm_alloc_ctx(int kmflag)
711 {
712 	gcm_ctx_t *gcm_ctx;
713 
714 	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
715 		return (NULL);
716 
717 	gcm_ctx->gcm_flags = GCM_MODE;
718 	return (gcm_ctx);
719 }
720 
721 /* GCM implementation that contains the fastest methods */
722 static gcm_impl_ops_t gcm_fastest_impl = {
723 	.name = "fastest"
724 };
725 
726 /* All compiled in implementations */
727 static const gcm_impl_ops_t *gcm_all_impl[] = {
728 	&gcm_generic_impl,
729 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
730 	&gcm_pclmulqdq_impl,
731 #endif
732 };
733 
734 /* Indicate that benchmark has been completed */
735 static boolean_t gcm_impl_initialized = B_FALSE;
736 
737 /* Hold all supported implementations */
738 static size_t gcm_supp_impl_cnt = 0;
739 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
740 
741 /*
742  * Returns the GCM operations for encrypt/decrypt/key setup.  When a
743  * SIMD implementation is not allowed in the current context, then
744  * fallback to the fastest generic implementation.
745  */
746 const gcm_impl_ops_t *
gcm_impl_get_ops(void)747 gcm_impl_get_ops(void)
748 {
749 	if (!kfpu_allowed())
750 		return (&gcm_generic_impl);
751 
752 	const gcm_impl_ops_t *ops = NULL;
753 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
754 
755 	switch (impl) {
756 	case IMPL_FASTEST:
757 		ASSERT(gcm_impl_initialized);
758 		ops = &gcm_fastest_impl;
759 		break;
760 	case IMPL_CYCLE:
761 		/* Cycle through supported implementations */
762 		ASSERT(gcm_impl_initialized);
763 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
764 		static size_t cycle_impl_idx = 0;
765 		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
766 		ops = gcm_supp_impl[idx];
767 		break;
768 #ifdef CAN_USE_GCM_ASM
769 	case IMPL_AVX:
770 		/*
771 		 * Make sure that we return a valid implementation while
772 		 * switching to the avx implementation since there still
773 		 * may be unfinished non-avx contexts around.
774 		 */
775 		ops = &gcm_generic_impl;
776 		break;
777 #endif
778 	default:
779 		ASSERT3U(impl, <, gcm_supp_impl_cnt);
780 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
781 		if (impl < ARRAY_SIZE(gcm_all_impl))
782 			ops = gcm_supp_impl[impl];
783 		break;
784 	}
785 
786 	ASSERT3P(ops, !=, NULL);
787 
788 	return (ops);
789 }
790 
791 /*
792  * Initialize all supported implementations.
793  */
794 void
gcm_impl_init(void)795 gcm_impl_init(void)
796 {
797 	gcm_impl_ops_t *curr_impl;
798 	int i, c;
799 
800 	/* Move supported implementations into gcm_supp_impls */
801 	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
802 		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
803 
804 		if (curr_impl->is_supported())
805 			gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
806 	}
807 	gcm_supp_impl_cnt = c;
808 
809 	/*
810 	 * Set the fastest implementation given the assumption that the
811 	 * hardware accelerated version is the fastest.
812 	 */
813 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
814 	if (gcm_pclmulqdq_impl.is_supported()) {
815 		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
816 		    sizeof (gcm_fastest_impl));
817 	} else
818 #endif
819 	{
820 		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
821 		    sizeof (gcm_fastest_impl));
822 	}
823 
824 	strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
825 
826 #ifdef CAN_USE_GCM_ASM
827 	/*
828 	 * Use the avx implementation if it's available and the implementation
829 	 * hasn't changed from its default value of fastest on module load.
830 	 */
831 	if (gcm_avx_will_work()) {
832 #ifdef HAVE_MOVBE
833 		if (zfs_movbe_available() == B_TRUE) {
834 			atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
835 		}
836 #endif
837 		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
838 			gcm_set_avx(B_TRUE);
839 		}
840 	}
841 #endif
842 	/* Finish initialization */
843 	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
844 	gcm_impl_initialized = B_TRUE;
845 }
846 
847 static const struct {
848 	const char *name;
849 	uint32_t sel;
850 } gcm_impl_opts[] = {
851 		{ "cycle",	IMPL_CYCLE },
852 		{ "fastest",	IMPL_FASTEST },
853 #ifdef CAN_USE_GCM_ASM
854 		{ "avx",	IMPL_AVX },
855 #endif
856 };
857 
858 /*
859  * Function sets desired gcm implementation.
860  *
861  * If we are called before init(), user preference will be saved in
862  * user_sel_impl, and applied in later init() call. This occurs when module
863  * parameter is specified on module load. Otherwise, directly update
864  * icp_gcm_impl.
865  *
866  * @val		Name of gcm implementation to use
867  * @param	Unused.
868  */
869 int
gcm_impl_set(const char * val)870 gcm_impl_set(const char *val)
871 {
872 	int err = -EINVAL;
873 	char req_name[GCM_IMPL_NAME_MAX];
874 	uint32_t impl = GCM_IMPL_READ(user_sel_impl);
875 	size_t i;
876 
877 	/* sanitize input */
878 	i = strnlen(val, GCM_IMPL_NAME_MAX);
879 	if (i == 0 || i >= GCM_IMPL_NAME_MAX)
880 		return (err);
881 
882 	strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
883 	while (i > 0 && isspace(req_name[i-1]))
884 		i--;
885 	req_name[i] = '\0';
886 
887 	/* Check mandatory options */
888 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
889 #ifdef CAN_USE_GCM_ASM
890 		/* Ignore avx implementation if it won't work. */
891 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
892 			continue;
893 		}
894 #endif
895 		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
896 			impl = gcm_impl_opts[i].sel;
897 			err = 0;
898 			break;
899 		}
900 	}
901 
902 	/* check all supported impl if init() was already called */
903 	if (err != 0 && gcm_impl_initialized) {
904 		/* check all supported implementations */
905 		for (i = 0; i < gcm_supp_impl_cnt; i++) {
906 			if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
907 				impl = i;
908 				err = 0;
909 				break;
910 			}
911 		}
912 	}
913 #ifdef CAN_USE_GCM_ASM
914 	/*
915 	 * Use the avx implementation if available and the requested one is
916 	 * avx or fastest.
917 	 */
918 	if (gcm_avx_will_work() == B_TRUE &&
919 	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
920 		gcm_set_avx(B_TRUE);
921 	} else {
922 		gcm_set_avx(B_FALSE);
923 	}
924 #endif
925 
926 	if (err == 0) {
927 		if (gcm_impl_initialized)
928 			atomic_swap_32(&icp_gcm_impl, impl);
929 		else
930 			atomic_swap_32(&user_sel_impl, impl);
931 	}
932 
933 	return (err);
934 }
935 
936 #if defined(_KERNEL) && defined(__linux__)
937 
938 static int
icp_gcm_impl_set(const char * val,zfs_kernel_param_t * kp)939 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
940 {
941 	return (gcm_impl_set(val));
942 }
943 
944 static int
icp_gcm_impl_get(char * buffer,zfs_kernel_param_t * kp)945 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
946 {
947 	int i, cnt = 0;
948 	char *fmt;
949 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
950 
951 	ASSERT(gcm_impl_initialized);
952 
953 	/* list mandatory options */
954 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
955 #ifdef CAN_USE_GCM_ASM
956 		/* Ignore avx implementation if it won't work. */
957 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
958 			continue;
959 		}
960 #endif
961 		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
962 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
963 		    gcm_impl_opts[i].name);
964 	}
965 
966 	/* list all supported implementations */
967 	for (i = 0; i < gcm_supp_impl_cnt; i++) {
968 		fmt = (i == impl) ? "[%s] " : "%s ";
969 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
970 		    gcm_supp_impl[i]->name);
971 	}
972 
973 	return (cnt);
974 }
975 
976 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
977     NULL, 0644);
978 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
979 #endif /* defined(__KERNEL) */
980 
981 #ifdef CAN_USE_GCM_ASM
982 #define	GCM_BLOCK_LEN 16
983 /*
984  * The openssl asm routines are 6x aggregated and need that many bytes
985  * at minimum.
986  */
987 #define	GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
988 #define	GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
989 /*
990  * Ensure the chunk size is reasonable since we are allocating a
991  * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
992  */
993 #define	GCM_AVX_MAX_CHUNK_SIZE \
994 	(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
995 
996 /* Clear the FPU registers since they hold sensitive internal state. */
997 #define	clear_fpu_regs() clear_fpu_regs_avx()
998 #define	GHASH_AVX(ctx, in, len) \
999     gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
1000     in, len)
1001 
1002 #define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1003 
1004 /* Get the chunk size module parameter. */
1005 #define	GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1006 
1007 /*
1008  * Module parameter: number of bytes to process at once while owning the FPU.
1009  * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1010  * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1011  */
1012 static uint32_t gcm_avx_chunk_size =
1013 	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1014 
1015 extern void ASMABI clear_fpu_regs_avx(void);
1016 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1017 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
1018     const uint32_t pt[4], uint32_t ct[4]);
1019 
1020 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1021 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1022     const uint8_t *in, size_t len);
1023 
1024 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1025     const void *, uint64_t *, uint64_t *);
1026 
1027 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1028     const void *, uint64_t *, uint64_t *);
1029 
1030 static inline boolean_t
gcm_avx_will_work(void)1031 gcm_avx_will_work(void)
1032 {
1033 	/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1034 	return (kfpu_allowed() &&
1035 	    zfs_avx_available() && zfs_aes_available() &&
1036 	    zfs_pclmulqdq_available());
1037 }
1038 
1039 static inline void
gcm_set_avx(boolean_t val)1040 gcm_set_avx(boolean_t val)
1041 {
1042 	if (gcm_avx_will_work() == B_TRUE) {
1043 		atomic_swap_32(&gcm_use_avx, val);
1044 	}
1045 }
1046 
1047 static inline boolean_t
gcm_toggle_avx(void)1048 gcm_toggle_avx(void)
1049 {
1050 	if (gcm_avx_will_work() == B_TRUE) {
1051 		return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
1052 	} else {
1053 		return (B_FALSE);
1054 	}
1055 }
1056 
1057 static inline size_t
gcm_simd_get_htab_size(boolean_t simd_mode)1058 gcm_simd_get_htab_size(boolean_t simd_mode)
1059 {
1060 	switch (simd_mode) {
1061 	case B_TRUE:
1062 		return (2 * 6 * 2 * sizeof (uint64_t));
1063 
1064 	default:
1065 		return (0);
1066 	}
1067 }
1068 
1069 
1070 /* Increment the GCM counter block by n. */
1071 static inline void
gcm_incr_counter_block_by(gcm_ctx_t * ctx,int n)1072 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1073 {
1074 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1075 	uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1076 
1077 	counter = htonll(counter + n);
1078 	counter &= counter_mask;
1079 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1080 }
1081 
1082 /*
1083  * Encrypt multiple blocks of data in GCM mode.
1084  * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1085  * if possible. While processing a chunk the FPU is "locked".
1086  */
1087 static int
gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size)1088 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1089     size_t length, crypto_data_t *out, size_t block_size)
1090 {
1091 	size_t bleft = length;
1092 	size_t need = 0;
1093 	size_t done = 0;
1094 	uint8_t *datap = (uint8_t *)data;
1095 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1096 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1097 	uint64_t *ghash = ctx->gcm_ghash;
1098 	uint64_t *cb = ctx->gcm_cb;
1099 	uint8_t *ct_buf = NULL;
1100 	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1101 	int rv = CRYPTO_SUCCESS;
1102 
1103 	ASSERT(block_size == GCM_BLOCK_LEN);
1104 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1105 	    B_FALSE);
1106 	/*
1107 	 * If the last call left an incomplete block, try to fill
1108 	 * it first.
1109 	 */
1110 	if (ctx->gcm_remainder_len > 0) {
1111 		need = block_size - ctx->gcm_remainder_len;
1112 		if (length < need) {
1113 			/* Accumulate bytes here and return. */
1114 			memcpy((uint8_t *)ctx->gcm_remainder +
1115 			    ctx->gcm_remainder_len, datap, length);
1116 
1117 			ctx->gcm_remainder_len += length;
1118 			if (ctx->gcm_copy_to == NULL) {
1119 				ctx->gcm_copy_to = datap;
1120 			}
1121 			return (CRYPTO_SUCCESS);
1122 		} else {
1123 			/* Complete incomplete block. */
1124 			memcpy((uint8_t *)ctx->gcm_remainder +
1125 			    ctx->gcm_remainder_len, datap, need);
1126 
1127 			ctx->gcm_copy_to = NULL;
1128 		}
1129 	}
1130 
1131 	/* Allocate a buffer to encrypt to if there is enough input. */
1132 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1133 		ct_buf = vmem_alloc(chunk_size, KM_SLEEP);
1134 		if (ct_buf == NULL) {
1135 			return (CRYPTO_HOST_MEMORY);
1136 		}
1137 	}
1138 
1139 	/* If we completed an incomplete block, encrypt and write it out. */
1140 	if (ctx->gcm_remainder_len > 0) {
1141 		kfpu_begin();
1142 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1143 		    (const uint32_t *)cb, (uint32_t *)tmp);
1144 
1145 		gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1146 		GHASH_AVX(ctx, tmp, block_size);
1147 		clear_fpu_regs();
1148 		kfpu_end();
1149 		rv = crypto_put_output_data(tmp, out, block_size);
1150 		out->cd_offset += block_size;
1151 		gcm_incr_counter_block(ctx);
1152 		ctx->gcm_processed_data_len += block_size;
1153 		bleft -= need;
1154 		datap += need;
1155 		ctx->gcm_remainder_len = 0;
1156 	}
1157 
1158 	/* Do the bulk encryption in chunk_size blocks. */
1159 	for (; bleft >= chunk_size; bleft -= chunk_size) {
1160 		kfpu_begin();
1161 		done = aesni_gcm_encrypt(
1162 		    datap, ct_buf, chunk_size, key, cb, ghash);
1163 
1164 		clear_fpu_regs();
1165 		kfpu_end();
1166 		if (done != chunk_size) {
1167 			rv = CRYPTO_FAILED;
1168 			goto out_nofpu;
1169 		}
1170 		rv = crypto_put_output_data(ct_buf, out, chunk_size);
1171 		if (rv != CRYPTO_SUCCESS) {
1172 			goto out_nofpu;
1173 		}
1174 		out->cd_offset += chunk_size;
1175 		datap += chunk_size;
1176 		ctx->gcm_processed_data_len += chunk_size;
1177 	}
1178 	/* Check if we are already done. */
1179 	if (bleft == 0) {
1180 		goto out_nofpu;
1181 	}
1182 	/* Bulk encrypt the remaining data. */
1183 	kfpu_begin();
1184 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1185 		done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
1186 		if (done == 0) {
1187 			rv = CRYPTO_FAILED;
1188 			goto out;
1189 		}
1190 		rv = crypto_put_output_data(ct_buf, out, done);
1191 		if (rv != CRYPTO_SUCCESS) {
1192 			goto out;
1193 		}
1194 		out->cd_offset += done;
1195 		ctx->gcm_processed_data_len += done;
1196 		datap += done;
1197 		bleft -= done;
1198 
1199 	}
1200 	/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1201 	while (bleft > 0) {
1202 		if (bleft < block_size) {
1203 			memcpy(ctx->gcm_remainder, datap, bleft);
1204 			ctx->gcm_remainder_len = bleft;
1205 			ctx->gcm_copy_to = datap;
1206 			goto out;
1207 		}
1208 		/* Encrypt, hash and write out. */
1209 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1210 		    (const uint32_t *)cb, (uint32_t *)tmp);
1211 
1212 		gcm_xor_avx(datap, tmp);
1213 		GHASH_AVX(ctx, tmp, block_size);
1214 		rv = crypto_put_output_data(tmp, out, block_size);
1215 		if (rv != CRYPTO_SUCCESS) {
1216 			goto out;
1217 		}
1218 		out->cd_offset += block_size;
1219 		gcm_incr_counter_block(ctx);
1220 		ctx->gcm_processed_data_len += block_size;
1221 		datap += block_size;
1222 		bleft -= block_size;
1223 	}
1224 out:
1225 	clear_fpu_regs();
1226 	kfpu_end();
1227 out_nofpu:
1228 	if (ct_buf != NULL) {
1229 		vmem_free(ct_buf, chunk_size);
1230 	}
1231 	return (rv);
1232 }
1233 
1234 /*
1235  * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1236  * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1237  */
1238 static int
gcm_encrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1239 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1240 {
1241 	uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1242 	uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1243 	uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1244 	size_t rem_len = ctx->gcm_remainder_len;
1245 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1246 	int aes_rounds = ((aes_key_t *)keysched)->nr;
1247 	int rv;
1248 
1249 	ASSERT(block_size == GCM_BLOCK_LEN);
1250 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1251 	    B_FALSE);
1252 
1253 	if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1254 		return (CRYPTO_DATA_LEN_RANGE);
1255 	}
1256 
1257 	kfpu_begin();
1258 	/* Pad last incomplete block with zeros, encrypt and hash. */
1259 	if (rem_len > 0) {
1260 		uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1261 		const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1262 
1263 		aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1264 		memset(remainder + rem_len, 0, block_size - rem_len);
1265 		for (int i = 0; i < rem_len; i++) {
1266 			remainder[i] ^= tmp[i];
1267 		}
1268 		GHASH_AVX(ctx, remainder, block_size);
1269 		ctx->gcm_processed_data_len += rem_len;
1270 		/* No need to increment counter_block, it's the last block. */
1271 	}
1272 	/* Finish tag. */
1273 	ctx->gcm_len_a_len_c[1] =
1274 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1275 	GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1276 	aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1277 
1278 	gcm_xor_avx((uint8_t *)J0, ghash);
1279 	clear_fpu_regs();
1280 	kfpu_end();
1281 
1282 	/* Output remainder. */
1283 	if (rem_len > 0) {
1284 		rv = crypto_put_output_data(remainder, out, rem_len);
1285 		if (rv != CRYPTO_SUCCESS)
1286 			return (rv);
1287 	}
1288 	out->cd_offset += rem_len;
1289 	ctx->gcm_remainder_len = 0;
1290 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1291 	if (rv != CRYPTO_SUCCESS)
1292 		return (rv);
1293 
1294 	out->cd_offset += ctx->gcm_tag_len;
1295 	return (CRYPTO_SUCCESS);
1296 }
1297 
1298 /*
1299  * Finalize decryption: We just have accumulated crypto text, so now we
1300  * decrypt it here inplace.
1301  */
1302 static int
gcm_decrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1303 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1304 {
1305 	ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1306 	ASSERT3U(block_size, ==, 16);
1307 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1308 	    B_FALSE);
1309 
1310 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1311 	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1312 	uint8_t *datap = ctx->gcm_pt_buf;
1313 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1314 	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1315 	uint64_t *ghash = ctx->gcm_ghash;
1316 	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1317 	int rv = CRYPTO_SUCCESS;
1318 	size_t bleft, done;
1319 
1320 	/*
1321 	 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1322 	 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1323 	 * GCM_AVX_MIN_DECRYPT_BYTES.
1324 	 */
1325 	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1326 		kfpu_begin();
1327 		done = aesni_gcm_decrypt(datap, datap, chunk_size,
1328 		    (const void *)key, ctx->gcm_cb, ghash);
1329 		clear_fpu_regs();
1330 		kfpu_end();
1331 		if (done != chunk_size) {
1332 			return (CRYPTO_FAILED);
1333 		}
1334 		datap += done;
1335 	}
1336 	/* Decrypt remainder, which is less than chunk size, in one go. */
1337 	kfpu_begin();
1338 	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1339 		done = aesni_gcm_decrypt(datap, datap, bleft,
1340 		    (const void *)key, ctx->gcm_cb, ghash);
1341 		if (done == 0) {
1342 			clear_fpu_regs();
1343 			kfpu_end();
1344 			return (CRYPTO_FAILED);
1345 		}
1346 		datap += done;
1347 		bleft -= done;
1348 	}
1349 	ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1350 
1351 	/*
1352 	 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1353 	 * decrypt them block by block.
1354 	 */
1355 	while (bleft > 0) {
1356 		/* Incomplete last block. */
1357 		if (bleft < block_size) {
1358 			uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1359 
1360 			memset(lastb, 0, block_size);
1361 			memcpy(lastb, datap, bleft);
1362 			/* The GCM processing. */
1363 			GHASH_AVX(ctx, lastb, block_size);
1364 			aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1365 			for (size_t i = 0; i < bleft; i++) {
1366 				datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1367 			}
1368 			break;
1369 		}
1370 		/* The GCM processing. */
1371 		GHASH_AVX(ctx, datap, block_size);
1372 		aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1373 		gcm_xor_avx((uint8_t *)tmp, datap);
1374 		gcm_incr_counter_block(ctx);
1375 
1376 		datap += block_size;
1377 		bleft -= block_size;
1378 	}
1379 	if (rv != CRYPTO_SUCCESS) {
1380 		clear_fpu_regs();
1381 		kfpu_end();
1382 		return (rv);
1383 	}
1384 	/* Decryption done, finish the tag. */
1385 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1386 	GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1387 	aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1388 	    (uint32_t *)ctx->gcm_J0);
1389 
1390 	gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1391 
1392 	/* We are done with the FPU, restore its state. */
1393 	clear_fpu_regs();
1394 	kfpu_end();
1395 
1396 	/* Compare the input authentication tag with what we calculated. */
1397 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1398 		/* They don't match. */
1399 		return (CRYPTO_INVALID_MAC);
1400 	}
1401 	rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1402 	if (rv != CRYPTO_SUCCESS) {
1403 		return (rv);
1404 	}
1405 	out->cd_offset += pt_len;
1406 	return (CRYPTO_SUCCESS);
1407 }
1408 
1409 /*
1410  * Initialize the GCM params H, Htabtle and the counter block. Save the
1411  * initial counter block.
1412  */
1413 static int
gcm_init_avx(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size)1414 gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
1415     const uint8_t *auth_data, size_t auth_data_len, size_t block_size)
1416 {
1417 	uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1418 	uint64_t *H = ctx->gcm_H;
1419 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1420 	int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1421 	const uint8_t *datap = auth_data;
1422 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1423 	size_t bleft;
1424 
1425 	ASSERT(block_size == GCM_BLOCK_LEN);
1426 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1427 	    B_FALSE);
1428 
1429 	/* Init H (encrypt zero block) and create the initial counter block. */
1430 	memset(H, 0, sizeof (ctx->gcm_H));
1431 	kfpu_begin();
1432 	aes_encrypt_intel(keysched, aes_rounds,
1433 	    (const uint32_t *)H, (uint32_t *)H);
1434 
1435 	gcm_init_htab_avx(ctx->gcm_Htable, H);
1436 
1437 	if (iv_len == 12) {
1438 		memcpy(cb, iv, 12);
1439 		cb[12] = 0;
1440 		cb[13] = 0;
1441 		cb[14] = 0;
1442 		cb[15] = 1;
1443 		/* We need the ICB later. */
1444 		memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0));
1445 	} else {
1446 		/*
1447 		 * Most consumers use 12 byte IVs, so it's OK to use the
1448 		 * original routines for other IV sizes, just avoid nesting
1449 		 * kfpu_begin calls.
1450 		 */
1451 		clear_fpu_regs();
1452 		kfpu_end();
1453 		gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1454 		    aes_copy_block, aes_xor_block);
1455 		kfpu_begin();
1456 	}
1457 
1458 	memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash));
1459 
1460 	/* Openssl post increments the counter, adjust for that. */
1461 	gcm_incr_counter_block(ctx);
1462 
1463 	/* Ghash AAD in chunk_size blocks. */
1464 	for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1465 		GHASH_AVX(ctx, datap, chunk_size);
1466 		datap += chunk_size;
1467 		clear_fpu_regs();
1468 		kfpu_end();
1469 		kfpu_begin();
1470 	}
1471 	/* Ghash the remainder and handle possible incomplete GCM block. */
1472 	if (bleft > 0) {
1473 		size_t incomp = bleft % block_size;
1474 
1475 		bleft -= incomp;
1476 		if (bleft > 0) {
1477 			GHASH_AVX(ctx, datap, bleft);
1478 			datap += bleft;
1479 		}
1480 		if (incomp > 0) {
1481 			/* Zero pad and hash incomplete last block. */
1482 			uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1483 
1484 			memset(authp, 0, block_size);
1485 			memcpy(authp, datap, incomp);
1486 			GHASH_AVX(ctx, authp, block_size);
1487 		}
1488 	}
1489 	clear_fpu_regs();
1490 	kfpu_end();
1491 	return (CRYPTO_SUCCESS);
1492 }
1493 
1494 #if defined(_KERNEL)
1495 static int
icp_gcm_avx_set_chunk_size(const char * buf,zfs_kernel_param_t * kp)1496 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1497 {
1498 	unsigned long val;
1499 	char val_rounded[16];
1500 	int error = 0;
1501 
1502 	error = kstrtoul(buf, 0, &val);
1503 	if (error)
1504 		return (error);
1505 
1506 	val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1507 
1508 	if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1509 		return (-EINVAL);
1510 
1511 	snprintf(val_rounded, 16, "%u", (uint32_t)val);
1512 	error = param_set_uint(val_rounded, kp);
1513 	return (error);
1514 }
1515 
1516 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1517     param_get_uint, &gcm_avx_chunk_size, 0644);
1518 
1519 MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1520 	"How many bytes to process while owning the FPU");
1521 
1522 #endif /* defined(__KERNEL) */
1523 #endif /* ifdef CAN_USE_GCM_ASM */
1524