xref: /freebsd/sys/contrib/openzfs/module/icp/algs/modes/gcm.c (revision b1c1ee4429fcca8f69873a8be66184e68e1b19d7)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/cmn_err.h>
28 #include <modes/modes.h>
29 #include <sys/crypto/common.h>
30 #include <sys/crypto/icp.h>
31 #include <sys/crypto/impl.h>
32 #include <sys/byteorder.h>
33 #include <sys/simd.h>
34 #include <modes/gcm_impl.h>
35 #ifdef CAN_USE_GCM_ASM
36 #include <aes/aes_impl.h>
37 #endif
38 
39 #define	GHASH(c, d, t, o) \
40 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
41 	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
42 	(uint64_t *)(void *)(t));
43 
44 /* Select GCM implementation */
45 #define	IMPL_FASTEST	(UINT32_MAX)
46 #define	IMPL_CYCLE	(UINT32_MAX-1)
47 #ifdef CAN_USE_GCM_ASM
48 #define	IMPL_AVX	(UINT32_MAX-2)
49 #endif
50 #define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
51 static uint32_t icp_gcm_impl = IMPL_FASTEST;
52 static uint32_t user_sel_impl = IMPL_FASTEST;
53 
54 #ifdef CAN_USE_GCM_ASM
55 /* Does the architecture we run on support the MOVBE instruction? */
56 boolean_t gcm_avx_can_use_movbe = B_FALSE;
57 /*
58  * Whether to use the optimized openssl gcm and ghash implementations.
59  * Set to true if module parameter icp_gcm_impl == "avx".
60  */
61 static boolean_t gcm_use_avx = B_FALSE;
62 #define	GCM_IMPL_USE_AVX	(*(volatile boolean_t *)&gcm_use_avx)
63 
64 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
65 
66 static inline boolean_t gcm_avx_will_work(void);
67 static inline void gcm_set_avx(boolean_t);
68 static inline boolean_t gcm_toggle_avx(void);
69 static inline size_t gcm_simd_get_htab_size(boolean_t);
70 
71 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
72     crypto_data_t *, size_t);
73 
74 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
75 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
76 static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *,
77     size_t, size_t);
78 #endif /* ifdef CAN_USE_GCM_ASM */
79 
80 /*
81  * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
82  * is done in another function.
83  */
84 int
gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))85 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
86     crypto_data_t *out, size_t block_size,
87     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
88     void (*copy_block)(uint8_t *, uint8_t *),
89     void (*xor_block)(uint8_t *, uint8_t *))
90 {
91 #ifdef CAN_USE_GCM_ASM
92 	if (ctx->gcm_use_avx == B_TRUE)
93 		return (gcm_mode_encrypt_contiguous_blocks_avx(
94 		    ctx, data, length, out, block_size));
95 #endif
96 
97 	const gcm_impl_ops_t *gops;
98 	size_t remainder = length;
99 	size_t need = 0;
100 	uint8_t *datap = (uint8_t *)data;
101 	uint8_t *blockp;
102 	uint8_t *lastp;
103 	void *iov_or_mp;
104 	offset_t offset;
105 	uint8_t *out_data_1;
106 	uint8_t *out_data_2;
107 	size_t out_data_1_len;
108 	uint64_t counter;
109 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
110 
111 	if (length + ctx->gcm_remainder_len < block_size) {
112 		/* accumulate bytes here and return */
113 		memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
114 		    datap,
115 		    length);
116 		ctx->gcm_remainder_len += length;
117 		if (ctx->gcm_copy_to == NULL) {
118 			ctx->gcm_copy_to = datap;
119 		}
120 		return (CRYPTO_SUCCESS);
121 	}
122 
123 	crypto_init_ptrs(out, &iov_or_mp, &offset);
124 
125 	gops = gcm_impl_get_ops();
126 	do {
127 		/* Unprocessed data from last call. */
128 		if (ctx->gcm_remainder_len > 0) {
129 			need = block_size - ctx->gcm_remainder_len;
130 
131 			if (need > remainder)
132 				return (CRYPTO_DATA_LEN_RANGE);
133 
134 			memcpy(&((uint8_t *)ctx->gcm_remainder)
135 			    [ctx->gcm_remainder_len], datap, need);
136 
137 			blockp = (uint8_t *)ctx->gcm_remainder;
138 		} else {
139 			blockp = datap;
140 		}
141 
142 		/*
143 		 * Increment counter. Counter bits are confined
144 		 * to the bottom 32 bits of the counter block.
145 		 */
146 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
147 		counter = htonll(counter + 1);
148 		counter &= counter_mask;
149 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
150 
151 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
152 		    (uint8_t *)ctx->gcm_tmp);
153 		xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
154 
155 		lastp = (uint8_t *)ctx->gcm_tmp;
156 
157 		ctx->gcm_processed_data_len += block_size;
158 
159 		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
160 		    &out_data_1_len, &out_data_2, block_size);
161 
162 		/* copy block to where it belongs */
163 		if (out_data_1_len == block_size) {
164 			copy_block(lastp, out_data_1);
165 		} else {
166 			memcpy(out_data_1, lastp, out_data_1_len);
167 			if (out_data_2 != NULL) {
168 				memcpy(out_data_2,
169 				    lastp + out_data_1_len,
170 				    block_size - out_data_1_len);
171 			}
172 		}
173 		/* update offset */
174 		out->cd_offset += block_size;
175 
176 		/* add ciphertext to the hash */
177 		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
178 
179 		/* Update pointer to next block of data to be processed. */
180 		if (ctx->gcm_remainder_len != 0) {
181 			datap += need;
182 			ctx->gcm_remainder_len = 0;
183 		} else {
184 			datap += block_size;
185 		}
186 
187 		remainder = (size_t)&data[length] - (size_t)datap;
188 
189 		/* Incomplete last block. */
190 		if (remainder > 0 && remainder < block_size) {
191 			memcpy(ctx->gcm_remainder, datap, remainder);
192 			ctx->gcm_remainder_len = remainder;
193 			ctx->gcm_copy_to = datap;
194 			goto out;
195 		}
196 		ctx->gcm_copy_to = NULL;
197 
198 	} while (remainder > 0);
199 out:
200 	return (CRYPTO_SUCCESS);
201 }
202 
203 int
gcm_encrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))204 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
205     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
206     void (*copy_block)(uint8_t *, uint8_t *),
207     void (*xor_block)(uint8_t *, uint8_t *))
208 {
209 	(void) copy_block;
210 #ifdef CAN_USE_GCM_ASM
211 	if (ctx->gcm_use_avx == B_TRUE)
212 		return (gcm_encrypt_final_avx(ctx, out, block_size));
213 #endif
214 
215 	const gcm_impl_ops_t *gops;
216 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
217 	uint8_t *ghash, *macp = NULL;
218 	int i, rv;
219 
220 	if (out->cd_length <
221 	    (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
222 		return (CRYPTO_DATA_LEN_RANGE);
223 	}
224 
225 	gops = gcm_impl_get_ops();
226 	ghash = (uint8_t *)ctx->gcm_ghash;
227 
228 	if (ctx->gcm_remainder_len > 0) {
229 		uint64_t counter;
230 		uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
231 
232 		/*
233 		 * Here is where we deal with data that is not a
234 		 * multiple of the block size.
235 		 */
236 
237 		/*
238 		 * Increment counter.
239 		 */
240 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
241 		counter = htonll(counter + 1);
242 		counter &= counter_mask;
243 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
244 
245 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
246 		    (uint8_t *)ctx->gcm_tmp);
247 
248 		macp = (uint8_t *)ctx->gcm_remainder;
249 		memset(macp + ctx->gcm_remainder_len, 0,
250 		    block_size - ctx->gcm_remainder_len);
251 
252 		/* XOR with counter block */
253 		for (i = 0; i < ctx->gcm_remainder_len; i++) {
254 			macp[i] ^= tmpp[i];
255 		}
256 
257 		/* add ciphertext to the hash */
258 		GHASH(ctx, macp, ghash, gops);
259 
260 		ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
261 	}
262 
263 	ctx->gcm_len_a_len_c[1] =
264 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
265 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
266 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
267 	    (uint8_t *)ctx->gcm_J0);
268 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
269 
270 	if (ctx->gcm_remainder_len > 0) {
271 		rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
272 		if (rv != CRYPTO_SUCCESS)
273 			return (rv);
274 	}
275 	out->cd_offset += ctx->gcm_remainder_len;
276 	ctx->gcm_remainder_len = 0;
277 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
278 	if (rv != CRYPTO_SUCCESS)
279 		return (rv);
280 	out->cd_offset += ctx->gcm_tag_len;
281 
282 	return (CRYPTO_SUCCESS);
283 }
284 
285 /*
286  * This will only deal with decrypting the last block of the input that
287  * might not be a multiple of block length.
288  */
289 static void
gcm_decrypt_incomplete_block(gcm_ctx_t * ctx,size_t block_size,size_t index,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))290 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
291     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
292     void (*xor_block)(uint8_t *, uint8_t *))
293 {
294 	uint8_t *datap, *outp, *counterp;
295 	uint64_t counter;
296 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
297 	int i;
298 
299 	/*
300 	 * Increment counter.
301 	 * Counter bits are confined to the bottom 32 bits
302 	 */
303 	counter = ntohll(ctx->gcm_cb[1] & counter_mask);
304 	counter = htonll(counter + 1);
305 	counter &= counter_mask;
306 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
307 
308 	datap = (uint8_t *)ctx->gcm_remainder;
309 	outp = &((ctx->gcm_pt_buf)[index]);
310 	counterp = (uint8_t *)ctx->gcm_tmp;
311 
312 	/* authentication tag */
313 	memset((uint8_t *)ctx->gcm_tmp, 0, block_size);
314 	memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len);
315 
316 	/* add ciphertext to the hash */
317 	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
318 
319 	/* decrypt remaining ciphertext */
320 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
321 
322 	/* XOR with counter block */
323 	for (i = 0; i < ctx->gcm_remainder_len; i++) {
324 		outp[i] = datap[i] ^ counterp[i];
325 	}
326 }
327 
328 int
gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))329 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
330     crypto_data_t *out, size_t block_size,
331     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
332     void (*copy_block)(uint8_t *, uint8_t *),
333     void (*xor_block)(uint8_t *, uint8_t *))
334 {
335 	(void) out, (void) block_size, (void) encrypt_block, (void) copy_block,
336 	    (void) xor_block;
337 	size_t new_len;
338 	uint8_t *new;
339 
340 	/*
341 	 * Copy contiguous ciphertext input blocks to plaintext buffer.
342 	 * Ciphertext will be decrypted in the final.
343 	 */
344 	if (length > 0) {
345 		new_len = ctx->gcm_pt_buf_len + length;
346 		new = vmem_alloc(new_len, KM_SLEEP);
347 		if (new == NULL) {
348 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
349 			ctx->gcm_pt_buf = NULL;
350 			return (CRYPTO_HOST_MEMORY);
351 		}
352 
353 		if (ctx->gcm_pt_buf != NULL) {
354 			memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
355 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
356 		} else {
357 			ASSERT0(ctx->gcm_pt_buf_len);
358 		}
359 
360 		ctx->gcm_pt_buf = new;
361 		ctx->gcm_pt_buf_len = new_len;
362 		memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data,
363 		    length);
364 		ctx->gcm_processed_data_len += length;
365 	}
366 
367 	ctx->gcm_remainder_len = 0;
368 	return (CRYPTO_SUCCESS);
369 }
370 
371 int
gcm_decrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))372 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
373     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
374     void (*xor_block)(uint8_t *, uint8_t *))
375 {
376 #ifdef CAN_USE_GCM_ASM
377 	if (ctx->gcm_use_avx == B_TRUE)
378 		return (gcm_decrypt_final_avx(ctx, out, block_size));
379 #endif
380 
381 	const gcm_impl_ops_t *gops;
382 	size_t pt_len;
383 	size_t remainder;
384 	uint8_t *ghash;
385 	uint8_t *blockp;
386 	uint8_t *cbp;
387 	uint64_t counter;
388 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
389 	int processed = 0, rv;
390 
391 	ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
392 
393 	gops = gcm_impl_get_ops();
394 	pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
395 	ghash = (uint8_t *)ctx->gcm_ghash;
396 	blockp = ctx->gcm_pt_buf;
397 	remainder = pt_len;
398 	while (remainder > 0) {
399 		/* Incomplete last block */
400 		if (remainder < block_size) {
401 			memcpy(ctx->gcm_remainder, blockp, remainder);
402 			ctx->gcm_remainder_len = remainder;
403 			/*
404 			 * not expecting anymore ciphertext, just
405 			 * compute plaintext for the remaining input
406 			 */
407 			gcm_decrypt_incomplete_block(ctx, block_size,
408 			    processed, encrypt_block, xor_block);
409 			ctx->gcm_remainder_len = 0;
410 			goto out;
411 		}
412 		/* add ciphertext to the hash */
413 		GHASH(ctx, blockp, ghash, gops);
414 
415 		/*
416 		 * Increment counter.
417 		 * Counter bits are confined to the bottom 32 bits
418 		 */
419 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
420 		counter = htonll(counter + 1);
421 		counter &= counter_mask;
422 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
423 
424 		cbp = (uint8_t *)ctx->gcm_tmp;
425 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
426 
427 		/* XOR with ciphertext */
428 		xor_block(cbp, blockp);
429 
430 		processed += block_size;
431 		blockp += block_size;
432 		remainder -= block_size;
433 	}
434 out:
435 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
436 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
437 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
438 	    (uint8_t *)ctx->gcm_J0);
439 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
440 
441 	/* compare the input authentication tag with what we calculated */
442 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
443 		/* They don't match */
444 		return (CRYPTO_INVALID_MAC);
445 	} else {
446 		rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
447 		if (rv != CRYPTO_SUCCESS)
448 			return (rv);
449 		out->cd_offset += pt_len;
450 	}
451 	return (CRYPTO_SUCCESS);
452 }
453 
454 static int
gcm_validate_args(CK_AES_GCM_PARAMS * gcm_param)455 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
456 {
457 	size_t tag_len;
458 
459 	/*
460 	 * Check the length of the authentication tag (in bits).
461 	 */
462 	tag_len = gcm_param->ulTagBits;
463 	switch (tag_len) {
464 	case 32:
465 	case 64:
466 	case 96:
467 	case 104:
468 	case 112:
469 	case 120:
470 	case 128:
471 		break;
472 	default:
473 		return (CRYPTO_MECHANISM_PARAM_INVALID);
474 	}
475 
476 	if (gcm_param->ulIvLen == 0)
477 		return (CRYPTO_MECHANISM_PARAM_INVALID);
478 
479 	return (CRYPTO_SUCCESS);
480 }
481 
482 static void
gcm_format_initial_blocks(const uint8_t * iv,ulong_t iv_len,gcm_ctx_t * ctx,size_t block_size,void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))483 gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len,
484     gcm_ctx_t *ctx, size_t block_size,
485     void (*copy_block)(uint8_t *, uint8_t *),
486     void (*xor_block)(uint8_t *, uint8_t *))
487 {
488 	const gcm_impl_ops_t *gops;
489 	uint8_t *cb;
490 	ulong_t remainder = iv_len;
491 	ulong_t processed = 0;
492 	uint8_t *datap, *ghash;
493 	uint64_t len_a_len_c[2];
494 
495 	gops = gcm_impl_get_ops();
496 	ghash = (uint8_t *)ctx->gcm_ghash;
497 	cb = (uint8_t *)ctx->gcm_cb;
498 	if (iv_len == 12) {
499 		memcpy(cb, iv, 12);
500 		cb[12] = 0;
501 		cb[13] = 0;
502 		cb[14] = 0;
503 		cb[15] = 1;
504 		/* J0 will be used again in the final */
505 		copy_block(cb, (uint8_t *)ctx->gcm_J0);
506 	} else {
507 		/* GHASH the IV */
508 		do {
509 			if (remainder < block_size) {
510 				memset(cb, 0, block_size);
511 				memcpy(cb, &(iv[processed]), remainder);
512 				datap = (uint8_t *)cb;
513 				remainder = 0;
514 			} else {
515 				datap = (uint8_t *)(&(iv[processed]));
516 				processed += block_size;
517 				remainder -= block_size;
518 			}
519 			GHASH(ctx, datap, ghash, gops);
520 		} while (remainder > 0);
521 
522 		len_a_len_c[0] = 0;
523 		len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
524 		GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
525 
526 		/* J0 will be used again in the final */
527 		copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
528 	}
529 }
530 
531 static int
gcm_init(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))532 gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
533     const uint8_t *auth_data, size_t auth_data_len, size_t block_size,
534     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
535     void (*copy_block)(uint8_t *, uint8_t *),
536     void (*xor_block)(uint8_t *, uint8_t *))
537 {
538 	const gcm_impl_ops_t *gops;
539 	uint8_t *ghash, *datap, *authp;
540 	size_t remainder, processed;
541 
542 	/* encrypt zero block to get subkey H */
543 	memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
544 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
545 	    (uint8_t *)ctx->gcm_H);
546 
547 	gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
548 	    copy_block, xor_block);
549 
550 	gops = gcm_impl_get_ops();
551 	authp = (uint8_t *)ctx->gcm_tmp;
552 	ghash = (uint8_t *)ctx->gcm_ghash;
553 	memset(authp, 0, block_size);
554 	memset(ghash, 0, block_size);
555 
556 	processed = 0;
557 	remainder = auth_data_len;
558 	do {
559 		if (remainder < block_size) {
560 			/*
561 			 * There's not a block full of data, pad rest of
562 			 * buffer with zero
563 			 */
564 
565 			if (auth_data != NULL) {
566 				memset(authp, 0, block_size);
567 				memcpy(authp, &(auth_data[processed]),
568 				    remainder);
569 			} else {
570 				ASSERT0(remainder);
571 			}
572 
573 			datap = (uint8_t *)authp;
574 			remainder = 0;
575 		} else {
576 			datap = (uint8_t *)(&(auth_data[processed]));
577 			processed += block_size;
578 			remainder -= block_size;
579 		}
580 
581 		/* add auth data to the hash */
582 		GHASH(ctx, datap, ghash, gops);
583 
584 	} while (remainder > 0);
585 
586 	return (CRYPTO_SUCCESS);
587 }
588 
589 /*
590  * Init the GCM context struct. Handle the cycle and avx implementations here.
591  */
592 int
gcm_init_ctx(gcm_ctx_t * gcm_ctx,char * param,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))593 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
594     size_t block_size, int (*encrypt_block)(const void *, const uint8_t *,
595     uint8_t *), void (*copy_block)(uint8_t *, uint8_t *),
596     void (*xor_block)(uint8_t *, uint8_t *))
597 {
598 	CK_AES_GCM_PARAMS *gcm_param;
599 	int rv = CRYPTO_SUCCESS;
600 	size_t tag_len, iv_len;
601 
602 	if (param != NULL) {
603 		gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
604 
605 		/* GCM mode. */
606 		if ((rv = gcm_validate_args(gcm_param)) != 0) {
607 			return (rv);
608 		}
609 		gcm_ctx->gcm_flags |= GCM_MODE;
610 
611 		size_t tbits = gcm_param->ulTagBits;
612 		tag_len = CRYPTO_BITS2BYTES(tbits);
613 		iv_len = gcm_param->ulIvLen;
614 
615 		gcm_ctx->gcm_tag_len = tag_len;
616 		gcm_ctx->gcm_processed_data_len = 0;
617 
618 		/* these values are in bits */
619 		gcm_ctx->gcm_len_a_len_c[0]
620 		    = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
621 	} else {
622 		return (CRYPTO_MECHANISM_PARAM_INVALID);
623 	}
624 
625 	const uint8_t *iv = (const uint8_t *)gcm_param->pIv;
626 	const uint8_t *aad = (const uint8_t *)gcm_param->pAAD;
627 	size_t aad_len = gcm_param->ulAADLen;
628 
629 #ifdef CAN_USE_GCM_ASM
630 	boolean_t needs_bswap =
631 	    ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
632 
633 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
634 		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
635 	} else {
636 		/*
637 		 * Handle the "cycle" implementation by creating avx and
638 		 * non-avx contexts alternately.
639 		 */
640 		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
641 
642 		/* The avx impl. doesn't handle byte swapped key schedules. */
643 		if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
644 			gcm_ctx->gcm_use_avx = B_FALSE;
645 		}
646 		/*
647 		 * If this is a GCM context, use the MOVBE and the BSWAP
648 		 * variants alternately.
649 		 */
650 		if (gcm_ctx->gcm_use_avx == B_TRUE &&
651 		    zfs_movbe_available() == B_TRUE) {
652 			(void) atomic_toggle_boolean_nv(
653 			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
654 		}
655 	}
656 	/*
657 	 * We don't handle byte swapped key schedules in the avx code path,
658 	 * still they could be created by the aes generic implementation.
659 	 * Make sure not to use them since we'll corrupt data if we do.
660 	 */
661 	if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
662 		gcm_ctx->gcm_use_avx = B_FALSE;
663 
664 		cmn_err_once(CE_WARN,
665 		    "ICP: Can't use the aes generic or cycle implementations "
666 		    "in combination with the gcm avx implementation!");
667 		cmn_err_once(CE_WARN,
668 		    "ICP: Falling back to a compatible implementation, "
669 		    "aes-gcm performance will likely be degraded.");
670 		cmn_err_once(CE_WARN,
671 		    "ICP: Choose at least the x86_64 aes implementation to "
672 		    "restore performance.");
673 	}
674 
675 	/* Allocate Htab memory as needed. */
676 	if (gcm_ctx->gcm_use_avx == B_TRUE) {
677 		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
678 
679 		if (htab_len == 0) {
680 			return (CRYPTO_MECHANISM_PARAM_INVALID);
681 		}
682 		gcm_ctx->gcm_htab_len = htab_len;
683 		gcm_ctx->gcm_Htable =
684 		    kmem_alloc(htab_len, KM_SLEEP);
685 
686 		if (gcm_ctx->gcm_Htable == NULL) {
687 			return (CRYPTO_HOST_MEMORY);
688 		}
689 	}
690 	/* Avx and non avx context initialization differs from here on. */
691 	if (gcm_ctx->gcm_use_avx == B_FALSE) {
692 #endif /* ifdef CAN_USE_GCM_ASM */
693 		if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
694 		    encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
695 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
696 		}
697 #ifdef CAN_USE_GCM_ASM
698 	} else {
699 		if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
700 		    block_size) != CRYPTO_SUCCESS) {
701 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
702 		}
703 	}
704 #endif /* ifdef CAN_USE_GCM_ASM */
705 
706 	return (rv);
707 }
708 
709 void *
gcm_alloc_ctx(int kmflag)710 gcm_alloc_ctx(int kmflag)
711 {
712 	gcm_ctx_t *gcm_ctx;
713 
714 	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
715 		return (NULL);
716 
717 	gcm_ctx->gcm_flags = GCM_MODE;
718 	return (gcm_ctx);
719 }
720 
721 /* GCM implementation that contains the fastest methods */
722 static gcm_impl_ops_t gcm_fastest_impl = {
723 	.name = "fastest"
724 };
725 
726 /* All compiled in implementations */
727 static const gcm_impl_ops_t *gcm_all_impl[] = {
728 	&gcm_generic_impl,
729 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
730 	&gcm_pclmulqdq_impl,
731 #endif
732 };
733 
734 /* Indicate that benchmark has been completed */
735 static boolean_t gcm_impl_initialized = B_FALSE;
736 
737 /* Hold all supported implementations */
738 static size_t gcm_supp_impl_cnt = 0;
739 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
740 
741 /*
742  * Returns the GCM operations for encrypt/decrypt/key setup.  When a
743  * SIMD implementation is not allowed in the current context, then
744  * fallback to the fastest generic implementation.
745  */
746 const gcm_impl_ops_t *
gcm_impl_get_ops(void)747 gcm_impl_get_ops(void)
748 {
749 	if (!kfpu_allowed())
750 		return (&gcm_generic_impl);
751 
752 	const gcm_impl_ops_t *ops = NULL;
753 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
754 
755 	switch (impl) {
756 	case IMPL_FASTEST:
757 		ASSERT(gcm_impl_initialized);
758 		ops = &gcm_fastest_impl;
759 		break;
760 	case IMPL_CYCLE:
761 		/* Cycle through supported implementations */
762 		ASSERT(gcm_impl_initialized);
763 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
764 		static size_t cycle_impl_idx = 0;
765 		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
766 		ops = gcm_supp_impl[idx];
767 		break;
768 #ifdef CAN_USE_GCM_ASM
769 	case IMPL_AVX:
770 		/*
771 		 * Make sure that we return a valid implementation while
772 		 * switching to the avx implementation since there still
773 		 * may be unfinished non-avx contexts around.
774 		 */
775 		ops = &gcm_generic_impl;
776 		break;
777 #endif
778 	default:
779 		ASSERT3U(impl, <, gcm_supp_impl_cnt);
780 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
781 		if (impl < ARRAY_SIZE(gcm_all_impl))
782 			ops = gcm_supp_impl[impl];
783 		break;
784 	}
785 
786 	ASSERT3P(ops, !=, NULL);
787 
788 	return (ops);
789 }
790 
791 /*
792  * Initialize all supported implementations.
793  */
794 void
gcm_impl_init(void)795 gcm_impl_init(void)
796 {
797 	gcm_impl_ops_t *curr_impl;
798 	int i, c;
799 
800 	/* Move supported implementations into gcm_supp_impls */
801 	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
802 		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
803 
804 		if (curr_impl->is_supported())
805 			gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
806 	}
807 	gcm_supp_impl_cnt = c;
808 
809 	/*
810 	 * Set the fastest implementation given the assumption that the
811 	 * hardware accelerated version is the fastest.
812 	 */
813 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
814 	if (gcm_pclmulqdq_impl.is_supported()) {
815 		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
816 		    sizeof (gcm_fastest_impl));
817 	} else
818 #endif
819 	{
820 		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
821 		    sizeof (gcm_fastest_impl));
822 	}
823 
824 	strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
825 
826 #ifdef CAN_USE_GCM_ASM
827 	/*
828 	 * Use the avx implementation if it's available and the implementation
829 	 * hasn't changed from its default value of fastest on module load.
830 	 */
831 	if (gcm_avx_will_work()) {
832 #ifdef HAVE_MOVBE
833 		if (zfs_movbe_available() == B_TRUE) {
834 			atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
835 		}
836 #endif
837 		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
838 			gcm_set_avx(B_TRUE);
839 		}
840 	}
841 #endif
842 	/* Finish initialization */
843 	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
844 	gcm_impl_initialized = B_TRUE;
845 }
846 
847 static const struct {
848 	const char *name;
849 	uint32_t sel;
850 } gcm_impl_opts[] = {
851 		{ "cycle",	IMPL_CYCLE },
852 		{ "fastest",	IMPL_FASTEST },
853 #ifdef CAN_USE_GCM_ASM
854 		{ "avx",	IMPL_AVX },
855 #endif
856 };
857 
858 /*
859  * Function sets desired gcm implementation.
860  *
861  * If we are called before init(), user preference will be saved in
862  * user_sel_impl, and applied in later init() call. This occurs when module
863  * parameter is specified on module load. Otherwise, directly update
864  * icp_gcm_impl.
865  *
866  * @val		Name of gcm implementation to use
867  * @param	Unused.
868  */
869 int
gcm_impl_set(const char * val)870 gcm_impl_set(const char *val)
871 {
872 	int err = -EINVAL;
873 	char req_name[GCM_IMPL_NAME_MAX];
874 	uint32_t impl = GCM_IMPL_READ(user_sel_impl);
875 	size_t i;
876 
877 	/* sanitize input */
878 	i = strnlen(val, GCM_IMPL_NAME_MAX);
879 	if (i == 0 || i >= GCM_IMPL_NAME_MAX)
880 		return (err);
881 
882 	strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
883 	while (i > 0 && isspace(req_name[i-1]))
884 		i--;
885 	req_name[i] = '\0';
886 
887 	/* Check mandatory options */
888 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
889 #ifdef CAN_USE_GCM_ASM
890 		/* Ignore avx implementation if it won't work. */
891 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
892 			continue;
893 		}
894 #endif
895 		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
896 			impl = gcm_impl_opts[i].sel;
897 			err = 0;
898 			break;
899 		}
900 	}
901 
902 	/* check all supported impl if init() was already called */
903 	if (err != 0 && gcm_impl_initialized) {
904 		/* check all supported implementations */
905 		for (i = 0; i < gcm_supp_impl_cnt; i++) {
906 			if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
907 				impl = i;
908 				err = 0;
909 				break;
910 			}
911 		}
912 	}
913 #ifdef CAN_USE_GCM_ASM
914 	/*
915 	 * Use the avx implementation if available and the requested one is
916 	 * avx or fastest.
917 	 */
918 	if (gcm_avx_will_work() == B_TRUE &&
919 	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
920 		gcm_set_avx(B_TRUE);
921 	} else {
922 		gcm_set_avx(B_FALSE);
923 	}
924 #endif
925 
926 	if (err == 0) {
927 		if (gcm_impl_initialized)
928 			atomic_swap_32(&icp_gcm_impl, impl);
929 		else
930 			atomic_swap_32(&user_sel_impl, impl);
931 	}
932 
933 	return (err);
934 }
935 
936 #if defined(_KERNEL) && defined(__linux__)
937 
938 static int
icp_gcm_impl_set(const char * val,zfs_kernel_param_t * kp)939 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
940 {
941 	return (gcm_impl_set(val));
942 }
943 
944 static int
icp_gcm_impl_get(char * buffer,zfs_kernel_param_t * kp)945 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
946 {
947 	int i, cnt = 0;
948 	char *fmt;
949 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
950 
951 	/* list mandatory options */
952 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
953 #ifdef CAN_USE_GCM_ASM
954 		/* Ignore avx implementation if it won't work. */
955 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
956 			continue;
957 		}
958 #endif
959 		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
960 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
961 		    gcm_impl_opts[i].name);
962 	}
963 
964 	/* list all supported implementations */
965 	for (i = 0; i < gcm_supp_impl_cnt; i++) {
966 		fmt = (i == impl) ? "[%s] " : "%s ";
967 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
968 		    gcm_supp_impl[i]->name);
969 	}
970 
971 	return (cnt);
972 }
973 
974 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
975     NULL, 0644);
976 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
977 #endif /* defined(__KERNEL) */
978 
979 #ifdef CAN_USE_GCM_ASM
980 #define	GCM_BLOCK_LEN 16
981 /*
982  * The openssl asm routines are 6x aggregated and need that many bytes
983  * at minimum.
984  */
985 #define	GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
986 #define	GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
987 /*
988  * Ensure the chunk size is reasonable since we are allocating a
989  * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
990  */
991 #define	GCM_AVX_MAX_CHUNK_SIZE \
992 	(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
993 
994 /* Clear the FPU registers since they hold sensitive internal state. */
995 #define	clear_fpu_regs() clear_fpu_regs_avx()
996 #define	GHASH_AVX(ctx, in, len) \
997     gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
998     in, len)
999 
1000 #define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1001 
1002 /* Get the chunk size module parameter. */
1003 #define	GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1004 
1005 /*
1006  * Module parameter: number of bytes to process at once while owning the FPU.
1007  * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1008  * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1009  */
1010 static uint32_t gcm_avx_chunk_size =
1011 	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1012 
1013 extern void ASMABI clear_fpu_regs_avx(void);
1014 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1015 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
1016     const uint32_t pt[4], uint32_t ct[4]);
1017 
1018 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1019 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1020     const uint8_t *in, size_t len);
1021 
1022 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1023     const void *, uint64_t *, uint64_t *);
1024 
1025 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1026     const void *, uint64_t *, uint64_t *);
1027 
1028 static inline boolean_t
gcm_avx_will_work(void)1029 gcm_avx_will_work(void)
1030 {
1031 	/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1032 	return (kfpu_allowed() &&
1033 	    zfs_avx_available() && zfs_aes_available() &&
1034 	    zfs_pclmulqdq_available());
1035 }
1036 
1037 static inline void
gcm_set_avx(boolean_t val)1038 gcm_set_avx(boolean_t val)
1039 {
1040 	if (gcm_avx_will_work() == B_TRUE) {
1041 		atomic_swap_32(&gcm_use_avx, val);
1042 	}
1043 }
1044 
1045 static inline boolean_t
gcm_toggle_avx(void)1046 gcm_toggle_avx(void)
1047 {
1048 	if (gcm_avx_will_work() == B_TRUE) {
1049 		return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
1050 	} else {
1051 		return (B_FALSE);
1052 	}
1053 }
1054 
1055 static inline size_t
gcm_simd_get_htab_size(boolean_t simd_mode)1056 gcm_simd_get_htab_size(boolean_t simd_mode)
1057 {
1058 	switch (simd_mode) {
1059 	case B_TRUE:
1060 		return (2 * 6 * 2 * sizeof (uint64_t));
1061 
1062 	default:
1063 		return (0);
1064 	}
1065 }
1066 
1067 
1068 /* Increment the GCM counter block by n. */
1069 static inline void
gcm_incr_counter_block_by(gcm_ctx_t * ctx,int n)1070 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1071 {
1072 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1073 	uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1074 
1075 	counter = htonll(counter + n);
1076 	counter &= counter_mask;
1077 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1078 }
1079 
1080 /*
1081  * Encrypt multiple blocks of data in GCM mode.
1082  * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1083  * if possible. While processing a chunk the FPU is "locked".
1084  */
1085 static int
gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size)1086 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1087     size_t length, crypto_data_t *out, size_t block_size)
1088 {
1089 	size_t bleft = length;
1090 	size_t need = 0;
1091 	size_t done = 0;
1092 	uint8_t *datap = (uint8_t *)data;
1093 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1094 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1095 	uint64_t *ghash = ctx->gcm_ghash;
1096 	uint64_t *cb = ctx->gcm_cb;
1097 	uint8_t *ct_buf = NULL;
1098 	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1099 	int rv = CRYPTO_SUCCESS;
1100 
1101 	ASSERT(block_size == GCM_BLOCK_LEN);
1102 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1103 	    B_FALSE);
1104 	/*
1105 	 * If the last call left an incomplete block, try to fill
1106 	 * it first.
1107 	 */
1108 	if (ctx->gcm_remainder_len > 0) {
1109 		need = block_size - ctx->gcm_remainder_len;
1110 		if (length < need) {
1111 			/* Accumulate bytes here and return. */
1112 			memcpy((uint8_t *)ctx->gcm_remainder +
1113 			    ctx->gcm_remainder_len, datap, length);
1114 
1115 			ctx->gcm_remainder_len += length;
1116 			if (ctx->gcm_copy_to == NULL) {
1117 				ctx->gcm_copy_to = datap;
1118 			}
1119 			return (CRYPTO_SUCCESS);
1120 		} else {
1121 			/* Complete incomplete block. */
1122 			memcpy((uint8_t *)ctx->gcm_remainder +
1123 			    ctx->gcm_remainder_len, datap, need);
1124 
1125 			ctx->gcm_copy_to = NULL;
1126 		}
1127 	}
1128 
1129 	/* Allocate a buffer to encrypt to if there is enough input. */
1130 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1131 		ct_buf = vmem_alloc(chunk_size, KM_SLEEP);
1132 		if (ct_buf == NULL) {
1133 			return (CRYPTO_HOST_MEMORY);
1134 		}
1135 	}
1136 
1137 	/* If we completed an incomplete block, encrypt and write it out. */
1138 	if (ctx->gcm_remainder_len > 0) {
1139 		kfpu_begin();
1140 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1141 		    (const uint32_t *)cb, (uint32_t *)tmp);
1142 
1143 		gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1144 		GHASH_AVX(ctx, tmp, block_size);
1145 		clear_fpu_regs();
1146 		kfpu_end();
1147 		rv = crypto_put_output_data(tmp, out, block_size);
1148 		out->cd_offset += block_size;
1149 		gcm_incr_counter_block(ctx);
1150 		ctx->gcm_processed_data_len += block_size;
1151 		bleft -= need;
1152 		datap += need;
1153 		ctx->gcm_remainder_len = 0;
1154 	}
1155 
1156 	/* Do the bulk encryption in chunk_size blocks. */
1157 	for (; bleft >= chunk_size; bleft -= chunk_size) {
1158 		kfpu_begin();
1159 		done = aesni_gcm_encrypt(
1160 		    datap, ct_buf, chunk_size, key, cb, ghash);
1161 
1162 		clear_fpu_regs();
1163 		kfpu_end();
1164 		if (done != chunk_size) {
1165 			rv = CRYPTO_FAILED;
1166 			goto out_nofpu;
1167 		}
1168 		rv = crypto_put_output_data(ct_buf, out, chunk_size);
1169 		if (rv != CRYPTO_SUCCESS) {
1170 			goto out_nofpu;
1171 		}
1172 		out->cd_offset += chunk_size;
1173 		datap += chunk_size;
1174 		ctx->gcm_processed_data_len += chunk_size;
1175 	}
1176 	/* Check if we are already done. */
1177 	if (bleft == 0) {
1178 		goto out_nofpu;
1179 	}
1180 	/* Bulk encrypt the remaining data. */
1181 	kfpu_begin();
1182 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1183 		done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
1184 		if (done == 0) {
1185 			rv = CRYPTO_FAILED;
1186 			goto out;
1187 		}
1188 		rv = crypto_put_output_data(ct_buf, out, done);
1189 		if (rv != CRYPTO_SUCCESS) {
1190 			goto out;
1191 		}
1192 		out->cd_offset += done;
1193 		ctx->gcm_processed_data_len += done;
1194 		datap += done;
1195 		bleft -= done;
1196 
1197 	}
1198 	/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1199 	while (bleft > 0) {
1200 		if (bleft < block_size) {
1201 			memcpy(ctx->gcm_remainder, datap, bleft);
1202 			ctx->gcm_remainder_len = bleft;
1203 			ctx->gcm_copy_to = datap;
1204 			goto out;
1205 		}
1206 		/* Encrypt, hash and write out. */
1207 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1208 		    (const uint32_t *)cb, (uint32_t *)tmp);
1209 
1210 		gcm_xor_avx(datap, tmp);
1211 		GHASH_AVX(ctx, tmp, block_size);
1212 		rv = crypto_put_output_data(tmp, out, block_size);
1213 		if (rv != CRYPTO_SUCCESS) {
1214 			goto out;
1215 		}
1216 		out->cd_offset += block_size;
1217 		gcm_incr_counter_block(ctx);
1218 		ctx->gcm_processed_data_len += block_size;
1219 		datap += block_size;
1220 		bleft -= block_size;
1221 	}
1222 out:
1223 	clear_fpu_regs();
1224 	kfpu_end();
1225 out_nofpu:
1226 	if (ct_buf != NULL) {
1227 		vmem_free(ct_buf, chunk_size);
1228 	}
1229 	return (rv);
1230 }
1231 
1232 /*
1233  * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1234  * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1235  */
1236 static int
gcm_encrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1237 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1238 {
1239 	uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1240 	uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1241 	uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1242 	size_t rem_len = ctx->gcm_remainder_len;
1243 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1244 	int aes_rounds = ((aes_key_t *)keysched)->nr;
1245 	int rv;
1246 
1247 	ASSERT(block_size == GCM_BLOCK_LEN);
1248 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1249 	    B_FALSE);
1250 
1251 	if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1252 		return (CRYPTO_DATA_LEN_RANGE);
1253 	}
1254 
1255 	kfpu_begin();
1256 	/* Pad last incomplete block with zeros, encrypt and hash. */
1257 	if (rem_len > 0) {
1258 		uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1259 		const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1260 
1261 		aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1262 		memset(remainder + rem_len, 0, block_size - rem_len);
1263 		for (int i = 0; i < rem_len; i++) {
1264 			remainder[i] ^= tmp[i];
1265 		}
1266 		GHASH_AVX(ctx, remainder, block_size);
1267 		ctx->gcm_processed_data_len += rem_len;
1268 		/* No need to increment counter_block, it's the last block. */
1269 	}
1270 	/* Finish tag. */
1271 	ctx->gcm_len_a_len_c[1] =
1272 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1273 	GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1274 	aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1275 
1276 	gcm_xor_avx((uint8_t *)J0, ghash);
1277 	clear_fpu_regs();
1278 	kfpu_end();
1279 
1280 	/* Output remainder. */
1281 	if (rem_len > 0) {
1282 		rv = crypto_put_output_data(remainder, out, rem_len);
1283 		if (rv != CRYPTO_SUCCESS)
1284 			return (rv);
1285 	}
1286 	out->cd_offset += rem_len;
1287 	ctx->gcm_remainder_len = 0;
1288 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1289 	if (rv != CRYPTO_SUCCESS)
1290 		return (rv);
1291 
1292 	out->cd_offset += ctx->gcm_tag_len;
1293 	return (CRYPTO_SUCCESS);
1294 }
1295 
1296 /*
1297  * Finalize decryption: We just have accumulated crypto text, so now we
1298  * decrypt it here inplace.
1299  */
1300 static int
gcm_decrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1301 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1302 {
1303 	ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1304 	ASSERT3U(block_size, ==, 16);
1305 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1306 	    B_FALSE);
1307 
1308 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1309 	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1310 	uint8_t *datap = ctx->gcm_pt_buf;
1311 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1312 	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1313 	uint64_t *ghash = ctx->gcm_ghash;
1314 	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1315 	int rv = CRYPTO_SUCCESS;
1316 	size_t bleft, done;
1317 
1318 	/*
1319 	 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1320 	 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1321 	 * GCM_AVX_MIN_DECRYPT_BYTES.
1322 	 */
1323 	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1324 		kfpu_begin();
1325 		done = aesni_gcm_decrypt(datap, datap, chunk_size,
1326 		    (const void *)key, ctx->gcm_cb, ghash);
1327 		clear_fpu_regs();
1328 		kfpu_end();
1329 		if (done != chunk_size) {
1330 			return (CRYPTO_FAILED);
1331 		}
1332 		datap += done;
1333 	}
1334 	/* Decrypt remainder, which is less than chunk size, in one go. */
1335 	kfpu_begin();
1336 	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1337 		done = aesni_gcm_decrypt(datap, datap, bleft,
1338 		    (const void *)key, ctx->gcm_cb, ghash);
1339 		if (done == 0) {
1340 			clear_fpu_regs();
1341 			kfpu_end();
1342 			return (CRYPTO_FAILED);
1343 		}
1344 		datap += done;
1345 		bleft -= done;
1346 	}
1347 	ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1348 
1349 	/*
1350 	 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1351 	 * decrypt them block by block.
1352 	 */
1353 	while (bleft > 0) {
1354 		/* Incomplete last block. */
1355 		if (bleft < block_size) {
1356 			uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1357 
1358 			memset(lastb, 0, block_size);
1359 			memcpy(lastb, datap, bleft);
1360 			/* The GCM processing. */
1361 			GHASH_AVX(ctx, lastb, block_size);
1362 			aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1363 			for (size_t i = 0; i < bleft; i++) {
1364 				datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1365 			}
1366 			break;
1367 		}
1368 		/* The GCM processing. */
1369 		GHASH_AVX(ctx, datap, block_size);
1370 		aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1371 		gcm_xor_avx((uint8_t *)tmp, datap);
1372 		gcm_incr_counter_block(ctx);
1373 
1374 		datap += block_size;
1375 		bleft -= block_size;
1376 	}
1377 	if (rv != CRYPTO_SUCCESS) {
1378 		clear_fpu_regs();
1379 		kfpu_end();
1380 		return (rv);
1381 	}
1382 	/* Decryption done, finish the tag. */
1383 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1384 	GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1385 	aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1386 	    (uint32_t *)ctx->gcm_J0);
1387 
1388 	gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1389 
1390 	/* We are done with the FPU, restore its state. */
1391 	clear_fpu_regs();
1392 	kfpu_end();
1393 
1394 	/* Compare the input authentication tag with what we calculated. */
1395 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1396 		/* They don't match. */
1397 		return (CRYPTO_INVALID_MAC);
1398 	}
1399 	rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1400 	if (rv != CRYPTO_SUCCESS) {
1401 		return (rv);
1402 	}
1403 	out->cd_offset += pt_len;
1404 	return (CRYPTO_SUCCESS);
1405 }
1406 
1407 /*
1408  * Initialize the GCM params H, Htabtle and the counter block. Save the
1409  * initial counter block.
1410  */
1411 static int
gcm_init_avx(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size)1412 gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
1413     const uint8_t *auth_data, size_t auth_data_len, size_t block_size)
1414 {
1415 	uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1416 	uint64_t *H = ctx->gcm_H;
1417 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1418 	int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1419 	const uint8_t *datap = auth_data;
1420 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1421 	size_t bleft;
1422 
1423 	ASSERT(block_size == GCM_BLOCK_LEN);
1424 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1425 	    B_FALSE);
1426 
1427 	/* Init H (encrypt zero block) and create the initial counter block. */
1428 	memset(H, 0, sizeof (ctx->gcm_H));
1429 	kfpu_begin();
1430 	aes_encrypt_intel(keysched, aes_rounds,
1431 	    (const uint32_t *)H, (uint32_t *)H);
1432 
1433 	gcm_init_htab_avx(ctx->gcm_Htable, H);
1434 
1435 	if (iv_len == 12) {
1436 		memcpy(cb, iv, 12);
1437 		cb[12] = 0;
1438 		cb[13] = 0;
1439 		cb[14] = 0;
1440 		cb[15] = 1;
1441 		/* We need the ICB later. */
1442 		memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0));
1443 	} else {
1444 		/*
1445 		 * Most consumers use 12 byte IVs, so it's OK to use the
1446 		 * original routines for other IV sizes, just avoid nesting
1447 		 * kfpu_begin calls.
1448 		 */
1449 		clear_fpu_regs();
1450 		kfpu_end();
1451 		gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1452 		    aes_copy_block, aes_xor_block);
1453 		kfpu_begin();
1454 	}
1455 
1456 	memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash));
1457 
1458 	/* Openssl post increments the counter, adjust for that. */
1459 	gcm_incr_counter_block(ctx);
1460 
1461 	/* Ghash AAD in chunk_size blocks. */
1462 	for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1463 		GHASH_AVX(ctx, datap, chunk_size);
1464 		datap += chunk_size;
1465 		clear_fpu_regs();
1466 		kfpu_end();
1467 		kfpu_begin();
1468 	}
1469 	/* Ghash the remainder and handle possible incomplete GCM block. */
1470 	if (bleft > 0) {
1471 		size_t incomp = bleft % block_size;
1472 
1473 		bleft -= incomp;
1474 		if (bleft > 0) {
1475 			GHASH_AVX(ctx, datap, bleft);
1476 			datap += bleft;
1477 		}
1478 		if (incomp > 0) {
1479 			/* Zero pad and hash incomplete last block. */
1480 			uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1481 
1482 			memset(authp, 0, block_size);
1483 			memcpy(authp, datap, incomp);
1484 			GHASH_AVX(ctx, authp, block_size);
1485 		}
1486 	}
1487 	clear_fpu_regs();
1488 	kfpu_end();
1489 	return (CRYPTO_SUCCESS);
1490 }
1491 
1492 #if defined(_KERNEL)
1493 static int
icp_gcm_avx_set_chunk_size(const char * buf,zfs_kernel_param_t * kp)1494 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1495 {
1496 	unsigned long val;
1497 	char val_rounded[16];
1498 	int error = 0;
1499 
1500 	error = kstrtoul(buf, 0, &val);
1501 	if (error)
1502 		return (error);
1503 
1504 	val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1505 
1506 	if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1507 		return (-EINVAL);
1508 
1509 	snprintf(val_rounded, 16, "%u", (uint32_t)val);
1510 	error = param_set_uint(val_rounded, kp);
1511 	return (error);
1512 }
1513 
1514 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1515     param_get_uint, &gcm_avx_chunk_size, 0644);
1516 
1517 MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1518 	"How many bytes to process while owning the FPU");
1519 
1520 #endif /* defined(__KERNEL) */
1521 #endif /* ifdef CAN_USE_GCM_ASM */
1522