1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/zfs_context.h>
27 #include <sys/cmn_err.h>
28 #include <modes/modes.h>
29 #include <sys/crypto/common.h>
30 #include <sys/crypto/icp.h>
31 #include <sys/crypto/impl.h>
32 #include <sys/byteorder.h>
33 #include <sys/simd.h>
34 #include <modes/gcm_impl.h>
35 #ifdef CAN_USE_GCM_ASM
36 #include <aes/aes_impl.h>
37 #endif
38
39 #define GHASH(c, d, t, o) \
40 xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
41 (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
42 (uint64_t *)(void *)(t));
43
44 /* Select GCM implementation */
45 #define IMPL_FASTEST (UINT32_MAX)
46 #define IMPL_CYCLE (UINT32_MAX-1)
47 #ifdef CAN_USE_GCM_ASM
48 #define IMPL_AVX (UINT32_MAX-2)
49 #endif
50 #define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
51 static uint32_t icp_gcm_impl = IMPL_FASTEST;
52 static uint32_t user_sel_impl = IMPL_FASTEST;
53
54 #ifdef CAN_USE_GCM_ASM
55 /* Does the architecture we run on support the MOVBE instruction? */
56 boolean_t gcm_avx_can_use_movbe = B_FALSE;
57 /*
58 * Whether to use the optimized openssl gcm and ghash implementations.
59 * Set to true if module parameter icp_gcm_impl == "avx".
60 */
61 static boolean_t gcm_use_avx = B_FALSE;
62 #define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx)
63
64 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
65
66 static inline boolean_t gcm_avx_will_work(void);
67 static inline void gcm_set_avx(boolean_t);
68 static inline boolean_t gcm_toggle_avx(void);
69 static inline size_t gcm_simd_get_htab_size(boolean_t);
70
71 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
72 crypto_data_t *, size_t);
73
74 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
75 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
76 static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *,
77 size_t, size_t);
78 #endif /* ifdef CAN_USE_GCM_ASM */
79
80 /*
81 * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode
82 * is done in another function.
83 */
84 int
gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))85 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
86 crypto_data_t *out, size_t block_size,
87 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
88 void (*copy_block)(uint8_t *, uint8_t *),
89 void (*xor_block)(uint8_t *, uint8_t *))
90 {
91 #ifdef CAN_USE_GCM_ASM
92 if (ctx->gcm_use_avx == B_TRUE)
93 return (gcm_mode_encrypt_contiguous_blocks_avx(
94 ctx, data, length, out, block_size));
95 #endif
96
97 const gcm_impl_ops_t *gops;
98 size_t remainder = length;
99 size_t need = 0;
100 uint8_t *datap = (uint8_t *)data;
101 uint8_t *blockp;
102 uint8_t *lastp;
103 void *iov_or_mp;
104 offset_t offset;
105 uint8_t *out_data_1;
106 uint8_t *out_data_2;
107 size_t out_data_1_len;
108 uint64_t counter;
109 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
110
111 if (length + ctx->gcm_remainder_len < block_size) {
112 /* accumulate bytes here and return */
113 memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
114 datap,
115 length);
116 ctx->gcm_remainder_len += length;
117 if (ctx->gcm_copy_to == NULL) {
118 ctx->gcm_copy_to = datap;
119 }
120 return (CRYPTO_SUCCESS);
121 }
122
123 crypto_init_ptrs(out, &iov_or_mp, &offset);
124
125 gops = gcm_impl_get_ops();
126 do {
127 /* Unprocessed data from last call. */
128 if (ctx->gcm_remainder_len > 0) {
129 need = block_size - ctx->gcm_remainder_len;
130
131 if (need > remainder)
132 return (CRYPTO_DATA_LEN_RANGE);
133
134 memcpy(&((uint8_t *)ctx->gcm_remainder)
135 [ctx->gcm_remainder_len], datap, need);
136
137 blockp = (uint8_t *)ctx->gcm_remainder;
138 } else {
139 blockp = datap;
140 }
141
142 /*
143 * Increment counter. Counter bits are confined
144 * to the bottom 32 bits of the counter block.
145 */
146 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
147 counter = htonll(counter + 1);
148 counter &= counter_mask;
149 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
150
151 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
152 (uint8_t *)ctx->gcm_tmp);
153 xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
154
155 lastp = (uint8_t *)ctx->gcm_tmp;
156
157 ctx->gcm_processed_data_len += block_size;
158
159 crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
160 &out_data_1_len, &out_data_2, block_size);
161
162 /* copy block to where it belongs */
163 if (out_data_1_len == block_size) {
164 copy_block(lastp, out_data_1);
165 } else {
166 memcpy(out_data_1, lastp, out_data_1_len);
167 if (out_data_2 != NULL) {
168 memcpy(out_data_2,
169 lastp + out_data_1_len,
170 block_size - out_data_1_len);
171 }
172 }
173 /* update offset */
174 out->cd_offset += block_size;
175
176 /* add ciphertext to the hash */
177 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
178
179 /* Update pointer to next block of data to be processed. */
180 if (ctx->gcm_remainder_len != 0) {
181 datap += need;
182 ctx->gcm_remainder_len = 0;
183 } else {
184 datap += block_size;
185 }
186
187 remainder = (size_t)&data[length] - (size_t)datap;
188
189 /* Incomplete last block. */
190 if (remainder > 0 && remainder < block_size) {
191 memcpy(ctx->gcm_remainder, datap, remainder);
192 ctx->gcm_remainder_len = remainder;
193 ctx->gcm_copy_to = datap;
194 goto out;
195 }
196 ctx->gcm_copy_to = NULL;
197
198 } while (remainder > 0);
199 out:
200 return (CRYPTO_SUCCESS);
201 }
202
203 int
gcm_encrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))204 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
205 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
206 void (*copy_block)(uint8_t *, uint8_t *),
207 void (*xor_block)(uint8_t *, uint8_t *))
208 {
209 (void) copy_block;
210 #ifdef CAN_USE_GCM_ASM
211 if (ctx->gcm_use_avx == B_TRUE)
212 return (gcm_encrypt_final_avx(ctx, out, block_size));
213 #endif
214
215 const gcm_impl_ops_t *gops;
216 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
217 uint8_t *ghash, *macp = NULL;
218 int i, rv;
219
220 if (out->cd_length <
221 (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
222 return (CRYPTO_DATA_LEN_RANGE);
223 }
224
225 gops = gcm_impl_get_ops();
226 ghash = (uint8_t *)ctx->gcm_ghash;
227
228 if (ctx->gcm_remainder_len > 0) {
229 uint64_t counter;
230 uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
231
232 /*
233 * Here is where we deal with data that is not a
234 * multiple of the block size.
235 */
236
237 /*
238 * Increment counter.
239 */
240 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
241 counter = htonll(counter + 1);
242 counter &= counter_mask;
243 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
244
245 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
246 (uint8_t *)ctx->gcm_tmp);
247
248 macp = (uint8_t *)ctx->gcm_remainder;
249 memset(macp + ctx->gcm_remainder_len, 0,
250 block_size - ctx->gcm_remainder_len);
251
252 /* XOR with counter block */
253 for (i = 0; i < ctx->gcm_remainder_len; i++) {
254 macp[i] ^= tmpp[i];
255 }
256
257 /* add ciphertext to the hash */
258 GHASH(ctx, macp, ghash, gops);
259
260 ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
261 }
262
263 ctx->gcm_len_a_len_c[1] =
264 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
265 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
266 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
267 (uint8_t *)ctx->gcm_J0);
268 xor_block((uint8_t *)ctx->gcm_J0, ghash);
269
270 if (ctx->gcm_remainder_len > 0) {
271 rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
272 if (rv != CRYPTO_SUCCESS)
273 return (rv);
274 }
275 out->cd_offset += ctx->gcm_remainder_len;
276 ctx->gcm_remainder_len = 0;
277 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
278 if (rv != CRYPTO_SUCCESS)
279 return (rv);
280 out->cd_offset += ctx->gcm_tag_len;
281
282 return (CRYPTO_SUCCESS);
283 }
284
285 /*
286 * This will only deal with decrypting the last block of the input that
287 * might not be a multiple of block length.
288 */
289 static void
gcm_decrypt_incomplete_block(gcm_ctx_t * ctx,size_t block_size,size_t index,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))290 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
291 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
292 void (*xor_block)(uint8_t *, uint8_t *))
293 {
294 uint8_t *datap, *outp, *counterp;
295 uint64_t counter;
296 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
297 int i;
298
299 /*
300 * Increment counter.
301 * Counter bits are confined to the bottom 32 bits
302 */
303 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
304 counter = htonll(counter + 1);
305 counter &= counter_mask;
306 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
307
308 datap = (uint8_t *)ctx->gcm_remainder;
309 outp = &((ctx->gcm_pt_buf)[index]);
310 counterp = (uint8_t *)ctx->gcm_tmp;
311
312 /* authentication tag */
313 memset((uint8_t *)ctx->gcm_tmp, 0, block_size);
314 memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len);
315
316 /* add ciphertext to the hash */
317 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
318
319 /* decrypt remaining ciphertext */
320 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
321
322 /* XOR with counter block */
323 for (i = 0; i < ctx->gcm_remainder_len; i++) {
324 outp[i] = datap[i] ^ counterp[i];
325 }
326 }
327
328 int
gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))329 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
330 crypto_data_t *out, size_t block_size,
331 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
332 void (*copy_block)(uint8_t *, uint8_t *),
333 void (*xor_block)(uint8_t *, uint8_t *))
334 {
335 (void) out, (void) block_size, (void) encrypt_block, (void) copy_block,
336 (void) xor_block;
337 size_t new_len;
338 uint8_t *new;
339
340 /*
341 * Copy contiguous ciphertext input blocks to plaintext buffer.
342 * Ciphertext will be decrypted in the final.
343 */
344 if (length > 0) {
345 new_len = ctx->gcm_pt_buf_len + length;
346 new = vmem_alloc(new_len, KM_SLEEP);
347 if (new == NULL) {
348 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
349 ctx->gcm_pt_buf = NULL;
350 return (CRYPTO_HOST_MEMORY);
351 }
352
353 if (ctx->gcm_pt_buf != NULL) {
354 memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
355 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
356 } else {
357 ASSERT0(ctx->gcm_pt_buf_len);
358 }
359
360 ctx->gcm_pt_buf = new;
361 ctx->gcm_pt_buf_len = new_len;
362 memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data,
363 length);
364 ctx->gcm_processed_data_len += length;
365 }
366
367 ctx->gcm_remainder_len = 0;
368 return (CRYPTO_SUCCESS);
369 }
370
371 int
gcm_decrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))372 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
373 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
374 void (*xor_block)(uint8_t *, uint8_t *))
375 {
376 #ifdef CAN_USE_GCM_ASM
377 if (ctx->gcm_use_avx == B_TRUE)
378 return (gcm_decrypt_final_avx(ctx, out, block_size));
379 #endif
380
381 const gcm_impl_ops_t *gops;
382 size_t pt_len;
383 size_t remainder;
384 uint8_t *ghash;
385 uint8_t *blockp;
386 uint8_t *cbp;
387 uint64_t counter;
388 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
389 int processed = 0, rv;
390
391 ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
392
393 gops = gcm_impl_get_ops();
394 pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
395 ghash = (uint8_t *)ctx->gcm_ghash;
396 blockp = ctx->gcm_pt_buf;
397 remainder = pt_len;
398 while (remainder > 0) {
399 /* Incomplete last block */
400 if (remainder < block_size) {
401 memcpy(ctx->gcm_remainder, blockp, remainder);
402 ctx->gcm_remainder_len = remainder;
403 /*
404 * not expecting anymore ciphertext, just
405 * compute plaintext for the remaining input
406 */
407 gcm_decrypt_incomplete_block(ctx, block_size,
408 processed, encrypt_block, xor_block);
409 ctx->gcm_remainder_len = 0;
410 goto out;
411 }
412 /* add ciphertext to the hash */
413 GHASH(ctx, blockp, ghash, gops);
414
415 /*
416 * Increment counter.
417 * Counter bits are confined to the bottom 32 bits
418 */
419 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
420 counter = htonll(counter + 1);
421 counter &= counter_mask;
422 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
423
424 cbp = (uint8_t *)ctx->gcm_tmp;
425 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
426
427 /* XOR with ciphertext */
428 xor_block(cbp, blockp);
429
430 processed += block_size;
431 blockp += block_size;
432 remainder -= block_size;
433 }
434 out:
435 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
436 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
437 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
438 (uint8_t *)ctx->gcm_J0);
439 xor_block((uint8_t *)ctx->gcm_J0, ghash);
440
441 /* compare the input authentication tag with what we calculated */
442 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
443 /* They don't match */
444 return (CRYPTO_INVALID_MAC);
445 } else {
446 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
447 if (rv != CRYPTO_SUCCESS)
448 return (rv);
449 out->cd_offset += pt_len;
450 }
451 return (CRYPTO_SUCCESS);
452 }
453
454 static int
gcm_validate_args(CK_AES_GCM_PARAMS * gcm_param)455 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
456 {
457 size_t tag_len;
458
459 /*
460 * Check the length of the authentication tag (in bits).
461 */
462 tag_len = gcm_param->ulTagBits;
463 switch (tag_len) {
464 case 32:
465 case 64:
466 case 96:
467 case 104:
468 case 112:
469 case 120:
470 case 128:
471 break;
472 default:
473 return (CRYPTO_MECHANISM_PARAM_INVALID);
474 }
475
476 if (gcm_param->ulIvLen == 0)
477 return (CRYPTO_MECHANISM_PARAM_INVALID);
478
479 return (CRYPTO_SUCCESS);
480 }
481
482 static void
gcm_format_initial_blocks(const uint8_t * iv,ulong_t iv_len,gcm_ctx_t * ctx,size_t block_size,void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))483 gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len,
484 gcm_ctx_t *ctx, size_t block_size,
485 void (*copy_block)(uint8_t *, uint8_t *),
486 void (*xor_block)(uint8_t *, uint8_t *))
487 {
488 const gcm_impl_ops_t *gops;
489 uint8_t *cb;
490 ulong_t remainder = iv_len;
491 ulong_t processed = 0;
492 uint8_t *datap, *ghash;
493 uint64_t len_a_len_c[2];
494
495 gops = gcm_impl_get_ops();
496 ghash = (uint8_t *)ctx->gcm_ghash;
497 cb = (uint8_t *)ctx->gcm_cb;
498 if (iv_len == 12) {
499 memcpy(cb, iv, 12);
500 cb[12] = 0;
501 cb[13] = 0;
502 cb[14] = 0;
503 cb[15] = 1;
504 /* J0 will be used again in the final */
505 copy_block(cb, (uint8_t *)ctx->gcm_J0);
506 } else {
507 /* GHASH the IV */
508 do {
509 if (remainder < block_size) {
510 memset(cb, 0, block_size);
511 memcpy(cb, &(iv[processed]), remainder);
512 datap = (uint8_t *)cb;
513 remainder = 0;
514 } else {
515 datap = (uint8_t *)(&(iv[processed]));
516 processed += block_size;
517 remainder -= block_size;
518 }
519 GHASH(ctx, datap, ghash, gops);
520 } while (remainder > 0);
521
522 len_a_len_c[0] = 0;
523 len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
524 GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
525
526 /* J0 will be used again in the final */
527 copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
528 }
529 }
530
531 static int
gcm_init(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))532 gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
533 const uint8_t *auth_data, size_t auth_data_len, size_t block_size,
534 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
535 void (*copy_block)(uint8_t *, uint8_t *),
536 void (*xor_block)(uint8_t *, uint8_t *))
537 {
538 const gcm_impl_ops_t *gops;
539 uint8_t *ghash, *datap, *authp;
540 size_t remainder, processed;
541
542 /* encrypt zero block to get subkey H */
543 memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
544 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
545 (uint8_t *)ctx->gcm_H);
546
547 gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
548 copy_block, xor_block);
549
550 gops = gcm_impl_get_ops();
551 authp = (uint8_t *)ctx->gcm_tmp;
552 ghash = (uint8_t *)ctx->gcm_ghash;
553 memset(authp, 0, block_size);
554 memset(ghash, 0, block_size);
555
556 processed = 0;
557 remainder = auth_data_len;
558 do {
559 if (remainder < block_size) {
560 /*
561 * There's not a block full of data, pad rest of
562 * buffer with zero
563 */
564
565 if (auth_data != NULL) {
566 memset(authp, 0, block_size);
567 memcpy(authp, &(auth_data[processed]),
568 remainder);
569 } else {
570 ASSERT0(remainder);
571 }
572
573 datap = (uint8_t *)authp;
574 remainder = 0;
575 } else {
576 datap = (uint8_t *)(&(auth_data[processed]));
577 processed += block_size;
578 remainder -= block_size;
579 }
580
581 /* add auth data to the hash */
582 GHASH(ctx, datap, ghash, gops);
583
584 } while (remainder > 0);
585
586 return (CRYPTO_SUCCESS);
587 }
588
589 /*
590 * Init the GCM context struct. Handle the cycle and avx implementations here.
591 */
592 int
gcm_init_ctx(gcm_ctx_t * gcm_ctx,char * param,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))593 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
594 size_t block_size, int (*encrypt_block)(const void *, const uint8_t *,
595 uint8_t *), void (*copy_block)(uint8_t *, uint8_t *),
596 void (*xor_block)(uint8_t *, uint8_t *))
597 {
598 CK_AES_GCM_PARAMS *gcm_param;
599 int rv = CRYPTO_SUCCESS;
600 size_t tag_len, iv_len;
601
602 if (param != NULL) {
603 gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
604
605 /* GCM mode. */
606 if ((rv = gcm_validate_args(gcm_param)) != 0) {
607 return (rv);
608 }
609 gcm_ctx->gcm_flags |= GCM_MODE;
610
611 size_t tbits = gcm_param->ulTagBits;
612 tag_len = CRYPTO_BITS2BYTES(tbits);
613 iv_len = gcm_param->ulIvLen;
614
615 gcm_ctx->gcm_tag_len = tag_len;
616 gcm_ctx->gcm_processed_data_len = 0;
617
618 /* these values are in bits */
619 gcm_ctx->gcm_len_a_len_c[0]
620 = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
621 } else {
622 return (CRYPTO_MECHANISM_PARAM_INVALID);
623 }
624
625 const uint8_t *iv = (const uint8_t *)gcm_param->pIv;
626 const uint8_t *aad = (const uint8_t *)gcm_param->pAAD;
627 size_t aad_len = gcm_param->ulAADLen;
628
629 #ifdef CAN_USE_GCM_ASM
630 boolean_t needs_bswap =
631 ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
632
633 if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
634 gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
635 } else {
636 /*
637 * Handle the "cycle" implementation by creating avx and
638 * non-avx contexts alternately.
639 */
640 gcm_ctx->gcm_use_avx = gcm_toggle_avx();
641
642 /* The avx impl. doesn't handle byte swapped key schedules. */
643 if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
644 gcm_ctx->gcm_use_avx = B_FALSE;
645 }
646 /*
647 * If this is a GCM context, use the MOVBE and the BSWAP
648 * variants alternately.
649 */
650 if (gcm_ctx->gcm_use_avx == B_TRUE &&
651 zfs_movbe_available() == B_TRUE) {
652 (void) atomic_toggle_boolean_nv(
653 (volatile boolean_t *)&gcm_avx_can_use_movbe);
654 }
655 }
656 /*
657 * We don't handle byte swapped key schedules in the avx code path,
658 * still they could be created by the aes generic implementation.
659 * Make sure not to use them since we'll corrupt data if we do.
660 */
661 if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
662 gcm_ctx->gcm_use_avx = B_FALSE;
663
664 cmn_err_once(CE_WARN,
665 "ICP: Can't use the aes generic or cycle implementations "
666 "in combination with the gcm avx implementation!");
667 cmn_err_once(CE_WARN,
668 "ICP: Falling back to a compatible implementation, "
669 "aes-gcm performance will likely be degraded.");
670 cmn_err_once(CE_WARN,
671 "ICP: Choose at least the x86_64 aes implementation to "
672 "restore performance.");
673 }
674
675 /* Allocate Htab memory as needed. */
676 if (gcm_ctx->gcm_use_avx == B_TRUE) {
677 size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
678
679 if (htab_len == 0) {
680 return (CRYPTO_MECHANISM_PARAM_INVALID);
681 }
682 gcm_ctx->gcm_htab_len = htab_len;
683 gcm_ctx->gcm_Htable =
684 kmem_alloc(htab_len, KM_SLEEP);
685
686 if (gcm_ctx->gcm_Htable == NULL) {
687 return (CRYPTO_HOST_MEMORY);
688 }
689 }
690 /* Avx and non avx context initialization differs from here on. */
691 if (gcm_ctx->gcm_use_avx == B_FALSE) {
692 #endif /* ifdef CAN_USE_GCM_ASM */
693 if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
694 encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
695 rv = CRYPTO_MECHANISM_PARAM_INVALID;
696 }
697 #ifdef CAN_USE_GCM_ASM
698 } else {
699 if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
700 block_size) != CRYPTO_SUCCESS) {
701 rv = CRYPTO_MECHANISM_PARAM_INVALID;
702 }
703 }
704 #endif /* ifdef CAN_USE_GCM_ASM */
705
706 return (rv);
707 }
708
709 void *
gcm_alloc_ctx(int kmflag)710 gcm_alloc_ctx(int kmflag)
711 {
712 gcm_ctx_t *gcm_ctx;
713
714 if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
715 return (NULL);
716
717 gcm_ctx->gcm_flags = GCM_MODE;
718 return (gcm_ctx);
719 }
720
721 /* GCM implementation that contains the fastest methods */
722 static gcm_impl_ops_t gcm_fastest_impl = {
723 .name = "fastest"
724 };
725
726 /* All compiled in implementations */
727 static const gcm_impl_ops_t *gcm_all_impl[] = {
728 &gcm_generic_impl,
729 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
730 &gcm_pclmulqdq_impl,
731 #endif
732 };
733
734 /* Indicate that benchmark has been completed */
735 static boolean_t gcm_impl_initialized = B_FALSE;
736
737 /* Hold all supported implementations */
738 static size_t gcm_supp_impl_cnt = 0;
739 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
740
741 /*
742 * Returns the GCM operations for encrypt/decrypt/key setup. When a
743 * SIMD implementation is not allowed in the current context, then
744 * fallback to the fastest generic implementation.
745 */
746 const gcm_impl_ops_t *
gcm_impl_get_ops(void)747 gcm_impl_get_ops(void)
748 {
749 if (!kfpu_allowed())
750 return (&gcm_generic_impl);
751
752 const gcm_impl_ops_t *ops = NULL;
753 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
754
755 switch (impl) {
756 case IMPL_FASTEST:
757 ASSERT(gcm_impl_initialized);
758 ops = &gcm_fastest_impl;
759 break;
760 case IMPL_CYCLE:
761 /* Cycle through supported implementations */
762 ASSERT(gcm_impl_initialized);
763 ASSERT3U(gcm_supp_impl_cnt, >, 0);
764 static size_t cycle_impl_idx = 0;
765 size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
766 ops = gcm_supp_impl[idx];
767 break;
768 #ifdef CAN_USE_GCM_ASM
769 case IMPL_AVX:
770 /*
771 * Make sure that we return a valid implementation while
772 * switching to the avx implementation since there still
773 * may be unfinished non-avx contexts around.
774 */
775 ops = &gcm_generic_impl;
776 break;
777 #endif
778 default:
779 ASSERT3U(impl, <, gcm_supp_impl_cnt);
780 ASSERT3U(gcm_supp_impl_cnt, >, 0);
781 if (impl < ARRAY_SIZE(gcm_all_impl))
782 ops = gcm_supp_impl[impl];
783 break;
784 }
785
786 ASSERT3P(ops, !=, NULL);
787
788 return (ops);
789 }
790
791 /*
792 * Initialize all supported implementations.
793 */
794 void
gcm_impl_init(void)795 gcm_impl_init(void)
796 {
797 gcm_impl_ops_t *curr_impl;
798 int i, c;
799
800 /* Move supported implementations into gcm_supp_impls */
801 for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
802 curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
803
804 if (curr_impl->is_supported())
805 gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
806 }
807 gcm_supp_impl_cnt = c;
808
809 /*
810 * Set the fastest implementation given the assumption that the
811 * hardware accelerated version is the fastest.
812 */
813 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
814 if (gcm_pclmulqdq_impl.is_supported()) {
815 memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
816 sizeof (gcm_fastest_impl));
817 } else
818 #endif
819 {
820 memcpy(&gcm_fastest_impl, &gcm_generic_impl,
821 sizeof (gcm_fastest_impl));
822 }
823
824 strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
825
826 #ifdef CAN_USE_GCM_ASM
827 /*
828 * Use the avx implementation if it's available and the implementation
829 * hasn't changed from its default value of fastest on module load.
830 */
831 if (gcm_avx_will_work()) {
832 #ifdef HAVE_MOVBE
833 if (zfs_movbe_available() == B_TRUE) {
834 atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
835 }
836 #endif
837 if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
838 gcm_set_avx(B_TRUE);
839 }
840 }
841 #endif
842 /* Finish initialization */
843 atomic_swap_32(&icp_gcm_impl, user_sel_impl);
844 gcm_impl_initialized = B_TRUE;
845 }
846
847 static const struct {
848 const char *name;
849 uint32_t sel;
850 } gcm_impl_opts[] = {
851 { "cycle", IMPL_CYCLE },
852 { "fastest", IMPL_FASTEST },
853 #ifdef CAN_USE_GCM_ASM
854 { "avx", IMPL_AVX },
855 #endif
856 };
857
858 /*
859 * Function sets desired gcm implementation.
860 *
861 * If we are called before init(), user preference will be saved in
862 * user_sel_impl, and applied in later init() call. This occurs when module
863 * parameter is specified on module load. Otherwise, directly update
864 * icp_gcm_impl.
865 *
866 * @val Name of gcm implementation to use
867 * @param Unused.
868 */
869 int
gcm_impl_set(const char * val)870 gcm_impl_set(const char *val)
871 {
872 int err = -EINVAL;
873 char req_name[GCM_IMPL_NAME_MAX];
874 uint32_t impl = GCM_IMPL_READ(user_sel_impl);
875 size_t i;
876
877 /* sanitize input */
878 i = strnlen(val, GCM_IMPL_NAME_MAX);
879 if (i == 0 || i >= GCM_IMPL_NAME_MAX)
880 return (err);
881
882 strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
883 while (i > 0 && isspace(req_name[i-1]))
884 i--;
885 req_name[i] = '\0';
886
887 /* Check mandatory options */
888 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
889 #ifdef CAN_USE_GCM_ASM
890 /* Ignore avx implementation if it won't work. */
891 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
892 continue;
893 }
894 #endif
895 if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
896 impl = gcm_impl_opts[i].sel;
897 err = 0;
898 break;
899 }
900 }
901
902 /* check all supported impl if init() was already called */
903 if (err != 0 && gcm_impl_initialized) {
904 /* check all supported implementations */
905 for (i = 0; i < gcm_supp_impl_cnt; i++) {
906 if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
907 impl = i;
908 err = 0;
909 break;
910 }
911 }
912 }
913 #ifdef CAN_USE_GCM_ASM
914 /*
915 * Use the avx implementation if available and the requested one is
916 * avx or fastest.
917 */
918 if (gcm_avx_will_work() == B_TRUE &&
919 (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
920 gcm_set_avx(B_TRUE);
921 } else {
922 gcm_set_avx(B_FALSE);
923 }
924 #endif
925
926 if (err == 0) {
927 if (gcm_impl_initialized)
928 atomic_swap_32(&icp_gcm_impl, impl);
929 else
930 atomic_swap_32(&user_sel_impl, impl);
931 }
932
933 return (err);
934 }
935
936 #if defined(_KERNEL) && defined(__linux__)
937
938 static int
icp_gcm_impl_set(const char * val,zfs_kernel_param_t * kp)939 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
940 {
941 return (gcm_impl_set(val));
942 }
943
944 static int
icp_gcm_impl_get(char * buffer,zfs_kernel_param_t * kp)945 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
946 {
947 int i, cnt = 0;
948 char *fmt;
949 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
950
951 /* list mandatory options */
952 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
953 #ifdef CAN_USE_GCM_ASM
954 /* Ignore avx implementation if it won't work. */
955 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
956 continue;
957 }
958 #endif
959 fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
960 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
961 gcm_impl_opts[i].name);
962 }
963
964 /* list all supported implementations */
965 for (i = 0; i < gcm_supp_impl_cnt; i++) {
966 fmt = (i == impl) ? "[%s] " : "%s ";
967 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
968 gcm_supp_impl[i]->name);
969 }
970
971 return (cnt);
972 }
973
974 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
975 NULL, 0644);
976 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
977 #endif /* defined(__KERNEL) */
978
979 #ifdef CAN_USE_GCM_ASM
980 #define GCM_BLOCK_LEN 16
981 /*
982 * The openssl asm routines are 6x aggregated and need that many bytes
983 * at minimum.
984 */
985 #define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
986 #define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
987 /*
988 * Ensure the chunk size is reasonable since we are allocating a
989 * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
990 */
991 #define GCM_AVX_MAX_CHUNK_SIZE \
992 (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
993
994 /* Clear the FPU registers since they hold sensitive internal state. */
995 #define clear_fpu_regs() clear_fpu_regs_avx()
996 #define GHASH_AVX(ctx, in, len) \
997 gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
998 in, len)
999
1000 #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1001
1002 /* Get the chunk size module parameter. */
1003 #define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1004
1005 /*
1006 * Module parameter: number of bytes to process at once while owning the FPU.
1007 * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1008 * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1009 */
1010 static uint32_t gcm_avx_chunk_size =
1011 ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1012
1013 extern void ASMABI clear_fpu_regs_avx(void);
1014 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1015 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
1016 const uint32_t pt[4], uint32_t ct[4]);
1017
1018 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1019 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1020 const uint8_t *in, size_t len);
1021
1022 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1023 const void *, uint64_t *, uint64_t *);
1024
1025 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1026 const void *, uint64_t *, uint64_t *);
1027
1028 static inline boolean_t
gcm_avx_will_work(void)1029 gcm_avx_will_work(void)
1030 {
1031 /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1032 return (kfpu_allowed() &&
1033 zfs_avx_available() && zfs_aes_available() &&
1034 zfs_pclmulqdq_available());
1035 }
1036
1037 static inline void
gcm_set_avx(boolean_t val)1038 gcm_set_avx(boolean_t val)
1039 {
1040 if (gcm_avx_will_work() == B_TRUE) {
1041 atomic_swap_32(&gcm_use_avx, val);
1042 }
1043 }
1044
1045 static inline boolean_t
gcm_toggle_avx(void)1046 gcm_toggle_avx(void)
1047 {
1048 if (gcm_avx_will_work() == B_TRUE) {
1049 return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
1050 } else {
1051 return (B_FALSE);
1052 }
1053 }
1054
1055 static inline size_t
gcm_simd_get_htab_size(boolean_t simd_mode)1056 gcm_simd_get_htab_size(boolean_t simd_mode)
1057 {
1058 switch (simd_mode) {
1059 case B_TRUE:
1060 return (2 * 6 * 2 * sizeof (uint64_t));
1061
1062 default:
1063 return (0);
1064 }
1065 }
1066
1067
1068 /* Increment the GCM counter block by n. */
1069 static inline void
gcm_incr_counter_block_by(gcm_ctx_t * ctx,int n)1070 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1071 {
1072 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1073 uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1074
1075 counter = htonll(counter + n);
1076 counter &= counter_mask;
1077 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1078 }
1079
1080 /*
1081 * Encrypt multiple blocks of data in GCM mode.
1082 * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1083 * if possible. While processing a chunk the FPU is "locked".
1084 */
1085 static int
gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size)1086 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1087 size_t length, crypto_data_t *out, size_t block_size)
1088 {
1089 size_t bleft = length;
1090 size_t need = 0;
1091 size_t done = 0;
1092 uint8_t *datap = (uint8_t *)data;
1093 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1094 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1095 uint64_t *ghash = ctx->gcm_ghash;
1096 uint64_t *cb = ctx->gcm_cb;
1097 uint8_t *ct_buf = NULL;
1098 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1099 int rv = CRYPTO_SUCCESS;
1100
1101 ASSERT(block_size == GCM_BLOCK_LEN);
1102 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1103 B_FALSE);
1104 /*
1105 * If the last call left an incomplete block, try to fill
1106 * it first.
1107 */
1108 if (ctx->gcm_remainder_len > 0) {
1109 need = block_size - ctx->gcm_remainder_len;
1110 if (length < need) {
1111 /* Accumulate bytes here and return. */
1112 memcpy((uint8_t *)ctx->gcm_remainder +
1113 ctx->gcm_remainder_len, datap, length);
1114
1115 ctx->gcm_remainder_len += length;
1116 if (ctx->gcm_copy_to == NULL) {
1117 ctx->gcm_copy_to = datap;
1118 }
1119 return (CRYPTO_SUCCESS);
1120 } else {
1121 /* Complete incomplete block. */
1122 memcpy((uint8_t *)ctx->gcm_remainder +
1123 ctx->gcm_remainder_len, datap, need);
1124
1125 ctx->gcm_copy_to = NULL;
1126 }
1127 }
1128
1129 /* Allocate a buffer to encrypt to if there is enough input. */
1130 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1131 ct_buf = vmem_alloc(chunk_size, KM_SLEEP);
1132 if (ct_buf == NULL) {
1133 return (CRYPTO_HOST_MEMORY);
1134 }
1135 }
1136
1137 /* If we completed an incomplete block, encrypt and write it out. */
1138 if (ctx->gcm_remainder_len > 0) {
1139 kfpu_begin();
1140 aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1141 (const uint32_t *)cb, (uint32_t *)tmp);
1142
1143 gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1144 GHASH_AVX(ctx, tmp, block_size);
1145 clear_fpu_regs();
1146 kfpu_end();
1147 rv = crypto_put_output_data(tmp, out, block_size);
1148 out->cd_offset += block_size;
1149 gcm_incr_counter_block(ctx);
1150 ctx->gcm_processed_data_len += block_size;
1151 bleft -= need;
1152 datap += need;
1153 ctx->gcm_remainder_len = 0;
1154 }
1155
1156 /* Do the bulk encryption in chunk_size blocks. */
1157 for (; bleft >= chunk_size; bleft -= chunk_size) {
1158 kfpu_begin();
1159 done = aesni_gcm_encrypt(
1160 datap, ct_buf, chunk_size, key, cb, ghash);
1161
1162 clear_fpu_regs();
1163 kfpu_end();
1164 if (done != chunk_size) {
1165 rv = CRYPTO_FAILED;
1166 goto out_nofpu;
1167 }
1168 rv = crypto_put_output_data(ct_buf, out, chunk_size);
1169 if (rv != CRYPTO_SUCCESS) {
1170 goto out_nofpu;
1171 }
1172 out->cd_offset += chunk_size;
1173 datap += chunk_size;
1174 ctx->gcm_processed_data_len += chunk_size;
1175 }
1176 /* Check if we are already done. */
1177 if (bleft == 0) {
1178 goto out_nofpu;
1179 }
1180 /* Bulk encrypt the remaining data. */
1181 kfpu_begin();
1182 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1183 done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
1184 if (done == 0) {
1185 rv = CRYPTO_FAILED;
1186 goto out;
1187 }
1188 rv = crypto_put_output_data(ct_buf, out, done);
1189 if (rv != CRYPTO_SUCCESS) {
1190 goto out;
1191 }
1192 out->cd_offset += done;
1193 ctx->gcm_processed_data_len += done;
1194 datap += done;
1195 bleft -= done;
1196
1197 }
1198 /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1199 while (bleft > 0) {
1200 if (bleft < block_size) {
1201 memcpy(ctx->gcm_remainder, datap, bleft);
1202 ctx->gcm_remainder_len = bleft;
1203 ctx->gcm_copy_to = datap;
1204 goto out;
1205 }
1206 /* Encrypt, hash and write out. */
1207 aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1208 (const uint32_t *)cb, (uint32_t *)tmp);
1209
1210 gcm_xor_avx(datap, tmp);
1211 GHASH_AVX(ctx, tmp, block_size);
1212 rv = crypto_put_output_data(tmp, out, block_size);
1213 if (rv != CRYPTO_SUCCESS) {
1214 goto out;
1215 }
1216 out->cd_offset += block_size;
1217 gcm_incr_counter_block(ctx);
1218 ctx->gcm_processed_data_len += block_size;
1219 datap += block_size;
1220 bleft -= block_size;
1221 }
1222 out:
1223 clear_fpu_regs();
1224 kfpu_end();
1225 out_nofpu:
1226 if (ct_buf != NULL) {
1227 vmem_free(ct_buf, chunk_size);
1228 }
1229 return (rv);
1230 }
1231
1232 /*
1233 * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1234 * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1235 */
1236 static int
gcm_encrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1237 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1238 {
1239 uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1240 uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1241 uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1242 size_t rem_len = ctx->gcm_remainder_len;
1243 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1244 int aes_rounds = ((aes_key_t *)keysched)->nr;
1245 int rv;
1246
1247 ASSERT(block_size == GCM_BLOCK_LEN);
1248 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1249 B_FALSE);
1250
1251 if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1252 return (CRYPTO_DATA_LEN_RANGE);
1253 }
1254
1255 kfpu_begin();
1256 /* Pad last incomplete block with zeros, encrypt and hash. */
1257 if (rem_len > 0) {
1258 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1259 const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1260
1261 aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1262 memset(remainder + rem_len, 0, block_size - rem_len);
1263 for (int i = 0; i < rem_len; i++) {
1264 remainder[i] ^= tmp[i];
1265 }
1266 GHASH_AVX(ctx, remainder, block_size);
1267 ctx->gcm_processed_data_len += rem_len;
1268 /* No need to increment counter_block, it's the last block. */
1269 }
1270 /* Finish tag. */
1271 ctx->gcm_len_a_len_c[1] =
1272 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1273 GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1274 aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1275
1276 gcm_xor_avx((uint8_t *)J0, ghash);
1277 clear_fpu_regs();
1278 kfpu_end();
1279
1280 /* Output remainder. */
1281 if (rem_len > 0) {
1282 rv = crypto_put_output_data(remainder, out, rem_len);
1283 if (rv != CRYPTO_SUCCESS)
1284 return (rv);
1285 }
1286 out->cd_offset += rem_len;
1287 ctx->gcm_remainder_len = 0;
1288 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1289 if (rv != CRYPTO_SUCCESS)
1290 return (rv);
1291
1292 out->cd_offset += ctx->gcm_tag_len;
1293 return (CRYPTO_SUCCESS);
1294 }
1295
1296 /*
1297 * Finalize decryption: We just have accumulated crypto text, so now we
1298 * decrypt it here inplace.
1299 */
1300 static int
gcm_decrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1301 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1302 {
1303 ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1304 ASSERT3U(block_size, ==, 16);
1305 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1306 B_FALSE);
1307
1308 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1309 size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1310 uint8_t *datap = ctx->gcm_pt_buf;
1311 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1312 uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1313 uint64_t *ghash = ctx->gcm_ghash;
1314 uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1315 int rv = CRYPTO_SUCCESS;
1316 size_t bleft, done;
1317
1318 /*
1319 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1320 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1321 * GCM_AVX_MIN_DECRYPT_BYTES.
1322 */
1323 for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1324 kfpu_begin();
1325 done = aesni_gcm_decrypt(datap, datap, chunk_size,
1326 (const void *)key, ctx->gcm_cb, ghash);
1327 clear_fpu_regs();
1328 kfpu_end();
1329 if (done != chunk_size) {
1330 return (CRYPTO_FAILED);
1331 }
1332 datap += done;
1333 }
1334 /* Decrypt remainder, which is less than chunk size, in one go. */
1335 kfpu_begin();
1336 if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1337 done = aesni_gcm_decrypt(datap, datap, bleft,
1338 (const void *)key, ctx->gcm_cb, ghash);
1339 if (done == 0) {
1340 clear_fpu_regs();
1341 kfpu_end();
1342 return (CRYPTO_FAILED);
1343 }
1344 datap += done;
1345 bleft -= done;
1346 }
1347 ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1348
1349 /*
1350 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1351 * decrypt them block by block.
1352 */
1353 while (bleft > 0) {
1354 /* Incomplete last block. */
1355 if (bleft < block_size) {
1356 uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1357
1358 memset(lastb, 0, block_size);
1359 memcpy(lastb, datap, bleft);
1360 /* The GCM processing. */
1361 GHASH_AVX(ctx, lastb, block_size);
1362 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1363 for (size_t i = 0; i < bleft; i++) {
1364 datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1365 }
1366 break;
1367 }
1368 /* The GCM processing. */
1369 GHASH_AVX(ctx, datap, block_size);
1370 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1371 gcm_xor_avx((uint8_t *)tmp, datap);
1372 gcm_incr_counter_block(ctx);
1373
1374 datap += block_size;
1375 bleft -= block_size;
1376 }
1377 if (rv != CRYPTO_SUCCESS) {
1378 clear_fpu_regs();
1379 kfpu_end();
1380 return (rv);
1381 }
1382 /* Decryption done, finish the tag. */
1383 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1384 GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1385 aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1386 (uint32_t *)ctx->gcm_J0);
1387
1388 gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1389
1390 /* We are done with the FPU, restore its state. */
1391 clear_fpu_regs();
1392 kfpu_end();
1393
1394 /* Compare the input authentication tag with what we calculated. */
1395 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1396 /* They don't match. */
1397 return (CRYPTO_INVALID_MAC);
1398 }
1399 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1400 if (rv != CRYPTO_SUCCESS) {
1401 return (rv);
1402 }
1403 out->cd_offset += pt_len;
1404 return (CRYPTO_SUCCESS);
1405 }
1406
1407 /*
1408 * Initialize the GCM params H, Htabtle and the counter block. Save the
1409 * initial counter block.
1410 */
1411 static int
gcm_init_avx(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size)1412 gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
1413 const uint8_t *auth_data, size_t auth_data_len, size_t block_size)
1414 {
1415 uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1416 uint64_t *H = ctx->gcm_H;
1417 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1418 int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1419 const uint8_t *datap = auth_data;
1420 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1421 size_t bleft;
1422
1423 ASSERT(block_size == GCM_BLOCK_LEN);
1424 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1425 B_FALSE);
1426
1427 /* Init H (encrypt zero block) and create the initial counter block. */
1428 memset(H, 0, sizeof (ctx->gcm_H));
1429 kfpu_begin();
1430 aes_encrypt_intel(keysched, aes_rounds,
1431 (const uint32_t *)H, (uint32_t *)H);
1432
1433 gcm_init_htab_avx(ctx->gcm_Htable, H);
1434
1435 if (iv_len == 12) {
1436 memcpy(cb, iv, 12);
1437 cb[12] = 0;
1438 cb[13] = 0;
1439 cb[14] = 0;
1440 cb[15] = 1;
1441 /* We need the ICB later. */
1442 memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0));
1443 } else {
1444 /*
1445 * Most consumers use 12 byte IVs, so it's OK to use the
1446 * original routines for other IV sizes, just avoid nesting
1447 * kfpu_begin calls.
1448 */
1449 clear_fpu_regs();
1450 kfpu_end();
1451 gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1452 aes_copy_block, aes_xor_block);
1453 kfpu_begin();
1454 }
1455
1456 memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash));
1457
1458 /* Openssl post increments the counter, adjust for that. */
1459 gcm_incr_counter_block(ctx);
1460
1461 /* Ghash AAD in chunk_size blocks. */
1462 for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1463 GHASH_AVX(ctx, datap, chunk_size);
1464 datap += chunk_size;
1465 clear_fpu_regs();
1466 kfpu_end();
1467 kfpu_begin();
1468 }
1469 /* Ghash the remainder and handle possible incomplete GCM block. */
1470 if (bleft > 0) {
1471 size_t incomp = bleft % block_size;
1472
1473 bleft -= incomp;
1474 if (bleft > 0) {
1475 GHASH_AVX(ctx, datap, bleft);
1476 datap += bleft;
1477 }
1478 if (incomp > 0) {
1479 /* Zero pad and hash incomplete last block. */
1480 uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1481
1482 memset(authp, 0, block_size);
1483 memcpy(authp, datap, incomp);
1484 GHASH_AVX(ctx, authp, block_size);
1485 }
1486 }
1487 clear_fpu_regs();
1488 kfpu_end();
1489 return (CRYPTO_SUCCESS);
1490 }
1491
1492 #if defined(_KERNEL)
1493 static int
icp_gcm_avx_set_chunk_size(const char * buf,zfs_kernel_param_t * kp)1494 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1495 {
1496 unsigned long val;
1497 char val_rounded[16];
1498 int error = 0;
1499
1500 error = kstrtoul(buf, 0, &val);
1501 if (error)
1502 return (error);
1503
1504 val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1505
1506 if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1507 return (-EINVAL);
1508
1509 snprintf(val_rounded, 16, "%u", (uint32_t)val);
1510 error = param_set_uint(val_rounded, kp);
1511 return (error);
1512 }
1513
1514 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1515 param_get_uint, &gcm_avx_chunk_size, 0644);
1516
1517 MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1518 "How many bytes to process while owning the FPU");
1519
1520 #endif /* defined(__KERNEL) */
1521 #endif /* ifdef CAN_USE_GCM_ASM */
1522