1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 #include <sys/zfs_context.h>
26 #include <sys/cmn_err.h>
27 #include <modes/modes.h>
28 #include <sys/crypto/common.h>
29 #include <sys/crypto/icp.h>
30 #include <sys/crypto/impl.h>
31 #include <sys/byteorder.h>
32 #include <sys/simd.h>
33 #include <modes/gcm_impl.h>
34 #ifdef CAN_USE_GCM_ASM
35 #include <aes/aes_impl.h>
36 #endif
37
38 #define GHASH(c, d, t, o) \
39 xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
40 (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
41 (uint64_t *)(void *)(t));
42
43 /* Select GCM implementation */
44 #define IMPL_FASTEST (UINT32_MAX)
45 #define IMPL_CYCLE (UINT32_MAX-1)
46 #ifdef CAN_USE_GCM_ASM
47 #define IMPL_AVX (UINT32_MAX-2)
48 #endif
49 #define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
50 static uint32_t icp_gcm_impl = IMPL_FASTEST;
51 static uint32_t user_sel_impl = IMPL_FASTEST;
52
53 #ifdef CAN_USE_GCM_ASM
54 /* Does the architecture we run on support the MOVBE instruction? */
55 boolean_t gcm_avx_can_use_movbe = B_FALSE;
56 /*
57 * Whether to use the optimized openssl gcm and ghash implementations.
58 * Set to true if module parameter icp_gcm_impl == "avx".
59 */
60 static boolean_t gcm_use_avx = B_FALSE;
61 #define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx)
62
63 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
64
65 static inline boolean_t gcm_avx_will_work(void);
66 static inline void gcm_set_avx(boolean_t);
67 static inline boolean_t gcm_toggle_avx(void);
68 static inline size_t gcm_simd_get_htab_size(boolean_t);
69
70 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
71 crypto_data_t *, size_t);
72
73 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
74 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
75 static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *,
76 size_t, size_t);
77 #endif /* ifdef CAN_USE_GCM_ASM */
78
79 /*
80 * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode
81 * is done in another function.
82 */
83 int
gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))84 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
85 crypto_data_t *out, size_t block_size,
86 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
87 void (*copy_block)(uint8_t *, uint8_t *),
88 void (*xor_block)(uint8_t *, uint8_t *))
89 {
90 #ifdef CAN_USE_GCM_ASM
91 if (ctx->gcm_use_avx == B_TRUE)
92 return (gcm_mode_encrypt_contiguous_blocks_avx(
93 ctx, data, length, out, block_size));
94 #endif
95
96 const gcm_impl_ops_t *gops;
97 size_t remainder = length;
98 size_t need = 0;
99 uint8_t *datap = (uint8_t *)data;
100 uint8_t *blockp;
101 uint8_t *lastp;
102 void *iov_or_mp;
103 offset_t offset;
104 uint8_t *out_data_1;
105 uint8_t *out_data_2;
106 size_t out_data_1_len;
107 uint64_t counter;
108 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
109
110 if (length + ctx->gcm_remainder_len < block_size) {
111 /* accumulate bytes here and return */
112 memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
113 datap,
114 length);
115 ctx->gcm_remainder_len += length;
116 if (ctx->gcm_copy_to == NULL) {
117 ctx->gcm_copy_to = datap;
118 }
119 return (CRYPTO_SUCCESS);
120 }
121
122 crypto_init_ptrs(out, &iov_or_mp, &offset);
123
124 gops = gcm_impl_get_ops();
125 do {
126 /* Unprocessed data from last call. */
127 if (ctx->gcm_remainder_len > 0) {
128 need = block_size - ctx->gcm_remainder_len;
129
130 if (need > remainder)
131 return (CRYPTO_DATA_LEN_RANGE);
132
133 memcpy(&((uint8_t *)ctx->gcm_remainder)
134 [ctx->gcm_remainder_len], datap, need);
135
136 blockp = (uint8_t *)ctx->gcm_remainder;
137 } else {
138 blockp = datap;
139 }
140
141 /*
142 * Increment counter. Counter bits are confined
143 * to the bottom 32 bits of the counter block.
144 */
145 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
146 counter = htonll(counter + 1);
147 counter &= counter_mask;
148 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
149
150 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
151 (uint8_t *)ctx->gcm_tmp);
152 xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
153
154 lastp = (uint8_t *)ctx->gcm_tmp;
155
156 ctx->gcm_processed_data_len += block_size;
157
158 crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
159 &out_data_1_len, &out_data_2, block_size);
160
161 /* copy block to where it belongs */
162 if (out_data_1_len == block_size) {
163 copy_block(lastp, out_data_1);
164 } else {
165 memcpy(out_data_1, lastp, out_data_1_len);
166 if (out_data_2 != NULL) {
167 memcpy(out_data_2,
168 lastp + out_data_1_len,
169 block_size - out_data_1_len);
170 }
171 }
172 /* update offset */
173 out->cd_offset += block_size;
174
175 /* add ciphertext to the hash */
176 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
177
178 /* Update pointer to next block of data to be processed. */
179 if (ctx->gcm_remainder_len != 0) {
180 datap += need;
181 ctx->gcm_remainder_len = 0;
182 } else {
183 datap += block_size;
184 }
185
186 remainder = (size_t)&data[length] - (size_t)datap;
187
188 /* Incomplete last block. */
189 if (remainder > 0 && remainder < block_size) {
190 memcpy(ctx->gcm_remainder, datap, remainder);
191 ctx->gcm_remainder_len = remainder;
192 ctx->gcm_copy_to = datap;
193 goto out;
194 }
195 ctx->gcm_copy_to = NULL;
196
197 } while (remainder > 0);
198 out:
199 return (CRYPTO_SUCCESS);
200 }
201
202 int
gcm_encrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))203 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
204 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
205 void (*copy_block)(uint8_t *, uint8_t *),
206 void (*xor_block)(uint8_t *, uint8_t *))
207 {
208 (void) copy_block;
209 #ifdef CAN_USE_GCM_ASM
210 if (ctx->gcm_use_avx == B_TRUE)
211 return (gcm_encrypt_final_avx(ctx, out, block_size));
212 #endif
213
214 const gcm_impl_ops_t *gops;
215 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
216 uint8_t *ghash, *macp = NULL;
217 int i, rv;
218
219 if (out->cd_length <
220 (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
221 return (CRYPTO_DATA_LEN_RANGE);
222 }
223
224 gops = gcm_impl_get_ops();
225 ghash = (uint8_t *)ctx->gcm_ghash;
226
227 if (ctx->gcm_remainder_len > 0) {
228 uint64_t counter;
229 uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
230
231 /*
232 * Here is where we deal with data that is not a
233 * multiple of the block size.
234 */
235
236 /*
237 * Increment counter.
238 */
239 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
240 counter = htonll(counter + 1);
241 counter &= counter_mask;
242 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
243
244 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
245 (uint8_t *)ctx->gcm_tmp);
246
247 macp = (uint8_t *)ctx->gcm_remainder;
248 memset(macp + ctx->gcm_remainder_len, 0,
249 block_size - ctx->gcm_remainder_len);
250
251 /* XOR with counter block */
252 for (i = 0; i < ctx->gcm_remainder_len; i++) {
253 macp[i] ^= tmpp[i];
254 }
255
256 /* add ciphertext to the hash */
257 GHASH(ctx, macp, ghash, gops);
258
259 ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
260 }
261
262 ctx->gcm_len_a_len_c[1] =
263 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
264 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
265 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
266 (uint8_t *)ctx->gcm_J0);
267 xor_block((uint8_t *)ctx->gcm_J0, ghash);
268
269 if (ctx->gcm_remainder_len > 0) {
270 rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
271 if (rv != CRYPTO_SUCCESS)
272 return (rv);
273 }
274 out->cd_offset += ctx->gcm_remainder_len;
275 ctx->gcm_remainder_len = 0;
276 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
277 if (rv != CRYPTO_SUCCESS)
278 return (rv);
279 out->cd_offset += ctx->gcm_tag_len;
280
281 return (CRYPTO_SUCCESS);
282 }
283
284 /*
285 * This will only deal with decrypting the last block of the input that
286 * might not be a multiple of block length.
287 */
288 static void
gcm_decrypt_incomplete_block(gcm_ctx_t * ctx,size_t block_size,size_t index,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))289 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
290 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
291 void (*xor_block)(uint8_t *, uint8_t *))
292 {
293 uint8_t *datap, *outp, *counterp;
294 uint64_t counter;
295 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
296 int i;
297
298 /*
299 * Increment counter.
300 * Counter bits are confined to the bottom 32 bits
301 */
302 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
303 counter = htonll(counter + 1);
304 counter &= counter_mask;
305 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
306
307 datap = (uint8_t *)ctx->gcm_remainder;
308 outp = &((ctx->gcm_pt_buf)[index]);
309 counterp = (uint8_t *)ctx->gcm_tmp;
310
311 /* authentication tag */
312 memset((uint8_t *)ctx->gcm_tmp, 0, block_size);
313 memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len);
314
315 /* add ciphertext to the hash */
316 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
317
318 /* decrypt remaining ciphertext */
319 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
320
321 /* XOR with counter block */
322 for (i = 0; i < ctx->gcm_remainder_len; i++) {
323 outp[i] = datap[i] ^ counterp[i];
324 }
325 }
326
327 int
gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))328 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
329 crypto_data_t *out, size_t block_size,
330 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
331 void (*copy_block)(uint8_t *, uint8_t *),
332 void (*xor_block)(uint8_t *, uint8_t *))
333 {
334 (void) out, (void) block_size, (void) encrypt_block, (void) copy_block,
335 (void) xor_block;
336 size_t new_len;
337 uint8_t *new;
338
339 /*
340 * Copy contiguous ciphertext input blocks to plaintext buffer.
341 * Ciphertext will be decrypted in the final.
342 */
343 if (length > 0) {
344 new_len = ctx->gcm_pt_buf_len + length;
345 new = vmem_alloc(new_len, KM_SLEEP);
346 if (new == NULL) {
347 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
348 ctx->gcm_pt_buf = NULL;
349 return (CRYPTO_HOST_MEMORY);
350 }
351
352 if (ctx->gcm_pt_buf != NULL) {
353 memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
354 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
355 } else {
356 ASSERT0(ctx->gcm_pt_buf_len);
357 }
358
359 ctx->gcm_pt_buf = new;
360 ctx->gcm_pt_buf_len = new_len;
361 memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data,
362 length);
363 ctx->gcm_processed_data_len += length;
364 }
365
366 ctx->gcm_remainder_len = 0;
367 return (CRYPTO_SUCCESS);
368 }
369
370 int
gcm_decrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))371 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
372 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
373 void (*xor_block)(uint8_t *, uint8_t *))
374 {
375 #ifdef CAN_USE_GCM_ASM
376 if (ctx->gcm_use_avx == B_TRUE)
377 return (gcm_decrypt_final_avx(ctx, out, block_size));
378 #endif
379
380 const gcm_impl_ops_t *gops;
381 size_t pt_len;
382 size_t remainder;
383 uint8_t *ghash;
384 uint8_t *blockp;
385 uint8_t *cbp;
386 uint64_t counter;
387 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
388 int processed = 0, rv;
389
390 ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
391
392 gops = gcm_impl_get_ops();
393 pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
394 ghash = (uint8_t *)ctx->gcm_ghash;
395 blockp = ctx->gcm_pt_buf;
396 remainder = pt_len;
397 while (remainder > 0) {
398 /* Incomplete last block */
399 if (remainder < block_size) {
400 memcpy(ctx->gcm_remainder, blockp, remainder);
401 ctx->gcm_remainder_len = remainder;
402 /*
403 * not expecting anymore ciphertext, just
404 * compute plaintext for the remaining input
405 */
406 gcm_decrypt_incomplete_block(ctx, block_size,
407 processed, encrypt_block, xor_block);
408 ctx->gcm_remainder_len = 0;
409 goto out;
410 }
411 /* add ciphertext to the hash */
412 GHASH(ctx, blockp, ghash, gops);
413
414 /*
415 * Increment counter.
416 * Counter bits are confined to the bottom 32 bits
417 */
418 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
419 counter = htonll(counter + 1);
420 counter &= counter_mask;
421 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
422
423 cbp = (uint8_t *)ctx->gcm_tmp;
424 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
425
426 /* XOR with ciphertext */
427 xor_block(cbp, blockp);
428
429 processed += block_size;
430 blockp += block_size;
431 remainder -= block_size;
432 }
433 out:
434 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
435 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
436 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
437 (uint8_t *)ctx->gcm_J0);
438 xor_block((uint8_t *)ctx->gcm_J0, ghash);
439
440 /* compare the input authentication tag with what we calculated */
441 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
442 /* They don't match */
443 return (CRYPTO_INVALID_MAC);
444 } else {
445 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
446 if (rv != CRYPTO_SUCCESS)
447 return (rv);
448 out->cd_offset += pt_len;
449 }
450 return (CRYPTO_SUCCESS);
451 }
452
453 static int
gcm_validate_args(CK_AES_GCM_PARAMS * gcm_param)454 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
455 {
456 size_t tag_len;
457
458 /*
459 * Check the length of the authentication tag (in bits).
460 */
461 tag_len = gcm_param->ulTagBits;
462 switch (tag_len) {
463 case 32:
464 case 64:
465 case 96:
466 case 104:
467 case 112:
468 case 120:
469 case 128:
470 break;
471 default:
472 return (CRYPTO_MECHANISM_PARAM_INVALID);
473 }
474
475 if (gcm_param->ulIvLen == 0)
476 return (CRYPTO_MECHANISM_PARAM_INVALID);
477
478 return (CRYPTO_SUCCESS);
479 }
480
481 static void
gcm_format_initial_blocks(const uint8_t * iv,ulong_t iv_len,gcm_ctx_t * ctx,size_t block_size,void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))482 gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len,
483 gcm_ctx_t *ctx, size_t block_size,
484 void (*copy_block)(uint8_t *, uint8_t *),
485 void (*xor_block)(uint8_t *, uint8_t *))
486 {
487 const gcm_impl_ops_t *gops;
488 uint8_t *cb;
489 ulong_t remainder = iv_len;
490 ulong_t processed = 0;
491 uint8_t *datap, *ghash;
492 uint64_t len_a_len_c[2];
493
494 gops = gcm_impl_get_ops();
495 ghash = (uint8_t *)ctx->gcm_ghash;
496 cb = (uint8_t *)ctx->gcm_cb;
497 if (iv_len == 12) {
498 memcpy(cb, iv, 12);
499 cb[12] = 0;
500 cb[13] = 0;
501 cb[14] = 0;
502 cb[15] = 1;
503 /* J0 will be used again in the final */
504 copy_block(cb, (uint8_t *)ctx->gcm_J0);
505 } else {
506 /* GHASH the IV */
507 do {
508 if (remainder < block_size) {
509 memset(cb, 0, block_size);
510 memcpy(cb, &(iv[processed]), remainder);
511 datap = (uint8_t *)cb;
512 remainder = 0;
513 } else {
514 datap = (uint8_t *)(&(iv[processed]));
515 processed += block_size;
516 remainder -= block_size;
517 }
518 GHASH(ctx, datap, ghash, gops);
519 } while (remainder > 0);
520
521 len_a_len_c[0] = 0;
522 len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
523 GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
524
525 /* J0 will be used again in the final */
526 copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
527 }
528 }
529
530 static int
gcm_init(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))531 gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
532 const uint8_t *auth_data, size_t auth_data_len, size_t block_size,
533 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
534 void (*copy_block)(uint8_t *, uint8_t *),
535 void (*xor_block)(uint8_t *, uint8_t *))
536 {
537 const gcm_impl_ops_t *gops;
538 uint8_t *ghash, *datap, *authp;
539 size_t remainder, processed;
540
541 /* encrypt zero block to get subkey H */
542 memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
543 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
544 (uint8_t *)ctx->gcm_H);
545
546 gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
547 copy_block, xor_block);
548
549 gops = gcm_impl_get_ops();
550 authp = (uint8_t *)ctx->gcm_tmp;
551 ghash = (uint8_t *)ctx->gcm_ghash;
552 memset(authp, 0, block_size);
553 memset(ghash, 0, block_size);
554
555 processed = 0;
556 remainder = auth_data_len;
557 do {
558 if (remainder < block_size) {
559 /*
560 * There's not a block full of data, pad rest of
561 * buffer with zero
562 */
563
564 if (auth_data != NULL) {
565 memset(authp, 0, block_size);
566 memcpy(authp, &(auth_data[processed]),
567 remainder);
568 } else {
569 ASSERT0(remainder);
570 }
571
572 datap = (uint8_t *)authp;
573 remainder = 0;
574 } else {
575 datap = (uint8_t *)(&(auth_data[processed]));
576 processed += block_size;
577 remainder -= block_size;
578 }
579
580 /* add auth data to the hash */
581 GHASH(ctx, datap, ghash, gops);
582
583 } while (remainder > 0);
584
585 return (CRYPTO_SUCCESS);
586 }
587
588 /*
589 * Init the GCM context struct. Handle the cycle and avx implementations here.
590 */
591 int
gcm_init_ctx(gcm_ctx_t * gcm_ctx,char * param,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))592 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
593 size_t block_size, int (*encrypt_block)(const void *, const uint8_t *,
594 uint8_t *), void (*copy_block)(uint8_t *, uint8_t *),
595 void (*xor_block)(uint8_t *, uint8_t *))
596 {
597 CK_AES_GCM_PARAMS *gcm_param;
598 int rv = CRYPTO_SUCCESS;
599 size_t tag_len, iv_len;
600
601 if (param != NULL) {
602 gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
603
604 /* GCM mode. */
605 if ((rv = gcm_validate_args(gcm_param)) != 0) {
606 return (rv);
607 }
608 gcm_ctx->gcm_flags |= GCM_MODE;
609
610 size_t tbits = gcm_param->ulTagBits;
611 tag_len = CRYPTO_BITS2BYTES(tbits);
612 iv_len = gcm_param->ulIvLen;
613
614 gcm_ctx->gcm_tag_len = tag_len;
615 gcm_ctx->gcm_processed_data_len = 0;
616
617 /* these values are in bits */
618 gcm_ctx->gcm_len_a_len_c[0]
619 = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
620 } else {
621 return (CRYPTO_MECHANISM_PARAM_INVALID);
622 }
623
624 const uint8_t *iv = (const uint8_t *)gcm_param->pIv;
625 const uint8_t *aad = (const uint8_t *)gcm_param->pAAD;
626 size_t aad_len = gcm_param->ulAADLen;
627
628 #ifdef CAN_USE_GCM_ASM
629 boolean_t needs_bswap =
630 ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
631
632 if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
633 gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
634 } else {
635 /*
636 * Handle the "cycle" implementation by creating avx and
637 * non-avx contexts alternately.
638 */
639 gcm_ctx->gcm_use_avx = gcm_toggle_avx();
640
641 /* The avx impl. doesn't handle byte swapped key schedules. */
642 if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
643 gcm_ctx->gcm_use_avx = B_FALSE;
644 }
645 /*
646 * If this is a GCM context, use the MOVBE and the BSWAP
647 * variants alternately.
648 */
649 if (gcm_ctx->gcm_use_avx == B_TRUE &&
650 zfs_movbe_available() == B_TRUE) {
651 (void) atomic_toggle_boolean_nv(
652 (volatile boolean_t *)&gcm_avx_can_use_movbe);
653 }
654 }
655 /*
656 * We don't handle byte swapped key schedules in the avx code path,
657 * still they could be created by the aes generic implementation.
658 * Make sure not to use them since we'll corrupt data if we do.
659 */
660 if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
661 gcm_ctx->gcm_use_avx = B_FALSE;
662
663 cmn_err_once(CE_WARN,
664 "ICP: Can't use the aes generic or cycle implementations "
665 "in combination with the gcm avx implementation!");
666 cmn_err_once(CE_WARN,
667 "ICP: Falling back to a compatible implementation, "
668 "aes-gcm performance will likely be degraded.");
669 cmn_err_once(CE_WARN,
670 "ICP: Choose at least the x86_64 aes implementation to "
671 "restore performance.");
672 }
673
674 /* Allocate Htab memory as needed. */
675 if (gcm_ctx->gcm_use_avx == B_TRUE) {
676 size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
677
678 if (htab_len == 0) {
679 return (CRYPTO_MECHANISM_PARAM_INVALID);
680 }
681 gcm_ctx->gcm_htab_len = htab_len;
682 gcm_ctx->gcm_Htable =
683 kmem_alloc(htab_len, KM_SLEEP);
684
685 if (gcm_ctx->gcm_Htable == NULL) {
686 return (CRYPTO_HOST_MEMORY);
687 }
688 }
689 /* Avx and non avx context initialization differs from here on. */
690 if (gcm_ctx->gcm_use_avx == B_FALSE) {
691 #endif /* ifdef CAN_USE_GCM_ASM */
692 if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
693 encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
694 rv = CRYPTO_MECHANISM_PARAM_INVALID;
695 }
696 #ifdef CAN_USE_GCM_ASM
697 } else {
698 if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
699 block_size) != CRYPTO_SUCCESS) {
700 rv = CRYPTO_MECHANISM_PARAM_INVALID;
701 }
702 }
703 #endif /* ifdef CAN_USE_GCM_ASM */
704
705 return (rv);
706 }
707
708 void *
gcm_alloc_ctx(int kmflag)709 gcm_alloc_ctx(int kmflag)
710 {
711 gcm_ctx_t *gcm_ctx;
712
713 if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
714 return (NULL);
715
716 gcm_ctx->gcm_flags = GCM_MODE;
717 return (gcm_ctx);
718 }
719
720 /* GCM implementation that contains the fastest methods */
721 static gcm_impl_ops_t gcm_fastest_impl = {
722 .name = "fastest"
723 };
724
725 /* All compiled in implementations */
726 static const gcm_impl_ops_t *gcm_all_impl[] = {
727 &gcm_generic_impl,
728 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
729 &gcm_pclmulqdq_impl,
730 #endif
731 };
732
733 /* Indicate that benchmark has been completed */
734 static boolean_t gcm_impl_initialized = B_FALSE;
735
736 /* Hold all supported implementations */
737 static size_t gcm_supp_impl_cnt = 0;
738 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
739
740 /*
741 * Returns the GCM operations for encrypt/decrypt/key setup. When a
742 * SIMD implementation is not allowed in the current context, then
743 * fallback to the fastest generic implementation.
744 */
745 const gcm_impl_ops_t *
gcm_impl_get_ops(void)746 gcm_impl_get_ops(void)
747 {
748 if (!kfpu_allowed())
749 return (&gcm_generic_impl);
750
751 const gcm_impl_ops_t *ops = NULL;
752 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
753
754 switch (impl) {
755 case IMPL_FASTEST:
756 ASSERT(gcm_impl_initialized);
757 ops = &gcm_fastest_impl;
758 break;
759 case IMPL_CYCLE:
760 /* Cycle through supported implementations */
761 ASSERT(gcm_impl_initialized);
762 ASSERT3U(gcm_supp_impl_cnt, >, 0);
763 static size_t cycle_impl_idx = 0;
764 size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
765 ops = gcm_supp_impl[idx];
766 break;
767 #ifdef CAN_USE_GCM_ASM
768 case IMPL_AVX:
769 /*
770 * Make sure that we return a valid implementation while
771 * switching to the avx implementation since there still
772 * may be unfinished non-avx contexts around.
773 */
774 ops = &gcm_generic_impl;
775 break;
776 #endif
777 default:
778 ASSERT3U(impl, <, gcm_supp_impl_cnt);
779 ASSERT3U(gcm_supp_impl_cnt, >, 0);
780 if (impl < ARRAY_SIZE(gcm_all_impl))
781 ops = gcm_supp_impl[impl];
782 break;
783 }
784
785 ASSERT3P(ops, !=, NULL);
786
787 return (ops);
788 }
789
790 /*
791 * Initialize all supported implementations.
792 */
793 void
gcm_impl_init(void)794 gcm_impl_init(void)
795 {
796 gcm_impl_ops_t *curr_impl;
797 int i, c;
798
799 /* Move supported implementations into gcm_supp_impls */
800 for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
801 curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
802
803 if (curr_impl->is_supported())
804 gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
805 }
806 gcm_supp_impl_cnt = c;
807
808 /*
809 * Set the fastest implementation given the assumption that the
810 * hardware accelerated version is the fastest.
811 */
812 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
813 if (gcm_pclmulqdq_impl.is_supported()) {
814 memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
815 sizeof (gcm_fastest_impl));
816 } else
817 #endif
818 {
819 memcpy(&gcm_fastest_impl, &gcm_generic_impl,
820 sizeof (gcm_fastest_impl));
821 }
822
823 strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
824
825 #ifdef CAN_USE_GCM_ASM
826 /*
827 * Use the avx implementation if it's available and the implementation
828 * hasn't changed from its default value of fastest on module load.
829 */
830 if (gcm_avx_will_work()) {
831 #ifdef HAVE_MOVBE
832 if (zfs_movbe_available() == B_TRUE) {
833 atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
834 }
835 #endif
836 if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
837 gcm_set_avx(B_TRUE);
838 }
839 }
840 #endif
841 /* Finish initialization */
842 atomic_swap_32(&icp_gcm_impl, user_sel_impl);
843 gcm_impl_initialized = B_TRUE;
844 }
845
846 static const struct {
847 const char *name;
848 uint32_t sel;
849 } gcm_impl_opts[] = {
850 { "cycle", IMPL_CYCLE },
851 { "fastest", IMPL_FASTEST },
852 #ifdef CAN_USE_GCM_ASM
853 { "avx", IMPL_AVX },
854 #endif
855 };
856
857 /*
858 * Function sets desired gcm implementation.
859 *
860 * If we are called before init(), user preference will be saved in
861 * user_sel_impl, and applied in later init() call. This occurs when module
862 * parameter is specified on module load. Otherwise, directly update
863 * icp_gcm_impl.
864 *
865 * @val Name of gcm implementation to use
866 * @param Unused.
867 */
868 int
gcm_impl_set(const char * val)869 gcm_impl_set(const char *val)
870 {
871 int err = -EINVAL;
872 char req_name[GCM_IMPL_NAME_MAX];
873 uint32_t impl = GCM_IMPL_READ(user_sel_impl);
874 size_t i;
875
876 /* sanitize input */
877 i = strnlen(val, GCM_IMPL_NAME_MAX);
878 if (i == 0 || i >= GCM_IMPL_NAME_MAX)
879 return (err);
880
881 strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
882 while (i > 0 && isspace(req_name[i-1]))
883 i--;
884 req_name[i] = '\0';
885
886 /* Check mandatory options */
887 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
888 #ifdef CAN_USE_GCM_ASM
889 /* Ignore avx implementation if it won't work. */
890 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
891 continue;
892 }
893 #endif
894 if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
895 impl = gcm_impl_opts[i].sel;
896 err = 0;
897 break;
898 }
899 }
900
901 /* check all supported impl if init() was already called */
902 if (err != 0 && gcm_impl_initialized) {
903 /* check all supported implementations */
904 for (i = 0; i < gcm_supp_impl_cnt; i++) {
905 if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
906 impl = i;
907 err = 0;
908 break;
909 }
910 }
911 }
912 #ifdef CAN_USE_GCM_ASM
913 /*
914 * Use the avx implementation if available and the requested one is
915 * avx or fastest.
916 */
917 if (gcm_avx_will_work() == B_TRUE &&
918 (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
919 gcm_set_avx(B_TRUE);
920 } else {
921 gcm_set_avx(B_FALSE);
922 }
923 #endif
924
925 if (err == 0) {
926 if (gcm_impl_initialized)
927 atomic_swap_32(&icp_gcm_impl, impl);
928 else
929 atomic_swap_32(&user_sel_impl, impl);
930 }
931
932 return (err);
933 }
934
935 #if defined(_KERNEL) && defined(__linux__)
936
937 static int
icp_gcm_impl_set(const char * val,zfs_kernel_param_t * kp)938 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
939 {
940 return (gcm_impl_set(val));
941 }
942
943 static int
icp_gcm_impl_get(char * buffer,zfs_kernel_param_t * kp)944 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
945 {
946 int i, cnt = 0;
947 char *fmt;
948 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
949
950 ASSERT(gcm_impl_initialized);
951
952 /* list mandatory options */
953 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
954 #ifdef CAN_USE_GCM_ASM
955 /* Ignore avx implementation if it won't work. */
956 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
957 continue;
958 }
959 #endif
960 fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
961 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
962 gcm_impl_opts[i].name);
963 }
964
965 /* list all supported implementations */
966 for (i = 0; i < gcm_supp_impl_cnt; i++) {
967 fmt = (i == impl) ? "[%s] " : "%s ";
968 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
969 gcm_supp_impl[i]->name);
970 }
971
972 return (cnt);
973 }
974
975 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
976 NULL, 0644);
977 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
978 #endif /* defined(__KERNEL) */
979
980 #ifdef CAN_USE_GCM_ASM
981 #define GCM_BLOCK_LEN 16
982 /*
983 * The openssl asm routines are 6x aggregated and need that many bytes
984 * at minimum.
985 */
986 #define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
987 #define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
988 /*
989 * Ensure the chunk size is reasonable since we are allocating a
990 * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
991 */
992 #define GCM_AVX_MAX_CHUNK_SIZE \
993 (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
994
995 /* Clear the FPU registers since they hold sensitive internal state. */
996 #define clear_fpu_regs() clear_fpu_regs_avx()
997 #define GHASH_AVX(ctx, in, len) \
998 gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
999 in, len)
1000
1001 #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1002
1003 /* Get the chunk size module parameter. */
1004 #define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1005
1006 /*
1007 * Module parameter: number of bytes to process at once while owning the FPU.
1008 * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1009 * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1010 */
1011 static uint32_t gcm_avx_chunk_size =
1012 ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1013
1014 extern void ASMABI clear_fpu_regs_avx(void);
1015 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1016 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
1017 const uint32_t pt[4], uint32_t ct[4]);
1018
1019 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1020 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1021 const uint8_t *in, size_t len);
1022
1023 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1024 const void *, uint64_t *, uint64_t *);
1025
1026 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1027 const void *, uint64_t *, uint64_t *);
1028
1029 static inline boolean_t
gcm_avx_will_work(void)1030 gcm_avx_will_work(void)
1031 {
1032 /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1033 return (kfpu_allowed() &&
1034 zfs_avx_available() && zfs_aes_available() &&
1035 zfs_pclmulqdq_available());
1036 }
1037
1038 static inline void
gcm_set_avx(boolean_t val)1039 gcm_set_avx(boolean_t val)
1040 {
1041 if (gcm_avx_will_work() == B_TRUE) {
1042 atomic_swap_32(&gcm_use_avx, val);
1043 }
1044 }
1045
1046 static inline boolean_t
gcm_toggle_avx(void)1047 gcm_toggle_avx(void)
1048 {
1049 if (gcm_avx_will_work() == B_TRUE) {
1050 return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
1051 } else {
1052 return (B_FALSE);
1053 }
1054 }
1055
1056 static inline size_t
gcm_simd_get_htab_size(boolean_t simd_mode)1057 gcm_simd_get_htab_size(boolean_t simd_mode)
1058 {
1059 switch (simd_mode) {
1060 case B_TRUE:
1061 return (2 * 6 * 2 * sizeof (uint64_t));
1062
1063 default:
1064 return (0);
1065 }
1066 }
1067
1068
1069 /* Increment the GCM counter block by n. */
1070 static inline void
gcm_incr_counter_block_by(gcm_ctx_t * ctx,int n)1071 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1072 {
1073 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1074 uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1075
1076 counter = htonll(counter + n);
1077 counter &= counter_mask;
1078 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1079 }
1080
1081 /*
1082 * Encrypt multiple blocks of data in GCM mode.
1083 * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1084 * if possible. While processing a chunk the FPU is "locked".
1085 */
1086 static int
gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size)1087 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1088 size_t length, crypto_data_t *out, size_t block_size)
1089 {
1090 size_t bleft = length;
1091 size_t need = 0;
1092 size_t done = 0;
1093 uint8_t *datap = (uint8_t *)data;
1094 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1095 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1096 uint64_t *ghash = ctx->gcm_ghash;
1097 uint64_t *cb = ctx->gcm_cb;
1098 uint8_t *ct_buf = NULL;
1099 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1100 int rv = CRYPTO_SUCCESS;
1101
1102 ASSERT(block_size == GCM_BLOCK_LEN);
1103 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1104 B_FALSE);
1105 /*
1106 * If the last call left an incomplete block, try to fill
1107 * it first.
1108 */
1109 if (ctx->gcm_remainder_len > 0) {
1110 need = block_size - ctx->gcm_remainder_len;
1111 if (length < need) {
1112 /* Accumulate bytes here and return. */
1113 memcpy((uint8_t *)ctx->gcm_remainder +
1114 ctx->gcm_remainder_len, datap, length);
1115
1116 ctx->gcm_remainder_len += length;
1117 if (ctx->gcm_copy_to == NULL) {
1118 ctx->gcm_copy_to = datap;
1119 }
1120 return (CRYPTO_SUCCESS);
1121 } else {
1122 /* Complete incomplete block. */
1123 memcpy((uint8_t *)ctx->gcm_remainder +
1124 ctx->gcm_remainder_len, datap, need);
1125
1126 ctx->gcm_copy_to = NULL;
1127 }
1128 }
1129
1130 /* Allocate a buffer to encrypt to if there is enough input. */
1131 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1132 ct_buf = vmem_alloc(chunk_size, KM_SLEEP);
1133 if (ct_buf == NULL) {
1134 return (CRYPTO_HOST_MEMORY);
1135 }
1136 }
1137
1138 /* If we completed an incomplete block, encrypt and write it out. */
1139 if (ctx->gcm_remainder_len > 0) {
1140 kfpu_begin();
1141 aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1142 (const uint32_t *)cb, (uint32_t *)tmp);
1143
1144 gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1145 GHASH_AVX(ctx, tmp, block_size);
1146 clear_fpu_regs();
1147 kfpu_end();
1148 rv = crypto_put_output_data(tmp, out, block_size);
1149 out->cd_offset += block_size;
1150 gcm_incr_counter_block(ctx);
1151 ctx->gcm_processed_data_len += block_size;
1152 bleft -= need;
1153 datap += need;
1154 ctx->gcm_remainder_len = 0;
1155 }
1156
1157 /* Do the bulk encryption in chunk_size blocks. */
1158 for (; bleft >= chunk_size; bleft -= chunk_size) {
1159 kfpu_begin();
1160 done = aesni_gcm_encrypt(
1161 datap, ct_buf, chunk_size, key, cb, ghash);
1162
1163 clear_fpu_regs();
1164 kfpu_end();
1165 if (done != chunk_size) {
1166 rv = CRYPTO_FAILED;
1167 goto out_nofpu;
1168 }
1169 rv = crypto_put_output_data(ct_buf, out, chunk_size);
1170 if (rv != CRYPTO_SUCCESS) {
1171 goto out_nofpu;
1172 }
1173 out->cd_offset += chunk_size;
1174 datap += chunk_size;
1175 ctx->gcm_processed_data_len += chunk_size;
1176 }
1177 /* Check if we are already done. */
1178 if (bleft == 0) {
1179 goto out_nofpu;
1180 }
1181 /* Bulk encrypt the remaining data. */
1182 kfpu_begin();
1183 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1184 done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
1185 if (done == 0) {
1186 rv = CRYPTO_FAILED;
1187 goto out;
1188 }
1189 rv = crypto_put_output_data(ct_buf, out, done);
1190 if (rv != CRYPTO_SUCCESS) {
1191 goto out;
1192 }
1193 out->cd_offset += done;
1194 ctx->gcm_processed_data_len += done;
1195 datap += done;
1196 bleft -= done;
1197
1198 }
1199 /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1200 while (bleft > 0) {
1201 if (bleft < block_size) {
1202 memcpy(ctx->gcm_remainder, datap, bleft);
1203 ctx->gcm_remainder_len = bleft;
1204 ctx->gcm_copy_to = datap;
1205 goto out;
1206 }
1207 /* Encrypt, hash and write out. */
1208 aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1209 (const uint32_t *)cb, (uint32_t *)tmp);
1210
1211 gcm_xor_avx(datap, tmp);
1212 GHASH_AVX(ctx, tmp, block_size);
1213 rv = crypto_put_output_data(tmp, out, block_size);
1214 if (rv != CRYPTO_SUCCESS) {
1215 goto out;
1216 }
1217 out->cd_offset += block_size;
1218 gcm_incr_counter_block(ctx);
1219 ctx->gcm_processed_data_len += block_size;
1220 datap += block_size;
1221 bleft -= block_size;
1222 }
1223 out:
1224 clear_fpu_regs();
1225 kfpu_end();
1226 out_nofpu:
1227 if (ct_buf != NULL) {
1228 vmem_free(ct_buf, chunk_size);
1229 }
1230 return (rv);
1231 }
1232
1233 /*
1234 * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1235 * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1236 */
1237 static int
gcm_encrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1238 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1239 {
1240 uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1241 uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1242 uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1243 size_t rem_len = ctx->gcm_remainder_len;
1244 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1245 int aes_rounds = ((aes_key_t *)keysched)->nr;
1246 int rv;
1247
1248 ASSERT(block_size == GCM_BLOCK_LEN);
1249 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1250 B_FALSE);
1251
1252 if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1253 return (CRYPTO_DATA_LEN_RANGE);
1254 }
1255
1256 kfpu_begin();
1257 /* Pad last incomplete block with zeros, encrypt and hash. */
1258 if (rem_len > 0) {
1259 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1260 const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1261
1262 aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1263 memset(remainder + rem_len, 0, block_size - rem_len);
1264 for (int i = 0; i < rem_len; i++) {
1265 remainder[i] ^= tmp[i];
1266 }
1267 GHASH_AVX(ctx, remainder, block_size);
1268 ctx->gcm_processed_data_len += rem_len;
1269 /* No need to increment counter_block, it's the last block. */
1270 }
1271 /* Finish tag. */
1272 ctx->gcm_len_a_len_c[1] =
1273 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1274 GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1275 aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1276
1277 gcm_xor_avx((uint8_t *)J0, ghash);
1278 clear_fpu_regs();
1279 kfpu_end();
1280
1281 /* Output remainder. */
1282 if (rem_len > 0) {
1283 rv = crypto_put_output_data(remainder, out, rem_len);
1284 if (rv != CRYPTO_SUCCESS)
1285 return (rv);
1286 }
1287 out->cd_offset += rem_len;
1288 ctx->gcm_remainder_len = 0;
1289 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1290 if (rv != CRYPTO_SUCCESS)
1291 return (rv);
1292
1293 out->cd_offset += ctx->gcm_tag_len;
1294 return (CRYPTO_SUCCESS);
1295 }
1296
1297 /*
1298 * Finalize decryption: We just have accumulated crypto text, so now we
1299 * decrypt it here inplace.
1300 */
1301 static int
gcm_decrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1302 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1303 {
1304 ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1305 ASSERT3U(block_size, ==, 16);
1306 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1307 B_FALSE);
1308
1309 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1310 size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1311 uint8_t *datap = ctx->gcm_pt_buf;
1312 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1313 uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1314 uint64_t *ghash = ctx->gcm_ghash;
1315 uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1316 int rv = CRYPTO_SUCCESS;
1317 size_t bleft, done;
1318
1319 /*
1320 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1321 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1322 * GCM_AVX_MIN_DECRYPT_BYTES.
1323 */
1324 for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1325 kfpu_begin();
1326 done = aesni_gcm_decrypt(datap, datap, chunk_size,
1327 (const void *)key, ctx->gcm_cb, ghash);
1328 clear_fpu_regs();
1329 kfpu_end();
1330 if (done != chunk_size) {
1331 return (CRYPTO_FAILED);
1332 }
1333 datap += done;
1334 }
1335 /* Decrypt remainder, which is less than chunk size, in one go. */
1336 kfpu_begin();
1337 if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1338 done = aesni_gcm_decrypt(datap, datap, bleft,
1339 (const void *)key, ctx->gcm_cb, ghash);
1340 if (done == 0) {
1341 clear_fpu_regs();
1342 kfpu_end();
1343 return (CRYPTO_FAILED);
1344 }
1345 datap += done;
1346 bleft -= done;
1347 }
1348 ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1349
1350 /*
1351 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1352 * decrypt them block by block.
1353 */
1354 while (bleft > 0) {
1355 /* Incomplete last block. */
1356 if (bleft < block_size) {
1357 uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1358
1359 memset(lastb, 0, block_size);
1360 memcpy(lastb, datap, bleft);
1361 /* The GCM processing. */
1362 GHASH_AVX(ctx, lastb, block_size);
1363 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1364 for (size_t i = 0; i < bleft; i++) {
1365 datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1366 }
1367 break;
1368 }
1369 /* The GCM processing. */
1370 GHASH_AVX(ctx, datap, block_size);
1371 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1372 gcm_xor_avx((uint8_t *)tmp, datap);
1373 gcm_incr_counter_block(ctx);
1374
1375 datap += block_size;
1376 bleft -= block_size;
1377 }
1378 if (rv != CRYPTO_SUCCESS) {
1379 clear_fpu_regs();
1380 kfpu_end();
1381 return (rv);
1382 }
1383 /* Decryption done, finish the tag. */
1384 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1385 GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1386 aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1387 (uint32_t *)ctx->gcm_J0);
1388
1389 gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1390
1391 /* We are done with the FPU, restore its state. */
1392 clear_fpu_regs();
1393 kfpu_end();
1394
1395 /* Compare the input authentication tag with what we calculated. */
1396 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1397 /* They don't match. */
1398 return (CRYPTO_INVALID_MAC);
1399 }
1400 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1401 if (rv != CRYPTO_SUCCESS) {
1402 return (rv);
1403 }
1404 out->cd_offset += pt_len;
1405 return (CRYPTO_SUCCESS);
1406 }
1407
1408 /*
1409 * Initialize the GCM params H, Htabtle and the counter block. Save the
1410 * initial counter block.
1411 */
1412 static int
gcm_init_avx(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size)1413 gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
1414 const uint8_t *auth_data, size_t auth_data_len, size_t block_size)
1415 {
1416 uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1417 uint64_t *H = ctx->gcm_H;
1418 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1419 int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1420 const uint8_t *datap = auth_data;
1421 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1422 size_t bleft;
1423
1424 ASSERT(block_size == GCM_BLOCK_LEN);
1425 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1426 B_FALSE);
1427
1428 /* Init H (encrypt zero block) and create the initial counter block. */
1429 memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash));
1430 memset(H, 0, sizeof (ctx->gcm_H));
1431 kfpu_begin();
1432 aes_encrypt_intel(keysched, aes_rounds,
1433 (const uint32_t *)H, (uint32_t *)H);
1434
1435 gcm_init_htab_avx(ctx->gcm_Htable, H);
1436
1437 if (iv_len == 12) {
1438 memcpy(cb, iv, 12);
1439 cb[12] = 0;
1440 cb[13] = 0;
1441 cb[14] = 0;
1442 cb[15] = 1;
1443 /* We need the ICB later. */
1444 memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0));
1445 } else {
1446 /*
1447 * Most consumers use 12 byte IVs, so it's OK to use the
1448 * original routines for other IV sizes, just avoid nesting
1449 * kfpu_begin calls.
1450 */
1451 clear_fpu_regs();
1452 kfpu_end();
1453 gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1454 aes_copy_block, aes_xor_block);
1455 kfpu_begin();
1456 }
1457
1458 /* Openssl post increments the counter, adjust for that. */
1459 gcm_incr_counter_block(ctx);
1460
1461 /* Ghash AAD in chunk_size blocks. */
1462 for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1463 GHASH_AVX(ctx, datap, chunk_size);
1464 datap += chunk_size;
1465 clear_fpu_regs();
1466 kfpu_end();
1467 kfpu_begin();
1468 }
1469 /* Ghash the remainder and handle possible incomplete GCM block. */
1470 if (bleft > 0) {
1471 size_t incomp = bleft % block_size;
1472
1473 bleft -= incomp;
1474 if (bleft > 0) {
1475 GHASH_AVX(ctx, datap, bleft);
1476 datap += bleft;
1477 }
1478 if (incomp > 0) {
1479 /* Zero pad and hash incomplete last block. */
1480 uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1481
1482 memset(authp, 0, block_size);
1483 memcpy(authp, datap, incomp);
1484 GHASH_AVX(ctx, authp, block_size);
1485 }
1486 }
1487 clear_fpu_regs();
1488 kfpu_end();
1489 return (CRYPTO_SUCCESS);
1490 }
1491
1492 #if defined(_KERNEL)
1493 static int
icp_gcm_avx_set_chunk_size(const char * buf,zfs_kernel_param_t * kp)1494 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1495 {
1496 unsigned long val;
1497 char val_rounded[16];
1498 int error = 0;
1499
1500 error = kstrtoul(buf, 0, &val);
1501 if (error)
1502 return (error);
1503
1504 val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1505
1506 if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1507 return (-EINVAL);
1508
1509 snprintf(val_rounded, 16, "%u", (uint32_t)val);
1510 error = param_set_uint(val_rounded, kp);
1511 return (error);
1512 }
1513
1514 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1515 param_get_uint, &gcm_avx_chunk_size, 0644);
1516
1517 MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1518 "How many bytes to process while owning the FPU");
1519
1520 #endif /* defined(__KERNEL) */
1521 #endif /* ifdef CAN_USE_GCM_ASM */
1522