xref: /titanic_52/usr/src/common/bignum/bignumimpl.c (revision bb1fad37c75defa7a6ae25f00c1d4b356713b734)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Configuration guide
28  * -------------------
29  *
30  * There are 4 preprocessor symbols used to configure the bignum
31  * implementation.  This file contains no logic to configure based on
32  * processor; we leave that to the Makefiles to specify.
33  *
34  * USE_FLOATING_POINT
35  *   Meaning: There is support for a fast floating-point implementation of
36  *   Montgomery multiply.
37  *
38  * PSR_MUL
39  *   Meaning: There are processor-specific versions of the low level
40  *   functions to implement big_mul.  Those functions are: big_mul_set_vec,
41  *   big_mul_add_vec, big_mul_vec, and big_sqr_vec.  PSR_MUL implies support
42  *   for all 4 functions.  You cannot pick and choose which subset of these
43  *   functions to support; that would lead to a rat's nest of #ifdefs.
44  *
45  * HWCAP
46  *   Meaning: Call multiply support functions through a function pointer.
47  *   On x86, there are multiple implementations for different hardware
48  *   capabilities, such as MMX, SSE2, etc.  Tests are made at run-time, when
49  *   a function is first used.  So, the support functions are called through
50  *   a function pointer.  There is no need for that on Sparc, because there
51  *   is only one implementation; support functions are called directly.
52  *   Later, if there were some new VIS instruction, or something, and a
53  *   run-time test were needed, rather than variant kernel modules and
54  *   libraries, then HWCAP would be defined for Sparc, as well.
55  *
56  * UMUL64
57  *   Meaning: It is safe to use generic C code that assumes the existence
58  *   of a 32 x 32 --> 64 bit unsigned multiply.  If this is not defined,
59  *   then the generic code for big_mul_add_vec() must necessarily be very slow,
60  *   because it must fall back to using 16 x 16 --> 32 bit multiplication.
61  *
62  */
63 
64 
65 #include <sys/types.h>
66 #include "bignum.h"
67 
68 #ifdef	_KERNEL
69 #include <sys/ddi.h>
70 #include <sys/mdesc.h>
71 #include <sys/crypto/common.h>
72 
73 #include <sys/kmem.h>
74 #include <sys/param.h>
75 #include <sys/sunddi.h>
76 
77 #else
78 #include <stdlib.h>
79 #include <stdio.h>
80 #include <assert.h>
81 #define	ASSERT	assert
82 #endif	/* _KERNEL */
83 
84 #ifdef __amd64
85 #ifdef _KERNEL
86 #include <sys/x86_archext.h>	/* cpuid_getvendor() */
87 #include <sys/cpuvar.h>
88 #else
89 #include <sys/auxv.h>		/* getisax() */
90 #endif  /* _KERNEL */
91 #endif  /* __amd64 */
92 
93 
94 #ifdef	_LP64 /* truncate 64-bit size_t to 32-bits */
95 #define	UI32(ui)	((uint32_t)ui)
96 #else /* size_t already 32-bits */
97 #define	UI32(ui)	(ui)
98 #endif
99 
100 
101 #ifdef	_KERNEL
102 #define	big_malloc(size)	kmem_alloc(size, KM_NOSLEEP)
103 #define	big_free(ptr, size)	kmem_free(ptr, size)
104 
105 /*
106  * big_realloc()
107  * Allocate memory of newsize bytes and copy oldsize bytes
108  * to the newly-allocated memory, then free the
109  * previously-allocated memory.
110  * Note: newsize must be > oldsize
111  */
112 void *
113 big_realloc(void *from, size_t oldsize, size_t newsize)
114 {
115 	void *rv;
116 
117 	rv = kmem_alloc(newsize, KM_NOSLEEP);
118 	if (rv != NULL)
119 		bcopy(from, rv, oldsize);
120 	kmem_free(from, oldsize);
121 	return (rv);
122 }
123 
124 #else	/* _KERNEL */
125 
126 #ifndef MALLOC_DEBUG
127 
128 #define	big_malloc(size)	malloc(size)
129 #define	big_free(ptr, size)	free(ptr)
130 
131 #else
132 
133 void
134 big_free(void *ptr, size_t size)
135 {
136 	printf("freed %d bytes at %p\n", size, ptr);
137 	free(ptr);
138 }
139 
140 void *
141 big_malloc(size_t size)
142 {
143 	void *rv;
144 	rv = malloc(size);
145 	printf("malloced %d bytes, addr:%p\n", size, rv);
146 	return (rv);
147 }
148 #endif /* MALLOC_DEBUG */
149 
150 #define	big_realloc(x, y, z) realloc((x), (z))
151 
152 
153 /*
154  * printbignum()
155  * Print a BIGNUM type to stdout.
156  */
157 void
158 printbignum(char *aname, BIGNUM *a)
159 {
160 	int i;
161 
162 	(void) printf("\n%s\n%d\n", aname, a->sign*a->len);
163 	for (i = a->len - 1; i >= 0; i--) {
164 #ifdef BIGNUM_CHUNK_32
165 		(void) printf("%08x ", a->value[i]);
166 		if (((i & (BITSINBYTE - 1)) == 0) && (i != 0)) {
167 			(void) printf("\n");
168 		}
169 #else
170 		(void) printf("%08x %08x ", (uint32_t)((a->value[i]) >> 32),
171 		    (uint32_t)((a->value[i]) & 0xffffffff));
172 		if (((i & 3) == 0) && (i != 0)) { /* end of this chunk */
173 			(void) printf("\n");
174 		}
175 #endif
176 	}
177 	(void) printf("\n");
178 }
179 
180 #endif	/* _KERNEL */
181 
182 
183 #ifdef  __amd64
184 /*
185  * Return 1 if executing on Intel, otherwise 0 (e.g., AMD64).
186  * Cache the result, as the CPU can't change.
187  *
188  * Note: the userland version uses getisax() and checks for an AMD-64-only
189  * feature.  The kernel version uses cpuid_getvendor().
190  */
191 static int
192 bignum_on_intel(void)
193 {
194 	static int	cached_result = -1;
195 
196 	if (cached_result == -1) { /* first time */
197 #ifdef _KERNEL
198 		cached_result = (cpuid_getvendor(CPU) == X86_VENDOR_Intel);
199 #else
200 		uint_t  ui;
201 
202 		(void) getisax(&ui, 1);
203 		cached_result = ((ui & AV_386_AMD_MMX) == 0);
204 #endif  /* _KERNEL */
205 	}
206 
207 	return (cached_result);
208 }
209 #endif  /* __amd64 */
210 
211 
212 /*
213  * big_init()
214  * Initialize and allocate memory for a BIGNUM type.
215  *
216  * big_init(number, size) is equivalent to big_init1(number, size, NULL, 0)
217  *
218  * Note: call big_finish() to free memory allocated by big_init().
219  *
220  * Input:
221  * number	Uninitialized memory for BIGNUM
222  * size		Minimum size, in BIG_CHUNK_SIZE-bit words, required for BIGNUM
223  *
224  * Output:
225  * number	Initialized BIGNUM
226  *
227  * Return BIG_OK on success or BIG_NO_MEM for an allocation error.
228  */
229 BIG_ERR_CODE
230 big_init(BIGNUM *number, int size)
231 {
232 	number->value = big_malloc(BIGNUM_WORDSIZE * size);
233 	if (number->value == NULL) {
234 		return (BIG_NO_MEM);
235 	}
236 	number->size = size;
237 	number->len = 0;
238 	number->sign = 1;
239 	number->malloced = 1;
240 	return (BIG_OK);
241 }
242 
243 
244 /*
245  * big_init1()
246  * Initialize and, if needed, allocate memory for a BIGNUM type.
247  * Use the buffer passed, buf, if any, instad of allocating memory
248  * if it's at least "size" bytes.
249  *
250  * Note: call big_finish() to free memory allocated by big_init().
251  *
252  * Input:
253  * number	Uninitialized memory for BIGNUM
254  * size		Minimum size, in BIG_CHUNK_SIZE-bit words, required for BIGNUM
255  * buf		Buffer for storing a BIGNUM.
256  *		If NULL, big_init1() will allocate a buffer
257  * bufsize	Size, in BIG_CHUNK_SIZE_bit words, of buf
258  *
259  * Output:
260  * number	Initialized BIGNUM
261  *
262  * Return BIG_OK on success or BIG_NO_MEM for an allocation error.
263  */
264 BIG_ERR_CODE
265 big_init1(BIGNUM *number, int size, BIG_CHUNK_TYPE *buf, int bufsize)
266 {
267 	if ((buf == NULL) || (size > bufsize)) {
268 		number->value = big_malloc(BIGNUM_WORDSIZE * size);
269 		if (number->value == NULL) {
270 			return (BIG_NO_MEM);
271 		}
272 		number->size = size;
273 		number->malloced = 1;
274 	} else {
275 		number->value = buf;
276 		number->size = bufsize;
277 		number->malloced = 0;
278 	}
279 	number->len = 0;
280 	number->sign = 1;
281 
282 	return (BIG_OK);
283 }
284 
285 
286 /*
287  * big_finish()
288  * Free memory, if any, allocated by big_init() or big_init1().
289  */
290 void
291 big_finish(BIGNUM *number)
292 {
293 	if (number->malloced == 1) {
294 		big_free(number->value, BIGNUM_WORDSIZE * number->size);
295 		number->malloced = 0;
296 	}
297 }
298 
299 
300 /*
301  * bn->size should be at least
302  * (len + BIGNUM_WORDSIZE - 1) / BIGNUM_WORDSIZE bytes
303  * converts from byte-big-endian format to bignum format (words in
304  * little endian order, but bytes within the words big endian)
305  */
306 void
307 bytestring2bignum(BIGNUM *bn, uchar_t *kn, size_t len)
308 {
309 	int		i, j;
310 	uint32_t	offs;
311 	const uint32_t	slen = UI32(len);
312 	BIG_CHUNK_TYPE	word;
313 	uchar_t		*knwordp;
314 
315 	if (slen == 0) {
316 		bn->len = 1;
317 		bn->value[0] = 0;
318 		return;
319 	}
320 
321 	offs = slen % BIGNUM_WORDSIZE;
322 	bn->len = slen / BIGNUM_WORDSIZE;
323 
324 	for (i = 0; i < slen / BIGNUM_WORDSIZE; i++) {
325 		knwordp = &(kn[slen - BIGNUM_WORDSIZE * (i + 1)]);
326 		word = knwordp[0];
327 		for (j = 1; j < BIGNUM_WORDSIZE; j++) {
328 			word = (word << BITSINBYTE) + knwordp[j];
329 		}
330 		bn->value[i] = word;
331 	}
332 	if (offs > 0) {
333 		word = kn[0];
334 		for (i = 1; i < offs; i++) word = (word << BITSINBYTE) + kn[i];
335 		bn->value[bn->len++] = word;
336 	}
337 	while ((bn->len > 1) && (bn->value[bn->len - 1] == 0)) {
338 		bn->len --;
339 	}
340 }
341 
342 
343 /*
344  * copies the least significant len bytes if
345  * len < bn->len * BIGNUM_WORDSIZE
346  * converts from bignum format to byte-big-endian format.
347  * bignum format is words of type  BIG_CHUNK_TYPE in little endian order.
348  */
349 void
350 bignum2bytestring(uchar_t *kn, BIGNUM *bn, size_t len)
351 {
352 	int		i, j;
353 	uint32_t	offs;
354 	const uint32_t	slen = UI32(len);
355 	BIG_CHUNK_TYPE	word;
356 
357 	if (len < BIGNUM_WORDSIZE * bn->len) {
358 		for (i = 0; i < slen / BIGNUM_WORDSIZE; i++) {
359 			word = bn->value[i];
360 			for (j = 0; j < BIGNUM_WORDSIZE; j++) {
361 				kn[slen - BIGNUM_WORDSIZE * i - j - 1] =
362 				    word & 0xff;
363 				word = word >> BITSINBYTE;
364 			}
365 		}
366 		offs = slen % BIGNUM_WORDSIZE;
367 		if (offs > 0) {
368 			word = bn->value[slen / BIGNUM_WORDSIZE];
369 			for (i =  slen % BIGNUM_WORDSIZE; i > 0; i --) {
370 				kn[i - 1] = word & 0xff;
371 				word = word >> BITSINBYTE;
372 			}
373 		}
374 	} else {
375 		for (i = 0; i < bn->len; i++) {
376 			word = bn->value[i];
377 			for (j = 0; j < BIGNUM_WORDSIZE; j++) {
378 				kn[slen - BIGNUM_WORDSIZE * i - j - 1] =
379 				    word & 0xff;
380 				word = word >> BITSINBYTE;
381 			}
382 		}
383 		for (i = 0; i < slen - BIGNUM_WORDSIZE * bn->len; i++) {
384 			kn[i] = 0;
385 		}
386 	}
387 }
388 
389 
390 int
391 big_bitlength(BIGNUM *a)
392 {
393 	int		l = 0, b = 0;
394 	BIG_CHUNK_TYPE	c;
395 
396 	l = a->len - 1;
397 	while ((l > 0) && (a->value[l] == 0)) {
398 		l--;
399 	}
400 	b = BIG_CHUNK_SIZE;
401 	c = a->value[l];
402 	while ((b > 1) && ((c & BIG_CHUNK_HIGHBIT) == 0)) {
403 		c = c << 1;
404 		b--;
405 	}
406 
407 	return (l * BIG_CHUNK_SIZE + b);
408 }
409 
410 
411 /*
412  * big_copy()
413  * Copy BIGNUM src to dest, allocating memory if needed.
414  */
415 BIG_ERR_CODE
416 big_copy(BIGNUM *dest, BIGNUM *src)
417 {
418 	BIG_CHUNK_TYPE	*newptr;
419 	int		i, len;
420 
421 	len = src->len;
422 	while ((len > 1) && (src->value[len - 1] == 0)) {
423 		len--;
424 	}
425 	src->len = len;
426 	if (dest->size < len) {
427 		if (dest->malloced == 1) {
428 			newptr = (BIG_CHUNK_TYPE *)big_realloc(dest->value,
429 			    BIGNUM_WORDSIZE * dest->size,
430 			    BIGNUM_WORDSIZE * len);
431 		} else {
432 			newptr = (BIG_CHUNK_TYPE *)
433 			    big_malloc(BIGNUM_WORDSIZE * len);
434 			if (newptr != NULL) {
435 				dest->malloced = 1;
436 			}
437 		}
438 		if (newptr == NULL) {
439 			return (BIG_NO_MEM);
440 		}
441 		dest->value = newptr;
442 		dest->size = len;
443 	}
444 	dest->len = len;
445 	dest->sign = src->sign;
446 	for (i = 0; i < len; i++) {
447 		dest->value[i] = src->value[i];
448 	}
449 
450 	return (BIG_OK);
451 }
452 
453 
454 /*
455  * big_extend()
456  * Allocate memory to extend BIGNUM number to size bignum chunks,
457  * if not at least that size already.
458  */
459 BIG_ERR_CODE
460 big_extend(BIGNUM *number, int size)
461 {
462 	BIG_CHUNK_TYPE	*newptr;
463 	int		i;
464 
465 	if (number->size >= size)
466 		return (BIG_OK);
467 	if (number->malloced) {
468 		number->value = big_realloc(number->value,
469 		    BIGNUM_WORDSIZE * number->size,
470 		    BIGNUM_WORDSIZE * size);
471 	} else {
472 		newptr = big_malloc(BIGNUM_WORDSIZE * size);
473 		if (newptr != NULL) {
474 			for (i = 0; i < number->size; i++) {
475 				newptr[i] = number->value[i];
476 			}
477 		}
478 		number->value = newptr;
479 	}
480 
481 	if (number->value == NULL) {
482 		return (BIG_NO_MEM);
483 	}
484 
485 	number->size = size;
486 	number->malloced = 1;
487 	return (BIG_OK);
488 }
489 
490 
491 /* returns 1 if n == 0 */
492 int
493 big_is_zero(BIGNUM *n)
494 {
495 	int	i, result;
496 
497 	result = 1;
498 	for (i = 0; i < n->len; i++) {
499 		if (n->value[i] != 0) {
500 			result = 0;
501 		}
502 	}
503 	return (result);
504 }
505 
506 
507 BIG_ERR_CODE
508 big_add_abs(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
509 {
510 	int		i, shorter, longer;
511 	BIG_CHUNK_TYPE	cy, ai;
512 	BIG_CHUNK_TYPE	*r, *a, *b, *c;
513 	BIG_ERR_CODE	err;
514 	BIGNUM		*longerarg;
515 
516 	if (aa->len > bb->len) {
517 		shorter = bb->len;
518 		longer = aa->len;
519 		longerarg = aa;
520 	} else {
521 		shorter = aa->len;
522 		longer = bb->len;
523 		longerarg = bb;
524 	}
525 	if (result->size < longer + 1) {
526 		err = big_extend(result, longer + 1);
527 		if (err != BIG_OK) {
528 			return (err);
529 		}
530 	}
531 
532 	r = result->value;
533 	a = aa->value;
534 	b = bb->value;
535 	c = longerarg->value;
536 	cy = 0;
537 	for (i = 0; i < shorter; i++) {
538 		ai = a[i];
539 		r[i] = ai + b[i] + cy;
540 		if (r[i] > ai) {
541 			cy = 0;
542 		} else if (r[i] < ai) {
543 			cy = 1;
544 		}
545 	}
546 	for (; i < longer; i++) {
547 		ai = c[i];
548 		r[i] = ai + cy;
549 		if (r[i] >= ai) {
550 			cy = 0;
551 		}
552 	}
553 	if (cy == 1) {
554 		r[i] = cy;
555 		result->len = longer + 1;
556 	} else {
557 		result->len = longer;
558 	}
559 	result->sign = 1;
560 	return (BIG_OK);
561 }
562 
563 
564 /* caller must make sure that result has at least len words allocated */
565 void
566 big_sub_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, BIG_CHUNK_TYPE *b, int len)
567 {
568 	int		i;
569 	BIG_CHUNK_TYPE	cy, ai;
570 
571 	cy = 1;
572 	for (i = 0; i < len; i++) {
573 		ai = a[i];
574 		r[i] = ai + (~b[i]) + cy;
575 		if (r[i] > ai) {
576 			cy = 0;
577 		} else if (r[i] < ai) {
578 			cy = 1;
579 		}
580 	}
581 }
582 
583 
584 /* result=aa-bb  it is assumed that aa>=bb */
585 BIG_ERR_CODE
586 big_sub_pos(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
587 {
588 	int		i, shorter;
589 	BIG_CHUNK_TYPE	cy = 1, ai;
590 	BIG_CHUNK_TYPE	*r, *a, *b;
591 	BIG_ERR_CODE	err = BIG_OK;
592 
593 	if (aa->len > bb->len) {
594 		shorter = bb->len;
595 	} else {
596 		shorter = aa->len;
597 	}
598 	if (result->size < aa->len) {
599 		err = big_extend(result, aa->len);
600 		if (err != BIG_OK) {
601 			return (err);
602 		}
603 	}
604 
605 	r = result->value;
606 	a = aa->value;
607 	b = bb->value;
608 	result->len = aa->len;
609 	cy = 1;
610 	for (i = 0; i < shorter; i++) {
611 		ai = a[i];
612 		r[i] = ai + (~b[i]) + cy;
613 		if (r[i] > ai) {
614 			cy = 0;
615 		} else if (r[i] < ai) {
616 			cy = 1;
617 		}
618 	}
619 	for (; i < aa->len; i++) {
620 		ai = a[i];
621 		r[i] = ai + (~0) + cy;
622 		if (r[i] < ai) {
623 			cy = 1;
624 		}
625 	}
626 	result->sign = 1;
627 
628 	if (cy == 0) {
629 		return (BIG_INVALID_ARGS);
630 	} else {
631 		return (BIG_OK);
632 	}
633 }
634 
635 
636 /* returns -1 if |aa|<|bb|, 0 if |aa|==|bb| 1 if |aa|>|bb| */
637 int
638 big_cmp_abs(BIGNUM *aa, BIGNUM *bb)
639 {
640 	int	i;
641 
642 	if (aa->len > bb->len) {
643 		for (i = aa->len - 1; i > bb->len - 1; i--) {
644 			if (aa->value[i] > 0) {
645 				return (1);
646 			}
647 		}
648 	} else if (aa->len < bb->len) {
649 		for (i = bb->len - 1; i > aa->len - 1; i--) {
650 			if (bb->value[i] > 0) {
651 				return (-1);
652 			}
653 		}
654 	} else {
655 		i = aa->len - 1;
656 	}
657 	for (; i >= 0; i--) {
658 		if (aa->value[i] > bb->value[i]) {
659 			return (1);
660 		} else if (aa->value[i] < bb->value[i]) {
661 			return (-1);
662 		}
663 	}
664 
665 	return (0);
666 }
667 
668 
669 BIG_ERR_CODE
670 big_sub(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
671 {
672 	BIG_ERR_CODE	err;
673 
674 	if ((bb->sign == -1) && (aa->sign == 1)) {
675 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
676 			return (err);
677 		}
678 		result->sign = 1;
679 	} else if ((aa->sign == -1) && (bb->sign == 1)) {
680 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
681 			return (err);
682 		}
683 		result->sign = -1;
684 	} else if ((aa->sign == 1) && (bb->sign == 1)) {
685 		if (big_cmp_abs(aa, bb) >= 0) {
686 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
687 				return (err);
688 			}
689 			result->sign = 1;
690 		} else {
691 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
692 				return (err);
693 			}
694 			result->sign = -1;
695 		}
696 	} else {
697 		if (big_cmp_abs(aa, bb) >= 0) {
698 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
699 				return (err);
700 			}
701 			result->sign = -1;
702 		} else {
703 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
704 				return (err);
705 			}
706 			result->sign = 1;
707 		}
708 	}
709 	return (BIG_OK);
710 }
711 
712 
713 BIG_ERR_CODE
714 big_add(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
715 {
716 	BIG_ERR_CODE	err;
717 
718 	if ((bb->sign == -1) && (aa->sign == -1)) {
719 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
720 			return (err);
721 		}
722 		result->sign = -1;
723 	} else if ((aa->sign == 1) && (bb->sign == 1)) {
724 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
725 			return (err);
726 		}
727 		result->sign = 1;
728 	} else if ((aa->sign == 1) && (bb->sign == -1)) {
729 		if (big_cmp_abs(aa, bb) >= 0) {
730 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
731 				return (err);
732 			}
733 			result->sign = 1;
734 		} else {
735 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
736 				return (err);
737 			}
738 			result->sign = -1;
739 		}
740 	} else {
741 		if (big_cmp_abs(aa, bb) >= 0) {
742 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
743 				return (err);
744 			}
745 			result->sign = -1;
746 		} else {
747 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
748 				return (err);
749 			}
750 			result->sign = 1;
751 		}
752 	}
753 	return (BIG_OK);
754 }
755 
756 
757 /* result = aa/2 */
758 BIG_ERR_CODE
759 big_half_pos(BIGNUM *result, BIGNUM *aa)
760 {
761 	BIG_ERR_CODE	err;
762 	int		i;
763 	BIG_CHUNK_TYPE	cy, cy1;
764 	BIG_CHUNK_TYPE	*a, *r;
765 
766 	if (result->size < aa->len) {
767 		err = big_extend(result, aa->len);
768 		if (err != BIG_OK) {
769 			return (err);
770 		}
771 	}
772 
773 	result->len = aa->len;
774 	a = aa->value;
775 	r = result->value;
776 	cy = 0;
777 	for (i = aa->len - 1; i >= 0; i--) {
778 		cy1 = a[i] << (BIG_CHUNK_SIZE - 1);
779 		r[i] = (cy | (a[i] >> 1));
780 		cy = cy1;
781 	}
782 	if (r[result->len - 1] == 0) {
783 		result->len--;
784 	}
785 
786 	return (BIG_OK);
787 }
788 
789 /* result  =  aa*2 */
790 BIG_ERR_CODE
791 big_double(BIGNUM *result, BIGNUM *aa)
792 {
793 	BIG_ERR_CODE	err;
794 	int		i, rsize;
795 	BIG_CHUNK_TYPE	cy, cy1;
796 	BIG_CHUNK_TYPE	*a, *r;
797 
798 	if ((aa->len > 0) &&
799 	    ((aa->value[aa->len - 1] & BIG_CHUNK_HIGHBIT) != 0)) {
800 		rsize = aa->len + 1;
801 	} else {
802 		rsize = aa->len;
803 	}
804 
805 	if (result->size < rsize) {
806 		err = big_extend(result, rsize);
807 		if (err != BIG_OK)
808 			return (err);
809 	}
810 
811 	a = aa->value;
812 	r = result->value;
813 	if (rsize == aa->len + 1) {
814 		r[rsize - 1] = 1;
815 	}
816 	cy = 0;
817 	for (i = 0; i < aa->len; i++) {
818 		cy1 = a[i] >> (BIG_CHUNK_SIZE - 1);
819 		r[i] = (cy | (a[i] << 1));
820 		cy = cy1;
821 	}
822 	result->len = rsize;
823 	return (BIG_OK);
824 }
825 
826 
827 /*
828  * returns aa mod b, aa must be nonneg, b must be a max
829  * (BIG_CHUNK_SIZE / 2)-bit integer
830  */
831 static uint32_t
832 big_modhalf_pos(BIGNUM *aa, uint32_t b)
833 {
834 	int		i;
835 	BIG_CHUNK_TYPE	rem;
836 
837 	if (aa->len == 0) {
838 		return (0);
839 	}
840 	rem = aa->value[aa->len - 1] % b;
841 	for (i = aa->len - 2; i >= 0; i--) {
842 		rem = ((rem << (BIG_CHUNK_SIZE / 2)) |
843 		    (aa->value[i] >> (BIG_CHUNK_SIZE / 2))) % b;
844 		rem = ((rem << (BIG_CHUNK_SIZE / 2)) |
845 		    (aa->value[i] & BIG_CHUNK_LOWHALFBITS)) % b;
846 	}
847 
848 	return ((uint32_t)rem);
849 }
850 
851 
852 /*
853  * result = aa - (2^BIG_CHUNK_SIZE)^lendiff * bb
854  * result->size should be at least aa->len at entry
855  * aa, bb, and result should be positive
856  */
857 void
858 big_sub_pos_high(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
859 {
860 	int i, lendiff;
861 	BIGNUM res1, aa1;
862 
863 	lendiff = aa->len - bb->len;
864 	res1.size = result->size - lendiff;
865 	res1.malloced = 0;
866 	res1.value = result->value + lendiff;
867 	aa1.size = aa->size - lendiff;
868 	aa1.value = aa->value + lendiff;
869 	aa1.len = bb->len;
870 	aa1.sign = 1;
871 	(void) big_sub_pos(&res1, &aa1, bb);
872 	if (result->value != aa->value) {
873 		for (i = 0; i < lendiff; i++) {
874 			result->value[i] = aa->value[i];
875 		}
876 	}
877 	result->len = aa->len;
878 }
879 
880 
881 /*
882  * returns 1, 0, or -1 depending on whether |aa| > , ==, or <
883  *					(2^BIG_CHUNK_SIZE)^lendiff * |bb|
884  * aa->len should be >= bb->len
885  */
886 int
887 big_cmp_abs_high(BIGNUM *aa, BIGNUM *bb)
888 {
889 	int		lendiff;
890 	BIGNUM		aa1;
891 
892 	lendiff = aa->len - bb->len;
893 	aa1.len = bb->len;
894 	aa1.size = aa->size - lendiff;
895 	aa1.malloced = 0;
896 	aa1.value = aa->value + lendiff;
897 	return (big_cmp_abs(&aa1, bb));
898 }
899 
900 
901 /*
902  * result = aa * b where b is a max. (BIG_CHUNK_SIZE / 2)-bit positive integer.
903  * result should have enough space allocated.
904  */
905 static void
906 big_mulhalf_low(BIGNUM *result, BIGNUM *aa, BIG_CHUNK_TYPE b)
907 {
908 	int		i;
909 	BIG_CHUNK_TYPE	t1, t2, ai, cy;
910 	BIG_CHUNK_TYPE	*a, *r;
911 
912 	a = aa->value;
913 	r = result->value;
914 	cy = 0;
915 	for (i = 0; i < aa->len; i++) {
916 		ai = a[i];
917 		t1 = (ai & BIG_CHUNK_LOWHALFBITS) * b + cy;
918 		t2 = (ai >> (BIG_CHUNK_SIZE / 2)) * b +
919 		    (t1 >> (BIG_CHUNK_SIZE / 2));
920 		r[i] = (t1 & BIG_CHUNK_LOWHALFBITS) |
921 		    (t2 << (BIG_CHUNK_SIZE / 2));
922 		cy = t2 >> (BIG_CHUNK_SIZE / 2);
923 	}
924 	r[i] = cy;
925 	result->len = aa->len + 1;
926 	result->sign = aa->sign;
927 }
928 
929 
930 /*
931  * result = aa * b * 2^(BIG_CHUNK_SIZE / 2) where b is a max.
932  * (BIG_CHUNK_SIZE / 2)-bit positive integer.
933  * result should have enough space allocated.
934  */
935 static void
936 big_mulhalf_high(BIGNUM *result, BIGNUM *aa, BIG_CHUNK_TYPE b)
937 {
938 	int		i;
939 	BIG_CHUNK_TYPE	t1, t2, ai, cy, ri;
940 	BIG_CHUNK_TYPE	*a, *r;
941 
942 	a = aa->value;
943 	r = result->value;
944 	cy = 0;
945 	ri = 0;
946 	for (i = 0; i < aa->len; i++) {
947 		ai = a[i];
948 		t1 = (ai & BIG_CHUNK_LOWHALFBITS) * b + cy;
949 		t2 = (ai >>  (BIG_CHUNK_SIZE / 2)) * b +
950 		    (t1 >>  (BIG_CHUNK_SIZE / 2));
951 		r[i] = (t1 <<  (BIG_CHUNK_SIZE / 2)) + ri;
952 		ri = t2 & BIG_CHUNK_LOWHALFBITS;
953 		cy = t2 >> (BIG_CHUNK_SIZE / 2);
954 	}
955 	r[i] = (cy <<  (BIG_CHUNK_SIZE / 2)) + ri;
956 	result->len = aa->len + 1;
957 	result->sign = aa->sign;
958 }
959 
960 
961 /* it is assumed that result->size is big enough */
962 void
963 big_shiftleft(BIGNUM *result, BIGNUM *aa, int offs)
964 {
965 	int		i;
966 	BIG_CHUNK_TYPE	cy, ai;
967 
968 	if (offs == 0) {
969 		if (result != aa) {
970 			(void) big_copy(result, aa);
971 		}
972 		return;
973 	}
974 	cy = 0;
975 	for (i = 0; i < aa->len; i++) {
976 		ai = aa->value[i];
977 		result->value[i] = (ai << offs) | cy;
978 		cy = ai >> (BIG_CHUNK_SIZE - offs);
979 	}
980 	if (cy != 0) {
981 		result->len = aa->len + 1;
982 		result->value[result->len - 1] = cy;
983 	} else {
984 		result->len = aa->len;
985 	}
986 	result->sign = aa->sign;
987 }
988 
989 
990 /* it is assumed that result->size is big enough */
991 void
992 big_shiftright(BIGNUM *result, BIGNUM *aa, int offs)
993 {
994 	int		 i;
995 	BIG_CHUNK_TYPE	cy, ai;
996 
997 	if (offs == 0) {
998 		if (result != aa) {
999 			(void) big_copy(result, aa);
1000 		}
1001 		return;
1002 	}
1003 	cy = aa->value[0] >> offs;
1004 	for (i = 1; i < aa->len; i++) {
1005 		ai = aa->value[i];
1006 		result->value[i - 1] = (ai << (BIG_CHUNK_SIZE - offs)) | cy;
1007 		cy = ai >> offs;
1008 	}
1009 	result->len = aa->len;
1010 	result->value[result->len - 1] = cy;
1011 	result->sign = aa->sign;
1012 }
1013 
1014 
1015 /*
1016  * result = aa/bb   remainder = aa mod bb
1017  * it is assumed that aa and bb are positive
1018  */
1019 BIG_ERR_CODE
1020 big_div_pos(BIGNUM *result, BIGNUM *remainder, BIGNUM *aa, BIGNUM *bb)
1021 {
1022 	BIG_ERR_CODE	err = BIG_OK;
1023 	int		i, alen, blen, tlen, rlen, offs;
1024 	BIG_CHUNK_TYPE	higha, highb, coeff;
1025 	BIG_CHUNK_TYPE	*a, *b;
1026 	BIGNUM		bbhigh, bblow, tresult, tmp1, tmp2;
1027 	BIG_CHUNK_TYPE	tmp1value[BIGTMPSIZE];
1028 	BIG_CHUNK_TYPE	tmp2value[BIGTMPSIZE];
1029 	BIG_CHUNK_TYPE	tresultvalue[BIGTMPSIZE];
1030 	BIG_CHUNK_TYPE	bblowvalue[BIGTMPSIZE];
1031 	BIG_CHUNK_TYPE	bbhighvalue[BIGTMPSIZE];
1032 
1033 	a = aa->value;
1034 	b = bb->value;
1035 	alen = aa->len;
1036 	blen = bb->len;
1037 	while ((alen > 1) && (a[alen - 1] == 0)) {
1038 		alen = alen - 1;
1039 	}
1040 	aa->len = alen;
1041 	while ((blen > 1) && (b[blen - 1] == 0)) {
1042 		blen = blen - 1;
1043 	}
1044 	bb->len = blen;
1045 	if ((blen == 1) && (b[0] == 0)) {
1046 		return (BIG_DIV_BY_0);
1047 	}
1048 
1049 	if (big_cmp_abs(aa, bb) < 0) {
1050 		if ((remainder != NULL) &&
1051 		    ((err = big_copy(remainder, aa)) != BIG_OK)) {
1052 			return (err);
1053 		}
1054 		if (result != NULL) {
1055 			result->len = 1;
1056 			result->sign = 1;
1057 			result->value[0] = 0;
1058 		}
1059 		return (BIG_OK);
1060 	}
1061 
1062 	if ((err = big_init1(&bblow, blen + 1,
1063 	    bblowvalue, arraysize(bblowvalue))) != BIG_OK)
1064 		return (err);
1065 
1066 	if ((err = big_init1(&bbhigh, blen + 1,
1067 	    bbhighvalue, arraysize(bbhighvalue))) != BIG_OK)
1068 		goto ret1;
1069 
1070 	if ((err = big_init1(&tmp1, alen + 2,
1071 	    tmp1value, arraysize(tmp1value))) != BIG_OK)
1072 		goto ret2;
1073 
1074 	if ((err = big_init1(&tmp2, blen + 2,
1075 	    tmp2value, arraysize(tmp2value))) != BIG_OK)
1076 		goto ret3;
1077 
1078 	if ((err = big_init1(&tresult, alen - blen + 2,
1079 	    tresultvalue, arraysize(tresultvalue))) != BIG_OK)
1080 		goto ret4;
1081 
1082 	offs = 0;
1083 	highb = b[blen - 1];
1084 	if (highb >= (BIG_CHUNK_HALF_HIGHBIT << 1)) {
1085 		highb = highb >> (BIG_CHUNK_SIZE / 2);
1086 		offs = (BIG_CHUNK_SIZE / 2);
1087 	}
1088 	while ((highb & BIG_CHUNK_HALF_HIGHBIT) == 0) {
1089 		highb = highb << 1;
1090 		offs++;
1091 	}
1092 
1093 	big_shiftleft(&bblow, bb, offs);
1094 
1095 	if (offs <= (BIG_CHUNK_SIZE / 2 - 1)) {
1096 		big_shiftleft(&bbhigh, &bblow, BIG_CHUNK_SIZE / 2);
1097 	} else {
1098 		big_shiftright(&bbhigh, &bblow, BIG_CHUNK_SIZE / 2);
1099 	}
1100 	if (bbhigh.value[bbhigh.len - 1] == 0) {
1101 		bbhigh.len--;
1102 	} else {
1103 		bbhigh.value[bbhigh.len] = 0;
1104 	}
1105 
1106 	highb = bblow.value[bblow.len - 1];
1107 
1108 	big_shiftleft(&tmp1, aa, offs);
1109 	rlen = tmp1.len - bblow.len + 1;
1110 	tresult.len = rlen;
1111 
1112 	tmp1.len++;
1113 	tlen = tmp1.len;
1114 	tmp1.value[tmp1.len - 1] = 0;
1115 	for (i = 0; i < rlen; i++) {
1116 		higha = (tmp1.value[tlen - 1] << (BIG_CHUNK_SIZE / 2)) +
1117 		    (tmp1.value[tlen - 2] >> (BIG_CHUNK_SIZE / 2));
1118 		coeff = higha / (highb + 1);
1119 		big_mulhalf_high(&tmp2, &bblow, coeff);
1120 		big_sub_pos_high(&tmp1, &tmp1, &tmp2);
1121 		bbhigh.len++;
1122 		while (tmp1.value[tlen - 1] > 0) {
1123 			big_sub_pos_high(&tmp1, &tmp1, &bbhigh);
1124 			coeff++;
1125 		}
1126 		bbhigh.len--;
1127 		tlen--;
1128 		tmp1.len--;
1129 		while (big_cmp_abs_high(&tmp1, &bbhigh) >= 0) {
1130 			big_sub_pos_high(&tmp1, &tmp1, &bbhigh);
1131 			coeff++;
1132 		}
1133 		tresult.value[rlen - i - 1] = coeff << (BIG_CHUNK_SIZE / 2);
1134 		higha = tmp1.value[tlen - 1];
1135 		coeff = higha / (highb + 1);
1136 		big_mulhalf_low(&tmp2, &bblow, coeff);
1137 		tmp2.len--;
1138 		big_sub_pos_high(&tmp1, &tmp1, &tmp2);
1139 		while (big_cmp_abs_high(&tmp1, &bblow) >= 0) {
1140 			big_sub_pos_high(&tmp1, &tmp1, &bblow);
1141 			coeff++;
1142 		}
1143 		tresult.value[rlen - i - 1] =
1144 		    tresult.value[rlen - i - 1] + coeff;
1145 	}
1146 
1147 	big_shiftright(&tmp1, &tmp1, offs);
1148 
1149 	err = BIG_OK;
1150 
1151 	if ((remainder != NULL) &&
1152 	    ((err = big_copy(remainder, &tmp1)) != BIG_OK))
1153 		goto ret;
1154 
1155 	if (result != NULL)
1156 		err = big_copy(result, &tresult);
1157 
1158 ret:
1159 	big_finish(&tresult);
1160 ret4:
1161 	big_finish(&tmp1);
1162 ret3:
1163 	big_finish(&tmp2);
1164 ret2:
1165 	big_finish(&bbhigh);
1166 ret1:
1167 	big_finish(&bblow);
1168 	return (err);
1169 }
1170 
1171 
1172 /*
1173  * If there is no processor-specific integer implementation of
1174  * the lower level multiply functions, then this code is provided
1175  * for big_mul_set_vec(), big_mul_add_vec(), big_mul_vec() and
1176  * big_sqr_vec().
1177  *
1178  * There are two generic implementations.  One that assumes that
1179  * there is hardware and C compiler support for a 32 x 32 --> 64
1180  * bit unsigned multiply, but otherwise is not specific to any
1181  * processor, platform, or ISA.
1182  *
1183  * The other makes very few assumptions about hardware capabilities.
1184  * It does not even assume that there is any implementation of a
1185  * 32 x 32 --> 64 bit multiply that is accessible to C code and
1186  * appropriate to use.  It falls constructs 32 x 32 --> 64 bit
1187  * multiplies from 16 x 16 --> 32 bit multiplies.
1188  *
1189  */
1190 
1191 #if !defined(PSR_MUL)
1192 
1193 #ifdef UMUL64
1194 
1195 #if (BIG_CHUNK_SIZE == 32)
1196 
1197 #define	UNROLL8
1198 
1199 #define	MUL_SET_VEC_ROUND_PREFETCH(R) \
1200 	p = pf * d; \
1201 	pf = (uint64_t)a[R + 1]; \
1202 	t = p + cy; \
1203 	r[R] = (uint32_t)t; \
1204 	cy = t >> 32
1205 
1206 #define	MUL_SET_VEC_ROUND_NOPREFETCH(R) \
1207 	p = pf * d; \
1208 	t = p + cy; \
1209 	r[R] = (uint32_t)t; \
1210 	cy = t >> 32
1211 
1212 #define	MUL_ADD_VEC_ROUND_PREFETCH(R) \
1213 	t = (uint64_t)r[R]; \
1214 	p = pf * d; \
1215 	pf = (uint64_t)a[R + 1]; \
1216 	t = p + t + cy; \
1217 	r[R] = (uint32_t)t; \
1218 	cy = t >> 32
1219 
1220 #define	MUL_ADD_VEC_ROUND_NOPREFETCH(R) \
1221 	t = (uint64_t)r[R]; \
1222 	p = pf * d; \
1223 	t = p + t + cy; \
1224 	r[R] = (uint32_t)t; \
1225 	cy = t >> 32
1226 
1227 #ifdef UNROLL8
1228 
1229 #define	UNROLL 8
1230 
1231 /*
1232  * r = a * b
1233  * where r and a are vectors; b is a single 32-bit digit
1234  */
1235 
1236 uint32_t
1237 big_mul_set_vec(uint32_t *r, uint32_t *a, int len, uint32_t b)
1238 {
1239 	uint64_t d, pf, p, t, cy;
1240 
1241 	if (len == 0)
1242 		return (0);
1243 	cy = 0;
1244 	d = (uint64_t)b;
1245 	pf = (uint64_t)a[0];
1246 	while (len > UNROLL) {
1247 		MUL_SET_VEC_ROUND_PREFETCH(0);
1248 		MUL_SET_VEC_ROUND_PREFETCH(1);
1249 		MUL_SET_VEC_ROUND_PREFETCH(2);
1250 		MUL_SET_VEC_ROUND_PREFETCH(3);
1251 		MUL_SET_VEC_ROUND_PREFETCH(4);
1252 		MUL_SET_VEC_ROUND_PREFETCH(5);
1253 		MUL_SET_VEC_ROUND_PREFETCH(6);
1254 		MUL_SET_VEC_ROUND_PREFETCH(7);
1255 		r += UNROLL;
1256 		a += UNROLL;
1257 		len -= UNROLL;
1258 	}
1259 	if (len == UNROLL) {
1260 		MUL_SET_VEC_ROUND_PREFETCH(0);
1261 		MUL_SET_VEC_ROUND_PREFETCH(1);
1262 		MUL_SET_VEC_ROUND_PREFETCH(2);
1263 		MUL_SET_VEC_ROUND_PREFETCH(3);
1264 		MUL_SET_VEC_ROUND_PREFETCH(4);
1265 		MUL_SET_VEC_ROUND_PREFETCH(5);
1266 		MUL_SET_VEC_ROUND_PREFETCH(6);
1267 		MUL_SET_VEC_ROUND_NOPREFETCH(7);
1268 		return ((uint32_t)cy);
1269 	}
1270 	while (len > 1) {
1271 		MUL_SET_VEC_ROUND_PREFETCH(0);
1272 		++r;
1273 		++a;
1274 		--len;
1275 	}
1276 	if (len > 0) {
1277 		MUL_SET_VEC_ROUND_NOPREFETCH(0);
1278 	}
1279 	return ((uint32_t)cy);
1280 }
1281 
1282 /*
1283  * r += a * b
1284  * where r and a are vectors; b is a single 32-bit digit
1285  */
1286 
1287 uint32_t
1288 big_mul_add_vec(uint32_t *r, uint32_t *a, int len, uint32_t b)
1289 {
1290 	uint64_t d, pf, p, t, cy;
1291 
1292 	if (len == 0)
1293 		return (0);
1294 	cy = 0;
1295 	d = (uint64_t)b;
1296 	pf = (uint64_t)a[0];
1297 	while (len > 8) {
1298 		MUL_ADD_VEC_ROUND_PREFETCH(0);
1299 		MUL_ADD_VEC_ROUND_PREFETCH(1);
1300 		MUL_ADD_VEC_ROUND_PREFETCH(2);
1301 		MUL_ADD_VEC_ROUND_PREFETCH(3);
1302 		MUL_ADD_VEC_ROUND_PREFETCH(4);
1303 		MUL_ADD_VEC_ROUND_PREFETCH(5);
1304 		MUL_ADD_VEC_ROUND_PREFETCH(6);
1305 		MUL_ADD_VEC_ROUND_PREFETCH(7);
1306 		r += 8;
1307 		a += 8;
1308 		len -= 8;
1309 	}
1310 	if (len == 8) {
1311 		MUL_ADD_VEC_ROUND_PREFETCH(0);
1312 		MUL_ADD_VEC_ROUND_PREFETCH(1);
1313 		MUL_ADD_VEC_ROUND_PREFETCH(2);
1314 		MUL_ADD_VEC_ROUND_PREFETCH(3);
1315 		MUL_ADD_VEC_ROUND_PREFETCH(4);
1316 		MUL_ADD_VEC_ROUND_PREFETCH(5);
1317 		MUL_ADD_VEC_ROUND_PREFETCH(6);
1318 		MUL_ADD_VEC_ROUND_NOPREFETCH(7);
1319 		return ((uint32_t)cy);
1320 	}
1321 	while (len > 1) {
1322 		MUL_ADD_VEC_ROUND_PREFETCH(0);
1323 		++r;
1324 		++a;
1325 		--len;
1326 	}
1327 	if (len > 0) {
1328 		MUL_ADD_VEC_ROUND_NOPREFETCH(0);
1329 	}
1330 	return ((uint32_t)cy);
1331 }
1332 #endif /* UNROLL8 */
1333 
1334 void
1335 big_sqr_vec(uint32_t *r, uint32_t *a, int len)
1336 {
1337 	uint32_t	*tr, *ta;
1338 	int		tlen, row, col;
1339 	uint64_t	p, s, t, t2, cy;
1340 	uint32_t	d;
1341 
1342 	tr = r + 1;
1343 	ta = a;
1344 	tlen = len - 1;
1345 	tr[tlen] = big_mul_set_vec(tr, ta + 1, tlen, ta[0]);
1346 	while (--tlen > 0) {
1347 		tr += 2;
1348 		++ta;
1349 		tr[tlen] = big_mul_add_vec(tr, ta + 1, tlen, ta[0]);
1350 	}
1351 	s = (uint64_t)a[0];
1352 	s = s * s;
1353 	r[0] = (uint32_t)s;
1354 	cy = s >> 32;
1355 	p = ((uint64_t)r[1] << 1) + cy;
1356 	r[1] = (uint32_t)p;
1357 	cy = p >> 32;
1358 	row = 1;
1359 	col = 2;
1360 	while (row < len) {
1361 		s = (uint64_t)a[row];
1362 		s = s * s;
1363 		p = (uint64_t)r[col] << 1;
1364 		t = p + s;
1365 		d = (uint32_t)t;
1366 		t2 = (uint64_t)d + cy;
1367 		r[col] = (uint32_t)t2;
1368 		cy = (t >> 32) + (t2 >> 32);
1369 		if (row == len - 1)
1370 			break;
1371 		p = ((uint64_t)r[col + 1] << 1) + cy;
1372 		r[col + 1] = (uint32_t)p;
1373 		cy = p >> 32;
1374 		++row;
1375 		col += 2;
1376 	}
1377 	r[col + 1] = (uint32_t)cy;
1378 }
1379 
1380 #else /* BIG_CHUNK_SIZE == 64 */
1381 
1382 /*
1383  * r = r + a * digit, r and a are vectors of length len
1384  * returns the carry digit
1385  */
1386 BIG_CHUNK_TYPE
1387 big_mul_add_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len,
1388     BIG_CHUNK_TYPE digit)
1389 {
1390 	BIG_CHUNK_TYPE	cy, cy1, retcy, dlow, dhigh;
1391 	int		i;
1392 
1393 	cy1 = 0;
1394 	dlow = digit & BIG_CHUNK_LOWHALFBITS;
1395 	dhigh = digit >> (BIG_CHUNK_SIZE / 2);
1396 	for (i = 0; i < len; i++) {
1397 		cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
1398 		    dlow * (a[i] & BIG_CHUNK_LOWHALFBITS) +
1399 		    (r[i] & BIG_CHUNK_LOWHALFBITS);
1400 		cy1 = (cy >> (BIG_CHUNK_SIZE / 2)) +
1401 		    dlow * (a[i] >> (BIG_CHUNK_SIZE / 2)) +
1402 		    (r[i] >> (BIG_CHUNK_SIZE / 2));
1403 		r[i] = (cy & BIG_CHUNK_LOWHALFBITS) |
1404 		    (cy1 << (BIG_CHUNK_SIZE / 2));
1405 	}
1406 	retcy = cy1 >> (BIG_CHUNK_SIZE / 2);
1407 
1408 	cy1 = r[0] & BIG_CHUNK_LOWHALFBITS;
1409 	for (i = 0; i < len - 1; i++) {
1410 		cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
1411 		    dhigh * (a[i] & BIG_CHUNK_LOWHALFBITS) +
1412 		    (r[i] >> (BIG_CHUNK_SIZE / 2));
1413 		r[i] = (cy1 & BIG_CHUNK_LOWHALFBITS) |
1414 		    (cy << (BIG_CHUNK_SIZE / 2));
1415 		cy1 = (cy >> (BIG_CHUNK_SIZE / 2)) +
1416 		    dhigh * (a[i] >> (BIG_CHUNK_SIZE / 2)) +
1417 		    (r[i + 1] & BIG_CHUNK_LOWHALFBITS);
1418 	}
1419 	cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
1420 	    dhigh * (a[len - 1] & BIG_CHUNK_LOWHALFBITS) +
1421 	    (r[len - 1] >> (BIG_CHUNK_SIZE / 2));
1422 	r[len - 1] = (cy1 & BIG_CHUNK_LOWHALFBITS) |
1423 	    (cy << (BIG_CHUNK_SIZE / 2));
1424 	retcy = (cy >> (BIG_CHUNK_SIZE / 2)) +
1425 	    dhigh * (a[len - 1] >> (BIG_CHUNK_SIZE / 2)) + retcy;
1426 
1427 	return (retcy);
1428 }
1429 
1430 
1431 /*
1432  * r = a * digit, r and a are vectors of length len
1433  * returns the carry digit
1434  */
1435 BIG_CHUNK_TYPE
1436 big_mul_set_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len,
1437     BIG_CHUNK_TYPE digit)
1438 {
1439 	int	i;
1440 
1441 	ASSERT(r != a);
1442 	for (i = 0; i < len; i++) {
1443 		r[i] = 0;
1444 	}
1445 	return (big_mul_add_vec(r, a, len, digit));
1446 }
1447 
1448 void
1449 big_sqr_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len)
1450 {
1451 	int i;
1452 
1453 	ASSERT(r != a);
1454 	r[len] = big_mul_set_vec(r, a, len, a[0]);
1455 	for (i = 1; i < len; ++i)
1456 		r[len + i] = big_mul_add_vec(r + i, a, len, a[i]);
1457 }
1458 
1459 #endif /* BIG_CHUNK_SIZE == 32/64 */
1460 
1461 
1462 #else /* ! UMUL64 */
1463 
1464 #if (BIG_CHUNK_SIZE != 32)
1465 #error Don't use 64-bit chunks without defining UMUL64
1466 #endif
1467 
1468 
1469 /*
1470  * r = r + a * digit, r and a are vectors of length len
1471  * returns the carry digit
1472  */
1473 uint32_t
1474 big_mul_add_vec(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1475 {
1476 	uint32_t cy, cy1, retcy, dlow, dhigh;
1477 	int i;
1478 
1479 	cy1 = 0;
1480 	dlow = digit & 0xffff;
1481 	dhigh = digit >> 16;
1482 	for (i = 0; i < len; i++) {
1483 		cy = (cy1 >> 16) + dlow * (a[i] & 0xffff) + (r[i] & 0xffff);
1484 		cy1 = (cy >> 16) + dlow * (a[i]>>16) + (r[i] >> 16);
1485 		r[i] = (cy & 0xffff) | (cy1 << 16);
1486 	}
1487 	retcy = cy1 >> 16;
1488 
1489 	cy1 = r[0] & 0xffff;
1490 	for (i = 0; i < len - 1; i++) {
1491 		cy = (cy1 >> 16) + dhigh * (a[i] & 0xffff) + (r[i] >> 16);
1492 		r[i] = (cy1 & 0xffff) | (cy << 16);
1493 		cy1 = (cy >> 16) + dhigh * (a[i] >> 16) + (r[i + 1] & 0xffff);
1494 	}
1495 	cy = (cy1 >> 16) + dhigh * (a[len - 1] & 0xffff) + (r[len - 1] >> 16);
1496 	r[len - 1] = (cy1 & 0xffff) | (cy << 16);
1497 	retcy = (cy >> 16) + dhigh * (a[len - 1] >> 16) + retcy;
1498 
1499 	return (retcy);
1500 }
1501 
1502 
1503 /*
1504  * r = a * digit, r and a are vectors of length len
1505  * returns the carry digit
1506  */
1507 uint32_t
1508 big_mul_set_vec(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1509 {
1510 	int	i;
1511 
1512 	ASSERT(r != a);
1513 	for (i = 0; i < len; i++) {
1514 		r[i] = 0;
1515 	}
1516 
1517 	return (big_mul_add_vec(r, a, len, digit));
1518 }
1519 
1520 void
1521 big_sqr_vec(uint32_t *r, uint32_t *a, int len)
1522 {
1523 	int i;
1524 
1525 	ASSERT(r != a);
1526 	r[len] = big_mul_set_vec(r, a, len, a[0]);
1527 	for (i = 1; i < len; ++i)
1528 		r[len + i] = big_mul_add_vec(r + i, a, len, a[i]);
1529 }
1530 
1531 #endif /* UMUL64 */
1532 
1533 void
1534 big_mul_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int alen,
1535     BIG_CHUNK_TYPE *b, int blen)
1536 {
1537 	int i;
1538 
1539 	r[alen] = big_mul_set_vec(r, a, alen, b[0]);
1540 	for (i = 1; i < blen; ++i)
1541 		r[alen + i] = big_mul_add_vec(r + i, a, alen, b[i]);
1542 }
1543 
1544 
1545 #endif /* ! PSR_MUL */
1546 
1547 
1548 /*
1549  * result = aa * bb  result->value should be big enough to hold the result
1550  *
1551  * Implementation: Standard grammar school algorithm
1552  *
1553  */
1554 BIG_ERR_CODE
1555 big_mul(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
1556 {
1557 	BIGNUM		tmp1;
1558 	BIG_CHUNK_TYPE	tmp1value[BIGTMPSIZE];
1559 	BIG_CHUNK_TYPE	*r, *t, *a, *b;
1560 	BIG_ERR_CODE	err;
1561 	int		i, alen, blen, rsize, sign, diff;
1562 
1563 	if (aa == bb) {
1564 		diff = 0;
1565 	} else {
1566 		diff = big_cmp_abs(aa, bb);
1567 		if (diff < 0) {
1568 			BIGNUM *tt;
1569 			tt = aa;
1570 			aa = bb;
1571 			bb = tt;
1572 		}
1573 	}
1574 	a = aa->value;
1575 	b = bb->value;
1576 	alen = aa->len;
1577 	blen = bb->len;
1578 	while ((alen > 1) && (a[alen - 1] == 0)) {
1579 		alen--;
1580 	}
1581 	aa->len = alen;
1582 	while ((blen > 1) && (b[blen - 1] == 0)) {
1583 		blen--;
1584 	}
1585 	bb->len = blen;
1586 
1587 	rsize = alen + blen;
1588 	ASSERT(rsize > 0);
1589 	if (result->size < rsize) {
1590 		err = big_extend(result, rsize);
1591 		if (err != BIG_OK) {
1592 			return (err);
1593 		}
1594 		/* aa or bb might be an alias to result */
1595 		a = aa->value;
1596 		b = bb->value;
1597 	}
1598 	r = result->value;
1599 
1600 	if (((alen == 1) && (a[0] == 0)) || ((blen == 1) && (b[0] == 0))) {
1601 		result->len = 1;
1602 		result->sign = 1;
1603 		r[0] = 0;
1604 		return (BIG_OK);
1605 	}
1606 	sign = aa->sign * bb->sign;
1607 	if ((alen == 1) && (a[0] == 1)) {
1608 		for (i = 0; i < blen; i++) {
1609 			r[i] = b[i];
1610 		}
1611 		result->len = blen;
1612 		result->sign = sign;
1613 		return (BIG_OK);
1614 	}
1615 	if ((blen == 1) && (b[0] == 1)) {
1616 		for (i = 0; i < alen; i++) {
1617 			r[i] = a[i];
1618 		}
1619 		result->len = alen;
1620 		result->sign = sign;
1621 		return (BIG_OK);
1622 	}
1623 
1624 	if ((err = big_init1(&tmp1, rsize,
1625 	    tmp1value, arraysize(tmp1value))) != BIG_OK) {
1626 		return (err);
1627 	}
1628 	t = tmp1.value;
1629 
1630 	for (i = 0; i < rsize; i++) {
1631 		t[i] = 0;
1632 	}
1633 
1634 	if (diff == 0 && alen > 2) {
1635 		BIG_SQR_VEC(t, a, alen);
1636 	} else if (blen > 0) {
1637 		BIG_MUL_VEC(t, a, alen, b, blen);
1638 	}
1639 
1640 	if (t[rsize - 1] == 0) {
1641 		tmp1.len = rsize - 1;
1642 	} else {
1643 		tmp1.len = rsize;
1644 	}
1645 
1646 	err = big_copy(result, &tmp1);
1647 
1648 	result->sign = sign;
1649 
1650 	big_finish(&tmp1);
1651 
1652 	return (err);
1653 }
1654 
1655 
1656 /*
1657  * big_mont_mul()
1658  * Montgomery multiplication.
1659  *
1660  * Caller must ensure that  a < n,  b < n,  ret->size >=  2 * n->len + 1,
1661  * and that ret is not n.
1662  */
1663 BIG_ERR_CODE
1664 big_mont_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, BIGNUM *n, BIG_CHUNK_TYPE n0)
1665 {
1666 	int		i, j, nlen, needsubtract;
1667 	BIG_CHUNK_TYPE	*nn, *rr, *rrplusi;
1668 	BIG_CHUNK_TYPE	digit, c;
1669 	BIG_ERR_CODE	err;
1670 #ifdef	__amd64
1671 #define	BIG_CPU_UNKNOWN	0
1672 #define	BIG_CPU_AMD	1
1673 #define	BIG_CPU_INTEL	2
1674 	static int	big_cpu = BIG_CPU_UNKNOWN;
1675 	BIG_CHUNK_TYPE	carry[BIGTMPSIZE];
1676 
1677 	if (big_cpu == BIG_CPU_UNKNOWN) {
1678 		big_cpu = 1 + bignum_on_intel();
1679 	}
1680 #endif	/* __amd64 */
1681 
1682 	nlen = n->len;
1683 	nn = n->value;
1684 
1685 	rr = ret->value;
1686 
1687 	if ((err = big_mul(ret, a, b)) != BIG_OK) {
1688 		return (err);
1689 	}
1690 
1691 	rr = ret->value;
1692 	for (i = ret->len; i < 2 * nlen + 1; i++) {
1693 		rr[i] = 0;
1694 	}
1695 
1696 #ifdef	__amd64	/* pipelining optimization for Intel 64, but not AMD64 */
1697 	if ((big_cpu == BIG_CPU_INTEL) && (nlen <= BIGTMPSIZE)) {
1698 		/*
1699 		 * Perform the following in two for loops to reduce the
1700 		 * dependency between computing the carryover bits with
1701 		 * BIG_MUL_ADD_VEC() and adding them, thus improving pipelining.
1702 		 */
1703 		for (i = 0; i < nlen; i++) {
1704 			rrplusi = rr + i;
1705 			digit = *rrplusi * n0;
1706 			carry[i] = BIG_MUL_ADD_VEC(rrplusi, nn, nlen, digit);
1707 		}
1708 		for (i = 0; i < nlen; i++) {
1709 			j = i + nlen;
1710 			rr[j] += carry[i];
1711 			while (rr[j] < carry[i]) {
1712 				rr[++j] += 1;
1713 				carry[i] = 1;
1714 			}
1715 		}
1716 	} else
1717 #endif	/* __amd64 */
1718 	{ /* no pipelining optimization */
1719 		for (i = 0; i < nlen; i++) {
1720 			rrplusi = rr + i;
1721 			digit = *rrplusi * n0;
1722 			c = BIG_MUL_ADD_VEC(rrplusi, nn, nlen, digit);
1723 			j = i + nlen;
1724 			rr[j] += c;
1725 			while (rr[j] < c) {
1726 				rr[++j] += 1;
1727 				c = 1;
1728 			}
1729 		}
1730 	}
1731 
1732 	needsubtract = 0;
1733 	if ((rr[2 * nlen]  != 0))
1734 		needsubtract = 1;
1735 	else {
1736 		for (i = 2 * nlen - 1; i >= nlen; i--) {
1737 			if (rr[i] > nn[i - nlen]) {
1738 				needsubtract = 1;
1739 				break;
1740 			} else if (rr[i] < nn[i - nlen]) {
1741 				break;
1742 			}
1743 		}
1744 	}
1745 	if (needsubtract)
1746 		big_sub_vec(rr, rr + nlen, nn, nlen);
1747 	else {
1748 		for (i = 0; i < nlen; i++) {
1749 			rr[i] = rr[i + nlen];
1750 		}
1751 	}
1752 
1753 	/* Remove leading zeros, but keep at least 1 digit: */
1754 	for (i = nlen - 1; (i > 0) && (rr[i] == 0); i--)
1755 		;
1756 	ret->len = i + 1;
1757 
1758 	return (BIG_OK);
1759 }
1760 
1761 
1762 BIG_CHUNK_TYPE
1763 big_n0(BIG_CHUNK_TYPE n)
1764 {
1765 	int		i;
1766 	BIG_CHUNK_TYPE	result, tmp;
1767 
1768 	result = 0;
1769 	tmp = BIG_CHUNK_ALLBITS;
1770 	for (i = 0; i < BIG_CHUNK_SIZE; i++) {
1771 		if ((tmp & 1) == 1) {
1772 			result = (result >> 1) | BIG_CHUNK_HIGHBIT;
1773 			tmp = tmp - n;
1774 		} else {
1775 			result = (result >> 1);
1776 		}
1777 		tmp = tmp >> 1;
1778 	}
1779 
1780 	return (result);
1781 }
1782 
1783 
1784 int
1785 big_numbits(BIGNUM *n)
1786 {
1787 	int		i, j;
1788 	BIG_CHUNK_TYPE	t;
1789 
1790 	for (i = n->len - 1; i > 0; i--) {
1791 		if (n->value[i] != 0) {
1792 			break;
1793 		}
1794 	}
1795 	t = n->value[i];
1796 	for (j = BIG_CHUNK_SIZE; j > 0; j--) {
1797 		if ((t & BIG_CHUNK_HIGHBIT) == 0) {
1798 			t = t << 1;
1799 		} else {
1800 			return (BIG_CHUNK_SIZE * i + j);
1801 		}
1802 	}
1803 	return (0);
1804 }
1805 
1806 
1807 /* caller must make sure that a < n */
1808 BIG_ERR_CODE
1809 big_mont_rr(BIGNUM *result, BIGNUM *n)
1810 {
1811 	BIGNUM		rr;
1812 	BIG_CHUNK_TYPE	rrvalue[BIGTMPSIZE];
1813 	int		len, i;
1814 	BIG_ERR_CODE	err;
1815 
1816 	rr.malloced = 0;
1817 	len = n->len;
1818 
1819 	if ((err = big_init1(&rr, 2 * len + 1,
1820 	    rrvalue, arraysize(rrvalue))) != BIG_OK) {
1821 		return (err);
1822 	}
1823 
1824 	for (i = 0; i < 2 * len; i++) {
1825 		rr.value[i] = 0;
1826 	}
1827 	rr.value[2 * len] = 1;
1828 	rr.len = 2 * len + 1;
1829 	if ((err = big_div_pos(NULL, &rr, &rr, n)) != BIG_OK) {
1830 		goto ret;
1831 	}
1832 	err = big_copy(result, &rr);
1833 ret:
1834 	big_finish(&rr);
1835 	return (err);
1836 }
1837 
1838 
1839 /* caller must make sure that a < n */
1840 BIG_ERR_CODE
1841 big_mont_conv(BIGNUM *result, BIGNUM *a, BIGNUM *n, BIG_CHUNK_TYPE n0,
1842     BIGNUM *n_rr)
1843 {
1844 	BIGNUM		rr;
1845 	BIG_CHUNK_TYPE	rrvalue[BIGTMPSIZE];
1846 	int		len, i;
1847 	BIG_ERR_CODE	err;
1848 
1849 	rr.malloced = 0;
1850 	len = n->len;
1851 
1852 	if ((err = big_init1(&rr, 2 * len + 1, rrvalue, arraysize(rrvalue)))
1853 	    != BIG_OK) {
1854 		return (err);
1855 	}
1856 
1857 	if (n_rr == NULL) {
1858 		for (i = 0; i < 2 * len; i++) {
1859 			rr.value[i] = 0;
1860 		}
1861 		rr.value[2 * len] = 1;
1862 		rr.len = 2 * len + 1;
1863 		if ((err = big_div_pos(NULL, &rr, &rr, n)) != BIG_OK) {
1864 			goto ret;
1865 		}
1866 		n_rr = &rr;
1867 	}
1868 
1869 	if ((err = big_mont_mul(&rr, n_rr, a, n, n0)) != BIG_OK) {
1870 		goto ret;
1871 	}
1872 	err = big_copy(result, &rr);
1873 
1874 ret:
1875 	big_finish(&rr);
1876 	return (err);
1877 }
1878 
1879 
1880 #ifdef	USE_FLOATING_POINT
1881 #define	big_modexp_ncp_float	big_modexp_ncp_sw
1882 #else
1883 #define	big_modexp_ncp_int	big_modexp_ncp_sw
1884 #endif
1885 
1886 #define	MAX_EXP_BIT_GROUP_SIZE 6
1887 #define	APOWERS_MAX_SIZE (1 << (MAX_EXP_BIT_GROUP_SIZE - 1))
1888 
1889 /* ARGSUSED */
1890 static BIG_ERR_CODE
1891 big_modexp_ncp_int(BIGNUM *result, BIGNUM *ma, BIGNUM *e, BIGNUM *n,
1892     BIGNUM *tmp, BIG_CHUNK_TYPE n0)
1893 
1894 {
1895 	BIGNUM		apowers[APOWERS_MAX_SIZE];
1896 	BIGNUM		tmp1;
1897 	BIG_CHUNK_TYPE	tmp1value[BIGTMPSIZE];
1898 	int		i, j, k, l, m, p;
1899 	uint32_t	bit, bitind, bitcount, groupbits, apowerssize;
1900 	uint32_t	nbits;
1901 	BIG_ERR_CODE	err;
1902 
1903 	nbits = big_numbits(e);
1904 	if (nbits < 50) {
1905 		groupbits = 1;
1906 		apowerssize = 1;
1907 	} else {
1908 		groupbits = MAX_EXP_BIT_GROUP_SIZE;
1909 		apowerssize = 1 << (groupbits - 1);
1910 	}
1911 
1912 
1913 	if ((err = big_init1(&tmp1, 2 * n->len + 1,
1914 	    tmp1value, arraysize(tmp1value))) != BIG_OK) {
1915 		return (err);
1916 	}
1917 
1918 	/* clear the malloced bit to help cleanup */
1919 	for (i = 0; i < apowerssize; i++) {
1920 		apowers[i].malloced = 0;
1921 	}
1922 
1923 	for (i = 0; i < apowerssize; i++) {
1924 		if ((err = big_init1(&(apowers[i]), n->len, NULL, 0)) !=
1925 		    BIG_OK) {
1926 			goto ret;
1927 		}
1928 	}
1929 
1930 	(void) big_copy(&(apowers[0]), ma);
1931 
1932 	if ((err = big_mont_mul(&tmp1, ma, ma, n, n0)) != BIG_OK) {
1933 		goto ret;
1934 	}
1935 	(void) big_copy(ma, &tmp1);
1936 
1937 	for (i = 1; i < apowerssize; i++) {
1938 		if ((err = big_mont_mul(&tmp1, ma,
1939 		    &(apowers[i - 1]), n, n0)) != BIG_OK) {
1940 			goto ret;
1941 		}
1942 		(void) big_copy(&apowers[i], &tmp1);
1943 	}
1944 
1945 	bitind = nbits % BIG_CHUNK_SIZE;
1946 	k = 0;
1947 	l = 0;
1948 	p = 0;
1949 	bitcount = 0;
1950 	for (i = nbits / BIG_CHUNK_SIZE; i >= 0; i--) {
1951 		for (j = bitind - 1; j >= 0; j--) {
1952 			bit = (e->value[i] >> j) & 1;
1953 			if ((bitcount == 0) && (bit == 0)) {
1954 				if ((err = big_mont_mul(tmp,
1955 				    tmp, tmp, n, n0)) != BIG_OK) {
1956 					goto ret;
1957 				}
1958 			} else {
1959 				bitcount++;
1960 				p = p * 2 + bit;
1961 				if (bit == 1) {
1962 					k = k + l + 1;
1963 					l = 0;
1964 				} else {
1965 					l++;
1966 				}
1967 				if (bitcount == groupbits) {
1968 					for (m = 0; m < k; m++) {
1969 						if ((err = big_mont_mul(tmp,
1970 						    tmp, tmp, n, n0)) !=
1971 						    BIG_OK) {
1972 							goto ret;
1973 						}
1974 					}
1975 					if ((err = big_mont_mul(tmp, tmp,
1976 					    &(apowers[p >> (l + 1)]),
1977 					    n, n0)) != BIG_OK) {
1978 						goto ret;
1979 					}
1980 					for (m = 0; m < l; m++) {
1981 						if ((err = big_mont_mul(tmp,
1982 						    tmp, tmp, n, n0)) !=
1983 						    BIG_OK) {
1984 							goto ret;
1985 						}
1986 					}
1987 					k = 0;
1988 					l = 0;
1989 					p = 0;
1990 					bitcount = 0;
1991 				}
1992 			}
1993 		}
1994 		bitind = BIG_CHUNK_SIZE;
1995 	}
1996 
1997 	for (m = 0; m < k; m++) {
1998 		if ((err = big_mont_mul(tmp, tmp, tmp, n, n0)) != BIG_OK) {
1999 			goto ret;
2000 		}
2001 	}
2002 	if (p != 0) {
2003 		if ((err = big_mont_mul(tmp, tmp,
2004 		    &(apowers[p >> (l + 1)]), n, n0)) != BIG_OK) {
2005 			goto ret;
2006 		}
2007 	}
2008 	for (m = 0; m < l; m++) {
2009 		if ((err = big_mont_mul(result, tmp, tmp, n, n0)) != BIG_OK) {
2010 			goto ret;
2011 		}
2012 	}
2013 
2014 ret:
2015 	for (i = apowerssize - 1; i >= 0; i--) {
2016 		big_finish(&(apowers[i]));
2017 	}
2018 	big_finish(&tmp1);
2019 
2020 	return (err);
2021 }
2022 
2023 
2024 #ifdef USE_FLOATING_POINT
2025 
2026 #ifdef _KERNEL
2027 
2028 #include <sys/sysmacros.h>
2029 #include <sys/regset.h>
2030 #include <sys/fpu/fpusystm.h>
2031 
2032 /* the alignment for block stores to save fp registers */
2033 #define	FPR_ALIGN	(64)
2034 
2035 extern void big_savefp(kfpu_t *);
2036 extern void big_restorefp(kfpu_t *);
2037 
2038 #endif /* _KERNEL */
2039 
2040 /*
2041  * This version makes use of floating point for performance
2042  */
2043 static BIG_ERR_CODE
2044 big_modexp_ncp_float(BIGNUM *result, BIGNUM *ma, BIGNUM *e, BIGNUM *n,
2045     BIGNUM *tmp, BIG_CHUNK_TYPE n0)
2046 {
2047 
2048 	int		i, j, k, l, m, p;
2049 	uint32_t	bit, bitind, bitcount, nlen;
2050 	double		dn0;
2051 	double		*dn, *dt, *d16r, *d32r;
2052 	uint32_t	*nint, *prod;
2053 	double		*apowers[APOWERS_MAX_SIZE];
2054 	uint32_t	nbits, groupbits, apowerssize;
2055 	BIG_ERR_CODE	err = BIG_OK;
2056 
2057 #ifdef _KERNEL
2058 	uint8_t fpua[sizeof (kfpu_t) + FPR_ALIGN];
2059 	kfpu_t *fpu;
2060 
2061 #ifdef DEBUG
2062 	if (!fpu_exists)
2063 		return (BIG_GENERAL_ERR);
2064 #endif
2065 
2066 	fpu =  (kfpu_t *)P2ROUNDUP((uintptr_t)fpua, FPR_ALIGN);
2067 	big_savefp(fpu);
2068 
2069 #endif /* _KERNEL */
2070 
2071 	nbits = big_numbits(e);
2072 	if (nbits < 50) {
2073 		groupbits = 1;
2074 		apowerssize = 1;
2075 	} else {
2076 		groupbits = MAX_EXP_BIT_GROUP_SIZE;
2077 		apowerssize = 1 << (groupbits - 1);
2078 	}
2079 
2080 	nlen = (BIG_CHUNK_SIZE / 32) * n->len;
2081 	dn0 = (double)(n0 & 0xffff);
2082 
2083 	dn = dt = d16r = d32r = NULL;
2084 	nint = prod = NULL;
2085 	for (i = 0; i < apowerssize; i++) {
2086 		apowers[i] = NULL;
2087 	}
2088 
2089 	if ((dn = big_malloc(nlen * sizeof (double))) == NULL) {
2090 		err = BIG_NO_MEM;
2091 		goto ret;
2092 	}
2093 	if ((dt = big_malloc((4 * nlen + 2) * sizeof (double))) == NULL) {
2094 		err = BIG_NO_MEM;
2095 		goto ret;
2096 	}
2097 	if ((nint = big_malloc(nlen * sizeof (uint32_t))) == NULL) {
2098 		err = BIG_NO_MEM;
2099 		goto ret;
2100 	}
2101 	if ((prod = big_malloc((nlen + 1) * sizeof (uint32_t))) == NULL) {
2102 		err = BIG_NO_MEM;
2103 		goto ret;
2104 	}
2105 	if ((d16r = big_malloc((2 * nlen + 1) * sizeof (double))) == NULL) {
2106 		err = BIG_NO_MEM;
2107 		goto ret;
2108 	}
2109 	if ((d32r = big_malloc(nlen * sizeof (double))) == NULL) {
2110 		err = BIG_NO_MEM;
2111 		goto ret;
2112 	}
2113 	for (i = 0; i < apowerssize; i++) {
2114 		if ((apowers[i] = big_malloc((2 * nlen + 1) *
2115 		    sizeof (double))) == NULL) {
2116 			err = BIG_NO_MEM;
2117 			goto ret;
2118 		}
2119 	}
2120 
2121 #if (BIG_CHUNK_SIZE == 32)
2122 	for (i = 0; i < ma->len; i++) {
2123 		nint[i] = ma->value[i];
2124 	}
2125 	for (; i < nlen; i++) {
2126 		nint[i] = 0;
2127 	}
2128 #else
2129 	for (i = 0; i < ma->len; i++) {
2130 		nint[2 * i] = (uint32_t)(ma->value[i] & 0xffffffffULL);
2131 		nint[2 * i + 1] = (uint32_t)(ma->value[i] >> 32);
2132 	}
2133 	for (i = ma->len * 2; i < nlen; i++) {
2134 		nint[i] = 0;
2135 	}
2136 #endif
2137 	conv_i32_to_d32_and_d16(d32r, apowers[0], nint, nlen);
2138 
2139 #if (BIG_CHUNK_SIZE == 32)
2140 	for (i = 0; i < n->len; i++) {
2141 		nint[i] = n->value[i];
2142 	}
2143 	for (; i < nlen; i++) {
2144 		nint[i] = 0;
2145 	}
2146 #else
2147 	for (i = 0; i < n->len; i++) {
2148 		nint[2 * i] = (uint32_t)(n->value[i] & 0xffffffffULL);
2149 		nint[2 * i + 1] = (uint32_t)(n->value[i] >> 32);
2150 	}
2151 	for (i = n->len * 2; i < nlen; i++) {
2152 		nint[i] = 0;
2153 	}
2154 #endif
2155 	conv_i32_to_d32(dn, nint, nlen);
2156 
2157 	mont_mulf_noconv(prod, d32r, apowers[0], dt, dn, nint, nlen, dn0);
2158 	conv_i32_to_d32(d32r, prod, nlen);
2159 	for (i = 1; i < apowerssize; i++) {
2160 		mont_mulf_noconv(prod, d32r, apowers[i - 1],
2161 		    dt, dn, nint, nlen, dn0);
2162 		conv_i32_to_d16(apowers[i], prod, nlen);
2163 	}
2164 
2165 #if (BIG_CHUNK_SIZE == 32)
2166 	for (i = 0; i < tmp->len; i++) {
2167 		prod[i] = tmp->value[i];
2168 	}
2169 	for (; i < nlen + 1; i++) {
2170 		prod[i] = 0;
2171 	}
2172 #else
2173 	for (i = 0; i < tmp->len; i++) {
2174 		prod[2 * i] = (uint32_t)(tmp->value[i] & 0xffffffffULL);
2175 		prod[2 * i + 1] = (uint32_t)(tmp->value[i] >> 32);
2176 	}
2177 	for (i = tmp->len * 2; i < nlen + 1; i++) {
2178 		prod[i] = 0;
2179 	}
2180 #endif
2181 
2182 	bitind = nbits % BIG_CHUNK_SIZE;
2183 	k = 0;
2184 	l = 0;
2185 	p = 0;
2186 	bitcount = 0;
2187 	for (i = nbits / BIG_CHUNK_SIZE; i >= 0; i--) {
2188 		for (j = bitind - 1; j >= 0; j--) {
2189 			bit = (e->value[i] >> j) & 1;
2190 			if ((bitcount == 0) && (bit == 0)) {
2191 				conv_i32_to_d32_and_d16(d32r, d16r,
2192 				    prod, nlen);
2193 				mont_mulf_noconv(prod, d32r, d16r,
2194 				    dt, dn, nint, nlen, dn0);
2195 			} else {
2196 				bitcount++;
2197 				p = p * 2 + bit;
2198 				if (bit == 1) {
2199 					k = k + l + 1;
2200 					l = 0;
2201 				} else {
2202 					l++;
2203 				}
2204 				if (bitcount == groupbits) {
2205 					for (m = 0; m < k; m++) {
2206 						conv_i32_to_d32_and_d16(d32r,
2207 						    d16r, prod, nlen);
2208 						mont_mulf_noconv(prod, d32r,
2209 						    d16r, dt, dn, nint,
2210 						    nlen, dn0);
2211 					}
2212 					conv_i32_to_d32(d32r, prod, nlen);
2213 					mont_mulf_noconv(prod, d32r,
2214 					    apowers[p >> (l + 1)],
2215 					    dt, dn, nint, nlen, dn0);
2216 					for (m = 0; m < l; m++) {
2217 						conv_i32_to_d32_and_d16(d32r,
2218 						    d16r, prod, nlen);
2219 						mont_mulf_noconv(prod, d32r,
2220 						    d16r, dt, dn, nint,
2221 						    nlen, dn0);
2222 					}
2223 					k = 0;
2224 					l = 0;
2225 					p = 0;
2226 					bitcount = 0;
2227 				}
2228 			}
2229 		}
2230 		bitind = BIG_CHUNK_SIZE;
2231 	}
2232 
2233 	for (m = 0; m < k; m++) {
2234 		conv_i32_to_d32_and_d16(d32r, d16r, prod, nlen);
2235 		mont_mulf_noconv(prod, d32r, d16r, dt, dn, nint, nlen, dn0);
2236 	}
2237 	if (p != 0) {
2238 		conv_i32_to_d32(d32r, prod, nlen);
2239 		mont_mulf_noconv(prod, d32r, apowers[p >> (l + 1)],
2240 		    dt, dn, nint, nlen, dn0);
2241 	}
2242 	for (m = 0; m < l; m++) {
2243 		conv_i32_to_d32_and_d16(d32r, d16r, prod, nlen);
2244 		mont_mulf_noconv(prod, d32r, d16r, dt, dn, nint, nlen, dn0);
2245 	}
2246 
2247 #if (BIG_CHUNK_SIZE == 32)
2248 	for (i = 0; i < nlen; i++) {
2249 		result->value[i] = prod[i];
2250 	}
2251 	for (i = nlen - 1; (i > 0) && (prod[i] == 0); i--)
2252 		;
2253 #else
2254 	for (i = 0; i < nlen / 2; i++) {
2255 		result->value[i] = (uint64_t)(prod[2 * i]) +
2256 		    (((uint64_t)(prod[2 * i + 1])) << 32);
2257 	}
2258 	for (i = nlen / 2 - 1; (i > 0) && (result->value[i] == 0); i--)
2259 		;
2260 #endif
2261 	result->len = i + 1;
2262 
2263 ret:
2264 	for (i = apowerssize - 1; i >= 0; i--) {
2265 		if (apowers[i] != NULL)
2266 			big_free(apowers[i], (2 * nlen + 1) * sizeof (double));
2267 	}
2268 	if (d32r != NULL) {
2269 		big_free(d32r, nlen * sizeof (double));
2270 	}
2271 	if (d16r != NULL) {
2272 		big_free(d16r, (2 * nlen + 1) * sizeof (double));
2273 	}
2274 	if (prod != NULL) {
2275 		big_free(prod, (nlen + 1) * sizeof (uint32_t));
2276 	}
2277 	if (nint != NULL) {
2278 		big_free(nint, nlen * sizeof (uint32_t));
2279 	}
2280 	if (dt != NULL) {
2281 		big_free(dt, (4 * nlen + 2) * sizeof (double));
2282 	}
2283 	if (dn != NULL) {
2284 		big_free(dn, nlen * sizeof (double));
2285 	}
2286 
2287 #ifdef _KERNEL
2288 	big_restorefp(fpu);
2289 #endif
2290 
2291 	return (err);
2292 }
2293 
2294 #endif /* USE_FLOATING_POINT */
2295 
2296 
2297 BIG_ERR_CODE
2298 big_modexp_ext(BIGNUM *result, BIGNUM *a, BIGNUM *e, BIGNUM *n, BIGNUM *n_rr,
2299     big_modexp_ncp_info_t *info)
2300 {
2301 	BIGNUM		ma, tmp, rr;
2302 	BIG_CHUNK_TYPE	mavalue[BIGTMPSIZE];
2303 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
2304 	BIG_CHUNK_TYPE	rrvalue[BIGTMPSIZE];
2305 	BIG_ERR_CODE	err;
2306 	BIG_CHUNK_TYPE	n0;
2307 
2308 	if ((err = big_init1(&ma, n->len, mavalue, arraysize(mavalue)))	!=
2309 	    BIG_OK) {
2310 		return (err);
2311 	}
2312 	ma.len = 1;
2313 	ma.value[0] = 0;
2314 
2315 	if ((err = big_init1(&tmp, 2 * n->len + 1,
2316 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
2317 		goto ret1;
2318 	}
2319 
2320 	/* clear the malloced bit to help cleanup */
2321 	rr.malloced = 0;
2322 
2323 	if (n_rr == NULL) {
2324 		if ((err = big_init1(&rr, 2 * n->len + 1,
2325 		    rrvalue, arraysize(rrvalue))) != BIG_OK) {
2326 			goto ret2;
2327 		}
2328 		if (big_mont_rr(&rr, n) != BIG_OK) {
2329 			goto ret;
2330 		}
2331 		n_rr = &rr;
2332 	}
2333 
2334 	n0 = big_n0(n->value[0]);
2335 
2336 	if (big_cmp_abs(a, n) > 0) {
2337 		if ((err = big_div_pos(NULL, &ma, a, n)) != BIG_OK) {
2338 			goto ret;
2339 		}
2340 		err = big_mont_conv(&ma, &ma, n, n0, n_rr);
2341 	} else {
2342 		err = big_mont_conv(&ma, a, n, n0, n_rr);
2343 	}
2344 	if (err != BIG_OK) {
2345 		goto ret;
2346 	}
2347 
2348 	tmp.len = 1;
2349 	tmp.value[0] = 1;
2350 	if ((err = big_mont_conv(&tmp, &tmp, n, n0, n_rr)) != BIG_OK) {
2351 		goto ret;
2352 	}
2353 
2354 	if ((info != NULL) && (info->func != NULL)) {
2355 		err = (*(info->func))(&tmp, &ma, e, n, &tmp, n0,
2356 		    info->ncp, info->reqp);
2357 	} else {
2358 		err = big_modexp_ncp_sw(&tmp, &ma, e, n, &tmp, n0);
2359 	}
2360 	if (err != BIG_OK) {
2361 		goto ret;
2362 	}
2363 
2364 	ma.value[0] = 1;
2365 	ma.len = 1;
2366 	if ((err = big_mont_mul(&tmp, &tmp, &ma, n, n0)) != BIG_OK) {
2367 		goto ret;
2368 	}
2369 	err = big_copy(result, &tmp);
2370 
2371 ret:
2372 	if (rr.malloced) {
2373 		big_finish(&rr);
2374 	}
2375 ret2:
2376 	big_finish(&tmp);
2377 ret1:
2378 	big_finish(&ma);
2379 
2380 	return (err);
2381 }
2382 
2383 BIG_ERR_CODE
2384 big_modexp(BIGNUM *result, BIGNUM *a, BIGNUM *e, BIGNUM *n, BIGNUM *n_rr)
2385 {
2386 	return (big_modexp_ext(result, a, e, n, n_rr, NULL));
2387 }
2388 
2389 
2390 BIG_ERR_CODE
2391 big_modexp_crt_ext(BIGNUM *result, BIGNUM *a, BIGNUM *dmodpminus1,
2392     BIGNUM *dmodqminus1, BIGNUM *p, BIGNUM *q, BIGNUM *pinvmodq,
2393     BIGNUM *p_rr, BIGNUM *q_rr, big_modexp_ncp_info_t *info)
2394 {
2395 	BIGNUM		ap, aq, tmp;
2396 	int		alen, biglen, sign;
2397 	BIG_ERR_CODE	err;
2398 
2399 	if (p->len > q->len) {
2400 		biglen = p->len;
2401 	} else {
2402 		biglen = q->len;
2403 	}
2404 
2405 	if ((err = big_init1(&ap, p->len, NULL, 0)) != BIG_OK) {
2406 		return (err);
2407 	}
2408 	if ((err = big_init1(&aq, q->len, NULL, 0)) != BIG_OK) {
2409 		goto ret1;
2410 	}
2411 	if ((err = big_init1(&tmp, biglen + q->len + 1, NULL, 0)) != BIG_OK) {
2412 		goto ret2;
2413 	}
2414 
2415 	/*
2416 	 * check whether a is too short - to avoid timing attacks
2417 	 */
2418 	alen = a->len;
2419 	while ((alen > p->len) && (a->value[alen - 1] == 0)) {
2420 		alen--;
2421 	}
2422 	if (alen < p->len + q->len) {
2423 		/*
2424 		 * a is too short, add p*q to it before
2425 		 * taking it modulo p and q
2426 		 * this will also affect timing, but this difference
2427 		 * does not depend on p or q, only on a
2428 		 * (in "normal" operation, this path will never be
2429 		 * taken, so it is not a performance penalty
2430 		 */
2431 		if ((err = big_mul(&tmp, p, q)) != BIG_OK) {
2432 			goto ret;
2433 		}
2434 		if ((err = big_add(&tmp, &tmp, a)) != BIG_OK) {
2435 			goto ret;
2436 		}
2437 		if ((err = big_div_pos(NULL, &ap, &tmp, p)) != BIG_OK) {
2438 			goto ret;
2439 		}
2440 		if ((err = big_div_pos(NULL, &aq, &tmp, q)) != BIG_OK) {
2441 			goto ret;
2442 		}
2443 	} else {
2444 		if ((err = big_div_pos(NULL, &ap, a, p)) != BIG_OK) {
2445 			goto ret;
2446 		}
2447 		if ((err = big_div_pos(NULL, &aq, a, q)) != BIG_OK) {
2448 			goto ret;
2449 		}
2450 	}
2451 
2452 	if ((err = big_modexp_ext(&ap, &ap, dmodpminus1, p, p_rr, info)) !=
2453 	    BIG_OK) {
2454 		goto ret;
2455 	}
2456 	if ((err = big_modexp_ext(&aq, &aq, dmodqminus1, q, q_rr, info)) !=
2457 	    BIG_OK) {
2458 		goto ret;
2459 	}
2460 	if ((err = big_sub(&tmp, &aq, &ap)) != BIG_OK) {
2461 		goto ret;
2462 	}
2463 	if ((err = big_mul(&tmp, &tmp, pinvmodq)) != BIG_OK) {
2464 		goto ret;
2465 	}
2466 	sign = tmp.sign;
2467 	tmp.sign = 1;
2468 	if ((err = big_div_pos(NULL, &aq, &tmp, q)) != BIG_OK) {
2469 		goto ret;
2470 	}
2471 	if ((sign == -1) && (!big_is_zero(&aq))) {
2472 		(void) big_sub_pos(&aq, q, &aq);
2473 	}
2474 	if ((err = big_mul(&tmp, &aq, p)) != BIG_OK) {
2475 		goto ret;
2476 	}
2477 	err = big_add_abs(result, &ap, &tmp);
2478 
2479 ret:
2480 	big_finish(&tmp);
2481 ret2:
2482 	big_finish(&aq);
2483 ret1:
2484 	big_finish(&ap);
2485 
2486 	return (err);
2487 }
2488 
2489 
2490 BIG_ERR_CODE
2491 big_modexp_crt(BIGNUM *result, BIGNUM *a, BIGNUM *dmodpminus1,
2492     BIGNUM *dmodqminus1, BIGNUM *p, BIGNUM *q, BIGNUM *pinvmodq,
2493     BIGNUM *p_rr, BIGNUM *q_rr)
2494 {
2495 	return (big_modexp_crt_ext(result, a, dmodpminus1, dmodqminus1,
2496 	    p, q, pinvmodq, p_rr, q_rr, NULL));
2497 }
2498 
2499 
2500 static BIG_CHUNK_TYPE onearr[1] = {(BIG_CHUNK_TYPE)1};
2501 BIGNUM big_One = {1, 1, 1, 0, onearr};
2502 
2503 static BIG_CHUNK_TYPE twoarr[1] = {(BIG_CHUNK_TYPE)2};
2504 BIGNUM big_Two = {1, 1, 1, 0, twoarr};
2505 
2506 static BIG_CHUNK_TYPE fourarr[1] = {(BIG_CHUNK_TYPE)4};
2507 static BIGNUM big_Four = {1, 1, 1, 0, fourarr};
2508 
2509 
2510 BIG_ERR_CODE
2511 big_sqrt_pos(BIGNUM *result, BIGNUM *n)
2512 {
2513 	BIGNUM		*high, *low, *mid, *t;
2514 	BIGNUM		t1, t2, t3, prod;
2515 	BIG_CHUNK_TYPE	t1value[BIGTMPSIZE];
2516 	BIG_CHUNK_TYPE	t2value[BIGTMPSIZE];
2517 	BIG_CHUNK_TYPE	t3value[BIGTMPSIZE];
2518 	BIG_CHUNK_TYPE	prodvalue[BIGTMPSIZE];
2519 	int		i, diff;
2520 	uint32_t	nbits, nrootbits, highbits;
2521 	BIG_ERR_CODE	err;
2522 
2523 	nbits = big_numbits(n);
2524 
2525 	if ((err = big_init1(&t1, n->len + 1,
2526 	    t1value, arraysize(t1value))) != BIG_OK)
2527 		return (err);
2528 	if ((err = big_init1(&t2, n->len + 1,
2529 	    t2value, arraysize(t2value))) != BIG_OK)
2530 		goto ret1;
2531 	if ((err = big_init1(&t3, n->len + 1,
2532 	    t3value, arraysize(t3value))) != BIG_OK)
2533 		goto ret2;
2534 	if ((err = big_init1(&prod, n->len + 1,
2535 	    prodvalue, arraysize(prodvalue))) != BIG_OK)
2536 		goto ret3;
2537 
2538 	nrootbits = (nbits + 1) / 2;
2539 	t1.len = t2.len = t3.len = (nrootbits - 1) / BIG_CHUNK_SIZE + 1;
2540 	for (i = 0; i < t1.len; i++) {
2541 		t1.value[i] = 0;
2542 		t2.value[i] = BIG_CHUNK_ALLBITS;
2543 	}
2544 	highbits = nrootbits - BIG_CHUNK_SIZE * (t1.len - 1);
2545 	if (highbits == BIG_CHUNK_SIZE) {
2546 		t1.value[t1.len - 1] = BIG_CHUNK_HIGHBIT;
2547 		t2.value[t2.len - 1] = BIG_CHUNK_ALLBITS;
2548 	} else {
2549 		t1.value[t1.len - 1] = (BIG_CHUNK_TYPE)1 << (highbits - 1);
2550 		t2.value[t2.len - 1] = 2 * t1.value[t1.len - 1] - 1;
2551 	}
2552 
2553 	high = &t2;
2554 	low = &t1;
2555 	mid = &t3;
2556 
2557 	if ((err = big_mul(&prod, high, high)) != BIG_OK) {
2558 		goto ret;
2559 	}
2560 	diff = big_cmp_abs(&prod, n);
2561 	if (diff <= 0) {
2562 		err = big_copy(result, high);
2563 		goto ret;
2564 	}
2565 
2566 	(void) big_sub_pos(mid, high, low);
2567 	while (big_cmp_abs(&big_One, mid) != 0) {
2568 		(void) big_add_abs(mid, high, low);
2569 		(void) big_half_pos(mid, mid);
2570 		if ((err = big_mul(&prod, mid, mid)) != BIG_OK)
2571 			goto ret;
2572 		diff = big_cmp_abs(&prod, n);
2573 		if (diff > 0) {
2574 			t = high;
2575 			high = mid;
2576 			mid = t;
2577 		} else if (diff < 0) {
2578 			t = low;
2579 			low = mid;
2580 			mid = t;
2581 		} else {
2582 			err = big_copy(result, low);
2583 			goto ret;
2584 		}
2585 		(void) big_sub_pos(mid, high, low);
2586 	}
2587 
2588 	err = big_copy(result, low);
2589 ret:
2590 	if (prod.malloced) big_finish(&prod);
2591 ret3:
2592 	if (t3.malloced) big_finish(&t3);
2593 ret2:
2594 	if (t2.malloced) big_finish(&t2);
2595 ret1:
2596 	if (t1.malloced) big_finish(&t1);
2597 
2598 	return (err);
2599 }
2600 
2601 
2602 BIG_ERR_CODE
2603 big_Jacobi_pos(int *jac, BIGNUM *nn, BIGNUM *mm)
2604 {
2605 	BIGNUM		*t, *tmp2, *m, *n;
2606 	BIGNUM		t1, t2, t3;
2607 	BIG_CHUNK_TYPE	t1value[BIGTMPSIZE];
2608 	BIG_CHUNK_TYPE	t2value[BIGTMPSIZE];
2609 	BIG_CHUNK_TYPE	t3value[BIGTMPSIZE];
2610 	int		len, err;
2611 
2612 	if (big_is_zero(nn) ||
2613 	    (((nn->value[0] & 1) | (mm->value[0] & 1)) == 0)) {
2614 		*jac = 0;
2615 		return (BIG_OK);
2616 	}
2617 
2618 	if (nn->len > mm->len) {
2619 		len = nn->len;
2620 	} else {
2621 		len = mm->len;
2622 	}
2623 
2624 	if ((err = big_init1(&t1, len,
2625 	    t1value, arraysize(t1value))) != BIG_OK) {
2626 		return (err);
2627 	}
2628 	if ((err = big_init1(&t2, len,
2629 	    t2value, arraysize(t2value))) != BIG_OK) {
2630 		goto ret1;
2631 	}
2632 	if ((err = big_init1(&t3, len,
2633 	    t3value, arraysize(t3value))) != BIG_OK) {
2634 		goto ret2;
2635 	}
2636 
2637 	n = &t1;
2638 	m = &t2;
2639 	tmp2 = &t3;
2640 
2641 	(void) big_copy(n, nn);
2642 	(void) big_copy(m, mm);
2643 
2644 	*jac = 1;
2645 	while (big_cmp_abs(&big_One, m) != 0) {
2646 		if (big_is_zero(n)) {
2647 			*jac = 0;
2648 			goto ret;
2649 		}
2650 		if ((m->value[0] & 1) == 0) {
2651 			if (((n->value[0] & 7) == 3) ||
2652 			    ((n->value[0] & 7) == 5))
2653 				*jac = -*jac;
2654 			(void) big_half_pos(m, m);
2655 		} else if ((n->value[0] & 1) == 0) {
2656 			if (((m->value[0] & 7) == 3) ||
2657 			    ((m->value[0] & 7) == 5))
2658 				*jac = -*jac;
2659 			(void) big_half_pos(n, n);
2660 		} else {
2661 			if (((m->value[0] & 3) == 3) &&
2662 			    ((n->value[0] & 3) == 3)) {
2663 				*jac = -*jac;
2664 			}
2665 			if ((err = big_div_pos(NULL, tmp2, m, n)) != BIG_OK) {
2666 				goto ret;
2667 			}
2668 			t = tmp2;
2669 			tmp2 = m;
2670 			m = n;
2671 			n = t;
2672 		}
2673 	}
2674 	err = BIG_OK;
2675 
2676 ret:
2677 	if (t3.malloced) big_finish(&t3);
2678 ret2:
2679 	if (t2.malloced) big_finish(&t2);
2680 ret1:
2681 	if (t1.malloced) big_finish(&t1);
2682 
2683 	return (err);
2684 }
2685 
2686 
2687 BIG_ERR_CODE
2688 big_Lucas(BIGNUM *Lkminus1, BIGNUM *Lk, BIGNUM *p, BIGNUM *k, BIGNUM *n)
2689 {
2690 	int		i;
2691 	uint32_t	m, w;
2692 	BIG_CHUNK_TYPE	bit;
2693 	BIGNUM		ki, tmp, tmp2;
2694 	BIG_CHUNK_TYPE	kivalue[BIGTMPSIZE];
2695 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
2696 	BIG_CHUNK_TYPE	tmp2value[BIGTMPSIZE];
2697 	BIG_ERR_CODE	err;
2698 
2699 	if (big_cmp_abs(k, &big_One) == 0) {
2700 		(void) big_copy(Lk, p);
2701 		(void) big_copy(Lkminus1, &big_Two);
2702 		return (BIG_OK);
2703 	}
2704 
2705 	if ((err = big_init1(&ki, k->len + 1,
2706 	    kivalue, arraysize(kivalue))) != BIG_OK)
2707 		return (err);
2708 
2709 	if ((err = big_init1(&tmp, 2 * n->len + 1,
2710 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK)
2711 		goto ret1;
2712 
2713 	if ((err = big_init1(&tmp2, n->len,
2714 	    tmp2value, arraysize(tmp2value))) != BIG_OK)
2715 		goto ret2;
2716 
2717 	m = big_numbits(k);
2718 	ki.len = (m - 1) / BIG_CHUNK_SIZE + 1;
2719 	w = (m - 1) / BIG_CHUNK_SIZE;
2720 	bit = (BIG_CHUNK_TYPE)1 << ((m - 1) % BIG_CHUNK_SIZE);
2721 	for (i = 0; i < ki.len; i++) {
2722 		ki.value[i] = 0;
2723 	}
2724 	ki.value[ki.len - 1] = bit;
2725 	if (big_cmp_abs(k, &ki) != 0) {
2726 		(void) big_double(&ki, &ki);
2727 	}
2728 	(void) big_sub_pos(&ki, &ki, k);
2729 
2730 	(void) big_copy(Lk, p);
2731 	(void) big_copy(Lkminus1, &big_Two);
2732 
2733 	for (i = 0; i < m; i++) {
2734 		if ((err = big_mul(&tmp, Lk, Lkminus1)) != BIG_OK) {
2735 			goto ret;
2736 		}
2737 		(void) big_add_abs(&tmp, &tmp, n);
2738 		(void) big_sub_pos(&tmp, &tmp, p);
2739 		if ((err = big_div_pos(NULL, &tmp2, &tmp, n)) != BIG_OK) {
2740 			goto ret;
2741 		}
2742 		if ((ki.value[w] & bit) != 0) {
2743 			if ((err = big_mul(&tmp, Lkminus1, Lkminus1)) !=
2744 			    BIG_OK) {
2745 				goto ret;
2746 			}
2747 			(void) big_add_abs(&tmp, &tmp, n);
2748 			(void) big_sub_pos(&tmp, &tmp, &big_Two);
2749 			if ((err = big_div_pos(NULL, Lkminus1, &tmp, n)) !=
2750 			    BIG_OK) {
2751 				goto ret;
2752 			}
2753 			(void) big_copy(Lk, &tmp2);
2754 		} else {
2755 			if ((err = big_mul(&tmp, Lk, Lk)) != BIG_OK) {
2756 				goto ret;
2757 			}
2758 			(void) big_add_abs(&tmp, &tmp, n);
2759 			(void) big_sub_pos(&tmp, &tmp, &big_Two);
2760 			if ((err = big_div_pos(NULL, Lk, &tmp, n)) != BIG_OK) {
2761 				goto ret;
2762 			}
2763 			(void) big_copy(Lkminus1, &tmp2);
2764 		}
2765 		bit = bit >> 1;
2766 		if (bit == 0) {
2767 			bit = BIG_CHUNK_HIGHBIT;
2768 			w--;
2769 		}
2770 	}
2771 
2772 	err = BIG_OK;
2773 
2774 ret:
2775 	if (tmp2.malloced) big_finish(&tmp2);
2776 ret2:
2777 	if (tmp.malloced) big_finish(&tmp);
2778 ret1:
2779 	if (ki.malloced) big_finish(&ki);
2780 
2781 	return (err);
2782 }
2783 
2784 
2785 BIG_ERR_CODE
2786 big_isprime_pos_ext(BIGNUM *n, big_modexp_ncp_info_t *info)
2787 {
2788 	BIGNUM		o, nminus1, tmp, Lkminus1, Lk;
2789 	BIG_CHUNK_TYPE	ovalue[BIGTMPSIZE];
2790 	BIG_CHUNK_TYPE	nminus1value[BIGTMPSIZE];
2791 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
2792 	BIG_CHUNK_TYPE	Lkminus1value[BIGTMPSIZE];
2793 	BIG_CHUNK_TYPE	Lkvalue[BIGTMPSIZE];
2794 	BIG_ERR_CODE	err;
2795 	int		e, i, jac;
2796 
2797 	if (big_cmp_abs(n, &big_One) == 0) {
2798 		return (BIG_FALSE);
2799 	}
2800 	if (big_cmp_abs(n, &big_Two) == 0) {
2801 		return (BIG_TRUE);
2802 	}
2803 	if ((n->value[0] & 1) == 0) {
2804 		return (BIG_FALSE);
2805 	}
2806 
2807 	if ((err = big_init1(&o, n->len, ovalue, arraysize(ovalue))) !=
2808 	    BIG_OK) {
2809 		return (err);
2810 	}
2811 
2812 	if ((err = big_init1(&nminus1, n->len,
2813 	    nminus1value, arraysize(nminus1value))) != BIG_OK) {
2814 		goto ret1;
2815 	}
2816 
2817 	if ((err = big_init1(&tmp, 2 * n->len,
2818 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
2819 		goto ret2;
2820 	}
2821 
2822 	if ((err = big_init1(&Lkminus1, n->len,
2823 	    Lkminus1value, arraysize(Lkminus1value))) != BIG_OK) {
2824 		goto ret3;
2825 	}
2826 
2827 	if ((err = big_init1(&Lk, n->len,
2828 	    Lkvalue, arraysize(Lkvalue))) != BIG_OK) {
2829 		goto ret4;
2830 	}
2831 
2832 	(void) big_sub_pos(&o, n, &big_One);	/* cannot fail */
2833 	(void) big_copy(&nminus1, &o);		/* cannot fail */
2834 	e = 0;
2835 	while ((o.value[0] & 1) == 0) {
2836 		e++;
2837 		(void) big_half_pos(&o, &o);  /* cannot fail */
2838 	}
2839 	if ((err = big_modexp_ext(&tmp, &big_Two, &o, n, NULL, info)) !=
2840 	    BIG_OK) {
2841 		goto ret;
2842 	}
2843 
2844 	i = 0;
2845 	while ((i < e) &&
2846 	    (big_cmp_abs(&tmp, &big_One) != 0) &&
2847 	    (big_cmp_abs(&tmp, &nminus1) != 0)) {
2848 		if ((err =
2849 		    big_modexp_ext(&tmp, &tmp, &big_Two, n, NULL, info)) !=
2850 		    BIG_OK)
2851 			goto ret;
2852 		i++;
2853 	}
2854 
2855 	if (!((big_cmp_abs(&tmp, &nminus1) == 0) ||
2856 	    ((i == 0) && (big_cmp_abs(&tmp, &big_One) == 0)))) {
2857 		err = BIG_FALSE;
2858 		goto ret;
2859 	}
2860 
2861 	if ((err = big_sqrt_pos(&tmp, n)) != BIG_OK) {
2862 		goto ret;
2863 	}
2864 
2865 	if ((err = big_mul(&tmp, &tmp, &tmp)) != BIG_OK) {
2866 		goto ret;
2867 	}
2868 	if (big_cmp_abs(&tmp, n) == 0) {
2869 		err = BIG_FALSE;
2870 		goto ret;
2871 	}
2872 
2873 	(void) big_copy(&o, &big_Two);
2874 	do {
2875 		(void) big_add_abs(&o, &o, &big_One);
2876 		if ((err = big_mul(&tmp, &o, &o)) != BIG_OK) {
2877 			goto ret;
2878 		}
2879 		(void) big_sub_pos(&tmp, &tmp, &big_Four);
2880 		if ((err = big_Jacobi_pos(&jac, &tmp, n)) != BIG_OK) {
2881 			goto ret;
2882 		}
2883 	} while (jac != -1);
2884 
2885 	(void) big_add_abs(&tmp, n, &big_One);
2886 	if ((err = big_Lucas(&Lkminus1, &Lk, &o, &tmp, n)) != BIG_OK) {
2887 		goto ret;
2888 	}
2889 
2890 	if ((big_cmp_abs(&Lkminus1, &o) == 0) &&
2891 	    (big_cmp_abs(&Lk, &big_Two) == 0)) {
2892 		err = BIG_TRUE;
2893 	} else {
2894 		err = BIG_FALSE;
2895 	}
2896 
2897 ret:
2898 	if (Lk.malloced) big_finish(&Lk);
2899 ret4:
2900 	if (Lkminus1.malloced) big_finish(&Lkminus1);
2901 ret3:
2902 	if (tmp.malloced) big_finish(&tmp);
2903 ret2:
2904 	if (nminus1.malloced) big_finish(&nminus1);
2905 ret1:
2906 	if (o.malloced) big_finish(&o);
2907 
2908 	return (err);
2909 }
2910 
2911 
2912 BIG_ERR_CODE
2913 big_isprime_pos(BIGNUM *n)
2914 {
2915 	return (big_isprime_pos_ext(n, NULL));
2916 }
2917 
2918 
2919 #define	SIEVESIZE 1000
2920 
2921 
2922 BIG_ERR_CODE
2923 big_nextprime_pos_ext(BIGNUM *result, BIGNUM *n, big_modexp_ncp_info_t *info)
2924 {
2925 	static const uint32_t smallprimes[] = {
2926 	    3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
2927 	    51, 53, 59, 61, 67, 71, 73, 79, 83, 89, 91, 97 };
2928 	BIG_ERR_CODE	err;
2929 	int		sieve[SIEVESIZE];
2930 	int		i;
2931 	uint32_t	off, p;
2932 
2933 	if ((err = big_copy(result, n)) != BIG_OK) {
2934 		return (err);
2935 	}
2936 	result->value[0] |= 1;
2937 	/* CONSTCOND */
2938 	while (1) {
2939 		for (i = 0; i < SIEVESIZE; i++) sieve[i] = 0;
2940 		for (i = 0;
2941 		    i < sizeof (smallprimes) / sizeof (smallprimes[0]); i++) {
2942 			p = smallprimes[i];
2943 			off = big_modhalf_pos(result, p);
2944 			off = p - off;
2945 			if ((off % 2) == 1) {
2946 				off = off + p;
2947 			}
2948 			off = off / 2;
2949 			while (off < SIEVESIZE) {
2950 				sieve[off] = 1;
2951 				off = off + p;
2952 			}
2953 		}
2954 
2955 		for (i = 0; i < SIEVESIZE; i++) {
2956 			if (sieve[i] == 0) {
2957 				err = big_isprime_pos_ext(result, info);
2958 				if (err != BIG_FALSE) {
2959 					if (err != BIG_TRUE) {
2960 						return (err);
2961 					} else {
2962 						goto out;
2963 					}
2964 				}
2965 
2966 			}
2967 			if ((err = big_add_abs(result, result, &big_Two)) !=
2968 			    BIG_OK) {
2969 				return (err);
2970 			}
2971 		}
2972 	}
2973 
2974 out:
2975 	return (BIG_OK);
2976 }
2977 
2978 
2979 BIG_ERR_CODE
2980 big_nextprime_pos(BIGNUM *result, BIGNUM *n)
2981 {
2982 	return (big_nextprime_pos_ext(result, n, NULL));
2983 }
2984 
2985 
2986 BIG_ERR_CODE
2987 big_nextprime_pos_slow(BIGNUM *result, BIGNUM *n)
2988 {
2989 	BIG_ERR_CODE err;
2990 
2991 
2992 	if ((err = big_copy(result, n)) != BIG_OK)
2993 		return (err);
2994 	result->value[0] |= 1;
2995 	while ((err = big_isprime_pos_ext(result, NULL)) != BIG_TRUE) {
2996 		if (err != BIG_FALSE)
2997 			return (err);
2998 		if ((err = big_add_abs(result, result, &big_Two)) != BIG_OK)
2999 			return (err);
3000 	}
3001 	return (BIG_OK);
3002 }
3003 
3004 
3005 /*
3006  * given m and e, computes the rest in the equation
3007  * gcd(m, e) = cm * m + ce * e
3008  */
3009 BIG_ERR_CODE
3010 big_ext_gcd_pos(BIGNUM *gcd, BIGNUM *cm, BIGNUM *ce, BIGNUM *m, BIGNUM *e)
3011 {
3012 	BIGNUM		*xi, *ri, *riminus1, *riminus2, *t;
3013 	BIGNUM		*vmi, *vei, *vmiminus1, *veiminus1;
3014 	BIGNUM		t1, t2, t3, t4, t5, t6, t7, t8, tmp;
3015 	BIG_CHUNK_TYPE	t1value[BIGTMPSIZE];
3016 	BIG_CHUNK_TYPE	t2value[BIGTMPSIZE];
3017 	BIG_CHUNK_TYPE	t3value[BIGTMPSIZE];
3018 	BIG_CHUNK_TYPE	t4value[BIGTMPSIZE];
3019 	BIG_CHUNK_TYPE	t5value[BIGTMPSIZE];
3020 	BIG_CHUNK_TYPE	t6value[BIGTMPSIZE];
3021 	BIG_CHUNK_TYPE	t7value[BIGTMPSIZE];
3022 	BIG_CHUNK_TYPE	t8value[BIGTMPSIZE];
3023 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
3024 	BIG_ERR_CODE	err;
3025 	int		len;
3026 
3027 	if (big_cmp_abs(m, e) >= 0) {
3028 		len = m->len;
3029 	} else {
3030 		len = e->len;
3031 	}
3032 
3033 	if ((err = big_init1(&t1, len,
3034 	    t1value, arraysize(t1value))) != BIG_OK) {
3035 		return (err);
3036 	}
3037 	if ((err = big_init1(&t2, len,
3038 	    t2value, arraysize(t2value))) != BIG_OK) {
3039 		goto ret1;
3040 	}
3041 	if ((err = big_init1(&t3, len,
3042 	    t3value, arraysize(t3value))) != BIG_OK) {
3043 		goto ret2;
3044 	}
3045 	if ((err = big_init1(&t4, len,
3046 	    t4value, arraysize(t3value))) != BIG_OK) {
3047 		goto ret3;
3048 	}
3049 	if ((err = big_init1(&t5, len,
3050 	    t5value, arraysize(t5value))) != BIG_OK) {
3051 		goto ret4;
3052 	}
3053 	if ((err = big_init1(&t6, len,
3054 	    t6value, arraysize(t6value))) != BIG_OK) {
3055 		goto ret5;
3056 	}
3057 	if ((err = big_init1(&t7, len,
3058 	    t7value, arraysize(t7value))) != BIG_OK) {
3059 		goto ret6;
3060 	}
3061 	if ((err = big_init1(&t8, len,
3062 	    t8value, arraysize(t8value))) != BIG_OK) {
3063 		goto ret7;
3064 	}
3065 
3066 	if ((err = big_init1(&tmp, 2 * len,
3067 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
3068 		goto ret8;
3069 	}
3070 
3071 	ri = &t1;
3072 	ri->value[0] = 1;
3073 	ri->len = 1;
3074 	xi = &t2;
3075 	riminus1 = &t3;
3076 	riminus2 = &t4;
3077 	vmi = &t5;
3078 	vei = &t6;
3079 	vmiminus1 = &t7;
3080 	veiminus1 = &t8;
3081 
3082 	(void) big_copy(vmiminus1, &big_One);
3083 	(void) big_copy(vmi, &big_One);
3084 	(void) big_copy(veiminus1, &big_One);
3085 	(void) big_copy(xi, &big_One);
3086 	vei->len = 1;
3087 	vei->value[0] = 0;
3088 
3089 	(void) big_copy(riminus1, m);
3090 	(void) big_copy(ri, e);
3091 
3092 	while (!big_is_zero(ri)) {
3093 		t = riminus2;
3094 		riminus2 = riminus1;
3095 		riminus1 = ri;
3096 		ri = t;
3097 		if ((err = big_mul(&tmp, vmi, xi)) != BIG_OK) {
3098 			goto ret;
3099 		}
3100 		if ((err = big_sub(vmiminus1, vmiminus1, &tmp)) != BIG_OK) {
3101 			goto ret;
3102 		}
3103 		t = vmiminus1;
3104 		vmiminus1 = vmi;
3105 		vmi = t;
3106 		if ((err = big_mul(&tmp, vei, xi)) != BIG_OK) {
3107 			goto ret;
3108 		}
3109 		if ((err = big_sub(veiminus1, veiminus1, &tmp)) != BIG_OK) {
3110 			goto ret;
3111 		}
3112 		t = veiminus1;
3113 		veiminus1 = vei;
3114 		vei = t;
3115 		if ((err = big_div_pos(xi, ri, riminus2, riminus1)) !=
3116 		    BIG_OK) {
3117 			goto ret;
3118 		}
3119 	}
3120 	if ((gcd != NULL) && ((err = big_copy(gcd, riminus1)) != BIG_OK)) {
3121 		goto ret;
3122 	}
3123 	if ((cm != NULL) && ((err = big_copy(cm, vmi)) != BIG_OK)) {
3124 		goto ret;
3125 	}
3126 	if (ce != NULL) {
3127 		err = big_copy(ce, vei);
3128 	}
3129 ret:
3130 	if (tmp.malloced) big_finish(&tmp);
3131 ret8:
3132 	if (t8.malloced) big_finish(&t8);
3133 ret7:
3134 	if (t7.malloced) big_finish(&t7);
3135 ret6:
3136 	if (t6.malloced) big_finish(&t6);
3137 ret5:
3138 	if (t5.malloced) big_finish(&t5);
3139 ret4:
3140 	if (t4.malloced) big_finish(&t4);
3141 ret3:
3142 	if (t3.malloced) big_finish(&t3);
3143 ret2:
3144 	if (t2.malloced) big_finish(&t2);
3145 ret1:
3146 	if (t1.malloced) big_finish(&t1);
3147 
3148 	return (err);
3149 }
3150