xref: /titanic_44/usr/src/common/bignum/bignumimpl.c (revision 8793b36b40d14ad0a0fecc97738dc118a928f46c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #define	big_div_pos_fast big_div_pos
29 
30 #include "bignum.h"
31 
32 /*
33  * Configuration guide
34  * -------------------
35  *
36  * There are 4 preprocessor symbols used to configure the bignum
37  * implementation.  This file contains no logic to configure based on
38  * processor; we leave that to the Makefiles to specify.
39  *
40  * USE_FLOATING_POINT
41  *   Meaning: There is support for a fast floating-point implementation of
42  *   Montgomery multiply.
43  *
44  * PSR_MUL
45  *   Meaning: There are processor-specific versions of the low level
46  *   functions to implement big_mul.  Those functions are: big_mul_set_vec,
47  *   big_mul_add_vec, big_mul_vec, and big_sqr_vec.  PSR_MUL implies support
48  *   for all 4 functions.  You cannot pick and choose which subset of these
49  *   functions to support; that would lead to a rat's nest of #ifdefs.
50  *
51  * HWCAP
52  *   Meaning: Call multiply support functions through a function pointer.
53  *   On x86, there are multiple implementations for differnt hardware
54  *   capabilities, such as MMX, SSE2, etc.  Tests are made at run-time, when
55  *   a function is first used.  So, the support functions are called through
56  *   a function pointer.  There is no need for that on Sparc, because there
57  *   is only one implementation; support functions are called directly.
58  *   Later, if there were some new VIS instruction, or something, and a
59  *   run-time test were needed, rather than variant kernel modules and
60  *   libraries, then HWCAP would be defined for Sparc, as well.
61  *
62  * UMUL64
63  *   Meaning: It is safe to use generic C code that assumes the existence
64  *   of a 32 x 32 --> 64 bit unsigned multiply.  If this is not defined,
65  *   then the generic code for big_mul_add_vec() must necessarily be very slow,
66  *   because it must fall back to using 16 x 16 --> 32 bit multiplication.
67  *
68  */
69 
70 
71 #ifdef	_KERNEL
72 #include <sys/ddi.h>
73 #include <sys/mdesc.h>
74 #include <sys/crypto/common.h>
75 
76 #include <sys/types.h>
77 #include <sys/kmem.h>
78 #include <sys/param.h>
79 #include <sys/sunddi.h>
80 
81 #define	big_malloc(size)	kmem_alloc(size, KM_NOSLEEP)
82 #define	big_free(ptr, size)	kmem_free(ptr, size)
83 
84 void *
85 big_realloc(void *from, size_t oldsize, size_t newsize)
86 {
87 	void *rv;
88 
89 	rv = kmem_alloc(newsize, KM_NOSLEEP);
90 	if (rv != NULL)
91 		bcopy(from, rv, oldsize);
92 	kmem_free(from, oldsize);
93 	return (rv);
94 }
95 
96 #else	/* _KERNEL */
97 
98 #include <stdlib.h>
99 #include <stdio.h>
100 #include <assert.h>
101 #define	ASSERT	assert
102 
103 #ifndef MALLOC_DEBUG
104 
105 #define	big_malloc(size)	malloc(size)
106 #define	big_free(ptr, size)	free(ptr)
107 
108 #else
109 
110 void
111 big_free(void *ptr, size_t size)
112 {
113 	printf("freed %d bytes at %p\n", size, ptr);
114 	free(ptr);
115 }
116 
117 void *
118 big_malloc(size_t size)
119 {
120 	void *rv;
121 	rv = malloc(size);
122 	printf("malloced %d bytes, addr:%p\n", size, rv);
123 	return (rv);
124 }
125 #endif /* MALLOC_DEBUG */
126 
127 #define	big_realloc(x, y, z) realloc((x), (z))
128 
129 void
130 printbignum(char *aname, BIGNUM *a)
131 {
132 	int i;
133 
134 	(void) printf("\n%s\n%d\n", aname, a->sign*a->len);
135 	for (i = a->len - 1; i >= 0; i--) {
136 #ifdef BIGNUM_CHUNK_32
137 		(void) printf("%08x ", a->value[i]);
138 		if ((i % 8 == 0) && (i != 0)) {
139 			(void) printf("\n");
140 		}
141 #else
142 		(void) printf("%08x %08x ", (uint32_t)((a->value[i]) >> 32),
143 		    (uint32_t)((a->value[i]) & 0xffffffff));
144 		if ((i % 4 == 0) && (i != 0)) {
145 			(void) printf("\n");
146 		}
147 #endif
148 	}
149 	(void) printf("\n");
150 }
151 
152 #endif	/* _KERNEL */
153 
154 
155 /* size in BIG_CHUNK_SIZE-bit words */
156 BIG_ERR_CODE
157 big_init(BIGNUM *number, int size)
158 {
159 	number->value = big_malloc(sizeof (BIG_CHUNK_TYPE) * size);
160 	if (number->value == NULL) {
161 		return (BIG_NO_MEM);
162 	}
163 	number->size = size;
164 	number->len = 0;
165 	number->sign = 1;
166 	number->malloced = 1;
167 	return (BIG_OK);
168 }
169 
170 /* size in BIG_CHUNK_SIZE-bit words */
171 BIG_ERR_CODE
172 big_init1(BIGNUM *number, int size, BIG_CHUNK_TYPE *buf, int bufsize)
173 {
174 	if ((buf == NULL) || (size > bufsize)) {
175 		number->value = big_malloc(sizeof (BIG_CHUNK_TYPE) * size);
176 		if (number->value == NULL) {
177 			return (BIG_NO_MEM);
178 		}
179 		number->size = size;
180 		number->malloced = 1;
181 	} else {
182 		number->value = buf;
183 		number->size = bufsize;
184 		number->malloced = 0;
185 	}
186 		number->len = 0;
187 		number->sign = 1;
188 
189 	return (BIG_OK);
190 }
191 
192 void
193 big_finish(BIGNUM *number)
194 {
195 	if (number->malloced == 1) {
196 		big_free(number->value,
197 		    sizeof (BIG_CHUNK_TYPE) * number->size);
198 		number->malloced = 0;
199 	}
200 }
201 
202 
203 /*
204  *  bn->size should be at least
205  * (len + sizeof (BIG_CHUNK_TYPE) - 1) / sizeof (BIG_CHUNK_TYPE) bytes
206  * converts from byte-big-endian format to bignum format (words in
207  * little endian order, but bytes within the words big endian)
208  */
209 void
210 bytestring2bignum(BIGNUM *bn, uchar_t *kn, size_t len)
211 {
212 	int		i, j, offs;
213 	BIG_CHUNK_TYPE	word;
214 	uchar_t		*knwordp;
215 
216 #ifdef	_LP64
217 	offs = (uint32_t)len % sizeof (BIG_CHUNK_TYPE);
218 	bn->len = (uint32_t)len / sizeof (BIG_CHUNK_TYPE);
219 
220 	for (i = 0; i < (uint32_t)len / sizeof (BIG_CHUNK_TYPE); i++) {
221 #else	/* !_LP64 */
222 	offs = len % sizeof (BIG_CHUNK_TYPE);
223 	bn->len = len / sizeof (BIG_CHUNK_TYPE);
224 	for (i = 0; i < len / sizeof (BIG_CHUNK_TYPE); i++) {
225 #endif	/* _LP64 */
226 		knwordp = &(kn[len - sizeof (BIG_CHUNK_TYPE) * (i + 1)]);
227 		word = knwordp[0];
228 		for (j = 1; j < sizeof (BIG_CHUNK_TYPE); j++) {
229 			word = (word << 8)+ knwordp[j];
230 		}
231 		bn->value[i] = word;
232 	}
233 	if (offs > 0) {
234 		word = kn[0];
235 		for (i = 1; i < offs; i++) word = (word << 8) + kn[i];
236 		bn->value[bn->len++] = word;
237 	}
238 	while ((bn->len > 1) && (bn->value[bn->len-1] == 0)) {
239 		bn->len --;
240 	}
241 }
242 
243 /*
244  * copies the least significant len bytes if
245  * len < bn->len * sizeof (BIG_CHUNK_TYPE)
246  * converts from bignum format to byte-big-endian format.
247  * bignum format is words of type  BIG_CHUNK_TYPE in little endian order.
248  */
249 void
250 bignum2bytestring(uchar_t *kn, BIGNUM *bn, size_t len)
251 {
252 	int		i, j, offs;
253 	BIG_CHUNK_TYPE	word;
254 
255 	if (len < sizeof (BIG_CHUNK_TYPE) * bn->len) {
256 #ifdef	_LP64
257 		for (i = 0; i < (uint32_t)len / sizeof (BIG_CHUNK_TYPE); i++) {
258 #else	/* !_LP64 */
259 		for (i = 0; i < len / sizeof (BIG_CHUNK_TYPE); i++) {
260 #endif	/* _LP64 */
261 			word = bn->value[i];
262 			for (j = 0; j < sizeof (BIG_CHUNK_TYPE); j++) {
263 				kn[len - sizeof (BIG_CHUNK_TYPE) * i - j - 1] =
264 				    word & 0xff;
265 				word = word >> 8;
266 			}
267 		}
268 #ifdef	_LP64
269 		offs = (uint32_t)len % sizeof (BIG_CHUNK_TYPE);
270 #else	/* !_LP64 */
271 		offs = len % sizeof (BIG_CHUNK_TYPE);
272 #endif	/* _LP64 */
273 		if (offs > 0) {
274 			word = bn->value[len / sizeof (BIG_CHUNK_TYPE)];
275 #ifdef	_LP64
276 			for (i =  (uint32_t)len % sizeof (BIG_CHUNK_TYPE);
277 			    i > 0; i --) {
278 #else	/* !_LP64 */
279 			for (i = len % sizeof (BIG_CHUNK_TYPE);
280 			    i > 0; i --) {
281 #endif	/* _LP64 */
282 				kn[i - 1] = word & 0xff;
283 				word = word >> 8;
284 			}
285 		}
286 	} else {
287 		for (i = 0; i < bn->len; i++) {
288 			word = bn->value[i];
289 			for (j = 0; j < sizeof (BIG_CHUNK_TYPE); j++) {
290 				kn[len - sizeof (BIG_CHUNK_TYPE) * i - j - 1] =
291 				    word & 0xff;
292 				word = word >> 8;
293 			}
294 		}
295 #ifdef	_LP64
296 		for (i = 0;
297 		    i < (uint32_t)len - sizeof (BIG_CHUNK_TYPE) * bn->len;
298 		    i++) {
299 #else	/* !_LP64 */
300 		for (i = 0; i < len - sizeof (BIG_CHUNK_TYPE) * bn->len; i++) {
301 #endif	/* _LP64 */
302 			kn[i] = 0;
303 		}
304 	}
305 }
306 
307 
308 int
309 big_bitlength(BIGNUM *a)
310 {
311 	int		l = 0, b = 0;
312 	BIG_CHUNK_TYPE	c;
313 
314 	l = a->len - 1;
315 	while ((l > 0) && (a->value[l] == 0)) {
316 		l--;
317 	}
318 	b = sizeof (BIG_CHUNK_TYPE) * BITSINBYTE;
319 	c = a->value[l];
320 	while ((b > 1) && ((c & BIG_CHUNK_HIGHBIT) == 0)) {
321 		c = c << 1;
322 		b--;
323 	}
324 
325 	return (l * sizeof (BIG_CHUNK_TYPE) * BITSINBYTE + b);
326 }
327 
328 
329 BIG_ERR_CODE
330 big_copy(BIGNUM *dest, BIGNUM *src)
331 {
332 	BIG_CHUNK_TYPE	*newptr;
333 	int		i, len;
334 
335 	len = src->len;
336 	while ((len > 1) && (src->value[len - 1] == 0)) {
337 		len--;
338 	}
339 	src->len = len;
340 	if (dest->size < len) {
341 		if (dest->malloced == 1) {
342 			newptr = (BIG_CHUNK_TYPE *)big_realloc(dest->value,
343 			    sizeof (BIG_CHUNK_TYPE) * dest->size,
344 			    sizeof (BIG_CHUNK_TYPE) * len);
345 		} else {
346 			newptr = (BIG_CHUNK_TYPE *)
347 			    big_malloc(sizeof (BIG_CHUNK_TYPE) * len);
348 			if (newptr != NULL) {
349 				dest->malloced = 1;
350 			}
351 		}
352 		if (newptr == NULL) {
353 			return (BIG_NO_MEM);
354 		}
355 		dest->value = newptr;
356 		dest->size = len;
357 	}
358 	dest->len = len;
359 	dest->sign = src->sign;
360 	for (i = 0; i < len; i++) {
361 		dest->value[i] = src->value[i];
362 	}
363 
364 	return (BIG_OK);
365 }
366 
367 
368 BIG_ERR_CODE
369 big_extend(BIGNUM *number, int size)
370 {
371 	BIG_CHUNK_TYPE	*newptr;
372 	int		i;
373 
374 	if (number->size >= size)
375 		return (BIG_OK);
376 	if (number->malloced) {
377 		number->value = big_realloc(number->value,
378 		    sizeof (BIG_CHUNK_TYPE) * number->size,
379 		    sizeof (BIG_CHUNK_TYPE) * size);
380 	} else {
381 		newptr = big_malloc(sizeof (BIG_CHUNK_TYPE) * size);
382 		if (newptr != NULL) {
383 			for (i = 0; i < number->size; i++) {
384 				newptr[i] = number->value[i];
385 			}
386 		}
387 		number->value = newptr;
388 	}
389 
390 	if (number->value == NULL) {
391 		return (BIG_NO_MEM);
392 	}
393 
394 	number->size = size;
395 	number->malloced = 1;
396 	return (BIG_OK);
397 }
398 
399 
400 /* returns 1 if n == 0 */
401 int
402 big_is_zero(BIGNUM *n)
403 {
404 	int	i, result;
405 
406 	result = 1;
407 	for (i = 0; i < n->len; i++) {
408 		if (n->value[i] != 0) {
409 			result = 0;
410 		}
411 	}
412 	return (result);
413 }
414 
415 
416 BIG_ERR_CODE
417 big_add_abs(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
418 {
419 	int		i, shorter, longer;
420 	BIG_CHUNK_TYPE	cy, ai;
421 	BIG_CHUNK_TYPE	*r, *a, *b, *c;
422 	BIG_ERR_CODE	err;
423 	BIGNUM		*longerarg;
424 
425 	if (aa->len > bb->len) {
426 		shorter = bb->len;
427 		longer = aa->len;
428 		longerarg = aa;
429 	} else {
430 		shorter = aa->len;
431 		longer = bb->len;
432 		longerarg = bb;
433 	}
434 	if (result->size < longer + 1) {
435 		err = big_extend(result, longer + 1);
436 		if (err != BIG_OK) {
437 			return (err);
438 		}
439 	}
440 
441 	r = result->value;
442 	a = aa->value;
443 	b = bb->value;
444 	c = longerarg->value;
445 	cy = 0;
446 	for (i = 0; i < shorter; i++) {
447 		ai = a[i];
448 		r[i] = ai + b[i] + cy;
449 		if (r[i] > ai) {
450 			cy = 0;
451 		} else if (r[i] < ai) {
452 			cy = 1;
453 		}
454 	}
455 	for (; i < longer; i++) {
456 		ai = c[i];
457 		r[i] = ai + cy;
458 		if (r[i] >= ai) {
459 			cy = 0;
460 		}
461 	}
462 	if (cy == 1) {
463 		r[i] = cy;
464 		result->len = longer + 1;
465 	} else {
466 		result->len = longer;
467 	}
468 	result->sign = 1;
469 	return (BIG_OK);
470 }
471 
472 
473 /* caller must make sure that result has at least len words allocated */
474 void
475 big_sub_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, BIG_CHUNK_TYPE *b, int len)
476 {
477 	int		i;
478 	BIG_CHUNK_TYPE	cy, ai;
479 
480 	cy = 1;
481 	for (i = 0; i < len; i++) {
482 		ai = a[i];
483 		r[i] = ai + (~b[i]) + cy;
484 		if (r[i] > ai) {
485 			cy = 0;
486 		} else if (r[i] < ai) {
487 			cy = 1;
488 		}
489 	}
490 }
491 
492 
493 /* result=aa-bb  it is assumed that aa>=bb */
494 BIG_ERR_CODE
495 big_sub_pos(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
496 {
497 	int		i, shorter;
498 	BIG_CHUNK_TYPE	cy = 1, ai;
499 	BIG_CHUNK_TYPE	*r, *a, *b;
500 	BIG_ERR_CODE	err = BIG_OK;
501 
502 	if (aa->len > bb->len) {
503 		shorter = bb->len;
504 	} else {
505 		shorter = aa->len;
506 	}
507 	if (result->size < aa->len) {
508 		err = big_extend(result, aa->len);
509 		if (err != BIG_OK) {
510 			return (err);
511 		}
512 	}
513 
514 	r = result->value;
515 	a = aa->value;
516 	b = bb->value;
517 	result->len = aa->len;
518 	cy = 1;
519 	for (i = 0; i < shorter; i++) {
520 		ai = a[i];
521 		r[i] = ai + (~b[i]) + cy;
522 		if (r[i] > ai) {
523 			cy = 0;
524 		} else if (r[i] < ai) {
525 			cy = 1;
526 		}
527 	}
528 	for (; i < aa->len; i++) {
529 		ai = a[i];
530 		r[i] = ai + (~0) + cy;
531 		if (r[i] < ai) {
532 			cy = 1;
533 		}
534 	}
535 	result->sign = 1;
536 
537 	if (cy == 0) {
538 		return (BIG_INVALID_ARGS);
539 	} else {
540 		return (BIG_OK);
541 	}
542 }
543 
544 
545 /* returns -1 if |aa|<|bb|, 0 if |aa|==|bb| 1 if |aa|>|bb| */
546 int
547 big_cmp_abs(BIGNUM *aa, BIGNUM *bb)
548 {
549 	int	i;
550 
551 	if (aa->len > bb->len) {
552 		for (i = aa->len - 1; i > bb->len - 1; i--) {
553 			if (aa->value[i] > 0) {
554 				return (1);
555 			}
556 		}
557 	} else if (aa->len < bb->len) {
558 		for (i = bb->len - 1; i > aa->len - 1; i--) {
559 			if (bb->value[i] > 0) {
560 				return (-1);
561 			}
562 		}
563 	} else {
564 		i = aa->len-1;
565 	}
566 	for (; i >= 0; i--) {
567 		if (aa->value[i] > bb->value[i]) {
568 			return (1);
569 		} else if (aa->value[i] < bb->value[i]) {
570 			return (-1);
571 		}
572 	}
573 
574 	return (0);
575 }
576 
577 
578 BIG_ERR_CODE
579 big_sub(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
580 {
581 	BIG_ERR_CODE	err;
582 
583 	if ((bb->sign == -1) && (aa->sign == 1)) {
584 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
585 			return (err);
586 		}
587 		result->sign = 1;
588 	} else if ((aa->sign == -1) && (bb->sign == 1)) {
589 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
590 			return (err);
591 		}
592 		result->sign = -1;
593 	} else if ((aa->sign == 1) && (bb->sign == 1)) {
594 		if (big_cmp_abs(aa, bb) >= 0) {
595 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
596 				return (err);
597 			}
598 			result->sign = 1;
599 		} else {
600 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
601 				return (err);
602 			}
603 			result->sign = -1;
604 		}
605 	} else {
606 		if (big_cmp_abs(aa, bb) >= 0) {
607 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
608 				return (err);
609 			}
610 			result->sign = -1;
611 		} else {
612 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
613 				return (err);
614 			}
615 			result->sign = 1;
616 		}
617 	}
618 	return (BIG_OK);
619 }
620 
621 
622 BIG_ERR_CODE
623 big_add(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
624 {
625 	BIG_ERR_CODE	err;
626 
627 	if ((bb->sign == -1) && (aa->sign == -1)) {
628 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
629 			return (err);
630 		}
631 		result->sign = -1;
632 	} else if ((aa->sign == 1) && (bb->sign == 1)) {
633 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
634 			return (err);
635 		}
636 		result->sign = 1;
637 	} else if ((aa->sign == 1) && (bb->sign == -1)) {
638 		if (big_cmp_abs(aa, bb) >= 0) {
639 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
640 				return (err);
641 			}
642 			result->sign = 1;
643 		} else {
644 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
645 				return (err);
646 			}
647 			result->sign = -1;
648 		}
649 	} else {
650 		if (big_cmp_abs(aa, bb) >= 0) {
651 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
652 				return (err);
653 			}
654 			result->sign = -1;
655 		} else {
656 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
657 				return (err);
658 			}
659 			result->sign = 1;
660 		}
661 	}
662 	return (BIG_OK);
663 }
664 
665 
666 /* result = aa/2 */
667 BIG_ERR_CODE
668 big_half_pos(BIGNUM *result, BIGNUM *aa)
669 {
670 	BIG_ERR_CODE	err;
671 	int		i;
672 	BIG_CHUNK_TYPE	cy, cy1;
673 	BIG_CHUNK_TYPE	*a, *r;
674 
675 	if (result->size < aa->len) {
676 		err = big_extend(result, aa->len);
677 		if (err != BIG_OK) {
678 			return (err);
679 		}
680 	}
681 
682 	result->len = aa->len;
683 	a = aa->value;
684 	r = result->value;
685 	cy = 0;
686 	for (i = aa->len - 1; i >= 0; i--) {
687 		cy1 = a[i] << (BIG_CHUNK_SIZE - 1);
688 		r[i] = (cy | (a[i] >> 1));
689 		cy = cy1;
690 	}
691 	if (r[result->len - 1] == 0) {
692 		result->len--;
693 	}
694 
695 	return (BIG_OK);
696 }
697 
698 /* result  =  aa*2 */
699 BIG_ERR_CODE
700 big_double(BIGNUM *result, BIGNUM *aa)
701 {
702 	BIG_ERR_CODE	err;
703 	int		i, rsize;
704 	BIG_CHUNK_TYPE	cy, cy1;
705 	BIG_CHUNK_TYPE	*a, *r;
706 
707 	if ((aa->len > 0) &&
708 	    ((aa->value[aa->len - 1] & BIG_CHUNK_HIGHBIT) != 0)) {
709 		rsize = aa->len + 1;
710 	} else {
711 		rsize = aa->len;
712 	}
713 
714 	if (result->size < rsize) {
715 		err = big_extend(result, rsize);
716 		if (err != BIG_OK)
717 			return (err);
718 	}
719 
720 	a = aa->value;
721 	r = result->value;
722 	if (rsize == aa->len + 1) {
723 		r[rsize - 1] = 1;
724 	}
725 	cy = 0;
726 	for (i = 0; i < aa->len; i++) {
727 		cy1 = a[i] >> (BIG_CHUNK_SIZE - 1);
728 		r[i] = (cy | (a[i] << 1));
729 		cy = cy1;
730 	}
731 	result->len = rsize;
732 	return (BIG_OK);
733 }
734 
735 
736 /*
737  * returns aa mod b, aa must be nonneg, b must be a max
738  * (BIG_CHUNK_SIZE / 2)-bit integer
739  */
740 static uint32_t
741 big_modhalf_pos(BIGNUM *aa, uint32_t b)
742 {
743 	int		i;
744 	BIG_CHUNK_TYPE	rem;
745 
746 	if (aa->len == 0) {
747 		return (0);
748 	}
749 	rem = aa->value[aa->len - 1] % b;
750 	for (i = aa->len - 2; i >= 0; i--) {
751 		rem = ((rem << (BIG_CHUNK_SIZE / 2)) |
752 		    (aa->value[i] >> (BIG_CHUNK_SIZE / 2))) % b;
753 		rem = ((rem << (BIG_CHUNK_SIZE / 2)) |
754 		    (aa->value[i] & BIG_CHUNK_LOWHALFBITS)) % b;
755 	}
756 
757 	return ((uint32_t)rem);
758 }
759 
760 
761 /*
762  * result = aa - (2^BIG_CHUNK_SIZE)^lendiff * bb
763  * result->size should be at least aa->len at entry
764  * aa, bb, and result should be positive
765  */
766 void
767 big_sub_pos_high(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
768 {
769 	int i, lendiff;
770 	BIGNUM res1, aa1;
771 
772 	lendiff = aa->len - bb->len;
773 	res1.size = result->size - lendiff;
774 	res1.malloced = 0;
775 	res1.value = result->value + lendiff;
776 	aa1.size = aa->size - lendiff;
777 	aa1.value = aa->value + lendiff;
778 	aa1.len = bb->len;
779 	aa1.sign = 1;
780 	(void) big_sub_pos(&res1, &aa1, bb);
781 	if (result->value != aa->value) {
782 		for (i = 0; i < lendiff; i++) {
783 			result->value[i] = aa->value[i];
784 		}
785 	}
786 	result->len = aa->len;
787 }
788 
789 
790 /*
791  * returns 1, 0, or -1 depending on whether |aa| > , ==, or <
792  *					(2^BIG_CHUNK_SIZE)^lendiff * |bb|
793  * aa->len should be >= bb->len
794  */
795 int
796 big_cmp_abs_high(BIGNUM *aa, BIGNUM *bb)
797 {
798 	int lendiff;
799 	BIGNUM aa1;
800 
801 	lendiff = aa->len - bb->len;
802 	aa1.len = bb->len;
803 	aa1.size = aa->size - lendiff;
804 	aa1.malloced = 0;
805 	aa1.value = aa->value + lendiff;
806 	return (big_cmp_abs(&aa1, bb));
807 }
808 
809 
810 /*
811  * result = aa * b where b is a max. (BIG_CHUNK_SIZE / 2)-bit positive integer.
812  * result should have enough space allocated.
813  */
814 static void
815 big_mulhalf_low(BIGNUM *result, BIGNUM *aa, BIG_CHUNK_TYPE b)
816 {
817 	int		i;
818 	BIG_CHUNK_TYPE	t1, t2, ai, cy;
819 	BIG_CHUNK_TYPE	*a, *r;
820 
821 	a = aa->value;
822 	r = result->value;
823 	cy = 0;
824 	for (i = 0; i < aa->len; i++) {
825 		ai = a[i];
826 		t1 = (ai & BIG_CHUNK_LOWHALFBITS) * b + cy;
827 		t2 = (ai >> (BIG_CHUNK_SIZE / 2)) * b +
828 		    (t1 >> (BIG_CHUNK_SIZE / 2));
829 		r[i] = (t1 & BIG_CHUNK_LOWHALFBITS) |
830 		    (t2 << (BIG_CHUNK_SIZE / 2));
831 		cy = t2 >> (BIG_CHUNK_SIZE / 2);
832 	}
833 	r[i] = cy;
834 	result->len = aa->len + 1;
835 	result->sign = aa->sign;
836 }
837 
838 
839 /*
840  * result = aa * b * 2^(BIG_CHUNK_SIZE / 2) where b is a max.
841  * (BIG_CHUNK_SIZE / 2)-bit positive integer.
842  * result should have enough space allocated.
843  */
844 static void
845 big_mulhalf_high(BIGNUM *result, BIGNUM *aa, BIG_CHUNK_TYPE b)
846 {
847 	int		i;
848 	BIG_CHUNK_TYPE	t1, t2, ai, cy, ri;
849 	BIG_CHUNK_TYPE	*a, *r;
850 
851 	a = aa->value;
852 	r = result->value;
853 	cy = 0;
854 	ri = 0;
855 	for (i = 0; i < aa->len; i++) {
856 		ai = a[i];
857 		t1 = (ai & BIG_CHUNK_LOWHALFBITS) * b + cy;
858 		t2 = (ai >>  (BIG_CHUNK_SIZE / 2)) * b +
859 		    (t1 >>  (BIG_CHUNK_SIZE / 2));
860 		r[i] = (t1 <<  (BIG_CHUNK_SIZE / 2)) + ri;
861 		ri = t2 & BIG_CHUNK_LOWHALFBITS;
862 		cy = t2 >> (BIG_CHUNK_SIZE / 2);
863 	}
864 	r[i] = (cy <<  (BIG_CHUNK_SIZE / 2)) + ri;
865 	result->len = aa->len + 1;
866 	result->sign = aa->sign;
867 }
868 
869 
870 /* it is assumed that result->size is big enough */
871 void
872 big_shiftleft(BIGNUM *result, BIGNUM *aa, int offs)
873 {
874 	int		i;
875 	BIG_CHUNK_TYPE	cy, ai;
876 
877 	if (offs == 0) {
878 		if (result != aa) {
879 			(void) big_copy(result, aa);
880 		}
881 		return;
882 	}
883 	cy = 0;
884 	for (i = 0; i < aa->len; i++) {
885 		ai = aa->value[i];
886 		result->value[i] = (ai << offs) | cy;
887 		cy = ai >> (BIG_CHUNK_SIZE - offs);
888 	}
889 	if (cy != 0) {
890 		result->len = aa->len + 1;
891 		result->value[result->len - 1] = cy;
892 	} else {
893 		result->len = aa->len;
894 	}
895 	result->sign = aa->sign;
896 }
897 
898 
899 /* it is assumed that result->size is big enough */
900 void
901 big_shiftright(BIGNUM *result, BIGNUM *aa, int offs)
902 {
903 	int		 i;
904 	BIG_CHUNK_TYPE	cy, ai;
905 
906 	if (offs == 0) {
907 		if (result != aa) {
908 			(void) big_copy(result, aa);
909 		}
910 		return;
911 	}
912 	cy = aa->value[0] >> offs;
913 	for (i = 1; i < aa->len; i++) {
914 		ai = aa->value[i];
915 		result->value[i-1] = (ai << (BIG_CHUNK_SIZE - offs)) | cy;
916 		cy = ai >> offs;
917 	}
918 	result->len = aa->len;
919 	result->value[result->len - 1] = cy;
920 	result->sign = aa->sign;
921 }
922 
923 
924 /*
925  * result = aa/bb   remainder = aa mod bb
926  * it is assumed that aa and bb are positive
927  */
928 BIG_ERR_CODE
929 big_div_pos_fast(BIGNUM *result, BIGNUM *remainder, BIGNUM *aa, BIGNUM *bb)
930 {
931 	BIG_ERR_CODE	err = BIG_OK;
932 	int		i, alen, blen, tlen, rlen, offs;
933 	BIG_CHUNK_TYPE	higha, highb, coeff;
934 	BIG_CHUNK_TYPE	*a, *b;
935 	BIGNUM		bbhigh, bblow, tresult, tmp1, tmp2;
936 	BIG_CHUNK_TYPE	tmp1value[BIGTMPSIZE];
937 	BIG_CHUNK_TYPE	tmp2value[BIGTMPSIZE];
938 	BIG_CHUNK_TYPE	tresultvalue[BIGTMPSIZE];
939 	BIG_CHUNK_TYPE	bblowvalue[BIGTMPSIZE];
940 	BIG_CHUNK_TYPE	bbhighvalue[BIGTMPSIZE];
941 
942 	a = aa->value;
943 	b = bb->value;
944 	alen = aa->len;
945 	blen = bb->len;
946 	while ((alen > 1) && (a[alen - 1] == 0)) {
947 		alen = alen - 1;
948 	}
949 	aa->len = alen;
950 	while ((blen > 1) && (b[blen - 1] == 0)) {
951 		blen = blen - 1;
952 	}
953 	bb->len = blen;
954 	if ((blen == 1) && (b[0] == 0)) {
955 		return (BIG_DIV_BY_0);
956 	}
957 
958 	if (big_cmp_abs(aa, bb) < 0) {
959 		if ((remainder != NULL) &&
960 		    ((err = big_copy(remainder, aa)) != BIG_OK)) {
961 			return (err);
962 		}
963 		if (result != NULL) {
964 			result->len = 1;
965 			result->sign = 1;
966 			result->value[0] = 0;
967 		}
968 		return (BIG_OK);
969 	}
970 
971 	if ((err = big_init1(&bblow, blen + 1,
972 	    bblowvalue, arraysize(bblowvalue))) != BIG_OK)
973 		return (err);
974 
975 	if ((err = big_init1(&bbhigh, blen + 1,
976 	    bbhighvalue, arraysize(bbhighvalue))) != BIG_OK)
977 		goto ret1;
978 
979 	if ((err = big_init1(&tmp1, alen + 2,
980 	    tmp1value, arraysize(tmp1value))) != BIG_OK)
981 		goto ret2;
982 
983 	if ((err = big_init1(&tmp2, blen + 2,
984 	    tmp2value, arraysize(tmp2value))) != BIG_OK)
985 		goto ret3;
986 
987 	if ((err = big_init1(&tresult, alen - blen + 2,
988 	    tresultvalue, arraysize(tresultvalue))) != BIG_OK)
989 		goto ret4;
990 
991 	offs = 0;
992 	highb = b[blen - 1];
993 	if (highb >= (BIG_CHUNK_HALF_HIGHBIT << 1)) {
994 		highb = highb >> (BIG_CHUNK_SIZE / 2);
995 		offs = (BIG_CHUNK_SIZE / 2);
996 	}
997 	while ((highb & BIG_CHUNK_HALF_HIGHBIT) == 0) {
998 		highb = highb << 1;
999 		offs++;
1000 	}
1001 
1002 	big_shiftleft(&bblow, bb, offs);
1003 
1004 	if (offs <= (BIG_CHUNK_SIZE / 2 - 1)) {
1005 		big_shiftleft(&bbhigh, &bblow, BIG_CHUNK_SIZE / 2);
1006 	} else {
1007 		big_shiftright(&bbhigh, &bblow, BIG_CHUNK_SIZE / 2);
1008 	}
1009 	if (bbhigh.value[bbhigh.len - 1] == 0) {
1010 		bbhigh.len--;
1011 	} else {
1012 		bbhigh.value[bbhigh.len] = 0;
1013 	}
1014 
1015 	highb = bblow.value[bblow.len - 1];
1016 
1017 	big_shiftleft(&tmp1, aa, offs);
1018 	rlen = tmp1.len - bblow.len + 1;
1019 	tresult.len = rlen;
1020 
1021 	tmp1.len++;
1022 	tlen = tmp1.len;
1023 	tmp1.value[tmp1.len - 1] = 0;
1024 	for (i = 0; i < rlen; i++) {
1025 		higha = (tmp1.value[tlen - 1] << (BIG_CHUNK_SIZE / 2)) +
1026 		    (tmp1.value[tlen - 2] >> (BIG_CHUNK_SIZE / 2));
1027 		coeff = higha / (highb + 1);
1028 		big_mulhalf_high(&tmp2, &bblow, coeff);
1029 		big_sub_pos_high(&tmp1, &tmp1, &tmp2);
1030 		bbhigh.len++;
1031 		while (tmp1.value[tlen - 1] > 0) {
1032 			big_sub_pos_high(&tmp1, &tmp1, &bbhigh);
1033 			coeff++;
1034 		}
1035 		bbhigh.len--;
1036 		tlen--;
1037 		tmp1.len--;
1038 		while (big_cmp_abs_high(&tmp1, &bbhigh) >= 0) {
1039 			big_sub_pos_high(&tmp1, &tmp1, &bbhigh);
1040 			coeff++;
1041 		}
1042 		tresult.value[rlen - i - 1] = coeff << (BIG_CHUNK_SIZE / 2);
1043 		higha = tmp1.value[tlen - 1];
1044 		coeff = higha / (highb + 1);
1045 		big_mulhalf_low(&tmp2, &bblow, coeff);
1046 		tmp2.len--;
1047 		big_sub_pos_high(&tmp1, &tmp1, &tmp2);
1048 		while (big_cmp_abs_high(&tmp1, &bblow) >= 0) {
1049 			big_sub_pos_high(&tmp1, &tmp1, &bblow);
1050 			coeff++;
1051 		}
1052 		tresult.value[rlen - i - 1] =
1053 		    tresult.value[rlen - i - 1] + coeff;
1054 	}
1055 
1056 	big_shiftright(&tmp1, &tmp1, offs);
1057 
1058 	err = BIG_OK;
1059 
1060 	if ((remainder != NULL) &&
1061 	    ((err = big_copy(remainder, &tmp1)) != BIG_OK))
1062 		goto ret;
1063 
1064 	if (result != NULL)
1065 		err = big_copy(result, &tresult);
1066 
1067 ret:
1068 	big_finish(&tresult);
1069 ret4:
1070 	big_finish(&tmp1);
1071 ret3:
1072 	big_finish(&tmp2);
1073 ret2:
1074 	big_finish(&bbhigh);
1075 ret1:
1076 	big_finish(&bblow);
1077 	return (err);
1078 }
1079 
1080 /*
1081  * If there is no processor-specific integer implementation of
1082  * the lower level multiply functions, then this code is provided
1083  * for big_mul_set_vec(), big_mul_add_vec(), big_mul_vec() and
1084  * big_sqr_vec().
1085  *
1086  * There are two generic implementations.  One that assumes that
1087  * there is hardware and C compiler support for a 32 x 32 --> 64
1088  * bit unsigned multiply, but otherwise is not specific to any
1089  * processor, platform, or ISA.
1090  *
1091  * The other makes very few assumptions about hardware capabilities.
1092  * It does not even assume that there is any implementation of a
1093  * 32 x 32 --> 64 bit multiply that is accessible to C code and
1094  * appropriate to use.  It falls constructs 32 x 32 --> 64 bit
1095  * multiplies from 16 x 16 --> 32 bit multiplies.
1096  *
1097  */
1098 
1099 #if !defined(PSR_MUL)
1100 
1101 #ifdef UMUL64
1102 
1103 #if (BIG_CHUNK_SIZE == 32)
1104 
1105 #define	UNROLL8
1106 
1107 #define	MUL_SET_VEC_ROUND_PREFETCH(R) \
1108 	p = pf * d; \
1109 	pf = (uint64_t)a[R+1]; \
1110 	t = p + cy; \
1111 	r[R] = (uint32_t)t; \
1112 	cy = t >> 32
1113 
1114 #define	MUL_SET_VEC_ROUND_NOPREFETCH(R) \
1115 	p = pf * d; \
1116 	t = p + cy; \
1117 	r[R] = (uint32_t)t; \
1118 	cy = t >> 32
1119 
1120 #define	MUL_ADD_VEC_ROUND_PREFETCH(R) \
1121 	t = (uint64_t)r[R]; \
1122 	p = pf * d; \
1123 	pf = (uint64_t)a[R+1]; \
1124 	t = p + t + cy; \
1125 	r[R] = (uint32_t)t; \
1126 	cy = t >> 32
1127 
1128 #define	MUL_ADD_VEC_ROUND_NOPREFETCH(R) \
1129 	t = (uint64_t)r[R]; \
1130 	p = pf * d; \
1131 	t = p + t + cy; \
1132 	r[R] = (uint32_t)t; \
1133 	cy = t >> 32
1134 
1135 #ifdef UNROLL8
1136 
1137 #define	UNROLL 8
1138 
1139 /*
1140  * r = a * b
1141  * where r and a are vectors; b is a single 32-bit digit
1142  */
1143 
1144 uint32_t
1145 big_mul_set_vec(uint32_t *r, uint32_t *a, int len, uint32_t b)
1146 {
1147 	uint64_t d, pf, p, t, cy;
1148 
1149 	if (len == 0)
1150 		return (0);
1151 	cy = 0;
1152 	d = (uint64_t)b;
1153 	pf = (uint64_t)a[0];
1154 	while (len > UNROLL) {
1155 		MUL_SET_VEC_ROUND_PREFETCH(0);
1156 		MUL_SET_VEC_ROUND_PREFETCH(1);
1157 		MUL_SET_VEC_ROUND_PREFETCH(2);
1158 		MUL_SET_VEC_ROUND_PREFETCH(3);
1159 		MUL_SET_VEC_ROUND_PREFETCH(4);
1160 		MUL_SET_VEC_ROUND_PREFETCH(5);
1161 		MUL_SET_VEC_ROUND_PREFETCH(6);
1162 		MUL_SET_VEC_ROUND_PREFETCH(7);
1163 		r += UNROLL;
1164 		a += UNROLL;
1165 		len -= UNROLL;
1166 	}
1167 	if (len == UNROLL) {
1168 		MUL_SET_VEC_ROUND_PREFETCH(0);
1169 		MUL_SET_VEC_ROUND_PREFETCH(1);
1170 		MUL_SET_VEC_ROUND_PREFETCH(2);
1171 		MUL_SET_VEC_ROUND_PREFETCH(3);
1172 		MUL_SET_VEC_ROUND_PREFETCH(4);
1173 		MUL_SET_VEC_ROUND_PREFETCH(5);
1174 		MUL_SET_VEC_ROUND_PREFETCH(6);
1175 		MUL_SET_VEC_ROUND_NOPREFETCH(7);
1176 		return ((uint32_t)cy);
1177 	}
1178 	while (len > 1) {
1179 		MUL_SET_VEC_ROUND_PREFETCH(0);
1180 		++r;
1181 		++a;
1182 		--len;
1183 	}
1184 	if (len > 0) {
1185 		MUL_SET_VEC_ROUND_NOPREFETCH(0);
1186 	}
1187 	return ((uint32_t)cy);
1188 }
1189 
1190 /*
1191  * r += a * b
1192  * where r and a are vectors; b is a single 32-bit digit
1193  */
1194 
1195 uint32_t
1196 big_mul_add_vec(uint32_t *r, uint32_t *a, int len, uint32_t b)
1197 {
1198 	uint64_t d, pf, p, t, cy;
1199 
1200 	if (len == 0)
1201 		return (0);
1202 	cy = 0;
1203 	d = (uint64_t)b;
1204 	pf = (uint64_t)a[0];
1205 	while (len > 8) {
1206 		MUL_ADD_VEC_ROUND_PREFETCH(0);
1207 		MUL_ADD_VEC_ROUND_PREFETCH(1);
1208 		MUL_ADD_VEC_ROUND_PREFETCH(2);
1209 		MUL_ADD_VEC_ROUND_PREFETCH(3);
1210 		MUL_ADD_VEC_ROUND_PREFETCH(4);
1211 		MUL_ADD_VEC_ROUND_PREFETCH(5);
1212 		MUL_ADD_VEC_ROUND_PREFETCH(6);
1213 		MUL_ADD_VEC_ROUND_PREFETCH(7);
1214 		r += 8;
1215 		a += 8;
1216 		len -= 8;
1217 	}
1218 	if (len == 8) {
1219 		MUL_ADD_VEC_ROUND_PREFETCH(0);
1220 		MUL_ADD_VEC_ROUND_PREFETCH(1);
1221 		MUL_ADD_VEC_ROUND_PREFETCH(2);
1222 		MUL_ADD_VEC_ROUND_PREFETCH(3);
1223 		MUL_ADD_VEC_ROUND_PREFETCH(4);
1224 		MUL_ADD_VEC_ROUND_PREFETCH(5);
1225 		MUL_ADD_VEC_ROUND_PREFETCH(6);
1226 		MUL_ADD_VEC_ROUND_NOPREFETCH(7);
1227 		return ((uint32_t)cy);
1228 	}
1229 	while (len > 1) {
1230 		MUL_ADD_VEC_ROUND_PREFETCH(0);
1231 		++r;
1232 		++a;
1233 		--len;
1234 	}
1235 	if (len > 0) {
1236 		MUL_ADD_VEC_ROUND_NOPREFETCH(0);
1237 	}
1238 	return ((uint32_t)cy);
1239 }
1240 #endif /* UNROLL8 */
1241 
1242 void
1243 big_sqr_vec(uint32_t *r, uint32_t *a, int len)
1244 {
1245 	uint32_t *tr, *ta;
1246 	int tlen, row, col;
1247 	uint64_t p, s, t, t2, cy;
1248 	uint32_t d;
1249 
1250 	tr = r + 1;
1251 	ta = a;
1252 	tlen = len - 1;
1253 	tr[tlen] = big_mul_set_vec(tr, ta + 1, tlen, ta[0]);
1254 	while (--tlen > 0) {
1255 		tr += 2;
1256 		++ta;
1257 		tr[tlen] = big_mul_add_vec(tr, ta + 1, tlen, ta[0]);
1258 	}
1259 	s = (uint64_t)a[0];
1260 	s = s * s;
1261 	r[0] = (uint32_t)s;
1262 	cy = s >> 32;
1263 	p = ((uint64_t)r[1] << 1) + cy;
1264 	r[1] = (uint32_t)p;
1265 	cy = p >> 32;
1266 	row = 1;
1267 	col = 2;
1268 	while (row < len) {
1269 		s = (uint64_t)a[row];
1270 		s = s * s;
1271 		p = (uint64_t)r[col] << 1;
1272 		t = p + s;
1273 		d = (uint32_t)t;
1274 		t2 = (uint64_t)d + cy;
1275 		r[col] = (uint32_t)t2;
1276 		cy = (t >> 32) + (t2 >> 32);
1277 		if (row == len - 1)
1278 			break;
1279 		p = ((uint64_t)r[col+1] << 1) + cy;
1280 		r[col+1] = (uint32_t)p;
1281 		cy = p >> 32;
1282 		++row;
1283 		col += 2;
1284 	}
1285 	r[col+1] = (uint32_t)cy;
1286 }
1287 
1288 #else /* BIG_CHUNK_SIZE == 64 */
1289 
1290 /*
1291  * r = r + a * digit, r and a are vectors of length len
1292  * returns the carry digit
1293  */
1294 BIG_CHUNK_TYPE
1295 big_mul_add_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len,
1296     BIG_CHUNK_TYPE digit)
1297 {
1298 	BIG_CHUNK_TYPE	cy, cy1, retcy, dlow, dhigh;
1299 	int		i;
1300 
1301 	cy1 = 0;
1302 	dlow = digit & BIG_CHUNK_LOWHALFBITS;
1303 	dhigh = digit >> (BIG_CHUNK_SIZE / 2);
1304 	for (i = 0; i < len; i++) {
1305 		cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
1306 		    dlow * (a[i] & BIG_CHUNK_LOWHALFBITS) +
1307 		    (r[i] & BIG_CHUNK_LOWHALFBITS);
1308 		cy1 = (cy >> (BIG_CHUNK_SIZE / 2)) +
1309 		    dlow * (a[i] >> (BIG_CHUNK_SIZE / 2)) +
1310 		    (r[i] >> (BIG_CHUNK_SIZE / 2));
1311 		r[i] = (cy & BIG_CHUNK_LOWHALFBITS) |
1312 		    (cy1 << (BIG_CHUNK_SIZE / 2));
1313 	}
1314 	retcy = cy1 >> (BIG_CHUNK_SIZE / 2);
1315 
1316 	cy1 = r[0] & BIG_CHUNK_LOWHALFBITS;
1317 	for (i = 0; i < len - 1; i++) {
1318 		cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
1319 		    dhigh * (a[i] & BIG_CHUNK_LOWHALFBITS) +
1320 		    (r[i] >> (BIG_CHUNK_SIZE / 2));
1321 		r[i] = (cy1 & BIG_CHUNK_LOWHALFBITS) |
1322 		    (cy << (BIG_CHUNK_SIZE / 2));
1323 		cy1 = (cy >> (BIG_CHUNK_SIZE / 2)) +
1324 		    dhigh * (a[i] >> (BIG_CHUNK_SIZE / 2)) +
1325 		    (r[i + 1] & BIG_CHUNK_LOWHALFBITS);
1326 	}
1327 	cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
1328 	    dhigh * (a[len - 1] & BIG_CHUNK_LOWHALFBITS) +
1329 	    (r[len - 1] >> (BIG_CHUNK_SIZE / 2));
1330 	r[len - 1] = (cy1 & BIG_CHUNK_LOWHALFBITS) |
1331 	    (cy << (BIG_CHUNK_SIZE / 2));
1332 	retcy = (cy >> (BIG_CHUNK_SIZE / 2)) +
1333 	    dhigh * (a[len - 1] >> (BIG_CHUNK_SIZE / 2)) + retcy;
1334 
1335 	return (retcy);
1336 }
1337 
1338 
1339 /*
1340  * r = a * digit, r and a are vectors of length len
1341  * returns the carry digit
1342  */
1343 BIG_CHUNK_TYPE
1344 big_mul_set_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len,
1345     BIG_CHUNK_TYPE digit)
1346 {
1347 	int	i;
1348 
1349 	ASSERT(r != a);
1350 	for (i = 0; i < len; i++) {
1351 		r[i] = 0;
1352 	}
1353 	return (big_mul_add_vec(r, a, len, digit));
1354 }
1355 
1356 void
1357 big_sqr_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len)
1358 {
1359 	int i;
1360 
1361 	ASSERT(r != a);
1362 	r[len] = big_mul_set_vec(r, a, len, a[0]);
1363 	for (i = 1; i < len; ++i)
1364 		r[len + i] = big_mul_add_vec(r+i, a, len, a[i]);
1365 }
1366 
1367 #endif /* BIG_CHUNK_SIZE == 32/64 */
1368 
1369 #else /* ! UMUL64 */
1370 
1371 #if (BIG_CHUNK_SIZE != 32)
1372 #error Don't use 64-bit chunks without defining UMUL64
1373 #endif
1374 
1375 
1376 /*
1377  * r = r + a * digit, r and a are vectors of length len
1378  * returns the carry digit
1379  */
1380 uint32_t
1381 big_mul_add_vec(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1382 {
1383 	uint32_t cy, cy1, retcy, dlow, dhigh;
1384 	int i;
1385 
1386 	cy1 = 0;
1387 	dlow = digit & 0xffff;
1388 	dhigh = digit >> 16;
1389 	for (i = 0; i < len; i++) {
1390 		cy = (cy1 >> 16) + dlow * (a[i] & 0xffff) + (r[i] & 0xffff);
1391 		cy1 = (cy >> 16) + dlow * (a[i]>>16) + (r[i] >> 16);
1392 		r[i] = (cy & 0xffff) | (cy1 << 16);
1393 	}
1394 	retcy = cy1 >> 16;
1395 
1396 	cy1 = r[0] & 0xffff;
1397 	for (i = 0; i < len - 1; i++) {
1398 		cy = (cy1 >> 16) + dhigh * (a[i] & 0xffff) + (r[i] >> 16);
1399 		r[i] = (cy1 & 0xffff) | (cy << 16);
1400 		cy1 = (cy >> 16) + dhigh * (a[i] >> 16) + (r[i + 1] & 0xffff);
1401 	}
1402 	cy = (cy1 >> 16) + dhigh * (a[len - 1] & 0xffff) + (r[len - 1] >> 16);
1403 	r[len - 1] = (cy1 & 0xffff) | (cy << 16);
1404 	retcy = (cy >> 16) + dhigh * (a[len - 1] >> 16) + retcy;
1405 
1406 	return (retcy);
1407 }
1408 
1409 
1410 /*
1411  * r = a * digit, r and a are vectors of length len
1412  * returns the carry digit
1413  */
1414 uint32_t
1415 big_mul_set_vec(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1416 {
1417 	int	i;
1418 
1419 	ASSERT(r != a);
1420 	for (i = 0; i < len; i++) {
1421 		r[i] = 0;
1422 	}
1423 
1424 	return (big_mul_add_vec(r, a, len, digit));
1425 }
1426 
1427 void
1428 big_sqr_vec(uint32_t *r, uint32_t *a, int len)
1429 {
1430 	int i;
1431 
1432 	ASSERT(r != a);
1433 	r[len] = big_mul_set_vec(r, a, len, a[0]);
1434 	for (i = 1; i < len; ++i)
1435 		r[len + i] = big_mul_add_vec(r+i, a, len, a[i]);
1436 }
1437 
1438 #endif /* UMUL64 */
1439 
1440 void
1441 big_mul_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int alen,
1442     BIG_CHUNK_TYPE *b, int blen)
1443 {
1444 	int i;
1445 
1446 	r[alen] = big_mul_set_vec(r, a, alen, b[0]);
1447 	for (i = 1; i < blen; ++i)
1448 		r[alen + i] = big_mul_add_vec(r+i, a, alen, b[i]);
1449 }
1450 
1451 
1452 #endif /* ! PSR_MUL */
1453 
1454 
1455 /*
1456  * result = aa * bb  result->value should be big enough to hold the result
1457  *
1458  * Implementation: Standard grammar school algorithm
1459  *
1460  */
1461 BIG_ERR_CODE
1462 big_mul(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
1463 {
1464 	BIGNUM		tmp1;
1465 	BIG_CHUNK_TYPE	tmp1value[BIGTMPSIZE];
1466 	BIG_CHUNK_TYPE	*r, *t, *a, *b;
1467 	BIG_ERR_CODE	err;
1468 	int		i, alen, blen, rsize, sign, diff;
1469 
1470 	if (aa == bb) {
1471 		diff = 0;
1472 	} else {
1473 		diff = big_cmp_abs(aa, bb);
1474 		if (diff < 0) {
1475 			BIGNUM *tt;
1476 			tt = aa;
1477 			aa = bb;
1478 			bb = tt;
1479 		}
1480 	}
1481 	a = aa->value;
1482 	b = bb->value;
1483 	alen = aa->len;
1484 	blen = bb->len;
1485 	while ((alen > 1) && (a[alen - 1] == 0)) {
1486 		alen--;
1487 	}
1488 	aa->len = alen;
1489 	while ((blen > 1) && (b[blen - 1] == 0)) {
1490 		blen--;
1491 	}
1492 	bb->len = blen;
1493 
1494 	rsize = alen + blen;
1495 	if (result->size < rsize) {
1496 		err = big_extend(result, rsize);
1497 		if (err != BIG_OK) {
1498 			return (err);
1499 		}
1500 		/* aa or bb might be an alias to result */
1501 		a = aa->value;
1502 		b = bb->value;
1503 	}
1504 	r = result->value;
1505 
1506 	if (((alen == 1) && (a[0] == 0)) || ((blen == 1) && (b[0] == 0))) {
1507 		result->len = 1;
1508 		result->sign = 1;
1509 		r[0] = 0;
1510 		return (BIG_OK);
1511 	}
1512 	sign = aa->sign * bb->sign;
1513 	if ((alen == 1) && (a[0] == 1)) {
1514 		for (i = 0; i < blen; i++) {
1515 			r[i] = b[i];
1516 		}
1517 		result->len = blen;
1518 		result->sign = sign;
1519 		return (BIG_OK);
1520 	}
1521 	if ((blen == 1) && (b[0] == 1)) {
1522 		for (i = 0; i < alen; i++) {
1523 			r[i] = a[i];
1524 		}
1525 		result->len = alen;
1526 		result->sign = sign;
1527 		return (BIG_OK);
1528 	}
1529 
1530 	if ((err = big_init1(&tmp1, rsize,
1531 	    tmp1value, arraysize(tmp1value))) != BIG_OK) {
1532 		return (err);
1533 	}
1534 	(void) big_copy(&tmp1, aa);
1535 	t = tmp1.value;
1536 
1537 	for (i = 0; i < rsize; i++) {
1538 		t[i] = 0;
1539 	}
1540 
1541 	if (diff == 0 && alen > 2) {
1542 		BIG_SQR_VEC(t, a, alen);
1543 	} else if (blen > 0) {
1544 		BIG_MUL_VEC(t, a, alen, b, blen);
1545 	}
1546 
1547 	if (t[rsize - 1] == 0) {
1548 		tmp1.len = rsize - 1;
1549 	} else {
1550 		tmp1.len = rsize;
1551 	}
1552 	if ((err = big_copy(result, &tmp1)) != BIG_OK) {
1553 		return (err);
1554 	}
1555 	result->sign = sign;
1556 
1557 	big_finish(&tmp1);
1558 
1559 	return (BIG_OK);
1560 }
1561 
1562 
1563 /*
1564  * caller must ensure that  a < n,  b < n  and  ret->size >=  2 * n->len + 1
1565  * and that ret is not n
1566  */
1567 BIG_ERR_CODE
1568 big_mont_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, BIGNUM *n, BIG_CHUNK_TYPE n0)
1569 {
1570 	int	i, j, nlen, needsubtract;
1571 	BIG_CHUNK_TYPE	*nn, *rr;
1572 	BIG_CHUNK_TYPE	digit, c;
1573 	BIG_ERR_CODE	err;
1574 
1575 	nlen = n->len;
1576 	nn = n->value;
1577 
1578 	rr = ret->value;
1579 
1580 	if ((err = big_mul(ret, a, b)) != BIG_OK) {
1581 		return (err);
1582 	}
1583 
1584 	rr = ret->value;
1585 	for (i = ret->len; i < 2 * nlen + 1; i++) {
1586 		rr[i] = 0;
1587 	}
1588 	for (i = 0; i < nlen; i++) {
1589 		digit = rr[i];
1590 		digit = digit * n0;
1591 
1592 		c = BIG_MUL_ADD_VEC(rr + i, nn, nlen, digit);
1593 		j = i + nlen;
1594 		rr[j] += c;
1595 		while (rr[j] < c) {
1596 			rr[j + 1] += 1;
1597 			j++;
1598 			c = 1;
1599 		}
1600 	}
1601 
1602 	needsubtract = 0;
1603 	if ((rr[2 * nlen]  != 0))
1604 		needsubtract = 1;
1605 	else {
1606 		for (i = 2 * nlen - 1; i >= nlen; i--) {
1607 			if (rr[i] > nn[i - nlen]) {
1608 				needsubtract = 1;
1609 				break;
1610 			} else if (rr[i] < nn[i - nlen]) {
1611 				break;
1612 			}
1613 		}
1614 	}
1615 	if (needsubtract)
1616 		big_sub_vec(rr, rr + nlen, nn, nlen);
1617 	else {
1618 		for (i = 0; i < nlen; i++) {
1619 			rr[i] = rr[i + nlen];
1620 		}
1621 	}
1622 	for (i = nlen - 1; (i >= 0) && (rr[i] == 0); i--)
1623 		;
1624 	ret->len = i+1;
1625 
1626 	return (BIG_OK);
1627 }
1628 
1629 
1630 BIG_CHUNK_TYPE
1631 big_n0(BIG_CHUNK_TYPE n)
1632 {
1633 	int		i;
1634 	BIG_CHUNK_TYPE	result, tmp;
1635 
1636 	result = 0;
1637 	tmp = BIG_CHUNK_ALLBITS;
1638 	for (i = 0; i < BIG_CHUNK_SIZE; i++) {
1639 		if ((tmp & 1) == 1) {
1640 			result = (result >> 1) | BIG_CHUNK_HIGHBIT;
1641 			tmp = tmp - n;
1642 		} else {
1643 			result = (result >> 1);
1644 		}
1645 		tmp = tmp >> 1;
1646 	}
1647 
1648 	return (result);
1649 }
1650 
1651 
1652 int
1653 big_numbits(BIGNUM *n)
1654 {
1655 	int		i, j;
1656 	BIG_CHUNK_TYPE	t;
1657 
1658 	for (i = n->len - 1; i > 0; i--) {
1659 		if (n->value[i] != 0) {
1660 			break;
1661 		}
1662 	}
1663 	t = n->value[i];
1664 	for (j = BIG_CHUNK_SIZE; j > 0; j--) {
1665 		if ((t & BIG_CHUNK_HIGHBIT) == 0) {
1666 			t = t << 1;
1667 		} else {
1668 			return (BIG_CHUNK_SIZE * i + j);
1669 		}
1670 	}
1671 	return (0);
1672 }
1673 
1674 
1675 /* caller must make sure that a < n */
1676 BIG_ERR_CODE
1677 big_mont_rr(BIGNUM *result, BIGNUM *n)
1678 {
1679 	BIGNUM		rr;
1680 	BIG_CHUNK_TYPE	rrvalue[BIGTMPSIZE];
1681 	int		len, i;
1682 	BIG_ERR_CODE	err;
1683 
1684 	rr.malloced = 0;
1685 	len = n->len;
1686 
1687 	if ((err = big_init1(&rr, 2 * len + 1,
1688 	    rrvalue, arraysize(rrvalue))) != BIG_OK) {
1689 		return (err);
1690 	}
1691 
1692 	for (i = 0; i < 2 * len; i++) {
1693 		rr.value[i] = 0;
1694 	}
1695 	rr.value[2 * len] = 1;
1696 	rr.len = 2 * len + 1;
1697 	if ((err = big_div_pos(NULL, &rr, &rr, n)) != BIG_OK) {
1698 		goto ret;
1699 	}
1700 	err = big_copy(result, &rr);
1701 ret:
1702 	big_finish(&rr);
1703 	return (err);
1704 }
1705 
1706 
1707 /* caller must make sure that a < n */
1708 BIG_ERR_CODE
1709 big_mont_conv(BIGNUM *result, BIGNUM *a, BIGNUM *n, BIG_CHUNK_TYPE n0,
1710     BIGNUM *n_rr)
1711 {
1712 	BIGNUM		rr;
1713 	BIG_CHUNK_TYPE	rrvalue[BIGTMPSIZE];
1714 	int		len, i;
1715 	BIG_ERR_CODE	err;
1716 
1717 	rr.malloced = 0;
1718 	len = n->len;
1719 
1720 	if ((err = big_init1(&rr, 2 * len + 1, rrvalue, arraysize(rrvalue)))
1721 	    != BIG_OK) {
1722 		return (err);
1723 	}
1724 
1725 	if (n_rr == NULL) {
1726 		for (i = 0; i < 2 * len; i++) {
1727 			rr.value[i] = 0;
1728 		}
1729 		rr.value[2 * len] = 1;
1730 		rr.len = 2 * len + 1;
1731 		if ((err = big_div_pos(NULL, &rr, &rr, n)) != BIG_OK) {
1732 			goto ret;
1733 		}
1734 		n_rr = &rr;
1735 	}
1736 
1737 	if ((err = big_mont_mul(&rr, n_rr, a, n, n0)) != BIG_OK) {
1738 		goto ret;
1739 	}
1740 	err = big_copy(result, &rr);
1741 
1742 ret:
1743 	big_finish(&rr);
1744 	return (err);
1745 }
1746 
1747 
1748 #ifdef	USE_FLOATING_POINT
1749 #define	big_modexp_ncp_float	big_modexp_ncp_sw
1750 #else
1751 #define	big_modexp_ncp_int	big_modexp_ncp_sw
1752 #endif
1753 
1754 #define	MAX_EXP_BIT_GROUP_SIZE 6
1755 #define	APOWERS_MAX_SIZE (1 << (MAX_EXP_BIT_GROUP_SIZE - 1))
1756 
1757 /* ARGSUSED */
1758 static BIG_ERR_CODE
1759 big_modexp_ncp_int(BIGNUM *result, BIGNUM *ma, BIGNUM *e, BIGNUM *n,
1760     BIGNUM *tmp, BIG_CHUNK_TYPE n0)
1761 
1762 {
1763 	BIGNUM		apowers[APOWERS_MAX_SIZE];
1764 	BIGNUM		tmp1;
1765 	BIG_CHUNK_TYPE	tmp1value[BIGTMPSIZE];
1766 	int		i, j, k, l, m, p;
1767 	int		bit, bitind, bitcount, groupbits, apowerssize;
1768 	int		nbits;
1769 	BIG_ERR_CODE	err;
1770 
1771 	nbits = big_numbits(e);
1772 	if (nbits < 50) {
1773 		groupbits = 1;
1774 		apowerssize = 1;
1775 	} else {
1776 		groupbits = MAX_EXP_BIT_GROUP_SIZE;
1777 		apowerssize = 1 << (groupbits - 1);
1778 	}
1779 
1780 
1781 	if ((err = big_init1(&tmp1, 2 * n->len + 1,
1782 	    tmp1value, arraysize(tmp1value))) != BIG_OK) {
1783 		return (err);
1784 	}
1785 
1786 	/* set the malloced bit to help cleanup */
1787 	for (i = 0; i < apowerssize; i++) {
1788 		apowers[i].malloced = 0;
1789 	}
1790 	for (i = 0; i < apowerssize; i++) {
1791 		if ((err = big_init1(&(apowers[i]), n->len, NULL, 0)) !=
1792 		    BIG_OK) {
1793 			goto ret;
1794 		}
1795 	}
1796 
1797 	(void) big_copy(&(apowers[0]), ma);
1798 
1799 	if ((err = big_mont_mul(&tmp1, ma, ma, n, n0)) != BIG_OK) {
1800 		goto ret;
1801 	}
1802 	(void) big_copy(ma, &tmp1);
1803 
1804 	for (i = 1; i < apowerssize; i++) {
1805 		if ((err = big_mont_mul(&tmp1, ma,
1806 		    &(apowers[i-1]), n, n0)) != BIG_OK) {
1807 			goto ret;
1808 		}
1809 		(void) big_copy(&apowers[i], &tmp1);
1810 	}
1811 
1812 	bitind = nbits % BIG_CHUNK_SIZE;
1813 	k = 0;
1814 	l = 0;
1815 	p = 0;
1816 	bitcount = 0;
1817 	for (i = nbits / BIG_CHUNK_SIZE; i >= 0; i--) {
1818 		for (j = bitind - 1; j >= 0; j--) {
1819 			bit = (e->value[i] >> j) & 1;
1820 			if ((bitcount == 0) && (bit == 0)) {
1821 				if ((err = big_mont_mul(tmp,
1822 				    tmp, tmp, n, n0)) != BIG_OK) {
1823 					goto ret;
1824 				}
1825 			} else {
1826 				bitcount++;
1827 				p = p * 2 + bit;
1828 				if (bit == 1) {
1829 					k = k + l + 1;
1830 					l = 0;
1831 				} else {
1832 					l++;
1833 				}
1834 				if (bitcount == groupbits) {
1835 					for (m = 0; m < k; m++) {
1836 						if ((err = big_mont_mul(tmp,
1837 						    tmp, tmp, n, n0)) !=
1838 						    BIG_OK) {
1839 							goto ret;
1840 						}
1841 					}
1842 					if ((err = big_mont_mul(tmp, tmp,
1843 					    &(apowers[p >> (l + 1)]),
1844 					    n, n0)) != BIG_OK) {
1845 						goto ret;
1846 					}
1847 					for (m = 0; m < l; m++) {
1848 						if ((err = big_mont_mul(tmp,
1849 						    tmp, tmp, n, n0)) !=
1850 						    BIG_OK) {
1851 							goto ret;
1852 						}
1853 					}
1854 					k = 0;
1855 					l = 0;
1856 					p = 0;
1857 					bitcount = 0;
1858 				}
1859 			}
1860 		}
1861 		bitind = BIG_CHUNK_SIZE;
1862 	}
1863 
1864 	for (m = 0; m < k; m++) {
1865 		if ((err = big_mont_mul(tmp, tmp, tmp, n, n0)) != BIG_OK) {
1866 			goto ret;
1867 		}
1868 	}
1869 	if (p != 0) {
1870 		if ((err = big_mont_mul(tmp, tmp,
1871 		    &(apowers[p >> (l + 1)]), n, n0)) != BIG_OK) {
1872 			goto ret;
1873 		}
1874 	}
1875 	for (m = 0; m < l; m++) {
1876 		if ((err = big_mont_mul(result, tmp, tmp, n, n0)) != BIG_OK) {
1877 			goto ret;
1878 		}
1879 	}
1880 
1881 ret:
1882 	for (i = apowerssize - 1; i >= 0; i--) {
1883 		big_finish(&(apowers[i]));
1884 	}
1885 	big_finish(&tmp1);
1886 
1887 	return (err);
1888 }
1889 
1890 
1891 #ifdef USE_FLOATING_POINT
1892 
1893 #ifdef _KERNEL
1894 
1895 #include <sys/sysmacros.h>
1896 #include <sys/regset.h>
1897 #include <sys/fpu/fpusystm.h>
1898 
1899 /* the alignment for block stores to save fp registers */
1900 #define	FPR_ALIGN	(64)
1901 
1902 extern void big_savefp(kfpu_t *);
1903 extern void big_restorefp(kfpu_t *);
1904 
1905 #endif /* _KERNEL */
1906 
1907 /*
1908  * This version makes use of floating point for performance
1909  */
1910 static BIG_ERR_CODE
1911 big_modexp_ncp_float(BIGNUM *result, BIGNUM *ma, BIGNUM *e, BIGNUM *n,
1912     BIGNUM *tmp, BIG_CHUNK_TYPE n0)
1913 {
1914 
1915 	int		i, j, k, l, m, p, bit, bitind, bitcount, nlen;
1916 	double		dn0;
1917 	double		*dn, *dt, *d16r, *d32r;
1918 	uint32_t	*nint, *prod;
1919 	double		*apowers[APOWERS_MAX_SIZE];
1920 	int		nbits, groupbits, apowerssize;
1921 	BIG_ERR_CODE	err = BIG_OK;
1922 
1923 #ifdef _KERNEL
1924 	uint8_t fpua[sizeof (kfpu_t) + FPR_ALIGN];
1925 	kfpu_t *fpu;
1926 
1927 #ifdef DEBUG
1928 	if (!fpu_exists)
1929 		return (BIG_GENERAL_ERR);
1930 #endif
1931 
1932 	fpu =  (kfpu_t *)P2ROUNDUP((uintptr_t)fpua, FPR_ALIGN);
1933 	big_savefp(fpu);
1934 
1935 #endif /* _KERNEL */
1936 
1937 	nbits = big_numbits(e);
1938 	if (nbits < 50) {
1939 		groupbits = 1;
1940 		apowerssize = 1;
1941 	} else {
1942 		groupbits = MAX_EXP_BIT_GROUP_SIZE;
1943 		apowerssize = 1 << (groupbits - 1);
1944 	}
1945 
1946 	nlen = (BIG_CHUNK_SIZE / 32) * n->len;
1947 	dn0 = (double)(n0 & 0xffff);
1948 
1949 	dn = dt = d16r = d32r = NULL;
1950 	nint = prod = NULL;
1951 	for (i = 0; i < apowerssize; i++) {
1952 		apowers[i] = NULL;
1953 	}
1954 
1955 	if ((dn = big_malloc(nlen * sizeof (double))) == NULL) {
1956 		err = BIG_NO_MEM;
1957 		goto ret;
1958 	}
1959 	if ((dt = big_malloc((4 * nlen + 2) * sizeof (double))) == NULL) {
1960 		err = BIG_NO_MEM;
1961 		goto ret;
1962 	}
1963 	if ((nint = big_malloc(nlen * sizeof (uint32_t))) == NULL) {
1964 		err = BIG_NO_MEM;
1965 		goto ret;
1966 	}
1967 	if ((prod = big_malloc((nlen + 1) * sizeof (uint32_t))) == NULL) {
1968 		err = BIG_NO_MEM;
1969 		goto ret;
1970 	}
1971 	if ((d16r = big_malloc((2 * nlen + 1) * sizeof (double))) == NULL) {
1972 		err = BIG_NO_MEM;
1973 		goto ret;
1974 	}
1975 	if ((d32r = big_malloc(nlen * sizeof (double))) == NULL) {
1976 		err = BIG_NO_MEM;
1977 		goto ret;
1978 	}
1979 	for (i = 0; i < apowerssize; i++) {
1980 		if ((apowers[i] = big_malloc((2 * nlen + 1) *
1981 		    sizeof (double))) == NULL) {
1982 			err = BIG_NO_MEM;
1983 			goto ret;
1984 		}
1985 	}
1986 
1987 #if (BIG_CHUNK_SIZE == 32)
1988 	for (i = 0; i < ma->len; i++) {
1989 		nint[i] = ma->value[i];
1990 	}
1991 	for (; i < nlen; i++) {
1992 		nint[i] = 0;
1993 	}
1994 #else
1995 	for (i = 0; i < ma->len; i++) {
1996 		nint[2 * i] = (uint32_t)(ma->value[i] & 0xffffffffULL);
1997 		nint[2 * i + 1] = (uint32_t)(ma->value[i] >> 32);
1998 	}
1999 	for (i = ma->len * 2; i < nlen; i++) {
2000 		nint[i] = 0;
2001 	}
2002 #endif
2003 	conv_i32_to_d32_and_d16(d32r, apowers[0], nint, nlen);
2004 
2005 #if (BIG_CHUNK_SIZE == 32)
2006 	for (i = 0; i < n->len; i++) {
2007 		nint[i] = n->value[i];
2008 	}
2009 	for (; i < nlen; i++) {
2010 		nint[i] = 0;
2011 	}
2012 #else
2013 	for (i = 0; i < n->len; i++) {
2014 		nint[2 * i] = (uint32_t)(n->value[i] & 0xffffffffULL);
2015 		nint[2 * i + 1] = (uint32_t)(n->value[i] >> 32);
2016 	}
2017 	for (i = n->len * 2; i < nlen; i++) {
2018 		nint[i] = 0;
2019 	}
2020 #endif
2021 	conv_i32_to_d32(dn, nint, nlen);
2022 
2023 	mont_mulf_noconv(prod, d32r, apowers[0], dt, dn, nint, nlen, dn0);
2024 	conv_i32_to_d32(d32r, prod, nlen);
2025 	for (i = 1; i < apowerssize; i++) {
2026 		mont_mulf_noconv(prod, d32r, apowers[i - 1],
2027 		    dt, dn, nint, nlen, dn0);
2028 		conv_i32_to_d16(apowers[i], prod, nlen);
2029 	}
2030 
2031 #if (BIG_CHUNK_SIZE == 32)
2032 	for (i = 0; i < tmp->len; i++) {
2033 		prod[i] = tmp->value[i];
2034 	}
2035 	for (; i < nlen + 1; i++) {
2036 		prod[i] = 0;
2037 	}
2038 #else
2039 	for (i = 0; i < tmp->len; i++) {
2040 		prod[2 * i] = (uint32_t)(tmp->value[i] & 0xffffffffULL);
2041 		prod[2 * i + 1] = (uint32_t)(tmp->value[i] >> 32);
2042 	}
2043 	for (i = tmp->len * 2; i < nlen + 1; i++) {
2044 		prod[i] = 0;
2045 	}
2046 #endif
2047 
2048 	bitind = nbits % BIG_CHUNK_SIZE;
2049 	k = 0;
2050 	l = 0;
2051 	p = 0;
2052 	bitcount = 0;
2053 	for (i = nbits / BIG_CHUNK_SIZE; i >= 0; i--) {
2054 		for (j = bitind - 1; j >= 0; j--) {
2055 			bit = (e->value[i] >> j) & 1;
2056 			if ((bitcount == 0) && (bit == 0)) {
2057 				conv_i32_to_d32_and_d16(d32r, d16r,
2058 				    prod, nlen);
2059 				mont_mulf_noconv(prod, d32r, d16r,
2060 				    dt, dn, nint, nlen, dn0);
2061 			} else {
2062 				bitcount++;
2063 				p = p * 2 + bit;
2064 				if (bit == 1) {
2065 					k = k + l + 1;
2066 					l = 0;
2067 				} else {
2068 					l++;
2069 				}
2070 				if (bitcount == groupbits) {
2071 					for (m = 0; m < k; m++) {
2072 						conv_i32_to_d32_and_d16(d32r,
2073 						    d16r, prod, nlen);
2074 						mont_mulf_noconv(prod, d32r,
2075 						    d16r, dt, dn, nint,
2076 						    nlen, dn0);
2077 					}
2078 					conv_i32_to_d32(d32r, prod, nlen);
2079 					mont_mulf_noconv(prod, d32r,
2080 					    apowers[p >> (l+1)],
2081 					    dt, dn, nint, nlen, dn0);
2082 					for (m = 0; m < l; m++) {
2083 						conv_i32_to_d32_and_d16(d32r,
2084 						    d16r, prod, nlen);
2085 						mont_mulf_noconv(prod, d32r,
2086 						    d16r, dt, dn, nint,
2087 						    nlen, dn0);
2088 					}
2089 					k = 0;
2090 					l = 0;
2091 					p = 0;
2092 					bitcount = 0;
2093 				}
2094 			}
2095 		}
2096 		bitind = BIG_CHUNK_SIZE;
2097 	}
2098 
2099 	for (m = 0; m < k; m++) {
2100 		conv_i32_to_d32_and_d16(d32r, d16r, prod, nlen);
2101 		mont_mulf_noconv(prod, d32r, d16r, dt, dn, nint, nlen, dn0);
2102 	}
2103 	if (p != 0) {
2104 		conv_i32_to_d32(d32r, prod, nlen);
2105 		mont_mulf_noconv(prod, d32r, apowers[p >> (l + 1)],
2106 		    dt, dn, nint, nlen, dn0);
2107 	}
2108 	for (m = 0; m < l; m++) {
2109 		conv_i32_to_d32_and_d16(d32r, d16r, prod, nlen);
2110 		mont_mulf_noconv(prod, d32r, d16r, dt, dn, nint, nlen, dn0);
2111 	}
2112 
2113 #if (BIG_CHUNK_SIZE == 32)
2114 	for (i = 0; i < nlen; i++) {
2115 		result->value[i] = prod[i];
2116 	}
2117 	for (i = nlen - 1; (i > 0) && (prod[i] == 0); i--)
2118 		;
2119 #else
2120 	for (i = 0; i < nlen / 2; i++) {
2121 		result->value[i] = (uint64_t)(prod[2 * i]) +
2122 		    (((uint64_t)(prod[2 * i + 1])) << 32);
2123 	}
2124 	for (i = nlen / 2 - 1; (i > 0) && (result->value[i] == 0); i--)
2125 		;
2126 #endif
2127 	result->len = i + 1;
2128 
2129 ret:
2130 	for (i = apowerssize - 1; i >= 0; i--) {
2131 		if (apowers[i] != NULL)
2132 			big_free(apowers[i], (2 * nlen + 1) * sizeof (double));
2133 	}
2134 	if (d32r != NULL) {
2135 		big_free(d32r, nlen * sizeof (double));
2136 	}
2137 	if (d16r != NULL) {
2138 		big_free(d16r, (2 * nlen + 1) * sizeof (double));
2139 	}
2140 	if (prod != NULL) {
2141 		big_free(prod, (nlen + 1) * sizeof (uint32_t));
2142 	}
2143 	if (nint != NULL) {
2144 		big_free(nint, nlen * sizeof (uint32_t));
2145 	}
2146 	if (dt != NULL) {
2147 		big_free(dt, (4 * nlen + 2) * sizeof (double));
2148 	}
2149 	if (dn != NULL) {
2150 		big_free(dn, nlen * sizeof (double));
2151 	}
2152 
2153 #ifdef _KERNEL
2154 	big_restorefp(fpu);
2155 #endif
2156 
2157 	return (err);
2158 }
2159 
2160 #endif /* USE_FLOATING_POINT */
2161 
2162 
2163 BIG_ERR_CODE
2164 big_modexp_ext(BIGNUM *result, BIGNUM *a, BIGNUM *e, BIGNUM *n, BIGNUM *n_rr,
2165     big_modexp_ncp_info_t *info)
2166 {
2167 	BIGNUM		ma, tmp, rr;
2168 	BIG_CHUNK_TYPE	mavalue[BIGTMPSIZE];
2169 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
2170 	BIG_CHUNK_TYPE	rrvalue[BIGTMPSIZE];
2171 	BIG_ERR_CODE	err;
2172 	BIG_CHUNK_TYPE	n0;
2173 
2174 	if ((err = big_init1(&ma, n->len, mavalue, arraysize(mavalue)))	!=
2175 	    BIG_OK) {
2176 		return (err);
2177 	}
2178 	ma.len = 1;
2179 	ma.value[0] = 0;
2180 
2181 	if ((err = big_init1(&tmp, 2 * n->len + 1,
2182 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
2183 		goto ret1;
2184 	}
2185 
2186 	/* set the malloced bit to help cleanup */
2187 	rr.malloced = 0;
2188 	if (n_rr == NULL) {
2189 		if ((err = big_init1(&rr, 2 * n->len + 1,
2190 		    rrvalue, arraysize(rrvalue))) != BIG_OK) {
2191 			goto ret2;
2192 		}
2193 		if (big_mont_rr(&rr, n) != BIG_OK) {
2194 			goto ret;
2195 		}
2196 		n_rr = &rr;
2197 	}
2198 
2199 	n0 = big_n0(n->value[0]);
2200 
2201 	if (big_cmp_abs(a, n) > 0) {
2202 		if ((err = big_div_pos(NULL, &ma, a, n)) != BIG_OK) {
2203 			goto ret;
2204 		}
2205 		err = big_mont_conv(&ma, &ma, n, n0, n_rr);
2206 	} else {
2207 		err = big_mont_conv(&ma, a, n, n0, n_rr);
2208 	}
2209 	if (err != BIG_OK) {
2210 		goto ret;
2211 	}
2212 
2213 	tmp.len = 1;
2214 	tmp.value[0] = 1;
2215 	if ((err = big_mont_conv(&tmp, &tmp, n, n0, n_rr)) != BIG_OK) {
2216 		goto ret;
2217 	}
2218 
2219 	if ((info != NULL) && (info->func != NULL)) {
2220 		err = (*(info->func))(&tmp, &ma, e, n, &tmp, n0,
2221 		    info->ncp, info->reqp);
2222 	} else {
2223 		err = big_modexp_ncp_sw(&tmp, &ma, e, n, &tmp, n0);
2224 	}
2225 	if (err != BIG_OK) {
2226 		goto ret;
2227 	}
2228 
2229 	ma.value[0] = 1;
2230 	ma.len = 1;
2231 	if ((err = big_mont_mul(&tmp, &tmp, &ma, n, n0)) != BIG_OK) {
2232 		goto ret;
2233 	}
2234 	err = big_copy(result, &tmp);
2235 
2236 ret:
2237 	if (rr.malloced) {
2238 		big_finish(&rr);
2239 	}
2240 ret2:
2241 	big_finish(&tmp);
2242 ret1:
2243 	big_finish(&ma);
2244 
2245 	return (err);
2246 }
2247 
2248 BIG_ERR_CODE
2249 big_modexp(BIGNUM *result, BIGNUM *a, BIGNUM *e, BIGNUM *n, BIGNUM *n_rr)
2250 {
2251 	return (big_modexp_ext(result, a, e, n, n_rr, NULL));
2252 }
2253 
2254 
2255 BIG_ERR_CODE
2256 big_modexp_crt_ext(BIGNUM *result, BIGNUM *a, BIGNUM *dmodpminus1,
2257     BIGNUM *dmodqminus1, BIGNUM *p, BIGNUM *q, BIGNUM *pinvmodq,
2258     BIGNUM *p_rr, BIGNUM *q_rr, big_modexp_ncp_info_t *info)
2259 {
2260 	BIGNUM		ap, aq, tmp;
2261 	int		alen, biglen, sign;
2262 	BIG_ERR_CODE	err;
2263 
2264 	if (p->len > q->len) {
2265 		biglen = p->len;
2266 	} else {
2267 		biglen = q->len;
2268 	}
2269 
2270 	if ((err = big_init1(&ap, p->len, NULL, 0)) != BIG_OK) {
2271 		return (err);
2272 	}
2273 	if ((err = big_init1(&aq, q->len, NULL, 0)) != BIG_OK) {
2274 		goto ret1;
2275 	}
2276 	if ((err = big_init1(&tmp, biglen + q->len + 1, NULL, 0)) != BIG_OK) {
2277 		goto ret2;
2278 	}
2279 
2280 	/*
2281 	 * check whether a is too short - to avoid timing attacks
2282 	 */
2283 	alen = a->len;
2284 	while ((alen > p->len) && (a->value[alen - 1] == 0)) {
2285 		alen--;
2286 	}
2287 	if (alen < p->len + q->len) {
2288 		/*
2289 		 * a is too short, add p*q to it before
2290 		 * taking it modulo p and q
2291 		 * this will also affect timing, but this difference
2292 		 * does not depend on p or q, only on a
2293 		 * (in "normal" operation, this path will never be
2294 		 * taken, so it is not a performance penalty
2295 		 */
2296 		if ((err = big_mul(&tmp, p, q)) != BIG_OK) {
2297 			goto ret;
2298 		}
2299 		if ((err = big_add(&tmp, &tmp, a)) != BIG_OK) {
2300 			goto ret;
2301 		}
2302 		if ((err = big_div_pos(NULL, &ap, &tmp, p)) != BIG_OK) {
2303 			goto ret;
2304 		}
2305 		if ((err = big_div_pos(NULL, &aq, &tmp, q)) != BIG_OK) {
2306 			goto ret;
2307 		}
2308 	} else {
2309 		if ((err = big_div_pos(NULL, &ap, a, p)) != BIG_OK) {
2310 			goto ret;
2311 		}
2312 		if ((err = big_div_pos(NULL, &aq, a, q)) != BIG_OK) {
2313 			goto ret;
2314 		}
2315 	}
2316 
2317 	if ((err = big_modexp_ext(&ap, &ap, dmodpminus1, p, p_rr, info)) !=
2318 	    BIG_OK) {
2319 		goto ret;
2320 	}
2321 	if ((err = big_modexp_ext(&aq, &aq, dmodqminus1, q, q_rr, info)) !=
2322 	    BIG_OK) {
2323 		goto ret;
2324 	}
2325 	if ((err = big_sub(&tmp, &aq, &ap)) != BIG_OK) {
2326 		goto ret;
2327 	}
2328 	if ((err = big_mul(&tmp, &tmp, pinvmodq)) != BIG_OK) {
2329 		goto ret;
2330 	}
2331 	sign = tmp.sign;
2332 	tmp.sign = 1;
2333 	if ((err = big_div_pos(NULL, &aq, &tmp, q)) != BIG_OK) {
2334 		goto ret;
2335 	}
2336 	if ((sign == -1) && (!big_is_zero(&aq))) {
2337 		(void) big_sub_pos(&aq, q, &aq);
2338 	}
2339 	if ((err = big_mul(&tmp, &aq, p)) != BIG_OK) {
2340 		goto ret;
2341 	}
2342 	err = big_add_abs(result, &ap, &tmp);
2343 
2344 ret:
2345 	big_finish(&tmp);
2346 ret2:
2347 	big_finish(&aq);
2348 ret1:
2349 	big_finish(&ap);
2350 
2351 	return (err);
2352 }
2353 
2354 
2355 BIG_ERR_CODE
2356 big_modexp_crt(BIGNUM *result, BIGNUM *a, BIGNUM *dmodpminus1,
2357     BIGNUM *dmodqminus1, BIGNUM *p, BIGNUM *q, BIGNUM *pinvmodq,
2358     BIGNUM *p_rr, BIGNUM *q_rr)
2359 {
2360 	return (big_modexp_crt_ext(result, a, dmodpminus1, dmodqminus1,
2361 	    p, q, pinvmodq, p_rr, q_rr, NULL));
2362 }
2363 
2364 
2365 static BIG_CHUNK_TYPE onearr[1] = {(BIG_CHUNK_TYPE)1};
2366 BIGNUM big_One = {1, 1, 1, 0, onearr};
2367 
2368 static BIG_CHUNK_TYPE twoarr[1] = {(BIG_CHUNK_TYPE)2};
2369 BIGNUM big_Two = {1, 1, 1, 0, twoarr};
2370 
2371 static BIG_CHUNK_TYPE fourarr[1] = {(BIG_CHUNK_TYPE)4};
2372 static BIGNUM big_Four = {1, 1, 1, 0, fourarr};
2373 
2374 
2375 BIG_ERR_CODE
2376 big_sqrt_pos(BIGNUM *result, BIGNUM *n)
2377 {
2378 	BIGNUM		*high, *low, *mid, *t;
2379 	BIGNUM		t1, t2, t3, prod;
2380 	BIG_CHUNK_TYPE	t1value[BIGTMPSIZE];
2381 	BIG_CHUNK_TYPE	t2value[BIGTMPSIZE];
2382 	BIG_CHUNK_TYPE	t3value[BIGTMPSIZE];
2383 	BIG_CHUNK_TYPE	prodvalue[BIGTMPSIZE];
2384 	int		i, nbits, diff, nrootbits, highbits;
2385 	BIG_ERR_CODE	err;
2386 
2387 	nbits = big_numbits(n);
2388 
2389 	if ((err = big_init1(&t1, n->len + 1,
2390 	    t1value, arraysize(t1value))) != BIG_OK)
2391 		return (err);
2392 	if ((err = big_init1(&t2, n->len + 1,
2393 	    t2value, arraysize(t2value))) != BIG_OK)
2394 		goto ret1;
2395 	if ((err = big_init1(&t3, n->len + 1,
2396 	    t3value, arraysize(t3value))) != BIG_OK)
2397 		goto ret2;
2398 	if ((err = big_init1(&prod, n->len + 1,
2399 	    prodvalue, arraysize(prodvalue))) != BIG_OK)
2400 		goto ret3;
2401 
2402 	nrootbits = (nbits + 1) / 2;
2403 	t1.len = t2.len = t3.len = (nrootbits - 1) / BIG_CHUNK_SIZE + 1;
2404 	for (i = 0; i < t1.len; i++) {
2405 		t1.value[i] = 0;
2406 		t2.value[i] = BIG_CHUNK_ALLBITS;
2407 	}
2408 	highbits = nrootbits - BIG_CHUNK_SIZE * (t1.len - 1);
2409 	if (highbits == BIG_CHUNK_SIZE) {
2410 		t1.value[t1.len - 1] = BIG_CHUNK_HIGHBIT;
2411 		t2.value[t2.len - 1] = BIG_CHUNK_ALLBITS;
2412 	} else {
2413 		t1.value[t1.len - 1] = (BIG_CHUNK_TYPE)1 << (highbits - 1);
2414 		t2.value[t2.len - 1] = 2 * t1.value[t1.len - 1] - 1;
2415 	}
2416 
2417 	high = &t2;
2418 	low = &t1;
2419 	mid = &t3;
2420 
2421 	if ((err = big_mul(&prod, high, high)) != BIG_OK) {
2422 		goto ret;
2423 	}
2424 	diff = big_cmp_abs(&prod, n);
2425 	if (diff <= 0) {
2426 		err = big_copy(result, high);
2427 		goto ret;
2428 	}
2429 
2430 	(void) big_sub_pos(mid, high, low);
2431 	while (big_cmp_abs(&big_One, mid) != 0) {
2432 		(void) big_add_abs(mid, high, low);
2433 		(void) big_half_pos(mid, mid);
2434 		if ((err = big_mul(&prod, mid, mid)) != BIG_OK)
2435 			goto ret;
2436 		diff = big_cmp_abs(&prod, n);
2437 		if (diff > 0) {
2438 			t = high;
2439 			high = mid;
2440 			mid = t;
2441 		} else if (diff < 0) {
2442 			t = low;
2443 			low = mid;
2444 			mid = t;
2445 		} else {
2446 			err = big_copy(result, low);
2447 			goto ret;
2448 		}
2449 		(void) big_sub_pos(mid, high, low);
2450 	}
2451 
2452 	err = big_copy(result, low);
2453 ret:
2454 	if (prod.malloced) big_finish(&prod);
2455 ret3:
2456 	if (t3.malloced) big_finish(&t3);
2457 ret2:
2458 	if (t2.malloced) big_finish(&t2);
2459 ret1:
2460 	if (t1.malloced) big_finish(&t1);
2461 
2462 	return (err);
2463 }
2464 
2465 
2466 BIG_ERR_CODE
2467 big_Jacobi_pos(int *jac, BIGNUM *nn, BIGNUM *mm)
2468 {
2469 	BIGNUM		*t, *tmp2, *m, *n;
2470 	BIGNUM		t1, t2, t3;
2471 	BIG_CHUNK_TYPE	t1value[BIGTMPSIZE];
2472 	BIG_CHUNK_TYPE	t2value[BIGTMPSIZE];
2473 	BIG_CHUNK_TYPE	t3value[BIGTMPSIZE];
2474 	int		len, err;
2475 
2476 	if (big_is_zero(nn) ||
2477 	    (((nn->value[0] & 1) | (mm->value[0] & 1)) == 0)) {
2478 		*jac = 0;
2479 		return (BIG_OK);
2480 	}
2481 
2482 	if (nn->len > mm->len) {
2483 		len = nn->len;
2484 	} else {
2485 		len = mm->len;
2486 	}
2487 
2488 	if ((err = big_init1(&t1, len,
2489 	    t1value, arraysize(t1value))) != BIG_OK) {
2490 		return (err);
2491 	}
2492 	if ((err = big_init1(&t2, len,
2493 	    t2value, arraysize(t2value))) != BIG_OK) {
2494 		goto ret1;
2495 	}
2496 	if ((err = big_init1(&t3, len,
2497 	    t3value, arraysize(t3value))) != BIG_OK) {
2498 		goto ret2;
2499 	}
2500 
2501 	n = &t1;
2502 	m = &t2;
2503 	tmp2 = &t3;
2504 
2505 	(void) big_copy(n, nn);
2506 	(void) big_copy(m, mm);
2507 
2508 	*jac = 1;
2509 	while (big_cmp_abs(&big_One, m) != 0) {
2510 		if (big_is_zero(n)) {
2511 			*jac = 0;
2512 			goto ret;
2513 		}
2514 		if ((m->value[0] & 1) == 0) {
2515 			if (((n->value[0] & 7) == 3) ||
2516 			    ((n->value[0] & 7) == 5))
2517 				*jac = -*jac;
2518 			(void) big_half_pos(m, m);
2519 		} else if ((n->value[0] & 1) == 0) {
2520 			if (((m->value[0] & 7) == 3) ||
2521 			    ((m->value[0] & 7) == 5))
2522 				*jac = -*jac;
2523 			(void) big_half_pos(n, n);
2524 		} else {
2525 			if (((m->value[0] & 3) == 3) &&
2526 			    ((n->value[0] & 3) == 3)) {
2527 				*jac = -*jac;
2528 			}
2529 			if ((err = big_div_pos(NULL, tmp2, m, n)) != BIG_OK) {
2530 				goto ret;
2531 			}
2532 			t = tmp2;
2533 			tmp2 = m;
2534 			m = n;
2535 			n = t;
2536 		}
2537 	}
2538 	err = BIG_OK;
2539 
2540 ret:
2541 	if (t3.malloced) big_finish(&t3);
2542 ret2:
2543 	if (t2.malloced) big_finish(&t2);
2544 ret1:
2545 	if (t1.malloced) big_finish(&t1);
2546 
2547 	return (err);
2548 }
2549 
2550 
2551 BIG_ERR_CODE
2552 big_Lucas(BIGNUM *Lkminus1, BIGNUM *Lk, BIGNUM *p, BIGNUM *k, BIGNUM *n)
2553 {
2554 	int		m, w, i;
2555 	BIG_CHUNK_TYPE	bit;
2556 	BIGNUM		ki, tmp, tmp2;
2557 	BIG_CHUNK_TYPE	kivalue[BIGTMPSIZE];
2558 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
2559 	BIG_CHUNK_TYPE	tmp2value[BIGTMPSIZE];
2560 	BIG_ERR_CODE	err;
2561 
2562 	if (big_cmp_abs(k, &big_One) == 0) {
2563 		(void) big_copy(Lk, p);
2564 		(void) big_copy(Lkminus1, &big_Two);
2565 		return (BIG_OK);
2566 	}
2567 
2568 	if ((err = big_init1(&ki, k->len + 1,
2569 	    kivalue, arraysize(kivalue))) != BIG_OK)
2570 		return (err);
2571 
2572 	if ((err = big_init1(&tmp, 2 * n->len +1,
2573 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK)
2574 		goto ret1;
2575 
2576 	if ((err = big_init1(&tmp2, n->len,
2577 	    tmp2value, arraysize(tmp2value))) != BIG_OK)
2578 		goto ret2;
2579 
2580 	m = big_numbits(k);
2581 	ki.len = (m - 1) / BIG_CHUNK_SIZE + 1;
2582 	w = (m - 1) / BIG_CHUNK_SIZE;
2583 	bit = (BIG_CHUNK_TYPE)1 << ((m - 1) % BIG_CHUNK_SIZE);
2584 	for (i = 0; i < ki.len; i++) {
2585 		ki.value[i] = 0;
2586 	}
2587 	ki.value[ki.len - 1] = bit;
2588 	if (big_cmp_abs(k, &ki) != 0) {
2589 		(void) big_double(&ki, &ki);
2590 	}
2591 	(void) big_sub_pos(&ki, &ki, k);
2592 
2593 	(void) big_copy(Lk, p);
2594 	(void) big_copy(Lkminus1, &big_Two);
2595 
2596 	for (i = 0; i < m; i++) {
2597 		if ((err = big_mul(&tmp, Lk, Lkminus1)) != BIG_OK) {
2598 			goto ret;
2599 		}
2600 		(void) big_add_abs(&tmp, &tmp, n);
2601 		(void) big_sub_pos(&tmp, &tmp, p);
2602 		if ((err = big_div_pos(NULL, &tmp2, &tmp, n)) != BIG_OK) {
2603 			goto ret;
2604 		}
2605 		if ((ki.value[w] & bit) != 0) {
2606 			if ((err = big_mul(&tmp, Lkminus1, Lkminus1)) !=
2607 			    BIG_OK) {
2608 				goto ret;
2609 			}
2610 			(void) big_add_abs(&tmp, &tmp, n);
2611 			(void) big_sub_pos(&tmp, &tmp, &big_Two);
2612 			if ((err = big_div_pos(NULL, Lkminus1, &tmp, n)) !=
2613 			    BIG_OK) {
2614 				goto ret;
2615 			}
2616 			(void) big_copy(Lk, &tmp2);
2617 		} else {
2618 			if ((err = big_mul(&tmp, Lk, Lk)) != BIG_OK) {
2619 				goto ret;
2620 			}
2621 			(void) big_add_abs(&tmp, &tmp, n);
2622 			(void) big_sub_pos(&tmp, &tmp, &big_Two);
2623 			if ((err = big_div_pos(NULL, Lk, &tmp, n)) != BIG_OK) {
2624 				goto ret;
2625 			}
2626 			(void) big_copy(Lkminus1, &tmp2);
2627 		}
2628 		bit = bit >> 1;
2629 		if (bit == 0) {
2630 			bit = BIG_CHUNK_HIGHBIT;
2631 			w--;
2632 		}
2633 	}
2634 
2635 	err = BIG_OK;
2636 
2637 ret:
2638 	if (tmp2.malloced) big_finish(&tmp2);
2639 ret2:
2640 	if (tmp.malloced) big_finish(&tmp);
2641 ret1:
2642 	if (ki.malloced) big_finish(&ki);
2643 
2644 	return (err);
2645 }
2646 
2647 
2648 BIG_ERR_CODE
2649 big_isprime_pos_ext(BIGNUM *n, big_modexp_ncp_info_t *info)
2650 {
2651 	BIGNUM		o, nminus1, tmp, Lkminus1, Lk;
2652 	BIG_CHUNK_TYPE	ovalue[BIGTMPSIZE];
2653 	BIG_CHUNK_TYPE	nminus1value[BIGTMPSIZE];
2654 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
2655 	BIG_CHUNK_TYPE	Lkminus1value[BIGTMPSIZE];
2656 	BIG_CHUNK_TYPE	Lkvalue[BIGTMPSIZE];
2657 	BIG_ERR_CODE	err;
2658 	int		e, i, jac;
2659 
2660 	if (big_cmp_abs(n, &big_One) == 0) {
2661 		return (BIG_FALSE);
2662 	}
2663 	if (big_cmp_abs(n, &big_Two) == 0) {
2664 		return (BIG_TRUE);
2665 	}
2666 	if ((n->value[0] & 1) == 0) {
2667 		return (BIG_FALSE);
2668 	}
2669 
2670 	if ((err = big_init1(&o, n->len, ovalue, arraysize(ovalue))) !=
2671 	    BIG_OK) {
2672 		return (err);
2673 	}
2674 
2675 	if ((err = big_init1(&nminus1, n->len,
2676 	    nminus1value, arraysize(nminus1value))) != BIG_OK) {
2677 		goto ret1;
2678 	}
2679 
2680 	if ((err = big_init1(&tmp, 2 * n->len,
2681 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
2682 		goto ret2;
2683 	}
2684 
2685 	if ((err = big_init1(&Lkminus1, n->len,
2686 	    Lkminus1value, arraysize(Lkminus1value))) != BIG_OK) {
2687 		goto ret3;
2688 	}
2689 
2690 	if ((err = big_init1(&Lk, n->len,
2691 	    Lkvalue, arraysize(Lkvalue))) != BIG_OK) {
2692 		goto ret4;
2693 	}
2694 
2695 	(void) big_sub_pos(&o, n, &big_One); 	/* cannot fail */
2696 	(void) big_copy(&nminus1, &o);		/* cannot fail */
2697 	e = 0;
2698 	while ((o.value[0] & 1) == 0) {
2699 		e++;
2700 		(void) big_half_pos(&o, &o);  /* cannot fail */
2701 	}
2702 	if ((err = big_modexp_ext(&tmp, &big_Two, &o, n, NULL, info)) !=
2703 	    BIG_OK) {
2704 		goto ret;
2705 	}
2706 
2707 	i = 0;
2708 	while ((i < e) &&
2709 	    (big_cmp_abs(&tmp, &big_One) != 0) &&
2710 	    (big_cmp_abs(&tmp, &nminus1) != 0)) {
2711 		if ((err =
2712 		    big_modexp_ext(&tmp, &tmp, &big_Two, n, NULL, info)) !=
2713 		    BIG_OK)
2714 			goto ret;
2715 		i++;
2716 	}
2717 
2718 	if (!((big_cmp_abs(&tmp, &nminus1) == 0) ||
2719 	    ((i == 0) && (big_cmp_abs(&tmp, &big_One) == 0)))) {
2720 		err = BIG_FALSE;
2721 		goto ret;
2722 	}
2723 
2724 	if ((err = big_sqrt_pos(&tmp, n)) != BIG_OK) {
2725 		goto ret;
2726 	}
2727 
2728 	if ((err = big_mul(&tmp, &tmp, &tmp)) != BIG_OK) {
2729 		goto ret;
2730 	}
2731 	if (big_cmp_abs(&tmp, n) == 0) {
2732 		err = BIG_FALSE;
2733 		goto ret;
2734 	}
2735 
2736 	(void) big_copy(&o, &big_Two);
2737 	do {
2738 		(void) big_add_abs(&o, &o, &big_One);
2739 		if ((err = big_mul(&tmp, &o, &o)) != BIG_OK) {
2740 			goto ret;
2741 		}
2742 		(void) big_sub_pos(&tmp, &tmp, &big_Four);
2743 		if ((err = big_Jacobi_pos(&jac, &tmp, n)) != BIG_OK) {
2744 			goto ret;
2745 		}
2746 	} while (jac != -1);
2747 
2748 	(void) big_add_abs(&tmp, n, &big_One);
2749 	if ((err = big_Lucas(&Lkminus1, &Lk, &o, &tmp, n)) != BIG_OK) {
2750 		goto ret;
2751 	}
2752 
2753 	if ((big_cmp_abs(&Lkminus1, &o) == 0) &&
2754 	    (big_cmp_abs(&Lk, &big_Two) == 0)) {
2755 		err = BIG_TRUE;
2756 	} else {
2757 		err = BIG_FALSE;
2758 	}
2759 
2760 ret:
2761 	if (Lk.malloced) big_finish(&Lk);
2762 ret4:
2763 	if (Lkminus1.malloced) big_finish(&Lkminus1);
2764 ret3:
2765 	if (tmp.malloced) big_finish(&tmp);
2766 ret2:
2767 	if (nminus1.malloced) big_finish(&nminus1);
2768 ret1:
2769 	if (o.malloced) big_finish(&o);
2770 
2771 	return (err);
2772 }
2773 
2774 
2775 BIG_ERR_CODE
2776 big_isprime_pos(BIGNUM *n)
2777 {
2778 	return (big_isprime_pos_ext(n, NULL));
2779 }
2780 
2781 
2782 #define	SIEVESIZE 1000
2783 
2784 uint32_t smallprimes[] =
2785 {
2786 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
2787 51, 53, 59, 61, 67, 71, 73, 79, 83, 89, 91, 97
2788 };
2789 
2790 
2791 BIG_ERR_CODE
2792 big_nextprime_pos_ext(BIGNUM *result, BIGNUM *n, big_modexp_ncp_info_t *info)
2793 {
2794 	BIG_ERR_CODE	err;
2795 	int		sieve[SIEVESIZE];
2796 	int		i;
2797 	uint32_t	off, p;
2798 
2799 	if ((err = big_copy(result, n)) != BIG_OK) {
2800 		return (err);
2801 	}
2802 	result->value[0] |= 1;
2803 	/* CONSTCOND */
2804 	while (1) {
2805 		for (i = 0; i < SIEVESIZE; i++) sieve[i] = 0;
2806 		for (i = 0;
2807 		    i < sizeof (smallprimes) / sizeof (smallprimes[0]); i++) {
2808 			p = smallprimes[i];
2809 			off = big_modhalf_pos(result, p);
2810 			off = p - off;
2811 			if ((off % 2) == 1) {
2812 				off = off + p;
2813 			}
2814 			off = off / 2;
2815 			while (off < SIEVESIZE) {
2816 				sieve[off] = 1;
2817 				off = off + p;
2818 			}
2819 		}
2820 
2821 		for (i = 0; i < SIEVESIZE; i++) {
2822 			if (sieve[i] == 0) {
2823 				err = big_isprime_pos_ext(result, info);
2824 				if (err != BIG_FALSE) {
2825 					if (err != BIG_TRUE) {
2826 						return (err);
2827 					} else {
2828 						goto out;
2829 					}
2830 				}
2831 
2832 			}
2833 			if ((err = big_add_abs(result, result, &big_Two)) !=
2834 			    BIG_OK) {
2835 				return (err);
2836 			}
2837 		}
2838 	}
2839 
2840 out:
2841 	return (BIG_OK);
2842 }
2843 
2844 
2845 BIG_ERR_CODE
2846 big_nextprime_pos(BIGNUM *result, BIGNUM *n)
2847 {
2848 	return (big_nextprime_pos_ext(result, n, NULL));
2849 }
2850 
2851 
2852 BIG_ERR_CODE
2853 big_nextprime_pos_slow(BIGNUM *result, BIGNUM *n)
2854 {
2855 	BIG_ERR_CODE err;
2856 
2857 
2858 	if ((err = big_copy(result, n)) != BIG_OK)
2859 		return (err);
2860 	result->value[0] |= 1;
2861 	while ((err = big_isprime_pos_ext(result, NULL)) != BIG_TRUE) {
2862 		if (err != BIG_FALSE)
2863 			return (err);
2864 		if ((err = big_add_abs(result, result, &big_Two)) != BIG_OK)
2865 			return (err);
2866 	}
2867 	return (BIG_OK);
2868 }
2869 
2870 
2871 /*
2872  * given m and e, computes the rest in the equation
2873  * gcd(m, e) = cm * m + ce * e
2874  */
2875 BIG_ERR_CODE
2876 big_ext_gcd_pos(BIGNUM *gcd, BIGNUM *cm, BIGNUM *ce, BIGNUM *m, BIGNUM *e)
2877 {
2878 	BIGNUM		*xi, *ri, *riminus1, *riminus2, *t;
2879 	BIGNUM		*vmi, *vei, *vmiminus1, *veiminus1;
2880 	BIGNUM		t1, t2, t3, t4, t5, t6, t7, t8, tmp;
2881 	BIG_CHUNK_TYPE	t1value[BIGTMPSIZE];
2882 	BIG_CHUNK_TYPE	t2value[BIGTMPSIZE];
2883 	BIG_CHUNK_TYPE	t3value[BIGTMPSIZE];
2884 	BIG_CHUNK_TYPE	t4value[BIGTMPSIZE];
2885 	BIG_CHUNK_TYPE	t5value[BIGTMPSIZE];
2886 	BIG_CHUNK_TYPE	t6value[BIGTMPSIZE];
2887 	BIG_CHUNK_TYPE	t7value[BIGTMPSIZE];
2888 	BIG_CHUNK_TYPE	t8value[BIGTMPSIZE];
2889 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
2890 	BIG_ERR_CODE	err;
2891 	int		len;
2892 
2893 	if (big_cmp_abs(m, e) >= 0) {
2894 		len = m->len;
2895 	} else {
2896 		len = e->len;
2897 	}
2898 
2899 	if ((err = big_init1(&t1, len,
2900 	    t1value, arraysize(t1value))) != BIG_OK) {
2901 		return (err);
2902 	}
2903 	if ((err = big_init1(&t2, len,
2904 	    t2value, arraysize(t2value))) != BIG_OK) {
2905 		goto ret1;
2906 	}
2907 	if ((err = big_init1(&t3, len,
2908 	    t3value, arraysize(t3value))) != BIG_OK) {
2909 		goto ret2;
2910 	}
2911 	if ((err = big_init1(&t4, len,
2912 	    t4value, arraysize(t3value))) != BIG_OK) {
2913 		goto ret3;
2914 	}
2915 	if ((err = big_init1(&t5, len,
2916 	    t5value, arraysize(t5value))) != BIG_OK) {
2917 		goto ret4;
2918 	}
2919 	if ((err = big_init1(&t6, len,
2920 	    t6value, arraysize(t6value))) != BIG_OK) {
2921 		goto ret5;
2922 	}
2923 	if ((err = big_init1(&t7, len,
2924 	    t7value, arraysize(t7value))) != BIG_OK) {
2925 		goto ret6;
2926 	}
2927 	if ((err = big_init1(&t8, len,
2928 	    t8value, arraysize(t8value))) != BIG_OK) {
2929 		goto ret7;
2930 	}
2931 
2932 	if ((err = big_init1(&tmp, 2 * len,
2933 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
2934 		goto ret8;
2935 	}
2936 
2937 	ri = &t1;
2938 	ri->value[0] = 1;
2939 	ri->len = 1;
2940 	xi = &t2;
2941 	riminus1 = &t3;
2942 	riminus2 = &t4;
2943 	vmi = &t5;
2944 	vei = &t6;
2945 	vmiminus1 = &t7;
2946 	veiminus1 = &t8;
2947 
2948 	(void) big_copy(vmiminus1, &big_One);
2949 	(void) big_copy(vmi, &big_One);
2950 	(void) big_copy(veiminus1, &big_One);
2951 	(void) big_copy(xi, &big_One);
2952 	vei->len = 1;
2953 	vei->value[0] = 0;
2954 
2955 	(void) big_copy(riminus1, m);
2956 	(void) big_copy(ri, e);
2957 
2958 	while (!big_is_zero(ri)) {
2959 		t = riminus2;
2960 		riminus2 = riminus1;
2961 		riminus1 = ri;
2962 		ri = t;
2963 		if ((err = big_mul(&tmp, vmi, xi)) != BIG_OK) {
2964 			goto ret;
2965 		}
2966 		if ((err = big_sub(vmiminus1, vmiminus1, &tmp)) != BIG_OK) {
2967 			goto ret;
2968 		}
2969 		t = vmiminus1;
2970 		vmiminus1 = vmi;
2971 		vmi = t;
2972 		if ((err = big_mul(&tmp, vei, xi)) != BIG_OK) {
2973 			goto ret;
2974 		}
2975 		if ((err = big_sub(veiminus1, veiminus1, &tmp)) != BIG_OK) {
2976 			goto ret;
2977 		}
2978 		t = veiminus1;
2979 		veiminus1 = vei;
2980 		vei = t;
2981 		if ((err = big_div_pos(xi, ri, riminus2, riminus1)) !=
2982 		    BIG_OK) {
2983 			goto ret;
2984 		}
2985 	}
2986 	if ((gcd != NULL) && ((err = big_copy(gcd, riminus1)) != BIG_OK)) {
2987 		goto ret;
2988 	}
2989 	if ((cm != NULL) && ((err = big_copy(cm, vmi)) != BIG_OK)) {
2990 		goto ret;
2991 	}
2992 	if (ce != NULL) {
2993 		err = big_copy(ce, vei);
2994 	}
2995 ret:
2996 	if (tmp.malloced) big_finish(&tmp);
2997 ret8:
2998 	if (t8.malloced) big_finish(&t8);
2999 ret7:
3000 	if (t7.malloced) big_finish(&t7);
3001 ret6:
3002 	if (t6.malloced) big_finish(&t6);
3003 ret5:
3004 	if (t5.malloced) big_finish(&t5);
3005 ret4:
3006 	if (t4.malloced) big_finish(&t4);
3007 ret3:
3008 	if (t3.malloced) big_finish(&t3);
3009 ret2:
3010 	if (t2.malloced) big_finish(&t2);
3011 ret1:
3012 	if (t1.malloced) big_finish(&t1);
3013 
3014 	return (err);
3015 }
3016