xref: /titanic_51/usr/src/common/bignum/bignumimpl.c (revision 40e7ce05bde825a43b0db32ded62db8cc5b6ec9c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Configuration guide
28  * -------------------
29  *
30  * There are 4 preprocessor symbols used to configure the bignum
31  * implementation.  This file contains no logic to configure based on
32  * processor; we leave that to the Makefiles to specify.
33  *
34  * USE_FLOATING_POINT
35  *   Meaning: There is support for a fast floating-point implementation of
36  *   Montgomery multiply.
37  *
38  * PSR_MUL
39  *   Meaning: There are processor-specific versions of the low level
40  *   functions to implement big_mul.  Those functions are: big_mul_set_vec,
41  *   big_mul_add_vec, big_mul_vec, and big_sqr_vec.  PSR_MUL implies support
42  *   for all 4 functions.  You cannot pick and choose which subset of these
43  *   functions to support; that would lead to a rat's nest of #ifdefs.
44  *
45  * HWCAP
46  *   Meaning: Call multiply support functions through a function pointer.
47  *   On x86, there are multiple implementations for different hardware
48  *   capabilities, such as MMX, SSE2, etc.  Tests are made at run-time, when
49  *   a function is first used.  So, the support functions are called through
50  *   a function pointer.  There is no need for that on Sparc, because there
51  *   is only one implementation; support functions are called directly.
52  *   Later, if there were some new VIS instruction, or something, and a
53  *   run-time test were needed, rather than variant kernel modules and
54  *   libraries, then HWCAP would be defined for Sparc, as well.
55  *
56  * UMUL64
57  *   Meaning: It is safe to use generic C code that assumes the existence
58  *   of a 32 x 32 --> 64 bit unsigned multiply.  If this is not defined,
59  *   then the generic code for big_mul_add_vec() must necessarily be very slow,
60  *   because it must fall back to using 16 x 16 --> 32 bit multiplication.
61  *
62  */
63 
64 
65 #include <sys/types.h>
66 #include "bignum.h"
67 
68 #ifdef	_KERNEL
69 #include <sys/ddi.h>
70 #include <sys/mdesc.h>
71 #include <sys/crypto/common.h>
72 
73 #include <sys/kmem.h>
74 #include <sys/param.h>
75 #include <sys/sunddi.h>
76 
77 #else
78 #include <stdlib.h>
79 #include <stdio.h>
80 #include <assert.h>
81 #define	ASSERT	assert
82 #endif	/* _KERNEL */
83 
84 #ifdef __amd64
85 #ifdef _KERNEL
86 #include <sys/x86_archext.h>	/* cpuid_getvendor() */
87 #include <sys/cpuvar.h>
88 #else
89 #include <sys/auxv.h>		/* getisax() */
90 #endif  /* _KERNEL */
91 #endif  /* __amd64 */
92 
93 
94 #ifdef	_LP64 /* truncate 64-bit size_t to 32-bits */
95 #define	UI32(ui)	((uint32_t)ui)
96 #else /* size_t already 32-bits */
97 #define	UI32(ui)	(ui)
98 #endif
99 
100 
101 #ifdef	_KERNEL
102 #define	big_malloc(size)	kmem_alloc(size, KM_NOSLEEP)
103 #define	big_free(ptr, size)	kmem_free(ptr, size)
104 
105 /*
106  * big_realloc()
107  * Allocate memory of newsize bytes and copy oldsize bytes
108  * to the newly-allocated memory, then free the
109  * previously-allocated memory.
110  * Note: newsize must be > oldsize
111  */
112 void *
113 big_realloc(void *from, size_t oldsize, size_t newsize)
114 {
115 	void *rv;
116 
117 	rv = kmem_alloc(newsize, KM_NOSLEEP);
118 	if (rv != NULL)
119 		bcopy(from, rv, oldsize);
120 	kmem_free(from, oldsize);
121 	return (rv);
122 }
123 
124 #else	/* _KERNEL */
125 
126 #ifndef MALLOC_DEBUG
127 
128 #define	big_malloc(size)	malloc(size)
129 #define	big_free(ptr, size)	free(ptr)
130 
131 #else
132 
133 void
134 big_free(void *ptr, size_t size)
135 {
136 	printf("freed %d bytes at %p\n", size, ptr);
137 	free(ptr);
138 }
139 
140 void *
141 big_malloc(size_t size)
142 {
143 	void *rv;
144 	rv = malloc(size);
145 	printf("malloced %d bytes, addr:%p\n", size, rv);
146 	return (rv);
147 }
148 #endif /* MALLOC_DEBUG */
149 
150 #define	big_realloc(x, y, z) realloc((x), (z))
151 
152 
153 /*
154  * printbignum()
155  * Print a BIGNUM type to stdout.
156  */
157 void
158 printbignum(char *aname, BIGNUM *a)
159 {
160 	int i;
161 
162 	(void) printf("\n%s\n%d\n", aname, a->sign*a->len);
163 	for (i = a->len - 1; i >= 0; i--) {
164 #ifdef BIGNUM_CHUNK_32
165 		(void) printf("%08x ", a->value[i]);
166 		if (((i & (BITSINBYTE - 1)) == 0) && (i != 0)) {
167 			(void) printf("\n");
168 		}
169 #else
170 		(void) printf("%08x %08x ", (uint32_t)((a->value[i]) >> 32),
171 		    (uint32_t)((a->value[i]) & 0xffffffff));
172 		if (((i & 3) == 0) && (i != 0)) { /* end of this chunk */
173 			(void) printf("\n");
174 		}
175 #endif
176 	}
177 	(void) printf("\n");
178 }
179 
180 #endif	/* _KERNEL */
181 
182 
183 #ifdef  __amd64
184 /*
185  * Return 1 if executing on Intel, otherwise 0 (e.g., AMD64).
186  */
187 static int
188 bignum_on_intel(void)
189 {
190 #ifdef _KERNEL
191 	return (cpuid_getvendor(CPU) == X86_VENDOR_Intel);
192 #else
193 	uint_t  ui;
194 	(void) getisax(&ui, 1);
195 	return ((ui & AV_386_AMD_MMX) == 0);
196 #endif  /* _KERNEL */
197 }
198 #endif  /* __amd64 */
199 
200 
201 /*
202  * big_init()
203  * Initialize and allocate memory for a BIGNUM type.
204  *
205  * big_init(number, size) is equivalent to big_init1(number, size, NULL, 0)
206  *
207  * Note: call big_finish() to free memory allocated by big_init().
208  *
209  * Input:
210  * number	Uninitialized memory for BIGNUM
211  * size		Minimum size, in BIG_CHUNK_SIZE-bit words, required for BIGNUM
212  *
213  * Output:
214  * number	Initialized BIGNUM
215  *
216  * Return BIG_OK on success or BIG_NO_MEM for an allocation error.
217  */
218 BIG_ERR_CODE
219 big_init(BIGNUM *number, int size)
220 {
221 	number->value = big_malloc(BIGNUM_WORDSIZE * size);
222 	if (number->value == NULL) {
223 		return (BIG_NO_MEM);
224 	}
225 	number->size = size;
226 	number->len = 0;
227 	number->sign = 1;
228 	number->malloced = 1;
229 	return (BIG_OK);
230 }
231 
232 
233 /*
234  * big_init1()
235  * Initialize and, if needed, allocate memory for a BIGNUM type.
236  * Use the buffer passed, buf, if any, instad of allocating memory
237  * if it's at least "size" bytes.
238  *
239  * Note: call big_finish() to free memory allocated by big_init().
240  *
241  * Input:
242  * number	Uninitialized memory for BIGNUM
243  * size		Minimum size, in BIG_CHUNK_SIZE-bit words, required for BIGNUM
244  * buf		Buffer for storing a BIGNUM.
245  *		If NULL, big_init1() will allocate a buffer
246  * bufsize	Size, in BIG_CHUNK_SIZE_bit words, of buf
247  *
248  * Output:
249  * number	Initialized BIGNUM
250  *
251  * Return BIG_OK on success or BIG_NO_MEM for an allocation error.
252  */
253 BIG_ERR_CODE
254 big_init1(BIGNUM *number, int size, BIG_CHUNK_TYPE *buf, int bufsize)
255 {
256 	if ((buf == NULL) || (size > bufsize)) {
257 		number->value = big_malloc(BIGNUM_WORDSIZE * size);
258 		if (number->value == NULL) {
259 			return (BIG_NO_MEM);
260 		}
261 		number->size = size;
262 		number->malloced = 1;
263 	} else {
264 		number->value = buf;
265 		number->size = bufsize;
266 		number->malloced = 0;
267 	}
268 	number->len = 0;
269 	number->sign = 1;
270 
271 	return (BIG_OK);
272 }
273 
274 
275 /*
276  * big_finish()
277  * Free memory, if any, allocated by big_init() or big_init1().
278  */
279 void
280 big_finish(BIGNUM *number)
281 {
282 	if (number->malloced == 1) {
283 		big_free(number->value, BIGNUM_WORDSIZE * number->size);
284 		number->malloced = 0;
285 	}
286 }
287 
288 
289 /*
290  * bn->size should be at least
291  * (len + BIGNUM_WORDSIZE - 1) / BIGNUM_WORDSIZE bytes
292  * converts from byte-big-endian format to bignum format (words in
293  * little endian order, but bytes within the words big endian)
294  */
295 void
296 bytestring2bignum(BIGNUM *bn, uchar_t *kn, size_t len)
297 {
298 	int		i, j;
299 	uint32_t	offs;
300 	const uint32_t	slen = UI32(len);
301 	BIG_CHUNK_TYPE	word;
302 	uchar_t		*knwordp;
303 
304 	if (slen == 0) {
305 		bn->len = 1;
306 		bn->value[0] = 0;
307 		return;
308 	}
309 
310 	offs = slen % BIGNUM_WORDSIZE;
311 	bn->len = slen / BIGNUM_WORDSIZE;
312 
313 	for (i = 0; i < slen / BIGNUM_WORDSIZE; i++) {
314 		knwordp = &(kn[slen - BIGNUM_WORDSIZE * (i + 1)]);
315 		word = knwordp[0];
316 		for (j = 1; j < BIGNUM_WORDSIZE; j++) {
317 			word = (word << BITSINBYTE) + knwordp[j];
318 		}
319 		bn->value[i] = word;
320 	}
321 	if (offs > 0) {
322 		word = kn[0];
323 		for (i = 1; i < offs; i++) word = (word << BITSINBYTE) + kn[i];
324 		bn->value[bn->len++] = word;
325 	}
326 	while ((bn->len > 1) && (bn->value[bn->len - 1] == 0)) {
327 		bn->len --;
328 	}
329 }
330 
331 
332 /*
333  * copies the least significant len bytes if
334  * len < bn->len * BIGNUM_WORDSIZE
335  * converts from bignum format to byte-big-endian format.
336  * bignum format is words of type  BIG_CHUNK_TYPE in little endian order.
337  */
338 void
339 bignum2bytestring(uchar_t *kn, BIGNUM *bn, size_t len)
340 {
341 	int		i, j;
342 	uint32_t	offs;
343 	const uint32_t	slen = UI32(len);
344 	BIG_CHUNK_TYPE	word;
345 
346 	if (len < BIGNUM_WORDSIZE * bn->len) {
347 		for (i = 0; i < slen / BIGNUM_WORDSIZE; i++) {
348 			word = bn->value[i];
349 			for (j = 0; j < BIGNUM_WORDSIZE; j++) {
350 				kn[slen - BIGNUM_WORDSIZE * i - j - 1] =
351 				    word & 0xff;
352 				word = word >> BITSINBYTE;
353 			}
354 		}
355 		offs = slen % BIGNUM_WORDSIZE;
356 		if (offs > 0) {
357 			word = bn->value[slen / BIGNUM_WORDSIZE];
358 			for (i =  slen % BIGNUM_WORDSIZE; i > 0; i --) {
359 				kn[i - 1] = word & 0xff;
360 				word = word >> BITSINBYTE;
361 			}
362 		}
363 	} else {
364 		for (i = 0; i < bn->len; i++) {
365 			word = bn->value[i];
366 			for (j = 0; j < BIGNUM_WORDSIZE; j++) {
367 				kn[slen - BIGNUM_WORDSIZE * i - j - 1] =
368 				    word & 0xff;
369 				word = word >> BITSINBYTE;
370 			}
371 		}
372 		for (i = 0; i < slen - BIGNUM_WORDSIZE * bn->len; i++) {
373 			kn[i] = 0;
374 		}
375 	}
376 }
377 
378 
379 int
380 big_bitlength(BIGNUM *a)
381 {
382 	int		l = 0, b = 0;
383 	BIG_CHUNK_TYPE	c;
384 
385 	l = a->len - 1;
386 	while ((l > 0) && (a->value[l] == 0)) {
387 		l--;
388 	}
389 	b = BIG_CHUNK_SIZE;
390 	c = a->value[l];
391 	while ((b > 1) && ((c & BIG_CHUNK_HIGHBIT) == 0)) {
392 		c = c << 1;
393 		b--;
394 	}
395 
396 	return (l * BIG_CHUNK_SIZE + b);
397 }
398 
399 
400 /*
401  * big_copy()
402  * Copy BIGNUM src to dest, allocating memory if needed.
403  */
404 BIG_ERR_CODE
405 big_copy(BIGNUM *dest, BIGNUM *src)
406 {
407 	BIG_CHUNK_TYPE	*newptr;
408 	int		i, len;
409 
410 	len = src->len;
411 	while ((len > 1) && (src->value[len - 1] == 0)) {
412 		len--;
413 	}
414 	src->len = len;
415 	if (dest->size < len) {
416 		if (dest->malloced == 1) {
417 			newptr = (BIG_CHUNK_TYPE *)big_realloc(dest->value,
418 			    BIGNUM_WORDSIZE * dest->size,
419 			    BIGNUM_WORDSIZE * len);
420 		} else {
421 			newptr = (BIG_CHUNK_TYPE *)
422 			    big_malloc(BIGNUM_WORDSIZE * len);
423 			if (newptr != NULL) {
424 				dest->malloced = 1;
425 			}
426 		}
427 		if (newptr == NULL) {
428 			return (BIG_NO_MEM);
429 		}
430 		dest->value = newptr;
431 		dest->size = len;
432 	}
433 	dest->len = len;
434 	dest->sign = src->sign;
435 	for (i = 0; i < len; i++) {
436 		dest->value[i] = src->value[i];
437 	}
438 
439 	return (BIG_OK);
440 }
441 
442 
443 /*
444  * big_extend()
445  * Allocate memory to extend BIGNUM number to size bignum chunks,
446  * if not at least that size already.
447  */
448 BIG_ERR_CODE
449 big_extend(BIGNUM *number, int size)
450 {
451 	BIG_CHUNK_TYPE	*newptr;
452 	int		i;
453 
454 	if (number->size >= size)
455 		return (BIG_OK);
456 	if (number->malloced) {
457 		number->value = big_realloc(number->value,
458 		    BIGNUM_WORDSIZE * number->size,
459 		    BIGNUM_WORDSIZE * size);
460 	} else {
461 		newptr = big_malloc(BIGNUM_WORDSIZE * size);
462 		if (newptr != NULL) {
463 			for (i = 0; i < number->size; i++) {
464 				newptr[i] = number->value[i];
465 			}
466 		}
467 		number->value = newptr;
468 	}
469 
470 	if (number->value == NULL) {
471 		return (BIG_NO_MEM);
472 	}
473 
474 	number->size = size;
475 	number->malloced = 1;
476 	return (BIG_OK);
477 }
478 
479 
480 /* returns 1 if n == 0 */
481 int
482 big_is_zero(BIGNUM *n)
483 {
484 	int	i, result;
485 
486 	result = 1;
487 	for (i = 0; i < n->len; i++) {
488 		if (n->value[i] != 0) {
489 			result = 0;
490 		}
491 	}
492 	return (result);
493 }
494 
495 
496 BIG_ERR_CODE
497 big_add_abs(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
498 {
499 	int		i, shorter, longer;
500 	BIG_CHUNK_TYPE	cy, ai;
501 	BIG_CHUNK_TYPE	*r, *a, *b, *c;
502 	BIG_ERR_CODE	err;
503 	BIGNUM		*longerarg;
504 
505 	if (aa->len > bb->len) {
506 		shorter = bb->len;
507 		longer = aa->len;
508 		longerarg = aa;
509 	} else {
510 		shorter = aa->len;
511 		longer = bb->len;
512 		longerarg = bb;
513 	}
514 	if (result->size < longer + 1) {
515 		err = big_extend(result, longer + 1);
516 		if (err != BIG_OK) {
517 			return (err);
518 		}
519 	}
520 
521 	r = result->value;
522 	a = aa->value;
523 	b = bb->value;
524 	c = longerarg->value;
525 	cy = 0;
526 	for (i = 0; i < shorter; i++) {
527 		ai = a[i];
528 		r[i] = ai + b[i] + cy;
529 		if (r[i] > ai) {
530 			cy = 0;
531 		} else if (r[i] < ai) {
532 			cy = 1;
533 		}
534 	}
535 	for (; i < longer; i++) {
536 		ai = c[i];
537 		r[i] = ai + cy;
538 		if (r[i] >= ai) {
539 			cy = 0;
540 		}
541 	}
542 	if (cy == 1) {
543 		r[i] = cy;
544 		result->len = longer + 1;
545 	} else {
546 		result->len = longer;
547 	}
548 	result->sign = 1;
549 	return (BIG_OK);
550 }
551 
552 
553 /* caller must make sure that result has at least len words allocated */
554 void
555 big_sub_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, BIG_CHUNK_TYPE *b, int len)
556 {
557 	int		i;
558 	BIG_CHUNK_TYPE	cy, ai;
559 
560 	cy = 1;
561 	for (i = 0; i < len; i++) {
562 		ai = a[i];
563 		r[i] = ai + (~b[i]) + cy;
564 		if (r[i] > ai) {
565 			cy = 0;
566 		} else if (r[i] < ai) {
567 			cy = 1;
568 		}
569 	}
570 }
571 
572 
573 /* result=aa-bb  it is assumed that aa>=bb */
574 BIG_ERR_CODE
575 big_sub_pos(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
576 {
577 	int		i, shorter;
578 	BIG_CHUNK_TYPE	cy = 1, ai;
579 	BIG_CHUNK_TYPE	*r, *a, *b;
580 	BIG_ERR_CODE	err = BIG_OK;
581 
582 	if (aa->len > bb->len) {
583 		shorter = bb->len;
584 	} else {
585 		shorter = aa->len;
586 	}
587 	if (result->size < aa->len) {
588 		err = big_extend(result, aa->len);
589 		if (err != BIG_OK) {
590 			return (err);
591 		}
592 	}
593 
594 	r = result->value;
595 	a = aa->value;
596 	b = bb->value;
597 	result->len = aa->len;
598 	cy = 1;
599 	for (i = 0; i < shorter; i++) {
600 		ai = a[i];
601 		r[i] = ai + (~b[i]) + cy;
602 		if (r[i] > ai) {
603 			cy = 0;
604 		} else if (r[i] < ai) {
605 			cy = 1;
606 		}
607 	}
608 	for (; i < aa->len; i++) {
609 		ai = a[i];
610 		r[i] = ai + (~0) + cy;
611 		if (r[i] < ai) {
612 			cy = 1;
613 		}
614 	}
615 	result->sign = 1;
616 
617 	if (cy == 0) {
618 		return (BIG_INVALID_ARGS);
619 	} else {
620 		return (BIG_OK);
621 	}
622 }
623 
624 
625 /* returns -1 if |aa|<|bb|, 0 if |aa|==|bb| 1 if |aa|>|bb| */
626 int
627 big_cmp_abs(BIGNUM *aa, BIGNUM *bb)
628 {
629 	int	i;
630 
631 	if (aa->len > bb->len) {
632 		for (i = aa->len - 1; i > bb->len - 1; i--) {
633 			if (aa->value[i] > 0) {
634 				return (1);
635 			}
636 		}
637 	} else if (aa->len < bb->len) {
638 		for (i = bb->len - 1; i > aa->len - 1; i--) {
639 			if (bb->value[i] > 0) {
640 				return (-1);
641 			}
642 		}
643 	} else {
644 		i = aa->len - 1;
645 	}
646 	for (; i >= 0; i--) {
647 		if (aa->value[i] > bb->value[i]) {
648 			return (1);
649 		} else if (aa->value[i] < bb->value[i]) {
650 			return (-1);
651 		}
652 	}
653 
654 	return (0);
655 }
656 
657 
658 BIG_ERR_CODE
659 big_sub(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
660 {
661 	BIG_ERR_CODE	err;
662 
663 	if ((bb->sign == -1) && (aa->sign == 1)) {
664 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
665 			return (err);
666 		}
667 		result->sign = 1;
668 	} else if ((aa->sign == -1) && (bb->sign == 1)) {
669 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
670 			return (err);
671 		}
672 		result->sign = -1;
673 	} else if ((aa->sign == 1) && (bb->sign == 1)) {
674 		if (big_cmp_abs(aa, bb) >= 0) {
675 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
676 				return (err);
677 			}
678 			result->sign = 1;
679 		} else {
680 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
681 				return (err);
682 			}
683 			result->sign = -1;
684 		}
685 	} else {
686 		if (big_cmp_abs(aa, bb) >= 0) {
687 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
688 				return (err);
689 			}
690 			result->sign = -1;
691 		} else {
692 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
693 				return (err);
694 			}
695 			result->sign = 1;
696 		}
697 	}
698 	return (BIG_OK);
699 }
700 
701 
702 BIG_ERR_CODE
703 big_add(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
704 {
705 	BIG_ERR_CODE	err;
706 
707 	if ((bb->sign == -1) && (aa->sign == -1)) {
708 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
709 			return (err);
710 		}
711 		result->sign = -1;
712 	} else if ((aa->sign == 1) && (bb->sign == 1)) {
713 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
714 			return (err);
715 		}
716 		result->sign = 1;
717 	} else if ((aa->sign == 1) && (bb->sign == -1)) {
718 		if (big_cmp_abs(aa, bb) >= 0) {
719 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
720 				return (err);
721 			}
722 			result->sign = 1;
723 		} else {
724 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
725 				return (err);
726 			}
727 			result->sign = -1;
728 		}
729 	} else {
730 		if (big_cmp_abs(aa, bb) >= 0) {
731 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
732 				return (err);
733 			}
734 			result->sign = -1;
735 		} else {
736 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
737 				return (err);
738 			}
739 			result->sign = 1;
740 		}
741 	}
742 	return (BIG_OK);
743 }
744 
745 
746 /* result = aa/2 */
747 BIG_ERR_CODE
748 big_half_pos(BIGNUM *result, BIGNUM *aa)
749 {
750 	BIG_ERR_CODE	err;
751 	int		i;
752 	BIG_CHUNK_TYPE	cy, cy1;
753 	BIG_CHUNK_TYPE	*a, *r;
754 
755 	if (result->size < aa->len) {
756 		err = big_extend(result, aa->len);
757 		if (err != BIG_OK) {
758 			return (err);
759 		}
760 	}
761 
762 	result->len = aa->len;
763 	a = aa->value;
764 	r = result->value;
765 	cy = 0;
766 	for (i = aa->len - 1; i >= 0; i--) {
767 		cy1 = a[i] << (BIG_CHUNK_SIZE - 1);
768 		r[i] = (cy | (a[i] >> 1));
769 		cy = cy1;
770 	}
771 	if (r[result->len - 1] == 0) {
772 		result->len--;
773 	}
774 
775 	return (BIG_OK);
776 }
777 
778 /* result  =  aa*2 */
779 BIG_ERR_CODE
780 big_double(BIGNUM *result, BIGNUM *aa)
781 {
782 	BIG_ERR_CODE	err;
783 	int		i, rsize;
784 	BIG_CHUNK_TYPE	cy, cy1;
785 	BIG_CHUNK_TYPE	*a, *r;
786 
787 	if ((aa->len > 0) &&
788 	    ((aa->value[aa->len - 1] & BIG_CHUNK_HIGHBIT) != 0)) {
789 		rsize = aa->len + 1;
790 	} else {
791 		rsize = aa->len;
792 	}
793 
794 	if (result->size < rsize) {
795 		err = big_extend(result, rsize);
796 		if (err != BIG_OK)
797 			return (err);
798 	}
799 
800 	a = aa->value;
801 	r = result->value;
802 	if (rsize == aa->len + 1) {
803 		r[rsize - 1] = 1;
804 	}
805 	cy = 0;
806 	for (i = 0; i < aa->len; i++) {
807 		cy1 = a[i] >> (BIG_CHUNK_SIZE - 1);
808 		r[i] = (cy | (a[i] << 1));
809 		cy = cy1;
810 	}
811 	result->len = rsize;
812 	return (BIG_OK);
813 }
814 
815 
816 /*
817  * returns aa mod b, aa must be nonneg, b must be a max
818  * (BIG_CHUNK_SIZE / 2)-bit integer
819  */
820 static uint32_t
821 big_modhalf_pos(BIGNUM *aa, uint32_t b)
822 {
823 	int		i;
824 	BIG_CHUNK_TYPE	rem;
825 
826 	if (aa->len == 0) {
827 		return (0);
828 	}
829 	rem = aa->value[aa->len - 1] % b;
830 	for (i = aa->len - 2; i >= 0; i--) {
831 		rem = ((rem << (BIG_CHUNK_SIZE / 2)) |
832 		    (aa->value[i] >> (BIG_CHUNK_SIZE / 2))) % b;
833 		rem = ((rem << (BIG_CHUNK_SIZE / 2)) |
834 		    (aa->value[i] & BIG_CHUNK_LOWHALFBITS)) % b;
835 	}
836 
837 	return ((uint32_t)rem);
838 }
839 
840 
841 /*
842  * result = aa - (2^BIG_CHUNK_SIZE)^lendiff * bb
843  * result->size should be at least aa->len at entry
844  * aa, bb, and result should be positive
845  */
846 void
847 big_sub_pos_high(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
848 {
849 	int i, lendiff;
850 	BIGNUM res1, aa1;
851 
852 	lendiff = aa->len - bb->len;
853 	res1.size = result->size - lendiff;
854 	res1.malloced = 0;
855 	res1.value = result->value + lendiff;
856 	aa1.size = aa->size - lendiff;
857 	aa1.value = aa->value + lendiff;
858 	aa1.len = bb->len;
859 	aa1.sign = 1;
860 	(void) big_sub_pos(&res1, &aa1, bb);
861 	if (result->value != aa->value) {
862 		for (i = 0; i < lendiff; i++) {
863 			result->value[i] = aa->value[i];
864 		}
865 	}
866 	result->len = aa->len;
867 }
868 
869 
870 /*
871  * returns 1, 0, or -1 depending on whether |aa| > , ==, or <
872  *					(2^BIG_CHUNK_SIZE)^lendiff * |bb|
873  * aa->len should be >= bb->len
874  */
875 int
876 big_cmp_abs_high(BIGNUM *aa, BIGNUM *bb)
877 {
878 	int		lendiff;
879 	BIGNUM		aa1;
880 
881 	lendiff = aa->len - bb->len;
882 	aa1.len = bb->len;
883 	aa1.size = aa->size - lendiff;
884 	aa1.malloced = 0;
885 	aa1.value = aa->value + lendiff;
886 	return (big_cmp_abs(&aa1, bb));
887 }
888 
889 
890 /*
891  * result = aa * b where b is a max. (BIG_CHUNK_SIZE / 2)-bit positive integer.
892  * result should have enough space allocated.
893  */
894 static void
895 big_mulhalf_low(BIGNUM *result, BIGNUM *aa, BIG_CHUNK_TYPE b)
896 {
897 	int		i;
898 	BIG_CHUNK_TYPE	t1, t2, ai, cy;
899 	BIG_CHUNK_TYPE	*a, *r;
900 
901 	a = aa->value;
902 	r = result->value;
903 	cy = 0;
904 	for (i = 0; i < aa->len; i++) {
905 		ai = a[i];
906 		t1 = (ai & BIG_CHUNK_LOWHALFBITS) * b + cy;
907 		t2 = (ai >> (BIG_CHUNK_SIZE / 2)) * b +
908 		    (t1 >> (BIG_CHUNK_SIZE / 2));
909 		r[i] = (t1 & BIG_CHUNK_LOWHALFBITS) |
910 		    (t2 << (BIG_CHUNK_SIZE / 2));
911 		cy = t2 >> (BIG_CHUNK_SIZE / 2);
912 	}
913 	r[i] = cy;
914 	result->len = aa->len + 1;
915 	result->sign = aa->sign;
916 }
917 
918 
919 /*
920  * result = aa * b * 2^(BIG_CHUNK_SIZE / 2) where b is a max.
921  * (BIG_CHUNK_SIZE / 2)-bit positive integer.
922  * result should have enough space allocated.
923  */
924 static void
925 big_mulhalf_high(BIGNUM *result, BIGNUM *aa, BIG_CHUNK_TYPE b)
926 {
927 	int		i;
928 	BIG_CHUNK_TYPE	t1, t2, ai, cy, ri;
929 	BIG_CHUNK_TYPE	*a, *r;
930 
931 	a = aa->value;
932 	r = result->value;
933 	cy = 0;
934 	ri = 0;
935 	for (i = 0; i < aa->len; i++) {
936 		ai = a[i];
937 		t1 = (ai & BIG_CHUNK_LOWHALFBITS) * b + cy;
938 		t2 = (ai >>  (BIG_CHUNK_SIZE / 2)) * b +
939 		    (t1 >>  (BIG_CHUNK_SIZE / 2));
940 		r[i] = (t1 <<  (BIG_CHUNK_SIZE / 2)) + ri;
941 		ri = t2 & BIG_CHUNK_LOWHALFBITS;
942 		cy = t2 >> (BIG_CHUNK_SIZE / 2);
943 	}
944 	r[i] = (cy <<  (BIG_CHUNK_SIZE / 2)) + ri;
945 	result->len = aa->len + 1;
946 	result->sign = aa->sign;
947 }
948 
949 
950 /* it is assumed that result->size is big enough */
951 void
952 big_shiftleft(BIGNUM *result, BIGNUM *aa, int offs)
953 {
954 	int		i;
955 	BIG_CHUNK_TYPE	cy, ai;
956 
957 	if (offs == 0) {
958 		if (result != aa) {
959 			(void) big_copy(result, aa);
960 		}
961 		return;
962 	}
963 	cy = 0;
964 	for (i = 0; i < aa->len; i++) {
965 		ai = aa->value[i];
966 		result->value[i] = (ai << offs) | cy;
967 		cy = ai >> (BIG_CHUNK_SIZE - offs);
968 	}
969 	if (cy != 0) {
970 		result->len = aa->len + 1;
971 		result->value[result->len - 1] = cy;
972 	} else {
973 		result->len = aa->len;
974 	}
975 	result->sign = aa->sign;
976 }
977 
978 
979 /* it is assumed that result->size is big enough */
980 void
981 big_shiftright(BIGNUM *result, BIGNUM *aa, int offs)
982 {
983 	int		 i;
984 	BIG_CHUNK_TYPE	cy, ai;
985 
986 	if (offs == 0) {
987 		if (result != aa) {
988 			(void) big_copy(result, aa);
989 		}
990 		return;
991 	}
992 	cy = aa->value[0] >> offs;
993 	for (i = 1; i < aa->len; i++) {
994 		ai = aa->value[i];
995 		result->value[i - 1] = (ai << (BIG_CHUNK_SIZE - offs)) | cy;
996 		cy = ai >> offs;
997 	}
998 	result->len = aa->len;
999 	result->value[result->len - 1] = cy;
1000 	result->sign = aa->sign;
1001 }
1002 
1003 
1004 /*
1005  * result = aa/bb   remainder = aa mod bb
1006  * it is assumed that aa and bb are positive
1007  */
1008 BIG_ERR_CODE
1009 big_div_pos(BIGNUM *result, BIGNUM *remainder, BIGNUM *aa, BIGNUM *bb)
1010 {
1011 	BIG_ERR_CODE	err = BIG_OK;
1012 	int		i, alen, blen, tlen, rlen, offs;
1013 	BIG_CHUNK_TYPE	higha, highb, coeff;
1014 	BIG_CHUNK_TYPE	*a, *b;
1015 	BIGNUM		bbhigh, bblow, tresult, tmp1, tmp2;
1016 	BIG_CHUNK_TYPE	tmp1value[BIGTMPSIZE];
1017 	BIG_CHUNK_TYPE	tmp2value[BIGTMPSIZE];
1018 	BIG_CHUNK_TYPE	tresultvalue[BIGTMPSIZE];
1019 	BIG_CHUNK_TYPE	bblowvalue[BIGTMPSIZE];
1020 	BIG_CHUNK_TYPE	bbhighvalue[BIGTMPSIZE];
1021 
1022 	a = aa->value;
1023 	b = bb->value;
1024 	alen = aa->len;
1025 	blen = bb->len;
1026 	while ((alen > 1) && (a[alen - 1] == 0)) {
1027 		alen = alen - 1;
1028 	}
1029 	aa->len = alen;
1030 	while ((blen > 1) && (b[blen - 1] == 0)) {
1031 		blen = blen - 1;
1032 	}
1033 	bb->len = blen;
1034 	if ((blen == 1) && (b[0] == 0)) {
1035 		return (BIG_DIV_BY_0);
1036 	}
1037 
1038 	if (big_cmp_abs(aa, bb) < 0) {
1039 		if ((remainder != NULL) &&
1040 		    ((err = big_copy(remainder, aa)) != BIG_OK)) {
1041 			return (err);
1042 		}
1043 		if (result != NULL) {
1044 			result->len = 1;
1045 			result->sign = 1;
1046 			result->value[0] = 0;
1047 		}
1048 		return (BIG_OK);
1049 	}
1050 
1051 	if ((err = big_init1(&bblow, blen + 1,
1052 	    bblowvalue, arraysize(bblowvalue))) != BIG_OK)
1053 		return (err);
1054 
1055 	if ((err = big_init1(&bbhigh, blen + 1,
1056 	    bbhighvalue, arraysize(bbhighvalue))) != BIG_OK)
1057 		goto ret1;
1058 
1059 	if ((err = big_init1(&tmp1, alen + 2,
1060 	    tmp1value, arraysize(tmp1value))) != BIG_OK)
1061 		goto ret2;
1062 
1063 	if ((err = big_init1(&tmp2, blen + 2,
1064 	    tmp2value, arraysize(tmp2value))) != BIG_OK)
1065 		goto ret3;
1066 
1067 	if ((err = big_init1(&tresult, alen - blen + 2,
1068 	    tresultvalue, arraysize(tresultvalue))) != BIG_OK)
1069 		goto ret4;
1070 
1071 	offs = 0;
1072 	highb = b[blen - 1];
1073 	if (highb >= (BIG_CHUNK_HALF_HIGHBIT << 1)) {
1074 		highb = highb >> (BIG_CHUNK_SIZE / 2);
1075 		offs = (BIG_CHUNK_SIZE / 2);
1076 	}
1077 	while ((highb & BIG_CHUNK_HALF_HIGHBIT) == 0) {
1078 		highb = highb << 1;
1079 		offs++;
1080 	}
1081 
1082 	big_shiftleft(&bblow, bb, offs);
1083 
1084 	if (offs <= (BIG_CHUNK_SIZE / 2 - 1)) {
1085 		big_shiftleft(&bbhigh, &bblow, BIG_CHUNK_SIZE / 2);
1086 	} else {
1087 		big_shiftright(&bbhigh, &bblow, BIG_CHUNK_SIZE / 2);
1088 	}
1089 	if (bbhigh.value[bbhigh.len - 1] == 0) {
1090 		bbhigh.len--;
1091 	} else {
1092 		bbhigh.value[bbhigh.len] = 0;
1093 	}
1094 
1095 	highb = bblow.value[bblow.len - 1];
1096 
1097 	big_shiftleft(&tmp1, aa, offs);
1098 	rlen = tmp1.len - bblow.len + 1;
1099 	tresult.len = rlen;
1100 
1101 	tmp1.len++;
1102 	tlen = tmp1.len;
1103 	tmp1.value[tmp1.len - 1] = 0;
1104 	for (i = 0; i < rlen; i++) {
1105 		higha = (tmp1.value[tlen - 1] << (BIG_CHUNK_SIZE / 2)) +
1106 		    (tmp1.value[tlen - 2] >> (BIG_CHUNK_SIZE / 2));
1107 		coeff = higha / (highb + 1);
1108 		big_mulhalf_high(&tmp2, &bblow, coeff);
1109 		big_sub_pos_high(&tmp1, &tmp1, &tmp2);
1110 		bbhigh.len++;
1111 		while (tmp1.value[tlen - 1] > 0) {
1112 			big_sub_pos_high(&tmp1, &tmp1, &bbhigh);
1113 			coeff++;
1114 		}
1115 		bbhigh.len--;
1116 		tlen--;
1117 		tmp1.len--;
1118 		while (big_cmp_abs_high(&tmp1, &bbhigh) >= 0) {
1119 			big_sub_pos_high(&tmp1, &tmp1, &bbhigh);
1120 			coeff++;
1121 		}
1122 		tresult.value[rlen - i - 1] = coeff << (BIG_CHUNK_SIZE / 2);
1123 		higha = tmp1.value[tlen - 1];
1124 		coeff = higha / (highb + 1);
1125 		big_mulhalf_low(&tmp2, &bblow, coeff);
1126 		tmp2.len--;
1127 		big_sub_pos_high(&tmp1, &tmp1, &tmp2);
1128 		while (big_cmp_abs_high(&tmp1, &bblow) >= 0) {
1129 			big_sub_pos_high(&tmp1, &tmp1, &bblow);
1130 			coeff++;
1131 		}
1132 		tresult.value[rlen - i - 1] =
1133 		    tresult.value[rlen - i - 1] + coeff;
1134 	}
1135 
1136 	big_shiftright(&tmp1, &tmp1, offs);
1137 
1138 	err = BIG_OK;
1139 
1140 	if ((remainder != NULL) &&
1141 	    ((err = big_copy(remainder, &tmp1)) != BIG_OK))
1142 		goto ret;
1143 
1144 	if (result != NULL)
1145 		err = big_copy(result, &tresult);
1146 
1147 ret:
1148 	big_finish(&tresult);
1149 ret4:
1150 	big_finish(&tmp1);
1151 ret3:
1152 	big_finish(&tmp2);
1153 ret2:
1154 	big_finish(&bbhigh);
1155 ret1:
1156 	big_finish(&bblow);
1157 	return (err);
1158 }
1159 
1160 
1161 /*
1162  * If there is no processor-specific integer implementation of
1163  * the lower level multiply functions, then this code is provided
1164  * for big_mul_set_vec(), big_mul_add_vec(), big_mul_vec() and
1165  * big_sqr_vec().
1166  *
1167  * There are two generic implementations.  One that assumes that
1168  * there is hardware and C compiler support for a 32 x 32 --> 64
1169  * bit unsigned multiply, but otherwise is not specific to any
1170  * processor, platform, or ISA.
1171  *
1172  * The other makes very few assumptions about hardware capabilities.
1173  * It does not even assume that there is any implementation of a
1174  * 32 x 32 --> 64 bit multiply that is accessible to C code and
1175  * appropriate to use.  It falls constructs 32 x 32 --> 64 bit
1176  * multiplies from 16 x 16 --> 32 bit multiplies.
1177  *
1178  */
1179 
1180 #if !defined(PSR_MUL)
1181 
1182 #ifdef UMUL64
1183 
1184 #if (BIG_CHUNK_SIZE == 32)
1185 
1186 #define	UNROLL8
1187 
1188 #define	MUL_SET_VEC_ROUND_PREFETCH(R) \
1189 	p = pf * d; \
1190 	pf = (uint64_t)a[R + 1]; \
1191 	t = p + cy; \
1192 	r[R] = (uint32_t)t; \
1193 	cy = t >> 32
1194 
1195 #define	MUL_SET_VEC_ROUND_NOPREFETCH(R) \
1196 	p = pf * d; \
1197 	t = p + cy; \
1198 	r[R] = (uint32_t)t; \
1199 	cy = t >> 32
1200 
1201 #define	MUL_ADD_VEC_ROUND_PREFETCH(R) \
1202 	t = (uint64_t)r[R]; \
1203 	p = pf * d; \
1204 	pf = (uint64_t)a[R + 1]; \
1205 	t = p + t + cy; \
1206 	r[R] = (uint32_t)t; \
1207 	cy = t >> 32
1208 
1209 #define	MUL_ADD_VEC_ROUND_NOPREFETCH(R) \
1210 	t = (uint64_t)r[R]; \
1211 	p = pf * d; \
1212 	t = p + t + cy; \
1213 	r[R] = (uint32_t)t; \
1214 	cy = t >> 32
1215 
1216 #ifdef UNROLL8
1217 
1218 #define	UNROLL 8
1219 
1220 /*
1221  * r = a * b
1222  * where r and a are vectors; b is a single 32-bit digit
1223  */
1224 
1225 uint32_t
1226 big_mul_set_vec(uint32_t *r, uint32_t *a, int len, uint32_t b)
1227 {
1228 	uint64_t d, pf, p, t, cy;
1229 
1230 	if (len == 0)
1231 		return (0);
1232 	cy = 0;
1233 	d = (uint64_t)b;
1234 	pf = (uint64_t)a[0];
1235 	while (len > UNROLL) {
1236 		MUL_SET_VEC_ROUND_PREFETCH(0);
1237 		MUL_SET_VEC_ROUND_PREFETCH(1);
1238 		MUL_SET_VEC_ROUND_PREFETCH(2);
1239 		MUL_SET_VEC_ROUND_PREFETCH(3);
1240 		MUL_SET_VEC_ROUND_PREFETCH(4);
1241 		MUL_SET_VEC_ROUND_PREFETCH(5);
1242 		MUL_SET_VEC_ROUND_PREFETCH(6);
1243 		MUL_SET_VEC_ROUND_PREFETCH(7);
1244 		r += UNROLL;
1245 		a += UNROLL;
1246 		len -= UNROLL;
1247 	}
1248 	if (len == UNROLL) {
1249 		MUL_SET_VEC_ROUND_PREFETCH(0);
1250 		MUL_SET_VEC_ROUND_PREFETCH(1);
1251 		MUL_SET_VEC_ROUND_PREFETCH(2);
1252 		MUL_SET_VEC_ROUND_PREFETCH(3);
1253 		MUL_SET_VEC_ROUND_PREFETCH(4);
1254 		MUL_SET_VEC_ROUND_PREFETCH(5);
1255 		MUL_SET_VEC_ROUND_PREFETCH(6);
1256 		MUL_SET_VEC_ROUND_NOPREFETCH(7);
1257 		return ((uint32_t)cy);
1258 	}
1259 	while (len > 1) {
1260 		MUL_SET_VEC_ROUND_PREFETCH(0);
1261 		++r;
1262 		++a;
1263 		--len;
1264 	}
1265 	if (len > 0) {
1266 		MUL_SET_VEC_ROUND_NOPREFETCH(0);
1267 	}
1268 	return ((uint32_t)cy);
1269 }
1270 
1271 /*
1272  * r += a * b
1273  * where r and a are vectors; b is a single 32-bit digit
1274  */
1275 
1276 uint32_t
1277 big_mul_add_vec(uint32_t *r, uint32_t *a, int len, uint32_t b)
1278 {
1279 	uint64_t d, pf, p, t, cy;
1280 
1281 	if (len == 0)
1282 		return (0);
1283 	cy = 0;
1284 	d = (uint64_t)b;
1285 	pf = (uint64_t)a[0];
1286 	while (len > 8) {
1287 		MUL_ADD_VEC_ROUND_PREFETCH(0);
1288 		MUL_ADD_VEC_ROUND_PREFETCH(1);
1289 		MUL_ADD_VEC_ROUND_PREFETCH(2);
1290 		MUL_ADD_VEC_ROUND_PREFETCH(3);
1291 		MUL_ADD_VEC_ROUND_PREFETCH(4);
1292 		MUL_ADD_VEC_ROUND_PREFETCH(5);
1293 		MUL_ADD_VEC_ROUND_PREFETCH(6);
1294 		MUL_ADD_VEC_ROUND_PREFETCH(7);
1295 		r += 8;
1296 		a += 8;
1297 		len -= 8;
1298 	}
1299 	if (len == 8) {
1300 		MUL_ADD_VEC_ROUND_PREFETCH(0);
1301 		MUL_ADD_VEC_ROUND_PREFETCH(1);
1302 		MUL_ADD_VEC_ROUND_PREFETCH(2);
1303 		MUL_ADD_VEC_ROUND_PREFETCH(3);
1304 		MUL_ADD_VEC_ROUND_PREFETCH(4);
1305 		MUL_ADD_VEC_ROUND_PREFETCH(5);
1306 		MUL_ADD_VEC_ROUND_PREFETCH(6);
1307 		MUL_ADD_VEC_ROUND_NOPREFETCH(7);
1308 		return ((uint32_t)cy);
1309 	}
1310 	while (len > 1) {
1311 		MUL_ADD_VEC_ROUND_PREFETCH(0);
1312 		++r;
1313 		++a;
1314 		--len;
1315 	}
1316 	if (len > 0) {
1317 		MUL_ADD_VEC_ROUND_NOPREFETCH(0);
1318 	}
1319 	return ((uint32_t)cy);
1320 }
1321 #endif /* UNROLL8 */
1322 
1323 void
1324 big_sqr_vec(uint32_t *r, uint32_t *a, int len)
1325 {
1326 	uint32_t	*tr, *ta;
1327 	int		tlen, row, col;
1328 	uint64_t	p, s, t, t2, cy;
1329 	uint32_t	d;
1330 
1331 	tr = r + 1;
1332 	ta = a;
1333 	tlen = len - 1;
1334 	tr[tlen] = big_mul_set_vec(tr, ta + 1, tlen, ta[0]);
1335 	while (--tlen > 0) {
1336 		tr += 2;
1337 		++ta;
1338 		tr[tlen] = big_mul_add_vec(tr, ta + 1, tlen, ta[0]);
1339 	}
1340 	s = (uint64_t)a[0];
1341 	s = s * s;
1342 	r[0] = (uint32_t)s;
1343 	cy = s >> 32;
1344 	p = ((uint64_t)r[1] << 1) + cy;
1345 	r[1] = (uint32_t)p;
1346 	cy = p >> 32;
1347 	row = 1;
1348 	col = 2;
1349 	while (row < len) {
1350 		s = (uint64_t)a[row];
1351 		s = s * s;
1352 		p = (uint64_t)r[col] << 1;
1353 		t = p + s;
1354 		d = (uint32_t)t;
1355 		t2 = (uint64_t)d + cy;
1356 		r[col] = (uint32_t)t2;
1357 		cy = (t >> 32) + (t2 >> 32);
1358 		if (row == len - 1)
1359 			break;
1360 		p = ((uint64_t)r[col + 1] << 1) + cy;
1361 		r[col + 1] = (uint32_t)p;
1362 		cy = p >> 32;
1363 		++row;
1364 		col += 2;
1365 	}
1366 	r[col + 1] = (uint32_t)cy;
1367 }
1368 
1369 #else /* BIG_CHUNK_SIZE == 64 */
1370 
1371 /*
1372  * r = r + a * digit, r and a are vectors of length len
1373  * returns the carry digit
1374  */
1375 BIG_CHUNK_TYPE
1376 big_mul_add_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len,
1377     BIG_CHUNK_TYPE digit)
1378 {
1379 	BIG_CHUNK_TYPE	cy, cy1, retcy, dlow, dhigh;
1380 	int		i;
1381 
1382 	cy1 = 0;
1383 	dlow = digit & BIG_CHUNK_LOWHALFBITS;
1384 	dhigh = digit >> (BIG_CHUNK_SIZE / 2);
1385 	for (i = 0; i < len; i++) {
1386 		cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
1387 		    dlow * (a[i] & BIG_CHUNK_LOWHALFBITS) +
1388 		    (r[i] & BIG_CHUNK_LOWHALFBITS);
1389 		cy1 = (cy >> (BIG_CHUNK_SIZE / 2)) +
1390 		    dlow * (a[i] >> (BIG_CHUNK_SIZE / 2)) +
1391 		    (r[i] >> (BIG_CHUNK_SIZE / 2));
1392 		r[i] = (cy & BIG_CHUNK_LOWHALFBITS) |
1393 		    (cy1 << (BIG_CHUNK_SIZE / 2));
1394 	}
1395 	retcy = cy1 >> (BIG_CHUNK_SIZE / 2);
1396 
1397 	cy1 = r[0] & BIG_CHUNK_LOWHALFBITS;
1398 	for (i = 0; i < len - 1; i++) {
1399 		cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
1400 		    dhigh * (a[i] & BIG_CHUNK_LOWHALFBITS) +
1401 		    (r[i] >> (BIG_CHUNK_SIZE / 2));
1402 		r[i] = (cy1 & BIG_CHUNK_LOWHALFBITS) |
1403 		    (cy << (BIG_CHUNK_SIZE / 2));
1404 		cy1 = (cy >> (BIG_CHUNK_SIZE / 2)) +
1405 		    dhigh * (a[i] >> (BIG_CHUNK_SIZE / 2)) +
1406 		    (r[i + 1] & BIG_CHUNK_LOWHALFBITS);
1407 	}
1408 	cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
1409 	    dhigh * (a[len - 1] & BIG_CHUNK_LOWHALFBITS) +
1410 	    (r[len - 1] >> (BIG_CHUNK_SIZE / 2));
1411 	r[len - 1] = (cy1 & BIG_CHUNK_LOWHALFBITS) |
1412 	    (cy << (BIG_CHUNK_SIZE / 2));
1413 	retcy = (cy >> (BIG_CHUNK_SIZE / 2)) +
1414 	    dhigh * (a[len - 1] >> (BIG_CHUNK_SIZE / 2)) + retcy;
1415 
1416 	return (retcy);
1417 }
1418 
1419 
1420 /*
1421  * r = a * digit, r and a are vectors of length len
1422  * returns the carry digit
1423  */
1424 BIG_CHUNK_TYPE
1425 big_mul_set_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len,
1426     BIG_CHUNK_TYPE digit)
1427 {
1428 	int	i;
1429 
1430 	ASSERT(r != a);
1431 	for (i = 0; i < len; i++) {
1432 		r[i] = 0;
1433 	}
1434 	return (big_mul_add_vec(r, a, len, digit));
1435 }
1436 
1437 void
1438 big_sqr_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len)
1439 {
1440 	int i;
1441 
1442 	ASSERT(r != a);
1443 	r[len] = big_mul_set_vec(r, a, len, a[0]);
1444 	for (i = 1; i < len; ++i)
1445 		r[len + i] = big_mul_add_vec(r + i, a, len, a[i]);
1446 }
1447 
1448 #endif /* BIG_CHUNK_SIZE == 32/64 */
1449 
1450 
1451 #else /* ! UMUL64 */
1452 
1453 #if (BIG_CHUNK_SIZE != 32)
1454 #error Don't use 64-bit chunks without defining UMUL64
1455 #endif
1456 
1457 
1458 /*
1459  * r = r + a * digit, r and a are vectors of length len
1460  * returns the carry digit
1461  */
1462 uint32_t
1463 big_mul_add_vec(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1464 {
1465 	uint32_t cy, cy1, retcy, dlow, dhigh;
1466 	int i;
1467 
1468 	cy1 = 0;
1469 	dlow = digit & 0xffff;
1470 	dhigh = digit >> 16;
1471 	for (i = 0; i < len; i++) {
1472 		cy = (cy1 >> 16) + dlow * (a[i] & 0xffff) + (r[i] & 0xffff);
1473 		cy1 = (cy >> 16) + dlow * (a[i]>>16) + (r[i] >> 16);
1474 		r[i] = (cy & 0xffff) | (cy1 << 16);
1475 	}
1476 	retcy = cy1 >> 16;
1477 
1478 	cy1 = r[0] & 0xffff;
1479 	for (i = 0; i < len - 1; i++) {
1480 		cy = (cy1 >> 16) + dhigh * (a[i] & 0xffff) + (r[i] >> 16);
1481 		r[i] = (cy1 & 0xffff) | (cy << 16);
1482 		cy1 = (cy >> 16) + dhigh * (a[i] >> 16) + (r[i + 1] & 0xffff);
1483 	}
1484 	cy = (cy1 >> 16) + dhigh * (a[len - 1] & 0xffff) + (r[len - 1] >> 16);
1485 	r[len - 1] = (cy1 & 0xffff) | (cy << 16);
1486 	retcy = (cy >> 16) + dhigh * (a[len - 1] >> 16) + retcy;
1487 
1488 	return (retcy);
1489 }
1490 
1491 
1492 /*
1493  * r = a * digit, r and a are vectors of length len
1494  * returns the carry digit
1495  */
1496 uint32_t
1497 big_mul_set_vec(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1498 {
1499 	int	i;
1500 
1501 	ASSERT(r != a);
1502 	for (i = 0; i < len; i++) {
1503 		r[i] = 0;
1504 	}
1505 
1506 	return (big_mul_add_vec(r, a, len, digit));
1507 }
1508 
1509 void
1510 big_sqr_vec(uint32_t *r, uint32_t *a, int len)
1511 {
1512 	int i;
1513 
1514 	ASSERT(r != a);
1515 	r[len] = big_mul_set_vec(r, a, len, a[0]);
1516 	for (i = 1; i < len; ++i)
1517 		r[len + i] = big_mul_add_vec(r + i, a, len, a[i]);
1518 }
1519 
1520 #endif /* UMUL64 */
1521 
1522 void
1523 big_mul_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int alen,
1524     BIG_CHUNK_TYPE *b, int blen)
1525 {
1526 	int i;
1527 
1528 	r[alen] = big_mul_set_vec(r, a, alen, b[0]);
1529 	for (i = 1; i < blen; ++i)
1530 		r[alen + i] = big_mul_add_vec(r + i, a, alen, b[i]);
1531 }
1532 
1533 
1534 #endif /* ! PSR_MUL */
1535 
1536 
1537 /*
1538  * result = aa * bb  result->value should be big enough to hold the result
1539  *
1540  * Implementation: Standard grammar school algorithm
1541  *
1542  */
1543 BIG_ERR_CODE
1544 big_mul(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
1545 {
1546 	BIGNUM		tmp1;
1547 	BIG_CHUNK_TYPE	tmp1value[BIGTMPSIZE];
1548 	BIG_CHUNK_TYPE	*r, *t, *a, *b;
1549 	BIG_ERR_CODE	err;
1550 	int		i, alen, blen, rsize, sign, diff;
1551 
1552 	if (aa == bb) {
1553 		diff = 0;
1554 	} else {
1555 		diff = big_cmp_abs(aa, bb);
1556 		if (diff < 0) {
1557 			BIGNUM *tt;
1558 			tt = aa;
1559 			aa = bb;
1560 			bb = tt;
1561 		}
1562 	}
1563 	a = aa->value;
1564 	b = bb->value;
1565 	alen = aa->len;
1566 	blen = bb->len;
1567 	while ((alen > 1) && (a[alen - 1] == 0)) {
1568 		alen--;
1569 	}
1570 	aa->len = alen;
1571 	while ((blen > 1) && (b[blen - 1] == 0)) {
1572 		blen--;
1573 	}
1574 	bb->len = blen;
1575 
1576 	rsize = alen + blen;
1577 	ASSERT(rsize > 0);
1578 	if (result->size < rsize) {
1579 		err = big_extend(result, rsize);
1580 		if (err != BIG_OK) {
1581 			return (err);
1582 		}
1583 		/* aa or bb might be an alias to result */
1584 		a = aa->value;
1585 		b = bb->value;
1586 	}
1587 	r = result->value;
1588 
1589 	if (((alen == 1) && (a[0] == 0)) || ((blen == 1) && (b[0] == 0))) {
1590 		result->len = 1;
1591 		result->sign = 1;
1592 		r[0] = 0;
1593 		return (BIG_OK);
1594 	}
1595 	sign = aa->sign * bb->sign;
1596 	if ((alen == 1) && (a[0] == 1)) {
1597 		for (i = 0; i < blen; i++) {
1598 			r[i] = b[i];
1599 		}
1600 		result->len = blen;
1601 		result->sign = sign;
1602 		return (BIG_OK);
1603 	}
1604 	if ((blen == 1) && (b[0] == 1)) {
1605 		for (i = 0; i < alen; i++) {
1606 			r[i] = a[i];
1607 		}
1608 		result->len = alen;
1609 		result->sign = sign;
1610 		return (BIG_OK);
1611 	}
1612 
1613 	if ((err = big_init1(&tmp1, rsize,
1614 	    tmp1value, arraysize(tmp1value))) != BIG_OK) {
1615 		return (err);
1616 	}
1617 	t = tmp1.value;
1618 
1619 	for (i = 0; i < rsize; i++) {
1620 		t[i] = 0;
1621 	}
1622 
1623 	if (diff == 0 && alen > 2) {
1624 		BIG_SQR_VEC(t, a, alen);
1625 	} else if (blen > 0) {
1626 		BIG_MUL_VEC(t, a, alen, b, blen);
1627 	}
1628 
1629 	if (t[rsize - 1] == 0) {
1630 		tmp1.len = rsize - 1;
1631 	} else {
1632 		tmp1.len = rsize;
1633 	}
1634 
1635 	err = big_copy(result, &tmp1);
1636 
1637 	result->sign = sign;
1638 
1639 	big_finish(&tmp1);
1640 
1641 	return (err);
1642 }
1643 
1644 
1645 /*
1646  * big_mont_mul()
1647  * Montgomery multiplication.
1648  *
1649  * Caller must ensure that  a < n,  b < n,  ret->size >=  2 * n->len + 1,
1650  * and that ret is not n.
1651  */
1652 BIG_ERR_CODE
1653 big_mont_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, BIGNUM *n, BIG_CHUNK_TYPE n0)
1654 {
1655 	int		i, j, nlen, needsubtract;
1656 	BIG_CHUNK_TYPE	*nn, *rr, *rrplusi;
1657 	BIG_CHUNK_TYPE	digit, c;
1658 	BIG_ERR_CODE	err;
1659 #ifdef	__amd64
1660 #define	BIG_CPU_UNKNOWN	0
1661 #define	BIG_CPU_AMD	1
1662 #define	BIG_CPU_INTEL	2
1663 	static int	big_cpu = BIG_CPU_UNKNOWN;
1664 	BIG_CHUNK_TYPE	carry[BIGTMPSIZE];
1665 
1666 	if (big_cpu == BIG_CPU_UNKNOWN) {
1667 		big_cpu = 1 + bignum_on_intel();
1668 	}
1669 #endif	/* __amd64 */
1670 
1671 	nlen = n->len;
1672 	nn = n->value;
1673 
1674 	rr = ret->value;
1675 
1676 	if ((err = big_mul(ret, a, b)) != BIG_OK) {
1677 		return (err);
1678 	}
1679 
1680 	rr = ret->value;
1681 	for (i = ret->len; i < 2 * nlen + 1; i++) {
1682 		rr[i] = 0;
1683 	}
1684 
1685 #ifdef	__amd64	/* pipelining optimization for Intel 64, but not AMD64 */
1686 	if ((big_cpu == BIG_CPU_INTEL) && (nlen <= BIGTMPSIZE)) {
1687 		/*
1688 		 * Perform the following in two for loops to reduce the
1689 		 * dependency between computing the carryover bits with
1690 		 * BIG_MUL_ADD_VEC() and adding them, thus improving pipelining.
1691 		 */
1692 		for (i = 0; i < nlen; i++) {
1693 			rrplusi = rr + i;
1694 			digit = *rrplusi * n0;
1695 			carry[i] = BIG_MUL_ADD_VEC(rrplusi, nn, nlen, digit);
1696 		}
1697 		for (i = 0; i < nlen; i++) {
1698 			j = i + nlen;
1699 			rr[j] += carry[i];
1700 			while (rr[j] < carry[i]) {
1701 				rr[++j] += 1;
1702 				carry[i] = 1;
1703 			}
1704 		}
1705 	} else
1706 #endif	/* __amd64 */
1707 	{ /* no pipelining optimization */
1708 		for (i = 0; i < nlen; i++) {
1709 			rrplusi = rr + i;
1710 			digit = *rrplusi * n0;
1711 			c = BIG_MUL_ADD_VEC(rrplusi, nn, nlen, digit);
1712 			j = i + nlen;
1713 			rr[j] += c;
1714 			while (rr[j] < c) {
1715 				rr[++j] += 1;
1716 				c = 1;
1717 			}
1718 		}
1719 	}
1720 
1721 	needsubtract = 0;
1722 	if ((rr[2 * nlen]  != 0))
1723 		needsubtract = 1;
1724 	else {
1725 		for (i = 2 * nlen - 1; i >= nlen; i--) {
1726 			if (rr[i] > nn[i - nlen]) {
1727 				needsubtract = 1;
1728 				break;
1729 			} else if (rr[i] < nn[i - nlen]) {
1730 				break;
1731 			}
1732 		}
1733 	}
1734 	if (needsubtract)
1735 		big_sub_vec(rr, rr + nlen, nn, nlen);
1736 	else {
1737 		for (i = 0; i < nlen; i++) {
1738 			rr[i] = rr[i + nlen];
1739 		}
1740 	}
1741 
1742 	/* Remove leading zeros, but keep at least 1 digit: */
1743 	for (i = nlen - 1; (i > 0) && (rr[i] == 0); i--)
1744 		;
1745 	ret->len = i + 1;
1746 
1747 	return (BIG_OK);
1748 }
1749 
1750 
1751 BIG_CHUNK_TYPE
1752 big_n0(BIG_CHUNK_TYPE n)
1753 {
1754 	int		i;
1755 	BIG_CHUNK_TYPE	result, tmp;
1756 
1757 	result = 0;
1758 	tmp = BIG_CHUNK_ALLBITS;
1759 	for (i = 0; i < BIG_CHUNK_SIZE; i++) {
1760 		if ((tmp & 1) == 1) {
1761 			result = (result >> 1) | BIG_CHUNK_HIGHBIT;
1762 			tmp = tmp - n;
1763 		} else {
1764 			result = (result >> 1);
1765 		}
1766 		tmp = tmp >> 1;
1767 	}
1768 
1769 	return (result);
1770 }
1771 
1772 
1773 int
1774 big_numbits(BIGNUM *n)
1775 {
1776 	int		i, j;
1777 	BIG_CHUNK_TYPE	t;
1778 
1779 	for (i = n->len - 1; i > 0; i--) {
1780 		if (n->value[i] != 0) {
1781 			break;
1782 		}
1783 	}
1784 	t = n->value[i];
1785 	for (j = BIG_CHUNK_SIZE; j > 0; j--) {
1786 		if ((t & BIG_CHUNK_HIGHBIT) == 0) {
1787 			t = t << 1;
1788 		} else {
1789 			return (BIG_CHUNK_SIZE * i + j);
1790 		}
1791 	}
1792 	return (0);
1793 }
1794 
1795 
1796 /* caller must make sure that a < n */
1797 BIG_ERR_CODE
1798 big_mont_rr(BIGNUM *result, BIGNUM *n)
1799 {
1800 	BIGNUM		rr;
1801 	BIG_CHUNK_TYPE	rrvalue[BIGTMPSIZE];
1802 	int		len, i;
1803 	BIG_ERR_CODE	err;
1804 
1805 	rr.malloced = 0;
1806 	len = n->len;
1807 
1808 	if ((err = big_init1(&rr, 2 * len + 1,
1809 	    rrvalue, arraysize(rrvalue))) != BIG_OK) {
1810 		return (err);
1811 	}
1812 
1813 	for (i = 0; i < 2 * len; i++) {
1814 		rr.value[i] = 0;
1815 	}
1816 	rr.value[2 * len] = 1;
1817 	rr.len = 2 * len + 1;
1818 	if ((err = big_div_pos(NULL, &rr, &rr, n)) != BIG_OK) {
1819 		goto ret;
1820 	}
1821 	err = big_copy(result, &rr);
1822 ret:
1823 	big_finish(&rr);
1824 	return (err);
1825 }
1826 
1827 
1828 /* caller must make sure that a < n */
1829 BIG_ERR_CODE
1830 big_mont_conv(BIGNUM *result, BIGNUM *a, BIGNUM *n, BIG_CHUNK_TYPE n0,
1831     BIGNUM *n_rr)
1832 {
1833 	BIGNUM		rr;
1834 	BIG_CHUNK_TYPE	rrvalue[BIGTMPSIZE];
1835 	int		len, i;
1836 	BIG_ERR_CODE	err;
1837 
1838 	rr.malloced = 0;
1839 	len = n->len;
1840 
1841 	if ((err = big_init1(&rr, 2 * len + 1, rrvalue, arraysize(rrvalue)))
1842 	    != BIG_OK) {
1843 		return (err);
1844 	}
1845 
1846 	if (n_rr == NULL) {
1847 		for (i = 0; i < 2 * len; i++) {
1848 			rr.value[i] = 0;
1849 		}
1850 		rr.value[2 * len] = 1;
1851 		rr.len = 2 * len + 1;
1852 		if ((err = big_div_pos(NULL, &rr, &rr, n)) != BIG_OK) {
1853 			goto ret;
1854 		}
1855 		n_rr = &rr;
1856 	}
1857 
1858 	if ((err = big_mont_mul(&rr, n_rr, a, n, n0)) != BIG_OK) {
1859 		goto ret;
1860 	}
1861 	err = big_copy(result, &rr);
1862 
1863 ret:
1864 	big_finish(&rr);
1865 	return (err);
1866 }
1867 
1868 
1869 #ifdef	USE_FLOATING_POINT
1870 #define	big_modexp_ncp_float	big_modexp_ncp_sw
1871 #else
1872 #define	big_modexp_ncp_int	big_modexp_ncp_sw
1873 #endif
1874 
1875 #define	MAX_EXP_BIT_GROUP_SIZE 6
1876 #define	APOWERS_MAX_SIZE (1 << (MAX_EXP_BIT_GROUP_SIZE - 1))
1877 
1878 /* ARGSUSED */
1879 static BIG_ERR_CODE
1880 big_modexp_ncp_int(BIGNUM *result, BIGNUM *ma, BIGNUM *e, BIGNUM *n,
1881     BIGNUM *tmp, BIG_CHUNK_TYPE n0)
1882 
1883 {
1884 	BIGNUM		apowers[APOWERS_MAX_SIZE];
1885 	BIGNUM		tmp1;
1886 	BIG_CHUNK_TYPE	tmp1value[BIGTMPSIZE];
1887 	int		i, j, k, l, m, p;
1888 	uint32_t	bit, bitind, bitcount, groupbits, apowerssize;
1889 	uint32_t	nbits;
1890 	BIG_ERR_CODE	err;
1891 
1892 	nbits = big_numbits(e);
1893 	if (nbits < 50) {
1894 		groupbits = 1;
1895 		apowerssize = 1;
1896 	} else {
1897 		groupbits = MAX_EXP_BIT_GROUP_SIZE;
1898 		apowerssize = 1 << (groupbits - 1);
1899 	}
1900 
1901 
1902 	if ((err = big_init1(&tmp1, 2 * n->len + 1,
1903 	    tmp1value, arraysize(tmp1value))) != BIG_OK) {
1904 		return (err);
1905 	}
1906 
1907 	/* clear the malloced bit to help cleanup */
1908 	for (i = 0; i < apowerssize; i++) {
1909 		apowers[i].malloced = 0;
1910 	}
1911 
1912 	for (i = 0; i < apowerssize; i++) {
1913 		if ((err = big_init1(&(apowers[i]), n->len, NULL, 0)) !=
1914 		    BIG_OK) {
1915 			goto ret;
1916 		}
1917 	}
1918 
1919 	(void) big_copy(&(apowers[0]), ma);
1920 
1921 	if ((err = big_mont_mul(&tmp1, ma, ma, n, n0)) != BIG_OK) {
1922 		goto ret;
1923 	}
1924 	(void) big_copy(ma, &tmp1);
1925 
1926 	for (i = 1; i < apowerssize; i++) {
1927 		if ((err = big_mont_mul(&tmp1, ma,
1928 		    &(apowers[i - 1]), n, n0)) != BIG_OK) {
1929 			goto ret;
1930 		}
1931 		(void) big_copy(&apowers[i], &tmp1);
1932 	}
1933 
1934 	bitind = nbits % BIG_CHUNK_SIZE;
1935 	k = 0;
1936 	l = 0;
1937 	p = 0;
1938 	bitcount = 0;
1939 	for (i = nbits / BIG_CHUNK_SIZE; i >= 0; i--) {
1940 		for (j = bitind - 1; j >= 0; j--) {
1941 			bit = (e->value[i] >> j) & 1;
1942 			if ((bitcount == 0) && (bit == 0)) {
1943 				if ((err = big_mont_mul(tmp,
1944 				    tmp, tmp, n, n0)) != BIG_OK) {
1945 					goto ret;
1946 				}
1947 			} else {
1948 				bitcount++;
1949 				p = p * 2 + bit;
1950 				if (bit == 1) {
1951 					k = k + l + 1;
1952 					l = 0;
1953 				} else {
1954 					l++;
1955 				}
1956 				if (bitcount == groupbits) {
1957 					for (m = 0; m < k; m++) {
1958 						if ((err = big_mont_mul(tmp,
1959 						    tmp, tmp, n, n0)) !=
1960 						    BIG_OK) {
1961 							goto ret;
1962 						}
1963 					}
1964 					if ((err = big_mont_mul(tmp, tmp,
1965 					    &(apowers[p >> (l + 1)]),
1966 					    n, n0)) != BIG_OK) {
1967 						goto ret;
1968 					}
1969 					for (m = 0; m < l; m++) {
1970 						if ((err = big_mont_mul(tmp,
1971 						    tmp, tmp, n, n0)) !=
1972 						    BIG_OK) {
1973 							goto ret;
1974 						}
1975 					}
1976 					k = 0;
1977 					l = 0;
1978 					p = 0;
1979 					bitcount = 0;
1980 				}
1981 			}
1982 		}
1983 		bitind = BIG_CHUNK_SIZE;
1984 	}
1985 
1986 	for (m = 0; m < k; m++) {
1987 		if ((err = big_mont_mul(tmp, tmp, tmp, n, n0)) != BIG_OK) {
1988 			goto ret;
1989 		}
1990 	}
1991 	if (p != 0) {
1992 		if ((err = big_mont_mul(tmp, tmp,
1993 		    &(apowers[p >> (l + 1)]), n, n0)) != BIG_OK) {
1994 			goto ret;
1995 		}
1996 	}
1997 	for (m = 0; m < l; m++) {
1998 		if ((err = big_mont_mul(result, tmp, tmp, n, n0)) != BIG_OK) {
1999 			goto ret;
2000 		}
2001 	}
2002 
2003 ret:
2004 	for (i = apowerssize - 1; i >= 0; i--) {
2005 		big_finish(&(apowers[i]));
2006 	}
2007 	big_finish(&tmp1);
2008 
2009 	return (err);
2010 }
2011 
2012 
2013 #ifdef USE_FLOATING_POINT
2014 
2015 #ifdef _KERNEL
2016 
2017 #include <sys/sysmacros.h>
2018 #include <sys/regset.h>
2019 #include <sys/fpu/fpusystm.h>
2020 
2021 /* the alignment for block stores to save fp registers */
2022 #define	FPR_ALIGN	(64)
2023 
2024 extern void big_savefp(kfpu_t *);
2025 extern void big_restorefp(kfpu_t *);
2026 
2027 #endif /* _KERNEL */
2028 
2029 /*
2030  * This version makes use of floating point for performance
2031  */
2032 static BIG_ERR_CODE
2033 big_modexp_ncp_float(BIGNUM *result, BIGNUM *ma, BIGNUM *e, BIGNUM *n,
2034     BIGNUM *tmp, BIG_CHUNK_TYPE n0)
2035 {
2036 
2037 	int		i, j, k, l, m, p;
2038 	uint32_t	bit, bitind, bitcount, nlen;
2039 	double		dn0;
2040 	double		*dn, *dt, *d16r, *d32r;
2041 	uint32_t	*nint, *prod;
2042 	double		*apowers[APOWERS_MAX_SIZE];
2043 	uint32_t	nbits, groupbits, apowerssize;
2044 	BIG_ERR_CODE	err = BIG_OK;
2045 
2046 #ifdef _KERNEL
2047 	uint8_t fpua[sizeof (kfpu_t) + FPR_ALIGN];
2048 	kfpu_t *fpu;
2049 
2050 #ifdef DEBUG
2051 	if (!fpu_exists)
2052 		return (BIG_GENERAL_ERR);
2053 #endif
2054 
2055 	fpu =  (kfpu_t *)P2ROUNDUP((uintptr_t)fpua, FPR_ALIGN);
2056 	big_savefp(fpu);
2057 
2058 #endif /* _KERNEL */
2059 
2060 	nbits = big_numbits(e);
2061 	if (nbits < 50) {
2062 		groupbits = 1;
2063 		apowerssize = 1;
2064 	} else {
2065 		groupbits = MAX_EXP_BIT_GROUP_SIZE;
2066 		apowerssize = 1 << (groupbits - 1);
2067 	}
2068 
2069 	nlen = (BIG_CHUNK_SIZE / 32) * n->len;
2070 	dn0 = (double)(n0 & 0xffff);
2071 
2072 	dn = dt = d16r = d32r = NULL;
2073 	nint = prod = NULL;
2074 	for (i = 0; i < apowerssize; i++) {
2075 		apowers[i] = NULL;
2076 	}
2077 
2078 	if ((dn = big_malloc(nlen * sizeof (double))) == NULL) {
2079 		err = BIG_NO_MEM;
2080 		goto ret;
2081 	}
2082 	if ((dt = big_malloc((4 * nlen + 2) * sizeof (double))) == NULL) {
2083 		err = BIG_NO_MEM;
2084 		goto ret;
2085 	}
2086 	if ((nint = big_malloc(nlen * sizeof (uint32_t))) == NULL) {
2087 		err = BIG_NO_MEM;
2088 		goto ret;
2089 	}
2090 	if ((prod = big_malloc((nlen + 1) * sizeof (uint32_t))) == NULL) {
2091 		err = BIG_NO_MEM;
2092 		goto ret;
2093 	}
2094 	if ((d16r = big_malloc((2 * nlen + 1) * sizeof (double))) == NULL) {
2095 		err = BIG_NO_MEM;
2096 		goto ret;
2097 	}
2098 	if ((d32r = big_malloc(nlen * sizeof (double))) == NULL) {
2099 		err = BIG_NO_MEM;
2100 		goto ret;
2101 	}
2102 	for (i = 0; i < apowerssize; i++) {
2103 		if ((apowers[i] = big_malloc((2 * nlen + 1) *
2104 		    sizeof (double))) == NULL) {
2105 			err = BIG_NO_MEM;
2106 			goto ret;
2107 		}
2108 	}
2109 
2110 #if (BIG_CHUNK_SIZE == 32)
2111 	for (i = 0; i < ma->len; i++) {
2112 		nint[i] = ma->value[i];
2113 	}
2114 	for (; i < nlen; i++) {
2115 		nint[i] = 0;
2116 	}
2117 #else
2118 	for (i = 0; i < ma->len; i++) {
2119 		nint[2 * i] = (uint32_t)(ma->value[i] & 0xffffffffULL);
2120 		nint[2 * i + 1] = (uint32_t)(ma->value[i] >> 32);
2121 	}
2122 	for (i = ma->len * 2; i < nlen; i++) {
2123 		nint[i] = 0;
2124 	}
2125 #endif
2126 	conv_i32_to_d32_and_d16(d32r, apowers[0], nint, nlen);
2127 
2128 #if (BIG_CHUNK_SIZE == 32)
2129 	for (i = 0; i < n->len; i++) {
2130 		nint[i] = n->value[i];
2131 	}
2132 	for (; i < nlen; i++) {
2133 		nint[i] = 0;
2134 	}
2135 #else
2136 	for (i = 0; i < n->len; i++) {
2137 		nint[2 * i] = (uint32_t)(n->value[i] & 0xffffffffULL);
2138 		nint[2 * i + 1] = (uint32_t)(n->value[i] >> 32);
2139 	}
2140 	for (i = n->len * 2; i < nlen; i++) {
2141 		nint[i] = 0;
2142 	}
2143 #endif
2144 	conv_i32_to_d32(dn, nint, nlen);
2145 
2146 	mont_mulf_noconv(prod, d32r, apowers[0], dt, dn, nint, nlen, dn0);
2147 	conv_i32_to_d32(d32r, prod, nlen);
2148 	for (i = 1; i < apowerssize; i++) {
2149 		mont_mulf_noconv(prod, d32r, apowers[i - 1],
2150 		    dt, dn, nint, nlen, dn0);
2151 		conv_i32_to_d16(apowers[i], prod, nlen);
2152 	}
2153 
2154 #if (BIG_CHUNK_SIZE == 32)
2155 	for (i = 0; i < tmp->len; i++) {
2156 		prod[i] = tmp->value[i];
2157 	}
2158 	for (; i < nlen + 1; i++) {
2159 		prod[i] = 0;
2160 	}
2161 #else
2162 	for (i = 0; i < tmp->len; i++) {
2163 		prod[2 * i] = (uint32_t)(tmp->value[i] & 0xffffffffULL);
2164 		prod[2 * i + 1] = (uint32_t)(tmp->value[i] >> 32);
2165 	}
2166 	for (i = tmp->len * 2; i < nlen + 1; i++) {
2167 		prod[i] = 0;
2168 	}
2169 #endif
2170 
2171 	bitind = nbits % BIG_CHUNK_SIZE;
2172 	k = 0;
2173 	l = 0;
2174 	p = 0;
2175 	bitcount = 0;
2176 	for (i = nbits / BIG_CHUNK_SIZE; i >= 0; i--) {
2177 		for (j = bitind - 1; j >= 0; j--) {
2178 			bit = (e->value[i] >> j) & 1;
2179 			if ((bitcount == 0) && (bit == 0)) {
2180 				conv_i32_to_d32_and_d16(d32r, d16r,
2181 				    prod, nlen);
2182 				mont_mulf_noconv(prod, d32r, d16r,
2183 				    dt, dn, nint, nlen, dn0);
2184 			} else {
2185 				bitcount++;
2186 				p = p * 2 + bit;
2187 				if (bit == 1) {
2188 					k = k + l + 1;
2189 					l = 0;
2190 				} else {
2191 					l++;
2192 				}
2193 				if (bitcount == groupbits) {
2194 					for (m = 0; m < k; m++) {
2195 						conv_i32_to_d32_and_d16(d32r,
2196 						    d16r, prod, nlen);
2197 						mont_mulf_noconv(prod, d32r,
2198 						    d16r, dt, dn, nint,
2199 						    nlen, dn0);
2200 					}
2201 					conv_i32_to_d32(d32r, prod, nlen);
2202 					mont_mulf_noconv(prod, d32r,
2203 					    apowers[p >> (l + 1)],
2204 					    dt, dn, nint, nlen, dn0);
2205 					for (m = 0; m < l; m++) {
2206 						conv_i32_to_d32_and_d16(d32r,
2207 						    d16r, prod, nlen);
2208 						mont_mulf_noconv(prod, d32r,
2209 						    d16r, dt, dn, nint,
2210 						    nlen, dn0);
2211 					}
2212 					k = 0;
2213 					l = 0;
2214 					p = 0;
2215 					bitcount = 0;
2216 				}
2217 			}
2218 		}
2219 		bitind = BIG_CHUNK_SIZE;
2220 	}
2221 
2222 	for (m = 0; m < k; m++) {
2223 		conv_i32_to_d32_and_d16(d32r, d16r, prod, nlen);
2224 		mont_mulf_noconv(prod, d32r, d16r, dt, dn, nint, nlen, dn0);
2225 	}
2226 	if (p != 0) {
2227 		conv_i32_to_d32(d32r, prod, nlen);
2228 		mont_mulf_noconv(prod, d32r, apowers[p >> (l + 1)],
2229 		    dt, dn, nint, nlen, dn0);
2230 	}
2231 	for (m = 0; m < l; m++) {
2232 		conv_i32_to_d32_and_d16(d32r, d16r, prod, nlen);
2233 		mont_mulf_noconv(prod, d32r, d16r, dt, dn, nint, nlen, dn0);
2234 	}
2235 
2236 #if (BIG_CHUNK_SIZE == 32)
2237 	for (i = 0; i < nlen; i++) {
2238 		result->value[i] = prod[i];
2239 	}
2240 	for (i = nlen - 1; (i > 0) && (prod[i] == 0); i--)
2241 		;
2242 #else
2243 	for (i = 0; i < nlen / 2; i++) {
2244 		result->value[i] = (uint64_t)(prod[2 * i]) +
2245 		    (((uint64_t)(prod[2 * i + 1])) << 32);
2246 	}
2247 	for (i = nlen / 2 - 1; (i > 0) && (result->value[i] == 0); i--)
2248 		;
2249 #endif
2250 	result->len = i + 1;
2251 
2252 ret:
2253 	for (i = apowerssize - 1; i >= 0; i--) {
2254 		if (apowers[i] != NULL)
2255 			big_free(apowers[i], (2 * nlen + 1) * sizeof (double));
2256 	}
2257 	if (d32r != NULL) {
2258 		big_free(d32r, nlen * sizeof (double));
2259 	}
2260 	if (d16r != NULL) {
2261 		big_free(d16r, (2 * nlen + 1) * sizeof (double));
2262 	}
2263 	if (prod != NULL) {
2264 		big_free(prod, (nlen + 1) * sizeof (uint32_t));
2265 	}
2266 	if (nint != NULL) {
2267 		big_free(nint, nlen * sizeof (uint32_t));
2268 	}
2269 	if (dt != NULL) {
2270 		big_free(dt, (4 * nlen + 2) * sizeof (double));
2271 	}
2272 	if (dn != NULL) {
2273 		big_free(dn, nlen * sizeof (double));
2274 	}
2275 
2276 #ifdef _KERNEL
2277 	big_restorefp(fpu);
2278 #endif
2279 
2280 	return (err);
2281 }
2282 
2283 #endif /* USE_FLOATING_POINT */
2284 
2285 
2286 BIG_ERR_CODE
2287 big_modexp_ext(BIGNUM *result, BIGNUM *a, BIGNUM *e, BIGNUM *n, BIGNUM *n_rr,
2288     big_modexp_ncp_info_t *info)
2289 {
2290 	BIGNUM		ma, tmp, rr;
2291 	BIG_CHUNK_TYPE	mavalue[BIGTMPSIZE];
2292 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
2293 	BIG_CHUNK_TYPE	rrvalue[BIGTMPSIZE];
2294 	BIG_ERR_CODE	err;
2295 	BIG_CHUNK_TYPE	n0;
2296 
2297 	if ((err = big_init1(&ma, n->len, mavalue, arraysize(mavalue)))	!=
2298 	    BIG_OK) {
2299 		return (err);
2300 	}
2301 	ma.len = 1;
2302 	ma.value[0] = 0;
2303 
2304 	if ((err = big_init1(&tmp, 2 * n->len + 1,
2305 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
2306 		goto ret1;
2307 	}
2308 
2309 	/* clear the malloced bit to help cleanup */
2310 	rr.malloced = 0;
2311 
2312 	if (n_rr == NULL) {
2313 		if ((err = big_init1(&rr, 2 * n->len + 1,
2314 		    rrvalue, arraysize(rrvalue))) != BIG_OK) {
2315 			goto ret2;
2316 		}
2317 		if (big_mont_rr(&rr, n) != BIG_OK) {
2318 			goto ret;
2319 		}
2320 		n_rr = &rr;
2321 	}
2322 
2323 	n0 = big_n0(n->value[0]);
2324 
2325 	if (big_cmp_abs(a, n) > 0) {
2326 		if ((err = big_div_pos(NULL, &ma, a, n)) != BIG_OK) {
2327 			goto ret;
2328 		}
2329 		err = big_mont_conv(&ma, &ma, n, n0, n_rr);
2330 	} else {
2331 		err = big_mont_conv(&ma, a, n, n0, n_rr);
2332 	}
2333 	if (err != BIG_OK) {
2334 		goto ret;
2335 	}
2336 
2337 	tmp.len = 1;
2338 	tmp.value[0] = 1;
2339 	if ((err = big_mont_conv(&tmp, &tmp, n, n0, n_rr)) != BIG_OK) {
2340 		goto ret;
2341 	}
2342 
2343 	if ((info != NULL) && (info->func != NULL)) {
2344 		err = (*(info->func))(&tmp, &ma, e, n, &tmp, n0,
2345 		    info->ncp, info->reqp);
2346 	} else {
2347 		err = big_modexp_ncp_sw(&tmp, &ma, e, n, &tmp, n0);
2348 	}
2349 	if (err != BIG_OK) {
2350 		goto ret;
2351 	}
2352 
2353 	ma.value[0] = 1;
2354 	ma.len = 1;
2355 	if ((err = big_mont_mul(&tmp, &tmp, &ma, n, n0)) != BIG_OK) {
2356 		goto ret;
2357 	}
2358 	err = big_copy(result, &tmp);
2359 
2360 ret:
2361 	if (rr.malloced) {
2362 		big_finish(&rr);
2363 	}
2364 ret2:
2365 	big_finish(&tmp);
2366 ret1:
2367 	big_finish(&ma);
2368 
2369 	return (err);
2370 }
2371 
2372 BIG_ERR_CODE
2373 big_modexp(BIGNUM *result, BIGNUM *a, BIGNUM *e, BIGNUM *n, BIGNUM *n_rr)
2374 {
2375 	return (big_modexp_ext(result, a, e, n, n_rr, NULL));
2376 }
2377 
2378 
2379 BIG_ERR_CODE
2380 big_modexp_crt_ext(BIGNUM *result, BIGNUM *a, BIGNUM *dmodpminus1,
2381     BIGNUM *dmodqminus1, BIGNUM *p, BIGNUM *q, BIGNUM *pinvmodq,
2382     BIGNUM *p_rr, BIGNUM *q_rr, big_modexp_ncp_info_t *info)
2383 {
2384 	BIGNUM		ap, aq, tmp;
2385 	int		alen, biglen, sign;
2386 	BIG_ERR_CODE	err;
2387 
2388 	if (p->len > q->len) {
2389 		biglen = p->len;
2390 	} else {
2391 		biglen = q->len;
2392 	}
2393 
2394 	if ((err = big_init1(&ap, p->len, NULL, 0)) != BIG_OK) {
2395 		return (err);
2396 	}
2397 	if ((err = big_init1(&aq, q->len, NULL, 0)) != BIG_OK) {
2398 		goto ret1;
2399 	}
2400 	if ((err = big_init1(&tmp, biglen + q->len + 1, NULL, 0)) != BIG_OK) {
2401 		goto ret2;
2402 	}
2403 
2404 	/*
2405 	 * check whether a is too short - to avoid timing attacks
2406 	 */
2407 	alen = a->len;
2408 	while ((alen > p->len) && (a->value[alen - 1] == 0)) {
2409 		alen--;
2410 	}
2411 	if (alen < p->len + q->len) {
2412 		/*
2413 		 * a is too short, add p*q to it before
2414 		 * taking it modulo p and q
2415 		 * this will also affect timing, but this difference
2416 		 * does not depend on p or q, only on a
2417 		 * (in "normal" operation, this path will never be
2418 		 * taken, so it is not a performance penalty
2419 		 */
2420 		if ((err = big_mul(&tmp, p, q)) != BIG_OK) {
2421 			goto ret;
2422 		}
2423 		if ((err = big_add(&tmp, &tmp, a)) != BIG_OK) {
2424 			goto ret;
2425 		}
2426 		if ((err = big_div_pos(NULL, &ap, &tmp, p)) != BIG_OK) {
2427 			goto ret;
2428 		}
2429 		if ((err = big_div_pos(NULL, &aq, &tmp, q)) != BIG_OK) {
2430 			goto ret;
2431 		}
2432 	} else {
2433 		if ((err = big_div_pos(NULL, &ap, a, p)) != BIG_OK) {
2434 			goto ret;
2435 		}
2436 		if ((err = big_div_pos(NULL, &aq, a, q)) != BIG_OK) {
2437 			goto ret;
2438 		}
2439 	}
2440 
2441 	if ((err = big_modexp_ext(&ap, &ap, dmodpminus1, p, p_rr, info)) !=
2442 	    BIG_OK) {
2443 		goto ret;
2444 	}
2445 	if ((err = big_modexp_ext(&aq, &aq, dmodqminus1, q, q_rr, info)) !=
2446 	    BIG_OK) {
2447 		goto ret;
2448 	}
2449 	if ((err = big_sub(&tmp, &aq, &ap)) != BIG_OK) {
2450 		goto ret;
2451 	}
2452 	if ((err = big_mul(&tmp, &tmp, pinvmodq)) != BIG_OK) {
2453 		goto ret;
2454 	}
2455 	sign = tmp.sign;
2456 	tmp.sign = 1;
2457 	if ((err = big_div_pos(NULL, &aq, &tmp, q)) != BIG_OK) {
2458 		goto ret;
2459 	}
2460 	if ((sign == -1) && (!big_is_zero(&aq))) {
2461 		(void) big_sub_pos(&aq, q, &aq);
2462 	}
2463 	if ((err = big_mul(&tmp, &aq, p)) != BIG_OK) {
2464 		goto ret;
2465 	}
2466 	err = big_add_abs(result, &ap, &tmp);
2467 
2468 ret:
2469 	big_finish(&tmp);
2470 ret2:
2471 	big_finish(&aq);
2472 ret1:
2473 	big_finish(&ap);
2474 
2475 	return (err);
2476 }
2477 
2478 
2479 BIG_ERR_CODE
2480 big_modexp_crt(BIGNUM *result, BIGNUM *a, BIGNUM *dmodpminus1,
2481     BIGNUM *dmodqminus1, BIGNUM *p, BIGNUM *q, BIGNUM *pinvmodq,
2482     BIGNUM *p_rr, BIGNUM *q_rr)
2483 {
2484 	return (big_modexp_crt_ext(result, a, dmodpminus1, dmodqminus1,
2485 	    p, q, pinvmodq, p_rr, q_rr, NULL));
2486 }
2487 
2488 
2489 static BIG_CHUNK_TYPE onearr[1] = {(BIG_CHUNK_TYPE)1};
2490 BIGNUM big_One = {1, 1, 1, 0, onearr};
2491 
2492 static BIG_CHUNK_TYPE twoarr[1] = {(BIG_CHUNK_TYPE)2};
2493 BIGNUM big_Two = {1, 1, 1, 0, twoarr};
2494 
2495 static BIG_CHUNK_TYPE fourarr[1] = {(BIG_CHUNK_TYPE)4};
2496 static BIGNUM big_Four = {1, 1, 1, 0, fourarr};
2497 
2498 
2499 BIG_ERR_CODE
2500 big_sqrt_pos(BIGNUM *result, BIGNUM *n)
2501 {
2502 	BIGNUM		*high, *low, *mid, *t;
2503 	BIGNUM		t1, t2, t3, prod;
2504 	BIG_CHUNK_TYPE	t1value[BIGTMPSIZE];
2505 	BIG_CHUNK_TYPE	t2value[BIGTMPSIZE];
2506 	BIG_CHUNK_TYPE	t3value[BIGTMPSIZE];
2507 	BIG_CHUNK_TYPE	prodvalue[BIGTMPSIZE];
2508 	int		i, diff;
2509 	uint32_t	nbits, nrootbits, highbits;
2510 	BIG_ERR_CODE	err;
2511 
2512 	nbits = big_numbits(n);
2513 
2514 	if ((err = big_init1(&t1, n->len + 1,
2515 	    t1value, arraysize(t1value))) != BIG_OK)
2516 		return (err);
2517 	if ((err = big_init1(&t2, n->len + 1,
2518 	    t2value, arraysize(t2value))) != BIG_OK)
2519 		goto ret1;
2520 	if ((err = big_init1(&t3, n->len + 1,
2521 	    t3value, arraysize(t3value))) != BIG_OK)
2522 		goto ret2;
2523 	if ((err = big_init1(&prod, n->len + 1,
2524 	    prodvalue, arraysize(prodvalue))) != BIG_OK)
2525 		goto ret3;
2526 
2527 	nrootbits = (nbits + 1) / 2;
2528 	t1.len = t2.len = t3.len = (nrootbits - 1) / BIG_CHUNK_SIZE + 1;
2529 	for (i = 0; i < t1.len; i++) {
2530 		t1.value[i] = 0;
2531 		t2.value[i] = BIG_CHUNK_ALLBITS;
2532 	}
2533 	highbits = nrootbits - BIG_CHUNK_SIZE * (t1.len - 1);
2534 	if (highbits == BIG_CHUNK_SIZE) {
2535 		t1.value[t1.len - 1] = BIG_CHUNK_HIGHBIT;
2536 		t2.value[t2.len - 1] = BIG_CHUNK_ALLBITS;
2537 	} else {
2538 		t1.value[t1.len - 1] = (BIG_CHUNK_TYPE)1 << (highbits - 1);
2539 		t2.value[t2.len - 1] = 2 * t1.value[t1.len - 1] - 1;
2540 	}
2541 
2542 	high = &t2;
2543 	low = &t1;
2544 	mid = &t3;
2545 
2546 	if ((err = big_mul(&prod, high, high)) != BIG_OK) {
2547 		goto ret;
2548 	}
2549 	diff = big_cmp_abs(&prod, n);
2550 	if (diff <= 0) {
2551 		err = big_copy(result, high);
2552 		goto ret;
2553 	}
2554 
2555 	(void) big_sub_pos(mid, high, low);
2556 	while (big_cmp_abs(&big_One, mid) != 0) {
2557 		(void) big_add_abs(mid, high, low);
2558 		(void) big_half_pos(mid, mid);
2559 		if ((err = big_mul(&prod, mid, mid)) != BIG_OK)
2560 			goto ret;
2561 		diff = big_cmp_abs(&prod, n);
2562 		if (diff > 0) {
2563 			t = high;
2564 			high = mid;
2565 			mid = t;
2566 		} else if (diff < 0) {
2567 			t = low;
2568 			low = mid;
2569 			mid = t;
2570 		} else {
2571 			err = big_copy(result, low);
2572 			goto ret;
2573 		}
2574 		(void) big_sub_pos(mid, high, low);
2575 	}
2576 
2577 	err = big_copy(result, low);
2578 ret:
2579 	if (prod.malloced) big_finish(&prod);
2580 ret3:
2581 	if (t3.malloced) big_finish(&t3);
2582 ret2:
2583 	if (t2.malloced) big_finish(&t2);
2584 ret1:
2585 	if (t1.malloced) big_finish(&t1);
2586 
2587 	return (err);
2588 }
2589 
2590 
2591 BIG_ERR_CODE
2592 big_Jacobi_pos(int *jac, BIGNUM *nn, BIGNUM *mm)
2593 {
2594 	BIGNUM		*t, *tmp2, *m, *n;
2595 	BIGNUM		t1, t2, t3;
2596 	BIG_CHUNK_TYPE	t1value[BIGTMPSIZE];
2597 	BIG_CHUNK_TYPE	t2value[BIGTMPSIZE];
2598 	BIG_CHUNK_TYPE	t3value[BIGTMPSIZE];
2599 	int		len, err;
2600 
2601 	if (big_is_zero(nn) ||
2602 	    (((nn->value[0] & 1) | (mm->value[0] & 1)) == 0)) {
2603 		*jac = 0;
2604 		return (BIG_OK);
2605 	}
2606 
2607 	if (nn->len > mm->len) {
2608 		len = nn->len;
2609 	} else {
2610 		len = mm->len;
2611 	}
2612 
2613 	if ((err = big_init1(&t1, len,
2614 	    t1value, arraysize(t1value))) != BIG_OK) {
2615 		return (err);
2616 	}
2617 	if ((err = big_init1(&t2, len,
2618 	    t2value, arraysize(t2value))) != BIG_OK) {
2619 		goto ret1;
2620 	}
2621 	if ((err = big_init1(&t3, len,
2622 	    t3value, arraysize(t3value))) != BIG_OK) {
2623 		goto ret2;
2624 	}
2625 
2626 	n = &t1;
2627 	m = &t2;
2628 	tmp2 = &t3;
2629 
2630 	(void) big_copy(n, nn);
2631 	(void) big_copy(m, mm);
2632 
2633 	*jac = 1;
2634 	while (big_cmp_abs(&big_One, m) != 0) {
2635 		if (big_is_zero(n)) {
2636 			*jac = 0;
2637 			goto ret;
2638 		}
2639 		if ((m->value[0] & 1) == 0) {
2640 			if (((n->value[0] & 7) == 3) ||
2641 			    ((n->value[0] & 7) == 5))
2642 				*jac = -*jac;
2643 			(void) big_half_pos(m, m);
2644 		} else if ((n->value[0] & 1) == 0) {
2645 			if (((m->value[0] & 7) == 3) ||
2646 			    ((m->value[0] & 7) == 5))
2647 				*jac = -*jac;
2648 			(void) big_half_pos(n, n);
2649 		} else {
2650 			if (((m->value[0] & 3) == 3) &&
2651 			    ((n->value[0] & 3) == 3)) {
2652 				*jac = -*jac;
2653 			}
2654 			if ((err = big_div_pos(NULL, tmp2, m, n)) != BIG_OK) {
2655 				goto ret;
2656 			}
2657 			t = tmp2;
2658 			tmp2 = m;
2659 			m = n;
2660 			n = t;
2661 		}
2662 	}
2663 	err = BIG_OK;
2664 
2665 ret:
2666 	if (t3.malloced) big_finish(&t3);
2667 ret2:
2668 	if (t2.malloced) big_finish(&t2);
2669 ret1:
2670 	if (t1.malloced) big_finish(&t1);
2671 
2672 	return (err);
2673 }
2674 
2675 
2676 BIG_ERR_CODE
2677 big_Lucas(BIGNUM *Lkminus1, BIGNUM *Lk, BIGNUM *p, BIGNUM *k, BIGNUM *n)
2678 {
2679 	int		i;
2680 	uint32_t	m, w;
2681 	BIG_CHUNK_TYPE	bit;
2682 	BIGNUM		ki, tmp, tmp2;
2683 	BIG_CHUNK_TYPE	kivalue[BIGTMPSIZE];
2684 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
2685 	BIG_CHUNK_TYPE	tmp2value[BIGTMPSIZE];
2686 	BIG_ERR_CODE	err;
2687 
2688 	if (big_cmp_abs(k, &big_One) == 0) {
2689 		(void) big_copy(Lk, p);
2690 		(void) big_copy(Lkminus1, &big_Two);
2691 		return (BIG_OK);
2692 	}
2693 
2694 	if ((err = big_init1(&ki, k->len + 1,
2695 	    kivalue, arraysize(kivalue))) != BIG_OK)
2696 		return (err);
2697 
2698 	if ((err = big_init1(&tmp, 2 * n->len + 1,
2699 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK)
2700 		goto ret1;
2701 
2702 	if ((err = big_init1(&tmp2, n->len,
2703 	    tmp2value, arraysize(tmp2value))) != BIG_OK)
2704 		goto ret2;
2705 
2706 	m = big_numbits(k);
2707 	ki.len = (m - 1) / BIG_CHUNK_SIZE + 1;
2708 	w = (m - 1) / BIG_CHUNK_SIZE;
2709 	bit = (BIG_CHUNK_TYPE)1 << ((m - 1) % BIG_CHUNK_SIZE);
2710 	for (i = 0; i < ki.len; i++) {
2711 		ki.value[i] = 0;
2712 	}
2713 	ki.value[ki.len - 1] = bit;
2714 	if (big_cmp_abs(k, &ki) != 0) {
2715 		(void) big_double(&ki, &ki);
2716 	}
2717 	(void) big_sub_pos(&ki, &ki, k);
2718 
2719 	(void) big_copy(Lk, p);
2720 	(void) big_copy(Lkminus1, &big_Two);
2721 
2722 	for (i = 0; i < m; i++) {
2723 		if ((err = big_mul(&tmp, Lk, Lkminus1)) != BIG_OK) {
2724 			goto ret;
2725 		}
2726 		(void) big_add_abs(&tmp, &tmp, n);
2727 		(void) big_sub_pos(&tmp, &tmp, p);
2728 		if ((err = big_div_pos(NULL, &tmp2, &tmp, n)) != BIG_OK) {
2729 			goto ret;
2730 		}
2731 		if ((ki.value[w] & bit) != 0) {
2732 			if ((err = big_mul(&tmp, Lkminus1, Lkminus1)) !=
2733 			    BIG_OK) {
2734 				goto ret;
2735 			}
2736 			(void) big_add_abs(&tmp, &tmp, n);
2737 			(void) big_sub_pos(&tmp, &tmp, &big_Two);
2738 			if ((err = big_div_pos(NULL, Lkminus1, &tmp, n)) !=
2739 			    BIG_OK) {
2740 				goto ret;
2741 			}
2742 			(void) big_copy(Lk, &tmp2);
2743 		} else {
2744 			if ((err = big_mul(&tmp, Lk, Lk)) != BIG_OK) {
2745 				goto ret;
2746 			}
2747 			(void) big_add_abs(&tmp, &tmp, n);
2748 			(void) big_sub_pos(&tmp, &tmp, &big_Two);
2749 			if ((err = big_div_pos(NULL, Lk, &tmp, n)) != BIG_OK) {
2750 				goto ret;
2751 			}
2752 			(void) big_copy(Lkminus1, &tmp2);
2753 		}
2754 		bit = bit >> 1;
2755 		if (bit == 0) {
2756 			bit = BIG_CHUNK_HIGHBIT;
2757 			w--;
2758 		}
2759 	}
2760 
2761 	err = BIG_OK;
2762 
2763 ret:
2764 	if (tmp2.malloced) big_finish(&tmp2);
2765 ret2:
2766 	if (tmp.malloced) big_finish(&tmp);
2767 ret1:
2768 	if (ki.malloced) big_finish(&ki);
2769 
2770 	return (err);
2771 }
2772 
2773 
2774 BIG_ERR_CODE
2775 big_isprime_pos_ext(BIGNUM *n, big_modexp_ncp_info_t *info)
2776 {
2777 	BIGNUM		o, nminus1, tmp, Lkminus1, Lk;
2778 	BIG_CHUNK_TYPE	ovalue[BIGTMPSIZE];
2779 	BIG_CHUNK_TYPE	nminus1value[BIGTMPSIZE];
2780 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
2781 	BIG_CHUNK_TYPE	Lkminus1value[BIGTMPSIZE];
2782 	BIG_CHUNK_TYPE	Lkvalue[BIGTMPSIZE];
2783 	BIG_ERR_CODE	err;
2784 	int		e, i, jac;
2785 
2786 	if (big_cmp_abs(n, &big_One) == 0) {
2787 		return (BIG_FALSE);
2788 	}
2789 	if (big_cmp_abs(n, &big_Two) == 0) {
2790 		return (BIG_TRUE);
2791 	}
2792 	if ((n->value[0] & 1) == 0) {
2793 		return (BIG_FALSE);
2794 	}
2795 
2796 	if ((err = big_init1(&o, n->len, ovalue, arraysize(ovalue))) !=
2797 	    BIG_OK) {
2798 		return (err);
2799 	}
2800 
2801 	if ((err = big_init1(&nminus1, n->len,
2802 	    nminus1value, arraysize(nminus1value))) != BIG_OK) {
2803 		goto ret1;
2804 	}
2805 
2806 	if ((err = big_init1(&tmp, 2 * n->len,
2807 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
2808 		goto ret2;
2809 	}
2810 
2811 	if ((err = big_init1(&Lkminus1, n->len,
2812 	    Lkminus1value, arraysize(Lkminus1value))) != BIG_OK) {
2813 		goto ret3;
2814 	}
2815 
2816 	if ((err = big_init1(&Lk, n->len,
2817 	    Lkvalue, arraysize(Lkvalue))) != BIG_OK) {
2818 		goto ret4;
2819 	}
2820 
2821 	(void) big_sub_pos(&o, n, &big_One);	/* cannot fail */
2822 	(void) big_copy(&nminus1, &o);		/* cannot fail */
2823 	e = 0;
2824 	while ((o.value[0] & 1) == 0) {
2825 		e++;
2826 		(void) big_half_pos(&o, &o);  /* cannot fail */
2827 	}
2828 	if ((err = big_modexp_ext(&tmp, &big_Two, &o, n, NULL, info)) !=
2829 	    BIG_OK) {
2830 		goto ret;
2831 	}
2832 
2833 	i = 0;
2834 	while ((i < e) &&
2835 	    (big_cmp_abs(&tmp, &big_One) != 0) &&
2836 	    (big_cmp_abs(&tmp, &nminus1) != 0)) {
2837 		if ((err =
2838 		    big_modexp_ext(&tmp, &tmp, &big_Two, n, NULL, info)) !=
2839 		    BIG_OK)
2840 			goto ret;
2841 		i++;
2842 	}
2843 
2844 	if (!((big_cmp_abs(&tmp, &nminus1) == 0) ||
2845 	    ((i == 0) && (big_cmp_abs(&tmp, &big_One) == 0)))) {
2846 		err = BIG_FALSE;
2847 		goto ret;
2848 	}
2849 
2850 	if ((err = big_sqrt_pos(&tmp, n)) != BIG_OK) {
2851 		goto ret;
2852 	}
2853 
2854 	if ((err = big_mul(&tmp, &tmp, &tmp)) != BIG_OK) {
2855 		goto ret;
2856 	}
2857 	if (big_cmp_abs(&tmp, n) == 0) {
2858 		err = BIG_FALSE;
2859 		goto ret;
2860 	}
2861 
2862 	(void) big_copy(&o, &big_Two);
2863 	do {
2864 		(void) big_add_abs(&o, &o, &big_One);
2865 		if ((err = big_mul(&tmp, &o, &o)) != BIG_OK) {
2866 			goto ret;
2867 		}
2868 		(void) big_sub_pos(&tmp, &tmp, &big_Four);
2869 		if ((err = big_Jacobi_pos(&jac, &tmp, n)) != BIG_OK) {
2870 			goto ret;
2871 		}
2872 	} while (jac != -1);
2873 
2874 	(void) big_add_abs(&tmp, n, &big_One);
2875 	if ((err = big_Lucas(&Lkminus1, &Lk, &o, &tmp, n)) != BIG_OK) {
2876 		goto ret;
2877 	}
2878 
2879 	if ((big_cmp_abs(&Lkminus1, &o) == 0) &&
2880 	    (big_cmp_abs(&Lk, &big_Two) == 0)) {
2881 		err = BIG_TRUE;
2882 	} else {
2883 		err = BIG_FALSE;
2884 	}
2885 
2886 ret:
2887 	if (Lk.malloced) big_finish(&Lk);
2888 ret4:
2889 	if (Lkminus1.malloced) big_finish(&Lkminus1);
2890 ret3:
2891 	if (tmp.malloced) big_finish(&tmp);
2892 ret2:
2893 	if (nminus1.malloced) big_finish(&nminus1);
2894 ret1:
2895 	if (o.malloced) big_finish(&o);
2896 
2897 	return (err);
2898 }
2899 
2900 
2901 BIG_ERR_CODE
2902 big_isprime_pos(BIGNUM *n)
2903 {
2904 	return (big_isprime_pos_ext(n, NULL));
2905 }
2906 
2907 
2908 #define	SIEVESIZE 1000
2909 
2910 
2911 BIG_ERR_CODE
2912 big_nextprime_pos_ext(BIGNUM *result, BIGNUM *n, big_modexp_ncp_info_t *info)
2913 {
2914 	static const uint32_t smallprimes[] = {
2915 	    3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
2916 	    51, 53, 59, 61, 67, 71, 73, 79, 83, 89, 91, 97 };
2917 	BIG_ERR_CODE	err;
2918 	int		sieve[SIEVESIZE];
2919 	int		i;
2920 	uint32_t	off, p;
2921 
2922 	if ((err = big_copy(result, n)) != BIG_OK) {
2923 		return (err);
2924 	}
2925 	result->value[0] |= 1;
2926 	/* CONSTCOND */
2927 	while (1) {
2928 		for (i = 0; i < SIEVESIZE; i++) sieve[i] = 0;
2929 		for (i = 0;
2930 		    i < sizeof (smallprimes) / sizeof (smallprimes[0]); i++) {
2931 			p = smallprimes[i];
2932 			off = big_modhalf_pos(result, p);
2933 			off = p - off;
2934 			if ((off % 2) == 1) {
2935 				off = off + p;
2936 			}
2937 			off = off / 2;
2938 			while (off < SIEVESIZE) {
2939 				sieve[off] = 1;
2940 				off = off + p;
2941 			}
2942 		}
2943 
2944 		for (i = 0; i < SIEVESIZE; i++) {
2945 			if (sieve[i] == 0) {
2946 				err = big_isprime_pos_ext(result, info);
2947 				if (err != BIG_FALSE) {
2948 					if (err != BIG_TRUE) {
2949 						return (err);
2950 					} else {
2951 						goto out;
2952 					}
2953 				}
2954 
2955 			}
2956 			if ((err = big_add_abs(result, result, &big_Two)) !=
2957 			    BIG_OK) {
2958 				return (err);
2959 			}
2960 		}
2961 	}
2962 
2963 out:
2964 	return (BIG_OK);
2965 }
2966 
2967 
2968 BIG_ERR_CODE
2969 big_nextprime_pos(BIGNUM *result, BIGNUM *n)
2970 {
2971 	return (big_nextprime_pos_ext(result, n, NULL));
2972 }
2973 
2974 
2975 BIG_ERR_CODE
2976 big_nextprime_pos_slow(BIGNUM *result, BIGNUM *n)
2977 {
2978 	BIG_ERR_CODE err;
2979 
2980 
2981 	if ((err = big_copy(result, n)) != BIG_OK)
2982 		return (err);
2983 	result->value[0] |= 1;
2984 	while ((err = big_isprime_pos_ext(result, NULL)) != BIG_TRUE) {
2985 		if (err != BIG_FALSE)
2986 			return (err);
2987 		if ((err = big_add_abs(result, result, &big_Two)) != BIG_OK)
2988 			return (err);
2989 	}
2990 	return (BIG_OK);
2991 }
2992 
2993 
2994 /*
2995  * given m and e, computes the rest in the equation
2996  * gcd(m, e) = cm * m + ce * e
2997  */
2998 BIG_ERR_CODE
2999 big_ext_gcd_pos(BIGNUM *gcd, BIGNUM *cm, BIGNUM *ce, BIGNUM *m, BIGNUM *e)
3000 {
3001 	BIGNUM		*xi, *ri, *riminus1, *riminus2, *t;
3002 	BIGNUM		*vmi, *vei, *vmiminus1, *veiminus1;
3003 	BIGNUM		t1, t2, t3, t4, t5, t6, t7, t8, tmp;
3004 	BIG_CHUNK_TYPE	t1value[BIGTMPSIZE];
3005 	BIG_CHUNK_TYPE	t2value[BIGTMPSIZE];
3006 	BIG_CHUNK_TYPE	t3value[BIGTMPSIZE];
3007 	BIG_CHUNK_TYPE	t4value[BIGTMPSIZE];
3008 	BIG_CHUNK_TYPE	t5value[BIGTMPSIZE];
3009 	BIG_CHUNK_TYPE	t6value[BIGTMPSIZE];
3010 	BIG_CHUNK_TYPE	t7value[BIGTMPSIZE];
3011 	BIG_CHUNK_TYPE	t8value[BIGTMPSIZE];
3012 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
3013 	BIG_ERR_CODE	err;
3014 	int		len;
3015 
3016 	if (big_cmp_abs(m, e) >= 0) {
3017 		len = m->len;
3018 	} else {
3019 		len = e->len;
3020 	}
3021 
3022 	if ((err = big_init1(&t1, len,
3023 	    t1value, arraysize(t1value))) != BIG_OK) {
3024 		return (err);
3025 	}
3026 	if ((err = big_init1(&t2, len,
3027 	    t2value, arraysize(t2value))) != BIG_OK) {
3028 		goto ret1;
3029 	}
3030 	if ((err = big_init1(&t3, len,
3031 	    t3value, arraysize(t3value))) != BIG_OK) {
3032 		goto ret2;
3033 	}
3034 	if ((err = big_init1(&t4, len,
3035 	    t4value, arraysize(t3value))) != BIG_OK) {
3036 		goto ret3;
3037 	}
3038 	if ((err = big_init1(&t5, len,
3039 	    t5value, arraysize(t5value))) != BIG_OK) {
3040 		goto ret4;
3041 	}
3042 	if ((err = big_init1(&t6, len,
3043 	    t6value, arraysize(t6value))) != BIG_OK) {
3044 		goto ret5;
3045 	}
3046 	if ((err = big_init1(&t7, len,
3047 	    t7value, arraysize(t7value))) != BIG_OK) {
3048 		goto ret6;
3049 	}
3050 	if ((err = big_init1(&t8, len,
3051 	    t8value, arraysize(t8value))) != BIG_OK) {
3052 		goto ret7;
3053 	}
3054 
3055 	if ((err = big_init1(&tmp, 2 * len,
3056 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
3057 		goto ret8;
3058 	}
3059 
3060 	ri = &t1;
3061 	ri->value[0] = 1;
3062 	ri->len = 1;
3063 	xi = &t2;
3064 	riminus1 = &t3;
3065 	riminus2 = &t4;
3066 	vmi = &t5;
3067 	vei = &t6;
3068 	vmiminus1 = &t7;
3069 	veiminus1 = &t8;
3070 
3071 	(void) big_copy(vmiminus1, &big_One);
3072 	(void) big_copy(vmi, &big_One);
3073 	(void) big_copy(veiminus1, &big_One);
3074 	(void) big_copy(xi, &big_One);
3075 	vei->len = 1;
3076 	vei->value[0] = 0;
3077 
3078 	(void) big_copy(riminus1, m);
3079 	(void) big_copy(ri, e);
3080 
3081 	while (!big_is_zero(ri)) {
3082 		t = riminus2;
3083 		riminus2 = riminus1;
3084 		riminus1 = ri;
3085 		ri = t;
3086 		if ((err = big_mul(&tmp, vmi, xi)) != BIG_OK) {
3087 			goto ret;
3088 		}
3089 		if ((err = big_sub(vmiminus1, vmiminus1, &tmp)) != BIG_OK) {
3090 			goto ret;
3091 		}
3092 		t = vmiminus1;
3093 		vmiminus1 = vmi;
3094 		vmi = t;
3095 		if ((err = big_mul(&tmp, vei, xi)) != BIG_OK) {
3096 			goto ret;
3097 		}
3098 		if ((err = big_sub(veiminus1, veiminus1, &tmp)) != BIG_OK) {
3099 			goto ret;
3100 		}
3101 		t = veiminus1;
3102 		veiminus1 = vei;
3103 		vei = t;
3104 		if ((err = big_div_pos(xi, ri, riminus2, riminus1)) !=
3105 		    BIG_OK) {
3106 			goto ret;
3107 		}
3108 	}
3109 	if ((gcd != NULL) && ((err = big_copy(gcd, riminus1)) != BIG_OK)) {
3110 		goto ret;
3111 	}
3112 	if ((cm != NULL) && ((err = big_copy(cm, vmi)) != BIG_OK)) {
3113 		goto ret;
3114 	}
3115 	if (ce != NULL) {
3116 		err = big_copy(ce, vei);
3117 	}
3118 ret:
3119 	if (tmp.malloced) big_finish(&tmp);
3120 ret8:
3121 	if (t8.malloced) big_finish(&t8);
3122 ret7:
3123 	if (t7.malloced) big_finish(&t7);
3124 ret6:
3125 	if (t6.malloced) big_finish(&t6);
3126 ret5:
3127 	if (t5.malloced) big_finish(&t5);
3128 ret4:
3129 	if (t4.malloced) big_finish(&t4);
3130 ret3:
3131 	if (t3.malloced) big_finish(&t3);
3132 ret2:
3133 	if (t2.malloced) big_finish(&t2);
3134 ret1:
3135 	if (t1.malloced) big_finish(&t1);
3136 
3137 	return (err);
3138 }
3139