xref: /titanic_41/usr/src/common/bignum/i386/bignum_i386_asm.s (revision 6a634c9dca3093f3922e4b7ab826d7bdf17bf78e)
17c478bd9Sstevel@tonic-gate/*
27c478bd9Sstevel@tonic-gate * CDDL HEADER START
37c478bd9Sstevel@tonic-gate *
47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the
5*7417cfdeSKuriakose Kuruvilla * Common Development and Distribution License (the "License").
6*7417cfdeSKuriakose Kuruvilla * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate *
87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate * and limitations under the License.
127c478bd9Sstevel@tonic-gate *
137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate *
197c478bd9Sstevel@tonic-gate * CDDL HEADER END
207c478bd9Sstevel@tonic-gate */
217c478bd9Sstevel@tonic-gate/*
22*7417cfdeSKuriakose Kuruvilla * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
237c478bd9Sstevel@tonic-gate */
247c478bd9Sstevel@tonic-gate
257c478bd9Sstevel@tonic-gate#include <sys/asm_linkage.h>
267c478bd9Sstevel@tonic-gate#include <sys/x86_archext.h>
277c478bd9Sstevel@tonic-gate#include <sys/controlregs.h>
287c478bd9Sstevel@tonic-gate
297c478bd9Sstevel@tonic-gate#if defined(__lint)
307c478bd9Sstevel@tonic-gate
317c478bd9Sstevel@tonic-gate#include <sys/types.h>
327c478bd9Sstevel@tonic-gate
337c478bd9Sstevel@tonic-gateuint32_t
347c478bd9Sstevel@tonic-gatebignum_use_sse2()
357c478bd9Sstevel@tonic-gate{ return (0); }
367c478bd9Sstevel@tonic-gate
377c478bd9Sstevel@tonic-gate/* Not to be called by C code */
387c478bd9Sstevel@tonic-gate/* ARGSUSED */
397c478bd9Sstevel@tonic-gateuint32_t
407c478bd9Sstevel@tonic-gatebig_mul_set_vec_sse2_r()
417c478bd9Sstevel@tonic-gate{ return (0); }
427c478bd9Sstevel@tonic-gate
437c478bd9Sstevel@tonic-gate/* Not to be called by C code */
447c478bd9Sstevel@tonic-gate/* ARGSUSED */
457c478bd9Sstevel@tonic-gateuint32_t
467c478bd9Sstevel@tonic-gatebig_mul_add_vec_sse2_r()
477c478bd9Sstevel@tonic-gate{ return (0); }
487c478bd9Sstevel@tonic-gate
497c478bd9Sstevel@tonic-gate/* ARGSUSED */
507c478bd9Sstevel@tonic-gateuint32_t
517c478bd9Sstevel@tonic-gatebig_mul_set_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
527c478bd9Sstevel@tonic-gate{ return (0); }
537c478bd9Sstevel@tonic-gate
547c478bd9Sstevel@tonic-gate/* ARGSUSED */
557c478bd9Sstevel@tonic-gateuint32_t
567c478bd9Sstevel@tonic-gatebig_mul_add_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
577c478bd9Sstevel@tonic-gate{ return (0); }
587c478bd9Sstevel@tonic-gate
597c478bd9Sstevel@tonic-gate/* ARGSUSED */
607c478bd9Sstevel@tonic-gatevoid
617c478bd9Sstevel@tonic-gatebig_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
627c478bd9Sstevel@tonic-gate{}
637c478bd9Sstevel@tonic-gate
647c478bd9Sstevel@tonic-gate/* ARGSUSED */
657c478bd9Sstevel@tonic-gatevoid
667c478bd9Sstevel@tonic-gatebig_sqr_vec_sse2(uint32_t *r, uint32_t *a, int len)
677c478bd9Sstevel@tonic-gate{}
687c478bd9Sstevel@tonic-gate
697c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE)
707c478bd9Sstevel@tonic-gate
717c478bd9Sstevel@tonic-gate/* ARGSUSED */
727c478bd9Sstevel@tonic-gateuint32_t
737c478bd9Sstevel@tonic-gatebig_mul_set_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
747c478bd9Sstevel@tonic-gate{ return (0); }
757c478bd9Sstevel@tonic-gate
767c478bd9Sstevel@tonic-gate/* ARGSUSED */
777c478bd9Sstevel@tonic-gateuint32_t
787c478bd9Sstevel@tonic-gatebig_mul_add_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
797c478bd9Sstevel@tonic-gate{ return (0); }
807c478bd9Sstevel@tonic-gate
817c478bd9Sstevel@tonic-gate/* Not to be called by C code */
827c478bd9Sstevel@tonic-gate/* ARGSUSED */
837c478bd9Sstevel@tonic-gatevoid
847c478bd9Sstevel@tonic-gatebig_sqr_vec_sse2_fc(uint32_t *r, uint32_t *a, int len)
857c478bd9Sstevel@tonic-gate{}
867c478bd9Sstevel@tonic-gate
877c478bd9Sstevel@tonic-gate#endif	/* MMX_MANAGE */
887c478bd9Sstevel@tonic-gate
897c478bd9Sstevel@tonic-gate/*
907c478bd9Sstevel@tonic-gate * UMUL
917c478bd9Sstevel@tonic-gate *
927c478bd9Sstevel@tonic-gate */
937c478bd9Sstevel@tonic-gate
947c478bd9Sstevel@tonic-gate/* ARGSUSED */
957c478bd9Sstevel@tonic-gateuint32_t
967c478bd9Sstevel@tonic-gatebig_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
977c478bd9Sstevel@tonic-gate{ return (0); }
987c478bd9Sstevel@tonic-gate
997c478bd9Sstevel@tonic-gate/* ARGSUSED */
1007c478bd9Sstevel@tonic-gateuint32_t
1017c478bd9Sstevel@tonic-gatebig_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1027c478bd9Sstevel@tonic-gate{ return (0); }
1037c478bd9Sstevel@tonic-gate
1047c478bd9Sstevel@tonic-gate#else	/* __lint */
1057c478bd9Sstevel@tonic-gate
1067c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE)
1077c478bd9Sstevel@tonic-gate
1087c478bd9Sstevel@tonic-gate#if defined(_KERNEL)
1097c478bd9Sstevel@tonic-gate
1107c478bd9Sstevel@tonic-gate#define	KPREEMPT_DISABLE call kpr_disable
1117c478bd9Sstevel@tonic-gate#define	KPREEMPT_ENABLE call kpr_enable
1127c478bd9Sstevel@tonic-gate#define	TEST_TS(reg)					\
1137c478bd9Sstevel@tonic-gate	movl	%cr0, reg;				\
1147c478bd9Sstevel@tonic-gate	clts;						\
1157c478bd9Sstevel@tonic-gate	testl	$CR0_TS, reg
1167c478bd9Sstevel@tonic-gate
1177c478bd9Sstevel@tonic-gate#else	/* _KERNEL */
1187c478bd9Sstevel@tonic-gate
1197c478bd9Sstevel@tonic-gate#define	KPREEMPT_DISABLE
1207c478bd9Sstevel@tonic-gate#define	KPREEMPT_ENABLE
1217c478bd9Sstevel@tonic-gate
1227c478bd9Sstevel@tonic-gate#define	TEST_TS(reg)					\
1237c478bd9Sstevel@tonic-gate	movl	$0, reg;				\
1247c478bd9Sstevel@tonic-gate	testl	$CR0_TS, reg
1257c478bd9Sstevel@tonic-gate
1267c478bd9Sstevel@tonic-gate#endif	/* _KERNEL */
1277c478bd9Sstevel@tonic-gate
1287c478bd9Sstevel@tonic-gate#define	MMX_SIZE 8
1297c478bd9Sstevel@tonic-gate#define	MMX_ALIGN 8
1307c478bd9Sstevel@tonic-gate
1317c478bd9Sstevel@tonic-gate#define	SAVE_MMX_PROLOG(sreg, nreg)			\
1329e1a718fSdarrenm	subl	$_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;	\
1337c478bd9Sstevel@tonic-gate	movl	%esp, sreg;				\
1347c478bd9Sstevel@tonic-gate	addl	$MMX_ALIGN, sreg;			\
1357c478bd9Sstevel@tonic-gate	andl	$-1![MMX_ALIGN-1], sreg;
1367c478bd9Sstevel@tonic-gate
1377c478bd9Sstevel@tonic-gate#define	RSTOR_MMX_EPILOG(nreg)				\
1389e1a718fSdarrenm	addl	$_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;
1397c478bd9Sstevel@tonic-gate
1407c478bd9Sstevel@tonic-gate#define	SAVE_MMX_0TO4(sreg)			\
1417c478bd9Sstevel@tonic-gate	SAVE_MMX_PROLOG(sreg, 5);		\
1427c478bd9Sstevel@tonic-gate	movq	%mm0, 0(sreg);			\
1437c478bd9Sstevel@tonic-gate	movq	%mm1, 8(sreg);			\
1447c478bd9Sstevel@tonic-gate	movq	%mm2, 16(sreg);			\
1457c478bd9Sstevel@tonic-gate	movq	%mm3, 24(sreg);			\
1467c478bd9Sstevel@tonic-gate	movq	%mm4, 32(sreg)
1477c478bd9Sstevel@tonic-gate
1487c478bd9Sstevel@tonic-gate#define	RSTOR_MMX_0TO4(sreg)			\
1497c478bd9Sstevel@tonic-gate	movq	0(sreg), %mm0;			\
1507c478bd9Sstevel@tonic-gate	movq	8(sreg), %mm1;			\
1517c478bd9Sstevel@tonic-gate	movq	16(sreg), %mm2;			\
1527c478bd9Sstevel@tonic-gate	movq	24(sreg), %mm3;			\
1537c478bd9Sstevel@tonic-gate	movq	32(sreg), %mm4;			\
1547c478bd9Sstevel@tonic-gate	RSTOR_MMX_EPILOG(5)
1557c478bd9Sstevel@tonic-gate
1567c478bd9Sstevel@tonic-gate#endif	/* MMX_MANAGE */
1577c478bd9Sstevel@tonic-gate
1587c478bd9Sstevel@tonic-gate/ Note: this file contains implementations for
1597c478bd9Sstevel@tonic-gate/	big_mul_set_vec()
1607c478bd9Sstevel@tonic-gate/	big_mul_add_vec()
1617c478bd9Sstevel@tonic-gate/	big_mul_vec()
1627c478bd9Sstevel@tonic-gate/	big_sqr_vec()
1637c478bd9Sstevel@tonic-gate/ One set of implementations is for SSE2-capable models.
1647c478bd9Sstevel@tonic-gate/ The other uses no MMX, SSE, or SSE2 instructions, only
1657c478bd9Sstevel@tonic-gate/ the x86 32 X 32 -> 64 unsigned multiply instruction, MUL.
1667c478bd9Sstevel@tonic-gate/
1677c478bd9Sstevel@tonic-gate/ The code for the implementations is grouped by SSE2 vs UMUL,
1687c478bd9Sstevel@tonic-gate/ rather than grouping pairs of implementations for each function.
1697c478bd9Sstevel@tonic-gate/ This is because the bignum implementation gets "imprinted"
1707c478bd9Sstevel@tonic-gate/ on the correct implementation, at the time of first use,
1717c478bd9Sstevel@tonic-gate/ so none of the code for the other implementations is ever
1727c478bd9Sstevel@tonic-gate/ executed.  So, it is a no-brainer to layout the code to minimize
1737c478bd9Sstevel@tonic-gate/ the "footprint" of executed code.
1747c478bd9Sstevel@tonic-gate
1757c478bd9Sstevel@tonic-gate/ Can we use SSE2 instructions?  Return value is non-zero
1767c478bd9Sstevel@tonic-gate/ if we can.
1777c478bd9Sstevel@tonic-gate/
1787c478bd9Sstevel@tonic-gate/ Note:
1797c478bd9Sstevel@tonic-gate/   Using the cpuid instruction directly would work equally
1807c478bd9Sstevel@tonic-gate/   well in userland and in the kernel, but we do not use the
181*7417cfdeSKuriakose Kuruvilla/   cpuid instruction in the kernel, we use x86_featureset,
182*7417cfdeSKuriakose Kuruvilla/   instead.  This means we honor any decisions the kernel
183*7417cfdeSKuriakose Kuruvilla/   startup code may have made in setting this variable,
184*7417cfdeSKuriakose Kuruvilla/   including disabling SSE2.  It might even be a good idea
185*7417cfdeSKuriakose Kuruvilla/   to honor this kind of setting in userland, as well, but
186*7417cfdeSKuriakose Kuruvilla/   the variable, x86_featureset is not readily available to
187*7417cfdeSKuriakose Kuruvilla/   userland processes.
1887c478bd9Sstevel@tonic-gate/
1897c478bd9Sstevel@tonic-gate/ uint32_t
1907c478bd9Sstevel@tonic-gate/ bignum_use_sse2()
1917c478bd9Sstevel@tonic-gate
1927c478bd9Sstevel@tonic-gate	ENTRY(bignum_use_sse2)
1937c478bd9Sstevel@tonic-gate#if defined(_KERNEL)
194*7417cfdeSKuriakose Kuruvilla	xor	%eax, %eax
195*7417cfdeSKuriakose Kuruvilla	bt	$X86FSET_SSE2, x86_featureset
196*7417cfdeSKuriakose Kuruvilla	adc     %eax, %eax
1977c478bd9Sstevel@tonic-gate#else	/* _KERNEL */
1987c478bd9Sstevel@tonic-gate	pushl	%ebx
1997c478bd9Sstevel@tonic-gate	movl	$1, %eax		/ Get feature information
2007c478bd9Sstevel@tonic-gate	cpuid
2017c478bd9Sstevel@tonic-gate	movl	%edx, %eax		/ set return value
2027c478bd9Sstevel@tonic-gate	popl	%ebx
2037c478bd9Sstevel@tonic-gate	andl	$CPUID_INTC_EDX_SSE2, %eax
2047c478bd9Sstevel@tonic-gate#endif	/* _KERNEL */
2057c478bd9Sstevel@tonic-gate	ret
2067c478bd9Sstevel@tonic-gate	SET_SIZE(bignum_use_sse2)
2077c478bd9Sstevel@tonic-gate
2087c478bd9Sstevel@tonic-gate
2097c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------
2107c478bd9Sstevel@tonic-gate/		SSE2 Implementations
2117c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------
2127c478bd9Sstevel@tonic-gate
2137c478bd9Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len
2147c478bd9Sstevel@tonic-gate/ returns the carry digit
2157c478bd9Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions
2167c478bd9Sstevel@tonic-gate/
2177c478bd9Sstevel@tonic-gate/ uint32_t
2187c478bd9Sstevel@tonic-gate/ big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
2197c478bd9Sstevel@tonic-gate/
2207c478bd9Sstevel@tonic-gate/ r	%edx
2217c478bd9Sstevel@tonic-gate/ a	%ebx
2227c478bd9Sstevel@tonic-gate/ len	%ecx
2237c478bd9Sstevel@tonic-gate/ digit	%mm3
2247c478bd9Sstevel@tonic-gate/
2257c478bd9Sstevel@tonic-gate/ Does not touch the following registers: %esi, %edi, %mm4
2267c478bd9Sstevel@tonic-gate/
2277c478bd9Sstevel@tonic-gate/ N.B.:
2287c478bd9Sstevel@tonic-gate/   This is strictly for internal use.
2297c478bd9Sstevel@tonic-gate/   The interface is very light-weight.
2307c478bd9Sstevel@tonic-gate/   All parameters are passed in registers.
2317c478bd9Sstevel@tonic-gate/   It does not conform to the SYSV x86 ABI.
2327c478bd9Sstevel@tonic-gate/   So, don't even think about calling this function directly from C code.
2337c478bd9Sstevel@tonic-gate/
2347c478bd9Sstevel@tonic-gate/ The basic multiply digit loop is unrolled 8 times.
2357c478bd9Sstevel@tonic-gate/ Each comment is preceded by an instance number.
2367c478bd9Sstevel@tonic-gate/ Instructions that have been moved retain their original, "natural"
2377c478bd9Sstevel@tonic-gate/ instance number.  It should be easier this way to follow
2387c478bd9Sstevel@tonic-gate/ the step-wise refinement process that went into constructing
2397c478bd9Sstevel@tonic-gate/ the final code.
2407c478bd9Sstevel@tonic-gate
2417c478bd9Sstevel@tonic-gate#define	UNROLL		8
2427c478bd9Sstevel@tonic-gate#define	UNROLL32	32
2437c478bd9Sstevel@tonic-gate
2447c478bd9Sstevel@tonic-gate	ENTRY(big_mul_set_vec_sse2_r)
2457c478bd9Sstevel@tonic-gate	xorl	%eax, %eax	/ if (len == 0) return (0);
2467c478bd9Sstevel@tonic-gate	testl	%ecx, %ecx
2477c478bd9Sstevel@tonic-gate	jz	.L17
2487c478bd9Sstevel@tonic-gate
2497c478bd9Sstevel@tonic-gate	pxor	%mm0, %mm0	/ cy = 0
2507c478bd9Sstevel@tonic-gate
2517c478bd9Sstevel@tonic-gate.L15:
2527c478bd9Sstevel@tonic-gate	cmpl	$UNROLL, %ecx
2537c478bd9Sstevel@tonic-gate	jl	.L16
2547c478bd9Sstevel@tonic-gate	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
2557c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
2567c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 1: mm0 = digit * a[i] + cy;
2577c478bd9Sstevel@tonic-gate	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
2587c478bd9Sstevel@tonic-gate	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
2597c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 1: cy = product[63..32]
2607c478bd9Sstevel@tonic-gate
2617c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
2627c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 2: mm0 = digit * a[i] + cy;
2637c478bd9Sstevel@tonic-gate	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
2647c478bd9Sstevel@tonic-gate	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
2657c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 2: cy = product[63..32]
2667c478bd9Sstevel@tonic-gate
2677c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
2687c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 3: mm0 = digit * a[i] + cy;
2697c478bd9Sstevel@tonic-gate	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
2707c478bd9Sstevel@tonic-gate	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
2717c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 3: cy = product[63..32]
2727c478bd9Sstevel@tonic-gate
2737c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
2747c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 4: mm0 = digit * a[i] + cy;
2757c478bd9Sstevel@tonic-gate	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
2767c478bd9Sstevel@tonic-gate	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
2777c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 4: cy = product[63..32]
2787c478bd9Sstevel@tonic-gate
2797c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
2807c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 5: mm0 = digit * a[i] + cy;
2817c478bd9Sstevel@tonic-gate	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
2827c478bd9Sstevel@tonic-gate	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
2837c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 5: cy = product[63..32]
2847c478bd9Sstevel@tonic-gate
2857c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
2867c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 6: mm0 = digit * a[i] + cy;
2877c478bd9Sstevel@tonic-gate	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
2887c478bd9Sstevel@tonic-gate	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
2897c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 6: cy = product[63..32]
2907c478bd9Sstevel@tonic-gate
2917c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
2927c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 7: mm0 = digit * a[i] + cy;
2937c478bd9Sstevel@tonic-gate	movd	28(%ebx), %mm1	/ 8: mm1 = a[i]
2947c478bd9Sstevel@tonic-gate	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
2957c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 7: cy = product[63..32]
2967c478bd9Sstevel@tonic-gate
2977c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 8: mm1 = digit * a[i]
2987c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 8: mm0 = digit * a[i] + cy;
2997c478bd9Sstevel@tonic-gate	movd	%mm0, 28(%edx)	/ 8: r[i] = product[31..0]
3007c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 8: cy = product[63..32]
3017c478bd9Sstevel@tonic-gate
3027c478bd9Sstevel@tonic-gate	leal	UNROLL32(%ebx), %ebx	/ a += UNROLL
3037c478bd9Sstevel@tonic-gate	leal	UNROLL32(%edx), %edx	/ r += UNROLL
3047c478bd9Sstevel@tonic-gate	subl	$UNROLL, %ecx		/ len -= UNROLL
3057c478bd9Sstevel@tonic-gate	jz	.L17
3067c478bd9Sstevel@tonic-gate	jmp	.L15
3077c478bd9Sstevel@tonic-gate
3087c478bd9Sstevel@tonic-gate.L16:
3097c478bd9Sstevel@tonic-gate	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
3107c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
3117c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 1: mm0 = digit * a[i] + cy;
3127c478bd9Sstevel@tonic-gate	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
3137c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 1: cy = product[63..32]
3147c478bd9Sstevel@tonic-gate	subl	$1, %ecx
3157c478bd9Sstevel@tonic-gate	jz	.L17
3167c478bd9Sstevel@tonic-gate
3177c478bd9Sstevel@tonic-gate	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
3187c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
3197c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 2: mm0 = digit * a[i] + cy;
3207c478bd9Sstevel@tonic-gate	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
3217c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 2: cy = product[63..32]
3227c478bd9Sstevel@tonic-gate	subl	$1, %ecx
3237c478bd9Sstevel@tonic-gate	jz	.L17
3247c478bd9Sstevel@tonic-gate
3257c478bd9Sstevel@tonic-gate	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
3267c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
3277c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 3: mm0 = digit * a[i] + cy;
3287c478bd9Sstevel@tonic-gate	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
3297c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 3: cy = product[63..32]
3307c478bd9Sstevel@tonic-gate	subl	$1, %ecx
3317c478bd9Sstevel@tonic-gate	jz	.L17
3327c478bd9Sstevel@tonic-gate
3337c478bd9Sstevel@tonic-gate	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
3347c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
3357c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 4: mm0 = digit * a[i] + cy;
3367c478bd9Sstevel@tonic-gate	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
3377c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 4: cy = product[63..32]
3387c478bd9Sstevel@tonic-gate	subl	$1, %ecx
3397c478bd9Sstevel@tonic-gate	jz	.L17
3407c478bd9Sstevel@tonic-gate
3417c478bd9Sstevel@tonic-gate	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
3427c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
3437c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 5: mm0 = digit * a[i] + cy;
3447c478bd9Sstevel@tonic-gate	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
3457c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 5: cy = product[63..32]
3467c478bd9Sstevel@tonic-gate	subl	$1, %ecx
3477c478bd9Sstevel@tonic-gate	jz	.L17
3487c478bd9Sstevel@tonic-gate
3497c478bd9Sstevel@tonic-gate	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
3507c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
3517c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 6: mm0 = digit * a[i] + cy;
3527c478bd9Sstevel@tonic-gate	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
3537c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 6: cy = product[63..32]
3547c478bd9Sstevel@tonic-gate	subl	$1, %ecx
3557c478bd9Sstevel@tonic-gate	jz	.L17
3567c478bd9Sstevel@tonic-gate
3577c478bd9Sstevel@tonic-gate	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
3587c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
3597c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 7: mm0 = digit * a[i] + cy;
3607c478bd9Sstevel@tonic-gate	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
3617c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 7: cy = product[63..32]
3627c478bd9Sstevel@tonic-gate
3637c478bd9Sstevel@tonic-gate.L17:
3647c478bd9Sstevel@tonic-gate	movd	%mm0, %eax	/ return (cy)
3657c478bd9Sstevel@tonic-gate	/ no emms.  caller is responsible for emms
3667c478bd9Sstevel@tonic-gate	ret
3677c478bd9Sstevel@tonic-gate	SET_SIZE(big_mul_set_vec_sse2_r)
3687c478bd9Sstevel@tonic-gate
3697c478bd9Sstevel@tonic-gate
3707c478bd9Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len
3717c478bd9Sstevel@tonic-gate/ returns the carry digit
3727c478bd9Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions
3737c478bd9Sstevel@tonic-gate/
3747c478bd9Sstevel@tonic-gate/ r		 8(%ebp)	%edx
3757c478bd9Sstevel@tonic-gate/ a		12(%ebp)	%ebx
3767c478bd9Sstevel@tonic-gate/ len		16(%ebp)	%ecx
3777c478bd9Sstevel@tonic-gate/ digit		20(%ebp)	%mm3
3787c478bd9Sstevel@tonic-gate/
3797c478bd9Sstevel@tonic-gate/ In userland, there is just the one function, big_mul_set_vec_sse2().
3807c478bd9Sstevel@tonic-gate/ But in the kernel, there are two variations:
3817c478bd9Sstevel@tonic-gate/    1. big_mul_set_vec_sse2() which does what is necessary to save and
3827c478bd9Sstevel@tonic-gate/       restore state, if necessary, and to ensure that preemtion is
3837c478bd9Sstevel@tonic-gate/       disabled.
3847c478bd9Sstevel@tonic-gate/    2. big_mul_set_vec_sse2_nsv() which just does the work;
3857c478bd9Sstevel@tonic-gate/       it is the caller's responsibility to ensure that MMX state
3867c478bd9Sstevel@tonic-gate/       does not need to be saved and restored and that preemption
3877c478bd9Sstevel@tonic-gate/       is already disabled.
3887c478bd9Sstevel@tonic-gate
3897c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE)
3907c478bd9Sstevel@tonic-gate	ENTRY(big_mul_set_vec_sse2)
3917c478bd9Sstevel@tonic-gate	pushl	%ebp
3927c478bd9Sstevel@tonic-gate	movl	%esp, %ebp
3937c478bd9Sstevel@tonic-gate	pushl	%ebx
3947c478bd9Sstevel@tonic-gate	pushl	%esi
3957c478bd9Sstevel@tonic-gate	KPREEMPT_DISABLE
3967c478bd9Sstevel@tonic-gate	TEST_TS(%ebx)
3977c478bd9Sstevel@tonic-gate	pushl	%ebx
3987c478bd9Sstevel@tonic-gate	jnz	.setvec_no_save
3997c478bd9Sstevel@tonic-gate	pushl	%edi
4007c478bd9Sstevel@tonic-gate	SAVE_MMX_0TO4(%edi)
4017c478bd9Sstevel@tonic-gate	movl	8(%ebp), %edx
4027c478bd9Sstevel@tonic-gate	movl	12(%ebp), %ebx
4037c478bd9Sstevel@tonic-gate	movl	16(%ebp), %ecx
4047c478bd9Sstevel@tonic-gate	movd	20(%ebp), %mm3
4057c478bd9Sstevel@tonic-gate	call	big_mul_set_vec_sse2_r
4067c478bd9Sstevel@tonic-gate	movl	%eax, %esi
4077c478bd9Sstevel@tonic-gate	RSTOR_MMX_0TO4(%edi)
4087c478bd9Sstevel@tonic-gate	popl	%edi
4097c478bd9Sstevel@tonic-gate	jmp	.setvec_rtn
4107c478bd9Sstevel@tonic-gate
4117c478bd9Sstevel@tonic-gate.setvec_no_save:
4127c478bd9Sstevel@tonic-gate	movl	8(%ebp), %edx
4137c478bd9Sstevel@tonic-gate	movl	12(%ebp), %ebx
4147c478bd9Sstevel@tonic-gate	movl	16(%ebp), %ecx
4157c478bd9Sstevel@tonic-gate	movd	20(%ebp), %mm3
4167c478bd9Sstevel@tonic-gate	call	big_mul_set_vec_sse2_r
4177c478bd9Sstevel@tonic-gate	movl	%eax, %esi
4187c478bd9Sstevel@tonic-gate
4197c478bd9Sstevel@tonic-gate.setvec_rtn:
4207c478bd9Sstevel@tonic-gate	emms
4217c478bd9Sstevel@tonic-gate	popl	%ebx
4227c478bd9Sstevel@tonic-gate	movl	%ebx, %cr0
4237c478bd9Sstevel@tonic-gate	KPREEMPT_ENABLE
4247c478bd9Sstevel@tonic-gate	movl	%esi, %eax
4257c478bd9Sstevel@tonic-gate	popl	%esi
4267c478bd9Sstevel@tonic-gate	popl	%ebx
4277c478bd9Sstevel@tonic-gate	leave
4287c478bd9Sstevel@tonic-gate	ret
4297c478bd9Sstevel@tonic-gate	SET_SIZE(big_mul_set_vec_sse2)
4307c478bd9Sstevel@tonic-gate
4317c478bd9Sstevel@tonic-gate	ENTRY(big_mul_set_vec_sse2_nsv)
4327c478bd9Sstevel@tonic-gate	pushl	%ebp
4337c478bd9Sstevel@tonic-gate	movl	%esp, %ebp
4347c478bd9Sstevel@tonic-gate	pushl	%ebx
4357c478bd9Sstevel@tonic-gate	movl	8(%ebp), %edx
4367c478bd9Sstevel@tonic-gate	movl	12(%ebp), %ebx
4377c478bd9Sstevel@tonic-gate	movl	16(%ebp), %ecx
4387c478bd9Sstevel@tonic-gate	movd	20(%ebp), %mm3
4397c478bd9Sstevel@tonic-gate	call	big_mul_set_vec_sse2_r
4407c478bd9Sstevel@tonic-gate	popl	%ebx
4417c478bd9Sstevel@tonic-gate	leave
4427c478bd9Sstevel@tonic-gate	ret
4437c478bd9Sstevel@tonic-gate	SET_SIZE(big_mul_set_vec_sse2_nsv)
4447c478bd9Sstevel@tonic-gate
4457c478bd9Sstevel@tonic-gate#else	/* !defined(MMX_MANAGE) */
4467c478bd9Sstevel@tonic-gate
4477c478bd9Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len
4487c478bd9Sstevel@tonic-gate/ returns the carry digit
4497c478bd9Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions
4507c478bd9Sstevel@tonic-gate/
4517c478bd9Sstevel@tonic-gate/ r		 8(%ebp)	%edx
4527c478bd9Sstevel@tonic-gate/ a		12(%ebp)	%ebx
4537c478bd9Sstevel@tonic-gate/ len		16(%ebp)	%ecx
4547c478bd9Sstevel@tonic-gate/ digit		20(%ebp)	%mm3
4557c478bd9Sstevel@tonic-gate
4567c478bd9Sstevel@tonic-gate	ENTRY(big_mul_set_vec_sse2)
4577c478bd9Sstevel@tonic-gate	pushl	%ebp
4587c478bd9Sstevel@tonic-gate	movl	%esp, %ebp
4597c478bd9Sstevel@tonic-gate	pushl	%ebx
4607c478bd9Sstevel@tonic-gate	movl	8(%ebp), %edx
4617c478bd9Sstevel@tonic-gate	movl	12(%ebp), %ebx
4627c478bd9Sstevel@tonic-gate	movl	16(%ebp), %ecx
4637c478bd9Sstevel@tonic-gate	movd	20(%ebp), %mm3
4647c478bd9Sstevel@tonic-gate	call	big_mul_set_vec_sse2_r
4657c478bd9Sstevel@tonic-gate	popl	%ebx
4667c478bd9Sstevel@tonic-gate	emms
4677c478bd9Sstevel@tonic-gate	leave
4687c478bd9Sstevel@tonic-gate	ret
4697c478bd9Sstevel@tonic-gate	SET_SIZE(big_mul_set_vec_sse2)
4707c478bd9Sstevel@tonic-gate
4717c478bd9Sstevel@tonic-gate#endif	/* MMX_MANAGE */
4727c478bd9Sstevel@tonic-gate
4737c478bd9Sstevel@tonic-gate
4747c478bd9Sstevel@tonic-gate/ r = r + a * digit, r and a are vectors of length len
4757c478bd9Sstevel@tonic-gate/ returns the carry digit
4767c478bd9Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions
4777c478bd9Sstevel@tonic-gate/
4787c478bd9Sstevel@tonic-gate/ uint32_t
4797c478bd9Sstevel@tonic-gate/ big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
4807c478bd9Sstevel@tonic-gate/
4817c478bd9Sstevel@tonic-gate/ r	%edx
4827c478bd9Sstevel@tonic-gate/ a	%ebx
4837c478bd9Sstevel@tonic-gate/ len	%ecx
4847c478bd9Sstevel@tonic-gate/ digit	%mm3
4857c478bd9Sstevel@tonic-gate/
4867c478bd9Sstevel@tonic-gate/ N.B.:
4877c478bd9Sstevel@tonic-gate/   This is strictly for internal use.
4887c478bd9Sstevel@tonic-gate/   The interface is very light-weight.
4897c478bd9Sstevel@tonic-gate/   All parameters are passed in registers.
4907c478bd9Sstevel@tonic-gate/   It does not conform to the SYSV x86 ABI.
4917c478bd9Sstevel@tonic-gate/   So, don't even think about calling this function directly from C code.
4927c478bd9Sstevel@tonic-gate/
4937c478bd9Sstevel@tonic-gate/ The basic multiply digit loop is unrolled 8 times.
4947c478bd9Sstevel@tonic-gate/ Each comment is preceded by an instance number.
4957c478bd9Sstevel@tonic-gate/ Instructions that have been moved retain their original, "natural"
4967c478bd9Sstevel@tonic-gate/ instance number.  It should be easier this way to follow
4977c478bd9Sstevel@tonic-gate/ the step-wise refinement process that went into constructing
4987c478bd9Sstevel@tonic-gate/ the final code.
4997c478bd9Sstevel@tonic-gate
5007c478bd9Sstevel@tonic-gate	ENTRY(big_mul_add_vec_sse2_r)
5017c478bd9Sstevel@tonic-gate	xorl	%eax, %eax
5027c478bd9Sstevel@tonic-gate	testl	%ecx, %ecx
5037c478bd9Sstevel@tonic-gate	jz	.L27
5047c478bd9Sstevel@tonic-gate
5057c478bd9Sstevel@tonic-gate	pxor	%mm0, %mm0	/ cy = 0
5067c478bd9Sstevel@tonic-gate
5077c478bd9Sstevel@tonic-gate.L25:
5087c478bd9Sstevel@tonic-gate	cmpl	$UNROLL, %ecx
5097c478bd9Sstevel@tonic-gate	jl	.L26
5107c478bd9Sstevel@tonic-gate	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
5117c478bd9Sstevel@tonic-gate	movd	0(%edx), %mm2	/ 1: mm2 = r[i]
5127c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
5137c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 1: mm2 = digit * a[i] + r[i]
5147c478bd9Sstevel@tonic-gate	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
5157c478bd9Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 1: mm0 = digit * a[i] + r[i] + cy;
5167c478bd9Sstevel@tonic-gate	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
5177c478bd9Sstevel@tonic-gate	movd	4(%edx), %mm2	/ 2: mm2 = r[i]
5187c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 1: cy = product[63..32]
5197c478bd9Sstevel@tonic-gate
5207c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
5217c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 2: mm2 = digit * a[i] + r[i]
5227c478bd9Sstevel@tonic-gate	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
5237c478bd9Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 2: mm0 = digit * a[i] + r[i] + cy;
5247c478bd9Sstevel@tonic-gate	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
5257c478bd9Sstevel@tonic-gate	movd	8(%edx), %mm2	/ 3: mm2 = r[i]
5267c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 2: cy = product[63..32]
5277c478bd9Sstevel@tonic-gate
5287c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
5297c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 3: mm2 = digit * a[i] + r[i]
5307c478bd9Sstevel@tonic-gate	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
5317c478bd9Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 3: mm0 = digit * a[i] + r[i] + cy;
5327c478bd9Sstevel@tonic-gate	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
5337c478bd9Sstevel@tonic-gate	movd	12(%edx), %mm2	/ 4: mm2 = r[i]
5347c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 3: cy = product[63..32]
5357c478bd9Sstevel@tonic-gate
5367c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
5377c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 4: mm2 = digit * a[i] + r[i]
5387c478bd9Sstevel@tonic-gate	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
5397c478bd9Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 4: mm0 = digit * a[i] + r[i] + cy;
5407c478bd9Sstevel@tonic-gate	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
5417c478bd9Sstevel@tonic-gate	movd	16(%edx), %mm2	/ 5: mm2 = r[i]
5427c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 4: cy = product[63..32]
5437c478bd9Sstevel@tonic-gate
5447c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
5457c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 5: mm2 = digit * a[i] + r[i]
5467c478bd9Sstevel@tonic-gate	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
5477c478bd9Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 5: mm0 = digit * a[i] + r[i] + cy;
5487c478bd9Sstevel@tonic-gate	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
5497c478bd9Sstevel@tonic-gate	movd	20(%edx), %mm2	/ 6: mm2 = r[i]
5507c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 5: cy = product[63..32]
5517c478bd9Sstevel@tonic-gate
5527c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
5537c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 6: mm2 = digit * a[i] + r[i]
5547c478bd9Sstevel@tonic-gate	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
5557c478bd9Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 6: mm0 = digit * a[i] + r[i] + cy;
5567c478bd9Sstevel@tonic-gate	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
5577c478bd9Sstevel@tonic-gate	movd	24(%edx), %mm2	/ 7: mm2 = r[i]
5587c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 6: cy = product[63..32]
5597c478bd9Sstevel@tonic-gate
5607c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
5617c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 7: mm2 = digit * a[i] + r[i]
5627c478bd9Sstevel@tonic-gate	movd	28(%ebx), %mm1	/ 8: mm1 = a[i]
5637c478bd9Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 7: mm0 = digit * a[i] + r[i] + cy;
5647c478bd9Sstevel@tonic-gate	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
5657c478bd9Sstevel@tonic-gate	movd	28(%edx), %mm2	/ 8: mm2 = r[i]
5667c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 7: cy = product[63..32]
5677c478bd9Sstevel@tonic-gate
5687c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 8: mm1 = digit * a[i]
5697c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 8: mm2 = digit * a[i] + r[i]
5707c478bd9Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 8: mm0 = digit * a[i] + r[i] + cy;
5717c478bd9Sstevel@tonic-gate	movd	%mm0, 28(%edx)	/ 8: r[i] = product[31..0]
5727c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 8: cy = product[63..32]
5737c478bd9Sstevel@tonic-gate
5747c478bd9Sstevel@tonic-gate	leal	UNROLL32(%ebx), %ebx	/ a += UNROLL
5757c478bd9Sstevel@tonic-gate	leal	UNROLL32(%edx), %edx	/ r += UNROLL
5767c478bd9Sstevel@tonic-gate	subl	$UNROLL, %ecx		/ len -= UNROLL
5777c478bd9Sstevel@tonic-gate	jz	.L27
5787c478bd9Sstevel@tonic-gate	jmp	.L25
5797c478bd9Sstevel@tonic-gate
5807c478bd9Sstevel@tonic-gate.L26:
5817c478bd9Sstevel@tonic-gate	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
5827c478bd9Sstevel@tonic-gate	movd	0(%edx), %mm2	/ 1: mm2 = r[i]
5837c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
5847c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 1: mm2 = digit * a[i] + r[i]
5857c478bd9Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 1: mm0 = digit * a[i] + r[i] + cy;
5867c478bd9Sstevel@tonic-gate	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
5877c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 1: cy = product[63..32]
5887c478bd9Sstevel@tonic-gate	subl	$1, %ecx
5897c478bd9Sstevel@tonic-gate	jz	.L27
5907c478bd9Sstevel@tonic-gate
5917c478bd9Sstevel@tonic-gate	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
5927c478bd9Sstevel@tonic-gate	movd	4(%edx), %mm2	/ 2: mm2 = r[i]
5937c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
5947c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 2: mm2 = digit * a[i] + r[i]
5957c478bd9Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 2: mm0 = digit * a[i] + r[i] + cy;
5967c478bd9Sstevel@tonic-gate	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
5977c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 2: cy = product[63..32]
5987c478bd9Sstevel@tonic-gate	subl	$1, %ecx
5997c478bd9Sstevel@tonic-gate	jz	.L27
6007c478bd9Sstevel@tonic-gate
6017c478bd9Sstevel@tonic-gate	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
6027c478bd9Sstevel@tonic-gate	movd	8(%edx), %mm2	/ 3: mm2 = r[i]
6037c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
6047c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 3: mm2 = digit * a[i] + r[i]
6057c478bd9Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 3: mm0 = digit * a[i] + r[i] + cy;
6067c478bd9Sstevel@tonic-gate	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
6077c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 3: cy = product[63..32]
6087c478bd9Sstevel@tonic-gate	subl	$1, %ecx
6097c478bd9Sstevel@tonic-gate	jz	.L27
6107c478bd9Sstevel@tonic-gate
6117c478bd9Sstevel@tonic-gate	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
6127c478bd9Sstevel@tonic-gate	movd	12(%edx), %mm2	/ 4: mm2 = r[i]
6137c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
6147c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 4: mm2 = digit * a[i] + r[i]
6157c478bd9Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 4: mm0 = digit * a[i] + r[i] + cy;
6167c478bd9Sstevel@tonic-gate	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
6177c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 4: cy = product[63..32]
6187c478bd9Sstevel@tonic-gate	subl	$1, %ecx
6197c478bd9Sstevel@tonic-gate	jz	.L27
6207c478bd9Sstevel@tonic-gate
6217c478bd9Sstevel@tonic-gate	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
6227c478bd9Sstevel@tonic-gate	movd	16(%edx), %mm2	/ 5: mm2 = r[i]
6237c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
6247c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 5: mm2 = digit * a[i] + r[i]
6257c478bd9Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 5: mm0 = digit * a[i] + r[i] + cy;
6267c478bd9Sstevel@tonic-gate	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
6277c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 5: cy = product[63..32]
6287c478bd9Sstevel@tonic-gate	subl	$1, %ecx
6297c478bd9Sstevel@tonic-gate	jz	.L27
6307c478bd9Sstevel@tonic-gate
6317c478bd9Sstevel@tonic-gate	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
6327c478bd9Sstevel@tonic-gate	movd	20(%edx), %mm2	/ 6: mm2 = r[i]
6337c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
6347c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 6: mm2 = digit * a[i] + r[i]
6357c478bd9Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 6: mm0 = digit * a[i] + r[i] + cy;
6367c478bd9Sstevel@tonic-gate	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
6377c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 6: cy = product[63..32]
6387c478bd9Sstevel@tonic-gate	subl	$1, %ecx
6397c478bd9Sstevel@tonic-gate	jz	.L27
6407c478bd9Sstevel@tonic-gate
6417c478bd9Sstevel@tonic-gate	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
6427c478bd9Sstevel@tonic-gate	movd	24(%edx), %mm2	/ 7: mm2 = r[i]
6437c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
6447c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 7: mm2 = digit * a[i] + r[i]
6457c478bd9Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 7: mm0 = digit * a[i] + r[i] + cy;
6467c478bd9Sstevel@tonic-gate	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
6477c478bd9Sstevel@tonic-gate	psrlq	$32, %mm0	/ 7: cy = product[63..32]
6487c478bd9Sstevel@tonic-gate
6497c478bd9Sstevel@tonic-gate.L27:
6507c478bd9Sstevel@tonic-gate	movd	%mm0, %eax
6517c478bd9Sstevel@tonic-gate	/ no emms.  caller is responsible for emms
6527c478bd9Sstevel@tonic-gate	ret
6537c478bd9Sstevel@tonic-gate	SET_SIZE(big_mul_add_vec_sse2_r)
6547c478bd9Sstevel@tonic-gate
6557c478bd9Sstevel@tonic-gate
6567c478bd9Sstevel@tonic-gate/ r = r + a * digit, r and a are vectors of length len
6577c478bd9Sstevel@tonic-gate/ returns the carry digit
6587c478bd9Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions
6597c478bd9Sstevel@tonic-gate/
6607c478bd9Sstevel@tonic-gate/ r		 8(%ebp)	%edx
6617c478bd9Sstevel@tonic-gate/ a		12(%ebp)	%ebx
6627c478bd9Sstevel@tonic-gate/ len		16(%ebp)	%ecx
6637c478bd9Sstevel@tonic-gate/ digit		20(%ebp)	%mm3
6647c478bd9Sstevel@tonic-gate/
6657c478bd9Sstevel@tonic-gate/ In userland, there is just the one function, big_mul_add_vec_sse2().
6667c478bd9Sstevel@tonic-gate/ But in the kernel, there are two variations:
6677c478bd9Sstevel@tonic-gate/    1. big_mul_add_vec_sse2() which does what is necessary to save and
6687c478bd9Sstevel@tonic-gate/       restore state, if necessary, and to ensure that preemtion is
6697c478bd9Sstevel@tonic-gate/       disabled.
6707c478bd9Sstevel@tonic-gate/    2. big_mul_add_vec_sse2_nsv() which just does the work;
6717c478bd9Sstevel@tonic-gate/       it is the caller's responsibility to ensure that MMX state
6727c478bd9Sstevel@tonic-gate/       does not need to be saved and restored and that preemption
6737c478bd9Sstevel@tonic-gate/       is already disabled.
6747c478bd9Sstevel@tonic-gate
6757c478bd9Sstevel@tonic-gate
6767c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE)
6777c478bd9Sstevel@tonic-gate
6787c478bd9Sstevel@tonic-gate	ENTRY(big_mul_add_vec_sse2)
6797c478bd9Sstevel@tonic-gate	pushl	%ebp
6807c478bd9Sstevel@tonic-gate	movl	%esp, %ebp
6817c478bd9Sstevel@tonic-gate	pushl	%ebx
6827c478bd9Sstevel@tonic-gate	pushl	%esi
6837c478bd9Sstevel@tonic-gate	KPREEMPT_DISABLE
6847c478bd9Sstevel@tonic-gate	TEST_TS(%ebx)
6857c478bd9Sstevel@tonic-gate	pushl	%ebx
6867c478bd9Sstevel@tonic-gate	jnz	.addvec_no_save
6877c478bd9Sstevel@tonic-gate	pushl	%edi
6887c478bd9Sstevel@tonic-gate	SAVE_MMX_0TO4(%edi)
6897c478bd9Sstevel@tonic-gate	movl	8(%ebp), %edx
6907c478bd9Sstevel@tonic-gate	movl	12(%ebp), %ebx
6917c478bd9Sstevel@tonic-gate	movl	16(%ebp), %ecx
6927c478bd9Sstevel@tonic-gate	movd	20(%ebp), %mm3
6937c478bd9Sstevel@tonic-gate	call	big_mul_add_vec_sse2_r
6947c478bd9Sstevel@tonic-gate	movl	%eax, %esi
6957c478bd9Sstevel@tonic-gate	RSTOR_MMX_0TO4(%edi)
6967c478bd9Sstevel@tonic-gate	popl	%edi
6977c478bd9Sstevel@tonic-gate	jmp	.addvec_rtn
6987c478bd9Sstevel@tonic-gate
6997c478bd9Sstevel@tonic-gate.addvec_no_save:
7007c478bd9Sstevel@tonic-gate	movl	8(%ebp), %edx
7017c478bd9Sstevel@tonic-gate	movl	12(%ebp), %ebx
7027c478bd9Sstevel@tonic-gate	movl	16(%ebp), %ecx
7037c478bd9Sstevel@tonic-gate	movd	20(%ebp), %mm3
7047c478bd9Sstevel@tonic-gate	call	big_mul_add_vec_sse2_r
7057c478bd9Sstevel@tonic-gate	movl	%eax, %esi
7067c478bd9Sstevel@tonic-gate
7077c478bd9Sstevel@tonic-gate.addvec_rtn:
7087c478bd9Sstevel@tonic-gate	emms
7097c478bd9Sstevel@tonic-gate	popl	%ebx
7107c478bd9Sstevel@tonic-gate	movl	%ebx, %cr0
7117c478bd9Sstevel@tonic-gate	KPREEMPT_ENABLE
7127c478bd9Sstevel@tonic-gate	movl	%esi, %eax
7137c478bd9Sstevel@tonic-gate	popl	%esi
7147c478bd9Sstevel@tonic-gate	popl	%ebx
7157c478bd9Sstevel@tonic-gate	leave
7167c478bd9Sstevel@tonic-gate	ret
7177c478bd9Sstevel@tonic-gate	SET_SIZE(big_mul_add_vec_sse2)
7187c478bd9Sstevel@tonic-gate
7197c478bd9Sstevel@tonic-gate	ENTRY(big_mul_add_vec_sse2_nsv)
7207c478bd9Sstevel@tonic-gate	pushl	%ebp
7217c478bd9Sstevel@tonic-gate	movl	%esp, %ebp
7227c478bd9Sstevel@tonic-gate	pushl	%ebx
7237c478bd9Sstevel@tonic-gate	movl	8(%ebp), %edx
7247c478bd9Sstevel@tonic-gate	movl	12(%ebp), %ebx
7257c478bd9Sstevel@tonic-gate	movl	16(%ebp), %ecx
7267c478bd9Sstevel@tonic-gate	movd	20(%ebp), %mm3
7277c478bd9Sstevel@tonic-gate	call	big_mul_add_vec_sse2_r
7287c478bd9Sstevel@tonic-gate	popl	%ebx
7297c478bd9Sstevel@tonic-gate	leave
7307c478bd9Sstevel@tonic-gate	ret
7317c478bd9Sstevel@tonic-gate	SET_SIZE(big_mul_add_vec_sse2_nsv)
7327c478bd9Sstevel@tonic-gate
7337c478bd9Sstevel@tonic-gate
7347c478bd9Sstevel@tonic-gate#else	/* !defined(MMX_MANAGE) */
7357c478bd9Sstevel@tonic-gate
7367c478bd9Sstevel@tonic-gate	ENTRY(big_mul_add_vec_sse2)
7377c478bd9Sstevel@tonic-gate	pushl	%ebp
7387c478bd9Sstevel@tonic-gate	movl	%esp, %ebp
7397c478bd9Sstevel@tonic-gate	pushl	%ebx
7407c478bd9Sstevel@tonic-gate	movl	8(%ebp), %edx
7417c478bd9Sstevel@tonic-gate	movl	12(%ebp), %ebx
7427c478bd9Sstevel@tonic-gate	movl	16(%ebp), %ecx
7437c478bd9Sstevel@tonic-gate	movd	20(%ebp), %mm3
7447c478bd9Sstevel@tonic-gate	call	big_mul_add_vec_sse2_r
7457c478bd9Sstevel@tonic-gate	popl	%ebx
7467c478bd9Sstevel@tonic-gate	emms
7477c478bd9Sstevel@tonic-gate	leave
7487c478bd9Sstevel@tonic-gate	ret
7497c478bd9Sstevel@tonic-gate	SET_SIZE(big_mul_add_vec_sse2)
7507c478bd9Sstevel@tonic-gate
7517c478bd9Sstevel@tonic-gate#endif	/* MMX_MANAGE */
7527c478bd9Sstevel@tonic-gate
7537c478bd9Sstevel@tonic-gate
7547c478bd9Sstevel@tonic-gate/ void
7557c478bd9Sstevel@tonic-gate/ big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
7567c478bd9Sstevel@tonic-gate/ {
7577c478bd9Sstevel@tonic-gate/ 	int i;
7587c478bd9Sstevel@tonic-gate/
7597c478bd9Sstevel@tonic-gate/ 	r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]);
7607c478bd9Sstevel@tonic-gate/ 	for (i = 1; i < blen; ++i)
7617c478bd9Sstevel@tonic-gate/ 		r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]);
7627c478bd9Sstevel@tonic-gate/ }
7637c478bd9Sstevel@tonic-gate
7647c478bd9Sstevel@tonic-gate
7657c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE)
7667c478bd9Sstevel@tonic-gate	ENTRY(big_mul_vec_sse2_fc)
7677c478bd9Sstevel@tonic-gate#else
7687c478bd9Sstevel@tonic-gate	ENTRY(big_mul_vec_sse2)
7697c478bd9Sstevel@tonic-gate#endif
7707c478bd9Sstevel@tonic-gate	subl	$0x8, %esp
7717c478bd9Sstevel@tonic-gate	pushl	%ebx
7727c478bd9Sstevel@tonic-gate	pushl	%ebp
7737c478bd9Sstevel@tonic-gate	pushl	%esi
7747c478bd9Sstevel@tonic-gate	pushl	%edi
7757c478bd9Sstevel@tonic-gate	movl	40(%esp), %eax
7767c478bd9Sstevel@tonic-gate	movl	%eax, 20(%esp)
7777c478bd9Sstevel@tonic-gate	pushl	(%eax)
7787c478bd9Sstevel@tonic-gate	movl	40(%esp), %edi
7797c478bd9Sstevel@tonic-gate	pushl	%edi
7807c478bd9Sstevel@tonic-gate	movl	40(%esp), %esi
7817c478bd9Sstevel@tonic-gate	pushl	%esi
7827c478bd9Sstevel@tonic-gate	movl	40(%esp), %ebx
7837c478bd9Sstevel@tonic-gate	pushl	%ebx
7847c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE)
7857c478bd9Sstevel@tonic-gate	call	big_mul_set_vec_sse2_nsv
7867c478bd9Sstevel@tonic-gate#else
7877c478bd9Sstevel@tonic-gate	call	big_mul_set_vec_sse2
7887c478bd9Sstevel@tonic-gate#endif
7897c478bd9Sstevel@tonic-gate	addl	$0x10, %esp
7907c478bd9Sstevel@tonic-gate	movl	%eax, (%ebx,%edi,4)
7917c478bd9Sstevel@tonic-gate	movl	44(%esp), %eax
7927c478bd9Sstevel@tonic-gate	movl	%eax, 16(%esp)
7937c478bd9Sstevel@tonic-gate	cmpl	$0x1, %eax
7947c478bd9Sstevel@tonic-gate	jle	.mulvec_rtn
7957c478bd9Sstevel@tonic-gate	movl	$0x1, %ebp
7967c478bd9Sstevel@tonic-gate
7979e1a718fSdarrenm	.align 16
7987c478bd9Sstevel@tonic-gate.mulvec_add:
7997c478bd9Sstevel@tonic-gate	movl	20(%esp), %eax
8007c478bd9Sstevel@tonic-gate	pushl	(%eax,%ebp,4)
8017c478bd9Sstevel@tonic-gate	pushl	%edi
8027c478bd9Sstevel@tonic-gate	pushl	%esi
8037c478bd9Sstevel@tonic-gate	leal	(%ebx,%ebp,4), %eax
8047c478bd9Sstevel@tonic-gate	pushl	%eax
8057c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE)
8067c478bd9Sstevel@tonic-gate	call	big_mul_add_vec_sse2_nsv
8077c478bd9Sstevel@tonic-gate#else
8087c478bd9Sstevel@tonic-gate	call	big_mul_add_vec_sse2
8097c478bd9Sstevel@tonic-gate#endif
8107c478bd9Sstevel@tonic-gate	addl	$0x10, %esp
8117c478bd9Sstevel@tonic-gate	leal	(%ebp,%edi), %ecx
8127c478bd9Sstevel@tonic-gate	movl	%eax, (%ebx,%ecx,4)
8137c478bd9Sstevel@tonic-gate	incl	%ebp
8147c478bd9Sstevel@tonic-gate	cmpl	16(%esp), %ebp
8157c478bd9Sstevel@tonic-gate	jl	.mulvec_add
8167c478bd9Sstevel@tonic-gate.mulvec_rtn:
8177c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE)
8187c478bd9Sstevel@tonic-gate	emms
8197c478bd9Sstevel@tonic-gate#endif
8207c478bd9Sstevel@tonic-gate	popl	%edi
8217c478bd9Sstevel@tonic-gate	popl	%esi
8227c478bd9Sstevel@tonic-gate	popl	%ebp
8237c478bd9Sstevel@tonic-gate	popl	%ebx
8247c478bd9Sstevel@tonic-gate	addl	$0x8, %esp
8257c478bd9Sstevel@tonic-gate	ret
8267c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE)
8277c478bd9Sstevel@tonic-gate	SET_SIZE(big_mul_vec_sse2_fc)
8287c478bd9Sstevel@tonic-gate#else
8297c478bd9Sstevel@tonic-gate	SET_SIZE(big_mul_vec_sse2)
8307c478bd9Sstevel@tonic-gate#endif
8317c478bd9Sstevel@tonic-gate
8327c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE)
8337c478bd9Sstevel@tonic-gate
8347c478bd9Sstevel@tonic-gate	ENTRY(big_mul_vec_sse2)
8357c478bd9Sstevel@tonic-gate	pushl	%ebp
8367c478bd9Sstevel@tonic-gate	movl	%esp, %ebp
8377c478bd9Sstevel@tonic-gate	subl	$8, %esp
8387c478bd9Sstevel@tonic-gate	pushl	%edi
8397c478bd9Sstevel@tonic-gate	KPREEMPT_DISABLE
8407c478bd9Sstevel@tonic-gate	TEST_TS(%eax)
8417c478bd9Sstevel@tonic-gate	movl	%eax, -8(%ebp)
8427c478bd9Sstevel@tonic-gate	jnz	.mulvec_no_save
8437c478bd9Sstevel@tonic-gate	SAVE_MMX_0TO4(%edi)
8447c478bd9Sstevel@tonic-gate	movl	%edi, -4(%ebp)
8457c478bd9Sstevel@tonic-gate.mulvec_no_save:
8467c478bd9Sstevel@tonic-gate	movl	24(%ebp), %eax		/ blen
8477c478bd9Sstevel@tonic-gate	pushl	%eax
8487c478bd9Sstevel@tonic-gate	movl	20(%ebp), %eax		/ b
8497c478bd9Sstevel@tonic-gate	pushl	%eax
8507c478bd9Sstevel@tonic-gate	movl	16(%ebp), %eax		/ alen
8517c478bd9Sstevel@tonic-gate	pushl	%eax
8527c478bd9Sstevel@tonic-gate	movl	12(%ebp), %eax		/ a
8537c478bd9Sstevel@tonic-gate	pushl	%eax
8547c478bd9Sstevel@tonic-gate	movl	8(%ebp), %eax		/ r
8557c478bd9Sstevel@tonic-gate	pushl	%eax
8567c478bd9Sstevel@tonic-gate	call	big_mul_vec_sse2_fc
8577c478bd9Sstevel@tonic-gate	addl	$20, %esp
8587c478bd9Sstevel@tonic-gate	movl	-8(%ebp), %eax
8597c478bd9Sstevel@tonic-gate	testl	$CR0_TS, %eax
8607c478bd9Sstevel@tonic-gate	jnz	.mulvec_no_rstr
8617c478bd9Sstevel@tonic-gate	movl	-4(%ebp), %edi
8627c478bd9Sstevel@tonic-gate	RSTOR_MMX_0TO4(%edi)
8637c478bd9Sstevel@tonic-gate.mulvec_no_rstr:
8647c478bd9Sstevel@tonic-gate	movl	%eax, %cr0
8657c478bd9Sstevel@tonic-gate	KPREEMPT_ENABLE
8667c478bd9Sstevel@tonic-gate	popl	%edi
8677c478bd9Sstevel@tonic-gate	leave
8687c478bd9Sstevel@tonic-gate	ret
8697c478bd9Sstevel@tonic-gate	SET_SIZE(big_mul_vec_sse2)
8707c478bd9Sstevel@tonic-gate
8717c478bd9Sstevel@tonic-gate#endif	/* MMX_MANAGE */
8727c478bd9Sstevel@tonic-gate
8737c478bd9Sstevel@tonic-gate
8747c478bd9Sstevel@tonic-gate
8757c478bd9Sstevel@tonic-gate#undef UNROLL
8767c478bd9Sstevel@tonic-gate#undef UNROLL32
8777c478bd9Sstevel@tonic-gate
8787c478bd9Sstevel@tonic-gate
8797c478bd9Sstevel@tonic-gate/ r = a * a, r and a are vectors of length len
8807c478bd9Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions
8817c478bd9Sstevel@tonic-gate/
8827c478bd9Sstevel@tonic-gate/ This function is not suitable for a truly general-purpose multiprecision
8837c478bd9Sstevel@tonic-gate/ arithmetic library, because it does not work for "small" numbers, that is
8847c478bd9Sstevel@tonic-gate/ numbers of 1 or 2 digits.  big_mul() just uses the ordinary big_mul_vec()
8857c478bd9Sstevel@tonic-gate/ for any small numbers.
8867c478bd9Sstevel@tonic-gate
8877c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE)
8887c478bd9Sstevel@tonic-gate	ENTRY(big_sqr_vec_sse2_fc)
8897c478bd9Sstevel@tonic-gate#else
8907c478bd9Sstevel@tonic-gate	ENTRY(big_sqr_vec_sse2)
8917c478bd9Sstevel@tonic-gate	pushl	%ebp
8927c478bd9Sstevel@tonic-gate	movl	%esp, %ebp
8937c478bd9Sstevel@tonic-gate#endif
8947c478bd9Sstevel@tonic-gate
8957c478bd9Sstevel@tonic-gate	pushl	%ebx
8967c478bd9Sstevel@tonic-gate	pushl	%edi
8977c478bd9Sstevel@tonic-gate	pushl	%esi
8987c478bd9Sstevel@tonic-gate
8997c478bd9Sstevel@tonic-gate	/ r[1..alen] = a[0] * a[1..alen-1]
9007c478bd9Sstevel@tonic-gate
9017c478bd9Sstevel@tonic-gate	movl	8(%ebp), %edi		/ r = arg(r)
9027c478bd9Sstevel@tonic-gate	movl	12(%ebp), %esi		/ a = arg(a)
9037c478bd9Sstevel@tonic-gate	movl	16(%ebp), %ecx		/ cnt = arg(alen)
9047c478bd9Sstevel@tonic-gate	movd	%ecx, %mm4		/ save_cnt = arg(alen)
9057c478bd9Sstevel@tonic-gate	leal	4(%edi), %edx		/ dst = &r[1]
9067c478bd9Sstevel@tonic-gate	movl	%esi, %ebx		/ src = a
9077c478bd9Sstevel@tonic-gate	movd	0(%ebx), %mm3		/ mm3 = a[0]
9087c478bd9Sstevel@tonic-gate	leal	4(%ebx), %ebx		/ src = &a[1]
9097c478bd9Sstevel@tonic-gate	subl	$1, %ecx		/ --cnt
9107c478bd9Sstevel@tonic-gate	call	big_mul_set_vec_sse2_r	/ r[1..alen-1] = a[0] * a[1..alen-1]
9117c478bd9Sstevel@tonic-gate	movl	%edi, %edx		/ dst = r
9127c478bd9Sstevel@tonic-gate	movl	%esi, %ebx		/ src = a
9137c478bd9Sstevel@tonic-gate	movd	%mm4, %ecx		/ cnt = save_cnt
9147c478bd9Sstevel@tonic-gate	movl	%eax, (%edx, %ecx, 4)	/ r[cnt] = cy
9157c478bd9Sstevel@tonic-gate
9167c478bd9Sstevel@tonic-gate/	/* High-level vector C pseudocode */
9177c478bd9Sstevel@tonic-gate/	for (i = 1; i < alen-1; ++i)
9187c478bd9Sstevel@tonic-gate/		r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1]
9197c478bd9Sstevel@tonic-gate/
9207c478bd9Sstevel@tonic-gate/	/* Same thing, but slightly lower level C-like pseudocode */
9217c478bd9Sstevel@tonic-gate/	i = 1;
9227c478bd9Sstevel@tonic-gate/	r = &arg_r[2*i + 1];
9237c478bd9Sstevel@tonic-gate/	a = &arg_a[i + 1];
9247c478bd9Sstevel@tonic-gate/	digit = arg_a[i];
9257c478bd9Sstevel@tonic-gate/	cnt = alen - 3;
9267c478bd9Sstevel@tonic-gate/	while (cnt != 0) {
9277c478bd9Sstevel@tonic-gate/		r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit);
9287c478bd9Sstevel@tonic-gate/		r += 2;
9297c478bd9Sstevel@tonic-gate/		++a;
9307c478bd9Sstevel@tonic-gate/		--cnt;
9317c478bd9Sstevel@tonic-gate/	}
9327c478bd9Sstevel@tonic-gate/
9337c478bd9Sstevel@tonic-gate/	/* Same thing, but even lower level
9347c478bd9Sstevel@tonic-gate/	 * For example, pointers are raw pointers,
9357c478bd9Sstevel@tonic-gate/	 * with no scaling by object size.
9367c478bd9Sstevel@tonic-gate/	 */
9377c478bd9Sstevel@tonic-gate/	r = arg_r + 12;	/* i == 1; 2i + 1 == 3;  4*3 == 12; */
9387c478bd9Sstevel@tonic-gate/	a = arg_a + 8;
9397c478bd9Sstevel@tonic-gate/	digit = *(arg_a + 4);
9407c478bd9Sstevel@tonic-gate/	cnt = alen - 3;
9417c478bd9Sstevel@tonic-gate/	while (cnt != 0) {
9427c478bd9Sstevel@tonic-gate/		cy = big_mul_add_vec_sse2_r();
9437c478bd9Sstevel@tonic-gate/		*(r + 4 * cnt) = cy;
9447c478bd9Sstevel@tonic-gate/		r += 8;
9457c478bd9Sstevel@tonic-gate/		a += 4;
9467c478bd9Sstevel@tonic-gate/		--cnt;
9477c478bd9Sstevel@tonic-gate/	}
9487c478bd9Sstevel@tonic-gate
9497c478bd9Sstevel@tonic-gate	leal	4(%edi), %edi		/ r += 4; r = &r[1]
9507c478bd9Sstevel@tonic-gate	leal	4(%esi), %esi		/ a += 4; a = &a[1]
9517c478bd9Sstevel@tonic-gate	movd	%mm4, %ecx		/ cnt = save
9527c478bd9Sstevel@tonic-gate	subl	$2, %ecx		/ cnt = alen - 2; i in 1..alen-2
9537c478bd9Sstevel@tonic-gate	movd	%ecx, %mm4		/ save_cnt
9547c478bd9Sstevel@tonic-gate	jecxz	.L32			/ while (cnt != 0) {
9557c478bd9Sstevel@tonic-gate.L31:
9567c478bd9Sstevel@tonic-gate	movd	0(%esi), %mm3		/ digit = a[i]
9577c478bd9Sstevel@tonic-gate	leal	4(%esi), %esi		/ a += 4; a = &a[1]; a = &a[i + 1]
9587c478bd9Sstevel@tonic-gate	leal	8(%edi), %edi		/ r += 8; r = &r[2]; r = &r[2 * i + 1]
9597c478bd9Sstevel@tonic-gate	movl	%edi, %edx		/ edx = r
9607c478bd9Sstevel@tonic-gate	movl	%esi, %ebx		/ ebx = a
9617c478bd9Sstevel@tonic-gate	cmp	$1, %ecx		/ The last triangle term is special
9627c478bd9Sstevel@tonic-gate	jz	.L32
9637c478bd9Sstevel@tonic-gate	call	big_mul_add_vec_sse2_r
9647c478bd9Sstevel@tonic-gate	movd	%mm4, %ecx		/ cnt = save_cnt
9657c478bd9Sstevel@tonic-gate	movl	%eax, (%edi, %ecx, 4)	/ r[cnt] = cy
9667c478bd9Sstevel@tonic-gate	subl	$1, %ecx		/ --cnt
9677c478bd9Sstevel@tonic-gate	movd	%ecx, %mm4		/ save_cnt = cnt
9687c478bd9Sstevel@tonic-gate	jmp	.L31			/ }
9697c478bd9Sstevel@tonic-gate
9707c478bd9Sstevel@tonic-gate.L32:
9717c478bd9Sstevel@tonic-gate	movd	0(%ebx), %mm1		/ mm1 = a[i + 1]
9727c478bd9Sstevel@tonic-gate	movd	0(%edx), %mm2		/ mm2 = r[2 * i + 1]
9737c478bd9Sstevel@tonic-gate	pmuludq	%mm3, %mm1		/ mm1 = p = digit * a[i + 1]
9747c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2		/ mm2 = r[2 * i + 1] + p
9757c478bd9Sstevel@tonic-gate	movd	%mm2, 0(%edx)		/ r[2 * i + 1] += lo32(p)
9767c478bd9Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = cy
9777c478bd9Sstevel@tonic-gate	movd	%mm2, 4(%edx)		/ r[2 * i + 2] = cy
9787c478bd9Sstevel@tonic-gate	pxor	%mm2, %mm2
9797c478bd9Sstevel@tonic-gate	movd	%mm2, 8(%edx)		/ r[2 * i + 3] = 0
9807c478bd9Sstevel@tonic-gate
9817c478bd9Sstevel@tonic-gate	movl	8(%ebp), %edx		/ r = arg(r)
9827c478bd9Sstevel@tonic-gate	movl	12(%ebp), %ebx		/ a = arg(a)
9837c478bd9Sstevel@tonic-gate	movl	16(%ebp), %ecx		/ cnt = arg(alen)
9847c478bd9Sstevel@tonic-gate
9857c478bd9Sstevel@tonic-gate	/ compute low-order corner
9867c478bd9Sstevel@tonic-gate	/ p = a[0]**2
9877c478bd9Sstevel@tonic-gate	/ r[0] = lo32(p)
9887c478bd9Sstevel@tonic-gate	/ cy   = hi32(p)
9897c478bd9Sstevel@tonic-gate	movd	0(%ebx), %mm2		/ mm2 = a[0]
9907c478bd9Sstevel@tonic-gate	pmuludq	%mm2, %mm2		/ mm2 = p = a[0]**2
9917c478bd9Sstevel@tonic-gate	movd	%mm2, 0(%edx)		/ r[0] = lo32(p)
9927c478bd9Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = cy = hi32(p)
9937c478bd9Sstevel@tonic-gate
9947c478bd9Sstevel@tonic-gate	/ p = 2 * r[1]
9957c478bd9Sstevel@tonic-gate	/ t = p + cy
9967c478bd9Sstevel@tonic-gate	/ r[1] = lo32(t)
9977c478bd9Sstevel@tonic-gate	/ cy   = hi32(t)
9987c478bd9Sstevel@tonic-gate	movd	4(%edx), %mm1		/ mm1 = r[1]
9997c478bd9Sstevel@tonic-gate	psllq	$1, %mm1		/ mm1 = p = 2 * r[1]
10007c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2		/ mm2 = t = p + cy
10017c478bd9Sstevel@tonic-gate	movd	%mm2, 4(%edx)		/ r[1] = low32(t)
10027c478bd9Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = cy = hi32(t)
10037c478bd9Sstevel@tonic-gate
10047c478bd9Sstevel@tonic-gate	/ r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3]
10057c478bd9Sstevel@tonic-gate	subl	$2, %ecx		/ cnt = alen - 2
10067c478bd9Sstevel@tonic-gate.L34:
10077c478bd9Sstevel@tonic-gate	movd	4(%ebx), %mm0		/ mm0 = diag = a[i+1]
10087c478bd9Sstevel@tonic-gate	pmuludq	%mm0, %mm0		/ mm0 = p = diag**2
10097c478bd9Sstevel@tonic-gate	paddq	%mm0, %mm2		/ mm2 = t = p + cy
10107c478bd9Sstevel@tonic-gate	movd	%mm2, %eax
10117c478bd9Sstevel@tonic-gate	movd	%eax, %mm1		/ mm1 = lo32(t)
10127c478bd9Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = hi32(t)
10137c478bd9Sstevel@tonic-gate
10147c478bd9Sstevel@tonic-gate	movd	8(%edx), %mm3		/ mm3 = r[2*i]
10157c478bd9Sstevel@tonic-gate	psllq	$1, %mm3		/ mm3 = 2*r[2*i]
10167c478bd9Sstevel@tonic-gate	paddq	%mm3, %mm1		/ mm1 = 2*r[2*i] + lo32(t)
10177c478bd9Sstevel@tonic-gate	movd	%mm1, 8(%edx)		/ r[2*i] = 2*r[2*i] + lo32(t)
10187c478bd9Sstevel@tonic-gate	psrlq	$32, %mm1
10197c478bd9Sstevel@tonic-gate	paddq	%mm1, %mm2
10207c478bd9Sstevel@tonic-gate
10217c478bd9Sstevel@tonic-gate	movd	12(%edx), %mm3		/ mm3 = r[2*i+1]
10227c478bd9Sstevel@tonic-gate	psllq	$1, %mm3		/ mm3 = 2*r[2*i+1]
10237c478bd9Sstevel@tonic-gate	paddq	%mm3, %mm2		/ mm2 = 2*r[2*i+1] + hi32(t)
10247c478bd9Sstevel@tonic-gate	movd	%mm2, 12(%edx)		/ r[2*i+1] = mm2
10257c478bd9Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = cy
10267c478bd9Sstevel@tonic-gate	leal	8(%edx), %edx		/ r += 2
10277c478bd9Sstevel@tonic-gate	leal	4(%ebx), %ebx		/ ++a
10287c478bd9Sstevel@tonic-gate	subl	$1, %ecx		/ --cnt
10297c478bd9Sstevel@tonic-gate	jnz	.L34
10307c478bd9Sstevel@tonic-gate
10317c478bd9Sstevel@tonic-gate	/ Carry from last triangle term must participate in doubling,
10327c478bd9Sstevel@tonic-gate	/ but this step isn't paired up with a squaring the elements
10337c478bd9Sstevel@tonic-gate	/ of the inner diagonal.
10347c478bd9Sstevel@tonic-gate	/ r[$-3..$-2] += 2 * r[$-3..$-2] + cy
10357c478bd9Sstevel@tonic-gate	movd	8(%edx), %mm3		/ mm3 = r[2*i]
10367c478bd9Sstevel@tonic-gate	psllq	$1, %mm3		/ mm3 = 2*r[2*i]
10377c478bd9Sstevel@tonic-gate	paddq	%mm3, %mm2		/ mm2 = 2*r[2*i] + cy
10387c478bd9Sstevel@tonic-gate	movd	%mm2, 8(%edx)		/ r[2*i] = lo32(2*r[2*i] + cy)
10397c478bd9Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = cy = hi32(2*r[2*i] + cy)
10407c478bd9Sstevel@tonic-gate
10417c478bd9Sstevel@tonic-gate	movd	12(%edx), %mm3		/ mm3 = r[2*i+1]
10427c478bd9Sstevel@tonic-gate	psllq	$1, %mm3		/ mm3 = 2*r[2*i+1]
10437c478bd9Sstevel@tonic-gate	paddq	%mm3, %mm2		/ mm2 = 2*r[2*i+1] + cy
10447c478bd9Sstevel@tonic-gate	movd	%mm2, 12(%edx)		/ r[2*i+1] = mm2
10457c478bd9Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = cy
10467c478bd9Sstevel@tonic-gate
10477c478bd9Sstevel@tonic-gate	/ compute high-order corner and add it in
10487c478bd9Sstevel@tonic-gate	/ p = a[alen - 1]**2
10497c478bd9Sstevel@tonic-gate	/ t = p + cy
10507c478bd9Sstevel@tonic-gate	/ r[alen + alen - 2] += lo32(t)
10517c478bd9Sstevel@tonic-gate	/ cy = hi32(t)
10527c478bd9Sstevel@tonic-gate	/ r[alen + alen - 1] = cy
10537c478bd9Sstevel@tonic-gate	movd	4(%ebx), %mm0		/ mm0 = a[$-1]
10547c478bd9Sstevel@tonic-gate	movd	8(%edx), %mm3		/ mm3 = r[$-2]
10557c478bd9Sstevel@tonic-gate	pmuludq	%mm0, %mm0		/ mm0 = p = a[$-1]**2
10567c478bd9Sstevel@tonic-gate	paddq	%mm0, %mm2		/ mm2 = t = p + cy
10577c478bd9Sstevel@tonic-gate	paddq	%mm3, %mm2		/ mm2 = r[$-2] + t
10587c478bd9Sstevel@tonic-gate	movd	%mm2, 8(%edx)		/ r[$-2] = lo32(r[$-2] + t)
10597c478bd9Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = cy = hi32(r[$-2] + t)
10607c478bd9Sstevel@tonic-gate	movd	12(%edx), %mm3
10617c478bd9Sstevel@tonic-gate	paddq	%mm3, %mm2
10627c478bd9Sstevel@tonic-gate	movd	%mm2, 12(%edx)		/ r[$-1] += cy
10637c478bd9Sstevel@tonic-gate
10647c478bd9Sstevel@tonic-gate.L35:
10657c478bd9Sstevel@tonic-gate	emms
10667c478bd9Sstevel@tonic-gate	popl	%esi
10677c478bd9Sstevel@tonic-gate	popl	%edi
10687c478bd9Sstevel@tonic-gate	popl	%ebx
10697c478bd9Sstevel@tonic-gate
10707c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE)
10717c478bd9Sstevel@tonic-gate	ret
10727c478bd9Sstevel@tonic-gate	SET_SIZE(big_sqr_vec_sse2_fc)
10737c478bd9Sstevel@tonic-gate#else
10747c478bd9Sstevel@tonic-gate	leave
10757c478bd9Sstevel@tonic-gate	ret
10767c478bd9Sstevel@tonic-gate	SET_SIZE(big_sqr_vec_sse2)
10777c478bd9Sstevel@tonic-gate#endif
10787c478bd9Sstevel@tonic-gate
10797c478bd9Sstevel@tonic-gate
10807c478bd9Sstevel@tonic-gate#if defined(MMX_MANAGE)
10817c478bd9Sstevel@tonic-gate	ENTRY(big_sqr_vec_sse2)
10827c478bd9Sstevel@tonic-gate	pushl	%ebp
10837c478bd9Sstevel@tonic-gate	movl	%esp, %ebp
10847c478bd9Sstevel@tonic-gate	KPREEMPT_DISABLE
10857c478bd9Sstevel@tonic-gate	TEST_TS(%ebx)
10867c478bd9Sstevel@tonic-gate	pushl	%ebx
10877c478bd9Sstevel@tonic-gate	jnz	.sqr_no_save
10887c478bd9Sstevel@tonic-gate	pushl	%edi
10897c478bd9Sstevel@tonic-gate	SAVE_MMX_0TO4(%edi)
10907c478bd9Sstevel@tonic-gate	call	big_sqr_vec_sse2_fc
10917c478bd9Sstevel@tonic-gate	RSTOR_MMX_0TO4(%edi)
10927c478bd9Sstevel@tonic-gate	popl	%edi
10937c478bd9Sstevel@tonic-gate	jmp	.sqr_rtn
10947c478bd9Sstevel@tonic-gate
10957c478bd9Sstevel@tonic-gate.sqr_no_save:
10967c478bd9Sstevel@tonic-gate	call	big_sqr_vec_sse2_fc
10977c478bd9Sstevel@tonic-gate
10987c478bd9Sstevel@tonic-gate.sqr_rtn:
10997c478bd9Sstevel@tonic-gate	popl	%ebx
11007c478bd9Sstevel@tonic-gate	movl	%ebx, %cr0
11017c478bd9Sstevel@tonic-gate	KPREEMPT_ENABLE
11027c478bd9Sstevel@tonic-gate	leave
11037c478bd9Sstevel@tonic-gate	ret
11047c478bd9Sstevel@tonic-gate	SET_SIZE(big_sqr_vec_sse2)
11057c478bd9Sstevel@tonic-gate
11067c478bd9Sstevel@tonic-gate#endif	/* MMX_MANAGE */
11077c478bd9Sstevel@tonic-gate
11087c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------
11097c478bd9Sstevel@tonic-gate/		UMUL Implementations
11107c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------
11117c478bd9Sstevel@tonic-gate
11127c478bd9Sstevel@tonic-gate
11137c478bd9Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len
11147c478bd9Sstevel@tonic-gate/ returns the carry digit
11157c478bd9Sstevel@tonic-gate/ Does not use any MMX, SSE, or SSE2 instructions.
11167c478bd9Sstevel@tonic-gate/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
11177c478bd9Sstevel@tonic-gate/ This is a fall-back implementation for x86 models that do not support
11187c478bd9Sstevel@tonic-gate/ the PMULUDQ instruction.
11197c478bd9Sstevel@tonic-gate/
11207c478bd9Sstevel@tonic-gate/ uint32_t
11217c478bd9Sstevel@tonic-gate/ big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
11227c478bd9Sstevel@tonic-gate/
11237c478bd9Sstevel@tonic-gate/ r		 8(%ebp)	%edx	%edi
11247c478bd9Sstevel@tonic-gate/ a		12(%ebp)	%ebx	%esi
11257c478bd9Sstevel@tonic-gate/ len		16(%ebp)	%ecx
11267c478bd9Sstevel@tonic-gate/ digit		20(%ebp)	%esi
11277c478bd9Sstevel@tonic-gate
11287c478bd9Sstevel@tonic-gate	ENTRY(big_mul_set_vec_umul)
11297c478bd9Sstevel@tonic-gate	pushl	%ebp
11307c478bd9Sstevel@tonic-gate	movl	%esp, %ebp
11317c478bd9Sstevel@tonic-gate	pushl	%esi
11327c478bd9Sstevel@tonic-gate	pushl	%edi
11337c478bd9Sstevel@tonic-gate	pushl	%ebx
11347c478bd9Sstevel@tonic-gate	movl	16(%ebp), %ecx
11357c478bd9Sstevel@tonic-gate	xorl	%ebx, %ebx	/ cy = 0
11367c478bd9Sstevel@tonic-gate	testl	%ecx, %ecx
11377c478bd9Sstevel@tonic-gate	movl	8(%ebp), %edi
11387c478bd9Sstevel@tonic-gate	movl	12(%ebp), %esi
11397c478bd9Sstevel@tonic-gate	je	.L57
11407c478bd9Sstevel@tonic-gate
11417c478bd9Sstevel@tonic-gate.L55:
11427c478bd9Sstevel@tonic-gate	movl	(%esi), %eax	/ eax = a[i]
11437c478bd9Sstevel@tonic-gate	leal	4(%esi), %esi	/ ++a
11447c478bd9Sstevel@tonic-gate	mull	20(%ebp)	/ edx:eax = a[i] * digit
11457c478bd9Sstevel@tonic-gate	addl	%ebx, %eax
11467c478bd9Sstevel@tonic-gate	adcl	$0, %edx	/ edx:eax = a[i] * digit + cy
11477c478bd9Sstevel@tonic-gate	movl	%eax, (%edi)	/ r[i] = product[31..0]
11487c478bd9Sstevel@tonic-gate	movl	%edx, %ebx	/ cy = product[63..32]
11497c478bd9Sstevel@tonic-gate	leal	4(%edi), %edi	/ ++r
11507c478bd9Sstevel@tonic-gate	decl	%ecx		/ --len
11517c478bd9Sstevel@tonic-gate	jnz	.L55		/ while (len != 0)
11527c478bd9Sstevel@tonic-gate.L57:
11537c478bd9Sstevel@tonic-gate	movl	%ebx, %eax
11547c478bd9Sstevel@tonic-gate	popl	%ebx
11557c478bd9Sstevel@tonic-gate	popl	%edi
11567c478bd9Sstevel@tonic-gate	popl	%esi
11577c478bd9Sstevel@tonic-gate	leave
11587c478bd9Sstevel@tonic-gate	ret
11597c478bd9Sstevel@tonic-gate	SET_SIZE(big_mul_set_vec_umul)
11607c478bd9Sstevel@tonic-gate
11617c478bd9Sstevel@tonic-gate
11627c478bd9Sstevel@tonic-gate/ r = r + a * digit, r and a are vectors of length len
11637c478bd9Sstevel@tonic-gate/ returns the carry digit
11647c478bd9Sstevel@tonic-gate/ Does not use any MMX, SSE, or SSE2 instructions.
11657c478bd9Sstevel@tonic-gate/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
11667c478bd9Sstevel@tonic-gate/ This is a fall-back implementation for x86 models that do not support
11677c478bd9Sstevel@tonic-gate/ the PMULUDQ instruction.
11687c478bd9Sstevel@tonic-gate/
11697c478bd9Sstevel@tonic-gate/ uint32_t
11707c478bd9Sstevel@tonic-gate/ big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
11717c478bd9Sstevel@tonic-gate/
11727c478bd9Sstevel@tonic-gate/ r		 8(%ebp)	%edx	%edi
11737c478bd9Sstevel@tonic-gate/ a		12(%ebp)	%ebx	%esi
11747c478bd9Sstevel@tonic-gate/ len		16(%ebp)	%ecx
11757c478bd9Sstevel@tonic-gate/ digit		20(%ebp)	%esi
11767c478bd9Sstevel@tonic-gate
11777c478bd9Sstevel@tonic-gate	ENTRY(big_mul_add_vec_umul)
11787c478bd9Sstevel@tonic-gate	pushl	%ebp
11797c478bd9Sstevel@tonic-gate	movl	%esp, %ebp
11807c478bd9Sstevel@tonic-gate	pushl	%esi
11817c478bd9Sstevel@tonic-gate	pushl	%edi
11827c478bd9Sstevel@tonic-gate	pushl	%ebx
11837c478bd9Sstevel@tonic-gate	movl	16(%ebp), %ecx
11847c478bd9Sstevel@tonic-gate	xorl	%ebx, %ebx	/ cy = 0
11857c478bd9Sstevel@tonic-gate	testl	%ecx, %ecx
11867c478bd9Sstevel@tonic-gate	movl	8(%ebp), %edi
11877c478bd9Sstevel@tonic-gate	movl	12(%ebp), %esi
11887c478bd9Sstevel@tonic-gate	je	.L67
11897c478bd9Sstevel@tonic-gate	.align 4
11907c478bd9Sstevel@tonic-gate.L65:
11917c478bd9Sstevel@tonic-gate	movl	(%esi), %eax	/ eax = a[i]
11927c478bd9Sstevel@tonic-gate	leal	4(%esi), %esi	/ ++a
11937c478bd9Sstevel@tonic-gate	mull	20(%ebp)	/ edx:eax = a[i] * digit
11947c478bd9Sstevel@tonic-gate	addl	(%edi), %eax
11957c478bd9Sstevel@tonic-gate	adcl	$0, %edx	/ edx:eax = a[i] * digit + r[i]
11967c478bd9Sstevel@tonic-gate	addl	%ebx, %eax
11977c478bd9Sstevel@tonic-gate	adcl	$0, %edx	/ edx:eax = a[i] * digit + r[i] + cy
11987c478bd9Sstevel@tonic-gate	movl	%eax, (%edi)	/ r[i] = product[31..0]
11997c478bd9Sstevel@tonic-gate	movl	%edx, %ebx	/ cy = product[63..32]
12007c478bd9Sstevel@tonic-gate	leal	4(%edi), %edi	/ ++r
12017c478bd9Sstevel@tonic-gate	decl	%ecx		/ --len
12027c478bd9Sstevel@tonic-gate	jnz	.L65		/ while (len != 0)
12037c478bd9Sstevel@tonic-gate.L67:
12047c478bd9Sstevel@tonic-gate	movl	%ebx, %eax
12057c478bd9Sstevel@tonic-gate	popl	%ebx
12067c478bd9Sstevel@tonic-gate	popl	%edi
12077c478bd9Sstevel@tonic-gate	popl	%esi
12087c478bd9Sstevel@tonic-gate	leave
12097c478bd9Sstevel@tonic-gate	ret
12107c478bd9Sstevel@tonic-gate	SET_SIZE(big_mul_add_vec_umul)
12117c478bd9Sstevel@tonic-gate
12127c478bd9Sstevel@tonic-gate#endif	/* __lint */
1213