xref: /titanic_50/usr/src/common/bignum/i386/bignum_i386_asm.s (revision 3eae19d9cf3390cf5b75e10c9c1945fd36ad856a)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/asm_linkage.h>
30#include <sys/x86_archext.h>
31#include <sys/controlregs.h>
32
33#if defined(__lint)
34
35#include <sys/types.h>
36
37uint32_t
38bignum_use_sse2()
39{ return (0); }
40
41/* Not to be called by C code */
42/* ARGSUSED */
43uint32_t
44big_mul_set_vec_sse2_r()
45{ return (0); }
46
47/* Not to be called by C code */
48/* ARGSUSED */
49uint32_t
50big_mul_add_vec_sse2_r()
51{ return (0); }
52
53/* ARGSUSED */
54uint32_t
55big_mul_set_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
56{ return (0); }
57
58/* ARGSUSED */
59uint32_t
60big_mul_add_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
61{ return (0); }
62
63/* ARGSUSED */
64void
65big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
66{}
67
68/* ARGSUSED */
69void
70big_sqr_vec_sse2(uint32_t *r, uint32_t *a, int len)
71{}
72
73#if defined(MMX_MANAGE)
74
75/* ARGSUSED */
76uint32_t
77big_mul_set_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
78{ return (0); }
79
80/* ARGSUSED */
81uint32_t
82big_mul_add_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
83{ return (0); }
84
85/* Not to be called by C code */
86/* ARGSUSED */
87void
88big_sqr_vec_sse2_fc(uint32_t *r, uint32_t *a, int len)
89{}
90
91#endif	/* MMX_MANAGE */
92
93/*
94 * UMUL
95 *
96 */
97
98/* ARGSUSED */
99uint32_t
100big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
101{ return (0); }
102
103/* ARGSUSED */
104uint32_t
105big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
106{ return (0); }
107
108#else	/* __lint */
109
110#if defined(MMX_MANAGE)
111
112#if defined(_KERNEL)
113
114#define	KPREEMPT_DISABLE call kpr_disable
115#define	KPREEMPT_ENABLE call kpr_enable
116#define	TEST_TS(reg)					\
117	movl	%cr0, reg;				\
118	clts;						\
119	testl	$CR0_TS, reg
120
121#else	/* _KERNEL */
122
123#define	KPREEMPT_DISABLE
124#define	KPREEMPT_ENABLE
125
126#define	TEST_TS(reg)					\
127	movl	$0, reg;				\
128	testl	$CR0_TS, reg
129
130#endif	/* _KERNEL */
131
132#define	MMX_SIZE 8
133#define	MMX_ALIGN 8
134
135#define	SAVE_MMX_PROLOG(sreg, nreg)			\
136	subl	$_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;	\
137	movl	%esp, sreg;				\
138	addl	$MMX_ALIGN, sreg;			\
139	andl	$-1![MMX_ALIGN-1], sreg;
140
141#define	RSTOR_MMX_EPILOG(nreg)				\
142	addl	$_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;
143
144#define	SAVE_MMX_0TO4(sreg)			\
145	SAVE_MMX_PROLOG(sreg, 5);		\
146	movq	%mm0, 0(sreg);			\
147	movq	%mm1, 8(sreg);			\
148	movq	%mm2, 16(sreg);			\
149	movq	%mm3, 24(sreg);			\
150	movq	%mm4, 32(sreg)
151
152#define	RSTOR_MMX_0TO4(sreg)			\
153	movq	0(sreg), %mm0;			\
154	movq	8(sreg), %mm1;			\
155	movq	16(sreg), %mm2;			\
156	movq	24(sreg), %mm3;			\
157	movq	32(sreg), %mm4;			\
158	RSTOR_MMX_EPILOG(5)
159
160#endif	/* MMX_MANAGE */
161
162/ Note: this file contains implementations for
163/	big_mul_set_vec()
164/	big_mul_add_vec()
165/	big_mul_vec()
166/	big_sqr_vec()
167/ One set of implementations is for SSE2-capable models.
168/ The other uses no MMX, SSE, or SSE2 instructions, only
169/ the x86 32 X 32 -> 64 unsigned multiply instruction, MUL.
170/
171/ The code for the implementations is grouped by SSE2 vs UMUL,
172/ rather than grouping pairs of implementations for each function.
173/ This is because the bignum implementation gets "imprinted"
174/ on the correct implementation, at the time of first use,
175/ so none of the code for the other implementations is ever
176/ executed.  So, it is a no-brainer to layout the code to minimize
177/ the "footprint" of executed code.
178
179/ Can we use SSE2 instructions?  Return value is non-zero
180/ if we can.
181/
182/ Note:
183/   Using the cpuid instruction directly would work equally
184/   well in userland and in the kernel, but we do not use the
185/   cpuid instruction in the kernel, we use the x86_feature
186/   variable, instead.  This means we honor any decisions
187/   the kernel startup code may have made in setting this
188/   variable, including disabling SSE2 because of settings
189/   in /etc/system.  It might even be a good idea to honor
190/   this kind of setting in userland, as well, but the variable,
191/   x86-feature is not readily available to userland processes.
192/
193/ uint32_t
194/ bignum_use_sse2()
195
196	ENTRY(bignum_use_sse2)
197#if defined(_KERNEL)
198	movl	x86_feature, %eax
199	andl	$X86_SSE2, %eax
200#else	/* _KERNEL */
201	pushl	%ebx
202	movl	$1, %eax		/ Get feature information
203	cpuid
204	movl	%edx, %eax		/ set return value
205	popl	%ebx
206	andl	$CPUID_INTC_EDX_SSE2, %eax
207#endif	/* _KERNEL */
208	ret
209	SET_SIZE(bignum_use_sse2)
210
211
212/ ------------------------------------------------------------------------
213/		SSE2 Implementations
214/ ------------------------------------------------------------------------
215
216/ r = a * digit, r and a are vectors of length len
217/ returns the carry digit
218/ Suitable only for x86 models that support SSE2 instruction set extensions
219/
220/ uint32_t
221/ big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
222/
223/ r	%edx
224/ a	%ebx
225/ len	%ecx
226/ digit	%mm3
227/
228/ Does not touch the following registers: %esi, %edi, %mm4
229/
230/ N.B.:
231/   This is strictly for internal use.
232/   The interface is very light-weight.
233/   All parameters are passed in registers.
234/   It does not conform to the SYSV x86 ABI.
235/   So, don't even think about calling this function directly from C code.
236/
237/ The basic multiply digit loop is unrolled 8 times.
238/ Each comment is preceded by an instance number.
239/ Instructions that have been moved retain their original, "natural"
240/ instance number.  It should be easier this way to follow
241/ the step-wise refinement process that went into constructing
242/ the final code.
243
244#define	UNROLL		8
245#define	UNROLL32	32
246
247	ENTRY(big_mul_set_vec_sse2_r)
248	xorl	%eax, %eax	/ if (len == 0) return (0);
249	testl	%ecx, %ecx
250	jz	.L17
251
252	pxor	%mm0, %mm0	/ cy = 0
253
254.L15:
255	cmpl	$UNROLL, %ecx
256	jl	.L16
257	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
258	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
259	paddq	%mm1, %mm0	/ 1: mm0 = digit * a[i] + cy;
260	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
261	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
262	psrlq	$32, %mm0	/ 1: cy = product[63..32]
263
264	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
265	paddq	%mm1, %mm0	/ 2: mm0 = digit * a[i] + cy;
266	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
267	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
268	psrlq	$32, %mm0	/ 2: cy = product[63..32]
269
270	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
271	paddq	%mm1, %mm0	/ 3: mm0 = digit * a[i] + cy;
272	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
273	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
274	psrlq	$32, %mm0	/ 3: cy = product[63..32]
275
276	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
277	paddq	%mm1, %mm0	/ 4: mm0 = digit * a[i] + cy;
278	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
279	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
280	psrlq	$32, %mm0	/ 4: cy = product[63..32]
281
282	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
283	paddq	%mm1, %mm0	/ 5: mm0 = digit * a[i] + cy;
284	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
285	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
286	psrlq	$32, %mm0	/ 5: cy = product[63..32]
287
288	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
289	paddq	%mm1, %mm0	/ 6: mm0 = digit * a[i] + cy;
290	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
291	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
292	psrlq	$32, %mm0	/ 6: cy = product[63..32]
293
294	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
295	paddq	%mm1, %mm0	/ 7: mm0 = digit * a[i] + cy;
296	movd	28(%ebx), %mm1	/ 8: mm1 = a[i]
297	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
298	psrlq	$32, %mm0	/ 7: cy = product[63..32]
299
300	pmuludq	%mm3, %mm1	/ 8: mm1 = digit * a[i]
301	paddq	%mm1, %mm0	/ 8: mm0 = digit * a[i] + cy;
302	movd	%mm0, 28(%edx)	/ 8: r[i] = product[31..0]
303	psrlq	$32, %mm0	/ 8: cy = product[63..32]
304
305	leal	UNROLL32(%ebx), %ebx	/ a += UNROLL
306	leal	UNROLL32(%edx), %edx	/ r += UNROLL
307	subl	$UNROLL, %ecx		/ len -= UNROLL
308	jz	.L17
309	jmp	.L15
310
311.L16:
312	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
313	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
314	paddq	%mm1, %mm0	/ 1: mm0 = digit * a[i] + cy;
315	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
316	psrlq	$32, %mm0	/ 1: cy = product[63..32]
317	subl	$1, %ecx
318	jz	.L17
319
320	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
321	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
322	paddq	%mm1, %mm0	/ 2: mm0 = digit * a[i] + cy;
323	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
324	psrlq	$32, %mm0	/ 2: cy = product[63..32]
325	subl	$1, %ecx
326	jz	.L17
327
328	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
329	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
330	paddq	%mm1, %mm0	/ 3: mm0 = digit * a[i] + cy;
331	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
332	psrlq	$32, %mm0	/ 3: cy = product[63..32]
333	subl	$1, %ecx
334	jz	.L17
335
336	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
337	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
338	paddq	%mm1, %mm0	/ 4: mm0 = digit * a[i] + cy;
339	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
340	psrlq	$32, %mm0	/ 4: cy = product[63..32]
341	subl	$1, %ecx
342	jz	.L17
343
344	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
345	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
346	paddq	%mm1, %mm0	/ 5: mm0 = digit * a[i] + cy;
347	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
348	psrlq	$32, %mm0	/ 5: cy = product[63..32]
349	subl	$1, %ecx
350	jz	.L17
351
352	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
353	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
354	paddq	%mm1, %mm0	/ 6: mm0 = digit * a[i] + cy;
355	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
356	psrlq	$32, %mm0	/ 6: cy = product[63..32]
357	subl	$1, %ecx
358	jz	.L17
359
360	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
361	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
362	paddq	%mm1, %mm0	/ 7: mm0 = digit * a[i] + cy;
363	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
364	psrlq	$32, %mm0	/ 7: cy = product[63..32]
365
366.L17:
367	movd	%mm0, %eax	/ return (cy)
368	/ no emms.  caller is responsible for emms
369	ret
370	SET_SIZE(big_mul_set_vec_sse2_r)
371
372
373/ r = a * digit, r and a are vectors of length len
374/ returns the carry digit
375/ Suitable only for x86 models that support SSE2 instruction set extensions
376/
377/ r		 8(%ebp)	%edx
378/ a		12(%ebp)	%ebx
379/ len		16(%ebp)	%ecx
380/ digit		20(%ebp)	%mm3
381/
382/ In userland, there is just the one function, big_mul_set_vec_sse2().
383/ But in the kernel, there are two variations:
384/    1. big_mul_set_vec_sse2() which does what is necessary to save and
385/       restore state, if necessary, and to ensure that preemtion is
386/       disabled.
387/    2. big_mul_set_vec_sse2_nsv() which just does the work;
388/       it is the caller's responsibility to ensure that MMX state
389/       does not need to be saved and restored and that preemption
390/       is already disabled.
391
392#if defined(MMX_MANAGE)
393	ENTRY(big_mul_set_vec_sse2)
394	pushl	%ebp
395	movl	%esp, %ebp
396	pushl	%ebx
397	pushl	%esi
398	KPREEMPT_DISABLE
399	TEST_TS(%ebx)
400	pushl	%ebx
401	jnz	.setvec_no_save
402	pushl	%edi
403	SAVE_MMX_0TO4(%edi)
404	movl	8(%ebp), %edx
405	movl	12(%ebp), %ebx
406	movl	16(%ebp), %ecx
407	movd	20(%ebp), %mm3
408	call	big_mul_set_vec_sse2_r
409	movl	%eax, %esi
410	RSTOR_MMX_0TO4(%edi)
411	popl	%edi
412	jmp	.setvec_rtn
413
414.setvec_no_save:
415	movl	8(%ebp), %edx
416	movl	12(%ebp), %ebx
417	movl	16(%ebp), %ecx
418	movd	20(%ebp), %mm3
419	call	big_mul_set_vec_sse2_r
420	movl	%eax, %esi
421
422.setvec_rtn:
423	emms
424	popl	%ebx
425	movl	%ebx, %cr0
426	KPREEMPT_ENABLE
427	movl	%esi, %eax
428	popl	%esi
429	popl	%ebx
430	leave
431	ret
432	SET_SIZE(big_mul_set_vec_sse2)
433
434	ENTRY(big_mul_set_vec_sse2_nsv)
435	pushl	%ebp
436	movl	%esp, %ebp
437	pushl	%ebx
438	movl	8(%ebp), %edx
439	movl	12(%ebp), %ebx
440	movl	16(%ebp), %ecx
441	movd	20(%ebp), %mm3
442	call	big_mul_set_vec_sse2_r
443	popl	%ebx
444	leave
445	ret
446	SET_SIZE(big_mul_set_vec_sse2_nsv)
447
448#else	/* !defined(MMX_MANAGE) */
449
450/ r = a * digit, r and a are vectors of length len
451/ returns the carry digit
452/ Suitable only for x86 models that support SSE2 instruction set extensions
453/
454/ r		 8(%ebp)	%edx
455/ a		12(%ebp)	%ebx
456/ len		16(%ebp)	%ecx
457/ digit		20(%ebp)	%mm3
458
459	ENTRY(big_mul_set_vec_sse2)
460	pushl	%ebp
461	movl	%esp, %ebp
462	pushl	%ebx
463	movl	8(%ebp), %edx
464	movl	12(%ebp), %ebx
465	movl	16(%ebp), %ecx
466	movd	20(%ebp), %mm3
467	call	big_mul_set_vec_sse2_r
468	popl	%ebx
469	emms
470	leave
471	ret
472	SET_SIZE(big_mul_set_vec_sse2)
473
474#endif	/* MMX_MANAGE */
475
476
477/ r = r + a * digit, r and a are vectors of length len
478/ returns the carry digit
479/ Suitable only for x86 models that support SSE2 instruction set extensions
480/
481/ uint32_t
482/ big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
483/
484/ r	%edx
485/ a	%ebx
486/ len	%ecx
487/ digit	%mm3
488/
489/ N.B.:
490/   This is strictly for internal use.
491/   The interface is very light-weight.
492/   All parameters are passed in registers.
493/   It does not conform to the SYSV x86 ABI.
494/   So, don't even think about calling this function directly from C code.
495/
496/ The basic multiply digit loop is unrolled 8 times.
497/ Each comment is preceded by an instance number.
498/ Instructions that have been moved retain their original, "natural"
499/ instance number.  It should be easier this way to follow
500/ the step-wise refinement process that went into constructing
501/ the final code.
502
503	ENTRY(big_mul_add_vec_sse2_r)
504	xorl	%eax, %eax
505	testl	%ecx, %ecx
506	jz	.L27
507
508	pxor	%mm0, %mm0	/ cy = 0
509
510.L25:
511	cmpl	$UNROLL, %ecx
512	jl	.L26
513	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
514	movd	0(%edx), %mm2	/ 1: mm2 = r[i]
515	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
516	paddq	%mm1, %mm2	/ 1: mm2 = digit * a[i] + r[i]
517	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
518	paddq	%mm2, %mm0	/ 1: mm0 = digit * a[i] + r[i] + cy;
519	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
520	movd	4(%edx), %mm2	/ 2: mm2 = r[i]
521	psrlq	$32, %mm0	/ 1: cy = product[63..32]
522
523	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
524	paddq	%mm1, %mm2	/ 2: mm2 = digit * a[i] + r[i]
525	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
526	paddq	%mm2, %mm0	/ 2: mm0 = digit * a[i] + r[i] + cy;
527	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
528	movd	8(%edx), %mm2	/ 3: mm2 = r[i]
529	psrlq	$32, %mm0	/ 2: cy = product[63..32]
530
531	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
532	paddq	%mm1, %mm2	/ 3: mm2 = digit * a[i] + r[i]
533	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
534	paddq	%mm2, %mm0	/ 3: mm0 = digit * a[i] + r[i] + cy;
535	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
536	movd	12(%edx), %mm2	/ 4: mm2 = r[i]
537	psrlq	$32, %mm0	/ 3: cy = product[63..32]
538
539	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
540	paddq	%mm1, %mm2	/ 4: mm2 = digit * a[i] + r[i]
541	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
542	paddq	%mm2, %mm0	/ 4: mm0 = digit * a[i] + r[i] + cy;
543	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
544	movd	16(%edx), %mm2	/ 5: mm2 = r[i]
545	psrlq	$32, %mm0	/ 4: cy = product[63..32]
546
547	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
548	paddq	%mm1, %mm2	/ 5: mm2 = digit * a[i] + r[i]
549	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
550	paddq	%mm2, %mm0	/ 5: mm0 = digit * a[i] + r[i] + cy;
551	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
552	movd	20(%edx), %mm2	/ 6: mm2 = r[i]
553	psrlq	$32, %mm0	/ 5: cy = product[63..32]
554
555	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
556	paddq	%mm1, %mm2	/ 6: mm2 = digit * a[i] + r[i]
557	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
558	paddq	%mm2, %mm0	/ 6: mm0 = digit * a[i] + r[i] + cy;
559	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
560	movd	24(%edx), %mm2	/ 7: mm2 = r[i]
561	psrlq	$32, %mm0	/ 6: cy = product[63..32]
562
563	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
564	paddq	%mm1, %mm2	/ 7: mm2 = digit * a[i] + r[i]
565	movd	28(%ebx), %mm1	/ 8: mm1 = a[i]
566	paddq	%mm2, %mm0	/ 7: mm0 = digit * a[i] + r[i] + cy;
567	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
568	movd	28(%edx), %mm2	/ 8: mm2 = r[i]
569	psrlq	$32, %mm0	/ 7: cy = product[63..32]
570
571	pmuludq	%mm3, %mm1	/ 8: mm1 = digit * a[i]
572	paddq	%mm1, %mm2	/ 8: mm2 = digit * a[i] + r[i]
573	paddq	%mm2, %mm0	/ 8: mm0 = digit * a[i] + r[i] + cy;
574	movd	%mm0, 28(%edx)	/ 8: r[i] = product[31..0]
575	psrlq	$32, %mm0	/ 8: cy = product[63..32]
576
577	leal	UNROLL32(%ebx), %ebx	/ a += UNROLL
578	leal	UNROLL32(%edx), %edx	/ r += UNROLL
579	subl	$UNROLL, %ecx		/ len -= UNROLL
580	jz	.L27
581	jmp	.L25
582
583.L26:
584	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
585	movd	0(%edx), %mm2	/ 1: mm2 = r[i]
586	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
587	paddq	%mm1, %mm2	/ 1: mm2 = digit * a[i] + r[i]
588	paddq	%mm2, %mm0	/ 1: mm0 = digit * a[i] + r[i] + cy;
589	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
590	psrlq	$32, %mm0	/ 1: cy = product[63..32]
591	subl	$1, %ecx
592	jz	.L27
593
594	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
595	movd	4(%edx), %mm2	/ 2: mm2 = r[i]
596	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
597	paddq	%mm1, %mm2	/ 2: mm2 = digit * a[i] + r[i]
598	paddq	%mm2, %mm0	/ 2: mm0 = digit * a[i] + r[i] + cy;
599	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
600	psrlq	$32, %mm0	/ 2: cy = product[63..32]
601	subl	$1, %ecx
602	jz	.L27
603
604	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
605	movd	8(%edx), %mm2	/ 3: mm2 = r[i]
606	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
607	paddq	%mm1, %mm2	/ 3: mm2 = digit * a[i] + r[i]
608	paddq	%mm2, %mm0	/ 3: mm0 = digit * a[i] + r[i] + cy;
609	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
610	psrlq	$32, %mm0	/ 3: cy = product[63..32]
611	subl	$1, %ecx
612	jz	.L27
613
614	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
615	movd	12(%edx), %mm2	/ 4: mm2 = r[i]
616	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
617	paddq	%mm1, %mm2	/ 4: mm2 = digit * a[i] + r[i]
618	paddq	%mm2, %mm0	/ 4: mm0 = digit * a[i] + r[i] + cy;
619	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
620	psrlq	$32, %mm0	/ 4: cy = product[63..32]
621	subl	$1, %ecx
622	jz	.L27
623
624	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
625	movd	16(%edx), %mm2	/ 5: mm2 = r[i]
626	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
627	paddq	%mm1, %mm2	/ 5: mm2 = digit * a[i] + r[i]
628	paddq	%mm2, %mm0	/ 5: mm0 = digit * a[i] + r[i] + cy;
629	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
630	psrlq	$32, %mm0	/ 5: cy = product[63..32]
631	subl	$1, %ecx
632	jz	.L27
633
634	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
635	movd	20(%edx), %mm2	/ 6: mm2 = r[i]
636	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
637	paddq	%mm1, %mm2	/ 6: mm2 = digit * a[i] + r[i]
638	paddq	%mm2, %mm0	/ 6: mm0 = digit * a[i] + r[i] + cy;
639	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
640	psrlq	$32, %mm0	/ 6: cy = product[63..32]
641	subl	$1, %ecx
642	jz	.L27
643
644	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
645	movd	24(%edx), %mm2	/ 7: mm2 = r[i]
646	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
647	paddq	%mm1, %mm2	/ 7: mm2 = digit * a[i] + r[i]
648	paddq	%mm2, %mm0	/ 7: mm0 = digit * a[i] + r[i] + cy;
649	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
650	psrlq	$32, %mm0	/ 7: cy = product[63..32]
651
652.L27:
653	movd	%mm0, %eax
654	/ no emms.  caller is responsible for emms
655	ret
656	SET_SIZE(big_mul_add_vec_sse2_r)
657
658
659/ r = r + a * digit, r and a are vectors of length len
660/ returns the carry digit
661/ Suitable only for x86 models that support SSE2 instruction set extensions
662/
663/ r		 8(%ebp)	%edx
664/ a		12(%ebp)	%ebx
665/ len		16(%ebp)	%ecx
666/ digit		20(%ebp)	%mm3
667/
668/ In userland, there is just the one function, big_mul_add_vec_sse2().
669/ But in the kernel, there are two variations:
670/    1. big_mul_add_vec_sse2() which does what is necessary to save and
671/       restore state, if necessary, and to ensure that preemtion is
672/       disabled.
673/    2. big_mul_add_vec_sse2_nsv() which just does the work;
674/       it is the caller's responsibility to ensure that MMX state
675/       does not need to be saved and restored and that preemption
676/       is already disabled.
677
678
679#if defined(MMX_MANAGE)
680
681	ENTRY(big_mul_add_vec_sse2)
682	pushl	%ebp
683	movl	%esp, %ebp
684	pushl	%ebx
685	pushl	%esi
686	KPREEMPT_DISABLE
687	TEST_TS(%ebx)
688	pushl	%ebx
689	jnz	.addvec_no_save
690	pushl	%edi
691	SAVE_MMX_0TO4(%edi)
692	movl	8(%ebp), %edx
693	movl	12(%ebp), %ebx
694	movl	16(%ebp), %ecx
695	movd	20(%ebp), %mm3
696	call	big_mul_add_vec_sse2_r
697	movl	%eax, %esi
698	RSTOR_MMX_0TO4(%edi)
699	popl	%edi
700	jmp	.addvec_rtn
701
702.addvec_no_save:
703	movl	8(%ebp), %edx
704	movl	12(%ebp), %ebx
705	movl	16(%ebp), %ecx
706	movd	20(%ebp), %mm3
707	call	big_mul_add_vec_sse2_r
708	movl	%eax, %esi
709
710.addvec_rtn:
711	emms
712	popl	%ebx
713	movl	%ebx, %cr0
714	KPREEMPT_ENABLE
715	movl	%esi, %eax
716	popl	%esi
717	popl	%ebx
718	leave
719	ret
720	SET_SIZE(big_mul_add_vec_sse2)
721
722	ENTRY(big_mul_add_vec_sse2_nsv)
723	pushl	%ebp
724	movl	%esp, %ebp
725	pushl	%ebx
726	movl	8(%ebp), %edx
727	movl	12(%ebp), %ebx
728	movl	16(%ebp), %ecx
729	movd	20(%ebp), %mm3
730	call	big_mul_add_vec_sse2_r
731	popl	%ebx
732	leave
733	ret
734	SET_SIZE(big_mul_add_vec_sse2_nsv)
735
736
737#else	/* !defined(MMX_MANAGE) */
738
739	ENTRY(big_mul_add_vec_sse2)
740	pushl	%ebp
741	movl	%esp, %ebp
742	pushl	%ebx
743	movl	8(%ebp), %edx
744	movl	12(%ebp), %ebx
745	movl	16(%ebp), %ecx
746	movd	20(%ebp), %mm3
747	call	big_mul_add_vec_sse2_r
748	popl	%ebx
749	emms
750	leave
751	ret
752	SET_SIZE(big_mul_add_vec_sse2)
753
754#endif	/* MMX_MANAGE */
755
756
757/ void
758/ big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
759/ {
760/ 	int i;
761/
762/ 	r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]);
763/ 	for (i = 1; i < blen; ++i)
764/ 		r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]);
765/ }
766
767
768#if defined(MMX_MANAGE)
769	ENTRY(big_mul_vec_sse2_fc)
770#else
771	ENTRY(big_mul_vec_sse2)
772#endif
773	subl	$0x8, %esp
774	pushl	%ebx
775	pushl	%ebp
776	pushl	%esi
777	pushl	%edi
778	movl	40(%esp), %eax
779	movl	%eax, 20(%esp)
780	pushl	(%eax)
781	movl	40(%esp), %edi
782	pushl	%edi
783	movl	40(%esp), %esi
784	pushl	%esi
785	movl	40(%esp), %ebx
786	pushl	%ebx
787#if defined(MMX_MANAGE)
788	call	big_mul_set_vec_sse2_nsv
789#else
790	call	big_mul_set_vec_sse2
791#endif
792	addl	$0x10, %esp
793	movl	%eax, (%ebx,%edi,4)
794	movl	44(%esp), %eax
795	movl	%eax, 16(%esp)
796	cmpl	$0x1, %eax
797	jle	.mulvec_rtn
798	movl	$0x1, %ebp
799
800	.align 16
801.mulvec_add:
802	movl	20(%esp), %eax
803	pushl	(%eax,%ebp,4)
804	pushl	%edi
805	pushl	%esi
806	leal	(%ebx,%ebp,4), %eax
807	pushl	%eax
808#if defined(MMX_MANAGE)
809	call	big_mul_add_vec_sse2_nsv
810#else
811	call	big_mul_add_vec_sse2
812#endif
813	addl	$0x10, %esp
814	leal	(%ebp,%edi), %ecx
815	movl	%eax, (%ebx,%ecx,4)
816	incl	%ebp
817	cmpl	16(%esp), %ebp
818	jl	.mulvec_add
819.mulvec_rtn:
820#if defined(MMX_MANAGE)
821	emms
822#endif
823	popl	%edi
824	popl	%esi
825	popl	%ebp
826	popl	%ebx
827	addl	$0x8, %esp
828	ret
829#if defined(MMX_MANAGE)
830	SET_SIZE(big_mul_vec_sse2_fc)
831#else
832	SET_SIZE(big_mul_vec_sse2)
833#endif
834
835#if defined(MMX_MANAGE)
836
837	ENTRY(big_mul_vec_sse2)
838	pushl	%ebp
839	movl	%esp, %ebp
840	subl	$8, %esp
841	pushl	%edi
842	KPREEMPT_DISABLE
843	TEST_TS(%eax)
844	movl	%eax, -8(%ebp)
845	jnz	.mulvec_no_save
846	SAVE_MMX_0TO4(%edi)
847	movl	%edi, -4(%ebp)
848.mulvec_no_save:
849	movl	24(%ebp), %eax		/ blen
850	pushl	%eax
851	movl	20(%ebp), %eax		/ b
852	pushl	%eax
853	movl	16(%ebp), %eax		/ alen
854	pushl	%eax
855	movl	12(%ebp), %eax		/ a
856	pushl	%eax
857	movl	8(%ebp), %eax		/ r
858	pushl	%eax
859	call	big_mul_vec_sse2_fc
860	addl	$20, %esp
861	movl	-8(%ebp), %eax
862	testl	$CR0_TS, %eax
863	jnz	.mulvec_no_rstr
864	movl	-4(%ebp), %edi
865	RSTOR_MMX_0TO4(%edi)
866.mulvec_no_rstr:
867	movl	%eax, %cr0
868	KPREEMPT_ENABLE
869	popl	%edi
870	leave
871	ret
872	SET_SIZE(big_mul_vec_sse2)
873
874#endif	/* MMX_MANAGE */
875
876
877
878#undef UNROLL
879#undef UNROLL32
880
881
882/ r = a * a, r and a are vectors of length len
883/ Suitable only for x86 models that support SSE2 instruction set extensions
884/
885/ This function is not suitable for a truly general-purpose multiprecision
886/ arithmetic library, because it does not work for "small" numbers, that is
887/ numbers of 1 or 2 digits.  big_mul() just uses the ordinary big_mul_vec()
888/ for any small numbers.
889
890#if defined(MMX_MANAGE)
891	ENTRY(big_sqr_vec_sse2_fc)
892#else
893	ENTRY(big_sqr_vec_sse2)
894	pushl	%ebp
895	movl	%esp, %ebp
896#endif
897
898	pushl	%ebx
899	pushl	%edi
900	pushl	%esi
901
902	/ r[1..alen] = a[0] * a[1..alen-1]
903
904	movl	8(%ebp), %edi		/ r = arg(r)
905	movl	12(%ebp), %esi		/ a = arg(a)
906	movl	16(%ebp), %ecx		/ cnt = arg(alen)
907	movd	%ecx, %mm4		/ save_cnt = arg(alen)
908	leal	4(%edi), %edx		/ dst = &r[1]
909	movl	%esi, %ebx		/ src = a
910	movd	0(%ebx), %mm3		/ mm3 = a[0]
911	leal	4(%ebx), %ebx		/ src = &a[1]
912	subl	$1, %ecx		/ --cnt
913	call	big_mul_set_vec_sse2_r	/ r[1..alen-1] = a[0] * a[1..alen-1]
914	movl	%edi, %edx		/ dst = r
915	movl	%esi, %ebx		/ src = a
916	movd	%mm4, %ecx		/ cnt = save_cnt
917	movl	%eax, (%edx, %ecx, 4)	/ r[cnt] = cy
918
919/	/* High-level vector C pseudocode */
920/	for (i = 1; i < alen-1; ++i)
921/		r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1]
922/
923/	/* Same thing, but slightly lower level C-like pseudocode */
924/	i = 1;
925/	r = &arg_r[2*i + 1];
926/	a = &arg_a[i + 1];
927/	digit = arg_a[i];
928/	cnt = alen - 3;
929/	while (cnt != 0) {
930/		r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit);
931/		r += 2;
932/		++a;
933/		--cnt;
934/	}
935/
936/	/* Same thing, but even lower level
937/	 * For example, pointers are raw pointers,
938/	 * with no scaling by object size.
939/	 */
940/	r = arg_r + 12;	/* i == 1; 2i + 1 == 3;  4*3 == 12; */
941/	a = arg_a + 8;
942/	digit = *(arg_a + 4);
943/	cnt = alen - 3;
944/	while (cnt != 0) {
945/		cy = big_mul_add_vec_sse2_r();
946/		*(r + 4 * cnt) = cy;
947/		r += 8;
948/		a += 4;
949/		--cnt;
950/	}
951
952	leal	4(%edi), %edi		/ r += 4; r = &r[1]
953	leal	4(%esi), %esi		/ a += 4; a = &a[1]
954	movd	%mm4, %ecx		/ cnt = save
955	subl	$2, %ecx		/ cnt = alen - 2; i in 1..alen-2
956	movd	%ecx, %mm4		/ save_cnt
957	jecxz	.L32			/ while (cnt != 0) {
958.L31:
959	movd	0(%esi), %mm3		/ digit = a[i]
960	leal	4(%esi), %esi		/ a += 4; a = &a[1]; a = &a[i + 1]
961	leal	8(%edi), %edi		/ r += 8; r = &r[2]; r = &r[2 * i + 1]
962	movl	%edi, %edx		/ edx = r
963	movl	%esi, %ebx		/ ebx = a
964	cmp	$1, %ecx		/ The last triangle term is special
965	jz	.L32
966	call	big_mul_add_vec_sse2_r
967	movd	%mm4, %ecx		/ cnt = save_cnt
968	movl	%eax, (%edi, %ecx, 4)	/ r[cnt] = cy
969	subl	$1, %ecx		/ --cnt
970	movd	%ecx, %mm4		/ save_cnt = cnt
971	jmp	.L31			/ }
972
973.L32:
974	movd	0(%ebx), %mm1		/ mm1 = a[i + 1]
975	movd	0(%edx), %mm2		/ mm2 = r[2 * i + 1]
976	pmuludq	%mm3, %mm1		/ mm1 = p = digit * a[i + 1]
977	paddq	%mm1, %mm2		/ mm2 = r[2 * i + 1] + p
978	movd	%mm2, 0(%edx)		/ r[2 * i + 1] += lo32(p)
979	psrlq	$32, %mm2		/ mm2 = cy
980	movd	%mm2, 4(%edx)		/ r[2 * i + 2] = cy
981	pxor	%mm2, %mm2
982	movd	%mm2, 8(%edx)		/ r[2 * i + 3] = 0
983
984	movl	8(%ebp), %edx		/ r = arg(r)
985	movl	12(%ebp), %ebx		/ a = arg(a)
986	movl	16(%ebp), %ecx		/ cnt = arg(alen)
987
988	/ compute low-order corner
989	/ p = a[0]**2
990	/ r[0] = lo32(p)
991	/ cy   = hi32(p)
992	movd	0(%ebx), %mm2		/ mm2 = a[0]
993	pmuludq	%mm2, %mm2		/ mm2 = p = a[0]**2
994	movd	%mm2, 0(%edx)		/ r[0] = lo32(p)
995	psrlq	$32, %mm2		/ mm2 = cy = hi32(p)
996
997	/ p = 2 * r[1]
998	/ t = p + cy
999	/ r[1] = lo32(t)
1000	/ cy   = hi32(t)
1001	movd	4(%edx), %mm1		/ mm1 = r[1]
1002	psllq	$1, %mm1		/ mm1 = p = 2 * r[1]
1003	paddq	%mm1, %mm2		/ mm2 = t = p + cy
1004	movd	%mm2, 4(%edx)		/ r[1] = low32(t)
1005	psrlq	$32, %mm2		/ mm2 = cy = hi32(t)
1006
1007	/ r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3]
1008	subl	$2, %ecx		/ cnt = alen - 2
1009.L34:
1010	movd	4(%ebx), %mm0		/ mm0 = diag = a[i+1]
1011	pmuludq	%mm0, %mm0		/ mm0 = p = diag**2
1012	paddq	%mm0, %mm2		/ mm2 = t = p + cy
1013	movd	%mm2, %eax
1014	movd	%eax, %mm1		/ mm1 = lo32(t)
1015	psrlq	$32, %mm2		/ mm2 = hi32(t)
1016
1017	movd	8(%edx), %mm3		/ mm3 = r[2*i]
1018	psllq	$1, %mm3		/ mm3 = 2*r[2*i]
1019	paddq	%mm3, %mm1		/ mm1 = 2*r[2*i] + lo32(t)
1020	movd	%mm1, 8(%edx)		/ r[2*i] = 2*r[2*i] + lo32(t)
1021	psrlq	$32, %mm1
1022	paddq	%mm1, %mm2
1023
1024	movd	12(%edx), %mm3		/ mm3 = r[2*i+1]
1025	psllq	$1, %mm3		/ mm3 = 2*r[2*i+1]
1026	paddq	%mm3, %mm2		/ mm2 = 2*r[2*i+1] + hi32(t)
1027	movd	%mm2, 12(%edx)		/ r[2*i+1] = mm2
1028	psrlq	$32, %mm2		/ mm2 = cy
1029	leal	8(%edx), %edx		/ r += 2
1030	leal	4(%ebx), %ebx		/ ++a
1031	subl	$1, %ecx		/ --cnt
1032	jnz	.L34
1033
1034	/ Carry from last triangle term must participate in doubling,
1035	/ but this step isn't paired up with a squaring the elements
1036	/ of the inner diagonal.
1037	/ r[$-3..$-2] += 2 * r[$-3..$-2] + cy
1038	movd	8(%edx), %mm3		/ mm3 = r[2*i]
1039	psllq	$1, %mm3		/ mm3 = 2*r[2*i]
1040	paddq	%mm3, %mm2		/ mm2 = 2*r[2*i] + cy
1041	movd	%mm2, 8(%edx)		/ r[2*i] = lo32(2*r[2*i] + cy)
1042	psrlq	$32, %mm2		/ mm2 = cy = hi32(2*r[2*i] + cy)
1043
1044	movd	12(%edx), %mm3		/ mm3 = r[2*i+1]
1045	psllq	$1, %mm3		/ mm3 = 2*r[2*i+1]
1046	paddq	%mm3, %mm2		/ mm2 = 2*r[2*i+1] + cy
1047	movd	%mm2, 12(%edx)		/ r[2*i+1] = mm2
1048	psrlq	$32, %mm2		/ mm2 = cy
1049
1050	/ compute high-order corner and add it in
1051	/ p = a[alen - 1]**2
1052	/ t = p + cy
1053	/ r[alen + alen - 2] += lo32(t)
1054	/ cy = hi32(t)
1055	/ r[alen + alen - 1] = cy
1056	movd	4(%ebx), %mm0		/ mm0 = a[$-1]
1057	movd	8(%edx), %mm3		/ mm3 = r[$-2]
1058	pmuludq	%mm0, %mm0		/ mm0 = p = a[$-1]**2
1059	paddq	%mm0, %mm2		/ mm2 = t = p + cy
1060	paddq	%mm3, %mm2		/ mm2 = r[$-2] + t
1061	movd	%mm2, 8(%edx)		/ r[$-2] = lo32(r[$-2] + t)
1062	psrlq	$32, %mm2		/ mm2 = cy = hi32(r[$-2] + t)
1063	movd	12(%edx), %mm3
1064	paddq	%mm3, %mm2
1065	movd	%mm2, 12(%edx)		/ r[$-1] += cy
1066
1067.L35:
1068	emms
1069	popl	%esi
1070	popl	%edi
1071	popl	%ebx
1072
1073#if defined(MMX_MANAGE)
1074	ret
1075	SET_SIZE(big_sqr_vec_sse2_fc)
1076#else
1077	leave
1078	ret
1079	SET_SIZE(big_sqr_vec_sse2)
1080#endif
1081
1082
1083#if defined(MMX_MANAGE)
1084	ENTRY(big_sqr_vec_sse2)
1085	pushl	%ebp
1086	movl	%esp, %ebp
1087	KPREEMPT_DISABLE
1088	TEST_TS(%ebx)
1089	pushl	%ebx
1090	jnz	.sqr_no_save
1091	pushl	%edi
1092	SAVE_MMX_0TO4(%edi)
1093	call	big_sqr_vec_sse2_fc
1094	RSTOR_MMX_0TO4(%edi)
1095	popl	%edi
1096	jmp	.sqr_rtn
1097
1098.sqr_no_save:
1099	call	big_sqr_vec_sse2_fc
1100
1101.sqr_rtn:
1102	popl	%ebx
1103	movl	%ebx, %cr0
1104	KPREEMPT_ENABLE
1105	leave
1106	ret
1107	SET_SIZE(big_sqr_vec_sse2)
1108
1109#endif	/* MMX_MANAGE */
1110
1111/ ------------------------------------------------------------------------
1112/		UMUL Implementations
1113/ ------------------------------------------------------------------------
1114
1115
1116/ r = a * digit, r and a are vectors of length len
1117/ returns the carry digit
1118/ Does not use any MMX, SSE, or SSE2 instructions.
1119/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1120/ This is a fall-back implementation for x86 models that do not support
1121/ the PMULUDQ instruction.
1122/
1123/ uint32_t
1124/ big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1125/
1126/ r		 8(%ebp)	%edx	%edi
1127/ a		12(%ebp)	%ebx	%esi
1128/ len		16(%ebp)	%ecx
1129/ digit		20(%ebp)	%esi
1130
1131	ENTRY(big_mul_set_vec_umul)
1132	pushl	%ebp
1133	movl	%esp, %ebp
1134	pushl	%esi
1135	pushl	%edi
1136	pushl	%ebx
1137	movl	16(%ebp), %ecx
1138	xorl	%ebx, %ebx	/ cy = 0
1139	testl	%ecx, %ecx
1140	movl	8(%ebp), %edi
1141	movl	12(%ebp), %esi
1142	je	.L57
1143
1144.L55:
1145	movl	(%esi), %eax	/ eax = a[i]
1146	leal	4(%esi), %esi	/ ++a
1147	mull	20(%ebp)	/ edx:eax = a[i] * digit
1148	addl	%ebx, %eax
1149	adcl	$0, %edx	/ edx:eax = a[i] * digit + cy
1150	movl	%eax, (%edi)	/ r[i] = product[31..0]
1151	movl	%edx, %ebx	/ cy = product[63..32]
1152	leal	4(%edi), %edi	/ ++r
1153	decl	%ecx		/ --len
1154	jnz	.L55		/ while (len != 0)
1155.L57:
1156	movl	%ebx, %eax
1157	popl	%ebx
1158	popl	%edi
1159	popl	%esi
1160	leave
1161	ret
1162	SET_SIZE(big_mul_set_vec_umul)
1163
1164
1165/ r = r + a * digit, r and a are vectors of length len
1166/ returns the carry digit
1167/ Does not use any MMX, SSE, or SSE2 instructions.
1168/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1169/ This is a fall-back implementation for x86 models that do not support
1170/ the PMULUDQ instruction.
1171/
1172/ uint32_t
1173/ big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1174/
1175/ r		 8(%ebp)	%edx	%edi
1176/ a		12(%ebp)	%ebx	%esi
1177/ len		16(%ebp)	%ecx
1178/ digit		20(%ebp)	%esi
1179
1180	ENTRY(big_mul_add_vec_umul)
1181	pushl	%ebp
1182	movl	%esp, %ebp
1183	pushl	%esi
1184	pushl	%edi
1185	pushl	%ebx
1186	movl	16(%ebp), %ecx
1187	xorl	%ebx, %ebx	/ cy = 0
1188	testl	%ecx, %ecx
1189	movl	8(%ebp), %edi
1190	movl	12(%ebp), %esi
1191	je	.L67
1192	.align 4
1193.L65:
1194	movl	(%esi), %eax	/ eax = a[i]
1195	leal	4(%esi), %esi	/ ++a
1196	mull	20(%ebp)	/ edx:eax = a[i] * digit
1197	addl	(%edi), %eax
1198	adcl	$0, %edx	/ edx:eax = a[i] * digit + r[i]
1199	addl	%ebx, %eax
1200	adcl	$0, %edx	/ edx:eax = a[i] * digit + r[i] + cy
1201	movl	%eax, (%edi)	/ r[i] = product[31..0]
1202	movl	%edx, %ebx	/ cy = product[63..32]
1203	leal	4(%edi), %edi	/ ++r
1204	decl	%ecx		/ --len
1205	jnz	.L65		/ while (len != 0)
1206.L67:
1207	movl	%ebx, %eax
1208	popl	%ebx
1209	popl	%edi
1210	popl	%esi
1211	leave
1212	ret
1213	SET_SIZE(big_mul_add_vec_umul)
1214
1215#endif	/* __lint */
1216