xref: /titanic_50/usr/src/common/bignum/i386/bignum_i386_asm.s (revision 733a5356058ae0150a67d61f0ad8e5260d2acae3)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#if !defined(__GNUC_AS__)
30
31#include <sys/asm_linkage.h>
32#include <sys/x86_archext.h>
33#include <sys/controlregs.h>
34
35#if defined(__lint)
36
37#include <sys/types.h>
38
39uint32_t
40bignum_use_sse2()
41{ return (0); }
42
43/* Not to be called by C code */
44/* ARGSUSED */
45uint32_t
46big_mul_set_vec_sse2_r()
47{ return (0); }
48
49/* Not to be called by C code */
50/* ARGSUSED */
51uint32_t
52big_mul_add_vec_sse2_r()
53{ return (0); }
54
55/* ARGSUSED */
56uint32_t
57big_mul_set_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
58{ return (0); }
59
60/* ARGSUSED */
61uint32_t
62big_mul_add_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
63{ return (0); }
64
65/* ARGSUSED */
66void
67big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
68{}
69
70/* ARGSUSED */
71void
72big_sqr_vec_sse2(uint32_t *r, uint32_t *a, int len)
73{}
74
75#if defined(MMX_MANAGE)
76
77/* ARGSUSED */
78uint32_t
79big_mul_set_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
80{ return (0); }
81
82/* ARGSUSED */
83uint32_t
84big_mul_add_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
85{ return (0); }
86
87/* Not to be called by C code */
88/* ARGSUSED */
89void
90big_sqr_vec_sse2_fc(uint32_t *r, uint32_t *a, int len)
91{}
92
93#endif	/* MMX_MANAGE */
94
95/*
96 * UMUL
97 *
98 */
99
100/* ARGSUSED */
101uint32_t
102big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
103{ return (0); }
104
105/* ARGSUSED */
106uint32_t
107big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
108{ return (0); }
109
110#else	/* __lint */
111
112#if defined(MMX_MANAGE)
113
114#if defined(_KERNEL)
115
116#define	KPREEMPT_DISABLE call kpr_disable
117#define	KPREEMPT_ENABLE call kpr_enable
118#define	TEST_TS(reg)					\
119	movl	%cr0, reg;				\
120	clts;						\
121	testl	$CR0_TS, reg
122
123#else	/* _KERNEL */
124
125#define	KPREEMPT_DISABLE
126#define	KPREEMPT_ENABLE
127
128#define	TEST_TS(reg)					\
129	movl	$0, reg;				\
130	testl	$CR0_TS, reg
131
132#endif	/* _KERNEL */
133
134#define	MMX_SIZE 8
135#define	MMX_ALIGN 8
136
137#define	SAVE_MMX_PROLOG(sreg, nreg)			\
138	subl	$[MMX_SIZE \* nreg + MMX_ALIGN], %esp;	\
139	movl	%esp, sreg;				\
140	addl	$MMX_ALIGN, sreg;			\
141	andl	$-1![MMX_ALIGN-1], sreg;
142
143#define	RSTOR_MMX_EPILOG(nreg)				\
144	addl	$[MMX_SIZE \* nreg + MMX_ALIGN], %esp;
145
146#define	SAVE_MMX_0TO4(sreg)			\
147	SAVE_MMX_PROLOG(sreg, 5);		\
148	movq	%mm0, 0(sreg);			\
149	movq	%mm1, 8(sreg);			\
150	movq	%mm2, 16(sreg);			\
151	movq	%mm3, 24(sreg);			\
152	movq	%mm4, 32(sreg)
153
154#define	RSTOR_MMX_0TO4(sreg)			\
155	movq	0(sreg), %mm0;			\
156	movq	8(sreg), %mm1;			\
157	movq	16(sreg), %mm2;			\
158	movq	24(sreg), %mm3;			\
159	movq	32(sreg), %mm4;			\
160	RSTOR_MMX_EPILOG(5)
161
162#endif	/* MMX_MANAGE */
163
164/ Note: this file contains implementations for
165/	big_mul_set_vec()
166/	big_mul_add_vec()
167/	big_mul_vec()
168/	big_sqr_vec()
169/ One set of implementations is for SSE2-capable models.
170/ The other uses no MMX, SSE, or SSE2 instructions, only
171/ the x86 32 X 32 -> 64 unsigned multiply instruction, MUL.
172/
173/ The code for the implementations is grouped by SSE2 vs UMUL,
174/ rather than grouping pairs of implementations for each function.
175/ This is because the bignum implementation gets "imprinted"
176/ on the correct implementation, at the time of first use,
177/ so none of the code for the other implementations is ever
178/ executed.  So, it is a no-brainer to layout the code to minimize
179/ the "footprint" of executed code.
180
181/ Can we use SSE2 instructions?  Return value is non-zero
182/ if we can.
183/
184/ Note:
185/   Using the cpuid instruction directly would work equally
186/   well in userland and in the kernel, but we do not use the
187/   cpuid instruction in the kernel, we use the x86_feature
188/   variable, instead.  This means we honor any decisions
189/   the kernel startup code may have made in setting this
190/   variable, including disabling SSE2 because of settings
191/   in /etc/system.  It might even be a good idea to honor
192/   this kind of setting in userland, as well, but the variable,
193/   x86-feature is not readily available to userland processes.
194/
195/ uint32_t
196/ bignum_use_sse2()
197
198	ENTRY(bignum_use_sse2)
199#if defined(_KERNEL)
200	movl	x86_feature, %eax
201	andl	$X86_SSE2, %eax
202#else	/* _KERNEL */
203	pushl	%ebx
204	movl	$1, %eax		/ Get feature information
205	cpuid
206	movl	%edx, %eax		/ set return value
207	popl	%ebx
208	andl	$CPUID_INTC_EDX_SSE2, %eax
209#endif	/* _KERNEL */
210	ret
211	SET_SIZE(bignum_use_sse2)
212
213
214/ ------------------------------------------------------------------------
215/		SSE2 Implementations
216/ ------------------------------------------------------------------------
217
218/ r = a * digit, r and a are vectors of length len
219/ returns the carry digit
220/ Suitable only for x86 models that support SSE2 instruction set extensions
221/
222/ uint32_t
223/ big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
224/
225/ r	%edx
226/ a	%ebx
227/ len	%ecx
228/ digit	%mm3
229/
230/ Does not touch the following registers: %esi, %edi, %mm4
231/
232/ N.B.:
233/   This is strictly for internal use.
234/   The interface is very light-weight.
235/   All parameters are passed in registers.
236/   It does not conform to the SYSV x86 ABI.
237/   So, don't even think about calling this function directly from C code.
238/
239/ The basic multiply digit loop is unrolled 8 times.
240/ Each comment is preceded by an instance number.
241/ Instructions that have been moved retain their original, "natural"
242/ instance number.  It should be easier this way to follow
243/ the step-wise refinement process that went into constructing
244/ the final code.
245
246#define	UNROLL		8
247#define	UNROLL32	32
248
249	ENTRY(big_mul_set_vec_sse2_r)
250	xorl	%eax, %eax	/ if (len == 0) return (0);
251	testl	%ecx, %ecx
252	jz	.L17
253
254	pxor	%mm0, %mm0	/ cy = 0
255
256.L15:
257	cmpl	$UNROLL, %ecx
258	jl	.L16
259	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
260	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
261	paddq	%mm1, %mm0	/ 1: mm0 = digit * a[i] + cy;
262	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
263	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
264	psrlq	$32, %mm0	/ 1: cy = product[63..32]
265
266	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
267	paddq	%mm1, %mm0	/ 2: mm0 = digit * a[i] + cy;
268	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
269	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
270	psrlq	$32, %mm0	/ 2: cy = product[63..32]
271
272	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
273	paddq	%mm1, %mm0	/ 3: mm0 = digit * a[i] + cy;
274	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
275	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
276	psrlq	$32, %mm0	/ 3: cy = product[63..32]
277
278	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
279	paddq	%mm1, %mm0	/ 4: mm0 = digit * a[i] + cy;
280	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
281	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
282	psrlq	$32, %mm0	/ 4: cy = product[63..32]
283
284	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
285	paddq	%mm1, %mm0	/ 5: mm0 = digit * a[i] + cy;
286	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
287	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
288	psrlq	$32, %mm0	/ 5: cy = product[63..32]
289
290	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
291	paddq	%mm1, %mm0	/ 6: mm0 = digit * a[i] + cy;
292	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
293	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
294	psrlq	$32, %mm0	/ 6: cy = product[63..32]
295
296	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
297	paddq	%mm1, %mm0	/ 7: mm0 = digit * a[i] + cy;
298	movd	28(%ebx), %mm1	/ 8: mm1 = a[i]
299	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
300	psrlq	$32, %mm0	/ 7: cy = product[63..32]
301
302	pmuludq	%mm3, %mm1	/ 8: mm1 = digit * a[i]
303	paddq	%mm1, %mm0	/ 8: mm0 = digit * a[i] + cy;
304	movd	%mm0, 28(%edx)	/ 8: r[i] = product[31..0]
305	psrlq	$32, %mm0	/ 8: cy = product[63..32]
306
307	leal	UNROLL32(%ebx), %ebx	/ a += UNROLL
308	leal	UNROLL32(%edx), %edx	/ r += UNROLL
309	subl	$UNROLL, %ecx		/ len -= UNROLL
310	jz	.L17
311	jmp	.L15
312
313.L16:
314	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
315	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
316	paddq	%mm1, %mm0	/ 1: mm0 = digit * a[i] + cy;
317	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
318	psrlq	$32, %mm0	/ 1: cy = product[63..32]
319	subl	$1, %ecx
320	jz	.L17
321
322	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
323	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
324	paddq	%mm1, %mm0	/ 2: mm0 = digit * a[i] + cy;
325	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
326	psrlq	$32, %mm0	/ 2: cy = product[63..32]
327	subl	$1, %ecx
328	jz	.L17
329
330	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
331	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
332	paddq	%mm1, %mm0	/ 3: mm0 = digit * a[i] + cy;
333	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
334	psrlq	$32, %mm0	/ 3: cy = product[63..32]
335	subl	$1, %ecx
336	jz	.L17
337
338	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
339	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
340	paddq	%mm1, %mm0	/ 4: mm0 = digit * a[i] + cy;
341	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
342	psrlq	$32, %mm0	/ 4: cy = product[63..32]
343	subl	$1, %ecx
344	jz	.L17
345
346	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
347	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
348	paddq	%mm1, %mm0	/ 5: mm0 = digit * a[i] + cy;
349	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
350	psrlq	$32, %mm0	/ 5: cy = product[63..32]
351	subl	$1, %ecx
352	jz	.L17
353
354	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
355	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
356	paddq	%mm1, %mm0	/ 6: mm0 = digit * a[i] + cy;
357	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
358	psrlq	$32, %mm0	/ 6: cy = product[63..32]
359	subl	$1, %ecx
360	jz	.L17
361
362	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
363	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
364	paddq	%mm1, %mm0	/ 7: mm0 = digit * a[i] + cy;
365	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
366	psrlq	$32, %mm0	/ 7: cy = product[63..32]
367
368.L17:
369	movd	%mm0, %eax	/ return (cy)
370	/ no emms.  caller is responsible for emms
371	ret
372	SET_SIZE(big_mul_set_vec_sse2_r)
373
374
375/ r = a * digit, r and a are vectors of length len
376/ returns the carry digit
377/ Suitable only for x86 models that support SSE2 instruction set extensions
378/
379/ r		 8(%ebp)	%edx
380/ a		12(%ebp)	%ebx
381/ len		16(%ebp)	%ecx
382/ digit		20(%ebp)	%mm3
383/
384/ In userland, there is just the one function, big_mul_set_vec_sse2().
385/ But in the kernel, there are two variations:
386/    1. big_mul_set_vec_sse2() which does what is necessary to save and
387/       restore state, if necessary, and to ensure that preemtion is
388/       disabled.
389/    2. big_mul_set_vec_sse2_nsv() which just does the work;
390/       it is the caller's responsibility to ensure that MMX state
391/       does not need to be saved and restored and that preemption
392/       is already disabled.
393
394#if defined(MMX_MANAGE)
395	ENTRY(big_mul_set_vec_sse2)
396	pushl	%ebp
397	movl	%esp, %ebp
398	pushl	%ebx
399	pushl	%esi
400	KPREEMPT_DISABLE
401	TEST_TS(%ebx)
402	pushl	%ebx
403	jnz	.setvec_no_save
404	pushl	%edi
405	SAVE_MMX_0TO4(%edi)
406	movl	8(%ebp), %edx
407	movl	12(%ebp), %ebx
408	movl	16(%ebp), %ecx
409	movd	20(%ebp), %mm3
410	call	big_mul_set_vec_sse2_r
411	movl	%eax, %esi
412	RSTOR_MMX_0TO4(%edi)
413	popl	%edi
414	jmp	.setvec_rtn
415
416.setvec_no_save:
417	movl	8(%ebp), %edx
418	movl	12(%ebp), %ebx
419	movl	16(%ebp), %ecx
420	movd	20(%ebp), %mm3
421	call	big_mul_set_vec_sse2_r
422	movl	%eax, %esi
423
424.setvec_rtn:
425	emms
426	popl	%ebx
427	movl	%ebx, %cr0
428	KPREEMPT_ENABLE
429	movl	%esi, %eax
430	popl	%esi
431	popl	%ebx
432	leave
433	ret
434	SET_SIZE(big_mul_set_vec_sse2)
435
436	ENTRY(big_mul_set_vec_sse2_nsv)
437	pushl	%ebp
438	movl	%esp, %ebp
439	pushl	%ebx
440	movl	8(%ebp), %edx
441	movl	12(%ebp), %ebx
442	movl	16(%ebp), %ecx
443	movd	20(%ebp), %mm3
444	call	big_mul_set_vec_sse2_r
445	popl	%ebx
446	leave
447	ret
448	SET_SIZE(big_mul_set_vec_sse2_nsv)
449
450#else	/* !defined(MMX_MANAGE) */
451
452/ r = a * digit, r and a are vectors of length len
453/ returns the carry digit
454/ Suitable only for x86 models that support SSE2 instruction set extensions
455/
456/ r		 8(%ebp)	%edx
457/ a		12(%ebp)	%ebx
458/ len		16(%ebp)	%ecx
459/ digit		20(%ebp)	%mm3
460
461	ENTRY(big_mul_set_vec_sse2)
462	pushl	%ebp
463	movl	%esp, %ebp
464	pushl	%ebx
465	movl	8(%ebp), %edx
466	movl	12(%ebp), %ebx
467	movl	16(%ebp), %ecx
468	movd	20(%ebp), %mm3
469	call	big_mul_set_vec_sse2_r
470	popl	%ebx
471	emms
472	leave
473	ret
474	SET_SIZE(big_mul_set_vec_sse2)
475
476#endif	/* MMX_MANAGE */
477
478
479/ r = r + a * digit, r and a are vectors of length len
480/ returns the carry digit
481/ Suitable only for x86 models that support SSE2 instruction set extensions
482/
483/ uint32_t
484/ big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
485/
486/ r	%edx
487/ a	%ebx
488/ len	%ecx
489/ digit	%mm3
490/
491/ N.B.:
492/   This is strictly for internal use.
493/   The interface is very light-weight.
494/   All parameters are passed in registers.
495/   It does not conform to the SYSV x86 ABI.
496/   So, don't even think about calling this function directly from C code.
497/
498/ The basic multiply digit loop is unrolled 8 times.
499/ Each comment is preceded by an instance number.
500/ Instructions that have been moved retain their original, "natural"
501/ instance number.  It should be easier this way to follow
502/ the step-wise refinement process that went into constructing
503/ the final code.
504
505	ENTRY(big_mul_add_vec_sse2_r)
506	xorl	%eax, %eax
507	testl	%ecx, %ecx
508	jz	.L27
509
510	pxor	%mm0, %mm0	/ cy = 0
511
512.L25:
513	cmpl	$UNROLL, %ecx
514	jl	.L26
515	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
516	movd	0(%edx), %mm2	/ 1: mm2 = r[i]
517	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
518	paddq	%mm1, %mm2	/ 1: mm2 = digit * a[i] + r[i]
519	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
520	paddq	%mm2, %mm0	/ 1: mm0 = digit * a[i] + r[i] + cy;
521	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
522	movd	4(%edx), %mm2	/ 2: mm2 = r[i]
523	psrlq	$32, %mm0	/ 1: cy = product[63..32]
524
525	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
526	paddq	%mm1, %mm2	/ 2: mm2 = digit * a[i] + r[i]
527	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
528	paddq	%mm2, %mm0	/ 2: mm0 = digit * a[i] + r[i] + cy;
529	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
530	movd	8(%edx), %mm2	/ 3: mm2 = r[i]
531	psrlq	$32, %mm0	/ 2: cy = product[63..32]
532
533	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
534	paddq	%mm1, %mm2	/ 3: mm2 = digit * a[i] + r[i]
535	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
536	paddq	%mm2, %mm0	/ 3: mm0 = digit * a[i] + r[i] + cy;
537	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
538	movd	12(%edx), %mm2	/ 4: mm2 = r[i]
539	psrlq	$32, %mm0	/ 3: cy = product[63..32]
540
541	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
542	paddq	%mm1, %mm2	/ 4: mm2 = digit * a[i] + r[i]
543	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
544	paddq	%mm2, %mm0	/ 4: mm0 = digit * a[i] + r[i] + cy;
545	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
546	movd	16(%edx), %mm2	/ 5: mm2 = r[i]
547	psrlq	$32, %mm0	/ 4: cy = product[63..32]
548
549	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
550	paddq	%mm1, %mm2	/ 5: mm2 = digit * a[i] + r[i]
551	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
552	paddq	%mm2, %mm0	/ 5: mm0 = digit * a[i] + r[i] + cy;
553	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
554	movd	20(%edx), %mm2	/ 6: mm2 = r[i]
555	psrlq	$32, %mm0	/ 5: cy = product[63..32]
556
557	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
558	paddq	%mm1, %mm2	/ 6: mm2 = digit * a[i] + r[i]
559	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
560	paddq	%mm2, %mm0	/ 6: mm0 = digit * a[i] + r[i] + cy;
561	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
562	movd	24(%edx), %mm2	/ 7: mm2 = r[i]
563	psrlq	$32, %mm0	/ 6: cy = product[63..32]
564
565	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
566	paddq	%mm1, %mm2	/ 7: mm2 = digit * a[i] + r[i]
567	movd	28(%ebx), %mm1	/ 8: mm1 = a[i]
568	paddq	%mm2, %mm0	/ 7: mm0 = digit * a[i] + r[i] + cy;
569	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
570	movd	28(%edx), %mm2	/ 8: mm2 = r[i]
571	psrlq	$32, %mm0	/ 7: cy = product[63..32]
572
573	pmuludq	%mm3, %mm1	/ 8: mm1 = digit * a[i]
574	paddq	%mm1, %mm2	/ 8: mm2 = digit * a[i] + r[i]
575	paddq	%mm2, %mm0	/ 8: mm0 = digit * a[i] + r[i] + cy;
576	movd	%mm0, 28(%edx)	/ 8: r[i] = product[31..0]
577	psrlq	$32, %mm0	/ 8: cy = product[63..32]
578
579	leal	UNROLL32(%ebx), %ebx	/ a += UNROLL
580	leal	UNROLL32(%edx), %edx	/ r += UNROLL
581	subl	$UNROLL, %ecx		/ len -= UNROLL
582	jz	.L27
583	jmp	.L25
584
585.L26:
586	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
587	movd	0(%edx), %mm2	/ 1: mm2 = r[i]
588	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
589	paddq	%mm1, %mm2	/ 1: mm2 = digit * a[i] + r[i]
590	paddq	%mm2, %mm0	/ 1: mm0 = digit * a[i] + r[i] + cy;
591	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
592	psrlq	$32, %mm0	/ 1: cy = product[63..32]
593	subl	$1, %ecx
594	jz	.L27
595
596	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
597	movd	4(%edx), %mm2	/ 2: mm2 = r[i]
598	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
599	paddq	%mm1, %mm2	/ 2: mm2 = digit * a[i] + r[i]
600	paddq	%mm2, %mm0	/ 2: mm0 = digit * a[i] + r[i] + cy;
601	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
602	psrlq	$32, %mm0	/ 2: cy = product[63..32]
603	subl	$1, %ecx
604	jz	.L27
605
606	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
607	movd	8(%edx), %mm2	/ 3: mm2 = r[i]
608	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
609	paddq	%mm1, %mm2	/ 3: mm2 = digit * a[i] + r[i]
610	paddq	%mm2, %mm0	/ 3: mm0 = digit * a[i] + r[i] + cy;
611	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
612	psrlq	$32, %mm0	/ 3: cy = product[63..32]
613	subl	$1, %ecx
614	jz	.L27
615
616	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
617	movd	12(%edx), %mm2	/ 4: mm2 = r[i]
618	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
619	paddq	%mm1, %mm2	/ 4: mm2 = digit * a[i] + r[i]
620	paddq	%mm2, %mm0	/ 4: mm0 = digit * a[i] + r[i] + cy;
621	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
622	psrlq	$32, %mm0	/ 4: cy = product[63..32]
623	subl	$1, %ecx
624	jz	.L27
625
626	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
627	movd	16(%edx), %mm2	/ 5: mm2 = r[i]
628	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
629	paddq	%mm1, %mm2	/ 5: mm2 = digit * a[i] + r[i]
630	paddq	%mm2, %mm0	/ 5: mm0 = digit * a[i] + r[i] + cy;
631	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
632	psrlq	$32, %mm0	/ 5: cy = product[63..32]
633	subl	$1, %ecx
634	jz	.L27
635
636	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
637	movd	20(%edx), %mm2	/ 6: mm2 = r[i]
638	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
639	paddq	%mm1, %mm2	/ 6: mm2 = digit * a[i] + r[i]
640	paddq	%mm2, %mm0	/ 6: mm0 = digit * a[i] + r[i] + cy;
641	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
642	psrlq	$32, %mm0	/ 6: cy = product[63..32]
643	subl	$1, %ecx
644	jz	.L27
645
646	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
647	movd	24(%edx), %mm2	/ 7: mm2 = r[i]
648	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
649	paddq	%mm1, %mm2	/ 7: mm2 = digit * a[i] + r[i]
650	paddq	%mm2, %mm0	/ 7: mm0 = digit * a[i] + r[i] + cy;
651	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
652	psrlq	$32, %mm0	/ 7: cy = product[63..32]
653
654.L27:
655	movd	%mm0, %eax
656	/ no emms.  caller is responsible for emms
657	ret
658	SET_SIZE(big_mul_add_vec_sse2_r)
659
660
661/ r = r + a * digit, r and a are vectors of length len
662/ returns the carry digit
663/ Suitable only for x86 models that support SSE2 instruction set extensions
664/
665/ r		 8(%ebp)	%edx
666/ a		12(%ebp)	%ebx
667/ len		16(%ebp)	%ecx
668/ digit		20(%ebp)	%mm3
669/
670/ In userland, there is just the one function, big_mul_add_vec_sse2().
671/ But in the kernel, there are two variations:
672/    1. big_mul_add_vec_sse2() which does what is necessary to save and
673/       restore state, if necessary, and to ensure that preemtion is
674/       disabled.
675/    2. big_mul_add_vec_sse2_nsv() which just does the work;
676/       it is the caller's responsibility to ensure that MMX state
677/       does not need to be saved and restored and that preemption
678/       is already disabled.
679
680
681#if defined(MMX_MANAGE)
682
683	ENTRY(big_mul_add_vec_sse2)
684	pushl	%ebp
685	movl	%esp, %ebp
686	pushl	%ebx
687	pushl	%esi
688	KPREEMPT_DISABLE
689	TEST_TS(%ebx)
690	pushl	%ebx
691	jnz	.addvec_no_save
692	pushl	%edi
693	SAVE_MMX_0TO4(%edi)
694	movl	8(%ebp), %edx
695	movl	12(%ebp), %ebx
696	movl	16(%ebp), %ecx
697	movd	20(%ebp), %mm3
698	call	big_mul_add_vec_sse2_r
699	movl	%eax, %esi
700	RSTOR_MMX_0TO4(%edi)
701	popl	%edi
702	jmp	.addvec_rtn
703
704.addvec_no_save:
705	movl	8(%ebp), %edx
706	movl	12(%ebp), %ebx
707	movl	16(%ebp), %ecx
708	movd	20(%ebp), %mm3
709	call	big_mul_add_vec_sse2_r
710	movl	%eax, %esi
711
712.addvec_rtn:
713	emms
714	popl	%ebx
715	movl	%ebx, %cr0
716	KPREEMPT_ENABLE
717	movl	%esi, %eax
718	popl	%esi
719	popl	%ebx
720	leave
721	ret
722	SET_SIZE(big_mul_add_vec_sse2)
723
724	ENTRY(big_mul_add_vec_sse2_nsv)
725	pushl	%ebp
726	movl	%esp, %ebp
727	pushl	%ebx
728	movl	8(%ebp), %edx
729	movl	12(%ebp), %ebx
730	movl	16(%ebp), %ecx
731	movd	20(%ebp), %mm3
732	call	big_mul_add_vec_sse2_r
733	popl	%ebx
734	leave
735	ret
736	SET_SIZE(big_mul_add_vec_sse2_nsv)
737
738
739#else	/* !defined(MMX_MANAGE) */
740
741	ENTRY(big_mul_add_vec_sse2)
742	pushl	%ebp
743	movl	%esp, %ebp
744	pushl	%ebx
745	movl	8(%ebp), %edx
746	movl	12(%ebp), %ebx
747	movl	16(%ebp), %ecx
748	movd	20(%ebp), %mm3
749	call	big_mul_add_vec_sse2_r
750	popl	%ebx
751	emms
752	leave
753	ret
754	SET_SIZE(big_mul_add_vec_sse2)
755
756#endif	/* MMX_MANAGE */
757
758
759/ void
760/ big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
761/ {
762/ 	int i;
763/
764/ 	r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]);
765/ 	for (i = 1; i < blen; ++i)
766/ 		r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]);
767/ }
768
769
770#if defined(MMX_MANAGE)
771	ENTRY(big_mul_vec_sse2_fc)
772#else
773	ENTRY(big_mul_vec_sse2)
774#endif
775	subl	$0x8, %esp
776	pushl	%ebx
777	pushl	%ebp
778	pushl	%esi
779	pushl	%edi
780	movl	40(%esp), %eax
781	movl	%eax, 20(%esp)
782	pushl	(%eax)
783	movl	40(%esp), %edi
784	pushl	%edi
785	movl	40(%esp), %esi
786	pushl	%esi
787	movl	40(%esp), %ebx
788	pushl	%ebx
789#if defined(MMX_MANAGE)
790	call	big_mul_set_vec_sse2_nsv
791#else
792	call	big_mul_set_vec_sse2
793#endif
794	addl	$0x10, %esp
795	movl	%eax, (%ebx,%edi,4)
796	movl	44(%esp), %eax
797	movl	%eax, 16(%esp)
798	cmpl	$0x1, %eax
799	jle	.mulvec_rtn
800	movl	$0x1, %ebp
801
802	.zalign 16,8
803.mulvec_add:
804	movl	20(%esp), %eax
805	pushl	(%eax,%ebp,4)
806	pushl	%edi
807	pushl	%esi
808	leal	(%ebx,%ebp,4), %eax
809	pushl	%eax
810#if defined(MMX_MANAGE)
811	call	big_mul_add_vec_sse2_nsv
812#else
813	call	big_mul_add_vec_sse2
814#endif
815	addl	$0x10, %esp
816	leal	(%ebp,%edi), %ecx
817	movl	%eax, (%ebx,%ecx,4)
818	incl	%ebp
819	cmpl	16(%esp), %ebp
820	jl	.mulvec_add
821.mulvec_rtn:
822#if defined(MMX_MANAGE)
823	emms
824#endif
825	popl	%edi
826	popl	%esi
827	popl	%ebp
828	popl	%ebx
829	addl	$0x8, %esp
830	ret
831#if defined(MMX_MANAGE)
832	SET_SIZE(big_mul_vec_sse2_fc)
833#else
834	SET_SIZE(big_mul_vec_sse2)
835#endif
836
837#if defined(MMX_MANAGE)
838
839	ENTRY(big_mul_vec_sse2)
840	pushl	%ebp
841	movl	%esp, %ebp
842	subl	$8, %esp
843	pushl	%edi
844	KPREEMPT_DISABLE
845	TEST_TS(%eax)
846	movl	%eax, -8(%ebp)
847	jnz	.mulvec_no_save
848	SAVE_MMX_0TO4(%edi)
849	movl	%edi, -4(%ebp)
850.mulvec_no_save:
851	movl	24(%ebp), %eax		/ blen
852	pushl	%eax
853	movl	20(%ebp), %eax		/ b
854	pushl	%eax
855	movl	16(%ebp), %eax		/ alen
856	pushl	%eax
857	movl	12(%ebp), %eax		/ a
858	pushl	%eax
859	movl	8(%ebp), %eax		/ r
860	pushl	%eax
861	call	big_mul_vec_sse2_fc
862	addl	$20, %esp
863	movl	-8(%ebp), %eax
864	testl	$CR0_TS, %eax
865	jnz	.mulvec_no_rstr
866	movl	-4(%ebp), %edi
867	RSTOR_MMX_0TO4(%edi)
868.mulvec_no_rstr:
869	movl	%eax, %cr0
870	KPREEMPT_ENABLE
871	popl	%edi
872	leave
873	ret
874	SET_SIZE(big_mul_vec_sse2)
875
876#endif	/* MMX_MANAGE */
877
878
879
880#undef UNROLL
881#undef UNROLL32
882
883
884/ r = a * a, r and a are vectors of length len
885/ Suitable only for x86 models that support SSE2 instruction set extensions
886/
887/ This function is not suitable for a truly general-purpose multiprecision
888/ arithmetic library, because it does not work for "small" numbers, that is
889/ numbers of 1 or 2 digits.  big_mul() just uses the ordinary big_mul_vec()
890/ for any small numbers.
891
892#if defined(MMX_MANAGE)
893	ENTRY(big_sqr_vec_sse2_fc)
894#else
895	ENTRY(big_sqr_vec_sse2)
896	pushl	%ebp
897	movl	%esp, %ebp
898#endif
899
900	pushl	%ebx
901	pushl	%edi
902	pushl	%esi
903
904	/ r[1..alen] = a[0] * a[1..alen-1]
905
906	movl	8(%ebp), %edi		/ r = arg(r)
907	movl	12(%ebp), %esi		/ a = arg(a)
908	movl	16(%ebp), %ecx		/ cnt = arg(alen)
909	movd	%ecx, %mm4		/ save_cnt = arg(alen)
910	leal	4(%edi), %edx		/ dst = &r[1]
911	movl	%esi, %ebx		/ src = a
912	movd	0(%ebx), %mm3		/ mm3 = a[0]
913	leal	4(%ebx), %ebx		/ src = &a[1]
914	subl	$1, %ecx		/ --cnt
915	call	big_mul_set_vec_sse2_r	/ r[1..alen-1] = a[0] * a[1..alen-1]
916	movl	%edi, %edx		/ dst = r
917	movl	%esi, %ebx		/ src = a
918	movd	%mm4, %ecx		/ cnt = save_cnt
919	movl	%eax, (%edx, %ecx, 4)	/ r[cnt] = cy
920
921/	/* High-level vector C pseudocode */
922/	for (i = 1; i < alen-1; ++i)
923/		r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1]
924/
925/	/* Same thing, but slightly lower level C-like pseudocode */
926/	i = 1;
927/	r = &arg_r[2*i + 1];
928/	a = &arg_a[i + 1];
929/	digit = arg_a[i];
930/	cnt = alen - 3;
931/	while (cnt != 0) {
932/		r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit);
933/		r += 2;
934/		++a;
935/		--cnt;
936/	}
937/
938/	/* Same thing, but even lower level
939/	 * For example, pointers are raw pointers,
940/	 * with no scaling by object size.
941/	 */
942/	r = arg_r + 12;	/* i == 1; 2i + 1 == 3;  4*3 == 12; */
943/	a = arg_a + 8;
944/	digit = *(arg_a + 4);
945/	cnt = alen - 3;
946/	while (cnt != 0) {
947/		cy = big_mul_add_vec_sse2_r();
948/		*(r + 4 * cnt) = cy;
949/		r += 8;
950/		a += 4;
951/		--cnt;
952/	}
953
954	leal	4(%edi), %edi		/ r += 4; r = &r[1]
955	leal	4(%esi), %esi		/ a += 4; a = &a[1]
956	movd	%mm4, %ecx		/ cnt = save
957	subl	$2, %ecx		/ cnt = alen - 2; i in 1..alen-2
958	movd	%ecx, %mm4		/ save_cnt
959	jecxz	.L32			/ while (cnt != 0) {
960.L31:
961	movd	0(%esi), %mm3		/ digit = a[i]
962	leal	4(%esi), %esi		/ a += 4; a = &a[1]; a = &a[i + 1]
963	leal	8(%edi), %edi		/ r += 8; r = &r[2]; r = &r[2 * i + 1]
964	movl	%edi, %edx		/ edx = r
965	movl	%esi, %ebx		/ ebx = a
966	cmp	$1, %ecx		/ The last triangle term is special
967	jz	.L32
968	call	big_mul_add_vec_sse2_r
969	movd	%mm4, %ecx		/ cnt = save_cnt
970	movl	%eax, (%edi, %ecx, 4)	/ r[cnt] = cy
971	subl	$1, %ecx		/ --cnt
972	movd	%ecx, %mm4		/ save_cnt = cnt
973	jmp	.L31			/ }
974
975.L32:
976	movd	0(%ebx), %mm1		/ mm1 = a[i + 1]
977	movd	0(%edx), %mm2		/ mm2 = r[2 * i + 1]
978	pmuludq	%mm3, %mm1		/ mm1 = p = digit * a[i + 1]
979	paddq	%mm1, %mm2		/ mm2 = r[2 * i + 1] + p
980	movd	%mm2, 0(%edx)		/ r[2 * i + 1] += lo32(p)
981	psrlq	$32, %mm2		/ mm2 = cy
982	movd	%mm2, 4(%edx)		/ r[2 * i + 2] = cy
983	pxor	%mm2, %mm2
984	movd	%mm2, 8(%edx)		/ r[2 * i + 3] = 0
985
986	movl	8(%ebp), %edx		/ r = arg(r)
987	movl	12(%ebp), %ebx		/ a = arg(a)
988	movl	16(%ebp), %ecx		/ cnt = arg(alen)
989
990	/ compute low-order corner
991	/ p = a[0]**2
992	/ r[0] = lo32(p)
993	/ cy   = hi32(p)
994	movd	0(%ebx), %mm2		/ mm2 = a[0]
995	pmuludq	%mm2, %mm2		/ mm2 = p = a[0]**2
996	movd	%mm2, 0(%edx)		/ r[0] = lo32(p)
997	psrlq	$32, %mm2		/ mm2 = cy = hi32(p)
998
999	/ p = 2 * r[1]
1000	/ t = p + cy
1001	/ r[1] = lo32(t)
1002	/ cy   = hi32(t)
1003	movd	4(%edx), %mm1		/ mm1 = r[1]
1004	psllq	$1, %mm1		/ mm1 = p = 2 * r[1]
1005	paddq	%mm1, %mm2		/ mm2 = t = p + cy
1006	movd	%mm2, 4(%edx)		/ r[1] = low32(t)
1007	psrlq	$32, %mm2		/ mm2 = cy = hi32(t)
1008
1009	/ r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3]
1010	subl	$2, %ecx		/ cnt = alen - 2
1011.L34:
1012	movd	4(%ebx), %mm0		/ mm0 = diag = a[i+1]
1013	pmuludq	%mm0, %mm0		/ mm0 = p = diag**2
1014	paddq	%mm0, %mm2		/ mm2 = t = p + cy
1015	movd	%mm2, %eax
1016	movd	%eax, %mm1		/ mm1 = lo32(t)
1017	psrlq	$32, %mm2		/ mm2 = hi32(t)
1018
1019	movd	8(%edx), %mm3		/ mm3 = r[2*i]
1020	psllq	$1, %mm3		/ mm3 = 2*r[2*i]
1021	paddq	%mm3, %mm1		/ mm1 = 2*r[2*i] + lo32(t)
1022	movd	%mm1, 8(%edx)		/ r[2*i] = 2*r[2*i] + lo32(t)
1023	psrlq	$32, %mm1
1024	paddq	%mm1, %mm2
1025
1026	movd	12(%edx), %mm3		/ mm3 = r[2*i+1]
1027	psllq	$1, %mm3		/ mm3 = 2*r[2*i+1]
1028	paddq	%mm3, %mm2		/ mm2 = 2*r[2*i+1] + hi32(t)
1029	movd	%mm2, 12(%edx)		/ r[2*i+1] = mm2
1030	psrlq	$32, %mm2		/ mm2 = cy
1031	leal	8(%edx), %edx		/ r += 2
1032	leal	4(%ebx), %ebx		/ ++a
1033	subl	$1, %ecx		/ --cnt
1034	jnz	.L34
1035
1036	/ Carry from last triangle term must participate in doubling,
1037	/ but this step isn't paired up with a squaring the elements
1038	/ of the inner diagonal.
1039	/ r[$-3..$-2] += 2 * r[$-3..$-2] + cy
1040	movd	8(%edx), %mm3		/ mm3 = r[2*i]
1041	psllq	$1, %mm3		/ mm3 = 2*r[2*i]
1042	paddq	%mm3, %mm2		/ mm2 = 2*r[2*i] + cy
1043	movd	%mm2, 8(%edx)		/ r[2*i] = lo32(2*r[2*i] + cy)
1044	psrlq	$32, %mm2		/ mm2 = cy = hi32(2*r[2*i] + cy)
1045
1046	movd	12(%edx), %mm3		/ mm3 = r[2*i+1]
1047	psllq	$1, %mm3		/ mm3 = 2*r[2*i+1]
1048	paddq	%mm3, %mm2		/ mm2 = 2*r[2*i+1] + cy
1049	movd	%mm2, 12(%edx)		/ r[2*i+1] = mm2
1050	psrlq	$32, %mm2		/ mm2 = cy
1051
1052	/ compute high-order corner and add it in
1053	/ p = a[alen - 1]**2
1054	/ t = p + cy
1055	/ r[alen + alen - 2] += lo32(t)
1056	/ cy = hi32(t)
1057	/ r[alen + alen - 1] = cy
1058	movd	4(%ebx), %mm0		/ mm0 = a[$-1]
1059	movd	8(%edx), %mm3		/ mm3 = r[$-2]
1060	pmuludq	%mm0, %mm0		/ mm0 = p = a[$-1]**2
1061	paddq	%mm0, %mm2		/ mm2 = t = p + cy
1062	paddq	%mm3, %mm2		/ mm2 = r[$-2] + t
1063	movd	%mm2, 8(%edx)		/ r[$-2] = lo32(r[$-2] + t)
1064	psrlq	$32, %mm2		/ mm2 = cy = hi32(r[$-2] + t)
1065	movd	12(%edx), %mm3
1066	paddq	%mm3, %mm2
1067	movd	%mm2, 12(%edx)		/ r[$-1] += cy
1068
1069.L35:
1070	emms
1071	popl	%esi
1072	popl	%edi
1073	popl	%ebx
1074
1075#if defined(MMX_MANAGE)
1076	ret
1077	SET_SIZE(big_sqr_vec_sse2_fc)
1078#else
1079	leave
1080	ret
1081	SET_SIZE(big_sqr_vec_sse2)
1082#endif
1083
1084
1085#if defined(MMX_MANAGE)
1086	ENTRY(big_sqr_vec_sse2)
1087	pushl	%ebp
1088	movl	%esp, %ebp
1089	KPREEMPT_DISABLE
1090	TEST_TS(%ebx)
1091	pushl	%ebx
1092	jnz	.sqr_no_save
1093	pushl	%edi
1094	SAVE_MMX_0TO4(%edi)
1095	call	big_sqr_vec_sse2_fc
1096	RSTOR_MMX_0TO4(%edi)
1097	popl	%edi
1098	jmp	.sqr_rtn
1099
1100.sqr_no_save:
1101	call	big_sqr_vec_sse2_fc
1102
1103.sqr_rtn:
1104	popl	%ebx
1105	movl	%ebx, %cr0
1106	KPREEMPT_ENABLE
1107	leave
1108	ret
1109	SET_SIZE(big_sqr_vec_sse2)
1110
1111#endif	/* MMX_MANAGE */
1112
1113/ ------------------------------------------------------------------------
1114/		UMUL Implementations
1115/ ------------------------------------------------------------------------
1116
1117
1118/ r = a * digit, r and a are vectors of length len
1119/ returns the carry digit
1120/ Does not use any MMX, SSE, or SSE2 instructions.
1121/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1122/ This is a fall-back implementation for x86 models that do not support
1123/ the PMULUDQ instruction.
1124/
1125/ uint32_t
1126/ big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1127/
1128/ r		 8(%ebp)	%edx	%edi
1129/ a		12(%ebp)	%ebx	%esi
1130/ len		16(%ebp)	%ecx
1131/ digit		20(%ebp)	%esi
1132
1133	ENTRY(big_mul_set_vec_umul)
1134	pushl	%ebp
1135	movl	%esp, %ebp
1136	pushl	%esi
1137	pushl	%edi
1138	pushl	%ebx
1139	movl	16(%ebp), %ecx
1140	xorl	%ebx, %ebx	/ cy = 0
1141	testl	%ecx, %ecx
1142	movl	8(%ebp), %edi
1143	movl	12(%ebp), %esi
1144	je	.L57
1145
1146.L55:
1147	movl	(%esi), %eax	/ eax = a[i]
1148	leal	4(%esi), %esi	/ ++a
1149	mull	20(%ebp)	/ edx:eax = a[i] * digit
1150	addl	%ebx, %eax
1151	adcl	$0, %edx	/ edx:eax = a[i] * digit + cy
1152	movl	%eax, (%edi)	/ r[i] = product[31..0]
1153	movl	%edx, %ebx	/ cy = product[63..32]
1154	leal	4(%edi), %edi	/ ++r
1155	decl	%ecx		/ --len
1156	jnz	.L55		/ while (len != 0)
1157.L57:
1158	movl	%ebx, %eax
1159	popl	%ebx
1160	popl	%edi
1161	popl	%esi
1162	leave
1163	ret
1164	SET_SIZE(big_mul_set_vec_umul)
1165
1166
1167/ r = r + a * digit, r and a are vectors of length len
1168/ returns the carry digit
1169/ Does not use any MMX, SSE, or SSE2 instructions.
1170/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1171/ This is a fall-back implementation for x86 models that do not support
1172/ the PMULUDQ instruction.
1173/
1174/ uint32_t
1175/ big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1176/
1177/ r		 8(%ebp)	%edx	%edi
1178/ a		12(%ebp)	%ebx	%esi
1179/ len		16(%ebp)	%ecx
1180/ digit		20(%ebp)	%esi
1181
1182	ENTRY(big_mul_add_vec_umul)
1183	pushl	%ebp
1184	movl	%esp, %ebp
1185	pushl	%esi
1186	pushl	%edi
1187	pushl	%ebx
1188	movl	16(%ebp), %ecx
1189	xorl	%ebx, %ebx	/ cy = 0
1190	testl	%ecx, %ecx
1191	movl	8(%ebp), %edi
1192	movl	12(%ebp), %esi
1193	je	.L67
1194	.align 4
1195.L65:
1196	movl	(%esi), %eax	/ eax = a[i]
1197	leal	4(%esi), %esi	/ ++a
1198	mull	20(%ebp)	/ edx:eax = a[i] * digit
1199	addl	(%edi), %eax
1200	adcl	$0, %edx	/ edx:eax = a[i] * digit + r[i]
1201	addl	%ebx, %eax
1202	adcl	$0, %edx	/ edx:eax = a[i] * digit + r[i] + cy
1203	movl	%eax, (%edi)	/ r[i] = product[31..0]
1204	movl	%edx, %ebx	/ cy = product[63..32]
1205	leal	4(%edi), %edi	/ ++r
1206	decl	%ecx		/ --len
1207	jnz	.L65		/ while (len != 0)
1208.L67:
1209	movl	%ebx, %eax
1210	popl	%ebx
1211	popl	%edi
1212	popl	%esi
1213	leave
1214	ret
1215	SET_SIZE(big_mul_add_vec_umul)
1216
1217#endif	/* __lint */
1218
1219#endif	/* !__GNUC_AS__ */
1220