xref: /titanic_50/usr/src/common/bignum/amd64/bignum_amd64_asm.s (revision 8eea8e29cc4374d1ee24c25a07f45af132db3499)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/asm_linkage.h>
30
31#if defined(lint) || defined(__lint)
32
33#include <sys/types.h>
34
35/* ARGSUSED */
36uint64_t
37big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
38{ return (0); }
39
40/* ARGSUSED */
41uint64_t
42big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
43{ return (0); }
44
45/* ARGSUSED */
46void
47big_sqr_vec64(uint64_t *r, uint64_t *a, int len)
48{}
49
50#else	/* lint */
51
52/ ------------------------------------------------------------------------
53/
54/  Implementation of big_mul_set_vec which exploits
55/  the 64X64->128 bit  unsigned multiply instruction.
56/
57/  As defined in Sun's bignum library for pkcs11, bignums are
58/  composed of an array of 32-bit "digits" along with descriptive
59/  information.  The arrays of digits are only required to be
60/  aligned on 32-bit boundary.  This implementation works only
61/  when the two factors and the result happen to be 64 bit aligned
62/  and have an even number of digits.
63/
64/ ------------------------------------------------------------------------
65
66/ r = a * digit, r and a are vectors of length len
67/ returns the carry digit
68/ r and a are 64 bit aligned.
69/
70/ uint64_t
71/ big_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
72/
73	ENTRY(big_mul_set_vec64)
74	xorq	%rax, %rax		/ if (len == 0) return (0)
75	testq	%rdx, %rdx
76	jz	.L17
77
78	movq	%rdx, %r8		/ Use r8 for len; %rdx is used by mul
79	xorq	%r9, %r9		/ cy = 0
80
81.L15:
82	cmpq	$8, %r8			/ 8 - len
83	jb	.L16
84	movq	0(%rsi), %rax		/ rax = a[0]
85	movq	8(%rsi), %r11		/ prefetch a[1]
86	mulq	%rcx			/ p = a[0] * digit
87	addq	%r9, %rax
88	adcq	$0, %rdx		/ p += cy
89	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
90	movq	%rdx, %r9		/ cy = hi(p)
91
92	movq	%r11, %rax
93	movq	16(%rsi), %r11		/ prefetch a[2]
94	mulq	%rcx			/ p = a[1] * digit
95	addq	%r9, %rax
96	adcq	$0, %rdx		/ p += cy
97	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
98	movq	%rdx, %r9		/ cy = hi(p)
99
100	movq	%r11, %rax
101	movq	24(%rsi), %r11		/ prefetch a[3]
102	mulq	%rcx			/ p = a[2] * digit
103	addq	%r9, %rax
104	adcq	$0, %rdx		/ p += cy
105	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
106	movq	%rdx, %r9		/ cy = hi(p)
107
108	movq	%r11, %rax
109	movq	32(%rsi), %r11		/ prefetch a[4]
110	mulq	%rcx			/ p = a[3] * digit
111	addq	%r9, %rax
112	adcq	$0, %rdx		/ p += cy
113	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
114	movq	%rdx, %r9		/ cy = hi(p)
115
116	movq	%r11, %rax
117	movq	40(%rsi), %r11		/ prefetch a[5]
118	mulq	%rcx			/ p = a[4] * digit
119	addq	%r9, %rax
120	adcq	$0, %rdx		/ p += cy
121	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
122	movq	%rdx, %r9		/ cy = hi(p)
123
124	movq	%r11, %rax
125	movq	48(%rsi), %r11		/ prefetch a[6]
126	mulq	%rcx			/ p = a[5] * digit
127	addq	%r9, %rax
128	adcq	$0, %rdx		/ p += cy
129	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
130	movq	%rdx, %r9		/ cy = hi(p)
131
132	movq	%r11, %rax
133	movq	56(%rsi), %r11		/ prefetch a[7]
134	mulq	%rcx			/ p = a[6] * digit
135	addq	%r9, %rax
136	adcq	$0, %rdx		/ p += cy
137	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
138	movq	%rdx, %r9		/ cy = hi(p)
139
140	movq	%r11, %rax
141	mulq	%rcx			/ p = a[7] * digit
142	addq	%r9, %rax
143	adcq	$0, %rdx		/ p += cy
144	movq	%rax, 56(%rdi)		/ r[7] = lo(p)
145	movq	%rdx, %r9		/ cy = hi(p)
146
147	addq	$64, %rsi
148	addq	$64, %rdi
149	subq	$8, %r8
150
151	jz	.L17
152	jmp	.L15
153
154.L16:
155	movq	0(%rsi), %rax
156	mulq	%rcx			/ p = a[0] * digit
157	addq	%r9, %rax
158	adcq	$0, %rdx		/ p += cy
159	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
160	movq	%rdx, %r9		/ cy = hi(p)
161	decq	%r8
162	jz	.L17
163
164	movq	8(%rsi), %rax
165	mulq	%rcx			/ p = a[1] * digit
166	addq	%r9, %rax
167	adcq	$0, %rdx		/ p += cy
168	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
169	movq	%rdx, %r9		/ cy = hi(p)
170	decq	%r8
171	jz	.L17
172
173	movq	16(%rsi), %rax
174	mulq	%rcx			/ p = a[2] * digit
175	addq	%r9, %rax
176	adcq	$0, %rdx		/ p += cy
177	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
178	movq	%rdx, %r9		/ cy = hi(p)
179	decq	%r8
180	jz	.L17
181
182	movq	24(%rsi), %rax
183	mulq	%rcx			/ p = a[3] * digit
184	addq	%r9, %rax
185	adcq	$0, %rdx		/ p += cy
186	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
187	movq	%rdx, %r9		/ cy = hi(p)
188	decq	%r8
189	jz	.L17
190
191	movq	32(%rsi), %rax
192	mulq	%rcx			/ p = a[4] * digit
193	addq	%r9, %rax
194	adcq	$0, %rdx		/ p += cy
195	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
196	movq	%rdx, %r9		/ cy = hi(p)
197	decq	%r8
198	jz	.L17
199
200	movq	40(%rsi), %rax
201	mulq	%rcx			/ p = a[5] * digit
202	addq	%r9, %rax
203	adcq	$0, %rdx		/ p += cy
204	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
205	movq	%rdx, %r9		/ cy = hi(p)
206	decq	%r8
207	jz	.L17
208
209	movq	48(%rsi), %rax
210	mulq	%rcx			/ p = a[6] * digit
211	addq	%r9, %rax
212	adcq	$0, %rdx		/ p += cy
213	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
214	movq	%rdx, %r9		/ cy = hi(p)
215	decq	%r8
216	jz	.L17
217
218
219.L17:
220	movq	%r9, %rax
221	ret
222	SET_SIZE(big_mul_set_vec64)
223
224/ ------------------------------------------------------------------------
225/
226/  Implementation of big_mul_add_vec which exploits
227/  the 64X64->128 bit  unsigned multiply instruction.
228/
229/  As defined in Sun's bignum library for pkcs11, bignums are
230/  composed of an array of 32-bit "digits" along with descriptive
231/  information.  The arrays of digits are only required to be
232/  aligned on 32-bit boundary.  This implementation works only
233/  when the two factors and the result happen to be 64 bit aligned
234/  and have an even number of digits.
235/
236/ ------------------------------------------------------------------------
237
238/ r += a * digit, r and a are vectors of length len
239/ returns the carry digit
240/ r and a are 64 bit aligned.
241/
242/ uint64_t
243/ big_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
244/
245	ENTRY(big_mul_add_vec64)
246	xorq	%rax, %rax		/ if (len == 0) return (0)
247	testq	%rdx, %rdx
248	jz	.L27
249
250	movq	%rdx, %r8		/ Use r8 for len; %rdx is used by mul
251	xorq	%r9, %r9		/ cy = 0
252
253.L25:
254	cmpq	$8, %r8			/ 8 - len
255	jb	.L26
256	movq	0(%rsi), %rax		/ rax = a[0]
257	movq	0(%rdi), %r10		/ r10 = r[0]
258	movq	8(%rsi), %r11		/ prefetch a[1]
259	mulq	%rcx			/ p = a[0] * digit
260	addq	%r10, %rax
261	adcq	$0, %rdx		/ p += r[0]
262	movq	8(%rdi), %r10		/ prefetch r[1]
263	addq	%r9, %rax
264	adcq	$0, %rdx		/ p += cy
265	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
266	movq	%rdx, %r9		/ cy = hi(p)
267
268	movq	%r11, %rax
269	movq	16(%rsi), %r11		/ prefetch a[2]
270	mulq	%rcx			/ p = a[1] * digit
271	addq	%r10, %rax
272	adcq	$0, %rdx		/ p += r[1]
273	movq	16(%rdi), %r10		/ prefetch r[2]
274	addq	%r9, %rax
275	adcq	$0, %rdx		/ p += cy
276	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
277	movq	%rdx, %r9		/ cy = hi(p)
278
279	movq	%r11, %rax
280	movq	24(%rsi), %r11		/ prefetch a[3]
281	mulq	%rcx			/ p = a[2] * digit
282	addq	%r10, %rax
283	adcq	$0, %rdx		/ p += r[2]
284	movq	24(%rdi), %r10		/ prefetch r[3]
285	addq	%r9, %rax
286	adcq	$0, %rdx		/ p += cy
287	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
288	movq	%rdx, %r9		/ cy = hi(p)
289
290	movq	%r11, %rax
291	movq	32(%rsi), %r11		/ prefetch a[4]
292	mulq	%rcx			/ p = a[3] * digit
293	addq	%r10, %rax
294	adcq	$0, %rdx		/ p += r[3]
295	movq	32(%rdi), %r10		/ prefetch r[4]
296	addq	%r9, %rax
297	adcq	$0, %rdx		/ p += cy
298	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
299	movq	%rdx, %r9		/ cy = hi(p)
300
301	movq	%r11, %rax
302	movq	40(%rsi), %r11		/ prefetch a[5]
303	mulq	%rcx			/ p = a[4] * digit
304	addq	%r10, %rax
305	adcq	$0, %rdx		/ p += r[4]
306	movq	40(%rdi), %r10		/ prefetch r[5]
307	addq	%r9, %rax
308	adcq	$0, %rdx		/ p += cy
309	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
310	movq	%rdx, %r9		/ cy = hi(p)
311
312	movq	%r11, %rax
313	movq	48(%rsi), %r11		/ prefetch a[6]
314	mulq	%rcx			/ p = a[5] * digit
315	addq	%r10, %rax
316	adcq	$0, %rdx		/ p += r[5]
317	movq	48(%rdi), %r10		/ prefetch r[6]
318	addq	%r9, %rax
319	adcq	$0, %rdx		/ p += cy
320	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
321	movq	%rdx, %r9		/ cy = hi(p)
322
323	movq	%r11, %rax
324	movq	56(%rsi), %r11		/ prefetch a[7]
325	mulq	%rcx			/ p = a[6] * digit
326	addq	%r10, %rax
327	adcq	$0, %rdx		/ p += r[6]
328	movq	56(%rdi), %r10		/ prefetch r[7]
329	addq	%r9, %rax
330	adcq	$0, %rdx		/ p += cy
331	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
332	movq	%rdx, %r9		/ cy = hi(p)
333
334	movq	%r11, %rax
335	mulq	%rcx			/ p = a[7] * digit
336	addq	%r10, %rax
337	adcq	$0, %rdx		/ p += r[7]
338	addq	%r9, %rax
339	adcq	$0, %rdx		/ p += cy
340	movq	%rax, 56(%rdi)		/ r[7] = lo(p)
341	movq	%rdx, %r9		/ cy = hi(p)
342
343	addq	$64, %rsi
344	addq	$64, %rdi
345	subq	$8, %r8
346
347	jz	.L27
348	jmp	.L25
349
350.L26:
351	movq	0(%rsi), %rax
352	movq	0(%rdi), %r10
353	mulq	%rcx			/ p = a[0] * digit
354	addq	%r10, %rax
355	adcq	$0, %rdx		/ p += r[0]
356	addq	%r9, %rax
357	adcq	$0, %rdx		/ p += cy
358	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
359	movq	%rdx, %r9		/ cy = hi(p)
360	decq	%r8
361	jz	.L27
362
363	movq	8(%rsi), %rax
364	movq	8(%rdi), %r10
365	mulq	%rcx			/ p = a[1] * digit
366	addq	%r10, %rax
367	adcq	$0, %rdx		/ p += r[1]
368	addq	%r9, %rax
369	adcq	$0, %rdx		/ p += cy
370	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
371	movq	%rdx, %r9		/ cy = hi(p)
372	decq	%r8
373	jz	.L27
374
375	movq	16(%rsi), %rax
376	movq	16(%rdi), %r10
377	mulq	%rcx			/ p = a[2] * digit
378	addq	%r10, %rax
379	adcq	$0, %rdx		/ p += r[2]
380	addq	%r9, %rax
381	adcq	$0, %rdx		/ p += cy
382	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
383	movq	%rdx, %r9		/ cy = hi(p)
384	decq	%r8
385	jz	.L27
386
387	movq	24(%rsi), %rax
388	movq	24(%rdi), %r10
389	mulq	%rcx			/ p = a[3] * digit
390	addq	%r10, %rax
391	adcq	$0, %rdx		/ p += r[3]
392	addq	%r9, %rax
393	adcq	$0, %rdx		/ p += cy
394	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
395	movq	%rdx, %r9		/ cy = hi(p)
396	decq	%r8
397	jz	.L27
398
399	movq	32(%rsi), %rax
400	movq	32(%rdi), %r10
401	mulq	%rcx			/ p = a[4] * digit
402	addq	%r10, %rax
403	adcq	$0, %rdx		/ p += r[4]
404	addq	%r9, %rax
405	adcq	$0, %rdx		/ p += cy
406	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
407	movq	%rdx, %r9		/ cy = hi(p)
408	decq	%r8
409	jz	.L27
410
411	movq	40(%rsi), %rax
412	movq	40(%rdi), %r10
413	mulq	%rcx			/ p = a[5] * digit
414	addq	%r10, %rax
415	adcq	$0, %rdx		/ p += r[5]
416	addq	%r9, %rax
417	adcq	$0, %rdx		/ p += cy
418	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
419	movq	%rdx, %r9		/ cy = hi(p)
420	decq	%r8
421	jz	.L27
422
423	movq	48(%rsi), %rax
424	movq	48(%rdi), %r10
425	mulq	%rcx			/ p = a[6] * digit
426	addq	%r10, %rax
427	adcq	$0, %rdx		/ p += r[6]
428	addq	%r9, %rax
429	adcq	$0, %rdx		/ p += cy
430	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
431	movq	%rdx, %r9		/ cy = hi(p)
432	decq	%r8
433	jz	.L27
434
435
436.L27:
437	movq	%r9, %rax
438	ret
439	SET_SIZE(big_mul_add_vec64)
440
441
442/ void
443/ big_sqr_vec64(uint64_t *r, uint64_t *a, int len)
444
445	ENTRY(big_sqr_vec64)
446	pushq	%rbx
447	pushq	%rbp
448	pushq	%r12
449	pushq	%r13
450	pushq	%r14
451	pushq	%r15
452	pushq	%rdx			/ save arg3, len
453	pushq	%rsi			/ save arg2, a
454	pushq	%rdi			/ save arg1, r
455
456	leaq	8(%rdi), %r13		/ tr = r + 1
457	movq	%rsi, %r14		/ ta = a
458	movq	%rdx, %r15		/ tlen = len
459	decq	%r15			/ tlen = len - 1
460	movq	%r13, %rdi		/ arg1 = tr
461	leaq	8(%r14), %rsi		/ arg2 = ta + 1
462	movq	%r15, %rdx		/ arg3 = tlen
463	movq	0(%r14), %rcx		/ arg4 = ta[0]
464	call	big_mul_set_vec64
465	movq	%rax, 0(%r13, %r15, 8)	/ tr[tlen] = cy
466.L31:
467	decq	%r15			/ --tlen
468	jz	.L32			/ while (--tlen != 0)
469
470	addq	$16, %r13		/ tr += 2
471	addq	$8, %r14		/ ++ta
472	movq	%r13, %rdi		/ arg1 = tr
473	leaq	8(%r14), %rsi		/ arg2 = ta + 1
474	movq	%r15, %rdx		/ arg3 = tlen
475	movq	0(%r14), %rcx		/ arg4 = ta[0]
476	call	big_mul_add_vec64
477	movq	%rax, 0(%r13, %r15, 8)	/ tr[tlen] = cy
478	jmp	.L31
479
480.L32:
481
482/ No more function calls after this.
483/ Restore arguments to registers.
484/ However, don't use %rdx for arg3, len, because it is heavily
485/ used by the hardware MUL instruction.  Use %r8, instead.
486	movq	0(%rsp), %rdi		/ %rdi == arg1 == r
487	movq	8(%rsp), %rsi		/ %rsi == arg2 == a
488	movq	16(%rsp), %r8		/ %r8  == arg3 == len
489
490	movq	0(%rsi), %rax		/ %rax = a[0];
491	mulq	%rax			/ s = %edx:%eax = a[0]**2
492	movq	%rax, 0(%rdi)		/ r[0] = lo64(s)
493	movq	%rdx, %r9		/ cy = hi64(s)
494	xorq	%rdx, %rdx
495	movq	8(%rdi), %rax		/ p = %rdx:%rax = r[1]
496	addq	%rax, %rax
497	adcq	$0, %rdx		/ p = p << 1
498	addq	%r9, %rax
499	adcq	$0, %rdx		/ p = (r[1] << 1) + cy
500	movq	%rax, 8(%rdi)		/ r[1] = lo64(p)
501	movq	%rdx, %r9		/ cy = hi64(p)
502	movq	$1, %r11		/ row = 1
503	movq	$2, %r12		/ col = 2
504	movq	%r8, %r15
505	decq	%r15			/ tlen = len - 1
506.L33:
507	cmpq	%r8, %r11		/ len - row
508	jae	.L34			/ while (row < len)
509
510	movq	0(%rsi, %r11, 8), %rax	/ s = (uint128_t)a[row]
511	mulq	%rax			/ s = s * s
512	xorq	%rbx, %rbx
513	movq	0(%rdi, %r12, 8), %rcx	/ p = (uint128_t)r[col]
514	addq	%rcx, %rcx
515	adcq	$0, %rbx		/ p = p << 1
516	addq	%rcx, %rax
517	adcq	%rbx, %rdx		/ t = p + s
518	xorq	%r10, %r10
519	movq	%rax, %rbp		/ t2 = 0:lo64(t)
520	addq	%r9, %rbp
521	adcq	$0, %r10		/ t2 = %r10:%rbp = lo64(t) + cy
522	movq	%rbp, 0(%rdi, %r12, 8)	/ r[col] = lo64(t2)
523	xorq	%rcx, %rcx
524	movq	%rdx, %r9
525	addq	%r10, %r9
526	adcq	$0, %rcx		/ cy = hi64(t) + hi64(t2)
527	cmpq	%r11, %r15
528	je	.L34			/ if (row == len - 1) break
529	xorq	%rdx, %rdx
530	movq	8(%rdi, %r12, 8), %rax
531	addq	%rax, %rax
532	adcq	$0, %rdx
533	addq	%r9, %rax
534	adcq	%rcx, %rdx		/ p = (lo64(r[col+1]) << 1) + cy
535	movq	%rax, 8(%rdi, %r12, 8)	/ r[col+1] = lo64(p)
536	movq	%rdx, %r9		/ cy = hi64(p)
537
538	incq	%r11			/ ++row
539	addq	$2, %r12		/ col += 2
540	jmp	.L33
541
542.L34:
543	movq	%r9, 8(%rdi, %r12, 8)	/ r[col+1] = lo64(cy)
544
545	addq	$24, %rsp		/ skip %rdi, %rsi, %rdx
546	popq	%r15
547	popq	%r14
548	popq	%r13
549	popq	%r12
550	popq	%rbp
551	popq	%rbx
552
553	ret
554
555	SET_SIZE(big_sqr_vec64)
556
557#endif	/* lint */
558