xref: /titanic_52/usr/src/common/bignum/amd64/bignum_amd64_asm.s (revision bde3d612a7c090234c60e6e4578821237a5db135)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/asm_linkage.h>
27
28#if defined(lint) || defined(__lint)
29
30#include <sys/types.h>
31
32/* ARGSUSED */
33uint64_t
34big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
35{ return (0); }
36
37/* ARGSUSED */
38uint64_t
39big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
40{ return (0); }
41
42/* ARGSUSED */
43void
44big_sqr_vec(uint64_t *r, uint64_t *a, int len)
45{}
46
47#else	/* lint */
48
49/ ------------------------------------------------------------------------
50/
51/  Implementation of big_mul_set_vec which exploits
52/  the 64X64->128 bit  unsigned multiply instruction.
53/
54/  As defined in Sun's bignum library for pkcs11, bignums are
55/  composed of an array of 64-bit "digits" or "chunks" along with
56/  descriptive information.
57/
58/ ------------------------------------------------------------------------
59
60/ r = a * digit, r and a are vectors of length len
61/ returns the carry digit
62/ r and a are 64 bit aligned.
63/
64/ uint64_t
65/ big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
66/
67	ENTRY(big_mul_set_vec)
68	xorq	%rax, %rax		/ if (len == 0) return (0)
69	testq	%rdx, %rdx
70	jz	.L17
71
72	movq	%rdx, %r8		/ Use r8 for len; %rdx is used by mul
73	xorq	%r9, %r9		/ cy = 0
74
75.L15:
76	cmpq	$8, %r8			/ 8 - len
77	jb	.L16
78	movq	0(%rsi), %rax		/ rax = a[0]
79	movq	8(%rsi), %r11		/ prefetch a[1]
80	mulq	%rcx			/ p = a[0] * digit
81	addq	%r9, %rax
82	adcq	$0, %rdx		/ p += cy
83	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
84	movq	%rdx, %r9		/ cy = hi(p)
85
86	movq	%r11, %rax
87	movq	16(%rsi), %r11		/ prefetch a[2]
88	mulq	%rcx			/ p = a[1] * digit
89	addq	%r9, %rax
90	adcq	$0, %rdx		/ p += cy
91	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
92	movq	%rdx, %r9		/ cy = hi(p)
93
94	movq	%r11, %rax
95	movq	24(%rsi), %r11		/ prefetch a[3]
96	mulq	%rcx			/ p = a[2] * digit
97	addq	%r9, %rax
98	adcq	$0, %rdx		/ p += cy
99	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
100	movq	%rdx, %r9		/ cy = hi(p)
101
102	movq	%r11, %rax
103	movq	32(%rsi), %r11		/ prefetch a[4]
104	mulq	%rcx			/ p = a[3] * digit
105	addq	%r9, %rax
106	adcq	$0, %rdx		/ p += cy
107	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
108	movq	%rdx, %r9		/ cy = hi(p)
109
110	movq	%r11, %rax
111	movq	40(%rsi), %r11		/ prefetch a[5]
112	mulq	%rcx			/ p = a[4] * digit
113	addq	%r9, %rax
114	adcq	$0, %rdx		/ p += cy
115	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
116	movq	%rdx, %r9		/ cy = hi(p)
117
118	movq	%r11, %rax
119	movq	48(%rsi), %r11		/ prefetch a[6]
120	mulq	%rcx			/ p = a[5] * digit
121	addq	%r9, %rax
122	adcq	$0, %rdx		/ p += cy
123	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
124	movq	%rdx, %r9		/ cy = hi(p)
125
126	movq	%r11, %rax
127	movq	56(%rsi), %r11		/ prefetch a[7]
128	mulq	%rcx			/ p = a[6] * digit
129	addq	%r9, %rax
130	adcq	$0, %rdx		/ p += cy
131	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
132	movq	%rdx, %r9		/ cy = hi(p)
133
134	movq	%r11, %rax
135	mulq	%rcx			/ p = a[7] * digit
136	addq	%r9, %rax
137	adcq	$0, %rdx		/ p += cy
138	movq	%rax, 56(%rdi)		/ r[7] = lo(p)
139	movq	%rdx, %r9		/ cy = hi(p)
140
141	addq	$64, %rsi
142	addq	$64, %rdi
143	subq	$8, %r8
144
145	jz	.L17
146	jmp	.L15
147
148.L16:
149	movq	0(%rsi), %rax
150	mulq	%rcx			/ p = a[0] * digit
151	addq	%r9, %rax
152	adcq	$0, %rdx		/ p += cy
153	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
154	movq	%rdx, %r9		/ cy = hi(p)
155	decq	%r8
156	jz	.L17
157
158	movq	8(%rsi), %rax
159	mulq	%rcx			/ p = a[1] * digit
160	addq	%r9, %rax
161	adcq	$0, %rdx		/ p += cy
162	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
163	movq	%rdx, %r9		/ cy = hi(p)
164	decq	%r8
165	jz	.L17
166
167	movq	16(%rsi), %rax
168	mulq	%rcx			/ p = a[2] * digit
169	addq	%r9, %rax
170	adcq	$0, %rdx		/ p += cy
171	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
172	movq	%rdx, %r9		/ cy = hi(p)
173	decq	%r8
174	jz	.L17
175
176	movq	24(%rsi), %rax
177	mulq	%rcx			/ p = a[3] * digit
178	addq	%r9, %rax
179	adcq	$0, %rdx		/ p += cy
180	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
181	movq	%rdx, %r9		/ cy = hi(p)
182	decq	%r8
183	jz	.L17
184
185	movq	32(%rsi), %rax
186	mulq	%rcx			/ p = a[4] * digit
187	addq	%r9, %rax
188	adcq	$0, %rdx		/ p += cy
189	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
190	movq	%rdx, %r9		/ cy = hi(p)
191	decq	%r8
192	jz	.L17
193
194	movq	40(%rsi), %rax
195	mulq	%rcx			/ p = a[5] * digit
196	addq	%r9, %rax
197	adcq	$0, %rdx		/ p += cy
198	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
199	movq	%rdx, %r9		/ cy = hi(p)
200	decq	%r8
201	jz	.L17
202
203	movq	48(%rsi), %rax
204	mulq	%rcx			/ p = a[6] * digit
205	addq	%r9, %rax
206	adcq	$0, %rdx		/ p += cy
207	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
208	movq	%rdx, %r9		/ cy = hi(p)
209	decq	%r8
210	jz	.L17
211
212
213.L17:
214	movq	%r9, %rax
215	ret
216	SET_SIZE(big_mul_set_vec)
217
218
219/ ------------------------------------------------------------------------
220/
221/  Implementation of big_mul_add_vec which exploits
222/  the 64X64->128 bit  unsigned multiply instruction.
223/
224/  As defined in Sun's bignum library for pkcs11, bignums are
225/  composed of an array of 64-bit "digits" or "chunks" along with
226/  descriptive information.
227/
228/ ------------------------------------------------------------------------
229
230/ r += a * digit, r and a are vectors of length len
231/ returns the carry digit
232/ r and a are 64 bit aligned.
233/
234/ uint64_t
235/ big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
236/
237	ENTRY(big_mul_add_vec)
238	xorq	%rax, %rax		/ if (len == 0) return (0)
239	testq	%rdx, %rdx
240	jz	.L27
241
242	movq	%rdx, %r8		/ Use r8 for len; %rdx is used by mul
243	xorq	%r9, %r9		/ cy = 0
244
245.L25:
246	cmpq	$8, %r8			/ 8 - len
247	jb	.L26
248	movq	0(%rsi), %rax		/ rax = a[0]
249	movq	0(%rdi), %r10		/ r10 = r[0]
250	movq	8(%rsi), %r11		/ prefetch a[1]
251	mulq	%rcx			/ p = a[0] * digit
252	addq	%r10, %rax
253	adcq	$0, %rdx		/ p += r[0]
254	movq	8(%rdi), %r10		/ prefetch r[1]
255	addq	%r9, %rax
256	adcq	$0, %rdx		/ p += cy
257	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
258	movq	%rdx, %r9		/ cy = hi(p)
259
260	movq	%r11, %rax
261	movq	16(%rsi), %r11		/ prefetch a[2]
262	mulq	%rcx			/ p = a[1] * digit
263	addq	%r10, %rax
264	adcq	$0, %rdx		/ p += r[1]
265	movq	16(%rdi), %r10		/ prefetch r[2]
266	addq	%r9, %rax
267	adcq	$0, %rdx		/ p += cy
268	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
269	movq	%rdx, %r9		/ cy = hi(p)
270
271	movq	%r11, %rax
272	movq	24(%rsi), %r11		/ prefetch a[3]
273	mulq	%rcx			/ p = a[2] * digit
274	addq	%r10, %rax
275	adcq	$0, %rdx		/ p += r[2]
276	movq	24(%rdi), %r10		/ prefetch r[3]
277	addq	%r9, %rax
278	adcq	$0, %rdx		/ p += cy
279	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
280	movq	%rdx, %r9		/ cy = hi(p)
281
282	movq	%r11, %rax
283	movq	32(%rsi), %r11		/ prefetch a[4]
284	mulq	%rcx			/ p = a[3] * digit
285	addq	%r10, %rax
286	adcq	$0, %rdx		/ p += r[3]
287	movq	32(%rdi), %r10		/ prefetch r[4]
288	addq	%r9, %rax
289	adcq	$0, %rdx		/ p += cy
290	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
291	movq	%rdx, %r9		/ cy = hi(p)
292
293	movq	%r11, %rax
294	movq	40(%rsi), %r11		/ prefetch a[5]
295	mulq	%rcx			/ p = a[4] * digit
296	addq	%r10, %rax
297	adcq	$0, %rdx		/ p += r[4]
298	movq	40(%rdi), %r10		/ prefetch r[5]
299	addq	%r9, %rax
300	adcq	$0, %rdx		/ p += cy
301	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
302	movq	%rdx, %r9		/ cy = hi(p)
303
304	movq	%r11, %rax
305	movq	48(%rsi), %r11		/ prefetch a[6]
306	mulq	%rcx			/ p = a[5] * digit
307	addq	%r10, %rax
308	adcq	$0, %rdx		/ p += r[5]
309	movq	48(%rdi), %r10		/ prefetch r[6]
310	addq	%r9, %rax
311	adcq	$0, %rdx		/ p += cy
312	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
313	movq	%rdx, %r9		/ cy = hi(p)
314
315	movq	%r11, %rax
316	movq	56(%rsi), %r11		/ prefetch a[7]
317	mulq	%rcx			/ p = a[6] * digit
318	addq	%r10, %rax
319	adcq	$0, %rdx		/ p += r[6]
320	movq	56(%rdi), %r10		/ prefetch r[7]
321	addq	%r9, %rax
322	adcq	$0, %rdx		/ p += cy
323	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
324	movq	%rdx, %r9		/ cy = hi(p)
325
326	movq	%r11, %rax
327	mulq	%rcx			/ p = a[7] * digit
328	addq	%r10, %rax
329	adcq	$0, %rdx		/ p += r[7]
330	addq	%r9, %rax
331	adcq	$0, %rdx		/ p += cy
332	movq	%rax, 56(%rdi)		/ r[7] = lo(p)
333	movq	%rdx, %r9		/ cy = hi(p)
334
335	addq	$64, %rsi
336	addq	$64, %rdi
337	subq	$8, %r8
338
339	jz	.L27
340	jmp	.L25
341
342.L26:
343	movq	0(%rsi), %rax
344	movq	0(%rdi), %r10
345	mulq	%rcx			/ p = a[0] * digit
346	addq	%r10, %rax
347	adcq	$0, %rdx		/ p += r[0]
348	addq	%r9, %rax
349	adcq	$0, %rdx		/ p += cy
350	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
351	movq	%rdx, %r9		/ cy = hi(p)
352	decq	%r8
353	jz	.L27
354
355	movq	8(%rsi), %rax
356	movq	8(%rdi), %r10
357	mulq	%rcx			/ p = a[1] * digit
358	addq	%r10, %rax
359	adcq	$0, %rdx		/ p += r[1]
360	addq	%r9, %rax
361	adcq	$0, %rdx		/ p += cy
362	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
363	movq	%rdx, %r9		/ cy = hi(p)
364	decq	%r8
365	jz	.L27
366
367	movq	16(%rsi), %rax
368	movq	16(%rdi), %r10
369	mulq	%rcx			/ p = a[2] * digit
370	addq	%r10, %rax
371	adcq	$0, %rdx		/ p += r[2]
372	addq	%r9, %rax
373	adcq	$0, %rdx		/ p += cy
374	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
375	movq	%rdx, %r9		/ cy = hi(p)
376	decq	%r8
377	jz	.L27
378
379	movq	24(%rsi), %rax
380	movq	24(%rdi), %r10
381	mulq	%rcx			/ p = a[3] * digit
382	addq	%r10, %rax
383	adcq	$0, %rdx		/ p += r[3]
384	addq	%r9, %rax
385	adcq	$0, %rdx		/ p += cy
386	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
387	movq	%rdx, %r9		/ cy = hi(p)
388	decq	%r8
389	jz	.L27
390
391	movq	32(%rsi), %rax
392	movq	32(%rdi), %r10
393	mulq	%rcx			/ p = a[4] * digit
394	addq	%r10, %rax
395	adcq	$0, %rdx		/ p += r[4]
396	addq	%r9, %rax
397	adcq	$0, %rdx		/ p += cy
398	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
399	movq	%rdx, %r9		/ cy = hi(p)
400	decq	%r8
401	jz	.L27
402
403	movq	40(%rsi), %rax
404	movq	40(%rdi), %r10
405	mulq	%rcx			/ p = a[5] * digit
406	addq	%r10, %rax
407	adcq	$0, %rdx		/ p += r[5]
408	addq	%r9, %rax
409	adcq	$0, %rdx		/ p += cy
410	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
411	movq	%rdx, %r9		/ cy = hi(p)
412	decq	%r8
413	jz	.L27
414
415	movq	48(%rsi), %rax
416	movq	48(%rdi), %r10
417	mulq	%rcx			/ p = a[6] * digit
418	addq	%r10, %rax
419	adcq	$0, %rdx		/ p += r[6]
420	addq	%r9, %rax
421	adcq	$0, %rdx		/ p += cy
422	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
423	movq	%rdx, %r9		/ cy = hi(p)
424	decq	%r8
425	jz	.L27
426
427
428.L27:
429	movq	%r9, %rax
430	ret
431	SET_SIZE(big_mul_add_vec)
432
433
434/ void
435/ big_sqr_vec(uint64_t *r, uint64_t *a, int len)
436
437	ENTRY(big_sqr_vec)
438	pushq	%rbx
439	pushq	%rbp
440	pushq	%r12
441	pushq	%r13
442	pushq	%r14
443	pushq	%r15
444	pushq	%rdx			/ save arg3, len
445	pushq	%rsi			/ save arg2, a
446	pushq	%rdi			/ save arg1, r
447
448	leaq	8(%rdi), %r13		/ tr = r + 1
449	movq	%rsi, %r14		/ ta = a
450	movq	%rdx, %r15		/ tlen = len
451	decq	%r15			/ tlen = len - 1
452	movq	%r13, %rdi		/ arg1 = tr
453	leaq	8(%r14), %rsi		/ arg2 = ta + 1
454	movq	%r15, %rdx		/ arg3 = tlen
455	movq	0(%r14), %rcx		/ arg4 = ta[0]
456	call	big_mul_set_vec
457	movq	%rax, 0(%r13, %r15, 8)	/ tr[tlen] = cy
458.L31:
459	decq	%r15			/ --tlen
460	jz	.L32			/ while (--tlen != 0)
461
462	addq	$16, %r13		/ tr += 2
463	addq	$8, %r14		/ ++ta
464	movq	%r13, %rdi		/ arg1 = tr
465	leaq	8(%r14), %rsi		/ arg2 = ta + 1
466	movq	%r15, %rdx		/ arg3 = tlen
467	movq	0(%r14), %rcx		/ arg4 = ta[0]
468	call	big_mul_add_vec
469	movq	%rax, 0(%r13, %r15, 8)	/ tr[tlen] = cy
470	jmp	.L31
471
472.L32:
473
474/ No more function calls after this.
475/ Restore arguments to registers.
476/ However, don't use %rdx for arg3, len, because it is heavily
477/ used by the hardware MUL instruction.  Use %r8, instead.
478	movq	0(%rsp), %rdi		/ %rdi == arg1 == r
479	movq	8(%rsp), %rsi		/ %rsi == arg2 == a
480	movq	16(%rsp), %r8		/ %r8  == arg3 == len
481
482	movq	0(%rsi), %rax		/ %rax = a[0];
483	mulq	%rax			/ s = %edx:%eax = a[0]**2
484	movq	%rax, 0(%rdi)		/ r[0] = lo64(s)
485	movq	%rdx, %r9		/ cy = hi64(s)
486	xorq	%rdx, %rdx
487	movq	8(%rdi), %rax		/ p = %rdx:%rax = r[1]
488	addq	%rax, %rax
489	adcq	$0, %rdx		/ p = p << 1
490	addq	%r9, %rax
491	adcq	$0, %rdx		/ p = (r[1] << 1) + cy
492	movq	%rax, 8(%rdi)		/ r[1] = lo64(p)
493	movq	%rdx, %r9		/ cy = hi64(p)
494	movq	$1, %r11		/ row = 1
495	movq	$2, %r12		/ col = 2
496	movq	%r8, %r15
497	decq	%r15			/ tlen = len - 1
498.L33:
499	cmpq	%r8, %r11		/ len - row
500	jae	.L34			/ while (row < len)
501
502	movq	0(%rsi, %r11, 8), %rax	/ s = (uint128_t)a[row]
503	mulq	%rax			/ s = s * s
504	xorq	%rbx, %rbx
505	movq	0(%rdi, %r12, 8), %rcx	/ p = (uint128_t)r[col]
506	addq	%rcx, %rcx
507	adcq	$0, %rbx		/ p = p << 1
508	addq	%rcx, %rax
509	adcq	%rbx, %rdx		/ t = p + s
510	xorq	%r10, %r10
511	movq	%rax, %rbp		/ t2 = 0:lo64(t)
512	addq	%r9, %rbp
513	adcq	$0, %r10		/ t2 = %r10:%rbp = lo64(t) + cy
514	movq	%rbp, 0(%rdi, %r12, 8)	/ r[col] = lo64(t2)
515	xorq	%rcx, %rcx
516	movq	%rdx, %r9
517	addq	%r10, %r9
518	adcq	$0, %rcx		/ cy = hi64(t) + hi64(t2)
519	cmpq	%r11, %r15
520	je	.L34			/ if (row == len - 1) break
521	xorq	%rdx, %rdx
522	movq	8(%rdi, %r12, 8), %rax
523	addq	%rax, %rax
524	adcq	$0, %rdx
525	addq	%r9, %rax
526	adcq	%rcx, %rdx		/ p = (lo64(r[col+1]) << 1) + cy
527	movq	%rax, 8(%rdi, %r12, 8)	/ r[col+1] = lo64(p)
528	movq	%rdx, %r9		/ cy = hi64(p)
529
530	incq	%r11			/ ++row
531	addq	$2, %r12		/ col += 2
532	jmp	.L33
533
534.L34:
535	movq	%r9, 8(%rdi, %r12, 8)	/ r[col+1] = lo64(cy)
536
537	addq	$24, %rsp		/ skip %rdi, %rsi, %rdx
538	popq	%r15
539	popq	%r14
540	popq	%r13
541	popq	%r12
542	popq	%rbp
543	popq	%rbx
544
545	ret
546
547	SET_SIZE(big_sqr_vec)
548
549#endif	/* lint */
550