xref: /freebsd/crypto/openssl/crypto/bn/asm/x86_64-mont5.pl (revision 3b8f08459569bf0faa21473e5cec2491e95c9349)
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# August 2011.
11#
12# Companion to x86_64-mont.pl that optimizes cache-timing attack
13# countermeasures. The subroutines are produced by replacing bp[i]
14# references in their x86_64-mont.pl counterparts with cache-neutral
15# references to powers table computed in BN_mod_exp_mont_consttime.
16# In addition subroutine that scatters elements of the powers table
17# is implemented, so that scatter-/gathering can be tuned without
18# bn_exp.c modifications.
19
20$flavour = shift;
21$output  = shift;
22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
23
24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
25
26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29die "can't locate x86_64-xlate.pl";
30
31open OUT,"| \"$^X\" $xlate $flavour $output";
32*STDOUT=*OUT;
33
34# int bn_mul_mont_gather5(
35$rp="%rdi";	# BN_ULONG *rp,
36$ap="%rsi";	# const BN_ULONG *ap,
37$bp="%rdx";	# const BN_ULONG *bp,
38$np="%rcx";	# const BN_ULONG *np,
39$n0="%r8";	# const BN_ULONG *n0,
40$num="%r9";	# int num,
41		# int idx);	# 0 to 2^5-1, "index" in $bp holding
42				# pre-computed powers of a', interlaced
43				# in such manner that b[0] is $bp[idx],
44				# b[1] is [2^5+idx], etc.
45$lo0="%r10";
46$hi0="%r11";
47$hi1="%r13";
48$i="%r14";
49$j="%r15";
50$m0="%rbx";
51$m1="%rbp";
52
53$code=<<___;
54.text
55
56.globl	bn_mul_mont_gather5
57.type	bn_mul_mont_gather5,\@function,6
58.align	64
59bn_mul_mont_gather5:
60	test	\$3,${num}d
61	jnz	.Lmul_enter
62	cmp	\$8,${num}d
63	jb	.Lmul_enter
64	jmp	.Lmul4x_enter
65
66.align	16
67.Lmul_enter:
68	mov	${num}d,${num}d
69	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
70	push	%rbx
71	push	%rbp
72	push	%r12
73	push	%r13
74	push	%r14
75	push	%r15
76___
77$code.=<<___ if ($win64);
78	lea	-0x28(%rsp),%rsp
79	movaps	%xmm6,(%rsp)
80	movaps	%xmm7,0x10(%rsp)
81.Lmul_alloca:
82___
83$code.=<<___;
84	mov	%rsp,%rax
85	lea	2($num),%r11
86	neg	%r11
87	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2))
88	and	\$-1024,%rsp		# minimize TLB usage
89
90	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
91.Lmul_body:
92	mov	$bp,%r12		# reassign $bp
93___
94		$bp="%r12";
95		$STRIDE=2**5*8;		# 5 is "window size"
96		$N=$STRIDE/4;		# should match cache line size
97$code.=<<___;
98	mov	%r10,%r11
99	shr	\$`log($N/8)/log(2)`,%r10
100	and	\$`$N/8-1`,%r11
101	not	%r10
102	lea	.Lmagic_masks(%rip),%rax
103	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
104	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
105	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
106	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
107	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
108	movq	24(%rax,%r10,8),%xmm7
109
110	movq	`0*$STRIDE/4-96`($bp),%xmm0
111	movq	`1*$STRIDE/4-96`($bp),%xmm1
112	pand	%xmm4,%xmm0
113	movq	`2*$STRIDE/4-96`($bp),%xmm2
114	pand	%xmm5,%xmm1
115	movq	`3*$STRIDE/4-96`($bp),%xmm3
116	pand	%xmm6,%xmm2
117	por	%xmm1,%xmm0
118	pand	%xmm7,%xmm3
119	por	%xmm2,%xmm0
120	lea	$STRIDE($bp),$bp
121	por	%xmm3,%xmm0
122
123	movq	%xmm0,$m0		# m0=bp[0]
124
125	mov	($n0),$n0		# pull n0[0] value
126	mov	($ap),%rax
127
128	xor	$i,$i			# i=0
129	xor	$j,$j			# j=0
130
131	movq	`0*$STRIDE/4-96`($bp),%xmm0
132	movq	`1*$STRIDE/4-96`($bp),%xmm1
133	pand	%xmm4,%xmm0
134	movq	`2*$STRIDE/4-96`($bp),%xmm2
135	pand	%xmm5,%xmm1
136
137	mov	$n0,$m1
138	mulq	$m0			# ap[0]*bp[0]
139	mov	%rax,$lo0
140	mov	($np),%rax
141
142	movq	`3*$STRIDE/4-96`($bp),%xmm3
143	pand	%xmm6,%xmm2
144	por	%xmm1,%xmm0
145	pand	%xmm7,%xmm3
146
147	imulq	$lo0,$m1		# "tp[0]"*n0
148	mov	%rdx,$hi0
149
150	por	%xmm2,%xmm0
151	lea	$STRIDE($bp),$bp
152	por	%xmm3,%xmm0
153
154	mulq	$m1			# np[0]*m1
155	add	%rax,$lo0		# discarded
156	mov	8($ap),%rax
157	adc	\$0,%rdx
158	mov	%rdx,$hi1
159
160	lea	1($j),$j		# j++
161	jmp	.L1st_enter
162
163.align	16
164.L1st:
165	add	%rax,$hi1
166	mov	($ap,$j,8),%rax
167	adc	\$0,%rdx
168	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
169	mov	$lo0,$hi0
170	adc	\$0,%rdx
171	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
172	mov	%rdx,$hi1
173
174.L1st_enter:
175	mulq	$m0			# ap[j]*bp[0]
176	add	%rax,$hi0
177	mov	($np,$j,8),%rax
178	adc	\$0,%rdx
179	lea	1($j),$j		# j++
180	mov	%rdx,$lo0
181
182	mulq	$m1			# np[j]*m1
183	cmp	$num,$j
184	jne	.L1st
185
186	movq	%xmm0,$m0		# bp[1]
187
188	add	%rax,$hi1
189	mov	($ap),%rax		# ap[0]
190	adc	\$0,%rdx
191	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
192	adc	\$0,%rdx
193	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
194	mov	%rdx,$hi1
195	mov	$lo0,$hi0
196
197	xor	%rdx,%rdx
198	add	$hi0,$hi1
199	adc	\$0,%rdx
200	mov	$hi1,-8(%rsp,$num,8)
201	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
202
203	lea	1($i),$i		# i++
204	jmp	.Louter
205.align	16
206.Louter:
207	xor	$j,$j			# j=0
208	mov	$n0,$m1
209	mov	(%rsp),$lo0
210
211	movq	`0*$STRIDE/4-96`($bp),%xmm0
212	movq	`1*$STRIDE/4-96`($bp),%xmm1
213	pand	%xmm4,%xmm0
214	movq	`2*$STRIDE/4-96`($bp),%xmm2
215	pand	%xmm5,%xmm1
216
217	mulq	$m0			# ap[0]*bp[i]
218	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
219	mov	($np),%rax
220	adc	\$0,%rdx
221
222	movq	`3*$STRIDE/4-96`($bp),%xmm3
223	pand	%xmm6,%xmm2
224	por	%xmm1,%xmm0
225	pand	%xmm7,%xmm3
226
227	imulq	$lo0,$m1		# tp[0]*n0
228	mov	%rdx,$hi0
229
230	por	%xmm2,%xmm0
231	lea	$STRIDE($bp),$bp
232	por	%xmm3,%xmm0
233
234	mulq	$m1			# np[0]*m1
235	add	%rax,$lo0		# discarded
236	mov	8($ap),%rax
237	adc	\$0,%rdx
238	mov	8(%rsp),$lo0		# tp[1]
239	mov	%rdx,$hi1
240
241	lea	1($j),$j		# j++
242	jmp	.Linner_enter
243
244.align	16
245.Linner:
246	add	%rax,$hi1
247	mov	($ap,$j,8),%rax
248	adc	\$0,%rdx
249	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
250	mov	(%rsp,$j,8),$lo0
251	adc	\$0,%rdx
252	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
253	mov	%rdx,$hi1
254
255.Linner_enter:
256	mulq	$m0			# ap[j]*bp[i]
257	add	%rax,$hi0
258	mov	($np,$j,8),%rax
259	adc	\$0,%rdx
260	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
261	mov	%rdx,$hi0
262	adc	\$0,$hi0
263	lea	1($j),$j		# j++
264
265	mulq	$m1			# np[j]*m1
266	cmp	$num,$j
267	jne	.Linner
268
269	movq	%xmm0,$m0		# bp[i+1]
270
271	add	%rax,$hi1
272	mov	($ap),%rax		# ap[0]
273	adc	\$0,%rdx
274	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
275	mov	(%rsp,$j,8),$lo0
276	adc	\$0,%rdx
277	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
278	mov	%rdx,$hi1
279
280	xor	%rdx,%rdx
281	add	$hi0,$hi1
282	adc	\$0,%rdx
283	add	$lo0,$hi1		# pull upmost overflow bit
284	adc	\$0,%rdx
285	mov	$hi1,-8(%rsp,$num,8)
286	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
287
288	lea	1($i),$i		# i++
289	cmp	$num,$i
290	jl	.Louter
291
292	xor	$i,$i			# i=0 and clear CF!
293	mov	(%rsp),%rax		# tp[0]
294	lea	(%rsp),$ap		# borrow ap for tp
295	mov	$num,$j			# j=num
296	jmp	.Lsub
297.align	16
298.Lsub:	sbb	($np,$i,8),%rax
299	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
300	mov	8($ap,$i,8),%rax	# tp[i+1]
301	lea	1($i),$i		# i++
302	dec	$j			# doesnn't affect CF!
303	jnz	.Lsub
304
305	sbb	\$0,%rax		# handle upmost overflow bit
306	xor	$i,$i
307	and	%rax,$ap
308	not	%rax
309	mov	$rp,$np
310	and	%rax,$np
311	mov	$num,$j			# j=num
312	or	$np,$ap			# ap=borrow?tp:rp
313.align	16
314.Lcopy:					# copy or in-place refresh
315	mov	($ap,$i,8),%rax
316	mov	$i,(%rsp,$i,8)		# zap temporary vector
317	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
318	lea	1($i),$i
319	sub	\$1,$j
320	jnz	.Lcopy
321
322	mov	8(%rsp,$num,8),%rsi	# restore %rsp
323	mov	\$1,%rax
324___
325$code.=<<___ if ($win64);
326	movaps	(%rsi),%xmm6
327	movaps	0x10(%rsi),%xmm7
328	lea	0x28(%rsi),%rsi
329___
330$code.=<<___;
331	mov	(%rsi),%r15
332	mov	8(%rsi),%r14
333	mov	16(%rsi),%r13
334	mov	24(%rsi),%r12
335	mov	32(%rsi),%rbp
336	mov	40(%rsi),%rbx
337	lea	48(%rsi),%rsp
338.Lmul_epilogue:
339	ret
340.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
341___
342{{{
343my @A=("%r10","%r11");
344my @N=("%r13","%rdi");
345$code.=<<___;
346.type	bn_mul4x_mont_gather5,\@function,6
347.align	16
348bn_mul4x_mont_gather5:
349.Lmul4x_enter:
350	mov	${num}d,${num}d
351	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
352	push	%rbx
353	push	%rbp
354	push	%r12
355	push	%r13
356	push	%r14
357	push	%r15
358___
359$code.=<<___ if ($win64);
360	lea	-0x28(%rsp),%rsp
361	movaps	%xmm6,(%rsp)
362	movaps	%xmm7,0x10(%rsp)
363.Lmul4x_alloca:
364___
365$code.=<<___;
366	mov	%rsp,%rax
367	lea	4($num),%r11
368	neg	%r11
369	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+4))
370	and	\$-1024,%rsp		# minimize TLB usage
371
372	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
373.Lmul4x_body:
374	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
375	mov	%rdx,%r12		# reassign $bp
376___
377		$bp="%r12";
378		$STRIDE=2**5*8;		# 5 is "window size"
379		$N=$STRIDE/4;		# should match cache line size
380$code.=<<___;
381	mov	%r10,%r11
382	shr	\$`log($N/8)/log(2)`,%r10
383	and	\$`$N/8-1`,%r11
384	not	%r10
385	lea	.Lmagic_masks(%rip),%rax
386	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
387	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
388	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
389	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
390	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
391	movq	24(%rax,%r10,8),%xmm7
392
393	movq	`0*$STRIDE/4-96`($bp),%xmm0
394	movq	`1*$STRIDE/4-96`($bp),%xmm1
395	pand	%xmm4,%xmm0
396	movq	`2*$STRIDE/4-96`($bp),%xmm2
397	pand	%xmm5,%xmm1
398	movq	`3*$STRIDE/4-96`($bp),%xmm3
399	pand	%xmm6,%xmm2
400	por	%xmm1,%xmm0
401	pand	%xmm7,%xmm3
402	por	%xmm2,%xmm0
403	lea	$STRIDE($bp),$bp
404	por	%xmm3,%xmm0
405
406	movq	%xmm0,$m0		# m0=bp[0]
407	mov	($n0),$n0		# pull n0[0] value
408	mov	($ap),%rax
409
410	xor	$i,$i			# i=0
411	xor	$j,$j			# j=0
412
413	movq	`0*$STRIDE/4-96`($bp),%xmm0
414	movq	`1*$STRIDE/4-96`($bp),%xmm1
415	pand	%xmm4,%xmm0
416	movq	`2*$STRIDE/4-96`($bp),%xmm2
417	pand	%xmm5,%xmm1
418
419	mov	$n0,$m1
420	mulq	$m0			# ap[0]*bp[0]
421	mov	%rax,$A[0]
422	mov	($np),%rax
423
424	movq	`3*$STRIDE/4-96`($bp),%xmm3
425	pand	%xmm6,%xmm2
426	por	%xmm1,%xmm0
427	pand	%xmm7,%xmm3
428
429	imulq	$A[0],$m1		# "tp[0]"*n0
430	mov	%rdx,$A[1]
431
432	por	%xmm2,%xmm0
433	lea	$STRIDE($bp),$bp
434	por	%xmm3,%xmm0
435
436	mulq	$m1			# np[0]*m1
437	add	%rax,$A[0]		# discarded
438	mov	8($ap),%rax
439	adc	\$0,%rdx
440	mov	%rdx,$N[1]
441
442	mulq	$m0
443	add	%rax,$A[1]
444	mov	8($np),%rax
445	adc	\$0,%rdx
446	mov	%rdx,$A[0]
447
448	mulq	$m1
449	add	%rax,$N[1]
450	mov	16($ap),%rax
451	adc	\$0,%rdx
452	add	$A[1],$N[1]
453	lea	4($j),$j		# j++
454	adc	\$0,%rdx
455	mov	$N[1],(%rsp)
456	mov	%rdx,$N[0]
457	jmp	.L1st4x
458.align	16
459.L1st4x:
460	mulq	$m0			# ap[j]*bp[0]
461	add	%rax,$A[0]
462	mov	-16($np,$j,8),%rax
463	adc	\$0,%rdx
464	mov	%rdx,$A[1]
465
466	mulq	$m1			# np[j]*m1
467	add	%rax,$N[0]
468	mov	-8($ap,$j,8),%rax
469	adc	\$0,%rdx
470	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
471	adc	\$0,%rdx
472	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
473	mov	%rdx,$N[1]
474
475	mulq	$m0			# ap[j]*bp[0]
476	add	%rax,$A[1]
477	mov	-8($np,$j,8),%rax
478	adc	\$0,%rdx
479	mov	%rdx,$A[0]
480
481	mulq	$m1			# np[j]*m1
482	add	%rax,$N[1]
483	mov	($ap,$j,8),%rax
484	adc	\$0,%rdx
485	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
486	adc	\$0,%rdx
487	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
488	mov	%rdx,$N[0]
489
490	mulq	$m0			# ap[j]*bp[0]
491	add	%rax,$A[0]
492	mov	($np,$j,8),%rax
493	adc	\$0,%rdx
494	mov	%rdx,$A[1]
495
496	mulq	$m1			# np[j]*m1
497	add	%rax,$N[0]
498	mov	8($ap,$j,8),%rax
499	adc	\$0,%rdx
500	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
501	adc	\$0,%rdx
502	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
503	mov	%rdx,$N[1]
504
505	mulq	$m0			# ap[j]*bp[0]
506	add	%rax,$A[1]
507	mov	8($np,$j,8),%rax
508	adc	\$0,%rdx
509	lea	4($j),$j		# j++
510	mov	%rdx,$A[0]
511
512	mulq	$m1			# np[j]*m1
513	add	%rax,$N[1]
514	mov	-16($ap,$j,8),%rax
515	adc	\$0,%rdx
516	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
517	adc	\$0,%rdx
518	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
519	mov	%rdx,$N[0]
520	cmp	$num,$j
521	jl	.L1st4x
522
523	mulq	$m0			# ap[j]*bp[0]
524	add	%rax,$A[0]
525	mov	-16($np,$j,8),%rax
526	adc	\$0,%rdx
527	mov	%rdx,$A[1]
528
529	mulq	$m1			# np[j]*m1
530	add	%rax,$N[0]
531	mov	-8($ap,$j,8),%rax
532	adc	\$0,%rdx
533	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
534	adc	\$0,%rdx
535	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
536	mov	%rdx,$N[1]
537
538	mulq	$m0			# ap[j]*bp[0]
539	add	%rax,$A[1]
540	mov	-8($np,$j,8),%rax
541	adc	\$0,%rdx
542	mov	%rdx,$A[0]
543
544	mulq	$m1			# np[j]*m1
545	add	%rax,$N[1]
546	mov	($ap),%rax		# ap[0]
547	adc	\$0,%rdx
548	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
549	adc	\$0,%rdx
550	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
551	mov	%rdx,$N[0]
552
553	movq	%xmm0,$m0		# bp[1]
554
555	xor	$N[1],$N[1]
556	add	$A[0],$N[0]
557	adc	\$0,$N[1]
558	mov	$N[0],-8(%rsp,$j,8)
559	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
560
561	lea	1($i),$i		# i++
562.align	4
563.Louter4x:
564	xor	$j,$j			# j=0
565	movq	`0*$STRIDE/4-96`($bp),%xmm0
566	movq	`1*$STRIDE/4-96`($bp),%xmm1
567	pand	%xmm4,%xmm0
568	movq	`2*$STRIDE/4-96`($bp),%xmm2
569	pand	%xmm5,%xmm1
570
571	mov	(%rsp),$A[0]
572	mov	$n0,$m1
573	mulq	$m0			# ap[0]*bp[i]
574	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
575	mov	($np),%rax
576	adc	\$0,%rdx
577
578	movq	`3*$STRIDE/4-96`($bp),%xmm3
579	pand	%xmm6,%xmm2
580	por	%xmm1,%xmm0
581	pand	%xmm7,%xmm3
582
583	imulq	$A[0],$m1		# tp[0]*n0
584	mov	%rdx,$A[1]
585
586	por	%xmm2,%xmm0
587	lea	$STRIDE($bp),$bp
588	por	%xmm3,%xmm0
589
590	mulq	$m1			# np[0]*m1
591	add	%rax,$A[0]		# "$N[0]", discarded
592	mov	8($ap),%rax
593	adc	\$0,%rdx
594	mov	%rdx,$N[1]
595
596	mulq	$m0			# ap[j]*bp[i]
597	add	%rax,$A[1]
598	mov	8($np),%rax
599	adc	\$0,%rdx
600	add	8(%rsp),$A[1]		# +tp[1]
601	adc	\$0,%rdx
602	mov	%rdx,$A[0]
603
604	mulq	$m1			# np[j]*m1
605	add	%rax,$N[1]
606	mov	16($ap),%rax
607	adc	\$0,%rdx
608	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
609	lea	4($j),$j		# j+=2
610	adc	\$0,%rdx
611	mov	%rdx,$N[0]
612	jmp	.Linner4x
613.align	16
614.Linner4x:
615	mulq	$m0			# ap[j]*bp[i]
616	add	%rax,$A[0]
617	mov	-16($np,$j,8),%rax
618	adc	\$0,%rdx
619	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
620	adc	\$0,%rdx
621	mov	%rdx,$A[1]
622
623	mulq	$m1			# np[j]*m1
624	add	%rax,$N[0]
625	mov	-8($ap,$j,8),%rax
626	adc	\$0,%rdx
627	add	$A[0],$N[0]
628	adc	\$0,%rdx
629	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
630	mov	%rdx,$N[1]
631
632	mulq	$m0			# ap[j]*bp[i]
633	add	%rax,$A[1]
634	mov	-8($np,$j,8),%rax
635	adc	\$0,%rdx
636	add	-8(%rsp,$j,8),$A[1]
637	adc	\$0,%rdx
638	mov	%rdx,$A[0]
639
640	mulq	$m1			# np[j]*m1
641	add	%rax,$N[1]
642	mov	($ap,$j,8),%rax
643	adc	\$0,%rdx
644	add	$A[1],$N[1]
645	adc	\$0,%rdx
646	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
647	mov	%rdx,$N[0]
648
649	mulq	$m0			# ap[j]*bp[i]
650	add	%rax,$A[0]
651	mov	($np,$j,8),%rax
652	adc	\$0,%rdx
653	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
654	adc	\$0,%rdx
655	mov	%rdx,$A[1]
656
657	mulq	$m1			# np[j]*m1
658	add	%rax,$N[0]
659	mov	8($ap,$j,8),%rax
660	adc	\$0,%rdx
661	add	$A[0],$N[0]
662	adc	\$0,%rdx
663	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
664	mov	%rdx,$N[1]
665
666	mulq	$m0			# ap[j]*bp[i]
667	add	%rax,$A[1]
668	mov	8($np,$j,8),%rax
669	adc	\$0,%rdx
670	add	8(%rsp,$j,8),$A[1]
671	adc	\$0,%rdx
672	lea	4($j),$j		# j++
673	mov	%rdx,$A[0]
674
675	mulq	$m1			# np[j]*m1
676	add	%rax,$N[1]
677	mov	-16($ap,$j,8),%rax
678	adc	\$0,%rdx
679	add	$A[1],$N[1]
680	adc	\$0,%rdx
681	mov	$N[0],-40(%rsp,$j,8)	# tp[j-1]
682	mov	%rdx,$N[0]
683	cmp	$num,$j
684	jl	.Linner4x
685
686	mulq	$m0			# ap[j]*bp[i]
687	add	%rax,$A[0]
688	mov	-16($np,$j,8),%rax
689	adc	\$0,%rdx
690	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
691	adc	\$0,%rdx
692	mov	%rdx,$A[1]
693
694	mulq	$m1			# np[j]*m1
695	add	%rax,$N[0]
696	mov	-8($ap,$j,8),%rax
697	adc	\$0,%rdx
698	add	$A[0],$N[0]
699	adc	\$0,%rdx
700	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
701	mov	%rdx,$N[1]
702
703	mulq	$m0			# ap[j]*bp[i]
704	add	%rax,$A[1]
705	mov	-8($np,$j,8),%rax
706	adc	\$0,%rdx
707	add	-8(%rsp,$j,8),$A[1]
708	adc	\$0,%rdx
709	lea	1($i),$i		# i++
710	mov	%rdx,$A[0]
711
712	mulq	$m1			# np[j]*m1
713	add	%rax,$N[1]
714	mov	($ap),%rax		# ap[0]
715	adc	\$0,%rdx
716	add	$A[1],$N[1]
717	adc	\$0,%rdx
718	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
719	mov	%rdx,$N[0]
720
721	movq	%xmm0,$m0		# bp[i+1]
722	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
723
724	xor	$N[1],$N[1]
725	add	$A[0],$N[0]
726	adc	\$0,$N[1]
727	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
728	adc	\$0,$N[1]
729	mov	$N[0],-8(%rsp,$j,8)
730	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
731
732	cmp	$num,$i
733	jl	.Louter4x
734___
735{
736my @ri=("%rax","%rdx",$m0,$m1);
737$code.=<<___;
738	mov	16(%rsp,$num,8),$rp	# restore $rp
739	mov	0(%rsp),@ri[0]		# tp[0]
740	pxor	%xmm0,%xmm0
741	mov	8(%rsp),@ri[1]		# tp[1]
742	shr	\$2,$num		# num/=4
743	lea	(%rsp),$ap		# borrow ap for tp
744	xor	$i,$i			# i=0 and clear CF!
745
746	sub	0($np),@ri[0]
747	mov	16($ap),@ri[2]		# tp[2]
748	mov	24($ap),@ri[3]		# tp[3]
749	sbb	8($np),@ri[1]
750	lea	-1($num),$j		# j=num/4-1
751	jmp	.Lsub4x
752.align	16
753.Lsub4x:
754	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
755	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
756	sbb	16($np,$i,8),@ri[2]
757	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
758	mov	40($ap,$i,8),@ri[1]
759	sbb	24($np,$i,8),@ri[3]
760	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
761	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
762	sbb	32($np,$i,8),@ri[0]
763	mov	48($ap,$i,8),@ri[2]
764	mov	56($ap,$i,8),@ri[3]
765	sbb	40($np,$i,8),@ri[1]
766	lea	4($i),$i		# i++
767	dec	$j			# doesnn't affect CF!
768	jnz	.Lsub4x
769
770	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
771	mov	32($ap,$i,8),@ri[0]	# load overflow bit
772	sbb	16($np,$i,8),@ri[2]
773	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
774	sbb	24($np,$i,8),@ri[3]
775	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
776
777	sbb	\$0,@ri[0]		# handle upmost overflow bit
778	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
779	xor	$i,$i			# i=0
780	and	@ri[0],$ap
781	not	@ri[0]
782	mov	$rp,$np
783	and	@ri[0],$np
784	lea	-1($num),$j
785	or	$np,$ap			# ap=borrow?tp:rp
786
787	movdqu	($ap),%xmm1
788	movdqa	%xmm0,(%rsp)
789	movdqu	%xmm1,($rp)
790	jmp	.Lcopy4x
791.align	16
792.Lcopy4x:					# copy or in-place refresh
793	movdqu	16($ap,$i),%xmm2
794	movdqu	32($ap,$i),%xmm1
795	movdqa	%xmm0,16(%rsp,$i)
796	movdqu	%xmm2,16($rp,$i)
797	movdqa	%xmm0,32(%rsp,$i)
798	movdqu	%xmm1,32($rp,$i)
799	lea	32($i),$i
800	dec	$j
801	jnz	.Lcopy4x
802
803	shl	\$2,$num
804	movdqu	16($ap,$i),%xmm2
805	movdqa	%xmm0,16(%rsp,$i)
806	movdqu	%xmm2,16($rp,$i)
807___
808}
809$code.=<<___;
810	mov	8(%rsp,$num,8),%rsi	# restore %rsp
811	mov	\$1,%rax
812___
813$code.=<<___ if ($win64);
814	movaps	(%rsi),%xmm6
815	movaps	0x10(%rsi),%xmm7
816	lea	0x28(%rsi),%rsi
817___
818$code.=<<___;
819	mov	(%rsi),%r15
820	mov	8(%rsi),%r14
821	mov	16(%rsi),%r13
822	mov	24(%rsi),%r12
823	mov	32(%rsi),%rbp
824	mov	40(%rsi),%rbx
825	lea	48(%rsi),%rsp
826.Lmul4x_epilogue:
827	ret
828.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
829___
830}}}
831
832{
833my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
834				("%rdi","%rsi","%rdx","%rcx"); # Unix order
835my $out=$inp;
836my $STRIDE=2**5*8;
837my $N=$STRIDE/4;
838
839$code.=<<___;
840.globl	bn_scatter5
841.type	bn_scatter5,\@abi-omnipotent
842.align	16
843bn_scatter5:
844	cmp	\$0, $num
845	jz	.Lscatter_epilogue
846	lea	($tbl,$idx,8),$tbl
847.Lscatter:
848	mov	($inp),%rax
849	lea	8($inp),$inp
850	mov	%rax,($tbl)
851	lea	32*8($tbl),$tbl
852	sub	\$1,$num
853	jnz	.Lscatter
854.Lscatter_epilogue:
855	ret
856.size	bn_scatter5,.-bn_scatter5
857
858.globl	bn_gather5
859.type	bn_gather5,\@abi-omnipotent
860.align	16
861bn_gather5:
862___
863$code.=<<___ if ($win64);
864.LSEH_begin_bn_gather5:
865	# I can't trust assembler to use specific encoding:-(
866	.byte	0x48,0x83,0xec,0x28		#sub	\$0x28,%rsp
867	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
868	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
869___
870$code.=<<___;
871	mov	$idx,%r11
872	shr	\$`log($N/8)/log(2)`,$idx
873	and	\$`$N/8-1`,%r11
874	not	$idx
875	lea	.Lmagic_masks(%rip),%rax
876	and	\$`2**5/($N/8)-1`,$idx	# 5 is "window size"
877	lea	96($tbl,%r11,8),$tbl	# pointer within 1st cache line
878	movq	0(%rax,$idx,8),%xmm4	# set of masks denoting which
879	movq	8(%rax,$idx,8),%xmm5	# cache line contains element
880	movq	16(%rax,$idx,8),%xmm6	# denoted by 7th argument
881	movq	24(%rax,$idx,8),%xmm7
882	jmp	.Lgather
883.align	16
884.Lgather:
885	movq	`0*$STRIDE/4-96`($tbl),%xmm0
886	movq	`1*$STRIDE/4-96`($tbl),%xmm1
887	pand	%xmm4,%xmm0
888	movq	`2*$STRIDE/4-96`($tbl),%xmm2
889	pand	%xmm5,%xmm1
890	movq	`3*$STRIDE/4-96`($tbl),%xmm3
891	pand	%xmm6,%xmm2
892	por	%xmm1,%xmm0
893	pand	%xmm7,%xmm3
894	por	%xmm2,%xmm0
895	lea	$STRIDE($tbl),$tbl
896	por	%xmm3,%xmm0
897
898	movq	%xmm0,($out)		# m0=bp[0]
899	lea	8($out),$out
900	sub	\$1,$num
901	jnz	.Lgather
902___
903$code.=<<___ if ($win64);
904	movaps	(%rsp),%xmm6
905	movaps	0x10(%rsp),%xmm7
906	lea	0x28(%rsp),%rsp
907___
908$code.=<<___;
909	ret
910.LSEH_end_bn_gather5:
911.size	bn_gather5,.-bn_gather5
912___
913}
914$code.=<<___;
915.align	64
916.Lmagic_masks:
917	.long	0,0, 0,0, 0,0, -1,-1
918	.long	0,0, 0,0, 0,0,  0,0
919.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
920___
921
922# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
923#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
924if ($win64) {
925$rec="%rcx";
926$frame="%rdx";
927$context="%r8";
928$disp="%r9";
929
930$code.=<<___;
931.extern	__imp_RtlVirtualUnwind
932.type	mul_handler,\@abi-omnipotent
933.align	16
934mul_handler:
935	push	%rsi
936	push	%rdi
937	push	%rbx
938	push	%rbp
939	push	%r12
940	push	%r13
941	push	%r14
942	push	%r15
943	pushfq
944	sub	\$64,%rsp
945
946	mov	120($context),%rax	# pull context->Rax
947	mov	248($context),%rbx	# pull context->Rip
948
949	mov	8($disp),%rsi		# disp->ImageBase
950	mov	56($disp),%r11		# disp->HandlerData
951
952	mov	0(%r11),%r10d		# HandlerData[0]
953	lea	(%rsi,%r10),%r10	# end of prologue label
954	cmp	%r10,%rbx		# context->Rip<end of prologue label
955	jb	.Lcommon_seh_tail
956
957	lea	`40+48`(%rax),%rax
958
959	mov	4(%r11),%r10d		# HandlerData[1]
960	lea	(%rsi,%r10),%r10	# end of alloca label
961	cmp	%r10,%rbx		# context->Rip<end of alloca label
962	jb	.Lcommon_seh_tail
963
964	mov	152($context),%rax	# pull context->Rsp
965
966	mov	8(%r11),%r10d		# HandlerData[2]
967	lea	(%rsi,%r10),%r10	# epilogue label
968	cmp	%r10,%rbx		# context->Rip>=epilogue label
969	jae	.Lcommon_seh_tail
970
971	mov	192($context),%r10	# pull $num
972	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
973
974	movaps	(%rax),%xmm0
975	movaps	16(%rax),%xmm1
976	lea	`40+48`(%rax),%rax
977
978	mov	-8(%rax),%rbx
979	mov	-16(%rax),%rbp
980	mov	-24(%rax),%r12
981	mov	-32(%rax),%r13
982	mov	-40(%rax),%r14
983	mov	-48(%rax),%r15
984	mov	%rbx,144($context)	# restore context->Rbx
985	mov	%rbp,160($context)	# restore context->Rbp
986	mov	%r12,216($context)	# restore context->R12
987	mov	%r13,224($context)	# restore context->R13
988	mov	%r14,232($context)	# restore context->R14
989	mov	%r15,240($context)	# restore context->R15
990	movups	%xmm0,512($context)	# restore context->Xmm6
991	movups	%xmm1,528($context)	# restore context->Xmm7
992
993.Lcommon_seh_tail:
994	mov	8(%rax),%rdi
995	mov	16(%rax),%rsi
996	mov	%rax,152($context)	# restore context->Rsp
997	mov	%rsi,168($context)	# restore context->Rsi
998	mov	%rdi,176($context)	# restore context->Rdi
999
1000	mov	40($disp),%rdi		# disp->ContextRecord
1001	mov	$context,%rsi		# context
1002	mov	\$154,%ecx		# sizeof(CONTEXT)
1003	.long	0xa548f3fc		# cld; rep movsq
1004
1005	mov	$disp,%rsi
1006	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1007	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1008	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1009	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1010	mov	40(%rsi),%r10		# disp->ContextRecord
1011	lea	56(%rsi),%r11		# &disp->HandlerData
1012	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1013	mov	%r10,32(%rsp)		# arg5
1014	mov	%r11,40(%rsp)		# arg6
1015	mov	%r12,48(%rsp)		# arg7
1016	mov	%rcx,56(%rsp)		# arg8, (NULL)
1017	call	*__imp_RtlVirtualUnwind(%rip)
1018
1019	mov	\$1,%eax		# ExceptionContinueSearch
1020	add	\$64,%rsp
1021	popfq
1022	pop	%r15
1023	pop	%r14
1024	pop	%r13
1025	pop	%r12
1026	pop	%rbp
1027	pop	%rbx
1028	pop	%rdi
1029	pop	%rsi
1030	ret
1031.size	mul_handler,.-mul_handler
1032
1033.section	.pdata
1034.align	4
1035	.rva	.LSEH_begin_bn_mul_mont_gather5
1036	.rva	.LSEH_end_bn_mul_mont_gather5
1037	.rva	.LSEH_info_bn_mul_mont_gather5
1038
1039	.rva	.LSEH_begin_bn_mul4x_mont_gather5
1040	.rva	.LSEH_end_bn_mul4x_mont_gather5
1041	.rva	.LSEH_info_bn_mul4x_mont_gather5
1042
1043	.rva	.LSEH_begin_bn_gather5
1044	.rva	.LSEH_end_bn_gather5
1045	.rva	.LSEH_info_bn_gather5
1046
1047.section	.xdata
1048.align	8
1049.LSEH_info_bn_mul_mont_gather5:
1050	.byte	9,0,0,0
1051	.rva	mul_handler
1052	.rva	.Lmul_alloca,.Lmul_body,.Lmul_epilogue		# HandlerData[]
1053.align	8
1054.LSEH_info_bn_mul4x_mont_gather5:
1055	.byte	9,0,0,0
1056	.rva	mul_handler
1057	.rva	.Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1058.align	8
1059.LSEH_info_bn_gather5:
1060        .byte   0x01,0x0d,0x05,0x00
1061        .byte   0x0d,0x78,0x01,0x00	#movaps	0x10(rsp),xmm7
1062        .byte   0x08,0x68,0x00,0x00	#movaps	(rsp),xmm6
1063        .byte   0x04,0x42,0x00,0x00	#sub	rsp,0x28
1064.align	8
1065___
1066}
1067
1068$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1069
1070print $code;
1071close STDOUT;
1072