xref: /freebsd/crypto/openssl/crypto/bn/asm/x86_64-mont5.pl (revision b077aed33b7b6aefca7b17ddb250cf521f938613)
11f13597dSJung-uk Kim#! /usr/bin/env perl
283eaf7aeSJung-uk Kim# Copyright 2011-2022 The OpenSSL Project Authors. All Rights Reserved.
3e71b7053SJung-uk Kim#
4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License").  You may not use
5e71b7053SJung-uk Kim# this file except in compliance with the License.  You can obtain a copy
6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at
7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html
8e71b7053SJung-uk Kim
91f13597dSJung-uk Kim
101f13597dSJung-uk Kim# ====================================================================
111f13597dSJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
121f13597dSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and
131f13597dSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further
141f13597dSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/.
151f13597dSJung-uk Kim# ====================================================================
161f13597dSJung-uk Kim
171f13597dSJung-uk Kim# August 2011.
181f13597dSJung-uk Kim#
191f13597dSJung-uk Kim# Companion to x86_64-mont.pl that optimizes cache-timing attack
201f13597dSJung-uk Kim# countermeasures. The subroutines are produced by replacing bp[i]
211f13597dSJung-uk Kim# references in their x86_64-mont.pl counterparts with cache-neutral
221f13597dSJung-uk Kim# references to powers table computed in BN_mod_exp_mont_consttime.
231f13597dSJung-uk Kim# In addition subroutine that scatters elements of the powers table
241f13597dSJung-uk Kim# is implemented, so that scatter-/gathering can be tuned without
251f13597dSJung-uk Kim# bn_exp.c modifications.
261f13597dSJung-uk Kim
277bded2dbSJung-uk Kim# August 2013.
287bded2dbSJung-uk Kim#
297bded2dbSJung-uk Kim# Add MULX/AD*X code paths and additional interfaces to optimize for
307bded2dbSJung-uk Kim# branch prediction unit. For input lengths that are multiples of 8
317bded2dbSJung-uk Kim# the np argument is not just modulus value, but one interleaved
327bded2dbSJung-uk Kim# with 0. This is to optimize post-condition...
337bded2dbSJung-uk Kim
34*b077aed3SPierre Pronchery# $output is the last argument if it looks like a file (it has an extension)
35*b077aed3SPierre Pronchery# $flavour is the first argument if it doesn't look like a file
36*b077aed3SPierre Pronchery$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
37*b077aed3SPierre Pronchery$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
381f13597dSJung-uk Kim
391f13597dSJung-uk Kim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
401f13597dSJung-uk Kim
411f13597dSJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
421f13597dSJung-uk Kim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
431f13597dSJung-uk Kim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
441f13597dSJung-uk Kimdie "can't locate x86_64-xlate.pl";
451f13597dSJung-uk Kim
46*b077aed3SPierre Proncheryopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
47*b077aed3SPierre Pronchery    or die "can't call $xlate: $!";
4809286989SJung-uk Kim*STDOUT=*OUT;
491f13597dSJung-uk Kim
507bded2dbSJung-uk Kimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
517bded2dbSJung-uk Kim		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
527bded2dbSJung-uk Kim	$addx = ($1>=2.23);
537bded2dbSJung-uk Kim}
547bded2dbSJung-uk Kim
557bded2dbSJung-uk Kimif (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
567bded2dbSJung-uk Kim	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
577bded2dbSJung-uk Kim	$addx = ($1>=2.10);
587bded2dbSJung-uk Kim}
597bded2dbSJung-uk Kim
607bded2dbSJung-uk Kimif (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
617bded2dbSJung-uk Kim	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
627bded2dbSJung-uk Kim	$addx = ($1>=12);
637bded2dbSJung-uk Kim}
647bded2dbSJung-uk Kim
6563c1bb51SJung-uk Kimif (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
6680815a77SJung-uk Kim	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
6780815a77SJung-uk Kim	$addx = ($ver>=3.03);
6880815a77SJung-uk Kim}
6980815a77SJung-uk Kim
701f13597dSJung-uk Kim# int bn_mul_mont_gather5(
711f13597dSJung-uk Kim$rp="%rdi";	# BN_ULONG *rp,
721f13597dSJung-uk Kim$ap="%rsi";	# const BN_ULONG *ap,
731f13597dSJung-uk Kim$bp="%rdx";	# const BN_ULONG *bp,
741f13597dSJung-uk Kim$np="%rcx";	# const BN_ULONG *np,
751f13597dSJung-uk Kim$n0="%r8";	# const BN_ULONG *n0,
761f13597dSJung-uk Kim$num="%r9";	# int num,
771f13597dSJung-uk Kim		# int idx);	# 0 to 2^5-1, "index" in $bp holding
781f13597dSJung-uk Kim				# pre-computed powers of a', interlaced
791f13597dSJung-uk Kim				# in such manner that b[0] is $bp[idx],
801f13597dSJung-uk Kim				# b[1] is [2^5+idx], etc.
811f13597dSJung-uk Kim$lo0="%r10";
821f13597dSJung-uk Kim$hi0="%r11";
831f13597dSJung-uk Kim$hi1="%r13";
841f13597dSJung-uk Kim$i="%r14";
851f13597dSJung-uk Kim$j="%r15";
861f13597dSJung-uk Kim$m0="%rbx";
871f13597dSJung-uk Kim$m1="%rbp";
881f13597dSJung-uk Kim
891f13597dSJung-uk Kim$code=<<___;
901f13597dSJung-uk Kim.text
911f13597dSJung-uk Kim
927bded2dbSJung-uk Kim.extern	OPENSSL_ia32cap_P
937bded2dbSJung-uk Kim
941f13597dSJung-uk Kim.globl	bn_mul_mont_gather5
951f13597dSJung-uk Kim.type	bn_mul_mont_gather5,\@function,6
961f13597dSJung-uk Kim.align	64
971f13597dSJung-uk Kimbn_mul_mont_gather5:
98e71b7053SJung-uk Kim.cfi_startproc
99aeb5019cSJung-uk Kim	mov	${num}d,${num}d
100aeb5019cSJung-uk Kim	mov	%rsp,%rax
101e71b7053SJung-uk Kim.cfi_def_cfa_register	%rax
1027bded2dbSJung-uk Kim	test	\$7,${num}d
1031f13597dSJung-uk Kim	jnz	.Lmul_enter
1047bded2dbSJung-uk Kim___
1057bded2dbSJung-uk Kim$code.=<<___ if ($addx);
1067bded2dbSJung-uk Kim	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
1077bded2dbSJung-uk Kim___
1087bded2dbSJung-uk Kim$code.=<<___;
1091f13597dSJung-uk Kim	jmp	.Lmul4x_enter
1101f13597dSJung-uk Kim
1111f13597dSJung-uk Kim.align	16
1121f13597dSJung-uk Kim.Lmul_enter:
1134c6a0400SJung-uk Kim	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
1141f13597dSJung-uk Kim	push	%rbx
115e71b7053SJung-uk Kim.cfi_push	%rbx
1161f13597dSJung-uk Kim	push	%rbp
117e71b7053SJung-uk Kim.cfi_push	%rbp
1181f13597dSJung-uk Kim	push	%r12
119e71b7053SJung-uk Kim.cfi_push	%r12
1201f13597dSJung-uk Kim	push	%r13
121e71b7053SJung-uk Kim.cfi_push	%r13
1221f13597dSJung-uk Kim	push	%r14
123e71b7053SJung-uk Kim.cfi_push	%r14
1241f13597dSJung-uk Kim	push	%r15
125e71b7053SJung-uk Kim.cfi_push	%r15
1264c6a0400SJung-uk Kim
127aeb5019cSJung-uk Kim	neg	$num
128aeb5019cSJung-uk Kim	mov	%rsp,%r11
129aeb5019cSJung-uk Kim	lea	-280(%rsp,$num,8),%r10	# future alloca(8*(num+2)+256+8)
130aeb5019cSJung-uk Kim	neg	$num			# restore $num
131aeb5019cSJung-uk Kim	and	\$-1024,%r10		# minimize TLB usage
1321f13597dSJung-uk Kim
133e71b7053SJung-uk Kim	# An OS-agnostic version of __chkstk.
134e71b7053SJung-uk Kim	#
135e71b7053SJung-uk Kim	# Some OSes (Windows) insist on stack being "wired" to
136b8721c16SJung-uk Kim	# physical memory in strictly sequential manner, i.e. if stack
137b8721c16SJung-uk Kim	# allocation spans two pages, then reference to farmost one can
138b8721c16SJung-uk Kim	# be punishable by SEGV. But page walking can do good even on
139b8721c16SJung-uk Kim	# other OSes, because it guarantees that villain thread hits
140b8721c16SJung-uk Kim	# the guard page before it can make damage to innocent one...
141aeb5019cSJung-uk Kim	sub	%r10,%r11
142aeb5019cSJung-uk Kim	and	\$-4096,%r11
143aeb5019cSJung-uk Kim	lea	(%r10,%r11),%rsp
144aeb5019cSJung-uk Kim	mov	(%rsp),%r11
145aeb5019cSJung-uk Kim	cmp	%r10,%rsp
146aeb5019cSJung-uk Kim	ja	.Lmul_page_walk
147aeb5019cSJung-uk Kim	jmp	.Lmul_page_walk_done
148aeb5019cSJung-uk Kim
149b8721c16SJung-uk Kim.Lmul_page_walk:
150aeb5019cSJung-uk Kim	lea	-4096(%rsp),%rsp
151aeb5019cSJung-uk Kim	mov	(%rsp),%r11
152aeb5019cSJung-uk Kim	cmp	%r10,%rsp
153aeb5019cSJung-uk Kim	ja	.Lmul_page_walk
154aeb5019cSJung-uk Kim.Lmul_page_walk_done:
155aeb5019cSJung-uk Kim
156aeb5019cSJung-uk Kim	lea	.Linc(%rip),%r10
157aeb5019cSJung-uk Kim	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
158e71b7053SJung-uk Kim.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
159aeb5019cSJung-uk Kim.Lmul_body:
160b8721c16SJung-uk Kim
1614c6a0400SJung-uk Kim	lea	128($bp),%r12		# reassign $bp (+size optimization)
1621f13597dSJung-uk Kim___
1631f13597dSJung-uk Kim		$bp="%r12";
1641f13597dSJung-uk Kim		$STRIDE=2**5*8;		# 5 is "window size"
1651f13597dSJung-uk Kim		$N=$STRIDE/4;		# should match cache line size
1661f13597dSJung-uk Kim$code.=<<___;
1674c6a0400SJung-uk Kim	movdqa	0(%r10),%xmm0		# 00000001000000010000000000000000
1684c6a0400SJung-uk Kim	movdqa	16(%r10),%xmm1		# 00000002000000020000000200000002
1694c6a0400SJung-uk Kim	lea	24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
1704c6a0400SJung-uk Kim	and	\$-16,%r10
1711f13597dSJung-uk Kim
1724c6a0400SJung-uk Kim	pshufd	\$0,%xmm5,%xmm5		# broadcast index
1734c6a0400SJung-uk Kim	movdqa	%xmm1,%xmm4
1744c6a0400SJung-uk Kim	movdqa	%xmm1,%xmm2
1754c6a0400SJung-uk Kim___
1764c6a0400SJung-uk Kim########################################################################
1774c6a0400SJung-uk Kim# calculate mask by comparing 0..31 to index and save result to stack
1784c6a0400SJung-uk Kim#
1794c6a0400SJung-uk Kim$code.=<<___;
1804c6a0400SJung-uk Kim	paddd	%xmm0,%xmm1
1814c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
1824c6a0400SJung-uk Kim	.byte	0x67
1834c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm3
1844c6a0400SJung-uk Kim___
1854c6a0400SJung-uk Kimfor($k=0;$k<$STRIDE/16-4;$k+=4) {
1864c6a0400SJung-uk Kim$code.=<<___;
1874c6a0400SJung-uk Kim	paddd	%xmm1,%xmm2
1884c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
1894c6a0400SJung-uk Kim	movdqa	%xmm0,`16*($k+0)+112`(%r10)
1904c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm0
1914c6a0400SJung-uk Kim
1924c6a0400SJung-uk Kim	paddd	%xmm2,%xmm3
1934c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
1944c6a0400SJung-uk Kim	movdqa	%xmm1,`16*($k+1)+112`(%r10)
1954c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm1
1964c6a0400SJung-uk Kim
1974c6a0400SJung-uk Kim	paddd	%xmm3,%xmm0
1984c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
1994c6a0400SJung-uk Kim	movdqa	%xmm2,`16*($k+2)+112`(%r10)
2004c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm2
2014c6a0400SJung-uk Kim
2024c6a0400SJung-uk Kim	paddd	%xmm0,%xmm1
2034c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm0
2044c6a0400SJung-uk Kim	movdqa	%xmm3,`16*($k+3)+112`(%r10)
2054c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm3
2064c6a0400SJung-uk Kim___
2074c6a0400SJung-uk Kim}
2084c6a0400SJung-uk Kim$code.=<<___;				# last iteration can be optimized
2094c6a0400SJung-uk Kim	paddd	%xmm1,%xmm2
2104c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm1
2114c6a0400SJung-uk Kim	movdqa	%xmm0,`16*($k+0)+112`(%r10)
2124c6a0400SJung-uk Kim
2134c6a0400SJung-uk Kim	paddd	%xmm2,%xmm3
2144c6a0400SJung-uk Kim	.byte	0x67
2154c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm2
2164c6a0400SJung-uk Kim	movdqa	%xmm1,`16*($k+1)+112`(%r10)
2174c6a0400SJung-uk Kim
2184c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm3
2194c6a0400SJung-uk Kim	movdqa	%xmm2,`16*($k+2)+112`(%r10)
2204c6a0400SJung-uk Kim	pand	`16*($k+0)-128`($bp),%xmm0	# while it's still in register
2214c6a0400SJung-uk Kim
2224c6a0400SJung-uk Kim	pand	`16*($k+1)-128`($bp),%xmm1
2234c6a0400SJung-uk Kim	pand	`16*($k+2)-128`($bp),%xmm2
2244c6a0400SJung-uk Kim	movdqa	%xmm3,`16*($k+3)+112`(%r10)
2254c6a0400SJung-uk Kim	pand	`16*($k+3)-128`($bp),%xmm3
2261f13597dSJung-uk Kim	por	%xmm2,%xmm0
2274c6a0400SJung-uk Kim	por	%xmm3,%xmm1
2284c6a0400SJung-uk Kim___
2294c6a0400SJung-uk Kimfor($k=0;$k<$STRIDE/16-4;$k+=4) {
2304c6a0400SJung-uk Kim$code.=<<___;
2314c6a0400SJung-uk Kim	movdqa	`16*($k+0)-128`($bp),%xmm4
2324c6a0400SJung-uk Kim	movdqa	`16*($k+1)-128`($bp),%xmm5
2334c6a0400SJung-uk Kim	movdqa	`16*($k+2)-128`($bp),%xmm2
2344c6a0400SJung-uk Kim	pand	`16*($k+0)+112`(%r10),%xmm4
2354c6a0400SJung-uk Kim	movdqa	`16*($k+3)-128`($bp),%xmm3
2364c6a0400SJung-uk Kim	pand	`16*($k+1)+112`(%r10),%xmm5
2374c6a0400SJung-uk Kim	por	%xmm4,%xmm0
2384c6a0400SJung-uk Kim	pand	`16*($k+2)+112`(%r10),%xmm2
2394c6a0400SJung-uk Kim	por	%xmm5,%xmm1
2404c6a0400SJung-uk Kim	pand	`16*($k+3)+112`(%r10),%xmm3
2414c6a0400SJung-uk Kim	por	%xmm2,%xmm0
2424c6a0400SJung-uk Kim	por	%xmm3,%xmm1
2434c6a0400SJung-uk Kim___
2444c6a0400SJung-uk Kim}
2454c6a0400SJung-uk Kim$code.=<<___;
2464c6a0400SJung-uk Kim	por	%xmm1,%xmm0
2474c6a0400SJung-uk Kim	pshufd	\$0x4e,%xmm0,%xmm1
2484c6a0400SJung-uk Kim	por	%xmm1,%xmm0
2491f13597dSJung-uk Kim	lea	$STRIDE($bp),$bp
2501f13597dSJung-uk Kim	movq	%xmm0,$m0		# m0=bp[0]
2511f13597dSJung-uk Kim
2521f13597dSJung-uk Kim	mov	($n0),$n0		# pull n0[0] value
2531f13597dSJung-uk Kim	mov	($ap),%rax
2541f13597dSJung-uk Kim
2551f13597dSJung-uk Kim	xor	$i,$i			# i=0
2561f13597dSJung-uk Kim	xor	$j,$j			# j=0
2571f13597dSJung-uk Kim
2581f13597dSJung-uk Kim	mov	$n0,$m1
2591f13597dSJung-uk Kim	mulq	$m0			# ap[0]*bp[0]
2601f13597dSJung-uk Kim	mov	%rax,$lo0
2611f13597dSJung-uk Kim	mov	($np),%rax
2621f13597dSJung-uk Kim
2631f13597dSJung-uk Kim	imulq	$lo0,$m1		# "tp[0]"*n0
2641f13597dSJung-uk Kim	mov	%rdx,$hi0
2651f13597dSJung-uk Kim
2661f13597dSJung-uk Kim	mulq	$m1			# np[0]*m1
2671f13597dSJung-uk Kim	add	%rax,$lo0		# discarded
2681f13597dSJung-uk Kim	mov	8($ap),%rax
2691f13597dSJung-uk Kim	adc	\$0,%rdx
2701f13597dSJung-uk Kim	mov	%rdx,$hi1
2711f13597dSJung-uk Kim
2721f13597dSJung-uk Kim	lea	1($j),$j		# j++
2731f13597dSJung-uk Kim	jmp	.L1st_enter
2741f13597dSJung-uk Kim
2751f13597dSJung-uk Kim.align	16
2761f13597dSJung-uk Kim.L1st:
2771f13597dSJung-uk Kim	add	%rax,$hi1
2781f13597dSJung-uk Kim	mov	($ap,$j,8),%rax
2791f13597dSJung-uk Kim	adc	\$0,%rdx
2801f13597dSJung-uk Kim	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
2811f13597dSJung-uk Kim	mov	$lo0,$hi0
2821f13597dSJung-uk Kim	adc	\$0,%rdx
2831f13597dSJung-uk Kim	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
2841f13597dSJung-uk Kim	mov	%rdx,$hi1
2851f13597dSJung-uk Kim
2861f13597dSJung-uk Kim.L1st_enter:
2871f13597dSJung-uk Kim	mulq	$m0			# ap[j]*bp[0]
2881f13597dSJung-uk Kim	add	%rax,$hi0
2891f13597dSJung-uk Kim	mov	($np,$j,8),%rax
2901f13597dSJung-uk Kim	adc	\$0,%rdx
2911f13597dSJung-uk Kim	lea	1($j),$j		# j++
2921f13597dSJung-uk Kim	mov	%rdx,$lo0
2931f13597dSJung-uk Kim
2941f13597dSJung-uk Kim	mulq	$m1			# np[j]*m1
2951f13597dSJung-uk Kim	cmp	$num,$j
2964c6a0400SJung-uk Kim	jne	.L1st			# note that upon exit $j==$num, so
2974c6a0400SJung-uk Kim					# they can be used interchangeably
2981f13597dSJung-uk Kim
2991f13597dSJung-uk Kim	add	%rax,$hi1
3001f13597dSJung-uk Kim	adc	\$0,%rdx
3011f13597dSJung-uk Kim	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
3021f13597dSJung-uk Kim	adc	\$0,%rdx
3034c6a0400SJung-uk Kim	mov	$hi1,-16(%rsp,$num,8)	# tp[num-1]
3041f13597dSJung-uk Kim	mov	%rdx,$hi1
3051f13597dSJung-uk Kim	mov	$lo0,$hi0
3061f13597dSJung-uk Kim
3071f13597dSJung-uk Kim	xor	%rdx,%rdx
3081f13597dSJung-uk Kim	add	$hi0,$hi1
3091f13597dSJung-uk Kim	adc	\$0,%rdx
3101f13597dSJung-uk Kim	mov	$hi1,-8(%rsp,$num,8)
3111f13597dSJung-uk Kim	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
3121f13597dSJung-uk Kim
3131f13597dSJung-uk Kim	lea	1($i),$i		# i++
3141f13597dSJung-uk Kim	jmp	.Louter
3151f13597dSJung-uk Kim.align	16
3161f13597dSJung-uk Kim.Louter:
3174c6a0400SJung-uk Kim	lea	24+128(%rsp,$num,8),%rdx	# where 256-byte mask is (+size optimization)
3184c6a0400SJung-uk Kim	and	\$-16,%rdx
3194c6a0400SJung-uk Kim	pxor	%xmm4,%xmm4
3204c6a0400SJung-uk Kim	pxor	%xmm5,%xmm5
3214c6a0400SJung-uk Kim___
3224c6a0400SJung-uk Kimfor($k=0;$k<$STRIDE/16;$k+=4) {
3234c6a0400SJung-uk Kim$code.=<<___;
3244c6a0400SJung-uk Kim	movdqa	`16*($k+0)-128`($bp),%xmm0
3254c6a0400SJung-uk Kim	movdqa	`16*($k+1)-128`($bp),%xmm1
3264c6a0400SJung-uk Kim	movdqa	`16*($k+2)-128`($bp),%xmm2
3274c6a0400SJung-uk Kim	movdqa	`16*($k+3)-128`($bp),%xmm3
3284c6a0400SJung-uk Kim	pand	`16*($k+0)-128`(%rdx),%xmm0
3294c6a0400SJung-uk Kim	pand	`16*($k+1)-128`(%rdx),%xmm1
3304c6a0400SJung-uk Kim	por	%xmm0,%xmm4
3314c6a0400SJung-uk Kim	pand	`16*($k+2)-128`(%rdx),%xmm2
3324c6a0400SJung-uk Kim	por	%xmm1,%xmm5
3334c6a0400SJung-uk Kim	pand	`16*($k+3)-128`(%rdx),%xmm3
3344c6a0400SJung-uk Kim	por	%xmm2,%xmm4
3354c6a0400SJung-uk Kim	por	%xmm3,%xmm5
3364c6a0400SJung-uk Kim___
3374c6a0400SJung-uk Kim}
3384c6a0400SJung-uk Kim$code.=<<___;
3394c6a0400SJung-uk Kim	por	%xmm5,%xmm4
3404c6a0400SJung-uk Kim	pshufd	\$0x4e,%xmm4,%xmm0
3414c6a0400SJung-uk Kim	por	%xmm4,%xmm0
3424c6a0400SJung-uk Kim	lea	$STRIDE($bp),$bp
3434c6a0400SJung-uk Kim
3444c6a0400SJung-uk Kim	mov	($ap),%rax		# ap[0]
3454c6a0400SJung-uk Kim	movq	%xmm0,$m0		# m0=bp[i]
3464c6a0400SJung-uk Kim
3471f13597dSJung-uk Kim	xor	$j,$j			# j=0
3481f13597dSJung-uk Kim	mov	$n0,$m1
3491f13597dSJung-uk Kim	mov	(%rsp),$lo0
3501f13597dSJung-uk Kim
3511f13597dSJung-uk Kim	mulq	$m0			# ap[0]*bp[i]
3521f13597dSJung-uk Kim	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
3531f13597dSJung-uk Kim	mov	($np),%rax
3541f13597dSJung-uk Kim	adc	\$0,%rdx
3551f13597dSJung-uk Kim
3561f13597dSJung-uk Kim	imulq	$lo0,$m1		# tp[0]*n0
3571f13597dSJung-uk Kim	mov	%rdx,$hi0
3581f13597dSJung-uk Kim
3591f13597dSJung-uk Kim	mulq	$m1			# np[0]*m1
3601f13597dSJung-uk Kim	add	%rax,$lo0		# discarded
3611f13597dSJung-uk Kim	mov	8($ap),%rax
3621f13597dSJung-uk Kim	adc	\$0,%rdx
3631f13597dSJung-uk Kim	mov	8(%rsp),$lo0		# tp[1]
3641f13597dSJung-uk Kim	mov	%rdx,$hi1
3651f13597dSJung-uk Kim
3661f13597dSJung-uk Kim	lea	1($j),$j		# j++
3671f13597dSJung-uk Kim	jmp	.Linner_enter
3681f13597dSJung-uk Kim
3691f13597dSJung-uk Kim.align	16
3701f13597dSJung-uk Kim.Linner:
3711f13597dSJung-uk Kim	add	%rax,$hi1
3721f13597dSJung-uk Kim	mov	($ap,$j,8),%rax
3731f13597dSJung-uk Kim	adc	\$0,%rdx
3741f13597dSJung-uk Kim	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
3751f13597dSJung-uk Kim	mov	(%rsp,$j,8),$lo0
3761f13597dSJung-uk Kim	adc	\$0,%rdx
3771f13597dSJung-uk Kim	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
3781f13597dSJung-uk Kim	mov	%rdx,$hi1
3791f13597dSJung-uk Kim
3801f13597dSJung-uk Kim.Linner_enter:
3811f13597dSJung-uk Kim	mulq	$m0			# ap[j]*bp[i]
3821f13597dSJung-uk Kim	add	%rax,$hi0
3831f13597dSJung-uk Kim	mov	($np,$j,8),%rax
3841f13597dSJung-uk Kim	adc	\$0,%rdx
3851f13597dSJung-uk Kim	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
3861f13597dSJung-uk Kim	mov	%rdx,$hi0
3871f13597dSJung-uk Kim	adc	\$0,$hi0
3881f13597dSJung-uk Kim	lea	1($j),$j		# j++
3891f13597dSJung-uk Kim
3901f13597dSJung-uk Kim	mulq	$m1			# np[j]*m1
3911f13597dSJung-uk Kim	cmp	$num,$j
3924c6a0400SJung-uk Kim	jne	.Linner			# note that upon exit $j==$num, so
3934c6a0400SJung-uk Kim					# they can be used interchangeably
3941f13597dSJung-uk Kim	add	%rax,$hi1
3951f13597dSJung-uk Kim	adc	\$0,%rdx
3961f13597dSJung-uk Kim	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
3974c6a0400SJung-uk Kim	mov	(%rsp,$num,8),$lo0
3981f13597dSJung-uk Kim	adc	\$0,%rdx
3994c6a0400SJung-uk Kim	mov	$hi1,-16(%rsp,$num,8)	# tp[num-1]
4001f13597dSJung-uk Kim	mov	%rdx,$hi1
4011f13597dSJung-uk Kim
4021f13597dSJung-uk Kim	xor	%rdx,%rdx
4031f13597dSJung-uk Kim	add	$hi0,$hi1
4041f13597dSJung-uk Kim	adc	\$0,%rdx
4051f13597dSJung-uk Kim	add	$lo0,$hi1		# pull upmost overflow bit
4061f13597dSJung-uk Kim	adc	\$0,%rdx
4071f13597dSJung-uk Kim	mov	$hi1,-8(%rsp,$num,8)
4081f13597dSJung-uk Kim	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
4091f13597dSJung-uk Kim
4101f13597dSJung-uk Kim	lea	1($i),$i		# i++
4111f13597dSJung-uk Kim	cmp	$num,$i
4127bded2dbSJung-uk Kim	jb	.Louter
4131f13597dSJung-uk Kim
4141f13597dSJung-uk Kim	xor	$i,$i			# i=0 and clear CF!
4151f13597dSJung-uk Kim	mov	(%rsp),%rax		# tp[0]
4161f13597dSJung-uk Kim	lea	(%rsp),$ap		# borrow ap for tp
4171f13597dSJung-uk Kim	mov	$num,$j			# j=num
4181f13597dSJung-uk Kim	jmp	.Lsub
4191f13597dSJung-uk Kim.align	16
4201f13597dSJung-uk Kim.Lsub:	sbb	($np,$i,8),%rax
4211f13597dSJung-uk Kim	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
4221f13597dSJung-uk Kim	mov	8($ap,$i,8),%rax	# tp[i+1]
4231f13597dSJung-uk Kim	lea	1($i),$i		# i++
424e71b7053SJung-uk Kim	dec	$j			# doesn't affect CF!
4251f13597dSJung-uk Kim	jnz	.Lsub
4261f13597dSJung-uk Kim
4271f13597dSJung-uk Kim	sbb	\$0,%rax		# handle upmost overflow bit
428dea77ea6SJung-uk Kim	mov	\$-1,%rbx
429dea77ea6SJung-uk Kim	xor	%rax,%rbx
4301f13597dSJung-uk Kim	xor	$i,$i
4311f13597dSJung-uk Kim	mov	$num,$j			# j=num
432dea77ea6SJung-uk Kim
433dea77ea6SJung-uk Kim.Lcopy:					# conditional copy
434dea77ea6SJung-uk Kim	mov	($rp,$i,8),%rcx
435dea77ea6SJung-uk Kim	mov	(%rsp,$i,8),%rdx
436dea77ea6SJung-uk Kim	and	%rbx,%rcx
437dea77ea6SJung-uk Kim	and	%rax,%rdx
4381f13597dSJung-uk Kim	mov	$i,(%rsp,$i,8)		# zap temporary vector
439dea77ea6SJung-uk Kim	or	%rcx,%rdx
440dea77ea6SJung-uk Kim	mov	%rdx,($rp,$i,8)		# rp[i]=tp[i]
4411f13597dSJung-uk Kim	lea	1($i),$i
4421f13597dSJung-uk Kim	sub	\$1,$j
4431f13597dSJung-uk Kim	jnz	.Lcopy
4441f13597dSJung-uk Kim
4451f13597dSJung-uk Kim	mov	8(%rsp,$num,8),%rsi	# restore %rsp
446e71b7053SJung-uk Kim.cfi_def_cfa	%rsi,8
4471f13597dSJung-uk Kim	mov	\$1,%rax
4484c6a0400SJung-uk Kim
4497bded2dbSJung-uk Kim	mov	-48(%rsi),%r15
450e71b7053SJung-uk Kim.cfi_restore	%r15
4517bded2dbSJung-uk Kim	mov	-40(%rsi),%r14
452e71b7053SJung-uk Kim.cfi_restore	%r14
4537bded2dbSJung-uk Kim	mov	-32(%rsi),%r13
454e71b7053SJung-uk Kim.cfi_restore	%r13
4557bded2dbSJung-uk Kim	mov	-24(%rsi),%r12
456e71b7053SJung-uk Kim.cfi_restore	%r12
4577bded2dbSJung-uk Kim	mov	-16(%rsi),%rbp
458e71b7053SJung-uk Kim.cfi_restore	%rbp
4597bded2dbSJung-uk Kim	mov	-8(%rsi),%rbx
460e71b7053SJung-uk Kim.cfi_restore	%rbx
4617bded2dbSJung-uk Kim	lea	(%rsi),%rsp
462e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
4631f13597dSJung-uk Kim.Lmul_epilogue:
4641f13597dSJung-uk Kim	ret
465e71b7053SJung-uk Kim.cfi_endproc
4661f13597dSJung-uk Kim.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
4671f13597dSJung-uk Kim___
4681f13597dSJung-uk Kim{{{
4691f13597dSJung-uk Kimmy @A=("%r10","%r11");
4701f13597dSJung-uk Kimmy @N=("%r13","%rdi");
4711f13597dSJung-uk Kim$code.=<<___;
4721f13597dSJung-uk Kim.type	bn_mul4x_mont_gather5,\@function,6
4737bded2dbSJung-uk Kim.align	32
4741f13597dSJung-uk Kimbn_mul4x_mont_gather5:
475e71b7053SJung-uk Kim.cfi_startproc
476aeb5019cSJung-uk Kim	.byte	0x67
477aeb5019cSJung-uk Kim	mov	%rsp,%rax
478e71b7053SJung-uk Kim.cfi_def_cfa_register	%rax
4791f13597dSJung-uk Kim.Lmul4x_enter:
4807bded2dbSJung-uk Kim___
4817bded2dbSJung-uk Kim$code.=<<___ if ($addx);
4824c6a0400SJung-uk Kim	and	\$0x80108,%r11d
4834c6a0400SJung-uk Kim	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
4847bded2dbSJung-uk Kim	je	.Lmulx4x_enter
4857bded2dbSJung-uk Kim___
4867bded2dbSJung-uk Kim$code.=<<___;
4871f13597dSJung-uk Kim	push	%rbx
488e71b7053SJung-uk Kim.cfi_push	%rbx
4891f13597dSJung-uk Kim	push	%rbp
490e71b7053SJung-uk Kim.cfi_push	%rbp
4911f13597dSJung-uk Kim	push	%r12
492e71b7053SJung-uk Kim.cfi_push	%r12
4931f13597dSJung-uk Kim	push	%r13
494e71b7053SJung-uk Kim.cfi_push	%r13
4951f13597dSJung-uk Kim	push	%r14
496e71b7053SJung-uk Kim.cfi_push	%r14
4971f13597dSJung-uk Kim	push	%r15
498e71b7053SJung-uk Kim.cfi_push	%r15
499aeb5019cSJung-uk Kim.Lmul4x_prologue:
5004c6a0400SJung-uk Kim
5017bded2dbSJung-uk Kim	.byte	0x67
5024c6a0400SJung-uk Kim	shl	\$3,${num}d		# convert $num to bytes
5034c6a0400SJung-uk Kim	lea	($num,$num,2),%r10	# 3*$num in bytes
5047bded2dbSJung-uk Kim	neg	$num			# -$num
5051f13597dSJung-uk Kim
5067bded2dbSJung-uk Kim	##############################################################
5074c6a0400SJung-uk Kim	# Ensure that stack frame doesn't alias with $rptr+3*$num
5084c6a0400SJung-uk Kim	# modulo 4096, which covers ret[num], am[num] and n[num]
5094c6a0400SJung-uk Kim	# (see bn_exp.c). This is done to allow memory disambiguation
5104c6a0400SJung-uk Kim	# logic do its magic. [Extra [num] is allocated in order
5114c6a0400SJung-uk Kim	# to align with bn_power5's frame, which is cleansed after
5124c6a0400SJung-uk Kim	# completing exponentiation. Extra 256 bytes is for power mask
5134c6a0400SJung-uk Kim	# calculated from 7th argument, the index.]
5147bded2dbSJung-uk Kim	#
5154c6a0400SJung-uk Kim	lea	-320(%rsp,$num,2),%r11
516aeb5019cSJung-uk Kim	mov	%rsp,%rbp
5174c6a0400SJung-uk Kim	sub	$rp,%r11
5187bded2dbSJung-uk Kim	and	\$4095,%r11
5197bded2dbSJung-uk Kim	cmp	%r11,%r10
5207bded2dbSJung-uk Kim	jb	.Lmul4xsp_alt
521aeb5019cSJung-uk Kim	sub	%r11,%rbp		# align with $rp
522aeb5019cSJung-uk Kim	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
5237bded2dbSJung-uk Kim	jmp	.Lmul4xsp_done
5247bded2dbSJung-uk Kim
5257bded2dbSJung-uk Kim.align	32
5267bded2dbSJung-uk Kim.Lmul4xsp_alt:
5274c6a0400SJung-uk Kim	lea	4096-320(,$num,2),%r10
528aeb5019cSJung-uk Kim	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
5297bded2dbSJung-uk Kim	sub	%r10,%r11
5307bded2dbSJung-uk Kim	mov	\$0,%r10
5317bded2dbSJung-uk Kim	cmovc	%r10,%r11
532aeb5019cSJung-uk Kim	sub	%r11,%rbp
5337bded2dbSJung-uk Kim.Lmul4xsp_done:
534aeb5019cSJung-uk Kim	and	\$-64,%rbp
535aeb5019cSJung-uk Kim	mov	%rsp,%r11
536aeb5019cSJung-uk Kim	sub	%rbp,%r11
537b8721c16SJung-uk Kim	and	\$-4096,%r11
538aeb5019cSJung-uk Kim	lea	(%rbp,%r11),%rsp
539aeb5019cSJung-uk Kim	mov	(%rsp),%r10
540aeb5019cSJung-uk Kim	cmp	%rbp,%rsp
541aeb5019cSJung-uk Kim	ja	.Lmul4x_page_walk
542aeb5019cSJung-uk Kim	jmp	.Lmul4x_page_walk_done
543aeb5019cSJung-uk Kim
544b8721c16SJung-uk Kim.Lmul4x_page_walk:
545aeb5019cSJung-uk Kim	lea	-4096(%rsp),%rsp
546aeb5019cSJung-uk Kim	mov	(%rsp),%r10
547aeb5019cSJung-uk Kim	cmp	%rbp,%rsp
548aeb5019cSJung-uk Kim	ja	.Lmul4x_page_walk
549aeb5019cSJung-uk Kim.Lmul4x_page_walk_done:
550b8721c16SJung-uk Kim
5517bded2dbSJung-uk Kim	neg	$num
5527bded2dbSJung-uk Kim
5537bded2dbSJung-uk Kim	mov	%rax,40(%rsp)
554e71b7053SJung-uk Kim.cfi_cfa_expression	%rsp+40,deref,+8
5551f13597dSJung-uk Kim.Lmul4x_body:
5567bded2dbSJung-uk Kim
5577bded2dbSJung-uk Kim	call	mul4x_internal
5587bded2dbSJung-uk Kim
5597bded2dbSJung-uk Kim	mov	40(%rsp),%rsi		# restore %rsp
560e71b7053SJung-uk Kim.cfi_def_cfa	%rsi,8
5617bded2dbSJung-uk Kim	mov	\$1,%rax
5624c6a0400SJung-uk Kim
5637bded2dbSJung-uk Kim	mov	-48(%rsi),%r15
564e71b7053SJung-uk Kim.cfi_restore	%r15
5657bded2dbSJung-uk Kim	mov	-40(%rsi),%r14
566e71b7053SJung-uk Kim.cfi_restore	%r14
5677bded2dbSJung-uk Kim	mov	-32(%rsi),%r13
568e71b7053SJung-uk Kim.cfi_restore	%r13
5697bded2dbSJung-uk Kim	mov	-24(%rsi),%r12
570e71b7053SJung-uk Kim.cfi_restore	%r12
5717bded2dbSJung-uk Kim	mov	-16(%rsi),%rbp
572e71b7053SJung-uk Kim.cfi_restore	%rbp
5737bded2dbSJung-uk Kim	mov	-8(%rsi),%rbx
574e71b7053SJung-uk Kim.cfi_restore	%rbx
5757bded2dbSJung-uk Kim	lea	(%rsi),%rsp
576e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
5777bded2dbSJung-uk Kim.Lmul4x_epilogue:
5787bded2dbSJung-uk Kim	ret
579e71b7053SJung-uk Kim.cfi_endproc
5807bded2dbSJung-uk Kim.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
5817bded2dbSJung-uk Kim
5827bded2dbSJung-uk Kim.type	mul4x_internal,\@abi-omnipotent
5837bded2dbSJung-uk Kim.align	32
5847bded2dbSJung-uk Kimmul4x_internal:
58517f01e99SJung-uk Kim.cfi_startproc
5864c6a0400SJung-uk Kim	shl	\$5,$num		# $num was in bytes
5874c6a0400SJung-uk Kim	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument, index
5884c6a0400SJung-uk Kim	lea	.Linc(%rip),%rax
5894c6a0400SJung-uk Kim	lea	128(%rdx,$num),%r13	# end of powers table (+size optimization)
5907bded2dbSJung-uk Kim	shr	\$5,$num		# restore $num
5911f13597dSJung-uk Kim___
5921f13597dSJung-uk Kim		$bp="%r12";
5931f13597dSJung-uk Kim		$STRIDE=2**5*8;		# 5 is "window size"
5941f13597dSJung-uk Kim		$N=$STRIDE/4;		# should match cache line size
5957bded2dbSJung-uk Kim		$tp=$i;
5961f13597dSJung-uk Kim$code.=<<___;
5974c6a0400SJung-uk Kim	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
5984c6a0400SJung-uk Kim	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
5994c6a0400SJung-uk Kim	lea	88-112(%rsp,$num),%r10	# place the mask after tp[num+1] (+ICache optimization)
6004c6a0400SJung-uk Kim	lea	128(%rdx),$bp		# size optimization
6011f13597dSJung-uk Kim
6024c6a0400SJung-uk Kim	pshufd	\$0,%xmm5,%xmm5		# broadcast index
6034c6a0400SJung-uk Kim	movdqa	%xmm1,%xmm4
6044c6a0400SJung-uk Kim	.byte	0x67,0x67
6054c6a0400SJung-uk Kim	movdqa	%xmm1,%xmm2
6064c6a0400SJung-uk Kim___
6074c6a0400SJung-uk Kim########################################################################
6084c6a0400SJung-uk Kim# calculate mask by comparing 0..31 to index and save result to stack
6094c6a0400SJung-uk Kim#
6104c6a0400SJung-uk Kim$code.=<<___;
6114c6a0400SJung-uk Kim	paddd	%xmm0,%xmm1
6124c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
6137bded2dbSJung-uk Kim	.byte	0x67
6144c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm3
6154c6a0400SJung-uk Kim___
6164c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16-4;$i+=4) {
6174c6a0400SJung-uk Kim$code.=<<___;
6184c6a0400SJung-uk Kim	paddd	%xmm1,%xmm2
6194c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
6204c6a0400SJung-uk Kim	movdqa	%xmm0,`16*($i+0)+112`(%r10)
6214c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm0
6224c6a0400SJung-uk Kim
6234c6a0400SJung-uk Kim	paddd	%xmm2,%xmm3
6244c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
6254c6a0400SJung-uk Kim	movdqa	%xmm1,`16*($i+1)+112`(%r10)
6264c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm1
6274c6a0400SJung-uk Kim
6284c6a0400SJung-uk Kim	paddd	%xmm3,%xmm0
6294c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
6304c6a0400SJung-uk Kim	movdqa	%xmm2,`16*($i+2)+112`(%r10)
6314c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm2
6324c6a0400SJung-uk Kim
6334c6a0400SJung-uk Kim	paddd	%xmm0,%xmm1
6344c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm0
6354c6a0400SJung-uk Kim	movdqa	%xmm3,`16*($i+3)+112`(%r10)
6364c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm3
6374c6a0400SJung-uk Kim___
6384c6a0400SJung-uk Kim}
6394c6a0400SJung-uk Kim$code.=<<___;				# last iteration can be optimized
6404c6a0400SJung-uk Kim	paddd	%xmm1,%xmm2
6414c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm1
6424c6a0400SJung-uk Kim	movdqa	%xmm0,`16*($i+0)+112`(%r10)
6434c6a0400SJung-uk Kim
6444c6a0400SJung-uk Kim	paddd	%xmm2,%xmm3
6457bded2dbSJung-uk Kim	.byte	0x67
6464c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm2
6474c6a0400SJung-uk Kim	movdqa	%xmm1,`16*($i+1)+112`(%r10)
6484c6a0400SJung-uk Kim
6494c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm3
6504c6a0400SJung-uk Kim	movdqa	%xmm2,`16*($i+2)+112`(%r10)
6514c6a0400SJung-uk Kim	pand	`16*($i+0)-128`($bp),%xmm0	# while it's still in register
6524c6a0400SJung-uk Kim
6534c6a0400SJung-uk Kim	pand	`16*($i+1)-128`($bp),%xmm1
6544c6a0400SJung-uk Kim	pand	`16*($i+2)-128`($bp),%xmm2
6554c6a0400SJung-uk Kim	movdqa	%xmm3,`16*($i+3)+112`(%r10)
6564c6a0400SJung-uk Kim	pand	`16*($i+3)-128`($bp),%xmm3
6571f13597dSJung-uk Kim	por	%xmm2,%xmm0
6584c6a0400SJung-uk Kim	por	%xmm3,%xmm1
6594c6a0400SJung-uk Kim___
6604c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16-4;$i+=4) {
6614c6a0400SJung-uk Kim$code.=<<___;
6624c6a0400SJung-uk Kim	movdqa	`16*($i+0)-128`($bp),%xmm4
6634c6a0400SJung-uk Kim	movdqa	`16*($i+1)-128`($bp),%xmm5
6644c6a0400SJung-uk Kim	movdqa	`16*($i+2)-128`($bp),%xmm2
6654c6a0400SJung-uk Kim	pand	`16*($i+0)+112`(%r10),%xmm4
6664c6a0400SJung-uk Kim	movdqa	`16*($i+3)-128`($bp),%xmm3
6674c6a0400SJung-uk Kim	pand	`16*($i+1)+112`(%r10),%xmm5
6684c6a0400SJung-uk Kim	por	%xmm4,%xmm0
6694c6a0400SJung-uk Kim	pand	`16*($i+2)+112`(%r10),%xmm2
6704c6a0400SJung-uk Kim	por	%xmm5,%xmm1
6714c6a0400SJung-uk Kim	pand	`16*($i+3)+112`(%r10),%xmm3
6724c6a0400SJung-uk Kim	por	%xmm2,%xmm0
6734c6a0400SJung-uk Kim	por	%xmm3,%xmm1
6744c6a0400SJung-uk Kim___
6754c6a0400SJung-uk Kim}
6764c6a0400SJung-uk Kim$code.=<<___;
6774c6a0400SJung-uk Kim	por	%xmm1,%xmm0
6784c6a0400SJung-uk Kim	pshufd	\$0x4e,%xmm0,%xmm1
6794c6a0400SJung-uk Kim	por	%xmm1,%xmm0
6804c6a0400SJung-uk Kim	lea	$STRIDE($bp),$bp
6811f13597dSJung-uk Kim	movq	%xmm0,$m0		# m0=bp[0]
6824c6a0400SJung-uk Kim
6837bded2dbSJung-uk Kim	mov	%r13,16+8(%rsp)		# save end of b[num]
6847bded2dbSJung-uk Kim	mov	$rp, 56+8(%rsp)		# save $rp
6857bded2dbSJung-uk Kim
6861f13597dSJung-uk Kim	mov	($n0),$n0		# pull n0[0] value
6871f13597dSJung-uk Kim	mov	($ap),%rax
6887bded2dbSJung-uk Kim	lea	($ap,$num),$ap		# end of a[num]
6897bded2dbSJung-uk Kim	neg	$num
6901f13597dSJung-uk Kim
6911f13597dSJung-uk Kim	mov	$n0,$m1
6921f13597dSJung-uk Kim	mulq	$m0			# ap[0]*bp[0]
6931f13597dSJung-uk Kim	mov	%rax,$A[0]
6941f13597dSJung-uk Kim	mov	($np),%rax
6951f13597dSJung-uk Kim
6961f13597dSJung-uk Kim	imulq	$A[0],$m1		# "tp[0]"*n0
6974c6a0400SJung-uk Kim	lea	64+8(%rsp),$tp
6981f13597dSJung-uk Kim	mov	%rdx,$A[1]
6991f13597dSJung-uk Kim
7001f13597dSJung-uk Kim	mulq	$m1			# np[0]*m1
7011f13597dSJung-uk Kim	add	%rax,$A[0]		# discarded
7027bded2dbSJung-uk Kim	mov	8($ap,$num),%rax
7031f13597dSJung-uk Kim	adc	\$0,%rdx
7041f13597dSJung-uk Kim	mov	%rdx,$N[1]
7051f13597dSJung-uk Kim
7061f13597dSJung-uk Kim	mulq	$m0
7071f13597dSJung-uk Kim	add	%rax,$A[1]
7084c6a0400SJung-uk Kim	mov	8*1($np),%rax
7091f13597dSJung-uk Kim	adc	\$0,%rdx
7101f13597dSJung-uk Kim	mov	%rdx,$A[0]
7111f13597dSJung-uk Kim
7121f13597dSJung-uk Kim	mulq	$m1
7131f13597dSJung-uk Kim	add	%rax,$N[1]
7147bded2dbSJung-uk Kim	mov	16($ap,$num),%rax
7151f13597dSJung-uk Kim	adc	\$0,%rdx
7161f13597dSJung-uk Kim	add	$A[1],$N[1]
7177bded2dbSJung-uk Kim	lea	4*8($num),$j		# j=4
7184c6a0400SJung-uk Kim	lea	8*4($np),$np
7191f13597dSJung-uk Kim	adc	\$0,%rdx
7207bded2dbSJung-uk Kim	mov	$N[1],($tp)
7211f13597dSJung-uk Kim	mov	%rdx,$N[0]
7221f13597dSJung-uk Kim	jmp	.L1st4x
7237bded2dbSJung-uk Kim
7247bded2dbSJung-uk Kim.align	32
7251f13597dSJung-uk Kim.L1st4x:
7261f13597dSJung-uk Kim	mulq	$m0			# ap[j]*bp[0]
7271f13597dSJung-uk Kim	add	%rax,$A[0]
7284c6a0400SJung-uk Kim	mov	-8*2($np),%rax
7297bded2dbSJung-uk Kim	lea	32($tp),$tp
7301f13597dSJung-uk Kim	adc	\$0,%rdx
7311f13597dSJung-uk Kim	mov	%rdx,$A[1]
7321f13597dSJung-uk Kim
7331f13597dSJung-uk Kim	mulq	$m1			# np[j]*m1
7341f13597dSJung-uk Kim	add	%rax,$N[0]
7357bded2dbSJung-uk Kim	mov	-8($ap,$j),%rax
7361f13597dSJung-uk Kim	adc	\$0,%rdx
7371f13597dSJung-uk Kim	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
7381f13597dSJung-uk Kim	adc	\$0,%rdx
7397bded2dbSJung-uk Kim	mov	$N[0],-24($tp)		# tp[j-1]
7401f13597dSJung-uk Kim	mov	%rdx,$N[1]
7411f13597dSJung-uk Kim
7421f13597dSJung-uk Kim	mulq	$m0			# ap[j]*bp[0]
7431f13597dSJung-uk Kim	add	%rax,$A[1]
7444c6a0400SJung-uk Kim	mov	-8*1($np),%rax
7451f13597dSJung-uk Kim	adc	\$0,%rdx
7461f13597dSJung-uk Kim	mov	%rdx,$A[0]
7471f13597dSJung-uk Kim
7481f13597dSJung-uk Kim	mulq	$m1			# np[j]*m1
7491f13597dSJung-uk Kim	add	%rax,$N[1]
7507bded2dbSJung-uk Kim	mov	($ap,$j),%rax
7511f13597dSJung-uk Kim	adc	\$0,%rdx
7521f13597dSJung-uk Kim	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
7531f13597dSJung-uk Kim	adc	\$0,%rdx
7547bded2dbSJung-uk Kim	mov	$N[1],-16($tp)		# tp[j-1]
7551f13597dSJung-uk Kim	mov	%rdx,$N[0]
7561f13597dSJung-uk Kim
7571f13597dSJung-uk Kim	mulq	$m0			# ap[j]*bp[0]
7581f13597dSJung-uk Kim	add	%rax,$A[0]
7594c6a0400SJung-uk Kim	mov	8*0($np),%rax
7601f13597dSJung-uk Kim	adc	\$0,%rdx
7611f13597dSJung-uk Kim	mov	%rdx,$A[1]
7621f13597dSJung-uk Kim
7631f13597dSJung-uk Kim	mulq	$m1			# np[j]*m1
7641f13597dSJung-uk Kim	add	%rax,$N[0]
7657bded2dbSJung-uk Kim	mov	8($ap,$j),%rax
7661f13597dSJung-uk Kim	adc	\$0,%rdx
7671f13597dSJung-uk Kim	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
7681f13597dSJung-uk Kim	adc	\$0,%rdx
7697bded2dbSJung-uk Kim	mov	$N[0],-8($tp)		# tp[j-1]
7701f13597dSJung-uk Kim	mov	%rdx,$N[1]
7711f13597dSJung-uk Kim
7721f13597dSJung-uk Kim	mulq	$m0			# ap[j]*bp[0]
7731f13597dSJung-uk Kim	add	%rax,$A[1]
7744c6a0400SJung-uk Kim	mov	8*1($np),%rax
7751f13597dSJung-uk Kim	adc	\$0,%rdx
7761f13597dSJung-uk Kim	mov	%rdx,$A[0]
7771f13597dSJung-uk Kim
7781f13597dSJung-uk Kim	mulq	$m1			# np[j]*m1
7791f13597dSJung-uk Kim	add	%rax,$N[1]
7807bded2dbSJung-uk Kim	mov	16($ap,$j),%rax
7811f13597dSJung-uk Kim	adc	\$0,%rdx
7821f13597dSJung-uk Kim	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
7834c6a0400SJung-uk Kim	lea	8*4($np),$np
7841f13597dSJung-uk Kim	adc	\$0,%rdx
7857bded2dbSJung-uk Kim	mov	$N[1],($tp)		# tp[j-1]
7861f13597dSJung-uk Kim	mov	%rdx,$N[0]
7877bded2dbSJung-uk Kim
7887bded2dbSJung-uk Kim	add	\$32,$j			# j+=4
7897bded2dbSJung-uk Kim	jnz	.L1st4x
7901f13597dSJung-uk Kim
7911f13597dSJung-uk Kim	mulq	$m0			# ap[j]*bp[0]
7921f13597dSJung-uk Kim	add	%rax,$A[0]
7934c6a0400SJung-uk Kim	mov	-8*2($np),%rax
7947bded2dbSJung-uk Kim	lea	32($tp),$tp
7951f13597dSJung-uk Kim	adc	\$0,%rdx
7961f13597dSJung-uk Kim	mov	%rdx,$A[1]
7971f13597dSJung-uk Kim
7981f13597dSJung-uk Kim	mulq	$m1			# np[j]*m1
7991f13597dSJung-uk Kim	add	%rax,$N[0]
8007bded2dbSJung-uk Kim	mov	-8($ap),%rax
8011f13597dSJung-uk Kim	adc	\$0,%rdx
8021f13597dSJung-uk Kim	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
8031f13597dSJung-uk Kim	adc	\$0,%rdx
8047bded2dbSJung-uk Kim	mov	$N[0],-24($tp)		# tp[j-1]
8051f13597dSJung-uk Kim	mov	%rdx,$N[1]
8061f13597dSJung-uk Kim
8071f13597dSJung-uk Kim	mulq	$m0			# ap[j]*bp[0]
8081f13597dSJung-uk Kim	add	%rax,$A[1]
8094c6a0400SJung-uk Kim	mov	-8*1($np),%rax
8101f13597dSJung-uk Kim	adc	\$0,%rdx
8111f13597dSJung-uk Kim	mov	%rdx,$A[0]
8121f13597dSJung-uk Kim
8131f13597dSJung-uk Kim	mulq	$m1			# np[j]*m1
8141f13597dSJung-uk Kim	add	%rax,$N[1]
8157bded2dbSJung-uk Kim	mov	($ap,$num),%rax		# ap[0]
8161f13597dSJung-uk Kim	adc	\$0,%rdx
8171f13597dSJung-uk Kim	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
8181f13597dSJung-uk Kim	adc	\$0,%rdx
8197bded2dbSJung-uk Kim	mov	$N[1],-16($tp)		# tp[j-1]
8201f13597dSJung-uk Kim	mov	%rdx,$N[0]
8211f13597dSJung-uk Kim
8224c6a0400SJung-uk Kim	lea	($np,$num),$np		# rewind $np
8231f13597dSJung-uk Kim
8241f13597dSJung-uk Kim	xor	$N[1],$N[1]
8251f13597dSJung-uk Kim	add	$A[0],$N[0]
8261f13597dSJung-uk Kim	adc	\$0,$N[1]
8277bded2dbSJung-uk Kim	mov	$N[0],-8($tp)
8281f13597dSJung-uk Kim
8297bded2dbSJung-uk Kim	jmp	.Louter4x
8307bded2dbSJung-uk Kim
8317bded2dbSJung-uk Kim.align	32
8321f13597dSJung-uk Kim.Louter4x:
8334c6a0400SJung-uk Kim	lea	16+128($tp),%rdx	# where 256-byte mask is (+size optimization)
8344c6a0400SJung-uk Kim	pxor	%xmm4,%xmm4
8354c6a0400SJung-uk Kim	pxor	%xmm5,%xmm5
8364c6a0400SJung-uk Kim___
8374c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16;$i+=4) {
8384c6a0400SJung-uk Kim$code.=<<___;
8394c6a0400SJung-uk Kim	movdqa	`16*($i+0)-128`($bp),%xmm0
8404c6a0400SJung-uk Kim	movdqa	`16*($i+1)-128`($bp),%xmm1
8414c6a0400SJung-uk Kim	movdqa	`16*($i+2)-128`($bp),%xmm2
8424c6a0400SJung-uk Kim	movdqa	`16*($i+3)-128`($bp),%xmm3
8434c6a0400SJung-uk Kim	pand	`16*($i+0)-128`(%rdx),%xmm0
8444c6a0400SJung-uk Kim	pand	`16*($i+1)-128`(%rdx),%xmm1
8454c6a0400SJung-uk Kim	por	%xmm0,%xmm4
8464c6a0400SJung-uk Kim	pand	`16*($i+2)-128`(%rdx),%xmm2
8474c6a0400SJung-uk Kim	por	%xmm1,%xmm5
8484c6a0400SJung-uk Kim	pand	`16*($i+3)-128`(%rdx),%xmm3
8494c6a0400SJung-uk Kim	por	%xmm2,%xmm4
8504c6a0400SJung-uk Kim	por	%xmm3,%xmm5
8514c6a0400SJung-uk Kim___
8524c6a0400SJung-uk Kim}
8534c6a0400SJung-uk Kim$code.=<<___;
8544c6a0400SJung-uk Kim	por	%xmm5,%xmm4
8554c6a0400SJung-uk Kim	pshufd	\$0x4e,%xmm4,%xmm0
8564c6a0400SJung-uk Kim	por	%xmm4,%xmm0
8574c6a0400SJung-uk Kim	lea	$STRIDE($bp),$bp
8584c6a0400SJung-uk Kim	movq	%xmm0,$m0		# m0=bp[i]
8594c6a0400SJung-uk Kim
8607bded2dbSJung-uk Kim	mov	($tp,$num),$A[0]
8611f13597dSJung-uk Kim	mov	$n0,$m1
8621f13597dSJung-uk Kim	mulq	$m0			# ap[0]*bp[i]
8631f13597dSJung-uk Kim	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
8641f13597dSJung-uk Kim	mov	($np),%rax
8651f13597dSJung-uk Kim	adc	\$0,%rdx
8661f13597dSJung-uk Kim
8677bded2dbSJung-uk Kim	imulq	$A[0],$m1		# tp[0]*n0
8687bded2dbSJung-uk Kim	mov	%rdx,$A[1]
8697bded2dbSJung-uk Kim	mov	$N[1],($tp)		# store upmost overflow bit
8707bded2dbSJung-uk Kim
8717bded2dbSJung-uk Kim	lea	($tp,$num),$tp		# rewind $tp
8721f13597dSJung-uk Kim
8731f13597dSJung-uk Kim	mulq	$m1			# np[0]*m1
8741f13597dSJung-uk Kim	add	%rax,$A[0]		# "$N[0]", discarded
8757bded2dbSJung-uk Kim	mov	8($ap,$num),%rax
8761f13597dSJung-uk Kim	adc	\$0,%rdx
8771f13597dSJung-uk Kim	mov	%rdx,$N[1]
8781f13597dSJung-uk Kim
8791f13597dSJung-uk Kim	mulq	$m0			# ap[j]*bp[i]
8801f13597dSJung-uk Kim	add	%rax,$A[1]
8814c6a0400SJung-uk Kim	mov	8*1($np),%rax
8821f13597dSJung-uk Kim	adc	\$0,%rdx
8837bded2dbSJung-uk Kim	add	8($tp),$A[1]		# +tp[1]
8841f13597dSJung-uk Kim	adc	\$0,%rdx
8851f13597dSJung-uk Kim	mov	%rdx,$A[0]
8861f13597dSJung-uk Kim
8871f13597dSJung-uk Kim	mulq	$m1			# np[j]*m1
8881f13597dSJung-uk Kim	add	%rax,$N[1]
8897bded2dbSJung-uk Kim	mov	16($ap,$num),%rax
8901f13597dSJung-uk Kim	adc	\$0,%rdx
8911f13597dSJung-uk Kim	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
8927bded2dbSJung-uk Kim	lea	4*8($num),$j		# j=4
8934c6a0400SJung-uk Kim	lea	8*4($np),$np
8941f13597dSJung-uk Kim	adc	\$0,%rdx
8951f13597dSJung-uk Kim	mov	%rdx,$N[0]
8961f13597dSJung-uk Kim	jmp	.Linner4x
8977bded2dbSJung-uk Kim
8987bded2dbSJung-uk Kim.align	32
8991f13597dSJung-uk Kim.Linner4x:
9001f13597dSJung-uk Kim	mulq	$m0			# ap[j]*bp[i]
9011f13597dSJung-uk Kim	add	%rax,$A[0]
9024c6a0400SJung-uk Kim	mov	-8*2($np),%rax
9031f13597dSJung-uk Kim	adc	\$0,%rdx
9047bded2dbSJung-uk Kim	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
9057bded2dbSJung-uk Kim	lea	32($tp),$tp
9061f13597dSJung-uk Kim	adc	\$0,%rdx
9071f13597dSJung-uk Kim	mov	%rdx,$A[1]
9081f13597dSJung-uk Kim
9091f13597dSJung-uk Kim	mulq	$m1			# np[j]*m1
9101f13597dSJung-uk Kim	add	%rax,$N[0]
9117bded2dbSJung-uk Kim	mov	-8($ap,$j),%rax
9121f13597dSJung-uk Kim	adc	\$0,%rdx
9131f13597dSJung-uk Kim	add	$A[0],$N[0]
9141f13597dSJung-uk Kim	adc	\$0,%rdx
9157bded2dbSJung-uk Kim	mov	$N[1],-32($tp)		# tp[j-1]
9161f13597dSJung-uk Kim	mov	%rdx,$N[1]
9171f13597dSJung-uk Kim
9181f13597dSJung-uk Kim	mulq	$m0			# ap[j]*bp[i]
9191f13597dSJung-uk Kim	add	%rax,$A[1]
9204c6a0400SJung-uk Kim	mov	-8*1($np),%rax
9211f13597dSJung-uk Kim	adc	\$0,%rdx
9227bded2dbSJung-uk Kim	add	-8($tp),$A[1]
9231f13597dSJung-uk Kim	adc	\$0,%rdx
9241f13597dSJung-uk Kim	mov	%rdx,$A[0]
9251f13597dSJung-uk Kim
9261f13597dSJung-uk Kim	mulq	$m1			# np[j]*m1
9271f13597dSJung-uk Kim	add	%rax,$N[1]
9287bded2dbSJung-uk Kim	mov	($ap,$j),%rax
9291f13597dSJung-uk Kim	adc	\$0,%rdx
9301f13597dSJung-uk Kim	add	$A[1],$N[1]
9311f13597dSJung-uk Kim	adc	\$0,%rdx
9327bded2dbSJung-uk Kim	mov	$N[0],-24($tp)		# tp[j-1]
9331f13597dSJung-uk Kim	mov	%rdx,$N[0]
9341f13597dSJung-uk Kim
9351f13597dSJung-uk Kim	mulq	$m0			# ap[j]*bp[i]
9361f13597dSJung-uk Kim	add	%rax,$A[0]
9374c6a0400SJung-uk Kim	mov	8*0($np),%rax
9381f13597dSJung-uk Kim	adc	\$0,%rdx
9397bded2dbSJung-uk Kim	add	($tp),$A[0]		# ap[j]*bp[i]+tp[j]
9401f13597dSJung-uk Kim	adc	\$0,%rdx
9411f13597dSJung-uk Kim	mov	%rdx,$A[1]
9421f13597dSJung-uk Kim
9431f13597dSJung-uk Kim	mulq	$m1			# np[j]*m1
9441f13597dSJung-uk Kim	add	%rax,$N[0]
9457bded2dbSJung-uk Kim	mov	8($ap,$j),%rax
9461f13597dSJung-uk Kim	adc	\$0,%rdx
9471f13597dSJung-uk Kim	add	$A[0],$N[0]
9481f13597dSJung-uk Kim	adc	\$0,%rdx
9497bded2dbSJung-uk Kim	mov	$N[1],-16($tp)		# tp[j-1]
9501f13597dSJung-uk Kim	mov	%rdx,$N[1]
9511f13597dSJung-uk Kim
9521f13597dSJung-uk Kim	mulq	$m0			# ap[j]*bp[i]
9531f13597dSJung-uk Kim	add	%rax,$A[1]
9544c6a0400SJung-uk Kim	mov	8*1($np),%rax
9551f13597dSJung-uk Kim	adc	\$0,%rdx
9567bded2dbSJung-uk Kim	add	8($tp),$A[1]
9571f13597dSJung-uk Kim	adc	\$0,%rdx
9581f13597dSJung-uk Kim	mov	%rdx,$A[0]
9591f13597dSJung-uk Kim
9601f13597dSJung-uk Kim	mulq	$m1			# np[j]*m1
9611f13597dSJung-uk Kim	add	%rax,$N[1]
9627bded2dbSJung-uk Kim	mov	16($ap,$j),%rax
9631f13597dSJung-uk Kim	adc	\$0,%rdx
9641f13597dSJung-uk Kim	add	$A[1],$N[1]
9654c6a0400SJung-uk Kim	lea	8*4($np),$np
9661f13597dSJung-uk Kim	adc	\$0,%rdx
9677bded2dbSJung-uk Kim	mov	$N[0],-8($tp)		# tp[j-1]
9681f13597dSJung-uk Kim	mov	%rdx,$N[0]
9697bded2dbSJung-uk Kim
9707bded2dbSJung-uk Kim	add	\$32,$j			# j+=4
9717bded2dbSJung-uk Kim	jnz	.Linner4x
9721f13597dSJung-uk Kim
9731f13597dSJung-uk Kim	mulq	$m0			# ap[j]*bp[i]
9741f13597dSJung-uk Kim	add	%rax,$A[0]
9754c6a0400SJung-uk Kim	mov	-8*2($np),%rax
9761f13597dSJung-uk Kim	adc	\$0,%rdx
9777bded2dbSJung-uk Kim	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
9787bded2dbSJung-uk Kim	lea	32($tp),$tp
9791f13597dSJung-uk Kim	adc	\$0,%rdx
9801f13597dSJung-uk Kim	mov	%rdx,$A[1]
9811f13597dSJung-uk Kim
9821f13597dSJung-uk Kim	mulq	$m1			# np[j]*m1
9831f13597dSJung-uk Kim	add	%rax,$N[0]
9847bded2dbSJung-uk Kim	mov	-8($ap),%rax
9851f13597dSJung-uk Kim	adc	\$0,%rdx
9861f13597dSJung-uk Kim	add	$A[0],$N[0]
9871f13597dSJung-uk Kim	adc	\$0,%rdx
9887bded2dbSJung-uk Kim	mov	$N[1],-32($tp)		# tp[j-1]
9891f13597dSJung-uk Kim	mov	%rdx,$N[1]
9901f13597dSJung-uk Kim
9911f13597dSJung-uk Kim	mulq	$m0			# ap[j]*bp[i]
9921f13597dSJung-uk Kim	add	%rax,$A[1]
9937bded2dbSJung-uk Kim	mov	$m1,%rax
9944c6a0400SJung-uk Kim	mov	-8*1($np),$m1
9951f13597dSJung-uk Kim	adc	\$0,%rdx
9967bded2dbSJung-uk Kim	add	-8($tp),$A[1]
9971f13597dSJung-uk Kim	adc	\$0,%rdx
9981f13597dSJung-uk Kim	mov	%rdx,$A[0]
9991f13597dSJung-uk Kim
10001f13597dSJung-uk Kim	mulq	$m1			# np[j]*m1
10011f13597dSJung-uk Kim	add	%rax,$N[1]
10027bded2dbSJung-uk Kim	mov	($ap,$num),%rax		# ap[0]
10031f13597dSJung-uk Kim	adc	\$0,%rdx
10041f13597dSJung-uk Kim	add	$A[1],$N[1]
10051f13597dSJung-uk Kim	adc	\$0,%rdx
10067bded2dbSJung-uk Kim	mov	$N[0],-24($tp)		# tp[j-1]
10071f13597dSJung-uk Kim	mov	%rdx,$N[0]
10081f13597dSJung-uk Kim
10097bded2dbSJung-uk Kim	mov	$N[1],-16($tp)		# tp[j-1]
10104c6a0400SJung-uk Kim	lea	($np,$num),$np		# rewind $np
10111f13597dSJung-uk Kim
10121f13597dSJung-uk Kim	xor	$N[1],$N[1]
10131f13597dSJung-uk Kim	add	$A[0],$N[0]
10141f13597dSJung-uk Kim	adc	\$0,$N[1]
10157bded2dbSJung-uk Kim	add	($tp),$N[0]		# pull upmost overflow bit
10167bded2dbSJung-uk Kim	adc	\$0,$N[1]		# upmost overflow bit
10177bded2dbSJung-uk Kim	mov	$N[0],-8($tp)
10181f13597dSJung-uk Kim
10197bded2dbSJung-uk Kim	cmp	16+8(%rsp),$bp
10207bded2dbSJung-uk Kim	jb	.Louter4x
10211f13597dSJung-uk Kim___
10227bded2dbSJung-uk Kimif (1) {
10231f13597dSJung-uk Kim$code.=<<___;
10244c6a0400SJung-uk Kim	xor	%rax,%rax
10257bded2dbSJung-uk Kim	sub	$N[0],$m1		# compare top-most words
10267bded2dbSJung-uk Kim	adc	$j,$j			# $j is zero
10277bded2dbSJung-uk Kim	or	$j,$N[1]
10284c6a0400SJung-uk Kim	sub	$N[1],%rax		# %rax=-$N[1]
10297bded2dbSJung-uk Kim	lea	($tp,$num),%rbx		# tptr in .sqr4x_sub
10304c6a0400SJung-uk Kim	mov	($np),%r12
10314c6a0400SJung-uk Kim	lea	($np),%rbp		# nptr in .sqr4x_sub
10327bded2dbSJung-uk Kim	mov	%r9,%rcx
10334c6a0400SJung-uk Kim	sar	\$3+2,%rcx
10347bded2dbSJung-uk Kim	mov	56+8(%rsp),%rdi		# rptr in .sqr4x_sub
10354c6a0400SJung-uk Kim	dec	%r12			# so that after 'not' we get -n[0]
10364c6a0400SJung-uk Kim	xor	%r10,%r10
10374c6a0400SJung-uk Kim	mov	8*1(%rbp),%r13
10384c6a0400SJung-uk Kim	mov	8*2(%rbp),%r14
10394c6a0400SJung-uk Kim	mov	8*3(%rbp),%r15
10404c6a0400SJung-uk Kim	jmp	.Lsqr4x_sub_entry
10417bded2dbSJung-uk Kim___
10427bded2dbSJung-uk Kim} else {
10437bded2dbSJung-uk Kimmy @ri=("%rax",$bp,$m0,$m1);
10447bded2dbSJung-uk Kimmy $rp="%rdx";
10457bded2dbSJung-uk Kim$code.=<<___
10467bded2dbSJung-uk Kim	xor	\$1,$N[1]
10477bded2dbSJung-uk Kim	lea	($tp,$num),$tp		# rewind $tp
10487bded2dbSJung-uk Kim	sar	\$5,$num		# cf=0
10497bded2dbSJung-uk Kim	lea	($np,$N[1],8),$np
10507bded2dbSJung-uk Kim	mov	56+8(%rsp),$rp		# restore $rp
10511f13597dSJung-uk Kim	jmp	.Lsub4x
10527bded2dbSJung-uk Kim
10537bded2dbSJung-uk Kim.align	32
10541f13597dSJung-uk Kim.Lsub4x:
10557bded2dbSJung-uk Kim	.byte	0x66
10567bded2dbSJung-uk Kim	mov	8*0($tp),@ri[0]
10577bded2dbSJung-uk Kim	mov	8*1($tp),@ri[1]
10587bded2dbSJung-uk Kim	.byte	0x66
10597bded2dbSJung-uk Kim	sbb	16*0($np),@ri[0]
10607bded2dbSJung-uk Kim	mov	8*2($tp),@ri[2]
10617bded2dbSJung-uk Kim	sbb	16*1($np),@ri[1]
10627bded2dbSJung-uk Kim	mov	3*8($tp),@ri[3]
10637bded2dbSJung-uk Kim	lea	4*8($tp),$tp
10647bded2dbSJung-uk Kim	sbb	16*2($np),@ri[2]
10657bded2dbSJung-uk Kim	mov	@ri[0],8*0($rp)
10667bded2dbSJung-uk Kim	sbb	16*3($np),@ri[3]
10677bded2dbSJung-uk Kim	lea	16*4($np),$np
10687bded2dbSJung-uk Kim	mov	@ri[1],8*1($rp)
10697bded2dbSJung-uk Kim	mov	@ri[2],8*2($rp)
10707bded2dbSJung-uk Kim	mov	@ri[3],8*3($rp)
10717bded2dbSJung-uk Kim	lea	8*4($rp),$rp
10727bded2dbSJung-uk Kim
10737bded2dbSJung-uk Kim	inc	$num
10741f13597dSJung-uk Kim	jnz	.Lsub4x
10751f13597dSJung-uk Kim
10767bded2dbSJung-uk Kim	ret
10771f13597dSJung-uk Kim___
10781f13597dSJung-uk Kim}
10791f13597dSJung-uk Kim$code.=<<___;
108017f01e99SJung-uk Kim.cfi_endproc
10817bded2dbSJung-uk Kim.size	mul4x_internal,.-mul4x_internal
10827bded2dbSJung-uk Kim___
10837bded2dbSJung-uk Kim}}}
10847bded2dbSJung-uk Kim{{{
10857bded2dbSJung-uk Kim######################################################################
10867bded2dbSJung-uk Kim# void bn_power5(
10877bded2dbSJung-uk Kimmy $rptr="%rdi";	# BN_ULONG *rptr,
10887bded2dbSJung-uk Kimmy $aptr="%rsi";	# const BN_ULONG *aptr,
10897bded2dbSJung-uk Kimmy $bptr="%rdx";	# const void *table,
10907bded2dbSJung-uk Kimmy $nptr="%rcx";	# const BN_ULONG *nptr,
10917bded2dbSJung-uk Kimmy $n0  ="%r8";		# const BN_ULONG *n0);
10927bded2dbSJung-uk Kimmy $num ="%r9";		# int num, has to be divisible by 8
10937bded2dbSJung-uk Kim			# int pwr
10947bded2dbSJung-uk Kim
10957bded2dbSJung-uk Kimmy ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
10967bded2dbSJung-uk Kimmy @A0=("%r10","%r11");
10977bded2dbSJung-uk Kimmy @A1=("%r12","%r13");
10987bded2dbSJung-uk Kimmy ($a0,$a1,$ai)=("%r14","%r15","%rbx");
10997bded2dbSJung-uk Kim
11007bded2dbSJung-uk Kim$code.=<<___;
11017bded2dbSJung-uk Kim.globl	bn_power5
11027bded2dbSJung-uk Kim.type	bn_power5,\@function,6
11037bded2dbSJung-uk Kim.align	32
11047bded2dbSJung-uk Kimbn_power5:
1105e71b7053SJung-uk Kim.cfi_startproc
1106aeb5019cSJung-uk Kim	mov	%rsp,%rax
1107e71b7053SJung-uk Kim.cfi_def_cfa_register	%rax
11087bded2dbSJung-uk Kim___
11097bded2dbSJung-uk Kim$code.=<<___ if ($addx);
11107bded2dbSJung-uk Kim	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
11114c6a0400SJung-uk Kim	and	\$0x80108,%r11d
11124c6a0400SJung-uk Kim	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
11137bded2dbSJung-uk Kim	je	.Lpowerx5_enter
11147bded2dbSJung-uk Kim___
11157bded2dbSJung-uk Kim$code.=<<___;
11167bded2dbSJung-uk Kim	push	%rbx
1117e71b7053SJung-uk Kim.cfi_push	%rbx
11187bded2dbSJung-uk Kim	push	%rbp
1119e71b7053SJung-uk Kim.cfi_push	%rbp
11207bded2dbSJung-uk Kim	push	%r12
1121e71b7053SJung-uk Kim.cfi_push	%r12
11227bded2dbSJung-uk Kim	push	%r13
1123e71b7053SJung-uk Kim.cfi_push	%r13
11247bded2dbSJung-uk Kim	push	%r14
1125e71b7053SJung-uk Kim.cfi_push	%r14
11267bded2dbSJung-uk Kim	push	%r15
1127e71b7053SJung-uk Kim.cfi_push	%r15
1128aeb5019cSJung-uk Kim.Lpower5_prologue:
11294c6a0400SJung-uk Kim
11307bded2dbSJung-uk Kim	shl	\$3,${num}d		# convert $num to bytes
11314c6a0400SJung-uk Kim	lea	($num,$num,2),%r10d	# 3*$num
11327bded2dbSJung-uk Kim	neg	$num
11337bded2dbSJung-uk Kim	mov	($n0),$n0		# *n0
11347bded2dbSJung-uk Kim
11357bded2dbSJung-uk Kim	##############################################################
11364c6a0400SJung-uk Kim	# Ensure that stack frame doesn't alias with $rptr+3*$num
11374c6a0400SJung-uk Kim	# modulo 4096, which covers ret[num], am[num] and n[num]
11384c6a0400SJung-uk Kim	# (see bn_exp.c). This is done to allow memory disambiguation
11394c6a0400SJung-uk Kim	# logic do its magic. [Extra 256 bytes is for power mask
11404c6a0400SJung-uk Kim	# calculated from 7th argument, the index.]
11417bded2dbSJung-uk Kim	#
11424c6a0400SJung-uk Kim	lea	-320(%rsp,$num,2),%r11
1143aeb5019cSJung-uk Kim	mov	%rsp,%rbp
11444c6a0400SJung-uk Kim	sub	$rptr,%r11
11457bded2dbSJung-uk Kim	and	\$4095,%r11
11467bded2dbSJung-uk Kim	cmp	%r11,%r10
11477bded2dbSJung-uk Kim	jb	.Lpwr_sp_alt
1148aeb5019cSJung-uk Kim	sub	%r11,%rbp		# align with $aptr
1149aeb5019cSJung-uk Kim	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
11507bded2dbSJung-uk Kim	jmp	.Lpwr_sp_done
11517bded2dbSJung-uk Kim
11527bded2dbSJung-uk Kim.align	32
11537bded2dbSJung-uk Kim.Lpwr_sp_alt:
11544c6a0400SJung-uk Kim	lea	4096-320(,$num,2),%r10
1155aeb5019cSJung-uk Kim	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
11567bded2dbSJung-uk Kim	sub	%r10,%r11
11577bded2dbSJung-uk Kim	mov	\$0,%r10
11587bded2dbSJung-uk Kim	cmovc	%r10,%r11
1159aeb5019cSJung-uk Kim	sub	%r11,%rbp
11607bded2dbSJung-uk Kim.Lpwr_sp_done:
1161aeb5019cSJung-uk Kim	and	\$-64,%rbp
1162aeb5019cSJung-uk Kim	mov	%rsp,%r11
1163aeb5019cSJung-uk Kim	sub	%rbp,%r11
1164b8721c16SJung-uk Kim	and	\$-4096,%r11
1165aeb5019cSJung-uk Kim	lea	(%rbp,%r11),%rsp
1166aeb5019cSJung-uk Kim	mov	(%rsp),%r10
1167aeb5019cSJung-uk Kim	cmp	%rbp,%rsp
1168aeb5019cSJung-uk Kim	ja	.Lpwr_page_walk
1169aeb5019cSJung-uk Kim	jmp	.Lpwr_page_walk_done
1170aeb5019cSJung-uk Kim
1171b8721c16SJung-uk Kim.Lpwr_page_walk:
1172aeb5019cSJung-uk Kim	lea	-4096(%rsp),%rsp
1173aeb5019cSJung-uk Kim	mov	(%rsp),%r10
1174aeb5019cSJung-uk Kim	cmp	%rbp,%rsp
1175aeb5019cSJung-uk Kim	ja	.Lpwr_page_walk
1176aeb5019cSJung-uk Kim.Lpwr_page_walk_done:
1177b8721c16SJung-uk Kim
11787bded2dbSJung-uk Kim	mov	$num,%r10
11797bded2dbSJung-uk Kim	neg	$num
11807bded2dbSJung-uk Kim
11817bded2dbSJung-uk Kim	##############################################################
11827bded2dbSJung-uk Kim	# Stack layout
11837bded2dbSJung-uk Kim	#
11847bded2dbSJung-uk Kim	# +0	saved $num, used in reduction section
11857bded2dbSJung-uk Kim	# +8	&t[2*$num], used in reduction section
11867bded2dbSJung-uk Kim	# +32	saved *n0
11877bded2dbSJung-uk Kim	# +40	saved %rsp
11887bded2dbSJung-uk Kim	# +48	t[2*$num]
11897bded2dbSJung-uk Kim	#
11907bded2dbSJung-uk Kim	mov	$n0,  32(%rsp)
11917bded2dbSJung-uk Kim	mov	%rax, 40(%rsp)		# save original %rsp
1192e71b7053SJung-uk Kim.cfi_cfa_expression	%rsp+40,deref,+8
11937bded2dbSJung-uk Kim.Lpower5_body:
11944c6a0400SJung-uk Kim	movq	$rptr,%xmm1		# save $rptr, used in sqr8x
11957bded2dbSJung-uk Kim	movq	$nptr,%xmm2		# save $nptr
11964c6a0400SJung-uk Kim	movq	%r10, %xmm3		# -$num, used in sqr8x
11977bded2dbSJung-uk Kim	movq	$bptr,%xmm4
11987bded2dbSJung-uk Kim
11997bded2dbSJung-uk Kim	call	__bn_sqr8x_internal
12004c6a0400SJung-uk Kim	call	__bn_post4x_internal
12017bded2dbSJung-uk Kim	call	__bn_sqr8x_internal
12024c6a0400SJung-uk Kim	call	__bn_post4x_internal
12037bded2dbSJung-uk Kim	call	__bn_sqr8x_internal
12044c6a0400SJung-uk Kim	call	__bn_post4x_internal
12057bded2dbSJung-uk Kim	call	__bn_sqr8x_internal
12064c6a0400SJung-uk Kim	call	__bn_post4x_internal
12077bded2dbSJung-uk Kim	call	__bn_sqr8x_internal
12084c6a0400SJung-uk Kim	call	__bn_post4x_internal
12097bded2dbSJung-uk Kim
12107bded2dbSJung-uk Kim	movq	%xmm2,$nptr
12117bded2dbSJung-uk Kim	movq	%xmm4,$bptr
12127bded2dbSJung-uk Kim	mov	$aptr,$rptr
12137bded2dbSJung-uk Kim	mov	40(%rsp),%rax
12147bded2dbSJung-uk Kim	lea	32(%rsp),$n0
12157bded2dbSJung-uk Kim
12167bded2dbSJung-uk Kim	call	mul4x_internal
12177bded2dbSJung-uk Kim
12187bded2dbSJung-uk Kim	mov	40(%rsp),%rsi		# restore %rsp
1219e71b7053SJung-uk Kim.cfi_def_cfa	%rsi,8
12207bded2dbSJung-uk Kim	mov	\$1,%rax
12217bded2dbSJung-uk Kim	mov	-48(%rsi),%r15
1222e71b7053SJung-uk Kim.cfi_restore	%r15
12237bded2dbSJung-uk Kim	mov	-40(%rsi),%r14
1224e71b7053SJung-uk Kim.cfi_restore	%r14
12257bded2dbSJung-uk Kim	mov	-32(%rsi),%r13
1226e71b7053SJung-uk Kim.cfi_restore	%r13
12277bded2dbSJung-uk Kim	mov	-24(%rsi),%r12
1228e71b7053SJung-uk Kim.cfi_restore	%r12
12297bded2dbSJung-uk Kim	mov	-16(%rsi),%rbp
1230e71b7053SJung-uk Kim.cfi_restore	%rbp
12317bded2dbSJung-uk Kim	mov	-8(%rsi),%rbx
1232e71b7053SJung-uk Kim.cfi_restore	%rbx
12337bded2dbSJung-uk Kim	lea	(%rsi),%rsp
1234e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
12357bded2dbSJung-uk Kim.Lpower5_epilogue:
12367bded2dbSJung-uk Kim	ret
1237e71b7053SJung-uk Kim.cfi_endproc
12387bded2dbSJung-uk Kim.size	bn_power5,.-bn_power5
12397bded2dbSJung-uk Kim
12407bded2dbSJung-uk Kim.globl	bn_sqr8x_internal
12417bded2dbSJung-uk Kim.hidden	bn_sqr8x_internal
12427bded2dbSJung-uk Kim.type	bn_sqr8x_internal,\@abi-omnipotent
12437bded2dbSJung-uk Kim.align	32
12447bded2dbSJung-uk Kimbn_sqr8x_internal:
12457bded2dbSJung-uk Kim__bn_sqr8x_internal:
124617f01e99SJung-uk Kim.cfi_startproc
12477bded2dbSJung-uk Kim	##############################################################
12487bded2dbSJung-uk Kim	# Squaring part:
12497bded2dbSJung-uk Kim	#
12507bded2dbSJung-uk Kim	# a) multiply-n-add everything but a[i]*a[i];
12517bded2dbSJung-uk Kim	# b) shift result of a) by 1 to the left and accumulate
12527bded2dbSJung-uk Kim	#    a[i]*a[i] products;
12537bded2dbSJung-uk Kim	#
12547bded2dbSJung-uk Kim	##############################################################
12557bded2dbSJung-uk Kim	#                                                     a[1]a[0]
12567bded2dbSJung-uk Kim	#                                                 a[2]a[0]
12577bded2dbSJung-uk Kim	#                                             a[3]a[0]
12587bded2dbSJung-uk Kim	#                                             a[2]a[1]
12597bded2dbSJung-uk Kim	#                                         a[4]a[0]
12607bded2dbSJung-uk Kim	#                                         a[3]a[1]
12617bded2dbSJung-uk Kim	#                                     a[5]a[0]
12627bded2dbSJung-uk Kim	#                                     a[4]a[1]
12637bded2dbSJung-uk Kim	#                                     a[3]a[2]
12647bded2dbSJung-uk Kim	#                                 a[6]a[0]
12657bded2dbSJung-uk Kim	#                                 a[5]a[1]
12667bded2dbSJung-uk Kim	#                                 a[4]a[2]
12677bded2dbSJung-uk Kim	#                             a[7]a[0]
12687bded2dbSJung-uk Kim	#                             a[6]a[1]
12697bded2dbSJung-uk Kim	#                             a[5]a[2]
12707bded2dbSJung-uk Kim	#                             a[4]a[3]
12717bded2dbSJung-uk Kim	#                         a[7]a[1]
12727bded2dbSJung-uk Kim	#                         a[6]a[2]
12737bded2dbSJung-uk Kim	#                         a[5]a[3]
12747bded2dbSJung-uk Kim	#                     a[7]a[2]
12757bded2dbSJung-uk Kim	#                     a[6]a[3]
12767bded2dbSJung-uk Kim	#                     a[5]a[4]
12777bded2dbSJung-uk Kim	#                 a[7]a[3]
12787bded2dbSJung-uk Kim	#                 a[6]a[4]
12797bded2dbSJung-uk Kim	#             a[7]a[4]
12807bded2dbSJung-uk Kim	#             a[6]a[5]
12817bded2dbSJung-uk Kim	#         a[7]a[5]
12827bded2dbSJung-uk Kim	#     a[7]a[6]
12837bded2dbSJung-uk Kim	#                                                     a[1]a[0]
12847bded2dbSJung-uk Kim	#                                                 a[2]a[0]
12857bded2dbSJung-uk Kim	#                                             a[3]a[0]
12867bded2dbSJung-uk Kim	#                                         a[4]a[0]
12877bded2dbSJung-uk Kim	#                                     a[5]a[0]
12887bded2dbSJung-uk Kim	#                                 a[6]a[0]
12897bded2dbSJung-uk Kim	#                             a[7]a[0]
12907bded2dbSJung-uk Kim	#                                             a[2]a[1]
12917bded2dbSJung-uk Kim	#                                         a[3]a[1]
12927bded2dbSJung-uk Kim	#                                     a[4]a[1]
12937bded2dbSJung-uk Kim	#                                 a[5]a[1]
12947bded2dbSJung-uk Kim	#                             a[6]a[1]
12957bded2dbSJung-uk Kim	#                         a[7]a[1]
12967bded2dbSJung-uk Kim	#                                     a[3]a[2]
12977bded2dbSJung-uk Kim	#                                 a[4]a[2]
12987bded2dbSJung-uk Kim	#                             a[5]a[2]
12997bded2dbSJung-uk Kim	#                         a[6]a[2]
13007bded2dbSJung-uk Kim	#                     a[7]a[2]
13017bded2dbSJung-uk Kim	#                             a[4]a[3]
13027bded2dbSJung-uk Kim	#                         a[5]a[3]
13037bded2dbSJung-uk Kim	#                     a[6]a[3]
13047bded2dbSJung-uk Kim	#                 a[7]a[3]
13057bded2dbSJung-uk Kim	#                     a[5]a[4]
13067bded2dbSJung-uk Kim	#                 a[6]a[4]
13077bded2dbSJung-uk Kim	#             a[7]a[4]
13087bded2dbSJung-uk Kim	#             a[6]a[5]
13097bded2dbSJung-uk Kim	#         a[7]a[5]
13107bded2dbSJung-uk Kim	#     a[7]a[6]
13117bded2dbSJung-uk Kim	#                                                         a[0]a[0]
13127bded2dbSJung-uk Kim	#                                                 a[1]a[1]
13137bded2dbSJung-uk Kim	#                                         a[2]a[2]
13147bded2dbSJung-uk Kim	#                                 a[3]a[3]
13157bded2dbSJung-uk Kim	#                         a[4]a[4]
13167bded2dbSJung-uk Kim	#                 a[5]a[5]
13177bded2dbSJung-uk Kim	#         a[6]a[6]
13187bded2dbSJung-uk Kim	# a[7]a[7]
13197bded2dbSJung-uk Kim
13207bded2dbSJung-uk Kim	lea	32(%r10),$i		# $i=-($num-32)
13217bded2dbSJung-uk Kim	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
13227bded2dbSJung-uk Kim
13237bded2dbSJung-uk Kim	mov	$num,$j			# $j=$num
13247bded2dbSJung-uk Kim
13257bded2dbSJung-uk Kim					# comments apply to $num==8 case
13267bded2dbSJung-uk Kim	mov	-32($aptr,$i),$a0	# a[0]
13277bded2dbSJung-uk Kim	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
13287bded2dbSJung-uk Kim	mov	-24($aptr,$i),%rax	# a[1]
13297bded2dbSJung-uk Kim	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
13307bded2dbSJung-uk Kim	mov	-16($aptr,$i),$ai	# a[2]
13317bded2dbSJung-uk Kim	mov	%rax,$a1
13327bded2dbSJung-uk Kim
13337bded2dbSJung-uk Kim	mul	$a0			# a[1]*a[0]
13347bded2dbSJung-uk Kim	mov	%rax,$A0[0]		# a[1]*a[0]
13357bded2dbSJung-uk Kim	 mov	$ai,%rax		# a[2]
13367bded2dbSJung-uk Kim	mov	%rdx,$A0[1]
13377bded2dbSJung-uk Kim	mov	$A0[0],-24($tptr,$i)	# t[1]
13387bded2dbSJung-uk Kim
13397bded2dbSJung-uk Kim	mul	$a0			# a[2]*a[0]
13407bded2dbSJung-uk Kim	add	%rax,$A0[1]
13417bded2dbSJung-uk Kim	 mov	$ai,%rax
13427bded2dbSJung-uk Kim	adc	\$0,%rdx
13437bded2dbSJung-uk Kim	mov	$A0[1],-16($tptr,$i)	# t[2]
13447bded2dbSJung-uk Kim	mov	%rdx,$A0[0]
13457bded2dbSJung-uk Kim
13467bded2dbSJung-uk Kim
13477bded2dbSJung-uk Kim	 mov	-8($aptr,$i),$ai	# a[3]
13487bded2dbSJung-uk Kim	mul	$a1			# a[2]*a[1]
13497bded2dbSJung-uk Kim	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
13507bded2dbSJung-uk Kim	 mov	$ai,%rax
13517bded2dbSJung-uk Kim	mov	%rdx,$A1[1]
13527bded2dbSJung-uk Kim
13537bded2dbSJung-uk Kim	 lea	($i),$j
13547bded2dbSJung-uk Kim	mul	$a0			# a[3]*a[0]
13557bded2dbSJung-uk Kim	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
13567bded2dbSJung-uk Kim	 mov	$ai,%rax
13577bded2dbSJung-uk Kim	mov	%rdx,$A0[1]
13587bded2dbSJung-uk Kim	adc	\$0,$A0[1]
13597bded2dbSJung-uk Kim	add	$A1[0],$A0[0]
13607bded2dbSJung-uk Kim	adc	\$0,$A0[1]
13617bded2dbSJung-uk Kim	mov	$A0[0],-8($tptr,$j)	# t[3]
13627bded2dbSJung-uk Kim	jmp	.Lsqr4x_1st
13637bded2dbSJung-uk Kim
13647bded2dbSJung-uk Kim.align	32
13657bded2dbSJung-uk Kim.Lsqr4x_1st:
13667bded2dbSJung-uk Kim	 mov	($aptr,$j),$ai		# a[4]
13677bded2dbSJung-uk Kim	mul	$a1			# a[3]*a[1]
13687bded2dbSJung-uk Kim	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
13697bded2dbSJung-uk Kim	 mov	$ai,%rax
13707bded2dbSJung-uk Kim	mov	%rdx,$A1[0]
13717bded2dbSJung-uk Kim	adc	\$0,$A1[0]
13727bded2dbSJung-uk Kim
13737bded2dbSJung-uk Kim	mul	$a0			# a[4]*a[0]
13747bded2dbSJung-uk Kim	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
13757bded2dbSJung-uk Kim	 mov	$ai,%rax		# a[3]
13767bded2dbSJung-uk Kim	 mov	8($aptr,$j),$ai		# a[5]
13777bded2dbSJung-uk Kim	mov	%rdx,$A0[0]
13787bded2dbSJung-uk Kim	adc	\$0,$A0[0]
13797bded2dbSJung-uk Kim	add	$A1[1],$A0[1]
13807bded2dbSJung-uk Kim	adc	\$0,$A0[0]
13817bded2dbSJung-uk Kim
13827bded2dbSJung-uk Kim
13837bded2dbSJung-uk Kim	mul	$a1			# a[4]*a[3]
13847bded2dbSJung-uk Kim	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
13857bded2dbSJung-uk Kim	 mov	$ai,%rax
13867bded2dbSJung-uk Kim	 mov	$A0[1],($tptr,$j)	# t[4]
13877bded2dbSJung-uk Kim	mov	%rdx,$A1[1]
13887bded2dbSJung-uk Kim	adc	\$0,$A1[1]
13897bded2dbSJung-uk Kim
13907bded2dbSJung-uk Kim	mul	$a0			# a[5]*a[2]
13917bded2dbSJung-uk Kim	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
13927bded2dbSJung-uk Kim	 mov	$ai,%rax
13937bded2dbSJung-uk Kim	 mov	16($aptr,$j),$ai	# a[6]
13947bded2dbSJung-uk Kim	mov	%rdx,$A0[1]
13957bded2dbSJung-uk Kim	adc	\$0,$A0[1]
13967bded2dbSJung-uk Kim	add	$A1[0],$A0[0]
13977bded2dbSJung-uk Kim	adc	\$0,$A0[1]
13987bded2dbSJung-uk Kim
13997bded2dbSJung-uk Kim	mul	$a1			# a[5]*a[3]
14007bded2dbSJung-uk Kim	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
14017bded2dbSJung-uk Kim	 mov	$ai,%rax
14027bded2dbSJung-uk Kim	 mov	$A0[0],8($tptr,$j)	# t[5]
14037bded2dbSJung-uk Kim	mov	%rdx,$A1[0]
14047bded2dbSJung-uk Kim	adc	\$0,$A1[0]
14057bded2dbSJung-uk Kim
14067bded2dbSJung-uk Kim	mul	$a0			# a[6]*a[2]
14077bded2dbSJung-uk Kim	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
14087bded2dbSJung-uk Kim	 mov	$ai,%rax		# a[3]
14097bded2dbSJung-uk Kim	 mov	24($aptr,$j),$ai	# a[7]
14107bded2dbSJung-uk Kim	mov	%rdx,$A0[0]
14117bded2dbSJung-uk Kim	adc	\$0,$A0[0]
14127bded2dbSJung-uk Kim	add	$A1[1],$A0[1]
14137bded2dbSJung-uk Kim	adc	\$0,$A0[0]
14147bded2dbSJung-uk Kim
14157bded2dbSJung-uk Kim
14167bded2dbSJung-uk Kim	mul	$a1			# a[6]*a[5]
14177bded2dbSJung-uk Kim	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
14187bded2dbSJung-uk Kim	 mov	$ai,%rax
14197bded2dbSJung-uk Kim	 mov	$A0[1],16($tptr,$j)	# t[6]
14207bded2dbSJung-uk Kim	mov	%rdx,$A1[1]
14217bded2dbSJung-uk Kim	adc	\$0,$A1[1]
14227bded2dbSJung-uk Kim	 lea	32($j),$j
14237bded2dbSJung-uk Kim
14247bded2dbSJung-uk Kim	mul	$a0			# a[7]*a[4]
14257bded2dbSJung-uk Kim	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
14267bded2dbSJung-uk Kim	 mov	$ai,%rax
14277bded2dbSJung-uk Kim	mov	%rdx,$A0[1]
14287bded2dbSJung-uk Kim	adc	\$0,$A0[1]
14297bded2dbSJung-uk Kim	add	$A1[0],$A0[0]
14307bded2dbSJung-uk Kim	adc	\$0,$A0[1]
14317bded2dbSJung-uk Kim	mov	$A0[0],-8($tptr,$j)	# t[7]
14327bded2dbSJung-uk Kim
14337bded2dbSJung-uk Kim	cmp	\$0,$j
14347bded2dbSJung-uk Kim	jne	.Lsqr4x_1st
14357bded2dbSJung-uk Kim
14367bded2dbSJung-uk Kim	mul	$a1			# a[7]*a[5]
14377bded2dbSJung-uk Kim	add	%rax,$A1[1]
14387bded2dbSJung-uk Kim	lea	16($i),$i
14397bded2dbSJung-uk Kim	adc	\$0,%rdx
14407bded2dbSJung-uk Kim	add	$A0[1],$A1[1]
14417bded2dbSJung-uk Kim	adc	\$0,%rdx
14427bded2dbSJung-uk Kim
14437bded2dbSJung-uk Kim	mov	$A1[1],($tptr)		# t[8]
14447bded2dbSJung-uk Kim	mov	%rdx,$A1[0]
14457bded2dbSJung-uk Kim	mov	%rdx,8($tptr)		# t[9]
14467bded2dbSJung-uk Kim	jmp	.Lsqr4x_outer
14477bded2dbSJung-uk Kim
14487bded2dbSJung-uk Kim.align	32
14497bded2dbSJung-uk Kim.Lsqr4x_outer:				# comments apply to $num==6 case
14507bded2dbSJung-uk Kim	mov	-32($aptr,$i),$a0	# a[0]
14517bded2dbSJung-uk Kim	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
14527bded2dbSJung-uk Kim	mov	-24($aptr,$i),%rax	# a[1]
14537bded2dbSJung-uk Kim	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
14547bded2dbSJung-uk Kim	mov	-16($aptr,$i),$ai	# a[2]
14557bded2dbSJung-uk Kim	mov	%rax,$a1
14567bded2dbSJung-uk Kim
14577bded2dbSJung-uk Kim	mul	$a0			# a[1]*a[0]
14587bded2dbSJung-uk Kim	mov	-24($tptr,$i),$A0[0]	# t[1]
14597bded2dbSJung-uk Kim	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
14607bded2dbSJung-uk Kim	 mov	$ai,%rax		# a[2]
14617bded2dbSJung-uk Kim	adc	\$0,%rdx
14627bded2dbSJung-uk Kim	mov	$A0[0],-24($tptr,$i)	# t[1]
14637bded2dbSJung-uk Kim	mov	%rdx,$A0[1]
14647bded2dbSJung-uk Kim
14657bded2dbSJung-uk Kim	mul	$a0			# a[2]*a[0]
14667bded2dbSJung-uk Kim	add	%rax,$A0[1]
14677bded2dbSJung-uk Kim	 mov	$ai,%rax
14687bded2dbSJung-uk Kim	adc	\$0,%rdx
14697bded2dbSJung-uk Kim	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
14707bded2dbSJung-uk Kim	mov	%rdx,$A0[0]
14717bded2dbSJung-uk Kim	adc	\$0,$A0[0]
14727bded2dbSJung-uk Kim	mov	$A0[1],-16($tptr,$i)	# t[2]
14737bded2dbSJung-uk Kim
14747bded2dbSJung-uk Kim	xor	$A1[0],$A1[0]
14757bded2dbSJung-uk Kim
14767bded2dbSJung-uk Kim	 mov	-8($aptr,$i),$ai	# a[3]
14777bded2dbSJung-uk Kim	mul	$a1			# a[2]*a[1]
14787bded2dbSJung-uk Kim	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
14797bded2dbSJung-uk Kim	 mov	$ai,%rax
14807bded2dbSJung-uk Kim	adc	\$0,%rdx
14817bded2dbSJung-uk Kim	add	-8($tptr,$i),$A1[0]
14827bded2dbSJung-uk Kim	mov	%rdx,$A1[1]
14837bded2dbSJung-uk Kim	adc	\$0,$A1[1]
14847bded2dbSJung-uk Kim
14857bded2dbSJung-uk Kim	mul	$a0			# a[3]*a[0]
14867bded2dbSJung-uk Kim	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
14877bded2dbSJung-uk Kim	 mov	$ai,%rax
14887bded2dbSJung-uk Kim	adc	\$0,%rdx
14897bded2dbSJung-uk Kim	add	$A1[0],$A0[0]
14907bded2dbSJung-uk Kim	mov	%rdx,$A0[1]
14917bded2dbSJung-uk Kim	adc	\$0,$A0[1]
14927bded2dbSJung-uk Kim	mov	$A0[0],-8($tptr,$i)	# t[3]
14937bded2dbSJung-uk Kim
14947bded2dbSJung-uk Kim	lea	($i),$j
14957bded2dbSJung-uk Kim	jmp	.Lsqr4x_inner
14967bded2dbSJung-uk Kim
14977bded2dbSJung-uk Kim.align	32
14987bded2dbSJung-uk Kim.Lsqr4x_inner:
14997bded2dbSJung-uk Kim	 mov	($aptr,$j),$ai		# a[4]
15007bded2dbSJung-uk Kim	mul	$a1			# a[3]*a[1]
15017bded2dbSJung-uk Kim	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
15027bded2dbSJung-uk Kim	 mov	$ai,%rax
15037bded2dbSJung-uk Kim	mov	%rdx,$A1[0]
15047bded2dbSJung-uk Kim	adc	\$0,$A1[0]
15057bded2dbSJung-uk Kim	add	($tptr,$j),$A1[1]
15067bded2dbSJung-uk Kim	adc	\$0,$A1[0]
15077bded2dbSJung-uk Kim
15087bded2dbSJung-uk Kim	.byte	0x67
15097bded2dbSJung-uk Kim	mul	$a0			# a[4]*a[0]
15107bded2dbSJung-uk Kim	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
15117bded2dbSJung-uk Kim	 mov	$ai,%rax		# a[3]
15127bded2dbSJung-uk Kim	 mov	8($aptr,$j),$ai		# a[5]
15137bded2dbSJung-uk Kim	mov	%rdx,$A0[0]
15147bded2dbSJung-uk Kim	adc	\$0,$A0[0]
15157bded2dbSJung-uk Kim	add	$A1[1],$A0[1]
15167bded2dbSJung-uk Kim	adc	\$0,$A0[0]
15177bded2dbSJung-uk Kim
15187bded2dbSJung-uk Kim	mul	$a1			# a[4]*a[3]
15197bded2dbSJung-uk Kim	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
15207bded2dbSJung-uk Kim	mov	$A0[1],($tptr,$j)	# t[4]
15217bded2dbSJung-uk Kim	 mov	$ai,%rax
15227bded2dbSJung-uk Kim	mov	%rdx,$A1[1]
15237bded2dbSJung-uk Kim	adc	\$0,$A1[1]
15247bded2dbSJung-uk Kim	add	8($tptr,$j),$A1[0]
15257bded2dbSJung-uk Kim	lea	16($j),$j		# j++
15267bded2dbSJung-uk Kim	adc	\$0,$A1[1]
15277bded2dbSJung-uk Kim
15287bded2dbSJung-uk Kim	mul	$a0			# a[5]*a[2]
15297bded2dbSJung-uk Kim	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
15307bded2dbSJung-uk Kim	 mov	$ai,%rax
15317bded2dbSJung-uk Kim	adc	\$0,%rdx
15327bded2dbSJung-uk Kim	add	$A1[0],$A0[0]
15337bded2dbSJung-uk Kim	mov	%rdx,$A0[1]
15347bded2dbSJung-uk Kim	adc	\$0,$A0[1]
15357bded2dbSJung-uk Kim	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
15367bded2dbSJung-uk Kim
15377bded2dbSJung-uk Kim	cmp	\$0,$j
15387bded2dbSJung-uk Kim	jne	.Lsqr4x_inner
15397bded2dbSJung-uk Kim
15407bded2dbSJung-uk Kim	.byte	0x67
15417bded2dbSJung-uk Kim	mul	$a1			# a[5]*a[3]
15427bded2dbSJung-uk Kim	add	%rax,$A1[1]
15437bded2dbSJung-uk Kim	adc	\$0,%rdx
15447bded2dbSJung-uk Kim	add	$A0[1],$A1[1]
15457bded2dbSJung-uk Kim	adc	\$0,%rdx
15467bded2dbSJung-uk Kim
15477bded2dbSJung-uk Kim	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
15487bded2dbSJung-uk Kim	mov	%rdx,$A1[0]
15497bded2dbSJung-uk Kim	mov	%rdx,8($tptr)		# t[7], "preloaded t[3]" below
15507bded2dbSJung-uk Kim
15517bded2dbSJung-uk Kim	add	\$16,$i
15527bded2dbSJung-uk Kim	jnz	.Lsqr4x_outer
15537bded2dbSJung-uk Kim
15547bded2dbSJung-uk Kim					# comments apply to $num==4 case
15557bded2dbSJung-uk Kim	mov	-32($aptr),$a0		# a[0]
15567bded2dbSJung-uk Kim	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
15577bded2dbSJung-uk Kim	mov	-24($aptr),%rax		# a[1]
15587bded2dbSJung-uk Kim	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
15597bded2dbSJung-uk Kim	mov	-16($aptr),$ai		# a[2]
15607bded2dbSJung-uk Kim	mov	%rax,$a1
15617bded2dbSJung-uk Kim
15627bded2dbSJung-uk Kim	mul	$a0			# a[1]*a[0]
15637bded2dbSJung-uk Kim	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
15647bded2dbSJung-uk Kim	 mov	$ai,%rax		# a[2]
15657bded2dbSJung-uk Kim	mov	%rdx,$A0[1]
15667bded2dbSJung-uk Kim	adc	\$0,$A0[1]
15677bded2dbSJung-uk Kim
15687bded2dbSJung-uk Kim	mul	$a0			# a[2]*a[0]
15697bded2dbSJung-uk Kim	add	%rax,$A0[1]
15707bded2dbSJung-uk Kim	 mov	$ai,%rax
15717bded2dbSJung-uk Kim	 mov	$A0[0],-24($tptr)	# t[1]
15727bded2dbSJung-uk Kim	mov	%rdx,$A0[0]
15737bded2dbSJung-uk Kim	adc	\$0,$A0[0]
15747bded2dbSJung-uk Kim	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
15757bded2dbSJung-uk Kim	 mov	-8($aptr),$ai		# a[3]
15767bded2dbSJung-uk Kim	adc	\$0,$A0[0]
15777bded2dbSJung-uk Kim
15787bded2dbSJung-uk Kim	mul	$a1			# a[2]*a[1]
15797bded2dbSJung-uk Kim	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
15807bded2dbSJung-uk Kim	 mov	$ai,%rax
15817bded2dbSJung-uk Kim	 mov	$A0[1],-16($tptr)	# t[2]
15827bded2dbSJung-uk Kim	mov	%rdx,$A1[1]
15837bded2dbSJung-uk Kim	adc	\$0,$A1[1]
15847bded2dbSJung-uk Kim
15857bded2dbSJung-uk Kim	mul	$a0			# a[3]*a[0]
15867bded2dbSJung-uk Kim	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
15877bded2dbSJung-uk Kim	 mov	$ai,%rax
15887bded2dbSJung-uk Kim	mov	%rdx,$A0[1]
15897bded2dbSJung-uk Kim	adc	\$0,$A0[1]
15907bded2dbSJung-uk Kim	add	$A1[0],$A0[0]
15917bded2dbSJung-uk Kim	adc	\$0,$A0[1]
15927bded2dbSJung-uk Kim	mov	$A0[0],-8($tptr)	# t[3]
15937bded2dbSJung-uk Kim
15947bded2dbSJung-uk Kim	mul	$a1			# a[3]*a[1]
15957bded2dbSJung-uk Kim	add	%rax,$A1[1]
15967bded2dbSJung-uk Kim	 mov	-16($aptr),%rax		# a[2]
15977bded2dbSJung-uk Kim	adc	\$0,%rdx
15987bded2dbSJung-uk Kim	add	$A0[1],$A1[1]
15997bded2dbSJung-uk Kim	adc	\$0,%rdx
16007bded2dbSJung-uk Kim
16017bded2dbSJung-uk Kim	mov	$A1[1],($tptr)		# t[4]
16027bded2dbSJung-uk Kim	mov	%rdx,$A1[0]
16037bded2dbSJung-uk Kim	mov	%rdx,8($tptr)		# t[5]
16047bded2dbSJung-uk Kim
16057bded2dbSJung-uk Kim	mul	$ai			# a[2]*a[3]
16067bded2dbSJung-uk Kim___
16077bded2dbSJung-uk Kim{
16087bded2dbSJung-uk Kimmy ($shift,$carry)=($a0,$a1);
16097bded2dbSJung-uk Kimmy @S=(@A1,$ai,$n0);
16107bded2dbSJung-uk Kim$code.=<<___;
16117bded2dbSJung-uk Kim	 add	\$16,$i
16127bded2dbSJung-uk Kim	 xor	$shift,$shift
16137bded2dbSJung-uk Kim	 sub	$num,$i			# $i=16-$num
16147bded2dbSJung-uk Kim	 xor	$carry,$carry
16157bded2dbSJung-uk Kim
16167bded2dbSJung-uk Kim	add	$A1[0],%rax		# t[5]
16177bded2dbSJung-uk Kim	adc	\$0,%rdx
16187bded2dbSJung-uk Kim	mov	%rax,8($tptr)		# t[5]
16197bded2dbSJung-uk Kim	mov	%rdx,16($tptr)		# t[6]
16207bded2dbSJung-uk Kim	mov	$carry,24($tptr)	# t[7]
16217bded2dbSJung-uk Kim
16227bded2dbSJung-uk Kim	 mov	-16($aptr,$i),%rax	# a[0]
16237bded2dbSJung-uk Kim	lea	48+8(%rsp),$tptr
16247bded2dbSJung-uk Kim	 xor	$A0[0],$A0[0]		# t[0]
16257bded2dbSJung-uk Kim	 mov	8($tptr),$A0[1]		# t[1]
16267bded2dbSJung-uk Kim
16277bded2dbSJung-uk Kim	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
16287bded2dbSJung-uk Kim	shr	\$63,$A0[0]
16297bded2dbSJung-uk Kim	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
16307bded2dbSJung-uk Kim	shr	\$63,$A0[1]
16317bded2dbSJung-uk Kim	or	$A0[0],$S[1]		# | t[2*i]>>63
16327bded2dbSJung-uk Kim	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
16337bded2dbSJung-uk Kim	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
16347bded2dbSJung-uk Kim	mul	%rax			# a[i]*a[i]
16357bded2dbSJung-uk Kim	neg	$carry			# mov $carry,cf
16367bded2dbSJung-uk Kim	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
16377bded2dbSJung-uk Kim	adc	%rax,$S[0]
16387bded2dbSJung-uk Kim	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
16397bded2dbSJung-uk Kim	mov	$S[0],($tptr)
16407bded2dbSJung-uk Kim	adc	%rdx,$S[1]
16417bded2dbSJung-uk Kim
16427bded2dbSJung-uk Kim	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
16437bded2dbSJung-uk Kim	 mov	$S[1],8($tptr)
16447bded2dbSJung-uk Kim	 sbb	$carry,$carry		# mov cf,$carry
16457bded2dbSJung-uk Kim	shr	\$63,$A0[0]
16467bded2dbSJung-uk Kim	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
16477bded2dbSJung-uk Kim	shr	\$63,$A0[1]
16487bded2dbSJung-uk Kim	or	$A0[0],$S[3]		# | t[2*i]>>63
16497bded2dbSJung-uk Kim	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
16507bded2dbSJung-uk Kim	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
16517bded2dbSJung-uk Kim	mul	%rax			# a[i]*a[i]
16527bded2dbSJung-uk Kim	neg	$carry			# mov $carry,cf
16537bded2dbSJung-uk Kim	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
16547bded2dbSJung-uk Kim	adc	%rax,$S[2]
16557bded2dbSJung-uk Kim	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
16567bded2dbSJung-uk Kim	mov	$S[2],16($tptr)
16577bded2dbSJung-uk Kim	adc	%rdx,$S[3]
16587bded2dbSJung-uk Kim	lea	16($i),$i
16597bded2dbSJung-uk Kim	mov	$S[3],24($tptr)
16607bded2dbSJung-uk Kim	sbb	$carry,$carry		# mov cf,$carry
16617bded2dbSJung-uk Kim	lea	64($tptr),$tptr
16627bded2dbSJung-uk Kim	jmp	.Lsqr4x_shift_n_add
16637bded2dbSJung-uk Kim
16647bded2dbSJung-uk Kim.align	32
16657bded2dbSJung-uk Kim.Lsqr4x_shift_n_add:
16667bded2dbSJung-uk Kim	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
16677bded2dbSJung-uk Kim	shr	\$63,$A0[0]
16687bded2dbSJung-uk Kim	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
16697bded2dbSJung-uk Kim	shr	\$63,$A0[1]
16707bded2dbSJung-uk Kim	or	$A0[0],$S[1]		# | t[2*i]>>63
16717bded2dbSJung-uk Kim	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
16727bded2dbSJung-uk Kim	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
16737bded2dbSJung-uk Kim	mul	%rax			# a[i]*a[i]
16747bded2dbSJung-uk Kim	neg	$carry			# mov $carry,cf
16757bded2dbSJung-uk Kim	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
16767bded2dbSJung-uk Kim	adc	%rax,$S[0]
16777bded2dbSJung-uk Kim	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
16787bded2dbSJung-uk Kim	mov	$S[0],-32($tptr)
16797bded2dbSJung-uk Kim	adc	%rdx,$S[1]
16807bded2dbSJung-uk Kim
16817bded2dbSJung-uk Kim	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
16827bded2dbSJung-uk Kim	 mov	$S[1],-24($tptr)
16837bded2dbSJung-uk Kim	 sbb	$carry,$carry		# mov cf,$carry
16847bded2dbSJung-uk Kim	shr	\$63,$A0[0]
16857bded2dbSJung-uk Kim	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
16867bded2dbSJung-uk Kim	shr	\$63,$A0[1]
16877bded2dbSJung-uk Kim	or	$A0[0],$S[3]		# | t[2*i]>>63
16887bded2dbSJung-uk Kim	 mov	0($tptr),$A0[0]		# t[2*i+2]	# prefetch
16897bded2dbSJung-uk Kim	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
16907bded2dbSJung-uk Kim	mul	%rax			# a[i]*a[i]
16917bded2dbSJung-uk Kim	neg	$carry			# mov $carry,cf
16927bded2dbSJung-uk Kim	 mov	8($tptr),$A0[1]		# t[2*i+2+1]	# prefetch
16937bded2dbSJung-uk Kim	adc	%rax,$S[2]
16947bded2dbSJung-uk Kim	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
16957bded2dbSJung-uk Kim	mov	$S[2],-16($tptr)
16967bded2dbSJung-uk Kim	adc	%rdx,$S[3]
16977bded2dbSJung-uk Kim
16987bded2dbSJung-uk Kim	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
16997bded2dbSJung-uk Kim	 mov	$S[3],-8($tptr)
17007bded2dbSJung-uk Kim	 sbb	$carry,$carry		# mov cf,$carry
17017bded2dbSJung-uk Kim	shr	\$63,$A0[0]
17027bded2dbSJung-uk Kim	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
17037bded2dbSJung-uk Kim	shr	\$63,$A0[1]
17047bded2dbSJung-uk Kim	or	$A0[0],$S[1]		# | t[2*i]>>63
17057bded2dbSJung-uk Kim	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
17067bded2dbSJung-uk Kim	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
17077bded2dbSJung-uk Kim	mul	%rax			# a[i]*a[i]
17087bded2dbSJung-uk Kim	neg	$carry			# mov $carry,cf
17097bded2dbSJung-uk Kim	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
17107bded2dbSJung-uk Kim	adc	%rax,$S[0]
17117bded2dbSJung-uk Kim	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
17127bded2dbSJung-uk Kim	mov	$S[0],0($tptr)
17137bded2dbSJung-uk Kim	adc	%rdx,$S[1]
17147bded2dbSJung-uk Kim
17157bded2dbSJung-uk Kim	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
17167bded2dbSJung-uk Kim	 mov	$S[1],8($tptr)
17177bded2dbSJung-uk Kim	 sbb	$carry,$carry		# mov cf,$carry
17187bded2dbSJung-uk Kim	shr	\$63,$A0[0]
17197bded2dbSJung-uk Kim	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
17207bded2dbSJung-uk Kim	shr	\$63,$A0[1]
17217bded2dbSJung-uk Kim	or	$A0[0],$S[3]		# | t[2*i]>>63
17227bded2dbSJung-uk Kim	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
17237bded2dbSJung-uk Kim	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
17247bded2dbSJung-uk Kim	mul	%rax			# a[i]*a[i]
17257bded2dbSJung-uk Kim	neg	$carry			# mov $carry,cf
17267bded2dbSJung-uk Kim	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
17277bded2dbSJung-uk Kim	adc	%rax,$S[2]
17287bded2dbSJung-uk Kim	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
17297bded2dbSJung-uk Kim	mov	$S[2],16($tptr)
17307bded2dbSJung-uk Kim	adc	%rdx,$S[3]
17317bded2dbSJung-uk Kim	mov	$S[3],24($tptr)
17327bded2dbSJung-uk Kim	sbb	$carry,$carry		# mov cf,$carry
17337bded2dbSJung-uk Kim	lea	64($tptr),$tptr
17347bded2dbSJung-uk Kim	add	\$32,$i
17357bded2dbSJung-uk Kim	jnz	.Lsqr4x_shift_n_add
17367bded2dbSJung-uk Kim
17377bded2dbSJung-uk Kim	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
17387bded2dbSJung-uk Kim	.byte	0x67
17397bded2dbSJung-uk Kim	shr	\$63,$A0[0]
17407bded2dbSJung-uk Kim	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
17417bded2dbSJung-uk Kim	shr	\$63,$A0[1]
17427bded2dbSJung-uk Kim	or	$A0[0],$S[1]		# | t[2*i]>>63
17437bded2dbSJung-uk Kim	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
17447bded2dbSJung-uk Kim	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
17457bded2dbSJung-uk Kim	mul	%rax			# a[i]*a[i]
17467bded2dbSJung-uk Kim	neg	$carry			# mov $carry,cf
17477bded2dbSJung-uk Kim	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
17487bded2dbSJung-uk Kim	adc	%rax,$S[0]
17497bded2dbSJung-uk Kim	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
17507bded2dbSJung-uk Kim	mov	$S[0],-32($tptr)
17517bded2dbSJung-uk Kim	adc	%rdx,$S[1]
17527bded2dbSJung-uk Kim
17537bded2dbSJung-uk Kim	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
17547bded2dbSJung-uk Kim	 mov	$S[1],-24($tptr)
17557bded2dbSJung-uk Kim	 sbb	$carry,$carry		# mov cf,$carry
17567bded2dbSJung-uk Kim	shr	\$63,$A0[0]
17577bded2dbSJung-uk Kim	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
17587bded2dbSJung-uk Kim	shr	\$63,$A0[1]
17597bded2dbSJung-uk Kim	or	$A0[0],$S[3]		# | t[2*i]>>63
17607bded2dbSJung-uk Kim	mul	%rax			# a[i]*a[i]
17617bded2dbSJung-uk Kim	neg	$carry			# mov $carry,cf
17627bded2dbSJung-uk Kim	adc	%rax,$S[2]
17637bded2dbSJung-uk Kim	adc	%rdx,$S[3]
17647bded2dbSJung-uk Kim	mov	$S[2],-16($tptr)
17657bded2dbSJung-uk Kim	mov	$S[3],-8($tptr)
17667bded2dbSJung-uk Kim___
17677bded2dbSJung-uk Kim}
17687bded2dbSJung-uk Kim######################################################################
17697bded2dbSJung-uk Kim# Montgomery reduction part, "word-by-word" algorithm.
17707bded2dbSJung-uk Kim#
17717bded2dbSJung-uk Kim# This new path is inspired by multiple submissions from Intel, by
17727bded2dbSJung-uk Kim# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
17737bded2dbSJung-uk Kim# Vinodh Gopal...
17747bded2dbSJung-uk Kim{
17757bded2dbSJung-uk Kimmy ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
17767bded2dbSJung-uk Kim
17777bded2dbSJung-uk Kim$code.=<<___;
17787bded2dbSJung-uk Kim	movq	%xmm2,$nptr
17794c6a0400SJung-uk Kim__bn_sqr8x_reduction:
17807bded2dbSJung-uk Kim	xor	%rax,%rax
17814c6a0400SJung-uk Kim	lea	($nptr,$num),%rcx	# end of n[]
17827bded2dbSJung-uk Kim	lea	48+8(%rsp,$num,2),%rdx	# end of t[] buffer
17837bded2dbSJung-uk Kim	mov	%rcx,0+8(%rsp)
17847bded2dbSJung-uk Kim	lea	48+8(%rsp,$num),$tptr	# end of initial t[] window
17857bded2dbSJung-uk Kim	mov	%rdx,8+8(%rsp)
17867bded2dbSJung-uk Kim	neg	$num
17877bded2dbSJung-uk Kim	jmp	.L8x_reduction_loop
17887bded2dbSJung-uk Kim
17897bded2dbSJung-uk Kim.align	32
17907bded2dbSJung-uk Kim.L8x_reduction_loop:
17917bded2dbSJung-uk Kim	lea	($tptr,$num),$tptr	# start of current t[] window
17927bded2dbSJung-uk Kim	.byte	0x66
17937bded2dbSJung-uk Kim	mov	8*0($tptr),$m0
17947bded2dbSJung-uk Kim	mov	8*1($tptr),%r9
17957bded2dbSJung-uk Kim	mov	8*2($tptr),%r10
17967bded2dbSJung-uk Kim	mov	8*3($tptr),%r11
17977bded2dbSJung-uk Kim	mov	8*4($tptr),%r12
17987bded2dbSJung-uk Kim	mov	8*5($tptr),%r13
17997bded2dbSJung-uk Kim	mov	8*6($tptr),%r14
18007bded2dbSJung-uk Kim	mov	8*7($tptr),%r15
18017bded2dbSJung-uk Kim	mov	%rax,(%rdx)		# store top-most carry bit
18027bded2dbSJung-uk Kim	lea	8*8($tptr),$tptr
18037bded2dbSJung-uk Kim
18047bded2dbSJung-uk Kim	.byte	0x67
18057bded2dbSJung-uk Kim	mov	$m0,%r8
18067bded2dbSJung-uk Kim	imulq	32+8(%rsp),$m0		# n0*a[0]
18074c6a0400SJung-uk Kim	mov	8*0($nptr),%rax		# n[0]
18087bded2dbSJung-uk Kim	mov	\$8,%ecx
18097bded2dbSJung-uk Kim	jmp	.L8x_reduce
18107bded2dbSJung-uk Kim
18117bded2dbSJung-uk Kim.align	32
18127bded2dbSJung-uk Kim.L8x_reduce:
18137bded2dbSJung-uk Kim	mulq	$m0
18144c6a0400SJung-uk Kim	 mov	8*1($nptr),%rax		# n[1]
18157bded2dbSJung-uk Kim	neg	%r8
18167bded2dbSJung-uk Kim	mov	%rdx,%r8
18177bded2dbSJung-uk Kim	adc	\$0,%r8
18187bded2dbSJung-uk Kim
18197bded2dbSJung-uk Kim	mulq	$m0
18207bded2dbSJung-uk Kim	add	%rax,%r9
18214c6a0400SJung-uk Kim	 mov	8*2($nptr),%rax
18227bded2dbSJung-uk Kim	adc	\$0,%rdx
18237bded2dbSJung-uk Kim	add	%r9,%r8
18247bded2dbSJung-uk Kim	 mov	$m0,48-8+8(%rsp,%rcx,8)	# put aside n0*a[i]
18257bded2dbSJung-uk Kim	mov	%rdx,%r9
18267bded2dbSJung-uk Kim	adc	\$0,%r9
18277bded2dbSJung-uk Kim
18287bded2dbSJung-uk Kim	mulq	$m0
18297bded2dbSJung-uk Kim	add	%rax,%r10
18304c6a0400SJung-uk Kim	 mov	8*3($nptr),%rax
18317bded2dbSJung-uk Kim	adc	\$0,%rdx
18327bded2dbSJung-uk Kim	add	%r10,%r9
18337bded2dbSJung-uk Kim	 mov	32+8(%rsp),$carry	# pull n0, borrow $carry
18347bded2dbSJung-uk Kim	mov	%rdx,%r10
18357bded2dbSJung-uk Kim	adc	\$0,%r10
18367bded2dbSJung-uk Kim
18377bded2dbSJung-uk Kim	mulq	$m0
18387bded2dbSJung-uk Kim	add	%rax,%r11
18394c6a0400SJung-uk Kim	 mov	8*4($nptr),%rax
18407bded2dbSJung-uk Kim	adc	\$0,%rdx
18417bded2dbSJung-uk Kim	 imulq	%r8,$carry		# modulo-scheduled
18427bded2dbSJung-uk Kim	add	%r11,%r10
18437bded2dbSJung-uk Kim	mov	%rdx,%r11
18447bded2dbSJung-uk Kim	adc	\$0,%r11
18457bded2dbSJung-uk Kim
18467bded2dbSJung-uk Kim	mulq	$m0
18477bded2dbSJung-uk Kim	add	%rax,%r12
18484c6a0400SJung-uk Kim	 mov	8*5($nptr),%rax
18497bded2dbSJung-uk Kim	adc	\$0,%rdx
18507bded2dbSJung-uk Kim	add	%r12,%r11
18517bded2dbSJung-uk Kim	mov	%rdx,%r12
18527bded2dbSJung-uk Kim	adc	\$0,%r12
18537bded2dbSJung-uk Kim
18547bded2dbSJung-uk Kim	mulq	$m0
18557bded2dbSJung-uk Kim	add	%rax,%r13
18564c6a0400SJung-uk Kim	 mov	8*6($nptr),%rax
18577bded2dbSJung-uk Kim	adc	\$0,%rdx
18587bded2dbSJung-uk Kim	add	%r13,%r12
18597bded2dbSJung-uk Kim	mov	%rdx,%r13
18607bded2dbSJung-uk Kim	adc	\$0,%r13
18617bded2dbSJung-uk Kim
18627bded2dbSJung-uk Kim	mulq	$m0
18637bded2dbSJung-uk Kim	add	%rax,%r14
18644c6a0400SJung-uk Kim	 mov	8*7($nptr),%rax
18657bded2dbSJung-uk Kim	adc	\$0,%rdx
18667bded2dbSJung-uk Kim	add	%r14,%r13
18677bded2dbSJung-uk Kim	mov	%rdx,%r14
18687bded2dbSJung-uk Kim	adc	\$0,%r14
18697bded2dbSJung-uk Kim
18707bded2dbSJung-uk Kim	mulq	$m0
18717bded2dbSJung-uk Kim	 mov	$carry,$m0		# n0*a[i]
18727bded2dbSJung-uk Kim	add	%rax,%r15
18734c6a0400SJung-uk Kim	 mov	8*0($nptr),%rax		# n[0]
18747bded2dbSJung-uk Kim	adc	\$0,%rdx
18757bded2dbSJung-uk Kim	add	%r15,%r14
18767bded2dbSJung-uk Kim	mov	%rdx,%r15
18777bded2dbSJung-uk Kim	adc	\$0,%r15
18787bded2dbSJung-uk Kim
18797bded2dbSJung-uk Kim	dec	%ecx
18807bded2dbSJung-uk Kim	jnz	.L8x_reduce
18817bded2dbSJung-uk Kim
18824c6a0400SJung-uk Kim	lea	8*8($nptr),$nptr
18837bded2dbSJung-uk Kim	xor	%rax,%rax
18847bded2dbSJung-uk Kim	mov	8+8(%rsp),%rdx		# pull end of t[]
18857bded2dbSJung-uk Kim	cmp	0+8(%rsp),$nptr		# end of n[]?
18867bded2dbSJung-uk Kim	jae	.L8x_no_tail
18877bded2dbSJung-uk Kim
18887bded2dbSJung-uk Kim	.byte	0x66
18897bded2dbSJung-uk Kim	add	8*0($tptr),%r8
18907bded2dbSJung-uk Kim	adc	8*1($tptr),%r9
18917bded2dbSJung-uk Kim	adc	8*2($tptr),%r10
18927bded2dbSJung-uk Kim	adc	8*3($tptr),%r11
18937bded2dbSJung-uk Kim	adc	8*4($tptr),%r12
18947bded2dbSJung-uk Kim	adc	8*5($tptr),%r13
18957bded2dbSJung-uk Kim	adc	8*6($tptr),%r14
18967bded2dbSJung-uk Kim	adc	8*7($tptr),%r15
18977bded2dbSJung-uk Kim	sbb	$carry,$carry		# top carry
18987bded2dbSJung-uk Kim
18997bded2dbSJung-uk Kim	mov	48+56+8(%rsp),$m0	# pull n0*a[0]
19007bded2dbSJung-uk Kim	mov	\$8,%ecx
19014c6a0400SJung-uk Kim	mov	8*0($nptr),%rax
19027bded2dbSJung-uk Kim	jmp	.L8x_tail
19037bded2dbSJung-uk Kim
19047bded2dbSJung-uk Kim.align	32
19057bded2dbSJung-uk Kim.L8x_tail:
19067bded2dbSJung-uk Kim	mulq	$m0
19077bded2dbSJung-uk Kim	add	%rax,%r8
19084c6a0400SJung-uk Kim	 mov	8*1($nptr),%rax
19097bded2dbSJung-uk Kim	 mov	%r8,($tptr)		# save result
19107bded2dbSJung-uk Kim	mov	%rdx,%r8
19117bded2dbSJung-uk Kim	adc	\$0,%r8
19127bded2dbSJung-uk Kim
19137bded2dbSJung-uk Kim	mulq	$m0
19147bded2dbSJung-uk Kim	add	%rax,%r9
19154c6a0400SJung-uk Kim	 mov	8*2($nptr),%rax
19167bded2dbSJung-uk Kim	adc	\$0,%rdx
19177bded2dbSJung-uk Kim	add	%r9,%r8
19187bded2dbSJung-uk Kim	 lea	8($tptr),$tptr		# $tptr++
19197bded2dbSJung-uk Kim	mov	%rdx,%r9
19207bded2dbSJung-uk Kim	adc	\$0,%r9
19217bded2dbSJung-uk Kim
19227bded2dbSJung-uk Kim	mulq	$m0
19237bded2dbSJung-uk Kim	add	%rax,%r10
19244c6a0400SJung-uk Kim	 mov	8*3($nptr),%rax
19257bded2dbSJung-uk Kim	adc	\$0,%rdx
19267bded2dbSJung-uk Kim	add	%r10,%r9
19277bded2dbSJung-uk Kim	mov	%rdx,%r10
19287bded2dbSJung-uk Kim	adc	\$0,%r10
19297bded2dbSJung-uk Kim
19307bded2dbSJung-uk Kim	mulq	$m0
19317bded2dbSJung-uk Kim	add	%rax,%r11
19324c6a0400SJung-uk Kim	 mov	8*4($nptr),%rax
19337bded2dbSJung-uk Kim	adc	\$0,%rdx
19347bded2dbSJung-uk Kim	add	%r11,%r10
19357bded2dbSJung-uk Kim	mov	%rdx,%r11
19367bded2dbSJung-uk Kim	adc	\$0,%r11
19377bded2dbSJung-uk Kim
19387bded2dbSJung-uk Kim	mulq	$m0
19397bded2dbSJung-uk Kim	add	%rax,%r12
19404c6a0400SJung-uk Kim	 mov	8*5($nptr),%rax
19417bded2dbSJung-uk Kim	adc	\$0,%rdx
19427bded2dbSJung-uk Kim	add	%r12,%r11
19437bded2dbSJung-uk Kim	mov	%rdx,%r12
19447bded2dbSJung-uk Kim	adc	\$0,%r12
19457bded2dbSJung-uk Kim
19467bded2dbSJung-uk Kim	mulq	$m0
19477bded2dbSJung-uk Kim	add	%rax,%r13
19484c6a0400SJung-uk Kim	 mov	8*6($nptr),%rax
19497bded2dbSJung-uk Kim	adc	\$0,%rdx
19507bded2dbSJung-uk Kim	add	%r13,%r12
19517bded2dbSJung-uk Kim	mov	%rdx,%r13
19527bded2dbSJung-uk Kim	adc	\$0,%r13
19537bded2dbSJung-uk Kim
19547bded2dbSJung-uk Kim	mulq	$m0
19557bded2dbSJung-uk Kim	add	%rax,%r14
19564c6a0400SJung-uk Kim	 mov	8*7($nptr),%rax
19577bded2dbSJung-uk Kim	adc	\$0,%rdx
19587bded2dbSJung-uk Kim	add	%r14,%r13
19597bded2dbSJung-uk Kim	mov	%rdx,%r14
19607bded2dbSJung-uk Kim	adc	\$0,%r14
19617bded2dbSJung-uk Kim
19627bded2dbSJung-uk Kim	mulq	$m0
19637bded2dbSJung-uk Kim	 mov	48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
19647bded2dbSJung-uk Kim	add	%rax,%r15
19657bded2dbSJung-uk Kim	adc	\$0,%rdx
19667bded2dbSJung-uk Kim	add	%r15,%r14
19674c6a0400SJung-uk Kim	 mov	8*0($nptr),%rax		# pull n[0]
19687bded2dbSJung-uk Kim	mov	%rdx,%r15
19697bded2dbSJung-uk Kim	adc	\$0,%r15
19707bded2dbSJung-uk Kim
19717bded2dbSJung-uk Kim	dec	%ecx
19727bded2dbSJung-uk Kim	jnz	.L8x_tail
19737bded2dbSJung-uk Kim
19744c6a0400SJung-uk Kim	lea	8*8($nptr),$nptr
19757bded2dbSJung-uk Kim	mov	8+8(%rsp),%rdx		# pull end of t[]
19767bded2dbSJung-uk Kim	cmp	0+8(%rsp),$nptr		# end of n[]?
19777bded2dbSJung-uk Kim	jae	.L8x_tail_done		# break out of loop
19787bded2dbSJung-uk Kim
19797bded2dbSJung-uk Kim	 mov	48+56+8(%rsp),$m0	# pull n0*a[0]
19807bded2dbSJung-uk Kim	neg	$carry
19817bded2dbSJung-uk Kim	 mov	8*0($nptr),%rax		# pull n[0]
19827bded2dbSJung-uk Kim	adc	8*0($tptr),%r8
19837bded2dbSJung-uk Kim	adc	8*1($tptr),%r9
19847bded2dbSJung-uk Kim	adc	8*2($tptr),%r10
19857bded2dbSJung-uk Kim	adc	8*3($tptr),%r11
19867bded2dbSJung-uk Kim	adc	8*4($tptr),%r12
19877bded2dbSJung-uk Kim	adc	8*5($tptr),%r13
19887bded2dbSJung-uk Kim	adc	8*6($tptr),%r14
19897bded2dbSJung-uk Kim	adc	8*7($tptr),%r15
19907bded2dbSJung-uk Kim	sbb	$carry,$carry		# top carry
19917bded2dbSJung-uk Kim
19927bded2dbSJung-uk Kim	mov	\$8,%ecx
19937bded2dbSJung-uk Kim	jmp	.L8x_tail
19947bded2dbSJung-uk Kim
19957bded2dbSJung-uk Kim.align	32
19967bded2dbSJung-uk Kim.L8x_tail_done:
19976cf8931aSJung-uk Kim	xor	%rax,%rax
19987bded2dbSJung-uk Kim	add	(%rdx),%r8		# can this overflow?
199980815a77SJung-uk Kim	adc	\$0,%r9
200080815a77SJung-uk Kim	adc	\$0,%r10
200180815a77SJung-uk Kim	adc	\$0,%r11
200280815a77SJung-uk Kim	adc	\$0,%r12
200380815a77SJung-uk Kim	adc	\$0,%r13
200480815a77SJung-uk Kim	adc	\$0,%r14
20056cf8931aSJung-uk Kim	adc	\$0,%r15
20066cf8931aSJung-uk Kim	adc	\$0,%rax
20077bded2dbSJung-uk Kim
20087bded2dbSJung-uk Kim	neg	$carry
20097bded2dbSJung-uk Kim.L8x_no_tail:
20107bded2dbSJung-uk Kim	adc	8*0($tptr),%r8
20117bded2dbSJung-uk Kim	adc	8*1($tptr),%r9
20127bded2dbSJung-uk Kim	adc	8*2($tptr),%r10
20137bded2dbSJung-uk Kim	adc	8*3($tptr),%r11
20147bded2dbSJung-uk Kim	adc	8*4($tptr),%r12
20157bded2dbSJung-uk Kim	adc	8*5($tptr),%r13
20167bded2dbSJung-uk Kim	adc	8*6($tptr),%r14
20177bded2dbSJung-uk Kim	adc	8*7($tptr),%r15
20187bded2dbSJung-uk Kim	adc	\$0,%rax		# top-most carry
20194c6a0400SJung-uk Kim	 mov	-8($nptr),%rcx		# np[num-1]
20207bded2dbSJung-uk Kim	 xor	$carry,$carry
20217bded2dbSJung-uk Kim
20227bded2dbSJung-uk Kim	movq	%xmm2,$nptr		# restore $nptr
20237bded2dbSJung-uk Kim
20247bded2dbSJung-uk Kim	mov	%r8,8*0($tptr)		# store top 512 bits
20257bded2dbSJung-uk Kim	mov	%r9,8*1($tptr)
20267bded2dbSJung-uk Kim	 movq	%xmm3,$num		# $num is %r9, can't be moved upwards
20277bded2dbSJung-uk Kim	mov	%r10,8*2($tptr)
20287bded2dbSJung-uk Kim	mov	%r11,8*3($tptr)
20297bded2dbSJung-uk Kim	mov	%r12,8*4($tptr)
20307bded2dbSJung-uk Kim	mov	%r13,8*5($tptr)
20317bded2dbSJung-uk Kim	mov	%r14,8*6($tptr)
20327bded2dbSJung-uk Kim	mov	%r15,8*7($tptr)
20337bded2dbSJung-uk Kim	lea	8*8($tptr),$tptr
20347bded2dbSJung-uk Kim
20357bded2dbSJung-uk Kim	cmp	%rdx,$tptr		# end of t[]?
20367bded2dbSJung-uk Kim	jb	.L8x_reduction_loop
20374c6a0400SJung-uk Kim	ret
203817f01e99SJung-uk Kim.cfi_endproc
20394c6a0400SJung-uk Kim.size	bn_sqr8x_internal,.-bn_sqr8x_internal
20407bded2dbSJung-uk Kim___
20417bded2dbSJung-uk Kim}
20427bded2dbSJung-uk Kim##############################################################
20437bded2dbSJung-uk Kim# Post-condition, 4x unrolled
20447bded2dbSJung-uk Kim#
20457bded2dbSJung-uk Kim{
20467bded2dbSJung-uk Kimmy ($tptr,$nptr)=("%rbx","%rbp");
20477bded2dbSJung-uk Kim$code.=<<___;
20484c6a0400SJung-uk Kim.type	__bn_post4x_internal,\@abi-omnipotent
20497bded2dbSJung-uk Kim.align	32
20504c6a0400SJung-uk Kim__bn_post4x_internal:
205117f01e99SJung-uk Kim.cfi_startproc
20524c6a0400SJung-uk Kim	mov	8*0($nptr),%r12
20534c6a0400SJung-uk Kim	lea	(%rdi,$num),$tptr	# %rdi was $tptr above
20544c6a0400SJung-uk Kim	mov	$num,%rcx
20554c6a0400SJung-uk Kim	movq	%xmm1,$rptr		# restore $rptr
20564c6a0400SJung-uk Kim	neg	%rax
20574c6a0400SJung-uk Kim	movq	%xmm1,$aptr		# prepare for back-to-back call
20584c6a0400SJung-uk Kim	sar	\$3+2,%rcx
20594c6a0400SJung-uk Kim	dec	%r12			# so that after 'not' we get -n[0]
20604c6a0400SJung-uk Kim	xor	%r10,%r10
20614c6a0400SJung-uk Kim	mov	8*1($nptr),%r13
20624c6a0400SJung-uk Kim	mov	8*2($nptr),%r14
20634c6a0400SJung-uk Kim	mov	8*3($nptr),%r15
20644c6a0400SJung-uk Kim	jmp	.Lsqr4x_sub_entry
20654c6a0400SJung-uk Kim
20664c6a0400SJung-uk Kim.align	16
20677bded2dbSJung-uk Kim.Lsqr4x_sub:
20684c6a0400SJung-uk Kim	mov	8*0($nptr),%r12
20694c6a0400SJung-uk Kim	mov	8*1($nptr),%r13
20704c6a0400SJung-uk Kim	mov	8*2($nptr),%r14
20714c6a0400SJung-uk Kim	mov	8*3($nptr),%r15
20724c6a0400SJung-uk Kim.Lsqr4x_sub_entry:
20734c6a0400SJung-uk Kim	lea	8*4($nptr),$nptr
20744c6a0400SJung-uk Kim	not	%r12
20754c6a0400SJung-uk Kim	not	%r13
20764c6a0400SJung-uk Kim	not	%r14
20774c6a0400SJung-uk Kim	not	%r15
20784c6a0400SJung-uk Kim	and	%rax,%r12
20794c6a0400SJung-uk Kim	and	%rax,%r13
20804c6a0400SJung-uk Kim	and	%rax,%r14
20814c6a0400SJung-uk Kim	and	%rax,%r15
20824c6a0400SJung-uk Kim
20834c6a0400SJung-uk Kim	neg	%r10			# mov %r10,%cf
20844c6a0400SJung-uk Kim	adc	8*0($tptr),%r12
20854c6a0400SJung-uk Kim	adc	8*1($tptr),%r13
20864c6a0400SJung-uk Kim	adc	8*2($tptr),%r14
20874c6a0400SJung-uk Kim	adc	8*3($tptr),%r15
20887bded2dbSJung-uk Kim	mov	%r12,8*0($rptr)
20894c6a0400SJung-uk Kim	lea	8*4($tptr),$tptr
20907bded2dbSJung-uk Kim	mov	%r13,8*1($rptr)
20914c6a0400SJung-uk Kim	sbb	%r10,%r10		# mov %cf,%r10
20927bded2dbSJung-uk Kim	mov	%r14,8*2($rptr)
20937bded2dbSJung-uk Kim	mov	%r15,8*3($rptr)
20947bded2dbSJung-uk Kim	lea	8*4($rptr),$rptr
20957bded2dbSJung-uk Kim
20967bded2dbSJung-uk Kim	inc	%rcx			# pass %cf
20977bded2dbSJung-uk Kim	jnz	.Lsqr4x_sub
20984c6a0400SJung-uk Kim
20997bded2dbSJung-uk Kim	mov	$num,%r10		# prepare for back-to-back call
21007bded2dbSJung-uk Kim	neg	$num			# restore $num
21017bded2dbSJung-uk Kim	ret
210217f01e99SJung-uk Kim.cfi_endproc
21034c6a0400SJung-uk Kim.size	__bn_post4x_internal,.-__bn_post4x_internal
21047bded2dbSJung-uk Kim___
21054c6a0400SJung-uk Kim}
21067bded2dbSJung-uk Kim}}}
21077bded2dbSJung-uk Kim
21087bded2dbSJung-uk Kimif ($addx) {{{
21097bded2dbSJung-uk Kimmy $bp="%rdx";	# restore original value
21107bded2dbSJung-uk Kim
21117bded2dbSJung-uk Kim$code.=<<___;
21127bded2dbSJung-uk Kim.type	bn_mulx4x_mont_gather5,\@function,6
21137bded2dbSJung-uk Kim.align	32
21147bded2dbSJung-uk Kimbn_mulx4x_mont_gather5:
2115e71b7053SJung-uk Kim.cfi_startproc
21167bded2dbSJung-uk Kim	mov	%rsp,%rax
2117e71b7053SJung-uk Kim.cfi_def_cfa_register	%rax
2118aeb5019cSJung-uk Kim.Lmulx4x_enter:
21197bded2dbSJung-uk Kim	push	%rbx
2120e71b7053SJung-uk Kim.cfi_push	%rbx
21217bded2dbSJung-uk Kim	push	%rbp
2122e71b7053SJung-uk Kim.cfi_push	%rbp
21237bded2dbSJung-uk Kim	push	%r12
2124e71b7053SJung-uk Kim.cfi_push	%r12
21257bded2dbSJung-uk Kim	push	%r13
2126e71b7053SJung-uk Kim.cfi_push	%r13
21277bded2dbSJung-uk Kim	push	%r14
2128e71b7053SJung-uk Kim.cfi_push	%r14
21297bded2dbSJung-uk Kim	push	%r15
2130e71b7053SJung-uk Kim.cfi_push	%r15
2131aeb5019cSJung-uk Kim.Lmulx4x_prologue:
21324c6a0400SJung-uk Kim
21337bded2dbSJung-uk Kim	shl	\$3,${num}d		# convert $num to bytes
21344c6a0400SJung-uk Kim	lea	($num,$num,2),%r10	# 3*$num in bytes
21357bded2dbSJung-uk Kim	neg	$num			# -$num
21367bded2dbSJung-uk Kim	mov	($n0),$n0		# *n0
21377bded2dbSJung-uk Kim
21387bded2dbSJung-uk Kim	##############################################################
21394c6a0400SJung-uk Kim	# Ensure that stack frame doesn't alias with $rptr+3*$num
21404c6a0400SJung-uk Kim	# modulo 4096, which covers ret[num], am[num] and n[num]
21414c6a0400SJung-uk Kim	# (see bn_exp.c). This is done to allow memory disambiguation
21424c6a0400SJung-uk Kim	# logic do its magic. [Extra [num] is allocated in order
21434c6a0400SJung-uk Kim	# to align with bn_power5's frame, which is cleansed after
21444c6a0400SJung-uk Kim	# completing exponentiation. Extra 256 bytes is for power mask
21454c6a0400SJung-uk Kim	# calculated from 7th argument, the index.]
21467bded2dbSJung-uk Kim	#
21474c6a0400SJung-uk Kim	lea	-320(%rsp,$num,2),%r11
2148aeb5019cSJung-uk Kim	mov	%rsp,%rbp
21494c6a0400SJung-uk Kim	sub	$rp,%r11
21507bded2dbSJung-uk Kim	and	\$4095,%r11
21517bded2dbSJung-uk Kim	cmp	%r11,%r10
21527bded2dbSJung-uk Kim	jb	.Lmulx4xsp_alt
2153aeb5019cSJung-uk Kim	sub	%r11,%rbp		# align with $aptr
2154aeb5019cSJung-uk Kim	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
21557bded2dbSJung-uk Kim	jmp	.Lmulx4xsp_done
21567bded2dbSJung-uk Kim
21577bded2dbSJung-uk Kim.Lmulx4xsp_alt:
21584c6a0400SJung-uk Kim	lea	4096-320(,$num,2),%r10
2159aeb5019cSJung-uk Kim	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
21607bded2dbSJung-uk Kim	sub	%r10,%r11
21617bded2dbSJung-uk Kim	mov	\$0,%r10
21627bded2dbSJung-uk Kim	cmovc	%r10,%r11
2163aeb5019cSJung-uk Kim	sub	%r11,%rbp
21647bded2dbSJung-uk Kim.Lmulx4xsp_done:
2165aeb5019cSJung-uk Kim	and	\$-64,%rbp		# ensure alignment
2166aeb5019cSJung-uk Kim	mov	%rsp,%r11
2167aeb5019cSJung-uk Kim	sub	%rbp,%r11
2168b8721c16SJung-uk Kim	and	\$-4096,%r11
2169aeb5019cSJung-uk Kim	lea	(%rbp,%r11),%rsp
2170aeb5019cSJung-uk Kim	mov	(%rsp),%r10
2171aeb5019cSJung-uk Kim	cmp	%rbp,%rsp
2172aeb5019cSJung-uk Kim	ja	.Lmulx4x_page_walk
2173aeb5019cSJung-uk Kim	jmp	.Lmulx4x_page_walk_done
2174aeb5019cSJung-uk Kim
2175b8721c16SJung-uk Kim.Lmulx4x_page_walk:
2176aeb5019cSJung-uk Kim	lea	-4096(%rsp),%rsp
2177aeb5019cSJung-uk Kim	mov	(%rsp),%r10
2178aeb5019cSJung-uk Kim	cmp	%rbp,%rsp
2179aeb5019cSJung-uk Kim	ja	.Lmulx4x_page_walk
2180aeb5019cSJung-uk Kim.Lmulx4x_page_walk_done:
2181b8721c16SJung-uk Kim
21827bded2dbSJung-uk Kim	##############################################################
21837bded2dbSJung-uk Kim	# Stack layout
21847bded2dbSJung-uk Kim	# +0	-num
21857bded2dbSJung-uk Kim	# +8	off-loaded &b[i]
21867bded2dbSJung-uk Kim	# +16	end of b[num]
21877bded2dbSJung-uk Kim	# +24	inner counter
21887bded2dbSJung-uk Kim	# +32	saved n0
21897bded2dbSJung-uk Kim	# +40	saved %rsp
21907bded2dbSJung-uk Kim	# +48
21917bded2dbSJung-uk Kim	# +56	saved rp
21927bded2dbSJung-uk Kim	# +64	tmp[num+1]
21937bded2dbSJung-uk Kim	#
21947bded2dbSJung-uk Kim	mov	$n0, 32(%rsp)		# save *n0
21957bded2dbSJung-uk Kim	mov	%rax,40(%rsp)		# save original %rsp
2196e71b7053SJung-uk Kim.cfi_cfa_expression	%rsp+40,deref,+8
21977bded2dbSJung-uk Kim.Lmulx4x_body:
21987bded2dbSJung-uk Kim	call	mulx4x_internal
21997bded2dbSJung-uk Kim
22007bded2dbSJung-uk Kim	mov	40(%rsp),%rsi		# restore %rsp
2201e71b7053SJung-uk Kim.cfi_def_cfa	%rsi,8
22021f13597dSJung-uk Kim	mov	\$1,%rax
22034c6a0400SJung-uk Kim
22047bded2dbSJung-uk Kim	mov	-48(%rsi),%r15
2205e71b7053SJung-uk Kim.cfi_restore	%r15
22067bded2dbSJung-uk Kim	mov	-40(%rsi),%r14
2207e71b7053SJung-uk Kim.cfi_restore	%r14
22087bded2dbSJung-uk Kim	mov	-32(%rsi),%r13
2209e71b7053SJung-uk Kim.cfi_restore	%r13
22107bded2dbSJung-uk Kim	mov	-24(%rsi),%r12
2211e71b7053SJung-uk Kim.cfi_restore	%r12
22127bded2dbSJung-uk Kim	mov	-16(%rsi),%rbp
2213e71b7053SJung-uk Kim.cfi_restore	%rbp
22147bded2dbSJung-uk Kim	mov	-8(%rsi),%rbx
2215e71b7053SJung-uk Kim.cfi_restore	%rbx
22167bded2dbSJung-uk Kim	lea	(%rsi),%rsp
2217e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
22187bded2dbSJung-uk Kim.Lmulx4x_epilogue:
22191f13597dSJung-uk Kim	ret
2220e71b7053SJung-uk Kim.cfi_endproc
22217bded2dbSJung-uk Kim.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
22227bded2dbSJung-uk Kim
22237bded2dbSJung-uk Kim.type	mulx4x_internal,\@abi-omnipotent
22247bded2dbSJung-uk Kim.align	32
22257bded2dbSJung-uk Kimmulx4x_internal:
222617f01e99SJung-uk Kim.cfi_startproc
22274c6a0400SJung-uk Kim	mov	$num,8(%rsp)		# save -$num (it was in bytes)
22284c6a0400SJung-uk Kim	mov	$num,%r10
22297bded2dbSJung-uk Kim	neg	$num			# restore $num
22307bded2dbSJung-uk Kim	shl	\$5,$num
22314c6a0400SJung-uk Kim	neg	%r10			# restore $num
22324c6a0400SJung-uk Kim	lea	128($bp,$num),%r13	# end of powers table (+size optimization)
22337bded2dbSJung-uk Kim	shr	\$5+5,$num
22344c6a0400SJung-uk Kim	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument
22357bded2dbSJung-uk Kim	sub	\$1,$num
22364c6a0400SJung-uk Kim	lea	.Linc(%rip),%rax
22377bded2dbSJung-uk Kim	mov	%r13,16+8(%rsp)		# end of b[num]
22387bded2dbSJung-uk Kim	mov	$num,24+8(%rsp)		# inner counter
22397bded2dbSJung-uk Kim	mov	$rp, 56+8(%rsp)		# save $rp
22407bded2dbSJung-uk Kim___
22417bded2dbSJung-uk Kimmy ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
22427bded2dbSJung-uk Kim   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
22437bded2dbSJung-uk Kimmy $rptr=$bptr;
22447bded2dbSJung-uk Kimmy $STRIDE=2**5*8;		# 5 is "window size"
22457bded2dbSJung-uk Kimmy $N=$STRIDE/4;		# should match cache line size
22467bded2dbSJung-uk Kim$code.=<<___;
22474c6a0400SJung-uk Kim	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
22484c6a0400SJung-uk Kim	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
2249e71b7053SJung-uk Kim	lea	88-112(%rsp,%r10),%r10	# place the mask after tp[num+1] (+ICache optimization)
22504c6a0400SJung-uk Kim	lea	128($bp),$bptr		# size optimization
22517bded2dbSJung-uk Kim
22524c6a0400SJung-uk Kim	pshufd	\$0,%xmm5,%xmm5		# broadcast index
22534c6a0400SJung-uk Kim	movdqa	%xmm1,%xmm4
22544c6a0400SJung-uk Kim	.byte	0x67
22554c6a0400SJung-uk Kim	movdqa	%xmm1,%xmm2
22564c6a0400SJung-uk Kim___
22574c6a0400SJung-uk Kim########################################################################
22584c6a0400SJung-uk Kim# calculate mask by comparing 0..31 to index and save result to stack
22597bded2dbSJung-uk Kim#
22604c6a0400SJung-uk Kim$code.=<<___;
22614c6a0400SJung-uk Kim	.byte	0x67
22624c6a0400SJung-uk Kim	paddd	%xmm0,%xmm1
22634c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
22644c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm3
22654c6a0400SJung-uk Kim___
22664c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16-4;$i+=4) {
22674c6a0400SJung-uk Kim$code.=<<___;
22684c6a0400SJung-uk Kim	paddd	%xmm1,%xmm2
22694c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
22704c6a0400SJung-uk Kim	movdqa	%xmm0,`16*($i+0)+112`(%r10)
22714c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm0
22724c6a0400SJung-uk Kim
22734c6a0400SJung-uk Kim	paddd	%xmm2,%xmm3
22744c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
22754c6a0400SJung-uk Kim	movdqa	%xmm1,`16*($i+1)+112`(%r10)
22764c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm1
22774c6a0400SJung-uk Kim
22784c6a0400SJung-uk Kim	paddd	%xmm3,%xmm0
22794c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
22804c6a0400SJung-uk Kim	movdqa	%xmm2,`16*($i+2)+112`(%r10)
22814c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm2
22824c6a0400SJung-uk Kim
22834c6a0400SJung-uk Kim	paddd	%xmm0,%xmm1
22844c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm0
22854c6a0400SJung-uk Kim	movdqa	%xmm3,`16*($i+3)+112`(%r10)
22864c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm3
22874c6a0400SJung-uk Kim___
22884c6a0400SJung-uk Kim}
22894c6a0400SJung-uk Kim$code.=<<___;				# last iteration can be optimized
22904c6a0400SJung-uk Kim	.byte	0x67
22914c6a0400SJung-uk Kim	paddd	%xmm1,%xmm2
22924c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm1
22934c6a0400SJung-uk Kim	movdqa	%xmm0,`16*($i+0)+112`(%r10)
22944c6a0400SJung-uk Kim
22954c6a0400SJung-uk Kim	paddd	%xmm2,%xmm3
22964c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm2
22974c6a0400SJung-uk Kim	movdqa	%xmm1,`16*($i+1)+112`(%r10)
22984c6a0400SJung-uk Kim
22994c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm3
23004c6a0400SJung-uk Kim	movdqa	%xmm2,`16*($i+2)+112`(%r10)
23014c6a0400SJung-uk Kim
23024c6a0400SJung-uk Kim	pand	`16*($i+0)-128`($bptr),%xmm0	# while it's still in register
23034c6a0400SJung-uk Kim	pand	`16*($i+1)-128`($bptr),%xmm1
23044c6a0400SJung-uk Kim	pand	`16*($i+2)-128`($bptr),%xmm2
23054c6a0400SJung-uk Kim	movdqa	%xmm3,`16*($i+3)+112`(%r10)
23064c6a0400SJung-uk Kim	pand	`16*($i+3)-128`($bptr),%xmm3
23074c6a0400SJung-uk Kim	por	%xmm2,%xmm0
23084c6a0400SJung-uk Kim	por	%xmm3,%xmm1
23094c6a0400SJung-uk Kim___
23104c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16-4;$i+=4) {
23114c6a0400SJung-uk Kim$code.=<<___;
23124c6a0400SJung-uk Kim	movdqa	`16*($i+0)-128`($bptr),%xmm4
23134c6a0400SJung-uk Kim	movdqa	`16*($i+1)-128`($bptr),%xmm5
23144c6a0400SJung-uk Kim	movdqa	`16*($i+2)-128`($bptr),%xmm2
23154c6a0400SJung-uk Kim	pand	`16*($i+0)+112`(%r10),%xmm4
23164c6a0400SJung-uk Kim	movdqa	`16*($i+3)-128`($bptr),%xmm3
23174c6a0400SJung-uk Kim	pand	`16*($i+1)+112`(%r10),%xmm5
23184c6a0400SJung-uk Kim	por	%xmm4,%xmm0
23194c6a0400SJung-uk Kim	pand	`16*($i+2)+112`(%r10),%xmm2
23204c6a0400SJung-uk Kim	por	%xmm5,%xmm1
23214c6a0400SJung-uk Kim	pand	`16*($i+3)+112`(%r10),%xmm3
23224c6a0400SJung-uk Kim	por	%xmm2,%xmm0
23234c6a0400SJung-uk Kim	por	%xmm3,%xmm1
23244c6a0400SJung-uk Kim___
23254c6a0400SJung-uk Kim}
23264c6a0400SJung-uk Kim$code.=<<___;
23274c6a0400SJung-uk Kim	pxor	%xmm1,%xmm0
23284c6a0400SJung-uk Kim	pshufd	\$0x4e,%xmm0,%xmm1
23294c6a0400SJung-uk Kim	por	%xmm1,%xmm0
23304c6a0400SJung-uk Kim	lea	$STRIDE($bptr),$bptr
23314c6a0400SJung-uk Kim	movq	%xmm0,%rdx		# bp[0]
23324c6a0400SJung-uk Kim	lea	64+8*4+8(%rsp),$tptr
23337bded2dbSJung-uk Kim
23347bded2dbSJung-uk Kim	mov	%rdx,$bi
23357bded2dbSJung-uk Kim	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
23367bded2dbSJung-uk Kim	mulx	1*8($aptr),%r11,%r12	# a[1]*b[0]
23377bded2dbSJung-uk Kim	add	%rax,%r11
23387bded2dbSJung-uk Kim	mulx	2*8($aptr),%rax,%r13	# ...
23397bded2dbSJung-uk Kim	adc	%rax,%r12
23407bded2dbSJung-uk Kim	adc	\$0,%r13
23417bded2dbSJung-uk Kim	mulx	3*8($aptr),%rax,%r14
23427bded2dbSJung-uk Kim
23437bded2dbSJung-uk Kim	mov	$mi,%r15
23447bded2dbSJung-uk Kim	imulq	32+8(%rsp),$mi		# "t[0]"*n0
23457bded2dbSJung-uk Kim	xor	$zero,$zero		# cf=0, of=0
23467bded2dbSJung-uk Kim	mov	$mi,%rdx
23477bded2dbSJung-uk Kim
23487bded2dbSJung-uk Kim	mov	$bptr,8+8(%rsp)		# off-load &b[i]
23497bded2dbSJung-uk Kim
23504c6a0400SJung-uk Kim	lea	4*8($aptr),$aptr
23517bded2dbSJung-uk Kim	adcx	%rax,%r13
23527bded2dbSJung-uk Kim	adcx	$zero,%r14		# cf=0
23537bded2dbSJung-uk Kim
23544c6a0400SJung-uk Kim	mulx	0*8($nptr),%rax,%r10
23557bded2dbSJung-uk Kim	adcx	%rax,%r15		# discarded
23567bded2dbSJung-uk Kim	adox	%r11,%r10
23574c6a0400SJung-uk Kim	mulx	1*8($nptr),%rax,%r11
23587bded2dbSJung-uk Kim	adcx	%rax,%r10
23597bded2dbSJung-uk Kim	adox	%r12,%r11
23604c6a0400SJung-uk Kim	mulx	2*8($nptr),%rax,%r12
23617bded2dbSJung-uk Kim	mov	24+8(%rsp),$bptr	# counter value
23627bded2dbSJung-uk Kim	mov	%r10,-8*4($tptr)
23637bded2dbSJung-uk Kim	adcx	%rax,%r11
23647bded2dbSJung-uk Kim	adox	%r13,%r12
23654c6a0400SJung-uk Kim	mulx	3*8($nptr),%rax,%r15
23667bded2dbSJung-uk Kim	 mov	$bi,%rdx
23677bded2dbSJung-uk Kim	mov	%r11,-8*3($tptr)
23687bded2dbSJung-uk Kim	adcx	%rax,%r12
23697bded2dbSJung-uk Kim	adox	$zero,%r15		# of=0
23704c6a0400SJung-uk Kim	lea	4*8($nptr),$nptr
23717bded2dbSJung-uk Kim	mov	%r12,-8*2($tptr)
23724c6a0400SJung-uk Kim	jmp	.Lmulx4x_1st
23737bded2dbSJung-uk Kim
23747bded2dbSJung-uk Kim.align	32
23757bded2dbSJung-uk Kim.Lmulx4x_1st:
23767bded2dbSJung-uk Kim	adcx	$zero,%r15		# cf=0, modulo-scheduled
23777bded2dbSJung-uk Kim	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
23787bded2dbSJung-uk Kim	adcx	%r14,%r10
23797bded2dbSJung-uk Kim	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
23807bded2dbSJung-uk Kim	adcx	%rax,%r11
23817bded2dbSJung-uk Kim	mulx	2*8($aptr),%r12,%rax	# ...
23827bded2dbSJung-uk Kim	adcx	%r14,%r12
23837bded2dbSJung-uk Kim	mulx	3*8($aptr),%r13,%r14
23847bded2dbSJung-uk Kim	 .byte	0x67,0x67
23857bded2dbSJung-uk Kim	 mov	$mi,%rdx
23867bded2dbSJung-uk Kim	adcx	%rax,%r13
23877bded2dbSJung-uk Kim	adcx	$zero,%r14		# cf=0
23887bded2dbSJung-uk Kim	lea	4*8($aptr),$aptr
23897bded2dbSJung-uk Kim	lea	4*8($tptr),$tptr
23907bded2dbSJung-uk Kim
23917bded2dbSJung-uk Kim	adox	%r15,%r10
23924c6a0400SJung-uk Kim	mulx	0*8($nptr),%rax,%r15
23937bded2dbSJung-uk Kim	adcx	%rax,%r10
23947bded2dbSJung-uk Kim	adox	%r15,%r11
23954c6a0400SJung-uk Kim	mulx	1*8($nptr),%rax,%r15
23967bded2dbSJung-uk Kim	adcx	%rax,%r11
23977bded2dbSJung-uk Kim	adox	%r15,%r12
23984c6a0400SJung-uk Kim	mulx	2*8($nptr),%rax,%r15
23997bded2dbSJung-uk Kim	mov	%r10,-5*8($tptr)
24007bded2dbSJung-uk Kim	adcx	%rax,%r12
24017bded2dbSJung-uk Kim	mov	%r11,-4*8($tptr)
24027bded2dbSJung-uk Kim	adox	%r15,%r13
24034c6a0400SJung-uk Kim	mulx	3*8($nptr),%rax,%r15
24047bded2dbSJung-uk Kim	 mov	$bi,%rdx
24057bded2dbSJung-uk Kim	mov	%r12,-3*8($tptr)
24067bded2dbSJung-uk Kim	adcx	%rax,%r13
24077bded2dbSJung-uk Kim	adox	$zero,%r15
24084c6a0400SJung-uk Kim	lea	4*8($nptr),$nptr
24097bded2dbSJung-uk Kim	mov	%r13,-2*8($tptr)
24107bded2dbSJung-uk Kim
24117bded2dbSJung-uk Kim	dec	$bptr			# of=0, pass cf
24127bded2dbSJung-uk Kim	jnz	.Lmulx4x_1st
24137bded2dbSJung-uk Kim
24147bded2dbSJung-uk Kim	mov	8(%rsp),$num		# load -num
24157bded2dbSJung-uk Kim	adc	$zero,%r15		# modulo-scheduled
24167bded2dbSJung-uk Kim	lea	($aptr,$num),$aptr	# rewind $aptr
24177bded2dbSJung-uk Kim	add	%r15,%r14
24187bded2dbSJung-uk Kim	mov	8+8(%rsp),$bptr		# re-load &b[i]
24197bded2dbSJung-uk Kim	adc	$zero,$zero		# top-most carry
24207bded2dbSJung-uk Kim	mov	%r14,-1*8($tptr)
24217bded2dbSJung-uk Kim	jmp	.Lmulx4x_outer
24227bded2dbSJung-uk Kim
24237bded2dbSJung-uk Kim.align	32
24247bded2dbSJung-uk Kim.Lmulx4x_outer:
24254c6a0400SJung-uk Kim	lea	16-256($tptr),%r10	# where 256-byte mask is (+density control)
24264c6a0400SJung-uk Kim	pxor	%xmm4,%xmm4
24274c6a0400SJung-uk Kim	.byte	0x67,0x67
24284c6a0400SJung-uk Kim	pxor	%xmm5,%xmm5
24294c6a0400SJung-uk Kim___
24304c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16;$i+=4) {
24314c6a0400SJung-uk Kim$code.=<<___;
24324c6a0400SJung-uk Kim	movdqa	`16*($i+0)-128`($bptr),%xmm0
24334c6a0400SJung-uk Kim	movdqa	`16*($i+1)-128`($bptr),%xmm1
24344c6a0400SJung-uk Kim	movdqa	`16*($i+2)-128`($bptr),%xmm2
24354c6a0400SJung-uk Kim	pand	`16*($i+0)+256`(%r10),%xmm0
24364c6a0400SJung-uk Kim	movdqa	`16*($i+3)-128`($bptr),%xmm3
24374c6a0400SJung-uk Kim	pand	`16*($i+1)+256`(%r10),%xmm1
24384c6a0400SJung-uk Kim	por	%xmm0,%xmm4
24394c6a0400SJung-uk Kim	pand	`16*($i+2)+256`(%r10),%xmm2
24404c6a0400SJung-uk Kim	por	%xmm1,%xmm5
24414c6a0400SJung-uk Kim	pand	`16*($i+3)+256`(%r10),%xmm3
24424c6a0400SJung-uk Kim	por	%xmm2,%xmm4
24434c6a0400SJung-uk Kim	por	%xmm3,%xmm5
24444c6a0400SJung-uk Kim___
24454c6a0400SJung-uk Kim}
24464c6a0400SJung-uk Kim$code.=<<___;
24474c6a0400SJung-uk Kim	por	%xmm5,%xmm4
24484c6a0400SJung-uk Kim	pshufd	\$0x4e,%xmm4,%xmm0
24494c6a0400SJung-uk Kim	por	%xmm4,%xmm0
24504c6a0400SJung-uk Kim	lea	$STRIDE($bptr),$bptr
24514c6a0400SJung-uk Kim	movq	%xmm0,%rdx		# m0=bp[i]
24524c6a0400SJung-uk Kim
24537bded2dbSJung-uk Kim	mov	$zero,($tptr)		# save top-most carry
24547bded2dbSJung-uk Kim	lea	4*8($tptr,$num),$tptr	# rewind $tptr
24557bded2dbSJung-uk Kim	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
24567bded2dbSJung-uk Kim	xor	$zero,$zero		# cf=0, of=0
24577bded2dbSJung-uk Kim	mov	%rdx,$bi
24587bded2dbSJung-uk Kim	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
24597bded2dbSJung-uk Kim	adox	-4*8($tptr),$mi		# +t[0]
24607bded2dbSJung-uk Kim	adcx	%r14,%r11
24617bded2dbSJung-uk Kim	mulx	2*8($aptr),%r15,%r13	# ...
24627bded2dbSJung-uk Kim	adox	-3*8($tptr),%r11
24637bded2dbSJung-uk Kim	adcx	%r15,%r12
24647bded2dbSJung-uk Kim	mulx	3*8($aptr),%rdx,%r14
24657bded2dbSJung-uk Kim	adox	-2*8($tptr),%r12
24667bded2dbSJung-uk Kim	adcx	%rdx,%r13
24674c6a0400SJung-uk Kim	lea	($nptr,$num),$nptr	# rewind $nptr
24687bded2dbSJung-uk Kim	lea	4*8($aptr),$aptr
24697bded2dbSJung-uk Kim	adox	-1*8($tptr),%r13
24707bded2dbSJung-uk Kim	adcx	$zero,%r14
24717bded2dbSJung-uk Kim	adox	$zero,%r14
24727bded2dbSJung-uk Kim
24737bded2dbSJung-uk Kim	mov	$mi,%r15
24747bded2dbSJung-uk Kim	imulq	32+8(%rsp),$mi		# "t[0]"*n0
24757bded2dbSJung-uk Kim
24767bded2dbSJung-uk Kim	mov	$mi,%rdx
24777bded2dbSJung-uk Kim	xor	$zero,$zero		# cf=0, of=0
24787bded2dbSJung-uk Kim	mov	$bptr,8+8(%rsp)		# off-load &b[i]
24797bded2dbSJung-uk Kim
24804c6a0400SJung-uk Kim	mulx	0*8($nptr),%rax,%r10
24817bded2dbSJung-uk Kim	adcx	%rax,%r15		# discarded
24827bded2dbSJung-uk Kim	adox	%r11,%r10
24834c6a0400SJung-uk Kim	mulx	1*8($nptr),%rax,%r11
24847bded2dbSJung-uk Kim	adcx	%rax,%r10
24857bded2dbSJung-uk Kim	adox	%r12,%r11
24864c6a0400SJung-uk Kim	mulx	2*8($nptr),%rax,%r12
24877bded2dbSJung-uk Kim	adcx	%rax,%r11
24887bded2dbSJung-uk Kim	adox	%r13,%r12
24894c6a0400SJung-uk Kim	mulx	3*8($nptr),%rax,%r15
24907bded2dbSJung-uk Kim	 mov	$bi,%rdx
24917bded2dbSJung-uk Kim	mov	24+8(%rsp),$bptr	# counter value
24927bded2dbSJung-uk Kim	mov	%r10,-8*4($tptr)
24937bded2dbSJung-uk Kim	adcx	%rax,%r12
24947bded2dbSJung-uk Kim	mov	%r11,-8*3($tptr)
24957bded2dbSJung-uk Kim	adox	$zero,%r15		# of=0
24967bded2dbSJung-uk Kim	mov	%r12,-8*2($tptr)
24974c6a0400SJung-uk Kim	lea	4*8($nptr),$nptr
24987bded2dbSJung-uk Kim	jmp	.Lmulx4x_inner
24997bded2dbSJung-uk Kim
25007bded2dbSJung-uk Kim.align	32
25017bded2dbSJung-uk Kim.Lmulx4x_inner:
25027bded2dbSJung-uk Kim	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
25037bded2dbSJung-uk Kim	adcx	$zero,%r15		# cf=0, modulo-scheduled
25047bded2dbSJung-uk Kim	adox	%r14,%r10
25057bded2dbSJung-uk Kim	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
25067bded2dbSJung-uk Kim	adcx	0*8($tptr),%r10
25077bded2dbSJung-uk Kim	adox	%rax,%r11
25087bded2dbSJung-uk Kim	mulx	2*8($aptr),%r12,%rax	# ...
25097bded2dbSJung-uk Kim	adcx	1*8($tptr),%r11
25107bded2dbSJung-uk Kim	adox	%r14,%r12
25117bded2dbSJung-uk Kim	mulx	3*8($aptr),%r13,%r14
25127bded2dbSJung-uk Kim	 mov	$mi,%rdx
25137bded2dbSJung-uk Kim	adcx	2*8($tptr),%r12
25147bded2dbSJung-uk Kim	adox	%rax,%r13
25157bded2dbSJung-uk Kim	adcx	3*8($tptr),%r13
25167bded2dbSJung-uk Kim	adox	$zero,%r14		# of=0
25177bded2dbSJung-uk Kim	lea	4*8($aptr),$aptr
25187bded2dbSJung-uk Kim	lea	4*8($tptr),$tptr
25197bded2dbSJung-uk Kim	adcx	$zero,%r14		# cf=0
25207bded2dbSJung-uk Kim
25217bded2dbSJung-uk Kim	adox	%r15,%r10
25224c6a0400SJung-uk Kim	mulx	0*8($nptr),%rax,%r15
25237bded2dbSJung-uk Kim	adcx	%rax,%r10
25247bded2dbSJung-uk Kim	adox	%r15,%r11
25254c6a0400SJung-uk Kim	mulx	1*8($nptr),%rax,%r15
25267bded2dbSJung-uk Kim	adcx	%rax,%r11
25277bded2dbSJung-uk Kim	adox	%r15,%r12
25284c6a0400SJung-uk Kim	mulx	2*8($nptr),%rax,%r15
25297bded2dbSJung-uk Kim	mov	%r10,-5*8($tptr)
25307bded2dbSJung-uk Kim	adcx	%rax,%r12
25317bded2dbSJung-uk Kim	adox	%r15,%r13
25327bded2dbSJung-uk Kim	mov	%r11,-4*8($tptr)
25334c6a0400SJung-uk Kim	mulx	3*8($nptr),%rax,%r15
25347bded2dbSJung-uk Kim	 mov	$bi,%rdx
25354c6a0400SJung-uk Kim	lea	4*8($nptr),$nptr
25367bded2dbSJung-uk Kim	mov	%r12,-3*8($tptr)
25377bded2dbSJung-uk Kim	adcx	%rax,%r13
25387bded2dbSJung-uk Kim	adox	$zero,%r15
25397bded2dbSJung-uk Kim	mov	%r13,-2*8($tptr)
25407bded2dbSJung-uk Kim
25417bded2dbSJung-uk Kim	dec	$bptr			# of=0, pass cf
25427bded2dbSJung-uk Kim	jnz	.Lmulx4x_inner
25437bded2dbSJung-uk Kim
25447bded2dbSJung-uk Kim	mov	0+8(%rsp),$num		# load -num
25457bded2dbSJung-uk Kim	adc	$zero,%r15		# modulo-scheduled
25467bded2dbSJung-uk Kim	sub	0*8($tptr),$bptr	# pull top-most carry to %cf
25477bded2dbSJung-uk Kim	mov	8+8(%rsp),$bptr		# re-load &b[i]
25487bded2dbSJung-uk Kim	mov	16+8(%rsp),%r10
25497bded2dbSJung-uk Kim	adc	%r15,%r14
25507bded2dbSJung-uk Kim	lea	($aptr,$num),$aptr	# rewind $aptr
25517bded2dbSJung-uk Kim	adc	$zero,$zero		# top-most carry
25527bded2dbSJung-uk Kim	mov	%r14,-1*8($tptr)
25537bded2dbSJung-uk Kim
25547bded2dbSJung-uk Kim	cmp	%r10,$bptr
25557bded2dbSJung-uk Kim	jb	.Lmulx4x_outer
25567bded2dbSJung-uk Kim
25574c6a0400SJung-uk Kim	mov	-8($nptr),%r10
25584c6a0400SJung-uk Kim	mov	$zero,%r8
25594c6a0400SJung-uk Kim	mov	($nptr,$num),%r12
25604c6a0400SJung-uk Kim	lea	($nptr,$num),%rbp	# rewind $nptr
25614c6a0400SJung-uk Kim	mov	$num,%rcx
25624c6a0400SJung-uk Kim	lea	($tptr,$num),%rdi	# rewind $tptr
25634c6a0400SJung-uk Kim	xor	%eax,%eax
25647bded2dbSJung-uk Kim	xor	%r15,%r15
25657bded2dbSJung-uk Kim	sub	%r14,%r10		# compare top-most words
25667bded2dbSJung-uk Kim	adc	%r15,%r15
25674c6a0400SJung-uk Kim	or	%r15,%r8
25684c6a0400SJung-uk Kim	sar	\$3+2,%rcx
25694c6a0400SJung-uk Kim	sub	%r8,%rax		# %rax=-%r8
25707bded2dbSJung-uk Kim	mov	56+8(%rsp),%rdx		# restore rp
25714c6a0400SJung-uk Kim	dec	%r12			# so that after 'not' we get -n[0]
25724c6a0400SJung-uk Kim	mov	8*1(%rbp),%r13
25734c6a0400SJung-uk Kim	xor	%r8,%r8
25744c6a0400SJung-uk Kim	mov	8*2(%rbp),%r14
25754c6a0400SJung-uk Kim	mov	8*3(%rbp),%r15
25764c6a0400SJung-uk Kim	jmp	.Lsqrx4x_sub_entry	# common post-condition
257717f01e99SJung-uk Kim.cfi_endproc
25787bded2dbSJung-uk Kim.size	mulx4x_internal,.-mulx4x_internal
25797bded2dbSJung-uk Kim___
25807bded2dbSJung-uk Kim}{
25817bded2dbSJung-uk Kim######################################################################
25827bded2dbSJung-uk Kim# void bn_power5(
25837bded2dbSJung-uk Kimmy $rptr="%rdi";	# BN_ULONG *rptr,
25847bded2dbSJung-uk Kimmy $aptr="%rsi";	# const BN_ULONG *aptr,
25857bded2dbSJung-uk Kimmy $bptr="%rdx";	# const void *table,
25867bded2dbSJung-uk Kimmy $nptr="%rcx";	# const BN_ULONG *nptr,
25877bded2dbSJung-uk Kimmy $n0  ="%r8";		# const BN_ULONG *n0);
25887bded2dbSJung-uk Kimmy $num ="%r9";		# int num, has to be divisible by 8
25897bded2dbSJung-uk Kim			# int pwr);
25907bded2dbSJung-uk Kim
25917bded2dbSJung-uk Kimmy ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
25927bded2dbSJung-uk Kimmy @A0=("%r10","%r11");
25937bded2dbSJung-uk Kimmy @A1=("%r12","%r13");
25947bded2dbSJung-uk Kimmy ($a0,$a1,$ai)=("%r14","%r15","%rbx");
25957bded2dbSJung-uk Kim
25967bded2dbSJung-uk Kim$code.=<<___;
25977bded2dbSJung-uk Kim.type	bn_powerx5,\@function,6
25987bded2dbSJung-uk Kim.align	32
25997bded2dbSJung-uk Kimbn_powerx5:
2600e71b7053SJung-uk Kim.cfi_startproc
26017bded2dbSJung-uk Kim	mov	%rsp,%rax
2602e71b7053SJung-uk Kim.cfi_def_cfa_register	%rax
2603aeb5019cSJung-uk Kim.Lpowerx5_enter:
26047bded2dbSJung-uk Kim	push	%rbx
2605e71b7053SJung-uk Kim.cfi_push	%rbx
26067bded2dbSJung-uk Kim	push	%rbp
2607e71b7053SJung-uk Kim.cfi_push	%rbp
26087bded2dbSJung-uk Kim	push	%r12
2609e71b7053SJung-uk Kim.cfi_push	%r12
26107bded2dbSJung-uk Kim	push	%r13
2611e71b7053SJung-uk Kim.cfi_push	%r13
26127bded2dbSJung-uk Kim	push	%r14
2613e71b7053SJung-uk Kim.cfi_push	%r14
26147bded2dbSJung-uk Kim	push	%r15
2615e71b7053SJung-uk Kim.cfi_push	%r15
2616aeb5019cSJung-uk Kim.Lpowerx5_prologue:
26174c6a0400SJung-uk Kim
26187bded2dbSJung-uk Kim	shl	\$3,${num}d		# convert $num to bytes
26194c6a0400SJung-uk Kim	lea	($num,$num,2),%r10	# 3*$num in bytes
26207bded2dbSJung-uk Kim	neg	$num
26217bded2dbSJung-uk Kim	mov	($n0),$n0		# *n0
26227bded2dbSJung-uk Kim
26237bded2dbSJung-uk Kim	##############################################################
26244c6a0400SJung-uk Kim	# Ensure that stack frame doesn't alias with $rptr+3*$num
26254c6a0400SJung-uk Kim	# modulo 4096, which covers ret[num], am[num] and n[num]
26264c6a0400SJung-uk Kim	# (see bn_exp.c). This is done to allow memory disambiguation
26274c6a0400SJung-uk Kim	# logic do its magic. [Extra 256 bytes is for power mask
26284c6a0400SJung-uk Kim	# calculated from 7th argument, the index.]
26297bded2dbSJung-uk Kim	#
26304c6a0400SJung-uk Kim	lea	-320(%rsp,$num,2),%r11
2631aeb5019cSJung-uk Kim	mov	%rsp,%rbp
26324c6a0400SJung-uk Kim	sub	$rptr,%r11
26337bded2dbSJung-uk Kim	and	\$4095,%r11
26347bded2dbSJung-uk Kim	cmp	%r11,%r10
26357bded2dbSJung-uk Kim	jb	.Lpwrx_sp_alt
2636aeb5019cSJung-uk Kim	sub	%r11,%rbp		# align with $aptr
2637aeb5019cSJung-uk Kim	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
26387bded2dbSJung-uk Kim	jmp	.Lpwrx_sp_done
26397bded2dbSJung-uk Kim
26407bded2dbSJung-uk Kim.align	32
26417bded2dbSJung-uk Kim.Lpwrx_sp_alt:
26424c6a0400SJung-uk Kim	lea	4096-320(,$num,2),%r10
2643aeb5019cSJung-uk Kim	lea	-320(%rbp,$num,2),%rbp	# alloca(frame+2*$num*8+256)
26447bded2dbSJung-uk Kim	sub	%r10,%r11
26457bded2dbSJung-uk Kim	mov	\$0,%r10
26467bded2dbSJung-uk Kim	cmovc	%r10,%r11
2647aeb5019cSJung-uk Kim	sub	%r11,%rbp
26487bded2dbSJung-uk Kim.Lpwrx_sp_done:
2649aeb5019cSJung-uk Kim	and	\$-64,%rbp
2650aeb5019cSJung-uk Kim	mov	%rsp,%r11
2651aeb5019cSJung-uk Kim	sub	%rbp,%r11
2652b8721c16SJung-uk Kim	and	\$-4096,%r11
2653aeb5019cSJung-uk Kim	lea	(%rbp,%r11),%rsp
2654aeb5019cSJung-uk Kim	mov	(%rsp),%r10
2655aeb5019cSJung-uk Kim	cmp	%rbp,%rsp
2656aeb5019cSJung-uk Kim	ja	.Lpwrx_page_walk
2657aeb5019cSJung-uk Kim	jmp	.Lpwrx_page_walk_done
2658aeb5019cSJung-uk Kim
2659b8721c16SJung-uk Kim.Lpwrx_page_walk:
2660aeb5019cSJung-uk Kim	lea	-4096(%rsp),%rsp
2661aeb5019cSJung-uk Kim	mov	(%rsp),%r10
2662aeb5019cSJung-uk Kim	cmp	%rbp,%rsp
2663aeb5019cSJung-uk Kim	ja	.Lpwrx_page_walk
2664aeb5019cSJung-uk Kim.Lpwrx_page_walk_done:
2665b8721c16SJung-uk Kim
26667bded2dbSJung-uk Kim	mov	$num,%r10
26677bded2dbSJung-uk Kim	neg	$num
26687bded2dbSJung-uk Kim
26697bded2dbSJung-uk Kim	##############################################################
26707bded2dbSJung-uk Kim	# Stack layout
26717bded2dbSJung-uk Kim	#
26727bded2dbSJung-uk Kim	# +0	saved $num, used in reduction section
26737bded2dbSJung-uk Kim	# +8	&t[2*$num], used in reduction section
26747bded2dbSJung-uk Kim	# +16	intermediate carry bit
26757bded2dbSJung-uk Kim	# +24	top-most carry bit, used in reduction section
26767bded2dbSJung-uk Kim	# +32	saved *n0
26777bded2dbSJung-uk Kim	# +40	saved %rsp
26787bded2dbSJung-uk Kim	# +48	t[2*$num]
26797bded2dbSJung-uk Kim	#
26807bded2dbSJung-uk Kim	pxor	%xmm0,%xmm0
26817bded2dbSJung-uk Kim	movq	$rptr,%xmm1		# save $rptr
26827bded2dbSJung-uk Kim	movq	$nptr,%xmm2		# save $nptr
26837bded2dbSJung-uk Kim	movq	%r10, %xmm3		# -$num
26847bded2dbSJung-uk Kim	movq	$bptr,%xmm4
26857bded2dbSJung-uk Kim	mov	$n0,  32(%rsp)
26867bded2dbSJung-uk Kim	mov	%rax, 40(%rsp)		# save original %rsp
2687e71b7053SJung-uk Kim.cfi_cfa_expression	%rsp+40,deref,+8
26887bded2dbSJung-uk Kim.Lpowerx5_body:
26897bded2dbSJung-uk Kim
26907bded2dbSJung-uk Kim	call	__bn_sqrx8x_internal
26914c6a0400SJung-uk Kim	call	__bn_postx4x_internal
26927bded2dbSJung-uk Kim	call	__bn_sqrx8x_internal
26934c6a0400SJung-uk Kim	call	__bn_postx4x_internal
26947bded2dbSJung-uk Kim	call	__bn_sqrx8x_internal
26954c6a0400SJung-uk Kim	call	__bn_postx4x_internal
26967bded2dbSJung-uk Kim	call	__bn_sqrx8x_internal
26974c6a0400SJung-uk Kim	call	__bn_postx4x_internal
26987bded2dbSJung-uk Kim	call	__bn_sqrx8x_internal
26994c6a0400SJung-uk Kim	call	__bn_postx4x_internal
27007bded2dbSJung-uk Kim
27017bded2dbSJung-uk Kim	mov	%r10,$num		# -num
27027bded2dbSJung-uk Kim	mov	$aptr,$rptr
27037bded2dbSJung-uk Kim	movq	%xmm2,$nptr
27047bded2dbSJung-uk Kim	movq	%xmm4,$bptr
27057bded2dbSJung-uk Kim	mov	40(%rsp),%rax
27067bded2dbSJung-uk Kim
27077bded2dbSJung-uk Kim	call	mulx4x_internal
27087bded2dbSJung-uk Kim
27097bded2dbSJung-uk Kim	mov	40(%rsp),%rsi		# restore %rsp
2710e71b7053SJung-uk Kim.cfi_def_cfa	%rsi,8
27117bded2dbSJung-uk Kim	mov	\$1,%rax
27124c6a0400SJung-uk Kim
27137bded2dbSJung-uk Kim	mov	-48(%rsi),%r15
2714e71b7053SJung-uk Kim.cfi_restore	%r15
27157bded2dbSJung-uk Kim	mov	-40(%rsi),%r14
2716e71b7053SJung-uk Kim.cfi_restore	%r14
27177bded2dbSJung-uk Kim	mov	-32(%rsi),%r13
2718e71b7053SJung-uk Kim.cfi_restore	%r13
27197bded2dbSJung-uk Kim	mov	-24(%rsi),%r12
2720e71b7053SJung-uk Kim.cfi_restore	%r12
27217bded2dbSJung-uk Kim	mov	-16(%rsi),%rbp
2722e71b7053SJung-uk Kim.cfi_restore	%rbp
27237bded2dbSJung-uk Kim	mov	-8(%rsi),%rbx
2724e71b7053SJung-uk Kim.cfi_restore	%rbx
27257bded2dbSJung-uk Kim	lea	(%rsi),%rsp
2726e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
27277bded2dbSJung-uk Kim.Lpowerx5_epilogue:
27287bded2dbSJung-uk Kim	ret
2729e71b7053SJung-uk Kim.cfi_endproc
27307bded2dbSJung-uk Kim.size	bn_powerx5,.-bn_powerx5
27317bded2dbSJung-uk Kim
27327bded2dbSJung-uk Kim.globl	bn_sqrx8x_internal
27337bded2dbSJung-uk Kim.hidden	bn_sqrx8x_internal
27347bded2dbSJung-uk Kim.type	bn_sqrx8x_internal,\@abi-omnipotent
27357bded2dbSJung-uk Kim.align	32
27367bded2dbSJung-uk Kimbn_sqrx8x_internal:
27377bded2dbSJung-uk Kim__bn_sqrx8x_internal:
27386935a639SJung-uk Kim.cfi_startproc
27397bded2dbSJung-uk Kim	##################################################################
27407bded2dbSJung-uk Kim	# Squaring part:
27417bded2dbSJung-uk Kim	#
27427bded2dbSJung-uk Kim	# a) multiply-n-add everything but a[i]*a[i];
27437bded2dbSJung-uk Kim	# b) shift result of a) by 1 to the left and accumulate
27447bded2dbSJung-uk Kim	#    a[i]*a[i] products;
27457bded2dbSJung-uk Kim	#
27467bded2dbSJung-uk Kim	##################################################################
27477bded2dbSJung-uk Kim	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
27487bded2dbSJung-uk Kim	#                                                     a[1]a[0]
27497bded2dbSJung-uk Kim	#                                                 a[2]a[0]
27507bded2dbSJung-uk Kim	#                                             a[3]a[0]
27517bded2dbSJung-uk Kim	#                                             a[2]a[1]
27527bded2dbSJung-uk Kim	#                                         a[3]a[1]
27537bded2dbSJung-uk Kim	#                                     a[3]a[2]
27547bded2dbSJung-uk Kim	#
27557bded2dbSJung-uk Kim	#                                         a[4]a[0]
27567bded2dbSJung-uk Kim	#                                     a[5]a[0]
27577bded2dbSJung-uk Kim	#                                 a[6]a[0]
27587bded2dbSJung-uk Kim	#                             a[7]a[0]
27597bded2dbSJung-uk Kim	#                                     a[4]a[1]
27607bded2dbSJung-uk Kim	#                                 a[5]a[1]
27617bded2dbSJung-uk Kim	#                             a[6]a[1]
27627bded2dbSJung-uk Kim	#                         a[7]a[1]
27637bded2dbSJung-uk Kim	#                                 a[4]a[2]
27647bded2dbSJung-uk Kim	#                             a[5]a[2]
27657bded2dbSJung-uk Kim	#                         a[6]a[2]
27667bded2dbSJung-uk Kim	#                     a[7]a[2]
27677bded2dbSJung-uk Kim	#                             a[4]a[3]
27687bded2dbSJung-uk Kim	#                         a[5]a[3]
27697bded2dbSJung-uk Kim	#                     a[6]a[3]
27707bded2dbSJung-uk Kim	#                 a[7]a[3]
27717bded2dbSJung-uk Kim	#
27727bded2dbSJung-uk Kim	#                     a[5]a[4]
27737bded2dbSJung-uk Kim	#                 a[6]a[4]
27747bded2dbSJung-uk Kim	#             a[7]a[4]
27757bded2dbSJung-uk Kim	#             a[6]a[5]
27767bded2dbSJung-uk Kim	#         a[7]a[5]
27777bded2dbSJung-uk Kim	#     a[7]a[6]
27787bded2dbSJung-uk Kim	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
27797bded2dbSJung-uk Kim___
27807bded2dbSJung-uk Kim{
27817bded2dbSJung-uk Kimmy ($zero,$carry)=("%rbp","%rcx");
27827bded2dbSJung-uk Kimmy $aaptr=$zero;
27837bded2dbSJung-uk Kim$code.=<<___;
27847bded2dbSJung-uk Kim	lea	48+8(%rsp),$tptr
27857bded2dbSJung-uk Kim	lea	($aptr,$num),$aaptr
27867bded2dbSJung-uk Kim	mov	$num,0+8(%rsp)			# save $num
27877bded2dbSJung-uk Kim	mov	$aaptr,8+8(%rsp)		# save end of $aptr
27887bded2dbSJung-uk Kim	jmp	.Lsqr8x_zero_start
27897bded2dbSJung-uk Kim
27907bded2dbSJung-uk Kim.align	32
27917bded2dbSJung-uk Kim.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
27927bded2dbSJung-uk Kim.Lsqrx8x_zero:
27937bded2dbSJung-uk Kim	.byte	0x3e
27947bded2dbSJung-uk Kim	movdqa	%xmm0,0*8($tptr)
27957bded2dbSJung-uk Kim	movdqa	%xmm0,2*8($tptr)
27967bded2dbSJung-uk Kim	movdqa	%xmm0,4*8($tptr)
27977bded2dbSJung-uk Kim	movdqa	%xmm0,6*8($tptr)
27987bded2dbSJung-uk Kim.Lsqr8x_zero_start:			# aligned at 32
27997bded2dbSJung-uk Kim	movdqa	%xmm0,8*8($tptr)
28007bded2dbSJung-uk Kim	movdqa	%xmm0,10*8($tptr)
28017bded2dbSJung-uk Kim	movdqa	%xmm0,12*8($tptr)
28027bded2dbSJung-uk Kim	movdqa	%xmm0,14*8($tptr)
28037bded2dbSJung-uk Kim	lea	16*8($tptr),$tptr
28047bded2dbSJung-uk Kim	sub	\$64,$num
28057bded2dbSJung-uk Kim	jnz	.Lsqrx8x_zero
28067bded2dbSJung-uk Kim
28077bded2dbSJung-uk Kim	mov	0*8($aptr),%rdx		# a[0], modulo-scheduled
28087bded2dbSJung-uk Kim	#xor	%r9,%r9			# t[1], ex-$num, zero already
28097bded2dbSJung-uk Kim	xor	%r10,%r10
28107bded2dbSJung-uk Kim	xor	%r11,%r11
28117bded2dbSJung-uk Kim	xor	%r12,%r12
28127bded2dbSJung-uk Kim	xor	%r13,%r13
28137bded2dbSJung-uk Kim	xor	%r14,%r14
28147bded2dbSJung-uk Kim	xor	%r15,%r15
28157bded2dbSJung-uk Kim	lea	48+8(%rsp),$tptr
28167bded2dbSJung-uk Kim	xor	$zero,$zero		# cf=0, cf=0
28177bded2dbSJung-uk Kim	jmp	.Lsqrx8x_outer_loop
28187bded2dbSJung-uk Kim
28197bded2dbSJung-uk Kim.align	32
28207bded2dbSJung-uk Kim.Lsqrx8x_outer_loop:
28217bded2dbSJung-uk Kim	mulx	1*8($aptr),%r8,%rax	# a[1]*a[0]
28227bded2dbSJung-uk Kim	adcx	%r9,%r8			# a[1]*a[0]+=t[1]
28237bded2dbSJung-uk Kim	adox	%rax,%r10
28247bded2dbSJung-uk Kim	mulx	2*8($aptr),%r9,%rax	# a[2]*a[0]
28257bded2dbSJung-uk Kim	adcx	%r10,%r9
28267bded2dbSJung-uk Kim	adox	%rax,%r11
28277bded2dbSJung-uk Kim	.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	3*8($aptr),%r10,%rax	# ...
28287bded2dbSJung-uk Kim	adcx	%r11,%r10
28297bded2dbSJung-uk Kim	adox	%rax,%r12
28307bded2dbSJung-uk Kim	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00	# mulx	4*8($aptr),%r11,%rax
28317bded2dbSJung-uk Kim	adcx	%r12,%r11
28327bded2dbSJung-uk Kim	adox	%rax,%r13
28337bded2dbSJung-uk Kim	mulx	5*8($aptr),%r12,%rax
28347bded2dbSJung-uk Kim	adcx	%r13,%r12
28357bded2dbSJung-uk Kim	adox	%rax,%r14
28367bded2dbSJung-uk Kim	mulx	6*8($aptr),%r13,%rax
28377bded2dbSJung-uk Kim	adcx	%r14,%r13
28387bded2dbSJung-uk Kim	adox	%r15,%rax
28397bded2dbSJung-uk Kim	mulx	7*8($aptr),%r14,%r15
28407bded2dbSJung-uk Kim	 mov	1*8($aptr),%rdx		# a[1]
28417bded2dbSJung-uk Kim	adcx	%rax,%r14
28427bded2dbSJung-uk Kim	adox	$zero,%r15
28437bded2dbSJung-uk Kim	adc	8*8($tptr),%r15
28447bded2dbSJung-uk Kim	mov	%r8,1*8($tptr)		# t[1]
28457bded2dbSJung-uk Kim	mov	%r9,2*8($tptr)		# t[2]
28467bded2dbSJung-uk Kim	sbb	$carry,$carry		# mov %cf,$carry
28477bded2dbSJung-uk Kim	xor	$zero,$zero		# cf=0, of=0
28487bded2dbSJung-uk Kim
28497bded2dbSJung-uk Kim
28507bded2dbSJung-uk Kim	mulx	2*8($aptr),%r8,%rbx	# a[2]*a[1]
28517bded2dbSJung-uk Kim	mulx	3*8($aptr),%r9,%rax	# a[3]*a[1]
28527bded2dbSJung-uk Kim	adcx	%r10,%r8
28537bded2dbSJung-uk Kim	adox	%rbx,%r9
28547bded2dbSJung-uk Kim	mulx	4*8($aptr),%r10,%rbx	# ...
28557bded2dbSJung-uk Kim	adcx	%r11,%r9
28567bded2dbSJung-uk Kim	adox	%rax,%r10
28577bded2dbSJung-uk Kim	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00	# mulx	5*8($aptr),%r11,%rax
28587bded2dbSJung-uk Kim	adcx	%r12,%r10
28597bded2dbSJung-uk Kim	adox	%rbx,%r11
28607bded2dbSJung-uk Kim	.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r12,%rbx
28617bded2dbSJung-uk Kim	adcx	%r13,%r11
28627bded2dbSJung-uk Kim	adox	%r14,%r12
28637bded2dbSJung-uk Kim	.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r13,%r14
28647bded2dbSJung-uk Kim	 mov	2*8($aptr),%rdx		# a[2]
28657bded2dbSJung-uk Kim	adcx	%rax,%r12
28667bded2dbSJung-uk Kim	adox	%rbx,%r13
28677bded2dbSJung-uk Kim	adcx	%r15,%r13
28687bded2dbSJung-uk Kim	adox	$zero,%r14		# of=0
28697bded2dbSJung-uk Kim	adcx	$zero,%r14		# cf=0
28707bded2dbSJung-uk Kim
28717bded2dbSJung-uk Kim	mov	%r8,3*8($tptr)		# t[3]
28727bded2dbSJung-uk Kim	mov	%r9,4*8($tptr)		# t[4]
28737bded2dbSJung-uk Kim
28747bded2dbSJung-uk Kim	mulx	3*8($aptr),%r8,%rbx	# a[3]*a[2]
28757bded2dbSJung-uk Kim	mulx	4*8($aptr),%r9,%rax	# a[4]*a[2]
28767bded2dbSJung-uk Kim	adcx	%r10,%r8
28777bded2dbSJung-uk Kim	adox	%rbx,%r9
28787bded2dbSJung-uk Kim	mulx	5*8($aptr),%r10,%rbx	# ...
28797bded2dbSJung-uk Kim	adcx	%r11,%r9
28807bded2dbSJung-uk Kim	adox	%rax,%r10
28817bded2dbSJung-uk Kim	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r11,%rax
28827bded2dbSJung-uk Kim	adcx	%r12,%r10
28837bded2dbSJung-uk Kim	adox	%r13,%r11
28847bded2dbSJung-uk Kim	.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r12,%r13
28857bded2dbSJung-uk Kim	.byte	0x3e
28867bded2dbSJung-uk Kim	 mov	3*8($aptr),%rdx		# a[3]
28877bded2dbSJung-uk Kim	adcx	%rbx,%r11
28887bded2dbSJung-uk Kim	adox	%rax,%r12
28897bded2dbSJung-uk Kim	adcx	%r14,%r12
28907bded2dbSJung-uk Kim	mov	%r8,5*8($tptr)		# t[5]
28917bded2dbSJung-uk Kim	mov	%r9,6*8($tptr)		# t[6]
28927bded2dbSJung-uk Kim	 mulx	4*8($aptr),%r8,%rax	# a[4]*a[3]
28937bded2dbSJung-uk Kim	adox	$zero,%r13		# of=0
28947bded2dbSJung-uk Kim	adcx	$zero,%r13		# cf=0
28957bded2dbSJung-uk Kim
28967bded2dbSJung-uk Kim	mulx	5*8($aptr),%r9,%rbx	# a[5]*a[3]
28977bded2dbSJung-uk Kim	adcx	%r10,%r8
28987bded2dbSJung-uk Kim	adox	%rax,%r9
28997bded2dbSJung-uk Kim	mulx	6*8($aptr),%r10,%rax	# ...
29007bded2dbSJung-uk Kim	adcx	%r11,%r9
29017bded2dbSJung-uk Kim	adox	%r12,%r10
29027bded2dbSJung-uk Kim	mulx	7*8($aptr),%r11,%r12
29037bded2dbSJung-uk Kim	 mov	4*8($aptr),%rdx		# a[4]
29047bded2dbSJung-uk Kim	 mov	5*8($aptr),%r14		# a[5]
29057bded2dbSJung-uk Kim	adcx	%rbx,%r10
29067bded2dbSJung-uk Kim	adox	%rax,%r11
29077bded2dbSJung-uk Kim	 mov	6*8($aptr),%r15		# a[6]
29087bded2dbSJung-uk Kim	adcx	%r13,%r11
29097bded2dbSJung-uk Kim	adox	$zero,%r12		# of=0
29107bded2dbSJung-uk Kim	adcx	$zero,%r12		# cf=0
29117bded2dbSJung-uk Kim
29127bded2dbSJung-uk Kim	mov	%r8,7*8($tptr)		# t[7]
29137bded2dbSJung-uk Kim	mov	%r9,8*8($tptr)		# t[8]
29147bded2dbSJung-uk Kim
29157bded2dbSJung-uk Kim	mulx	%r14,%r9,%rax		# a[5]*a[4]
29167bded2dbSJung-uk Kim	 mov	7*8($aptr),%r8		# a[7]
29177bded2dbSJung-uk Kim	adcx	%r10,%r9
29187bded2dbSJung-uk Kim	mulx	%r15,%r10,%rbx		# a[6]*a[4]
29197bded2dbSJung-uk Kim	adox	%rax,%r10
29207bded2dbSJung-uk Kim	adcx	%r11,%r10
29217bded2dbSJung-uk Kim	mulx	%r8,%r11,%rax		# a[7]*a[4]
29227bded2dbSJung-uk Kim	 mov	%r14,%rdx		# a[5]
29237bded2dbSJung-uk Kim	adox	%rbx,%r11
29247bded2dbSJung-uk Kim	adcx	%r12,%r11
29257bded2dbSJung-uk Kim	#adox	$zero,%rax		# of=0
29267bded2dbSJung-uk Kim	adcx	$zero,%rax		# cf=0
29277bded2dbSJung-uk Kim
29287bded2dbSJung-uk Kim	mulx	%r15,%r14,%rbx		# a[6]*a[5]
29297bded2dbSJung-uk Kim	mulx	%r8,%r12,%r13		# a[7]*a[5]
29307bded2dbSJung-uk Kim	 mov	%r15,%rdx		# a[6]
29317bded2dbSJung-uk Kim	 lea	8*8($aptr),$aptr
29327bded2dbSJung-uk Kim	adcx	%r14,%r11
29337bded2dbSJung-uk Kim	adox	%rbx,%r12
29347bded2dbSJung-uk Kim	adcx	%rax,%r12
29357bded2dbSJung-uk Kim	adox	$zero,%r13
29367bded2dbSJung-uk Kim
29377bded2dbSJung-uk Kim	.byte	0x67,0x67
29387bded2dbSJung-uk Kim	mulx	%r8,%r8,%r14		# a[7]*a[6]
29397bded2dbSJung-uk Kim	adcx	%r8,%r13
29407bded2dbSJung-uk Kim	adcx	$zero,%r14
29417bded2dbSJung-uk Kim
29427bded2dbSJung-uk Kim	cmp	8+8(%rsp),$aptr
29437bded2dbSJung-uk Kim	je	.Lsqrx8x_outer_break
29447bded2dbSJung-uk Kim
29457bded2dbSJung-uk Kim	neg	$carry			# mov $carry,%cf
29467bded2dbSJung-uk Kim	mov	\$-8,%rcx
29477bded2dbSJung-uk Kim	mov	$zero,%r15
29487bded2dbSJung-uk Kim	mov	8*8($tptr),%r8
29497bded2dbSJung-uk Kim	adcx	9*8($tptr),%r9		# +=t[9]
29507bded2dbSJung-uk Kim	adcx	10*8($tptr),%r10	# ...
29517bded2dbSJung-uk Kim	adcx	11*8($tptr),%r11
29527bded2dbSJung-uk Kim	adc	12*8($tptr),%r12
29537bded2dbSJung-uk Kim	adc	13*8($tptr),%r13
29547bded2dbSJung-uk Kim	adc	14*8($tptr),%r14
29557bded2dbSJung-uk Kim	adc	15*8($tptr),%r15
29567bded2dbSJung-uk Kim	lea	($aptr),$aaptr
29577bded2dbSJung-uk Kim	lea	2*64($tptr),$tptr
29587bded2dbSJung-uk Kim	sbb	%rax,%rax		# mov %cf,$carry
29597bded2dbSJung-uk Kim
29607bded2dbSJung-uk Kim	mov	-64($aptr),%rdx		# a[0]
29617bded2dbSJung-uk Kim	mov	%rax,16+8(%rsp)		# offload $carry
29627bded2dbSJung-uk Kim	mov	$tptr,24+8(%rsp)
29637bded2dbSJung-uk Kim
29647bded2dbSJung-uk Kim	#lea	8*8($tptr),$tptr	# see 2*8*8($tptr) above
29657bded2dbSJung-uk Kim	xor	%eax,%eax		# cf=0, of=0
29667bded2dbSJung-uk Kim	jmp	.Lsqrx8x_loop
29677bded2dbSJung-uk Kim
29687bded2dbSJung-uk Kim.align	32
29697bded2dbSJung-uk Kim.Lsqrx8x_loop:
29707bded2dbSJung-uk Kim	mov	%r8,%rbx
29717bded2dbSJung-uk Kim	mulx	0*8($aaptr),%rax,%r8	# a[8]*a[i]
29727bded2dbSJung-uk Kim	adcx	%rax,%rbx		# +=t[8]
29737bded2dbSJung-uk Kim	adox	%r9,%r8
29747bded2dbSJung-uk Kim
29757bded2dbSJung-uk Kim	mulx	1*8($aaptr),%rax,%r9	# ...
29767bded2dbSJung-uk Kim	adcx	%rax,%r8
29777bded2dbSJung-uk Kim	adox	%r10,%r9
29787bded2dbSJung-uk Kim
29797bded2dbSJung-uk Kim	mulx	2*8($aaptr),%rax,%r10
29807bded2dbSJung-uk Kim	adcx	%rax,%r9
29817bded2dbSJung-uk Kim	adox	%r11,%r10
29827bded2dbSJung-uk Kim
29837bded2dbSJung-uk Kim	mulx	3*8($aaptr),%rax,%r11
29847bded2dbSJung-uk Kim	adcx	%rax,%r10
29857bded2dbSJung-uk Kim	adox	%r12,%r11
29867bded2dbSJung-uk Kim
29877bded2dbSJung-uk Kim	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	4*8($aaptr),%rax,%r12
29887bded2dbSJung-uk Kim	adcx	%rax,%r11
29897bded2dbSJung-uk Kim	adox	%r13,%r12
29907bded2dbSJung-uk Kim
29917bded2dbSJung-uk Kim	mulx	5*8($aaptr),%rax,%r13
29927bded2dbSJung-uk Kim	adcx	%rax,%r12
29937bded2dbSJung-uk Kim	adox	%r14,%r13
29947bded2dbSJung-uk Kim
29957bded2dbSJung-uk Kim	mulx	6*8($aaptr),%rax,%r14
29967bded2dbSJung-uk Kim	 mov	%rbx,($tptr,%rcx,8)	# store t[8+i]
29977bded2dbSJung-uk Kim	 mov	\$0,%ebx
29987bded2dbSJung-uk Kim	adcx	%rax,%r13
29997bded2dbSJung-uk Kim	adox	%r15,%r14
30007bded2dbSJung-uk Kim
30017bded2dbSJung-uk Kim	.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00	# mulx	7*8($aaptr),%rax,%r15
30027bded2dbSJung-uk Kim	 mov	8($aptr,%rcx,8),%rdx	# a[i]
30037bded2dbSJung-uk Kim	adcx	%rax,%r14
30047bded2dbSJung-uk Kim	adox	%rbx,%r15		# %rbx is 0, of=0
30057bded2dbSJung-uk Kim	adcx	%rbx,%r15		# cf=0
30067bded2dbSJung-uk Kim
30077bded2dbSJung-uk Kim	.byte	0x67
30087bded2dbSJung-uk Kim	inc	%rcx			# of=0
30097bded2dbSJung-uk Kim	jnz	.Lsqrx8x_loop
30107bded2dbSJung-uk Kim
30117bded2dbSJung-uk Kim	lea	8*8($aaptr),$aaptr
30127bded2dbSJung-uk Kim	mov	\$-8,%rcx
30137bded2dbSJung-uk Kim	cmp	8+8(%rsp),$aaptr	# done?
30147bded2dbSJung-uk Kim	je	.Lsqrx8x_break
30157bded2dbSJung-uk Kim
30167bded2dbSJung-uk Kim	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
30177bded2dbSJung-uk Kim	.byte	0x66
30187bded2dbSJung-uk Kim	mov	-64($aptr),%rdx
30197bded2dbSJung-uk Kim	adcx	0*8($tptr),%r8
30207bded2dbSJung-uk Kim	adcx	1*8($tptr),%r9
30217bded2dbSJung-uk Kim	adc	2*8($tptr),%r10
30227bded2dbSJung-uk Kim	adc	3*8($tptr),%r11
30237bded2dbSJung-uk Kim	adc	4*8($tptr),%r12
30247bded2dbSJung-uk Kim	adc	5*8($tptr),%r13
30257bded2dbSJung-uk Kim	adc	6*8($tptr),%r14
30267bded2dbSJung-uk Kim	adc	7*8($tptr),%r15
30277bded2dbSJung-uk Kim	lea	8*8($tptr),$tptr
30287bded2dbSJung-uk Kim	.byte	0x67
30297bded2dbSJung-uk Kim	sbb	%rax,%rax		# mov %cf,%rax
30307bded2dbSJung-uk Kim	xor	%ebx,%ebx		# cf=0, of=0
30317bded2dbSJung-uk Kim	mov	%rax,16+8(%rsp)		# offload carry
30327bded2dbSJung-uk Kim	jmp	.Lsqrx8x_loop
30337bded2dbSJung-uk Kim
30347bded2dbSJung-uk Kim.align	32
30357bded2dbSJung-uk Kim.Lsqrx8x_break:
303647902a71SJung-uk Kim	xor	$zero,$zero
303747902a71SJung-uk Kim	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
303847902a71SJung-uk Kim	adcx	$zero,%r8
30397bded2dbSJung-uk Kim	mov	24+8(%rsp),$carry	# initial $tptr, borrow $carry
304047902a71SJung-uk Kim	adcx	$zero,%r9
30417bded2dbSJung-uk Kim	mov	0*8($aptr),%rdx		# a[8], modulo-scheduled
304247902a71SJung-uk Kim	adc	\$0,%r10
30437bded2dbSJung-uk Kim	mov	%r8,0*8($tptr)
304447902a71SJung-uk Kim	adc	\$0,%r11
304547902a71SJung-uk Kim	adc	\$0,%r12
304647902a71SJung-uk Kim	adc	\$0,%r13
304747902a71SJung-uk Kim	adc	\$0,%r14
304847902a71SJung-uk Kim	adc	\$0,%r15
30497bded2dbSJung-uk Kim	cmp	$carry,$tptr		# cf=0, of=0
30507bded2dbSJung-uk Kim	je	.Lsqrx8x_outer_loop
30517bded2dbSJung-uk Kim
30527bded2dbSJung-uk Kim	mov	%r9,1*8($tptr)
30537bded2dbSJung-uk Kim	 mov	1*8($carry),%r9
30547bded2dbSJung-uk Kim	mov	%r10,2*8($tptr)
30557bded2dbSJung-uk Kim	 mov	2*8($carry),%r10
30567bded2dbSJung-uk Kim	mov	%r11,3*8($tptr)
30577bded2dbSJung-uk Kim	 mov	3*8($carry),%r11
30587bded2dbSJung-uk Kim	mov	%r12,4*8($tptr)
30597bded2dbSJung-uk Kim	 mov	4*8($carry),%r12
30607bded2dbSJung-uk Kim	mov	%r13,5*8($tptr)
30617bded2dbSJung-uk Kim	 mov	5*8($carry),%r13
30627bded2dbSJung-uk Kim	mov	%r14,6*8($tptr)
30637bded2dbSJung-uk Kim	 mov	6*8($carry),%r14
30647bded2dbSJung-uk Kim	mov	%r15,7*8($tptr)
30657bded2dbSJung-uk Kim	 mov	7*8($carry),%r15
30667bded2dbSJung-uk Kim	mov	$carry,$tptr
30677bded2dbSJung-uk Kim	jmp	.Lsqrx8x_outer_loop
30687bded2dbSJung-uk Kim
30697bded2dbSJung-uk Kim.align	32
30707bded2dbSJung-uk Kim.Lsqrx8x_outer_break:
30717bded2dbSJung-uk Kim	mov	%r9,9*8($tptr)		# t[9]
30727bded2dbSJung-uk Kim	 movq	%xmm3,%rcx		# -$num
30737bded2dbSJung-uk Kim	mov	%r10,10*8($tptr)	# ...
30747bded2dbSJung-uk Kim	mov	%r11,11*8($tptr)
30757bded2dbSJung-uk Kim	mov	%r12,12*8($tptr)
30767bded2dbSJung-uk Kim	mov	%r13,13*8($tptr)
30777bded2dbSJung-uk Kim	mov	%r14,14*8($tptr)
30787bded2dbSJung-uk Kim___
30797bded2dbSJung-uk Kim}{
30807bded2dbSJung-uk Kimmy $i="%rcx";
30817bded2dbSJung-uk Kim$code.=<<___;
30827bded2dbSJung-uk Kim	lea	48+8(%rsp),$tptr
30837bded2dbSJung-uk Kim	mov	($aptr,$i),%rdx		# a[0]
30847bded2dbSJung-uk Kim
30857bded2dbSJung-uk Kim	mov	8($tptr),$A0[1]		# t[1]
30867bded2dbSJung-uk Kim	xor	$A0[0],$A0[0]		# t[0], of=0, cf=0
30877bded2dbSJung-uk Kim	mov	0+8(%rsp),$num		# restore $num
30887bded2dbSJung-uk Kim	adox	$A0[1],$A0[1]
30897bded2dbSJung-uk Kim	 mov	16($tptr),$A1[0]	# t[2]	# prefetch
30907bded2dbSJung-uk Kim	 mov	24($tptr),$A1[1]	# t[3]	# prefetch
30917bded2dbSJung-uk Kim	#jmp	.Lsqrx4x_shift_n_add	# happens to be aligned
30927bded2dbSJung-uk Kim
30937bded2dbSJung-uk Kim.align	32
30947bded2dbSJung-uk Kim.Lsqrx4x_shift_n_add:
30957bded2dbSJung-uk Kim	mulx	%rdx,%rax,%rbx
30967bded2dbSJung-uk Kim	 adox	$A1[0],$A1[0]
30977bded2dbSJung-uk Kim	adcx	$A0[0],%rax
30987bded2dbSJung-uk Kim	 .byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00	# mov	8($aptr,$i),%rdx	# a[i+1]	# prefetch
30997bded2dbSJung-uk Kim	 .byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00	# mov	32($tptr),$A0[0]	# t[2*i+4]	# prefetch
31007bded2dbSJung-uk Kim	 adox	$A1[1],$A1[1]
31017bded2dbSJung-uk Kim	adcx	$A0[1],%rbx
31027bded2dbSJung-uk Kim	 mov	40($tptr),$A0[1]		# t[2*i+4+1]	# prefetch
31037bded2dbSJung-uk Kim	mov	%rax,0($tptr)
31047bded2dbSJung-uk Kim	mov	%rbx,8($tptr)
31057bded2dbSJung-uk Kim
31067bded2dbSJung-uk Kim	mulx	%rdx,%rax,%rbx
31077bded2dbSJung-uk Kim	 adox	$A0[0],$A0[0]
31087bded2dbSJung-uk Kim	adcx	$A1[0],%rax
31097bded2dbSJung-uk Kim	 mov	16($aptr,$i),%rdx	# a[i+2]	# prefetch
31107bded2dbSJung-uk Kim	 mov	48($tptr),$A1[0]	# t[2*i+6]	# prefetch
31117bded2dbSJung-uk Kim	 adox	$A0[1],$A0[1]
31127bded2dbSJung-uk Kim	adcx	$A1[1],%rbx
31137bded2dbSJung-uk Kim	 mov	56($tptr),$A1[1]	# t[2*i+6+1]	# prefetch
31147bded2dbSJung-uk Kim	mov	%rax,16($tptr)
31157bded2dbSJung-uk Kim	mov	%rbx,24($tptr)
31167bded2dbSJung-uk Kim
31177bded2dbSJung-uk Kim	mulx	%rdx,%rax,%rbx
31187bded2dbSJung-uk Kim	 adox	$A1[0],$A1[0]
31197bded2dbSJung-uk Kim	adcx	$A0[0],%rax
31207bded2dbSJung-uk Kim	 mov	24($aptr,$i),%rdx	# a[i+3]	# prefetch
31217bded2dbSJung-uk Kim	 lea	32($i),$i
31227bded2dbSJung-uk Kim	 mov	64($tptr),$A0[0]	# t[2*i+8]	# prefetch
31237bded2dbSJung-uk Kim	 adox	$A1[1],$A1[1]
31247bded2dbSJung-uk Kim	adcx	$A0[1],%rbx
31257bded2dbSJung-uk Kim	 mov	72($tptr),$A0[1]	# t[2*i+8+1]	# prefetch
31267bded2dbSJung-uk Kim	mov	%rax,32($tptr)
31277bded2dbSJung-uk Kim	mov	%rbx,40($tptr)
31287bded2dbSJung-uk Kim
31297bded2dbSJung-uk Kim	mulx	%rdx,%rax,%rbx
31307bded2dbSJung-uk Kim	 adox	$A0[0],$A0[0]
31317bded2dbSJung-uk Kim	adcx	$A1[0],%rax
31327bded2dbSJung-uk Kim	jrcxz	.Lsqrx4x_shift_n_add_break
31337bded2dbSJung-uk Kim	 .byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00	# mov	0($aptr,$i),%rdx	# a[i+4]	# prefetch
31347bded2dbSJung-uk Kim	 adox	$A0[1],$A0[1]
31357bded2dbSJung-uk Kim	adcx	$A1[1],%rbx
31367bded2dbSJung-uk Kim	 mov	80($tptr),$A1[0]	# t[2*i+10]	# prefetch
31377bded2dbSJung-uk Kim	 mov	88($tptr),$A1[1]	# t[2*i+10+1]	# prefetch
31387bded2dbSJung-uk Kim	mov	%rax,48($tptr)
31397bded2dbSJung-uk Kim	mov	%rbx,56($tptr)
31407bded2dbSJung-uk Kim	lea	64($tptr),$tptr
31417bded2dbSJung-uk Kim	nop
31427bded2dbSJung-uk Kim	jmp	.Lsqrx4x_shift_n_add
31437bded2dbSJung-uk Kim
31447bded2dbSJung-uk Kim.align	32
31457bded2dbSJung-uk Kim.Lsqrx4x_shift_n_add_break:
31467bded2dbSJung-uk Kim	adcx	$A1[1],%rbx
31477bded2dbSJung-uk Kim	mov	%rax,48($tptr)
31487bded2dbSJung-uk Kim	mov	%rbx,56($tptr)
31497bded2dbSJung-uk Kim	lea	64($tptr),$tptr		# end of t[] buffer
31507bded2dbSJung-uk Kim___
31517bded2dbSJung-uk Kim}
31527bded2dbSJung-uk Kim######################################################################
31537bded2dbSJung-uk Kim# Montgomery reduction part, "word-by-word" algorithm.
31547bded2dbSJung-uk Kim#
31557bded2dbSJung-uk Kim# This new path is inspired by multiple submissions from Intel, by
31567bded2dbSJung-uk Kim# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
31577bded2dbSJung-uk Kim# Vinodh Gopal...
31587bded2dbSJung-uk Kim{
31597bded2dbSJung-uk Kimmy ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
31607bded2dbSJung-uk Kim
31617bded2dbSJung-uk Kim$code.=<<___;
31627bded2dbSJung-uk Kim	movq	%xmm2,$nptr
31634c6a0400SJung-uk Kim__bn_sqrx8x_reduction:
31647bded2dbSJung-uk Kim	xor	%eax,%eax		# initial top-most carry bit
31657bded2dbSJung-uk Kim	mov	32+8(%rsp),%rbx		# n0
31667bded2dbSJung-uk Kim	mov	48+8(%rsp),%rdx		# "%r8", 8*0($tptr)
31674c6a0400SJung-uk Kim	lea	-8*8($nptr,$num),%rcx	# end of n[]
31687bded2dbSJung-uk Kim	#lea	48+8(%rsp,$num,2),$tptr	# end of t[] buffer
31697bded2dbSJung-uk Kim	mov	%rcx, 0+8(%rsp)		# save end of n[]
31707bded2dbSJung-uk Kim	mov	$tptr,8+8(%rsp)		# save end of t[]
31717bded2dbSJung-uk Kim
31727bded2dbSJung-uk Kim	lea	48+8(%rsp),$tptr		# initial t[] window
31737bded2dbSJung-uk Kim	jmp	.Lsqrx8x_reduction_loop
31747bded2dbSJung-uk Kim
31757bded2dbSJung-uk Kim.align	32
31767bded2dbSJung-uk Kim.Lsqrx8x_reduction_loop:
31777bded2dbSJung-uk Kim	mov	8*1($tptr),%r9
31787bded2dbSJung-uk Kim	mov	8*2($tptr),%r10
31797bded2dbSJung-uk Kim	mov	8*3($tptr),%r11
31807bded2dbSJung-uk Kim	mov	8*4($tptr),%r12
31817bded2dbSJung-uk Kim	mov	%rdx,%r8
31827bded2dbSJung-uk Kim	imulq	%rbx,%rdx		# n0*a[i]
31837bded2dbSJung-uk Kim	mov	8*5($tptr),%r13
31847bded2dbSJung-uk Kim	mov	8*6($tptr),%r14
31857bded2dbSJung-uk Kim	mov	8*7($tptr),%r15
31867bded2dbSJung-uk Kim	mov	%rax,24+8(%rsp)		# store top-most carry bit
31877bded2dbSJung-uk Kim
31887bded2dbSJung-uk Kim	lea	8*8($tptr),$tptr
31897bded2dbSJung-uk Kim	xor	$carry,$carry		# cf=0,of=0
31907bded2dbSJung-uk Kim	mov	\$-8,%rcx
31917bded2dbSJung-uk Kim	jmp	.Lsqrx8x_reduce
31927bded2dbSJung-uk Kim
31937bded2dbSJung-uk Kim.align	32
31947bded2dbSJung-uk Kim.Lsqrx8x_reduce:
31957bded2dbSJung-uk Kim	mov	%r8, %rbx
31964c6a0400SJung-uk Kim	mulx	8*0($nptr),%rax,%r8	# n[0]
31977bded2dbSJung-uk Kim	adcx	%rbx,%rax		# discarded
31987bded2dbSJung-uk Kim	adox	%r9,%r8
31997bded2dbSJung-uk Kim
32004c6a0400SJung-uk Kim	mulx	8*1($nptr),%rbx,%r9	# n[1]
32017bded2dbSJung-uk Kim	adcx	%rbx,%r8
32027bded2dbSJung-uk Kim	adox	%r10,%r9
32037bded2dbSJung-uk Kim
32044c6a0400SJung-uk Kim	mulx	8*2($nptr),%rbx,%r10
32057bded2dbSJung-uk Kim	adcx	%rbx,%r9
32067bded2dbSJung-uk Kim	adox	%r11,%r10
32077bded2dbSJung-uk Kim
32084c6a0400SJung-uk Kim	mulx	8*3($nptr),%rbx,%r11
32097bded2dbSJung-uk Kim	adcx	%rbx,%r10
32107bded2dbSJung-uk Kim	adox	%r12,%r11
32117bded2dbSJung-uk Kim
32124c6a0400SJung-uk Kim	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rbx,%r12
32137bded2dbSJung-uk Kim	 mov	%rdx,%rax
32147bded2dbSJung-uk Kim	 mov	%r8,%rdx
32157bded2dbSJung-uk Kim	adcx	%rbx,%r11
32167bded2dbSJung-uk Kim	adox	%r13,%r12
32177bded2dbSJung-uk Kim
32187bded2dbSJung-uk Kim	 mulx	32+8(%rsp),%rbx,%rdx	# %rdx discarded
32197bded2dbSJung-uk Kim	 mov	%rax,%rdx
32207bded2dbSJung-uk Kim	 mov	%rax,64+48+8(%rsp,%rcx,8)	# put aside n0*a[i]
32217bded2dbSJung-uk Kim
32224c6a0400SJung-uk Kim	mulx	8*5($nptr),%rax,%r13
32237bded2dbSJung-uk Kim	adcx	%rax,%r12
32247bded2dbSJung-uk Kim	adox	%r14,%r13
32257bded2dbSJung-uk Kim
32264c6a0400SJung-uk Kim	mulx	8*6($nptr),%rax,%r14
32277bded2dbSJung-uk Kim	adcx	%rax,%r13
32287bded2dbSJung-uk Kim	adox	%r15,%r14
32297bded2dbSJung-uk Kim
32304c6a0400SJung-uk Kim	mulx	8*7($nptr),%rax,%r15
32317bded2dbSJung-uk Kim	 mov	%rbx,%rdx
32327bded2dbSJung-uk Kim	adcx	%rax,%r14
32337bded2dbSJung-uk Kim	adox	$carry,%r15		# $carry is 0
32347bded2dbSJung-uk Kim	adcx	$carry,%r15		# cf=0
32357bded2dbSJung-uk Kim
32367bded2dbSJung-uk Kim	.byte	0x67,0x67,0x67
32377bded2dbSJung-uk Kim	inc	%rcx			# of=0
32387bded2dbSJung-uk Kim	jnz	.Lsqrx8x_reduce
32397bded2dbSJung-uk Kim
32407bded2dbSJung-uk Kim	mov	$carry,%rax		# xor	%rax,%rax
32417bded2dbSJung-uk Kim	cmp	0+8(%rsp),$nptr		# end of n[]?
32427bded2dbSJung-uk Kim	jae	.Lsqrx8x_no_tail
32437bded2dbSJung-uk Kim
32447bded2dbSJung-uk Kim	mov	48+8(%rsp),%rdx		# pull n0*a[0]
32457bded2dbSJung-uk Kim	add	8*0($tptr),%r8
32464c6a0400SJung-uk Kim	lea	8*8($nptr),$nptr
32477bded2dbSJung-uk Kim	mov	\$-8,%rcx
32487bded2dbSJung-uk Kim	adcx	8*1($tptr),%r9
32497bded2dbSJung-uk Kim	adcx	8*2($tptr),%r10
32507bded2dbSJung-uk Kim	adc	8*3($tptr),%r11
32517bded2dbSJung-uk Kim	adc	8*4($tptr),%r12
32527bded2dbSJung-uk Kim	adc	8*5($tptr),%r13
32537bded2dbSJung-uk Kim	adc	8*6($tptr),%r14
32547bded2dbSJung-uk Kim	adc	8*7($tptr),%r15
32557bded2dbSJung-uk Kim	lea	8*8($tptr),$tptr
32567bded2dbSJung-uk Kim	sbb	%rax,%rax		# top carry
32577bded2dbSJung-uk Kim
32587bded2dbSJung-uk Kim	xor	$carry,$carry		# of=0, cf=0
32597bded2dbSJung-uk Kim	mov	%rax,16+8(%rsp)
32607bded2dbSJung-uk Kim	jmp	.Lsqrx8x_tail
32617bded2dbSJung-uk Kim
32627bded2dbSJung-uk Kim.align	32
32637bded2dbSJung-uk Kim.Lsqrx8x_tail:
32647bded2dbSJung-uk Kim	mov	%r8,%rbx
32654c6a0400SJung-uk Kim	mulx	8*0($nptr),%rax,%r8
32667bded2dbSJung-uk Kim	adcx	%rax,%rbx
32677bded2dbSJung-uk Kim	adox	%r9,%r8
32687bded2dbSJung-uk Kim
32694c6a0400SJung-uk Kim	mulx	8*1($nptr),%rax,%r9
32707bded2dbSJung-uk Kim	adcx	%rax,%r8
32717bded2dbSJung-uk Kim	adox	%r10,%r9
32727bded2dbSJung-uk Kim
32734c6a0400SJung-uk Kim	mulx	8*2($nptr),%rax,%r10
32747bded2dbSJung-uk Kim	adcx	%rax,%r9
32757bded2dbSJung-uk Kim	adox	%r11,%r10
32767bded2dbSJung-uk Kim
32774c6a0400SJung-uk Kim	mulx	8*3($nptr),%rax,%r11
32787bded2dbSJung-uk Kim	adcx	%rax,%r10
32797bded2dbSJung-uk Kim	adox	%r12,%r11
32807bded2dbSJung-uk Kim
32814c6a0400SJung-uk Kim	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rax,%r12
32827bded2dbSJung-uk Kim	adcx	%rax,%r11
32837bded2dbSJung-uk Kim	adox	%r13,%r12
32847bded2dbSJung-uk Kim
32854c6a0400SJung-uk Kim	mulx	8*5($nptr),%rax,%r13
32867bded2dbSJung-uk Kim	adcx	%rax,%r12
32877bded2dbSJung-uk Kim	adox	%r14,%r13
32887bded2dbSJung-uk Kim
32894c6a0400SJung-uk Kim	mulx	8*6($nptr),%rax,%r14
32907bded2dbSJung-uk Kim	adcx	%rax,%r13
32917bded2dbSJung-uk Kim	adox	%r15,%r14
32927bded2dbSJung-uk Kim
32934c6a0400SJung-uk Kim	mulx	8*7($nptr),%rax,%r15
32947bded2dbSJung-uk Kim	 mov	72+48+8(%rsp,%rcx,8),%rdx	# pull n0*a[i]
32957bded2dbSJung-uk Kim	adcx	%rax,%r14
32967bded2dbSJung-uk Kim	adox	$carry,%r15
32977bded2dbSJung-uk Kim	 mov	%rbx,($tptr,%rcx,8)	# save result
32987bded2dbSJung-uk Kim	 mov	%r8,%rbx
32997bded2dbSJung-uk Kim	adcx	$carry,%r15		# cf=0
33007bded2dbSJung-uk Kim
33017bded2dbSJung-uk Kim	inc	%rcx			# of=0
33027bded2dbSJung-uk Kim	jnz	.Lsqrx8x_tail
33037bded2dbSJung-uk Kim
33047bded2dbSJung-uk Kim	cmp	0+8(%rsp),$nptr		# end of n[]?
33057bded2dbSJung-uk Kim	jae	.Lsqrx8x_tail_done	# break out of loop
33067bded2dbSJung-uk Kim
33077bded2dbSJung-uk Kim	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
33087bded2dbSJung-uk Kim	 mov	48+8(%rsp),%rdx		# pull n0*a[0]
33094c6a0400SJung-uk Kim	 lea	8*8($nptr),$nptr
33107bded2dbSJung-uk Kim	adc	8*0($tptr),%r8
33117bded2dbSJung-uk Kim	adc	8*1($tptr),%r9
33127bded2dbSJung-uk Kim	adc	8*2($tptr),%r10
33137bded2dbSJung-uk Kim	adc	8*3($tptr),%r11
33147bded2dbSJung-uk Kim	adc	8*4($tptr),%r12
33157bded2dbSJung-uk Kim	adc	8*5($tptr),%r13
33167bded2dbSJung-uk Kim	adc	8*6($tptr),%r14
33177bded2dbSJung-uk Kim	adc	8*7($tptr),%r15
33187bded2dbSJung-uk Kim	lea	8*8($tptr),$tptr
33197bded2dbSJung-uk Kim	sbb	%rax,%rax
33207bded2dbSJung-uk Kim	sub	\$8,%rcx		# mov	\$-8,%rcx
33217bded2dbSJung-uk Kim
33227bded2dbSJung-uk Kim	xor	$carry,$carry		# of=0, cf=0
33237bded2dbSJung-uk Kim	mov	%rax,16+8(%rsp)
33247bded2dbSJung-uk Kim	jmp	.Lsqrx8x_tail
33257bded2dbSJung-uk Kim
33267bded2dbSJung-uk Kim.align	32
33277bded2dbSJung-uk Kim.Lsqrx8x_tail_done:
33286cf8931aSJung-uk Kim	xor	%rax,%rax
33297bded2dbSJung-uk Kim	add	24+8(%rsp),%r8		# can this overflow?
333080815a77SJung-uk Kim	adc	\$0,%r9
333180815a77SJung-uk Kim	adc	\$0,%r10
333280815a77SJung-uk Kim	adc	\$0,%r11
333380815a77SJung-uk Kim	adc	\$0,%r12
333480815a77SJung-uk Kim	adc	\$0,%r13
333580815a77SJung-uk Kim	adc	\$0,%r14
33366cf8931aSJung-uk Kim	adc	\$0,%r15
33376cf8931aSJung-uk Kim	adc	\$0,%rax
33387bded2dbSJung-uk Kim
33397bded2dbSJung-uk Kim	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
33407bded2dbSJung-uk Kim.Lsqrx8x_no_tail:			# %cf is 0 if jumped here
33417bded2dbSJung-uk Kim	adc	8*0($tptr),%r8
33427bded2dbSJung-uk Kim	 movq	%xmm3,%rcx
33437bded2dbSJung-uk Kim	adc	8*1($tptr),%r9
33444c6a0400SJung-uk Kim	 mov	8*7($nptr),$carry
33457bded2dbSJung-uk Kim	 movq	%xmm2,$nptr		# restore $nptr
33467bded2dbSJung-uk Kim	adc	8*2($tptr),%r10
33477bded2dbSJung-uk Kim	adc	8*3($tptr),%r11
33487bded2dbSJung-uk Kim	adc	8*4($tptr),%r12
33497bded2dbSJung-uk Kim	adc	8*5($tptr),%r13
33507bded2dbSJung-uk Kim	adc	8*6($tptr),%r14
33517bded2dbSJung-uk Kim	adc	8*7($tptr),%r15
33526cf8931aSJung-uk Kim	adc	\$0,%rax		# top-most carry
33537bded2dbSJung-uk Kim
33547bded2dbSJung-uk Kim	mov	32+8(%rsp),%rbx		# n0
33557bded2dbSJung-uk Kim	mov	8*8($tptr,%rcx),%rdx	# modulo-scheduled "%r8"
33567bded2dbSJung-uk Kim
33577bded2dbSJung-uk Kim	mov	%r8,8*0($tptr)		# store top 512 bits
33587bded2dbSJung-uk Kim	 lea	8*8($tptr),%r8		# borrow %r8
33597bded2dbSJung-uk Kim	mov	%r9,8*1($tptr)
33607bded2dbSJung-uk Kim	mov	%r10,8*2($tptr)
33617bded2dbSJung-uk Kim	mov	%r11,8*3($tptr)
33627bded2dbSJung-uk Kim	mov	%r12,8*4($tptr)
33637bded2dbSJung-uk Kim	mov	%r13,8*5($tptr)
33647bded2dbSJung-uk Kim	mov	%r14,8*6($tptr)
33657bded2dbSJung-uk Kim	mov	%r15,8*7($tptr)
33667bded2dbSJung-uk Kim
33677bded2dbSJung-uk Kim	lea	8*8($tptr,%rcx),$tptr	# start of current t[] window
33687bded2dbSJung-uk Kim	cmp	8+8(%rsp),%r8		# end of t[]?
33697bded2dbSJung-uk Kim	jb	.Lsqrx8x_reduction_loop
33704c6a0400SJung-uk Kim	ret
33716935a639SJung-uk Kim.cfi_endproc
33724c6a0400SJung-uk Kim.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
33737bded2dbSJung-uk Kim___
33747bded2dbSJung-uk Kim}
33757bded2dbSJung-uk Kim##############################################################
33767bded2dbSJung-uk Kim# Post-condition, 4x unrolled
33777bded2dbSJung-uk Kim#
33787bded2dbSJung-uk Kim{
33797bded2dbSJung-uk Kimmy ($rptr,$nptr)=("%rdx","%rbp");
33807bded2dbSJung-uk Kim$code.=<<___;
33814c6a0400SJung-uk Kim.align	32
33824c6a0400SJung-uk Kim__bn_postx4x_internal:
338317f01e99SJung-uk Kim.cfi_startproc
33844c6a0400SJung-uk Kim	mov	8*0($nptr),%r12
33857bded2dbSJung-uk Kim	mov	%rcx,%r10		# -$num
33867bded2dbSJung-uk Kim	mov	%rcx,%r9		# -$num
33874c6a0400SJung-uk Kim	neg	%rax
33884c6a0400SJung-uk Kim	sar	\$3+2,%rcx
33897bded2dbSJung-uk Kim	#lea	48+8(%rsp,%r9),$tptr
33907bded2dbSJung-uk Kim	movq	%xmm1,$rptr		# restore $rptr
33917bded2dbSJung-uk Kim	movq	%xmm1,$aptr		# prepare for back-to-back call
33924c6a0400SJung-uk Kim	dec	%r12			# so that after 'not' we get -n[0]
33934c6a0400SJung-uk Kim	mov	8*1($nptr),%r13
33944c6a0400SJung-uk Kim	xor	%r8,%r8
33954c6a0400SJung-uk Kim	mov	8*2($nptr),%r14
33964c6a0400SJung-uk Kim	mov	8*3($nptr),%r15
33974c6a0400SJung-uk Kim	jmp	.Lsqrx4x_sub_entry
33987bded2dbSJung-uk Kim
33994c6a0400SJung-uk Kim.align	16
34007bded2dbSJung-uk Kim.Lsqrx4x_sub:
34014c6a0400SJung-uk Kim	mov	8*0($nptr),%r12
34024c6a0400SJung-uk Kim	mov	8*1($nptr),%r13
34034c6a0400SJung-uk Kim	mov	8*2($nptr),%r14
34044c6a0400SJung-uk Kim	mov	8*3($nptr),%r15
34054c6a0400SJung-uk Kim.Lsqrx4x_sub_entry:
34064c6a0400SJung-uk Kim	andn	%rax,%r12,%r12
34074c6a0400SJung-uk Kim	lea	8*4($nptr),$nptr
34084c6a0400SJung-uk Kim	andn	%rax,%r13,%r13
34094c6a0400SJung-uk Kim	andn	%rax,%r14,%r14
34104c6a0400SJung-uk Kim	andn	%rax,%r15,%r15
34114c6a0400SJung-uk Kim
34124c6a0400SJung-uk Kim	neg	%r8			# mov %r8,%cf
34134c6a0400SJung-uk Kim	adc	8*0($tptr),%r12
34144c6a0400SJung-uk Kim	adc	8*1($tptr),%r13
34154c6a0400SJung-uk Kim	adc	8*2($tptr),%r14
34164c6a0400SJung-uk Kim	adc	8*3($tptr),%r15
34177bded2dbSJung-uk Kim	mov	%r12,8*0($rptr)
34184c6a0400SJung-uk Kim	lea	8*4($tptr),$tptr
34197bded2dbSJung-uk Kim	mov	%r13,8*1($rptr)
34204c6a0400SJung-uk Kim	sbb	%r8,%r8			# mov %cf,%r8
34217bded2dbSJung-uk Kim	mov	%r14,8*2($rptr)
34227bded2dbSJung-uk Kim	mov	%r15,8*3($rptr)
34237bded2dbSJung-uk Kim	lea	8*4($rptr),$rptr
34247bded2dbSJung-uk Kim
34257bded2dbSJung-uk Kim	inc	%rcx
34267bded2dbSJung-uk Kim	jnz	.Lsqrx4x_sub
34274c6a0400SJung-uk Kim
34287bded2dbSJung-uk Kim	neg	%r9			# restore $num
34297bded2dbSJung-uk Kim
34307bded2dbSJung-uk Kim	ret
343117f01e99SJung-uk Kim.cfi_endproc
34324c6a0400SJung-uk Kim.size	__bn_postx4x_internal,.-__bn_postx4x_internal
34331f13597dSJung-uk Kim___
34344c6a0400SJung-uk Kim}
34351f13597dSJung-uk Kim}}}
34361f13597dSJung-uk Kim{
34377bded2dbSJung-uk Kimmy ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
34387bded2dbSJung-uk Kim				("%rdi","%esi","%rdx","%ecx");  # Unix order
34391f13597dSJung-uk Kimmy $out=$inp;
34401f13597dSJung-uk Kimmy $STRIDE=2**5*8;
34411f13597dSJung-uk Kimmy $N=$STRIDE/4;
34421f13597dSJung-uk Kim
34431f13597dSJung-uk Kim$code.=<<___;
34447bded2dbSJung-uk Kim.globl	bn_get_bits5
34457bded2dbSJung-uk Kim.type	bn_get_bits5,\@abi-omnipotent
34467bded2dbSJung-uk Kim.align	16
34477bded2dbSJung-uk Kimbn_get_bits5:
344817f01e99SJung-uk Kim.cfi_startproc
34497bded2dbSJung-uk Kim	lea	0($inp),%r10
34507bded2dbSJung-uk Kim	lea	1($inp),%r11
34517bded2dbSJung-uk Kim	mov	$num,%ecx
34527bded2dbSJung-uk Kim	shr	\$4,$num
34537bded2dbSJung-uk Kim	and	\$15,%ecx
34547bded2dbSJung-uk Kim	lea	-8(%ecx),%eax
34557bded2dbSJung-uk Kim	cmp	\$11,%ecx
34567bded2dbSJung-uk Kim	cmova	%r11,%r10
34577bded2dbSJung-uk Kim	cmova	%eax,%ecx
34587bded2dbSJung-uk Kim	movzw	(%r10,$num,2),%eax
34597bded2dbSJung-uk Kim	shrl	%cl,%eax
34607bded2dbSJung-uk Kim	and	\$31,%eax
34617bded2dbSJung-uk Kim	ret
346217f01e99SJung-uk Kim.cfi_endproc
34637bded2dbSJung-uk Kim.size	bn_get_bits5,.-bn_get_bits5
34647bded2dbSJung-uk Kim
34651f13597dSJung-uk Kim.globl	bn_scatter5
34661f13597dSJung-uk Kim.type	bn_scatter5,\@abi-omnipotent
34671f13597dSJung-uk Kim.align	16
34681f13597dSJung-uk Kimbn_scatter5:
346917f01e99SJung-uk Kim.cfi_startproc
34701f13597dSJung-uk Kim	cmp	\$0, $num
34711f13597dSJung-uk Kim	jz	.Lscatter_epilogue
34721f13597dSJung-uk Kim	lea	($tbl,$idx,8),$tbl
34731f13597dSJung-uk Kim.Lscatter:
34741f13597dSJung-uk Kim	mov	($inp),%rax
34751f13597dSJung-uk Kim	lea	8($inp),$inp
34761f13597dSJung-uk Kim	mov	%rax,($tbl)
34771f13597dSJung-uk Kim	lea	32*8($tbl),$tbl
34781f13597dSJung-uk Kim	sub	\$1,$num
34791f13597dSJung-uk Kim	jnz	.Lscatter
34801f13597dSJung-uk Kim.Lscatter_epilogue:
34811f13597dSJung-uk Kim	ret
348217f01e99SJung-uk Kim.cfi_endproc
34831f13597dSJung-uk Kim.size	bn_scatter5,.-bn_scatter5
34841f13597dSJung-uk Kim
34851f13597dSJung-uk Kim.globl	bn_gather5
34861f13597dSJung-uk Kim.type	bn_gather5,\@abi-omnipotent
34874c6a0400SJung-uk Kim.align	32
34881f13597dSJung-uk Kimbn_gather5:
34894c6a0400SJung-uk Kim.LSEH_begin_bn_gather5:			# Win64 thing, but harmless in other cases
349017f01e99SJung-uk Kim.cfi_startproc
34911f13597dSJung-uk Kim	# I can't trust assembler to use specific encoding:-(
34924c6a0400SJung-uk Kim	.byte	0x4c,0x8d,0x14,0x24			#lea    (%rsp),%r10
34934c6a0400SJung-uk Kim	.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00	#sub	$0x108,%rsp
34944c6a0400SJung-uk Kim	lea	.Linc(%rip),%rax
34954c6a0400SJung-uk Kim	and	\$-16,%rsp		# shouldn't be formally required
34964c6a0400SJung-uk Kim
34974c6a0400SJung-uk Kim	movd	$idx,%xmm5
34984c6a0400SJung-uk Kim	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
34994c6a0400SJung-uk Kim	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
35004c6a0400SJung-uk Kim	lea	128($tbl),%r11		# size optimization
35014c6a0400SJung-uk Kim	lea	128(%rsp),%rax		# size optimization
35024c6a0400SJung-uk Kim
35034c6a0400SJung-uk Kim	pshufd	\$0,%xmm5,%xmm5		# broadcast $idx
35044c6a0400SJung-uk Kim	movdqa	%xmm1,%xmm4
35054c6a0400SJung-uk Kim	movdqa	%xmm1,%xmm2
35064c6a0400SJung-uk Kim___
35074c6a0400SJung-uk Kim########################################################################
35084c6a0400SJung-uk Kim# calculate mask by comparing 0..31 to $idx and save result to stack
35094c6a0400SJung-uk Kim#
35104c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16;$i+=4) {
35114c6a0400SJung-uk Kim$code.=<<___;
35124c6a0400SJung-uk Kim	paddd	%xmm0,%xmm1
35134c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
35144c6a0400SJung-uk Kim___
35154c6a0400SJung-uk Kim$code.=<<___	if ($i);
35164c6a0400SJung-uk Kim	movdqa	%xmm3,`16*($i-1)-128`(%rax)
35171f13597dSJung-uk Kim___
35181f13597dSJung-uk Kim$code.=<<___;
35194c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm3
35201f13597dSJung-uk Kim
35214c6a0400SJung-uk Kim	paddd	%xmm1,%xmm2
35224c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
35234c6a0400SJung-uk Kim	movdqa	%xmm0,`16*($i+0)-128`(%rax)
35244c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm0
35254c6a0400SJung-uk Kim
35264c6a0400SJung-uk Kim	paddd	%xmm2,%xmm3
35274c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
35284c6a0400SJung-uk Kim	movdqa	%xmm1,`16*($i+1)-128`(%rax)
35294c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm1
35304c6a0400SJung-uk Kim
35314c6a0400SJung-uk Kim	paddd	%xmm3,%xmm0
35324c6a0400SJung-uk Kim	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
35334c6a0400SJung-uk Kim	movdqa	%xmm2,`16*($i+2)-128`(%rax)
35344c6a0400SJung-uk Kim	movdqa	%xmm4,%xmm2
35354c6a0400SJung-uk Kim___
35364c6a0400SJung-uk Kim}
35374c6a0400SJung-uk Kim$code.=<<___;
35384c6a0400SJung-uk Kim	movdqa	%xmm3,`16*($i-1)-128`(%rax)
35394c6a0400SJung-uk Kim	jmp	.Lgather
35404c6a0400SJung-uk Kim
35414c6a0400SJung-uk Kim.align	32
35424c6a0400SJung-uk Kim.Lgather:
35434c6a0400SJung-uk Kim	pxor	%xmm4,%xmm4
35444c6a0400SJung-uk Kim	pxor	%xmm5,%xmm5
35454c6a0400SJung-uk Kim___
35464c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16;$i+=4) {
35474c6a0400SJung-uk Kim$code.=<<___;
35484c6a0400SJung-uk Kim	movdqa	`16*($i+0)-128`(%r11),%xmm0
35494c6a0400SJung-uk Kim	movdqa	`16*($i+1)-128`(%r11),%xmm1
35504c6a0400SJung-uk Kim	movdqa	`16*($i+2)-128`(%r11),%xmm2
35514c6a0400SJung-uk Kim	pand	`16*($i+0)-128`(%rax),%xmm0
35524c6a0400SJung-uk Kim	movdqa	`16*($i+3)-128`(%r11),%xmm3
35534c6a0400SJung-uk Kim	pand	`16*($i+1)-128`(%rax),%xmm1
35544c6a0400SJung-uk Kim	por	%xmm0,%xmm4
35554c6a0400SJung-uk Kim	pand	`16*($i+2)-128`(%rax),%xmm2
35564c6a0400SJung-uk Kim	por	%xmm1,%xmm5
35574c6a0400SJung-uk Kim	pand	`16*($i+3)-128`(%rax),%xmm3
35584c6a0400SJung-uk Kim	por	%xmm2,%xmm4
35594c6a0400SJung-uk Kim	por	%xmm3,%xmm5
35604c6a0400SJung-uk Kim___
35614c6a0400SJung-uk Kim}
35624c6a0400SJung-uk Kim$code.=<<___;
35634c6a0400SJung-uk Kim	por	%xmm5,%xmm4
35644c6a0400SJung-uk Kim	lea	$STRIDE(%r11),%r11
35654c6a0400SJung-uk Kim	pshufd	\$0x4e,%xmm4,%xmm0
35664c6a0400SJung-uk Kim	por	%xmm4,%xmm0
35671f13597dSJung-uk Kim	movq	%xmm0,($out)		# m0=bp[0]
35681f13597dSJung-uk Kim	lea	8($out),$out
35691f13597dSJung-uk Kim	sub	\$1,$num
35701f13597dSJung-uk Kim	jnz	.Lgather
35714c6a0400SJung-uk Kim
35724c6a0400SJung-uk Kim	lea	(%r10),%rsp
35731f13597dSJung-uk Kim	ret
35741f13597dSJung-uk Kim.LSEH_end_bn_gather5:
357517f01e99SJung-uk Kim.cfi_endproc
35761f13597dSJung-uk Kim.size	bn_gather5,.-bn_gather5
35771f13597dSJung-uk Kim___
35781f13597dSJung-uk Kim}
35791f13597dSJung-uk Kim$code.=<<___;
35801f13597dSJung-uk Kim.align	64
35814c6a0400SJung-uk Kim.Linc:
35824c6a0400SJung-uk Kim	.long	0,0, 1,1
35834c6a0400SJung-uk Kim	.long	2,2, 2,2
35841f13597dSJung-uk Kim.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
35851f13597dSJung-uk Kim___
35861f13597dSJung-uk Kim
35871f13597dSJung-uk Kim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
35881f13597dSJung-uk Kim#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
35891f13597dSJung-uk Kimif ($win64) {
35901f13597dSJung-uk Kim$rec="%rcx";
35911f13597dSJung-uk Kim$frame="%rdx";
35921f13597dSJung-uk Kim$context="%r8";
35931f13597dSJung-uk Kim$disp="%r9";
35941f13597dSJung-uk Kim
35951f13597dSJung-uk Kim$code.=<<___;
35961f13597dSJung-uk Kim.extern	__imp_RtlVirtualUnwind
35971f13597dSJung-uk Kim.type	mul_handler,\@abi-omnipotent
35981f13597dSJung-uk Kim.align	16
35991f13597dSJung-uk Kimmul_handler:
36001f13597dSJung-uk Kim	push	%rsi
36011f13597dSJung-uk Kim	push	%rdi
36021f13597dSJung-uk Kim	push	%rbx
36031f13597dSJung-uk Kim	push	%rbp
36041f13597dSJung-uk Kim	push	%r12
36051f13597dSJung-uk Kim	push	%r13
36061f13597dSJung-uk Kim	push	%r14
36071f13597dSJung-uk Kim	push	%r15
36081f13597dSJung-uk Kim	pushfq
36091f13597dSJung-uk Kim	sub	\$64,%rsp
36101f13597dSJung-uk Kim
36111f13597dSJung-uk Kim	mov	120($context),%rax	# pull context->Rax
36121f13597dSJung-uk Kim	mov	248($context),%rbx	# pull context->Rip
36131f13597dSJung-uk Kim
36141f13597dSJung-uk Kim	mov	8($disp),%rsi		# disp->ImageBase
36151f13597dSJung-uk Kim	mov	56($disp),%r11		# disp->HandlerData
36161f13597dSJung-uk Kim
36171f13597dSJung-uk Kim	mov	0(%r11),%r10d		# HandlerData[0]
36181f13597dSJung-uk Kim	lea	(%rsi,%r10),%r10	# end of prologue label
36191f13597dSJung-uk Kim	cmp	%r10,%rbx		# context->Rip<end of prologue label
36201f13597dSJung-uk Kim	jb	.Lcommon_seh_tail
36211f13597dSJung-uk Kim
3622aeb5019cSJung-uk Kim	mov	4(%r11),%r10d		# HandlerData[1]
3623e71b7053SJung-uk Kim	lea	(%rsi,%r10),%r10	# beginning of body label
3624e71b7053SJung-uk Kim	cmp	%r10,%rbx		# context->Rip<body label
3625aeb5019cSJung-uk Kim	jb	.Lcommon_pop_regs
3626aeb5019cSJung-uk Kim
36271f13597dSJung-uk Kim	mov	152($context),%rax	# pull context->Rsp
36281f13597dSJung-uk Kim
3629aeb5019cSJung-uk Kim	mov	8(%r11),%r10d		# HandlerData[2]
36301f13597dSJung-uk Kim	lea	(%rsi,%r10),%r10	# epilogue label
36311f13597dSJung-uk Kim	cmp	%r10,%rbx		# context->Rip>=epilogue label
36321f13597dSJung-uk Kim	jae	.Lcommon_seh_tail
36331f13597dSJung-uk Kim
36347bded2dbSJung-uk Kim	lea	.Lmul_epilogue(%rip),%r10
36357bded2dbSJung-uk Kim	cmp	%r10,%rbx
36364c6a0400SJung-uk Kim	ja	.Lbody_40
36377bded2dbSJung-uk Kim
36381f13597dSJung-uk Kim	mov	192($context),%r10	# pull $num
36391f13597dSJung-uk Kim	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
36404c6a0400SJung-uk Kim
3641aeb5019cSJung-uk Kim	jmp	.Lcommon_pop_regs
36421f13597dSJung-uk Kim
36437bded2dbSJung-uk Kim.Lbody_40:
36447bded2dbSJung-uk Kim	mov	40(%rax),%rax		# pull saved stack pointer
3645aeb5019cSJung-uk Kim.Lcommon_pop_regs:
36461f13597dSJung-uk Kim	mov	-8(%rax),%rbx
36471f13597dSJung-uk Kim	mov	-16(%rax),%rbp
36481f13597dSJung-uk Kim	mov	-24(%rax),%r12
36491f13597dSJung-uk Kim	mov	-32(%rax),%r13
36501f13597dSJung-uk Kim	mov	-40(%rax),%r14
36511f13597dSJung-uk Kim	mov	-48(%rax),%r15
36521f13597dSJung-uk Kim	mov	%rbx,144($context)	# restore context->Rbx
36531f13597dSJung-uk Kim	mov	%rbp,160($context)	# restore context->Rbp
36541f13597dSJung-uk Kim	mov	%r12,216($context)	# restore context->R12
36551f13597dSJung-uk Kim	mov	%r13,224($context)	# restore context->R13
36561f13597dSJung-uk Kim	mov	%r14,232($context)	# restore context->R14
36571f13597dSJung-uk Kim	mov	%r15,240($context)	# restore context->R15
36581f13597dSJung-uk Kim
36591f13597dSJung-uk Kim.Lcommon_seh_tail:
36601f13597dSJung-uk Kim	mov	8(%rax),%rdi
36611f13597dSJung-uk Kim	mov	16(%rax),%rsi
36621f13597dSJung-uk Kim	mov	%rax,152($context)	# restore context->Rsp
36631f13597dSJung-uk Kim	mov	%rsi,168($context)	# restore context->Rsi
36641f13597dSJung-uk Kim	mov	%rdi,176($context)	# restore context->Rdi
36651f13597dSJung-uk Kim
36661f13597dSJung-uk Kim	mov	40($disp),%rdi		# disp->ContextRecord
36671f13597dSJung-uk Kim	mov	$context,%rsi		# context
36681f13597dSJung-uk Kim	mov	\$154,%ecx		# sizeof(CONTEXT)
36691f13597dSJung-uk Kim	.long	0xa548f3fc		# cld; rep movsq
36701f13597dSJung-uk Kim
36711f13597dSJung-uk Kim	mov	$disp,%rsi
36721f13597dSJung-uk Kim	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
36731f13597dSJung-uk Kim	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
36741f13597dSJung-uk Kim	mov	0(%rsi),%r8		# arg3, disp->ControlPc
36751f13597dSJung-uk Kim	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
36761f13597dSJung-uk Kim	mov	40(%rsi),%r10		# disp->ContextRecord
36771f13597dSJung-uk Kim	lea	56(%rsi),%r11		# &disp->HandlerData
36781f13597dSJung-uk Kim	lea	24(%rsi),%r12		# &disp->EstablisherFrame
36791f13597dSJung-uk Kim	mov	%r10,32(%rsp)		# arg5
36801f13597dSJung-uk Kim	mov	%r11,40(%rsp)		# arg6
36811f13597dSJung-uk Kim	mov	%r12,48(%rsp)		# arg7
36821f13597dSJung-uk Kim	mov	%rcx,56(%rsp)		# arg8, (NULL)
36831f13597dSJung-uk Kim	call	*__imp_RtlVirtualUnwind(%rip)
36841f13597dSJung-uk Kim
36851f13597dSJung-uk Kim	mov	\$1,%eax		# ExceptionContinueSearch
36861f13597dSJung-uk Kim	add	\$64,%rsp
36871f13597dSJung-uk Kim	popfq
36881f13597dSJung-uk Kim	pop	%r15
36891f13597dSJung-uk Kim	pop	%r14
36901f13597dSJung-uk Kim	pop	%r13
36911f13597dSJung-uk Kim	pop	%r12
36921f13597dSJung-uk Kim	pop	%rbp
36931f13597dSJung-uk Kim	pop	%rbx
36941f13597dSJung-uk Kim	pop	%rdi
36951f13597dSJung-uk Kim	pop	%rsi
36961f13597dSJung-uk Kim	ret
36971f13597dSJung-uk Kim.size	mul_handler,.-mul_handler
36981f13597dSJung-uk Kim
36991f13597dSJung-uk Kim.section	.pdata
37001f13597dSJung-uk Kim.align	4
37011f13597dSJung-uk Kim	.rva	.LSEH_begin_bn_mul_mont_gather5
37021f13597dSJung-uk Kim	.rva	.LSEH_end_bn_mul_mont_gather5
37031f13597dSJung-uk Kim	.rva	.LSEH_info_bn_mul_mont_gather5
37041f13597dSJung-uk Kim
37051f13597dSJung-uk Kim	.rva	.LSEH_begin_bn_mul4x_mont_gather5
37061f13597dSJung-uk Kim	.rva	.LSEH_end_bn_mul4x_mont_gather5
37071f13597dSJung-uk Kim	.rva	.LSEH_info_bn_mul4x_mont_gather5
37081f13597dSJung-uk Kim
37097bded2dbSJung-uk Kim	.rva	.LSEH_begin_bn_power5
37107bded2dbSJung-uk Kim	.rva	.LSEH_end_bn_power5
37117bded2dbSJung-uk Kim	.rva	.LSEH_info_bn_power5
37127bded2dbSJung-uk Kim___
37137bded2dbSJung-uk Kim$code.=<<___ if ($addx);
37147bded2dbSJung-uk Kim	.rva	.LSEH_begin_bn_mulx4x_mont_gather5
37157bded2dbSJung-uk Kim	.rva	.LSEH_end_bn_mulx4x_mont_gather5
37167bded2dbSJung-uk Kim	.rva	.LSEH_info_bn_mulx4x_mont_gather5
37177bded2dbSJung-uk Kim
37187bded2dbSJung-uk Kim	.rva	.LSEH_begin_bn_powerx5
37197bded2dbSJung-uk Kim	.rva	.LSEH_end_bn_powerx5
37207bded2dbSJung-uk Kim	.rva	.LSEH_info_bn_powerx5
37217bded2dbSJung-uk Kim___
37227bded2dbSJung-uk Kim$code.=<<___;
37231f13597dSJung-uk Kim	.rva	.LSEH_begin_bn_gather5
37241f13597dSJung-uk Kim	.rva	.LSEH_end_bn_gather5
37251f13597dSJung-uk Kim	.rva	.LSEH_info_bn_gather5
37261f13597dSJung-uk Kim
37271f13597dSJung-uk Kim.section	.xdata
37281f13597dSJung-uk Kim.align	8
37291f13597dSJung-uk Kim.LSEH_info_bn_mul_mont_gather5:
37301f13597dSJung-uk Kim	.byte	9,0,0,0
37311f13597dSJung-uk Kim	.rva	mul_handler
3732aeb5019cSJung-uk Kim	.rva	.Lmul_body,.Lmul_body,.Lmul_epilogue		# HandlerData[]
37331f13597dSJung-uk Kim.align	8
37341f13597dSJung-uk Kim.LSEH_info_bn_mul4x_mont_gather5:
37351f13597dSJung-uk Kim	.byte	9,0,0,0
37361f13597dSJung-uk Kim	.rva	mul_handler
3737aeb5019cSJung-uk Kim	.rva	.Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
37387bded2dbSJung-uk Kim.align	8
37397bded2dbSJung-uk Kim.LSEH_info_bn_power5:
37407bded2dbSJung-uk Kim	.byte	9,0,0,0
37417bded2dbSJung-uk Kim	.rva	mul_handler
3742aeb5019cSJung-uk Kim	.rva	.Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue	# HandlerData[]
37437bded2dbSJung-uk Kim___
37447bded2dbSJung-uk Kim$code.=<<___ if ($addx);
37457bded2dbSJung-uk Kim.align	8
37467bded2dbSJung-uk Kim.LSEH_info_bn_mulx4x_mont_gather5:
37477bded2dbSJung-uk Kim	.byte	9,0,0,0
37487bded2dbSJung-uk Kim	.rva	mul_handler
3749aeb5019cSJung-uk Kim	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
37507bded2dbSJung-uk Kim.align	8
37517bded2dbSJung-uk Kim.LSEH_info_bn_powerx5:
37527bded2dbSJung-uk Kim	.byte	9,0,0,0
37537bded2dbSJung-uk Kim	.rva	mul_handler
3754aeb5019cSJung-uk Kim	.rva	.Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
37557bded2dbSJung-uk Kim___
37567bded2dbSJung-uk Kim$code.=<<___;
37571f13597dSJung-uk Kim.align	8
37581f13597dSJung-uk Kim.LSEH_info_bn_gather5:
37594c6a0400SJung-uk Kim	.byte	0x01,0x0b,0x03,0x0a
37604c6a0400SJung-uk Kim	.byte	0x0b,0x01,0x21,0x00	# sub	rsp,0x108
37614c6a0400SJung-uk Kim	.byte	0x04,0xa3,0x00,0x00	# lea	r10,(rsp)
37621f13597dSJung-uk Kim.align	8
37631f13597dSJung-uk Kim___
37641f13597dSJung-uk Kim}
37651f13597dSJung-uk Kim
37661f13597dSJung-uk Kim$code =~ s/\`([^\`]*)\`/eval($1)/gem;
37671f13597dSJung-uk Kim
37681f13597dSJung-uk Kimprint $code;
376917f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!";
3770