sha2-armv8.pl - OpenGrok cross reference for /linux/lib/crypto/arm64/sha2-armv8.pl

Lines Matching +full:1 +full:- +full:9 +full:a +full:- +full:e
2 # SPDX-License-Identifier: GPL-2.0
12 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
15 # this file except in compliance with the License.  You can obtain a copy
31 #		SHA256-hw	SHA256(*)	SHA512
32 # Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
33 # Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
34 # Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
36 # X-Gene			20.0 (+100%)	12.8 (+300%(***))
41 # (**)	The result is a trade-off: it's possible to improve it by
42 #	10% (or by 1 cycle per round), but at the cost of 20% loss
43 #	on Cortex-A53 (or by 4 cycles per round).
44 # (***)	Super-impressive coefficients over gcc-generated code are
46 #	generated with -mgeneral-regs-only is significantly faster
47 #	and the gap is only 40-90%.
52 # version of SHA256 for 64-bit processors. This is because performance
53 # improvement on most wide-spread Cortex-A5x processors was observed
54 # to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
55 # observed that 32-bit NEON SHA256 performs significantly better than
56 # 64-bit scalar version on *some* of the more recent processors. As
57 # result 64-bit NEON version of SHA256 was added to provide best
58 # all-round performance. For example it executes ~30% faster on X-Gene
60 # deliver much less improvement, likely *negative* on Cortex-A5x.
67     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
68     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
69     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
70     die "can't locate arm-xlate.pl";
83 	@sigma0=(1,  8, 7);
103 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
107 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
108 my $j=($i+1)&15;
109 my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
117 $code.=<<___	if ($i<13 && ($i&1));
118 	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ
124 	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
127 	add	$a,$a,$t1			// h+=Sigma0(a)
130 	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
132 # While ARMv8 specifies merged rotate-n-logical operation such as
137 # Cortex-A5x handles merged instructions much better than disjoint
140 	ror	$t0,$e,#$Sigma1[0]
142 	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
143 	and	$t1,$f,$e
144 	bic	$t2,$g,$e
146 	orr	$t1,$t1,$t2			// Ch(e,f,g)
147 	eor	$t2,$a,$b			// a^b, b^c in next round
148 	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e)
149 	ror	$T0,$a,#$Sigma0[0]
150 	add	$h,$h,$t1			// h+=Ch(e,f,g)
151 	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
152 	add	$h,$h,$t0			// h+=Sigma1(e)
153 	and	$t3,$t3,$t2			// (b^c)&=(a^b)
155 	eor	$t3,$t3,$b			// Maj(a,b,c)
156 	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a)
157 	add	$h,$h,$t3			// h+=Maj(a,b,c)
159 	//add	$h,$h,$t1			// h+=Sigma0(a)
162 	ror	$t0,$e,#$Sigma1[0]
164 	ror	$T1,@X[($j+1)&15],#$sigma0[0]
165 	and	$t1,$f,$e
167 	bic	$t2,$g,$e
168 	ror	$T0,$a,#$Sigma0[0]
170 	eor	$t0,$t0,$e,ror#$Sigma1[1]
171 	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
172 	orr	$t1,$t1,$t2			// Ch(e,f,g)
173 	eor	$t2,$a,$b			// a^b, b^c in next round
174 	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e)
175 	eor	$T0,$T0,$a,ror#$Sigma0[1]
176 	add	$h,$h,$t1			// h+=Ch(e,f,g)
177 	and	$t3,$t3,$t2			// (b^c)&=(a^b)
178 	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
179 	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1])
180 	add	$h,$h,$t0			// h+=Sigma1(e)
181 	eor	$t3,$t3,$b			// Maj(a,b,c)
182 	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a)
184 	add	@X[$j],@X[$j],@X[($j+9)&15]
186 	add	$h,$h,$t3			// h+=Maj(a,b,c)
189 	add	$h,$h,$t1			// h+=Sigma0(a)
225 	stp	x29,x30,[sp,#-128]!
235 	ldp	$A,$B,[$ctx]				// load context
237 	ldp	$E,$F,[$ctx,#4*$SZ]
244 	ldp	@X[0],@X[1],[$inp],#2*$SZ
257 	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind
259 	ldp	@X[0],@X[1],[$ctx]
263 	add	$A,$A,@X[0]
265 	add	$B,$B,@X[1]
268 	stp	$A,$B,[$ctx]
269 	add	$E,$E,@X[4]
275 	stp	$E,$F,[$ctx,#4*$SZ]
287 .size	$func,.-$func
356 .size	.LK$BITS,.-.LK$BITS
361 	.long	OPENSSL_armcap_P-.
363 	.quad	OPENSSL_armcap_P-.
384 	stp		x29,x30,[sp,#-16]!
391 	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
392 	sub		$num,$num,#1
395 	rev32		@MSG[1],@MSG[1]
405 	sha256su0	@MSG[0],@MSG[1]
421 	add.i32		$W1,$W1,@MSG[1]
428 	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
447 .size	sha256_block_armv8,.-sha256_block_armv8
453 # You'll surely note a lot of similarities with sha256-armv4 module,
454 # and of course it's not a coincidence. sha256-armv4 was used as
456 # extensively re-tuned for all-round performance.
458 my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
466 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
469     $arg = "#$arg" if ($arg*1 eq $arg);
473 sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
474 sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
475 sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
481   my ($a,$b,$c,$d,$e,$f,$g,$h);
483 	&ext_8		($T0,@X[0],@X[1],4);	# X[1..4]
487 	&ext_8		($T3,@X[2],@X[3],4);	# X[9..12]
497 	&add_32 	(@X[0],@X[0],$T3);	# X[0..3] += X[9..12]
499 	&sli_32		($T2,$T0,32-$sigma0[0]);
502 	&ushr_32	($T3,$T0,$sigma0[1]);
508 	&sli_32		($T3,$T0,32-$sigma0[1]);
514 	&eor_8		($T1,$T1,$T3);		# sigma0(X[1..4])
517 	  &sli_32	($T4,$T7,32-$sigma1[0]);
523 	  &ushr_32	($T3,$T7,$sigma1[1]);
526 	&add_32		(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
529 	  &sli_u32	($T3,$T7,32-$sigma1[1]);
540 	&add_32		(@X[0],@X[0],$T5);	# X[0..1] += sigma1(X[14..15])
549 	  &sli_32	($T6,@X[0],32-$sigma1[0]);
551 	  &ushr_32	($T5,@X[0],$sigma1[1]);
557 	  &sli_32	($T5,@X[0],32-$sigma1[1]);
577 	 while($#insns>=1) { eval(shift(@insns)); }
588   my ($a,$b,$c,$d,$e,$f,$g,$h);
614 	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
616 	'&add	($a,$a,$t4);'.			# h+=Sigma0(a) from the past
617 	'&and	($t1,$f,$e)',
618 	'&bic	($t4,$g,$e)',
619 	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
620 	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
621 	'&orr	($t1,$t1,$t4)',			# Ch(e,f,g)
622 	'&eor	($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
623 	'&eor	($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
624 	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
626 	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
627 	'&eor	($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
628 	'&add	($h,$h,$t0)',			# h+=Sigma1(e)
629 	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
631 	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
634 	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
647 	stp	x29, x30, [sp, #-16]!
655 	ld1.8	{@X[1]},[$inp], #16
663 	rev32	@X[1],@X[1]		// big-endian
668 	add.32	$T1,$T1,@X[1]
670 	st1.32	{$T0-$T1},[$Xfer], #32
672 	st1.32	{$T2-$T3},[$Xfer]
675 	ldp	$A,$B,[$ctx]
677 	ldp	$E,$F,[$ctx,#16]
710 	add	$A,$A,$t4			// h+=Sigma0(a) from the past
712 	add	$A,$A,$t2			// h+=Maj(a,b,c) from the past
714 	add	$A,$A,$t0			// accumulate
720 	add	$E,$E,$t0
723 	stp	$A,$B,[$ctx,#0]
728 	stp	$E,$F,[$ctx,#16]
738 .size	sha256_block_neon,.-sha256_block_neon
755 	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
758 			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
773 	s/\`([^\`]*)\`/eval($1)/ge;
775 	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
777 	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers
779 	s/\.[ui]?8(\s)/$1/;
781 	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g;