xref: /linux/arch/arm64/crypto/sha512-armv8.pl (revision e3b9f1e81de2083f359bacd2a94bf1c024f2ede0)
1#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14#
15# Permission to use under GPLv2 terms is granted.
16# ====================================================================
17#
18# SHA256/512 for ARMv8.
19#
20# Performance in cycles per processed byte and improvement coefficient
21# over code generated with "default" compiler:
22#
23#		SHA256-hw	SHA256(*)	SHA512
24# Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
25# Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
26# Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
27# Denver	2.01		10.5 (+26%)	6.70 (+8%)
28# X-Gene			20.0 (+100%)	12.8 (+300%(***))
29# Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
30#
31# (*)	Software SHA256 results are of lesser relevance, presented
32#	mostly for informational purposes.
33# (**)	The result is a trade-off: it's possible to improve it by
34#	10% (or by 1 cycle per round), but at the cost of 20% loss
35#	on Cortex-A53 (or by 4 cycles per round).
36# (***)	Super-impressive coefficients over gcc-generated code are
37#	indication of some compiler "pathology", most notably code
38#	generated with -mgeneral-regs-only is significanty faster
39#	and the gap is only 40-90%.
40#
41# October 2016.
42#
43# Originally it was reckoned that it makes no sense to implement NEON
44# version of SHA256 for 64-bit processors. This is because performance
45# improvement on most wide-spread Cortex-A5x processors was observed
46# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
47# observed that 32-bit NEON SHA256 performs significantly better than
48# 64-bit scalar version on *some* of the more recent processors. As
49# result 64-bit NEON version of SHA256 was added to provide best
50# all-round performance. For example it executes ~30% faster on X-Gene
51# and Mongoose. [For reference, NEON version of SHA512 is bound to
52# deliver much less improvement, likely *negative* on Cortex-A5x.
53# Which is why NEON support is limited to SHA256.]
54
55$output=pop;
56$flavour=pop;
57
58if ($flavour && $flavour ne "void") {
59    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
61    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
62    die "can't locate arm-xlate.pl";
63
64    open OUT,"| \"$^X\" $xlate $flavour $output";
65    *STDOUT=*OUT;
66} else {
67    open STDOUT,">$output";
68}
69
70if ($output =~ /512/) {
71	$BITS=512;
72	$SZ=8;
73	@Sigma0=(28,34,39);
74	@Sigma1=(14,18,41);
75	@sigma0=(1,  8, 7);
76	@sigma1=(19,61, 6);
77	$rounds=80;
78	$reg_t="x";
79} else {
80	$BITS=256;
81	$SZ=4;
82	@Sigma0=( 2,13,22);
83	@Sigma1=( 6,11,25);
84	@sigma0=( 7,18, 3);
85	@sigma1=(17,19,10);
86	$rounds=64;
87	$reg_t="w";
88}
89
90$func="sha${BITS}_block_data_order";
91
92($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
93
94@X=map("$reg_t$_",(3..15,0..2));
95@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
96($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
97
98sub BODY_00_xx {
99my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
100my $j=($i+1)&15;
101my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
102   $T0=@X[$i+3] if ($i<11);
103
104$code.=<<___	if ($i<16);
105#ifndef	__AARCH64EB__
106	rev	@X[$i],@X[$i]			// $i
107#endif
108___
109$code.=<<___	if ($i<13 && ($i&1));
110	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ
111___
112$code.=<<___	if ($i==13);
113	ldp	@X[14],@X[15],[$inp]
114___
115$code.=<<___	if ($i>=14);
116	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
117___
118$code.=<<___	if ($i>0 && $i<16);
119	add	$a,$a,$t1			// h+=Sigma0(a)
120___
121$code.=<<___	if ($i>=11);
122	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
123___
124# While ARMv8 specifies merged rotate-n-logical operation such as
125# 'eor x,y,z,ror#n', it was found to negatively affect performance
126# on Apple A7. The reason seems to be that it requires even 'y' to
127# be available earlier. This means that such merged instruction is
128# not necessarily best choice on critical path... On the other hand
129# Cortex-A5x handles merged instructions much better than disjoint
130# rotate and logical... See (**) footnote above.
131$code.=<<___	if ($i<15);
132	ror	$t0,$e,#$Sigma1[0]
133	add	$h,$h,$t2			// h+=K[i]
134	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
135	and	$t1,$f,$e
136	bic	$t2,$g,$e
137	add	$h,$h,@X[$i&15]			// h+=X[i]
138	orr	$t1,$t1,$t2			// Ch(e,f,g)
139	eor	$t2,$a,$b			// a^b, b^c in next round
140	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e)
141	ror	$T0,$a,#$Sigma0[0]
142	add	$h,$h,$t1			// h+=Ch(e,f,g)
143	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
144	add	$h,$h,$t0			// h+=Sigma1(e)
145	and	$t3,$t3,$t2			// (b^c)&=(a^b)
146	add	$d,$d,$h			// d+=h
147	eor	$t3,$t3,$b			// Maj(a,b,c)
148	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a)
149	add	$h,$h,$t3			// h+=Maj(a,b,c)
150	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
151	//add	$h,$h,$t1			// h+=Sigma0(a)
152___
153$code.=<<___	if ($i>=15);
154	ror	$t0,$e,#$Sigma1[0]
155	add	$h,$h,$t2			// h+=K[i]
156	ror	$T1,@X[($j+1)&15],#$sigma0[0]
157	and	$t1,$f,$e
158	ror	$T2,@X[($j+14)&15],#$sigma1[0]
159	bic	$t2,$g,$e
160	ror	$T0,$a,#$Sigma0[0]
161	add	$h,$h,@X[$i&15]			// h+=X[i]
162	eor	$t0,$t0,$e,ror#$Sigma1[1]
163	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
164	orr	$t1,$t1,$t2			// Ch(e,f,g)
165	eor	$t2,$a,$b			// a^b, b^c in next round
166	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e)
167	eor	$T0,$T0,$a,ror#$Sigma0[1]
168	add	$h,$h,$t1			// h+=Ch(e,f,g)
169	and	$t3,$t3,$t2			// (b^c)&=(a^b)
170	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
171	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1])
172	add	$h,$h,$t0			// h+=Sigma1(e)
173	eor	$t3,$t3,$b			// Maj(a,b,c)
174	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a)
175	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14])
176	add	@X[$j],@X[$j],@X[($j+9)&15]
177	add	$d,$d,$h			// d+=h
178	add	$h,$h,$t3			// h+=Maj(a,b,c)
179	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
180	add	@X[$j],@X[$j],$T1
181	add	$h,$h,$t1			// h+=Sigma0(a)
182	add	@X[$j],@X[$j],$T2
183___
184	($t2,$t3)=($t3,$t2);
185}
186
187$code.=<<___;
188#ifndef	__KERNEL__
189# include "arm_arch.h"
190#endif
191
192.text
193
194.extern	OPENSSL_armcap_P
195.globl	$func
196.type	$func,%function
197.align	6
198$func:
199___
200$code.=<<___	if ($SZ==4);
201#ifndef	__KERNEL__
202# ifdef	__ILP32__
203	ldrsw	x16,.LOPENSSL_armcap_P
204# else
205	ldr	x16,.LOPENSSL_armcap_P
206# endif
207	adr	x17,.LOPENSSL_armcap_P
208	add	x16,x16,x17
209	ldr	w16,[x16]
210	tst	w16,#ARMV8_SHA256
211	b.ne	.Lv8_entry
212	tst	w16,#ARMV7_NEON
213	b.ne	.Lneon_entry
214#endif
215___
216$code.=<<___;
217	stp	x29,x30,[sp,#-128]!
218	add	x29,sp,#0
219
220	stp	x19,x20,[sp,#16]
221	stp	x21,x22,[sp,#32]
222	stp	x23,x24,[sp,#48]
223	stp	x25,x26,[sp,#64]
224	stp	x27,x28,[sp,#80]
225	sub	sp,sp,#4*$SZ
226
227	ldp	$A,$B,[$ctx]				// load context
228	ldp	$C,$D,[$ctx,#2*$SZ]
229	ldp	$E,$F,[$ctx,#4*$SZ]
230	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
231	ldp	$G,$H,[$ctx,#6*$SZ]
232	adr	$Ktbl,.LK$BITS
233	stp	$ctx,$num,[x29,#96]
234
235.Loop:
236	ldp	@X[0],@X[1],[$inp],#2*$SZ
237	ldr	$t2,[$Ktbl],#$SZ			// *K++
238	eor	$t3,$B,$C				// magic seed
239	str	$inp,[x29,#112]
240___
241for ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
242$code.=".Loop_16_xx:\n";
243for (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
244$code.=<<___;
245	cbnz	$t2,.Loop_16_xx
246
247	ldp	$ctx,$num,[x29,#96]
248	ldr	$inp,[x29,#112]
249	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind
250
251	ldp	@X[0],@X[1],[$ctx]
252	ldp	@X[2],@X[3],[$ctx,#2*$SZ]
253	add	$inp,$inp,#14*$SZ			// advance input pointer
254	ldp	@X[4],@X[5],[$ctx,#4*$SZ]
255	add	$A,$A,@X[0]
256	ldp	@X[6],@X[7],[$ctx,#6*$SZ]
257	add	$B,$B,@X[1]
258	add	$C,$C,@X[2]
259	add	$D,$D,@X[3]
260	stp	$A,$B,[$ctx]
261	add	$E,$E,@X[4]
262	add	$F,$F,@X[5]
263	stp	$C,$D,[$ctx,#2*$SZ]
264	add	$G,$G,@X[6]
265	add	$H,$H,@X[7]
266	cmp	$inp,$num
267	stp	$E,$F,[$ctx,#4*$SZ]
268	stp	$G,$H,[$ctx,#6*$SZ]
269	b.ne	.Loop
270
271	ldp	x19,x20,[x29,#16]
272	add	sp,sp,#4*$SZ
273	ldp	x21,x22,[x29,#32]
274	ldp	x23,x24,[x29,#48]
275	ldp	x25,x26,[x29,#64]
276	ldp	x27,x28,[x29,#80]
277	ldp	x29,x30,[sp],#128
278	ret
279.size	$func,.-$func
280
281.align	6
282.type	.LK$BITS,%object
283.LK$BITS:
284___
285$code.=<<___ if ($SZ==8);
286	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
287	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
288	.quad	0x3956c25bf348b538,0x59f111f1b605d019
289	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
290	.quad	0xd807aa98a3030242,0x12835b0145706fbe
291	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
292	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
293	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
294	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
295	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
296	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
297	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
298	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
299	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
300	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
301	.quad	0x06ca6351e003826f,0x142929670a0e6e70
302	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
303	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
304	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
305	.quad	0x81c2c92e47edaee6,0x92722c851482353b
306	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
307	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
308	.quad	0xd192e819d6ef5218,0xd69906245565a910
309	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
310	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
311	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
312	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
313	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
314	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
315	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
316	.quad	0x90befffa23631e28,0xa4506cebde82bde9
317	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
318	.quad	0xca273eceea26619c,0xd186b8c721c0c207
319	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
320	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
321	.quad	0x113f9804bef90dae,0x1b710b35131c471b
322	.quad	0x28db77f523047d84,0x32caab7b40c72493
323	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
324	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
325	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
326	.quad	0	// terminator
327___
328$code.=<<___ if ($SZ==4);
329	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
330	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
331	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
332	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
333	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
334	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
335	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
336	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
337	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
338	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
339	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
340	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
341	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
342	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
343	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
344	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
345	.long	0	//terminator
346___
347$code.=<<___;
348.size	.LK$BITS,.-.LK$BITS
349#ifndef	__KERNEL__
350.align	3
351.LOPENSSL_armcap_P:
352# ifdef	__ILP32__
353	.long	OPENSSL_armcap_P-.
354# else
355	.quad	OPENSSL_armcap_P-.
356# endif
357#endif
358.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
359.align	2
360___
361
362if ($SZ==4) {
363my $Ktbl="x3";
364
365my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
366my @MSG=map("v$_.16b",(4..7));
367my ($W0,$W1)=("v16.4s","v17.4s");
368my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
369
370$code.=<<___;
371#ifndef	__KERNEL__
372.type	sha256_block_armv8,%function
373.align	6
374sha256_block_armv8:
375.Lv8_entry:
376	stp		x29,x30,[sp,#-16]!
377	add		x29,sp,#0
378
379	ld1.32		{$ABCD,$EFGH},[$ctx]
380	adr		$Ktbl,.LK256
381
382.Loop_hw:
383	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
384	sub		$num,$num,#1
385	ld1.32		{$W0},[$Ktbl],#16
386	rev32		@MSG[0],@MSG[0]
387	rev32		@MSG[1],@MSG[1]
388	rev32		@MSG[2],@MSG[2]
389	rev32		@MSG[3],@MSG[3]
390	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
391	orr		$EFGH_SAVE,$EFGH,$EFGH
392___
393for($i=0;$i<12;$i++) {
394$code.=<<___;
395	ld1.32		{$W1},[$Ktbl],#16
396	add.i32		$W0,$W0,@MSG[0]
397	sha256su0	@MSG[0],@MSG[1]
398	orr		$abcd,$ABCD,$ABCD
399	sha256h		$ABCD,$EFGH,$W0
400	sha256h2	$EFGH,$abcd,$W0
401	sha256su1	@MSG[0],@MSG[2],@MSG[3]
402___
403	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
404}
405$code.=<<___;
406	ld1.32		{$W1},[$Ktbl],#16
407	add.i32		$W0,$W0,@MSG[0]
408	orr		$abcd,$ABCD,$ABCD
409	sha256h		$ABCD,$EFGH,$W0
410	sha256h2	$EFGH,$abcd,$W0
411
412	ld1.32		{$W0},[$Ktbl],#16
413	add.i32		$W1,$W1,@MSG[1]
414	orr		$abcd,$ABCD,$ABCD
415	sha256h		$ABCD,$EFGH,$W1
416	sha256h2	$EFGH,$abcd,$W1
417
418	ld1.32		{$W1},[$Ktbl]
419	add.i32		$W0,$W0,@MSG[2]
420	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
421	orr		$abcd,$ABCD,$ABCD
422	sha256h		$ABCD,$EFGH,$W0
423	sha256h2	$EFGH,$abcd,$W0
424
425	add.i32		$W1,$W1,@MSG[3]
426	orr		$abcd,$ABCD,$ABCD
427	sha256h		$ABCD,$EFGH,$W1
428	sha256h2	$EFGH,$abcd,$W1
429
430	add.i32		$ABCD,$ABCD,$ABCD_SAVE
431	add.i32		$EFGH,$EFGH,$EFGH_SAVE
432
433	cbnz		$num,.Loop_hw
434
435	st1.32		{$ABCD,$EFGH},[$ctx]
436
437	ldr		x29,[sp],#16
438	ret
439.size	sha256_block_armv8,.-sha256_block_armv8
440#endif
441___
442}
443
444if ($SZ==4) {	######################################### NEON stuff #
445# You'll surely note a lot of similarities with sha256-armv4 module,
446# and of course it's not a coincidence. sha256-armv4 was used as
447# initial template, but was adapted for ARMv8 instruction set and
448# extensively re-tuned for all-round performance.
449
450my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
451my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
452my $Ktbl="x16";
453my $Xfer="x17";
454my @X = map("q$_",(0..3));
455my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
456my $j=0;
457
458sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
459{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
460  my $arg = pop;
461    $arg = "#$arg" if ($arg*1 eq $arg);
462    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
463}
464
465sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
466sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
467sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
468
469sub Xupdate()
470{ use integer;
471  my $body = shift;
472  my @insns = (&$body,&$body,&$body,&$body);
473  my ($a,$b,$c,$d,$e,$f,$g,$h);
474
475	&ext_8		($T0,@X[0],@X[1],4);	# X[1..4]
476	 eval(shift(@insns));
477	 eval(shift(@insns));
478	 eval(shift(@insns));
479	&ext_8		($T3,@X[2],@X[3],4);	# X[9..12]
480	 eval(shift(@insns));
481	 eval(shift(@insns));
482	&mov		(&Dscalar($T7),&Dhi(@X[3]));	# X[14..15]
483	 eval(shift(@insns));
484	 eval(shift(@insns));
485	&ushr_32	($T2,$T0,$sigma0[0]);
486	 eval(shift(@insns));
487	&ushr_32	($T1,$T0,$sigma0[2]);
488	 eval(shift(@insns));
489	&add_32 	(@X[0],@X[0],$T3);	# X[0..3] += X[9..12]
490	 eval(shift(@insns));
491	&sli_32		($T2,$T0,32-$sigma0[0]);
492	 eval(shift(@insns));
493	 eval(shift(@insns));
494	&ushr_32	($T3,$T0,$sigma0[1]);
495	 eval(shift(@insns));
496	 eval(shift(@insns));
497	&eor_8		($T1,$T1,$T2);
498	 eval(shift(@insns));
499	 eval(shift(@insns));
500	&sli_32		($T3,$T0,32-$sigma0[1]);
501	 eval(shift(@insns));
502	 eval(shift(@insns));
503	  &ushr_32	($T4,$T7,$sigma1[0]);
504	 eval(shift(@insns));
505	 eval(shift(@insns));
506	&eor_8		($T1,$T1,$T3);		# sigma0(X[1..4])
507	 eval(shift(@insns));
508	 eval(shift(@insns));
509	  &sli_32	($T4,$T7,32-$sigma1[0]);
510	 eval(shift(@insns));
511	 eval(shift(@insns));
512	  &ushr_32	($T5,$T7,$sigma1[2]);
513	 eval(shift(@insns));
514	 eval(shift(@insns));
515	  &ushr_32	($T3,$T7,$sigma1[1]);
516	 eval(shift(@insns));
517	 eval(shift(@insns));
518	&add_32		(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
519	 eval(shift(@insns));
520	 eval(shift(@insns));
521	  &sli_u32	($T3,$T7,32-$sigma1[1]);
522	 eval(shift(@insns));
523	 eval(shift(@insns));
524	  &eor_8	($T5,$T5,$T4);
525	 eval(shift(@insns));
526	 eval(shift(@insns));
527	 eval(shift(@insns));
528	  &eor_8	($T5,$T5,$T3);		# sigma1(X[14..15])
529	 eval(shift(@insns));
530	 eval(shift(@insns));
531	 eval(shift(@insns));
532	&add_32		(@X[0],@X[0],$T5);	# X[0..1] += sigma1(X[14..15])
533	 eval(shift(@insns));
534	 eval(shift(@insns));
535	 eval(shift(@insns));
536	  &ushr_32	($T6,@X[0],$sigma1[0]);
537	 eval(shift(@insns));
538	  &ushr_32	($T7,@X[0],$sigma1[2]);
539	 eval(shift(@insns));
540	 eval(shift(@insns));
541	  &sli_32	($T6,@X[0],32-$sigma1[0]);
542	 eval(shift(@insns));
543	  &ushr_32	($T5,@X[0],$sigma1[1]);
544	 eval(shift(@insns));
545	 eval(shift(@insns));
546	  &eor_8	($T7,$T7,$T6);
547	 eval(shift(@insns));
548	 eval(shift(@insns));
549	  &sli_32	($T5,@X[0],32-$sigma1[1]);
550	 eval(shift(@insns));
551	 eval(shift(@insns));
552	&ld1_32		("{$T0}","[$Ktbl], #16");
553	 eval(shift(@insns));
554	  &eor_8	($T7,$T7,$T5);		# sigma1(X[16..17])
555	 eval(shift(@insns));
556	 eval(shift(@insns));
557	&eor_8		($T5,$T5,$T5);
558	 eval(shift(@insns));
559	 eval(shift(@insns));
560	&mov		(&Dhi($T5), &Dlo($T7));
561	 eval(shift(@insns));
562	 eval(shift(@insns));
563	 eval(shift(@insns));
564	&add_32		(@X[0],@X[0],$T5);	# X[2..3] += sigma1(X[16..17])
565	 eval(shift(@insns));
566	 eval(shift(@insns));
567	 eval(shift(@insns));
568	&add_32		($T0,$T0,@X[0]);
569	 while($#insns>=1) { eval(shift(@insns)); }
570	&st1_32		("{$T0}","[$Xfer], #16");
571	 eval(shift(@insns));
572
573	push(@X,shift(@X));		# "rotate" X[]
574}
575
576sub Xpreload()
577{ use integer;
578  my $body = shift;
579  my @insns = (&$body,&$body,&$body,&$body);
580  my ($a,$b,$c,$d,$e,$f,$g,$h);
581
582	 eval(shift(@insns));
583	 eval(shift(@insns));
584	&ld1_8		("{@X[0]}","[$inp],#16");
585	 eval(shift(@insns));
586	 eval(shift(@insns));
587	&ld1_32		("{$T0}","[$Ktbl],#16");
588	 eval(shift(@insns));
589	 eval(shift(@insns));
590	 eval(shift(@insns));
591	 eval(shift(@insns));
592	&rev32		(@X[0],@X[0]);
593	 eval(shift(@insns));
594	 eval(shift(@insns));
595	 eval(shift(@insns));
596	 eval(shift(@insns));
597	&add_32		($T0,$T0,@X[0]);
598	 foreach (@insns) { eval; }	# remaining instructions
599	&st1_32		("{$T0}","[$Xfer], #16");
600
601	push(@X,shift(@X));		# "rotate" X[]
602}
603
604sub body_00_15 () {
605	(
606	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
607	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
608	'&add	($a,$a,$t4);'.			# h+=Sigma0(a) from the past
609	'&and	($t1,$f,$e)',
610	'&bic	($t4,$g,$e)',
611	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
612	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
613	'&orr	($t1,$t1,$t4)',			# Ch(e,f,g)
614	'&eor	($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
615	'&eor	($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
616	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
617	'&ror	($t0,$t0,"#$Sigma1[0]")',
618	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
619	'&eor	($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
620	'&add	($h,$h,$t0)',			# h+=Sigma1(e)
621	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
622	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
623	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
624	'&ror	($t4,$t4,"#$Sigma0[0]")',
625	'&add	($d,$d,$h)',			# d+=h
626	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
627	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
628	)
629}
630
631$code.=<<___;
632#ifdef	__KERNEL__
633.globl	sha256_block_neon
634#endif
635.type	sha256_block_neon,%function
636.align	4
637sha256_block_neon:
638.Lneon_entry:
639	stp	x29, x30, [sp, #-16]!
640	mov	x29, sp
641	sub	sp,sp,#16*4
642
643	adr	$Ktbl,.LK256
644	add	$num,$inp,$num,lsl#6	// len to point at the end of inp
645
646	ld1.8	{@X[0]},[$inp], #16
647	ld1.8	{@X[1]},[$inp], #16
648	ld1.8	{@X[2]},[$inp], #16
649	ld1.8	{@X[3]},[$inp], #16
650	ld1.32	{$T0},[$Ktbl], #16
651	ld1.32	{$T1},[$Ktbl], #16
652	ld1.32	{$T2},[$Ktbl], #16
653	ld1.32	{$T3},[$Ktbl], #16
654	rev32	@X[0],@X[0]		// yes, even on
655	rev32	@X[1],@X[1]		// big-endian
656	rev32	@X[2],@X[2]
657	rev32	@X[3],@X[3]
658	mov	$Xfer,sp
659	add.32	$T0,$T0,@X[0]
660	add.32	$T1,$T1,@X[1]
661	add.32	$T2,$T2,@X[2]
662	st1.32	{$T0-$T1},[$Xfer], #32
663	add.32	$T3,$T3,@X[3]
664	st1.32	{$T2-$T3},[$Xfer]
665	sub	$Xfer,$Xfer,#32
666
667	ldp	$A,$B,[$ctx]
668	ldp	$C,$D,[$ctx,#8]
669	ldp	$E,$F,[$ctx,#16]
670	ldp	$G,$H,[$ctx,#24]
671	ldr	$t1,[sp,#0]
672	mov	$t2,wzr
673	eor	$t3,$B,$C
674	mov	$t4,wzr
675	b	.L_00_48
676
677.align	4
678.L_00_48:
679___
680	&Xupdate(\&body_00_15);
681	&Xupdate(\&body_00_15);
682	&Xupdate(\&body_00_15);
683	&Xupdate(\&body_00_15);
684$code.=<<___;
685	cmp	$t1,#0				// check for K256 terminator
686	ldr	$t1,[sp,#0]
687	sub	$Xfer,$Xfer,#64
688	bne	.L_00_48
689
690	sub	$Ktbl,$Ktbl,#256		// rewind $Ktbl
691	cmp	$inp,$num
692	mov	$Xfer, #64
693	csel	$Xfer, $Xfer, xzr, eq
694	sub	$inp,$inp,$Xfer			// avoid SEGV
695	mov	$Xfer,sp
696___
697	&Xpreload(\&body_00_15);
698	&Xpreload(\&body_00_15);
699	&Xpreload(\&body_00_15);
700	&Xpreload(\&body_00_15);
701$code.=<<___;
702	add	$A,$A,$t4			// h+=Sigma0(a) from the past
703	ldp	$t0,$t1,[$ctx,#0]
704	add	$A,$A,$t2			// h+=Maj(a,b,c) from the past
705	ldp	$t2,$t3,[$ctx,#8]
706	add	$A,$A,$t0			// accumulate
707	add	$B,$B,$t1
708	ldp	$t0,$t1,[$ctx,#16]
709	add	$C,$C,$t2
710	add	$D,$D,$t3
711	ldp	$t2,$t3,[$ctx,#24]
712	add	$E,$E,$t0
713	add	$F,$F,$t1
714	 ldr	$t1,[sp,#0]
715	stp	$A,$B,[$ctx,#0]
716	add	$G,$G,$t2
717	 mov	$t2,wzr
718	stp	$C,$D,[$ctx,#8]
719	add	$H,$H,$t3
720	stp	$E,$F,[$ctx,#16]
721	 eor	$t3,$B,$C
722	stp	$G,$H,[$ctx,#24]
723	 mov	$t4,wzr
724	 mov	$Xfer,sp
725	b.ne	.L_00_48
726
727	ldr	x29,[x29]
728	add	sp,sp,#16*4+16
729	ret
730.size	sha256_block_neon,.-sha256_block_neon
731___
732}
733
734$code.=<<___;
735#ifndef	__KERNEL__
736.comm	OPENSSL_armcap_P,4,4
737#endif
738___
739
740{   my  %opcode = (
741	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
742	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
743
744    sub unsha256 {
745	my ($mnemonic,$arg)=@_;
746
747	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
748	&&
749	sprintf ".inst\t0x%08x\t//%s %s",
750			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
751			$mnemonic,$arg;
752    }
753}
754
755open SELF,$0;
756while(<SELF>) {
757        next if (/^#!/);
758        last if (!s/^#/\/\// and !/^$/);
759        print;
760}
761close SELF;
762
763foreach(split("\n",$code)) {
764
765	s/\`([^\`]*)\`/eval($1)/ge;
766
767	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
768
769	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers
770
771	s/\.[ui]?8(\s)/$1/;
772	s/\.\w?32\b//		and s/\.16b/\.4s/g;
773	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g;
774
775	print $_,"\n";
776}
777
778close STDOUT;
779