xref: /linux/arch/arm/crypto/sha256-armv4.pl (revision 6fdcba32711044c35c0e1b094cbd8f3f0b4472c9)
1#!/usr/bin/env perl
2# SPDX-License-Identifier: GPL-2.0
3
4# This code is taken from the OpenSSL project but the author (Andy Polyakov)
5# has relicensed it under the GPLv2. Therefore this program is free software;
6# you can redistribute it and/or modify it under the terms of the GNU General
7# Public License version 2 as published by the Free Software Foundation.
8#
9# The original headers, including the original license headers, are
10# included below for completeness.
11
12# ====================================================================
13# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14# project. The module is, however, dual licensed under OpenSSL and
15# CRYPTOGAMS licenses depending on where you obtain it. For further
16# details see http://www.openssl.org/~appro/cryptogams/.
17# ====================================================================
18
19# SHA256 block procedure for ARMv4. May 2007.
20
21# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23# byte [on single-issue Xscale PXA250 core].
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28# Cortex A8 core and ~20 cycles per processed byte.
29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35# September 2013.
36#
37# Add NEON implementation. On Cortex A8 it was measured to process one
38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40# code (meaning that latter performs sub-optimally, nothing was done
41# about it).
42
43# May 2014.
44#
45# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
48open STDOUT,">$output";
49
50$ctx="r0";	$t0="r0";
51$inp="r1";	$t4="r1";
52$len="r2";	$t1="r2";
53$T1="r3";	$t3="r3";
54$A="r4";
55$B="r5";
56$C="r6";
57$D="r7";
58$E="r8";
59$F="r9";
60$G="r10";
61$H="r11";
62@V=($A,$B,$C,$D,$E,$F,$G,$H);
63$t2="r12";
64$Ktbl="r14";
65
66@Sigma0=( 2,13,22);
67@Sigma1=( 6,11,25);
68@sigma0=( 7,18, 3);
69@sigma1=(17,19,10);
70
71sub BODY_00_15 {
72my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
73
74$code.=<<___ if ($i<16);
75#if __ARM_ARCH__>=7
76	@ ldr	$t1,[$inp],#4			@ $i
77# if $i==15
78	str	$inp,[sp,#17*4]			@ make room for $t4
79# endif
80	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
81	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
82	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
83# ifndef __ARMEB__
84	rev	$t1,$t1
85# endif
86#else
87	@ ldrb	$t1,[$inp,#3]			@ $i
88	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
89	ldrb	$t2,[$inp,#2]
90	ldrb	$t0,[$inp,#1]
91	orr	$t1,$t1,$t2,lsl#8
92	ldrb	$t2,[$inp],#4
93	orr	$t1,$t1,$t0,lsl#16
94# if $i==15
95	str	$inp,[sp,#17*4]			@ make room for $t4
96# endif
97	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
98	orr	$t1,$t1,$t2,lsl#24
99	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
100#endif
101___
102$code.=<<___;
103	ldr	$t2,[$Ktbl],#4			@ *K256++
104	add	$h,$h,$t1			@ h+=X[i]
105	str	$t1,[sp,#`$i%16`*4]
106	eor	$t1,$f,$g
107	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
108	and	$t1,$t1,$e
109	add	$h,$h,$t2			@ h+=K256[i]
110	eor	$t1,$t1,$g			@ Ch(e,f,g)
111	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
112	add	$h,$h,$t1			@ h+=Ch(e,f,g)
113#if $i==31
114	and	$t2,$t2,#0xff
115	cmp	$t2,#0xf2			@ done?
116#endif
117#if $i<15
118# if __ARM_ARCH__>=7
119	ldr	$t1,[$inp],#4			@ prefetch
120# else
121	ldrb	$t1,[$inp,#3]
122# endif
123	eor	$t2,$a,$b			@ a^b, b^c in next round
124#else
125	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
126	eor	$t2,$a,$b			@ a^b, b^c in next round
127	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
128#endif
129	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
130	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
131	add	$d,$d,$h			@ d+=h
132	eor	$t3,$t3,$b			@ Maj(a,b,c)
133	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
134	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
135___
136	($t2,$t3)=($t3,$t2);
137}
138
139sub BODY_16_XX {
140my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
141
142$code.=<<___;
143	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
144	@ ldr	$t4,[sp,#`($i+14)%16`*4]
145	mov	$t0,$t1,ror#$sigma0[0]
146	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
147	mov	$t2,$t4,ror#$sigma1[0]
148	eor	$t0,$t0,$t1,ror#$sigma0[1]
149	eor	$t2,$t2,$t4,ror#$sigma1[1]
150	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
151	ldr	$t1,[sp,#`($i+0)%16`*4]
152	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
153	ldr	$t4,[sp,#`($i+9)%16`*4]
154
155	add	$t2,$t2,$t0
156	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
157	add	$t1,$t1,$t2
158	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
159	add	$t1,$t1,$t4			@ X[i]
160___
161	&BODY_00_15(@_);
162}
163
164$code=<<___;
165#ifndef __KERNEL__
166# include "arm_arch.h"
167#else
168# define __ARM_ARCH__ __LINUX_ARM_ARCH__
169# define __ARM_MAX_ARCH__ 7
170#endif
171
172.text
173#if __ARM_ARCH__<7
174.code	32
175#else
176.syntax unified
177# ifdef __thumb2__
178#  define adrl adr
179.thumb
180# else
181.code   32
182# endif
183#endif
184
185.type	K256,%object
186.align	5
187K256:
188.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
189.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
190.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
191.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
192.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
193.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
194.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
195.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
196.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
197.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
198.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
199.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
200.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
201.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
202.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
203.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
204.size	K256,.-K256
205.word	0				@ terminator
206#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
207.LOPENSSL_armcap:
208.word	OPENSSL_armcap_P-sha256_block_data_order
209#endif
210.align	5
211
212.global	sha256_block_data_order
213.type	sha256_block_data_order,%function
214sha256_block_data_order:
215.Lsha256_block_data_order:
216#if __ARM_ARCH__<7
217	sub	r3,pc,#8		@ sha256_block_data_order
218#else
219	adr	r3,.Lsha256_block_data_order
220#endif
221#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
222	ldr	r12,.LOPENSSL_armcap
223	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
224	tst	r12,#ARMV8_SHA256
225	bne	.LARMv8
226	tst	r12,#ARMV7_NEON
227	bne	.LNEON
228#endif
229	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
230	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
231	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
232	sub	$Ktbl,r3,#256+32	@ K256
233	sub	sp,sp,#16*4		@ alloca(X[16])
234.Loop:
235# if __ARM_ARCH__>=7
236	ldr	$t1,[$inp],#4
237# else
238	ldrb	$t1,[$inp,#3]
239# endif
240	eor	$t3,$B,$C		@ magic
241	eor	$t2,$t2,$t2
242___
243for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
244$code.=".Lrounds_16_xx:\n";
245for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
246$code.=<<___;
247#if __ARM_ARCH__>=7
248	ite	eq			@ Thumb2 thing, sanity check in ARM
249#endif
250	ldreq	$t3,[sp,#16*4]		@ pull ctx
251	bne	.Lrounds_16_xx
252
253	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
254	ldr	$t0,[$t3,#0]
255	ldr	$t1,[$t3,#4]
256	ldr	$t2,[$t3,#8]
257	add	$A,$A,$t0
258	ldr	$t0,[$t3,#12]
259	add	$B,$B,$t1
260	ldr	$t1,[$t3,#16]
261	add	$C,$C,$t2
262	ldr	$t2,[$t3,#20]
263	add	$D,$D,$t0
264	ldr	$t0,[$t3,#24]
265	add	$E,$E,$t1
266	ldr	$t1,[$t3,#28]
267	add	$F,$F,$t2
268	ldr	$inp,[sp,#17*4]		@ pull inp
269	ldr	$t2,[sp,#18*4]		@ pull inp+len
270	add	$G,$G,$t0
271	add	$H,$H,$t1
272	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
273	cmp	$inp,$t2
274	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
275	bne	.Loop
276
277	add	sp,sp,#`16+3`*4	@ destroy frame
278#if __ARM_ARCH__>=5
279	ldmia	sp!,{r4-r11,pc}
280#else
281	ldmia	sp!,{r4-r11,lr}
282	tst	lr,#1
283	moveq	pc,lr			@ be binary compatible with V4, yet
284	bx	lr			@ interoperable with Thumb ISA:-)
285#endif
286.size	sha256_block_data_order,.-sha256_block_data_order
287___
288######################################################################
289# NEON stuff
290#
291{{{
292my @X=map("q$_",(0..3));
293my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
294my $Xfer=$t4;
295my $j=0;
296
297sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
298sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
299
300sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
301{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
302  my $arg = pop;
303    $arg = "#$arg" if ($arg*1 eq $arg);
304    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
305}
306
307sub Xupdate()
308{ use integer;
309  my $body = shift;
310  my @insns = (&$body,&$body,&$body,&$body);
311  my ($a,$b,$c,$d,$e,$f,$g,$h);
312
313	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
314	 eval(shift(@insns));
315	 eval(shift(@insns));
316	 eval(shift(@insns));
317	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
318	 eval(shift(@insns));
319	 eval(shift(@insns));
320	 eval(shift(@insns));
321	&vshr_u32	($T2,$T0,$sigma0[0]);
322	 eval(shift(@insns));
323	 eval(shift(@insns));
324	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
325	 eval(shift(@insns));
326	 eval(shift(@insns));
327	&vshr_u32	($T1,$T0,$sigma0[2]);
328	 eval(shift(@insns));
329	 eval(shift(@insns));
330	&vsli_32	($T2,$T0,32-$sigma0[0]);
331	 eval(shift(@insns));
332	 eval(shift(@insns));
333	&vshr_u32	($T3,$T0,$sigma0[1]);
334	 eval(shift(@insns));
335	 eval(shift(@insns));
336	&veor		($T1,$T1,$T2);
337	 eval(shift(@insns));
338	 eval(shift(@insns));
339	&vsli_32	($T3,$T0,32-$sigma0[1]);
340	 eval(shift(@insns));
341	 eval(shift(@insns));
342	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
343	 eval(shift(@insns));
344	 eval(shift(@insns));
345	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
346	 eval(shift(@insns));
347	 eval(shift(@insns));
348	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
349	 eval(shift(@insns));
350	 eval(shift(@insns));
351	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
352	 eval(shift(@insns));
353	 eval(shift(@insns));
354	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
355	 eval(shift(@insns));
356	 eval(shift(@insns));
357	  &veor		($T5,$T5,$T4);
358	 eval(shift(@insns));
359	 eval(shift(@insns));
360	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
361	 eval(shift(@insns));
362	 eval(shift(@insns));
363	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
364	 eval(shift(@insns));
365	 eval(shift(@insns));
366	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
367	 eval(shift(@insns));
368	 eval(shift(@insns));
369	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
370	 eval(shift(@insns));
371	 eval(shift(@insns));
372	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
373	 eval(shift(@insns));
374	 eval(shift(@insns));
375	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
376	 eval(shift(@insns));
377	 eval(shift(@insns));
378	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
379	 eval(shift(@insns));
380	 eval(shift(@insns));
381	  &veor		($T5,$T5,$T4);
382	 eval(shift(@insns));
383	 eval(shift(@insns));
384	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
385	 eval(shift(@insns));
386	 eval(shift(@insns));
387	&vld1_32	("{$T0}","[$Ktbl,:128]!");
388	 eval(shift(@insns));
389	 eval(shift(@insns));
390	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
391	 eval(shift(@insns));
392	 eval(shift(@insns));
393	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
394	 eval(shift(@insns));
395	 eval(shift(@insns));
396	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
397	 eval(shift(@insns));
398	 eval(shift(@insns));
399	&vadd_i32	($T0,$T0,@X[0]);
400	 while($#insns>=2) { eval(shift(@insns)); }
401	&vst1_32	("{$T0}","[$Xfer,:128]!");
402	 eval(shift(@insns));
403	 eval(shift(@insns));
404
405	push(@X,shift(@X));		# "rotate" X[]
406}
407
408sub Xpreload()
409{ use integer;
410  my $body = shift;
411  my @insns = (&$body,&$body,&$body,&$body);
412  my ($a,$b,$c,$d,$e,$f,$g,$h);
413
414	 eval(shift(@insns));
415	 eval(shift(@insns));
416	 eval(shift(@insns));
417	 eval(shift(@insns));
418	&vld1_32	("{$T0}","[$Ktbl,:128]!");
419	 eval(shift(@insns));
420	 eval(shift(@insns));
421	 eval(shift(@insns));
422	 eval(shift(@insns));
423	&vrev32_8	(@X[0],@X[0]);
424	 eval(shift(@insns));
425	 eval(shift(@insns));
426	 eval(shift(@insns));
427	 eval(shift(@insns));
428	&vadd_i32	($T0,$T0,@X[0]);
429	 foreach (@insns) { eval; }	# remaining instructions
430	&vst1_32	("{$T0}","[$Xfer,:128]!");
431
432	push(@X,shift(@X));		# "rotate" X[]
433}
434
435sub body_00_15 () {
436	(
437	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
438	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
439	'&eor	($t1,$f,$g)',
440	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
441	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
442	'&and	($t1,$t1,$e)',
443	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
444	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
445	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
446	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
447	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
448	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
449	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
450	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
451	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
452	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
453	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
454	'&add	($d,$d,$h)',			# d+=h
455	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
456	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
457	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
458	)
459}
460
461$code.=<<___;
462#if __ARM_MAX_ARCH__>=7
463.arch	armv7-a
464.fpu	neon
465
466.global	sha256_block_data_order_neon
467.type	sha256_block_data_order_neon,%function
468.align	4
469sha256_block_data_order_neon:
470.LNEON:
471	stmdb	sp!,{r4-r12,lr}
472
473	sub	$H,sp,#16*4+16
474	adrl	$Ktbl,K256
475	bic	$H,$H,#15		@ align for 128-bit stores
476	mov	$t2,sp
477	mov	sp,$H			@ alloca
478	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
479
480	vld1.8		{@X[0]},[$inp]!
481	vld1.8		{@X[1]},[$inp]!
482	vld1.8		{@X[2]},[$inp]!
483	vld1.8		{@X[3]},[$inp]!
484	vld1.32		{$T0},[$Ktbl,:128]!
485	vld1.32		{$T1},[$Ktbl,:128]!
486	vld1.32		{$T2},[$Ktbl,:128]!
487	vld1.32		{$T3},[$Ktbl,:128]!
488	vrev32.8	@X[0],@X[0]		@ yes, even on
489	str		$ctx,[sp,#64]
490	vrev32.8	@X[1],@X[1]		@ big-endian
491	str		$inp,[sp,#68]
492	mov		$Xfer,sp
493	vrev32.8	@X[2],@X[2]
494	str		$len,[sp,#72]
495	vrev32.8	@X[3],@X[3]
496	str		$t2,[sp,#76]		@ save original sp
497	vadd.i32	$T0,$T0,@X[0]
498	vadd.i32	$T1,$T1,@X[1]
499	vst1.32		{$T0},[$Xfer,:128]!
500	vadd.i32	$T2,$T2,@X[2]
501	vst1.32		{$T1},[$Xfer,:128]!
502	vadd.i32	$T3,$T3,@X[3]
503	vst1.32		{$T2},[$Xfer,:128]!
504	vst1.32		{$T3},[$Xfer,:128]!
505
506	ldmia		$ctx,{$A-$H}
507	sub		$Xfer,$Xfer,#64
508	ldr		$t1,[sp,#0]
509	eor		$t2,$t2,$t2
510	eor		$t3,$B,$C
511	b		.L_00_48
512
513.align	4
514.L_00_48:
515___
516	&Xupdate(\&body_00_15);
517	&Xupdate(\&body_00_15);
518	&Xupdate(\&body_00_15);
519	&Xupdate(\&body_00_15);
520$code.=<<___;
521	teq	$t1,#0				@ check for K256 terminator
522	ldr	$t1,[sp,#0]
523	sub	$Xfer,$Xfer,#64
524	bne	.L_00_48
525
526	ldr		$inp,[sp,#68]
527	ldr		$t0,[sp,#72]
528	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
529	teq		$inp,$t0
530	it		eq
531	subeq		$inp,$inp,#64		@ avoid SEGV
532	vld1.8		{@X[0]},[$inp]!		@ load next input block
533	vld1.8		{@X[1]},[$inp]!
534	vld1.8		{@X[2]},[$inp]!
535	vld1.8		{@X[3]},[$inp]!
536	it		ne
537	strne		$inp,[sp,#68]
538	mov		$Xfer,sp
539___
540	&Xpreload(\&body_00_15);
541	&Xpreload(\&body_00_15);
542	&Xpreload(\&body_00_15);
543	&Xpreload(\&body_00_15);
544$code.=<<___;
545	ldr	$t0,[$t1,#0]
546	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
547	ldr	$t2,[$t1,#4]
548	ldr	$t3,[$t1,#8]
549	ldr	$t4,[$t1,#12]
550	add	$A,$A,$t0			@ accumulate
551	ldr	$t0,[$t1,#16]
552	add	$B,$B,$t2
553	ldr	$t2,[$t1,#20]
554	add	$C,$C,$t3
555	ldr	$t3,[$t1,#24]
556	add	$D,$D,$t4
557	ldr	$t4,[$t1,#28]
558	add	$E,$E,$t0
559	str	$A,[$t1],#4
560	add	$F,$F,$t2
561	str	$B,[$t1],#4
562	add	$G,$G,$t3
563	str	$C,[$t1],#4
564	add	$H,$H,$t4
565	str	$D,[$t1],#4
566	stmia	$t1,{$E-$H}
567
568	ittte	ne
569	movne	$Xfer,sp
570	ldrne	$t1,[sp,#0]
571	eorne	$t2,$t2,$t2
572	ldreq	sp,[sp,#76]			@ restore original sp
573	itt	ne
574	eorne	$t3,$B,$C
575	bne	.L_00_48
576
577	ldmia	sp!,{r4-r12,pc}
578.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
579#endif
580___
581}}}
582######################################################################
583# ARMv8 stuff
584#
585{{{
586my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
587my @MSG=map("q$_",(8..11));
588my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
589my $Ktbl="r3";
590
591$code.=<<___;
592#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
593
594# ifdef __thumb2__
595#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
596# else
597#  define INST(a,b,c,d)	.byte	a,b,c,d
598# endif
599
600.type	sha256_block_data_order_armv8,%function
601.align	5
602sha256_block_data_order_armv8:
603.LARMv8:
604	vld1.32	{$ABCD,$EFGH},[$ctx]
605# ifdef __thumb2__
606	adr	$Ktbl,.LARMv8
607	sub	$Ktbl,$Ktbl,#.LARMv8-K256
608# else
609	adrl	$Ktbl,K256
610# endif
611	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
612
613.Loop_v8:
614	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
615	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
616	vld1.32		{$W0},[$Ktbl]!
617	vrev32.8	@MSG[0],@MSG[0]
618	vrev32.8	@MSG[1],@MSG[1]
619	vrev32.8	@MSG[2],@MSG[2]
620	vrev32.8	@MSG[3],@MSG[3]
621	vmov		$ABCD_SAVE,$ABCD	@ offload
622	vmov		$EFGH_SAVE,$EFGH
623	teq		$inp,$len
624___
625for($i=0;$i<12;$i++) {
626$code.=<<___;
627	vld1.32		{$W1},[$Ktbl]!
628	vadd.i32	$W0,$W0,@MSG[0]
629	sha256su0	@MSG[0],@MSG[1]
630	vmov		$abcd,$ABCD
631	sha256h		$ABCD,$EFGH,$W0
632	sha256h2	$EFGH,$abcd,$W0
633	sha256su1	@MSG[0],@MSG[2],@MSG[3]
634___
635	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
636}
637$code.=<<___;
638	vld1.32		{$W1},[$Ktbl]!
639	vadd.i32	$W0,$W0,@MSG[0]
640	vmov		$abcd,$ABCD
641	sha256h		$ABCD,$EFGH,$W0
642	sha256h2	$EFGH,$abcd,$W0
643
644	vld1.32		{$W0},[$Ktbl]!
645	vadd.i32	$W1,$W1,@MSG[1]
646	vmov		$abcd,$ABCD
647	sha256h		$ABCD,$EFGH,$W1
648	sha256h2	$EFGH,$abcd,$W1
649
650	vld1.32		{$W1},[$Ktbl]
651	vadd.i32	$W0,$W0,@MSG[2]
652	sub		$Ktbl,$Ktbl,#256-16	@ rewind
653	vmov		$abcd,$ABCD
654	sha256h		$ABCD,$EFGH,$W0
655	sha256h2	$EFGH,$abcd,$W0
656
657	vadd.i32	$W1,$W1,@MSG[3]
658	vmov		$abcd,$ABCD
659	sha256h		$ABCD,$EFGH,$W1
660	sha256h2	$EFGH,$abcd,$W1
661
662	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
663	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
664	it		ne
665	bne		.Loop_v8
666
667	vst1.32		{$ABCD,$EFGH},[$ctx]
668
669	ret		@ bx lr
670.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
671#endif
672___
673}}}
674$code.=<<___;
675.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
676.align	2
677#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
678.comm   OPENSSL_armcap_P,4,4
679#endif
680___
681
682open SELF,$0;
683while(<SELF>) {
684	next if (/^#!/);
685	last if (!s/^#/@/ and !/^$/);
686	print;
687}
688close SELF;
689
690{   my  %opcode = (
691	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
692	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
693
694    sub unsha256 {
695	my ($mnemonic,$arg)=@_;
696
697	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
698	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
699					 |(($2&7)<<17)|(($2&8)<<4)
700					 |(($3&7)<<1) |(($3&8)<<2);
701	    # since ARMv7 instructions are always encoded little-endian.
702	    # correct solution is to use .inst directive, but older
703	    # assemblers don't implement it:-(
704	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
705			$word&0xff,($word>>8)&0xff,
706			($word>>16)&0xff,($word>>24)&0xff,
707			$mnemonic,$arg;
708	}
709    }
710}
711
712foreach (split($/,$code)) {
713
714	s/\`([^\`]*)\`/eval $1/geo;
715
716	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
717
718	s/\bret\b/bx	lr/go		or
719	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
720
721	print $_,"\n";
722}
723
724close STDOUT; # enforce flush
725