xref: /linux/arch/arm/crypto/sha256-armv4.pl (revision c8bfe3fad4f86a029da7157bae9699c816f0c309)
1#!/usr/bin/env perl
2# SPDX-License-Identifier: GPL-2.0
3
4# This code is taken from the OpenSSL project but the author (Andy Polyakov)
5# has relicensed it under the GPLv2. Therefore this program is free software;
6# you can redistribute it and/or modify it under the terms of the GNU General
7# Public License version 2 as published by the Free Software Foundation.
8#
9# The original headers, including the original license headers, are
10# included below for completeness.
11
12# ====================================================================
13# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14# project. The module is, however, dual licensed under OpenSSL and
15# CRYPTOGAMS licenses depending on where you obtain it. For further
16# details see https://www.openssl.org/~appro/cryptogams/.
17# ====================================================================
18
19# SHA256 block procedure for ARMv4. May 2007.
20
21# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23# byte [on single-issue Xscale PXA250 core].
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28# Cortex A8 core and ~20 cycles per processed byte.
29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35# September 2013.
36#
37# Add NEON implementation. On Cortex A8 it was measured to process one
38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40# code (meaning that latter performs sub-optimally, nothing was done
41# about it).
42
43# May 2014.
44#
45# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
48open STDOUT,">$output";
49
50$ctx="r0";	$t0="r0";
51$inp="r1";	$t4="r1";
52$len="r2";	$t1="r2";
53$T1="r3";	$t3="r3";
54$A="r4";
55$B="r5";
56$C="r6";
57$D="r7";
58$E="r8";
59$F="r9";
60$G="r10";
61$H="r11";
62@V=($A,$B,$C,$D,$E,$F,$G,$H);
63$t2="r12";
64$Ktbl="r14";
65
66@Sigma0=( 2,13,22);
67@Sigma1=( 6,11,25);
68@sigma0=( 7,18, 3);
69@sigma1=(17,19,10);
70
71sub BODY_00_15 {
72my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
73
74$code.=<<___ if ($i<16);
75#if __ARM_ARCH__>=7
76	@ ldr	$t1,[$inp],#4			@ $i
77# if $i==15
78	str	$inp,[sp,#17*4]			@ make room for $t4
79# endif
80	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
81	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
82	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
83# ifndef __ARMEB__
84	rev	$t1,$t1
85# endif
86#else
87	@ ldrb	$t1,[$inp,#3]			@ $i
88	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
89	ldrb	$t2,[$inp,#2]
90	ldrb	$t0,[$inp,#1]
91	orr	$t1,$t1,$t2,lsl#8
92	ldrb	$t2,[$inp],#4
93	orr	$t1,$t1,$t0,lsl#16
94# if $i==15
95	str	$inp,[sp,#17*4]			@ make room for $t4
96# endif
97	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
98	orr	$t1,$t1,$t2,lsl#24
99	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
100#endif
101___
102$code.=<<___;
103	ldr	$t2,[$Ktbl],#4			@ *K256++
104	add	$h,$h,$t1			@ h+=X[i]
105	str	$t1,[sp,#`$i%16`*4]
106	eor	$t1,$f,$g
107	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
108	and	$t1,$t1,$e
109	add	$h,$h,$t2			@ h+=K256[i]
110	eor	$t1,$t1,$g			@ Ch(e,f,g)
111	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
112	add	$h,$h,$t1			@ h+=Ch(e,f,g)
113#if $i==31
114	and	$t2,$t2,#0xff
115	cmp	$t2,#0xf2			@ done?
116#endif
117#if $i<15
118# if __ARM_ARCH__>=7
119	ldr	$t1,[$inp],#4			@ prefetch
120# else
121	ldrb	$t1,[$inp,#3]
122# endif
123	eor	$t2,$a,$b			@ a^b, b^c in next round
124#else
125	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
126	eor	$t2,$a,$b			@ a^b, b^c in next round
127	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
128#endif
129	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
130	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
131	add	$d,$d,$h			@ d+=h
132	eor	$t3,$t3,$b			@ Maj(a,b,c)
133	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
134	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
135___
136	($t2,$t3)=($t3,$t2);
137}
138
139sub BODY_16_XX {
140my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
141
142$code.=<<___;
143	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
144	@ ldr	$t4,[sp,#`($i+14)%16`*4]
145	mov	$t0,$t1,ror#$sigma0[0]
146	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
147	mov	$t2,$t4,ror#$sigma1[0]
148	eor	$t0,$t0,$t1,ror#$sigma0[1]
149	eor	$t2,$t2,$t4,ror#$sigma1[1]
150	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
151	ldr	$t1,[sp,#`($i+0)%16`*4]
152	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
153	ldr	$t4,[sp,#`($i+9)%16`*4]
154
155	add	$t2,$t2,$t0
156	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
157	add	$t1,$t1,$t2
158	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
159	add	$t1,$t1,$t4			@ X[i]
160___
161	&BODY_00_15(@_);
162}
163
164$code=<<___;
165#ifndef __KERNEL__
166# include "arm_arch.h"
167#else
168# define __ARM_ARCH__ __LINUX_ARM_ARCH__
169# define __ARM_MAX_ARCH__ 7
170#endif
171
172.text
173#if __ARM_ARCH__<7
174.code	32
175#else
176.syntax unified
177# ifdef __thumb2__
178.thumb
179# else
180.code   32
181# endif
182#endif
183
184.type	K256,%object
185.align	5
186K256:
187.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
188.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
189.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
190.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
191.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
192.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
193.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
194.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
195.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
196.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
197.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
198.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
199.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
200.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
201.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
202.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
203.size	K256,.-K256
204.word	0				@ terminator
205#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
206.LOPENSSL_armcap:
207.word	OPENSSL_armcap_P-sha256_block_data_order
208#endif
209.align	5
210
211.global	sha256_block_data_order
212.type	sha256_block_data_order,%function
213sha256_block_data_order:
214.Lsha256_block_data_order:
215#if __ARM_ARCH__<7
216	sub	r3,pc,#8		@ sha256_block_data_order
217#else
218	adr	r3,.Lsha256_block_data_order
219#endif
220#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
221	ldr	r12,.LOPENSSL_armcap
222	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
223	tst	r12,#ARMV8_SHA256
224	bne	.LARMv8
225	tst	r12,#ARMV7_NEON
226	bne	.LNEON
227#endif
228	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
229	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
230	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
231	sub	$Ktbl,r3,#256+32	@ K256
232	sub	sp,sp,#16*4		@ alloca(X[16])
233.Loop:
234# if __ARM_ARCH__>=7
235	ldr	$t1,[$inp],#4
236# else
237	ldrb	$t1,[$inp,#3]
238# endif
239	eor	$t3,$B,$C		@ magic
240	eor	$t2,$t2,$t2
241___
242for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
243$code.=".Lrounds_16_xx:\n";
244for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
245$code.=<<___;
246#if __ARM_ARCH__>=7
247	ite	eq			@ Thumb2 thing, sanity check in ARM
248#endif
249	ldreq	$t3,[sp,#16*4]		@ pull ctx
250	bne	.Lrounds_16_xx
251
252	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
253	ldr	$t0,[$t3,#0]
254	ldr	$t1,[$t3,#4]
255	ldr	$t2,[$t3,#8]
256	add	$A,$A,$t0
257	ldr	$t0,[$t3,#12]
258	add	$B,$B,$t1
259	ldr	$t1,[$t3,#16]
260	add	$C,$C,$t2
261	ldr	$t2,[$t3,#20]
262	add	$D,$D,$t0
263	ldr	$t0,[$t3,#24]
264	add	$E,$E,$t1
265	ldr	$t1,[$t3,#28]
266	add	$F,$F,$t2
267	ldr	$inp,[sp,#17*4]		@ pull inp
268	ldr	$t2,[sp,#18*4]		@ pull inp+len
269	add	$G,$G,$t0
270	add	$H,$H,$t1
271	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
272	cmp	$inp,$t2
273	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
274	bne	.Loop
275
276	add	sp,sp,#`16+3`*4	@ destroy frame
277#if __ARM_ARCH__>=5
278	ldmia	sp!,{r4-r11,pc}
279#else
280	ldmia	sp!,{r4-r11,lr}
281	tst	lr,#1
282	moveq	pc,lr			@ be binary compatible with V4, yet
283	bx	lr			@ interoperable with Thumb ISA:-)
284#endif
285.size	sha256_block_data_order,.-sha256_block_data_order
286___
287######################################################################
288# NEON stuff
289#
290{{{
291my @X=map("q$_",(0..3));
292my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
293my $Xfer=$t4;
294my $j=0;
295
296sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
297sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
298
299sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
300{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
301  my $arg = pop;
302    $arg = "#$arg" if ($arg*1 eq $arg);
303    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
304}
305
306sub Xupdate()
307{ use integer;
308  my $body = shift;
309  my @insns = (&$body,&$body,&$body,&$body);
310  my ($a,$b,$c,$d,$e,$f,$g,$h);
311
312	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
313	 eval(shift(@insns));
314	 eval(shift(@insns));
315	 eval(shift(@insns));
316	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
317	 eval(shift(@insns));
318	 eval(shift(@insns));
319	 eval(shift(@insns));
320	&vshr_u32	($T2,$T0,$sigma0[0]);
321	 eval(shift(@insns));
322	 eval(shift(@insns));
323	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
324	 eval(shift(@insns));
325	 eval(shift(@insns));
326	&vshr_u32	($T1,$T0,$sigma0[2]);
327	 eval(shift(@insns));
328	 eval(shift(@insns));
329	&vsli_32	($T2,$T0,32-$sigma0[0]);
330	 eval(shift(@insns));
331	 eval(shift(@insns));
332	&vshr_u32	($T3,$T0,$sigma0[1]);
333	 eval(shift(@insns));
334	 eval(shift(@insns));
335	&veor		($T1,$T1,$T2);
336	 eval(shift(@insns));
337	 eval(shift(@insns));
338	&vsli_32	($T3,$T0,32-$sigma0[1]);
339	 eval(shift(@insns));
340	 eval(shift(@insns));
341	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
342	 eval(shift(@insns));
343	 eval(shift(@insns));
344	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
345	 eval(shift(@insns));
346	 eval(shift(@insns));
347	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
348	 eval(shift(@insns));
349	 eval(shift(@insns));
350	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
351	 eval(shift(@insns));
352	 eval(shift(@insns));
353	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
354	 eval(shift(@insns));
355	 eval(shift(@insns));
356	  &veor		($T5,$T5,$T4);
357	 eval(shift(@insns));
358	 eval(shift(@insns));
359	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
360	 eval(shift(@insns));
361	 eval(shift(@insns));
362	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
363	 eval(shift(@insns));
364	 eval(shift(@insns));
365	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
366	 eval(shift(@insns));
367	 eval(shift(@insns));
368	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
369	 eval(shift(@insns));
370	 eval(shift(@insns));
371	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
372	 eval(shift(@insns));
373	 eval(shift(@insns));
374	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
375	 eval(shift(@insns));
376	 eval(shift(@insns));
377	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
378	 eval(shift(@insns));
379	 eval(shift(@insns));
380	  &veor		($T5,$T5,$T4);
381	 eval(shift(@insns));
382	 eval(shift(@insns));
383	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
384	 eval(shift(@insns));
385	 eval(shift(@insns));
386	&vld1_32	("{$T0}","[$Ktbl,:128]!");
387	 eval(shift(@insns));
388	 eval(shift(@insns));
389	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
390	 eval(shift(@insns));
391	 eval(shift(@insns));
392	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
393	 eval(shift(@insns));
394	 eval(shift(@insns));
395	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
396	 eval(shift(@insns));
397	 eval(shift(@insns));
398	&vadd_i32	($T0,$T0,@X[0]);
399	 while($#insns>=2) { eval(shift(@insns)); }
400	&vst1_32	("{$T0}","[$Xfer,:128]!");
401	 eval(shift(@insns));
402	 eval(shift(@insns));
403
404	push(@X,shift(@X));		# "rotate" X[]
405}
406
407sub Xpreload()
408{ use integer;
409  my $body = shift;
410  my @insns = (&$body,&$body,&$body,&$body);
411  my ($a,$b,$c,$d,$e,$f,$g,$h);
412
413	 eval(shift(@insns));
414	 eval(shift(@insns));
415	 eval(shift(@insns));
416	 eval(shift(@insns));
417	&vld1_32	("{$T0}","[$Ktbl,:128]!");
418	 eval(shift(@insns));
419	 eval(shift(@insns));
420	 eval(shift(@insns));
421	 eval(shift(@insns));
422	&vrev32_8	(@X[0],@X[0]);
423	 eval(shift(@insns));
424	 eval(shift(@insns));
425	 eval(shift(@insns));
426	 eval(shift(@insns));
427	&vadd_i32	($T0,$T0,@X[0]);
428	 foreach (@insns) { eval; }	# remaining instructions
429	&vst1_32	("{$T0}","[$Xfer,:128]!");
430
431	push(@X,shift(@X));		# "rotate" X[]
432}
433
434sub body_00_15 () {
435	(
436	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
437	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
438	'&eor	($t1,$f,$g)',
439	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
440	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
441	'&and	($t1,$t1,$e)',
442	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
443	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
444	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
445	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
446	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
447	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
448	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
449	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
450	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
451	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
452	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
453	'&add	($d,$d,$h)',			# d+=h
454	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
455	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
456	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
457	)
458}
459
460$code.=<<___;
461#if __ARM_MAX_ARCH__>=7
462.arch	armv7-a
463.fpu	neon
464
465.global	sha256_block_data_order_neon
466.type	sha256_block_data_order_neon,%function
467.align	4
468sha256_block_data_order_neon:
469.LNEON:
470	stmdb	sp!,{r4-r12,lr}
471
472	sub	$H,sp,#16*4+16
473	adr	$Ktbl,.Lsha256_block_data_order
474	sub	$Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
475	bic	$H,$H,#15		@ align for 128-bit stores
476	mov	$t2,sp
477	mov	sp,$H			@ alloca
478	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
479
480	vld1.8		{@X[0]},[$inp]!
481	vld1.8		{@X[1]},[$inp]!
482	vld1.8		{@X[2]},[$inp]!
483	vld1.8		{@X[3]},[$inp]!
484	vld1.32		{$T0},[$Ktbl,:128]!
485	vld1.32		{$T1},[$Ktbl,:128]!
486	vld1.32		{$T2},[$Ktbl,:128]!
487	vld1.32		{$T3},[$Ktbl,:128]!
488	vrev32.8	@X[0],@X[0]		@ yes, even on
489	str		$ctx,[sp,#64]
490	vrev32.8	@X[1],@X[1]		@ big-endian
491	str		$inp,[sp,#68]
492	mov		$Xfer,sp
493	vrev32.8	@X[2],@X[2]
494	str		$len,[sp,#72]
495	vrev32.8	@X[3],@X[3]
496	str		$t2,[sp,#76]		@ save original sp
497	vadd.i32	$T0,$T0,@X[0]
498	vadd.i32	$T1,$T1,@X[1]
499	vst1.32		{$T0},[$Xfer,:128]!
500	vadd.i32	$T2,$T2,@X[2]
501	vst1.32		{$T1},[$Xfer,:128]!
502	vadd.i32	$T3,$T3,@X[3]
503	vst1.32		{$T2},[$Xfer,:128]!
504	vst1.32		{$T3},[$Xfer,:128]!
505
506	ldmia		$ctx,{$A-$H}
507	sub		$Xfer,$Xfer,#64
508	ldr		$t1,[sp,#0]
509	eor		$t2,$t2,$t2
510	eor		$t3,$B,$C
511	b		.L_00_48
512
513.align	4
514.L_00_48:
515___
516	&Xupdate(\&body_00_15);
517	&Xupdate(\&body_00_15);
518	&Xupdate(\&body_00_15);
519	&Xupdate(\&body_00_15);
520$code.=<<___;
521	teq	$t1,#0				@ check for K256 terminator
522	ldr	$t1,[sp,#0]
523	sub	$Xfer,$Xfer,#64
524	bne	.L_00_48
525
526	ldr		$inp,[sp,#68]
527	ldr		$t0,[sp,#72]
528	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
529	teq		$inp,$t0
530	it		eq
531	subeq		$inp,$inp,#64		@ avoid SEGV
532	vld1.8		{@X[0]},[$inp]!		@ load next input block
533	vld1.8		{@X[1]},[$inp]!
534	vld1.8		{@X[2]},[$inp]!
535	vld1.8		{@X[3]},[$inp]!
536	it		ne
537	strne		$inp,[sp,#68]
538	mov		$Xfer,sp
539___
540	&Xpreload(\&body_00_15);
541	&Xpreload(\&body_00_15);
542	&Xpreload(\&body_00_15);
543	&Xpreload(\&body_00_15);
544$code.=<<___;
545	ldr	$t0,[$t1,#0]
546	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
547	ldr	$t2,[$t1,#4]
548	ldr	$t3,[$t1,#8]
549	ldr	$t4,[$t1,#12]
550	add	$A,$A,$t0			@ accumulate
551	ldr	$t0,[$t1,#16]
552	add	$B,$B,$t2
553	ldr	$t2,[$t1,#20]
554	add	$C,$C,$t3
555	ldr	$t3,[$t1,#24]
556	add	$D,$D,$t4
557	ldr	$t4,[$t1,#28]
558	add	$E,$E,$t0
559	str	$A,[$t1],#4
560	add	$F,$F,$t2
561	str	$B,[$t1],#4
562	add	$G,$G,$t3
563	str	$C,[$t1],#4
564	add	$H,$H,$t4
565	str	$D,[$t1],#4
566	stmia	$t1,{$E-$H}
567
568	ittte	ne
569	movne	$Xfer,sp
570	ldrne	$t1,[sp,#0]
571	eorne	$t2,$t2,$t2
572	ldreq	sp,[sp,#76]			@ restore original sp
573	itt	ne
574	eorne	$t3,$B,$C
575	bne	.L_00_48
576
577	ldmia	sp!,{r4-r12,pc}
578.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
579#endif
580___
581}}}
582######################################################################
583# ARMv8 stuff
584#
585{{{
586my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
587my @MSG=map("q$_",(8..11));
588my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
589my $Ktbl="r3";
590
591$code.=<<___;
592#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
593
594# ifdef __thumb2__
595#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
596# else
597#  define INST(a,b,c,d)	.byte	a,b,c,d
598# endif
599
600.type	sha256_block_data_order_armv8,%function
601.align	5
602sha256_block_data_order_armv8:
603.LARMv8:
604	vld1.32	{$ABCD,$EFGH},[$ctx]
605# ifdef __thumb2__
606	adr	$Ktbl,.LARMv8
607	sub	$Ktbl,$Ktbl,#.LARMv8-K256
608# else
609	adrl	$Ktbl,K256
610# endif
611	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
612
613.Loop_v8:
614	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
615	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
616	vld1.32		{$W0},[$Ktbl]!
617	vrev32.8	@MSG[0],@MSG[0]
618	vrev32.8	@MSG[1],@MSG[1]
619	vrev32.8	@MSG[2],@MSG[2]
620	vrev32.8	@MSG[3],@MSG[3]
621	vmov		$ABCD_SAVE,$ABCD	@ offload
622	vmov		$EFGH_SAVE,$EFGH
623	teq		$inp,$len
624___
625for($i=0;$i<12;$i++) {
626$code.=<<___;
627	vld1.32		{$W1},[$Ktbl]!
628	vadd.i32	$W0,$W0,@MSG[0]
629	sha256su0	@MSG[0],@MSG[1]
630	vmov		$abcd,$ABCD
631	sha256h		$ABCD,$EFGH,$W0
632	sha256h2	$EFGH,$abcd,$W0
633	sha256su1	@MSG[0],@MSG[2],@MSG[3]
634___
635	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
636}
637$code.=<<___;
638	vld1.32		{$W1},[$Ktbl]!
639	vadd.i32	$W0,$W0,@MSG[0]
640	vmov		$abcd,$ABCD
641	sha256h		$ABCD,$EFGH,$W0
642	sha256h2	$EFGH,$abcd,$W0
643
644	vld1.32		{$W0},[$Ktbl]!
645	vadd.i32	$W1,$W1,@MSG[1]
646	vmov		$abcd,$ABCD
647	sha256h		$ABCD,$EFGH,$W1
648	sha256h2	$EFGH,$abcd,$W1
649
650	vld1.32		{$W1},[$Ktbl]
651	vadd.i32	$W0,$W0,@MSG[2]
652	sub		$Ktbl,$Ktbl,#256-16	@ rewind
653	vmov		$abcd,$ABCD
654	sha256h		$ABCD,$EFGH,$W0
655	sha256h2	$EFGH,$abcd,$W0
656
657	vadd.i32	$W1,$W1,@MSG[3]
658	vmov		$abcd,$ABCD
659	sha256h		$ABCD,$EFGH,$W1
660	sha256h2	$EFGH,$abcd,$W1
661
662	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
663	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
664	it		ne
665	bne		.Loop_v8
666
667	vst1.32		{$ABCD,$EFGH},[$ctx]
668
669	ret		@ bx lr
670.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
671#endif
672___
673}}}
674$code.=<<___;
675.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
676.align	2
677#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
678.comm   OPENSSL_armcap_P,4,4
679#endif
680___
681
682open SELF,$0;
683while(<SELF>) {
684	next if (/^#!/);
685	last if (!s/^#/@/ and !/^$/);
686	print;
687}
688close SELF;
689
690{   my  %opcode = (
691	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
692	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
693
694    sub unsha256 {
695	my ($mnemonic,$arg)=@_;
696
697	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
698	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
699					 |(($2&7)<<17)|(($2&8)<<4)
700					 |(($3&7)<<1) |(($3&8)<<2);
701	    # since ARMv7 instructions are always encoded little-endian.
702	    # correct solution is to use .inst directive, but older
703	    # assemblers don't implement it:-(
704	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
705			$word&0xff,($word>>8)&0xff,
706			($word>>16)&0xff,($word>>24)&0xff,
707			$mnemonic,$arg;
708	}
709    }
710}
711
712foreach (split($/,$code)) {
713
714	s/\`([^\`]*)\`/eval $1/geo;
715
716	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
717
718	s/\bret\b/bx	lr/go		or
719	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
720
721	print $_,"\n";
722}
723
724close STDOUT; # enforce flush
725