xref: /freebsd/crypto/openssl/crypto/sha/asm/sha256-armv4.pl (revision b077aed33b7b6aefca7b17ddb250cf521f938613)
1#! /usr/bin/env perl
2# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15#
16# Permission to use under GPL terms is granted.
17# ====================================================================
18
19# SHA256 block procedure for ARMv4. May 2007.
20
21# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23# byte [on single-issue Xscale PXA250 core].
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28# Cortex A8 core and ~20 cycles per processed byte.
29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35# September 2013.
36#
37# Add NEON implementation. On Cortex A8 it was measured to process one
38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40# code (meaning that latter performs sub-optimally, nothing was done
41# about it).
42
43# May 2014.
44#
45# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47# $output is the last argument if it looks like a file (it has an extension)
48# $flavour is the first argument if it doesn't look like a file
49$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
51
52if ($flavour && $flavour ne "void") {
53    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
55    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
56    die "can't locate arm-xlate.pl";
57
58    open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
59        or die "can't call $xlate: $!";
60} else {
61    $output and open STDOUT,">$output";
62}
63
64$ctx="r0";	$t0="r0";
65$inp="r1";	$t4="r1";
66$len="r2";	$t1="r2";
67$T1="r3";	$t3="r3";
68$A="r4";
69$B="r5";
70$C="r6";
71$D="r7";
72$E="r8";
73$F="r9";
74$G="r10";
75$H="r11";
76@V=($A,$B,$C,$D,$E,$F,$G,$H);
77$t2="r12";
78$Ktbl="r14";
79
80@Sigma0=( 2,13,22);
81@Sigma1=( 6,11,25);
82@sigma0=( 7,18, 3);
83@sigma1=(17,19,10);
84
85sub BODY_00_15 {
86my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
87
88$code.=<<___ if ($i<16);
89#if __ARM_ARCH__>=7
90	@ ldr	$t1,[$inp],#4			@ $i
91# if $i==15
92	str	$inp,[sp,#17*4]			@ make room for $t4
93# endif
94	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
95	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
96	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
97# ifndef __ARMEB__
98	rev	$t1,$t1
99# endif
100#else
101	@ ldrb	$t1,[$inp,#3]			@ $i
102	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
103	ldrb	$t2,[$inp,#2]
104	ldrb	$t0,[$inp,#1]
105	orr	$t1,$t1,$t2,lsl#8
106	ldrb	$t2,[$inp],#4
107	orr	$t1,$t1,$t0,lsl#16
108# if $i==15
109	str	$inp,[sp,#17*4]			@ make room for $t4
110# endif
111	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
112	orr	$t1,$t1,$t2,lsl#24
113	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
114#endif
115___
116$code.=<<___;
117	ldr	$t2,[$Ktbl],#4			@ *K256++
118	add	$h,$h,$t1			@ h+=X[i]
119	str	$t1,[sp,#`$i%16`*4]
120	eor	$t1,$f,$g
121	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
122	and	$t1,$t1,$e
123	add	$h,$h,$t2			@ h+=K256[i]
124	eor	$t1,$t1,$g			@ Ch(e,f,g)
125	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
126	add	$h,$h,$t1			@ h+=Ch(e,f,g)
127#if $i==31
128	and	$t2,$t2,#0xff
129	cmp	$t2,#0xf2			@ done?
130#endif
131#if $i<15
132# if __ARM_ARCH__>=7
133	ldr	$t1,[$inp],#4			@ prefetch
134# else
135	ldrb	$t1,[$inp,#3]
136# endif
137	eor	$t2,$a,$b			@ a^b, b^c in next round
138#else
139	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
140	eor	$t2,$a,$b			@ a^b, b^c in next round
141	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
142#endif
143	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
144	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
145	add	$d,$d,$h			@ d+=h
146	eor	$t3,$t3,$b			@ Maj(a,b,c)
147	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
148	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
149___
150	($t2,$t3)=($t3,$t2);
151}
152
153sub BODY_16_XX {
154my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
155
156$code.=<<___;
157	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
158	@ ldr	$t4,[sp,#`($i+14)%16`*4]
159	mov	$t0,$t1,ror#$sigma0[0]
160	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
161	mov	$t2,$t4,ror#$sigma1[0]
162	eor	$t0,$t0,$t1,ror#$sigma0[1]
163	eor	$t2,$t2,$t4,ror#$sigma1[1]
164	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
165	ldr	$t1,[sp,#`($i+0)%16`*4]
166	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
167	ldr	$t4,[sp,#`($i+9)%16`*4]
168
169	add	$t2,$t2,$t0
170	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
171	add	$t1,$t1,$t2
172	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
173	add	$t1,$t1,$t4			@ X[i]
174___
175	&BODY_00_15(@_);
176}
177
178$code=<<___;
179#ifndef __KERNEL__
180# include "arm_arch.h"
181#else
182# define __ARM_ARCH__ __LINUX_ARM_ARCH__
183# define __ARM_MAX_ARCH__ 7
184#endif
185
186#if defined(__thumb2__)
187.syntax unified
188.thumb
189#else
190.code   32
191#endif
192
193.text
194
195.type	K256,%object
196.align	5
197K256:
198.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
199.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
200.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
201.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
202.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
203.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
204.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
205.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
206.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
207.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
208.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
209.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
210.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
211.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
212.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
213.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
214.size	K256,.-K256
215.word	0				@ terminator
216#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
217.LOPENSSL_armcap:
218# ifdef	_WIN32
219.word	OPENSSL_armcap_P
220# else
221.word	OPENSSL_armcap_P-.Lsha256_block_data_order
222# endif
223#endif
224.align	5
225
226.global	sha256_block_data_order
227.type	sha256_block_data_order,%function
228sha256_block_data_order:
229.Lsha256_block_data_order:
230#if __ARM_ARCH__<7 && !defined(__thumb2__)
231	sub	r3,pc,#8		@ sha256_block_data_order
232#else
233	adr	r3,.Lsha256_block_data_order
234#endif
235#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
236	ldr	r12,.LOPENSSL_armcap
237# if !defined(_WIN32)
238	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
239# endif
240# if defined(__APPLE__) || defined(_WIN32)
241	ldr	r12,[r12]
242# endif
243	tst	r12,#ARMV8_SHA256
244	bne	.LARMv8
245	tst	r12,#ARMV7_NEON
246	bne	.LNEON
247#endif
248	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
249	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
250	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
251	sub	$Ktbl,r3,#256+32	@ K256
252	sub	sp,sp,#16*4		@ alloca(X[16])
253.Loop:
254# if __ARM_ARCH__>=7
255	ldr	$t1,[$inp],#4
256# else
257	ldrb	$t1,[$inp,#3]
258# endif
259	eor	$t3,$B,$C		@ magic
260	eor	$t2,$t2,$t2
261___
262for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
263$code.=".Lrounds_16_xx:\n";
264for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
265$code.=<<___;
266#ifdef	__thumb2__
267	ite	eq			@ Thumb2 thing, sanity check in ARM
268#endif
269	ldreq	$t3,[sp,#16*4]		@ pull ctx
270	bne	.Lrounds_16_xx
271
272	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
273	ldr	$t0,[$t3,#0]
274	ldr	$t1,[$t3,#4]
275	ldr	$t2,[$t3,#8]
276	add	$A,$A,$t0
277	ldr	$t0,[$t3,#12]
278	add	$B,$B,$t1
279	ldr	$t1,[$t3,#16]
280	add	$C,$C,$t2
281	ldr	$t2,[$t3,#20]
282	add	$D,$D,$t0
283	ldr	$t0,[$t3,#24]
284	add	$E,$E,$t1
285	ldr	$t1,[$t3,#28]
286	add	$F,$F,$t2
287	ldr	$inp,[sp,#17*4]		@ pull inp
288	ldr	$t2,[sp,#18*4]		@ pull inp+len
289	add	$G,$G,$t0
290	add	$H,$H,$t1
291	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
292	cmp	$inp,$t2
293	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
294	bne	.Loop
295
296	add	sp,sp,#`16+3`*4	@ destroy frame
297#if __ARM_ARCH__>=5
298	ldmia	sp!,{r4-r11,pc}
299#else
300	ldmia	sp!,{r4-r11,lr}
301	tst	lr,#1
302	moveq	pc,lr			@ be binary compatible with V4, yet
303	bx	lr			@ interoperable with Thumb ISA:-)
304#endif
305.size	sha256_block_data_order,.-sha256_block_data_order
306___
307######################################################################
308# NEON stuff
309#
310{{{
311my @X=map("q$_",(0..3));
312my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
313my $Xfer=$t4;
314my $j=0;
315
316sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
317sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
318
319sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
320{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
321  my $arg = pop;
322    $arg = "#$arg" if ($arg*1 eq $arg);
323    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
324}
325
326sub Xupdate()
327{ use integer;
328  my $body = shift;
329  my @insns = (&$body,&$body,&$body,&$body);
330  my ($a,$b,$c,$d,$e,$f,$g,$h);
331
332	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
333	 eval(shift(@insns));
334	 eval(shift(@insns));
335	 eval(shift(@insns));
336	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
337	 eval(shift(@insns));
338	 eval(shift(@insns));
339	 eval(shift(@insns));
340	&vshr_u32	($T2,$T0,$sigma0[0]);
341	 eval(shift(@insns));
342	 eval(shift(@insns));
343	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
344	 eval(shift(@insns));
345	 eval(shift(@insns));
346	&vshr_u32	($T1,$T0,$sigma0[2]);
347	 eval(shift(@insns));
348	 eval(shift(@insns));
349	&vsli_32	($T2,$T0,32-$sigma0[0]);
350	 eval(shift(@insns));
351	 eval(shift(@insns));
352	&vshr_u32	($T3,$T0,$sigma0[1]);
353	 eval(shift(@insns));
354	 eval(shift(@insns));
355	&veor		($T1,$T1,$T2);
356	 eval(shift(@insns));
357	 eval(shift(@insns));
358	&vsli_32	($T3,$T0,32-$sigma0[1]);
359	 eval(shift(@insns));
360	 eval(shift(@insns));
361	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
362	 eval(shift(@insns));
363	 eval(shift(@insns));
364	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
365	 eval(shift(@insns));
366	 eval(shift(@insns));
367	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
368	 eval(shift(@insns));
369	 eval(shift(@insns));
370	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
371	 eval(shift(@insns));
372	 eval(shift(@insns));
373	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
374	 eval(shift(@insns));
375	 eval(shift(@insns));
376	  &veor		($T5,$T5,$T4);
377	 eval(shift(@insns));
378	 eval(shift(@insns));
379	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
380	 eval(shift(@insns));
381	 eval(shift(@insns));
382	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
383	 eval(shift(@insns));
384	 eval(shift(@insns));
385	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
386	 eval(shift(@insns));
387	 eval(shift(@insns));
388	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
389	 eval(shift(@insns));
390	 eval(shift(@insns));
391	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
392	 eval(shift(@insns));
393	 eval(shift(@insns));
394	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
395	 eval(shift(@insns));
396	 eval(shift(@insns));
397	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
398	 eval(shift(@insns));
399	 eval(shift(@insns));
400	  &veor		($T5,$T5,$T4);
401	 eval(shift(@insns));
402	 eval(shift(@insns));
403	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
404	 eval(shift(@insns));
405	 eval(shift(@insns));
406	&vld1_32	("{$T0}","[$Ktbl,:128]!");
407	 eval(shift(@insns));
408	 eval(shift(@insns));
409	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
410	 eval(shift(@insns));
411	 eval(shift(@insns));
412	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
413	 eval(shift(@insns));
414	 eval(shift(@insns));
415	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
416	 eval(shift(@insns));
417	 eval(shift(@insns));
418	&vadd_i32	($T0,$T0,@X[0]);
419	 while($#insns>=2) { eval(shift(@insns)); }
420	&vst1_32	("{$T0}","[$Xfer,:128]!");
421	 eval(shift(@insns));
422	 eval(shift(@insns));
423
424	push(@X,shift(@X));		# "rotate" X[]
425}
426
427sub Xpreload()
428{ use integer;
429  my $body = shift;
430  my @insns = (&$body,&$body,&$body,&$body);
431  my ($a,$b,$c,$d,$e,$f,$g,$h);
432
433	 eval(shift(@insns));
434	 eval(shift(@insns));
435	 eval(shift(@insns));
436	 eval(shift(@insns));
437	&vld1_32	("{$T0}","[$Ktbl,:128]!");
438	 eval(shift(@insns));
439	 eval(shift(@insns));
440	 eval(shift(@insns));
441	 eval(shift(@insns));
442	&vrev32_8	(@X[0],@X[0]);
443	 eval(shift(@insns));
444	 eval(shift(@insns));
445	 eval(shift(@insns));
446	 eval(shift(@insns));
447	&vadd_i32	($T0,$T0,@X[0]);
448	 foreach (@insns) { eval; }	# remaining instructions
449	&vst1_32	("{$T0}","[$Xfer,:128]!");
450
451	push(@X,shift(@X));		# "rotate" X[]
452}
453
454sub body_00_15 () {
455	(
456	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
457	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
458	'&eor	($t1,$f,$g)',
459	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
460	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
461	'&and	($t1,$t1,$e)',
462	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
463	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
464	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
465	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
466	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
467	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
468	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
469	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
470	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
471	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
472	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
473	'&add	($d,$d,$h)',			# d+=h
474	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
475	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
476	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
477	)
478}
479
480$code.=<<___;
481#if __ARM_MAX_ARCH__>=7
482.arch	armv7-a
483.fpu	neon
484
485.global	sha256_block_data_order_neon
486.type	sha256_block_data_order_neon,%function
487.align	5
488.skip	16
489sha256_block_data_order_neon:
490.LNEON:
491	stmdb	sp!,{r4-r12,lr}
492
493	sub	$H,sp,#16*4+16
494	adr	$Ktbl,K256
495	bic	$H,$H,#15		@ align for 128-bit stores
496	mov	$t2,sp
497	mov	sp,$H			@ alloca
498	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
499
500	vld1.8		{@X[0]},[$inp]!
501	vld1.8		{@X[1]},[$inp]!
502	vld1.8		{@X[2]},[$inp]!
503	vld1.8		{@X[3]},[$inp]!
504	vld1.32		{$T0},[$Ktbl,:128]!
505	vld1.32		{$T1},[$Ktbl,:128]!
506	vld1.32		{$T2},[$Ktbl,:128]!
507	vld1.32		{$T3},[$Ktbl,:128]!
508	vrev32.8	@X[0],@X[0]		@ yes, even on
509	str		$ctx,[sp,#64]
510	vrev32.8	@X[1],@X[1]		@ big-endian
511	str		$inp,[sp,#68]
512	mov		$Xfer,sp
513	vrev32.8	@X[2],@X[2]
514	str		$len,[sp,#72]
515	vrev32.8	@X[3],@X[3]
516	str		$t2,[sp,#76]		@ save original sp
517	vadd.i32	$T0,$T0,@X[0]
518	vadd.i32	$T1,$T1,@X[1]
519	vst1.32		{$T0},[$Xfer,:128]!
520	vadd.i32	$T2,$T2,@X[2]
521	vst1.32		{$T1},[$Xfer,:128]!
522	vadd.i32	$T3,$T3,@X[3]
523	vst1.32		{$T2},[$Xfer,:128]!
524	vst1.32		{$T3},[$Xfer,:128]!
525
526	ldmia		$ctx,{$A-$H}
527	sub		$Xfer,$Xfer,#64
528	ldr		$t1,[sp,#0]
529	eor		$t2,$t2,$t2
530	eor		$t3,$B,$C
531	b		.L_00_48
532
533.align	4
534.L_00_48:
535___
536	&Xupdate(\&body_00_15);
537	&Xupdate(\&body_00_15);
538	&Xupdate(\&body_00_15);
539	&Xupdate(\&body_00_15);
540$code.=<<___;
541	teq	$t1,#0				@ check for K256 terminator
542	ldr	$t1,[sp,#0]
543	sub	$Xfer,$Xfer,#64
544	bne	.L_00_48
545
546	ldr		$inp,[sp,#68]
547	ldr		$t0,[sp,#72]
548	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
549	teq		$inp,$t0
550	it		eq
551	subeq		$inp,$inp,#64		@ avoid SEGV
552	vld1.8		{@X[0]},[$inp]!		@ load next input block
553	vld1.8		{@X[1]},[$inp]!
554	vld1.8		{@X[2]},[$inp]!
555	vld1.8		{@X[3]},[$inp]!
556	it		ne
557	strne		$inp,[sp,#68]
558	mov		$Xfer,sp
559___
560	&Xpreload(\&body_00_15);
561	&Xpreload(\&body_00_15);
562	&Xpreload(\&body_00_15);
563	&Xpreload(\&body_00_15);
564$code.=<<___;
565	ldr	$t0,[$t1,#0]
566	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
567	ldr	$t2,[$t1,#4]
568	ldr	$t3,[$t1,#8]
569	ldr	$t4,[$t1,#12]
570	add	$A,$A,$t0			@ accumulate
571	ldr	$t0,[$t1,#16]
572	add	$B,$B,$t2
573	ldr	$t2,[$t1,#20]
574	add	$C,$C,$t3
575	ldr	$t3,[$t1,#24]
576	add	$D,$D,$t4
577	ldr	$t4,[$t1,#28]
578	add	$E,$E,$t0
579	str	$A,[$t1],#4
580	add	$F,$F,$t2
581	str	$B,[$t1],#4
582	add	$G,$G,$t3
583	str	$C,[$t1],#4
584	add	$H,$H,$t4
585	str	$D,[$t1],#4
586	stmia	$t1,{$E-$H}
587
588	ittte	ne
589	movne	$Xfer,sp
590	ldrne	$t1,[sp,#0]
591	eorne	$t2,$t2,$t2
592	ldreq	sp,[sp,#76]			@ restore original sp
593	itt	ne
594	eorne	$t3,$B,$C
595	bne	.L_00_48
596
597	ldmia	sp!,{r4-r12,pc}
598.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
599#endif
600___
601}}}
602######################################################################
603# ARMv8 stuff
604#
605{{{
606my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
607my @MSG=map("q$_",(8..11));
608my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
609my $Ktbl="r3";
610my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
611
612$code.=<<___;
613#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
614
615# if defined(__thumb2__)
616#  define INST(a,b,c,d)	$_byte	c,d|0xc,a,b
617# else
618#  define INST(a,b,c,d)	$_byte	a,b,c,d
619# endif
620
621.type	sha256_block_data_order_armv8,%function
622.align	5
623sha256_block_data_order_armv8:
624.LARMv8:
625	vld1.32	{$ABCD,$EFGH},[$ctx]
626	sub	$Ktbl,$Ktbl,#256+32
627	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
628	b	.Loop_v8
629
630.align	4
631.Loop_v8:
632	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
633	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
634	vld1.32		{$W0},[$Ktbl]!
635	vrev32.8	@MSG[0],@MSG[0]
636	vrev32.8	@MSG[1],@MSG[1]
637	vrev32.8	@MSG[2],@MSG[2]
638	vrev32.8	@MSG[3],@MSG[3]
639	vmov		$ABCD_SAVE,$ABCD	@ offload
640	vmov		$EFGH_SAVE,$EFGH
641	teq		$inp,$len
642___
643for($i=0;$i<12;$i++) {
644$code.=<<___;
645	vld1.32		{$W1},[$Ktbl]!
646	vadd.i32	$W0,$W0,@MSG[0]
647	sha256su0	@MSG[0],@MSG[1]
648	vmov		$abcd,$ABCD
649	sha256h		$ABCD,$EFGH,$W0
650	sha256h2	$EFGH,$abcd,$W0
651	sha256su1	@MSG[0],@MSG[2],@MSG[3]
652___
653	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
654}
655$code.=<<___;
656	vld1.32		{$W1},[$Ktbl]!
657	vadd.i32	$W0,$W0,@MSG[0]
658	vmov		$abcd,$ABCD
659	sha256h		$ABCD,$EFGH,$W0
660	sha256h2	$EFGH,$abcd,$W0
661
662	vld1.32		{$W0},[$Ktbl]!
663	vadd.i32	$W1,$W1,@MSG[1]
664	vmov		$abcd,$ABCD
665	sha256h		$ABCD,$EFGH,$W1
666	sha256h2	$EFGH,$abcd,$W1
667
668	vld1.32		{$W1},[$Ktbl]
669	vadd.i32	$W0,$W0,@MSG[2]
670	sub		$Ktbl,$Ktbl,#256-16	@ rewind
671	vmov		$abcd,$ABCD
672	sha256h		$ABCD,$EFGH,$W0
673	sha256h2	$EFGH,$abcd,$W0
674
675	vadd.i32	$W1,$W1,@MSG[3]
676	vmov		$abcd,$ABCD
677	sha256h		$ABCD,$EFGH,$W1
678	sha256h2	$EFGH,$abcd,$W1
679
680	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
681	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
682	it		ne
683	bne		.Loop_v8
684
685	vst1.32		{$ABCD,$EFGH},[$ctx]
686
687	ret		@ bx lr
688.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
689#endif
690___
691}}}
692$code.=<<___;
693.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
694.align	2
695#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
696.comm   OPENSSL_armcap_P,4,4
697#endif
698___
699
700open SELF,$0;
701while(<SELF>) {
702	next if (/^#!/);
703	last if (!s/^#/@/ and !/^$/);
704	print;
705}
706close SELF;
707
708{   my  %opcode = (
709	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
710	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
711
712    sub unsha256 {
713	my ($mnemonic,$arg)=@_;
714
715	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
716	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
717					 |(($2&7)<<17)|(($2&8)<<4)
718					 |(($3&7)<<1) |(($3&8)<<2);
719	    # since ARMv7 instructions are always encoded little-endian.
720	    # correct solution is to use .inst directive, but older
721	    # assemblers don't implement it:-(
722	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
723			$word&0xff,($word>>8)&0xff,
724			($word>>16)&0xff,($word>>24)&0xff,
725			$mnemonic,$arg;
726	}
727    }
728}
729
730foreach (split($/,$code)) {
731
732	s/\`([^\`]*)\`/eval $1/geo;
733
734	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
735
736	s/\bret\b/bx	lr/go		or
737	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
738
739	print $_,"\n";
740}
741
742close STDOUT or die "error closing STDOUT: $!"; # enforce flush
743