1#!/usr/bin/env perl 2# SPDX-License-Identifier: GPL-2.0 3 4# This code is taken from the OpenSSL project but the author (Andy Polyakov) 5# has relicensed it under the GPLv2. Therefore this program is free software; 6# you can redistribute it and/or modify it under the terms of the GNU General 7# Public License version 2 as published by the Free Software Foundation. 8# 9# The original headers, including the original license headers, are 10# included below for completeness. 11 12# ==================================================================== 13# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 14# project. The module is, however, dual licensed under OpenSSL and 15# CRYPTOGAMS licenses depending on where you obtain it. For further 16# details see https://www.openssl.org/~appro/cryptogams/. 17# ==================================================================== 18 19# SHA256 block procedure for ARMv4. May 2007. 20 21# Performance is ~2x better than gcc 3.4 generated code and in "abso- 22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 23# byte [on single-issue Xscale PXA250 core]. 24 25# July 2010. 26# 27# Rescheduling for dual-issue pipeline resulted in 22% improvement on 28# Cortex A8 core and ~20 cycles per processed byte. 29 30# February 2011. 31# 32# Profiler-assisted and platform-specific optimization resulted in 16% 33# improvement on Cortex A8 core and ~15.4 cycles per processed byte. 34 35# September 2013. 36# 37# Add NEON implementation. On Cortex A8 it was measured to process one 38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon 39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only 40# code (meaning that latter performs sub-optimally, nothing was done 41# about it). 42 43# May 2014. 44# 45# Add ARMv8 code path performing at 2.0 cpb on Apple A7. 46 47while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 48open STDOUT,">$output"; 49 50$ctx="r0"; $t0="r0"; 51$inp="r1"; $t4="r1"; 52$len="r2"; $t1="r2"; 53$T1="r3"; $t3="r3"; 54$A="r4"; 55$B="r5"; 56$C="r6"; 57$D="r7"; 58$E="r8"; 59$F="r9"; 60$G="r10"; 61$H="r11"; 62@V=($A,$B,$C,$D,$E,$F,$G,$H); 63$t2="r12"; 64$Ktbl="r14"; 65 66@Sigma0=( 2,13,22); 67@Sigma1=( 6,11,25); 68@sigma0=( 7,18, 3); 69@sigma1=(17,19,10); 70 71sub BODY_00_15 { 72my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 73 74$code.=<<___ if ($i<16); 75#if __ARM_ARCH__>=7 76 @ ldr $t1,[$inp],#4 @ $i 77# if $i==15 78 str $inp,[sp,#17*4] @ make room for $t4 79# endif 80 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 81 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 82 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 83# ifndef __ARMEB__ 84 rev $t1,$t1 85# endif 86#else 87 @ ldrb $t1,[$inp,#3] @ $i 88 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 89 ldrb $t2,[$inp,#2] 90 ldrb $t0,[$inp,#1] 91 orr $t1,$t1,$t2,lsl#8 92 ldrb $t2,[$inp],#4 93 orr $t1,$t1,$t0,lsl#16 94# if $i==15 95 str $inp,[sp,#17*4] @ make room for $t4 96# endif 97 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 98 orr $t1,$t1,$t2,lsl#24 99 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 100#endif 101___ 102$code.=<<___; 103 ldr $t2,[$Ktbl],#4 @ *K256++ 104 add $h,$h,$t1 @ h+=X[i] 105 str $t1,[sp,#`$i%16`*4] 106 eor $t1,$f,$g 107 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) 108 and $t1,$t1,$e 109 add $h,$h,$t2 @ h+=K256[i] 110 eor $t1,$t1,$g @ Ch(e,f,g) 111 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` 112 add $h,$h,$t1 @ h+=Ch(e,f,g) 113#if $i==31 114 and $t2,$t2,#0xff 115 cmp $t2,#0xf2 @ done? 116#endif 117#if $i<15 118# if __ARM_ARCH__>=7 119 ldr $t1,[$inp],#4 @ prefetch 120# else 121 ldrb $t1,[$inp,#3] 122# endif 123 eor $t2,$a,$b @ a^b, b^c in next round 124#else 125 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx 126 eor $t2,$a,$b @ a^b, b^c in next round 127 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx 128#endif 129 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) 130 and $t3,$t3,$t2 @ (b^c)&=(a^b) 131 add $d,$d,$h @ d+=h 132 eor $t3,$t3,$b @ Maj(a,b,c) 133 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) 134 @ add $h,$h,$t3 @ h+=Maj(a,b,c) 135___ 136 ($t2,$t3)=($t3,$t2); 137} 138 139sub BODY_16_XX { 140my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 141 142$code.=<<___; 143 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i 144 @ ldr $t4,[sp,#`($i+14)%16`*4] 145 mov $t0,$t1,ror#$sigma0[0] 146 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 147 mov $t2,$t4,ror#$sigma1[0] 148 eor $t0,$t0,$t1,ror#$sigma0[1] 149 eor $t2,$t2,$t4,ror#$sigma1[1] 150 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 151 ldr $t1,[sp,#`($i+0)%16`*4] 152 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) 153 ldr $t4,[sp,#`($i+9)%16`*4] 154 155 add $t2,$t2,$t0 156 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 157 add $t1,$t1,$t2 158 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 159 add $t1,$t1,$t4 @ X[i] 160___ 161 &BODY_00_15(@_); 162} 163 164$code=<<___; 165#ifndef __KERNEL__ 166# include "arm_arch.h" 167#else 168# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 169# define __ARM_MAX_ARCH__ 7 170#endif 171 172.text 173#if __ARM_ARCH__<7 174.code 32 175#else 176.syntax unified 177# ifdef __thumb2__ 178# define adrl adr 179.thumb 180# else 181.code 32 182# endif 183#endif 184 185.type K256,%object 186.align 5 187K256: 188.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 189.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 190.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 191.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 192.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 193.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 194.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 195.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 196.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 197.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 198.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 199.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 200.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 201.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 202.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 203.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 204.size K256,.-K256 205.word 0 @ terminator 206#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 207.LOPENSSL_armcap: 208.word OPENSSL_armcap_P-sha256_block_data_order 209#endif 210.align 5 211 212.global sha256_block_data_order 213.type sha256_block_data_order,%function 214sha256_block_data_order: 215.Lsha256_block_data_order: 216#if __ARM_ARCH__<7 217 sub r3,pc,#8 @ sha256_block_data_order 218#else 219 adr r3,.Lsha256_block_data_order 220#endif 221#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 222 ldr r12,.LOPENSSL_armcap 223 ldr r12,[r3,r12] @ OPENSSL_armcap_P 224 tst r12,#ARMV8_SHA256 225 bne .LARMv8 226 tst r12,#ARMV7_NEON 227 bne .LNEON 228#endif 229 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 230 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 231 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 232 sub $Ktbl,r3,#256+32 @ K256 233 sub sp,sp,#16*4 @ alloca(X[16]) 234.Loop: 235# if __ARM_ARCH__>=7 236 ldr $t1,[$inp],#4 237# else 238 ldrb $t1,[$inp,#3] 239# endif 240 eor $t3,$B,$C @ magic 241 eor $t2,$t2,$t2 242___ 243for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 244$code.=".Lrounds_16_xx:\n"; 245for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 246$code.=<<___; 247#if __ARM_ARCH__>=7 248 ite eq @ Thumb2 thing, sanity check in ARM 249#endif 250 ldreq $t3,[sp,#16*4] @ pull ctx 251 bne .Lrounds_16_xx 252 253 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 254 ldr $t0,[$t3,#0] 255 ldr $t1,[$t3,#4] 256 ldr $t2,[$t3,#8] 257 add $A,$A,$t0 258 ldr $t0,[$t3,#12] 259 add $B,$B,$t1 260 ldr $t1,[$t3,#16] 261 add $C,$C,$t2 262 ldr $t2,[$t3,#20] 263 add $D,$D,$t0 264 ldr $t0,[$t3,#24] 265 add $E,$E,$t1 266 ldr $t1,[$t3,#28] 267 add $F,$F,$t2 268 ldr $inp,[sp,#17*4] @ pull inp 269 ldr $t2,[sp,#18*4] @ pull inp+len 270 add $G,$G,$t0 271 add $H,$H,$t1 272 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} 273 cmp $inp,$t2 274 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 275 bne .Loop 276 277 add sp,sp,#`16+3`*4 @ destroy frame 278#if __ARM_ARCH__>=5 279 ldmia sp!,{r4-r11,pc} 280#else 281 ldmia sp!,{r4-r11,lr} 282 tst lr,#1 283 moveq pc,lr @ be binary compatible with V4, yet 284 bx lr @ interoperable with Thumb ISA:-) 285#endif 286.size sha256_block_data_order,.-sha256_block_data_order 287___ 288###################################################################### 289# NEON stuff 290# 291{{{ 292my @X=map("q$_",(0..3)); 293my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); 294my $Xfer=$t4; 295my $j=0; 296 297sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 298sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 299 300sub AUTOLOAD() # thunk [simplified] x86-style perlasm 301{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 302 my $arg = pop; 303 $arg = "#$arg" if ($arg*1 eq $arg); 304 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 305} 306 307sub Xupdate() 308{ use integer; 309 my $body = shift; 310 my @insns = (&$body,&$body,&$body,&$body); 311 my ($a,$b,$c,$d,$e,$f,$g,$h); 312 313 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] 314 eval(shift(@insns)); 315 eval(shift(@insns)); 316 eval(shift(@insns)); 317 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] 318 eval(shift(@insns)); 319 eval(shift(@insns)); 320 eval(shift(@insns)); 321 &vshr_u32 ($T2,$T0,$sigma0[0]); 322 eval(shift(@insns)); 323 eval(shift(@insns)); 324 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] 325 eval(shift(@insns)); 326 eval(shift(@insns)); 327 &vshr_u32 ($T1,$T0,$sigma0[2]); 328 eval(shift(@insns)); 329 eval(shift(@insns)); 330 &vsli_32 ($T2,$T0,32-$sigma0[0]); 331 eval(shift(@insns)); 332 eval(shift(@insns)); 333 &vshr_u32 ($T3,$T0,$sigma0[1]); 334 eval(shift(@insns)); 335 eval(shift(@insns)); 336 &veor ($T1,$T1,$T2); 337 eval(shift(@insns)); 338 eval(shift(@insns)); 339 &vsli_32 ($T3,$T0,32-$sigma0[1]); 340 eval(shift(@insns)); 341 eval(shift(@insns)); 342 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); 343 eval(shift(@insns)); 344 eval(shift(@insns)); 345 &veor ($T1,$T1,$T3); # sigma0(X[1..4]) 346 eval(shift(@insns)); 347 eval(shift(@insns)); 348 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); 349 eval(shift(@insns)); 350 eval(shift(@insns)); 351 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); 352 eval(shift(@insns)); 353 eval(shift(@insns)); 354 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) 355 eval(shift(@insns)); 356 eval(shift(@insns)); 357 &veor ($T5,$T5,$T4); 358 eval(shift(@insns)); 359 eval(shift(@insns)); 360 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); 361 eval(shift(@insns)); 362 eval(shift(@insns)); 363 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); 364 eval(shift(@insns)); 365 eval(shift(@insns)); 366 &veor ($T5,$T5,$T4); # sigma1(X[14..15]) 367 eval(shift(@insns)); 368 eval(shift(@insns)); 369 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) 370 eval(shift(@insns)); 371 eval(shift(@insns)); 372 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); 373 eval(shift(@insns)); 374 eval(shift(@insns)); 375 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); 376 eval(shift(@insns)); 377 eval(shift(@insns)); 378 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); 379 eval(shift(@insns)); 380 eval(shift(@insns)); 381 &veor ($T5,$T5,$T4); 382 eval(shift(@insns)); 383 eval(shift(@insns)); 384 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); 385 eval(shift(@insns)); 386 eval(shift(@insns)); 387 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 388 eval(shift(@insns)); 389 eval(shift(@insns)); 390 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); 391 eval(shift(@insns)); 392 eval(shift(@insns)); 393 &veor ($T5,$T5,$T4); # sigma1(X[16..17]) 394 eval(shift(@insns)); 395 eval(shift(@insns)); 396 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) 397 eval(shift(@insns)); 398 eval(shift(@insns)); 399 &vadd_i32 ($T0,$T0,@X[0]); 400 while($#insns>=2) { eval(shift(@insns)); } 401 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 402 eval(shift(@insns)); 403 eval(shift(@insns)); 404 405 push(@X,shift(@X)); # "rotate" X[] 406} 407 408sub Xpreload() 409{ use integer; 410 my $body = shift; 411 my @insns = (&$body,&$body,&$body,&$body); 412 my ($a,$b,$c,$d,$e,$f,$g,$h); 413 414 eval(shift(@insns)); 415 eval(shift(@insns)); 416 eval(shift(@insns)); 417 eval(shift(@insns)); 418 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 419 eval(shift(@insns)); 420 eval(shift(@insns)); 421 eval(shift(@insns)); 422 eval(shift(@insns)); 423 &vrev32_8 (@X[0],@X[0]); 424 eval(shift(@insns)); 425 eval(shift(@insns)); 426 eval(shift(@insns)); 427 eval(shift(@insns)); 428 &vadd_i32 ($T0,$T0,@X[0]); 429 foreach (@insns) { eval; } # remaining instructions 430 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 431 432 push(@X,shift(@X)); # "rotate" X[] 433} 434 435sub body_00_15 () { 436 ( 437 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. 438 '&add ($h,$h,$t1)', # h+=X[i]+K[i] 439 '&eor ($t1,$f,$g)', 440 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', 441 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past 442 '&and ($t1,$t1,$e)', 443 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) 444 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', 445 '&eor ($t1,$t1,$g)', # Ch(e,f,g) 446 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) 447 '&eor ($t2,$a,$b)', # a^b, b^c in next round 448 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) 449 '&add ($h,$h,$t1)', # h+=Ch(e,f,g) 450 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. 451 '&ldr ($t1,"[$Ktbl]") if ($j==15);'. 452 '&ldr ($t1,"[sp,#64]") if ($j==31)', 453 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) 454 '&add ($d,$d,$h)', # d+=h 455 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) 456 '&eor ($t3,$t3,$b)', # Maj(a,b,c) 457 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' 458 ) 459} 460 461$code.=<<___; 462#if __ARM_MAX_ARCH__>=7 463.arch armv7-a 464.fpu neon 465 466.global sha256_block_data_order_neon 467.type sha256_block_data_order_neon,%function 468.align 4 469sha256_block_data_order_neon: 470.LNEON: 471 stmdb sp!,{r4-r12,lr} 472 473 sub $H,sp,#16*4+16 474 adrl $Ktbl,K256 475 bic $H,$H,#15 @ align for 128-bit stores 476 mov $t2,sp 477 mov sp,$H @ alloca 478 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 479 480 vld1.8 {@X[0]},[$inp]! 481 vld1.8 {@X[1]},[$inp]! 482 vld1.8 {@X[2]},[$inp]! 483 vld1.8 {@X[3]},[$inp]! 484 vld1.32 {$T0},[$Ktbl,:128]! 485 vld1.32 {$T1},[$Ktbl,:128]! 486 vld1.32 {$T2},[$Ktbl,:128]! 487 vld1.32 {$T3},[$Ktbl,:128]! 488 vrev32.8 @X[0],@X[0] @ yes, even on 489 str $ctx,[sp,#64] 490 vrev32.8 @X[1],@X[1] @ big-endian 491 str $inp,[sp,#68] 492 mov $Xfer,sp 493 vrev32.8 @X[2],@X[2] 494 str $len,[sp,#72] 495 vrev32.8 @X[3],@X[3] 496 str $t2,[sp,#76] @ save original sp 497 vadd.i32 $T0,$T0,@X[0] 498 vadd.i32 $T1,$T1,@X[1] 499 vst1.32 {$T0},[$Xfer,:128]! 500 vadd.i32 $T2,$T2,@X[2] 501 vst1.32 {$T1},[$Xfer,:128]! 502 vadd.i32 $T3,$T3,@X[3] 503 vst1.32 {$T2},[$Xfer,:128]! 504 vst1.32 {$T3},[$Xfer,:128]! 505 506 ldmia $ctx,{$A-$H} 507 sub $Xfer,$Xfer,#64 508 ldr $t1,[sp,#0] 509 eor $t2,$t2,$t2 510 eor $t3,$B,$C 511 b .L_00_48 512 513.align 4 514.L_00_48: 515___ 516 &Xupdate(\&body_00_15); 517 &Xupdate(\&body_00_15); 518 &Xupdate(\&body_00_15); 519 &Xupdate(\&body_00_15); 520$code.=<<___; 521 teq $t1,#0 @ check for K256 terminator 522 ldr $t1,[sp,#0] 523 sub $Xfer,$Xfer,#64 524 bne .L_00_48 525 526 ldr $inp,[sp,#68] 527 ldr $t0,[sp,#72] 528 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl 529 teq $inp,$t0 530 it eq 531 subeq $inp,$inp,#64 @ avoid SEGV 532 vld1.8 {@X[0]},[$inp]! @ load next input block 533 vld1.8 {@X[1]},[$inp]! 534 vld1.8 {@X[2]},[$inp]! 535 vld1.8 {@X[3]},[$inp]! 536 it ne 537 strne $inp,[sp,#68] 538 mov $Xfer,sp 539___ 540 &Xpreload(\&body_00_15); 541 &Xpreload(\&body_00_15); 542 &Xpreload(\&body_00_15); 543 &Xpreload(\&body_00_15); 544$code.=<<___; 545 ldr $t0,[$t1,#0] 546 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 547 ldr $t2,[$t1,#4] 548 ldr $t3,[$t1,#8] 549 ldr $t4,[$t1,#12] 550 add $A,$A,$t0 @ accumulate 551 ldr $t0,[$t1,#16] 552 add $B,$B,$t2 553 ldr $t2,[$t1,#20] 554 add $C,$C,$t3 555 ldr $t3,[$t1,#24] 556 add $D,$D,$t4 557 ldr $t4,[$t1,#28] 558 add $E,$E,$t0 559 str $A,[$t1],#4 560 add $F,$F,$t2 561 str $B,[$t1],#4 562 add $G,$G,$t3 563 str $C,[$t1],#4 564 add $H,$H,$t4 565 str $D,[$t1],#4 566 stmia $t1,{$E-$H} 567 568 ittte ne 569 movne $Xfer,sp 570 ldrne $t1,[sp,#0] 571 eorne $t2,$t2,$t2 572 ldreq sp,[sp,#76] @ restore original sp 573 itt ne 574 eorne $t3,$B,$C 575 bne .L_00_48 576 577 ldmia sp!,{r4-r12,pc} 578.size sha256_block_data_order_neon,.-sha256_block_data_order_neon 579#endif 580___ 581}}} 582###################################################################### 583# ARMv8 stuff 584# 585{{{ 586my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); 587my @MSG=map("q$_",(8..11)); 588my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); 589my $Ktbl="r3"; 590 591$code.=<<___; 592#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 593 594# ifdef __thumb2__ 595# define INST(a,b,c,d) .byte c,d|0xc,a,b 596# else 597# define INST(a,b,c,d) .byte a,b,c,d 598# endif 599 600.type sha256_block_data_order_armv8,%function 601.align 5 602sha256_block_data_order_armv8: 603.LARMv8: 604 vld1.32 {$ABCD,$EFGH},[$ctx] 605# ifdef __thumb2__ 606 adr $Ktbl,.LARMv8 607 sub $Ktbl,$Ktbl,#.LARMv8-K256 608# else 609 adrl $Ktbl,K256 610# endif 611 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 612 613.Loop_v8: 614 vld1.8 {@MSG[0]-@MSG[1]},[$inp]! 615 vld1.8 {@MSG[2]-@MSG[3]},[$inp]! 616 vld1.32 {$W0},[$Ktbl]! 617 vrev32.8 @MSG[0],@MSG[0] 618 vrev32.8 @MSG[1],@MSG[1] 619 vrev32.8 @MSG[2],@MSG[2] 620 vrev32.8 @MSG[3],@MSG[3] 621 vmov $ABCD_SAVE,$ABCD @ offload 622 vmov $EFGH_SAVE,$EFGH 623 teq $inp,$len 624___ 625for($i=0;$i<12;$i++) { 626$code.=<<___; 627 vld1.32 {$W1},[$Ktbl]! 628 vadd.i32 $W0,$W0,@MSG[0] 629 sha256su0 @MSG[0],@MSG[1] 630 vmov $abcd,$ABCD 631 sha256h $ABCD,$EFGH,$W0 632 sha256h2 $EFGH,$abcd,$W0 633 sha256su1 @MSG[0],@MSG[2],@MSG[3] 634___ 635 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 636} 637$code.=<<___; 638 vld1.32 {$W1},[$Ktbl]! 639 vadd.i32 $W0,$W0,@MSG[0] 640 vmov $abcd,$ABCD 641 sha256h $ABCD,$EFGH,$W0 642 sha256h2 $EFGH,$abcd,$W0 643 644 vld1.32 {$W0},[$Ktbl]! 645 vadd.i32 $W1,$W1,@MSG[1] 646 vmov $abcd,$ABCD 647 sha256h $ABCD,$EFGH,$W1 648 sha256h2 $EFGH,$abcd,$W1 649 650 vld1.32 {$W1},[$Ktbl] 651 vadd.i32 $W0,$W0,@MSG[2] 652 sub $Ktbl,$Ktbl,#256-16 @ rewind 653 vmov $abcd,$ABCD 654 sha256h $ABCD,$EFGH,$W0 655 sha256h2 $EFGH,$abcd,$W0 656 657 vadd.i32 $W1,$W1,@MSG[3] 658 vmov $abcd,$ABCD 659 sha256h $ABCD,$EFGH,$W1 660 sha256h2 $EFGH,$abcd,$W1 661 662 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE 663 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE 664 it ne 665 bne .Loop_v8 666 667 vst1.32 {$ABCD,$EFGH},[$ctx] 668 669 ret @ bx lr 670.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 671#endif 672___ 673}}} 674$code.=<<___; 675.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 676.align 2 677#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 678.comm OPENSSL_armcap_P,4,4 679#endif 680___ 681 682open SELF,$0; 683while(<SELF>) { 684 next if (/^#!/); 685 last if (!s/^#/@/ and !/^$/); 686 print; 687} 688close SELF; 689 690{ my %opcode = ( 691 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, 692 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); 693 694 sub unsha256 { 695 my ($mnemonic,$arg)=@_; 696 697 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { 698 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 699 |(($2&7)<<17)|(($2&8)<<4) 700 |(($3&7)<<1) |(($3&8)<<2); 701 # since ARMv7 instructions are always encoded little-endian. 702 # correct solution is to use .inst directive, but older 703 # assemblers don't implement it:-( 704 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 705 $word&0xff,($word>>8)&0xff, 706 ($word>>16)&0xff,($word>>24)&0xff, 707 $mnemonic,$arg; 708 } 709 } 710} 711 712foreach (split($/,$code)) { 713 714 s/\`([^\`]*)\`/eval $1/geo; 715 716 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; 717 718 s/\bret\b/bx lr/go or 719 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 720 721 print $_,"\n"; 722} 723 724close STDOUT; # enforce flush 725