1#! /usr/bin/env perl 2# Copyright 2016-2025 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# June 2015 18# 19# ChaCha20 for ARMv8. 20# 21# April 2019 22# 23# Replace 3xNEON+1xIALU code path with 4+1. 4+1 is actually fastest 24# option on most(*), but not all, processors, yet 6+2 is retained. 25# This is because penalties are considered tolerable in comparison to 26# improvement on processors where 6+2 helps. Most notably +37% on 27# ThunderX2. It's server-oriented processor which will have to serve 28# as many requests as possible. While others are mostly clients, when 29# performance doesn't have to be absolute top-notch, just fast enough, 30# as majority of time is spent "entertaining" relatively slow human. 31# 32# Performance in cycles per byte out of large buffer. 33# 34# IALU/gcc-4.9 4xNEON+1xIALU 6xNEON+2xIALU 35# 36# Apple A7 5.50/+49% 2.72 1.60 37# Cortex-A53 8.40/+80% 4.06 4.45(*) 38# Cortex-A57 8.06/+43% 4.15 4.40(*) 39# Denver 4.50/+82% 2.30 2.70(*) 40# X-Gene 9.50/+46% 8.20 8.90(*) 41# Mongoose 8.00/+44% 2.74 3.12(*) 42# Kryo 8.17/+50% 4.47 4.65(*) 43# ThunderX2 7.22/+48% 5.64 4.10 44# 45# (*) slower than 4+1:-( 46 47# $output is the last argument if it looks like a file (it has an extension) 48# $flavour is the first argument if it doesn't look like a file 49$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 50$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 51 52$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 53( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 54( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 55die "can't locate arm-xlate.pl"; 56 57open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 58 or die "can't call $xlate: $!"; 59*STDOUT=*OUT; 60 61sub AUTOLOAD() # thunk [simplified] x86-style perlasm 62{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 63 my $arg = pop; 64 $arg = "#$arg" if ($arg*1 eq $arg); 65 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 66} 67 68my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4)); 69 70my @x=map("x$_",(5..17,19..21)); 71my @d=map("x$_",(22..28,30)); 72 73sub ROUND { 74my ($a0,$b0,$c0,$d0)=@_; 75my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 76my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 77my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 78 79 ( 80 "&add_32 (@x[$a0],@x[$a0],@x[$b0])", 81 "&add_32 (@x[$a1],@x[$a1],@x[$b1])", 82 "&add_32 (@x[$a2],@x[$a2],@x[$b2])", 83 "&add_32 (@x[$a3],@x[$a3],@x[$b3])", 84 "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", 85 "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", 86 "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", 87 "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", 88 "&ror_32 (@x[$d0],@x[$d0],16)", 89 "&ror_32 (@x[$d1],@x[$d1],16)", 90 "&ror_32 (@x[$d2],@x[$d2],16)", 91 "&ror_32 (@x[$d3],@x[$d3],16)", 92 93 "&add_32 (@x[$c0],@x[$c0],@x[$d0])", 94 "&add_32 (@x[$c1],@x[$c1],@x[$d1])", 95 "&add_32 (@x[$c2],@x[$c2],@x[$d2])", 96 "&add_32 (@x[$c3],@x[$c3],@x[$d3])", 97 "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", 98 "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", 99 "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", 100 "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", 101 "&ror_32 (@x[$b0],@x[$b0],20)", 102 "&ror_32 (@x[$b1],@x[$b1],20)", 103 "&ror_32 (@x[$b2],@x[$b2],20)", 104 "&ror_32 (@x[$b3],@x[$b3],20)", 105 106 "&add_32 (@x[$a0],@x[$a0],@x[$b0])", 107 "&add_32 (@x[$a1],@x[$a1],@x[$b1])", 108 "&add_32 (@x[$a2],@x[$a2],@x[$b2])", 109 "&add_32 (@x[$a3],@x[$a3],@x[$b3])", 110 "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", 111 "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", 112 "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", 113 "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", 114 "&ror_32 (@x[$d0],@x[$d0],24)", 115 "&ror_32 (@x[$d1],@x[$d1],24)", 116 "&ror_32 (@x[$d2],@x[$d2],24)", 117 "&ror_32 (@x[$d3],@x[$d3],24)", 118 119 "&add_32 (@x[$c0],@x[$c0],@x[$d0])", 120 "&add_32 (@x[$c1],@x[$c1],@x[$d1])", 121 "&add_32 (@x[$c2],@x[$c2],@x[$d2])", 122 "&add_32 (@x[$c3],@x[$c3],@x[$d3])", 123 "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", 124 "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", 125 "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", 126 "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", 127 "&ror_32 (@x[$b0],@x[$b0],25)", 128 "&ror_32 (@x[$b1],@x[$b1],25)", 129 "&ror_32 (@x[$b2],@x[$b2],25)", 130 "&ror_32 (@x[$b3],@x[$b3],25)" 131 ); 132} 133 134$code.=<<___; 135#include "arm_arch.h" 136#ifndef __KERNEL__ 137.extern OPENSSL_armcap_P 138.hidden OPENSSL_armcap_P 139 140.extern ChaCha20_ctr32_sve 141#endif 142 143.rodata 144 145.align 5 146.Lsigma: 147.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 148.Lone: 149.long 1,2,3,4 150.Lrot24: 151.long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f 152.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by \@dot-asm" 153 154.text 155 156.globl ChaCha20_ctr32_dflt 157.type ChaCha20_ctr32_dflt,%function 158.align 5 159ChaCha20_ctr32_dflt: 160 AARCH64_SIGN_LINK_REGISTER 161 cmp $len,#192 162 b.lo .Lshort 163#ifndef __KERNEL__ 164 adrp x17,OPENSSL_armcap_P 165 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 166.Lcheck_neon: 167 tst w17,#ARMV7_NEON 168 b.ne .LChaCha20_neon 169#endif 170 171.Lshort: 172 stp x29,x30,[sp,#-96]! 173 add x29,sp,#0 174 175 adrp @x[0],.Lsigma 176 add @x[0],@x[0],:lo12:.Lsigma 177 stp x19,x20,[sp,#16] 178 stp x21,x22,[sp,#32] 179 stp x23,x24,[sp,#48] 180 stp x25,x26,[sp,#64] 181 stp x27,x28,[sp,#80] 182 sub sp,sp,#64 183 184 ldp @d[0],@d[1],[@x[0]] // load sigma 185 ldp @d[2],@d[3],[$key] // load key 186 ldp @d[4],@d[5],[$key,#16] 187 ldp @d[6],@d[7],[$ctr] // load counter 188#ifdef __AARCH64EB__ 189 ror @d[2],@d[2],#32 190 ror @d[3],@d[3],#32 191 ror @d[4],@d[4],#32 192 ror @d[5],@d[5],#32 193 ror @d[6],@d[6],#32 194 ror @d[7],@d[7],#32 195#endif 196 197.Loop_outer: 198 mov.32 @x[0],@d[0] // unpack key block 199 lsr @x[1],@d[0],#32 200 mov.32 @x[2],@d[1] 201 lsr @x[3],@d[1],#32 202 mov.32 @x[4],@d[2] 203 lsr @x[5],@d[2],#32 204 mov.32 @x[6],@d[3] 205 lsr @x[7],@d[3],#32 206 mov.32 @x[8],@d[4] 207 lsr @x[9],@d[4],#32 208 mov.32 @x[10],@d[5] 209 lsr @x[11],@d[5],#32 210 mov.32 @x[12],@d[6] 211 lsr @x[13],@d[6],#32 212 mov.32 @x[14],@d[7] 213 lsr @x[15],@d[7],#32 214 215 mov $ctr,#10 216 subs $len,$len,#64 217.Loop: 218 sub $ctr,$ctr,#1 219___ 220 foreach (&ROUND(0, 4, 8,12)) { eval; } 221 foreach (&ROUND(0, 5,10,15)) { eval; } 222$code.=<<___; 223 cbnz $ctr,.Loop 224 225 add.32 @x[0],@x[0],@d[0] // accumulate key block 226 add @x[1],@x[1],@d[0],lsr#32 227 add.32 @x[2],@x[2],@d[1] 228 add @x[3],@x[3],@d[1],lsr#32 229 add.32 @x[4],@x[4],@d[2] 230 add @x[5],@x[5],@d[2],lsr#32 231 add.32 @x[6],@x[6],@d[3] 232 add @x[7],@x[7],@d[3],lsr#32 233 add.32 @x[8],@x[8],@d[4] 234 add @x[9],@x[9],@d[4],lsr#32 235 add.32 @x[10],@x[10],@d[5] 236 add @x[11],@x[11],@d[5],lsr#32 237 add.32 @x[12],@x[12],@d[6] 238 add @x[13],@x[13],@d[6],lsr#32 239 add.32 @x[14],@x[14],@d[7] 240 add @x[15],@x[15],@d[7],lsr#32 241 242 b.lo .Ltail 243 244 add @x[0],@x[0],@x[1],lsl#32 // pack 245 add @x[2],@x[2],@x[3],lsl#32 246 ldp @x[1],@x[3],[$inp,#0] // load input 247 add @x[4],@x[4],@x[5],lsl#32 248 add @x[6],@x[6],@x[7],lsl#32 249 ldp @x[5],@x[7],[$inp,#16] 250 add @x[8],@x[8],@x[9],lsl#32 251 add @x[10],@x[10],@x[11],lsl#32 252 ldp @x[9],@x[11],[$inp,#32] 253 add @x[12],@x[12],@x[13],lsl#32 254 add @x[14],@x[14],@x[15],lsl#32 255 ldp @x[13],@x[15],[$inp,#48] 256 add $inp,$inp,#64 257#ifdef __AARCH64EB__ 258 rev @x[0],@x[0] 259 rev @x[2],@x[2] 260 rev @x[4],@x[4] 261 rev @x[6],@x[6] 262 rev @x[8],@x[8] 263 rev @x[10],@x[10] 264 rev @x[12],@x[12] 265 rev @x[14],@x[14] 266#endif 267 eor @x[0],@x[0],@x[1] 268 eor @x[2],@x[2],@x[3] 269 eor @x[4],@x[4],@x[5] 270 eor @x[6],@x[6],@x[7] 271 eor @x[8],@x[8],@x[9] 272 eor @x[10],@x[10],@x[11] 273 eor @x[12],@x[12],@x[13] 274 eor @x[14],@x[14],@x[15] 275 276 stp @x[0],@x[2],[$out,#0] // store output 277 add @d[6],@d[6],#1 // increment counter 278 stp @x[4],@x[6],[$out,#16] 279 stp @x[8],@x[10],[$out,#32] 280 stp @x[12],@x[14],[$out,#48] 281 add $out,$out,#64 282 283 b.hi .Loop_outer 284 285 ldp x19,x20,[x29,#16] 286 add sp,sp,#64 287 ldp x21,x22,[x29,#32] 288 ldp x23,x24,[x29,#48] 289 ldp x25,x26,[x29,#64] 290 ldp x27,x28,[x29,#80] 291 ldp x29,x30,[sp],#96 292.Labort: 293 AARCH64_VALIDATE_LINK_REGISTER 294 ret 295 296.align 4 297.Ltail: 298 add $len,$len,#64 299.Less_than_64: 300 sub $out,$out,#1 301 add $inp,$inp,$len 302 add $out,$out,$len 303 add $ctr,sp,$len 304 neg $len,$len 305 306 add @x[0],@x[0],@x[1],lsl#32 // pack 307 add @x[2],@x[2],@x[3],lsl#32 308 add @x[4],@x[4],@x[5],lsl#32 309 add @x[6],@x[6],@x[7],lsl#32 310 add @x[8],@x[8],@x[9],lsl#32 311 add @x[10],@x[10],@x[11],lsl#32 312 add @x[12],@x[12],@x[13],lsl#32 313 add @x[14],@x[14],@x[15],lsl#32 314#ifdef __AARCH64EB__ 315 rev @x[0],@x[0] 316 rev @x[2],@x[2] 317 rev @x[4],@x[4] 318 rev @x[6],@x[6] 319 rev @x[8],@x[8] 320 rev @x[10],@x[10] 321 rev @x[12],@x[12] 322 rev @x[14],@x[14] 323#endif 324 stp @x[0],@x[2],[sp,#0] 325 stp @x[4],@x[6],[sp,#16] 326 stp @x[8],@x[10],[sp,#32] 327 stp @x[12],@x[14],[sp,#48] 328 329.Loop_tail: 330 ldrb w10,[$inp,$len] 331 ldrb w11,[$ctr,$len] 332 add $len,$len,#1 333 eor w10,w10,w11 334 strb w10,[$out,$len] 335 cbnz $len,.Loop_tail 336 337 stp xzr,xzr,[sp,#0] 338 stp xzr,xzr,[sp,#16] 339 stp xzr,xzr,[sp,#32] 340 stp xzr,xzr,[sp,#48] 341 342 ldp x19,x20,[x29,#16] 343 add sp,sp,#64 344 ldp x21,x22,[x29,#32] 345 ldp x23,x24,[x29,#48] 346 ldp x25,x26,[x29,#64] 347 ldp x27,x28,[x29,#80] 348 ldp x29,x30,[sp],#96 349 AARCH64_VALIDATE_LINK_REGISTER 350 ret 351.size ChaCha20_ctr32_dflt,.-ChaCha20_ctr32_dflt 352 353.globl ChaCha20_ctr32 354.type ChaCha20_ctr32,%function 355.align 5 356ChaCha20_ctr32: 357 AARCH64_SIGN_LINK_REGISTER 358 cbz $len,.Labort 359 cmp $len,#192 360 b.lo .Lshort 361#ifndef __KERNEL__ 362 adrp x17,OPENSSL_armcap_P 363 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 364 tst w17,#ARMV8_SVE 365 b.eq .Lcheck_neon 366 stp x29,x30,[sp,#-16]! 367 sub sp,sp,#16 368 // SVE handling will inevitably increment the counter 369 // Neon/Scalar code that follows to process tail data needs to 370 // use new counter, unfortunately the input counter buffer 371 // pointed to by ctr is meant to be read-only per API contract 372 // we have to copy the buffer to stack to be writable by SVE 373 ldp x5,x6,[$ctr] 374 stp x5,x6,[sp] 375 mov $ctr,sp 376 bl ChaCha20_ctr32_sve 377 cbz $len,1f 378 bl ChaCha20_ctr32_dflt 3791: 380 add sp,sp,#16 381 ldp x29,x30,[sp],#16 382 AARCH64_VALIDATE_LINK_REGISTER 383 ret 384#endif 385 b .Lshort 386.size ChaCha20_ctr32,.-ChaCha20_ctr32 387___ 388 389{{{ 390my @K = map("v$_.4s",(0..3)); 391my ($xt0,$xt1,$xt2,$xt3, $CTR,$ROT24) = map("v$_.4s",(4..9)); 392my @X = map("v$_.4s",(16,20,24,28, 17,21,25,29, 18,22,26,30, 19,23,27,31)); 393my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 394 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @X; 395 396sub NEON_lane_ROUND { 397my ($a0,$b0,$c0,$d0)=@_; 398my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 399my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 400my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 401my @x=map("'$_'",@X); 402 403 ( 404 "&add (@x[$a0],@x[$a0],@x[$b0])", # Q1 405 "&add (@x[$a1],@x[$a1],@x[$b1])", # Q2 406 "&add (@x[$a2],@x[$a2],@x[$b2])", # Q3 407 "&add (@x[$a3],@x[$a3],@x[$b3])", # Q4 408 "&eor (@x[$d0],@x[$d0],@x[$a0])", 409 "&eor (@x[$d1],@x[$d1],@x[$a1])", 410 "&eor (@x[$d2],@x[$d2],@x[$a2])", 411 "&eor (@x[$d3],@x[$d3],@x[$a3])", 412 "&rev32_16 (@x[$d0],@x[$d0])", 413 "&rev32_16 (@x[$d1],@x[$d1])", 414 "&rev32_16 (@x[$d2],@x[$d2])", 415 "&rev32_16 (@x[$d3],@x[$d3])", 416 417 "&add (@x[$c0],@x[$c0],@x[$d0])", 418 "&add (@x[$c1],@x[$c1],@x[$d1])", 419 "&add (@x[$c2],@x[$c2],@x[$d2])", 420 "&add (@x[$c3],@x[$c3],@x[$d3])", 421 "&eor ('$xt0',@x[$b0],@x[$c0])", 422 "&eor ('$xt1',@x[$b1],@x[$c1])", 423 "&eor ('$xt2',@x[$b2],@x[$c2])", 424 "&eor ('$xt3',@x[$b3],@x[$c3])", 425 "&ushr (@x[$b0],'$xt0',20)", 426 "&ushr (@x[$b1],'$xt1',20)", 427 "&ushr (@x[$b2],'$xt2',20)", 428 "&ushr (@x[$b3],'$xt3',20)", 429 "&sli (@x[$b0],'$xt0',12)", 430 "&sli (@x[$b1],'$xt1',12)", 431 "&sli (@x[$b2],'$xt2',12)", 432 "&sli (@x[$b3],'$xt3',12)", 433 434 "&add (@x[$a0],@x[$a0],@x[$b0])", 435 "&add (@x[$a1],@x[$a1],@x[$b1])", 436 "&add (@x[$a2],@x[$a2],@x[$b2])", 437 "&add (@x[$a3],@x[$a3],@x[$b3])", 438 "&eor ('$xt0',@x[$d0],@x[$a0])", 439 "&eor ('$xt1',@x[$d1],@x[$a1])", 440 "&eor ('$xt2',@x[$d2],@x[$a2])", 441 "&eor ('$xt3',@x[$d3],@x[$a3])", 442 "&tbl (@x[$d0],'{$xt0}','$ROT24')", 443 "&tbl (@x[$d1],'{$xt1}','$ROT24')", 444 "&tbl (@x[$d2],'{$xt2}','$ROT24')", 445 "&tbl (@x[$d3],'{$xt3}','$ROT24')", 446 447 "&add (@x[$c0],@x[$c0],@x[$d0])", 448 "&add (@x[$c1],@x[$c1],@x[$d1])", 449 "&add (@x[$c2],@x[$c2],@x[$d2])", 450 "&add (@x[$c3],@x[$c3],@x[$d3])", 451 "&eor ('$xt0',@x[$b0],@x[$c0])", 452 "&eor ('$xt1',@x[$b1],@x[$c1])", 453 "&eor ('$xt2',@x[$b2],@x[$c2])", 454 "&eor ('$xt3',@x[$b3],@x[$c3])", 455 "&ushr (@x[$b0],'$xt0',25)", 456 "&ushr (@x[$b1],'$xt1',25)", 457 "&ushr (@x[$b2],'$xt2',25)", 458 "&ushr (@x[$b3],'$xt3',25)", 459 "&sli (@x[$b0],'$xt0',7)", 460 "&sli (@x[$b1],'$xt1',7)", 461 "&sli (@x[$b2],'$xt2',7)", 462 "&sli (@x[$b3],'$xt3',7)" 463 ); 464} 465 466$code.=<<___; 467 468#ifdef __KERNEL__ 469.globl ChaCha20_neon 470#endif 471.type ChaCha20_neon,%function 472.align 5 473ChaCha20_neon: 474 AARCH64_SIGN_LINK_REGISTER 475.LChaCha20_neon: 476 stp x29,x30,[sp,#-96]! 477 add x29,sp,#0 478 479 adrp @x[0],.Lsigma 480 add @x[0],@x[0],:lo12:.Lsigma 481 stp x19,x20,[sp,#16] 482 stp x21,x22,[sp,#32] 483 stp x23,x24,[sp,#48] 484 stp x25,x26,[sp,#64] 485 stp x27,x28,[sp,#80] 486 cmp $len,#512 487 b.hs .L512_or_more_neon 488 489 sub sp,sp,#64 490 491 ldp @d[0],@d[1],[@x[0]] // load sigma 492 ld1 {@K[0]},[@x[0]],#16 493 ldp @d[2],@d[3],[$key] // load key 494 ldp @d[4],@d[5],[$key,#16] 495 ld1 {@K[1],@K[2]},[$key] 496 ldp @d[6],@d[7],[$ctr] // load counter 497 ld1 {@K[3]},[$ctr] 498 stp d8,d9,[sp] // meet ABI requirements 499 ld1 {$CTR,$ROT24},[@x[0]] 500#ifdef __AARCH64EB__ 501 rev64 @K[0],@K[0] 502 ror @d[2],@d[2],#32 503 ror @d[3],@d[3],#32 504 ror @d[4],@d[4],#32 505 ror @d[5],@d[5],#32 506 ror @d[6],@d[6],#32 507 ror @d[7],@d[7],#32 508#endif 509 510.Loop_outer_neon: 511 dup $xa0,@{K[0]}[0] // unpack key block 512 mov.32 @x[0],@d[0] 513 dup $xa1,@{K[0]}[1] 514 lsr @x[1],@d[0],#32 515 dup $xa2,@{K[0]}[2] 516 mov.32 @x[2],@d[1] 517 dup $xa3,@{K[0]}[3] 518 lsr @x[3],@d[1],#32 519 dup $xb0,@{K[1]}[0] 520 mov.32 @x[4],@d[2] 521 dup $xb1,@{K[1]}[1] 522 lsr @x[5],@d[2],#32 523 dup $xb2,@{K[1]}[2] 524 mov.32 @x[6],@d[3] 525 dup $xb3,@{K[1]}[3] 526 lsr @x[7],@d[3],#32 527 dup $xd0,@{K[3]}[0] 528 mov.32 @x[8],@d[4] 529 dup $xd1,@{K[3]}[1] 530 lsr @x[9],@d[4],#32 531 dup $xd2,@{K[3]}[2] 532 mov.32 @x[10],@d[5] 533 dup $xd3,@{K[3]}[3] 534 lsr @x[11],@d[5],#32 535 add $xd0,$xd0,$CTR 536 mov.32 @x[12],@d[6] 537 dup $xc0,@{K[2]}[0] 538 lsr @x[13],@d[6],#32 539 dup $xc1,@{K[2]}[1] 540 mov.32 @x[14],@d[7] 541 dup $xc2,@{K[2]}[2] 542 lsr @x[15],@d[7],#32 543 dup $xc3,@{K[2]}[3] 544 545 mov $ctr,#10 546 subs $len,$len,#320 547.Loop_neon: 548 sub $ctr,$ctr,#1 549___ 550 my @plus_one=&ROUND(0,4,8,12); 551 foreach (&NEON_lane_ROUND(0,4,8,12)) { eval; eval(shift(@plus_one)); } 552 553 @plus_one=&ROUND(0,5,10,15); 554 foreach (&NEON_lane_ROUND(0,5,10,15)) { eval; eval(shift(@plus_one)); } 555$code.=<<___; 556 cbnz $ctr,.Loop_neon 557 558 add $xd0,$xd0,$CTR 559 560 zip1 $xt0,$xa0,$xa1 // transpose data 561 zip1 $xt1,$xa2,$xa3 562 zip2 $xt2,$xa0,$xa1 563 zip2 $xt3,$xa2,$xa3 564 zip1.64 $xa0,$xt0,$xt1 565 zip2.64 $xa1,$xt0,$xt1 566 zip1.64 $xa2,$xt2,$xt3 567 zip2.64 $xa3,$xt2,$xt3 568 569 zip1 $xt0,$xb0,$xb1 570 zip1 $xt1,$xb2,$xb3 571 zip2 $xt2,$xb0,$xb1 572 zip2 $xt3,$xb2,$xb3 573 zip1.64 $xb0,$xt0,$xt1 574 zip2.64 $xb1,$xt0,$xt1 575 zip1.64 $xb2,$xt2,$xt3 576 zip2.64 $xb3,$xt2,$xt3 577 578 zip1 $xt0,$xc0,$xc1 579 add.32 @x[0],@x[0],@d[0] // accumulate key block 580 zip1 $xt1,$xc2,$xc3 581 add @x[1],@x[1],@d[0],lsr#32 582 zip2 $xt2,$xc0,$xc1 583 add.32 @x[2],@x[2],@d[1] 584 zip2 $xt3,$xc2,$xc3 585 add @x[3],@x[3],@d[1],lsr#32 586 zip1.64 $xc0,$xt0,$xt1 587 add.32 @x[4],@x[4],@d[2] 588 zip2.64 $xc1,$xt0,$xt1 589 add @x[5],@x[5],@d[2],lsr#32 590 zip1.64 $xc2,$xt2,$xt3 591 add.32 @x[6],@x[6],@d[3] 592 zip2.64 $xc3,$xt2,$xt3 593 add @x[7],@x[7],@d[3],lsr#32 594 595 zip1 $xt0,$xd0,$xd1 596 add.32 @x[8],@x[8],@d[4] 597 zip1 $xt1,$xd2,$xd3 598 add @x[9],@x[9],@d[4],lsr#32 599 zip2 $xt2,$xd0,$xd1 600 add.32 @x[10],@x[10],@d[5] 601 zip2 $xt3,$xd2,$xd3 602 add @x[11],@x[11],@d[5],lsr#32 603 zip1.64 $xd0,$xt0,$xt1 604 add.32 @x[12],@x[12],@d[6] 605 zip2.64 $xd1,$xt0,$xt1 606 add @x[13],@x[13],@d[6],lsr#32 607 zip1.64 $xd2,$xt2,$xt3 608 add.32 @x[14],@x[14],@d[7] 609 zip2.64 $xd3,$xt2,$xt3 610 add @x[15],@x[15],@d[7],lsr#32 611 612 b.lo .Ltail_neon 613 614 add @x[0],@x[0],@x[1],lsl#32 // pack 615 add @x[2],@x[2],@x[3],lsl#32 616 ldp @x[1],@x[3],[$inp,#0] // load input 617 add $xa0,$xa0,@K[0] // accumulate key block 618 add @x[4],@x[4],@x[5],lsl#32 619 add @x[6],@x[6],@x[7],lsl#32 620 ldp @x[5],@x[7],[$inp,#16] 621 add $xb0,$xb0,@K[1] 622 add @x[8],@x[8],@x[9],lsl#32 623 add @x[10],@x[10],@x[11],lsl#32 624 ldp @x[9],@x[11],[$inp,#32] 625 add $xc0,$xc0,@K[2] 626 add @x[12],@x[12],@x[13],lsl#32 627 add @x[14],@x[14],@x[15],lsl#32 628 ldp @x[13],@x[15],[$inp,#48] 629 add $xd0,$xd0,@K[3] 630 add $inp,$inp,#64 631#ifdef __AARCH64EB__ 632 rev @x[0],@x[0] 633 rev @x[2],@x[2] 634 rev @x[4],@x[4] 635 rev @x[6],@x[6] 636 rev @x[8],@x[8] 637 rev @x[10],@x[10] 638 rev @x[12],@x[12] 639 rev @x[14],@x[14] 640#endif 641 ld1.8 {$xt0-$xt3},[$inp],#64 642 eor @x[0],@x[0],@x[1] 643 add $xa1,$xa1,@K[0] 644 eor @x[2],@x[2],@x[3] 645 add $xb1,$xb1,@K[1] 646 eor @x[4],@x[4],@x[5] 647 add $xc1,$xc1,@K[2] 648 eor @x[6],@x[6],@x[7] 649 add $xd1,$xd1,@K[3] 650 eor @x[8],@x[8],@x[9] 651 eor $xa0,$xa0,$xt0 652 movi $xt0,#5 653 eor @x[10],@x[10],@x[11] 654 eor $xb0,$xb0,$xt1 655 eor @x[12],@x[12],@x[13] 656 eor $xc0,$xc0,$xt2 657 eor @x[14],@x[14],@x[15] 658 eor $xd0,$xd0,$xt3 659 add $CTR,$CTR,$xt0 // += 5 660 ld1.8 {$xt0-$xt3},[$inp],#64 661 662 stp @x[0],@x[2],[$out,#0] // store output 663 add @d[6],@d[6],#5 // increment counter 664 stp @x[4],@x[6],[$out,#16] 665 stp @x[8],@x[10],[$out,#32] 666 stp @x[12],@x[14],[$out,#48] 667 add $out,$out,#64 668 669 st1.8 {$xa0-$xd0},[$out],#64 670 add $xa2,$xa2,@K[0] 671 add $xb2,$xb2,@K[1] 672 add $xc2,$xc2,@K[2] 673 add $xd2,$xd2,@K[3] 674 ld1.8 {$xa0-$xd0},[$inp],#64 675 676 eor $xa1,$xa1,$xt0 677 eor $xb1,$xb1,$xt1 678 eor $xc1,$xc1,$xt2 679 eor $xd1,$xd1,$xt3 680 st1.8 {$xa1-$xd1},[$out],#64 681 add $xa3,$xa3,@K[0] 682 add $xb3,$xb3,@K[1] 683 add $xc3,$xc3,@K[2] 684 add $xd3,$xd3,@K[3] 685 ld1.8 {$xa1-$xd1},[$inp],#64 686 687 eor $xa2,$xa2,$xa0 688 eor $xb2,$xb2,$xb0 689 eor $xc2,$xc2,$xc0 690 eor $xd2,$xd2,$xd0 691 st1.8 {$xa2-$xd2},[$out],#64 692 693 eor $xa3,$xa3,$xa1 694 eor $xb3,$xb3,$xb1 695 eor $xc3,$xc3,$xc1 696 eor $xd3,$xd3,$xd1 697 st1.8 {$xa3-$xd3},[$out],#64 698 699 b.hi .Loop_outer_neon 700 701 ldp d8,d9,[sp] // meet ABI requirements 702 703 ldp x19,x20,[x29,#16] 704 add sp,sp,#64 705 ldp x21,x22,[x29,#32] 706 ldp x23,x24,[x29,#48] 707 ldp x25,x26,[x29,#64] 708 ldp x27,x28,[x29,#80] 709 ldp x29,x30,[sp],#96 710 AARCH64_VALIDATE_LINK_REGISTER 711 ret 712 713.align 4 714.Ltail_neon: 715 add $len,$len,#320 716 ldp d8,d9,[sp] // meet ABI requirements 717 cmp $len,#64 718 b.lo .Less_than_64 719 720 add @x[0],@x[0],@x[1],lsl#32 // pack 721 add @x[2],@x[2],@x[3],lsl#32 722 ldp @x[1],@x[3],[$inp,#0] // load input 723 add @x[4],@x[4],@x[5],lsl#32 724 add @x[6],@x[6],@x[7],lsl#32 725 ldp @x[5],@x[7],[$inp,#16] 726 add @x[8],@x[8],@x[9],lsl#32 727 add @x[10],@x[10],@x[11],lsl#32 728 ldp @x[9],@x[11],[$inp,#32] 729 add @x[12],@x[12],@x[13],lsl#32 730 add @x[14],@x[14],@x[15],lsl#32 731 ldp @x[13],@x[15],[$inp,#48] 732 add $inp,$inp,#64 733#ifdef __AARCH64EB__ 734 rev @x[0],@x[0] 735 rev @x[2],@x[2] 736 rev @x[4],@x[4] 737 rev @x[6],@x[6] 738 rev @x[8],@x[8] 739 rev @x[10],@x[10] 740 rev @x[12],@x[12] 741 rev @x[14],@x[14] 742#endif 743 eor @x[0],@x[0],@x[1] 744 eor @x[2],@x[2],@x[3] 745 eor @x[4],@x[4],@x[5] 746 eor @x[6],@x[6],@x[7] 747 eor @x[8],@x[8],@x[9] 748 eor @x[10],@x[10],@x[11] 749 eor @x[12],@x[12],@x[13] 750 eor @x[14],@x[14],@x[15] 751 752 stp @x[0],@x[2],[$out,#0] // store output 753 add $xa0,$xa0,@K[0] // accumulate key block 754 stp @x[4],@x[6],[$out,#16] 755 add $xb0,$xb0,@K[1] 756 stp @x[8],@x[10],[$out,#32] 757 add $xc0,$xc0,@K[2] 758 stp @x[12],@x[14],[$out,#48] 759 add $xd0,$xd0,@K[3] 760 add $out,$out,#64 761 b.eq .Ldone_neon 762 sub $len,$len,#64 763 cmp $len,#64 764 b.lo .Last_neon 765 766 ld1.8 {$xt0-$xt3},[$inp],#64 767 eor $xa0,$xa0,$xt0 768 eor $xb0,$xb0,$xt1 769 eor $xc0,$xc0,$xt2 770 eor $xd0,$xd0,$xt3 771 st1.8 {$xa0-$xd0},[$out],#64 772 b.eq .Ldone_neon 773 774 add $xa0,$xa1,@K[0] 775 add $xb0,$xb1,@K[1] 776 sub $len,$len,#64 777 add $xc0,$xc1,@K[2] 778 cmp $len,#64 779 add $xd0,$xd1,@K[3] 780 b.lo .Last_neon 781 782 ld1.8 {$xt0-$xt3},[$inp],#64 783 eor $xa1,$xa0,$xt0 784 eor $xb1,$xb0,$xt1 785 eor $xc1,$xc0,$xt2 786 eor $xd1,$xd0,$xt3 787 st1.8 {$xa1-$xd1},[$out],#64 788 b.eq .Ldone_neon 789 790 add $xa0,$xa2,@K[0] 791 add $xb0,$xb2,@K[1] 792 sub $len,$len,#64 793 add $xc0,$xc2,@K[2] 794 cmp $len,#64 795 add $xd0,$xd2,@K[3] 796 b.lo .Last_neon 797 798 ld1.8 {$xt0-$xt3},[$inp],#64 799 eor $xa2,$xa0,$xt0 800 eor $xb2,$xb0,$xt1 801 eor $xc2,$xc0,$xt2 802 eor $xd2,$xd0,$xt3 803 st1.8 {$xa2-$xd2},[$out],#64 804 b.eq .Ldone_neon 805 806 add $xa0,$xa3,@K[0] 807 add $xb0,$xb3,@K[1] 808 add $xc0,$xc3,@K[2] 809 add $xd0,$xd3,@K[3] 810 sub $len,$len,#64 811 812.Last_neon: 813 st1.8 {$xa0-$xd0},[sp] 814 815 sub $out,$out,#1 816 add $inp,$inp,$len 817 add $out,$out,$len 818 add $ctr,sp,$len 819 neg $len,$len 820 821.Loop_tail_neon: 822 ldrb w10,[$inp,$len] 823 ldrb w11,[$ctr,$len] 824 add $len,$len,#1 825 eor w10,w10,w11 826 strb w10,[$out,$len] 827 cbnz $len,.Loop_tail_neon 828 829 stp xzr,xzr,[sp,#0] 830 stp xzr,xzr,[sp,#16] 831 stp xzr,xzr,[sp,#32] 832 stp xzr,xzr,[sp,#48] 833 834.Ldone_neon: 835 ldp x19,x20,[x29,#16] 836 add sp,sp,#64 837 ldp x21,x22,[x29,#32] 838 ldp x23,x24,[x29,#48] 839 ldp x25,x26,[x29,#64] 840 ldp x27,x28,[x29,#80] 841 ldp x29,x30,[sp],#96 842 AARCH64_VALIDATE_LINK_REGISTER 843 ret 844.size ChaCha20_neon,.-ChaCha20_neon 845___ 846{ 847my @K = map("v$_.4s",(0..6)); 848my ($T0,$T1,$T2,$T3,$T4,$T5)=@K; 849my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2, 850 $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(8..31)); 851my $rot24 = @K[6]; 852my $ONE = "v7.4s"; 853 854sub NEONROUND { 855my $odd = pop; 856my ($a,$b,$c,$d,$t)=@_; 857 858 ( 859 "&add ('$a','$a','$b')", 860 "&eor ('$d','$d','$a')", 861 "&rev32_16 ('$d','$d')", # vrot ($d,16) 862 863 "&add ('$c','$c','$d')", 864 "&eor ('$t','$b','$c')", 865 "&ushr ('$b','$t',20)", 866 "&sli ('$b','$t',12)", 867 868 "&add ('$a','$a','$b')", 869 "&eor ('$d','$d','$a')", 870 "&tbl ('$d','{$d}','$rot24')", 871 872 "&add ('$c','$c','$d')", 873 "&eor ('$t','$b','$c')", 874 "&ushr ('$b','$t',25)", 875 "&sli ('$b','$t',7)", 876 877 "&ext ('$c','$c','$c',8)", 878 "&ext ('$d','$d','$d',$odd?4:12)", 879 "&ext ('$b','$b','$b',$odd?12:4)" 880 ); 881} 882 883$code.=<<___; 884.type ChaCha20_512_neon,%function 885.align 5 886ChaCha20_512_neon: 887 AARCH64_SIGN_LINK_REGISTER 888 stp x29,x30,[sp,#-96]! 889 add x29,sp,#0 890 891 adrp @x[0],.Lsigma 892 add @x[0],@x[0],:lo12:.Lsigma 893 stp x19,x20,[sp,#16] 894 stp x21,x22,[sp,#32] 895 stp x23,x24,[sp,#48] 896 stp x25,x26,[sp,#64] 897 stp x27,x28,[sp,#80] 898 899.L512_or_more_neon: 900 sub sp,sp,#128+64 901 902 eor $ONE,$ONE,$ONE 903 ldp @d[0],@d[1],[@x[0]] // load sigma 904 ld1 {@K[0]},[@x[0]],#16 905 ldp @d[2],@d[3],[$key] // load key 906 ldp @d[4],@d[5],[$key,#16] 907 ld1 {@K[1],@K[2]},[$key] 908 ldp @d[6],@d[7],[$ctr] // load counter 909 ld1 {@K[3]},[$ctr] 910 ld1 {$ONE}[0],[@x[0]] 911 add $key,@x[0],#16 // .Lrot24 912#ifdef __AARCH64EB__ 913 rev64 @K[0],@K[0] 914 ror @d[2],@d[2],#32 915 ror @d[3],@d[3],#32 916 ror @d[4],@d[4],#32 917 ror @d[5],@d[5],#32 918 ror @d[6],@d[6],#32 919 ror @d[7],@d[7],#32 920#endif 921 add @K[3],@K[3],$ONE // += 1 922 stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part 923 add @K[3],@K[3],$ONE // not typo 924 str @K[2],[sp,#32] 925 add @K[4],@K[3],$ONE 926 add @K[5],@K[4],$ONE 927 add @K[6],@K[5],$ONE 928 shl $ONE,$ONE,#2 // 1 -> 4 929 930 stp d8,d9,[sp,#128+0] // meet ABI requirements 931 stp d10,d11,[sp,#128+16] 932 stp d12,d13,[sp,#128+32] 933 stp d14,d15,[sp,#128+48] 934 935 sub $len,$len,#512 // not typo 936 937.Loop_outer_512_neon: 938 mov $A0,@K[0] 939 mov $A1,@K[0] 940 mov $A2,@K[0] 941 mov $A3,@K[0] 942 mov $A4,@K[0] 943 mov $A5,@K[0] 944 mov $B0,@K[1] 945 mov.32 @x[0],@d[0] // unpack key block 946 mov $B1,@K[1] 947 lsr @x[1],@d[0],#32 948 mov $B2,@K[1] 949 mov.32 @x[2],@d[1] 950 mov $B3,@K[1] 951 lsr @x[3],@d[1],#32 952 mov $B4,@K[1] 953 mov.32 @x[4],@d[2] 954 mov $B5,@K[1] 955 lsr @x[5],@d[2],#32 956 mov $D0,@K[3] 957 mov.32 @x[6],@d[3] 958 mov $D1,@K[4] 959 lsr @x[7],@d[3],#32 960 mov $D2,@K[5] 961 mov.32 @x[8],@d[4] 962 mov $D3,@K[6] 963 lsr @x[9],@d[4],#32 964 mov $C0,@K[2] 965 mov.32 @x[10],@d[5] 966 mov $C1,@K[2] 967 lsr @x[11],@d[5],#32 968 add $D4,$D0,$ONE // +4 969 mov.32 @x[12],@d[6] 970 add $D5,$D1,$ONE // +4 971 lsr @x[13],@d[6],#32 972 mov $C2,@K[2] 973 mov.32 @x[14],@d[7] 974 mov $C3,@K[2] 975 lsr @x[15],@d[7],#32 976 mov $C4,@K[2] 977 stp @K[3],@K[4],[sp,#48] // off-load key block, variable part 978 mov $C5,@K[2] 979 stp @K[5],@K[6],[sp,#80] 980 981 mov $ctr,#5 982 ld1 {$rot24},[$key] 983 subs $len,$len,#512 984.Loop_upper_neon: 985 sub $ctr,$ctr,#1 986___ 987 my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); 988 my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); 989 my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); 990 my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); 991 my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); 992 my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); 993 my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); 994 my $diff = ($#thread0+1)*6 - $#thread67 - 1; 995 my $i = 0; 996 997 foreach (@thread0) { 998 eval; eval(shift(@thread67)); 999 eval(shift(@thread1)); eval(shift(@thread67)); 1000 eval(shift(@thread2)); eval(shift(@thread67)); 1001 eval(shift(@thread3)); eval(shift(@thread67)); 1002 eval(shift(@thread4)); eval(shift(@thread67)); 1003 eval(shift(@thread5)); eval(shift(@thread67)); 1004 } 1005 1006 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); 1007 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); 1008 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); 1009 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); 1010 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); 1011 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); 1012 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); 1013 1014 foreach (@thread0) { 1015 eval; eval(shift(@thread67)); 1016 eval(shift(@thread1)); eval(shift(@thread67)); 1017 eval(shift(@thread2)); eval(shift(@thread67)); 1018 eval(shift(@thread3)); eval(shift(@thread67)); 1019 eval(shift(@thread4)); eval(shift(@thread67)); 1020 eval(shift(@thread5)); eval(shift(@thread67)); 1021 } 1022$code.=<<___; 1023 cbnz $ctr,.Loop_upper_neon 1024 1025 add.32 @x[0],@x[0],@d[0] // accumulate key block 1026 add @x[1],@x[1],@d[0],lsr#32 1027 add.32 @x[2],@x[2],@d[1] 1028 add @x[3],@x[3],@d[1],lsr#32 1029 add.32 @x[4],@x[4],@d[2] 1030 add @x[5],@x[5],@d[2],lsr#32 1031 add.32 @x[6],@x[6],@d[3] 1032 add @x[7],@x[7],@d[3],lsr#32 1033 add.32 @x[8],@x[8],@d[4] 1034 add @x[9],@x[9],@d[4],lsr#32 1035 add.32 @x[10],@x[10],@d[5] 1036 add @x[11],@x[11],@d[5],lsr#32 1037 add.32 @x[12],@x[12],@d[6] 1038 add @x[13],@x[13],@d[6],lsr#32 1039 add.32 @x[14],@x[14],@d[7] 1040 add @x[15],@x[15],@d[7],lsr#32 1041 1042 add @x[0],@x[0],@x[1],lsl#32 // pack 1043 add @x[2],@x[2],@x[3],lsl#32 1044 ldp @x[1],@x[3],[$inp,#0] // load input 1045 add @x[4],@x[4],@x[5],lsl#32 1046 add @x[6],@x[6],@x[7],lsl#32 1047 ldp @x[5],@x[7],[$inp,#16] 1048 add @x[8],@x[8],@x[9],lsl#32 1049 add @x[10],@x[10],@x[11],lsl#32 1050 ldp @x[9],@x[11],[$inp,#32] 1051 add @x[12],@x[12],@x[13],lsl#32 1052 add @x[14],@x[14],@x[15],lsl#32 1053 ldp @x[13],@x[15],[$inp,#48] 1054 add $inp,$inp,#64 1055#ifdef __AARCH64EB__ 1056 rev @x[0],@x[0] 1057 rev @x[2],@x[2] 1058 rev @x[4],@x[4] 1059 rev @x[6],@x[6] 1060 rev @x[8],@x[8] 1061 rev @x[10],@x[10] 1062 rev @x[12],@x[12] 1063 rev @x[14],@x[14] 1064#endif 1065 eor @x[0],@x[0],@x[1] 1066 eor @x[2],@x[2],@x[3] 1067 eor @x[4],@x[4],@x[5] 1068 eor @x[6],@x[6],@x[7] 1069 eor @x[8],@x[8],@x[9] 1070 eor @x[10],@x[10],@x[11] 1071 eor @x[12],@x[12],@x[13] 1072 eor @x[14],@x[14],@x[15] 1073 1074 stp @x[0],@x[2],[$out,#0] // store output 1075 add @d[6],@d[6],#1 // increment counter 1076 mov.32 @x[0],@d[0] // unpack key block 1077 lsr @x[1],@d[0],#32 1078 stp @x[4],@x[6],[$out,#16] 1079 mov.32 @x[2],@d[1] 1080 lsr @x[3],@d[1],#32 1081 stp @x[8],@x[10],[$out,#32] 1082 mov.32 @x[4],@d[2] 1083 lsr @x[5],@d[2],#32 1084 stp @x[12],@x[14],[$out,#48] 1085 add $out,$out,#64 1086 mov.32 @x[6],@d[3] 1087 lsr @x[7],@d[3],#32 1088 mov.32 @x[8],@d[4] 1089 lsr @x[9],@d[4],#32 1090 mov.32 @x[10],@d[5] 1091 lsr @x[11],@d[5],#32 1092 mov.32 @x[12],@d[6] 1093 lsr @x[13],@d[6],#32 1094 mov.32 @x[14],@d[7] 1095 lsr @x[15],@d[7],#32 1096 1097 mov $ctr,#5 1098.Loop_lower_neon: 1099 sub $ctr,$ctr,#1 1100___ 1101 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); 1102 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); 1103 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); 1104 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); 1105 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); 1106 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); 1107 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); 1108 1109 foreach (@thread0) { 1110 eval; eval(shift(@thread67)); 1111 eval(shift(@thread1)); eval(shift(@thread67)); 1112 eval(shift(@thread2)); eval(shift(@thread67)); 1113 eval(shift(@thread3)); eval(shift(@thread67)); 1114 eval(shift(@thread4)); eval(shift(@thread67)); 1115 eval(shift(@thread5)); eval(shift(@thread67)); 1116 } 1117 1118 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); 1119 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); 1120 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); 1121 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); 1122 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); 1123 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); 1124 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); 1125 1126 foreach (@thread0) { 1127 eval; eval(shift(@thread67)); 1128 eval(shift(@thread1)); eval(shift(@thread67)); 1129 eval(shift(@thread2)); eval(shift(@thread67)); 1130 eval(shift(@thread3)); eval(shift(@thread67)); 1131 eval(shift(@thread4)); eval(shift(@thread67)); 1132 eval(shift(@thread5)); eval(shift(@thread67)); 1133 } 1134$code.=<<___; 1135 cbnz $ctr,.Loop_lower_neon 1136 1137 add.32 @x[0],@x[0],@d[0] // accumulate key block 1138 ldp @K[0],@K[1],[sp,#0] 1139 add @x[1],@x[1],@d[0],lsr#32 1140 ldp @K[2],@K[3],[sp,#32] 1141 add.32 @x[2],@x[2],@d[1] 1142 ldp @K[4],@K[5],[sp,#64] 1143 add @x[3],@x[3],@d[1],lsr#32 1144 ldr @K[6],[sp,#96] 1145 add $A0,$A0,@K[0] 1146 add.32 @x[4],@x[4],@d[2] 1147 add $A1,$A1,@K[0] 1148 add @x[5],@x[5],@d[2],lsr#32 1149 add $A2,$A2,@K[0] 1150 add.32 @x[6],@x[6],@d[3] 1151 add $A3,$A3,@K[0] 1152 add @x[7],@x[7],@d[3],lsr#32 1153 add $A4,$A4,@K[0] 1154 add.32 @x[8],@x[8],@d[4] 1155 add $A5,$A5,@K[0] 1156 add @x[9],@x[9],@d[4],lsr#32 1157 add $C0,$C0,@K[2] 1158 add.32 @x[10],@x[10],@d[5] 1159 add $C1,$C1,@K[2] 1160 add @x[11],@x[11],@d[5],lsr#32 1161 add $C2,$C2,@K[2] 1162 add.32 @x[12],@x[12],@d[6] 1163 add $C3,$C3,@K[2] 1164 add @x[13],@x[13],@d[6],lsr#32 1165 add $C4,$C4,@K[2] 1166 add.32 @x[14],@x[14],@d[7] 1167 add $C5,$C5,@K[2] 1168 add @x[15],@x[15],@d[7],lsr#32 1169 add $D4,$D4,$ONE // +4 1170 add @x[0],@x[0],@x[1],lsl#32 // pack 1171 add $D5,$D5,$ONE // +4 1172 add @x[2],@x[2],@x[3],lsl#32 1173 add $D0,$D0,@K[3] 1174 ldp @x[1],@x[3],[$inp,#0] // load input 1175 add $D1,$D1,@K[4] 1176 add @x[4],@x[4],@x[5],lsl#32 1177 add $D2,$D2,@K[5] 1178 add @x[6],@x[6],@x[7],lsl#32 1179 add $D3,$D3,@K[6] 1180 ldp @x[5],@x[7],[$inp,#16] 1181 add $D4,$D4,@K[3] 1182 add @x[8],@x[8],@x[9],lsl#32 1183 add $D5,$D5,@K[4] 1184 add @x[10],@x[10],@x[11],lsl#32 1185 add $B0,$B0,@K[1] 1186 ldp @x[9],@x[11],[$inp,#32] 1187 add $B1,$B1,@K[1] 1188 add @x[12],@x[12],@x[13],lsl#32 1189 add $B2,$B2,@K[1] 1190 add @x[14],@x[14],@x[15],lsl#32 1191 add $B3,$B3,@K[1] 1192 ldp @x[13],@x[15],[$inp,#48] 1193 add $B4,$B4,@K[1] 1194 add $inp,$inp,#64 1195 add $B5,$B5,@K[1] 1196 1197#ifdef __AARCH64EB__ 1198 rev @x[0],@x[0] 1199 rev @x[2],@x[2] 1200 rev @x[4],@x[4] 1201 rev @x[6],@x[6] 1202 rev @x[8],@x[8] 1203 rev @x[10],@x[10] 1204 rev @x[12],@x[12] 1205 rev @x[14],@x[14] 1206#endif 1207 ld1.8 {$T0-$T3},[$inp],#64 1208 eor @x[0],@x[0],@x[1] 1209 eor @x[2],@x[2],@x[3] 1210 eor @x[4],@x[4],@x[5] 1211 eor @x[6],@x[6],@x[7] 1212 eor @x[8],@x[8],@x[9] 1213 eor $A0,$A0,$T0 1214 eor @x[10],@x[10],@x[11] 1215 eor $B0,$B0,$T1 1216 eor @x[12],@x[12],@x[13] 1217 eor $C0,$C0,$T2 1218 eor @x[14],@x[14],@x[15] 1219 eor $D0,$D0,$T3 1220 ld1.8 {$T0-$T3},[$inp],#64 1221 1222 stp @x[0],@x[2],[$out,#0] // store output 1223 add @d[6],@d[6],#7 // increment counter 1224 stp @x[4],@x[6],[$out,#16] 1225 stp @x[8],@x[10],[$out,#32] 1226 stp @x[12],@x[14],[$out,#48] 1227 add $out,$out,#64 1228 st1.8 {$A0-$D0},[$out],#64 1229 1230 ld1.8 {$A0-$D0},[$inp],#64 1231 eor $A1,$A1,$T0 1232 eor $B1,$B1,$T1 1233 eor $C1,$C1,$T2 1234 eor $D1,$D1,$T3 1235 st1.8 {$A1-$D1},[$out],#64 1236 1237 ld1.8 {$A1-$D1},[$inp],#64 1238 eor $A2,$A2,$A0 1239 ldp @K[0],@K[1],[sp,#0] 1240 eor $B2,$B2,$B0 1241 ldp @K[2],@K[3],[sp,#32] 1242 eor $C2,$C2,$C0 1243 eor $D2,$D2,$D0 1244 st1.8 {$A2-$D2},[$out],#64 1245 1246 ld1.8 {$A2-$D2},[$inp],#64 1247 eor $A3,$A3,$A1 1248 eor $B3,$B3,$B1 1249 eor $C3,$C3,$C1 1250 eor $D3,$D3,$D1 1251 st1.8 {$A3-$D3},[$out],#64 1252 1253 ld1.8 {$A3-$D3},[$inp],#64 1254 eor $A4,$A4,$A2 1255 eor $B4,$B4,$B2 1256 eor $C4,$C4,$C2 1257 eor $D4,$D4,$D2 1258 st1.8 {$A4-$D4},[$out],#64 1259 1260 shl $A0,$ONE,#1 // 4 -> 8 1261 eor $A5,$A5,$A3 1262 eor $B5,$B5,$B3 1263 eor $C5,$C5,$C3 1264 eor $D5,$D5,$D3 1265 st1.8 {$A5-$D5},[$out],#64 1266 1267 add @K[3],@K[3],$A0 // += 8 1268 add @K[4],@K[4],$A0 1269 add @K[5],@K[5],$A0 1270 add @K[6],@K[6],$A0 1271 1272 b.hs .Loop_outer_512_neon 1273 1274 adds $len,$len,#512 1275 ushr $ONE,$ONE,#1 // 4 -> 2 1276 1277 ldp d10,d11,[sp,#128+16] // meet ABI requirements 1278 ldp d12,d13,[sp,#128+32] 1279 ldp d14,d15,[sp,#128+48] 1280 1281 stp @K[0],@K[0],[sp,#0] // wipe off-load area 1282 stp @K[0],@K[0],[sp,#32] 1283 stp @K[0],@K[0],[sp,#64] 1284 1285 b.eq .Ldone_512_neon 1286 1287 sub $key,$key,#16 // .Lone 1288 cmp $len,#192 1289 add sp,sp,#128 1290 sub @K[3],@K[3],$ONE // -= 2 1291 ld1 {$CTR,$ROT24},[$key] 1292 b.hs .Loop_outer_neon 1293 1294 ldp d8,d9,[sp,#0] // meet ABI requirements 1295 eor @K[1],@K[1],@K[1] 1296 eor @K[2],@K[2],@K[2] 1297 eor @K[3],@K[3],@K[3] 1298 eor @K[4],@K[4],@K[4] 1299 eor @K[5],@K[5],@K[5] 1300 eor @K[6],@K[6],@K[6] 1301 b .Loop_outer 1302 1303.Ldone_512_neon: 1304 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1305 ldp x19,x20,[x29,#16] 1306 add sp,sp,#128+64 1307 ldp x21,x22,[x29,#32] 1308 ldp x23,x24,[x29,#48] 1309 ldp x25,x26,[x29,#64] 1310 ldp x27,x28,[x29,#80] 1311 ldp x29,x30,[sp],#96 1312 AARCH64_VALIDATE_LINK_REGISTER 1313 ret 1314.size ChaCha20_512_neon,.-ChaCha20_512_neon 1315___ 1316} 1317}}} 1318 1319foreach (split("\n",$code)) { 1320 s/\`([^\`]*)\`/eval $1/geo; 1321 1322 (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or 1323 (m/\b(eor|ext|mov|tbl)\b/ and (s/\.4s/\.16b/g or 1)) or 1324 (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or 1325 (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or 1326 (m/\b(dup|ld1)\b/ and (s/\.4(s}?\[[0-3]\])/.$1/g or 1)) or 1327 (s/\b(zip[12])\.64\b/$1/ and (s/\.4s/\.2d/g or 1)) or 1328 (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1)); 1329 1330 #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; 1331 1332 print $_,"\n"; 1333} 1334close STDOUT or die "error closing STDOUT: $!"; # flush 1335