1#! /usr/bin/env perl 2# Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# 10# ChaCha20 for ARMv8 via SVE 11# 12# $output is the last argument if it looks like a file (it has an extension) 13# $flavour is the first argument if it doesn't look like a file 14$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 15$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 16 17$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 18( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 19( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 20die "can't locate arm-xlate.pl"; 21 22open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 23 or die "can't call $xlate: $!"; 24*STDOUT=*OUT; 25 26sub AUTOLOAD() # thunk [simplified] x86-style perlasm 27{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 28 my $arg = pop; 29 $arg = "#$arg" if ($arg*1 eq $arg); 30 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 31} 32 33$prefix="chacha_sve"; 34my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4)); 35my ($veclen) = ("x5"); 36my ($counter) = ("x6"); 37my ($counter_w) = ("w6"); 38my @xx=(7..22); 39my @sxx=map("x$_",@xx); 40my @sx=map("w$_",@xx); 41my @K=map("x$_",(23..30)); 42my @elem=(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); 43my @KL=map("w$_",(23..30)); 44my @mx=map("z$_",@elem); 45my @vx=map("v$_",@elem); 46my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 47 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx; 48my ($zctr) = ("z16"); 49my @tt=(17..24); 50my @xt=map("z$_",@tt); 51my @vt=map("v$_",@tt); 52my @perm=map("z$_",(25..30)); 53my ($rot8) = ("z31"); 54my @bak=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],@xt[4],@xt[5],@xt[6],@xt[7],@xt[0],@xt[1],$zctr,@xt[2],@xt[3],$rot8); 55my $debug_encoder=0; 56 57sub SVE_ADD() { 58 my $x = shift; 59 my $y = shift; 60 61$code.=<<___; 62 add @mx[$x].s,@mx[$x].s,@mx[$y].s 63 .if mixin == 1 64 add @sx[$x],@sx[$x],@sx[$y] 65 .endif 66___ 67 if (@_) { 68 &SVE_ADD(@_); 69 } 70} 71 72sub SVE_EOR() { 73 my $x = shift; 74 my $y = shift; 75 76$code.=<<___; 77 eor @mx[$x].d,@mx[$x].d,@mx[$y].d 78 .if mixin == 1 79 eor @sx[$x],@sx[$x],@sx[$y] 80 .endif 81___ 82 if (@_) { 83 &SVE_EOR(@_); 84 } 85} 86 87sub SVE_LSL() { 88 my $bits = shift; 89 my $x = shift; 90 my $y = shift; 91 my $next = $x + 1; 92 93$code.=<<___; 94 lsl @xt[$x].s,@mx[$y].s,$bits 95___ 96 if (@_) { 97 &SVE_LSL($bits,$next,@_); 98 } 99} 100 101sub SVE_LSR() { 102 my $bits = shift; 103 my $x = shift; 104 105$code.=<<___; 106 lsr @mx[$x].s,@mx[$x].s,$bits 107 .if mixin == 1 108 ror @sx[$x],@sx[$x],$bits 109 .endif 110___ 111 if (@_) { 112 &SVE_LSR($bits,@_); 113 } 114} 115 116sub SVE_ORR() { 117 my $x = shift; 118 my $y = shift; 119 my $next = $x + 1; 120 121$code.=<<___; 122 orr @mx[$y].d,@mx[$y].d,@xt[$x].d 123___ 124 if (@_) { 125 &SVE_ORR($next,@_); 126 } 127} 128 129sub SVE_REV16() { 130 my $x = shift; 131 132$code.=<<___; 133 revh @mx[$x].s,p0/m,@mx[$x].s 134 .if mixin == 1 135 ror @sx[$x],@sx[$x],#16 136 .endif 137___ 138 if (@_) { 139 &SVE_REV16(@_); 140 } 141} 142 143sub SVE_ROT8() { 144 my $x = shift; 145 146$code.=<<___; 147 tbl @mx[$x].b,{@mx[$x].b},$rot8.b 148 .if mixin == 1 149 ror @sx[$x],@sx[$x],#24 150 .endif 151___ 152 if (@_) { 153 &SVE_ROT8(@_); 154 } 155} 156 157sub SVE2_XAR() { 158 my $bits = shift; 159 my $x = shift; 160 my $y = shift; 161 my $rbits = 32-$bits; 162 163$code.=<<___; 164 .if mixin == 1 165 eor @sx[$x],@sx[$x],@sx[$y] 166 .endif 167 xar @mx[$x].s,@mx[$x].s,@mx[$y].s,$rbits 168 .if mixin == 1 169 ror @sx[$x],@sx[$x],$rbits 170 .endif 171___ 172 if (@_) { 173 &SVE2_XAR($bits,@_); 174 } 175} 176 177sub SVE2_QR_GROUP() { 178 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_; 179 180 &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); 181 &SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); 182 183 &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); 184 &SVE2_XAR(12,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); 185 186 &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); 187 &SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); 188 189 &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); 190 &SVE2_XAR(7,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); 191} 192 193sub SVE_QR_GROUP() { 194 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_; 195 196 &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); 197 &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); 198 &SVE_REV16($d0,$d1,$d2,$d3); 199 200 &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); 201 &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); 202 &SVE_LSL(12,0,$b0,$b1,$b2,$b3); 203 &SVE_LSR(20,$b0,$b1,$b2,$b3); 204 &SVE_ORR(0,$b0,$b1,$b2,$b3); 205 206 &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); 207 &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); 208 &SVE_ROT8($d0,$d1,$d2,$d3); 209 210 &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); 211 &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); 212 &SVE_LSL(7,0,$b0,$b1,$b2,$b3); 213 &SVE_LSR(25,$b0,$b1,$b2,$b3); 214 &SVE_ORR(0,$b0,$b1,$b2,$b3); 215} 216 217sub SVE_INNER_BLOCK() { 218$code.=<<___; 219 mov $counter,#10 22010: 221.align 5 222___ 223 &SVE_QR_GROUP(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); 224 &SVE_QR_GROUP(0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14); 225$code.=<<___; 226 sub $counter,$counter,1 227 cbnz $counter,10b 228___ 229} 230 231sub SVE2_INNER_BLOCK() { 232$code.=<<___; 233 mov $counter,#10 23410: 235.align 5 236___ 237 &SVE2_QR_GROUP(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); 238 &SVE2_QR_GROUP(0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14); 239$code.=<<___; 240 sub $counter,$counter,1 241 cbnz $counter,10b 242___ 243} 244 245sub load_regs() { 246 my $offset = shift; 247 my $reg = shift; 248 my $next_offset = $offset + 1; 249$code.=<<___; 250 ld1w {$reg.s},p0/z,[$inp,#$offset,MUL VL] 251___ 252 if (@_) { 253 &load_regs($next_offset, @_); 254 } else { 255$code.=<<___; 256 addvl $inp,$inp,$next_offset 257___ 258 } 259} 260 261sub load() { 262 if (@_) { 263 &load_regs(0, @_); 264 } 265} 266 267sub store_regs() { 268 my $offset = shift; 269 my $reg = shift; 270 my $next_offset = $offset + 1; 271$code.=<<___; 272 st1w {$reg.s},p0,[$outp,#$offset,MUL VL] 273___ 274 if (@_) { 275 &store_regs($next_offset, @_); 276 } else { 277$code.=<<___; 278 addvl $outp,$outp,$next_offset 279___ 280 } 281} 282 283sub store() { 284 if (@_) { 285 &store_regs(0, @_); 286 } 287} 288 289sub transpose() { 290 my $xa = shift; 291 my $xb = shift; 292 my $xc = shift; 293 my $xd = shift; 294 my $xa1 = shift; 295 my $xb1 = shift; 296 my $xc1 = shift; 297 my $xd1 = shift; 298$code.=<<___; 299 zip1 @xt[0].s,$xa.s,$xb.s 300 zip2 @xt[1].s,$xa.s,$xb.s 301 zip1 @xt[2].s,$xc.s,$xd.s 302 zip2 @xt[3].s,$xc.s,$xd.s 303 304 zip1 @xt[4].s,$xa1.s,$xb1.s 305 zip2 @xt[5].s,$xa1.s,$xb1.s 306 zip1 @xt[6].s,$xc1.s,$xd1.s 307 zip2 @xt[7].s,$xc1.s,$xd1.s 308 309 zip1 $xa.d,@xt[0].d,@xt[2].d 310 zip2 $xb.d,@xt[0].d,@xt[2].d 311 zip1 $xc.d,@xt[1].d,@xt[3].d 312 zip2 $xd.d,@xt[1].d,@xt[3].d 313 314 zip1 $xa1.d,@xt[4].d,@xt[6].d 315 zip2 $xb1.d,@xt[4].d,@xt[6].d 316 zip1 $xc1.d,@xt[5].d,@xt[7].d 317 zip2 $xd1.d,@xt[5].d,@xt[7].d 318___ 319} 320 321sub ACCUM() { 322 my $idx0 = shift; 323 my $idx1 = $idx0 + 1; 324 my $x0 = @sx[$idx0]; 325 my $xx0 = @sxx[$idx0]; 326 my $x1 = @sx[$idx1]; 327 my $xx1 = @sxx[$idx1]; 328 my $d = $idx0/2; 329 my ($tmp,$tmpw) = ($counter,$counter_w); 330 my $bk0 = @_ ? shift : @bak[$idx0]; 331 my $bk1 = @_ ? shift : @bak[$idx1]; 332 333$code.=<<___; 334 .if mixin == 1 335 add @sx[$idx0],@sx[$idx0],@KL[$d] 336 .endif 337 add @mx[$idx0].s,@mx[$idx0].s,$bk0.s 338 .if mixin == 1 339 add @sxx[$idx1],@sxx[$idx1],@K[$d],lsr #32 340 .endif 341 add @mx[$idx1].s,@mx[$idx1].s,$bk1.s 342 .if mixin == 1 343 add @sxx[$idx0],@sxx[$idx0],$sxx[$idx1],lsl #32 // pack 344 .endif 345___ 346} 347 348sub SCA_INP() { 349 my $idx0 = shift; 350 my $idx1 = $idx0 + 2; 351$code.=<<___; 352 .if mixin == 1 353 ldp @sxx[$idx0],@sxx[$idx1],[$inp],#16 354 .endif 355___ 356} 357 358sub SVE_ACCUM_STATES() { 359 my ($tmp,$tmpw) = ($counter,$counter_w); 360 361$code.=<<___; 362 lsr $tmp,@K[5],#32 363 dup @bak[10].s,@KL[5] 364 dup @bak[11].s,$tmpw 365 lsr $tmp,@K[6],#32 366 dup @bak[13].s,$tmpw 367 lsr $tmp,@K[7],#32 368___ 369 &ACCUM(0); 370 &ACCUM(2); 371 &SCA_INP(1); 372 &ACCUM(4); 373 &ACCUM(6); 374 &SCA_INP(5); 375 &ACCUM(8); 376 &ACCUM(10); 377 &SCA_INP(9); 378$code.=<<___; 379 dup @bak[14].s,@KL[7] 380 dup @bak[0].s,$tmpw // bak[15] not available for SVE 381___ 382 &ACCUM(12); 383 &ACCUM(14, @bak[14],@bak[0]); 384 &SCA_INP(13); 385} 386 387sub SVE2_ACCUM_STATES() { 388 &ACCUM(0); 389 &ACCUM(2); 390 &SCA_INP(1); 391 &ACCUM(4); 392 &ACCUM(6); 393 &SCA_INP(5); 394 &ACCUM(8); 395 &ACCUM(10); 396 &SCA_INP(9); 397 &ACCUM(12); 398 &ACCUM(14); 399 &SCA_INP(13); 400} 401 402sub SCA_EOR() { 403 my $idx0 = shift; 404 my $idx1 = $idx0 + 1; 405$code.=<<___; 406 .if mixin == 1 407 eor @sxx[$idx0],@sxx[$idx0],@sxx[$idx1] 408 .endif 409___ 410} 411 412sub SCA_SAVE() { 413 my $idx0 = shift; 414 my $idx1 = shift; 415$code.=<<___; 416 .if mixin == 1 417 stp @sxx[$idx0],@sxx[$idx1],[$outp],#16 418 .endif 419___ 420} 421 422sub SVE_VL128_TRANSFORMS() { 423 &SCA_EOR(0); 424 &SCA_EOR(2); 425 &SCA_EOR(4); 426 &transpose($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3); 427 &SCA_EOR(6); 428 &SCA_EOR(8); 429 &SCA_EOR(10); 430 &transpose($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3); 431 &SCA_EOR(12); 432 &SCA_EOR(14); 433$code.=<<___; 434 ld1 {@vt[0].4s-@vt[3].4s},[$inp],#64 435 ld1 {@vt[4].4s-@vt[7].4s},[$inp],#64 436 eor $xa0.d,$xa0.d,@xt[0].d 437 eor $xb0.d,$xb0.d,@xt[1].d 438 eor $xc0.d,$xc0.d,@xt[2].d 439 eor $xd0.d,$xd0.d,@xt[3].d 440 eor $xa1.d,$xa1.d,@xt[4].d 441 eor $xb1.d,$xb1.d,@xt[5].d 442 eor $xc1.d,$xc1.d,@xt[6].d 443 eor $xd1.d,$xd1.d,@xt[7].d 444 ld1 {@vt[0].4s-@vt[3].4s},[$inp],#64 445 ld1 {@vt[4].4s-@vt[7].4s},[$inp],#64 446___ 447 &SCA_SAVE(0,2); 448$code.=<<___; 449 eor $xa2.d,$xa2.d,@xt[0].d 450 eor $xb2.d,$xb2.d,@xt[1].d 451___ 452 &SCA_SAVE(4,6); 453$code.=<<___; 454 eor $xc2.d,$xc2.d,@xt[2].d 455 eor $xd2.d,$xd2.d,@xt[3].d 456___ 457 &SCA_SAVE(8,10); 458$code.=<<___; 459 eor $xa3.d,$xa3.d,@xt[4].d 460 eor $xb3.d,$xb3.d,@xt[5].d 461___ 462 &SCA_SAVE(12,14); 463$code.=<<___; 464 eor $xc3.d,$xc3.d,@xt[6].d 465 eor $xd3.d,$xd3.d,@xt[7].d 466 st1 {@vx[0].4s-@vx[12].4s},[$outp],#64 467 st1 {@vx[1].4s-@vx[13].4s},[$outp],#64 468 st1 {@vx[2].4s-@vx[14].4s},[$outp],#64 469 st1 {@vx[3].4s-@vx[15].4s},[$outp],#64 470___ 471} 472 473sub SVE_TRANSFORMS() { 474$code.=<<___; 475#ifdef __AARCH64EB__ 476 rev @sxx[0],@sxx[0] 477 revb @mx[0].s,p0/m,@mx[0].s 478 revb @mx[1].s,p0/m,@mx[1].s 479 rev @sxx[2],@sxx[2] 480 revb @mx[2].s,p0/m,@mx[2].s 481 revb @mx[3].s,p0/m,@mx[3].s 482 rev @sxx[4],@sxx[4] 483 revb @mx[4].s,p0/m,@mx[4].s 484 revb @mx[5].s,p0/m,@mx[5].s 485 rev @sxx[6],@sxx[6] 486 revb @mx[6].s,p0/m,@mx[6].s 487 revb @mx[7].s,p0/m,@mx[7].s 488 rev @sxx[8],@sxx[8] 489 revb @mx[8].s,p0/m,@mx[8].s 490 revb @mx[9].s,p0/m,@mx[9].s 491 rev @sxx[10],@sxx[10] 492 revb @mx[10].s,p0/m,@mx[10].s 493 revb @mx[11].s,p0/m,@mx[11].s 494 rev @sxx[12],@sxx[12] 495 revb @mx[12].s,p0/m,@mx[12].s 496 revb @mx[13].s,p0/m,@mx[13].s 497 rev @sxx[14],@sxx[14] 498 revb @mx[14].s,p0/m,@mx[14].s 499 revb @mx[15].s,p0/m,@mx[15].s 500#endif 501 .if mixin == 1 502 add @K[6],@K[6],#1 503 .endif 504 cmp $veclen,4 505 b.ne 200f 506___ 507 &SVE_VL128_TRANSFORMS(); 508$code.=<<___; 509 b 210f 510200: 511___ 512 &transpose($xa0,$xb0,$xc0,$xd0,$xa1,$xb1,$xc1,$xd1); 513 &SCA_EOR(0); 514 &SCA_EOR(2); 515 &transpose($xa2,$xb2,$xc2,$xd2,$xa3,$xb3,$xc3,$xd3); 516 &SCA_EOR(4); 517 &SCA_EOR(6); 518 &transpose($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3); 519 &SCA_EOR(8); 520 &SCA_EOR(10); 521 &transpose($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3); 522 &SCA_EOR(12); 523 &SCA_EOR(14); 524 &load(@xt[0],@xt[1],@xt[2],@xt[3],@xt[4],@xt[5],@xt[6],@xt[7]); 525$code.=<<___; 526 eor $xa0.d,$xa0.d,@xt[0].d 527 eor $xa1.d,$xa1.d,@xt[1].d 528 eor $xa2.d,$xa2.d,@xt[2].d 529 eor $xa3.d,$xa3.d,@xt[3].d 530 eor $xb0.d,$xb0.d,@xt[4].d 531 eor $xb1.d,$xb1.d,@xt[5].d 532 eor $xb2.d,$xb2.d,@xt[6].d 533 eor $xb3.d,$xb3.d,@xt[7].d 534___ 535 &load(@xt[0],@xt[1],@xt[2],@xt[3],@xt[4],@xt[5],@xt[6],@xt[7]); 536 &SCA_SAVE(0,2); 537$code.=<<___; 538 eor $xc0.d,$xc0.d,@xt[0].d 539 eor $xc1.d,$xc1.d,@xt[1].d 540___ 541 &SCA_SAVE(4,6); 542$code.=<<___; 543 eor $xc2.d,$xc2.d,@xt[2].d 544 eor $xc3.d,$xc3.d,@xt[3].d 545___ 546 &SCA_SAVE(8,10); 547$code.=<<___; 548 eor $xd0.d,$xd0.d,@xt[4].d 549 eor $xd1.d,$xd1.d,@xt[5].d 550___ 551 &SCA_SAVE(12,14); 552$code.=<<___; 553 eor $xd2.d,$xd2.d,@xt[6].d 554 eor $xd3.d,$xd3.d,@xt[7].d 555___ 556 &store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3); 557 &store($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3); 558$code.=<<___; 559210: 560 incw @K[6], ALL, MUL #1 561___ 562} 563 564sub SET_STATE_BAK() { 565 my $idx0 = shift; 566 my $idx1 = $idx0 + 1; 567 my $x0 = @sx[$idx0]; 568 my $xx0 = @sxx[$idx0]; 569 my $x1 = @sx[$idx1]; 570 my $xx1 = @sxx[$idx1]; 571 my $d = $idx0/2; 572 573$code.=<<___; 574 lsr $xx1,@K[$d],#32 575 dup @mx[$idx0].s,@KL[$d] 576 dup @bak[$idx0].s,@KL[$d] 577 .if mixin == 1 578 mov $x0,@KL[$d] 579 .endif 580 dup @mx[$idx1].s,$x1 581 dup @bak[$idx1].s,$x1 582___ 583} 584 585sub SET_STATE() { 586 my $idx0 = shift; 587 my $idx1 = $idx0 + 1; 588 my $x0 = @sx[$idx0]; 589 my $xx0 = @sxx[$idx0]; 590 my $x1 = @sx[$idx1]; 591 my $xx1 = @sxx[$idx1]; 592 my $d = $idx0/2; 593 594$code.=<<___; 595 lsr $xx1,@K[$d],#32 596 dup @mx[$idx0].s,@KL[$d] 597 .if mixin == 1 598 mov $x0,@KL[$d] 599 .endif 600 dup @mx[$idx1].s,$x1 601___ 602} 603 604sub SVE_LOAD_STATES() { 605 &SET_STATE_BAK(0); 606 &SET_STATE_BAK(2); 607 &SET_STATE_BAK(4); 608 &SET_STATE_BAK(6); 609 &SET_STATE_BAK(8); 610 &SET_STATE(10); 611 &SET_STATE(14); 612$code.=<<___; 613 .if mixin == 1 614 add @sx[13],@KL[6],#1 615 mov @sx[12],@KL[6] 616 index $zctr.s,@sx[13],1 617 index @mx[12].s,@sx[13],1 618 .else 619 index $zctr.s,@KL[6],1 620 index @mx[12].s,@KL[6],1 621 .endif 622 lsr @sxx[13],@K[6],#32 623 dup @mx[13].s,@sx[13] 624___ 625} 626 627sub SVE2_LOAD_STATES() { 628 &SET_STATE_BAK(0); 629 &SET_STATE_BAK(2); 630 &SET_STATE_BAK(4); 631 &SET_STATE_BAK(6); 632 &SET_STATE_BAK(8); 633 &SET_STATE_BAK(10); 634 &SET_STATE_BAK(14); 635 636$code.=<<___; 637 .if mixin == 1 638 add @sx[13],@KL[6],#1 639 mov @sx[12],@KL[6] 640 index $zctr.s,@sx[13],1 641 index @mx[12].s,@sx[13],1 642 .else 643 index $zctr.s,@KL[6],1 644 index @mx[12].s,@KL[6],1 645 .endif 646 lsr @sxx[13],@K[6],#32 647 dup @mx[13].s,@sx[13] 648 dup @bak[13].s,@sx[13] 649___ 650} 651 652sub chacha20_sve() { 653 my ($tmp) = (@sxx[0]); 654 655$code.=<<___; 656.align 5 657100: 658 subs $tmp,$len,$veclen,lsl #6 659 b.lt 110f 660 mov $len,$tmp 661 b.eq 101f 662 cmp $len,64 663 b.lt 101f 664 mixin=1 665___ 666 &SVE_LOAD_STATES(); 667 &SVE_INNER_BLOCK(); 668 &SVE_ACCUM_STATES(); 669 &SVE_TRANSFORMS(); 670$code.=<<___; 671 subs $len,$len,64 672 b.gt 100b 673 b 110f 674101: 675 mixin=0 676___ 677 &SVE_LOAD_STATES(); 678 &SVE_INNER_BLOCK(); 679 &SVE_ACCUM_STATES(); 680 &SVE_TRANSFORMS(); 681$code.=<<___; 682110: 683___ 684} 685 686sub chacha20_sve2() { 687 my ($tmp) = (@sxx[0]); 688 689$code.=<<___; 690.align 5 691100: 692 subs $tmp,$len,$veclen,lsl #6 693 b.lt 110f 694 mov $len,$tmp 695 b.eq 101f 696 cmp $len,64 697 b.lt 101f 698 mixin=1 699___ 700 &SVE2_LOAD_STATES(); 701 &SVE2_INNER_BLOCK(); 702 &SVE2_ACCUM_STATES(); 703 &SVE_TRANSFORMS(); 704$code.=<<___; 705 subs $len,$len,64 706 b.gt 100b 707 b 110f 708101: 709 mixin=0 710___ 711 &SVE2_LOAD_STATES(); 712 &SVE2_INNER_BLOCK(); 713 &SVE2_ACCUM_STATES(); 714 &SVE_TRANSFORMS(); 715$code.=<<___; 716110: 717___ 718} 719 720 721{{{ 722 my ($tmp,$tmpw) = ("x6", "w6"); 723 my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10"); 724 my ($sve2flag) = ("x7"); 725 726$code.=<<___; 727#include "arm_arch.h" 728 729.arch armv8-a 730 731.extern OPENSSL_armcap_P 732.hidden OPENSSL_armcap_P 733 734.text 735 736.rodata 737.align 5 738.type _${prefix}_consts,%object 739_${prefix}_consts: 740.Lchacha20_consts: 741.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 742.Lrot8: 743 .word 0x02010003,0x04040404,0x02010003,0x04040404 744.size _${prefix}_consts,.-_${prefix}_consts 745 746.previous 747 748.globl ChaCha20_ctr32_sve 749.type ChaCha20_ctr32_sve,%function 750.align 5 751ChaCha20_ctr32_sve: 752 AARCH64_VALID_CALL_TARGET 753 cntw $veclen, ALL, MUL #1 754 cmp $len,$veclen,lsl #6 755 b.lt .Lreturn 756 mov $sve2flag,0 757 adrp $tmp,OPENSSL_armcap_P 758 ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P] 759 tst $tmpw,#ARMV8_SVE2 760 b.eq 1f 761 mov $sve2flag,1 762 b 2f 7631: 764 cmp $veclen,4 765 b.le .Lreturn 766 adrp $tmp,.Lrot8 767 add $tmp,$tmp,#:lo12:.Lrot8 768 ldp $tmpw0,$tmpw1,[$tmp] 769 index $rot8.s,$tmpw0,$tmpw1 7702: 771 AARCH64_SIGN_LINK_REGISTER 772 stp d8,d9,[sp,-192]! 773 stp d10,d11,[sp,16] 774 stp d12,d13,[sp,32] 775 stp d14,d15,[sp,48] 776 stp x16,x17,[sp,64] 777 stp x18,x19,[sp,80] 778 stp x20,x21,[sp,96] 779 stp x22,x23,[sp,112] 780 stp x24,x25,[sp,128] 781 stp x26,x27,[sp,144] 782 stp x28,x29,[sp,160] 783 str x30,[sp,176] 784 785 adrp $tmp,.Lchacha20_consts 786 add $tmp,$tmp,#:lo12:.Lchacha20_consts 787 ldp @K[0],@K[1],[$tmp] 788 ldp @K[2],@K[3],[$key] 789 ldp @K[4],@K[5],[$key, 16] 790 ldp @K[6],@K[7],[$ctr] 791 ptrues p0.s,ALL 792#ifdef __AARCH64EB__ 793 ror @K[2],@K[2],#32 794 ror @K[3],@K[3],#32 795 ror @K[4],@K[4],#32 796 ror @K[5],@K[5],#32 797 ror @K[6],@K[6],#32 798 ror @K[7],@K[7],#32 799#endif 800 cbz $sve2flag, 1f 801___ 802 &chacha20_sve2(); 803$code.=<<___; 804 b 2f 8051: 806___ 807 &chacha20_sve(); 808$code.=<<___; 8092: 810 str @KL[6],[$ctr] 811 ldp d10,d11,[sp,16] 812 ldp d12,d13,[sp,32] 813 ldp d14,d15,[sp,48] 814 ldp x16,x17,[sp,64] 815 ldp x18,x19,[sp,80] 816 ldp x20,x21,[sp,96] 817 ldp x22,x23,[sp,112] 818 ldp x24,x25,[sp,128] 819 ldp x26,x27,[sp,144] 820 ldp x28,x29,[sp,160] 821 ldr x30,[sp,176] 822 ldp d8,d9,[sp],192 823 AARCH64_VALIDATE_LINK_REGISTER 824.Lreturn: 825 ret 826.size ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve 827___ 828 829}}} 830 831######################################## 832{ 833my %opcode_unpred = ( 834 "movprfx" => 0x0420BC00, 835 "eor" => 0x04a03000, 836 "add" => 0x04200000, 837 "orr" => 0x04603000, 838 "lsl" => 0x04209C00, 839 "lsr" => 0x04209400, 840 "incw" => 0x04B00000, 841 "xar" => 0x04203400, 842 "zip1" => 0x05206000, 843 "zip2" => 0x05206400, 844 "uzp1" => 0x05206800, 845 "uzp2" => 0x05206C00, 846 "index" => 0x04204C00, 847 "mov" => 0x05203800, 848 "dup" => 0x05203800, 849 "cntw" => 0x04A0E000, 850 "tbl" => 0x05203000); 851 852my %opcode_imm_unpred = ( 853 "dup" => 0x2538C000, 854 "index" => 0x04204400); 855 856my %opcode_scalar_pred = ( 857 "mov" => 0x0528A000, 858 "cpy" => 0x0528A000, 859 "st4w" => 0xE5606000, 860 "st1w" => 0xE5004000, 861 "ld1w" => 0xA5404000); 862 863my %opcode_gather_pred = ( 864 "ld1w" => 0x85204000); 865 866my %opcode_pred = ( 867 "eor" => 0x04190000, 868 "add" => 0x04000000, 869 "orr" => 0x04180000, 870 "whilelo" => 0x25200C00, 871 "whilelt" => 0x25200400, 872 "cntp" => 0x25208000, 873 "addvl" => 0x04205000, 874 "lsl" => 0x04038000, 875 "lsr" => 0x04018000, 876 "sel" => 0x0520C000, 877 "mov" => 0x0520C000, 878 "ptrue" => 0x2518E000, 879 "pfalse" => 0x2518E400, 880 "ptrues" => 0x2519E000, 881 "pnext" => 0x2519C400, 882 "ld4w" => 0xA560E000, 883 "st4w" => 0xE570E000, 884 "st1w" => 0xE500E000, 885 "ld1w" => 0xA540A000, 886 "ld1rw" => 0x8540C000, 887 "lasta" => 0x0520A000, 888 "revh" => 0x05258000, 889 "revb" => 0x05248000); 890 891my %tsize = ( 892 'b' => 0, 893 'h' => 1, 894 's' => 2, 895 'd' => 3); 896 897my %sf = ( 898 "w" => 0, 899 "x" => 1); 900 901my %pattern = ( 902 "POW2" => 0, 903 "VL1" => 1, 904 "VL2" => 2, 905 "VL3" => 3, 906 "VL4" => 4, 907 "VL5" => 5, 908 "VL6" => 6, 909 "VL7" => 7, 910 "VL8" => 8, 911 "VL16" => 9, 912 "VL32" => 10, 913 "VL64" => 11, 914 "VL128" => 12, 915 "VL256" => 13, 916 "MUL4" => 29, 917 "MUL3" => 30, 918 "ALL" => 31); 919 920sub create_verifier { 921 my $filename="./compile_sve.sh"; 922 923$scripts = <<___; 924#! /bin/bash 925set -e 926CROSS_COMPILE=\${CROSS_COMPILE:-'aarch64-none-linux-gnu-'} 927 928[ -z "\$1" ] && exit 1 929ARCH=`uname -p | xargs echo -n` 930 931# need gcc-10 and above to compile SVE code 932# change this according to your system during debugging 933if [ \$ARCH == 'aarch64' ]; then 934 CC=gcc-11 935 OBJDUMP=objdump 936else 937 CC=\${CROSS_COMPILE}gcc 938 OBJDUMP=\${CROSS_COMPILE}objdump 939fi 940TMPFILE=/tmp/\$\$ 941cat > \$TMPFILE.c << EOF 942extern __attribute__((noinline, section("disasm_output"))) void dummy_func() 943{ 944 asm("\$@\\t\\n"); 945} 946int main(int argc, char *argv[]) 947{ 948} 949EOF 950\$CC -march=armv8.2-a+sve+sve2 -o \$TMPFILE.out \$TMPFILE.c 951\$OBJDUMP -d \$TMPFILE.out | awk -F"\\n" -v RS="\\n\\n" '\$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",\$2}' 952rm \$TMPFILE.c \$TMPFILE.out 953___ 954 open(FH, '>', $filename) or die $!; 955 print FH $scripts; 956 close(FH); 957 system("chmod a+x ./compile_sve.sh"); 958} 959 960sub compile_sve { 961 return `./compile_sve.sh '@_'` 962} 963 964sub verify_inst { 965 my ($code,$inst)=@_; 966 my $hexcode = (sprintf "%08x", $code); 967 968 if ($debug_encoder == 1) { 969 my $expect=&compile_sve($inst); 970 if ($expect ne $hexcode) { 971 return (sprintf "%s // Encode Error! expect [%s] actual [%s]", $inst, $expect, $hexcode); 972 } 973 } 974 return (sprintf ".inst\t0x%s\t//%s", $hexcode, $inst); 975} 976 977sub reg_code { 978 my $code = shift; 979 980 if ($code == "zr") { 981 return "31"; 982 } 983 return $code; 984} 985 986sub encode_size_imm() { 987 my ($mnemonic, $isize, $const)=@_; 988 my $esize = (8<<$tsize{$isize}); 989 my $tsize_imm = $esize + $const; 990 991 if ($mnemonic eq "lsr" || $mnemonic eq "xar") { 992 $tsize_imm = 2*$esize - $const; 993 } 994 return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<16); 995} 996 997sub encode_shift_pred() { 998 my ($mnemonic, $isize, $const)=@_; 999 my $esize = (8<<$tsize{$isize}); 1000 my $tsize_imm = $esize + $const; 1001 1002 if ($mnemonic eq "lsr") { 1003 $tsize_imm = 2*$esize - $const; 1004 } 1005 return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<5); 1006} 1007 1008sub sve_unpred { 1009 my ($mnemonic,$arg)=@_; 1010 my $inst = (sprintf "%s %s", $mnemonic,$arg); 1011 1012 if ($arg =~ m/z([0-9]+)\.([bhsd]),\s*\{\s*z([0-9]+)\.[bhsd].*\},\s*z([0-9]+)\.[bhsd].*/o) { 1013 return &verify_inst($opcode_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22)|($4<<16), 1014 $inst) 1015 } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*([zwx][0-9]+.*)/o) { 1016 my $regd = $1; 1017 my $isize = $2; 1018 my $regs=$3; 1019 1020 if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) { 1021 if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o 1022 && ((8<<$tsize{$isize}) > $2)) { 1023 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($1<<5)|&encode_size_imm($mnemonic,$isize,$2), 1024 $inst); 1025 } 1026 } elsif($regs =~ m/[wx]([0-9]+),\s*[wx]([0-9]+)/o) { 1027 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst); 1028 } elsif ($regs =~ m/[wx]([0-9]+),\s*#?([0-9]+)/o) { 1029 return &verify_inst($opcode_imm_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst); 1030 } elsif ($regs =~ m/[wx]([0-9]+)/o) { 1031 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5), $inst); 1032 } else { 1033 my $encoded_size = 0; 1034 if (($mnemonic eq "add") || ($mnemonic =~ /zip./) || ($mnemonic =~ /uzp./) ) { 1035 $encoded_size = ($tsize{$isize}<<22); 1036 } 1037 if ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd],\s*([0-9]+)/o && 1038 $1 == $regd) { 1039 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($2<<5)|&encode_size_imm($mnemonic,$isize,$3), $inst); 1040 } elsif ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd]/o) { 1041 return &verify_inst($opcode_unpred{$mnemonic}|$regd|$encoded_size|($1<<5)|($2<<16), $inst); 1042 } 1043 } 1044 } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*#?([0-9]+)/o) { 1045 return &verify_inst($opcode_imm_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22), 1046 $inst) 1047 } 1048 sprintf "%s // fail to parse", $inst; 1049} 1050 1051sub sve_pred { 1052 my ($mnemonic,,$arg)=@_; 1053 my $inst = (sprintf "%s %s", $mnemonic,$arg); 1054 1055 if ($arg =~ m/\{\s*z([0-9]+)\.([bhsd]).*\},\s*p([0-9])+(\/z)?,\s*\[(\s*[xs].*)\]/o) { 1056 my $zt = $1; 1057 my $size = $tsize{$2}; 1058 my $pg = $3; 1059 my $addr = $5; 1060 my $xn = 31; 1061 1062 if ($addr =~ m/x([0-9]+)\s*/o) { 1063 $xn = $1; 1064 } 1065 1066 if ($mnemonic =~m/ld1r[bhwd]/o) { 1067 $size = 0; 1068 } 1069 if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) { 1070 return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); 1071 } elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) { 1072 my $xs = ($2 eq "SXTW") ? 1 : 0; 1073 return &verify_inst($opcode_gather_pred{$mnemonic}|($xs<<22)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); 1074 } elsif($addr =~ m/\w+\s*,\s*#?([0-9]+)/o) { 1075 return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); 1076 } else { 1077 return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($xn<<5),$inst); 1078 } 1079 } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*p([0-9]+)\/([mz]),\s*([zwx][0-9]+.*)/o) { 1080 my $regd = $1; 1081 my $isize = $2; 1082 my $pg = $3; 1083 my $mod = $4; 1084 my $regs = $5; 1085 1086 if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) { 1087 if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o 1088 && $regd == $1 1089 && $mode == 'm' 1090 && ((8<<$tsize{$isize}) > $2)) { 1091 return &verify_inst($opcode_pred{$mnemonic}|$regd|($pg<<10)|&encode_shift_pred($mnemonic,$isize,$2), $inst); 1092 } 1093 } elsif($regs =~ m/[wx]([0-9]+)/o) { 1094 return &verify_inst($opcode_scalar_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst); 1095 } elsif ($regs =~ m/z([0-9]+)[^,]*(?:,\s*z([0-9]+))?/o) { 1096 if ($mnemonic eq "sel") { 1097 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($2<<16), $inst); 1098 } elsif ($mnemonic eq "mov") { 1099 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($regd<<16), $inst); 1100 } elsif (length $2 > 0) { 1101 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($2<<5), $inst); 1102 } else { 1103 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst); 1104 } 1105 } 1106 } elsif ($arg =~ m/p([0-9]+)\.([bhsd]),\s*(\w+.*)/o) { 1107 my $pg = $1; 1108 my $isize = $2; 1109 my $regs = $3; 1110 1111 if ($regs =~ m/([wx])(zr|[0-9]+),\s*[wx](zr|[0-9]+)/o) { 1112 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($sf{$1}<<12)|(®_code($2)<<5)|(®_code($3)<<16), $inst); 1113 } elsif ($regs =~ m/p([0-9]+),\s*p([0-9]+)\.[bhsd]/o) { 1114 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($1<<5), $inst); 1115 } else { 1116 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($pattern{$regs}<<5), $inst); 1117 } 1118 } elsif ($arg =~ m/p([0-9]+)\.([bhsd])/o) { 1119 return &verify_inst($opcode_pred{$mnemonic}|$1, $inst); 1120 } 1121 1122 sprintf "%s // fail to parse", $inst; 1123} 1124 1125sub sve_other { 1126 my ($mnemonic,$arg)=@_; 1127 my $inst = (sprintf "%s %s", $mnemonic,$arg); 1128 1129 if ($arg =~ m/x([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*p([0-9]+)\.([bhsd])/o) { 1130 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$4}<<22)|$1|($2<<10)|($3<<5), $inst); 1131 } elsif ($arg =~ m/(x|w)([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*z([0-9]+)\.([bhsd])/o) { 1132 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$5}<<22)|$1|($3<<10)|($4<<5)|$2, $inst); 1133 }elsif ($mnemonic =~ /inc[bhdw]/) { 1134 if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) { 1135 return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(2<<12)|(($3 - 1)<<16)|0xE000, $inst); 1136 } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) { 1137 return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16)|0xC000, $inst); 1138 } elsif ($arg =~ m/x([0-9]+)/o) { 1139 return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16)|0xE000, $inst); 1140 } 1141 } elsif ($mnemonic =~ /cnt[bhdw]/) { 1142 if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) { 1143 return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst); 1144 } 1145 } elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) { 1146 return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst); 1147 } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) { 1148 return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst); 1149 } 1150 sprintf "%s // fail to parse", $inst; 1151} 1152} 1153 1154open SELF,$0; 1155while(<SELF>) { 1156 next if (/^#!/); 1157 last if (!s/^#/\/\// and !/^$/); 1158 print; 1159} 1160close SELF; 1161 1162if ($debug_encoder == 1) { 1163 &create_verifier(); 1164} 1165 1166foreach(split("\n",$code)) { 1167 s/\`([^\`]*)\`/eval($1)/ge; 1168 s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge; 1169 s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge; 1170 s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge; 1171 s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge; 1172 s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge; 1173 s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge; 1174 s/\b(movprfx|lasta|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z|w).*)/sve_other($1,$2)/ge; 1175 print $_,"\n"; 1176} 1177 1178close STDOUT or die "error closing STDOUT: $!"; 1179