1#! /usr/bin/env perl 2# Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# This module implements SM4 with ASIMD and AESE on AARCH64 11# 12# Dec 2022 13# 14 15# $output is the last argument if it looks like a file (it has an extension) 16# $flavour is the first argument if it doesn't look like a file 17$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 18$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 19 20$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 21( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 22( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 23die "can't locate arm-xlate.pl"; 24 25open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 26 or die "can't call $xlate: $!"; 27*STDOUT=*OUT; 28 29$prefix="vpsm4_ex"; 30my @vtmp=map("v$_",(0..3)); 31my @qtmp=map("q$_",(0..3)); 32my @data=map("v$_",(4..7)); 33my @datax=map("v$_",(8..11)); 34my ($rk0,$rk1)=("v12","v13"); 35my ($rka,$rkb)=("v14","v15"); 36my @vtmpx=map("v$_",(12..15)); 37my ($vtmp4,$vtmp5)=("v24","v25"); 38my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31"); 39my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31"); 40 41my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); 42my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); 43my ($xtmp1,$xtmp2)=("x8","x9"); 44my ($ptr,$counter)=("x10","w11"); 45my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); 46 47sub rev32() { 48 my $dst = shift; 49 my $src = shift; 50 51 if ($src and ("$src" ne "$dst")) { 52$code.=<<___; 53#ifndef __AARCH64EB__ 54 rev32 $dst.16b,$src.16b 55#else 56 mov $dst.16b,$src.16b 57#endif 58___ 59 } else { 60$code.=<<___; 61#ifndef __AARCH64EB__ 62 rev32 $dst.16b,$dst.16b 63#endif 64___ 65 } 66} 67 68sub rev32_armeb() { 69 my $dst = shift; 70 my $src = shift; 71 72 if ($src and ("$src" ne "$dst")) { 73$code.=<<___; 74#ifdef __AARCH64EB__ 75 rev32 $dst.16b,$src.16b 76#else 77 mov $dst.16b,$src.16b 78#endif 79___ 80 } else { 81$code.=<<___; 82#ifdef __AARCH64EB__ 83 rev32 $dst.16b,$dst.16b 84#endif 85___ 86 } 87} 88 89sub rbit() { 90 my $dst = shift; 91 my $src = shift; 92 my $std = shift; 93 94 if ($src and ("$src" ne "$dst")) { 95 if ($std eq "_gb") { 96$code.=<<___; 97 rbit $dst.16b,$src.16b 98___ 99 } else { 100$code.=<<___; 101 mov $dst.16b,$src.16b 102___ 103 } 104 } else { 105 if ($std eq "_gb") { 106$code.=<<___; 107 rbit $dst.16b,$src.16b 108___ 109 } 110 } 111} 112 113sub transpose() { 114 my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; 115 116$code.=<<___; 117 zip1 $vt0.4s,$dat0.4s,$dat1.4s 118 zip2 $vt1.4s,$dat0.4s,$dat1.4s 119 zip1 $vt2.4s,$dat2.4s,$dat3.4s 120 zip2 $vt3.4s,$dat2.4s,$dat3.4s 121 zip1 $dat0.2d,$vt0.2d,$vt2.2d 122 zip2 $dat1.2d,$vt0.2d,$vt2.2d 123 zip1 $dat2.2d,$vt1.2d,$vt3.2d 124 zip2 $dat3.2d,$vt1.2d,$vt3.2d 125___ 126} 127 128# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x) 129sub mul_matrix() { 130 my $x = shift; 131 my $higherMat = shift; 132 my $lowerMat = shift; 133 my $tmp = shift; 134$code.=<<___; 135 ushr $tmp.16b, $x.16b, 4 136 and $x.16b, $x.16b, $ANDMaskV.16b 137 tbl $x.16b, {$lowerMat.16b}, $x.16b 138 tbl $tmp.16b, {$higherMat.16b}, $tmp.16b 139 eor $x.16b, $x.16b, $tmp.16b 140___ 141} 142 143# sbox operations for 4-lane of words 144# sbox operation for 4-lane of words 145sub sbox() { 146 my $dat = shift; 147 148$code.=<<___; 149 // optimize sbox using AESE instruction 150 tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b 151___ 152 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); 153$code.=<<___; 154 eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b 155 aese @vtmp[0].16b,@vtmp[1].16b 156___ 157 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4); 158$code.=<<___; 159 mov $dat.16b,@vtmp[0].16b 160 161 // linear transformation 162 ushr @vtmp[0].4s,$dat.4s,32-2 163 ushr @vtmp[1].4s,$dat.4s,32-10 164 ushr @vtmp[2].4s,$dat.4s,32-18 165 ushr @vtmp[3].4s,$dat.4s,32-24 166 sli @vtmp[0].4s,$dat.4s,2 167 sli @vtmp[1].4s,$dat.4s,10 168 sli @vtmp[2].4s,$dat.4s,18 169 sli @vtmp[3].4s,$dat.4s,24 170 eor $vtmp4.16b,@vtmp[0].16b,$dat.16b 171 eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b 172 eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b 173 eor $dat.16b,$dat.16b,$vtmp4.16b 174___ 175} 176 177# sbox operation for 8-lane of words 178sub sbox_double() { 179 my $dat = shift; 180 my $datx = shift; 181 182$code.=<<___; 183 // optimize sbox using AESE instruction 184 tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b 185 tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b 186___ 187 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); 188 &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4); 189$code.=<<___; 190 eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b 191 aese @vtmp[0].16b,$vtmp5.16b 192 aese @vtmp[1].16b,$vtmp5.16b 193___ 194 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4); 195 &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4); 196$code.=<<___; 197 mov $dat.16b,@vtmp[0].16b 198 mov $datx.16b,@vtmp[1].16b 199 200 // linear transformation 201 ushr @vtmp[0].4s,$dat.4s,32-2 202 ushr $vtmp5.4s,$datx.4s,32-2 203 ushr @vtmp[1].4s,$dat.4s,32-10 204 ushr @vtmp[2].4s,$dat.4s,32-18 205 ushr @vtmp[3].4s,$dat.4s,32-24 206 sli @vtmp[0].4s,$dat.4s,2 207 sli $vtmp5.4s,$datx.4s,2 208 sli @vtmp[1].4s,$dat.4s,10 209 sli @vtmp[2].4s,$dat.4s,18 210 sli @vtmp[3].4s,$dat.4s,24 211 eor $vtmp4.16b,@vtmp[0].16b,$dat.16b 212 eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b 213 eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b 214 eor $dat.16b,$dat.16b,$vtmp4.16b 215 ushr @vtmp[1].4s,$datx.4s,32-10 216 ushr @vtmp[2].4s,$datx.4s,32-18 217 ushr @vtmp[3].4s,$datx.4s,32-24 218 sli @vtmp[1].4s,$datx.4s,10 219 sli @vtmp[2].4s,$datx.4s,18 220 sli @vtmp[3].4s,$datx.4s,24 221 eor $vtmp4.16b,$vtmp5.16b,$datx.16b 222 eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b 223 eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b 224 eor $datx.16b,$datx.16b,$vtmp4.16b 225___ 226} 227 228# sbox operation for one single word 229sub sbox_1word () { 230 my $word = shift; 231 232$code.=<<___; 233 mov @vtmp[3].s[0],$word 234 // optimize sbox using AESE instruction 235 tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b 236___ 237 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); 238$code.=<<___; 239 eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b 240 aese @vtmp[0].16b,@vtmp[1].16b 241___ 242 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); 243$code.=<<___; 244 245 mov $wtmp0,@vtmp[0].s[0] 246 eor $word,$wtmp0,$wtmp0,ror #32-2 247 eor $word,$word,$wtmp0,ror #32-10 248 eor $word,$word,$wtmp0,ror #32-18 249 eor $word,$word,$wtmp0,ror #32-24 250___ 251} 252 253# sm4 for one block of data, in scalar registers word0/word1/word2/word3 254sub sm4_1blk () { 255 my $kptr = shift; 256 257$code.=<<___; 258 ldp $wtmp0,$wtmp1,[$kptr],8 259 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 260 eor $tmpw,$word2,$word3 261 eor $wtmp2,$wtmp0,$word1 262 eor $tmpw,$tmpw,$wtmp2 263___ 264 &sbox_1word($tmpw); 265$code.=<<___; 266 eor $word0,$word0,$tmpw 267 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 268 eor $tmpw,$word2,$word3 269 eor $wtmp2,$word0,$wtmp1 270 eor $tmpw,$tmpw,$wtmp2 271___ 272 &sbox_1word($tmpw); 273$code.=<<___; 274 ldp $wtmp0,$wtmp1,[$kptr],8 275 eor $word1,$word1,$tmpw 276 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 277 eor $tmpw,$word0,$word1 278 eor $wtmp2,$wtmp0,$word3 279 eor $tmpw,$tmpw,$wtmp2 280___ 281 &sbox_1word($tmpw); 282$code.=<<___; 283 eor $word2,$word2,$tmpw 284 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 285 eor $tmpw,$word0,$word1 286 eor $wtmp2,$word2,$wtmp1 287 eor $tmpw,$tmpw,$wtmp2 288___ 289 &sbox_1word($tmpw); 290$code.=<<___; 291 eor $word3,$word3,$tmpw 292___ 293} 294 295# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 296sub sm4_4blks () { 297 my $kptr = shift; 298 299$code.=<<___; 300 ldp $wtmp0,$wtmp1,[$kptr],8 301 dup $rk0.4s,$wtmp0 302 dup $rk1.4s,$wtmp1 303 304 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 305 eor $rka.16b,@data[2].16b,@data[3].16b 306 eor $rk0.16b,@data[1].16b,$rk0.16b 307 eor $rk0.16b,$rka.16b,$rk0.16b 308___ 309 &sbox($rk0); 310$code.=<<___; 311 eor @data[0].16b,@data[0].16b,$rk0.16b 312 313 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 314 eor $rka.16b,$rka.16b,@data[0].16b 315 eor $rk1.16b,$rka.16b,$rk1.16b 316___ 317 &sbox($rk1); 318$code.=<<___; 319 ldp $wtmp0,$wtmp1,[$kptr],8 320 eor @data[1].16b,@data[1].16b,$rk1.16b 321 322 dup $rk0.4s,$wtmp0 323 dup $rk1.4s,$wtmp1 324 325 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 326 eor $rka.16b,@data[0].16b,@data[1].16b 327 eor $rk0.16b,@data[3].16b,$rk0.16b 328 eor $rk0.16b,$rka.16b,$rk0.16b 329___ 330 &sbox($rk0); 331$code.=<<___; 332 eor @data[2].16b,@data[2].16b,$rk0.16b 333 334 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 335 eor $rka.16b,$rka.16b,@data[2].16b 336 eor $rk1.16b,$rka.16b,$rk1.16b 337___ 338 &sbox($rk1); 339$code.=<<___; 340 eor @data[3].16b,@data[3].16b,$rk1.16b 341___ 342} 343 344# sm4 for 8 lanes of data, in neon registers 345# data0/data1/data2/data3 datax0/datax1/datax2/datax3 346sub sm4_8blks () { 347 my $kptr = shift; 348 349$code.=<<___; 350 ldp $wtmp0,$wtmp1,[$kptr],8 351 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 352 dup $rk0.4s,$wtmp0 353 eor $rka.16b,@data[2].16b,@data[3].16b 354 eor $rkb.16b,@datax[2].16b,@datax[3].16b 355 eor @vtmp[0].16b,@data[1].16b,$rk0.16b 356 eor @vtmp[1].16b,@datax[1].16b,$rk0.16b 357 eor $rk0.16b,$rka.16b,@vtmp[0].16b 358 eor $rk1.16b,$rkb.16b,@vtmp[1].16b 359___ 360 &sbox_double($rk0,$rk1); 361$code.=<<___; 362 eor @data[0].16b,@data[0].16b,$rk0.16b 363 eor @datax[0].16b,@datax[0].16b,$rk1.16b 364 365 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 366 dup $rk1.4s,$wtmp1 367 eor $rka.16b,$rka.16b,@data[0].16b 368 eor $rkb.16b,$rkb.16b,@datax[0].16b 369 eor $rk0.16b,$rka.16b,$rk1.16b 370 eor $rk1.16b,$rkb.16b,$rk1.16b 371___ 372 &sbox_double($rk0,$rk1); 373$code.=<<___; 374 ldp $wtmp0,$wtmp1,[$kptr],8 375 eor @data[1].16b,@data[1].16b,$rk0.16b 376 eor @datax[1].16b,@datax[1].16b,$rk1.16b 377 378 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 379 dup $rk0.4s,$wtmp0 380 eor $rka.16b,@data[0].16b,@data[1].16b 381 eor $rkb.16b,@datax[0].16b,@datax[1].16b 382 eor @vtmp[0].16b,@data[3].16b,$rk0.16b 383 eor @vtmp[1].16b,@datax[3].16b,$rk0.16b 384 eor $rk0.16b,$rka.16b,@vtmp[0].16b 385 eor $rk1.16b,$rkb.16b,@vtmp[1].16b 386___ 387 &sbox_double($rk0,$rk1); 388$code.=<<___; 389 eor @data[2].16b,@data[2].16b,$rk0.16b 390 eor @datax[2].16b,@datax[2].16b,$rk1.16b 391 392 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 393 dup $rk1.4s,$wtmp1 394 eor $rka.16b,$rka.16b,@data[2].16b 395 eor $rkb.16b,$rkb.16b,@datax[2].16b 396 eor $rk0.16b,$rka.16b,$rk1.16b 397 eor $rk1.16b,$rkb.16b,$rk1.16b 398___ 399 &sbox_double($rk0,$rk1); 400$code.=<<___; 401 eor @data[3].16b,@data[3].16b,$rk0.16b 402 eor @datax[3].16b,@datax[3].16b,$rk1.16b 403___ 404} 405 406sub encrypt_1blk_norev() { 407 my $dat = shift; 408 409$code.=<<___; 410 mov $ptr,$rks 411 mov $counter,#8 412 mov $word0,$dat.s[0] 413 mov $word1,$dat.s[1] 414 mov $word2,$dat.s[2] 415 mov $word3,$dat.s[3] 41610: 417___ 418 &sm4_1blk($ptr); 419$code.=<<___; 420 subs $counter,$counter,#1 421 b.ne 10b 422 mov $dat.s[0],$word3 423 mov $dat.s[1],$word2 424 mov $dat.s[2],$word1 425 mov $dat.s[3],$word0 426___ 427} 428 429sub encrypt_1blk() { 430 my $dat = shift; 431 432 &encrypt_1blk_norev($dat); 433 &rev32($dat,$dat); 434} 435 436sub encrypt_4blks() { 437$code.=<<___; 438 mov $ptr,$rks 439 mov $counter,#8 44010: 441___ 442 &sm4_4blks($ptr); 443$code.=<<___; 444 subs $counter,$counter,#1 445 b.ne 10b 446___ 447 &rev32(@vtmp[3],@data[0]); 448 &rev32(@vtmp[2],@data[1]); 449 &rev32(@vtmp[1],@data[2]); 450 &rev32(@vtmp[0],@data[3]); 451} 452 453sub encrypt_8blks() { 454$code.=<<___; 455 mov $ptr,$rks 456 mov $counter,#8 45710: 458___ 459 &sm4_8blks($ptr); 460$code.=<<___; 461 subs $counter,$counter,#1 462 b.ne 10b 463___ 464 &rev32(@vtmp[3],@data[0]); 465 &rev32(@vtmp[2],@data[1]); 466 &rev32(@vtmp[1],@data[2]); 467 &rev32(@vtmp[0],@data[3]); 468 &rev32(@data[3],@datax[0]); 469 &rev32(@data[2],@datax[1]); 470 &rev32(@data[1],@datax[2]); 471 &rev32(@data[0],@datax[3]); 472} 473 474sub load_sbox () { 475 my $data = shift; 476 477$code.=<<___; 478 adrp $xtmp2, .Lsbox_magic 479 ldr $MaskQ, [$xtmp2, #:lo12:.Lsbox_magic] 480 ldr $TAHMatQ, [$xtmp2, #:lo12:.Lsbox_magic+16] 481 ldr $TALMatQ, [$xtmp2, #:lo12:.Lsbox_magic+32] 482 ldr $ATAHMatQ, [$xtmp2, #:lo12:.Lsbox_magic+48] 483 ldr $ATALMatQ, [$xtmp2, #:lo12:.Lsbox_magic+64] 484 ldr $ANDMaskQ, [$xtmp2, #:lo12:.Lsbox_magic+80] 485___ 486} 487 488sub mov_reg_to_vec() { 489 my $src0 = shift; 490 my $src1 = shift; 491 my $desv = shift; 492$code.=<<___; 493 mov $desv.d[0],$src0 494 mov $desv.d[1],$src1 495___ 496 &rev32_armeb($desv,$desv); 497} 498 499sub mov_vec_to_reg() { 500 my $srcv = shift; 501 my $des0 = shift; 502 my $des1 = shift; 503$code.=<<___; 504 mov $des0,$srcv.d[0] 505 mov $des1,$srcv.d[1] 506___ 507} 508 509sub compute_tweak() { 510 my $src0 = shift; 511 my $src1 = shift; 512 my $des0 = shift; 513 my $des1 = shift; 514$code.=<<___; 515 mov $wtmp0,0x87 516 extr $xtmp2,$src1,$src1,#32 517 extr $des1,$src1,$src0,#63 518 and $wtmp1,$wtmp0,$wtmp2,asr#31 519 eor $des0,$xtmp1,$src0,lsl#1 520___ 521} 522 523sub compute_tweak_vec() { 524 my $src = shift; 525 my $des = shift; 526 my $std = shift; 527 &rbit(@vtmp[2],$src,$std); 528$code.=<<___; 529 adrp $xtmp2, .Lxts_magic 530 ldr @qtmp[0], [$xtmp2, #:lo12:.Lxts_magic] 531 shl $des.16b, @vtmp[2].16b, #1 532 ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 533 ushr @vtmp[1].16b, @vtmp[1].16b, #7 534 mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b 535 eor $des.16b, $des.16b, @vtmp[1].16b 536___ 537 &rbit($des,$des,$std); 538} 539 540$code=<<___; 541#include "arm_arch.h" 542.arch armv8-a+crypto 543.text 544 545.type _${prefix}_consts,%object 546.align 7 547_${prefix}_consts: 548.Lck: 549 .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 550 .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 551 .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 552 .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 553 .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 554 .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 555 .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 556 .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 557.Lfk: 558 .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197 559.Lshuffles: 560 .quad 0x0B0A090807060504,0x030201000F0E0D0C 561.Lxts_magic: 562 .quad 0x0101010101010187,0x0101010101010101 563.Lsbox_magic: 564 .quad 0x0b0e0104070a0d00,0x0306090c0f020508 565 .quad 0x62185a2042387a00,0x22581a6002783a40 566 .quad 0x15df62a89e54e923,0xc10bb67c4a803df7 567 .quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead 568 .quad 0x6404462679195b3b,0xe383c1a1fe9edcbc 569 .quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f 570 571.size _${prefix}_consts,.-_${prefix}_consts 572___ 573 574{{{ 575my ($key,$keys,$enc)=("x0","x1","w2"); 576my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); 577my ($vkey,$vfk,$vmap)=("v5","v6","v7"); 578$code.=<<___; 579.type _${prefix}_set_key,%function 580.align 4 581_${prefix}_set_key: 582 AARCH64_VALID_CALL_TARGET 583 ld1 {$vkey.4s},[$key] 584___ 585 &load_sbox(); 586 &rev32($vkey,$vkey); 587$code.=<<___; 588 adrp $pointer,.Lshuffles 589 add $pointer,$pointer,#:lo12:.Lshuffles 590 ld1 {$vmap.2d},[$pointer] 591 adrp $pointer,.Lfk 592 add $pointer,$pointer,#:lo12:.Lfk 593 ld1 {$vfk.2d},[$pointer] 594 eor $vkey.16b,$vkey.16b,$vfk.16b 595 mov $schedules,#32 596 adrp $pointer,.Lck 597 add $pointer,$pointer,#:lo12:.Lck 598 movi @vtmp[0].16b,#64 599 cbnz $enc,1f 600 add $keys,$keys,124 6011: 602 mov $wtmp,$vkey.s[1] 603 ldr $roundkey,[$pointer],#4 604 eor $roundkey,$roundkey,$wtmp 605 mov $wtmp,$vkey.s[2] 606 eor $roundkey,$roundkey,$wtmp 607 mov $wtmp,$vkey.s[3] 608 eor $roundkey,$roundkey,$wtmp 609 // optimize sbox using AESE instruction 610 mov @data[0].s[0],$roundkey 611 tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b 612___ 613 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); 614$code.=<<___; 615 eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b 616 aese @vtmp[0].16b,@vtmp[1].16b 617___ 618 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); 619$code.=<<___; 620 mov $wtmp,@vtmp[0].s[0] 621 eor $roundkey,$wtmp,$wtmp,ror #19 622 eor $roundkey,$roundkey,$wtmp,ror #9 623 mov $wtmp,$vkey.s[0] 624 eor $roundkey,$roundkey,$wtmp 625 mov $vkey.s[0],$roundkey 626 cbz $enc,2f 627 str $roundkey,[$keys],#4 628 b 3f 6292: 630 str $roundkey,[$keys],#-4 6313: 632 tbl $vkey.16b,{$vkey.16b},$vmap.16b 633 subs $schedules,$schedules,#1 634 b.ne 1b 635 ret 636.size _${prefix}_set_key,.-_${prefix}_set_key 637___ 638}}} 639 640 641{{{ 642$code.=<<___; 643.type _${prefix}_enc_4blks,%function 644.align 4 645_${prefix}_enc_4blks: 646 AARCH64_VALID_CALL_TARGET 647___ 648 &encrypt_4blks(); 649$code.=<<___; 650 ret 651.size _${prefix}_enc_4blks,.-_${prefix}_enc_4blks 652___ 653}}} 654 655{{{ 656$code.=<<___; 657.type _${prefix}_enc_8blks,%function 658.align 4 659_${prefix}_enc_8blks: 660 AARCH64_VALID_CALL_TARGET 661___ 662 &encrypt_8blks(); 663$code.=<<___; 664 ret 665.size _${prefix}_enc_8blks,.-_${prefix}_enc_8blks 666___ 667}}} 668 669 670{{{ 671my ($key,$keys)=("x0","x1"); 672$code.=<<___; 673.globl ${prefix}_set_encrypt_key 674.type ${prefix}_set_encrypt_key,%function 675.align 5 676${prefix}_set_encrypt_key: 677 AARCH64_SIGN_LINK_REGISTER 678 stp x29,x30,[sp,#-16]! 679 mov w2,1 680 bl _${prefix}_set_key 681 ldp x29,x30,[sp],#16 682 AARCH64_VALIDATE_LINK_REGISTER 683 ret 684.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 685___ 686}}} 687 688{{{ 689my ($key,$keys)=("x0","x1"); 690$code.=<<___; 691.globl ${prefix}_set_decrypt_key 692.type ${prefix}_set_decrypt_key,%function 693.align 5 694${prefix}_set_decrypt_key: 695 AARCH64_SIGN_LINK_REGISTER 696 stp x29,x30,[sp,#-16]! 697 mov w2,0 698 bl _${prefix}_set_key 699 ldp x29,x30,[sp],#16 700 AARCH64_VALIDATE_LINK_REGISTER 701 ret 702.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 703___ 704}}} 705 706{{{ 707sub gen_block () { 708 my $dir = shift; 709 my ($inp,$outp,$rk)=map("x$_",(0..2)); 710 711$code.=<<___; 712.globl ${prefix}_${dir}crypt 713.type ${prefix}_${dir}crypt,%function 714.align 5 715${prefix}_${dir}crypt: 716 AARCH64_VALID_CALL_TARGET 717 ld1 {@data[0].4s},[$inp] 718___ 719 &load_sbox(); 720 &rev32(@data[0],@data[0]); 721$code.=<<___; 722 mov $rks,$rk 723___ 724 &encrypt_1blk(@data[0]); 725$code.=<<___; 726 st1 {@data[0].4s},[$outp] 727 ret 728.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 729___ 730} 731&gen_block("en"); 732&gen_block("de"); 733}}} 734 735{{{ 736$code.=<<___; 737.globl ${prefix}_ecb_encrypt 738.type ${prefix}_ecb_encrypt,%function 739.align 5 740${prefix}_ecb_encrypt: 741 AARCH64_SIGN_LINK_REGISTER 742 // convert length into blocks 743 lsr x2,x2,4 744 stp d8,d9,[sp,#-80]! 745 stp d10,d11,[sp,#16] 746 stp d12,d13,[sp,#32] 747 stp d14,d15,[sp,#48] 748 stp x29,x30,[sp,#64] 749___ 750 &load_sbox(); 751$code.=<<___; 752.Lecb_8_blocks_process: 753 cmp $blocks,#8 754 b.lt .Lecb_4_blocks_process 755 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 756 ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 757___ 758 &rev32(@data[0],@data[0]); 759 &rev32(@data[1],@data[1]); 760 &rev32(@data[2],@data[2]); 761 &rev32(@data[3],@data[3]); 762 &rev32(@datax[0],@datax[0]); 763 &rev32(@datax[1],@datax[1]); 764 &rev32(@datax[2],@datax[2]); 765 &rev32(@datax[3],@datax[3]); 766$code.=<<___; 767 bl _${prefix}_enc_8blks 768 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 769 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 770 subs $blocks,$blocks,#8 771 b.gt .Lecb_8_blocks_process 772 b 100f 773.Lecb_4_blocks_process: 774 cmp $blocks,#4 775 b.lt 1f 776 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 777___ 778 &rev32(@data[0],@data[0]); 779 &rev32(@data[1],@data[1]); 780 &rev32(@data[2],@data[2]); 781 &rev32(@data[3],@data[3]); 782$code.=<<___; 783 bl _${prefix}_enc_4blks 784 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 785 sub $blocks,$blocks,#4 7861: 787 // process last block 788 cmp $blocks,#1 789 b.lt 100f 790 b.gt 1f 791 ld1 {@data[0].4s},[$inp] 792___ 793 &rev32(@data[0],@data[0]); 794 &encrypt_1blk(@data[0]); 795$code.=<<___; 796 st1 {@data[0].4s},[$outp] 797 b 100f 7981: // process last 2 blocks 799 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 800 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 801 cmp $blocks,#2 802 b.gt 1f 803___ 804 &rev32(@data[0],@data[0]); 805 &rev32(@data[1],@data[1]); 806 &rev32(@data[2],@data[2]); 807 &rev32(@data[3],@data[3]); 808$code.=<<___; 809 bl _${prefix}_enc_4blks 810 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 811 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] 812 b 100f 8131: // process last 3 blocks 814 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 815___ 816 &rev32(@data[0],@data[0]); 817 &rev32(@data[1],@data[1]); 818 &rev32(@data[2],@data[2]); 819 &rev32(@data[3],@data[3]); 820$code.=<<___; 821 bl _${prefix}_enc_4blks 822 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 823 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 824 st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] 825100: 826 ldp d10,d11,[sp,#16] 827 ldp d12,d13,[sp,#32] 828 ldp d14,d15,[sp,#48] 829 ldp x29,x30,[sp,#64] 830 ldp d8,d9,[sp],#80 831 AARCH64_VALIDATE_LINK_REGISTER 832 ret 833.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt 834___ 835}}} 836 837{{{ 838my ($len,$ivp,$enc)=("x2","x4","w5"); 839my $ivec0=("v3"); 840my $ivec1=("v15"); 841 842$code.=<<___; 843.globl ${prefix}_cbc_encrypt 844.type ${prefix}_cbc_encrypt,%function 845.align 5 846${prefix}_cbc_encrypt: 847 AARCH64_VALID_CALL_TARGET 848 lsr $len,$len,4 849___ 850 &load_sbox(); 851$code.=<<___; 852 cbz $enc,.Ldec 853 ld1 {$ivec0.4s},[$ivp] 854.Lcbc_4_blocks_enc: 855 cmp $blocks,#4 856 b.lt 1f 857 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 858 eor @data[0].16b,@data[0].16b,$ivec0.16b 859___ 860 &rev32(@data[1],@data[1]); 861 &rev32(@data[0],@data[0]); 862 &rev32(@data[2],@data[2]); 863 &rev32(@data[3],@data[3]); 864 &encrypt_1blk_norev(@data[0]); 865$code.=<<___; 866 eor @data[1].16b,@data[1].16b,@data[0].16b 867___ 868 &encrypt_1blk_norev(@data[1]); 869 &rev32(@data[0],@data[0]); 870 871$code.=<<___; 872 eor @data[2].16b,@data[2].16b,@data[1].16b 873___ 874 &encrypt_1blk_norev(@data[2]); 875 &rev32(@data[1],@data[1]); 876$code.=<<___; 877 eor @data[3].16b,@data[3].16b,@data[2].16b 878___ 879 &encrypt_1blk_norev(@data[3]); 880 &rev32(@data[2],@data[2]); 881 &rev32(@data[3],@data[3]); 882$code.=<<___; 883 orr $ivec0.16b,@data[3].16b,@data[3].16b 884 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 885 subs $blocks,$blocks,#4 886 b.ne .Lcbc_4_blocks_enc 887 b 2f 8881: 889 subs $blocks,$blocks,#1 890 b.lt 2f 891 ld1 {@data[0].4s},[$inp],#16 892 eor $ivec0.16b,$ivec0.16b,@data[0].16b 893___ 894 &rev32($ivec0,$ivec0); 895 &encrypt_1blk($ivec0); 896$code.=<<___; 897 st1 {$ivec0.4s},[$outp],#16 898 b 1b 8992: 900 // save back IV 901 st1 {$ivec0.4s},[$ivp] 902 ret 903 904.Ldec: 905 // decryption mode starts 906 AARCH64_SIGN_LINK_REGISTER 907 stp d8,d9,[sp,#-80]! 908 stp d10,d11,[sp,#16] 909 stp d12,d13,[sp,#32] 910 stp d14,d15,[sp,#48] 911 stp x29,x30,[sp,#64] 912.Lcbc_8_blocks_dec: 913 cmp $blocks,#8 914 b.lt 1f 915 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] 916 add $ptr,$inp,#64 917 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr] 918___ 919 &rev32(@data[0],@data[0]); 920 &rev32(@data[1],@data[1]); 921 &rev32(@data[2],@data[2]); 922 &rev32(@data[3],$data[3]); 923 &rev32(@datax[0],@datax[0]); 924 &rev32(@datax[1],@datax[1]); 925 &rev32(@datax[2],@datax[2]); 926 &rev32(@datax[3],$datax[3]); 927$code.=<<___; 928 bl _${prefix}_enc_8blks 929___ 930 &transpose(@vtmp,@datax); 931 &transpose(@data,@datax); 932$code.=<<___; 933 ld1 {$ivec1.4s},[$ivp] 934 ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 935 // note ivec1 and vtmpx[3] are reusing the same register 936 // care needs to be taken to avoid conflict 937 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 938 ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 939 eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b 940 eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b 941 eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b 942 // save back IV 943 st1 {$vtmpx[3].4s}, [$ivp] 944 eor @data[0].16b,@data[0].16b,$datax[3].16b 945 eor @data[1].16b,@data[1].16b,@vtmpx[0].16b 946 eor @data[2].16b,@data[2].16b,@vtmpx[1].16b 947 eor @data[3].16b,$data[3].16b,@vtmpx[2].16b 948 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 949 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 950 subs $blocks,$blocks,#8 951 b.gt .Lcbc_8_blocks_dec 952 b.eq 100f 9531: 954 ld1 {$ivec1.4s},[$ivp] 955.Lcbc_4_blocks_dec: 956 cmp $blocks,#4 957 b.lt 1f 958 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] 959___ 960 &rev32(@data[0],@data[0]); 961 &rev32(@data[1],@data[1]); 962 &rev32(@data[2],@data[2]); 963 &rev32(@data[3],$data[3]); 964$code.=<<___; 965 bl _${prefix}_enc_4blks 966 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 967___ 968 &transpose(@vtmp,@datax); 969$code.=<<___; 970 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 971 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b 972 orr $ivec1.16b,@data[3].16b,@data[3].16b 973 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b 974 eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b 975 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 976 subs $blocks,$blocks,#4 977 b.gt .Lcbc_4_blocks_dec 978 // save back IV 979 st1 {@data[3].4s}, [$ivp] 980 b 100f 9811: // last block 982 subs $blocks,$blocks,#1 983 b.lt 100f 984 b.gt 1f 985 ld1 {@data[0].4s},[$inp],#16 986 // save back IV 987 st1 {$data[0].4s}, [$ivp] 988___ 989 &rev32(@datax[0],@data[0]); 990 &encrypt_1blk(@datax[0]); 991$code.=<<___; 992 eor @datax[0].16b,@datax[0].16b,$ivec1.16b 993 st1 {@datax[0].4s},[$outp],#16 994 b 100f 9951: // last two blocks 996 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp] 997 add $ptr,$inp,#16 998 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16 999 subs $blocks,$blocks,1 1000 b.gt 1f 1001___ 1002 &rev32(@data[0],@data[0]); 1003 &rev32(@data[1],@data[1]); 1004 &rev32(@data[2],@data[2]); 1005 &rev32(@data[3],@data[3]); 1006$code.=<<___; 1007 bl _${prefix}_enc_4blks 1008 ld1 {@data[0].4s,@data[1].4s},[$inp],#32 1009___ 1010 &transpose(@vtmp,@datax); 1011$code.=<<___; 1012 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 1013 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b 1014 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 1015 // save back IV 1016 st1 {@data[1].4s}, [$ivp] 1017 b 100f 10181: // last 3 blocks 1019 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr] 1020___ 1021 &rev32(@data[0],@data[0]); 1022 &rev32(@data[1],@data[1]); 1023 &rev32(@data[2],@data[2]); 1024 &rev32(@data[3],@data[3]); 1025$code.=<<___; 1026 bl _${prefix}_enc_4blks 1027 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 1028___ 1029 &transpose(@vtmp,@datax); 1030$code.=<<___; 1031 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 1032 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b 1033 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b 1034 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 1035 // save back IV 1036 st1 {@data[2].4s}, [$ivp] 1037100: 1038 ldp d10,d11,[sp,#16] 1039 ldp d12,d13,[sp,#32] 1040 ldp d14,d15,[sp,#48] 1041 ldp x29,x30,[sp,#64] 1042 ldp d8,d9,[sp],#80 1043 AARCH64_VALIDATE_LINK_REGISTER 1044 ret 1045.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 1046___ 1047}}} 1048 1049{{{ 1050my ($ivp)=("x4"); 1051my ($ctr)=("w5"); 1052my $ivec=("v3"); 1053 1054$code.=<<___; 1055.globl ${prefix}_ctr32_encrypt_blocks 1056.type ${prefix}_ctr32_encrypt_blocks,%function 1057.align 5 1058${prefix}_ctr32_encrypt_blocks: 1059 AARCH64_VALID_CALL_TARGET 1060 ld1 {$ivec.4s},[$ivp] 1061___ 1062 &rev32($ivec,$ivec); 1063 &load_sbox(); 1064$code.=<<___; 1065 cmp $blocks,#1 1066 b.ne 1f 1067 // fast processing for one single block without 1068 // context saving overhead 1069___ 1070 &encrypt_1blk($ivec); 1071$code.=<<___; 1072 ld1 {@data[0].4s},[$inp] 1073 eor @data[0].16b,@data[0].16b,$ivec.16b 1074 st1 {@data[0].4s},[$outp] 1075 ret 10761: 1077 AARCH64_SIGN_LINK_REGISTER 1078 stp d8,d9,[sp,#-80]! 1079 stp d10,d11,[sp,#16] 1080 stp d12,d13,[sp,#32] 1081 stp d14,d15,[sp,#48] 1082 stp x29,x30,[sp,#64] 1083 mov $word0,$ivec.s[0] 1084 mov $word1,$ivec.s[1] 1085 mov $word2,$ivec.s[2] 1086 mov $ctr,$ivec.s[3] 1087.Lctr32_4_blocks_process: 1088 cmp $blocks,#4 1089 b.lt 1f 1090 dup @data[0].4s,$word0 1091 dup @data[1].4s,$word1 1092 dup @data[2].4s,$word2 1093 mov @data[3].s[0],$ctr 1094 add $ctr,$ctr,#1 1095 mov $data[3].s[1],$ctr 1096 add $ctr,$ctr,#1 1097 mov @data[3].s[2],$ctr 1098 add $ctr,$ctr,#1 1099 mov @data[3].s[3],$ctr 1100 add $ctr,$ctr,#1 1101 cmp $blocks,#8 1102 b.ge .Lctr32_8_blocks_process 1103 bl _${prefix}_enc_4blks 1104 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 1105 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1106 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1107 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1108 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1109 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1110 subs $blocks,$blocks,#4 1111 b.ne .Lctr32_4_blocks_process 1112 b 100f 1113.Lctr32_8_blocks_process: 1114 dup @datax[0].4s,$word0 1115 dup @datax[1].4s,$word1 1116 dup @datax[2].4s,$word2 1117 mov @datax[3].s[0],$ctr 1118 add $ctr,$ctr,#1 1119 mov $datax[3].s[1],$ctr 1120 add $ctr,$ctr,#1 1121 mov @datax[3].s[2],$ctr 1122 add $ctr,$ctr,#1 1123 mov @datax[3].s[3],$ctr 1124 add $ctr,$ctr,#1 1125 bl _${prefix}_enc_8blks 1126 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 1127 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 1128 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1129 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1130 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1131 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1132 eor @data[0].16b,@data[0].16b,@datax[0].16b 1133 eor @data[1].16b,@data[1].16b,@datax[1].16b 1134 eor @data[2].16b,@data[2].16b,@datax[2].16b 1135 eor @data[3].16b,@data[3].16b,@datax[3].16b 1136 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1137 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 1138 subs $blocks,$blocks,#8 1139 b.ne .Lctr32_4_blocks_process 1140 b 100f 11411: // last block processing 1142 subs $blocks,$blocks,#1 1143 b.lt 100f 1144 b.gt 1f 1145 mov $ivec.s[0],$word0 1146 mov $ivec.s[1],$word1 1147 mov $ivec.s[2],$word2 1148 mov $ivec.s[3],$ctr 1149___ 1150 &encrypt_1blk($ivec); 1151$code.=<<___; 1152 ld1 {@data[0].4s},[$inp] 1153 eor @data[0].16b,@data[0].16b,$ivec.16b 1154 st1 {@data[0].4s},[$outp] 1155 b 100f 11561: // last 2 blocks processing 1157 dup @data[0].4s,$word0 1158 dup @data[1].4s,$word1 1159 dup @data[2].4s,$word2 1160 mov @data[3].s[0],$ctr 1161 add $ctr,$ctr,#1 1162 mov @data[3].s[1],$ctr 1163 subs $blocks,$blocks,#1 1164 b.ne 1f 1165 bl _${prefix}_enc_4blks 1166 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 1167 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 1168 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1169 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1170 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1171 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1172 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 1173 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 1174 b 100f 11751: // last 3 blocks processing 1176 add $ctr,$ctr,#1 1177 mov @data[3].s[2],$ctr 1178 bl _${prefix}_enc_4blks 1179 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 1180 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 1181 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16 1182 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1183 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1184 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1185 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1186 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 1187 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 1188 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16 1189100: 1190 ldp d10,d11,[sp,#16] 1191 ldp d12,d13,[sp,#32] 1192 ldp d14,d15,[sp,#48] 1193 ldp x29,x30,[sp,#64] 1194 ldp d8,d9,[sp],#80 1195 AARCH64_VALIDATE_LINK_REGISTER 1196 ret 1197.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 1198___ 1199}}} 1200 1201 1202{{{ 1203my ($blocks,$len)=("x2","x2"); 1204my $ivp=("x5"); 1205my @twx=map("x$_",(12..27)); 1206my ($rks1,$rks2)=("x26","x27"); 1207my $lastBlk=("x26"); 1208my $enc=("w28"); 1209my $remain=("x29"); 1210 1211my @tweak=map("v$_",(16..23)); 1212my $lastTweak=("v25"); 1213 1214sub gen_xts_cipher() { 1215 my $std = shift; 1216$code.=<<___; 1217.globl ${prefix}_xts_encrypt${std} 1218.type ${prefix}_xts_encrypt${std},%function 1219.align 5 1220${prefix}_xts_encrypt${std}: 1221 AARCH64_SIGN_LINK_REGISTER 1222 stp x15, x16, [sp, #-0x10]! 1223 stp x17, x18, [sp, #-0x10]! 1224 stp x19, x20, [sp, #-0x10]! 1225 stp x21, x22, [sp, #-0x10]! 1226 stp x23, x24, [sp, #-0x10]! 1227 stp x25, x26, [sp, #-0x10]! 1228 stp x27, x28, [sp, #-0x10]! 1229 stp x29, x30, [sp, #-0x10]! 1230 stp d8, d9, [sp, #-0x10]! 1231 stp d10, d11, [sp, #-0x10]! 1232 stp d12, d13, [sp, #-0x10]! 1233 stp d14, d15, [sp, #-0x10]! 1234 mov $rks1,x3 1235 mov $rks2,x4 1236 mov $enc,w6 1237 ld1 {@tweak[0].4s}, [$ivp] 1238 mov $rks,$rks2 1239___ 1240 &load_sbox(); 1241 &rev32(@tweak[0],@tweak[0]); 1242 &encrypt_1blk(@tweak[0]); 1243$code.=<<___; 1244 mov $rks,$rks1 1245 and $remain,$len,#0x0F 1246 // convert length into blocks 1247 lsr $blocks,$len,4 1248 cmp $blocks,#1 1249 b.lt .return${std} 1250 1251 cmp $remain,0 1252 // If the encryption/decryption Length is N times of 16, 1253 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std} 1254 b.eq .xts_encrypt_blocks${std} 1255 1256 // If the encryption/decryption length is not N times of 16, 1257 // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std} 1258 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std} 1259 subs $blocks,$blocks,#1 1260 b.eq .only_2blks_tweak${std} 1261.xts_encrypt_blocks${std}: 1262___ 1263 &rbit(@tweak[0],@tweak[0],$std); 1264 &rev32_armeb(@tweak[0],@tweak[0]); 1265 &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); 1266 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); 1267 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); 1268 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); 1269 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); 1270 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); 1271 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); 1272 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); 1273$code.=<<___; 1274.Lxts_8_blocks_process${std}: 1275 cmp $blocks,#8 1276___ 1277 &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); 1278 &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); 1279 &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); 1280 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); 1281 &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); 1282 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); 1283 &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); 1284 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); 1285 &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]); 1286 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); 1287 &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]); 1288 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); 1289 &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]); 1290 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); 1291 &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]); 1292 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); 1293$code.=<<___; 1294 b.lt .Lxts_4_blocks_process${std} 1295 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 1296___ 1297 &rbit(@tweak[0],@tweak[0],$std); 1298 &rbit(@tweak[1],@tweak[1],$std); 1299 &rbit(@tweak[2],@tweak[2],$std); 1300 &rbit(@tweak[3],@tweak[3],$std); 1301$code.=<<___; 1302 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1303 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1304 eor @data[2].16b, @data[2].16b, @tweak[2].16b 1305 eor @data[3].16b, @data[3].16b, @tweak[3].16b 1306 ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 1307___ 1308 &rbit(@tweak[4],@tweak[4],$std); 1309 &rbit(@tweak[5],@tweak[5],$std); 1310 &rbit(@tweak[6],@tweak[6],$std); 1311 &rbit(@tweak[7],@tweak[7],$std); 1312$code.=<<___; 1313 eor @datax[0].16b, @datax[0].16b, @tweak[4].16b 1314 eor @datax[1].16b, @datax[1].16b, @tweak[5].16b 1315 eor @datax[2].16b, @datax[2].16b, @tweak[6].16b 1316 eor @datax[3].16b, @datax[3].16b, @tweak[7].16b 1317___ 1318 &rev32(@data[0],@data[0]); 1319 &rev32(@data[1],@data[1]); 1320 &rev32(@data[2],@data[2]); 1321 &rev32(@data[3],@data[3]); 1322 &rev32(@datax[0],@datax[0]); 1323 &rev32(@datax[1],@datax[1]); 1324 &rev32(@datax[2],@datax[2]); 1325 &rev32(@datax[3],@datax[3]); 1326 &transpose(@data,@vtmp); 1327 &transpose(@datax,@vtmp); 1328$code.=<<___; 1329 bl _${prefix}_enc_8blks 1330___ 1331 &transpose(@vtmp,@datax); 1332 &transpose(@data,@datax); 1333$code.=<<___; 1334 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b 1335 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b 1336 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b 1337 eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b 1338 eor @data[0].16b, @data[0].16b, @tweak[4].16b 1339 eor @data[1].16b, @data[1].16b, @tweak[5].16b 1340 eor @data[2].16b, @data[2].16b, @tweak[6].16b 1341 eor @data[3].16b, @data[3].16b, @tweak[7].16b 1342 1343 // save the last tweak 1344 mov $lastTweak.16b,@tweak[7].16b 1345 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1346 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 1347 subs $blocks,$blocks,#8 1348 b.gt .Lxts_8_blocks_process${std} 1349 b 100f 1350.Lxts_4_blocks_process${std}: 1351 cmp $blocks,#4 1352 b.lt 1f 1353 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 1354___ 1355 &rbit(@tweak[0],@tweak[0],$std); 1356 &rbit(@tweak[1],@tweak[1],$std); 1357 &rbit(@tweak[2],@tweak[2],$std); 1358 &rbit(@tweak[3],@tweak[3],$std); 1359$code.=<<___; 1360 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1361 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1362 eor @data[2].16b, @data[2].16b, @tweak[2].16b 1363 eor @data[3].16b, @data[3].16b, @tweak[3].16b 1364___ 1365 &rev32(@data[0],@data[0]); 1366 &rev32(@data[1],@data[1]); 1367 &rev32(@data[2],@data[2]); 1368 &rev32(@data[3],@data[3]); 1369 &transpose(@data,@vtmp); 1370$code.=<<___; 1371 bl _${prefix}_enc_4blks 1372___ 1373 &transpose(@vtmp,@data); 1374$code.=<<___; 1375 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b 1376 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b 1377 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b 1378 eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b 1379 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1380 sub $blocks,$blocks,#4 1381 mov @tweak[0].16b,@tweak[4].16b 1382 mov @tweak[1].16b,@tweak[5].16b 1383 mov @tweak[2].16b,@tweak[6].16b 1384 // save the last tweak 1385 mov $lastTweak.16b,@tweak[3].16b 13861: 1387 // process last block 1388 cmp $blocks,#1 1389 b.lt 100f 1390 b.gt 1f 1391 ld1 {@data[0].4s},[$inp],#16 1392___ 1393 &rbit(@tweak[0],@tweak[0],$std); 1394$code.=<<___; 1395 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1396___ 1397 &rev32(@data[0],@data[0]); 1398 &encrypt_1blk(@data[0]); 1399$code.=<<___; 1400 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1401 st1 {@data[0].4s},[$outp],#16 1402 // save the last tweak 1403 mov $lastTweak.16b,@tweak[0].16b 1404 b 100f 14051: // process last 2 blocks 1406 cmp $blocks,#2 1407 b.gt 1f 1408 ld1 {@data[0].4s,@data[1].4s},[$inp],#32 1409___ 1410 &rbit(@tweak[0],@tweak[0],$std); 1411 &rbit(@tweak[1],@tweak[1],$std); 1412$code.=<<___; 1413 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1414 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1415___ 1416 &rev32(@data[0],@data[0]); 1417 &rev32(@data[1],@data[1]); 1418 &transpose(@data,@vtmp); 1419$code.=<<___; 1420 bl _${prefix}_enc_4blks 1421___ 1422 &transpose(@vtmp,@data); 1423$code.=<<___; 1424 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b 1425 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b 1426 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 1427 // save the last tweak 1428 mov $lastTweak.16b,@tweak[1].16b 1429 b 100f 14301: // process last 3 blocks 1431 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 1432___ 1433 &rbit(@tweak[0],@tweak[0],$std); 1434 &rbit(@tweak[1],@tweak[1],$std); 1435 &rbit(@tweak[2],@tweak[2],$std); 1436$code.=<<___; 1437 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1438 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1439 eor @data[2].16b, @data[2].16b, @tweak[2].16b 1440___ 1441 &rev32(@data[0],@data[0]); 1442 &rev32(@data[1],@data[1]); 1443 &rev32(@data[2],@data[2]); 1444 &transpose(@data,@vtmp); 1445$code.=<<___; 1446 bl _${prefix}_enc_4blks 1447___ 1448 &transpose(@vtmp,@data); 1449$code.=<<___; 1450 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b 1451 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b 1452 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b 1453 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 1454 // save the last tweak 1455 mov $lastTweak.16b,@tweak[2].16b 1456100: 1457 cmp $remain,0 1458 b.eq .return${std} 1459 1460// This branch calculates the last two tweaks, 1461// while the encryption/decryption length is larger than 32 1462.last_2blks_tweak${std}: 1463___ 1464 &rev32_armeb($lastTweak,$lastTweak); 1465 &compute_tweak_vec($lastTweak,@tweak[1],$std); 1466 &compute_tweak_vec(@tweak[1],@tweak[2],$std); 1467$code.=<<___; 1468 b .check_dec${std} 1469 1470 1471// This branch calculates the last two tweaks, 1472// while the encryption/decryption length is equal to 32, who only need two tweaks 1473.only_2blks_tweak${std}: 1474 mov @tweak[1].16b,@tweak[0].16b 1475___ 1476 &rev32_armeb(@tweak[1],@tweak[1]); 1477 &compute_tweak_vec(@tweak[1],@tweak[2],$std); 1478$code.=<<___; 1479 b .check_dec${std} 1480 1481 1482// Determine whether encryption or decryption is required. 1483// The last two tweaks need to be swapped for decryption. 1484.check_dec${std}: 1485 // encryption:1 decryption:0 1486 cmp $enc,1 1487 b.eq .process_last_2blks${std} 1488 mov @vtmp[0].16B,@tweak[1].16b 1489 mov @tweak[1].16B,@tweak[2].16b 1490 mov @tweak[2].16B,@vtmp[0].16b 1491 1492.process_last_2blks${std}: 1493___ 1494 &rev32_armeb(@tweak[1],@tweak[1]); 1495 &rev32_armeb(@tweak[2],@tweak[2]); 1496$code.=<<___; 1497 ld1 {@data[0].4s},[$inp],#16 1498 eor @data[0].16b, @data[0].16b, @tweak[1].16b 1499___ 1500 &rev32(@data[0],@data[0]); 1501 &encrypt_1blk(@data[0]); 1502$code.=<<___; 1503 eor @data[0].16b, @data[0].16b, @tweak[1].16b 1504 st1 {@data[0].4s},[$outp],#16 1505 1506 sub $lastBlk,$outp,16 1507 .loop${std}: 1508 subs $remain,$remain,1 1509 ldrb $wtmp0,[$lastBlk,$remain] 1510 ldrb $wtmp1,[$inp,$remain] 1511 strb $wtmp1,[$lastBlk,$remain] 1512 strb $wtmp0,[$outp,$remain] 1513 b.gt .loop${std} 1514 ld1 {@data[0].4s}, [$lastBlk] 1515 eor @data[0].16b, @data[0].16b, @tweak[2].16b 1516___ 1517 &rev32(@data[0],@data[0]); 1518 &encrypt_1blk(@data[0]); 1519$code.=<<___; 1520 eor @data[0].16b, @data[0].16b, @tweak[2].16b 1521 st1 {@data[0].4s}, [$lastBlk] 1522.return${std}: 1523 ldp d14, d15, [sp], #0x10 1524 ldp d12, d13, [sp], #0x10 1525 ldp d10, d11, [sp], #0x10 1526 ldp d8, d9, [sp], #0x10 1527 ldp x29, x30, [sp], #0x10 1528 ldp x27, x28, [sp], #0x10 1529 ldp x25, x26, [sp], #0x10 1530 ldp x23, x24, [sp], #0x10 1531 ldp x21, x22, [sp], #0x10 1532 ldp x19, x20, [sp], #0x10 1533 ldp x17, x18, [sp], #0x10 1534 ldp x15, x16, [sp], #0x10 1535 AARCH64_VALIDATE_LINK_REGISTER 1536 ret 1537.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std} 1538___ 1539} # end of gen_xts_cipher 1540&gen_xts_cipher("_gb"); 1541&gen_xts_cipher(""); 1542}}} 1543 1544######################################## 1545open SELF,$0; 1546while(<SELF>) { 1547 next if (/^#!/); 1548 last if (!s/^#/\/\// and !/^$/); 1549 print; 1550} 1551close SELF; 1552 1553foreach(split("\n",$code)) { 1554 s/\`([^\`]*)\`/eval($1)/ge; 1555 print $_,"\n"; 1556} 1557 1558close STDOUT or die "error closing STDOUT: $!"; 1559