1#! /usr/bin/env perl 2# Copyright (C) 2023 Intel Corporation 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# This implementation is based on the AES-XTS code (AVX512VAES + VPCLMULQDQ) 10# from Intel(R) Intelligent Storage Acceleration Library Crypto Version 11# (https://github.com/intel/isa-l_crypto). 12# 13###################################################################### 14# The main building block of the loop is code that encrypts/decrypts 15# 8/16 blocks of data stitching with generation of tweak for the next 16# 8/16 blocks, utilizing VAES and VPCLMULQDQ instructions with full width 17# of ZMM registers. The main loop is selected based on the input length. 18# main_loop_run_16 encrypts/decrypts 16 blocks in parallel and it's selected 19# when input length >= 256 bytes (16 blocks) 20# main_loop_run_8 encrypts/decrypts 8 blocks in parallel and it's selected 21# when 128 bytes <= input length < 256 bytes (8-15 blocks) 22# Input length < 128 bytes (8 blocks) is handled by do_n_blocks. 23# 24# This implementation mainly uses vpshrdq from AVX-512-VBMI2 family and vaesenc, 25# vaesdec, vpclmulqdq from AVX-512F family. 26$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 27$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 28 29$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 30$avx512vaes=0; 31 32$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 33( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 34( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 35die "can't locate x86_64-xlate.pl"; 36 37if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 38 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 39 $avx512vaes = ($1>=2.30); 40} 41 42if (!$avx512vaes && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 43 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 44 $avx512vaes = ($1==2.11 && $2>=8) + ($1>=2.12); 45} 46 47if (!$avx512vaes && `$ENV{CC} -v 2>&1` 48 =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) { 49 my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001 50 if ($1) { 51 # Apple conditions, they use a different version series, see 52 # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2 53 # clang 7.0.0 is Apple clang 10.0.1 54 $avx512vaes = ($ver>=10.0001) 55 } else { 56 $avx512vaes = ($ver>=7.0); 57 } 58} 59 60open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 61 or die "can't call $xlate: $!"; 62*STDOUT=*OUT; 63 64#====================================================================== 65 66if ($avx512vaes) { 67 68 my $GP_STORAGE = $win64 ? (16 * 18) : (16 * 8); # store rbx 69 my $XMM_STORAGE = $win64 ? (16 * 8) : 0; # store xmm6:xmm15 70 my $VARIABLE_OFFSET = $win64 ? (16*8 + 16*10 + 8*3) : 71 (16*8 + 8*1); 72 73 # right now, >= 0x80 (128) is used for expanded keys. all usages of 74 # rsp should be invoked via $TW, not shadowed by any other name or 75 # used directly. 76 my $TW = "%rsp"; 77 my $TEMPHIGH = "%rbx"; 78 my $TEMPLOW = "%rax"; 79 my $ZPOLY = "%zmm25"; 80 81 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 82 # ;;; Function arguments abstraction 83 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 84 my ($key2, $key1, $tweak, $length, $input, $output); 85 86 87$input = "%rdi"; 88$output = "%rsi"; 89$length = "%rdx"; 90$key1 = "%rcx"; 91$key2 = "%r8"; 92$tweak = "%r9"; 93 94 # arguments for temp parameters 95 my ($tmp1, $gf_poly_8b, $gf_poly_8b_temp); 96 $tmp1 = "%r8"; 97 $gf_poly_8b = "%r10"; 98 $gf_poly_8b_temp = "%r11"; 99 100 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 101 # ;;; Helper functions 102 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 103 104 # Generates "random" local labels 105 sub random_string() { 106 my @chars = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_'); 107 my $length = 15; 108 my $str; 109 map { $str .= $chars[rand(33)] } 1 .. $length; 110 return $str; 111 } 112 113 # ; Seed the RNG so the labels are generated deterministically 114 srand(12345); 115 116 sub encrypt_tweak { 117 my $state_tweak = $_[0]; 118 my $is_128 = $_[1]; 119 120 $code.=<<___; 121 vpxor ($key2), $state_tweak, $state_tweak 122 vaesenc 0x10($key2), $state_tweak, $state_tweak 123 vaesenc 0x20($key2), $state_tweak, $state_tweak 124 vaesenc 0x30($key2), $state_tweak, $state_tweak 125 vaesenc 0x40($key2), $state_tweak, $state_tweak 126 vaesenc 0x50($key2), $state_tweak, $state_tweak 127 vaesenc 0x60($key2), $state_tweak, $state_tweak 128 vaesenc 0x70($key2), $state_tweak, $state_tweak 129 vaesenc 0x80($key2), $state_tweak, $state_tweak 130 vaesenc 0x90($key2), $state_tweak, $state_tweak 131___ 132 133 if ($is_128) { 134 $code .= "vaesenclast 0xa0($key2), $state_tweak, $state_tweak\n"; 135 } else { 136 $code .= "vaesenc 0xa0($key2), $state_tweak, $state_tweak\n"; 137 $code .= "vaesenc 0xb0($key2), $state_tweak, $state_tweak\n"; 138 $code .= "vaesenc 0xc0($key2), $state_tweak, $state_tweak\n"; 139 $code .= "vaesenc 0xd0($key2), $state_tweak, $state_tweak\n"; 140 $code .= "vaesenclast 0xe0($key2), $state_tweak, $state_tweak\n"; 141 } 142 $code .= "vmovdqa $state_tweak, ($TW)\n"; 143 } 144 145 sub encrypt_final { 146 my $st = $_[0]; 147 my $tw = $_[1]; 148 my $is_128 = $_[2]; 149 150 # xor Tweak value 151 $code .= "vpxor $tw, $st, $st\n"; 152 $code .= "vpxor ($key1), $st, $st\n"; 153 154 my $rounds = $is_128 ? 10 : 14; 155 for (my $i = 1; $i < $rounds; $i++) { 156 $code .= "vaesenc 16*$i($key1), $st, $st\n"; 157 } 158 159 $code .=<<___; 160 vaesenclast 16*$rounds($key1), $st, $st 161 vpxor $tw, $st, $st 162___ 163 } 164 165 # decrypt initial blocks of AES 166 # 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted 167 # next 8 Tweak values are generated 168 sub decrypt_initial { 169 my @st; 170 $st[0] = $_[0]; 171 $st[1] = $_[1]; 172 $st[2] = $_[2]; 173 $st[3] = $_[3]; 174 $st[4] = $_[4]; 175 $st[5] = $_[5]; 176 $st[6] = $_[6]; 177 $st[7] = $_[7]; 178 179 my @tw; 180 $tw[0] = $_[8]; 181 $tw[1] = $_[9]; 182 $tw[2] = $_[10]; 183 $tw[3] = $_[11]; 184 $tw[4] = $_[12]; 185 $tw[5] = $_[13]; 186 $tw[6] = $_[14]; 187 my $t0 = $_[15]; 188 my $num_blocks = $_[16]; 189 my $lt128 = $_[17]; 190 my $is_128 = $_[18]; 191 192 # num_blocks blocks encrypted 193 # num_blocks can be 1, 2, 3, 4, 5, 6, 7 194 195 # xor Tweak value 196 for (my $i = 0; $i < $num_blocks; $i++) { 197 $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n"; 198 } 199 200 $code .= "vmovdqu ($key1), $t0\n"; 201 202 for (my $i = 0; $i < $num_blocks; $i++) { 203 $code .= "vpxor $t0, $st[$i], $st[$i]\n"; 204 } 205 206 if (0 == $lt128) { 207 $code .= <<___; 208 xor $gf_poly_8b_temp, $gf_poly_8b_temp 209 shl \$1, $TEMPLOW 210 adc $TEMPHIGH, $TEMPHIGH 211___ 212 } 213 # round 1 214 $code .= "vmovdqu 0x10($key1), $t0\n"; 215 216 for (my $i = 0; $i < $num_blocks; $i++) { 217 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 218 } 219 220 if (0 == $lt128) { 221 $code .= <<___; 222 cmovc $gf_poly_8b, $gf_poly_8b_temp 223 xor $gf_poly_8b_temp, $TEMPLOW 224 mov $TEMPLOW, ($TW) # next Tweak1 generated 225 mov $TEMPLOW, 0x08($TW) 226 xor $gf_poly_8b_temp, $gf_poly_8b_temp 227___ 228 } 229 230 # round 2 231 $code .= "vmovdqu 0x20($key1), $t0\n"; 232 233 for (my $i = 0; $i < $num_blocks; $i++) { 234 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 235 } 236 237 if (0 == $lt128) { 238 $code .= <<___; 239 shl \$1, $TEMPLOW 240 adc $TEMPHIGH, $TEMPHIGH 241 cmovc $gf_poly_8b, $gf_poly_8b_temp 242 xor $gf_poly_8b_temp, $TEMPLOW 243 mov $TEMPLOW, 0x10($TW) # next Tweak2 generated 244___ 245 } 246 247 # round 3 248 $code .= "vmovdqu 0x30($key1), $t0\n"; 249 250 for (my $i = 0; $i < $num_blocks; $i++) { 251 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 252 } 253 254 if (0 == $lt128) { 255 $code .= <<___; 256 mov $TEMPHIGH, 0x18($TW) 257 xor $gf_poly_8b_temp, $gf_poly_8b_temp 258 shl \$1, $TEMPLOW 259 adc $TEMPHIGH, $TEMPHIGH 260 cmovc $gf_poly_8b, $gf_poly_8b_temp 261___ 262 } 263 264 # round 4 265 $code .= "vmovdqu 0x40($key1), $t0\n"; 266 267 for (my $i = 0; $i < $num_blocks; $i++) { 268 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 269 } 270 271 if (0 == $lt128) { 272 $code .= <<___; 273 xor $gf_poly_8b_temp, $TEMPLOW 274 mov $TEMPLOW, 0x20($TW) # next Tweak3 generated 275 mov $TEMPHIGH, 0x28($TW) 276 xor $gf_poly_8b_temp, $gf_poly_8b_temp 277 shl \$1, $TEMPLOW 278___ 279 } 280 281 # round 5 282 $code .= "vmovdqu 0x50($key1), $t0\n"; 283 284 for (my $i = 0; $i < $num_blocks; $i++) { 285 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 286 } 287 288 if (0 == $lt128) { 289 $code .= <<___; 290 adc $TEMPHIGH, $TEMPHIGH 291 cmovc $gf_poly_8b, $gf_poly_8b_temp 292 xor $gf_poly_8b_temp, $TEMPLOW 293 mov $TEMPLOW, 0x30($TW) # next Tweak4 generated 294 mov $TEMPHIGH, 0x38($TW) 295___ 296 } 297 298 # round 6 299 $code .= "vmovdqu 0x60($key1), $t0\n"; 300 301 for (my $i = 0; $i < $num_blocks; $i++) { 302 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 303 } 304 305 if (0 == $lt128) { 306 $code .= <<___; 307 xor $gf_poly_8b_temp, $gf_poly_8b_temp 308 shl \$1, $TEMPLOW 309 adc $TEMPHIGH, $TEMPHIGH 310 cmovc $gf_poly_8b, $gf_poly_8b_temp 311 xor $gf_poly_8b_temp, $TEMPLOW 312 mov $TEMPLOW, 0x40($TW) # next Tweak5 generated 313 mov $TEMPHIGH, 0x48($TW) 314___ 315 } 316 317 # round 7 318 $code .= "vmovdqu 0x70($key1), $t0\n"; 319 320 for (my $i = 0; $i < $num_blocks; $i++) { 321 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 322 } 323 324 if (0 == $lt128) { 325 $code .= <<___; 326 xor $gf_poly_8b_temp, $gf_poly_8b_temp 327 shl \$1, $TEMPLOW 328 adc $TEMPHIGH, $TEMPHIGH 329 cmovc $gf_poly_8b, $gf_poly_8b_temp 330 xor $gf_poly_8b_temp, $TEMPLOW 331 mov $TEMPLOW, 0x50($TW) # next Tweak6 generated 332 mov $TEMPHIGH, 0x58($TW) 333___ 334 } 335 336 # round 8 337 $code .= "vmovdqu 0x80($key1), $t0\n"; 338 339 for (my $i = 0; $i < $num_blocks; $i++) { 340 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 341 } 342 343 if (0 == $lt128) { 344 $code .= <<___; 345 xor $gf_poly_8b_temp, $gf_poly_8b_temp 346 shl \$1, $TEMPLOW 347 adc $TEMPHIGH, $TEMPHIGH 348 cmovc $gf_poly_8b, $gf_poly_8b_temp 349 xor $gf_poly_8b_temp, $TEMPLOW 350 mov $TEMPLOW, 0x60($TW) # next Tweak7 generated 351 mov $TEMPHIGH, 0x68($TW) 352___ 353 } 354 355 # round 9 356 $code .= "vmovdqu 0x90($key1), $t0\n"; 357 358 for (my $i = 0; $i < $num_blocks; $i++) { 359 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 360 } 361 362 if (0 == $lt128) { 363 $code .= <<___; 364 xor $gf_poly_8b_temp, $gf_poly_8b_temp 365 shl \$1, $TEMPLOW 366 adc $TEMPHIGH, $TEMPHIGH 367 cmovc $gf_poly_8b, $gf_poly_8b_temp 368 xor $gf_poly_8b_temp, $TEMPLOW 369 mov $TEMPLOW, 0x70($TW) # next Tweak8 generated 370 mov $TEMPHIGH, 0x78($TW) 371___ 372 } 373 374 if ($is_128) { 375 # round 10 376 $code .= "vmovdqu 0xa0($key1), $t0\n"; 377 for (my $i = 0; $i < $num_blocks; $i++) { 378 $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n"; 379 } 380 } else { 381 # round 10 382 $code .= "vmovdqu 0xa0($key1), $t0\n"; 383 for (my $i = 0; $i < $num_blocks; $i++) { 384 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 385 } 386 387 # round 11 388 $code .= "vmovdqu 0xb0($key1), $t0\n"; 389 for (my $i = 0; $i < $num_blocks; $i++) { 390 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 391 } 392 393 # round 12 394 $code .= "vmovdqu 0xc0($key1), $t0\n"; 395 for (my $i = 0; $i < $num_blocks; $i++) { 396 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 397 } 398 399 # round 13 400 $code .= "vmovdqu 0xd0($key1), $t0\n"; 401 for (my $i = 0; $i < $num_blocks; $i++) { 402 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 403 } 404 405 # round 14 406 $code .= "vmovdqu 0xe0($key1), $t0\n"; 407 for (my $i = 0; $i < $num_blocks; $i++) { 408 $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n"; 409 } 410 } 411 412 # xor Tweak values 413 for (my $i = 0; $i < $num_blocks; $i++) { 414 $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n"; 415 } 416 417 if (0 == $lt128) { 418 # load next Tweak values 419 $code .= <<___; 420 vmovdqa ($TW), $tw1 421 vmovdqa 0x10($TW), $tw2 422 vmovdqa 0x20($TW), $tw3 423 vmovdqa 0x30($TW), $tw4 424 vmovdqa 0x40($TW), $tw5 425 vmovdqa 0x50($TW), $tw6 426 vmovdqa 0x60($TW), $tw7 427___ 428 } 429 } 430 431 sub initialize { 432 my @st; 433 $st[0] = $_[0]; 434 $st[1] = $_[1]; 435 $st[2] = $_[2]; 436 $st[3] = $_[3]; 437 $st[4] = $_[4]; 438 $st[5] = $_[5]; 439 $st[6] = $_[6]; 440 $st[7] = $_[7]; 441 442 my @tw; 443 $tw[0] = $_[8]; 444 $tw[1] = $_[9]; 445 $tw[2] = $_[10]; 446 $tw[3] = $_[11]; 447 $tw[4] = $_[12]; 448 $tw[5] = $_[13]; 449 $tw[6] = $_[14]; 450 my $num_initial_blocks = $_[15]; 451 452 $code .= <<___; 453 vmovdqa 0x0($TW), $tw[0] 454 mov 0x0($TW), $TEMPLOW 455 mov 0x08($TW), $TEMPHIGH 456 vmovdqu 0x0($input), $st[0] 457___ 458 459 if ($num_initial_blocks >= 2) { 460 for (my $i = 1; $i < $num_initial_blocks; $i++) { 461 $code .= "xor $gf_poly_8b_temp, $gf_poly_8b_temp\n"; 462 $code .= "shl \$1, $TEMPLOW\n"; 463 $code .= "adc $TEMPHIGH, $TEMPHIGH\n"; 464 $code .= "cmovc $gf_poly_8b, $gf_poly_8b_temp\n"; 465 $code .= "xor $gf_poly_8b_temp, $TEMPLOW\n"; 466 my $offset = $i * 16; 467 $code .= "mov $TEMPLOW, $offset($TW)\n"; 468 $code .= "mov $TEMPHIGH, $offset + 8($TW)\n"; 469 $code .= "vmovdqa $offset($TW), $tw[$i]\n"; 470 $code .= "vmovdqu $offset($input), $st[$i]\n"; 471 } 472 } 473 } 474 475 # Encrypt 4 blocks in parallel 476 sub encrypt_by_four { 477 my $st1 = $_[0]; # state 1 478 my $tw1 = $_[1]; # tweak 1 479 my $tmp = $_[2]; 480 my $is_128 = $_[3]; 481 482 $code .= "vbroadcasti32x4 ($key1), $tmp\n"; 483 $code .= "vpternlogq \$0x96, $tmp, $tw1, $st1\n"; 484 485 my $rounds = $is_128 ? 10 : 14; 486 for (my $i = 1; $i < $rounds; $i++) { 487 $code .= "vbroadcasti32x4 16*$i($key1), $tmp\n"; 488 $code .= "vaesenc $tmp, $st1, $st1\n"; 489 } 490 491 $code .= "vbroadcasti32x4 16*$rounds($key1), $tmp\n"; 492 $code .= "vaesenclast $tmp, $st1, $st1\n"; 493 494 $code .= "vpxorq $tw1, $st1, $st1\n"; 495 } 496 497 # Encrypt 8 blocks in parallel 498 # generate next 8 tweak values 499 sub encrypt_by_eight_zmm { 500 my $st1 = $_[0]; 501 my $st2 = $_[1]; 502 my $tw1 = $_[2]; 503 my $tw2 = $_[3]; 504 my $t0 = $_[4]; 505 my $last_eight = $_[5]; 506 my $is_128 = $_[6]; 507 508 $code .= <<___; 509 vbroadcasti32x4 ($key1), $t0 510 vpternlogq \$0x96, $t0, $tw1, $st1 511 vpternlogq \$0x96, $t0, $tw2, $st2 512___ 513 514 if (0 == $last_eight) { 515 $code .= <<___; 516 vpsrldq \$0xf, $tw1, %zmm13 517 vpclmulqdq \$0x0, $ZPOLY, %zmm13, %zmm14 518 vpslldq \$0x1, $tw1, %zmm15 519 vpxord %zmm14, %zmm15, %zmm15 520___ 521 } 522 # round 1 523 $code .= <<___; 524 vbroadcasti32x4 0x10($key1), $t0 525 vaesenc $t0, $st1, $st1 526 vaesenc $t0, $st2, $st2 527 528 # round 2 529 vbroadcasti32x4 0x20($key1), $t0 530 vaesenc $t0, $st1, $st1 531 vaesenc $t0, $st2, $st2 532 533 # round 3 534 vbroadcasti32x4 0x30($key1), $t0 535 vaesenc $t0, $st1, $st1 536 vaesenc $t0, $st2, $st2 537___ 538 539 if (0 == $last_eight) { 540 $code .= <<___; 541 vpsrldq \$0xf, $tw2, %zmm13 542 vpclmulqdq \$0x0, $ZPOLY, %zmm13, %zmm14 543 vpslldq \$0x1, $tw2, %zmm16 544 vpxord %zmm14, %zmm16, %zmm16 545___ 546 } 547 548 $code .= <<___; 549 # round 4 550 vbroadcasti32x4 0x40($key1), $t0 551 vaesenc $t0, $st1, $st1 552 vaesenc $t0, $st2, $st2 553 554 # round 5 555 vbroadcasti32x4 0x50($key1), $t0 556 vaesenc $t0, $st1, $st1 557 vaesenc $t0, $st2, $st2 558 559 # round 6 560 vbroadcasti32x4 0x60($key1), $t0 561 vaesenc $t0, $st1, $st1 562 vaesenc $t0, $st2, $st2 563 564 # round 7 565 vbroadcasti32x4 0x70($key1), $t0 566 vaesenc $t0, $st1, $st1 567 vaesenc $t0, $st2, $st2 568 569 # round 8 570 vbroadcasti32x4 0x80($key1), $t0 571 vaesenc $t0, $st1, $st1 572 vaesenc $t0, $st2, $st2 573 574 # round 9 575 vbroadcasti32x4 0x90($key1), $t0 576 vaesenc $t0, $st1, $st1 577 vaesenc $t0, $st2, $st2 578___ 579 580 if ($is_128) { 581 $code .= <<___; 582 # round 10 583 vbroadcasti32x4 0xa0($key1), $t0 584 vaesenclast $t0, $st1, $st1 585 vaesenclast $t0, $st2, $st2 586___ 587 } else { 588 $code .= <<___; 589 # round 10 590 vbroadcasti32x4 0xa0($key1), $t0 591 vaesenc $t0, $st1, $st1 592 vaesenc $t0, $st2, $st2 593 594 # round 11 595 vbroadcasti32x4 0xb0($key1), $t0 596 vaesenc $t0, $st1, $st1 597 vaesenc $t0, $st2, $st2 598 599 # round 12 600 vbroadcasti32x4 0xc0($key1), $t0 601 vaesenc $t0, $st1, $st1 602 vaesenc $t0, $st2, $st2 603 604 # round 13 605 vbroadcasti32x4 0xd0($key1), $t0 606 vaesenc $t0, $st1, $st1 607 vaesenc $t0, $st2, $st2 608 609 # round 14 610 vbroadcasti32x4 0xe0($key1), $t0 611 vaesenclast $t0, $st1, $st1 612 vaesenclast $t0, $st2, $st2 613___ 614 } 615 616 # xor Tweak values 617 $code .= "vpxorq $tw1, $st1, $st1\n"; 618 $code .= "vpxorq $tw2, $st2, $st2\n"; 619 620 if (0 == $last_eight) { 621 # load next Tweak values 622 $code .= <<___; 623 vmovdqa32 %zmm15, $tw1 624 vmovdqa32 %zmm16, $tw2 625___ 626 } 627 } 628 629 # Decrypt 8 blocks in parallel 630 # generate next 8 tweak values 631 sub decrypt_by_eight_zmm { 632 my $st1 = $_[0]; 633 my $st2 = $_[1]; 634 my $tw1 = $_[2]; 635 my $tw2 = $_[3]; 636 my $t0 = $_[4]; 637 my $last_eight = $_[5]; 638 my $is_128 = $_[6]; 639 640 $code .= <<___; 641 # xor Tweak values 642 vpxorq $tw1, $st1, $st1 643 vpxorq $tw2, $st2, $st2 644 645 # ARK 646 vbroadcasti32x4 ($key1), $t0 647 vpxorq $t0, $st1, $st1 648 vpxorq $t0, $st2, $st2 649___ 650 651 if (0 == $last_eight) { 652 $code .= <<___; 653 vpsrldq \$0xf, $tw1, %zmm13 654 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 655 vpslldq \$0x1, $tw1, %zmm15 656 vpxord %zmm14, %zmm15, %zmm15 657___ 658 } 659 # round 1 660 $code .= <<___; 661 vbroadcasti32x4 0x10($key1), $t0 662 vaesdec $t0, $st1, $st1 663 vaesdec $t0, $st2, $st2 664 665 # round 2 666 vbroadcasti32x4 0x20($key1), $t0 667 vaesdec $t0, $st1, $st1 668 vaesdec $t0, $st2, $st2 669 670 # round 3 671 vbroadcasti32x4 0x30($key1), $t0 672 vaesdec $t0, $st1, $st1 673 vaesdec $t0, $st2, $st2 674___ 675 676 if (0 == $last_eight) { 677 $code .= <<___; 678 vpsrldq \$0xf, $tw2, %zmm13 679 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 680 vpslldq \$0x1, $tw2, %zmm16 681 vpxord %zmm14, %zmm16, %zmm16 682___ 683 } 684 685 $code .= <<___; 686 # round 4 687 vbroadcasti32x4 0x40($key1), $t0 688 vaesdec $t0, $st1, $st1 689 vaesdec $t0, $st2, $st2 690 691 # round 5 692 vbroadcasti32x4 0x50($key1), $t0 693 vaesdec $t0, $st1, $st1 694 vaesdec $t0, $st2, $st2 695 696 # round 6 697 vbroadcasti32x4 0x60($key1), $t0 698 vaesdec $t0, $st1, $st1 699 vaesdec $t0, $st2, $st2 700 701 # round 7 702 vbroadcasti32x4 0x70($key1), $t0 703 vaesdec $t0, $st1, $st1 704 vaesdec $t0, $st2, $st2 705 706 # round 8 707 vbroadcasti32x4 0x80($key1), $t0 708 vaesdec $t0, $st1, $st1 709 vaesdec $t0, $st2, $st2 710 711 # round 9 712 vbroadcasti32x4 0x90($key1), $t0 713 vaesdec $t0, $st1, $st1 714 vaesdec $t0, $st2, $st2 715 716___ 717 if ($is_128) { 718 $code .= <<___; 719 # round 10 720 vbroadcasti32x4 0xa0($key1), $t0 721 vaesdeclast $t0, $st1, $st1 722 vaesdeclast $t0, $st2, $st2 723___ 724 } else { 725 $code .= <<___; 726 # round 10 727 vbroadcasti32x4 0xa0($key1), $t0 728 vaesdec $t0, $st1, $st1 729 vaesdec $t0, $st2, $st2 730 731 # round 11 732 vbroadcasti32x4 0xb0($key1), $t0 733 vaesdec $t0, $st1, $st1 734 vaesdec $t0, $st2, $st2 735 736 # round 12 737 vbroadcasti32x4 0xc0($key1), $t0 738 vaesdec $t0, $st1, $st1 739 vaesdec $t0, $st2, $st2 740 741 # round 13 742 vbroadcasti32x4 0xd0($key1), $t0 743 vaesdec $t0, $st1, $st1 744 vaesdec $t0, $st2, $st2 745 746 # round 14 747 vbroadcasti32x4 0xe0($key1), $t0 748 vaesdeclast $t0, $st1, $st1 749 vaesdeclast $t0, $st2, $st2 750___ 751 } 752 753 $code .= <<___; 754 # xor Tweak values 755 vpxorq $tw1, $st1, $st1 756 vpxorq $tw2, $st2, $st2 757 758 # load next Tweak values 759 vmovdqa32 %zmm15, $tw1 760 vmovdqa32 %zmm16, $tw2 761___ 762 } 763 764 # Encrypt 16 blocks in parallel 765 # generate next 16 tweak values 766 sub encrypt_by_16_zmm { 767 my @st; 768 $st[0] = $_[0]; 769 $st[1] = $_[1]; 770 $st[2] = $_[2]; 771 $st[3] = $_[3]; 772 773 my @tw; 774 $tw[0] = $_[4]; 775 $tw[1] = $_[5]; 776 $tw[2] = $_[6]; 777 $tw[3] = $_[7]; 778 779 my $t0 = $_[8]; 780 my $last_eight = $_[9]; 781 my $is_128 = $_[10]; 782 783 # xor Tweak values 784 for (my $i = 0; $i < 4; $i++) { 785 $code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n"; 786 } 787 788 # ARK 789 $code .= "vbroadcasti32x4 ($key1), $t0\n"; 790 for (my $i = 0; $i < 4; $i++) { 791 $code .= "vpxorq $t0, $st[$i], $st[$i]\n"; 792 } 793 794 if (0 == $last_eight) { 795 $code .= <<___; 796 vpsrldq \$0xf, $tw[2], %zmm13 797 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 798 vpslldq \$0x1, $tw[2], %zmm15 799 vpxord %zmm14, %zmm15, %zmm15 800___ 801 } 802 803 # round 1 804 $code .= "vbroadcasti32x4 0x10($key1), $t0\n"; 805 for (my $i = 0; $i < 4; $i++) { 806 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 807 } 808 809 # round 2 810 $code .= "vbroadcasti32x4 0x20($key1), $t0\n"; 811 for (my $i = 0; $i < 4; $i++) { 812 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 813 } 814 815 # round 3 816 $code .= "vbroadcasti32x4 0x30($key1), $t0\n"; 817 for (my $i = 0; $i < 4; $i++) { 818 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 819 } 820 821 if (0 == $last_eight) { 822 $code .= <<___; 823 vpsrldq \$0xf, $tw[3], %zmm13 824 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 825 vpslldq \$0x1, $tw[3], %zmm16 826 vpxord %zmm14, %zmm16, %zmm16 827___ 828 } 829 # round 4 830 $code .= "vbroadcasti32x4 0x40($key1), $t0\n"; 831 for (my $i = 0; $i < 4; $i++) { 832 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 833 } 834 835 # round 5 836 $code .= "vbroadcasti32x4 0x50($key1), $t0\n"; 837 for (my $i = 0; $i < 4; $i++) { 838 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 839 } 840 841 # round 6 842 $code .= "vbroadcasti32x4 0x60($key1), $t0\n"; 843 for (my $i = 0; $i < 4; $i++) { 844 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 845 } 846 847 if (0 == $last_eight) { 848 $code .= <<___; 849 vpsrldq \$0xf, %zmm15, %zmm13 850 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 851 vpslldq \$0x1, %zmm15, %zmm17 852 vpxord %zmm14, %zmm17, %zmm17 853___ 854 } 855 # round 7 856 $code .= "vbroadcasti32x4 0x70($key1), $t0\n"; 857 for (my $i = 0; $i < 4; $i++) { 858 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 859 } 860 861 # round 8 862 $code .= "vbroadcasti32x4 0x80($key1), $t0\n"; 863 for (my $i = 0; $i < 4; $i++) { 864 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 865 } 866 867 # round 9 868 $code .= "vbroadcasti32x4 0x90($key1), $t0\n"; 869 for (my $i = 0; $i < 4; $i++) { 870 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 871 } 872 873 if (0 == $last_eight) { 874 $code .= <<___; 875 vpsrldq \$0xf, %zmm16, %zmm13 876 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 877 vpslldq \$0x1, %zmm16, %zmm18 878 vpxord %zmm14, %zmm18, %zmm18 879___ 880 } 881 if ($is_128) { 882 # round 10 883 $code .= "vbroadcasti32x4 0xa0($key1), $t0\n"; 884 for (my $i = 0; $i < 4; $i++) { 885 $code .= "vaesenclast $t0, $st[$i], $st[$i]\n"; 886 } 887 } else { 888 # round 10 889 $code .= "vbroadcasti32x4 0xa0($key1), $t0\n"; 890 for (my $i = 0; $i < 4; $i++) { 891 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 892 } 893 # round 11 894 $code .= "vbroadcasti32x4 0xb0($key1), $t0\n"; 895 for (my $i = 0; $i < 4; $i++) { 896 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 897 } 898 # round 12 899 $code .= "vbroadcasti32x4 0xc0($key1), $t0\n"; 900 for (my $i = 0; $i < 4; $i++) { 901 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 902 } 903 # round 13 904 $code .= "vbroadcasti32x4 0xd0($key1), $t0\n"; 905 for (my $i = 0; $i < 4; $i++) { 906 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 907 } 908 # round 14 909 $code .= "vbroadcasti32x4 0xe0($key1), $t0\n"; 910 for (my $i = 0; $i < 4; $i++) { 911 $code .= "vaesenclast $t0, $st[$i], $st[$i]\n"; 912 } 913 } 914 915 # xor Tweak values 916 for (my $i = 0; $i < 4; $i++) { 917 $code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n"; 918 } 919 920 $code .= <<___; 921 # load next Tweak values 922 vmovdqa32 %zmm15, $tw[0] 923 vmovdqa32 %zmm16, $tw[1] 924 vmovdqa32 %zmm17, $tw[2] 925 vmovdqa32 %zmm18, $tw[3] 926___ 927 } 928 929 # Decrypt 16 blocks in parallel 930 # generate next 8 tweak values 931 sub decrypt_by_16_zmm { 932 my @st; 933 $st[0] = $_[0]; 934 $st[1] = $_[1]; 935 $st[2] = $_[2]; 936 $st[3] = $_[3]; 937 938 my @tw; 939 $tw[0] = $_[4]; 940 $tw[1] = $_[5]; 941 $tw[2] = $_[6]; 942 $tw[3] = $_[7]; 943 944 my $t0 = $_[8]; 945 my $last_eight = $_[9]; 946 my $is_128 = $_[10]; 947 948 # xor Tweak values 949 for (my $i = 0; $i < 4; $i++) { 950 $code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n"; 951 } 952 953 # ARK 954 $code .= "vbroadcasti32x4 ($key1), $t0\n"; 955 for (my $i = 0; $i < 4; $i++) { 956 $code .= "vpxorq $t0, $st[$i], $st[$i]\n"; 957 } 958 959 if (0 == $last_eight) { 960 $code .= <<___; 961 vpsrldq \$0xf, $tw[2], %zmm13 962 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 963 vpslldq \$0x1, $tw[2], %zmm15 964 vpxord %zmm14, %zmm15, %zmm15 965___ 966 } 967 968 # round 1 969 $code .= "vbroadcasti32x4 0x10($key1), $t0\n"; 970 for (my $i = 0; $i < 4; $i++) { 971 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 972 } 973 974 # round 2 975 $code .= "vbroadcasti32x4 0x20($key1), $t0\n"; 976 for (my $i = 0; $i < 4; $i++) { 977 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 978 } 979 980 # round 3 981 $code .= "vbroadcasti32x4 0x30($key1), $t0\n"; 982 for (my $i = 0; $i < 4; $i++) { 983 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 984 } 985 986 if (0 == $last_eight) { 987 $code .= <<___; 988 vpsrldq \$0xf, $tw[3], %zmm13 989 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 990 vpslldq \$0x1, $tw[3], %zmm16 991 vpxord %zmm14, %zmm16, %zmm16 992___ 993 } 994 # round 4 995 $code .= "vbroadcasti32x4 0x40($key1), $t0\n"; 996 for (my $i = 0; $i < 4; $i++) { 997 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 998 } 999 1000 # round 5 1001 $code .= "vbroadcasti32x4 0x50($key1), $t0\n"; 1002 for (my $i = 0; $i < 4; $i++) { 1003 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1004 } 1005 1006 # round 6 1007 $code .= "vbroadcasti32x4 0x60($key1), $t0\n"; 1008 for (my $i = 0; $i < 4; $i++) { 1009 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1010 } 1011 1012 if (0 == $last_eight) { 1013 $code .= <<___; 1014 vpsrldq \$0xf, %zmm15, %zmm13 1015 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 1016 vpslldq \$0x1, %zmm15, %zmm17 1017 vpxord %zmm14, %zmm17, %zmm17 1018___ 1019 } 1020 # round 7 1021 $code .= "vbroadcasti32x4 0x70($key1), $t0\n"; 1022 for (my $i = 0; $i < 4; $i++) { 1023 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1024 } 1025 1026 # round 8 1027 $code .= "vbroadcasti32x4 0x80($key1), $t0\n"; 1028 for (my $i = 0; $i < 4; $i++) { 1029 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1030 } 1031 1032 # round 9 1033 $code .= "vbroadcasti32x4 0x90($key1), $t0\n"; 1034 for (my $i = 0; $i < 4; $i++) { 1035 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1036 } 1037 1038 if (0 == $last_eight) { 1039 $code .= <<___; 1040 vpsrldq \$0xf, %zmm16, %zmm13 1041 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 1042 vpslldq \$0x1, %zmm16, %zmm18 1043 vpxord %zmm14, %zmm18, %zmm18 1044___ 1045 } 1046 if ($is_128) { 1047 # round 10 1048 $code .= "vbroadcasti32x4 0xa0($key1), $t0\n"; 1049 for (my $i = 0; $i < 4; $i++) { 1050 $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n"; 1051 } 1052 } else { 1053 # round 10 1054 $code .= "vbroadcasti32x4 0xa0($key1), $t0\n"; 1055 for (my $i = 0; $i < 4; $i++) { 1056 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1057 } 1058 1059 # round 11 1060 $code .= "vbroadcasti32x4 0xb0($key1), $t0\n"; 1061 for (my $i = 0; $i < 4; $i++) { 1062 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1063 } 1064 1065 # round 12 1066 $code .= "vbroadcasti32x4 0xc0($key1), $t0\n"; 1067 for (my $i = 0; $i < 4; $i++) { 1068 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1069 } 1070 1071 # round 13 1072 $code .= "vbroadcasti32x4 0xd0($key1), $t0\n"; 1073 for (my $i = 0; $i < 4; $i++) { 1074 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1075 } 1076 1077 # round 14 1078 $code .= "vbroadcasti32x4 0xe0($key1), $t0\n"; 1079 for (my $i = 0; $i < 4; $i++) { 1080 $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n"; 1081 } 1082 } 1083 1084 # xor Tweak values 1085 for (my $i = 0; $i < 4; $i++) { 1086 $code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n"; 1087 } 1088 1089 $code .= <<___; 1090 # load next Tweak values 1091 vmovdqa32 %zmm15, $tw[0] 1092 vmovdqa32 %zmm16, $tw[1] 1093 vmovdqa32 %zmm17, $tw[2] 1094 vmovdqa32 %zmm18, $tw[3] 1095___ 1096 } 1097 1098 $code .= ".text\n"; 1099 1100 { 1101 $code.=<<"___"; 1102 .extern OPENSSL_ia32cap_P 1103 .globl aesni_xts_avx512_eligible 1104 .type aesni_xts_avx512_eligible,\@abi-omnipotent 1105 .align 32 1106 aesni_xts_avx512_eligible: 1107 mov OPENSSL_ia32cap_P+8(%rip), %ecx 1108 xor %eax,%eax 1109 # 1<<31|1<<30|1<<17|1<<16 avx512vl + avx512bw + avx512dq + avx512f 1110 and \$0xc0030000, %ecx 1111 cmp \$0xc0030000, %ecx 1112 jne .L_done 1113 mov OPENSSL_ia32cap_P+12(%rip), %ecx 1114 # 1<<10|1<<9|1<<6 vaes + vpclmulqdq + vbmi2 1115 and \$0x640, %ecx 1116 cmp \$0x640, %ecx 1117 cmove %ecx,%eax 1118 .L_done: 1119 ret 1120 .size aesni_xts_avx512_eligible, .-aesni_xts_avx512_eligible 1121___ 1122 } 1123 1124 1125 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1126 # ;void aesni_xts_[128|256]_encrypt_avx512( 1127 # ; const uint8_t *in, // input data 1128 # ; uint8_t *out, // output data 1129 # ; size_t length, // sector size, in bytes 1130 # ; const AES_KEY *key1, // key used for "ECB" encryption 1131 # ; const AES_KEY *key2, // key used for tweaking 1132 # ; const uint8_t iv[16]) // initial tweak value, 16 bytes 1133 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1134 sub enc { 1135 my $is_128 = $_[0]; 1136 my $rndsuffix = &random_string(); 1137 1138 if ($is_128) { 1139 $code.=<<___; 1140 .globl aesni_xts_128_encrypt_avx512 1141 .hidden aesni_xts_128_encrypt_avx512 1142 .type aesni_xts_128_encrypt_avx512,\@function,6 1143 .align 32 1144 aesni_xts_128_encrypt_avx512: 1145 .cfi_startproc 1146 endbranch 1147___ 1148 } else { 1149 $code.=<<___; 1150 .globl aesni_xts_256_encrypt_avx512 1151 .hidden aesni_xts_256_encrypt_avx512 1152 .type aesni_xts_256_encrypt_avx512,\@function,6 1153 .align 32 1154 aesni_xts_256_encrypt_avx512: 1155 .cfi_startproc 1156 endbranch 1157___ 1158 } 1159 $code .= "push %rbp\n"; 1160 $code .= "mov $TW,%rbp\n"; 1161 $code .= "sub \$$VARIABLE_OFFSET,$TW\n"; 1162 $code .= "and \$0xffffffffffffffc0,$TW\n"; 1163 $code .= "mov %rbx,$GP_STORAGE($TW)\n"; 1164 1165 if ($win64) { 1166 $code .= "mov %rdi,$GP_STORAGE + 8*1($TW)\n"; 1167 $code .= "mov %rsi,$GP_STORAGE + 8*2($TW)\n"; 1168 $code .= "vmovdqa %xmm6, $XMM_STORAGE + 16*0($TW)\n"; 1169 $code .= "vmovdqa %xmm7, $XMM_STORAGE + 16*1($TW)\n"; 1170 $code .= "vmovdqa %xmm8, $XMM_STORAGE + 16*2($TW)\n"; 1171 $code .= "vmovdqa %xmm9, $XMM_STORAGE + 16*3($TW)\n"; 1172 $code .= "vmovdqa %xmm10, $XMM_STORAGE + 16*4($TW)\n"; 1173 $code .= "vmovdqa %xmm11, $XMM_STORAGE + 16*5($TW)\n"; 1174 $code .= "vmovdqa %xmm12, $XMM_STORAGE + 16*6($TW)\n"; 1175 $code .= "vmovdqa %xmm13, $XMM_STORAGE + 16*7($TW)\n"; 1176 $code .= "vmovdqa %xmm14, $XMM_STORAGE + 16*8($TW)\n"; 1177 $code .= "vmovdqa %xmm15, $XMM_STORAGE + 16*9($TW)\n"; 1178 } 1179 1180 $code .= "mov \$0x87, $gf_poly_8b\n"; 1181 $code .= "vmovdqu ($tweak),%xmm1\n"; # read initial tweak values 1182 1183 encrypt_tweak("%xmm1", $is_128); 1184 1185 if ($win64) { 1186 $code .= "mov $input, 8 + 8*5(%rbp)\n"; # ciphertext pointer 1187 $code .= "mov $output, 8 + 8*6(%rbp)\n"; # plaintext pointer 1188 } 1189 1190 { 1191 $code.=<<___; 1192 1193 cmp \$0x80,$length 1194 jl .L_less_than_128_bytes_${rndsuffix} 1195 vpbroadcastq $gf_poly_8b,$ZPOLY 1196 cmp \$0x100,$length 1197 jge .L_start_by16_${rndsuffix} 1198 cmp \$0x80,$length 1199 jge .L_start_by8_${rndsuffix} 1200 1201 .L_do_n_blocks_${rndsuffix}: 1202 cmp \$0x0,$length 1203 je .L_ret_${rndsuffix} 1204 cmp \$0x70,$length 1205 jge .L_remaining_num_blocks_is_7_${rndsuffix} 1206 cmp \$0x60,$length 1207 jge .L_remaining_num_blocks_is_6_${rndsuffix} 1208 cmp \$0x50,$length 1209 jge .L_remaining_num_blocks_is_5_${rndsuffix} 1210 cmp \$0x40,$length 1211 jge .L_remaining_num_blocks_is_4_${rndsuffix} 1212 cmp \$0x30,$length 1213 jge .L_remaining_num_blocks_is_3_${rndsuffix} 1214 cmp \$0x20,$length 1215 jge .L_remaining_num_blocks_is_2_${rndsuffix} 1216 cmp \$0x10,$length 1217 jge .L_remaining_num_blocks_is_1_${rndsuffix} 1218 vmovdqa %xmm0,%xmm8 1219 vmovdqa %xmm9,%xmm0 1220 jmp .L_steal_cipher_${rndsuffix} 1221 1222 .L_remaining_num_blocks_is_7_${rndsuffix}: 1223 mov \$0x0000ffffffffffff,$tmp1 1224 kmovq $tmp1,%k1 1225 vmovdqu8 ($input),%zmm1 1226 vmovdqu8 0x40($input),%zmm2{%k1} 1227 add \$0x70,$input 1228___ 1229 } 1230 1231 encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1232 1233 { 1234 $code .= <<___; 1235 vmovdqu8 %zmm1,($output) 1236 vmovdqu8 %zmm2,0x40($output){%k1} 1237 add \$0x70,$output 1238 vextracti32x4 \$0x2,%zmm2,%xmm8 1239 vextracti32x4 \$0x3,%zmm10,%xmm0 1240 and \$0xf,$length 1241 je .L_ret_${rndsuffix} 1242 jmp .L_steal_cipher_${rndsuffix} 1243 1244 .L_remaining_num_blocks_is_6_${rndsuffix}: 1245 vmovdqu8 ($input),%zmm1 1246 vmovdqu8 0x40($input),%ymm2 1247 add \$0x60,$input 1248___ 1249 } 1250 1251 encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1252 1253 { 1254 $code .= <<___; 1255 vmovdqu8 %zmm1,($output) 1256 vmovdqu8 %ymm2,0x40($output) 1257 add \$0x60,$output 1258 vextracti32x4 \$0x1,%zmm2,%xmm8 1259 vextracti32x4 \$0x2,%zmm10,%xmm0 1260 and \$0xf,$length 1261 je .L_ret_${rndsuffix} 1262 jmp .L_steal_cipher_${rndsuffix} 1263 1264 .L_remaining_num_blocks_is_5_${rndsuffix}: 1265 vmovdqu8 ($input),%zmm1 1266 vmovdqu 0x40($input),%xmm2 1267 add \$0x50,$input 1268___ 1269 } 1270 1271 encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1272 1273 { 1274 $code .= <<___; 1275 vmovdqu8 %zmm1,($output) 1276 vmovdqu %xmm2,0x40($output) 1277 add \$0x50,$output 1278 vmovdqa %xmm2,%xmm8 1279 vextracti32x4 \$0x1,%zmm10,%xmm0 1280 and \$0xf,$length 1281 je .L_ret_${rndsuffix} 1282 jmp .L_steal_cipher_${rndsuffix} 1283 1284 .L_remaining_num_blocks_is_4_${rndsuffix}: 1285 vmovdqu8 ($input),%zmm1 1286 add \$0x40,$input 1287___ 1288 } 1289 1290 encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128); 1291 1292 { 1293 $code .= <<___; 1294 vmovdqu8 %zmm1,($output) 1295 add \$0x40,$output 1296 vextracti32x4 \$0x3,%zmm1,%xmm8 1297 vmovdqa64 %xmm10, %xmm0 1298 and \$0xf,$length 1299 je .L_ret_${rndsuffix} 1300 jmp .L_steal_cipher_${rndsuffix} 1301___ 1302 } 1303 1304 { 1305 $code .= <<___; 1306 .L_remaining_num_blocks_is_3_${rndsuffix}: 1307 mov \$-1, $tmp1 1308 shr \$0x10, $tmp1 1309 kmovq $tmp1, %k1 1310 vmovdqu8 ($input), %zmm1{%k1} 1311 add \$0x30, $input 1312___ 1313 } 1314 1315 encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128); 1316 1317 { 1318 $code .= <<___; 1319 vmovdqu8 %zmm1, ($output){%k1} 1320 add \$0x30, $output 1321 vextracti32x4 \$0x2, %zmm1, %xmm8 1322 vextracti32x4 \$0x3, %zmm9, %xmm0 1323 and \$0xf, $length 1324 je .L_ret_${rndsuffix} 1325 jmp .L_steal_cipher_${rndsuffix} 1326___ 1327 } 1328 1329 { 1330 $code .= <<___; 1331 .L_remaining_num_blocks_is_2_${rndsuffix}: 1332 vmovdqu8 ($input), %ymm1 1333 add \$0x20, $input 1334___ 1335 } 1336 1337 encrypt_by_four("%ymm1", "%ymm9", "%ymm0", $is_128); 1338 1339 { 1340 $code .= <<___; 1341 vmovdqu %ymm1,($output) 1342 add \$0x20,$output 1343 vextracti32x4 \$0x1, %zmm1, %xmm8 1344 vextracti32x4 \$0x2,%zmm9,%xmm0 1345 and \$0xf,$length 1346 je .L_ret_${rndsuffix} 1347 jmp .L_steal_cipher_${rndsuffix} 1348___ 1349 } 1350 1351 { 1352 $code .= <<___; 1353 .L_remaining_num_blocks_is_1_${rndsuffix}: 1354 vmovdqu ($input),%xmm1 1355 add \$0x10,$input 1356___ 1357 } 1358 1359 encrypt_final("%xmm1", "%xmm9", $is_128); 1360 1361 { 1362 $code .= <<___; 1363 vmovdqu %xmm1,($output) 1364 add \$0x10,$output 1365 vmovdqa %xmm1,%xmm8 1366 vextracti32x4 \$0x1,%zmm9,%xmm0 1367 and \$0xf,$length 1368 je .L_ret_${rndsuffix} 1369 jmp .L_steal_cipher_${rndsuffix} 1370 1371 1372 .L_start_by16_${rndsuffix}: 1373 vbroadcasti32x4 ($TW),%zmm0 1374 vbroadcasti32x4 shufb_15_7(%rip),%zmm8 1375 mov \$0xaa,$tmp1 1376 kmovq $tmp1,%k2 1377 vpshufb %zmm8,%zmm0,%zmm1 1378 vpsllvq const_dq3210(%rip),%zmm0,%zmm4 1379 vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 1380 vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3 1381 vpxorq %zmm2,%zmm4,%zmm4{%k2} 1382 vpxord %zmm4,%zmm3,%zmm9 1383 vpsllvq const_dq7654(%rip),%zmm0,%zmm5 1384 vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 1385 vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 1386 vpxorq %zmm6,%zmm5,%zmm5{%k2} 1387 vpxord %zmm5,%zmm7,%zmm10 1388 vpsrldq \$0xf,%zmm9,%zmm13 1389 vpclmulqdq \$0x0,%zmm25,%zmm13,%zmm14 1390 vpslldq \$0x1,%zmm9,%zmm11 1391 vpxord %zmm14,%zmm11,%zmm11 1392 vpsrldq \$0xf,%zmm10,%zmm15 1393 vpclmulqdq \$0x0,%zmm25,%zmm15,%zmm16 1394 vpslldq \$0x1,%zmm10,%zmm12 1395 vpxord %zmm16,%zmm12,%zmm12 1396 1397 .L_main_loop_run_16_${rndsuffix}: 1398 vmovdqu8 ($input),%zmm1 1399 vmovdqu8 0x40($input),%zmm2 1400 vmovdqu8 0x80($input),%zmm3 1401 vmovdqu8 0xc0($input),%zmm4 1402 add \$0x100,$input 1403___ 1404 } 1405 1406 encrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9", 1407 "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0, $is_128); 1408 1409 { 1410 $code .= <<___; 1411 vmovdqu8 %zmm1,($output) 1412 vmovdqu8 %zmm2,0x40($output) 1413 vmovdqu8 %zmm3,0x80($output) 1414 vmovdqu8 %zmm4,0xc0($output) 1415 add \$0x100,$output 1416 sub \$0x100,$length 1417 cmp \$0x100,$length 1418 jae .L_main_loop_run_16_${rndsuffix} 1419 cmp \$0x80,$length 1420 jae .L_main_loop_run_8_${rndsuffix} 1421 vextracti32x4 \$0x3,%zmm4,%xmm0 1422 jmp .L_do_n_blocks_${rndsuffix} 1423 1424 .L_start_by8_${rndsuffix}: 1425 vbroadcasti32x4 ($TW),%zmm0 1426 vbroadcasti32x4 shufb_15_7(%rip),%zmm8 1427 mov \$0xaa,$tmp1 1428 kmovq $tmp1,%k2 1429 vpshufb %zmm8,%zmm0,%zmm1 1430 vpsllvq const_dq3210(%rip),%zmm0,%zmm4 1431 vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 1432 vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3 1433 vpxorq %zmm2,%zmm4,%zmm4{%k2} 1434 vpxord %zmm4,%zmm3,%zmm9 1435 vpsllvq const_dq7654(%rip),%zmm0,%zmm5 1436 vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 1437 vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 1438 vpxorq %zmm6,%zmm5,%zmm5{%k2} 1439 vpxord %zmm5,%zmm7,%zmm10 1440 1441 .L_main_loop_run_8_${rndsuffix}: 1442 vmovdqu8 ($input),%zmm1 1443 vmovdqu8 0x40($input),%zmm2 1444 add \$0x80,$input 1445___ 1446 } 1447 1448 encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0, $is_128); 1449 1450 { 1451 $code .= <<___; 1452 vmovdqu8 %zmm1,($output) 1453 vmovdqu8 %zmm2,0x40($output) 1454 add \$0x80,$output 1455 sub \$0x80,$length 1456 cmp \$0x80,$length 1457 jae .L_main_loop_run_8_${rndsuffix} 1458 vextracti32x4 \$0x3,%zmm2,%xmm0 1459 jmp .L_do_n_blocks_${rndsuffix} 1460 1461 .L_steal_cipher_${rndsuffix}: 1462 vmovdqa %xmm8,%xmm2 1463 lea vpshufb_shf_table(%rip),$TEMPLOW 1464 vmovdqu ($TEMPLOW,$length,1),%xmm10 1465 vpshufb %xmm10,%xmm8,%xmm8 1466 vmovdqu -0x10($input,$length,1),%xmm3 1467 vmovdqu %xmm8,-0x10($output,$length,1) 1468 lea vpshufb_shf_table(%rip),$TEMPLOW 1469 add \$16, $TEMPLOW 1470 sub $length,$TEMPLOW 1471 vmovdqu ($TEMPLOW),%xmm10 1472 vpxor mask1(%rip),%xmm10,%xmm10 1473 vpshufb %xmm10,%xmm3,%xmm3 1474 vpblendvb %xmm10,%xmm2,%xmm3,%xmm3 1475 vpxor %xmm0,%xmm3,%xmm8 1476 vpxor ($key1),%xmm8,%xmm8 1477 vaesenc 0x10($key1),%xmm8,%xmm8 1478 vaesenc 0x20($key1),%xmm8,%xmm8 1479 vaesenc 0x30($key1),%xmm8,%xmm8 1480 vaesenc 0x40($key1),%xmm8,%xmm8 1481 vaesenc 0x50($key1),%xmm8,%xmm8 1482 vaesenc 0x60($key1),%xmm8,%xmm8 1483 vaesenc 0x70($key1),%xmm8,%xmm8 1484 vaesenc 0x80($key1),%xmm8,%xmm8 1485 vaesenc 0x90($key1),%xmm8,%xmm8 1486___ 1487 if ($is_128) { 1488 $code .= "vaesenclast 0xa0($key1),%xmm8,%xmm8\n"; 1489 } else { 1490 $code .= <<___ 1491 vaesenc 0xa0($key1),%xmm8,%xmm8 1492 vaesenc 0xb0($key1),%xmm8,%xmm8 1493 vaesenc 0xc0($key1),%xmm8,%xmm8 1494 vaesenc 0xd0($key1),%xmm8,%xmm8 1495 vaesenclast 0xe0($key1),%xmm8,%xmm8 1496___ 1497 } 1498 $code .= "vpxor %xmm0,%xmm8,%xmm8\n"; 1499 $code .= "vmovdqu %xmm8,-0x10($output)\n"; 1500 } 1501 1502 { 1503 $code .= <<___; 1504 .L_ret_${rndsuffix}: 1505 mov $GP_STORAGE($TW),%rbx 1506 xor $tmp1,$tmp1 1507 mov $tmp1,$GP_STORAGE($TW) 1508 # Zero-out the whole of `%zmm0`. 1509 vpxorq %zmm0,%zmm0,%zmm0 1510___ 1511 } 1512 1513 if ($win64) { 1514 $code .= <<___; 1515 mov $GP_STORAGE + 8*1($TW),%rdi 1516 mov $tmp1,$GP_STORAGE + 8*1($TW) 1517 mov $GP_STORAGE + 8*2($TW),%rsi 1518 mov $tmp1,$GP_STORAGE + 8*2($TW) 1519 1520 vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6 1521 vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7 1522 vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8 1523 vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9 1524 1525 # Zero the 64 bytes we just restored to the xmm registers. 1526 vmovdqa64 %zmm0,$XMM_STORAGE($TW) 1527 1528 vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10 1529 vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11 1530 vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12 1531 vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13 1532 1533 # And again. 1534 vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW) 1535 1536 vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14 1537 vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15 1538 1539 # Last round is only 32 bytes (256-bits), so we use `%ymm` as the 1540 # source operand. 1541 vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW) 1542___ 1543 } 1544 1545 { 1546 $code .= <<___; 1547 mov %rbp,$TW 1548 pop %rbp 1549 vzeroupper 1550 ret 1551 1552 .L_less_than_128_bytes_${rndsuffix}: 1553 vpbroadcastq $gf_poly_8b, $ZPOLY 1554 cmp \$0x10,$length 1555 jb .L_ret_${rndsuffix} 1556 vbroadcasti32x4 ($TW), %zmm0 1557 vbroadcasti32x4 shufb_15_7(%rip), %zmm8 1558 movl \$0xaa, %r8d 1559 kmovq %r8, %k2 1560 mov $length,$tmp1 1561 and \$0x70,$tmp1 1562 cmp \$0x60,$tmp1 1563 je .L_num_blocks_is_6_${rndsuffix} 1564 cmp \$0x50,$tmp1 1565 je .L_num_blocks_is_5_${rndsuffix} 1566 cmp \$0x40,$tmp1 1567 je .L_num_blocks_is_4_${rndsuffix} 1568 cmp \$0x30,$tmp1 1569 je .L_num_blocks_is_3_${rndsuffix} 1570 cmp \$0x20,$tmp1 1571 je .L_num_blocks_is_2_${rndsuffix} 1572 cmp \$0x10,$tmp1 1573 je .L_num_blocks_is_1_${rndsuffix} 1574 1575 .L_num_blocks_is_7_${rndsuffix}: 1576 vpshufb %zmm8, %zmm0, %zmm1 1577 vpsllvq const_dq3210(%rip), %zmm0, %zmm4 1578 vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 1579 vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 1580 vpxorq %zmm2, %zmm4, %zmm4{%k2} 1581 vpxord %zmm4, %zmm3, %zmm9 1582 vpsllvq const_dq7654(%rip), %zmm0, %zmm5 1583 vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 1584 vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 1585 vpxorq %zmm6, %zmm5, %zmm5{%k2} 1586 vpxord %zmm5, %zmm7, %zmm10 1587 mov \$0x0000ffffffffffff, $tmp1 1588 kmovq $tmp1, %k1 1589 vmovdqu8 16*0($input), %zmm1 1590 vmovdqu8 16*4($input), %zmm2{%k1} 1591 1592 add \$0x70,$input 1593___ 1594 } 1595 1596 encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1597 1598 { 1599 $code .= <<___; 1600 vmovdqu8 %zmm1, 16*0($output) 1601 vmovdqu8 %zmm2, 16*4($output){%k1} 1602 add \$0x70,$output 1603 vextracti32x4 \$0x2, %zmm2, %xmm8 1604 vextracti32x4 \$0x3, %zmm10, %xmm0 1605 and \$0xf,$length 1606 je .L_ret_${rndsuffix} 1607 jmp .L_steal_cipher_${rndsuffix} 1608___ 1609 } 1610 1611 { 1612 $code .= <<___; 1613 .L_num_blocks_is_6_${rndsuffix}: 1614 vpshufb %zmm8, %zmm0, %zmm1 1615 vpsllvq const_dq3210(%rip), %zmm0, %zmm4 1616 vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 1617 vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 1618 vpxorq %zmm2, %zmm4, %zmm4{%k2} 1619 vpxord %zmm4, %zmm3, %zmm9 1620 vpsllvq const_dq7654(%rip), %zmm0, %zmm5 1621 vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 1622 vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 1623 vpxorq %zmm6, %zmm5, %zmm5{%k2} 1624 vpxord %zmm5, %zmm7, %zmm10 1625 vmovdqu8 16*0($input), %zmm1 1626 vmovdqu8 16*4($input), %ymm2 1627 add \$96, $input 1628___ 1629 } 1630 1631 encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1632 1633 { 1634 $code .= <<___; 1635 vmovdqu8 %zmm1, 16*0($output) 1636 vmovdqu8 %ymm2, 16*4($output) 1637 add \$96, $output 1638 1639 vextracti32x4 \$0x1, %ymm2, %xmm8 1640 vextracti32x4 \$0x2, %zmm10, %xmm0 1641 and \$0xf,$length 1642 je .L_ret_${rndsuffix} 1643 jmp .L_steal_cipher_${rndsuffix} 1644___ 1645 } 1646 1647 { 1648 $code .= <<___; 1649 .L_num_blocks_is_5_${rndsuffix}: 1650 vpshufb %zmm8, %zmm0, %zmm1 1651 vpsllvq const_dq3210(%rip), %zmm0, %zmm4 1652 vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 1653 vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 1654 vpxorq %zmm2, %zmm4, %zmm4{%k2} 1655 vpxord %zmm4, %zmm3, %zmm9 1656 vpsllvq const_dq7654(%rip), %zmm0, %zmm5 1657 vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 1658 vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 1659 vpxorq %zmm6, %zmm5, %zmm5{%k2} 1660 vpxord %zmm5, %zmm7, %zmm10 1661 vmovdqu8 16*0($input), %zmm1 1662 vmovdqu8 16*4($input), %xmm2 1663 add \$80, $input 1664___ 1665 } 1666 1667 encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1668 1669 { 1670 $code .= <<___; 1671 vmovdqu8 %zmm1, 16*0($output) 1672 vmovdqu8 %xmm2, 16*4($output) 1673 add \$80, $output 1674 1675 vmovdqa %xmm2, %xmm8 1676 vextracti32x4 \$0x1, %zmm10, %xmm0 1677 and \$0xf,$length 1678 je .L_ret_${rndsuffix} 1679 jmp .L_steal_cipher_${rndsuffix} 1680___ 1681 } 1682 1683 { 1684 $code .= <<___; 1685 .L_num_blocks_is_4_${rndsuffix}: 1686 vpshufb %zmm8, %zmm0, %zmm1 1687 vpsllvq const_dq3210(%rip), %zmm0, %zmm4 1688 vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 1689 vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 1690 vpxorq %zmm2, %zmm4, %zmm4{%k2} 1691 vpxord %zmm4, %zmm3, %zmm9 1692 vpsllvq const_dq7654(%rip), %zmm0, %zmm5 1693 vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 1694 vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 1695 vpxorq %zmm6, %zmm5, %zmm5{%k2} 1696 vpxord %zmm5, %zmm7, %zmm10 1697 vmovdqu8 16*0($input), %zmm1 1698 add \$64, $input 1699___ 1700 } 1701 1702 encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128); 1703 1704 { 1705 $code .= <<___; 1706 vmovdqu8 %zmm1, 16*0($output) 1707 add \$64, $output 1708 vextracti32x4 \$0x3, %zmm1, %xmm8 1709 vmovdqa %xmm10, %xmm0 1710 and \$0xf,$length 1711 je .L_ret_${rndsuffix} 1712 jmp .L_steal_cipher_${rndsuffix} 1713___ 1714 } 1715 1716 { 1717 $code .= <<___; 1718 .L_num_blocks_is_3_${rndsuffix}: 1719 vpshufb %zmm8, %zmm0, %zmm1 1720 vpsllvq const_dq3210(%rip), %zmm0, %zmm4 1721 vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 1722 vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 1723 vpxorq %zmm2, %zmm4, %zmm4{%k2} 1724 vpxord %zmm4, %zmm3, %zmm9 1725 mov \$0x0000ffffffffffff, $tmp1 1726 kmovq $tmp1, %k1 1727 vmovdqu8 16*0($input), %zmm1{%k1} 1728 add \$48, $input 1729___ 1730 } 1731 1732 encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128); 1733 1734 { 1735 $code .= <<___; 1736 vmovdqu8 %zmm1, 16*0($output){%k1} 1737 add \$48, $output 1738 vextracti32x4 \$2, %zmm1, %xmm8 1739 vextracti32x4 \$3, %zmm9, %xmm0 1740 and \$0xf,$length 1741 je .L_ret_${rndsuffix} 1742 jmp .L_steal_cipher_${rndsuffix} 1743___ 1744 } 1745 1746 { 1747 $code .= <<___; 1748 .L_num_blocks_is_2_${rndsuffix}: 1749 vpshufb %zmm8, %zmm0, %zmm1 1750 vpsllvq const_dq3210(%rip), %zmm0, %zmm4 1751 vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 1752 vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 1753 vpxorq %zmm2, %zmm4, %zmm4{%k2} 1754 vpxord %zmm4, %zmm3, %zmm9 1755 1756 vmovdqu8 16*0($input), %ymm1 1757 add \$32, $input 1758___ 1759 } 1760 1761 encrypt_by_four("%ymm1", "%ymm9", "%ymm0", $is_128); 1762 1763 { 1764 $code .= <<___; 1765 vmovdqu8 %ymm1, 16*0($output) 1766 add \$32, $output 1767 1768 vextracti32x4 \$1, %ymm1, %xmm8 1769 vextracti32x4 \$2, %zmm9, %xmm0 1770 and \$0xf,$length 1771 je .L_ret_${rndsuffix} 1772 jmp .L_steal_cipher_${rndsuffix} 1773___ 1774 } 1775 1776 { 1777 $code .= <<___; 1778 .L_num_blocks_is_1_${rndsuffix}: 1779 vpshufb %zmm8, %zmm0, %zmm1 1780 vpsllvq const_dq3210(%rip), %zmm0, %zmm4 1781 vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 1782 vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 1783 vpxorq %zmm2, %zmm4, %zmm4{%k2} 1784 vpxord %zmm4, %zmm3, %zmm9 1785 1786 vmovdqu8 16*0($input), %xmm1 1787 add \$16, $input 1788___ 1789 } 1790 1791 encrypt_by_four("%ymm1", "%ymm9", "%ymm0", $is_128); 1792 1793 { 1794 $code .= <<___; 1795 vmovdqu8 %xmm1, 16*0($output) 1796 add \$16, $output 1797 1798 vmovdqa %xmm1, %xmm8 1799 vextracti32x4 \$1, %zmm9, %xmm0 1800 and \$0xf,$length 1801 je .L_ret_${rndsuffix} 1802 jmp .L_steal_cipher_${rndsuffix} 1803 .cfi_endproc 1804___ 1805 } 1806 } 1807 1808 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1809 # ;void aesni_xts_[128|256]_decrypt_avx512( 1810 # ; const uint8_t *in, // input data 1811 # ; uint8_t *out, // output data 1812 # ; size_t length, // sector size, in bytes 1813 # ; const AES_KEY *key1, // key used for "ECB" encryption, 16*2 bytes 1814 # ; const AES_KEY *key2, // key used for tweaking, 16*2 bytes 1815 # ; const uint8_t iv[16]) // initial tweak value, 16 bytes 1816 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1817 sub dec { 1818 my $is_128 = $_[0]; 1819 my $rndsuffix = &random_string(); 1820 1821 if ($is_128) { 1822 $code.=<<___; 1823 .globl aesni_xts_128_decrypt_avx512 1824 .hidden aesni_xts_128_decrypt_avx512 1825 .type aesni_xts_128_decrypt_avx512,\@function,6 1826 .align 32 1827 aesni_xts_128_decrypt_avx512: 1828 .cfi_startproc 1829 endbranch 1830___ 1831 } else { 1832 $code.=<<___; 1833 .globl aesni_xts_256_decrypt_avx512 1834 .hidden aesni_xts_256_decrypt_avx512 1835 .type aesni_xts_256_decrypt_avx512,\@function,6 1836 .align 32 1837 aesni_xts_256_decrypt_avx512: 1838 .cfi_startproc 1839 endbranch 1840___ 1841 } 1842 $code .= "push %rbp\n"; 1843 $code .= "mov $TW,%rbp\n"; 1844 $code .= "sub \$$VARIABLE_OFFSET,$TW\n"; 1845 $code .= "and \$0xffffffffffffffc0,$TW\n"; 1846 $code .= "mov %rbx,$GP_STORAGE($TW)\n"; 1847 1848 if ($win64) { 1849 $code .= "mov %rdi,$GP_STORAGE + 8*1($TW)\n"; 1850 $code .= "mov %rsi,$GP_STORAGE + 8*2($TW)\n"; 1851 $code .= "vmovdqa %xmm6, $XMM_STORAGE + 16*0($TW)\n"; 1852 $code .= "vmovdqa %xmm7, $XMM_STORAGE + 16*1($TW)\n"; 1853 $code .= "vmovdqa %xmm8, $XMM_STORAGE + 16*2($TW)\n"; 1854 $code .= "vmovdqa %xmm9, $XMM_STORAGE + 16*3($TW)\n"; 1855 $code .= "vmovdqa %xmm10, $XMM_STORAGE + 16*4($TW)\n"; 1856 $code .= "vmovdqa %xmm11, $XMM_STORAGE + 16*5($TW)\n"; 1857 $code .= "vmovdqa %xmm12, $XMM_STORAGE + 16*6($TW)\n"; 1858 $code .= "vmovdqa %xmm13, $XMM_STORAGE + 16*7($TW)\n"; 1859 $code .= "vmovdqa %xmm14, $XMM_STORAGE + 16*8($TW)\n"; 1860 $code .= "vmovdqa %xmm15, $XMM_STORAGE + 16*9($TW)\n"; 1861 } 1862 1863 $code .= "mov \$0x87, $gf_poly_8b\n"; 1864 $code .= "vmovdqu ($tweak),%xmm1\n"; # read initial tweak values 1865 1866 encrypt_tweak("%xmm1", $is_128); 1867 1868 if ($win64) { 1869 $code .= "mov $input, 8 + 8*5(%rbp)\n"; # ciphertext pointer 1870 $code .= "mov $output, 8 + 8*6(%rbp)\n"; # plaintext pointer 1871 } 1872 1873 { 1874 $code.=<<___; 1875 1876 cmp \$0x80,$length 1877 jb .L_less_than_128_bytes_${rndsuffix} 1878 vpbroadcastq $gf_poly_8b,$ZPOLY 1879 cmp \$0x100,$length 1880 jge .L_start_by16_${rndsuffix} 1881 jmp .L_start_by8_${rndsuffix} 1882 1883 .L_do_n_blocks_${rndsuffix}: 1884 cmp \$0x0,$length 1885 je .L_ret_${rndsuffix} 1886 cmp \$0x70,$length 1887 jge .L_remaining_num_blocks_is_7_${rndsuffix} 1888 cmp \$0x60,$length 1889 jge .L_remaining_num_blocks_is_6_${rndsuffix} 1890 cmp \$0x50,$length 1891 jge .L_remaining_num_blocks_is_5_${rndsuffix} 1892 cmp \$0x40,$length 1893 jge .L_remaining_num_blocks_is_4_${rndsuffix} 1894 cmp \$0x30,$length 1895 jge .L_remaining_num_blocks_is_3_${rndsuffix} 1896 cmp \$0x20,$length 1897 jge .L_remaining_num_blocks_is_2_${rndsuffix} 1898 cmp \$0x10,$length 1899 jge .L_remaining_num_blocks_is_1_${rndsuffix} 1900 1901 # _remaining_num_blocks_is_0: 1902 vmovdqu %xmm5, %xmm1 1903 # xmm5 contains last full block to decrypt with next teawk 1904___ 1905 } 1906 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 1907 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 1908 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128); 1909 1910 { 1911 $code .= <<___; 1912 vmovdqu %xmm1, -0x10($output) 1913 vmovdqa %xmm1, %xmm8 1914 1915 # Calc previous tweak 1916 mov \$0x1,$tmp1 1917 kmovq $tmp1, %k1 1918 vpsllq \$0x3f,%xmm9,%xmm13 1919 vpsraq \$0x3f,%xmm13,%xmm14 1920 vpandq %xmm25,%xmm14,%xmm5 1921 vpxorq %xmm5,%xmm9,%xmm9{%k1} 1922 vpsrldq \$0x8,%xmm9,%xmm10 1923 .byte 98, 211, 181, 8, 115, 194, 1 #vpshrdq \$0x1,%xmm10,%xmm9,%xmm0 1924 vpslldq \$0x8,%xmm13,%xmm13 1925 vpxorq %xmm13,%xmm0,%xmm0 1926 jmp .L_steal_cipher_${rndsuffix} 1927 1928 .L_remaining_num_blocks_is_7_${rndsuffix}: 1929 mov \$0xffffffffffffffff,$tmp1 1930 shr \$0x10,$tmp1 1931 kmovq $tmp1,%k1 1932 vmovdqu8 ($input),%zmm1 1933 vmovdqu8 0x40($input),%zmm2{%k1} 1934 add \$0x70,$input 1935 and \$0xf,$length 1936 je .L_done_7_remain_${rndsuffix} 1937 vextracti32x4 \$0x2,%zmm10,%xmm12 1938 vextracti32x4 \$0x3,%zmm10,%xmm13 1939 vinserti32x4 \$0x2,%xmm13,%zmm10,%zmm10 1940___ 1941 } 1942 1943 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1944 1945 { 1946 $code .= <<___; 1947 vmovdqu8 %zmm1, ($output) 1948 vmovdqu8 %zmm2, 0x40($output){%k1} 1949 add \$0x70, $output 1950 vextracti32x4 \$0x2,%zmm2,%xmm8 1951 vmovdqa %xmm12,%xmm0 1952 jmp .L_steal_cipher_${rndsuffix} 1953___ 1954 } 1955 1956 $code .= "\n.L_done_7_remain_${rndsuffix}:\n"; 1957 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1958 1959 { 1960 $code .= <<___; 1961 vmovdqu8 %zmm1, ($output) 1962 vmovdqu8 %zmm2, 0x40($output){%k1} 1963 jmp .L_ret_${rndsuffix} 1964 1965 .L_remaining_num_blocks_is_6_${rndsuffix}: 1966 vmovdqu8 ($input),%zmm1 1967 vmovdqu8 0x40($input),%ymm2 1968 add \$0x60,$input 1969 and \$0xf, $length 1970 je .L_done_6_remain_${rndsuffix} 1971 vextracti32x4 \$0x1,%zmm10,%xmm12 1972 vextracti32x4 \$0x2,%zmm10,%xmm13 1973 vinserti32x4 \$0x1,%xmm13,%zmm10,%zmm10 1974___ 1975 } 1976 1977 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1978 1979 { 1980 $code .= <<___; 1981 vmovdqu8 %zmm1, ($output) 1982 vmovdqu8 %ymm2, 0x40($output) 1983 add \$0x60,$output 1984 vextracti32x4 \$0x1,%zmm2,%xmm8 1985 vmovdqa %xmm12,%xmm0 1986 jmp .L_steal_cipher_${rndsuffix} 1987___ 1988 } 1989 1990 $code .= "\n.L_done_6_remain_${rndsuffix}:\n"; 1991 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1992 1993 { 1994 $code .= <<___; 1995 vmovdqu8 %zmm1, ($output) 1996 vmovdqu8 %ymm2,0x40($output) 1997 jmp .L_ret_${rndsuffix} 1998 1999 .L_remaining_num_blocks_is_5_${rndsuffix}: 2000 vmovdqu8 ($input),%zmm1 2001 vmovdqu 0x40($input),%xmm2 2002 add \$0x50,$input 2003 and \$0xf,$length 2004 je .L_done_5_remain_${rndsuffix} 2005 vmovdqa %xmm10,%xmm12 2006 vextracti32x4 \$0x1,%zmm10,%xmm10 2007___ 2008 } 2009 2010 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 2011 2012 { 2013 $code .= <<___; 2014 vmovdqu8 %zmm1, ($output) 2015 vmovdqu %xmm2, 0x40($output) 2016 add \$0x50, $output 2017 vmovdqa %xmm2,%xmm8 2018 vmovdqa %xmm12,%xmm0 2019 jmp .L_steal_cipher_${rndsuffix} 2020___ 2021 } 2022 2023 $code .= "\n.L_done_5_remain_${rndsuffix}:\n"; 2024 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 2025 2026 { 2027 $code .= <<___; 2028 vmovdqu8 %zmm1, ($output) 2029 vmovdqu8 %xmm2, 0x40($output) 2030 jmp .L_ret_${rndsuffix} 2031 2032 .L_remaining_num_blocks_is_4_${rndsuffix}: 2033 vmovdqu8 ($input),%zmm1 2034 add \$0x40,$input 2035 and \$0xf, $length 2036 je .L_done_4_remain_${rndsuffix} 2037 vextracti32x4 \$0x3,%zmm9,%xmm12 2038 vinserti32x4 \$0x3,%xmm10,%zmm9,%zmm9 2039___ 2040 } 2041 2042 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 2043 2044 { 2045 $code .= <<___; 2046 vmovdqu8 %zmm1,($output) 2047 add \$0x40,$output 2048 vextracti32x4 \$0x3,%zmm1,%xmm8 2049 vmovdqa %xmm12,%xmm0 2050 jmp .L_steal_cipher_${rndsuffix} 2051___ 2052 } 2053 2054 $code .= "\n.L_done_4_remain_${rndsuffix}:\n"; 2055 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 2056 2057 { 2058 $code .= <<___; 2059 vmovdqu8 %zmm1, ($output) 2060 jmp .L_ret_${rndsuffix} 2061 2062 .L_remaining_num_blocks_is_3_${rndsuffix}: 2063 vmovdqu ($input),%xmm1 2064 vmovdqu 0x10($input),%xmm2 2065 vmovdqu 0x20($input),%xmm3 2066 add \$0x30,$input 2067 and \$0xf,$length 2068 je .L_done_3_remain_${rndsuffix} 2069 vextracti32x4 \$0x2,%zmm9,%xmm13 2070 vextracti32x4 \$0x1,%zmm9,%xmm10 2071 vextracti32x4 \$0x3,%zmm9,%xmm11 2072___ 2073 } 2074 2075 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2076 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2077 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128); 2078 2079 { 2080 $code .= <<___; 2081 vmovdqu %xmm1,($output) 2082 vmovdqu %xmm2,0x10($output) 2083 vmovdqu %xmm3,0x20($output) 2084 add \$0x30,$output 2085 vmovdqa %xmm3,%xmm8 2086 vmovdqa %xmm13,%xmm0 2087 jmp .L_steal_cipher_${rndsuffix} 2088___ 2089 } 2090 $code .= "\n.L_done_3_remain_${rndsuffix}:\n"; 2091 $code .= "vextracti32x4 \$0x1,%zmm9,%xmm10\n"; 2092 $code .= "vextracti32x4 \$0x2,%zmm9,%xmm11\n"; 2093 2094 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2095 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2096 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128); 2097 2098 { 2099 $code .= <<___; 2100 vmovdqu %xmm1,($output) 2101 vmovdqu %xmm2,0x10($output) 2102 vmovdqu %xmm3,0x20($output) 2103 jmp .L_ret_${rndsuffix} 2104 2105 .L_remaining_num_blocks_is_2_${rndsuffix}: 2106 vmovdqu ($input),%xmm1 2107 vmovdqu 0x10($input),%xmm2 2108 add \$0x20,$input 2109 and \$0xf,$length 2110 je .L_done_2_remain_${rndsuffix} 2111 vextracti32x4 \$0x2,%zmm9,%xmm10 2112 vextracti32x4 \$0x1,%zmm9,%xmm12 2113___ 2114 } 2115 2116 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2117 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2118 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128); 2119 2120 { 2121 $code .= <<___; 2122 vmovdqu %xmm1,($output) 2123 vmovdqu %xmm2,0x10($output) 2124 add \$0x20,$output 2125 vmovdqa %xmm2,%xmm8 2126 vmovdqa %xmm12,%xmm0 2127 jmp .L_steal_cipher_${rndsuffix} 2128___ 2129 } 2130 $code .= "\n.L_done_2_remain_${rndsuffix}:\n"; 2131 $code .= "vextracti32x4 \$0x1,%zmm9,%xmm10\n"; 2132 2133 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2134 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2135 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128); 2136 2137 { 2138 $code .= <<___; 2139 vmovdqu %xmm1,($output) 2140 vmovdqu %xmm2,0x10($output) 2141 jmp .L_ret_${rndsuffix} 2142 2143 .L_remaining_num_blocks_is_1_${rndsuffix}: 2144 vmovdqu ($input),%xmm1 2145 add \$0x10,$input 2146 and \$0xf,$length 2147 je .L_done_1_remain_${rndsuffix} 2148 vextracti32x4 \$0x1,%zmm9,%xmm11 2149___ 2150 } 2151 2152 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2153 "%xmm7", "%xmm8", "%xmm11", "%xmm10", "%xmm9", "%xmm12", 2154 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128); 2155 { 2156 $code .= <<___; 2157 vmovdqu %xmm1,($output) 2158 add \$0x10,$output 2159 vmovdqa %xmm1,%xmm8 2160 vmovdqa %xmm9,%xmm0 2161 jmp .L_steal_cipher_${rndsuffix} 2162___ 2163 } 2164 2165 $code .= "\n.L_done_1_remain_${rndsuffix}:\n"; 2166 2167 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2168 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2169 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128); 2170 2171 { 2172 $code .= <<___; 2173 vmovdqu %xmm1, ($output) 2174 jmp .L_ret_${rndsuffix} 2175 2176 .L_start_by16_${rndsuffix}: 2177 vbroadcasti32x4 ($TW),%zmm0 2178 vbroadcasti32x4 shufb_15_7(%rip),%zmm8 2179 mov \$0xaa,$tmp1 2180 kmovq $tmp1,%k2 2181 2182 # Mult tweak by 2^{3, 2, 1, 0} 2183 vpshufb %zmm8,%zmm0,%zmm1 2184 vpsllvq const_dq3210(%rip),%zmm0,%zmm4 2185 vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 2186 vpclmulqdq \$0x0,$ZPOLY,%zmm2,%zmm3 2187 vpxorq %zmm2,%zmm4,%zmm4{%k2} 2188 vpxord %zmm4,%zmm3,%zmm9 2189 2190 # Mult tweak by 2^{7, 6, 5, 4} 2191 vpsllvq const_dq7654(%rip),%zmm0,%zmm5 2192 vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 2193 vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 2194 vpxorq %zmm6,%zmm5,%zmm5{%k2} 2195 vpxord %zmm5,%zmm7,%zmm10 2196 2197 # Make next 8 tweek values by all x 2^8 2198 vpsrldq \$0xf,%zmm9,%zmm13 2199 vpclmulqdq \$0x0,%zmm25,%zmm13,%zmm14 2200 vpslldq \$0x1,%zmm9,%zmm11 2201 vpxord %zmm14,%zmm11,%zmm11 2202 2203 vpsrldq \$0xf,%zmm10,%zmm15 2204 vpclmulqdq \$0x0,%zmm25,%zmm15,%zmm16 2205 vpslldq \$0x1,%zmm10,%zmm12 2206 vpxord %zmm16,%zmm12,%zmm12 2207 2208 .L_main_loop_run_16_${rndsuffix}: 2209 vmovdqu8 ($input),%zmm1 2210 vmovdqu8 0x40($input),%zmm2 2211 vmovdqu8 0x80($input),%zmm3 2212 vmovdqu8 0xc0($input),%zmm4 2213 vmovdqu8 0xf0($input),%xmm5 2214 add \$0x100,$input 2215___ 2216 } 2217 2218 decrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9", 2219 "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0, $is_128); 2220 2221 { 2222 $code .= <<___; 2223 vmovdqu8 %zmm1,($output) 2224 vmovdqu8 %zmm2,0x40($output) 2225 vmovdqu8 %zmm3,0x80($output) 2226 vmovdqu8 %zmm4,0xc0($output) 2227 add \$0x100,$output 2228 sub \$0x100,$length 2229 cmp \$0x100,$length 2230 jge .L_main_loop_run_16_${rndsuffix} 2231 2232 cmp \$0x80,$length 2233 jge .L_main_loop_run_8_${rndsuffix} 2234 jmp .L_do_n_blocks_${rndsuffix} 2235 2236 .L_start_by8_${rndsuffix}: 2237 # Make first 7 tweek values 2238 vbroadcasti32x4 ($TW),%zmm0 2239 vbroadcasti32x4 shufb_15_7(%rip),%zmm8 2240 mov \$0xaa,$tmp1 2241 kmovq $tmp1,%k2 2242 2243 # Mult tweak by 2^{3, 2, 1, 0} 2244 vpshufb %zmm8,%zmm0,%zmm1 2245 vpsllvq const_dq3210(%rip),%zmm0,%zmm4 2246 vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 2247 vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3 2248 vpxorq %zmm2,%zmm4,%zmm4{%k2} 2249 vpxord %zmm4,%zmm3,%zmm9 2250 2251 # Mult tweak by 2^{7, 6, 5, 4} 2252 vpsllvq const_dq7654(%rip),%zmm0,%zmm5 2253 vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 2254 vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 2255 vpxorq %zmm6,%zmm5,%zmm5{%k2} 2256 vpxord %zmm5,%zmm7,%zmm10 2257 2258 .L_main_loop_run_8_${rndsuffix}: 2259 vmovdqu8 ($input),%zmm1 2260 vmovdqu8 0x40($input),%zmm2 2261 vmovdqu8 0x70($input),%xmm5 2262 add \$0x80,$input 2263___ 2264 } 2265 2266 2267 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0, $is_128); 2268 2269 { 2270 $code .= <<___; 2271 vmovdqu8 %zmm1,($output) 2272 vmovdqu8 %zmm2,0x40($output) 2273 add \$0x80,$output 2274 sub \$0x80,$length 2275 cmp \$0x80,$length 2276 jge .L_main_loop_run_8_${rndsuffix} 2277 jmp .L_do_n_blocks_${rndsuffix} 2278 2279 .L_steal_cipher_${rndsuffix}: 2280 # start cipher stealing simplified: xmm8-last cipher block, xmm0-next tweak 2281 vmovdqa %xmm8,%xmm2 2282 2283 # shift xmm8 to the left by 16-N_val bytes 2284 lea vpshufb_shf_table(%rip),$TEMPLOW 2285 vmovdqu ($TEMPLOW,$length,1),%xmm10 2286 vpshufb %xmm10,%xmm8,%xmm8 2287 2288 2289 vmovdqu -0x10($input,$length,1),%xmm3 2290 vmovdqu %xmm8,-0x10($output,$length,1) 2291 2292 # shift xmm3 to the right by 16-N_val bytes 2293 lea vpshufb_shf_table(%rip), $TEMPLOW 2294 add \$16, $TEMPLOW 2295 sub $length,$TEMPLOW 2296 vmovdqu ($TEMPLOW),%xmm10 2297 vpxor mask1(%rip),%xmm10,%xmm10 2298 vpshufb %xmm10,%xmm3,%xmm3 2299 2300 vpblendvb %xmm10,%xmm2,%xmm3,%xmm3 2301 2302 # xor Tweak value 2303 vpxor %xmm0,%xmm3,%xmm8 2304 2305 # decrypt last block with cipher stealing 2306 vpxor ($key1),%xmm8,%xmm8 2307 vaesdec 0x10($key1),%xmm8,%xmm8 2308 vaesdec 0x20($key1),%xmm8,%xmm8 2309 vaesdec 0x30($key1),%xmm8,%xmm8 2310 vaesdec 0x40($key1),%xmm8,%xmm8 2311 vaesdec 0x50($key1),%xmm8,%xmm8 2312 vaesdec 0x60($key1),%xmm8,%xmm8 2313 vaesdec 0x70($key1),%xmm8,%xmm8 2314 vaesdec 0x80($key1),%xmm8,%xmm8 2315 vaesdec 0x90($key1),%xmm8,%xmm8 2316___ 2317 if ($is_128) { 2318 $code .= "vaesdeclast 0xa0($key1),%xmm8,%xmm8\n"; 2319 } else { 2320 $code .= <<___; 2321 vaesdec 0xa0($key1),%xmm8,%xmm8 2322 vaesdec 0xb0($key1),%xmm8,%xmm8 2323 vaesdec 0xc0($key1),%xmm8,%xmm8 2324 vaesdec 0xd0($key1),%xmm8,%xmm8 2325 vaesdeclast 0xe0($key1),%xmm8,%xmm8 2326___ 2327 } 2328 $code .= <<___ 2329 # xor Tweak value 2330 vpxor %xmm0,%xmm8,%xmm8 2331 2332 .L_done_${rndsuffix}: 2333 # store last ciphertext value 2334 vmovdqu %xmm8,-0x10($output) 2335___ 2336 } 2337 2338 { 2339 $code .= <<___; 2340 .L_ret_${rndsuffix}: 2341 mov $GP_STORAGE($TW),%rbx 2342 xor $tmp1,$tmp1 2343 mov $tmp1,$GP_STORAGE($TW) 2344 # Zero-out the whole of `%zmm0`. 2345 vpxorq %zmm0,%zmm0,%zmm0 2346___ 2347 } 2348 2349 if ($win64) { 2350 $code .= <<___; 2351 mov $GP_STORAGE + 8*1($TW),%rdi 2352 mov $tmp1,$GP_STORAGE + 8*1($TW) 2353 mov $GP_STORAGE + 8*2($TW),%rsi 2354 mov $tmp1,$GP_STORAGE + 8*2($TW) 2355 2356 vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6 2357 vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7 2358 vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8 2359 vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9 2360 2361 # Zero the 64 bytes we just restored to the xmm registers. 2362 vmovdqa64 %zmm0,$XMM_STORAGE($TW) 2363 2364 vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10 2365 vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11 2366 vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12 2367 vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13 2368 2369 # And again. 2370 vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW) 2371 2372 vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14 2373 vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15 2374 2375 # Last round is only 32 bytes (256-bits), so we use `%ymm` as the 2376 # source operand. 2377 vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW) 2378___ 2379 } 2380 2381 { 2382 $code .= <<___; 2383 mov %rbp,$TW 2384 pop %rbp 2385 vzeroupper 2386 ret 2387 2388 .L_less_than_128_bytes_${rndsuffix}: 2389 cmp \$0x10,$length 2390 jb .L_ret_${rndsuffix} 2391 2392 mov $length,$tmp1 2393 and \$0x70,$tmp1 2394 cmp \$0x60,$tmp1 2395 je .L_num_blocks_is_6_${rndsuffix} 2396 cmp \$0x50,$tmp1 2397 je .L_num_blocks_is_5_${rndsuffix} 2398 cmp \$0x40,$tmp1 2399 je .L_num_blocks_is_4_${rndsuffix} 2400 cmp \$0x30,$tmp1 2401 je .L_num_blocks_is_3_${rndsuffix} 2402 cmp \$0x20,$tmp1 2403 je .L_num_blocks_is_2_${rndsuffix} 2404 cmp \$0x10,$tmp1 2405 je .L_num_blocks_is_1_${rndsuffix} 2406___ 2407 } 2408 2409 $code .= "\n.L_num_blocks_is_7_${rndsuffix}:\n"; 2410 initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2411 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2412 "%xmm13", "%xmm14", "%xmm15", 7); 2413 2414 { 2415 $code .= <<___; 2416 add \$0x70,$input 2417 and \$0xf,$length 2418 je .L_done_7_${rndsuffix} 2419 2420 .L_steal_cipher_7_${rndsuffix}: 2421 xor $gf_poly_8b_temp, $gf_poly_8b_temp 2422 shl \$1, $TEMPLOW 2423 adc $TEMPHIGH, $TEMPHIGH 2424 cmovc $gf_poly_8b, $gf_poly_8b_temp 2425 xor $gf_poly_8b_temp, $TEMPLOW 2426 mov $TEMPLOW,0x10($TW) 2427 mov $TEMPHIGH,0x18($TW) 2428 vmovdqa64 %xmm15,%xmm16 2429 vmovdqa 0x10($TW),%xmm15 2430___ 2431 } 2432 2433 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2434 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2435 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1, $is_128); 2436 2437 { 2438 $code .= <<___; 2439 vmovdqu %xmm1,($output) 2440 vmovdqu %xmm2,0x10($output) 2441 vmovdqu %xmm3,0x20($output) 2442 vmovdqu %xmm4,0x30($output) 2443 vmovdqu %xmm5,0x40($output) 2444 vmovdqu %xmm6,0x50($output) 2445 add \$0x70,$output 2446 vmovdqa64 %xmm16,%xmm0 2447 vmovdqa %xmm7,%xmm8 2448 jmp .L_steal_cipher_${rndsuffix} 2449___ 2450 } 2451 2452 $code .= "\n.L_done_7_${rndsuffix}:\n"; 2453 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2454 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2455 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1, $is_128); 2456 2457 { 2458 $code .= <<___; 2459 vmovdqu %xmm1,($output) 2460 vmovdqu %xmm2,0x10($output) 2461 vmovdqu %xmm3,0x20($output) 2462 vmovdqu %xmm4,0x30($output) 2463 vmovdqu %xmm5,0x40($output) 2464 vmovdqu %xmm6,0x50($output) 2465 add \$0x70,$output 2466 vmovdqa %xmm7,%xmm8 2467 jmp .L_done_${rndsuffix} 2468___ 2469 } 2470 2471 $code .= "\n.L_num_blocks_is_6_${rndsuffix}:\n"; 2472 initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2473 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2474 "%xmm13", "%xmm14", "%xmm15", 6); 2475 2476 { 2477 $code .= <<___; 2478 add \$0x60,$input 2479 and \$0xf,$length 2480 je .L_done_6_${rndsuffix} 2481 2482 .L_steal_cipher_6_${rndsuffix}: 2483 xor $gf_poly_8b_temp, $gf_poly_8b_temp 2484 shl \$1, $TEMPLOW 2485 adc $TEMPHIGH, $TEMPHIGH 2486 cmovc $gf_poly_8b, $gf_poly_8b_temp 2487 xor $gf_poly_8b_temp, $TEMPLOW 2488 mov $TEMPLOW,0x10($TW) 2489 mov $TEMPHIGH,0x18($TW) 2490 vmovdqa64 %xmm14,%xmm15 2491 vmovdqa 0x10($TW),%xmm14 2492___ 2493 } 2494 2495 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2496 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2497 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1, $is_128); 2498 2499 { 2500 $code .= <<___; 2501 vmovdqu %xmm1,($output) 2502 vmovdqu %xmm2,0x10($output) 2503 vmovdqu %xmm3,0x20($output) 2504 vmovdqu %xmm4,0x30($output) 2505 vmovdqu %xmm5,0x40($output) 2506 add \$0x60,$output 2507 vmovdqa %xmm15,%xmm0 2508 vmovdqa %xmm6,%xmm8 2509 jmp .L_steal_cipher_${rndsuffix} 2510___ 2511 } 2512 $code .= "\n.L_done_6_${rndsuffix}:\n"; 2513 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2514 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2515 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1, $is_128); 2516 2517 { 2518 $code .= <<___; 2519 vmovdqu %xmm1,($output) 2520 vmovdqu %xmm2,0x10($output) 2521 vmovdqu %xmm3,0x20($output) 2522 vmovdqu %xmm4,0x30($output) 2523 vmovdqu %xmm5,0x40($output) 2524 add \$0x60,$output 2525 vmovdqa %xmm6,%xmm8 2526 jmp .L_done_${rndsuffix} 2527___ 2528 } 2529 2530 $code .= "\n.L_num_blocks_is_5_${rndsuffix}:\n"; 2531 initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2532 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2533 "%xmm13", "%xmm14", "%xmm15", 5); 2534 2535 { 2536 $code .= <<___; 2537 add \$0x50,$input 2538 and \$0xf,$length 2539 je .L_done_5_${rndsuffix} 2540 2541 .L_steal_cipher_5_${rndsuffix}: 2542 xor $gf_poly_8b_temp, $gf_poly_8b_temp 2543 shl \$1, $TEMPLOW 2544 adc $TEMPHIGH, $TEMPHIGH 2545 cmovc $gf_poly_8b, $gf_poly_8b_temp 2546 xor $gf_poly_8b_temp, $TEMPLOW 2547 mov $TEMPLOW,0x10($TW) 2548 mov $TEMPHIGH,0x18($TW) 2549 vmovdqa64 %xmm13,%xmm14 2550 vmovdqa 0x10($TW),%xmm13 2551___ 2552 } 2553 2554 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2555 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2556 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1, $is_128); 2557 2558 { 2559 $code .= <<___; 2560 vmovdqu %xmm1,($output) 2561 vmovdqu %xmm2,0x10($output) 2562 vmovdqu %xmm3,0x20($output) 2563 vmovdqu %xmm4,0x30($output) 2564 add \$0x50,$output 2565 vmovdqa %xmm14,%xmm0 2566 vmovdqa %xmm5,%xmm8 2567 jmp .L_steal_cipher_${rndsuffix} 2568___ 2569 } 2570 2571 $code .= "\n.L_done_5_${rndsuffix}:\n"; 2572 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2573 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2574 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1, $is_128); 2575 2576 { 2577 $code .= <<___; 2578 vmovdqu %xmm1,($output) 2579 vmovdqu %xmm2,0x10($output) 2580 vmovdqu %xmm3,0x20($output) 2581 vmovdqu %xmm4,0x30($output) 2582 add \$0x50,$output 2583 vmovdqa %xmm5,%xmm8 2584 jmp .L_done_${rndsuffix} 2585___ 2586 } 2587 2588 $code .= "\n.L_num_blocks_is_4_${rndsuffix}:\n"; 2589 2590 initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2591 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2592 "%xmm13", "%xmm14", "%xmm15", 4); 2593 2594 { 2595 $code .= <<___; 2596 add \$0x40,$input 2597 and \$0xf,$length 2598 je .L_done_4_${rndsuffix} 2599 2600 .L_steal_cipher_4_${rndsuffix}: 2601 xor $gf_poly_8b_temp, $gf_poly_8b_temp 2602 shl \$1, $TEMPLOW 2603 adc $TEMPHIGH, $TEMPHIGH 2604 cmovc $gf_poly_8b, $gf_poly_8b_temp 2605 xor $gf_poly_8b_temp, $TEMPLOW 2606 mov $TEMPLOW,0x10($TW) 2607 mov $TEMPHIGH,0x18($TW) 2608 vmovdqa64 %xmm12,%xmm13 2609 vmovdqa 0x10($TW),%xmm12 2610___ 2611 } 2612 2613 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2614 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2615 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1, $is_128); 2616 2617 { 2618 $code .= <<___; 2619 vmovdqu %xmm1,($output) 2620 vmovdqu %xmm2,0x10($output) 2621 vmovdqu %xmm3,0x20($output) 2622 add \$0x40,$output 2623 vmovdqa %xmm13,%xmm0 2624 vmovdqa %xmm4,%xmm8 2625 jmp .L_steal_cipher_${rndsuffix} 2626___ 2627 } 2628 2629 $code .= "\n.L_done_4_${rndsuffix}:\n"; 2630 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2631 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2632 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1, $is_128); 2633 2634 { 2635 $code .= <<___; 2636 vmovdqu %xmm1,($output) 2637 vmovdqu %xmm2,0x10($output) 2638 vmovdqu %xmm3,0x20($output) 2639 add \$0x40,$output 2640 vmovdqa %xmm4,%xmm8 2641 jmp .L_done_${rndsuffix} 2642___ 2643 } 2644 2645 $code .= "\n.L_num_blocks_is_3_${rndsuffix}:\n"; 2646 2647 initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2648 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2649 "%xmm13", "%xmm14", "%xmm15", 3); 2650 2651 { 2652 $code .= <<___; 2653 add \$0x30,$input 2654 and \$0xf,$length 2655 je .L_done_3_${rndsuffix} 2656 2657 .L_steal_cipher_3_${rndsuffix}: 2658 xor $gf_poly_8b_temp, $gf_poly_8b_temp 2659 shl \$1, $TEMPLOW 2660 adc $TEMPHIGH, $TEMPHIGH 2661 cmovc $gf_poly_8b, $gf_poly_8b_temp 2662 xor $gf_poly_8b_temp, $TEMPLOW 2663 mov $TEMPLOW,0x10($TW) 2664 mov $TEMPHIGH,0x18($TW) 2665 vmovdqa64 %xmm11,%xmm12 2666 vmovdqa 0x10($TW),%xmm11 2667___ 2668 } 2669 2670 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2671 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2672 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128); 2673 2674 { 2675 $code .= <<___; 2676 vmovdqu %xmm1,($output) 2677 vmovdqu %xmm2,0x10($output) 2678 add \$0x30,$output 2679 vmovdqa %xmm12,%xmm0 2680 vmovdqa %xmm3,%xmm8 2681 jmp .L_steal_cipher_${rndsuffix} 2682___ 2683 } 2684 $code .= "\n.L_done_3_${rndsuffix}:\n"; 2685 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2686 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2687 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128); 2688 2689 { 2690 $code .= <<___; 2691 vmovdqu %xmm1,($output) 2692 vmovdqu %xmm2,0x10($output) 2693 add \$0x30,$output 2694 vmovdqa %xmm3,%xmm8 2695 jmp .L_done_${rndsuffix} 2696___ 2697 } 2698 2699 $code .= "\n.L_num_blocks_is_2_${rndsuffix}:\n"; 2700 2701 initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2702 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2703 "%xmm13", "%xmm14", "%xmm15", 2); 2704 2705 { 2706 $code .= <<___; 2707 add \$0x20,$input 2708 and \$0xf,$length 2709 je .L_done_2_${rndsuffix} 2710 2711 .L_steal_cipher_2_${rndsuffix}: 2712 xor $gf_poly_8b_temp, $gf_poly_8b_temp 2713 shl \$1, $TEMPLOW 2714 adc $TEMPHIGH, $TEMPHIGH 2715 cmovc $gf_poly_8b, $gf_poly_8b_temp 2716 xor $gf_poly_8b_temp, $TEMPLOW 2717 mov $TEMPLOW,0x10($TW) 2718 mov $TEMPHIGH,0x18($TW) 2719 vmovdqa64 %xmm10,%xmm11 2720 vmovdqa 0x10($TW),%xmm10 2721___ 2722 } 2723 2724 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2725 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2726 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128); 2727 2728 { 2729 $code .= <<___; 2730 vmovdqu %xmm1,($output) 2731 add \$0x20,$output 2732 vmovdqa %xmm11,%xmm0 2733 vmovdqa %xmm2,%xmm8 2734 jmp .L_steal_cipher_${rndsuffix} 2735___ 2736 } 2737 2738 $code .= "\n.L_done_2_${rndsuffix}:\n"; 2739 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2740 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2741 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128); 2742 2743 { 2744 $code .= <<___; 2745 vmovdqu %xmm1,($output) 2746 add \$0x20,$output 2747 vmovdqa %xmm2,%xmm8 2748 jmp .L_done_${rndsuffix} 2749___ 2750 } 2751 2752 $code .= "\n.L_num_blocks_is_1_${rndsuffix}:\n"; 2753 2754 initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2755 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2756 "%xmm13", "%xmm14", "%xmm15", 1); 2757 2758 { 2759 $code .= <<___; 2760 add \$0x10,$input 2761 and \$0xf,$length 2762 je .L_done_1_${rndsuffix} 2763 2764 .L_steal_cipher_1_${rndsuffix}: 2765 xor $gf_poly_8b_temp, $gf_poly_8b_temp 2766 shl \$1, $TEMPLOW 2767 adc $TEMPHIGH, $TEMPHIGH 2768 cmovc $gf_poly_8b, $gf_poly_8b_temp 2769 xor $gf_poly_8b_temp, $TEMPLOW 2770 mov $TEMPLOW,0x10($TW) 2771 mov $TEMPHIGH,0x18($TW) 2772 vmovdqa64 %xmm9,%xmm10 2773 vmovdqa 0x10($TW),%xmm9 2774___ 2775 } 2776 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2777 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2778 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128); 2779 2780 { 2781 $code .= <<___; 2782 add \$0x10,$output 2783 vmovdqa %xmm10,%xmm0 2784 vmovdqa %xmm1,%xmm8 2785 jmp .L_steal_cipher_${rndsuffix} 2786___ 2787 } 2788 $code .= "\n.L_done_1_${rndsuffix}:\n"; 2789 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2790 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2791 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128); 2792 2793 { 2794 $code .= <<___; 2795 add \$0x10,$output 2796 vmovdqa %xmm1,%xmm8 2797 jmp .L_done_${rndsuffix} 2798 .cfi_endproc 2799___ 2800 } 2801 2802 } 2803 2804 # The only difference between AES-XTS-128 and -256 is the number of rounds, 2805 # so we generate from the same perlasm base, extending to 14 rounds when 2806 # `$is_128' is 0. 2807 2808 enc(1); 2809 dec(1); 2810 2811 enc(0); 2812 dec(0); 2813 2814 $code .= <<___; 2815 .section .rodata 2816 .align 16 2817 2818 vpshufb_shf_table: 2819 .quad 0x8786858483828100, 0x8f8e8d8c8b8a8988 2820 .quad 0x0706050403020100, 0x000e0d0c0b0a0908 2821 2822 mask1: 2823 .quad 0x8080808080808080, 0x8080808080808080 2824 2825 const_dq3210: 2826 .quad 0, 0, 1, 1, 2, 2, 3, 3 2827 const_dq5678: 2828 .quad 8, 8, 7, 7, 6, 6, 5, 5 2829 const_dq7654: 2830 .quad 4, 4, 5, 5, 6, 6, 7, 7 2831 const_dq1234: 2832 .quad 4, 4, 3, 3, 2, 2, 1, 1 2833 2834 shufb_15_7: 2835 .byte 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff 2836 .byte 0xff, 0xff, 0xff, 0xff, 0xff 2837 2838.text 2839___ 2840 2841} else { 2842 $code .= <<___; 2843 .text 2844 .globl aesni_xts_128_encrypt_avx512 2845 .globl aesni_xts_128_decrypt_avx512 2846 2847 aesni_xts_128_encrypt_avx512: 2848 aesni_xts_128_decrypt_avx512: 2849 .byte 0x0f,0x0b # ud2 2850 ret 2851 2852 .globl aesni_xts_256_encrypt_avx512 2853 .globl aesni_xts_256_decrypt_avx512 2854 2855 aesni_xts_256_encrypt_avx512: 2856 aesni_xts_256_decrypt_avx512: 2857 .byte 0x0f,0x0b # ud2 2858 ret 2859 2860 .globl aesni_xts_avx512_eligible 2861 .type aesni_xts_avx512_eligible,\@abi-omnipotent 2862 aesni_xts_avx512_eligible: 2863 xor %eax,%eax 2864 ret 2865 .size aesni_xts_avx512_eligible, .-aesni_xts_avx512_eligible 2866 2867___ 2868} 2869 2870print $code; 2871 2872close STDOUT or die "error closing STDOUT: $!"; 2873