1#!/usr/bin/env perl 2# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 3# 4# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. 5# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 6# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. 7# 8# This code is taken from the OpenSSL project but the author, Andy Polyakov, 9# has relicensed it under the licenses specified in the SPDX header above. 10# The original headers, including the original license headers, are 11# included below for completeness. 12# 13# ==================================================================== 14# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 15# project. The module is, however, dual licensed under OpenSSL and 16# CRYPTOGAMS licenses depending on where you obtain it. For further 17# details see http://www.openssl.org/~appro/cryptogams/. 18# ==================================================================== 19# 20# This module implements Poly1305 hash for x86_64. 21# 22# March 2015 23# 24# Initial release. 25# 26# December 2016 27# 28# Add AVX512F+VL+BW code path. 29# 30# November 2017 31# 32# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be 33# executed even on Knights Landing. Trigger for modification was 34# observation that AVX512 code paths can negatively affect overall 35# Skylake-X system performance. Since we are likely to suppress 36# AVX512F capability flag [at least on Skylake-X], conversion serves 37# as kind of "investment protection". Note that next *lake processor, 38# Cannonlake, has AVX512IFMA code path to execute... 39# 40# Numbers are cycles per processed byte with poly1305_blocks alone, 41# measured with rdtsc at fixed clock frequency. 42# 43# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 44# P4 4.46/+120% - 45# Core 2 2.41/+90% - 46# Westmere 1.88/+120% - 47# Sandy Bridge 1.39/+140% 1.10 48# Haswell 1.14/+175% 1.11 0.65 49# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] 50# Silvermont 2.83/+95% - 51# Knights L 3.60/? 1.65 1.10 0.41(***) 52# Goldmont 1.70/+180% - 53# VIA Nano 1.82/+150% - 54# Sledgehammer 1.38/+160% - 55# Bulldozer 2.30/+130% 0.97 56# Ryzen 1.15/+200% 1.08 1.18 57# 58# (*) improvement coefficients relative to clang are more modest and 59# are ~50% on most processors, in both cases we are comparing to 60# __int128 code; 61# (**) SSE2 implementation was attempted, but among non-AVX processors 62# it was faster than integer-only code only on older Intel P4 and 63# Core processors, 50-30%, less newer processor is, but slower on 64# contemporary ones, for example almost 2x slower on Atom, and as 65# former are naturally disappearing, SSE2 is deemed unnecessary; 66# (***) strangely enough performance seems to vary from core to core, 67# listed result is best case; 68 69$flavour = shift; 70$output = shift; 71if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 72 73$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 74$kernel=0; $kernel=1 if (!$flavour && !$output); 75 76if (!$kernel) { 77 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 78 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 79 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 80 die "can't locate x86_64-xlate.pl"; 81 82 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 83 *STDOUT=*OUT; 84 85 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 86 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 87 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); 88 } 89 90 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 91 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 92 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); 93 $avx += 1 if ($1==2.11 && $2>=8); 94 } 95 96 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 97 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 98 $avx = ($1>=10) + ($1>=11); 99 } 100 101 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { 102 $avx = ($2>=3.0) + ($2>3.0); 103 } 104} else { 105 $avx = 4; # The kernel uses ifdefs for this. 106} 107 108sub declare_function() { 109 my ($name, $align, $nargs) = @_; 110 if($kernel) { 111 $code .= ".align $align\n"; 112 $code .= "SYM_FUNC_START($name)\n"; 113 $code .= ".L$name:\n"; 114 } else { 115 $code .= ".globl $name\n"; 116 $code .= ".type $name,\@function,$nargs\n"; 117 $code .= ".align $align\n"; 118 $code .= "$name:\n"; 119 } 120} 121 122sub end_function() { 123 my ($name) = @_; 124 if($kernel) { 125 $code .= "SYM_FUNC_END($name)\n"; 126 } else { 127 $code .= ".size $name,.-$name\n"; 128 } 129} 130 131$code.=<<___ if $kernel; 132#include <linux/linkage.h> 133___ 134 135if ($avx) { 136$code.=<<___ if $kernel; 137.section .rodata 138___ 139$code.=<<___; 140.align 64 141.Lconst: 142.Lmask24: 143.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 144.L129: 145.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 146.Lmask26: 147.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 148.Lpermd_avx2: 149.long 2,2,2,3,2,0,2,1 150.Lpermd_avx512: 151.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 152 153.L2_44_inp_permd: 154.long 0,1,1,2,2,3,7,7 155.L2_44_inp_shift: 156.quad 0,12,24,64 157.L2_44_mask: 158.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff 159.L2_44_shift_rgt: 160.quad 44,44,42,64 161.L2_44_shift_lft: 162.quad 8,8,10,64 163 164.align 64 165.Lx_mask44: 166.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 167.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 168.Lx_mask42: 169.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 170.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 171___ 172} 173$code.=<<___ if (!$kernel); 174.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 175.align 16 176___ 177 178my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); 179my ($mac,$nonce)=($inp,$len); # *_emit arguments 180my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); 181my ($h0,$h1,$h2)=("%r14","%rbx","%r10"); 182 183sub poly1305_iteration { 184# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 185# output: $h0-$h2 *= $r0-$r1 186$code.=<<___; 187 mulq $h0 # h0*r1 188 mov %rax,$d2 189 mov $r0,%rax 190 mov %rdx,$d3 191 192 mulq $h0 # h0*r0 193 mov %rax,$h0 # future $h0 194 mov $r0,%rax 195 mov %rdx,$d1 196 197 mulq $h1 # h1*r0 198 add %rax,$d2 199 mov $s1,%rax 200 adc %rdx,$d3 201 202 mulq $h1 # h1*s1 203 mov $h2,$h1 # borrow $h1 204 add %rax,$h0 205 adc %rdx,$d1 206 207 imulq $s1,$h1 # h2*s1 208 add $h1,$d2 209 mov $d1,$h1 210 adc \$0,$d3 211 212 imulq $r0,$h2 # h2*r0 213 add $d2,$h1 214 mov \$-4,%rax # mask value 215 adc $h2,$d3 216 217 and $d3,%rax # last reduction step 218 mov $d3,$h2 219 shr \$2,$d3 220 and \$3,$h2 221 add $d3,%rax 222 add %rax,$h0 223 adc \$0,$h1 224 adc \$0,$h2 225___ 226} 227 228######################################################################## 229# Layout of opaque area is following. 230# 231# unsigned __int64 h[3]; # current hash value base 2^64 232# unsigned __int64 r[2]; # key value base 2^64 233 234$code.=<<___; 235.text 236___ 237$code.=<<___ if (!$kernel); 238.extern OPENSSL_ia32cap_P 239 240.globl poly1305_init_x86_64 241.hidden poly1305_init_x86_64 242.globl poly1305_blocks_x86_64 243.hidden poly1305_blocks_x86_64 244.globl poly1305_emit_x86_64 245.hidden poly1305_emit_x86_64 246___ 247&declare_function("poly1305_init_x86_64", 32, 3); 248$code.=<<___; 249 xor %rax,%rax 250 mov %rax,0($ctx) # initialize hash value 251 mov %rax,8($ctx) 252 mov %rax,16($ctx) 253 254 cmp \$0,$inp 255 je .Lno_key 256___ 257$code.=<<___ if (!$kernel); 258 lea poly1305_blocks_x86_64(%rip),%r10 259 lea poly1305_emit_x86_64(%rip),%r11 260___ 261$code.=<<___ if (!$kernel && $avx); 262 mov OPENSSL_ia32cap_P+4(%rip),%r9 263 lea poly1305_blocks_avx(%rip),%rax 264 lea poly1305_emit_avx(%rip),%rcx 265 bt \$`60-32`,%r9 # AVX? 266 cmovc %rax,%r10 267 cmovc %rcx,%r11 268___ 269$code.=<<___ if (!$kernel && $avx>1); 270 lea poly1305_blocks_avx2(%rip),%rax 271 bt \$`5+32`,%r9 # AVX2? 272 cmovc %rax,%r10 273___ 274$code.=<<___ if (!$kernel && $avx>3); 275 mov \$`(1<<31|1<<21|1<<16)`,%rax 276 shr \$32,%r9 277 and %rax,%r9 278 cmp %rax,%r9 279 je .Linit_base2_44 280___ 281$code.=<<___; 282 mov \$0x0ffffffc0fffffff,%rax 283 mov \$0x0ffffffc0ffffffc,%rcx 284 and 0($inp),%rax 285 and 8($inp),%rcx 286 mov %rax,24($ctx) 287 mov %rcx,32($ctx) 288___ 289$code.=<<___ if (!$kernel && $flavour !~ /elf32/); 290 mov %r10,0(%rdx) 291 mov %r11,8(%rdx) 292___ 293$code.=<<___ if (!$kernel && $flavour =~ /elf32/); 294 mov %r10d,0(%rdx) 295 mov %r11d,4(%rdx) 296___ 297$code.=<<___; 298 mov \$1,%eax 299.Lno_key: 300 ret 301___ 302&end_function("poly1305_init_x86_64"); 303 304&declare_function("poly1305_blocks_x86_64", 32, 4); 305$code.=<<___; 306.cfi_startproc 307.Lblocks: 308 shr \$4,$len 309 jz .Lno_data # too short 310 311 push %rbx 312.cfi_push %rbx 313 push %r12 314.cfi_push %r12 315 push %r13 316.cfi_push %r13 317 push %r14 318.cfi_push %r14 319 push %r15 320.cfi_push %r15 321 push $ctx 322.cfi_push $ctx 323.Lblocks_body: 324 325 mov $len,%r15 # reassign $len 326 327 mov 24($ctx),$r0 # load r 328 mov 32($ctx),$s1 329 330 mov 0($ctx),$h0 # load hash value 331 mov 8($ctx),$h1 332 mov 16($ctx),$h2 333 334 mov $s1,$r1 335 shr \$2,$s1 336 mov $r1,%rax 337 add $r1,$s1 # s1 = r1 + (r1 >> 2) 338 jmp .Loop 339 340.align 32 341.Loop: 342 add 0($inp),$h0 # accumulate input 343 adc 8($inp),$h1 344 lea 16($inp),$inp 345 adc $padbit,$h2 346___ 347 348 &poly1305_iteration(); 349 350$code.=<<___; 351 mov $r1,%rax 352 dec %r15 # len-=16 353 jnz .Loop 354 355 mov 0(%rsp),$ctx 356.cfi_restore $ctx 357 358 mov $h0,0($ctx) # store hash value 359 mov $h1,8($ctx) 360 mov $h2,16($ctx) 361 362 mov 8(%rsp),%r15 363.cfi_restore %r15 364 mov 16(%rsp),%r14 365.cfi_restore %r14 366 mov 24(%rsp),%r13 367.cfi_restore %r13 368 mov 32(%rsp),%r12 369.cfi_restore %r12 370 mov 40(%rsp),%rbx 371.cfi_restore %rbx 372 lea 48(%rsp),%rsp 373.cfi_adjust_cfa_offset -48 374.Lno_data: 375.Lblocks_epilogue: 376 ret 377.cfi_endproc 378___ 379&end_function("poly1305_blocks_x86_64"); 380 381&declare_function("poly1305_emit_x86_64", 32, 3); 382$code.=<<___; 383.Lemit: 384 mov 0($ctx),%r8 # load hash value 385 mov 8($ctx),%r9 386 mov 16($ctx),%r10 387 388 mov %r8,%rax 389 add \$5,%r8 # compare to modulus 390 mov %r9,%rcx 391 adc \$0,%r9 392 adc \$0,%r10 393 shr \$2,%r10 # did 130-bit value overflow? 394 cmovnz %r8,%rax 395 cmovnz %r9,%rcx 396 397 add 0($nonce),%rax # accumulate nonce 398 adc 8($nonce),%rcx 399 mov %rax,0($mac) # write result 400 mov %rcx,8($mac) 401 402 ret 403___ 404&end_function("poly1305_emit_x86_64"); 405if ($avx) { 406 407if($kernel) { 408 $code .= "#ifdef CONFIG_AS_AVX\n"; 409} 410 411######################################################################## 412# Layout of opaque area is following. 413# 414# unsigned __int32 h[5]; # current hash value base 2^26 415# unsigned __int32 is_base2_26; 416# unsigned __int64 r[2]; # key value base 2^64 417# unsigned __int64 pad; 418# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; 419# 420# where r^n are base 2^26 digits of degrees of multiplier key. There are 421# 5 digits, but last four are interleaved with multiples of 5, totalling 422# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. 423 424my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = 425 map("%xmm$_",(0..15)); 426 427$code.=<<___; 428.type __poly1305_block,\@abi-omnipotent 429.align 32 430__poly1305_block: 431 push $ctx 432___ 433 &poly1305_iteration(); 434$code.=<<___; 435 pop $ctx 436 ret 437.size __poly1305_block,.-__poly1305_block 438 439.type __poly1305_init_avx,\@abi-omnipotent 440.align 32 441__poly1305_init_avx: 442 push %rbp 443 mov %rsp,%rbp 444 mov $r0,$h0 445 mov $r1,$h1 446 xor $h2,$h2 447 448 lea 48+64($ctx),$ctx # size optimization 449 450 mov $r1,%rax 451 call __poly1305_block # r^2 452 453 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 454 mov \$0x3ffffff,%edx 455 mov $h0,$d1 456 and $h0#d,%eax 457 mov $r0,$d2 458 and $r0#d,%edx 459 mov %eax,`16*0+0-64`($ctx) 460 shr \$26,$d1 461 mov %edx,`16*0+4-64`($ctx) 462 shr \$26,$d2 463 464 mov \$0x3ffffff,%eax 465 mov \$0x3ffffff,%edx 466 and $d1#d,%eax 467 and $d2#d,%edx 468 mov %eax,`16*1+0-64`($ctx) 469 lea (%rax,%rax,4),%eax # *5 470 mov %edx,`16*1+4-64`($ctx) 471 lea (%rdx,%rdx,4),%edx # *5 472 mov %eax,`16*2+0-64`($ctx) 473 shr \$26,$d1 474 mov %edx,`16*2+4-64`($ctx) 475 shr \$26,$d2 476 477 mov $h1,%rax 478 mov $r1,%rdx 479 shl \$12,%rax 480 shl \$12,%rdx 481 or $d1,%rax 482 or $d2,%rdx 483 and \$0x3ffffff,%eax 484 and \$0x3ffffff,%edx 485 mov %eax,`16*3+0-64`($ctx) 486 lea (%rax,%rax,4),%eax # *5 487 mov %edx,`16*3+4-64`($ctx) 488 lea (%rdx,%rdx,4),%edx # *5 489 mov %eax,`16*4+0-64`($ctx) 490 mov $h1,$d1 491 mov %edx,`16*4+4-64`($ctx) 492 mov $r1,$d2 493 494 mov \$0x3ffffff,%eax 495 mov \$0x3ffffff,%edx 496 shr \$14,$d1 497 shr \$14,$d2 498 and $d1#d,%eax 499 and $d2#d,%edx 500 mov %eax,`16*5+0-64`($ctx) 501 lea (%rax,%rax,4),%eax # *5 502 mov %edx,`16*5+4-64`($ctx) 503 lea (%rdx,%rdx,4),%edx # *5 504 mov %eax,`16*6+0-64`($ctx) 505 shr \$26,$d1 506 mov %edx,`16*6+4-64`($ctx) 507 shr \$26,$d2 508 509 mov $h2,%rax 510 shl \$24,%rax 511 or %rax,$d1 512 mov $d1#d,`16*7+0-64`($ctx) 513 lea ($d1,$d1,4),$d1 # *5 514 mov $d2#d,`16*7+4-64`($ctx) 515 lea ($d2,$d2,4),$d2 # *5 516 mov $d1#d,`16*8+0-64`($ctx) 517 mov $d2#d,`16*8+4-64`($ctx) 518 519 mov $r1,%rax 520 call __poly1305_block # r^3 521 522 mov \$0x3ffffff,%eax # save r^3 base 2^26 523 mov $h0,$d1 524 and $h0#d,%eax 525 shr \$26,$d1 526 mov %eax,`16*0+12-64`($ctx) 527 528 mov \$0x3ffffff,%edx 529 and $d1#d,%edx 530 mov %edx,`16*1+12-64`($ctx) 531 lea (%rdx,%rdx,4),%edx # *5 532 shr \$26,$d1 533 mov %edx,`16*2+12-64`($ctx) 534 535 mov $h1,%rax 536 shl \$12,%rax 537 or $d1,%rax 538 and \$0x3ffffff,%eax 539 mov %eax,`16*3+12-64`($ctx) 540 lea (%rax,%rax,4),%eax # *5 541 mov $h1,$d1 542 mov %eax,`16*4+12-64`($ctx) 543 544 mov \$0x3ffffff,%edx 545 shr \$14,$d1 546 and $d1#d,%edx 547 mov %edx,`16*5+12-64`($ctx) 548 lea (%rdx,%rdx,4),%edx # *5 549 shr \$26,$d1 550 mov %edx,`16*6+12-64`($ctx) 551 552 mov $h2,%rax 553 shl \$24,%rax 554 or %rax,$d1 555 mov $d1#d,`16*7+12-64`($ctx) 556 lea ($d1,$d1,4),$d1 # *5 557 mov $d1#d,`16*8+12-64`($ctx) 558 559 mov $r1,%rax 560 call __poly1305_block # r^4 561 562 mov \$0x3ffffff,%eax # save r^4 base 2^26 563 mov $h0,$d1 564 and $h0#d,%eax 565 shr \$26,$d1 566 mov %eax,`16*0+8-64`($ctx) 567 568 mov \$0x3ffffff,%edx 569 and $d1#d,%edx 570 mov %edx,`16*1+8-64`($ctx) 571 lea (%rdx,%rdx,4),%edx # *5 572 shr \$26,$d1 573 mov %edx,`16*2+8-64`($ctx) 574 575 mov $h1,%rax 576 shl \$12,%rax 577 or $d1,%rax 578 and \$0x3ffffff,%eax 579 mov %eax,`16*3+8-64`($ctx) 580 lea (%rax,%rax,4),%eax # *5 581 mov $h1,$d1 582 mov %eax,`16*4+8-64`($ctx) 583 584 mov \$0x3ffffff,%edx 585 shr \$14,$d1 586 and $d1#d,%edx 587 mov %edx,`16*5+8-64`($ctx) 588 lea (%rdx,%rdx,4),%edx # *5 589 shr \$26,$d1 590 mov %edx,`16*6+8-64`($ctx) 591 592 mov $h2,%rax 593 shl \$24,%rax 594 or %rax,$d1 595 mov $d1#d,`16*7+8-64`($ctx) 596 lea ($d1,$d1,4),$d1 # *5 597 mov $d1#d,`16*8+8-64`($ctx) 598 599 lea -48-64($ctx),$ctx # size [de-]optimization 600 pop %rbp 601 ret 602.size __poly1305_init_avx,.-__poly1305_init_avx 603___ 604 605&declare_function("poly1305_blocks_avx", 32, 4); 606$code.=<<___; 607.cfi_startproc 608 mov 20($ctx),%r8d # is_base2_26 609 cmp \$128,$len 610 jae .Lblocks_avx 611 test %r8d,%r8d 612 jz .Lblocks 613 614.Lblocks_avx: 615 and \$-16,$len 616 jz .Lno_data_avx 617 618 vzeroupper 619 620 test %r8d,%r8d 621 jz .Lbase2_64_avx 622 623 test \$31,$len 624 jz .Leven_avx 625 626 push %rbp 627.cfi_push %rbp 628 mov %rsp,%rbp 629 push %rbx 630.cfi_push %rbx 631 push %r12 632.cfi_push %r12 633 push %r13 634.cfi_push %r13 635 push %r14 636.cfi_push %r14 637 push %r15 638.cfi_push %r15 639.Lblocks_avx_body: 640 641 mov $len,%r15 # reassign $len 642 643 mov 0($ctx),$d1 # load hash value 644 mov 8($ctx),$d2 645 mov 16($ctx),$h2#d 646 647 mov 24($ctx),$r0 # load r 648 mov 32($ctx),$s1 649 650 ################################# base 2^26 -> base 2^64 651 mov $d1#d,$h0#d 652 and \$`-1*(1<<31)`,$d1 653 mov $d2,$r1 # borrow $r1 654 mov $d2#d,$h1#d 655 and \$`-1*(1<<31)`,$d2 656 657 shr \$6,$d1 658 shl \$52,$r1 659 add $d1,$h0 660 shr \$12,$h1 661 shr \$18,$d2 662 add $r1,$h0 663 adc $d2,$h1 664 665 mov $h2,$d1 666 shl \$40,$d1 667 shr \$24,$h2 668 add $d1,$h1 669 adc \$0,$h2 # can be partially reduced... 670 671 mov \$-4,$d2 # ... so reduce 672 mov $h2,$d1 673 and $h2,$d2 674 shr \$2,$d1 675 and \$3,$h2 676 add $d2,$d1 # =*5 677 add $d1,$h0 678 adc \$0,$h1 679 adc \$0,$h2 680 681 mov $s1,$r1 682 mov $s1,%rax 683 shr \$2,$s1 684 add $r1,$s1 # s1 = r1 + (r1 >> 2) 685 686 add 0($inp),$h0 # accumulate input 687 adc 8($inp),$h1 688 lea 16($inp),$inp 689 adc $padbit,$h2 690 691 call __poly1305_block 692 693 test $padbit,$padbit # if $padbit is zero, 694 jz .Lstore_base2_64_avx # store hash in base 2^64 format 695 696 ################################# base 2^64 -> base 2^26 697 mov $h0,%rax 698 mov $h0,%rdx 699 shr \$52,$h0 700 mov $h1,$r0 701 mov $h1,$r1 702 shr \$26,%rdx 703 and \$0x3ffffff,%rax # h[0] 704 shl \$12,$r0 705 and \$0x3ffffff,%rdx # h[1] 706 shr \$14,$h1 707 or $r0,$h0 708 shl \$24,$h2 709 and \$0x3ffffff,$h0 # h[2] 710 shr \$40,$r1 711 and \$0x3ffffff,$h1 # h[3] 712 or $r1,$h2 # h[4] 713 714 sub \$16,%r15 715 jz .Lstore_base2_26_avx 716 717 vmovd %rax#d,$H0 718 vmovd %rdx#d,$H1 719 vmovd $h0#d,$H2 720 vmovd $h1#d,$H3 721 vmovd $h2#d,$H4 722 jmp .Lproceed_avx 723 724.align 32 725.Lstore_base2_64_avx: 726 mov $h0,0($ctx) 727 mov $h1,8($ctx) 728 mov $h2,16($ctx) # note that is_base2_26 is zeroed 729 jmp .Ldone_avx 730 731.align 16 732.Lstore_base2_26_avx: 733 mov %rax#d,0($ctx) # store hash value base 2^26 734 mov %rdx#d,4($ctx) 735 mov $h0#d,8($ctx) 736 mov $h1#d,12($ctx) 737 mov $h2#d,16($ctx) 738.align 16 739.Ldone_avx: 740 pop %r15 741.cfi_restore %r15 742 pop %r14 743.cfi_restore %r14 744 pop %r13 745.cfi_restore %r13 746 pop %r12 747.cfi_restore %r12 748 pop %rbx 749.cfi_restore %rbx 750 pop %rbp 751.cfi_restore %rbp 752.Lno_data_avx: 753.Lblocks_avx_epilogue: 754 ret 755.cfi_endproc 756 757.align 32 758.Lbase2_64_avx: 759.cfi_startproc 760 push %rbp 761.cfi_push %rbp 762 mov %rsp,%rbp 763 push %rbx 764.cfi_push %rbx 765 push %r12 766.cfi_push %r12 767 push %r13 768.cfi_push %r13 769 push %r14 770.cfi_push %r14 771 push %r15 772.cfi_push %r15 773.Lbase2_64_avx_body: 774 775 mov $len,%r15 # reassign $len 776 777 mov 24($ctx),$r0 # load r 778 mov 32($ctx),$s1 779 780 mov 0($ctx),$h0 # load hash value 781 mov 8($ctx),$h1 782 mov 16($ctx),$h2#d 783 784 mov $s1,$r1 785 mov $s1,%rax 786 shr \$2,$s1 787 add $r1,$s1 # s1 = r1 + (r1 >> 2) 788 789 test \$31,$len 790 jz .Linit_avx 791 792 add 0($inp),$h0 # accumulate input 793 adc 8($inp),$h1 794 lea 16($inp),$inp 795 adc $padbit,$h2 796 sub \$16,%r15 797 798 call __poly1305_block 799 800.Linit_avx: 801 ################################# base 2^64 -> base 2^26 802 mov $h0,%rax 803 mov $h0,%rdx 804 shr \$52,$h0 805 mov $h1,$d1 806 mov $h1,$d2 807 shr \$26,%rdx 808 and \$0x3ffffff,%rax # h[0] 809 shl \$12,$d1 810 and \$0x3ffffff,%rdx # h[1] 811 shr \$14,$h1 812 or $d1,$h0 813 shl \$24,$h2 814 and \$0x3ffffff,$h0 # h[2] 815 shr \$40,$d2 816 and \$0x3ffffff,$h1 # h[3] 817 or $d2,$h2 # h[4] 818 819 vmovd %rax#d,$H0 820 vmovd %rdx#d,$H1 821 vmovd $h0#d,$H2 822 vmovd $h1#d,$H3 823 vmovd $h2#d,$H4 824 movl \$1,20($ctx) # set is_base2_26 825 826 call __poly1305_init_avx 827 828.Lproceed_avx: 829 mov %r15,$len 830 pop %r15 831.cfi_restore %r15 832 pop %r14 833.cfi_restore %r14 834 pop %r13 835.cfi_restore %r13 836 pop %r12 837.cfi_restore %r12 838 pop %rbx 839.cfi_restore %rbx 840 pop %rbp 841.cfi_restore %rbp 842.Lbase2_64_avx_epilogue: 843 jmp .Ldo_avx 844.cfi_endproc 845 846.align 32 847.Leven_avx: 848.cfi_startproc 849 vmovd 4*0($ctx),$H0 # load hash value 850 vmovd 4*1($ctx),$H1 851 vmovd 4*2($ctx),$H2 852 vmovd 4*3($ctx),$H3 853 vmovd 4*4($ctx),$H4 854 855.Ldo_avx: 856___ 857$code.=<<___ if (!$win64); 858 lea 8(%rsp),%r10 859.cfi_def_cfa_register %r10 860 and \$-32,%rsp 861 sub \$-8,%rsp 862 lea -0x58(%rsp),%r11 863 sub \$0x178,%rsp 864___ 865$code.=<<___ if ($win64); 866 lea -0xf8(%rsp),%r11 867 sub \$0x218,%rsp 868 vmovdqa %xmm6,0x50(%r11) 869 vmovdqa %xmm7,0x60(%r11) 870 vmovdqa %xmm8,0x70(%r11) 871 vmovdqa %xmm9,0x80(%r11) 872 vmovdqa %xmm10,0x90(%r11) 873 vmovdqa %xmm11,0xa0(%r11) 874 vmovdqa %xmm12,0xb0(%r11) 875 vmovdqa %xmm13,0xc0(%r11) 876 vmovdqa %xmm14,0xd0(%r11) 877 vmovdqa %xmm15,0xe0(%r11) 878.Ldo_avx_body: 879___ 880$code.=<<___; 881 sub \$64,$len 882 lea -32($inp),%rax 883 cmovc %rax,$inp 884 885 vmovdqu `16*3`($ctx),$D4 # preload r0^2 886 lea `16*3+64`($ctx),$ctx # size optimization 887 lea .Lconst(%rip),%rcx 888 889 ################################################################ 890 # load input 891 vmovdqu 16*2($inp),$T0 892 vmovdqu 16*3($inp),$T1 893 vmovdqa 64(%rcx),$MASK # .Lmask26 894 895 vpsrldq \$6,$T0,$T2 # splat input 896 vpsrldq \$6,$T1,$T3 897 vpunpckhqdq $T1,$T0,$T4 # 4 898 vpunpcklqdq $T1,$T0,$T0 # 0:1 899 vpunpcklqdq $T3,$T2,$T3 # 2:3 900 901 vpsrlq \$40,$T4,$T4 # 4 902 vpsrlq \$26,$T0,$T1 903 vpand $MASK,$T0,$T0 # 0 904 vpsrlq \$4,$T3,$T2 905 vpand $MASK,$T1,$T1 # 1 906 vpsrlq \$30,$T3,$T3 907 vpand $MASK,$T2,$T2 # 2 908 vpand $MASK,$T3,$T3 # 3 909 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 910 911 jbe .Lskip_loop_avx 912 913 # expand and copy pre-calculated table to stack 914 vmovdqu `16*1-64`($ctx),$D1 915 vmovdqu `16*2-64`($ctx),$D2 916 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 917 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 918 vmovdqa $D3,-0x90(%r11) 919 vmovdqa $D0,0x00(%rsp) 920 vpshufd \$0xEE,$D1,$D4 921 vmovdqu `16*3-64`($ctx),$D0 922 vpshufd \$0x44,$D1,$D1 923 vmovdqa $D4,-0x80(%r11) 924 vmovdqa $D1,0x10(%rsp) 925 vpshufd \$0xEE,$D2,$D3 926 vmovdqu `16*4-64`($ctx),$D1 927 vpshufd \$0x44,$D2,$D2 928 vmovdqa $D3,-0x70(%r11) 929 vmovdqa $D2,0x20(%rsp) 930 vpshufd \$0xEE,$D0,$D4 931 vmovdqu `16*5-64`($ctx),$D2 932 vpshufd \$0x44,$D0,$D0 933 vmovdqa $D4,-0x60(%r11) 934 vmovdqa $D0,0x30(%rsp) 935 vpshufd \$0xEE,$D1,$D3 936 vmovdqu `16*6-64`($ctx),$D0 937 vpshufd \$0x44,$D1,$D1 938 vmovdqa $D3,-0x50(%r11) 939 vmovdqa $D1,0x40(%rsp) 940 vpshufd \$0xEE,$D2,$D4 941 vmovdqu `16*7-64`($ctx),$D1 942 vpshufd \$0x44,$D2,$D2 943 vmovdqa $D4,-0x40(%r11) 944 vmovdqa $D2,0x50(%rsp) 945 vpshufd \$0xEE,$D0,$D3 946 vmovdqu `16*8-64`($ctx),$D2 947 vpshufd \$0x44,$D0,$D0 948 vmovdqa $D3,-0x30(%r11) 949 vmovdqa $D0,0x60(%rsp) 950 vpshufd \$0xEE,$D1,$D4 951 vpshufd \$0x44,$D1,$D1 952 vmovdqa $D4,-0x20(%r11) 953 vmovdqa $D1,0x70(%rsp) 954 vpshufd \$0xEE,$D2,$D3 955 vmovdqa 0x00(%rsp),$D4 # preload r0^2 956 vpshufd \$0x44,$D2,$D2 957 vmovdqa $D3,-0x10(%r11) 958 vmovdqa $D2,0x80(%rsp) 959 960 jmp .Loop_avx 961 962.align 32 963.Loop_avx: 964 ################################################################ 965 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 966 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 967 # \___________________/ 968 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 969 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 970 # \___________________/ \____________________/ 971 # 972 # Note that we start with inp[2:3]*r^2. This is because it 973 # doesn't depend on reduction in previous iteration. 974 ################################################################ 975 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 976 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 977 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 978 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 979 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 980 # 981 # though note that $Tx and $Hx are "reversed" in this section, 982 # and $D4 is preloaded with r0^2... 983 984 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 985 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 986 vmovdqa $H2,0x20(%r11) # offload hash 987 vpmuludq $T2,$D4,$D2 # d3 = h2*r0 988 vmovdqa 0x10(%rsp),$H2 # r1^2 989 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 990 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 991 992 vmovdqa $H0,0x00(%r11) # 993 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 994 vmovdqa $H1,0x10(%r11) # 995 vpmuludq $T3,$H2,$H1 # h3*r1 996 vpaddq $H0,$D0,$D0 # d0 += h4*s1 997 vpaddq $H1,$D4,$D4 # d4 += h3*r1 998 vmovdqa $H3,0x30(%r11) # 999 vpmuludq $T2,$H2,$H0 # h2*r1 1000 vpmuludq $T1,$H2,$H1 # h1*r1 1001 vpaddq $H0,$D3,$D3 # d3 += h2*r1 1002 vmovdqa 0x30(%rsp),$H3 # r2^2 1003 vpaddq $H1,$D2,$D2 # d2 += h1*r1 1004 vmovdqa $H4,0x40(%r11) # 1005 vpmuludq $T0,$H2,$H2 # h0*r1 1006 vpmuludq $T2,$H3,$H0 # h2*r2 1007 vpaddq $H2,$D1,$D1 # d1 += h0*r1 1008 1009 vmovdqa 0x40(%rsp),$H4 # s2^2 1010 vpaddq $H0,$D4,$D4 # d4 += h2*r2 1011 vpmuludq $T1,$H3,$H1 # h1*r2 1012 vpmuludq $T0,$H3,$H3 # h0*r2 1013 vpaddq $H1,$D3,$D3 # d3 += h1*r2 1014 vmovdqa 0x50(%rsp),$H2 # r3^2 1015 vpaddq $H3,$D2,$D2 # d2 += h0*r2 1016 vpmuludq $T4,$H4,$H0 # h4*s2 1017 vpmuludq $T3,$H4,$H4 # h3*s2 1018 vpaddq $H0,$D1,$D1 # d1 += h4*s2 1019 vmovdqa 0x60(%rsp),$H3 # s3^2 1020 vpaddq $H4,$D0,$D0 # d0 += h3*s2 1021 1022 vmovdqa 0x80(%rsp),$H4 # s4^2 1023 vpmuludq $T1,$H2,$H1 # h1*r3 1024 vpmuludq $T0,$H2,$H2 # h0*r3 1025 vpaddq $H1,$D4,$D4 # d4 += h1*r3 1026 vpaddq $H2,$D3,$D3 # d3 += h0*r3 1027 vpmuludq $T4,$H3,$H0 # h4*s3 1028 vpmuludq $T3,$H3,$H1 # h3*s3 1029 vpaddq $H0,$D2,$D2 # d2 += h4*s3 1030 vmovdqu 16*0($inp),$H0 # load input 1031 vpaddq $H1,$D1,$D1 # d1 += h3*s3 1032 vpmuludq $T2,$H3,$H3 # h2*s3 1033 vpmuludq $T2,$H4,$T2 # h2*s4 1034 vpaddq $H3,$D0,$D0 # d0 += h2*s3 1035 1036 vmovdqu 16*1($inp),$H1 # 1037 vpaddq $T2,$D1,$D1 # d1 += h2*s4 1038 vpmuludq $T3,$H4,$T3 # h3*s4 1039 vpmuludq $T4,$H4,$T4 # h4*s4 1040 vpsrldq \$6,$H0,$H2 # splat input 1041 vpaddq $T3,$D2,$D2 # d2 += h3*s4 1042 vpaddq $T4,$D3,$D3 # d3 += h4*s4 1043 vpsrldq \$6,$H1,$H3 # 1044 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 1045 vpmuludq $T1,$H4,$T0 # h1*s4 1046 vpunpckhqdq $H1,$H0,$H4 # 4 1047 vpaddq $T4,$D4,$D4 # d4 += h0*r4 1048 vmovdqa -0x90(%r11),$T4 # r0^4 1049 vpaddq $T0,$D0,$D0 # d0 += h1*s4 1050 1051 vpunpcklqdq $H1,$H0,$H0 # 0:1 1052 vpunpcklqdq $H3,$H2,$H3 # 2:3 1053 1054 #vpsrlq \$40,$H4,$H4 # 4 1055 vpsrldq \$`40/8`,$H4,$H4 # 4 1056 vpsrlq \$26,$H0,$H1 1057 vpand $MASK,$H0,$H0 # 0 1058 vpsrlq \$4,$H3,$H2 1059 vpand $MASK,$H1,$H1 # 1 1060 vpand 0(%rcx),$H4,$H4 # .Lmask24 1061 vpsrlq \$30,$H3,$H3 1062 vpand $MASK,$H2,$H2 # 2 1063 vpand $MASK,$H3,$H3 # 3 1064 vpor 32(%rcx),$H4,$H4 # padbit, yes, always 1065 1066 vpaddq 0x00(%r11),$H0,$H0 # add hash value 1067 vpaddq 0x10(%r11),$H1,$H1 1068 vpaddq 0x20(%r11),$H2,$H2 1069 vpaddq 0x30(%r11),$H3,$H3 1070 vpaddq 0x40(%r11),$H4,$H4 1071 1072 lea 16*2($inp),%rax 1073 lea 16*4($inp),$inp 1074 sub \$64,$len 1075 cmovc %rax,$inp 1076 1077 ################################################################ 1078 # Now we accumulate (inp[0:1]+hash)*r^4 1079 ################################################################ 1080 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1081 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1082 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1083 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1084 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1085 1086 vpmuludq $H0,$T4,$T0 # h0*r0 1087 vpmuludq $H1,$T4,$T1 # h1*r0 1088 vpaddq $T0,$D0,$D0 1089 vpaddq $T1,$D1,$D1 1090 vmovdqa -0x80(%r11),$T2 # r1^4 1091 vpmuludq $H2,$T4,$T0 # h2*r0 1092 vpmuludq $H3,$T4,$T1 # h3*r0 1093 vpaddq $T0,$D2,$D2 1094 vpaddq $T1,$D3,$D3 1095 vpmuludq $H4,$T4,$T4 # h4*r0 1096 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 1097 vpaddq $T4,$D4,$D4 1098 1099 vpaddq $T0,$D0,$D0 # d0 += h4*s1 1100 vpmuludq $H2,$T2,$T1 # h2*r1 1101 vpmuludq $H3,$T2,$T0 # h3*r1 1102 vpaddq $T1,$D3,$D3 # d3 += h2*r1 1103 vmovdqa -0x60(%r11),$T3 # r2^4 1104 vpaddq $T0,$D4,$D4 # d4 += h3*r1 1105 vpmuludq $H1,$T2,$T1 # h1*r1 1106 vpmuludq $H0,$T2,$T2 # h0*r1 1107 vpaddq $T1,$D2,$D2 # d2 += h1*r1 1108 vpaddq $T2,$D1,$D1 # d1 += h0*r1 1109 1110 vmovdqa -0x50(%r11),$T4 # s2^4 1111 vpmuludq $H2,$T3,$T0 # h2*r2 1112 vpmuludq $H1,$T3,$T1 # h1*r2 1113 vpaddq $T0,$D4,$D4 # d4 += h2*r2 1114 vpaddq $T1,$D3,$D3 # d3 += h1*r2 1115 vmovdqa -0x40(%r11),$T2 # r3^4 1116 vpmuludq $H0,$T3,$T3 # h0*r2 1117 vpmuludq $H4,$T4,$T0 # h4*s2 1118 vpaddq $T3,$D2,$D2 # d2 += h0*r2 1119 vpaddq $T0,$D1,$D1 # d1 += h4*s2 1120 vmovdqa -0x30(%r11),$T3 # s3^4 1121 vpmuludq $H3,$T4,$T4 # h3*s2 1122 vpmuludq $H1,$T2,$T1 # h1*r3 1123 vpaddq $T4,$D0,$D0 # d0 += h3*s2 1124 1125 vmovdqa -0x10(%r11),$T4 # s4^4 1126 vpaddq $T1,$D4,$D4 # d4 += h1*r3 1127 vpmuludq $H0,$T2,$T2 # h0*r3 1128 vpmuludq $H4,$T3,$T0 # h4*s3 1129 vpaddq $T2,$D3,$D3 # d3 += h0*r3 1130 vpaddq $T0,$D2,$D2 # d2 += h4*s3 1131 vmovdqu 16*2($inp),$T0 # load input 1132 vpmuludq $H3,$T3,$T2 # h3*s3 1133 vpmuludq $H2,$T3,$T3 # h2*s3 1134 vpaddq $T2,$D1,$D1 # d1 += h3*s3 1135 vmovdqu 16*3($inp),$T1 # 1136 vpaddq $T3,$D0,$D0 # d0 += h2*s3 1137 1138 vpmuludq $H2,$T4,$H2 # h2*s4 1139 vpmuludq $H3,$T4,$H3 # h3*s4 1140 vpsrldq \$6,$T0,$T2 # splat input 1141 vpaddq $H2,$D1,$D1 # d1 += h2*s4 1142 vpmuludq $H4,$T4,$H4 # h4*s4 1143 vpsrldq \$6,$T1,$T3 # 1144 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 1145 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 1146 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 1147 vpmuludq $H1,$T4,$H0 1148 vpunpckhqdq $T1,$T0,$T4 # 4 1149 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 1150 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 1151 1152 vpunpcklqdq $T1,$T0,$T0 # 0:1 1153 vpunpcklqdq $T3,$T2,$T3 # 2:3 1154 1155 #vpsrlq \$40,$T4,$T4 # 4 1156 vpsrldq \$`40/8`,$T4,$T4 # 4 1157 vpsrlq \$26,$T0,$T1 1158 vmovdqa 0x00(%rsp),$D4 # preload r0^2 1159 vpand $MASK,$T0,$T0 # 0 1160 vpsrlq \$4,$T3,$T2 1161 vpand $MASK,$T1,$T1 # 1 1162 vpand 0(%rcx),$T4,$T4 # .Lmask24 1163 vpsrlq \$30,$T3,$T3 1164 vpand $MASK,$T2,$T2 # 2 1165 vpand $MASK,$T3,$T3 # 3 1166 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1167 1168 ################################################################ 1169 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 1170 # and P. Schwabe 1171 1172 vpsrlq \$26,$H3,$D3 1173 vpand $MASK,$H3,$H3 1174 vpaddq $D3,$H4,$H4 # h3 -> h4 1175 1176 vpsrlq \$26,$H0,$D0 1177 vpand $MASK,$H0,$H0 1178 vpaddq $D0,$D1,$H1 # h0 -> h1 1179 1180 vpsrlq \$26,$H4,$D0 1181 vpand $MASK,$H4,$H4 1182 1183 vpsrlq \$26,$H1,$D1 1184 vpand $MASK,$H1,$H1 1185 vpaddq $D1,$H2,$H2 # h1 -> h2 1186 1187 vpaddq $D0,$H0,$H0 1188 vpsllq \$2,$D0,$D0 1189 vpaddq $D0,$H0,$H0 # h4 -> h0 1190 1191 vpsrlq \$26,$H2,$D2 1192 vpand $MASK,$H2,$H2 1193 vpaddq $D2,$H3,$H3 # h2 -> h3 1194 1195 vpsrlq \$26,$H0,$D0 1196 vpand $MASK,$H0,$H0 1197 vpaddq $D0,$H1,$H1 # h0 -> h1 1198 1199 vpsrlq \$26,$H3,$D3 1200 vpand $MASK,$H3,$H3 1201 vpaddq $D3,$H4,$H4 # h3 -> h4 1202 1203 ja .Loop_avx 1204 1205.Lskip_loop_avx: 1206 ################################################################ 1207 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 1208 1209 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 1210 add \$32,$len 1211 jnz .Long_tail_avx 1212 1213 vpaddq $H2,$T2,$T2 1214 vpaddq $H0,$T0,$T0 1215 vpaddq $H1,$T1,$T1 1216 vpaddq $H3,$T3,$T3 1217 vpaddq $H4,$T4,$T4 1218 1219.Long_tail_avx: 1220 vmovdqa $H2,0x20(%r11) 1221 vmovdqa $H0,0x00(%r11) 1222 vmovdqa $H1,0x10(%r11) 1223 vmovdqa $H3,0x30(%r11) 1224 vmovdqa $H4,0x40(%r11) 1225 1226 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1227 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1228 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1229 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1230 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1231 1232 vpmuludq $T2,$D4,$D2 # d2 = h2*r0 1233 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 1234 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n 1235 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 1236 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 1237 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 1238 1239 vpmuludq $T3,$H2,$H0 # h3*r1 1240 vpaddq $H0,$D4,$D4 # d4 += h3*r1 1241 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n 1242 vpmuludq $T2,$H2,$H1 # h2*r1 1243 vpaddq $H1,$D3,$D3 # d3 += h2*r1 1244 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n 1245 vpmuludq $T1,$H2,$H0 # h1*r1 1246 vpaddq $H0,$D2,$D2 # d2 += h1*r1 1247 vpmuludq $T0,$H2,$H2 # h0*r1 1248 vpaddq $H2,$D1,$D1 # d1 += h0*r1 1249 vpmuludq $T4,$H3,$H3 # h4*s1 1250 vpaddq $H3,$D0,$D0 # d0 += h4*s1 1251 1252 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n 1253 vpmuludq $T2,$H4,$H1 # h2*r2 1254 vpaddq $H1,$D4,$D4 # d4 += h2*r2 1255 vpmuludq $T1,$H4,$H0 # h1*r2 1256 vpaddq $H0,$D3,$D3 # d3 += h1*r2 1257 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n 1258 vpmuludq $T0,$H4,$H4 # h0*r2 1259 vpaddq $H4,$D2,$D2 # d2 += h0*r2 1260 vpmuludq $T4,$H2,$H1 # h4*s2 1261 vpaddq $H1,$D1,$D1 # d1 += h4*s2 1262 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n 1263 vpmuludq $T3,$H2,$H2 # h3*s2 1264 vpaddq $H2,$D0,$D0 # d0 += h3*s2 1265 1266 vpmuludq $T1,$H3,$H0 # h1*r3 1267 vpaddq $H0,$D4,$D4 # d4 += h1*r3 1268 vpmuludq $T0,$H3,$H3 # h0*r3 1269 vpaddq $H3,$D3,$D3 # d3 += h0*r3 1270 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n 1271 vpmuludq $T4,$H4,$H1 # h4*s3 1272 vpaddq $H1,$D2,$D2 # d2 += h4*s3 1273 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n 1274 vpmuludq $T3,$H4,$H0 # h3*s3 1275 vpaddq $H0,$D1,$D1 # d1 += h3*s3 1276 vpmuludq $T2,$H4,$H4 # h2*s3 1277 vpaddq $H4,$D0,$D0 # d0 += h2*s3 1278 1279 vpmuludq $T0,$H2,$H2 # h0*r4 1280 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 1281 vpmuludq $T4,$H3,$H1 # h4*s4 1282 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 1283 vpmuludq $T3,$H3,$H0 # h3*s4 1284 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 1285 vpmuludq $T2,$H3,$H1 # h2*s4 1286 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 1287 vpmuludq $T1,$H3,$H3 # h1*s4 1288 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 1289 1290 jz .Lshort_tail_avx 1291 1292 vmovdqu 16*0($inp),$H0 # load input 1293 vmovdqu 16*1($inp),$H1 1294 1295 vpsrldq \$6,$H0,$H2 # splat input 1296 vpsrldq \$6,$H1,$H3 1297 vpunpckhqdq $H1,$H0,$H4 # 4 1298 vpunpcklqdq $H1,$H0,$H0 # 0:1 1299 vpunpcklqdq $H3,$H2,$H3 # 2:3 1300 1301 vpsrlq \$40,$H4,$H4 # 4 1302 vpsrlq \$26,$H0,$H1 1303 vpand $MASK,$H0,$H0 # 0 1304 vpsrlq \$4,$H3,$H2 1305 vpand $MASK,$H1,$H1 # 1 1306 vpsrlq \$30,$H3,$H3 1307 vpand $MASK,$H2,$H2 # 2 1308 vpand $MASK,$H3,$H3 # 3 1309 vpor 32(%rcx),$H4,$H4 # padbit, yes, always 1310 1311 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 1312 vpaddq 0x00(%r11),$H0,$H0 1313 vpaddq 0x10(%r11),$H1,$H1 1314 vpaddq 0x20(%r11),$H2,$H2 1315 vpaddq 0x30(%r11),$H3,$H3 1316 vpaddq 0x40(%r11),$H4,$H4 1317 1318 ################################################################ 1319 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate 1320 1321 vpmuludq $H0,$T4,$T0 # h0*r0 1322 vpaddq $T0,$D0,$D0 # d0 += h0*r0 1323 vpmuludq $H1,$T4,$T1 # h1*r0 1324 vpaddq $T1,$D1,$D1 # d1 += h1*r0 1325 vpmuludq $H2,$T4,$T0 # h2*r0 1326 vpaddq $T0,$D2,$D2 # d2 += h2*r0 1327 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n 1328 vpmuludq $H3,$T4,$T1 # h3*r0 1329 vpaddq $T1,$D3,$D3 # d3 += h3*r0 1330 vpmuludq $H4,$T4,$T4 # h4*r0 1331 vpaddq $T4,$D4,$D4 # d4 += h4*r0 1332 1333 vpmuludq $H3,$T2,$T0 # h3*r1 1334 vpaddq $T0,$D4,$D4 # d4 += h3*r1 1335 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 1336 vpmuludq $H2,$T2,$T1 # h2*r1 1337 vpaddq $T1,$D3,$D3 # d3 += h2*r1 1338 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 1339 vpmuludq $H1,$T2,$T0 # h1*r1 1340 vpaddq $T0,$D2,$D2 # d2 += h1*r1 1341 vpmuludq $H0,$T2,$T2 # h0*r1 1342 vpaddq $T2,$D1,$D1 # d1 += h0*r1 1343 vpmuludq $H4,$T3,$T3 # h4*s1 1344 vpaddq $T3,$D0,$D0 # d0 += h4*s1 1345 1346 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 1347 vpmuludq $H2,$T4,$T1 # h2*r2 1348 vpaddq $T1,$D4,$D4 # d4 += h2*r2 1349 vpmuludq $H1,$T4,$T0 # h1*r2 1350 vpaddq $T0,$D3,$D3 # d3 += h1*r2 1351 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 1352 vpmuludq $H0,$T4,$T4 # h0*r2 1353 vpaddq $T4,$D2,$D2 # d2 += h0*r2 1354 vpmuludq $H4,$T2,$T1 # h4*s2 1355 vpaddq $T1,$D1,$D1 # d1 += h4*s2 1356 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 1357 vpmuludq $H3,$T2,$T2 # h3*s2 1358 vpaddq $T2,$D0,$D0 # d0 += h3*s2 1359 1360 vpmuludq $H1,$T3,$T0 # h1*r3 1361 vpaddq $T0,$D4,$D4 # d4 += h1*r3 1362 vpmuludq $H0,$T3,$T3 # h0*r3 1363 vpaddq $T3,$D3,$D3 # d3 += h0*r3 1364 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 1365 vpmuludq $H4,$T4,$T1 # h4*s3 1366 vpaddq $T1,$D2,$D2 # d2 += h4*s3 1367 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 1368 vpmuludq $H3,$T4,$T0 # h3*s3 1369 vpaddq $T0,$D1,$D1 # d1 += h3*s3 1370 vpmuludq $H2,$T4,$T4 # h2*s3 1371 vpaddq $T4,$D0,$D0 # d0 += h2*s3 1372 1373 vpmuludq $H0,$T2,$T2 # h0*r4 1374 vpaddq $T2,$D4,$D4 # d4 += h0*r4 1375 vpmuludq $H4,$T3,$T1 # h4*s4 1376 vpaddq $T1,$D3,$D3 # d3 += h4*s4 1377 vpmuludq $H3,$T3,$T0 # h3*s4 1378 vpaddq $T0,$D2,$D2 # d2 += h3*s4 1379 vpmuludq $H2,$T3,$T1 # h2*s4 1380 vpaddq $T1,$D1,$D1 # d1 += h2*s4 1381 vpmuludq $H1,$T3,$T3 # h1*s4 1382 vpaddq $T3,$D0,$D0 # d0 += h1*s4 1383 1384.Lshort_tail_avx: 1385 ################################################################ 1386 # horizontal addition 1387 1388 vpsrldq \$8,$D4,$T4 1389 vpsrldq \$8,$D3,$T3 1390 vpsrldq \$8,$D1,$T1 1391 vpsrldq \$8,$D0,$T0 1392 vpsrldq \$8,$D2,$T2 1393 vpaddq $T3,$D3,$D3 1394 vpaddq $T4,$D4,$D4 1395 vpaddq $T0,$D0,$D0 1396 vpaddq $T1,$D1,$D1 1397 vpaddq $T2,$D2,$D2 1398 1399 ################################################################ 1400 # lazy reduction 1401 1402 vpsrlq \$26,$D3,$H3 1403 vpand $MASK,$D3,$D3 1404 vpaddq $H3,$D4,$D4 # h3 -> h4 1405 1406 vpsrlq \$26,$D0,$H0 1407 vpand $MASK,$D0,$D0 1408 vpaddq $H0,$D1,$D1 # h0 -> h1 1409 1410 vpsrlq \$26,$D4,$H4 1411 vpand $MASK,$D4,$D4 1412 1413 vpsrlq \$26,$D1,$H1 1414 vpand $MASK,$D1,$D1 1415 vpaddq $H1,$D2,$D2 # h1 -> h2 1416 1417 vpaddq $H4,$D0,$D0 1418 vpsllq \$2,$H4,$H4 1419 vpaddq $H4,$D0,$D0 # h4 -> h0 1420 1421 vpsrlq \$26,$D2,$H2 1422 vpand $MASK,$D2,$D2 1423 vpaddq $H2,$D3,$D3 # h2 -> h3 1424 1425 vpsrlq \$26,$D0,$H0 1426 vpand $MASK,$D0,$D0 1427 vpaddq $H0,$D1,$D1 # h0 -> h1 1428 1429 vpsrlq \$26,$D3,$H3 1430 vpand $MASK,$D3,$D3 1431 vpaddq $H3,$D4,$D4 # h3 -> h4 1432 1433 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced 1434 vmovd $D1,`4*1-48-64`($ctx) 1435 vmovd $D2,`4*2-48-64`($ctx) 1436 vmovd $D3,`4*3-48-64`($ctx) 1437 vmovd $D4,`4*4-48-64`($ctx) 1438___ 1439$code.=<<___ if ($win64); 1440 vmovdqa 0x50(%r11),%xmm6 1441 vmovdqa 0x60(%r11),%xmm7 1442 vmovdqa 0x70(%r11),%xmm8 1443 vmovdqa 0x80(%r11),%xmm9 1444 vmovdqa 0x90(%r11),%xmm10 1445 vmovdqa 0xa0(%r11),%xmm11 1446 vmovdqa 0xb0(%r11),%xmm12 1447 vmovdqa 0xc0(%r11),%xmm13 1448 vmovdqa 0xd0(%r11),%xmm14 1449 vmovdqa 0xe0(%r11),%xmm15 1450 lea 0xf8(%r11),%rsp 1451.Ldo_avx_epilogue: 1452___ 1453$code.=<<___ if (!$win64); 1454 lea -8(%r10),%rsp 1455.cfi_def_cfa_register %rsp 1456___ 1457$code.=<<___; 1458 vzeroupper 1459 ret 1460.cfi_endproc 1461___ 1462&end_function("poly1305_blocks_avx"); 1463 1464&declare_function("poly1305_emit_avx", 32, 3); 1465$code.=<<___; 1466 cmpl \$0,20($ctx) # is_base2_26? 1467 je .Lemit 1468 1469 mov 0($ctx),%eax # load hash value base 2^26 1470 mov 4($ctx),%ecx 1471 mov 8($ctx),%r8d 1472 mov 12($ctx),%r11d 1473 mov 16($ctx),%r10d 1474 1475 shl \$26,%rcx # base 2^26 -> base 2^64 1476 mov %r8,%r9 1477 shl \$52,%r8 1478 add %rcx,%rax 1479 shr \$12,%r9 1480 add %rax,%r8 # h0 1481 adc \$0,%r9 1482 1483 shl \$14,%r11 1484 mov %r10,%rax 1485 shr \$24,%r10 1486 add %r11,%r9 1487 shl \$40,%rax 1488 add %rax,%r9 # h1 1489 adc \$0,%r10 # h2 1490 1491 mov %r10,%rax # could be partially reduced, so reduce 1492 mov %r10,%rcx 1493 and \$3,%r10 1494 shr \$2,%rax 1495 and \$-4,%rcx 1496 add %rcx,%rax 1497 add %rax,%r8 1498 adc \$0,%r9 1499 adc \$0,%r10 1500 1501 mov %r8,%rax 1502 add \$5,%r8 # compare to modulus 1503 mov %r9,%rcx 1504 adc \$0,%r9 1505 adc \$0,%r10 1506 shr \$2,%r10 # did 130-bit value overflow? 1507 cmovnz %r8,%rax 1508 cmovnz %r9,%rcx 1509 1510 add 0($nonce),%rax # accumulate nonce 1511 adc 8($nonce),%rcx 1512 mov %rax,0($mac) # write result 1513 mov %rcx,8($mac) 1514 1515 ret 1516___ 1517&end_function("poly1305_emit_avx"); 1518 1519if ($kernel) { 1520 $code .= "#endif\n"; 1521} 1522 1523if ($avx>1) { 1524 1525if ($kernel) { 1526 $code .= "#ifdef CONFIG_AS_AVX2\n"; 1527} 1528 1529my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = 1530 map("%ymm$_",(0..15)); 1531my $S4=$MASK; 1532 1533sub poly1305_blocks_avxN { 1534 my ($avx512) = @_; 1535 my $suffix = $avx512 ? "_avx512" : ""; 1536$code.=<<___; 1537.cfi_startproc 1538 mov 20($ctx),%r8d # is_base2_26 1539 cmp \$128,$len 1540 jae .Lblocks_avx2$suffix 1541 test %r8d,%r8d 1542 jz .Lblocks 1543 1544.Lblocks_avx2$suffix: 1545 and \$-16,$len 1546 jz .Lno_data_avx2$suffix 1547 1548 vzeroupper 1549 1550 test %r8d,%r8d 1551 jz .Lbase2_64_avx2$suffix 1552 1553 test \$63,$len 1554 jz .Leven_avx2$suffix 1555 1556 push %rbp 1557.cfi_push %rbp 1558 mov %rsp,%rbp 1559 push %rbx 1560.cfi_push %rbx 1561 push %r12 1562.cfi_push %r12 1563 push %r13 1564.cfi_push %r13 1565 push %r14 1566.cfi_push %r14 1567 push %r15 1568.cfi_push %r15 1569.Lblocks_avx2_body$suffix: 1570 1571 mov $len,%r15 # reassign $len 1572 1573 mov 0($ctx),$d1 # load hash value 1574 mov 8($ctx),$d2 1575 mov 16($ctx),$h2#d 1576 1577 mov 24($ctx),$r0 # load r 1578 mov 32($ctx),$s1 1579 1580 ################################# base 2^26 -> base 2^64 1581 mov $d1#d,$h0#d 1582 and \$`-1*(1<<31)`,$d1 1583 mov $d2,$r1 # borrow $r1 1584 mov $d2#d,$h1#d 1585 and \$`-1*(1<<31)`,$d2 1586 1587 shr \$6,$d1 1588 shl \$52,$r1 1589 add $d1,$h0 1590 shr \$12,$h1 1591 shr \$18,$d2 1592 add $r1,$h0 1593 adc $d2,$h1 1594 1595 mov $h2,$d1 1596 shl \$40,$d1 1597 shr \$24,$h2 1598 add $d1,$h1 1599 adc \$0,$h2 # can be partially reduced... 1600 1601 mov \$-4,$d2 # ... so reduce 1602 mov $h2,$d1 1603 and $h2,$d2 1604 shr \$2,$d1 1605 and \$3,$h2 1606 add $d2,$d1 # =*5 1607 add $d1,$h0 1608 adc \$0,$h1 1609 adc \$0,$h2 1610 1611 mov $s1,$r1 1612 mov $s1,%rax 1613 shr \$2,$s1 1614 add $r1,$s1 # s1 = r1 + (r1 >> 2) 1615 1616.Lbase2_26_pre_avx2$suffix: 1617 add 0($inp),$h0 # accumulate input 1618 adc 8($inp),$h1 1619 lea 16($inp),$inp 1620 adc $padbit,$h2 1621 sub \$16,%r15 1622 1623 call __poly1305_block 1624 mov $r1,%rax 1625 1626 test \$63,%r15 1627 jnz .Lbase2_26_pre_avx2$suffix 1628 1629 test $padbit,$padbit # if $padbit is zero, 1630 jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format 1631 1632 ################################# base 2^64 -> base 2^26 1633 mov $h0,%rax 1634 mov $h0,%rdx 1635 shr \$52,$h0 1636 mov $h1,$r0 1637 mov $h1,$r1 1638 shr \$26,%rdx 1639 and \$0x3ffffff,%rax # h[0] 1640 shl \$12,$r0 1641 and \$0x3ffffff,%rdx # h[1] 1642 shr \$14,$h1 1643 or $r0,$h0 1644 shl \$24,$h2 1645 and \$0x3ffffff,$h0 # h[2] 1646 shr \$40,$r1 1647 and \$0x3ffffff,$h1 # h[3] 1648 or $r1,$h2 # h[4] 1649 1650 test %r15,%r15 1651 jz .Lstore_base2_26_avx2$suffix 1652 1653 vmovd %rax#d,%x#$H0 1654 vmovd %rdx#d,%x#$H1 1655 vmovd $h0#d,%x#$H2 1656 vmovd $h1#d,%x#$H3 1657 vmovd $h2#d,%x#$H4 1658 jmp .Lproceed_avx2$suffix 1659 1660.align 32 1661.Lstore_base2_64_avx2$suffix: 1662 mov $h0,0($ctx) 1663 mov $h1,8($ctx) 1664 mov $h2,16($ctx) # note that is_base2_26 is zeroed 1665 jmp .Ldone_avx2$suffix 1666 1667.align 16 1668.Lstore_base2_26_avx2$suffix: 1669 mov %rax#d,0($ctx) # store hash value base 2^26 1670 mov %rdx#d,4($ctx) 1671 mov $h0#d,8($ctx) 1672 mov $h1#d,12($ctx) 1673 mov $h2#d,16($ctx) 1674.align 16 1675.Ldone_avx2$suffix: 1676 pop %r15 1677.cfi_restore %r15 1678 pop %r14 1679.cfi_restore %r14 1680 pop %r13 1681.cfi_restore %r13 1682 pop %r12 1683.cfi_restore %r12 1684 pop %rbx 1685.cfi_restore %rbx 1686 pop %rbp 1687.cfi_restore %rbp 1688.Lno_data_avx2$suffix: 1689.Lblocks_avx2_epilogue$suffix: 1690 ret 1691.cfi_endproc 1692 1693.align 32 1694.Lbase2_64_avx2$suffix: 1695.cfi_startproc 1696 push %rbp 1697.cfi_push %rbp 1698 mov %rsp,%rbp 1699 push %rbx 1700.cfi_push %rbx 1701 push %r12 1702.cfi_push %r12 1703 push %r13 1704.cfi_push %r13 1705 push %r14 1706.cfi_push %r14 1707 push %r15 1708.cfi_push %r15 1709.Lbase2_64_avx2_body$suffix: 1710 1711 mov $len,%r15 # reassign $len 1712 1713 mov 24($ctx),$r0 # load r 1714 mov 32($ctx),$s1 1715 1716 mov 0($ctx),$h0 # load hash value 1717 mov 8($ctx),$h1 1718 mov 16($ctx),$h2#d 1719 1720 mov $s1,$r1 1721 mov $s1,%rax 1722 shr \$2,$s1 1723 add $r1,$s1 # s1 = r1 + (r1 >> 2) 1724 1725 test \$63,$len 1726 jz .Linit_avx2$suffix 1727 1728.Lbase2_64_pre_avx2$suffix: 1729 add 0($inp),$h0 # accumulate input 1730 adc 8($inp),$h1 1731 lea 16($inp),$inp 1732 adc $padbit,$h2 1733 sub \$16,%r15 1734 1735 call __poly1305_block 1736 mov $r1,%rax 1737 1738 test \$63,%r15 1739 jnz .Lbase2_64_pre_avx2$suffix 1740 1741.Linit_avx2$suffix: 1742 ################################# base 2^64 -> base 2^26 1743 mov $h0,%rax 1744 mov $h0,%rdx 1745 shr \$52,$h0 1746 mov $h1,$d1 1747 mov $h1,$d2 1748 shr \$26,%rdx 1749 and \$0x3ffffff,%rax # h[0] 1750 shl \$12,$d1 1751 and \$0x3ffffff,%rdx # h[1] 1752 shr \$14,$h1 1753 or $d1,$h0 1754 shl \$24,$h2 1755 and \$0x3ffffff,$h0 # h[2] 1756 shr \$40,$d2 1757 and \$0x3ffffff,$h1 # h[3] 1758 or $d2,$h2 # h[4] 1759 1760 vmovd %rax#d,%x#$H0 1761 vmovd %rdx#d,%x#$H1 1762 vmovd $h0#d,%x#$H2 1763 vmovd $h1#d,%x#$H3 1764 vmovd $h2#d,%x#$H4 1765 movl \$1,20($ctx) # set is_base2_26 1766 1767 call __poly1305_init_avx 1768 1769.Lproceed_avx2$suffix: 1770 mov %r15,$len # restore $len 1771___ 1772$code.=<<___ if (!$kernel); 1773 mov OPENSSL_ia32cap_P+8(%rip),%r9d 1774 mov \$`(1<<31|1<<30|1<<16)`,%r11d 1775___ 1776$code.=<<___; 1777 pop %r15 1778.cfi_restore %r15 1779 pop %r14 1780.cfi_restore %r14 1781 pop %r13 1782.cfi_restore %r13 1783 pop %r12 1784.cfi_restore %r12 1785 pop %rbx 1786.cfi_restore %rbx 1787 pop %rbp 1788.cfi_restore %rbp 1789.Lbase2_64_avx2_epilogue$suffix: 1790 jmp .Ldo_avx2$suffix 1791.cfi_endproc 1792 1793.align 32 1794.Leven_avx2$suffix: 1795.cfi_startproc 1796___ 1797$code.=<<___ if (!$kernel); 1798 mov OPENSSL_ia32cap_P+8(%rip),%r9d 1799___ 1800$code.=<<___; 1801 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 1802 vmovd 4*1($ctx),%x#$H1 1803 vmovd 4*2($ctx),%x#$H2 1804 vmovd 4*3($ctx),%x#$H3 1805 vmovd 4*4($ctx),%x#$H4 1806 1807.Ldo_avx2$suffix: 1808___ 1809$code.=<<___ if (!$kernel && $avx>2); 1810 cmp \$512,$len 1811 jb .Lskip_avx512 1812 and %r11d,%r9d 1813 test \$`1<<16`,%r9d # check for AVX512F 1814 jnz .Lblocks_avx512 1815.Lskip_avx512$suffix: 1816___ 1817$code.=<<___ if ($avx > 2 && $avx512 && $kernel); 1818 cmp \$512,$len 1819 jae .Lblocks_avx512 1820___ 1821$code.=<<___ if (!$win64); 1822 lea 8(%rsp),%r10 1823.cfi_def_cfa_register %r10 1824 sub \$0x128,%rsp 1825___ 1826$code.=<<___ if ($win64); 1827 lea 8(%rsp),%r10 1828 sub \$0x1c8,%rsp 1829 vmovdqa %xmm6,-0xb0(%r10) 1830 vmovdqa %xmm7,-0xa0(%r10) 1831 vmovdqa %xmm8,-0x90(%r10) 1832 vmovdqa %xmm9,-0x80(%r10) 1833 vmovdqa %xmm10,-0x70(%r10) 1834 vmovdqa %xmm11,-0x60(%r10) 1835 vmovdqa %xmm12,-0x50(%r10) 1836 vmovdqa %xmm13,-0x40(%r10) 1837 vmovdqa %xmm14,-0x30(%r10) 1838 vmovdqa %xmm15,-0x20(%r10) 1839.Ldo_avx2_body$suffix: 1840___ 1841$code.=<<___; 1842 lea .Lconst(%rip),%rcx 1843 lea 48+64($ctx),$ctx # size optimization 1844 vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 1845 1846 # expand and copy pre-calculated table to stack 1847 vmovdqu `16*0-64`($ctx),%x#$T2 1848 and \$-512,%rsp 1849 vmovdqu `16*1-64`($ctx),%x#$T3 1850 vmovdqu `16*2-64`($ctx),%x#$T4 1851 vmovdqu `16*3-64`($ctx),%x#$D0 1852 vmovdqu `16*4-64`($ctx),%x#$D1 1853 vmovdqu `16*5-64`($ctx),%x#$D2 1854 lea 0x90(%rsp),%rax # size optimization 1855 vmovdqu `16*6-64`($ctx),%x#$D3 1856 vpermd $T2,$T0,$T2 # 00003412 -> 14243444 1857 vmovdqu `16*7-64`($ctx),%x#$D4 1858 vpermd $T3,$T0,$T3 1859 vmovdqu `16*8-64`($ctx),%x#$MASK 1860 vpermd $T4,$T0,$T4 1861 vmovdqa $T2,0x00(%rsp) 1862 vpermd $D0,$T0,$D0 1863 vmovdqa $T3,0x20-0x90(%rax) 1864 vpermd $D1,$T0,$D1 1865 vmovdqa $T4,0x40-0x90(%rax) 1866 vpermd $D2,$T0,$D2 1867 vmovdqa $D0,0x60-0x90(%rax) 1868 vpermd $D3,$T0,$D3 1869 vmovdqa $D1,0x80-0x90(%rax) 1870 vpermd $D4,$T0,$D4 1871 vmovdqa $D2,0xa0-0x90(%rax) 1872 vpermd $MASK,$T0,$MASK 1873 vmovdqa $D3,0xc0-0x90(%rax) 1874 vmovdqa $D4,0xe0-0x90(%rax) 1875 vmovdqa $MASK,0x100-0x90(%rax) 1876 vmovdqa 64(%rcx),$MASK # .Lmask26 1877 1878 ################################################################ 1879 # load input 1880 vmovdqu 16*0($inp),%x#$T0 1881 vmovdqu 16*1($inp),%x#$T1 1882 vinserti128 \$1,16*2($inp),$T0,$T0 1883 vinserti128 \$1,16*3($inp),$T1,$T1 1884 lea 16*4($inp),$inp 1885 1886 vpsrldq \$6,$T0,$T2 # splat input 1887 vpsrldq \$6,$T1,$T3 1888 vpunpckhqdq $T1,$T0,$T4 # 4 1889 vpunpcklqdq $T3,$T2,$T2 # 2:3 1890 vpunpcklqdq $T1,$T0,$T0 # 0:1 1891 1892 vpsrlq \$30,$T2,$T3 1893 vpsrlq \$4,$T2,$T2 1894 vpsrlq \$26,$T0,$T1 1895 vpsrlq \$40,$T4,$T4 # 4 1896 vpand $MASK,$T2,$T2 # 2 1897 vpand $MASK,$T0,$T0 # 0 1898 vpand $MASK,$T1,$T1 # 1 1899 vpand $MASK,$T3,$T3 # 3 1900 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1901 1902 vpaddq $H2,$T2,$H2 # accumulate input 1903 sub \$64,$len 1904 jz .Ltail_avx2$suffix 1905 jmp .Loop_avx2$suffix 1906 1907.align 32 1908.Loop_avx2$suffix: 1909 ################################################################ 1910 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 1911 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 1912 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 1913 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 1914 # \________/\__________/ 1915 ################################################################ 1916 #vpaddq $H2,$T2,$H2 # accumulate input 1917 vpaddq $H0,$T0,$H0 1918 vmovdqa `32*0`(%rsp),$T0 # r0^4 1919 vpaddq $H1,$T1,$H1 1920 vmovdqa `32*1`(%rsp),$T1 # r1^4 1921 vpaddq $H3,$T3,$H3 1922 vmovdqa `32*3`(%rsp),$T2 # r2^4 1923 vpaddq $H4,$T4,$H4 1924 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 1925 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 1926 1927 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1928 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1929 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1930 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1931 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1932 # 1933 # however, as h2 is "chronologically" first one available pull 1934 # corresponding operations up, so it's 1935 # 1936 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 1937 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 1938 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1939 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 1940 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 1941 1942 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 1943 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 1944 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 1945 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 1946 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 1947 1948 vpmuludq $H0,$T1,$T4 # h0*r1 1949 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp 1950 vpaddq $T4,$D1,$D1 # d1 += h0*r1 1951 vpaddq $H2,$D2,$D2 # d2 += h1*r1 1952 vpmuludq $H3,$T1,$T4 # h3*r1 1953 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 1954 vpaddq $T4,$D4,$D4 # d4 += h3*r1 1955 vpaddq $H2,$D0,$D0 # d0 += h4*s1 1956 vmovdqa `32*4-0x90`(%rax),$T1 # s2 1957 1958 vpmuludq $H0,$T0,$T4 # h0*r0 1959 vpmuludq $H1,$T0,$H2 # h1*r0 1960 vpaddq $T4,$D0,$D0 # d0 += h0*r0 1961 vpaddq $H2,$D1,$D1 # d1 += h1*r0 1962 vpmuludq $H3,$T0,$T4 # h3*r0 1963 vpmuludq $H4,$T0,$H2 # h4*r0 1964 vmovdqu 16*0($inp),%x#$T0 # load input 1965 vpaddq $T4,$D3,$D3 # d3 += h3*r0 1966 vpaddq $H2,$D4,$D4 # d4 += h4*r0 1967 vinserti128 \$1,16*2($inp),$T0,$T0 1968 1969 vpmuludq $H3,$T1,$T4 # h3*s2 1970 vpmuludq $H4,$T1,$H2 # h4*s2 1971 vmovdqu 16*1($inp),%x#$T1 1972 vpaddq $T4,$D0,$D0 # d0 += h3*s2 1973 vpaddq $H2,$D1,$D1 # d1 += h4*s2 1974 vmovdqa `32*5-0x90`(%rax),$H2 # r3 1975 vpmuludq $H1,$T2,$T4 # h1*r2 1976 vpmuludq $H0,$T2,$T2 # h0*r2 1977 vpaddq $T4,$D3,$D3 # d3 += h1*r2 1978 vpaddq $T2,$D2,$D2 # d2 += h0*r2 1979 vinserti128 \$1,16*3($inp),$T1,$T1 1980 lea 16*4($inp),$inp 1981 1982 vpmuludq $H1,$H2,$T4 # h1*r3 1983 vpmuludq $H0,$H2,$H2 # h0*r3 1984 vpsrldq \$6,$T0,$T2 # splat input 1985 vpaddq $T4,$D4,$D4 # d4 += h1*r3 1986 vpaddq $H2,$D3,$D3 # d3 += h0*r3 1987 vpmuludq $H3,$T3,$T4 # h3*s3 1988 vpmuludq $H4,$T3,$H2 # h4*s3 1989 vpsrldq \$6,$T1,$T3 1990 vpaddq $T4,$D1,$D1 # d1 += h3*s3 1991 vpaddq $H2,$D2,$D2 # d2 += h4*s3 1992 vpunpckhqdq $T1,$T0,$T4 # 4 1993 1994 vpmuludq $H3,$S4,$H3 # h3*s4 1995 vpmuludq $H4,$S4,$H4 # h4*s4 1996 vpunpcklqdq $T1,$T0,$T0 # 0:1 1997 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 1998 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 1999 vpunpcklqdq $T3,$T2,$T3 # 2:3 2000 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 2001 vpmuludq $H1,$S4,$H0 # h1*s4 2002 vmovdqa 64(%rcx),$MASK # .Lmask26 2003 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 2004 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 2005 2006 ################################################################ 2007 # lazy reduction (interleaved with tail of input splat) 2008 2009 vpsrlq \$26,$H3,$D3 2010 vpand $MASK,$H3,$H3 2011 vpaddq $D3,$H4,$H4 # h3 -> h4 2012 2013 vpsrlq \$26,$H0,$D0 2014 vpand $MASK,$H0,$H0 2015 vpaddq $D0,$D1,$H1 # h0 -> h1 2016 2017 vpsrlq \$26,$H4,$D4 2018 vpand $MASK,$H4,$H4 2019 2020 vpsrlq \$4,$T3,$T2 2021 2022 vpsrlq \$26,$H1,$D1 2023 vpand $MASK,$H1,$H1 2024 vpaddq $D1,$H2,$H2 # h1 -> h2 2025 2026 vpaddq $D4,$H0,$H0 2027 vpsllq \$2,$D4,$D4 2028 vpaddq $D4,$H0,$H0 # h4 -> h0 2029 2030 vpand $MASK,$T2,$T2 # 2 2031 vpsrlq \$26,$T0,$T1 2032 2033 vpsrlq \$26,$H2,$D2 2034 vpand $MASK,$H2,$H2 2035 vpaddq $D2,$H3,$H3 # h2 -> h3 2036 2037 vpaddq $T2,$H2,$H2 # modulo-scheduled 2038 vpsrlq \$30,$T3,$T3 2039 2040 vpsrlq \$26,$H0,$D0 2041 vpand $MASK,$H0,$H0 2042 vpaddq $D0,$H1,$H1 # h0 -> h1 2043 2044 vpsrlq \$40,$T4,$T4 # 4 2045 2046 vpsrlq \$26,$H3,$D3 2047 vpand $MASK,$H3,$H3 2048 vpaddq $D3,$H4,$H4 # h3 -> h4 2049 2050 vpand $MASK,$T0,$T0 # 0 2051 vpand $MASK,$T1,$T1 # 1 2052 vpand $MASK,$T3,$T3 # 3 2053 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 2054 2055 sub \$64,$len 2056 jnz .Loop_avx2$suffix 2057 2058 .byte 0x66,0x90 2059.Ltail_avx2$suffix: 2060 ################################################################ 2061 # while above multiplications were by r^4 in all lanes, in last 2062 # iteration we multiply least significant lane by r^4 and most 2063 # significant one by r, so copy of above except that references 2064 # to the precomputed table are displaced by 4... 2065 2066 #vpaddq $H2,$T2,$H2 # accumulate input 2067 vpaddq $H0,$T0,$H0 2068 vmovdqu `32*0+4`(%rsp),$T0 # r0^4 2069 vpaddq $H1,$T1,$H1 2070 vmovdqu `32*1+4`(%rsp),$T1 # r1^4 2071 vpaddq $H3,$T3,$H3 2072 vmovdqu `32*3+4`(%rsp),$T2 # r2^4 2073 vpaddq $H4,$T4,$H4 2074 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 2075 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 2076 2077 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 2078 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 2079 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 2080 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 2081 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2082 2083 vpmuludq $H0,$T1,$T4 # h0*r1 2084 vpmuludq $H1,$T1,$H2 # h1*r1 2085 vpaddq $T4,$D1,$D1 # d1 += h0*r1 2086 vpaddq $H2,$D2,$D2 # d2 += h1*r1 2087 vpmuludq $H3,$T1,$T4 # h3*r1 2088 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 2089 vpaddq $T4,$D4,$D4 # d4 += h3*r1 2090 vpaddq $H2,$D0,$D0 # d0 += h4*s1 2091 2092 vpmuludq $H0,$T0,$T4 # h0*r0 2093 vpmuludq $H1,$T0,$H2 # h1*r0 2094 vpaddq $T4,$D0,$D0 # d0 += h0*r0 2095 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 2096 vpaddq $H2,$D1,$D1 # d1 += h1*r0 2097 vpmuludq $H3,$T0,$T4 # h3*r0 2098 vpmuludq $H4,$T0,$H2 # h4*r0 2099 vpaddq $T4,$D3,$D3 # d3 += h3*r0 2100 vpaddq $H2,$D4,$D4 # d4 += h4*r0 2101 2102 vpmuludq $H3,$T1,$T4 # h3*s2 2103 vpmuludq $H4,$T1,$H2 # h4*s2 2104 vpaddq $T4,$D0,$D0 # d0 += h3*s2 2105 vpaddq $H2,$D1,$D1 # d1 += h4*s2 2106 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 2107 vpmuludq $H1,$T2,$T4 # h1*r2 2108 vpmuludq $H0,$T2,$T2 # h0*r2 2109 vpaddq $T4,$D3,$D3 # d3 += h1*r2 2110 vpaddq $T2,$D2,$D2 # d2 += h0*r2 2111 2112 vpmuludq $H1,$H2,$T4 # h1*r3 2113 vpmuludq $H0,$H2,$H2 # h0*r3 2114 vpaddq $T4,$D4,$D4 # d4 += h1*r3 2115 vpaddq $H2,$D3,$D3 # d3 += h0*r3 2116 vpmuludq $H3,$T3,$T4 # h3*s3 2117 vpmuludq $H4,$T3,$H2 # h4*s3 2118 vpaddq $T4,$D1,$D1 # d1 += h3*s3 2119 vpaddq $H2,$D2,$D2 # d2 += h4*s3 2120 2121 vpmuludq $H3,$S4,$H3 # h3*s4 2122 vpmuludq $H4,$S4,$H4 # h4*s4 2123 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 2124 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 2125 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 2126 vpmuludq $H1,$S4,$H0 # h1*s4 2127 vmovdqa 64(%rcx),$MASK # .Lmask26 2128 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 2129 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 2130 2131 ################################################################ 2132 # horizontal addition 2133 2134 vpsrldq \$8,$D1,$T1 2135 vpsrldq \$8,$H2,$T2 2136 vpsrldq \$8,$H3,$T3 2137 vpsrldq \$8,$H4,$T4 2138 vpsrldq \$8,$H0,$T0 2139 vpaddq $T1,$D1,$D1 2140 vpaddq $T2,$H2,$H2 2141 vpaddq $T3,$H3,$H3 2142 vpaddq $T4,$H4,$H4 2143 vpaddq $T0,$H0,$H0 2144 2145 vpermq \$0x2,$H3,$T3 2146 vpermq \$0x2,$H4,$T4 2147 vpermq \$0x2,$H0,$T0 2148 vpermq \$0x2,$D1,$T1 2149 vpermq \$0x2,$H2,$T2 2150 vpaddq $T3,$H3,$H3 2151 vpaddq $T4,$H4,$H4 2152 vpaddq $T0,$H0,$H0 2153 vpaddq $T1,$D1,$D1 2154 vpaddq $T2,$H2,$H2 2155 2156 ################################################################ 2157 # lazy reduction 2158 2159 vpsrlq \$26,$H3,$D3 2160 vpand $MASK,$H3,$H3 2161 vpaddq $D3,$H4,$H4 # h3 -> h4 2162 2163 vpsrlq \$26,$H0,$D0 2164 vpand $MASK,$H0,$H0 2165 vpaddq $D0,$D1,$H1 # h0 -> h1 2166 2167 vpsrlq \$26,$H4,$D4 2168 vpand $MASK,$H4,$H4 2169 2170 vpsrlq \$26,$H1,$D1 2171 vpand $MASK,$H1,$H1 2172 vpaddq $D1,$H2,$H2 # h1 -> h2 2173 2174 vpaddq $D4,$H0,$H0 2175 vpsllq \$2,$D4,$D4 2176 vpaddq $D4,$H0,$H0 # h4 -> h0 2177 2178 vpsrlq \$26,$H2,$D2 2179 vpand $MASK,$H2,$H2 2180 vpaddq $D2,$H3,$H3 # h2 -> h3 2181 2182 vpsrlq \$26,$H0,$D0 2183 vpand $MASK,$H0,$H0 2184 vpaddq $D0,$H1,$H1 # h0 -> h1 2185 2186 vpsrlq \$26,$H3,$D3 2187 vpand $MASK,$H3,$H3 2188 vpaddq $D3,$H4,$H4 # h3 -> h4 2189 2190 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2191 vmovd %x#$H1,`4*1-48-64`($ctx) 2192 vmovd %x#$H2,`4*2-48-64`($ctx) 2193 vmovd %x#$H3,`4*3-48-64`($ctx) 2194 vmovd %x#$H4,`4*4-48-64`($ctx) 2195___ 2196$code.=<<___ if ($win64); 2197 vmovdqa -0xb0(%r10),%xmm6 2198 vmovdqa -0xa0(%r10),%xmm7 2199 vmovdqa -0x90(%r10),%xmm8 2200 vmovdqa -0x80(%r10),%xmm9 2201 vmovdqa -0x70(%r10),%xmm10 2202 vmovdqa -0x60(%r10),%xmm11 2203 vmovdqa -0x50(%r10),%xmm12 2204 vmovdqa -0x40(%r10),%xmm13 2205 vmovdqa -0x30(%r10),%xmm14 2206 vmovdqa -0x20(%r10),%xmm15 2207 lea -8(%r10),%rsp 2208.Ldo_avx2_epilogue$suffix: 2209___ 2210$code.=<<___ if (!$win64); 2211 lea -8(%r10),%rsp 2212.cfi_def_cfa_register %rsp 2213___ 2214$code.=<<___; 2215 vzeroupper 2216 ret 2217.cfi_endproc 2218___ 2219if($avx > 2 && $avx512) { 2220my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); 2221my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); 2222my $PADBIT="%zmm30"; 2223 2224map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain 2225map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); 2226map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); 2227map(s/%y/%z/,($MASK)); 2228 2229$code.=<<___; 2230.cfi_startproc 2231.Lblocks_avx512: 2232 mov \$15,%eax 2233 kmovw %eax,%k2 2234___ 2235$code.=<<___ if (!$win64); 2236 lea 8(%rsp),%r10 2237.cfi_def_cfa_register %r10 2238 sub \$0x128,%rsp 2239___ 2240$code.=<<___ if ($win64); 2241 lea 8(%rsp),%r10 2242 sub \$0x1c8,%rsp 2243 vmovdqa %xmm6,-0xb0(%r10) 2244 vmovdqa %xmm7,-0xa0(%r10) 2245 vmovdqa %xmm8,-0x90(%r10) 2246 vmovdqa %xmm9,-0x80(%r10) 2247 vmovdqa %xmm10,-0x70(%r10) 2248 vmovdqa %xmm11,-0x60(%r10) 2249 vmovdqa %xmm12,-0x50(%r10) 2250 vmovdqa %xmm13,-0x40(%r10) 2251 vmovdqa %xmm14,-0x30(%r10) 2252 vmovdqa %xmm15,-0x20(%r10) 2253.Ldo_avx512_body: 2254___ 2255$code.=<<___; 2256 lea .Lconst(%rip),%rcx 2257 lea 48+64($ctx),$ctx # size optimization 2258 vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 2259 2260 # expand pre-calculated table 2261 vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} 2262 and \$-512,%rsp 2263 vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} 2264 mov \$0x20,%rax 2265 vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} 2266 vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} 2267 vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} 2268 vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} 2269 vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} 2270 vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} 2271 vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} 2272 vpermd $D0,$T2,$R0 # 00003412 -> 14243444 2273 vpbroadcastq 64(%rcx),$MASK # .Lmask26 2274 vpermd $D1,$T2,$R1 2275 vpermd $T0,$T2,$S1 2276 vpermd $D2,$T2,$R2 2277 vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 2278 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 2279 vpermd $T1,$T2,$S2 2280 vmovdqu64 $R1,0x00(%rsp,%rax){%k2} 2281 vpsrlq \$32,$R1,$T1 2282 vpermd $D3,$T2,$R3 2283 vmovdqa64 $S1,0x40(%rsp){%k2} 2284 vpermd $T3,$T2,$S3 2285 vpermd $D4,$T2,$R4 2286 vmovdqu64 $R2,0x40(%rsp,%rax){%k2} 2287 vpermd $T4,$T2,$S4 2288 vmovdqa64 $S2,0x80(%rsp){%k2} 2289 vmovdqu64 $R3,0x80(%rsp,%rax){%k2} 2290 vmovdqa64 $S3,0xc0(%rsp){%k2} 2291 vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} 2292 vmovdqa64 $S4,0x100(%rsp){%k2} 2293 2294 ################################################################ 2295 # calculate 5th through 8th powers of the key 2296 # 2297 # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 2298 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 2299 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 2300 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 2301 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 2302 2303 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 2304 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 2305 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 2306 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 2307 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 2308 vpsrlq \$32,$R2,$T2 2309 2310 vpmuludq $T1,$S4,$M0 2311 vpmuludq $T1,$R0,$M1 2312 vpmuludq $T1,$R1,$M2 2313 vpmuludq $T1,$R2,$M3 2314 vpmuludq $T1,$R3,$M4 2315 vpsrlq \$32,$R3,$T3 2316 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 2317 vpaddq $M1,$D1,$D1 # d1 += r1'*r0 2318 vpaddq $M2,$D2,$D2 # d2 += r1'*r1 2319 vpaddq $M3,$D3,$D3 # d3 += r1'*r2 2320 vpaddq $M4,$D4,$D4 # d4 += r1'*r3 2321 2322 vpmuludq $T2,$S3,$M0 2323 vpmuludq $T2,$S4,$M1 2324 vpmuludq $T2,$R1,$M3 2325 vpmuludq $T2,$R2,$M4 2326 vpmuludq $T2,$R0,$M2 2327 vpsrlq \$32,$R4,$T4 2328 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 2329 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 2330 vpaddq $M3,$D3,$D3 # d3 += r2'*r1 2331 vpaddq $M4,$D4,$D4 # d4 += r2'*r2 2332 vpaddq $M2,$D2,$D2 # d2 += r2'*r0 2333 2334 vpmuludq $T3,$S2,$M0 2335 vpmuludq $T3,$R0,$M3 2336 vpmuludq $T3,$R1,$M4 2337 vpmuludq $T3,$S3,$M1 2338 vpmuludq $T3,$S4,$M2 2339 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 2340 vpaddq $M3,$D3,$D3 # d3 += r3'*r0 2341 vpaddq $M4,$D4,$D4 # d4 += r3'*r1 2342 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 2343 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 2344 2345 vpmuludq $T4,$S4,$M3 2346 vpmuludq $T4,$R0,$M4 2347 vpmuludq $T4,$S1,$M0 2348 vpmuludq $T4,$S2,$M1 2349 vpmuludq $T4,$S3,$M2 2350 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 2351 vpaddq $M4,$D4,$D4 # d4 += r2'*r0 2352 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 2353 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 2354 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 2355 2356 ################################################################ 2357 # load input 2358 vmovdqu64 16*0($inp),%z#$T3 2359 vmovdqu64 16*4($inp),%z#$T4 2360 lea 16*8($inp),$inp 2361 2362 ################################################################ 2363 # lazy reduction 2364 2365 vpsrlq \$26,$D3,$M3 2366 vpandq $MASK,$D3,$D3 2367 vpaddq $M3,$D4,$D4 # d3 -> d4 2368 2369 vpsrlq \$26,$D0,$M0 2370 vpandq $MASK,$D0,$D0 2371 vpaddq $M0,$D1,$D1 # d0 -> d1 2372 2373 vpsrlq \$26,$D4,$M4 2374 vpandq $MASK,$D4,$D4 2375 2376 vpsrlq \$26,$D1,$M1 2377 vpandq $MASK,$D1,$D1 2378 vpaddq $M1,$D2,$D2 # d1 -> d2 2379 2380 vpaddq $M4,$D0,$D0 2381 vpsllq \$2,$M4,$M4 2382 vpaddq $M4,$D0,$D0 # d4 -> d0 2383 2384 vpsrlq \$26,$D2,$M2 2385 vpandq $MASK,$D2,$D2 2386 vpaddq $M2,$D3,$D3 # d2 -> d3 2387 2388 vpsrlq \$26,$D0,$M0 2389 vpandq $MASK,$D0,$D0 2390 vpaddq $M0,$D1,$D1 # d0 -> d1 2391 2392 vpsrlq \$26,$D3,$M3 2393 vpandq $MASK,$D3,$D3 2394 vpaddq $M3,$D4,$D4 # d3 -> d4 2395 2396 ################################################################ 2397 # at this point we have 14243444 in $R0-$S4 and 05060708 in 2398 # $D0-$D4, ... 2399 2400 vpunpcklqdq $T4,$T3,$T0 # transpose input 2401 vpunpckhqdq $T4,$T3,$T4 2402 2403 # ... since input 64-bit lanes are ordered as 73625140, we could 2404 # "vperm" it to 76543210 (here and in each loop iteration), *or* 2405 # we could just flow along, hence the goal for $R0-$S4 is 2406 # 1858286838784888 ... 2407 2408 vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: 2409 mov \$0x7777,%eax 2410 kmovw %eax,%k1 2411 2412 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- 2413 vpermd $R1,$M0,$R1 2414 vpermd $R2,$M0,$R2 2415 vpermd $R3,$M0,$R3 2416 vpermd $R4,$M0,$R4 2417 2418 vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 2419 vpermd $D1,$M0,${R1}{%k1} 2420 vpermd $D2,$M0,${R2}{%k1} 2421 vpermd $D3,$M0,${R3}{%k1} 2422 vpermd $D4,$M0,${R4}{%k1} 2423 2424 vpslld \$2,$R1,$S1 # *5 2425 vpslld \$2,$R2,$S2 2426 vpslld \$2,$R3,$S3 2427 vpslld \$2,$R4,$S4 2428 vpaddd $R1,$S1,$S1 2429 vpaddd $R2,$S2,$S2 2430 vpaddd $R3,$S3,$S3 2431 vpaddd $R4,$S4,$S4 2432 2433 vpbroadcastq 32(%rcx),$PADBIT # .L129 2434 2435 vpsrlq \$52,$T0,$T2 # splat input 2436 vpsllq \$12,$T4,$T3 2437 vporq $T3,$T2,$T2 2438 vpsrlq \$26,$T0,$T1 2439 vpsrlq \$14,$T4,$T3 2440 vpsrlq \$40,$T4,$T4 # 4 2441 vpandq $MASK,$T2,$T2 # 2 2442 vpandq $MASK,$T0,$T0 # 0 2443 #vpandq $MASK,$T1,$T1 # 1 2444 #vpandq $MASK,$T3,$T3 # 3 2445 #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2446 2447 vpaddq $H2,$T2,$H2 # accumulate input 2448 sub \$192,$len 2449 jbe .Ltail_avx512 2450 jmp .Loop_avx512 2451 2452.align 32 2453.Loop_avx512: 2454 ################################################################ 2455 # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 2456 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 2457 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 2458 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 2459 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 2460 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 2461 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 2462 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 2463 # \________/\___________/ 2464 ################################################################ 2465 #vpaddq $H2,$T2,$H2 # accumulate input 2466 2467 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 2468 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 2469 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 2470 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 2471 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 2472 # 2473 # however, as h2 is "chronologically" first one available pull 2474 # corresponding operations up, so it's 2475 # 2476 # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 2477 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 2478 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 2479 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 2480 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 2481 2482 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2483 vpaddq $H0,$T0,$H0 2484 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2485 vpandq $MASK,$T1,$T1 # 1 2486 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2487 vpandq $MASK,$T3,$T3 # 3 2488 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2489 vporq $PADBIT,$T4,$T4 # padbit, yes, always 2490 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2491 vpaddq $H1,$T1,$H1 # accumulate input 2492 vpaddq $H3,$T3,$H3 2493 vpaddq $H4,$T4,$H4 2494 2495 vmovdqu64 16*0($inp),$T3 # load input 2496 vmovdqu64 16*4($inp),$T4 2497 lea 16*8($inp),$inp 2498 vpmuludq $H0,$R3,$M3 2499 vpmuludq $H0,$R4,$M4 2500 vpmuludq $H0,$R0,$M0 2501 vpmuludq $H0,$R1,$M1 2502 vpaddq $M3,$D3,$D3 # d3 += h0*r3 2503 vpaddq $M4,$D4,$D4 # d4 += h0*r4 2504 vpaddq $M0,$D0,$D0 # d0 += h0*r0 2505 vpaddq $M1,$D1,$D1 # d1 += h0*r1 2506 2507 vpmuludq $H1,$R2,$M3 2508 vpmuludq $H1,$R3,$M4 2509 vpmuludq $H1,$S4,$M0 2510 vpmuludq $H0,$R2,$M2 2511 vpaddq $M3,$D3,$D3 # d3 += h1*r2 2512 vpaddq $M4,$D4,$D4 # d4 += h1*r3 2513 vpaddq $M0,$D0,$D0 # d0 += h1*s4 2514 vpaddq $M2,$D2,$D2 # d2 += h0*r2 2515 2516 vpunpcklqdq $T4,$T3,$T0 # transpose input 2517 vpunpckhqdq $T4,$T3,$T4 2518 2519 vpmuludq $H3,$R0,$M3 2520 vpmuludq $H3,$R1,$M4 2521 vpmuludq $H1,$R0,$M1 2522 vpmuludq $H1,$R1,$M2 2523 vpaddq $M3,$D3,$D3 # d3 += h3*r0 2524 vpaddq $M4,$D4,$D4 # d4 += h3*r1 2525 vpaddq $M1,$D1,$D1 # d1 += h1*r0 2526 vpaddq $M2,$D2,$D2 # d2 += h1*r1 2527 2528 vpmuludq $H4,$S4,$M3 2529 vpmuludq $H4,$R0,$M4 2530 vpmuludq $H3,$S2,$M0 2531 vpmuludq $H3,$S3,$M1 2532 vpaddq $M3,$D3,$D3 # d3 += h4*s4 2533 vpmuludq $H3,$S4,$M2 2534 vpaddq $M4,$D4,$D4 # d4 += h4*r0 2535 vpaddq $M0,$D0,$D0 # d0 += h3*s2 2536 vpaddq $M1,$D1,$D1 # d1 += h3*s3 2537 vpaddq $M2,$D2,$D2 # d2 += h3*s4 2538 2539 vpmuludq $H4,$S1,$M0 2540 vpmuludq $H4,$S2,$M1 2541 vpmuludq $H4,$S3,$M2 2542 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2543 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2544 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2545 2546 ################################################################ 2547 # lazy reduction (interleaved with input splat) 2548 2549 vpsrlq \$52,$T0,$T2 # splat input 2550 vpsllq \$12,$T4,$T3 2551 2552 vpsrlq \$26,$D3,$H3 2553 vpandq $MASK,$D3,$D3 2554 vpaddq $H3,$D4,$H4 # h3 -> h4 2555 2556 vporq $T3,$T2,$T2 2557 2558 vpsrlq \$26,$H0,$D0 2559 vpandq $MASK,$H0,$H0 2560 vpaddq $D0,$H1,$H1 # h0 -> h1 2561 2562 vpandq $MASK,$T2,$T2 # 2 2563 2564 vpsrlq \$26,$H4,$D4 2565 vpandq $MASK,$H4,$H4 2566 2567 vpsrlq \$26,$H1,$D1 2568 vpandq $MASK,$H1,$H1 2569 vpaddq $D1,$H2,$H2 # h1 -> h2 2570 2571 vpaddq $D4,$H0,$H0 2572 vpsllq \$2,$D4,$D4 2573 vpaddq $D4,$H0,$H0 # h4 -> h0 2574 2575 vpaddq $T2,$H2,$H2 # modulo-scheduled 2576 vpsrlq \$26,$T0,$T1 2577 2578 vpsrlq \$26,$H2,$D2 2579 vpandq $MASK,$H2,$H2 2580 vpaddq $D2,$D3,$H3 # h2 -> h3 2581 2582 vpsrlq \$14,$T4,$T3 2583 2584 vpsrlq \$26,$H0,$D0 2585 vpandq $MASK,$H0,$H0 2586 vpaddq $D0,$H1,$H1 # h0 -> h1 2587 2588 vpsrlq \$40,$T4,$T4 # 4 2589 2590 vpsrlq \$26,$H3,$D3 2591 vpandq $MASK,$H3,$H3 2592 vpaddq $D3,$H4,$H4 # h3 -> h4 2593 2594 vpandq $MASK,$T0,$T0 # 0 2595 #vpandq $MASK,$T1,$T1 # 1 2596 #vpandq $MASK,$T3,$T3 # 3 2597 #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2598 2599 sub \$128,$len 2600 ja .Loop_avx512 2601 2602.Ltail_avx512: 2603 ################################################################ 2604 # while above multiplications were by r^8 in all lanes, in last 2605 # iteration we multiply least significant lane by r^8 and most 2606 # significant one by r, that's why table gets shifted... 2607 2608 vpsrlq \$32,$R0,$R0 # 0105020603070408 2609 vpsrlq \$32,$R1,$R1 2610 vpsrlq \$32,$R2,$R2 2611 vpsrlq \$32,$S3,$S3 2612 vpsrlq \$32,$S4,$S4 2613 vpsrlq \$32,$R3,$R3 2614 vpsrlq \$32,$R4,$R4 2615 vpsrlq \$32,$S1,$S1 2616 vpsrlq \$32,$S2,$S2 2617 2618 ################################################################ 2619 # load either next or last 64 byte of input 2620 lea ($inp,$len),$inp 2621 2622 #vpaddq $H2,$T2,$H2 # accumulate input 2623 vpaddq $H0,$T0,$H0 2624 2625 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2626 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2627 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2628 vpandq $MASK,$T1,$T1 # 1 2629 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2630 vpandq $MASK,$T3,$T3 # 3 2631 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2632 vporq $PADBIT,$T4,$T4 # padbit, yes, always 2633 vpaddq $H1,$T1,$H1 # accumulate input 2634 vpaddq $H3,$T3,$H3 2635 vpaddq $H4,$T4,$H4 2636 2637 vmovdqu 16*0($inp),%x#$T0 2638 vpmuludq $H0,$R3,$M3 2639 vpmuludq $H0,$R4,$M4 2640 vpmuludq $H0,$R0,$M0 2641 vpmuludq $H0,$R1,$M1 2642 vpaddq $M3,$D3,$D3 # d3 += h0*r3 2643 vpaddq $M4,$D4,$D4 # d4 += h0*r4 2644 vpaddq $M0,$D0,$D0 # d0 += h0*r0 2645 vpaddq $M1,$D1,$D1 # d1 += h0*r1 2646 2647 vmovdqu 16*1($inp),%x#$T1 2648 vpmuludq $H1,$R2,$M3 2649 vpmuludq $H1,$R3,$M4 2650 vpmuludq $H1,$S4,$M0 2651 vpmuludq $H0,$R2,$M2 2652 vpaddq $M3,$D3,$D3 # d3 += h1*r2 2653 vpaddq $M4,$D4,$D4 # d4 += h1*r3 2654 vpaddq $M0,$D0,$D0 # d0 += h1*s4 2655 vpaddq $M2,$D2,$D2 # d2 += h0*r2 2656 2657 vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 2658 vpmuludq $H3,$R0,$M3 2659 vpmuludq $H3,$R1,$M4 2660 vpmuludq $H1,$R0,$M1 2661 vpmuludq $H1,$R1,$M2 2662 vpaddq $M3,$D3,$D3 # d3 += h3*r0 2663 vpaddq $M4,$D4,$D4 # d4 += h3*r1 2664 vpaddq $M1,$D1,$D1 # d1 += h1*r0 2665 vpaddq $M2,$D2,$D2 # d2 += h1*r1 2666 2667 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 2668 vpmuludq $H4,$S4,$M3 2669 vpmuludq $H4,$R0,$M4 2670 vpmuludq $H3,$S2,$M0 2671 vpmuludq $H3,$S3,$M1 2672 vpmuludq $H3,$S4,$M2 2673 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 2674 vpaddq $M4,$D4,$D4 # d4 += h4*r0 2675 vpaddq $M0,$D0,$D0 # d0 += h3*s2 2676 vpaddq $M1,$D1,$D1 # d1 += h3*s3 2677 vpaddq $M2,$D2,$D2 # d2 += h3*s4 2678 2679 vpmuludq $H4,$S1,$M0 2680 vpmuludq $H4,$S2,$M1 2681 vpmuludq $H4,$S3,$M2 2682 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2683 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2684 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2685 2686 ################################################################ 2687 # horizontal addition 2688 2689 mov \$1,%eax 2690 vpermq \$0xb1,$H3,$D3 2691 vpermq \$0xb1,$D4,$H4 2692 vpermq \$0xb1,$H0,$D0 2693 vpermq \$0xb1,$H1,$D1 2694 vpermq \$0xb1,$H2,$D2 2695 vpaddq $D3,$H3,$H3 2696 vpaddq $D4,$H4,$H4 2697 vpaddq $D0,$H0,$H0 2698 vpaddq $D1,$H1,$H1 2699 vpaddq $D2,$H2,$H2 2700 2701 kmovw %eax,%k3 2702 vpermq \$0x2,$H3,$D3 2703 vpermq \$0x2,$H4,$D4 2704 vpermq \$0x2,$H0,$D0 2705 vpermq \$0x2,$H1,$D1 2706 vpermq \$0x2,$H2,$D2 2707 vpaddq $D3,$H3,$H3 2708 vpaddq $D4,$H4,$H4 2709 vpaddq $D0,$H0,$H0 2710 vpaddq $D1,$H1,$H1 2711 vpaddq $D2,$H2,$H2 2712 2713 vextracti64x4 \$0x1,$H3,%y#$D3 2714 vextracti64x4 \$0x1,$H4,%y#$D4 2715 vextracti64x4 \$0x1,$H0,%y#$D0 2716 vextracti64x4 \$0x1,$H1,%y#$D1 2717 vextracti64x4 \$0x1,$H2,%y#$D2 2718 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case 2719 vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 2720 vpaddq $D0,$H0,${H0}{%k3}{z} 2721 vpaddq $D1,$H1,${H1}{%k3}{z} 2722 vpaddq $D2,$H2,${H2}{%k3}{z} 2723___ 2724map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); 2725map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); 2726$code.=<<___; 2727 ################################################################ 2728 # lazy reduction (interleaved with input splat) 2729 2730 vpsrlq \$26,$H3,$D3 2731 vpand $MASK,$H3,$H3 2732 vpsrldq \$6,$T0,$T2 # splat input 2733 vpsrldq \$6,$T1,$T3 2734 vpunpckhqdq $T1,$T0,$T4 # 4 2735 vpaddq $D3,$H4,$H4 # h3 -> h4 2736 2737 vpsrlq \$26,$H0,$D0 2738 vpand $MASK,$H0,$H0 2739 vpunpcklqdq $T3,$T2,$T2 # 2:3 2740 vpunpcklqdq $T1,$T0,$T0 # 0:1 2741 vpaddq $D0,$H1,$H1 # h0 -> h1 2742 2743 vpsrlq \$26,$H4,$D4 2744 vpand $MASK,$H4,$H4 2745 2746 vpsrlq \$26,$H1,$D1 2747 vpand $MASK,$H1,$H1 2748 vpsrlq \$30,$T2,$T3 2749 vpsrlq \$4,$T2,$T2 2750 vpaddq $D1,$H2,$H2 # h1 -> h2 2751 2752 vpaddq $D4,$H0,$H0 2753 vpsllq \$2,$D4,$D4 2754 vpsrlq \$26,$T0,$T1 2755 vpsrlq \$40,$T4,$T4 # 4 2756 vpaddq $D4,$H0,$H0 # h4 -> h0 2757 2758 vpsrlq \$26,$H2,$D2 2759 vpand $MASK,$H2,$H2 2760 vpand $MASK,$T2,$T2 # 2 2761 vpand $MASK,$T0,$T0 # 0 2762 vpaddq $D2,$H3,$H3 # h2 -> h3 2763 2764 vpsrlq \$26,$H0,$D0 2765 vpand $MASK,$H0,$H0 2766 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 2767 vpand $MASK,$T1,$T1 # 1 2768 vpaddq $D0,$H1,$H1 # h0 -> h1 2769 2770 vpsrlq \$26,$H3,$D3 2771 vpand $MASK,$H3,$H3 2772 vpand $MASK,$T3,$T3 # 3 2773 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 2774 vpaddq $D3,$H4,$H4 # h3 -> h4 2775 2776 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 2777 add \$64,$len 2778 jnz .Ltail_avx2$suffix 2779 2780 vpsubq $T2,$H2,$H2 # undo input accumulation 2781 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2782 vmovd %x#$H1,`4*1-48-64`($ctx) 2783 vmovd %x#$H2,`4*2-48-64`($ctx) 2784 vmovd %x#$H3,`4*3-48-64`($ctx) 2785 vmovd %x#$H4,`4*4-48-64`($ctx) 2786 vzeroall 2787___ 2788$code.=<<___ if ($win64); 2789 movdqa -0xb0(%r10),%xmm6 2790 movdqa -0xa0(%r10),%xmm7 2791 movdqa -0x90(%r10),%xmm8 2792 movdqa -0x80(%r10),%xmm9 2793 movdqa -0x70(%r10),%xmm10 2794 movdqa -0x60(%r10),%xmm11 2795 movdqa -0x50(%r10),%xmm12 2796 movdqa -0x40(%r10),%xmm13 2797 movdqa -0x30(%r10),%xmm14 2798 movdqa -0x20(%r10),%xmm15 2799 lea -8(%r10),%rsp 2800.Ldo_avx512_epilogue: 2801___ 2802$code.=<<___ if (!$win64); 2803 lea -8(%r10),%rsp 2804.cfi_def_cfa_register %rsp 2805___ 2806$code.=<<___; 2807 ret 2808.cfi_endproc 2809___ 2810 2811} 2812 2813} 2814 2815&declare_function("poly1305_blocks_avx2", 32, 4); 2816poly1305_blocks_avxN(0); 2817&end_function("poly1305_blocks_avx2"); 2818 2819if($kernel) { 2820 $code .= "#endif\n"; 2821} 2822 2823####################################################################### 2824if ($avx>2) { 2825# On entry we have input length divisible by 64. But since inner loop 2826# processes 128 bytes per iteration, cases when length is not divisible 2827# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this 2828# reason stack layout is kept identical to poly1305_blocks_avx2. If not 2829# for this tail, we wouldn't have to even allocate stack frame... 2830 2831if($kernel) { 2832 $code .= "#ifdef CONFIG_AS_AVX512\n"; 2833} 2834 2835&declare_function("poly1305_blocks_avx512", 32, 4); 2836poly1305_blocks_avxN(1); 2837&end_function("poly1305_blocks_avx512"); 2838 2839if ($kernel) { 2840 $code .= "#endif\n"; 2841} 2842 2843if (!$kernel && $avx>3) { 2844######################################################################## 2845# VPMADD52 version using 2^44 radix. 2846# 2847# One can argue that base 2^52 would be more natural. Well, even though 2848# some operations would be more natural, one has to recognize couple of 2849# things. Base 2^52 doesn't provide advantage over base 2^44 if you look 2850# at amount of multiply-n-accumulate operations. Secondly, it makes it 2851# impossible to pre-compute multiples of 5 [referred to as s[]/sN in 2852# reference implementations], which means that more such operations 2853# would have to be performed in inner loop, which in turn makes critical 2854# path longer. In other words, even though base 2^44 reduction might 2855# look less elegant, overall critical path is actually shorter... 2856 2857######################################################################## 2858# Layout of opaque area is following. 2859# 2860# unsigned __int64 h[3]; # current hash value base 2^44 2861# unsigned __int64 s[2]; # key value*20 base 2^44 2862# unsigned __int64 r[3]; # key value base 2^44 2863# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; 2864# # r^n positions reflect 2865# # placement in register, not 2866# # memory, R[3] is R[1]*20 2867 2868$code.=<<___; 2869.type poly1305_init_base2_44,\@function,3 2870.align 32 2871poly1305_init_base2_44: 2872 xor %rax,%rax 2873 mov %rax,0($ctx) # initialize hash value 2874 mov %rax,8($ctx) 2875 mov %rax,16($ctx) 2876 2877.Linit_base2_44: 2878 lea poly1305_blocks_vpmadd52(%rip),%r10 2879 lea poly1305_emit_base2_44(%rip),%r11 2880 2881 mov \$0x0ffffffc0fffffff,%rax 2882 mov \$0x0ffffffc0ffffffc,%rcx 2883 and 0($inp),%rax 2884 mov \$0x00000fffffffffff,%r8 2885 and 8($inp),%rcx 2886 mov \$0x00000fffffffffff,%r9 2887 and %rax,%r8 2888 shrd \$44,%rcx,%rax 2889 mov %r8,40($ctx) # r0 2890 and %r9,%rax 2891 shr \$24,%rcx 2892 mov %rax,48($ctx) # r1 2893 lea (%rax,%rax,4),%rax # *5 2894 mov %rcx,56($ctx) # r2 2895 shl \$2,%rax # magic <<2 2896 lea (%rcx,%rcx,4),%rcx # *5 2897 shl \$2,%rcx # magic <<2 2898 mov %rax,24($ctx) # s1 2899 mov %rcx,32($ctx) # s2 2900 movq \$-1,64($ctx) # write impossible value 2901___ 2902$code.=<<___ if ($flavour !~ /elf32/); 2903 mov %r10,0(%rdx) 2904 mov %r11,8(%rdx) 2905___ 2906$code.=<<___ if ($flavour =~ /elf32/); 2907 mov %r10d,0(%rdx) 2908 mov %r11d,4(%rdx) 2909___ 2910$code.=<<___; 2911 mov \$1,%eax 2912 ret 2913.size poly1305_init_base2_44,.-poly1305_init_base2_44 2914___ 2915{ 2916my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); 2917my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); 2918my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); 2919 2920$code.=<<___; 2921.type poly1305_blocks_vpmadd52,\@function,4 2922.align 32 2923poly1305_blocks_vpmadd52: 2924 shr \$4,$len 2925 jz .Lno_data_vpmadd52 # too short 2926 2927 shl \$40,$padbit 2928 mov 64($ctx),%r8 # peek on power of the key 2929 2930 # if powers of the key are not calculated yet, process up to 3 2931 # blocks with this single-block subroutine, otherwise ensure that 2932 # length is divisible by 2 blocks and pass the rest down to next 2933 # subroutine... 2934 2935 mov \$3,%rax 2936 mov \$1,%r10 2937 cmp \$4,$len # is input long 2938 cmovae %r10,%rax 2939 test %r8,%r8 # is power value impossible? 2940 cmovns %r10,%rax 2941 2942 and $len,%rax # is input of favourable length? 2943 jz .Lblocks_vpmadd52_4x 2944 2945 sub %rax,$len 2946 mov \$7,%r10d 2947 mov \$1,%r11d 2948 kmovw %r10d,%k7 2949 lea .L2_44_inp_permd(%rip),%r10 2950 kmovw %r11d,%k1 2951 2952 vmovq $padbit,%x#$PAD 2953 vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd 2954 vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift 2955 vpermq \$0xcf,$PAD,$PAD 2956 vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask 2957 2958 vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value 2959 vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys 2960 vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} 2961 vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} 2962 2963 vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt 2964 vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft 2965 2966 jmp .Loop_vpmadd52 2967 2968.align 32 2969.Loop_vpmadd52: 2970 vmovdqu32 0($inp),%x#$T0 # load input as ----3210 2971 lea 16($inp),$inp 2972 2973 vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 2974 vpsrlvq $inp_shift,$T0,$T0 2975 vpandq $reduc_mask,$T0,$T0 2976 vporq $PAD,$T0,$T0 2977 2978 vpaddq $T0,$Dlo,$Dlo # accumulate input 2979 2980 vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value 2981 vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} 2982 vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} 2983 2984 vpxord $Dlo,$Dlo,$Dlo 2985 vpxord $Dhi,$Dhi,$Dhi 2986 2987 vpmadd52luq $r2r1r0,$H0,$Dlo 2988 vpmadd52huq $r2r1r0,$H0,$Dhi 2989 2990 vpmadd52luq $r1r0s2,$H1,$Dlo 2991 vpmadd52huq $r1r0s2,$H1,$Dhi 2992 2993 vpmadd52luq $r0s2s1,$H2,$Dlo 2994 vpmadd52huq $r0s2s1,$H2,$Dhi 2995 2996 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword 2997 vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword 2998 vpandq $reduc_mask,$Dlo,$Dlo 2999 3000 vpaddq $T0,$Dhi,$Dhi 3001 3002 vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword 3003 3004 vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) 3005 3006 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word 3007 vpandq $reduc_mask,$Dlo,$Dlo 3008 3009 vpermq \$0b10010011,$T0,$T0 3010 3011 vpaddq $T0,$Dlo,$Dlo 3012 3013 vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} 3014 3015 vpaddq $T0,$Dlo,$Dlo 3016 vpsllq \$2,$T0,$T0 3017 3018 vpaddq $T0,$Dlo,$Dlo 3019 3020 dec %rax # len-=16 3021 jnz .Loop_vpmadd52 3022 3023 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value 3024 3025 test $len,$len 3026 jnz .Lblocks_vpmadd52_4x 3027 3028.Lno_data_vpmadd52: 3029 ret 3030.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 3031___ 3032} 3033{ 3034######################################################################## 3035# As implied by its name 4x subroutine processes 4 blocks in parallel 3036# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power 3037# and is handled in 256-bit %ymm registers. 3038 3039my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 3040my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 3041my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 3042 3043$code.=<<___; 3044.type poly1305_blocks_vpmadd52_4x,\@function,4 3045.align 32 3046poly1305_blocks_vpmadd52_4x: 3047 shr \$4,$len 3048 jz .Lno_data_vpmadd52_4x # too short 3049 3050 shl \$40,$padbit 3051 mov 64($ctx),%r8 # peek on power of the key 3052 3053.Lblocks_vpmadd52_4x: 3054 vpbroadcastq $padbit,$PAD 3055 3056 vmovdqa64 .Lx_mask44(%rip),$mask44 3057 mov \$5,%eax 3058 vmovdqa64 .Lx_mask42(%rip),$mask42 3059 kmovw %eax,%k1 # used in 2x path 3060 3061 test %r8,%r8 # is power value impossible? 3062 js .Linit_vpmadd52 # if it is, then init R[4] 3063 3064 vmovq 0($ctx),%x#$H0 # load current hash value 3065 vmovq 8($ctx),%x#$H1 3066 vmovq 16($ctx),%x#$H2 3067 3068 test \$3,$len # is length 4*n+2? 3069 jnz .Lblocks_vpmadd52_2x_do 3070 3071.Lblocks_vpmadd52_4x_do: 3072 vpbroadcastq 64($ctx),$R0 # load 4th power of the key 3073 vpbroadcastq 96($ctx),$R1 3074 vpbroadcastq 128($ctx),$R2 3075 vpbroadcastq 160($ctx),$S1 3076 3077.Lblocks_vpmadd52_4x_key_loaded: 3078 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3079 vpaddq $R2,$S2,$S2 3080 vpsllq \$2,$S2,$S2 3081 3082 test \$7,$len # is len 8*n? 3083 jz .Lblocks_vpmadd52_8x 3084 3085 vmovdqu64 16*0($inp),$T2 # load data 3086 vmovdqu64 16*2($inp),$T3 3087 lea 16*4($inp),$inp 3088 3089 vpunpcklqdq $T3,$T2,$T1 # transpose data 3090 vpunpckhqdq $T3,$T2,$T3 3091 3092 # at this point 64-bit lanes are ordered as 3-1-2-0 3093 3094 vpsrlq \$24,$T3,$T2 # splat the data 3095 vporq $PAD,$T2,$T2 3096 vpaddq $T2,$H2,$H2 # accumulate input 3097 vpandq $mask44,$T1,$T0 3098 vpsrlq \$44,$T1,$T1 3099 vpsllq \$20,$T3,$T3 3100 vporq $T3,$T1,$T1 3101 vpandq $mask44,$T1,$T1 3102 3103 sub \$4,$len 3104 jz .Ltail_vpmadd52_4x 3105 jmp .Loop_vpmadd52_4x 3106 ud2 3107 3108.align 32 3109.Linit_vpmadd52: 3110 vmovq 24($ctx),%x#$S1 # load key 3111 vmovq 56($ctx),%x#$H2 3112 vmovq 32($ctx),%x#$S2 3113 vmovq 40($ctx),%x#$R0 3114 vmovq 48($ctx),%x#$R1 3115 3116 vmovdqa $R0,$H0 3117 vmovdqa $R1,$H1 3118 vmovdqa $H2,$R2 3119 3120 mov \$2,%eax 3121 3122.Lmul_init_vpmadd52: 3123 vpxorq $D0lo,$D0lo,$D0lo 3124 vpmadd52luq $H2,$S1,$D0lo 3125 vpxorq $D0hi,$D0hi,$D0hi 3126 vpmadd52huq $H2,$S1,$D0hi 3127 vpxorq $D1lo,$D1lo,$D1lo 3128 vpmadd52luq $H2,$S2,$D1lo 3129 vpxorq $D1hi,$D1hi,$D1hi 3130 vpmadd52huq $H2,$S2,$D1hi 3131 vpxorq $D2lo,$D2lo,$D2lo 3132 vpmadd52luq $H2,$R0,$D2lo 3133 vpxorq $D2hi,$D2hi,$D2hi 3134 vpmadd52huq $H2,$R0,$D2hi 3135 3136 vpmadd52luq $H0,$R0,$D0lo 3137 vpmadd52huq $H0,$R0,$D0hi 3138 vpmadd52luq $H0,$R1,$D1lo 3139 vpmadd52huq $H0,$R1,$D1hi 3140 vpmadd52luq $H0,$R2,$D2lo 3141 vpmadd52huq $H0,$R2,$D2hi 3142 3143 vpmadd52luq $H1,$S2,$D0lo 3144 vpmadd52huq $H1,$S2,$D0hi 3145 vpmadd52luq $H1,$R0,$D1lo 3146 vpmadd52huq $H1,$R0,$D1hi 3147 vpmadd52luq $H1,$R1,$D2lo 3148 vpmadd52huq $H1,$R1,$D2hi 3149 3150 ################################################################ 3151 # partial reduction 3152 vpsrlq \$44,$D0lo,$tmp 3153 vpsllq \$8,$D0hi,$D0hi 3154 vpandq $mask44,$D0lo,$H0 3155 vpaddq $tmp,$D0hi,$D0hi 3156 3157 vpaddq $D0hi,$D1lo,$D1lo 3158 3159 vpsrlq \$44,$D1lo,$tmp 3160 vpsllq \$8,$D1hi,$D1hi 3161 vpandq $mask44,$D1lo,$H1 3162 vpaddq $tmp,$D1hi,$D1hi 3163 3164 vpaddq $D1hi,$D2lo,$D2lo 3165 3166 vpsrlq \$42,$D2lo,$tmp 3167 vpsllq \$10,$D2hi,$D2hi 3168 vpandq $mask42,$D2lo,$H2 3169 vpaddq $tmp,$D2hi,$D2hi 3170 3171 vpaddq $D2hi,$H0,$H0 3172 vpsllq \$2,$D2hi,$D2hi 3173 3174 vpaddq $D2hi,$H0,$H0 3175 3176 vpsrlq \$44,$H0,$tmp # additional step 3177 vpandq $mask44,$H0,$H0 3178 3179 vpaddq $tmp,$H1,$H1 3180 3181 dec %eax 3182 jz .Ldone_init_vpmadd52 3183 3184 vpunpcklqdq $R1,$H1,$R1 # 1,2 3185 vpbroadcastq %x#$H1,%x#$H1 # 2,2 3186 vpunpcklqdq $R2,$H2,$R2 3187 vpbroadcastq %x#$H2,%x#$H2 3188 vpunpcklqdq $R0,$H0,$R0 3189 vpbroadcastq %x#$H0,%x#$H0 3190 3191 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3192 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3193 vpaddq $R1,$S1,$S1 3194 vpaddq $R2,$S2,$S2 3195 vpsllq \$2,$S1,$S1 3196 vpsllq \$2,$S2,$S2 3197 3198 jmp .Lmul_init_vpmadd52 3199 ud2 3200 3201.align 32 3202.Ldone_init_vpmadd52: 3203 vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 3204 vinserti128 \$1,%x#$R2,$H2,$R2 3205 vinserti128 \$1,%x#$R0,$H0,$R0 3206 3207 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 3208 vpermq \$0b11011000,$R2,$R2 3209 vpermq \$0b11011000,$R0,$R0 3210 3211 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3212 vpaddq $R1,$S1,$S1 3213 vpsllq \$2,$S1,$S1 3214 3215 vmovq 0($ctx),%x#$H0 # load current hash value 3216 vmovq 8($ctx),%x#$H1 3217 vmovq 16($ctx),%x#$H2 3218 3219 test \$3,$len # is length 4*n+2? 3220 jnz .Ldone_init_vpmadd52_2x 3221 3222 vmovdqu64 $R0,64($ctx) # save key powers 3223 vpbroadcastq %x#$R0,$R0 # broadcast 4th power 3224 vmovdqu64 $R1,96($ctx) 3225 vpbroadcastq %x#$R1,$R1 3226 vmovdqu64 $R2,128($ctx) 3227 vpbroadcastq %x#$R2,$R2 3228 vmovdqu64 $S1,160($ctx) 3229 vpbroadcastq %x#$S1,$S1 3230 3231 jmp .Lblocks_vpmadd52_4x_key_loaded 3232 ud2 3233 3234.align 32 3235.Ldone_init_vpmadd52_2x: 3236 vmovdqu64 $R0,64($ctx) # save key powers 3237 vpsrldq \$8,$R0,$R0 # 0-1-0-2 3238 vmovdqu64 $R1,96($ctx) 3239 vpsrldq \$8,$R1,$R1 3240 vmovdqu64 $R2,128($ctx) 3241 vpsrldq \$8,$R2,$R2 3242 vmovdqu64 $S1,160($ctx) 3243 vpsrldq \$8,$S1,$S1 3244 jmp .Lblocks_vpmadd52_2x_key_loaded 3245 ud2 3246 3247.align 32 3248.Lblocks_vpmadd52_2x_do: 3249 vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers 3250 vmovdqu64 160+8($ctx),${S1}{%k1}{z} 3251 vmovdqu64 64+8($ctx),${R0}{%k1}{z} 3252 vmovdqu64 96+8($ctx),${R1}{%k1}{z} 3253 3254.Lblocks_vpmadd52_2x_key_loaded: 3255 vmovdqu64 16*0($inp),$T2 # load data 3256 vpxorq $T3,$T3,$T3 3257 lea 16*2($inp),$inp 3258 3259 vpunpcklqdq $T3,$T2,$T1 # transpose data 3260 vpunpckhqdq $T3,$T2,$T3 3261 3262 # at this point 64-bit lanes are ordered as x-1-x-0 3263 3264 vpsrlq \$24,$T3,$T2 # splat the data 3265 vporq $PAD,$T2,$T2 3266 vpaddq $T2,$H2,$H2 # accumulate input 3267 vpandq $mask44,$T1,$T0 3268 vpsrlq \$44,$T1,$T1 3269 vpsllq \$20,$T3,$T3 3270 vporq $T3,$T1,$T1 3271 vpandq $mask44,$T1,$T1 3272 3273 jmp .Ltail_vpmadd52_2x 3274 ud2 3275 3276.align 32 3277.Loop_vpmadd52_4x: 3278 #vpaddq $T2,$H2,$H2 # accumulate input 3279 vpaddq $T0,$H0,$H0 3280 vpaddq $T1,$H1,$H1 3281 3282 vpxorq $D0lo,$D0lo,$D0lo 3283 vpmadd52luq $H2,$S1,$D0lo 3284 vpxorq $D0hi,$D0hi,$D0hi 3285 vpmadd52huq $H2,$S1,$D0hi 3286 vpxorq $D1lo,$D1lo,$D1lo 3287 vpmadd52luq $H2,$S2,$D1lo 3288 vpxorq $D1hi,$D1hi,$D1hi 3289 vpmadd52huq $H2,$S2,$D1hi 3290 vpxorq $D2lo,$D2lo,$D2lo 3291 vpmadd52luq $H2,$R0,$D2lo 3292 vpxorq $D2hi,$D2hi,$D2hi 3293 vpmadd52huq $H2,$R0,$D2hi 3294 3295 vmovdqu64 16*0($inp),$T2 # load data 3296 vmovdqu64 16*2($inp),$T3 3297 lea 16*4($inp),$inp 3298 vpmadd52luq $H0,$R0,$D0lo 3299 vpmadd52huq $H0,$R0,$D0hi 3300 vpmadd52luq $H0,$R1,$D1lo 3301 vpmadd52huq $H0,$R1,$D1hi 3302 vpmadd52luq $H0,$R2,$D2lo 3303 vpmadd52huq $H0,$R2,$D2hi 3304 3305 vpunpcklqdq $T3,$T2,$T1 # transpose data 3306 vpunpckhqdq $T3,$T2,$T3 3307 vpmadd52luq $H1,$S2,$D0lo 3308 vpmadd52huq $H1,$S2,$D0hi 3309 vpmadd52luq $H1,$R0,$D1lo 3310 vpmadd52huq $H1,$R0,$D1hi 3311 vpmadd52luq $H1,$R1,$D2lo 3312 vpmadd52huq $H1,$R1,$D2hi 3313 3314 ################################################################ 3315 # partial reduction (interleaved with data splat) 3316 vpsrlq \$44,$D0lo,$tmp 3317 vpsllq \$8,$D0hi,$D0hi 3318 vpandq $mask44,$D0lo,$H0 3319 vpaddq $tmp,$D0hi,$D0hi 3320 3321 vpsrlq \$24,$T3,$T2 3322 vporq $PAD,$T2,$T2 3323 vpaddq $D0hi,$D1lo,$D1lo 3324 3325 vpsrlq \$44,$D1lo,$tmp 3326 vpsllq \$8,$D1hi,$D1hi 3327 vpandq $mask44,$D1lo,$H1 3328 vpaddq $tmp,$D1hi,$D1hi 3329 3330 vpandq $mask44,$T1,$T0 3331 vpsrlq \$44,$T1,$T1 3332 vpsllq \$20,$T3,$T3 3333 vpaddq $D1hi,$D2lo,$D2lo 3334 3335 vpsrlq \$42,$D2lo,$tmp 3336 vpsllq \$10,$D2hi,$D2hi 3337 vpandq $mask42,$D2lo,$H2 3338 vpaddq $tmp,$D2hi,$D2hi 3339 3340 vpaddq $T2,$H2,$H2 # accumulate input 3341 vpaddq $D2hi,$H0,$H0 3342 vpsllq \$2,$D2hi,$D2hi 3343 3344 vpaddq $D2hi,$H0,$H0 3345 vporq $T3,$T1,$T1 3346 vpandq $mask44,$T1,$T1 3347 3348 vpsrlq \$44,$H0,$tmp # additional step 3349 vpandq $mask44,$H0,$H0 3350 3351 vpaddq $tmp,$H1,$H1 3352 3353 sub \$4,$len # len-=64 3354 jnz .Loop_vpmadd52_4x 3355 3356.Ltail_vpmadd52_4x: 3357 vmovdqu64 128($ctx),$R2 # load all key powers 3358 vmovdqu64 160($ctx),$S1 3359 vmovdqu64 64($ctx),$R0 3360 vmovdqu64 96($ctx),$R1 3361 3362.Ltail_vpmadd52_2x: 3363 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3364 vpaddq $R2,$S2,$S2 3365 vpsllq \$2,$S2,$S2 3366 3367 #vpaddq $T2,$H2,$H2 # accumulate input 3368 vpaddq $T0,$H0,$H0 3369 vpaddq $T1,$H1,$H1 3370 3371 vpxorq $D0lo,$D0lo,$D0lo 3372 vpmadd52luq $H2,$S1,$D0lo 3373 vpxorq $D0hi,$D0hi,$D0hi 3374 vpmadd52huq $H2,$S1,$D0hi 3375 vpxorq $D1lo,$D1lo,$D1lo 3376 vpmadd52luq $H2,$S2,$D1lo 3377 vpxorq $D1hi,$D1hi,$D1hi 3378 vpmadd52huq $H2,$S2,$D1hi 3379 vpxorq $D2lo,$D2lo,$D2lo 3380 vpmadd52luq $H2,$R0,$D2lo 3381 vpxorq $D2hi,$D2hi,$D2hi 3382 vpmadd52huq $H2,$R0,$D2hi 3383 3384 vpmadd52luq $H0,$R0,$D0lo 3385 vpmadd52huq $H0,$R0,$D0hi 3386 vpmadd52luq $H0,$R1,$D1lo 3387 vpmadd52huq $H0,$R1,$D1hi 3388 vpmadd52luq $H0,$R2,$D2lo 3389 vpmadd52huq $H0,$R2,$D2hi 3390 3391 vpmadd52luq $H1,$S2,$D0lo 3392 vpmadd52huq $H1,$S2,$D0hi 3393 vpmadd52luq $H1,$R0,$D1lo 3394 vpmadd52huq $H1,$R0,$D1hi 3395 vpmadd52luq $H1,$R1,$D2lo 3396 vpmadd52huq $H1,$R1,$D2hi 3397 3398 ################################################################ 3399 # horizontal addition 3400 3401 mov \$1,%eax 3402 kmovw %eax,%k1 3403 vpsrldq \$8,$D0lo,$T0 3404 vpsrldq \$8,$D0hi,$H0 3405 vpsrldq \$8,$D1lo,$T1 3406 vpsrldq \$8,$D1hi,$H1 3407 vpaddq $T0,$D0lo,$D0lo 3408 vpaddq $H0,$D0hi,$D0hi 3409 vpsrldq \$8,$D2lo,$T2 3410 vpsrldq \$8,$D2hi,$H2 3411 vpaddq $T1,$D1lo,$D1lo 3412 vpaddq $H1,$D1hi,$D1hi 3413 vpermq \$0x2,$D0lo,$T0 3414 vpermq \$0x2,$D0hi,$H0 3415 vpaddq $T2,$D2lo,$D2lo 3416 vpaddq $H2,$D2hi,$D2hi 3417 3418 vpermq \$0x2,$D1lo,$T1 3419 vpermq \$0x2,$D1hi,$H1 3420 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3421 vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3422 vpermq \$0x2,$D2lo,$T2 3423 vpermq \$0x2,$D2hi,$H2 3424 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3425 vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3426 vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3427 vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3428 3429 ################################################################ 3430 # partial reduction 3431 vpsrlq \$44,$D0lo,$tmp 3432 vpsllq \$8,$D0hi,$D0hi 3433 vpandq $mask44,$D0lo,$H0 3434 vpaddq $tmp,$D0hi,$D0hi 3435 3436 vpaddq $D0hi,$D1lo,$D1lo 3437 3438 vpsrlq \$44,$D1lo,$tmp 3439 vpsllq \$8,$D1hi,$D1hi 3440 vpandq $mask44,$D1lo,$H1 3441 vpaddq $tmp,$D1hi,$D1hi 3442 3443 vpaddq $D1hi,$D2lo,$D2lo 3444 3445 vpsrlq \$42,$D2lo,$tmp 3446 vpsllq \$10,$D2hi,$D2hi 3447 vpandq $mask42,$D2lo,$H2 3448 vpaddq $tmp,$D2hi,$D2hi 3449 3450 vpaddq $D2hi,$H0,$H0 3451 vpsllq \$2,$D2hi,$D2hi 3452 3453 vpaddq $D2hi,$H0,$H0 3454 3455 vpsrlq \$44,$H0,$tmp # additional step 3456 vpandq $mask44,$H0,$H0 3457 3458 vpaddq $tmp,$H1,$H1 3459 # at this point $len is 3460 # either 4*n+2 or 0... 3461 sub \$2,$len # len-=32 3462 ja .Lblocks_vpmadd52_4x_do 3463 3464 vmovq %x#$H0,0($ctx) 3465 vmovq %x#$H1,8($ctx) 3466 vmovq %x#$H2,16($ctx) 3467 vzeroall 3468 3469.Lno_data_vpmadd52_4x: 3470 ret 3471.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x 3472___ 3473} 3474{ 3475######################################################################## 3476# As implied by its name 8x subroutine processes 8 blocks in parallel... 3477# This is intermediate version, as it's used only in cases when input 3478# length is either 8*n, 8*n+1 or 8*n+2... 3479 3480my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 3481my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 3482my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 3483my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); 3484 3485$code.=<<___; 3486.type poly1305_blocks_vpmadd52_8x,\@function,4 3487.align 32 3488poly1305_blocks_vpmadd52_8x: 3489 shr \$4,$len 3490 jz .Lno_data_vpmadd52_8x # too short 3491 3492 shl \$40,$padbit 3493 mov 64($ctx),%r8 # peek on power of the key 3494 3495 vmovdqa64 .Lx_mask44(%rip),$mask44 3496 vmovdqa64 .Lx_mask42(%rip),$mask42 3497 3498 test %r8,%r8 # is power value impossible? 3499 js .Linit_vpmadd52 # if it is, then init R[4] 3500 3501 vmovq 0($ctx),%x#$H0 # load current hash value 3502 vmovq 8($ctx),%x#$H1 3503 vmovq 16($ctx),%x#$H2 3504 3505.Lblocks_vpmadd52_8x: 3506 ################################################################ 3507 # fist we calculate more key powers 3508 3509 vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers 3510 vmovdqu64 160($ctx),$S1 3511 vmovdqu64 64($ctx),$R0 3512 vmovdqu64 96($ctx),$R1 3513 3514 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3515 vpaddq $R2,$S2,$S2 3516 vpsllq \$2,$S2,$S2 3517 3518 vpbroadcastq %x#$R2,$RR2 # broadcast 4th power 3519 vpbroadcastq %x#$R0,$RR0 3520 vpbroadcastq %x#$R1,$RR1 3521 3522 vpxorq $D0lo,$D0lo,$D0lo 3523 vpmadd52luq $RR2,$S1,$D0lo 3524 vpxorq $D0hi,$D0hi,$D0hi 3525 vpmadd52huq $RR2,$S1,$D0hi 3526 vpxorq $D1lo,$D1lo,$D1lo 3527 vpmadd52luq $RR2,$S2,$D1lo 3528 vpxorq $D1hi,$D1hi,$D1hi 3529 vpmadd52huq $RR2,$S2,$D1hi 3530 vpxorq $D2lo,$D2lo,$D2lo 3531 vpmadd52luq $RR2,$R0,$D2lo 3532 vpxorq $D2hi,$D2hi,$D2hi 3533 vpmadd52huq $RR2,$R0,$D2hi 3534 3535 vpmadd52luq $RR0,$R0,$D0lo 3536 vpmadd52huq $RR0,$R0,$D0hi 3537 vpmadd52luq $RR0,$R1,$D1lo 3538 vpmadd52huq $RR0,$R1,$D1hi 3539 vpmadd52luq $RR0,$R2,$D2lo 3540 vpmadd52huq $RR0,$R2,$D2hi 3541 3542 vpmadd52luq $RR1,$S2,$D0lo 3543 vpmadd52huq $RR1,$S2,$D0hi 3544 vpmadd52luq $RR1,$R0,$D1lo 3545 vpmadd52huq $RR1,$R0,$D1hi 3546 vpmadd52luq $RR1,$R1,$D2lo 3547 vpmadd52huq $RR1,$R1,$D2hi 3548 3549 ################################################################ 3550 # partial reduction 3551 vpsrlq \$44,$D0lo,$tmp 3552 vpsllq \$8,$D0hi,$D0hi 3553 vpandq $mask44,$D0lo,$RR0 3554 vpaddq $tmp,$D0hi,$D0hi 3555 3556 vpaddq $D0hi,$D1lo,$D1lo 3557 3558 vpsrlq \$44,$D1lo,$tmp 3559 vpsllq \$8,$D1hi,$D1hi 3560 vpandq $mask44,$D1lo,$RR1 3561 vpaddq $tmp,$D1hi,$D1hi 3562 3563 vpaddq $D1hi,$D2lo,$D2lo 3564 3565 vpsrlq \$42,$D2lo,$tmp 3566 vpsllq \$10,$D2hi,$D2hi 3567 vpandq $mask42,$D2lo,$RR2 3568 vpaddq $tmp,$D2hi,$D2hi 3569 3570 vpaddq $D2hi,$RR0,$RR0 3571 vpsllq \$2,$D2hi,$D2hi 3572 3573 vpaddq $D2hi,$RR0,$RR0 3574 3575 vpsrlq \$44,$RR0,$tmp # additional step 3576 vpandq $mask44,$RR0,$RR0 3577 3578 vpaddq $tmp,$RR1,$RR1 3579 3580 ################################################################ 3581 # At this point Rx holds 1324 powers, RRx - 5768, and the goal 3582 # is 15263748, which reflects how data is loaded... 3583 3584 vpunpcklqdq $R2,$RR2,$T2 # 3748 3585 vpunpckhqdq $R2,$RR2,$R2 # 1526 3586 vpunpcklqdq $R0,$RR0,$T0 3587 vpunpckhqdq $R0,$RR0,$R0 3588 vpunpcklqdq $R1,$RR1,$T1 3589 vpunpckhqdq $R1,$RR1,$R1 3590___ 3591######## switch to %zmm 3592map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3593map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3594map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3595map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); 3596 3597$code.=<<___; 3598 vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 3599 vshufi64x2 \$0x44,$R0,$T0,$RR0 3600 vshufi64x2 \$0x44,$R1,$T1,$RR1 3601 3602 vmovdqu64 16*0($inp),$T2 # load data 3603 vmovdqu64 16*4($inp),$T3 3604 lea 16*8($inp),$inp 3605 3606 vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 3607 vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 3608 vpaddq $RR2,$SS2,$SS2 3609 vpaddq $RR1,$SS1,$SS1 3610 vpsllq \$2,$SS2,$SS2 3611 vpsllq \$2,$SS1,$SS1 3612 3613 vpbroadcastq $padbit,$PAD 3614 vpbroadcastq %x#$mask44,$mask44 3615 vpbroadcastq %x#$mask42,$mask42 3616 3617 vpbroadcastq %x#$SS1,$S1 # broadcast 8th power 3618 vpbroadcastq %x#$SS2,$S2 3619 vpbroadcastq %x#$RR0,$R0 3620 vpbroadcastq %x#$RR1,$R1 3621 vpbroadcastq %x#$RR2,$R2 3622 3623 vpunpcklqdq $T3,$T2,$T1 # transpose data 3624 vpunpckhqdq $T3,$T2,$T3 3625 3626 # at this point 64-bit lanes are ordered as 73625140 3627 3628 vpsrlq \$24,$T3,$T2 # splat the data 3629 vporq $PAD,$T2,$T2 3630 vpaddq $T2,$H2,$H2 # accumulate input 3631 vpandq $mask44,$T1,$T0 3632 vpsrlq \$44,$T1,$T1 3633 vpsllq \$20,$T3,$T3 3634 vporq $T3,$T1,$T1 3635 vpandq $mask44,$T1,$T1 3636 3637 sub \$8,$len 3638 jz .Ltail_vpmadd52_8x 3639 jmp .Loop_vpmadd52_8x 3640 3641.align 32 3642.Loop_vpmadd52_8x: 3643 #vpaddq $T2,$H2,$H2 # accumulate input 3644 vpaddq $T0,$H0,$H0 3645 vpaddq $T1,$H1,$H1 3646 3647 vpxorq $D0lo,$D0lo,$D0lo 3648 vpmadd52luq $H2,$S1,$D0lo 3649 vpxorq $D0hi,$D0hi,$D0hi 3650 vpmadd52huq $H2,$S1,$D0hi 3651 vpxorq $D1lo,$D1lo,$D1lo 3652 vpmadd52luq $H2,$S2,$D1lo 3653 vpxorq $D1hi,$D1hi,$D1hi 3654 vpmadd52huq $H2,$S2,$D1hi 3655 vpxorq $D2lo,$D2lo,$D2lo 3656 vpmadd52luq $H2,$R0,$D2lo 3657 vpxorq $D2hi,$D2hi,$D2hi 3658 vpmadd52huq $H2,$R0,$D2hi 3659 3660 vmovdqu64 16*0($inp),$T2 # load data 3661 vmovdqu64 16*4($inp),$T3 3662 lea 16*8($inp),$inp 3663 vpmadd52luq $H0,$R0,$D0lo 3664 vpmadd52huq $H0,$R0,$D0hi 3665 vpmadd52luq $H0,$R1,$D1lo 3666 vpmadd52huq $H0,$R1,$D1hi 3667 vpmadd52luq $H0,$R2,$D2lo 3668 vpmadd52huq $H0,$R2,$D2hi 3669 3670 vpunpcklqdq $T3,$T2,$T1 # transpose data 3671 vpunpckhqdq $T3,$T2,$T3 3672 vpmadd52luq $H1,$S2,$D0lo 3673 vpmadd52huq $H1,$S2,$D0hi 3674 vpmadd52luq $H1,$R0,$D1lo 3675 vpmadd52huq $H1,$R0,$D1hi 3676 vpmadd52luq $H1,$R1,$D2lo 3677 vpmadd52huq $H1,$R1,$D2hi 3678 3679 ################################################################ 3680 # partial reduction (interleaved with data splat) 3681 vpsrlq \$44,$D0lo,$tmp 3682 vpsllq \$8,$D0hi,$D0hi 3683 vpandq $mask44,$D0lo,$H0 3684 vpaddq $tmp,$D0hi,$D0hi 3685 3686 vpsrlq \$24,$T3,$T2 3687 vporq $PAD,$T2,$T2 3688 vpaddq $D0hi,$D1lo,$D1lo 3689 3690 vpsrlq \$44,$D1lo,$tmp 3691 vpsllq \$8,$D1hi,$D1hi 3692 vpandq $mask44,$D1lo,$H1 3693 vpaddq $tmp,$D1hi,$D1hi 3694 3695 vpandq $mask44,$T1,$T0 3696 vpsrlq \$44,$T1,$T1 3697 vpsllq \$20,$T3,$T3 3698 vpaddq $D1hi,$D2lo,$D2lo 3699 3700 vpsrlq \$42,$D2lo,$tmp 3701 vpsllq \$10,$D2hi,$D2hi 3702 vpandq $mask42,$D2lo,$H2 3703 vpaddq $tmp,$D2hi,$D2hi 3704 3705 vpaddq $T2,$H2,$H2 # accumulate input 3706 vpaddq $D2hi,$H0,$H0 3707 vpsllq \$2,$D2hi,$D2hi 3708 3709 vpaddq $D2hi,$H0,$H0 3710 vporq $T3,$T1,$T1 3711 vpandq $mask44,$T1,$T1 3712 3713 vpsrlq \$44,$H0,$tmp # additional step 3714 vpandq $mask44,$H0,$H0 3715 3716 vpaddq $tmp,$H1,$H1 3717 3718 sub \$8,$len # len-=128 3719 jnz .Loop_vpmadd52_8x 3720 3721.Ltail_vpmadd52_8x: 3722 #vpaddq $T2,$H2,$H2 # accumulate input 3723 vpaddq $T0,$H0,$H0 3724 vpaddq $T1,$H1,$H1 3725 3726 vpxorq $D0lo,$D0lo,$D0lo 3727 vpmadd52luq $H2,$SS1,$D0lo 3728 vpxorq $D0hi,$D0hi,$D0hi 3729 vpmadd52huq $H2,$SS1,$D0hi 3730 vpxorq $D1lo,$D1lo,$D1lo 3731 vpmadd52luq $H2,$SS2,$D1lo 3732 vpxorq $D1hi,$D1hi,$D1hi 3733 vpmadd52huq $H2,$SS2,$D1hi 3734 vpxorq $D2lo,$D2lo,$D2lo 3735 vpmadd52luq $H2,$RR0,$D2lo 3736 vpxorq $D2hi,$D2hi,$D2hi 3737 vpmadd52huq $H2,$RR0,$D2hi 3738 3739 vpmadd52luq $H0,$RR0,$D0lo 3740 vpmadd52huq $H0,$RR0,$D0hi 3741 vpmadd52luq $H0,$RR1,$D1lo 3742 vpmadd52huq $H0,$RR1,$D1hi 3743 vpmadd52luq $H0,$RR2,$D2lo 3744 vpmadd52huq $H0,$RR2,$D2hi 3745 3746 vpmadd52luq $H1,$SS2,$D0lo 3747 vpmadd52huq $H1,$SS2,$D0hi 3748 vpmadd52luq $H1,$RR0,$D1lo 3749 vpmadd52huq $H1,$RR0,$D1hi 3750 vpmadd52luq $H1,$RR1,$D2lo 3751 vpmadd52huq $H1,$RR1,$D2hi 3752 3753 ################################################################ 3754 # horizontal addition 3755 3756 mov \$1,%eax 3757 kmovw %eax,%k1 3758 vpsrldq \$8,$D0lo,$T0 3759 vpsrldq \$8,$D0hi,$H0 3760 vpsrldq \$8,$D1lo,$T1 3761 vpsrldq \$8,$D1hi,$H1 3762 vpaddq $T0,$D0lo,$D0lo 3763 vpaddq $H0,$D0hi,$D0hi 3764 vpsrldq \$8,$D2lo,$T2 3765 vpsrldq \$8,$D2hi,$H2 3766 vpaddq $T1,$D1lo,$D1lo 3767 vpaddq $H1,$D1hi,$D1hi 3768 vpermq \$0x2,$D0lo,$T0 3769 vpermq \$0x2,$D0hi,$H0 3770 vpaddq $T2,$D2lo,$D2lo 3771 vpaddq $H2,$D2hi,$D2hi 3772 3773 vpermq \$0x2,$D1lo,$T1 3774 vpermq \$0x2,$D1hi,$H1 3775 vpaddq $T0,$D0lo,$D0lo 3776 vpaddq $H0,$D0hi,$D0hi 3777 vpermq \$0x2,$D2lo,$T2 3778 vpermq \$0x2,$D2hi,$H2 3779 vpaddq $T1,$D1lo,$D1lo 3780 vpaddq $H1,$D1hi,$D1hi 3781 vextracti64x4 \$1,$D0lo,%y#$T0 3782 vextracti64x4 \$1,$D0hi,%y#$H0 3783 vpaddq $T2,$D2lo,$D2lo 3784 vpaddq $H2,$D2hi,$D2hi 3785 3786 vextracti64x4 \$1,$D1lo,%y#$T1 3787 vextracti64x4 \$1,$D1hi,%y#$H1 3788 vextracti64x4 \$1,$D2lo,%y#$T2 3789 vextracti64x4 \$1,$D2hi,%y#$H2 3790___ 3791######## switch back to %ymm 3792map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3793map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3794map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3795 3796$code.=<<___; 3797 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3798 vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3799 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3800 vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3801 vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3802 vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3803 3804 ################################################################ 3805 # partial reduction 3806 vpsrlq \$44,$D0lo,$tmp 3807 vpsllq \$8,$D0hi,$D0hi 3808 vpandq $mask44,$D0lo,$H0 3809 vpaddq $tmp,$D0hi,$D0hi 3810 3811 vpaddq $D0hi,$D1lo,$D1lo 3812 3813 vpsrlq \$44,$D1lo,$tmp 3814 vpsllq \$8,$D1hi,$D1hi 3815 vpandq $mask44,$D1lo,$H1 3816 vpaddq $tmp,$D1hi,$D1hi 3817 3818 vpaddq $D1hi,$D2lo,$D2lo 3819 3820 vpsrlq \$42,$D2lo,$tmp 3821 vpsllq \$10,$D2hi,$D2hi 3822 vpandq $mask42,$D2lo,$H2 3823 vpaddq $tmp,$D2hi,$D2hi 3824 3825 vpaddq $D2hi,$H0,$H0 3826 vpsllq \$2,$D2hi,$D2hi 3827 3828 vpaddq $D2hi,$H0,$H0 3829 3830 vpsrlq \$44,$H0,$tmp # additional step 3831 vpandq $mask44,$H0,$H0 3832 3833 vpaddq $tmp,$H1,$H1 3834 3835 ################################################################ 3836 3837 vmovq %x#$H0,0($ctx) 3838 vmovq %x#$H1,8($ctx) 3839 vmovq %x#$H2,16($ctx) 3840 vzeroall 3841 3842.Lno_data_vpmadd52_8x: 3843 ret 3844.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x 3845___ 3846} 3847$code.=<<___; 3848.type poly1305_emit_base2_44,\@function,3 3849.align 32 3850poly1305_emit_base2_44: 3851 mov 0($ctx),%r8 # load hash value 3852 mov 8($ctx),%r9 3853 mov 16($ctx),%r10 3854 3855 mov %r9,%rax 3856 shr \$20,%r9 3857 shl \$44,%rax 3858 mov %r10,%rcx 3859 shr \$40,%r10 3860 shl \$24,%rcx 3861 3862 add %rax,%r8 3863 adc %rcx,%r9 3864 adc \$0,%r10 3865 3866 mov %r8,%rax 3867 add \$5,%r8 # compare to modulus 3868 mov %r9,%rcx 3869 adc \$0,%r9 3870 adc \$0,%r10 3871 shr \$2,%r10 # did 130-bit value overflow? 3872 cmovnz %r8,%rax 3873 cmovnz %r9,%rcx 3874 3875 add 0($nonce),%rax # accumulate nonce 3876 adc 8($nonce),%rcx 3877 mov %rax,0($mac) # write result 3878 mov %rcx,8($mac) 3879 3880 ret 3881.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 3882___ 3883} } } 3884} 3885 3886if (!$kernel) 3887{ # chacha20-poly1305 helpers 3888my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 3889 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 3890$code.=<<___; 3891.globl xor128_encrypt_n_pad 3892.type xor128_encrypt_n_pad,\@abi-omnipotent 3893.align 16 3894xor128_encrypt_n_pad: 3895 sub $otp,$inp 3896 sub $otp,$out 3897 mov $len,%r10 # put len aside 3898 shr \$4,$len # len / 16 3899 jz .Ltail_enc 3900 nop 3901.Loop_enc_xmm: 3902 movdqu ($inp,$otp),%xmm0 3903 pxor ($otp),%xmm0 3904 movdqu %xmm0,($out,$otp) 3905 movdqa %xmm0,($otp) 3906 lea 16($otp),$otp 3907 dec $len 3908 jnz .Loop_enc_xmm 3909 3910 and \$15,%r10 # len % 16 3911 jz .Ldone_enc 3912 3913.Ltail_enc: 3914 mov \$16,$len 3915 sub %r10,$len 3916 xor %eax,%eax 3917.Loop_enc_byte: 3918 mov ($inp,$otp),%al 3919 xor ($otp),%al 3920 mov %al,($out,$otp) 3921 mov %al,($otp) 3922 lea 1($otp),$otp 3923 dec %r10 3924 jnz .Loop_enc_byte 3925 3926 xor %eax,%eax 3927.Loop_enc_pad: 3928 mov %al,($otp) 3929 lea 1($otp),$otp 3930 dec $len 3931 jnz .Loop_enc_pad 3932 3933.Ldone_enc: 3934 mov $otp,%rax 3935 ret 3936.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad 3937 3938.globl xor128_decrypt_n_pad 3939.type xor128_decrypt_n_pad,\@abi-omnipotent 3940.align 16 3941xor128_decrypt_n_pad: 3942 sub $otp,$inp 3943 sub $otp,$out 3944 mov $len,%r10 # put len aside 3945 shr \$4,$len # len / 16 3946 jz .Ltail_dec 3947 nop 3948.Loop_dec_xmm: 3949 movdqu ($inp,$otp),%xmm0 3950 movdqa ($otp),%xmm1 3951 pxor %xmm0,%xmm1 3952 movdqu %xmm1,($out,$otp) 3953 movdqa %xmm0,($otp) 3954 lea 16($otp),$otp 3955 dec $len 3956 jnz .Loop_dec_xmm 3957 3958 pxor %xmm1,%xmm1 3959 and \$15,%r10 # len % 16 3960 jz .Ldone_dec 3961 3962.Ltail_dec: 3963 mov \$16,$len 3964 sub %r10,$len 3965 xor %eax,%eax 3966 xor %r11,%r11 3967.Loop_dec_byte: 3968 mov ($inp,$otp),%r11b 3969 mov ($otp),%al 3970 xor %r11b,%al 3971 mov %al,($out,$otp) 3972 mov %r11b,($otp) 3973 lea 1($otp),$otp 3974 dec %r10 3975 jnz .Loop_dec_byte 3976 3977 xor %eax,%eax 3978.Loop_dec_pad: 3979 mov %al,($otp) 3980 lea 1($otp),$otp 3981 dec $len 3982 jnz .Loop_dec_pad 3983 3984.Ldone_dec: 3985 mov $otp,%rax 3986 ret 3987.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad 3988___ 3989} 3990 3991# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3992# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3993if ($win64) { 3994$rec="%rcx"; 3995$frame="%rdx"; 3996$context="%r8"; 3997$disp="%r9"; 3998 3999$code.=<<___; 4000.extern __imp_RtlVirtualUnwind 4001.type se_handler,\@abi-omnipotent 4002.align 16 4003se_handler: 4004 push %rsi 4005 push %rdi 4006 push %rbx 4007 push %rbp 4008 push %r12 4009 push %r13 4010 push %r14 4011 push %r15 4012 pushfq 4013 sub \$64,%rsp 4014 4015 mov 120($context),%rax # pull context->Rax 4016 mov 248($context),%rbx # pull context->Rip 4017 4018 mov 8($disp),%rsi # disp->ImageBase 4019 mov 56($disp),%r11 # disp->HandlerData 4020 4021 mov 0(%r11),%r10d # HandlerData[0] 4022 lea (%rsi,%r10),%r10 # prologue label 4023 cmp %r10,%rbx # context->Rip<.Lprologue 4024 jb .Lcommon_seh_tail 4025 4026 mov 152($context),%rax # pull context->Rsp 4027 4028 mov 4(%r11),%r10d # HandlerData[1] 4029 lea (%rsi,%r10),%r10 # epilogue label 4030 cmp %r10,%rbx # context->Rip>=.Lepilogue 4031 jae .Lcommon_seh_tail 4032 4033 lea 48(%rax),%rax 4034 4035 mov -8(%rax),%rbx 4036 mov -16(%rax),%rbp 4037 mov -24(%rax),%r12 4038 mov -32(%rax),%r13 4039 mov -40(%rax),%r14 4040 mov -48(%rax),%r15 4041 mov %rbx,144($context) # restore context->Rbx 4042 mov %rbp,160($context) # restore context->Rbp 4043 mov %r12,216($context) # restore context->R12 4044 mov %r13,224($context) # restore context->R13 4045 mov %r14,232($context) # restore context->R14 4046 mov %r15,240($context) # restore context->R14 4047 4048 jmp .Lcommon_seh_tail 4049.size se_handler,.-se_handler 4050 4051.type avx_handler,\@abi-omnipotent 4052.align 16 4053avx_handler: 4054 push %rsi 4055 push %rdi 4056 push %rbx 4057 push %rbp 4058 push %r12 4059 push %r13 4060 push %r14 4061 push %r15 4062 pushfq 4063 sub \$64,%rsp 4064 4065 mov 120($context),%rax # pull context->Rax 4066 mov 248($context),%rbx # pull context->Rip 4067 4068 mov 8($disp),%rsi # disp->ImageBase 4069 mov 56($disp),%r11 # disp->HandlerData 4070 4071 mov 0(%r11),%r10d # HandlerData[0] 4072 lea (%rsi,%r10),%r10 # prologue label 4073 cmp %r10,%rbx # context->Rip<prologue label 4074 jb .Lcommon_seh_tail 4075 4076 mov 152($context),%rax # pull context->Rsp 4077 4078 mov 4(%r11),%r10d # HandlerData[1] 4079 lea (%rsi,%r10),%r10 # epilogue label 4080 cmp %r10,%rbx # context->Rip>=epilogue label 4081 jae .Lcommon_seh_tail 4082 4083 mov 208($context),%rax # pull context->R11 4084 4085 lea 0x50(%rax),%rsi 4086 lea 0xf8(%rax),%rax 4087 lea 512($context),%rdi # &context.Xmm6 4088 mov \$20,%ecx 4089 .long 0xa548f3fc # cld; rep movsq 4090 4091.Lcommon_seh_tail: 4092 mov 8(%rax),%rdi 4093 mov 16(%rax),%rsi 4094 mov %rax,152($context) # restore context->Rsp 4095 mov %rsi,168($context) # restore context->Rsi 4096 mov %rdi,176($context) # restore context->Rdi 4097 4098 mov 40($disp),%rdi # disp->ContextRecord 4099 mov $context,%rsi # context 4100 mov \$154,%ecx # sizeof(CONTEXT) 4101 .long 0xa548f3fc # cld; rep movsq 4102 4103 mov $disp,%rsi 4104 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4105 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4106 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4107 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4108 mov 40(%rsi),%r10 # disp->ContextRecord 4109 lea 56(%rsi),%r11 # &disp->HandlerData 4110 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4111 mov %r10,32(%rsp) # arg5 4112 mov %r11,40(%rsp) # arg6 4113 mov %r12,48(%rsp) # arg7 4114 mov %rcx,56(%rsp) # arg8, (NULL) 4115 call *__imp_RtlVirtualUnwind(%rip) 4116 4117 mov \$1,%eax # ExceptionContinueSearch 4118 add \$64,%rsp 4119 popfq 4120 pop %r15 4121 pop %r14 4122 pop %r13 4123 pop %r12 4124 pop %rbp 4125 pop %rbx 4126 pop %rdi 4127 pop %rsi 4128 ret 4129.size avx_handler,.-avx_handler 4130 4131.section .pdata 4132.align 4 4133 .rva .LSEH_begin_poly1305_init_x86_64 4134 .rva .LSEH_end_poly1305_init_x86_64 4135 .rva .LSEH_info_poly1305_init_x86_64 4136 4137 .rva .LSEH_begin_poly1305_blocks_x86_64 4138 .rva .LSEH_end_poly1305_blocks_x86_64 4139 .rva .LSEH_info_poly1305_blocks_x86_64 4140 4141 .rva .LSEH_begin_poly1305_emit_x86_64 4142 .rva .LSEH_end_poly1305_emit_x86_64 4143 .rva .LSEH_info_poly1305_emit_x86_64 4144___ 4145$code.=<<___ if ($avx); 4146 .rva .LSEH_begin_poly1305_blocks_avx 4147 .rva .Lbase2_64_avx 4148 .rva .LSEH_info_poly1305_blocks_avx_1 4149 4150 .rva .Lbase2_64_avx 4151 .rva .Leven_avx 4152 .rva .LSEH_info_poly1305_blocks_avx_2 4153 4154 .rva .Leven_avx 4155 .rva .LSEH_end_poly1305_blocks_avx 4156 .rva .LSEH_info_poly1305_blocks_avx_3 4157 4158 .rva .LSEH_begin_poly1305_emit_avx 4159 .rva .LSEH_end_poly1305_emit_avx 4160 .rva .LSEH_info_poly1305_emit_avx 4161___ 4162$code.=<<___ if ($avx>1); 4163 .rva .LSEH_begin_poly1305_blocks_avx2 4164 .rva .Lbase2_64_avx2 4165 .rva .LSEH_info_poly1305_blocks_avx2_1 4166 4167 .rva .Lbase2_64_avx2 4168 .rva .Leven_avx2 4169 .rva .LSEH_info_poly1305_blocks_avx2_2 4170 4171 .rva .Leven_avx2 4172 .rva .LSEH_end_poly1305_blocks_avx2 4173 .rva .LSEH_info_poly1305_blocks_avx2_3 4174___ 4175$code.=<<___ if ($avx>2); 4176 .rva .LSEH_begin_poly1305_blocks_avx512 4177 .rva .LSEH_end_poly1305_blocks_avx512 4178 .rva .LSEH_info_poly1305_blocks_avx512 4179___ 4180$code.=<<___; 4181.section .xdata 4182.align 8 4183.LSEH_info_poly1305_init_x86_64: 4184 .byte 9,0,0,0 4185 .rva se_handler 4186 .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 4187 4188.LSEH_info_poly1305_blocks_x86_64: 4189 .byte 9,0,0,0 4190 .rva se_handler 4191 .rva .Lblocks_body,.Lblocks_epilogue 4192 4193.LSEH_info_poly1305_emit_x86_64: 4194 .byte 9,0,0,0 4195 .rva se_handler 4196 .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 4197___ 4198$code.=<<___ if ($avx); 4199.LSEH_info_poly1305_blocks_avx_1: 4200 .byte 9,0,0,0 4201 .rva se_handler 4202 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] 4203 4204.LSEH_info_poly1305_blocks_avx_2: 4205 .byte 9,0,0,0 4206 .rva se_handler 4207 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] 4208 4209.LSEH_info_poly1305_blocks_avx_3: 4210 .byte 9,0,0,0 4211 .rva avx_handler 4212 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] 4213 4214.LSEH_info_poly1305_emit_avx: 4215 .byte 9,0,0,0 4216 .rva se_handler 4217 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx 4218___ 4219$code.=<<___ if ($avx>1); 4220.LSEH_info_poly1305_blocks_avx2_1: 4221 .byte 9,0,0,0 4222 .rva se_handler 4223 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] 4224 4225.LSEH_info_poly1305_blocks_avx2_2: 4226 .byte 9,0,0,0 4227 .rva se_handler 4228 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] 4229 4230.LSEH_info_poly1305_blocks_avx2_3: 4231 .byte 9,0,0,0 4232 .rva avx_handler 4233 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] 4234___ 4235$code.=<<___ if ($avx>2); 4236.LSEH_info_poly1305_blocks_avx512: 4237 .byte 9,0,0,0 4238 .rva avx_handler 4239 .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] 4240___ 4241} 4242 4243open SELF,$0; 4244while(<SELF>) { 4245 next if (/^#!/); 4246 last if (!s/^#/\/\// and !/^$/); 4247 print; 4248} 4249close SELF; 4250 4251foreach (split('\n',$code)) { 4252 s/\`([^\`]*)\`/eval($1)/ge; 4253 s/%r([a-z]+)#d/%e$1/g; 4254 s/%r([0-9]+)#d/%r$1d/g; 4255 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; 4256 4257 if ($kernel) { 4258 s/(^\.type.*),[0-9]+$/\1/; 4259 s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; 4260 next if /^\.cfi.*/; 4261 } 4262 4263 print $_,"\n"; 4264} 4265close STDOUT; 4266