11f13597dSJung-uk Kim#! /usr/bin/env perl 283eaf7aeSJung-uk Kim# Copyright 2011-2022 The OpenSSL Project Authors. All Rights Reserved. 3e71b7053SJung-uk Kim# 4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License"). You may not use 5e71b7053SJung-uk Kim# this file except in compliance with the License. You can obtain a copy 6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at 7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html 8e71b7053SJung-uk Kim 91f13597dSJung-uk Kim 101f13597dSJung-uk Kim# ==================================================================== 111f13597dSJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 121f13597dSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 131f13597dSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 141f13597dSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 151f13597dSJung-uk Kim# ==================================================================== 161f13597dSJung-uk Kim 171f13597dSJung-uk Kim# August 2011. 181f13597dSJung-uk Kim# 191f13597dSJung-uk Kim# Companion to x86_64-mont.pl that optimizes cache-timing attack 201f13597dSJung-uk Kim# countermeasures. The subroutines are produced by replacing bp[i] 211f13597dSJung-uk Kim# references in their x86_64-mont.pl counterparts with cache-neutral 221f13597dSJung-uk Kim# references to powers table computed in BN_mod_exp_mont_consttime. 231f13597dSJung-uk Kim# In addition subroutine that scatters elements of the powers table 241f13597dSJung-uk Kim# is implemented, so that scatter-/gathering can be tuned without 251f13597dSJung-uk Kim# bn_exp.c modifications. 261f13597dSJung-uk Kim 277bded2dbSJung-uk Kim# August 2013. 287bded2dbSJung-uk Kim# 297bded2dbSJung-uk Kim# Add MULX/AD*X code paths and additional interfaces to optimize for 307bded2dbSJung-uk Kim# branch prediction unit. For input lengths that are multiples of 8 317bded2dbSJung-uk Kim# the np argument is not just modulus value, but one interleaved 327bded2dbSJung-uk Kim# with 0. This is to optimize post-condition... 337bded2dbSJung-uk Kim 34*b077aed3SPierre Pronchery# $output is the last argument if it looks like a file (it has an extension) 35*b077aed3SPierre Pronchery# $flavour is the first argument if it doesn't look like a file 36*b077aed3SPierre Pronchery$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 37*b077aed3SPierre Pronchery$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 381f13597dSJung-uk Kim 391f13597dSJung-uk Kim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 401f13597dSJung-uk Kim 411f13597dSJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 421f13597dSJung-uk Kim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 431f13597dSJung-uk Kim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 441f13597dSJung-uk Kimdie "can't locate x86_64-xlate.pl"; 451f13597dSJung-uk Kim 46*b077aed3SPierre Proncheryopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 47*b077aed3SPierre Pronchery or die "can't call $xlate: $!"; 4809286989SJung-uk Kim*STDOUT=*OUT; 491f13597dSJung-uk Kim 507bded2dbSJung-uk Kimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 517bded2dbSJung-uk Kim =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 527bded2dbSJung-uk Kim $addx = ($1>=2.23); 537bded2dbSJung-uk Kim} 547bded2dbSJung-uk Kim 557bded2dbSJung-uk Kimif (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 567bded2dbSJung-uk Kim `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 577bded2dbSJung-uk Kim $addx = ($1>=2.10); 587bded2dbSJung-uk Kim} 597bded2dbSJung-uk Kim 607bded2dbSJung-uk Kimif (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 617bded2dbSJung-uk Kim `ml64 2>&1` =~ /Version ([0-9]+)\./) { 627bded2dbSJung-uk Kim $addx = ($1>=12); 637bded2dbSJung-uk Kim} 647bded2dbSJung-uk Kim 6563c1bb51SJung-uk Kimif (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { 6680815a77SJung-uk Kim my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 6780815a77SJung-uk Kim $addx = ($ver>=3.03); 6880815a77SJung-uk Kim} 6980815a77SJung-uk Kim 701f13597dSJung-uk Kim# int bn_mul_mont_gather5( 711f13597dSJung-uk Kim$rp="%rdi"; # BN_ULONG *rp, 721f13597dSJung-uk Kim$ap="%rsi"; # const BN_ULONG *ap, 731f13597dSJung-uk Kim$bp="%rdx"; # const BN_ULONG *bp, 741f13597dSJung-uk Kim$np="%rcx"; # const BN_ULONG *np, 751f13597dSJung-uk Kim$n0="%r8"; # const BN_ULONG *n0, 761f13597dSJung-uk Kim$num="%r9"; # int num, 771f13597dSJung-uk Kim # int idx); # 0 to 2^5-1, "index" in $bp holding 781f13597dSJung-uk Kim # pre-computed powers of a', interlaced 791f13597dSJung-uk Kim # in such manner that b[0] is $bp[idx], 801f13597dSJung-uk Kim # b[1] is [2^5+idx], etc. 811f13597dSJung-uk Kim$lo0="%r10"; 821f13597dSJung-uk Kim$hi0="%r11"; 831f13597dSJung-uk Kim$hi1="%r13"; 841f13597dSJung-uk Kim$i="%r14"; 851f13597dSJung-uk Kim$j="%r15"; 861f13597dSJung-uk Kim$m0="%rbx"; 871f13597dSJung-uk Kim$m1="%rbp"; 881f13597dSJung-uk Kim 891f13597dSJung-uk Kim$code=<<___; 901f13597dSJung-uk Kim.text 911f13597dSJung-uk Kim 927bded2dbSJung-uk Kim.extern OPENSSL_ia32cap_P 937bded2dbSJung-uk Kim 941f13597dSJung-uk Kim.globl bn_mul_mont_gather5 951f13597dSJung-uk Kim.type bn_mul_mont_gather5,\@function,6 961f13597dSJung-uk Kim.align 64 971f13597dSJung-uk Kimbn_mul_mont_gather5: 98e71b7053SJung-uk Kim.cfi_startproc 99aeb5019cSJung-uk Kim mov ${num}d,${num}d 100aeb5019cSJung-uk Kim mov %rsp,%rax 101e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 1027bded2dbSJung-uk Kim test \$7,${num}d 1031f13597dSJung-uk Kim jnz .Lmul_enter 1047bded2dbSJung-uk Kim___ 1057bded2dbSJung-uk Kim$code.=<<___ if ($addx); 1067bded2dbSJung-uk Kim mov OPENSSL_ia32cap_P+8(%rip),%r11d 1077bded2dbSJung-uk Kim___ 1087bded2dbSJung-uk Kim$code.=<<___; 1091f13597dSJung-uk Kim jmp .Lmul4x_enter 1101f13597dSJung-uk Kim 1111f13597dSJung-uk Kim.align 16 1121f13597dSJung-uk Kim.Lmul_enter: 1134c6a0400SJung-uk Kim movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument 1141f13597dSJung-uk Kim push %rbx 115e71b7053SJung-uk Kim.cfi_push %rbx 1161f13597dSJung-uk Kim push %rbp 117e71b7053SJung-uk Kim.cfi_push %rbp 1181f13597dSJung-uk Kim push %r12 119e71b7053SJung-uk Kim.cfi_push %r12 1201f13597dSJung-uk Kim push %r13 121e71b7053SJung-uk Kim.cfi_push %r13 1221f13597dSJung-uk Kim push %r14 123e71b7053SJung-uk Kim.cfi_push %r14 1241f13597dSJung-uk Kim push %r15 125e71b7053SJung-uk Kim.cfi_push %r15 1264c6a0400SJung-uk Kim 127aeb5019cSJung-uk Kim neg $num 128aeb5019cSJung-uk Kim mov %rsp,%r11 129aeb5019cSJung-uk Kim lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) 130aeb5019cSJung-uk Kim neg $num # restore $num 131aeb5019cSJung-uk Kim and \$-1024,%r10 # minimize TLB usage 1321f13597dSJung-uk Kim 133e71b7053SJung-uk Kim # An OS-agnostic version of __chkstk. 134e71b7053SJung-uk Kim # 135e71b7053SJung-uk Kim # Some OSes (Windows) insist on stack being "wired" to 136b8721c16SJung-uk Kim # physical memory in strictly sequential manner, i.e. if stack 137b8721c16SJung-uk Kim # allocation spans two pages, then reference to farmost one can 138b8721c16SJung-uk Kim # be punishable by SEGV. But page walking can do good even on 139b8721c16SJung-uk Kim # other OSes, because it guarantees that villain thread hits 140b8721c16SJung-uk Kim # the guard page before it can make damage to innocent one... 141aeb5019cSJung-uk Kim sub %r10,%r11 142aeb5019cSJung-uk Kim and \$-4096,%r11 143aeb5019cSJung-uk Kim lea (%r10,%r11),%rsp 144aeb5019cSJung-uk Kim mov (%rsp),%r11 145aeb5019cSJung-uk Kim cmp %r10,%rsp 146aeb5019cSJung-uk Kim ja .Lmul_page_walk 147aeb5019cSJung-uk Kim jmp .Lmul_page_walk_done 148aeb5019cSJung-uk Kim 149b8721c16SJung-uk Kim.Lmul_page_walk: 150aeb5019cSJung-uk Kim lea -4096(%rsp),%rsp 151aeb5019cSJung-uk Kim mov (%rsp),%r11 152aeb5019cSJung-uk Kim cmp %r10,%rsp 153aeb5019cSJung-uk Kim ja .Lmul_page_walk 154aeb5019cSJung-uk Kim.Lmul_page_walk_done: 155aeb5019cSJung-uk Kim 156aeb5019cSJung-uk Kim lea .Linc(%rip),%r10 157aeb5019cSJung-uk Kim mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 158e71b7053SJung-uk Kim.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 159aeb5019cSJung-uk Kim.Lmul_body: 160b8721c16SJung-uk Kim 1614c6a0400SJung-uk Kim lea 128($bp),%r12 # reassign $bp (+size optimization) 1621f13597dSJung-uk Kim___ 1631f13597dSJung-uk Kim $bp="%r12"; 1641f13597dSJung-uk Kim $STRIDE=2**5*8; # 5 is "window size" 1651f13597dSJung-uk Kim $N=$STRIDE/4; # should match cache line size 1661f13597dSJung-uk Kim$code.=<<___; 1674c6a0400SJung-uk Kim movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 1684c6a0400SJung-uk Kim movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 1694c6a0400SJung-uk Kim lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) 1704c6a0400SJung-uk Kim and \$-16,%r10 1711f13597dSJung-uk Kim 1724c6a0400SJung-uk Kim pshufd \$0,%xmm5,%xmm5 # broadcast index 1734c6a0400SJung-uk Kim movdqa %xmm1,%xmm4 1744c6a0400SJung-uk Kim movdqa %xmm1,%xmm2 1754c6a0400SJung-uk Kim___ 1764c6a0400SJung-uk Kim######################################################################## 1774c6a0400SJung-uk Kim# calculate mask by comparing 0..31 to index and save result to stack 1784c6a0400SJung-uk Kim# 1794c6a0400SJung-uk Kim$code.=<<___; 1804c6a0400SJung-uk Kim paddd %xmm0,%xmm1 1814c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm0 # compare to 1,0 1824c6a0400SJung-uk Kim .byte 0x67 1834c6a0400SJung-uk Kim movdqa %xmm4,%xmm3 1844c6a0400SJung-uk Kim___ 1854c6a0400SJung-uk Kimfor($k=0;$k<$STRIDE/16-4;$k+=4) { 1864c6a0400SJung-uk Kim$code.=<<___; 1874c6a0400SJung-uk Kim paddd %xmm1,%xmm2 1884c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm1 # compare to 3,2 1894c6a0400SJung-uk Kim movdqa %xmm0,`16*($k+0)+112`(%r10) 1904c6a0400SJung-uk Kim movdqa %xmm4,%xmm0 1914c6a0400SJung-uk Kim 1924c6a0400SJung-uk Kim paddd %xmm2,%xmm3 1934c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm2 # compare to 5,4 1944c6a0400SJung-uk Kim movdqa %xmm1,`16*($k+1)+112`(%r10) 1954c6a0400SJung-uk Kim movdqa %xmm4,%xmm1 1964c6a0400SJung-uk Kim 1974c6a0400SJung-uk Kim paddd %xmm3,%xmm0 1984c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm3 # compare to 7,6 1994c6a0400SJung-uk Kim movdqa %xmm2,`16*($k+2)+112`(%r10) 2004c6a0400SJung-uk Kim movdqa %xmm4,%xmm2 2014c6a0400SJung-uk Kim 2024c6a0400SJung-uk Kim paddd %xmm0,%xmm1 2034c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm0 2044c6a0400SJung-uk Kim movdqa %xmm3,`16*($k+3)+112`(%r10) 2054c6a0400SJung-uk Kim movdqa %xmm4,%xmm3 2064c6a0400SJung-uk Kim___ 2074c6a0400SJung-uk Kim} 2084c6a0400SJung-uk Kim$code.=<<___; # last iteration can be optimized 2094c6a0400SJung-uk Kim paddd %xmm1,%xmm2 2104c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm1 2114c6a0400SJung-uk Kim movdqa %xmm0,`16*($k+0)+112`(%r10) 2124c6a0400SJung-uk Kim 2134c6a0400SJung-uk Kim paddd %xmm2,%xmm3 2144c6a0400SJung-uk Kim .byte 0x67 2154c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm2 2164c6a0400SJung-uk Kim movdqa %xmm1,`16*($k+1)+112`(%r10) 2174c6a0400SJung-uk Kim 2184c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm3 2194c6a0400SJung-uk Kim movdqa %xmm2,`16*($k+2)+112`(%r10) 2204c6a0400SJung-uk Kim pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register 2214c6a0400SJung-uk Kim 2224c6a0400SJung-uk Kim pand `16*($k+1)-128`($bp),%xmm1 2234c6a0400SJung-uk Kim pand `16*($k+2)-128`($bp),%xmm2 2244c6a0400SJung-uk Kim movdqa %xmm3,`16*($k+3)+112`(%r10) 2254c6a0400SJung-uk Kim pand `16*($k+3)-128`($bp),%xmm3 2261f13597dSJung-uk Kim por %xmm2,%xmm0 2274c6a0400SJung-uk Kim por %xmm3,%xmm1 2284c6a0400SJung-uk Kim___ 2294c6a0400SJung-uk Kimfor($k=0;$k<$STRIDE/16-4;$k+=4) { 2304c6a0400SJung-uk Kim$code.=<<___; 2314c6a0400SJung-uk Kim movdqa `16*($k+0)-128`($bp),%xmm4 2324c6a0400SJung-uk Kim movdqa `16*($k+1)-128`($bp),%xmm5 2334c6a0400SJung-uk Kim movdqa `16*($k+2)-128`($bp),%xmm2 2344c6a0400SJung-uk Kim pand `16*($k+0)+112`(%r10),%xmm4 2354c6a0400SJung-uk Kim movdqa `16*($k+3)-128`($bp),%xmm3 2364c6a0400SJung-uk Kim pand `16*($k+1)+112`(%r10),%xmm5 2374c6a0400SJung-uk Kim por %xmm4,%xmm0 2384c6a0400SJung-uk Kim pand `16*($k+2)+112`(%r10),%xmm2 2394c6a0400SJung-uk Kim por %xmm5,%xmm1 2404c6a0400SJung-uk Kim pand `16*($k+3)+112`(%r10),%xmm3 2414c6a0400SJung-uk Kim por %xmm2,%xmm0 2424c6a0400SJung-uk Kim por %xmm3,%xmm1 2434c6a0400SJung-uk Kim___ 2444c6a0400SJung-uk Kim} 2454c6a0400SJung-uk Kim$code.=<<___; 2464c6a0400SJung-uk Kim por %xmm1,%xmm0 2474c6a0400SJung-uk Kim pshufd \$0x4e,%xmm0,%xmm1 2484c6a0400SJung-uk Kim por %xmm1,%xmm0 2491f13597dSJung-uk Kim lea $STRIDE($bp),$bp 2501f13597dSJung-uk Kim movq %xmm0,$m0 # m0=bp[0] 2511f13597dSJung-uk Kim 2521f13597dSJung-uk Kim mov ($n0),$n0 # pull n0[0] value 2531f13597dSJung-uk Kim mov ($ap),%rax 2541f13597dSJung-uk Kim 2551f13597dSJung-uk Kim xor $i,$i # i=0 2561f13597dSJung-uk Kim xor $j,$j # j=0 2571f13597dSJung-uk Kim 2581f13597dSJung-uk Kim mov $n0,$m1 2591f13597dSJung-uk Kim mulq $m0 # ap[0]*bp[0] 2601f13597dSJung-uk Kim mov %rax,$lo0 2611f13597dSJung-uk Kim mov ($np),%rax 2621f13597dSJung-uk Kim 2631f13597dSJung-uk Kim imulq $lo0,$m1 # "tp[0]"*n0 2641f13597dSJung-uk Kim mov %rdx,$hi0 2651f13597dSJung-uk Kim 2661f13597dSJung-uk Kim mulq $m1 # np[0]*m1 2671f13597dSJung-uk Kim add %rax,$lo0 # discarded 2681f13597dSJung-uk Kim mov 8($ap),%rax 2691f13597dSJung-uk Kim adc \$0,%rdx 2701f13597dSJung-uk Kim mov %rdx,$hi1 2711f13597dSJung-uk Kim 2721f13597dSJung-uk Kim lea 1($j),$j # j++ 2731f13597dSJung-uk Kim jmp .L1st_enter 2741f13597dSJung-uk Kim 2751f13597dSJung-uk Kim.align 16 2761f13597dSJung-uk Kim.L1st: 2771f13597dSJung-uk Kim add %rax,$hi1 2781f13597dSJung-uk Kim mov ($ap,$j,8),%rax 2791f13597dSJung-uk Kim adc \$0,%rdx 2801f13597dSJung-uk Kim add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 2811f13597dSJung-uk Kim mov $lo0,$hi0 2821f13597dSJung-uk Kim adc \$0,%rdx 2831f13597dSJung-uk Kim mov $hi1,-16(%rsp,$j,8) # tp[j-1] 2841f13597dSJung-uk Kim mov %rdx,$hi1 2851f13597dSJung-uk Kim 2861f13597dSJung-uk Kim.L1st_enter: 2871f13597dSJung-uk Kim mulq $m0 # ap[j]*bp[0] 2881f13597dSJung-uk Kim add %rax,$hi0 2891f13597dSJung-uk Kim mov ($np,$j,8),%rax 2901f13597dSJung-uk Kim adc \$0,%rdx 2911f13597dSJung-uk Kim lea 1($j),$j # j++ 2921f13597dSJung-uk Kim mov %rdx,$lo0 2931f13597dSJung-uk Kim 2941f13597dSJung-uk Kim mulq $m1 # np[j]*m1 2951f13597dSJung-uk Kim cmp $num,$j 2964c6a0400SJung-uk Kim jne .L1st # note that upon exit $j==$num, so 2974c6a0400SJung-uk Kim # they can be used interchangeably 2981f13597dSJung-uk Kim 2991f13597dSJung-uk Kim add %rax,$hi1 3001f13597dSJung-uk Kim adc \$0,%rdx 3011f13597dSJung-uk Kim add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 3021f13597dSJung-uk Kim adc \$0,%rdx 3034c6a0400SJung-uk Kim mov $hi1,-16(%rsp,$num,8) # tp[num-1] 3041f13597dSJung-uk Kim mov %rdx,$hi1 3051f13597dSJung-uk Kim mov $lo0,$hi0 3061f13597dSJung-uk Kim 3071f13597dSJung-uk Kim xor %rdx,%rdx 3081f13597dSJung-uk Kim add $hi0,$hi1 3091f13597dSJung-uk Kim adc \$0,%rdx 3101f13597dSJung-uk Kim mov $hi1,-8(%rsp,$num,8) 3111f13597dSJung-uk Kim mov %rdx,(%rsp,$num,8) # store upmost overflow bit 3121f13597dSJung-uk Kim 3131f13597dSJung-uk Kim lea 1($i),$i # i++ 3141f13597dSJung-uk Kim jmp .Louter 3151f13597dSJung-uk Kim.align 16 3161f13597dSJung-uk Kim.Louter: 3174c6a0400SJung-uk Kim lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) 3184c6a0400SJung-uk Kim and \$-16,%rdx 3194c6a0400SJung-uk Kim pxor %xmm4,%xmm4 3204c6a0400SJung-uk Kim pxor %xmm5,%xmm5 3214c6a0400SJung-uk Kim___ 3224c6a0400SJung-uk Kimfor($k=0;$k<$STRIDE/16;$k+=4) { 3234c6a0400SJung-uk Kim$code.=<<___; 3244c6a0400SJung-uk Kim movdqa `16*($k+0)-128`($bp),%xmm0 3254c6a0400SJung-uk Kim movdqa `16*($k+1)-128`($bp),%xmm1 3264c6a0400SJung-uk Kim movdqa `16*($k+2)-128`($bp),%xmm2 3274c6a0400SJung-uk Kim movdqa `16*($k+3)-128`($bp),%xmm3 3284c6a0400SJung-uk Kim pand `16*($k+0)-128`(%rdx),%xmm0 3294c6a0400SJung-uk Kim pand `16*($k+1)-128`(%rdx),%xmm1 3304c6a0400SJung-uk Kim por %xmm0,%xmm4 3314c6a0400SJung-uk Kim pand `16*($k+2)-128`(%rdx),%xmm2 3324c6a0400SJung-uk Kim por %xmm1,%xmm5 3334c6a0400SJung-uk Kim pand `16*($k+3)-128`(%rdx),%xmm3 3344c6a0400SJung-uk Kim por %xmm2,%xmm4 3354c6a0400SJung-uk Kim por %xmm3,%xmm5 3364c6a0400SJung-uk Kim___ 3374c6a0400SJung-uk Kim} 3384c6a0400SJung-uk Kim$code.=<<___; 3394c6a0400SJung-uk Kim por %xmm5,%xmm4 3404c6a0400SJung-uk Kim pshufd \$0x4e,%xmm4,%xmm0 3414c6a0400SJung-uk Kim por %xmm4,%xmm0 3424c6a0400SJung-uk Kim lea $STRIDE($bp),$bp 3434c6a0400SJung-uk Kim 3444c6a0400SJung-uk Kim mov ($ap),%rax # ap[0] 3454c6a0400SJung-uk Kim movq %xmm0,$m0 # m0=bp[i] 3464c6a0400SJung-uk Kim 3471f13597dSJung-uk Kim xor $j,$j # j=0 3481f13597dSJung-uk Kim mov $n0,$m1 3491f13597dSJung-uk Kim mov (%rsp),$lo0 3501f13597dSJung-uk Kim 3511f13597dSJung-uk Kim mulq $m0 # ap[0]*bp[i] 3521f13597dSJung-uk Kim add %rax,$lo0 # ap[0]*bp[i]+tp[0] 3531f13597dSJung-uk Kim mov ($np),%rax 3541f13597dSJung-uk Kim adc \$0,%rdx 3551f13597dSJung-uk Kim 3561f13597dSJung-uk Kim imulq $lo0,$m1 # tp[0]*n0 3571f13597dSJung-uk Kim mov %rdx,$hi0 3581f13597dSJung-uk Kim 3591f13597dSJung-uk Kim mulq $m1 # np[0]*m1 3601f13597dSJung-uk Kim add %rax,$lo0 # discarded 3611f13597dSJung-uk Kim mov 8($ap),%rax 3621f13597dSJung-uk Kim adc \$0,%rdx 3631f13597dSJung-uk Kim mov 8(%rsp),$lo0 # tp[1] 3641f13597dSJung-uk Kim mov %rdx,$hi1 3651f13597dSJung-uk Kim 3661f13597dSJung-uk Kim lea 1($j),$j # j++ 3671f13597dSJung-uk Kim jmp .Linner_enter 3681f13597dSJung-uk Kim 3691f13597dSJung-uk Kim.align 16 3701f13597dSJung-uk Kim.Linner: 3711f13597dSJung-uk Kim add %rax,$hi1 3721f13597dSJung-uk Kim mov ($ap,$j,8),%rax 3731f13597dSJung-uk Kim adc \$0,%rdx 3741f13597dSJung-uk Kim add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 3751f13597dSJung-uk Kim mov (%rsp,$j,8),$lo0 3761f13597dSJung-uk Kim adc \$0,%rdx 3771f13597dSJung-uk Kim mov $hi1,-16(%rsp,$j,8) # tp[j-1] 3781f13597dSJung-uk Kim mov %rdx,$hi1 3791f13597dSJung-uk Kim 3801f13597dSJung-uk Kim.Linner_enter: 3811f13597dSJung-uk Kim mulq $m0 # ap[j]*bp[i] 3821f13597dSJung-uk Kim add %rax,$hi0 3831f13597dSJung-uk Kim mov ($np,$j,8),%rax 3841f13597dSJung-uk Kim adc \$0,%rdx 3851f13597dSJung-uk Kim add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 3861f13597dSJung-uk Kim mov %rdx,$hi0 3871f13597dSJung-uk Kim adc \$0,$hi0 3881f13597dSJung-uk Kim lea 1($j),$j # j++ 3891f13597dSJung-uk Kim 3901f13597dSJung-uk Kim mulq $m1 # np[j]*m1 3911f13597dSJung-uk Kim cmp $num,$j 3924c6a0400SJung-uk Kim jne .Linner # note that upon exit $j==$num, so 3934c6a0400SJung-uk Kim # they can be used interchangeably 3941f13597dSJung-uk Kim add %rax,$hi1 3951f13597dSJung-uk Kim adc \$0,%rdx 3961f13597dSJung-uk Kim add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 3974c6a0400SJung-uk Kim mov (%rsp,$num,8),$lo0 3981f13597dSJung-uk Kim adc \$0,%rdx 3994c6a0400SJung-uk Kim mov $hi1,-16(%rsp,$num,8) # tp[num-1] 4001f13597dSJung-uk Kim mov %rdx,$hi1 4011f13597dSJung-uk Kim 4021f13597dSJung-uk Kim xor %rdx,%rdx 4031f13597dSJung-uk Kim add $hi0,$hi1 4041f13597dSJung-uk Kim adc \$0,%rdx 4051f13597dSJung-uk Kim add $lo0,$hi1 # pull upmost overflow bit 4061f13597dSJung-uk Kim adc \$0,%rdx 4071f13597dSJung-uk Kim mov $hi1,-8(%rsp,$num,8) 4081f13597dSJung-uk Kim mov %rdx,(%rsp,$num,8) # store upmost overflow bit 4091f13597dSJung-uk Kim 4101f13597dSJung-uk Kim lea 1($i),$i # i++ 4111f13597dSJung-uk Kim cmp $num,$i 4127bded2dbSJung-uk Kim jb .Louter 4131f13597dSJung-uk Kim 4141f13597dSJung-uk Kim xor $i,$i # i=0 and clear CF! 4151f13597dSJung-uk Kim mov (%rsp),%rax # tp[0] 4161f13597dSJung-uk Kim lea (%rsp),$ap # borrow ap for tp 4171f13597dSJung-uk Kim mov $num,$j # j=num 4181f13597dSJung-uk Kim jmp .Lsub 4191f13597dSJung-uk Kim.align 16 4201f13597dSJung-uk Kim.Lsub: sbb ($np,$i,8),%rax 4211f13597dSJung-uk Kim mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 4221f13597dSJung-uk Kim mov 8($ap,$i,8),%rax # tp[i+1] 4231f13597dSJung-uk Kim lea 1($i),$i # i++ 424e71b7053SJung-uk Kim dec $j # doesn't affect CF! 4251f13597dSJung-uk Kim jnz .Lsub 4261f13597dSJung-uk Kim 4271f13597dSJung-uk Kim sbb \$0,%rax # handle upmost overflow bit 428dea77ea6SJung-uk Kim mov \$-1,%rbx 429dea77ea6SJung-uk Kim xor %rax,%rbx 4301f13597dSJung-uk Kim xor $i,$i 4311f13597dSJung-uk Kim mov $num,$j # j=num 432dea77ea6SJung-uk Kim 433dea77ea6SJung-uk Kim.Lcopy: # conditional copy 434dea77ea6SJung-uk Kim mov ($rp,$i,8),%rcx 435dea77ea6SJung-uk Kim mov (%rsp,$i,8),%rdx 436dea77ea6SJung-uk Kim and %rbx,%rcx 437dea77ea6SJung-uk Kim and %rax,%rdx 4381f13597dSJung-uk Kim mov $i,(%rsp,$i,8) # zap temporary vector 439dea77ea6SJung-uk Kim or %rcx,%rdx 440dea77ea6SJung-uk Kim mov %rdx,($rp,$i,8) # rp[i]=tp[i] 4411f13597dSJung-uk Kim lea 1($i),$i 4421f13597dSJung-uk Kim sub \$1,$j 4431f13597dSJung-uk Kim jnz .Lcopy 4441f13597dSJung-uk Kim 4451f13597dSJung-uk Kim mov 8(%rsp,$num,8),%rsi # restore %rsp 446e71b7053SJung-uk Kim.cfi_def_cfa %rsi,8 4471f13597dSJung-uk Kim mov \$1,%rax 4484c6a0400SJung-uk Kim 4497bded2dbSJung-uk Kim mov -48(%rsi),%r15 450e71b7053SJung-uk Kim.cfi_restore %r15 4517bded2dbSJung-uk Kim mov -40(%rsi),%r14 452e71b7053SJung-uk Kim.cfi_restore %r14 4537bded2dbSJung-uk Kim mov -32(%rsi),%r13 454e71b7053SJung-uk Kim.cfi_restore %r13 4557bded2dbSJung-uk Kim mov -24(%rsi),%r12 456e71b7053SJung-uk Kim.cfi_restore %r12 4577bded2dbSJung-uk Kim mov -16(%rsi),%rbp 458e71b7053SJung-uk Kim.cfi_restore %rbp 4597bded2dbSJung-uk Kim mov -8(%rsi),%rbx 460e71b7053SJung-uk Kim.cfi_restore %rbx 4617bded2dbSJung-uk Kim lea (%rsi),%rsp 462e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 4631f13597dSJung-uk Kim.Lmul_epilogue: 4641f13597dSJung-uk Kim ret 465e71b7053SJung-uk Kim.cfi_endproc 4661f13597dSJung-uk Kim.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 4671f13597dSJung-uk Kim___ 4681f13597dSJung-uk Kim{{{ 4691f13597dSJung-uk Kimmy @A=("%r10","%r11"); 4701f13597dSJung-uk Kimmy @N=("%r13","%rdi"); 4711f13597dSJung-uk Kim$code.=<<___; 4721f13597dSJung-uk Kim.type bn_mul4x_mont_gather5,\@function,6 4737bded2dbSJung-uk Kim.align 32 4741f13597dSJung-uk Kimbn_mul4x_mont_gather5: 475e71b7053SJung-uk Kim.cfi_startproc 476aeb5019cSJung-uk Kim .byte 0x67 477aeb5019cSJung-uk Kim mov %rsp,%rax 478e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 4791f13597dSJung-uk Kim.Lmul4x_enter: 4807bded2dbSJung-uk Kim___ 4817bded2dbSJung-uk Kim$code.=<<___ if ($addx); 4824c6a0400SJung-uk Kim and \$0x80108,%r11d 4834c6a0400SJung-uk Kim cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 4847bded2dbSJung-uk Kim je .Lmulx4x_enter 4857bded2dbSJung-uk Kim___ 4867bded2dbSJung-uk Kim$code.=<<___; 4871f13597dSJung-uk Kim push %rbx 488e71b7053SJung-uk Kim.cfi_push %rbx 4891f13597dSJung-uk Kim push %rbp 490e71b7053SJung-uk Kim.cfi_push %rbp 4911f13597dSJung-uk Kim push %r12 492e71b7053SJung-uk Kim.cfi_push %r12 4931f13597dSJung-uk Kim push %r13 494e71b7053SJung-uk Kim.cfi_push %r13 4951f13597dSJung-uk Kim push %r14 496e71b7053SJung-uk Kim.cfi_push %r14 4971f13597dSJung-uk Kim push %r15 498e71b7053SJung-uk Kim.cfi_push %r15 499aeb5019cSJung-uk Kim.Lmul4x_prologue: 5004c6a0400SJung-uk Kim 5017bded2dbSJung-uk Kim .byte 0x67 5024c6a0400SJung-uk Kim shl \$3,${num}d # convert $num to bytes 5034c6a0400SJung-uk Kim lea ($num,$num,2),%r10 # 3*$num in bytes 5047bded2dbSJung-uk Kim neg $num # -$num 5051f13597dSJung-uk Kim 5067bded2dbSJung-uk Kim ############################################################## 5074c6a0400SJung-uk Kim # Ensure that stack frame doesn't alias with $rptr+3*$num 5084c6a0400SJung-uk Kim # modulo 4096, which covers ret[num], am[num] and n[num] 5094c6a0400SJung-uk Kim # (see bn_exp.c). This is done to allow memory disambiguation 5104c6a0400SJung-uk Kim # logic do its magic. [Extra [num] is allocated in order 5114c6a0400SJung-uk Kim # to align with bn_power5's frame, which is cleansed after 5124c6a0400SJung-uk Kim # completing exponentiation. Extra 256 bytes is for power mask 5134c6a0400SJung-uk Kim # calculated from 7th argument, the index.] 5147bded2dbSJung-uk Kim # 5154c6a0400SJung-uk Kim lea -320(%rsp,$num,2),%r11 516aeb5019cSJung-uk Kim mov %rsp,%rbp 5174c6a0400SJung-uk Kim sub $rp,%r11 5187bded2dbSJung-uk Kim and \$4095,%r11 5197bded2dbSJung-uk Kim cmp %r11,%r10 5207bded2dbSJung-uk Kim jb .Lmul4xsp_alt 521aeb5019cSJung-uk Kim sub %r11,%rbp # align with $rp 522aeb5019cSJung-uk Kim lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 5237bded2dbSJung-uk Kim jmp .Lmul4xsp_done 5247bded2dbSJung-uk Kim 5257bded2dbSJung-uk Kim.align 32 5267bded2dbSJung-uk Kim.Lmul4xsp_alt: 5274c6a0400SJung-uk Kim lea 4096-320(,$num,2),%r10 528aeb5019cSJung-uk Kim lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 5297bded2dbSJung-uk Kim sub %r10,%r11 5307bded2dbSJung-uk Kim mov \$0,%r10 5317bded2dbSJung-uk Kim cmovc %r10,%r11 532aeb5019cSJung-uk Kim sub %r11,%rbp 5337bded2dbSJung-uk Kim.Lmul4xsp_done: 534aeb5019cSJung-uk Kim and \$-64,%rbp 535aeb5019cSJung-uk Kim mov %rsp,%r11 536aeb5019cSJung-uk Kim sub %rbp,%r11 537b8721c16SJung-uk Kim and \$-4096,%r11 538aeb5019cSJung-uk Kim lea (%rbp,%r11),%rsp 539aeb5019cSJung-uk Kim mov (%rsp),%r10 540aeb5019cSJung-uk Kim cmp %rbp,%rsp 541aeb5019cSJung-uk Kim ja .Lmul4x_page_walk 542aeb5019cSJung-uk Kim jmp .Lmul4x_page_walk_done 543aeb5019cSJung-uk Kim 544b8721c16SJung-uk Kim.Lmul4x_page_walk: 545aeb5019cSJung-uk Kim lea -4096(%rsp),%rsp 546aeb5019cSJung-uk Kim mov (%rsp),%r10 547aeb5019cSJung-uk Kim cmp %rbp,%rsp 548aeb5019cSJung-uk Kim ja .Lmul4x_page_walk 549aeb5019cSJung-uk Kim.Lmul4x_page_walk_done: 550b8721c16SJung-uk Kim 5517bded2dbSJung-uk Kim neg $num 5527bded2dbSJung-uk Kim 5537bded2dbSJung-uk Kim mov %rax,40(%rsp) 554e71b7053SJung-uk Kim.cfi_cfa_expression %rsp+40,deref,+8 5551f13597dSJung-uk Kim.Lmul4x_body: 5567bded2dbSJung-uk Kim 5577bded2dbSJung-uk Kim call mul4x_internal 5587bded2dbSJung-uk Kim 5597bded2dbSJung-uk Kim mov 40(%rsp),%rsi # restore %rsp 560e71b7053SJung-uk Kim.cfi_def_cfa %rsi,8 5617bded2dbSJung-uk Kim mov \$1,%rax 5624c6a0400SJung-uk Kim 5637bded2dbSJung-uk Kim mov -48(%rsi),%r15 564e71b7053SJung-uk Kim.cfi_restore %r15 5657bded2dbSJung-uk Kim mov -40(%rsi),%r14 566e71b7053SJung-uk Kim.cfi_restore %r14 5677bded2dbSJung-uk Kim mov -32(%rsi),%r13 568e71b7053SJung-uk Kim.cfi_restore %r13 5697bded2dbSJung-uk Kim mov -24(%rsi),%r12 570e71b7053SJung-uk Kim.cfi_restore %r12 5717bded2dbSJung-uk Kim mov -16(%rsi),%rbp 572e71b7053SJung-uk Kim.cfi_restore %rbp 5737bded2dbSJung-uk Kim mov -8(%rsi),%rbx 574e71b7053SJung-uk Kim.cfi_restore %rbx 5757bded2dbSJung-uk Kim lea (%rsi),%rsp 576e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 5777bded2dbSJung-uk Kim.Lmul4x_epilogue: 5787bded2dbSJung-uk Kim ret 579e71b7053SJung-uk Kim.cfi_endproc 5807bded2dbSJung-uk Kim.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 5817bded2dbSJung-uk Kim 5827bded2dbSJung-uk Kim.type mul4x_internal,\@abi-omnipotent 5837bded2dbSJung-uk Kim.align 32 5847bded2dbSJung-uk Kimmul4x_internal: 58517f01e99SJung-uk Kim.cfi_startproc 5864c6a0400SJung-uk Kim shl \$5,$num # $num was in bytes 5874c6a0400SJung-uk Kim movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index 5884c6a0400SJung-uk Kim lea .Linc(%rip),%rax 5894c6a0400SJung-uk Kim lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) 5907bded2dbSJung-uk Kim shr \$5,$num # restore $num 5911f13597dSJung-uk Kim___ 5921f13597dSJung-uk Kim $bp="%r12"; 5931f13597dSJung-uk Kim $STRIDE=2**5*8; # 5 is "window size" 5941f13597dSJung-uk Kim $N=$STRIDE/4; # should match cache line size 5957bded2dbSJung-uk Kim $tp=$i; 5961f13597dSJung-uk Kim$code.=<<___; 5974c6a0400SJung-uk Kim movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 5984c6a0400SJung-uk Kim movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 5994c6a0400SJung-uk Kim lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) 6004c6a0400SJung-uk Kim lea 128(%rdx),$bp # size optimization 6011f13597dSJung-uk Kim 6024c6a0400SJung-uk Kim pshufd \$0,%xmm5,%xmm5 # broadcast index 6034c6a0400SJung-uk Kim movdqa %xmm1,%xmm4 6044c6a0400SJung-uk Kim .byte 0x67,0x67 6054c6a0400SJung-uk Kim movdqa %xmm1,%xmm2 6064c6a0400SJung-uk Kim___ 6074c6a0400SJung-uk Kim######################################################################## 6084c6a0400SJung-uk Kim# calculate mask by comparing 0..31 to index and save result to stack 6094c6a0400SJung-uk Kim# 6104c6a0400SJung-uk Kim$code.=<<___; 6114c6a0400SJung-uk Kim paddd %xmm0,%xmm1 6124c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm0 # compare to 1,0 6137bded2dbSJung-uk Kim .byte 0x67 6144c6a0400SJung-uk Kim movdqa %xmm4,%xmm3 6154c6a0400SJung-uk Kim___ 6164c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16-4;$i+=4) { 6174c6a0400SJung-uk Kim$code.=<<___; 6184c6a0400SJung-uk Kim paddd %xmm1,%xmm2 6194c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm1 # compare to 3,2 6204c6a0400SJung-uk Kim movdqa %xmm0,`16*($i+0)+112`(%r10) 6214c6a0400SJung-uk Kim movdqa %xmm4,%xmm0 6224c6a0400SJung-uk Kim 6234c6a0400SJung-uk Kim paddd %xmm2,%xmm3 6244c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm2 # compare to 5,4 6254c6a0400SJung-uk Kim movdqa %xmm1,`16*($i+1)+112`(%r10) 6264c6a0400SJung-uk Kim movdqa %xmm4,%xmm1 6274c6a0400SJung-uk Kim 6284c6a0400SJung-uk Kim paddd %xmm3,%xmm0 6294c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm3 # compare to 7,6 6304c6a0400SJung-uk Kim movdqa %xmm2,`16*($i+2)+112`(%r10) 6314c6a0400SJung-uk Kim movdqa %xmm4,%xmm2 6324c6a0400SJung-uk Kim 6334c6a0400SJung-uk Kim paddd %xmm0,%xmm1 6344c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm0 6354c6a0400SJung-uk Kim movdqa %xmm3,`16*($i+3)+112`(%r10) 6364c6a0400SJung-uk Kim movdqa %xmm4,%xmm3 6374c6a0400SJung-uk Kim___ 6384c6a0400SJung-uk Kim} 6394c6a0400SJung-uk Kim$code.=<<___; # last iteration can be optimized 6404c6a0400SJung-uk Kim paddd %xmm1,%xmm2 6414c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm1 6424c6a0400SJung-uk Kim movdqa %xmm0,`16*($i+0)+112`(%r10) 6434c6a0400SJung-uk Kim 6444c6a0400SJung-uk Kim paddd %xmm2,%xmm3 6457bded2dbSJung-uk Kim .byte 0x67 6464c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm2 6474c6a0400SJung-uk Kim movdqa %xmm1,`16*($i+1)+112`(%r10) 6484c6a0400SJung-uk Kim 6494c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm3 6504c6a0400SJung-uk Kim movdqa %xmm2,`16*($i+2)+112`(%r10) 6514c6a0400SJung-uk Kim pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register 6524c6a0400SJung-uk Kim 6534c6a0400SJung-uk Kim pand `16*($i+1)-128`($bp),%xmm1 6544c6a0400SJung-uk Kim pand `16*($i+2)-128`($bp),%xmm2 6554c6a0400SJung-uk Kim movdqa %xmm3,`16*($i+3)+112`(%r10) 6564c6a0400SJung-uk Kim pand `16*($i+3)-128`($bp),%xmm3 6571f13597dSJung-uk Kim por %xmm2,%xmm0 6584c6a0400SJung-uk Kim por %xmm3,%xmm1 6594c6a0400SJung-uk Kim___ 6604c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16-4;$i+=4) { 6614c6a0400SJung-uk Kim$code.=<<___; 6624c6a0400SJung-uk Kim movdqa `16*($i+0)-128`($bp),%xmm4 6634c6a0400SJung-uk Kim movdqa `16*($i+1)-128`($bp),%xmm5 6644c6a0400SJung-uk Kim movdqa `16*($i+2)-128`($bp),%xmm2 6654c6a0400SJung-uk Kim pand `16*($i+0)+112`(%r10),%xmm4 6664c6a0400SJung-uk Kim movdqa `16*($i+3)-128`($bp),%xmm3 6674c6a0400SJung-uk Kim pand `16*($i+1)+112`(%r10),%xmm5 6684c6a0400SJung-uk Kim por %xmm4,%xmm0 6694c6a0400SJung-uk Kim pand `16*($i+2)+112`(%r10),%xmm2 6704c6a0400SJung-uk Kim por %xmm5,%xmm1 6714c6a0400SJung-uk Kim pand `16*($i+3)+112`(%r10),%xmm3 6724c6a0400SJung-uk Kim por %xmm2,%xmm0 6734c6a0400SJung-uk Kim por %xmm3,%xmm1 6744c6a0400SJung-uk Kim___ 6754c6a0400SJung-uk Kim} 6764c6a0400SJung-uk Kim$code.=<<___; 6774c6a0400SJung-uk Kim por %xmm1,%xmm0 6784c6a0400SJung-uk Kim pshufd \$0x4e,%xmm0,%xmm1 6794c6a0400SJung-uk Kim por %xmm1,%xmm0 6804c6a0400SJung-uk Kim lea $STRIDE($bp),$bp 6811f13597dSJung-uk Kim movq %xmm0,$m0 # m0=bp[0] 6824c6a0400SJung-uk Kim 6837bded2dbSJung-uk Kim mov %r13,16+8(%rsp) # save end of b[num] 6847bded2dbSJung-uk Kim mov $rp, 56+8(%rsp) # save $rp 6857bded2dbSJung-uk Kim 6861f13597dSJung-uk Kim mov ($n0),$n0 # pull n0[0] value 6871f13597dSJung-uk Kim mov ($ap),%rax 6887bded2dbSJung-uk Kim lea ($ap,$num),$ap # end of a[num] 6897bded2dbSJung-uk Kim neg $num 6901f13597dSJung-uk Kim 6911f13597dSJung-uk Kim mov $n0,$m1 6921f13597dSJung-uk Kim mulq $m0 # ap[0]*bp[0] 6931f13597dSJung-uk Kim mov %rax,$A[0] 6941f13597dSJung-uk Kim mov ($np),%rax 6951f13597dSJung-uk Kim 6961f13597dSJung-uk Kim imulq $A[0],$m1 # "tp[0]"*n0 6974c6a0400SJung-uk Kim lea 64+8(%rsp),$tp 6981f13597dSJung-uk Kim mov %rdx,$A[1] 6991f13597dSJung-uk Kim 7001f13597dSJung-uk Kim mulq $m1 # np[0]*m1 7011f13597dSJung-uk Kim add %rax,$A[0] # discarded 7027bded2dbSJung-uk Kim mov 8($ap,$num),%rax 7031f13597dSJung-uk Kim adc \$0,%rdx 7041f13597dSJung-uk Kim mov %rdx,$N[1] 7051f13597dSJung-uk Kim 7061f13597dSJung-uk Kim mulq $m0 7071f13597dSJung-uk Kim add %rax,$A[1] 7084c6a0400SJung-uk Kim mov 8*1($np),%rax 7091f13597dSJung-uk Kim adc \$0,%rdx 7101f13597dSJung-uk Kim mov %rdx,$A[0] 7111f13597dSJung-uk Kim 7121f13597dSJung-uk Kim mulq $m1 7131f13597dSJung-uk Kim add %rax,$N[1] 7147bded2dbSJung-uk Kim mov 16($ap,$num),%rax 7151f13597dSJung-uk Kim adc \$0,%rdx 7161f13597dSJung-uk Kim add $A[1],$N[1] 7177bded2dbSJung-uk Kim lea 4*8($num),$j # j=4 7184c6a0400SJung-uk Kim lea 8*4($np),$np 7191f13597dSJung-uk Kim adc \$0,%rdx 7207bded2dbSJung-uk Kim mov $N[1],($tp) 7211f13597dSJung-uk Kim mov %rdx,$N[0] 7221f13597dSJung-uk Kim jmp .L1st4x 7237bded2dbSJung-uk Kim 7247bded2dbSJung-uk Kim.align 32 7251f13597dSJung-uk Kim.L1st4x: 7261f13597dSJung-uk Kim mulq $m0 # ap[j]*bp[0] 7271f13597dSJung-uk Kim add %rax,$A[0] 7284c6a0400SJung-uk Kim mov -8*2($np),%rax 7297bded2dbSJung-uk Kim lea 32($tp),$tp 7301f13597dSJung-uk Kim adc \$0,%rdx 7311f13597dSJung-uk Kim mov %rdx,$A[1] 7321f13597dSJung-uk Kim 7331f13597dSJung-uk Kim mulq $m1 # np[j]*m1 7341f13597dSJung-uk Kim add %rax,$N[0] 7357bded2dbSJung-uk Kim mov -8($ap,$j),%rax 7361f13597dSJung-uk Kim adc \$0,%rdx 7371f13597dSJung-uk Kim add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 7381f13597dSJung-uk Kim adc \$0,%rdx 7397bded2dbSJung-uk Kim mov $N[0],-24($tp) # tp[j-1] 7401f13597dSJung-uk Kim mov %rdx,$N[1] 7411f13597dSJung-uk Kim 7421f13597dSJung-uk Kim mulq $m0 # ap[j]*bp[0] 7431f13597dSJung-uk Kim add %rax,$A[1] 7444c6a0400SJung-uk Kim mov -8*1($np),%rax 7451f13597dSJung-uk Kim adc \$0,%rdx 7461f13597dSJung-uk Kim mov %rdx,$A[0] 7471f13597dSJung-uk Kim 7481f13597dSJung-uk Kim mulq $m1 # np[j]*m1 7491f13597dSJung-uk Kim add %rax,$N[1] 7507bded2dbSJung-uk Kim mov ($ap,$j),%rax 7511f13597dSJung-uk Kim adc \$0,%rdx 7521f13597dSJung-uk Kim add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 7531f13597dSJung-uk Kim adc \$0,%rdx 7547bded2dbSJung-uk Kim mov $N[1],-16($tp) # tp[j-1] 7551f13597dSJung-uk Kim mov %rdx,$N[0] 7561f13597dSJung-uk Kim 7571f13597dSJung-uk Kim mulq $m0 # ap[j]*bp[0] 7581f13597dSJung-uk Kim add %rax,$A[0] 7594c6a0400SJung-uk Kim mov 8*0($np),%rax 7601f13597dSJung-uk Kim adc \$0,%rdx 7611f13597dSJung-uk Kim mov %rdx,$A[1] 7621f13597dSJung-uk Kim 7631f13597dSJung-uk Kim mulq $m1 # np[j]*m1 7641f13597dSJung-uk Kim add %rax,$N[0] 7657bded2dbSJung-uk Kim mov 8($ap,$j),%rax 7661f13597dSJung-uk Kim adc \$0,%rdx 7671f13597dSJung-uk Kim add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 7681f13597dSJung-uk Kim adc \$0,%rdx 7697bded2dbSJung-uk Kim mov $N[0],-8($tp) # tp[j-1] 7701f13597dSJung-uk Kim mov %rdx,$N[1] 7711f13597dSJung-uk Kim 7721f13597dSJung-uk Kim mulq $m0 # ap[j]*bp[0] 7731f13597dSJung-uk Kim add %rax,$A[1] 7744c6a0400SJung-uk Kim mov 8*1($np),%rax 7751f13597dSJung-uk Kim adc \$0,%rdx 7761f13597dSJung-uk Kim mov %rdx,$A[0] 7771f13597dSJung-uk Kim 7781f13597dSJung-uk Kim mulq $m1 # np[j]*m1 7791f13597dSJung-uk Kim add %rax,$N[1] 7807bded2dbSJung-uk Kim mov 16($ap,$j),%rax 7811f13597dSJung-uk Kim adc \$0,%rdx 7821f13597dSJung-uk Kim add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 7834c6a0400SJung-uk Kim lea 8*4($np),$np 7841f13597dSJung-uk Kim adc \$0,%rdx 7857bded2dbSJung-uk Kim mov $N[1],($tp) # tp[j-1] 7861f13597dSJung-uk Kim mov %rdx,$N[0] 7877bded2dbSJung-uk Kim 7887bded2dbSJung-uk Kim add \$32,$j # j+=4 7897bded2dbSJung-uk Kim jnz .L1st4x 7901f13597dSJung-uk Kim 7911f13597dSJung-uk Kim mulq $m0 # ap[j]*bp[0] 7921f13597dSJung-uk Kim add %rax,$A[0] 7934c6a0400SJung-uk Kim mov -8*2($np),%rax 7947bded2dbSJung-uk Kim lea 32($tp),$tp 7951f13597dSJung-uk Kim adc \$0,%rdx 7961f13597dSJung-uk Kim mov %rdx,$A[1] 7971f13597dSJung-uk Kim 7981f13597dSJung-uk Kim mulq $m1 # np[j]*m1 7991f13597dSJung-uk Kim add %rax,$N[0] 8007bded2dbSJung-uk Kim mov -8($ap),%rax 8011f13597dSJung-uk Kim adc \$0,%rdx 8021f13597dSJung-uk Kim add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 8031f13597dSJung-uk Kim adc \$0,%rdx 8047bded2dbSJung-uk Kim mov $N[0],-24($tp) # tp[j-1] 8051f13597dSJung-uk Kim mov %rdx,$N[1] 8061f13597dSJung-uk Kim 8071f13597dSJung-uk Kim mulq $m0 # ap[j]*bp[0] 8081f13597dSJung-uk Kim add %rax,$A[1] 8094c6a0400SJung-uk Kim mov -8*1($np),%rax 8101f13597dSJung-uk Kim adc \$0,%rdx 8111f13597dSJung-uk Kim mov %rdx,$A[0] 8121f13597dSJung-uk Kim 8131f13597dSJung-uk Kim mulq $m1 # np[j]*m1 8141f13597dSJung-uk Kim add %rax,$N[1] 8157bded2dbSJung-uk Kim mov ($ap,$num),%rax # ap[0] 8161f13597dSJung-uk Kim adc \$0,%rdx 8171f13597dSJung-uk Kim add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 8181f13597dSJung-uk Kim adc \$0,%rdx 8197bded2dbSJung-uk Kim mov $N[1],-16($tp) # tp[j-1] 8201f13597dSJung-uk Kim mov %rdx,$N[0] 8211f13597dSJung-uk Kim 8224c6a0400SJung-uk Kim lea ($np,$num),$np # rewind $np 8231f13597dSJung-uk Kim 8241f13597dSJung-uk Kim xor $N[1],$N[1] 8251f13597dSJung-uk Kim add $A[0],$N[0] 8261f13597dSJung-uk Kim adc \$0,$N[1] 8277bded2dbSJung-uk Kim mov $N[0],-8($tp) 8281f13597dSJung-uk Kim 8297bded2dbSJung-uk Kim jmp .Louter4x 8307bded2dbSJung-uk Kim 8317bded2dbSJung-uk Kim.align 32 8321f13597dSJung-uk Kim.Louter4x: 8334c6a0400SJung-uk Kim lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) 8344c6a0400SJung-uk Kim pxor %xmm4,%xmm4 8354c6a0400SJung-uk Kim pxor %xmm5,%xmm5 8364c6a0400SJung-uk Kim___ 8374c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16;$i+=4) { 8384c6a0400SJung-uk Kim$code.=<<___; 8394c6a0400SJung-uk Kim movdqa `16*($i+0)-128`($bp),%xmm0 8404c6a0400SJung-uk Kim movdqa `16*($i+1)-128`($bp),%xmm1 8414c6a0400SJung-uk Kim movdqa `16*($i+2)-128`($bp),%xmm2 8424c6a0400SJung-uk Kim movdqa `16*($i+3)-128`($bp),%xmm3 8434c6a0400SJung-uk Kim pand `16*($i+0)-128`(%rdx),%xmm0 8444c6a0400SJung-uk Kim pand `16*($i+1)-128`(%rdx),%xmm1 8454c6a0400SJung-uk Kim por %xmm0,%xmm4 8464c6a0400SJung-uk Kim pand `16*($i+2)-128`(%rdx),%xmm2 8474c6a0400SJung-uk Kim por %xmm1,%xmm5 8484c6a0400SJung-uk Kim pand `16*($i+3)-128`(%rdx),%xmm3 8494c6a0400SJung-uk Kim por %xmm2,%xmm4 8504c6a0400SJung-uk Kim por %xmm3,%xmm5 8514c6a0400SJung-uk Kim___ 8524c6a0400SJung-uk Kim} 8534c6a0400SJung-uk Kim$code.=<<___; 8544c6a0400SJung-uk Kim por %xmm5,%xmm4 8554c6a0400SJung-uk Kim pshufd \$0x4e,%xmm4,%xmm0 8564c6a0400SJung-uk Kim por %xmm4,%xmm0 8574c6a0400SJung-uk Kim lea $STRIDE($bp),$bp 8584c6a0400SJung-uk Kim movq %xmm0,$m0 # m0=bp[i] 8594c6a0400SJung-uk Kim 8607bded2dbSJung-uk Kim mov ($tp,$num),$A[0] 8611f13597dSJung-uk Kim mov $n0,$m1 8621f13597dSJung-uk Kim mulq $m0 # ap[0]*bp[i] 8631f13597dSJung-uk Kim add %rax,$A[0] # ap[0]*bp[i]+tp[0] 8641f13597dSJung-uk Kim mov ($np),%rax 8651f13597dSJung-uk Kim adc \$0,%rdx 8661f13597dSJung-uk Kim 8677bded2dbSJung-uk Kim imulq $A[0],$m1 # tp[0]*n0 8687bded2dbSJung-uk Kim mov %rdx,$A[1] 8697bded2dbSJung-uk Kim mov $N[1],($tp) # store upmost overflow bit 8707bded2dbSJung-uk Kim 8717bded2dbSJung-uk Kim lea ($tp,$num),$tp # rewind $tp 8721f13597dSJung-uk Kim 8731f13597dSJung-uk Kim mulq $m1 # np[0]*m1 8741f13597dSJung-uk Kim add %rax,$A[0] # "$N[0]", discarded 8757bded2dbSJung-uk Kim mov 8($ap,$num),%rax 8761f13597dSJung-uk Kim adc \$0,%rdx 8771f13597dSJung-uk Kim mov %rdx,$N[1] 8781f13597dSJung-uk Kim 8791f13597dSJung-uk Kim mulq $m0 # ap[j]*bp[i] 8801f13597dSJung-uk Kim add %rax,$A[1] 8814c6a0400SJung-uk Kim mov 8*1($np),%rax 8821f13597dSJung-uk Kim adc \$0,%rdx 8837bded2dbSJung-uk Kim add 8($tp),$A[1] # +tp[1] 8841f13597dSJung-uk Kim adc \$0,%rdx 8851f13597dSJung-uk Kim mov %rdx,$A[0] 8861f13597dSJung-uk Kim 8871f13597dSJung-uk Kim mulq $m1 # np[j]*m1 8881f13597dSJung-uk Kim add %rax,$N[1] 8897bded2dbSJung-uk Kim mov 16($ap,$num),%rax 8901f13597dSJung-uk Kim adc \$0,%rdx 8911f13597dSJung-uk Kim add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 8927bded2dbSJung-uk Kim lea 4*8($num),$j # j=4 8934c6a0400SJung-uk Kim lea 8*4($np),$np 8941f13597dSJung-uk Kim adc \$0,%rdx 8951f13597dSJung-uk Kim mov %rdx,$N[0] 8961f13597dSJung-uk Kim jmp .Linner4x 8977bded2dbSJung-uk Kim 8987bded2dbSJung-uk Kim.align 32 8991f13597dSJung-uk Kim.Linner4x: 9001f13597dSJung-uk Kim mulq $m0 # ap[j]*bp[i] 9011f13597dSJung-uk Kim add %rax,$A[0] 9024c6a0400SJung-uk Kim mov -8*2($np),%rax 9031f13597dSJung-uk Kim adc \$0,%rdx 9047bded2dbSJung-uk Kim add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 9057bded2dbSJung-uk Kim lea 32($tp),$tp 9061f13597dSJung-uk Kim adc \$0,%rdx 9071f13597dSJung-uk Kim mov %rdx,$A[1] 9081f13597dSJung-uk Kim 9091f13597dSJung-uk Kim mulq $m1 # np[j]*m1 9101f13597dSJung-uk Kim add %rax,$N[0] 9117bded2dbSJung-uk Kim mov -8($ap,$j),%rax 9121f13597dSJung-uk Kim adc \$0,%rdx 9131f13597dSJung-uk Kim add $A[0],$N[0] 9141f13597dSJung-uk Kim adc \$0,%rdx 9157bded2dbSJung-uk Kim mov $N[1],-32($tp) # tp[j-1] 9161f13597dSJung-uk Kim mov %rdx,$N[1] 9171f13597dSJung-uk Kim 9181f13597dSJung-uk Kim mulq $m0 # ap[j]*bp[i] 9191f13597dSJung-uk Kim add %rax,$A[1] 9204c6a0400SJung-uk Kim mov -8*1($np),%rax 9211f13597dSJung-uk Kim adc \$0,%rdx 9227bded2dbSJung-uk Kim add -8($tp),$A[1] 9231f13597dSJung-uk Kim adc \$0,%rdx 9241f13597dSJung-uk Kim mov %rdx,$A[0] 9251f13597dSJung-uk Kim 9261f13597dSJung-uk Kim mulq $m1 # np[j]*m1 9271f13597dSJung-uk Kim add %rax,$N[1] 9287bded2dbSJung-uk Kim mov ($ap,$j),%rax 9291f13597dSJung-uk Kim adc \$0,%rdx 9301f13597dSJung-uk Kim add $A[1],$N[1] 9311f13597dSJung-uk Kim adc \$0,%rdx 9327bded2dbSJung-uk Kim mov $N[0],-24($tp) # tp[j-1] 9331f13597dSJung-uk Kim mov %rdx,$N[0] 9341f13597dSJung-uk Kim 9351f13597dSJung-uk Kim mulq $m0 # ap[j]*bp[i] 9361f13597dSJung-uk Kim add %rax,$A[0] 9374c6a0400SJung-uk Kim mov 8*0($np),%rax 9381f13597dSJung-uk Kim adc \$0,%rdx 9397bded2dbSJung-uk Kim add ($tp),$A[0] # ap[j]*bp[i]+tp[j] 9401f13597dSJung-uk Kim adc \$0,%rdx 9411f13597dSJung-uk Kim mov %rdx,$A[1] 9421f13597dSJung-uk Kim 9431f13597dSJung-uk Kim mulq $m1 # np[j]*m1 9441f13597dSJung-uk Kim add %rax,$N[0] 9457bded2dbSJung-uk Kim mov 8($ap,$j),%rax 9461f13597dSJung-uk Kim adc \$0,%rdx 9471f13597dSJung-uk Kim add $A[0],$N[0] 9481f13597dSJung-uk Kim adc \$0,%rdx 9497bded2dbSJung-uk Kim mov $N[1],-16($tp) # tp[j-1] 9501f13597dSJung-uk Kim mov %rdx,$N[1] 9511f13597dSJung-uk Kim 9521f13597dSJung-uk Kim mulq $m0 # ap[j]*bp[i] 9531f13597dSJung-uk Kim add %rax,$A[1] 9544c6a0400SJung-uk Kim mov 8*1($np),%rax 9551f13597dSJung-uk Kim adc \$0,%rdx 9567bded2dbSJung-uk Kim add 8($tp),$A[1] 9571f13597dSJung-uk Kim adc \$0,%rdx 9581f13597dSJung-uk Kim mov %rdx,$A[0] 9591f13597dSJung-uk Kim 9601f13597dSJung-uk Kim mulq $m1 # np[j]*m1 9611f13597dSJung-uk Kim add %rax,$N[1] 9627bded2dbSJung-uk Kim mov 16($ap,$j),%rax 9631f13597dSJung-uk Kim adc \$0,%rdx 9641f13597dSJung-uk Kim add $A[1],$N[1] 9654c6a0400SJung-uk Kim lea 8*4($np),$np 9661f13597dSJung-uk Kim adc \$0,%rdx 9677bded2dbSJung-uk Kim mov $N[0],-8($tp) # tp[j-1] 9681f13597dSJung-uk Kim mov %rdx,$N[0] 9697bded2dbSJung-uk Kim 9707bded2dbSJung-uk Kim add \$32,$j # j+=4 9717bded2dbSJung-uk Kim jnz .Linner4x 9721f13597dSJung-uk Kim 9731f13597dSJung-uk Kim mulq $m0 # ap[j]*bp[i] 9741f13597dSJung-uk Kim add %rax,$A[0] 9754c6a0400SJung-uk Kim mov -8*2($np),%rax 9761f13597dSJung-uk Kim adc \$0,%rdx 9777bded2dbSJung-uk Kim add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 9787bded2dbSJung-uk Kim lea 32($tp),$tp 9791f13597dSJung-uk Kim adc \$0,%rdx 9801f13597dSJung-uk Kim mov %rdx,$A[1] 9811f13597dSJung-uk Kim 9821f13597dSJung-uk Kim mulq $m1 # np[j]*m1 9831f13597dSJung-uk Kim add %rax,$N[0] 9847bded2dbSJung-uk Kim mov -8($ap),%rax 9851f13597dSJung-uk Kim adc \$0,%rdx 9861f13597dSJung-uk Kim add $A[0],$N[0] 9871f13597dSJung-uk Kim adc \$0,%rdx 9887bded2dbSJung-uk Kim mov $N[1],-32($tp) # tp[j-1] 9891f13597dSJung-uk Kim mov %rdx,$N[1] 9901f13597dSJung-uk Kim 9911f13597dSJung-uk Kim mulq $m0 # ap[j]*bp[i] 9921f13597dSJung-uk Kim add %rax,$A[1] 9937bded2dbSJung-uk Kim mov $m1,%rax 9944c6a0400SJung-uk Kim mov -8*1($np),$m1 9951f13597dSJung-uk Kim adc \$0,%rdx 9967bded2dbSJung-uk Kim add -8($tp),$A[1] 9971f13597dSJung-uk Kim adc \$0,%rdx 9981f13597dSJung-uk Kim mov %rdx,$A[0] 9991f13597dSJung-uk Kim 10001f13597dSJung-uk Kim mulq $m1 # np[j]*m1 10011f13597dSJung-uk Kim add %rax,$N[1] 10027bded2dbSJung-uk Kim mov ($ap,$num),%rax # ap[0] 10031f13597dSJung-uk Kim adc \$0,%rdx 10041f13597dSJung-uk Kim add $A[1],$N[1] 10051f13597dSJung-uk Kim adc \$0,%rdx 10067bded2dbSJung-uk Kim mov $N[0],-24($tp) # tp[j-1] 10071f13597dSJung-uk Kim mov %rdx,$N[0] 10081f13597dSJung-uk Kim 10097bded2dbSJung-uk Kim mov $N[1],-16($tp) # tp[j-1] 10104c6a0400SJung-uk Kim lea ($np,$num),$np # rewind $np 10111f13597dSJung-uk Kim 10121f13597dSJung-uk Kim xor $N[1],$N[1] 10131f13597dSJung-uk Kim add $A[0],$N[0] 10141f13597dSJung-uk Kim adc \$0,$N[1] 10157bded2dbSJung-uk Kim add ($tp),$N[0] # pull upmost overflow bit 10167bded2dbSJung-uk Kim adc \$0,$N[1] # upmost overflow bit 10177bded2dbSJung-uk Kim mov $N[0],-8($tp) 10181f13597dSJung-uk Kim 10197bded2dbSJung-uk Kim cmp 16+8(%rsp),$bp 10207bded2dbSJung-uk Kim jb .Louter4x 10211f13597dSJung-uk Kim___ 10227bded2dbSJung-uk Kimif (1) { 10231f13597dSJung-uk Kim$code.=<<___; 10244c6a0400SJung-uk Kim xor %rax,%rax 10257bded2dbSJung-uk Kim sub $N[0],$m1 # compare top-most words 10267bded2dbSJung-uk Kim adc $j,$j # $j is zero 10277bded2dbSJung-uk Kim or $j,$N[1] 10284c6a0400SJung-uk Kim sub $N[1],%rax # %rax=-$N[1] 10297bded2dbSJung-uk Kim lea ($tp,$num),%rbx # tptr in .sqr4x_sub 10304c6a0400SJung-uk Kim mov ($np),%r12 10314c6a0400SJung-uk Kim lea ($np),%rbp # nptr in .sqr4x_sub 10327bded2dbSJung-uk Kim mov %r9,%rcx 10334c6a0400SJung-uk Kim sar \$3+2,%rcx 10347bded2dbSJung-uk Kim mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub 10354c6a0400SJung-uk Kim dec %r12 # so that after 'not' we get -n[0] 10364c6a0400SJung-uk Kim xor %r10,%r10 10374c6a0400SJung-uk Kim mov 8*1(%rbp),%r13 10384c6a0400SJung-uk Kim mov 8*2(%rbp),%r14 10394c6a0400SJung-uk Kim mov 8*3(%rbp),%r15 10404c6a0400SJung-uk Kim jmp .Lsqr4x_sub_entry 10417bded2dbSJung-uk Kim___ 10427bded2dbSJung-uk Kim} else { 10437bded2dbSJung-uk Kimmy @ri=("%rax",$bp,$m0,$m1); 10447bded2dbSJung-uk Kimmy $rp="%rdx"; 10457bded2dbSJung-uk Kim$code.=<<___ 10467bded2dbSJung-uk Kim xor \$1,$N[1] 10477bded2dbSJung-uk Kim lea ($tp,$num),$tp # rewind $tp 10487bded2dbSJung-uk Kim sar \$5,$num # cf=0 10497bded2dbSJung-uk Kim lea ($np,$N[1],8),$np 10507bded2dbSJung-uk Kim mov 56+8(%rsp),$rp # restore $rp 10511f13597dSJung-uk Kim jmp .Lsub4x 10527bded2dbSJung-uk Kim 10537bded2dbSJung-uk Kim.align 32 10541f13597dSJung-uk Kim.Lsub4x: 10557bded2dbSJung-uk Kim .byte 0x66 10567bded2dbSJung-uk Kim mov 8*0($tp),@ri[0] 10577bded2dbSJung-uk Kim mov 8*1($tp),@ri[1] 10587bded2dbSJung-uk Kim .byte 0x66 10597bded2dbSJung-uk Kim sbb 16*0($np),@ri[0] 10607bded2dbSJung-uk Kim mov 8*2($tp),@ri[2] 10617bded2dbSJung-uk Kim sbb 16*1($np),@ri[1] 10627bded2dbSJung-uk Kim mov 3*8($tp),@ri[3] 10637bded2dbSJung-uk Kim lea 4*8($tp),$tp 10647bded2dbSJung-uk Kim sbb 16*2($np),@ri[2] 10657bded2dbSJung-uk Kim mov @ri[0],8*0($rp) 10667bded2dbSJung-uk Kim sbb 16*3($np),@ri[3] 10677bded2dbSJung-uk Kim lea 16*4($np),$np 10687bded2dbSJung-uk Kim mov @ri[1],8*1($rp) 10697bded2dbSJung-uk Kim mov @ri[2],8*2($rp) 10707bded2dbSJung-uk Kim mov @ri[3],8*3($rp) 10717bded2dbSJung-uk Kim lea 8*4($rp),$rp 10727bded2dbSJung-uk Kim 10737bded2dbSJung-uk Kim inc $num 10741f13597dSJung-uk Kim jnz .Lsub4x 10751f13597dSJung-uk Kim 10767bded2dbSJung-uk Kim ret 10771f13597dSJung-uk Kim___ 10781f13597dSJung-uk Kim} 10791f13597dSJung-uk Kim$code.=<<___; 108017f01e99SJung-uk Kim.cfi_endproc 10817bded2dbSJung-uk Kim.size mul4x_internal,.-mul4x_internal 10827bded2dbSJung-uk Kim___ 10837bded2dbSJung-uk Kim}}} 10847bded2dbSJung-uk Kim{{{ 10857bded2dbSJung-uk Kim###################################################################### 10867bded2dbSJung-uk Kim# void bn_power5( 10877bded2dbSJung-uk Kimmy $rptr="%rdi"; # BN_ULONG *rptr, 10887bded2dbSJung-uk Kimmy $aptr="%rsi"; # const BN_ULONG *aptr, 10897bded2dbSJung-uk Kimmy $bptr="%rdx"; # const void *table, 10907bded2dbSJung-uk Kimmy $nptr="%rcx"; # const BN_ULONG *nptr, 10917bded2dbSJung-uk Kimmy $n0 ="%r8"; # const BN_ULONG *n0); 10927bded2dbSJung-uk Kimmy $num ="%r9"; # int num, has to be divisible by 8 10937bded2dbSJung-uk Kim # int pwr 10947bded2dbSJung-uk Kim 10957bded2dbSJung-uk Kimmy ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 10967bded2dbSJung-uk Kimmy @A0=("%r10","%r11"); 10977bded2dbSJung-uk Kimmy @A1=("%r12","%r13"); 10987bded2dbSJung-uk Kimmy ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 10997bded2dbSJung-uk Kim 11007bded2dbSJung-uk Kim$code.=<<___; 11017bded2dbSJung-uk Kim.globl bn_power5 11027bded2dbSJung-uk Kim.type bn_power5,\@function,6 11037bded2dbSJung-uk Kim.align 32 11047bded2dbSJung-uk Kimbn_power5: 1105e71b7053SJung-uk Kim.cfi_startproc 1106aeb5019cSJung-uk Kim mov %rsp,%rax 1107e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 11087bded2dbSJung-uk Kim___ 11097bded2dbSJung-uk Kim$code.=<<___ if ($addx); 11107bded2dbSJung-uk Kim mov OPENSSL_ia32cap_P+8(%rip),%r11d 11114c6a0400SJung-uk Kim and \$0x80108,%r11d 11124c6a0400SJung-uk Kim cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 11137bded2dbSJung-uk Kim je .Lpowerx5_enter 11147bded2dbSJung-uk Kim___ 11157bded2dbSJung-uk Kim$code.=<<___; 11167bded2dbSJung-uk Kim push %rbx 1117e71b7053SJung-uk Kim.cfi_push %rbx 11187bded2dbSJung-uk Kim push %rbp 1119e71b7053SJung-uk Kim.cfi_push %rbp 11207bded2dbSJung-uk Kim push %r12 1121e71b7053SJung-uk Kim.cfi_push %r12 11227bded2dbSJung-uk Kim push %r13 1123e71b7053SJung-uk Kim.cfi_push %r13 11247bded2dbSJung-uk Kim push %r14 1125e71b7053SJung-uk Kim.cfi_push %r14 11267bded2dbSJung-uk Kim push %r15 1127e71b7053SJung-uk Kim.cfi_push %r15 1128aeb5019cSJung-uk Kim.Lpower5_prologue: 11294c6a0400SJung-uk Kim 11307bded2dbSJung-uk Kim shl \$3,${num}d # convert $num to bytes 11314c6a0400SJung-uk Kim lea ($num,$num,2),%r10d # 3*$num 11327bded2dbSJung-uk Kim neg $num 11337bded2dbSJung-uk Kim mov ($n0),$n0 # *n0 11347bded2dbSJung-uk Kim 11357bded2dbSJung-uk Kim ############################################################## 11364c6a0400SJung-uk Kim # Ensure that stack frame doesn't alias with $rptr+3*$num 11374c6a0400SJung-uk Kim # modulo 4096, which covers ret[num], am[num] and n[num] 11384c6a0400SJung-uk Kim # (see bn_exp.c). This is done to allow memory disambiguation 11394c6a0400SJung-uk Kim # logic do its magic. [Extra 256 bytes is for power mask 11404c6a0400SJung-uk Kim # calculated from 7th argument, the index.] 11417bded2dbSJung-uk Kim # 11424c6a0400SJung-uk Kim lea -320(%rsp,$num,2),%r11 1143aeb5019cSJung-uk Kim mov %rsp,%rbp 11444c6a0400SJung-uk Kim sub $rptr,%r11 11457bded2dbSJung-uk Kim and \$4095,%r11 11467bded2dbSJung-uk Kim cmp %r11,%r10 11477bded2dbSJung-uk Kim jb .Lpwr_sp_alt 1148aeb5019cSJung-uk Kim sub %r11,%rbp # align with $aptr 1149aeb5019cSJung-uk Kim lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 11507bded2dbSJung-uk Kim jmp .Lpwr_sp_done 11517bded2dbSJung-uk Kim 11527bded2dbSJung-uk Kim.align 32 11537bded2dbSJung-uk Kim.Lpwr_sp_alt: 11544c6a0400SJung-uk Kim lea 4096-320(,$num,2),%r10 1155aeb5019cSJung-uk Kim lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 11567bded2dbSJung-uk Kim sub %r10,%r11 11577bded2dbSJung-uk Kim mov \$0,%r10 11587bded2dbSJung-uk Kim cmovc %r10,%r11 1159aeb5019cSJung-uk Kim sub %r11,%rbp 11607bded2dbSJung-uk Kim.Lpwr_sp_done: 1161aeb5019cSJung-uk Kim and \$-64,%rbp 1162aeb5019cSJung-uk Kim mov %rsp,%r11 1163aeb5019cSJung-uk Kim sub %rbp,%r11 1164b8721c16SJung-uk Kim and \$-4096,%r11 1165aeb5019cSJung-uk Kim lea (%rbp,%r11),%rsp 1166aeb5019cSJung-uk Kim mov (%rsp),%r10 1167aeb5019cSJung-uk Kim cmp %rbp,%rsp 1168aeb5019cSJung-uk Kim ja .Lpwr_page_walk 1169aeb5019cSJung-uk Kim jmp .Lpwr_page_walk_done 1170aeb5019cSJung-uk Kim 1171b8721c16SJung-uk Kim.Lpwr_page_walk: 1172aeb5019cSJung-uk Kim lea -4096(%rsp),%rsp 1173aeb5019cSJung-uk Kim mov (%rsp),%r10 1174aeb5019cSJung-uk Kim cmp %rbp,%rsp 1175aeb5019cSJung-uk Kim ja .Lpwr_page_walk 1176aeb5019cSJung-uk Kim.Lpwr_page_walk_done: 1177b8721c16SJung-uk Kim 11787bded2dbSJung-uk Kim mov $num,%r10 11797bded2dbSJung-uk Kim neg $num 11807bded2dbSJung-uk Kim 11817bded2dbSJung-uk Kim ############################################################## 11827bded2dbSJung-uk Kim # Stack layout 11837bded2dbSJung-uk Kim # 11847bded2dbSJung-uk Kim # +0 saved $num, used in reduction section 11857bded2dbSJung-uk Kim # +8 &t[2*$num], used in reduction section 11867bded2dbSJung-uk Kim # +32 saved *n0 11877bded2dbSJung-uk Kim # +40 saved %rsp 11887bded2dbSJung-uk Kim # +48 t[2*$num] 11897bded2dbSJung-uk Kim # 11907bded2dbSJung-uk Kim mov $n0, 32(%rsp) 11917bded2dbSJung-uk Kim mov %rax, 40(%rsp) # save original %rsp 1192e71b7053SJung-uk Kim.cfi_cfa_expression %rsp+40,deref,+8 11937bded2dbSJung-uk Kim.Lpower5_body: 11944c6a0400SJung-uk Kim movq $rptr,%xmm1 # save $rptr, used in sqr8x 11957bded2dbSJung-uk Kim movq $nptr,%xmm2 # save $nptr 11964c6a0400SJung-uk Kim movq %r10, %xmm3 # -$num, used in sqr8x 11977bded2dbSJung-uk Kim movq $bptr,%xmm4 11987bded2dbSJung-uk Kim 11997bded2dbSJung-uk Kim call __bn_sqr8x_internal 12004c6a0400SJung-uk Kim call __bn_post4x_internal 12017bded2dbSJung-uk Kim call __bn_sqr8x_internal 12024c6a0400SJung-uk Kim call __bn_post4x_internal 12037bded2dbSJung-uk Kim call __bn_sqr8x_internal 12044c6a0400SJung-uk Kim call __bn_post4x_internal 12057bded2dbSJung-uk Kim call __bn_sqr8x_internal 12064c6a0400SJung-uk Kim call __bn_post4x_internal 12077bded2dbSJung-uk Kim call __bn_sqr8x_internal 12084c6a0400SJung-uk Kim call __bn_post4x_internal 12097bded2dbSJung-uk Kim 12107bded2dbSJung-uk Kim movq %xmm2,$nptr 12117bded2dbSJung-uk Kim movq %xmm4,$bptr 12127bded2dbSJung-uk Kim mov $aptr,$rptr 12137bded2dbSJung-uk Kim mov 40(%rsp),%rax 12147bded2dbSJung-uk Kim lea 32(%rsp),$n0 12157bded2dbSJung-uk Kim 12167bded2dbSJung-uk Kim call mul4x_internal 12177bded2dbSJung-uk Kim 12187bded2dbSJung-uk Kim mov 40(%rsp),%rsi # restore %rsp 1219e71b7053SJung-uk Kim.cfi_def_cfa %rsi,8 12207bded2dbSJung-uk Kim mov \$1,%rax 12217bded2dbSJung-uk Kim mov -48(%rsi),%r15 1222e71b7053SJung-uk Kim.cfi_restore %r15 12237bded2dbSJung-uk Kim mov -40(%rsi),%r14 1224e71b7053SJung-uk Kim.cfi_restore %r14 12257bded2dbSJung-uk Kim mov -32(%rsi),%r13 1226e71b7053SJung-uk Kim.cfi_restore %r13 12277bded2dbSJung-uk Kim mov -24(%rsi),%r12 1228e71b7053SJung-uk Kim.cfi_restore %r12 12297bded2dbSJung-uk Kim mov -16(%rsi),%rbp 1230e71b7053SJung-uk Kim.cfi_restore %rbp 12317bded2dbSJung-uk Kim mov -8(%rsi),%rbx 1232e71b7053SJung-uk Kim.cfi_restore %rbx 12337bded2dbSJung-uk Kim lea (%rsi),%rsp 1234e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 12357bded2dbSJung-uk Kim.Lpower5_epilogue: 12367bded2dbSJung-uk Kim ret 1237e71b7053SJung-uk Kim.cfi_endproc 12387bded2dbSJung-uk Kim.size bn_power5,.-bn_power5 12397bded2dbSJung-uk Kim 12407bded2dbSJung-uk Kim.globl bn_sqr8x_internal 12417bded2dbSJung-uk Kim.hidden bn_sqr8x_internal 12427bded2dbSJung-uk Kim.type bn_sqr8x_internal,\@abi-omnipotent 12437bded2dbSJung-uk Kim.align 32 12447bded2dbSJung-uk Kimbn_sqr8x_internal: 12457bded2dbSJung-uk Kim__bn_sqr8x_internal: 124617f01e99SJung-uk Kim.cfi_startproc 12477bded2dbSJung-uk Kim ############################################################## 12487bded2dbSJung-uk Kim # Squaring part: 12497bded2dbSJung-uk Kim # 12507bded2dbSJung-uk Kim # a) multiply-n-add everything but a[i]*a[i]; 12517bded2dbSJung-uk Kim # b) shift result of a) by 1 to the left and accumulate 12527bded2dbSJung-uk Kim # a[i]*a[i] products; 12537bded2dbSJung-uk Kim # 12547bded2dbSJung-uk Kim ############################################################## 12557bded2dbSJung-uk Kim # a[1]a[0] 12567bded2dbSJung-uk Kim # a[2]a[0] 12577bded2dbSJung-uk Kim # a[3]a[0] 12587bded2dbSJung-uk Kim # a[2]a[1] 12597bded2dbSJung-uk Kim # a[4]a[0] 12607bded2dbSJung-uk Kim # a[3]a[1] 12617bded2dbSJung-uk Kim # a[5]a[0] 12627bded2dbSJung-uk Kim # a[4]a[1] 12637bded2dbSJung-uk Kim # a[3]a[2] 12647bded2dbSJung-uk Kim # a[6]a[0] 12657bded2dbSJung-uk Kim # a[5]a[1] 12667bded2dbSJung-uk Kim # a[4]a[2] 12677bded2dbSJung-uk Kim # a[7]a[0] 12687bded2dbSJung-uk Kim # a[6]a[1] 12697bded2dbSJung-uk Kim # a[5]a[2] 12707bded2dbSJung-uk Kim # a[4]a[3] 12717bded2dbSJung-uk Kim # a[7]a[1] 12727bded2dbSJung-uk Kim # a[6]a[2] 12737bded2dbSJung-uk Kim # a[5]a[3] 12747bded2dbSJung-uk Kim # a[7]a[2] 12757bded2dbSJung-uk Kim # a[6]a[3] 12767bded2dbSJung-uk Kim # a[5]a[4] 12777bded2dbSJung-uk Kim # a[7]a[3] 12787bded2dbSJung-uk Kim # a[6]a[4] 12797bded2dbSJung-uk Kim # a[7]a[4] 12807bded2dbSJung-uk Kim # a[6]a[5] 12817bded2dbSJung-uk Kim # a[7]a[5] 12827bded2dbSJung-uk Kim # a[7]a[6] 12837bded2dbSJung-uk Kim # a[1]a[0] 12847bded2dbSJung-uk Kim # a[2]a[0] 12857bded2dbSJung-uk Kim # a[3]a[0] 12867bded2dbSJung-uk Kim # a[4]a[0] 12877bded2dbSJung-uk Kim # a[5]a[0] 12887bded2dbSJung-uk Kim # a[6]a[0] 12897bded2dbSJung-uk Kim # a[7]a[0] 12907bded2dbSJung-uk Kim # a[2]a[1] 12917bded2dbSJung-uk Kim # a[3]a[1] 12927bded2dbSJung-uk Kim # a[4]a[1] 12937bded2dbSJung-uk Kim # a[5]a[1] 12947bded2dbSJung-uk Kim # a[6]a[1] 12957bded2dbSJung-uk Kim # a[7]a[1] 12967bded2dbSJung-uk Kim # a[3]a[2] 12977bded2dbSJung-uk Kim # a[4]a[2] 12987bded2dbSJung-uk Kim # a[5]a[2] 12997bded2dbSJung-uk Kim # a[6]a[2] 13007bded2dbSJung-uk Kim # a[7]a[2] 13017bded2dbSJung-uk Kim # a[4]a[3] 13027bded2dbSJung-uk Kim # a[5]a[3] 13037bded2dbSJung-uk Kim # a[6]a[3] 13047bded2dbSJung-uk Kim # a[7]a[3] 13057bded2dbSJung-uk Kim # a[5]a[4] 13067bded2dbSJung-uk Kim # a[6]a[4] 13077bded2dbSJung-uk Kim # a[7]a[4] 13087bded2dbSJung-uk Kim # a[6]a[5] 13097bded2dbSJung-uk Kim # a[7]a[5] 13107bded2dbSJung-uk Kim # a[7]a[6] 13117bded2dbSJung-uk Kim # a[0]a[0] 13127bded2dbSJung-uk Kim # a[1]a[1] 13137bded2dbSJung-uk Kim # a[2]a[2] 13147bded2dbSJung-uk Kim # a[3]a[3] 13157bded2dbSJung-uk Kim # a[4]a[4] 13167bded2dbSJung-uk Kim # a[5]a[5] 13177bded2dbSJung-uk Kim # a[6]a[6] 13187bded2dbSJung-uk Kim # a[7]a[7] 13197bded2dbSJung-uk Kim 13207bded2dbSJung-uk Kim lea 32(%r10),$i # $i=-($num-32) 13217bded2dbSJung-uk Kim lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 13227bded2dbSJung-uk Kim 13237bded2dbSJung-uk Kim mov $num,$j # $j=$num 13247bded2dbSJung-uk Kim 13257bded2dbSJung-uk Kim # comments apply to $num==8 case 13267bded2dbSJung-uk Kim mov -32($aptr,$i),$a0 # a[0] 13277bded2dbSJung-uk Kim lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 13287bded2dbSJung-uk Kim mov -24($aptr,$i),%rax # a[1] 13297bded2dbSJung-uk Kim lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 13307bded2dbSJung-uk Kim mov -16($aptr,$i),$ai # a[2] 13317bded2dbSJung-uk Kim mov %rax,$a1 13327bded2dbSJung-uk Kim 13337bded2dbSJung-uk Kim mul $a0 # a[1]*a[0] 13347bded2dbSJung-uk Kim mov %rax,$A0[0] # a[1]*a[0] 13357bded2dbSJung-uk Kim mov $ai,%rax # a[2] 13367bded2dbSJung-uk Kim mov %rdx,$A0[1] 13377bded2dbSJung-uk Kim mov $A0[0],-24($tptr,$i) # t[1] 13387bded2dbSJung-uk Kim 13397bded2dbSJung-uk Kim mul $a0 # a[2]*a[0] 13407bded2dbSJung-uk Kim add %rax,$A0[1] 13417bded2dbSJung-uk Kim mov $ai,%rax 13427bded2dbSJung-uk Kim adc \$0,%rdx 13437bded2dbSJung-uk Kim mov $A0[1],-16($tptr,$i) # t[2] 13447bded2dbSJung-uk Kim mov %rdx,$A0[0] 13457bded2dbSJung-uk Kim 13467bded2dbSJung-uk Kim 13477bded2dbSJung-uk Kim mov -8($aptr,$i),$ai # a[3] 13487bded2dbSJung-uk Kim mul $a1 # a[2]*a[1] 13497bded2dbSJung-uk Kim mov %rax,$A1[0] # a[2]*a[1]+t[3] 13507bded2dbSJung-uk Kim mov $ai,%rax 13517bded2dbSJung-uk Kim mov %rdx,$A1[1] 13527bded2dbSJung-uk Kim 13537bded2dbSJung-uk Kim lea ($i),$j 13547bded2dbSJung-uk Kim mul $a0 # a[3]*a[0] 13557bded2dbSJung-uk Kim add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 13567bded2dbSJung-uk Kim mov $ai,%rax 13577bded2dbSJung-uk Kim mov %rdx,$A0[1] 13587bded2dbSJung-uk Kim adc \$0,$A0[1] 13597bded2dbSJung-uk Kim add $A1[0],$A0[0] 13607bded2dbSJung-uk Kim adc \$0,$A0[1] 13617bded2dbSJung-uk Kim mov $A0[0],-8($tptr,$j) # t[3] 13627bded2dbSJung-uk Kim jmp .Lsqr4x_1st 13637bded2dbSJung-uk Kim 13647bded2dbSJung-uk Kim.align 32 13657bded2dbSJung-uk Kim.Lsqr4x_1st: 13667bded2dbSJung-uk Kim mov ($aptr,$j),$ai # a[4] 13677bded2dbSJung-uk Kim mul $a1 # a[3]*a[1] 13687bded2dbSJung-uk Kim add %rax,$A1[1] # a[3]*a[1]+t[4] 13697bded2dbSJung-uk Kim mov $ai,%rax 13707bded2dbSJung-uk Kim mov %rdx,$A1[0] 13717bded2dbSJung-uk Kim adc \$0,$A1[0] 13727bded2dbSJung-uk Kim 13737bded2dbSJung-uk Kim mul $a0 # a[4]*a[0] 13747bded2dbSJung-uk Kim add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 13757bded2dbSJung-uk Kim mov $ai,%rax # a[3] 13767bded2dbSJung-uk Kim mov 8($aptr,$j),$ai # a[5] 13777bded2dbSJung-uk Kim mov %rdx,$A0[0] 13787bded2dbSJung-uk Kim adc \$0,$A0[0] 13797bded2dbSJung-uk Kim add $A1[1],$A0[1] 13807bded2dbSJung-uk Kim adc \$0,$A0[0] 13817bded2dbSJung-uk Kim 13827bded2dbSJung-uk Kim 13837bded2dbSJung-uk Kim mul $a1 # a[4]*a[3] 13847bded2dbSJung-uk Kim add %rax,$A1[0] # a[4]*a[3]+t[5] 13857bded2dbSJung-uk Kim mov $ai,%rax 13867bded2dbSJung-uk Kim mov $A0[1],($tptr,$j) # t[4] 13877bded2dbSJung-uk Kim mov %rdx,$A1[1] 13887bded2dbSJung-uk Kim adc \$0,$A1[1] 13897bded2dbSJung-uk Kim 13907bded2dbSJung-uk Kim mul $a0 # a[5]*a[2] 13917bded2dbSJung-uk Kim add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 13927bded2dbSJung-uk Kim mov $ai,%rax 13937bded2dbSJung-uk Kim mov 16($aptr,$j),$ai # a[6] 13947bded2dbSJung-uk Kim mov %rdx,$A0[1] 13957bded2dbSJung-uk Kim adc \$0,$A0[1] 13967bded2dbSJung-uk Kim add $A1[0],$A0[0] 13977bded2dbSJung-uk Kim adc \$0,$A0[1] 13987bded2dbSJung-uk Kim 13997bded2dbSJung-uk Kim mul $a1 # a[5]*a[3] 14007bded2dbSJung-uk Kim add %rax,$A1[1] # a[5]*a[3]+t[6] 14017bded2dbSJung-uk Kim mov $ai,%rax 14027bded2dbSJung-uk Kim mov $A0[0],8($tptr,$j) # t[5] 14037bded2dbSJung-uk Kim mov %rdx,$A1[0] 14047bded2dbSJung-uk Kim adc \$0,$A1[0] 14057bded2dbSJung-uk Kim 14067bded2dbSJung-uk Kim mul $a0 # a[6]*a[2] 14077bded2dbSJung-uk Kim add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 14087bded2dbSJung-uk Kim mov $ai,%rax # a[3] 14097bded2dbSJung-uk Kim mov 24($aptr,$j),$ai # a[7] 14107bded2dbSJung-uk Kim mov %rdx,$A0[0] 14117bded2dbSJung-uk Kim adc \$0,$A0[0] 14127bded2dbSJung-uk Kim add $A1[1],$A0[1] 14137bded2dbSJung-uk Kim adc \$0,$A0[0] 14147bded2dbSJung-uk Kim 14157bded2dbSJung-uk Kim 14167bded2dbSJung-uk Kim mul $a1 # a[6]*a[5] 14177bded2dbSJung-uk Kim add %rax,$A1[0] # a[6]*a[5]+t[7] 14187bded2dbSJung-uk Kim mov $ai,%rax 14197bded2dbSJung-uk Kim mov $A0[1],16($tptr,$j) # t[6] 14207bded2dbSJung-uk Kim mov %rdx,$A1[1] 14217bded2dbSJung-uk Kim adc \$0,$A1[1] 14227bded2dbSJung-uk Kim lea 32($j),$j 14237bded2dbSJung-uk Kim 14247bded2dbSJung-uk Kim mul $a0 # a[7]*a[4] 14257bded2dbSJung-uk Kim add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 14267bded2dbSJung-uk Kim mov $ai,%rax 14277bded2dbSJung-uk Kim mov %rdx,$A0[1] 14287bded2dbSJung-uk Kim adc \$0,$A0[1] 14297bded2dbSJung-uk Kim add $A1[0],$A0[0] 14307bded2dbSJung-uk Kim adc \$0,$A0[1] 14317bded2dbSJung-uk Kim mov $A0[0],-8($tptr,$j) # t[7] 14327bded2dbSJung-uk Kim 14337bded2dbSJung-uk Kim cmp \$0,$j 14347bded2dbSJung-uk Kim jne .Lsqr4x_1st 14357bded2dbSJung-uk Kim 14367bded2dbSJung-uk Kim mul $a1 # a[7]*a[5] 14377bded2dbSJung-uk Kim add %rax,$A1[1] 14387bded2dbSJung-uk Kim lea 16($i),$i 14397bded2dbSJung-uk Kim adc \$0,%rdx 14407bded2dbSJung-uk Kim add $A0[1],$A1[1] 14417bded2dbSJung-uk Kim adc \$0,%rdx 14427bded2dbSJung-uk Kim 14437bded2dbSJung-uk Kim mov $A1[1],($tptr) # t[8] 14447bded2dbSJung-uk Kim mov %rdx,$A1[0] 14457bded2dbSJung-uk Kim mov %rdx,8($tptr) # t[9] 14467bded2dbSJung-uk Kim jmp .Lsqr4x_outer 14477bded2dbSJung-uk Kim 14487bded2dbSJung-uk Kim.align 32 14497bded2dbSJung-uk Kim.Lsqr4x_outer: # comments apply to $num==6 case 14507bded2dbSJung-uk Kim mov -32($aptr,$i),$a0 # a[0] 14517bded2dbSJung-uk Kim lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 14527bded2dbSJung-uk Kim mov -24($aptr,$i),%rax # a[1] 14537bded2dbSJung-uk Kim lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 14547bded2dbSJung-uk Kim mov -16($aptr,$i),$ai # a[2] 14557bded2dbSJung-uk Kim mov %rax,$a1 14567bded2dbSJung-uk Kim 14577bded2dbSJung-uk Kim mul $a0 # a[1]*a[0] 14587bded2dbSJung-uk Kim mov -24($tptr,$i),$A0[0] # t[1] 14597bded2dbSJung-uk Kim add %rax,$A0[0] # a[1]*a[0]+t[1] 14607bded2dbSJung-uk Kim mov $ai,%rax # a[2] 14617bded2dbSJung-uk Kim adc \$0,%rdx 14627bded2dbSJung-uk Kim mov $A0[0],-24($tptr,$i) # t[1] 14637bded2dbSJung-uk Kim mov %rdx,$A0[1] 14647bded2dbSJung-uk Kim 14657bded2dbSJung-uk Kim mul $a0 # a[2]*a[0] 14667bded2dbSJung-uk Kim add %rax,$A0[1] 14677bded2dbSJung-uk Kim mov $ai,%rax 14687bded2dbSJung-uk Kim adc \$0,%rdx 14697bded2dbSJung-uk Kim add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 14707bded2dbSJung-uk Kim mov %rdx,$A0[0] 14717bded2dbSJung-uk Kim adc \$0,$A0[0] 14727bded2dbSJung-uk Kim mov $A0[1],-16($tptr,$i) # t[2] 14737bded2dbSJung-uk Kim 14747bded2dbSJung-uk Kim xor $A1[0],$A1[0] 14757bded2dbSJung-uk Kim 14767bded2dbSJung-uk Kim mov -8($aptr,$i),$ai # a[3] 14777bded2dbSJung-uk Kim mul $a1 # a[2]*a[1] 14787bded2dbSJung-uk Kim add %rax,$A1[0] # a[2]*a[1]+t[3] 14797bded2dbSJung-uk Kim mov $ai,%rax 14807bded2dbSJung-uk Kim adc \$0,%rdx 14817bded2dbSJung-uk Kim add -8($tptr,$i),$A1[0] 14827bded2dbSJung-uk Kim mov %rdx,$A1[1] 14837bded2dbSJung-uk Kim adc \$0,$A1[1] 14847bded2dbSJung-uk Kim 14857bded2dbSJung-uk Kim mul $a0 # a[3]*a[0] 14867bded2dbSJung-uk Kim add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 14877bded2dbSJung-uk Kim mov $ai,%rax 14887bded2dbSJung-uk Kim adc \$0,%rdx 14897bded2dbSJung-uk Kim add $A1[0],$A0[0] 14907bded2dbSJung-uk Kim mov %rdx,$A0[1] 14917bded2dbSJung-uk Kim adc \$0,$A0[1] 14927bded2dbSJung-uk Kim mov $A0[0],-8($tptr,$i) # t[3] 14937bded2dbSJung-uk Kim 14947bded2dbSJung-uk Kim lea ($i),$j 14957bded2dbSJung-uk Kim jmp .Lsqr4x_inner 14967bded2dbSJung-uk Kim 14977bded2dbSJung-uk Kim.align 32 14987bded2dbSJung-uk Kim.Lsqr4x_inner: 14997bded2dbSJung-uk Kim mov ($aptr,$j),$ai # a[4] 15007bded2dbSJung-uk Kim mul $a1 # a[3]*a[1] 15017bded2dbSJung-uk Kim add %rax,$A1[1] # a[3]*a[1]+t[4] 15027bded2dbSJung-uk Kim mov $ai,%rax 15037bded2dbSJung-uk Kim mov %rdx,$A1[0] 15047bded2dbSJung-uk Kim adc \$0,$A1[0] 15057bded2dbSJung-uk Kim add ($tptr,$j),$A1[1] 15067bded2dbSJung-uk Kim adc \$0,$A1[0] 15077bded2dbSJung-uk Kim 15087bded2dbSJung-uk Kim .byte 0x67 15097bded2dbSJung-uk Kim mul $a0 # a[4]*a[0] 15107bded2dbSJung-uk Kim add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 15117bded2dbSJung-uk Kim mov $ai,%rax # a[3] 15127bded2dbSJung-uk Kim mov 8($aptr,$j),$ai # a[5] 15137bded2dbSJung-uk Kim mov %rdx,$A0[0] 15147bded2dbSJung-uk Kim adc \$0,$A0[0] 15157bded2dbSJung-uk Kim add $A1[1],$A0[1] 15167bded2dbSJung-uk Kim adc \$0,$A0[0] 15177bded2dbSJung-uk Kim 15187bded2dbSJung-uk Kim mul $a1 # a[4]*a[3] 15197bded2dbSJung-uk Kim add %rax,$A1[0] # a[4]*a[3]+t[5] 15207bded2dbSJung-uk Kim mov $A0[1],($tptr,$j) # t[4] 15217bded2dbSJung-uk Kim mov $ai,%rax 15227bded2dbSJung-uk Kim mov %rdx,$A1[1] 15237bded2dbSJung-uk Kim adc \$0,$A1[1] 15247bded2dbSJung-uk Kim add 8($tptr,$j),$A1[0] 15257bded2dbSJung-uk Kim lea 16($j),$j # j++ 15267bded2dbSJung-uk Kim adc \$0,$A1[1] 15277bded2dbSJung-uk Kim 15287bded2dbSJung-uk Kim mul $a0 # a[5]*a[2] 15297bded2dbSJung-uk Kim add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 15307bded2dbSJung-uk Kim mov $ai,%rax 15317bded2dbSJung-uk Kim adc \$0,%rdx 15327bded2dbSJung-uk Kim add $A1[0],$A0[0] 15337bded2dbSJung-uk Kim mov %rdx,$A0[1] 15347bded2dbSJung-uk Kim adc \$0,$A0[1] 15357bded2dbSJung-uk Kim mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 15367bded2dbSJung-uk Kim 15377bded2dbSJung-uk Kim cmp \$0,$j 15387bded2dbSJung-uk Kim jne .Lsqr4x_inner 15397bded2dbSJung-uk Kim 15407bded2dbSJung-uk Kim .byte 0x67 15417bded2dbSJung-uk Kim mul $a1 # a[5]*a[3] 15427bded2dbSJung-uk Kim add %rax,$A1[1] 15437bded2dbSJung-uk Kim adc \$0,%rdx 15447bded2dbSJung-uk Kim add $A0[1],$A1[1] 15457bded2dbSJung-uk Kim adc \$0,%rdx 15467bded2dbSJung-uk Kim 15477bded2dbSJung-uk Kim mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 15487bded2dbSJung-uk Kim mov %rdx,$A1[0] 15497bded2dbSJung-uk Kim mov %rdx,8($tptr) # t[7], "preloaded t[3]" below 15507bded2dbSJung-uk Kim 15517bded2dbSJung-uk Kim add \$16,$i 15527bded2dbSJung-uk Kim jnz .Lsqr4x_outer 15537bded2dbSJung-uk Kim 15547bded2dbSJung-uk Kim # comments apply to $num==4 case 15557bded2dbSJung-uk Kim mov -32($aptr),$a0 # a[0] 15567bded2dbSJung-uk Kim lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 15577bded2dbSJung-uk Kim mov -24($aptr),%rax # a[1] 15587bded2dbSJung-uk Kim lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 15597bded2dbSJung-uk Kim mov -16($aptr),$ai # a[2] 15607bded2dbSJung-uk Kim mov %rax,$a1 15617bded2dbSJung-uk Kim 15627bded2dbSJung-uk Kim mul $a0 # a[1]*a[0] 15637bded2dbSJung-uk Kim add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 15647bded2dbSJung-uk Kim mov $ai,%rax # a[2] 15657bded2dbSJung-uk Kim mov %rdx,$A0[1] 15667bded2dbSJung-uk Kim adc \$0,$A0[1] 15677bded2dbSJung-uk Kim 15687bded2dbSJung-uk Kim mul $a0 # a[2]*a[0] 15697bded2dbSJung-uk Kim add %rax,$A0[1] 15707bded2dbSJung-uk Kim mov $ai,%rax 15717bded2dbSJung-uk Kim mov $A0[0],-24($tptr) # t[1] 15727bded2dbSJung-uk Kim mov %rdx,$A0[0] 15737bded2dbSJung-uk Kim adc \$0,$A0[0] 15747bded2dbSJung-uk Kim add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 15757bded2dbSJung-uk Kim mov -8($aptr),$ai # a[3] 15767bded2dbSJung-uk Kim adc \$0,$A0[0] 15777bded2dbSJung-uk Kim 15787bded2dbSJung-uk Kim mul $a1 # a[2]*a[1] 15797bded2dbSJung-uk Kim add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 15807bded2dbSJung-uk Kim mov $ai,%rax 15817bded2dbSJung-uk Kim mov $A0[1],-16($tptr) # t[2] 15827bded2dbSJung-uk Kim mov %rdx,$A1[1] 15837bded2dbSJung-uk Kim adc \$0,$A1[1] 15847bded2dbSJung-uk Kim 15857bded2dbSJung-uk Kim mul $a0 # a[3]*a[0] 15867bded2dbSJung-uk Kim add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 15877bded2dbSJung-uk Kim mov $ai,%rax 15887bded2dbSJung-uk Kim mov %rdx,$A0[1] 15897bded2dbSJung-uk Kim adc \$0,$A0[1] 15907bded2dbSJung-uk Kim add $A1[0],$A0[0] 15917bded2dbSJung-uk Kim adc \$0,$A0[1] 15927bded2dbSJung-uk Kim mov $A0[0],-8($tptr) # t[3] 15937bded2dbSJung-uk Kim 15947bded2dbSJung-uk Kim mul $a1 # a[3]*a[1] 15957bded2dbSJung-uk Kim add %rax,$A1[1] 15967bded2dbSJung-uk Kim mov -16($aptr),%rax # a[2] 15977bded2dbSJung-uk Kim adc \$0,%rdx 15987bded2dbSJung-uk Kim add $A0[1],$A1[1] 15997bded2dbSJung-uk Kim adc \$0,%rdx 16007bded2dbSJung-uk Kim 16017bded2dbSJung-uk Kim mov $A1[1],($tptr) # t[4] 16027bded2dbSJung-uk Kim mov %rdx,$A1[0] 16037bded2dbSJung-uk Kim mov %rdx,8($tptr) # t[5] 16047bded2dbSJung-uk Kim 16057bded2dbSJung-uk Kim mul $ai # a[2]*a[3] 16067bded2dbSJung-uk Kim___ 16077bded2dbSJung-uk Kim{ 16087bded2dbSJung-uk Kimmy ($shift,$carry)=($a0,$a1); 16097bded2dbSJung-uk Kimmy @S=(@A1,$ai,$n0); 16107bded2dbSJung-uk Kim$code.=<<___; 16117bded2dbSJung-uk Kim add \$16,$i 16127bded2dbSJung-uk Kim xor $shift,$shift 16137bded2dbSJung-uk Kim sub $num,$i # $i=16-$num 16147bded2dbSJung-uk Kim xor $carry,$carry 16157bded2dbSJung-uk Kim 16167bded2dbSJung-uk Kim add $A1[0],%rax # t[5] 16177bded2dbSJung-uk Kim adc \$0,%rdx 16187bded2dbSJung-uk Kim mov %rax,8($tptr) # t[5] 16197bded2dbSJung-uk Kim mov %rdx,16($tptr) # t[6] 16207bded2dbSJung-uk Kim mov $carry,24($tptr) # t[7] 16217bded2dbSJung-uk Kim 16227bded2dbSJung-uk Kim mov -16($aptr,$i),%rax # a[0] 16237bded2dbSJung-uk Kim lea 48+8(%rsp),$tptr 16247bded2dbSJung-uk Kim xor $A0[0],$A0[0] # t[0] 16257bded2dbSJung-uk Kim mov 8($tptr),$A0[1] # t[1] 16267bded2dbSJung-uk Kim 16277bded2dbSJung-uk Kim lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 16287bded2dbSJung-uk Kim shr \$63,$A0[0] 16297bded2dbSJung-uk Kim lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 16307bded2dbSJung-uk Kim shr \$63,$A0[1] 16317bded2dbSJung-uk Kim or $A0[0],$S[1] # | t[2*i]>>63 16327bded2dbSJung-uk Kim mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 16337bded2dbSJung-uk Kim mov $A0[1],$shift # shift=t[2*i+1]>>63 16347bded2dbSJung-uk Kim mul %rax # a[i]*a[i] 16357bded2dbSJung-uk Kim neg $carry # mov $carry,cf 16367bded2dbSJung-uk Kim mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 16377bded2dbSJung-uk Kim adc %rax,$S[0] 16387bded2dbSJung-uk Kim mov -8($aptr,$i),%rax # a[i+1] # prefetch 16397bded2dbSJung-uk Kim mov $S[0],($tptr) 16407bded2dbSJung-uk Kim adc %rdx,$S[1] 16417bded2dbSJung-uk Kim 16427bded2dbSJung-uk Kim lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 16437bded2dbSJung-uk Kim mov $S[1],8($tptr) 16447bded2dbSJung-uk Kim sbb $carry,$carry # mov cf,$carry 16457bded2dbSJung-uk Kim shr \$63,$A0[0] 16467bded2dbSJung-uk Kim lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 16477bded2dbSJung-uk Kim shr \$63,$A0[1] 16487bded2dbSJung-uk Kim or $A0[0],$S[3] # | t[2*i]>>63 16497bded2dbSJung-uk Kim mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 16507bded2dbSJung-uk Kim mov $A0[1],$shift # shift=t[2*i+1]>>63 16517bded2dbSJung-uk Kim mul %rax # a[i]*a[i] 16527bded2dbSJung-uk Kim neg $carry # mov $carry,cf 16537bded2dbSJung-uk Kim mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 16547bded2dbSJung-uk Kim adc %rax,$S[2] 16557bded2dbSJung-uk Kim mov 0($aptr,$i),%rax # a[i+1] # prefetch 16567bded2dbSJung-uk Kim mov $S[2],16($tptr) 16577bded2dbSJung-uk Kim adc %rdx,$S[3] 16587bded2dbSJung-uk Kim lea 16($i),$i 16597bded2dbSJung-uk Kim mov $S[3],24($tptr) 16607bded2dbSJung-uk Kim sbb $carry,$carry # mov cf,$carry 16617bded2dbSJung-uk Kim lea 64($tptr),$tptr 16627bded2dbSJung-uk Kim jmp .Lsqr4x_shift_n_add 16637bded2dbSJung-uk Kim 16647bded2dbSJung-uk Kim.align 32 16657bded2dbSJung-uk Kim.Lsqr4x_shift_n_add: 16667bded2dbSJung-uk Kim lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 16677bded2dbSJung-uk Kim shr \$63,$A0[0] 16687bded2dbSJung-uk Kim lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 16697bded2dbSJung-uk Kim shr \$63,$A0[1] 16707bded2dbSJung-uk Kim or $A0[0],$S[1] # | t[2*i]>>63 16717bded2dbSJung-uk Kim mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 16727bded2dbSJung-uk Kim mov $A0[1],$shift # shift=t[2*i+1]>>63 16737bded2dbSJung-uk Kim mul %rax # a[i]*a[i] 16747bded2dbSJung-uk Kim neg $carry # mov $carry,cf 16757bded2dbSJung-uk Kim mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 16767bded2dbSJung-uk Kim adc %rax,$S[0] 16777bded2dbSJung-uk Kim mov -8($aptr,$i),%rax # a[i+1] # prefetch 16787bded2dbSJung-uk Kim mov $S[0],-32($tptr) 16797bded2dbSJung-uk Kim adc %rdx,$S[1] 16807bded2dbSJung-uk Kim 16817bded2dbSJung-uk Kim lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 16827bded2dbSJung-uk Kim mov $S[1],-24($tptr) 16837bded2dbSJung-uk Kim sbb $carry,$carry # mov cf,$carry 16847bded2dbSJung-uk Kim shr \$63,$A0[0] 16857bded2dbSJung-uk Kim lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 16867bded2dbSJung-uk Kim shr \$63,$A0[1] 16877bded2dbSJung-uk Kim or $A0[0],$S[3] # | t[2*i]>>63 16887bded2dbSJung-uk Kim mov 0($tptr),$A0[0] # t[2*i+2] # prefetch 16897bded2dbSJung-uk Kim mov $A0[1],$shift # shift=t[2*i+1]>>63 16907bded2dbSJung-uk Kim mul %rax # a[i]*a[i] 16917bded2dbSJung-uk Kim neg $carry # mov $carry,cf 16927bded2dbSJung-uk Kim mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch 16937bded2dbSJung-uk Kim adc %rax,$S[2] 16947bded2dbSJung-uk Kim mov 0($aptr,$i),%rax # a[i+1] # prefetch 16957bded2dbSJung-uk Kim mov $S[2],-16($tptr) 16967bded2dbSJung-uk Kim adc %rdx,$S[3] 16977bded2dbSJung-uk Kim 16987bded2dbSJung-uk Kim lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 16997bded2dbSJung-uk Kim mov $S[3],-8($tptr) 17007bded2dbSJung-uk Kim sbb $carry,$carry # mov cf,$carry 17017bded2dbSJung-uk Kim shr \$63,$A0[0] 17027bded2dbSJung-uk Kim lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 17037bded2dbSJung-uk Kim shr \$63,$A0[1] 17047bded2dbSJung-uk Kim or $A0[0],$S[1] # | t[2*i]>>63 17057bded2dbSJung-uk Kim mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 17067bded2dbSJung-uk Kim mov $A0[1],$shift # shift=t[2*i+1]>>63 17077bded2dbSJung-uk Kim mul %rax # a[i]*a[i] 17087bded2dbSJung-uk Kim neg $carry # mov $carry,cf 17097bded2dbSJung-uk Kim mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 17107bded2dbSJung-uk Kim adc %rax,$S[0] 17117bded2dbSJung-uk Kim mov 8($aptr,$i),%rax # a[i+1] # prefetch 17127bded2dbSJung-uk Kim mov $S[0],0($tptr) 17137bded2dbSJung-uk Kim adc %rdx,$S[1] 17147bded2dbSJung-uk Kim 17157bded2dbSJung-uk Kim lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 17167bded2dbSJung-uk Kim mov $S[1],8($tptr) 17177bded2dbSJung-uk Kim sbb $carry,$carry # mov cf,$carry 17187bded2dbSJung-uk Kim shr \$63,$A0[0] 17197bded2dbSJung-uk Kim lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 17207bded2dbSJung-uk Kim shr \$63,$A0[1] 17217bded2dbSJung-uk Kim or $A0[0],$S[3] # | t[2*i]>>63 17227bded2dbSJung-uk Kim mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 17237bded2dbSJung-uk Kim mov $A0[1],$shift # shift=t[2*i+1]>>63 17247bded2dbSJung-uk Kim mul %rax # a[i]*a[i] 17257bded2dbSJung-uk Kim neg $carry # mov $carry,cf 17267bded2dbSJung-uk Kim mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 17277bded2dbSJung-uk Kim adc %rax,$S[2] 17287bded2dbSJung-uk Kim mov 16($aptr,$i),%rax # a[i+1] # prefetch 17297bded2dbSJung-uk Kim mov $S[2],16($tptr) 17307bded2dbSJung-uk Kim adc %rdx,$S[3] 17317bded2dbSJung-uk Kim mov $S[3],24($tptr) 17327bded2dbSJung-uk Kim sbb $carry,$carry # mov cf,$carry 17337bded2dbSJung-uk Kim lea 64($tptr),$tptr 17347bded2dbSJung-uk Kim add \$32,$i 17357bded2dbSJung-uk Kim jnz .Lsqr4x_shift_n_add 17367bded2dbSJung-uk Kim 17377bded2dbSJung-uk Kim lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 17387bded2dbSJung-uk Kim .byte 0x67 17397bded2dbSJung-uk Kim shr \$63,$A0[0] 17407bded2dbSJung-uk Kim lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 17417bded2dbSJung-uk Kim shr \$63,$A0[1] 17427bded2dbSJung-uk Kim or $A0[0],$S[1] # | t[2*i]>>63 17437bded2dbSJung-uk Kim mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 17447bded2dbSJung-uk Kim mov $A0[1],$shift # shift=t[2*i+1]>>63 17457bded2dbSJung-uk Kim mul %rax # a[i]*a[i] 17467bded2dbSJung-uk Kim neg $carry # mov $carry,cf 17477bded2dbSJung-uk Kim mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 17487bded2dbSJung-uk Kim adc %rax,$S[0] 17497bded2dbSJung-uk Kim mov -8($aptr),%rax # a[i+1] # prefetch 17507bded2dbSJung-uk Kim mov $S[0],-32($tptr) 17517bded2dbSJung-uk Kim adc %rdx,$S[1] 17527bded2dbSJung-uk Kim 17537bded2dbSJung-uk Kim lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 17547bded2dbSJung-uk Kim mov $S[1],-24($tptr) 17557bded2dbSJung-uk Kim sbb $carry,$carry # mov cf,$carry 17567bded2dbSJung-uk Kim shr \$63,$A0[0] 17577bded2dbSJung-uk Kim lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 17587bded2dbSJung-uk Kim shr \$63,$A0[1] 17597bded2dbSJung-uk Kim or $A0[0],$S[3] # | t[2*i]>>63 17607bded2dbSJung-uk Kim mul %rax # a[i]*a[i] 17617bded2dbSJung-uk Kim neg $carry # mov $carry,cf 17627bded2dbSJung-uk Kim adc %rax,$S[2] 17637bded2dbSJung-uk Kim adc %rdx,$S[3] 17647bded2dbSJung-uk Kim mov $S[2],-16($tptr) 17657bded2dbSJung-uk Kim mov $S[3],-8($tptr) 17667bded2dbSJung-uk Kim___ 17677bded2dbSJung-uk Kim} 17687bded2dbSJung-uk Kim###################################################################### 17697bded2dbSJung-uk Kim# Montgomery reduction part, "word-by-word" algorithm. 17707bded2dbSJung-uk Kim# 17717bded2dbSJung-uk Kim# This new path is inspired by multiple submissions from Intel, by 17727bded2dbSJung-uk Kim# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 17737bded2dbSJung-uk Kim# Vinodh Gopal... 17747bded2dbSJung-uk Kim{ 17757bded2dbSJung-uk Kimmy ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); 17767bded2dbSJung-uk Kim 17777bded2dbSJung-uk Kim$code.=<<___; 17787bded2dbSJung-uk Kim movq %xmm2,$nptr 17794c6a0400SJung-uk Kim__bn_sqr8x_reduction: 17807bded2dbSJung-uk Kim xor %rax,%rax 17814c6a0400SJung-uk Kim lea ($nptr,$num),%rcx # end of n[] 17827bded2dbSJung-uk Kim lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer 17837bded2dbSJung-uk Kim mov %rcx,0+8(%rsp) 17847bded2dbSJung-uk Kim lea 48+8(%rsp,$num),$tptr # end of initial t[] window 17857bded2dbSJung-uk Kim mov %rdx,8+8(%rsp) 17867bded2dbSJung-uk Kim neg $num 17877bded2dbSJung-uk Kim jmp .L8x_reduction_loop 17887bded2dbSJung-uk Kim 17897bded2dbSJung-uk Kim.align 32 17907bded2dbSJung-uk Kim.L8x_reduction_loop: 17917bded2dbSJung-uk Kim lea ($tptr,$num),$tptr # start of current t[] window 17927bded2dbSJung-uk Kim .byte 0x66 17937bded2dbSJung-uk Kim mov 8*0($tptr),$m0 17947bded2dbSJung-uk Kim mov 8*1($tptr),%r9 17957bded2dbSJung-uk Kim mov 8*2($tptr),%r10 17967bded2dbSJung-uk Kim mov 8*3($tptr),%r11 17977bded2dbSJung-uk Kim mov 8*4($tptr),%r12 17987bded2dbSJung-uk Kim mov 8*5($tptr),%r13 17997bded2dbSJung-uk Kim mov 8*6($tptr),%r14 18007bded2dbSJung-uk Kim mov 8*7($tptr),%r15 18017bded2dbSJung-uk Kim mov %rax,(%rdx) # store top-most carry bit 18027bded2dbSJung-uk Kim lea 8*8($tptr),$tptr 18037bded2dbSJung-uk Kim 18047bded2dbSJung-uk Kim .byte 0x67 18057bded2dbSJung-uk Kim mov $m0,%r8 18067bded2dbSJung-uk Kim imulq 32+8(%rsp),$m0 # n0*a[0] 18074c6a0400SJung-uk Kim mov 8*0($nptr),%rax # n[0] 18087bded2dbSJung-uk Kim mov \$8,%ecx 18097bded2dbSJung-uk Kim jmp .L8x_reduce 18107bded2dbSJung-uk Kim 18117bded2dbSJung-uk Kim.align 32 18127bded2dbSJung-uk Kim.L8x_reduce: 18137bded2dbSJung-uk Kim mulq $m0 18144c6a0400SJung-uk Kim mov 8*1($nptr),%rax # n[1] 18157bded2dbSJung-uk Kim neg %r8 18167bded2dbSJung-uk Kim mov %rdx,%r8 18177bded2dbSJung-uk Kim adc \$0,%r8 18187bded2dbSJung-uk Kim 18197bded2dbSJung-uk Kim mulq $m0 18207bded2dbSJung-uk Kim add %rax,%r9 18214c6a0400SJung-uk Kim mov 8*2($nptr),%rax 18227bded2dbSJung-uk Kim adc \$0,%rdx 18237bded2dbSJung-uk Kim add %r9,%r8 18247bded2dbSJung-uk Kim mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] 18257bded2dbSJung-uk Kim mov %rdx,%r9 18267bded2dbSJung-uk Kim adc \$0,%r9 18277bded2dbSJung-uk Kim 18287bded2dbSJung-uk Kim mulq $m0 18297bded2dbSJung-uk Kim add %rax,%r10 18304c6a0400SJung-uk Kim mov 8*3($nptr),%rax 18317bded2dbSJung-uk Kim adc \$0,%rdx 18327bded2dbSJung-uk Kim add %r10,%r9 18337bded2dbSJung-uk Kim mov 32+8(%rsp),$carry # pull n0, borrow $carry 18347bded2dbSJung-uk Kim mov %rdx,%r10 18357bded2dbSJung-uk Kim adc \$0,%r10 18367bded2dbSJung-uk Kim 18377bded2dbSJung-uk Kim mulq $m0 18387bded2dbSJung-uk Kim add %rax,%r11 18394c6a0400SJung-uk Kim mov 8*4($nptr),%rax 18407bded2dbSJung-uk Kim adc \$0,%rdx 18417bded2dbSJung-uk Kim imulq %r8,$carry # modulo-scheduled 18427bded2dbSJung-uk Kim add %r11,%r10 18437bded2dbSJung-uk Kim mov %rdx,%r11 18447bded2dbSJung-uk Kim adc \$0,%r11 18457bded2dbSJung-uk Kim 18467bded2dbSJung-uk Kim mulq $m0 18477bded2dbSJung-uk Kim add %rax,%r12 18484c6a0400SJung-uk Kim mov 8*5($nptr),%rax 18497bded2dbSJung-uk Kim adc \$0,%rdx 18507bded2dbSJung-uk Kim add %r12,%r11 18517bded2dbSJung-uk Kim mov %rdx,%r12 18527bded2dbSJung-uk Kim adc \$0,%r12 18537bded2dbSJung-uk Kim 18547bded2dbSJung-uk Kim mulq $m0 18557bded2dbSJung-uk Kim add %rax,%r13 18564c6a0400SJung-uk Kim mov 8*6($nptr),%rax 18577bded2dbSJung-uk Kim adc \$0,%rdx 18587bded2dbSJung-uk Kim add %r13,%r12 18597bded2dbSJung-uk Kim mov %rdx,%r13 18607bded2dbSJung-uk Kim adc \$0,%r13 18617bded2dbSJung-uk Kim 18627bded2dbSJung-uk Kim mulq $m0 18637bded2dbSJung-uk Kim add %rax,%r14 18644c6a0400SJung-uk Kim mov 8*7($nptr),%rax 18657bded2dbSJung-uk Kim adc \$0,%rdx 18667bded2dbSJung-uk Kim add %r14,%r13 18677bded2dbSJung-uk Kim mov %rdx,%r14 18687bded2dbSJung-uk Kim adc \$0,%r14 18697bded2dbSJung-uk Kim 18707bded2dbSJung-uk Kim mulq $m0 18717bded2dbSJung-uk Kim mov $carry,$m0 # n0*a[i] 18727bded2dbSJung-uk Kim add %rax,%r15 18734c6a0400SJung-uk Kim mov 8*0($nptr),%rax # n[0] 18747bded2dbSJung-uk Kim adc \$0,%rdx 18757bded2dbSJung-uk Kim add %r15,%r14 18767bded2dbSJung-uk Kim mov %rdx,%r15 18777bded2dbSJung-uk Kim adc \$0,%r15 18787bded2dbSJung-uk Kim 18797bded2dbSJung-uk Kim dec %ecx 18807bded2dbSJung-uk Kim jnz .L8x_reduce 18817bded2dbSJung-uk Kim 18824c6a0400SJung-uk Kim lea 8*8($nptr),$nptr 18837bded2dbSJung-uk Kim xor %rax,%rax 18847bded2dbSJung-uk Kim mov 8+8(%rsp),%rdx # pull end of t[] 18857bded2dbSJung-uk Kim cmp 0+8(%rsp),$nptr # end of n[]? 18867bded2dbSJung-uk Kim jae .L8x_no_tail 18877bded2dbSJung-uk Kim 18887bded2dbSJung-uk Kim .byte 0x66 18897bded2dbSJung-uk Kim add 8*0($tptr),%r8 18907bded2dbSJung-uk Kim adc 8*1($tptr),%r9 18917bded2dbSJung-uk Kim adc 8*2($tptr),%r10 18927bded2dbSJung-uk Kim adc 8*3($tptr),%r11 18937bded2dbSJung-uk Kim adc 8*4($tptr),%r12 18947bded2dbSJung-uk Kim adc 8*5($tptr),%r13 18957bded2dbSJung-uk Kim adc 8*6($tptr),%r14 18967bded2dbSJung-uk Kim adc 8*7($tptr),%r15 18977bded2dbSJung-uk Kim sbb $carry,$carry # top carry 18987bded2dbSJung-uk Kim 18997bded2dbSJung-uk Kim mov 48+56+8(%rsp),$m0 # pull n0*a[0] 19007bded2dbSJung-uk Kim mov \$8,%ecx 19014c6a0400SJung-uk Kim mov 8*0($nptr),%rax 19027bded2dbSJung-uk Kim jmp .L8x_tail 19037bded2dbSJung-uk Kim 19047bded2dbSJung-uk Kim.align 32 19057bded2dbSJung-uk Kim.L8x_tail: 19067bded2dbSJung-uk Kim mulq $m0 19077bded2dbSJung-uk Kim add %rax,%r8 19084c6a0400SJung-uk Kim mov 8*1($nptr),%rax 19097bded2dbSJung-uk Kim mov %r8,($tptr) # save result 19107bded2dbSJung-uk Kim mov %rdx,%r8 19117bded2dbSJung-uk Kim adc \$0,%r8 19127bded2dbSJung-uk Kim 19137bded2dbSJung-uk Kim mulq $m0 19147bded2dbSJung-uk Kim add %rax,%r9 19154c6a0400SJung-uk Kim mov 8*2($nptr),%rax 19167bded2dbSJung-uk Kim adc \$0,%rdx 19177bded2dbSJung-uk Kim add %r9,%r8 19187bded2dbSJung-uk Kim lea 8($tptr),$tptr # $tptr++ 19197bded2dbSJung-uk Kim mov %rdx,%r9 19207bded2dbSJung-uk Kim adc \$0,%r9 19217bded2dbSJung-uk Kim 19227bded2dbSJung-uk Kim mulq $m0 19237bded2dbSJung-uk Kim add %rax,%r10 19244c6a0400SJung-uk Kim mov 8*3($nptr),%rax 19257bded2dbSJung-uk Kim adc \$0,%rdx 19267bded2dbSJung-uk Kim add %r10,%r9 19277bded2dbSJung-uk Kim mov %rdx,%r10 19287bded2dbSJung-uk Kim adc \$0,%r10 19297bded2dbSJung-uk Kim 19307bded2dbSJung-uk Kim mulq $m0 19317bded2dbSJung-uk Kim add %rax,%r11 19324c6a0400SJung-uk Kim mov 8*4($nptr),%rax 19337bded2dbSJung-uk Kim adc \$0,%rdx 19347bded2dbSJung-uk Kim add %r11,%r10 19357bded2dbSJung-uk Kim mov %rdx,%r11 19367bded2dbSJung-uk Kim adc \$0,%r11 19377bded2dbSJung-uk Kim 19387bded2dbSJung-uk Kim mulq $m0 19397bded2dbSJung-uk Kim add %rax,%r12 19404c6a0400SJung-uk Kim mov 8*5($nptr),%rax 19417bded2dbSJung-uk Kim adc \$0,%rdx 19427bded2dbSJung-uk Kim add %r12,%r11 19437bded2dbSJung-uk Kim mov %rdx,%r12 19447bded2dbSJung-uk Kim adc \$0,%r12 19457bded2dbSJung-uk Kim 19467bded2dbSJung-uk Kim mulq $m0 19477bded2dbSJung-uk Kim add %rax,%r13 19484c6a0400SJung-uk Kim mov 8*6($nptr),%rax 19497bded2dbSJung-uk Kim adc \$0,%rdx 19507bded2dbSJung-uk Kim add %r13,%r12 19517bded2dbSJung-uk Kim mov %rdx,%r13 19527bded2dbSJung-uk Kim adc \$0,%r13 19537bded2dbSJung-uk Kim 19547bded2dbSJung-uk Kim mulq $m0 19557bded2dbSJung-uk Kim add %rax,%r14 19564c6a0400SJung-uk Kim mov 8*7($nptr),%rax 19577bded2dbSJung-uk Kim adc \$0,%rdx 19587bded2dbSJung-uk Kim add %r14,%r13 19597bded2dbSJung-uk Kim mov %rdx,%r14 19607bded2dbSJung-uk Kim adc \$0,%r14 19617bded2dbSJung-uk Kim 19627bded2dbSJung-uk Kim mulq $m0 19637bded2dbSJung-uk Kim mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] 19647bded2dbSJung-uk Kim add %rax,%r15 19657bded2dbSJung-uk Kim adc \$0,%rdx 19667bded2dbSJung-uk Kim add %r15,%r14 19674c6a0400SJung-uk Kim mov 8*0($nptr),%rax # pull n[0] 19687bded2dbSJung-uk Kim mov %rdx,%r15 19697bded2dbSJung-uk Kim adc \$0,%r15 19707bded2dbSJung-uk Kim 19717bded2dbSJung-uk Kim dec %ecx 19727bded2dbSJung-uk Kim jnz .L8x_tail 19737bded2dbSJung-uk Kim 19744c6a0400SJung-uk Kim lea 8*8($nptr),$nptr 19757bded2dbSJung-uk Kim mov 8+8(%rsp),%rdx # pull end of t[] 19767bded2dbSJung-uk Kim cmp 0+8(%rsp),$nptr # end of n[]? 19777bded2dbSJung-uk Kim jae .L8x_tail_done # break out of loop 19787bded2dbSJung-uk Kim 19797bded2dbSJung-uk Kim mov 48+56+8(%rsp),$m0 # pull n0*a[0] 19807bded2dbSJung-uk Kim neg $carry 19817bded2dbSJung-uk Kim mov 8*0($nptr),%rax # pull n[0] 19827bded2dbSJung-uk Kim adc 8*0($tptr),%r8 19837bded2dbSJung-uk Kim adc 8*1($tptr),%r9 19847bded2dbSJung-uk Kim adc 8*2($tptr),%r10 19857bded2dbSJung-uk Kim adc 8*3($tptr),%r11 19867bded2dbSJung-uk Kim adc 8*4($tptr),%r12 19877bded2dbSJung-uk Kim adc 8*5($tptr),%r13 19887bded2dbSJung-uk Kim adc 8*6($tptr),%r14 19897bded2dbSJung-uk Kim adc 8*7($tptr),%r15 19907bded2dbSJung-uk Kim sbb $carry,$carry # top carry 19917bded2dbSJung-uk Kim 19927bded2dbSJung-uk Kim mov \$8,%ecx 19937bded2dbSJung-uk Kim jmp .L8x_tail 19947bded2dbSJung-uk Kim 19957bded2dbSJung-uk Kim.align 32 19967bded2dbSJung-uk Kim.L8x_tail_done: 19976cf8931aSJung-uk Kim xor %rax,%rax 19987bded2dbSJung-uk Kim add (%rdx),%r8 # can this overflow? 199980815a77SJung-uk Kim adc \$0,%r9 200080815a77SJung-uk Kim adc \$0,%r10 200180815a77SJung-uk Kim adc \$0,%r11 200280815a77SJung-uk Kim adc \$0,%r12 200380815a77SJung-uk Kim adc \$0,%r13 200480815a77SJung-uk Kim adc \$0,%r14 20056cf8931aSJung-uk Kim adc \$0,%r15 20066cf8931aSJung-uk Kim adc \$0,%rax 20077bded2dbSJung-uk Kim 20087bded2dbSJung-uk Kim neg $carry 20097bded2dbSJung-uk Kim.L8x_no_tail: 20107bded2dbSJung-uk Kim adc 8*0($tptr),%r8 20117bded2dbSJung-uk Kim adc 8*1($tptr),%r9 20127bded2dbSJung-uk Kim adc 8*2($tptr),%r10 20137bded2dbSJung-uk Kim adc 8*3($tptr),%r11 20147bded2dbSJung-uk Kim adc 8*4($tptr),%r12 20157bded2dbSJung-uk Kim adc 8*5($tptr),%r13 20167bded2dbSJung-uk Kim adc 8*6($tptr),%r14 20177bded2dbSJung-uk Kim adc 8*7($tptr),%r15 20187bded2dbSJung-uk Kim adc \$0,%rax # top-most carry 20194c6a0400SJung-uk Kim mov -8($nptr),%rcx # np[num-1] 20207bded2dbSJung-uk Kim xor $carry,$carry 20217bded2dbSJung-uk Kim 20227bded2dbSJung-uk Kim movq %xmm2,$nptr # restore $nptr 20237bded2dbSJung-uk Kim 20247bded2dbSJung-uk Kim mov %r8,8*0($tptr) # store top 512 bits 20257bded2dbSJung-uk Kim mov %r9,8*1($tptr) 20267bded2dbSJung-uk Kim movq %xmm3,$num # $num is %r9, can't be moved upwards 20277bded2dbSJung-uk Kim mov %r10,8*2($tptr) 20287bded2dbSJung-uk Kim mov %r11,8*3($tptr) 20297bded2dbSJung-uk Kim mov %r12,8*4($tptr) 20307bded2dbSJung-uk Kim mov %r13,8*5($tptr) 20317bded2dbSJung-uk Kim mov %r14,8*6($tptr) 20327bded2dbSJung-uk Kim mov %r15,8*7($tptr) 20337bded2dbSJung-uk Kim lea 8*8($tptr),$tptr 20347bded2dbSJung-uk Kim 20357bded2dbSJung-uk Kim cmp %rdx,$tptr # end of t[]? 20367bded2dbSJung-uk Kim jb .L8x_reduction_loop 20374c6a0400SJung-uk Kim ret 203817f01e99SJung-uk Kim.cfi_endproc 20394c6a0400SJung-uk Kim.size bn_sqr8x_internal,.-bn_sqr8x_internal 20407bded2dbSJung-uk Kim___ 20417bded2dbSJung-uk Kim} 20427bded2dbSJung-uk Kim############################################################## 20437bded2dbSJung-uk Kim# Post-condition, 4x unrolled 20447bded2dbSJung-uk Kim# 20457bded2dbSJung-uk Kim{ 20467bded2dbSJung-uk Kimmy ($tptr,$nptr)=("%rbx","%rbp"); 20477bded2dbSJung-uk Kim$code.=<<___; 20484c6a0400SJung-uk Kim.type __bn_post4x_internal,\@abi-omnipotent 20497bded2dbSJung-uk Kim.align 32 20504c6a0400SJung-uk Kim__bn_post4x_internal: 205117f01e99SJung-uk Kim.cfi_startproc 20524c6a0400SJung-uk Kim mov 8*0($nptr),%r12 20534c6a0400SJung-uk Kim lea (%rdi,$num),$tptr # %rdi was $tptr above 20544c6a0400SJung-uk Kim mov $num,%rcx 20554c6a0400SJung-uk Kim movq %xmm1,$rptr # restore $rptr 20564c6a0400SJung-uk Kim neg %rax 20574c6a0400SJung-uk Kim movq %xmm1,$aptr # prepare for back-to-back call 20584c6a0400SJung-uk Kim sar \$3+2,%rcx 20594c6a0400SJung-uk Kim dec %r12 # so that after 'not' we get -n[0] 20604c6a0400SJung-uk Kim xor %r10,%r10 20614c6a0400SJung-uk Kim mov 8*1($nptr),%r13 20624c6a0400SJung-uk Kim mov 8*2($nptr),%r14 20634c6a0400SJung-uk Kim mov 8*3($nptr),%r15 20644c6a0400SJung-uk Kim jmp .Lsqr4x_sub_entry 20654c6a0400SJung-uk Kim 20664c6a0400SJung-uk Kim.align 16 20677bded2dbSJung-uk Kim.Lsqr4x_sub: 20684c6a0400SJung-uk Kim mov 8*0($nptr),%r12 20694c6a0400SJung-uk Kim mov 8*1($nptr),%r13 20704c6a0400SJung-uk Kim mov 8*2($nptr),%r14 20714c6a0400SJung-uk Kim mov 8*3($nptr),%r15 20724c6a0400SJung-uk Kim.Lsqr4x_sub_entry: 20734c6a0400SJung-uk Kim lea 8*4($nptr),$nptr 20744c6a0400SJung-uk Kim not %r12 20754c6a0400SJung-uk Kim not %r13 20764c6a0400SJung-uk Kim not %r14 20774c6a0400SJung-uk Kim not %r15 20784c6a0400SJung-uk Kim and %rax,%r12 20794c6a0400SJung-uk Kim and %rax,%r13 20804c6a0400SJung-uk Kim and %rax,%r14 20814c6a0400SJung-uk Kim and %rax,%r15 20824c6a0400SJung-uk Kim 20834c6a0400SJung-uk Kim neg %r10 # mov %r10,%cf 20844c6a0400SJung-uk Kim adc 8*0($tptr),%r12 20854c6a0400SJung-uk Kim adc 8*1($tptr),%r13 20864c6a0400SJung-uk Kim adc 8*2($tptr),%r14 20874c6a0400SJung-uk Kim adc 8*3($tptr),%r15 20887bded2dbSJung-uk Kim mov %r12,8*0($rptr) 20894c6a0400SJung-uk Kim lea 8*4($tptr),$tptr 20907bded2dbSJung-uk Kim mov %r13,8*1($rptr) 20914c6a0400SJung-uk Kim sbb %r10,%r10 # mov %cf,%r10 20927bded2dbSJung-uk Kim mov %r14,8*2($rptr) 20937bded2dbSJung-uk Kim mov %r15,8*3($rptr) 20947bded2dbSJung-uk Kim lea 8*4($rptr),$rptr 20957bded2dbSJung-uk Kim 20967bded2dbSJung-uk Kim inc %rcx # pass %cf 20977bded2dbSJung-uk Kim jnz .Lsqr4x_sub 20984c6a0400SJung-uk Kim 20997bded2dbSJung-uk Kim mov $num,%r10 # prepare for back-to-back call 21007bded2dbSJung-uk Kim neg $num # restore $num 21017bded2dbSJung-uk Kim ret 210217f01e99SJung-uk Kim.cfi_endproc 21034c6a0400SJung-uk Kim.size __bn_post4x_internal,.-__bn_post4x_internal 21047bded2dbSJung-uk Kim___ 21054c6a0400SJung-uk Kim} 21067bded2dbSJung-uk Kim}}} 21077bded2dbSJung-uk Kim 21087bded2dbSJung-uk Kimif ($addx) {{{ 21097bded2dbSJung-uk Kimmy $bp="%rdx"; # restore original value 21107bded2dbSJung-uk Kim 21117bded2dbSJung-uk Kim$code.=<<___; 21127bded2dbSJung-uk Kim.type bn_mulx4x_mont_gather5,\@function,6 21137bded2dbSJung-uk Kim.align 32 21147bded2dbSJung-uk Kimbn_mulx4x_mont_gather5: 2115e71b7053SJung-uk Kim.cfi_startproc 21167bded2dbSJung-uk Kim mov %rsp,%rax 2117e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 2118aeb5019cSJung-uk Kim.Lmulx4x_enter: 21197bded2dbSJung-uk Kim push %rbx 2120e71b7053SJung-uk Kim.cfi_push %rbx 21217bded2dbSJung-uk Kim push %rbp 2122e71b7053SJung-uk Kim.cfi_push %rbp 21237bded2dbSJung-uk Kim push %r12 2124e71b7053SJung-uk Kim.cfi_push %r12 21257bded2dbSJung-uk Kim push %r13 2126e71b7053SJung-uk Kim.cfi_push %r13 21277bded2dbSJung-uk Kim push %r14 2128e71b7053SJung-uk Kim.cfi_push %r14 21297bded2dbSJung-uk Kim push %r15 2130e71b7053SJung-uk Kim.cfi_push %r15 2131aeb5019cSJung-uk Kim.Lmulx4x_prologue: 21324c6a0400SJung-uk Kim 21337bded2dbSJung-uk Kim shl \$3,${num}d # convert $num to bytes 21344c6a0400SJung-uk Kim lea ($num,$num,2),%r10 # 3*$num in bytes 21357bded2dbSJung-uk Kim neg $num # -$num 21367bded2dbSJung-uk Kim mov ($n0),$n0 # *n0 21377bded2dbSJung-uk Kim 21387bded2dbSJung-uk Kim ############################################################## 21394c6a0400SJung-uk Kim # Ensure that stack frame doesn't alias with $rptr+3*$num 21404c6a0400SJung-uk Kim # modulo 4096, which covers ret[num], am[num] and n[num] 21414c6a0400SJung-uk Kim # (see bn_exp.c). This is done to allow memory disambiguation 21424c6a0400SJung-uk Kim # logic do its magic. [Extra [num] is allocated in order 21434c6a0400SJung-uk Kim # to align with bn_power5's frame, which is cleansed after 21444c6a0400SJung-uk Kim # completing exponentiation. Extra 256 bytes is for power mask 21454c6a0400SJung-uk Kim # calculated from 7th argument, the index.] 21467bded2dbSJung-uk Kim # 21474c6a0400SJung-uk Kim lea -320(%rsp,$num,2),%r11 2148aeb5019cSJung-uk Kim mov %rsp,%rbp 21494c6a0400SJung-uk Kim sub $rp,%r11 21507bded2dbSJung-uk Kim and \$4095,%r11 21517bded2dbSJung-uk Kim cmp %r11,%r10 21527bded2dbSJung-uk Kim jb .Lmulx4xsp_alt 2153aeb5019cSJung-uk Kim sub %r11,%rbp # align with $aptr 2154aeb5019cSJung-uk Kim lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 21557bded2dbSJung-uk Kim jmp .Lmulx4xsp_done 21567bded2dbSJung-uk Kim 21577bded2dbSJung-uk Kim.Lmulx4xsp_alt: 21584c6a0400SJung-uk Kim lea 4096-320(,$num,2),%r10 2159aeb5019cSJung-uk Kim lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 21607bded2dbSJung-uk Kim sub %r10,%r11 21617bded2dbSJung-uk Kim mov \$0,%r10 21627bded2dbSJung-uk Kim cmovc %r10,%r11 2163aeb5019cSJung-uk Kim sub %r11,%rbp 21647bded2dbSJung-uk Kim.Lmulx4xsp_done: 2165aeb5019cSJung-uk Kim and \$-64,%rbp # ensure alignment 2166aeb5019cSJung-uk Kim mov %rsp,%r11 2167aeb5019cSJung-uk Kim sub %rbp,%r11 2168b8721c16SJung-uk Kim and \$-4096,%r11 2169aeb5019cSJung-uk Kim lea (%rbp,%r11),%rsp 2170aeb5019cSJung-uk Kim mov (%rsp),%r10 2171aeb5019cSJung-uk Kim cmp %rbp,%rsp 2172aeb5019cSJung-uk Kim ja .Lmulx4x_page_walk 2173aeb5019cSJung-uk Kim jmp .Lmulx4x_page_walk_done 2174aeb5019cSJung-uk Kim 2175b8721c16SJung-uk Kim.Lmulx4x_page_walk: 2176aeb5019cSJung-uk Kim lea -4096(%rsp),%rsp 2177aeb5019cSJung-uk Kim mov (%rsp),%r10 2178aeb5019cSJung-uk Kim cmp %rbp,%rsp 2179aeb5019cSJung-uk Kim ja .Lmulx4x_page_walk 2180aeb5019cSJung-uk Kim.Lmulx4x_page_walk_done: 2181b8721c16SJung-uk Kim 21827bded2dbSJung-uk Kim ############################################################## 21837bded2dbSJung-uk Kim # Stack layout 21847bded2dbSJung-uk Kim # +0 -num 21857bded2dbSJung-uk Kim # +8 off-loaded &b[i] 21867bded2dbSJung-uk Kim # +16 end of b[num] 21877bded2dbSJung-uk Kim # +24 inner counter 21887bded2dbSJung-uk Kim # +32 saved n0 21897bded2dbSJung-uk Kim # +40 saved %rsp 21907bded2dbSJung-uk Kim # +48 21917bded2dbSJung-uk Kim # +56 saved rp 21927bded2dbSJung-uk Kim # +64 tmp[num+1] 21937bded2dbSJung-uk Kim # 21947bded2dbSJung-uk Kim mov $n0, 32(%rsp) # save *n0 21957bded2dbSJung-uk Kim mov %rax,40(%rsp) # save original %rsp 2196e71b7053SJung-uk Kim.cfi_cfa_expression %rsp+40,deref,+8 21977bded2dbSJung-uk Kim.Lmulx4x_body: 21987bded2dbSJung-uk Kim call mulx4x_internal 21997bded2dbSJung-uk Kim 22007bded2dbSJung-uk Kim mov 40(%rsp),%rsi # restore %rsp 2201e71b7053SJung-uk Kim.cfi_def_cfa %rsi,8 22021f13597dSJung-uk Kim mov \$1,%rax 22034c6a0400SJung-uk Kim 22047bded2dbSJung-uk Kim mov -48(%rsi),%r15 2205e71b7053SJung-uk Kim.cfi_restore %r15 22067bded2dbSJung-uk Kim mov -40(%rsi),%r14 2207e71b7053SJung-uk Kim.cfi_restore %r14 22087bded2dbSJung-uk Kim mov -32(%rsi),%r13 2209e71b7053SJung-uk Kim.cfi_restore %r13 22107bded2dbSJung-uk Kim mov -24(%rsi),%r12 2211e71b7053SJung-uk Kim.cfi_restore %r12 22127bded2dbSJung-uk Kim mov -16(%rsi),%rbp 2213e71b7053SJung-uk Kim.cfi_restore %rbp 22147bded2dbSJung-uk Kim mov -8(%rsi),%rbx 2215e71b7053SJung-uk Kim.cfi_restore %rbx 22167bded2dbSJung-uk Kim lea (%rsi),%rsp 2217e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 22187bded2dbSJung-uk Kim.Lmulx4x_epilogue: 22191f13597dSJung-uk Kim ret 2220e71b7053SJung-uk Kim.cfi_endproc 22217bded2dbSJung-uk Kim.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 22227bded2dbSJung-uk Kim 22237bded2dbSJung-uk Kim.type mulx4x_internal,\@abi-omnipotent 22247bded2dbSJung-uk Kim.align 32 22257bded2dbSJung-uk Kimmulx4x_internal: 222617f01e99SJung-uk Kim.cfi_startproc 22274c6a0400SJung-uk Kim mov $num,8(%rsp) # save -$num (it was in bytes) 22284c6a0400SJung-uk Kim mov $num,%r10 22297bded2dbSJung-uk Kim neg $num # restore $num 22307bded2dbSJung-uk Kim shl \$5,$num 22314c6a0400SJung-uk Kim neg %r10 # restore $num 22324c6a0400SJung-uk Kim lea 128($bp,$num),%r13 # end of powers table (+size optimization) 22337bded2dbSJung-uk Kim shr \$5+5,$num 22344c6a0400SJung-uk Kim movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument 22357bded2dbSJung-uk Kim sub \$1,$num 22364c6a0400SJung-uk Kim lea .Linc(%rip),%rax 22377bded2dbSJung-uk Kim mov %r13,16+8(%rsp) # end of b[num] 22387bded2dbSJung-uk Kim mov $num,24+8(%rsp) # inner counter 22397bded2dbSJung-uk Kim mov $rp, 56+8(%rsp) # save $rp 22407bded2dbSJung-uk Kim___ 22417bded2dbSJung-uk Kimmy ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= 22427bded2dbSJung-uk Kim ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); 22437bded2dbSJung-uk Kimmy $rptr=$bptr; 22447bded2dbSJung-uk Kimmy $STRIDE=2**5*8; # 5 is "window size" 22457bded2dbSJung-uk Kimmy $N=$STRIDE/4; # should match cache line size 22467bded2dbSJung-uk Kim$code.=<<___; 22474c6a0400SJung-uk Kim movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 22484c6a0400SJung-uk Kim movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 2249e71b7053SJung-uk Kim lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization) 22504c6a0400SJung-uk Kim lea 128($bp),$bptr # size optimization 22517bded2dbSJung-uk Kim 22524c6a0400SJung-uk Kim pshufd \$0,%xmm5,%xmm5 # broadcast index 22534c6a0400SJung-uk Kim movdqa %xmm1,%xmm4 22544c6a0400SJung-uk Kim .byte 0x67 22554c6a0400SJung-uk Kim movdqa %xmm1,%xmm2 22564c6a0400SJung-uk Kim___ 22574c6a0400SJung-uk Kim######################################################################## 22584c6a0400SJung-uk Kim# calculate mask by comparing 0..31 to index and save result to stack 22597bded2dbSJung-uk Kim# 22604c6a0400SJung-uk Kim$code.=<<___; 22614c6a0400SJung-uk Kim .byte 0x67 22624c6a0400SJung-uk Kim paddd %xmm0,%xmm1 22634c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm0 # compare to 1,0 22644c6a0400SJung-uk Kim movdqa %xmm4,%xmm3 22654c6a0400SJung-uk Kim___ 22664c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16-4;$i+=4) { 22674c6a0400SJung-uk Kim$code.=<<___; 22684c6a0400SJung-uk Kim paddd %xmm1,%xmm2 22694c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm1 # compare to 3,2 22704c6a0400SJung-uk Kim movdqa %xmm0,`16*($i+0)+112`(%r10) 22714c6a0400SJung-uk Kim movdqa %xmm4,%xmm0 22724c6a0400SJung-uk Kim 22734c6a0400SJung-uk Kim paddd %xmm2,%xmm3 22744c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm2 # compare to 5,4 22754c6a0400SJung-uk Kim movdqa %xmm1,`16*($i+1)+112`(%r10) 22764c6a0400SJung-uk Kim movdqa %xmm4,%xmm1 22774c6a0400SJung-uk Kim 22784c6a0400SJung-uk Kim paddd %xmm3,%xmm0 22794c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm3 # compare to 7,6 22804c6a0400SJung-uk Kim movdqa %xmm2,`16*($i+2)+112`(%r10) 22814c6a0400SJung-uk Kim movdqa %xmm4,%xmm2 22824c6a0400SJung-uk Kim 22834c6a0400SJung-uk Kim paddd %xmm0,%xmm1 22844c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm0 22854c6a0400SJung-uk Kim movdqa %xmm3,`16*($i+3)+112`(%r10) 22864c6a0400SJung-uk Kim movdqa %xmm4,%xmm3 22874c6a0400SJung-uk Kim___ 22884c6a0400SJung-uk Kim} 22894c6a0400SJung-uk Kim$code.=<<___; # last iteration can be optimized 22904c6a0400SJung-uk Kim .byte 0x67 22914c6a0400SJung-uk Kim paddd %xmm1,%xmm2 22924c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm1 22934c6a0400SJung-uk Kim movdqa %xmm0,`16*($i+0)+112`(%r10) 22944c6a0400SJung-uk Kim 22954c6a0400SJung-uk Kim paddd %xmm2,%xmm3 22964c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm2 22974c6a0400SJung-uk Kim movdqa %xmm1,`16*($i+1)+112`(%r10) 22984c6a0400SJung-uk Kim 22994c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm3 23004c6a0400SJung-uk Kim movdqa %xmm2,`16*($i+2)+112`(%r10) 23014c6a0400SJung-uk Kim 23024c6a0400SJung-uk Kim pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register 23034c6a0400SJung-uk Kim pand `16*($i+1)-128`($bptr),%xmm1 23044c6a0400SJung-uk Kim pand `16*($i+2)-128`($bptr),%xmm2 23054c6a0400SJung-uk Kim movdqa %xmm3,`16*($i+3)+112`(%r10) 23064c6a0400SJung-uk Kim pand `16*($i+3)-128`($bptr),%xmm3 23074c6a0400SJung-uk Kim por %xmm2,%xmm0 23084c6a0400SJung-uk Kim por %xmm3,%xmm1 23094c6a0400SJung-uk Kim___ 23104c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16-4;$i+=4) { 23114c6a0400SJung-uk Kim$code.=<<___; 23124c6a0400SJung-uk Kim movdqa `16*($i+0)-128`($bptr),%xmm4 23134c6a0400SJung-uk Kim movdqa `16*($i+1)-128`($bptr),%xmm5 23144c6a0400SJung-uk Kim movdqa `16*($i+2)-128`($bptr),%xmm2 23154c6a0400SJung-uk Kim pand `16*($i+0)+112`(%r10),%xmm4 23164c6a0400SJung-uk Kim movdqa `16*($i+3)-128`($bptr),%xmm3 23174c6a0400SJung-uk Kim pand `16*($i+1)+112`(%r10),%xmm5 23184c6a0400SJung-uk Kim por %xmm4,%xmm0 23194c6a0400SJung-uk Kim pand `16*($i+2)+112`(%r10),%xmm2 23204c6a0400SJung-uk Kim por %xmm5,%xmm1 23214c6a0400SJung-uk Kim pand `16*($i+3)+112`(%r10),%xmm3 23224c6a0400SJung-uk Kim por %xmm2,%xmm0 23234c6a0400SJung-uk Kim por %xmm3,%xmm1 23244c6a0400SJung-uk Kim___ 23254c6a0400SJung-uk Kim} 23264c6a0400SJung-uk Kim$code.=<<___; 23274c6a0400SJung-uk Kim pxor %xmm1,%xmm0 23284c6a0400SJung-uk Kim pshufd \$0x4e,%xmm0,%xmm1 23294c6a0400SJung-uk Kim por %xmm1,%xmm0 23304c6a0400SJung-uk Kim lea $STRIDE($bptr),$bptr 23314c6a0400SJung-uk Kim movq %xmm0,%rdx # bp[0] 23324c6a0400SJung-uk Kim lea 64+8*4+8(%rsp),$tptr 23337bded2dbSJung-uk Kim 23347bded2dbSJung-uk Kim mov %rdx,$bi 23357bded2dbSJung-uk Kim mulx 0*8($aptr),$mi,%rax # a[0]*b[0] 23367bded2dbSJung-uk Kim mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] 23377bded2dbSJung-uk Kim add %rax,%r11 23387bded2dbSJung-uk Kim mulx 2*8($aptr),%rax,%r13 # ... 23397bded2dbSJung-uk Kim adc %rax,%r12 23407bded2dbSJung-uk Kim adc \$0,%r13 23417bded2dbSJung-uk Kim mulx 3*8($aptr),%rax,%r14 23427bded2dbSJung-uk Kim 23437bded2dbSJung-uk Kim mov $mi,%r15 23447bded2dbSJung-uk Kim imulq 32+8(%rsp),$mi # "t[0]"*n0 23457bded2dbSJung-uk Kim xor $zero,$zero # cf=0, of=0 23467bded2dbSJung-uk Kim mov $mi,%rdx 23477bded2dbSJung-uk Kim 23487bded2dbSJung-uk Kim mov $bptr,8+8(%rsp) # off-load &b[i] 23497bded2dbSJung-uk Kim 23504c6a0400SJung-uk Kim lea 4*8($aptr),$aptr 23517bded2dbSJung-uk Kim adcx %rax,%r13 23527bded2dbSJung-uk Kim adcx $zero,%r14 # cf=0 23537bded2dbSJung-uk Kim 23544c6a0400SJung-uk Kim mulx 0*8($nptr),%rax,%r10 23557bded2dbSJung-uk Kim adcx %rax,%r15 # discarded 23567bded2dbSJung-uk Kim adox %r11,%r10 23574c6a0400SJung-uk Kim mulx 1*8($nptr),%rax,%r11 23587bded2dbSJung-uk Kim adcx %rax,%r10 23597bded2dbSJung-uk Kim adox %r12,%r11 23604c6a0400SJung-uk Kim mulx 2*8($nptr),%rax,%r12 23617bded2dbSJung-uk Kim mov 24+8(%rsp),$bptr # counter value 23627bded2dbSJung-uk Kim mov %r10,-8*4($tptr) 23637bded2dbSJung-uk Kim adcx %rax,%r11 23647bded2dbSJung-uk Kim adox %r13,%r12 23654c6a0400SJung-uk Kim mulx 3*8($nptr),%rax,%r15 23667bded2dbSJung-uk Kim mov $bi,%rdx 23677bded2dbSJung-uk Kim mov %r11,-8*3($tptr) 23687bded2dbSJung-uk Kim adcx %rax,%r12 23697bded2dbSJung-uk Kim adox $zero,%r15 # of=0 23704c6a0400SJung-uk Kim lea 4*8($nptr),$nptr 23717bded2dbSJung-uk Kim mov %r12,-8*2($tptr) 23724c6a0400SJung-uk Kim jmp .Lmulx4x_1st 23737bded2dbSJung-uk Kim 23747bded2dbSJung-uk Kim.align 32 23757bded2dbSJung-uk Kim.Lmulx4x_1st: 23767bded2dbSJung-uk Kim adcx $zero,%r15 # cf=0, modulo-scheduled 23777bded2dbSJung-uk Kim mulx 0*8($aptr),%r10,%rax # a[4]*b[0] 23787bded2dbSJung-uk Kim adcx %r14,%r10 23797bded2dbSJung-uk Kim mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] 23807bded2dbSJung-uk Kim adcx %rax,%r11 23817bded2dbSJung-uk Kim mulx 2*8($aptr),%r12,%rax # ... 23827bded2dbSJung-uk Kim adcx %r14,%r12 23837bded2dbSJung-uk Kim mulx 3*8($aptr),%r13,%r14 23847bded2dbSJung-uk Kim .byte 0x67,0x67 23857bded2dbSJung-uk Kim mov $mi,%rdx 23867bded2dbSJung-uk Kim adcx %rax,%r13 23877bded2dbSJung-uk Kim adcx $zero,%r14 # cf=0 23887bded2dbSJung-uk Kim lea 4*8($aptr),$aptr 23897bded2dbSJung-uk Kim lea 4*8($tptr),$tptr 23907bded2dbSJung-uk Kim 23917bded2dbSJung-uk Kim adox %r15,%r10 23924c6a0400SJung-uk Kim mulx 0*8($nptr),%rax,%r15 23937bded2dbSJung-uk Kim adcx %rax,%r10 23947bded2dbSJung-uk Kim adox %r15,%r11 23954c6a0400SJung-uk Kim mulx 1*8($nptr),%rax,%r15 23967bded2dbSJung-uk Kim adcx %rax,%r11 23977bded2dbSJung-uk Kim adox %r15,%r12 23984c6a0400SJung-uk Kim mulx 2*8($nptr),%rax,%r15 23997bded2dbSJung-uk Kim mov %r10,-5*8($tptr) 24007bded2dbSJung-uk Kim adcx %rax,%r12 24017bded2dbSJung-uk Kim mov %r11,-4*8($tptr) 24027bded2dbSJung-uk Kim adox %r15,%r13 24034c6a0400SJung-uk Kim mulx 3*8($nptr),%rax,%r15 24047bded2dbSJung-uk Kim mov $bi,%rdx 24057bded2dbSJung-uk Kim mov %r12,-3*8($tptr) 24067bded2dbSJung-uk Kim adcx %rax,%r13 24077bded2dbSJung-uk Kim adox $zero,%r15 24084c6a0400SJung-uk Kim lea 4*8($nptr),$nptr 24097bded2dbSJung-uk Kim mov %r13,-2*8($tptr) 24107bded2dbSJung-uk Kim 24117bded2dbSJung-uk Kim dec $bptr # of=0, pass cf 24127bded2dbSJung-uk Kim jnz .Lmulx4x_1st 24137bded2dbSJung-uk Kim 24147bded2dbSJung-uk Kim mov 8(%rsp),$num # load -num 24157bded2dbSJung-uk Kim adc $zero,%r15 # modulo-scheduled 24167bded2dbSJung-uk Kim lea ($aptr,$num),$aptr # rewind $aptr 24177bded2dbSJung-uk Kim add %r15,%r14 24187bded2dbSJung-uk Kim mov 8+8(%rsp),$bptr # re-load &b[i] 24197bded2dbSJung-uk Kim adc $zero,$zero # top-most carry 24207bded2dbSJung-uk Kim mov %r14,-1*8($tptr) 24217bded2dbSJung-uk Kim jmp .Lmulx4x_outer 24227bded2dbSJung-uk Kim 24237bded2dbSJung-uk Kim.align 32 24247bded2dbSJung-uk Kim.Lmulx4x_outer: 24254c6a0400SJung-uk Kim lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) 24264c6a0400SJung-uk Kim pxor %xmm4,%xmm4 24274c6a0400SJung-uk Kim .byte 0x67,0x67 24284c6a0400SJung-uk Kim pxor %xmm5,%xmm5 24294c6a0400SJung-uk Kim___ 24304c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16;$i+=4) { 24314c6a0400SJung-uk Kim$code.=<<___; 24324c6a0400SJung-uk Kim movdqa `16*($i+0)-128`($bptr),%xmm0 24334c6a0400SJung-uk Kim movdqa `16*($i+1)-128`($bptr),%xmm1 24344c6a0400SJung-uk Kim movdqa `16*($i+2)-128`($bptr),%xmm2 24354c6a0400SJung-uk Kim pand `16*($i+0)+256`(%r10),%xmm0 24364c6a0400SJung-uk Kim movdqa `16*($i+3)-128`($bptr),%xmm3 24374c6a0400SJung-uk Kim pand `16*($i+1)+256`(%r10),%xmm1 24384c6a0400SJung-uk Kim por %xmm0,%xmm4 24394c6a0400SJung-uk Kim pand `16*($i+2)+256`(%r10),%xmm2 24404c6a0400SJung-uk Kim por %xmm1,%xmm5 24414c6a0400SJung-uk Kim pand `16*($i+3)+256`(%r10),%xmm3 24424c6a0400SJung-uk Kim por %xmm2,%xmm4 24434c6a0400SJung-uk Kim por %xmm3,%xmm5 24444c6a0400SJung-uk Kim___ 24454c6a0400SJung-uk Kim} 24464c6a0400SJung-uk Kim$code.=<<___; 24474c6a0400SJung-uk Kim por %xmm5,%xmm4 24484c6a0400SJung-uk Kim pshufd \$0x4e,%xmm4,%xmm0 24494c6a0400SJung-uk Kim por %xmm4,%xmm0 24504c6a0400SJung-uk Kim lea $STRIDE($bptr),$bptr 24514c6a0400SJung-uk Kim movq %xmm0,%rdx # m0=bp[i] 24524c6a0400SJung-uk Kim 24537bded2dbSJung-uk Kim mov $zero,($tptr) # save top-most carry 24547bded2dbSJung-uk Kim lea 4*8($tptr,$num),$tptr # rewind $tptr 24557bded2dbSJung-uk Kim mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] 24567bded2dbSJung-uk Kim xor $zero,$zero # cf=0, of=0 24577bded2dbSJung-uk Kim mov %rdx,$bi 24587bded2dbSJung-uk Kim mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] 24597bded2dbSJung-uk Kim adox -4*8($tptr),$mi # +t[0] 24607bded2dbSJung-uk Kim adcx %r14,%r11 24617bded2dbSJung-uk Kim mulx 2*8($aptr),%r15,%r13 # ... 24627bded2dbSJung-uk Kim adox -3*8($tptr),%r11 24637bded2dbSJung-uk Kim adcx %r15,%r12 24647bded2dbSJung-uk Kim mulx 3*8($aptr),%rdx,%r14 24657bded2dbSJung-uk Kim adox -2*8($tptr),%r12 24667bded2dbSJung-uk Kim adcx %rdx,%r13 24674c6a0400SJung-uk Kim lea ($nptr,$num),$nptr # rewind $nptr 24687bded2dbSJung-uk Kim lea 4*8($aptr),$aptr 24697bded2dbSJung-uk Kim adox -1*8($tptr),%r13 24707bded2dbSJung-uk Kim adcx $zero,%r14 24717bded2dbSJung-uk Kim adox $zero,%r14 24727bded2dbSJung-uk Kim 24737bded2dbSJung-uk Kim mov $mi,%r15 24747bded2dbSJung-uk Kim imulq 32+8(%rsp),$mi # "t[0]"*n0 24757bded2dbSJung-uk Kim 24767bded2dbSJung-uk Kim mov $mi,%rdx 24777bded2dbSJung-uk Kim xor $zero,$zero # cf=0, of=0 24787bded2dbSJung-uk Kim mov $bptr,8+8(%rsp) # off-load &b[i] 24797bded2dbSJung-uk Kim 24804c6a0400SJung-uk Kim mulx 0*8($nptr),%rax,%r10 24817bded2dbSJung-uk Kim adcx %rax,%r15 # discarded 24827bded2dbSJung-uk Kim adox %r11,%r10 24834c6a0400SJung-uk Kim mulx 1*8($nptr),%rax,%r11 24847bded2dbSJung-uk Kim adcx %rax,%r10 24857bded2dbSJung-uk Kim adox %r12,%r11 24864c6a0400SJung-uk Kim mulx 2*8($nptr),%rax,%r12 24877bded2dbSJung-uk Kim adcx %rax,%r11 24887bded2dbSJung-uk Kim adox %r13,%r12 24894c6a0400SJung-uk Kim mulx 3*8($nptr),%rax,%r15 24907bded2dbSJung-uk Kim mov $bi,%rdx 24917bded2dbSJung-uk Kim mov 24+8(%rsp),$bptr # counter value 24927bded2dbSJung-uk Kim mov %r10,-8*4($tptr) 24937bded2dbSJung-uk Kim adcx %rax,%r12 24947bded2dbSJung-uk Kim mov %r11,-8*3($tptr) 24957bded2dbSJung-uk Kim adox $zero,%r15 # of=0 24967bded2dbSJung-uk Kim mov %r12,-8*2($tptr) 24974c6a0400SJung-uk Kim lea 4*8($nptr),$nptr 24987bded2dbSJung-uk Kim jmp .Lmulx4x_inner 24997bded2dbSJung-uk Kim 25007bded2dbSJung-uk Kim.align 32 25017bded2dbSJung-uk Kim.Lmulx4x_inner: 25027bded2dbSJung-uk Kim mulx 0*8($aptr),%r10,%rax # a[4]*b[i] 25037bded2dbSJung-uk Kim adcx $zero,%r15 # cf=0, modulo-scheduled 25047bded2dbSJung-uk Kim adox %r14,%r10 25057bded2dbSJung-uk Kim mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] 25067bded2dbSJung-uk Kim adcx 0*8($tptr),%r10 25077bded2dbSJung-uk Kim adox %rax,%r11 25087bded2dbSJung-uk Kim mulx 2*8($aptr),%r12,%rax # ... 25097bded2dbSJung-uk Kim adcx 1*8($tptr),%r11 25107bded2dbSJung-uk Kim adox %r14,%r12 25117bded2dbSJung-uk Kim mulx 3*8($aptr),%r13,%r14 25127bded2dbSJung-uk Kim mov $mi,%rdx 25137bded2dbSJung-uk Kim adcx 2*8($tptr),%r12 25147bded2dbSJung-uk Kim adox %rax,%r13 25157bded2dbSJung-uk Kim adcx 3*8($tptr),%r13 25167bded2dbSJung-uk Kim adox $zero,%r14 # of=0 25177bded2dbSJung-uk Kim lea 4*8($aptr),$aptr 25187bded2dbSJung-uk Kim lea 4*8($tptr),$tptr 25197bded2dbSJung-uk Kim adcx $zero,%r14 # cf=0 25207bded2dbSJung-uk Kim 25217bded2dbSJung-uk Kim adox %r15,%r10 25224c6a0400SJung-uk Kim mulx 0*8($nptr),%rax,%r15 25237bded2dbSJung-uk Kim adcx %rax,%r10 25247bded2dbSJung-uk Kim adox %r15,%r11 25254c6a0400SJung-uk Kim mulx 1*8($nptr),%rax,%r15 25267bded2dbSJung-uk Kim adcx %rax,%r11 25277bded2dbSJung-uk Kim adox %r15,%r12 25284c6a0400SJung-uk Kim mulx 2*8($nptr),%rax,%r15 25297bded2dbSJung-uk Kim mov %r10,-5*8($tptr) 25307bded2dbSJung-uk Kim adcx %rax,%r12 25317bded2dbSJung-uk Kim adox %r15,%r13 25327bded2dbSJung-uk Kim mov %r11,-4*8($tptr) 25334c6a0400SJung-uk Kim mulx 3*8($nptr),%rax,%r15 25347bded2dbSJung-uk Kim mov $bi,%rdx 25354c6a0400SJung-uk Kim lea 4*8($nptr),$nptr 25367bded2dbSJung-uk Kim mov %r12,-3*8($tptr) 25377bded2dbSJung-uk Kim adcx %rax,%r13 25387bded2dbSJung-uk Kim adox $zero,%r15 25397bded2dbSJung-uk Kim mov %r13,-2*8($tptr) 25407bded2dbSJung-uk Kim 25417bded2dbSJung-uk Kim dec $bptr # of=0, pass cf 25427bded2dbSJung-uk Kim jnz .Lmulx4x_inner 25437bded2dbSJung-uk Kim 25447bded2dbSJung-uk Kim mov 0+8(%rsp),$num # load -num 25457bded2dbSJung-uk Kim adc $zero,%r15 # modulo-scheduled 25467bded2dbSJung-uk Kim sub 0*8($tptr),$bptr # pull top-most carry to %cf 25477bded2dbSJung-uk Kim mov 8+8(%rsp),$bptr # re-load &b[i] 25487bded2dbSJung-uk Kim mov 16+8(%rsp),%r10 25497bded2dbSJung-uk Kim adc %r15,%r14 25507bded2dbSJung-uk Kim lea ($aptr,$num),$aptr # rewind $aptr 25517bded2dbSJung-uk Kim adc $zero,$zero # top-most carry 25527bded2dbSJung-uk Kim mov %r14,-1*8($tptr) 25537bded2dbSJung-uk Kim 25547bded2dbSJung-uk Kim cmp %r10,$bptr 25557bded2dbSJung-uk Kim jb .Lmulx4x_outer 25567bded2dbSJung-uk Kim 25574c6a0400SJung-uk Kim mov -8($nptr),%r10 25584c6a0400SJung-uk Kim mov $zero,%r8 25594c6a0400SJung-uk Kim mov ($nptr,$num),%r12 25604c6a0400SJung-uk Kim lea ($nptr,$num),%rbp # rewind $nptr 25614c6a0400SJung-uk Kim mov $num,%rcx 25624c6a0400SJung-uk Kim lea ($tptr,$num),%rdi # rewind $tptr 25634c6a0400SJung-uk Kim xor %eax,%eax 25647bded2dbSJung-uk Kim xor %r15,%r15 25657bded2dbSJung-uk Kim sub %r14,%r10 # compare top-most words 25667bded2dbSJung-uk Kim adc %r15,%r15 25674c6a0400SJung-uk Kim or %r15,%r8 25684c6a0400SJung-uk Kim sar \$3+2,%rcx 25694c6a0400SJung-uk Kim sub %r8,%rax # %rax=-%r8 25707bded2dbSJung-uk Kim mov 56+8(%rsp),%rdx # restore rp 25714c6a0400SJung-uk Kim dec %r12 # so that after 'not' we get -n[0] 25724c6a0400SJung-uk Kim mov 8*1(%rbp),%r13 25734c6a0400SJung-uk Kim xor %r8,%r8 25744c6a0400SJung-uk Kim mov 8*2(%rbp),%r14 25754c6a0400SJung-uk Kim mov 8*3(%rbp),%r15 25764c6a0400SJung-uk Kim jmp .Lsqrx4x_sub_entry # common post-condition 257717f01e99SJung-uk Kim.cfi_endproc 25787bded2dbSJung-uk Kim.size mulx4x_internal,.-mulx4x_internal 25797bded2dbSJung-uk Kim___ 25807bded2dbSJung-uk Kim}{ 25817bded2dbSJung-uk Kim###################################################################### 25827bded2dbSJung-uk Kim# void bn_power5( 25837bded2dbSJung-uk Kimmy $rptr="%rdi"; # BN_ULONG *rptr, 25847bded2dbSJung-uk Kimmy $aptr="%rsi"; # const BN_ULONG *aptr, 25857bded2dbSJung-uk Kimmy $bptr="%rdx"; # const void *table, 25867bded2dbSJung-uk Kimmy $nptr="%rcx"; # const BN_ULONG *nptr, 25877bded2dbSJung-uk Kimmy $n0 ="%r8"; # const BN_ULONG *n0); 25887bded2dbSJung-uk Kimmy $num ="%r9"; # int num, has to be divisible by 8 25897bded2dbSJung-uk Kim # int pwr); 25907bded2dbSJung-uk Kim 25917bded2dbSJung-uk Kimmy ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 25927bded2dbSJung-uk Kimmy @A0=("%r10","%r11"); 25937bded2dbSJung-uk Kimmy @A1=("%r12","%r13"); 25947bded2dbSJung-uk Kimmy ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 25957bded2dbSJung-uk Kim 25967bded2dbSJung-uk Kim$code.=<<___; 25977bded2dbSJung-uk Kim.type bn_powerx5,\@function,6 25987bded2dbSJung-uk Kim.align 32 25997bded2dbSJung-uk Kimbn_powerx5: 2600e71b7053SJung-uk Kim.cfi_startproc 26017bded2dbSJung-uk Kim mov %rsp,%rax 2602e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 2603aeb5019cSJung-uk Kim.Lpowerx5_enter: 26047bded2dbSJung-uk Kim push %rbx 2605e71b7053SJung-uk Kim.cfi_push %rbx 26067bded2dbSJung-uk Kim push %rbp 2607e71b7053SJung-uk Kim.cfi_push %rbp 26087bded2dbSJung-uk Kim push %r12 2609e71b7053SJung-uk Kim.cfi_push %r12 26107bded2dbSJung-uk Kim push %r13 2611e71b7053SJung-uk Kim.cfi_push %r13 26127bded2dbSJung-uk Kim push %r14 2613e71b7053SJung-uk Kim.cfi_push %r14 26147bded2dbSJung-uk Kim push %r15 2615e71b7053SJung-uk Kim.cfi_push %r15 2616aeb5019cSJung-uk Kim.Lpowerx5_prologue: 26174c6a0400SJung-uk Kim 26187bded2dbSJung-uk Kim shl \$3,${num}d # convert $num to bytes 26194c6a0400SJung-uk Kim lea ($num,$num,2),%r10 # 3*$num in bytes 26207bded2dbSJung-uk Kim neg $num 26217bded2dbSJung-uk Kim mov ($n0),$n0 # *n0 26227bded2dbSJung-uk Kim 26237bded2dbSJung-uk Kim ############################################################## 26244c6a0400SJung-uk Kim # Ensure that stack frame doesn't alias with $rptr+3*$num 26254c6a0400SJung-uk Kim # modulo 4096, which covers ret[num], am[num] and n[num] 26264c6a0400SJung-uk Kim # (see bn_exp.c). This is done to allow memory disambiguation 26274c6a0400SJung-uk Kim # logic do its magic. [Extra 256 bytes is for power mask 26284c6a0400SJung-uk Kim # calculated from 7th argument, the index.] 26297bded2dbSJung-uk Kim # 26304c6a0400SJung-uk Kim lea -320(%rsp,$num,2),%r11 2631aeb5019cSJung-uk Kim mov %rsp,%rbp 26324c6a0400SJung-uk Kim sub $rptr,%r11 26337bded2dbSJung-uk Kim and \$4095,%r11 26347bded2dbSJung-uk Kim cmp %r11,%r10 26357bded2dbSJung-uk Kim jb .Lpwrx_sp_alt 2636aeb5019cSJung-uk Kim sub %r11,%rbp # align with $aptr 2637aeb5019cSJung-uk Kim lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 26387bded2dbSJung-uk Kim jmp .Lpwrx_sp_done 26397bded2dbSJung-uk Kim 26407bded2dbSJung-uk Kim.align 32 26417bded2dbSJung-uk Kim.Lpwrx_sp_alt: 26424c6a0400SJung-uk Kim lea 4096-320(,$num,2),%r10 2643aeb5019cSJung-uk Kim lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) 26447bded2dbSJung-uk Kim sub %r10,%r11 26457bded2dbSJung-uk Kim mov \$0,%r10 26467bded2dbSJung-uk Kim cmovc %r10,%r11 2647aeb5019cSJung-uk Kim sub %r11,%rbp 26487bded2dbSJung-uk Kim.Lpwrx_sp_done: 2649aeb5019cSJung-uk Kim and \$-64,%rbp 2650aeb5019cSJung-uk Kim mov %rsp,%r11 2651aeb5019cSJung-uk Kim sub %rbp,%r11 2652b8721c16SJung-uk Kim and \$-4096,%r11 2653aeb5019cSJung-uk Kim lea (%rbp,%r11),%rsp 2654aeb5019cSJung-uk Kim mov (%rsp),%r10 2655aeb5019cSJung-uk Kim cmp %rbp,%rsp 2656aeb5019cSJung-uk Kim ja .Lpwrx_page_walk 2657aeb5019cSJung-uk Kim jmp .Lpwrx_page_walk_done 2658aeb5019cSJung-uk Kim 2659b8721c16SJung-uk Kim.Lpwrx_page_walk: 2660aeb5019cSJung-uk Kim lea -4096(%rsp),%rsp 2661aeb5019cSJung-uk Kim mov (%rsp),%r10 2662aeb5019cSJung-uk Kim cmp %rbp,%rsp 2663aeb5019cSJung-uk Kim ja .Lpwrx_page_walk 2664aeb5019cSJung-uk Kim.Lpwrx_page_walk_done: 2665b8721c16SJung-uk Kim 26667bded2dbSJung-uk Kim mov $num,%r10 26677bded2dbSJung-uk Kim neg $num 26687bded2dbSJung-uk Kim 26697bded2dbSJung-uk Kim ############################################################## 26707bded2dbSJung-uk Kim # Stack layout 26717bded2dbSJung-uk Kim # 26727bded2dbSJung-uk Kim # +0 saved $num, used in reduction section 26737bded2dbSJung-uk Kim # +8 &t[2*$num], used in reduction section 26747bded2dbSJung-uk Kim # +16 intermediate carry bit 26757bded2dbSJung-uk Kim # +24 top-most carry bit, used in reduction section 26767bded2dbSJung-uk Kim # +32 saved *n0 26777bded2dbSJung-uk Kim # +40 saved %rsp 26787bded2dbSJung-uk Kim # +48 t[2*$num] 26797bded2dbSJung-uk Kim # 26807bded2dbSJung-uk Kim pxor %xmm0,%xmm0 26817bded2dbSJung-uk Kim movq $rptr,%xmm1 # save $rptr 26827bded2dbSJung-uk Kim movq $nptr,%xmm2 # save $nptr 26837bded2dbSJung-uk Kim movq %r10, %xmm3 # -$num 26847bded2dbSJung-uk Kim movq $bptr,%xmm4 26857bded2dbSJung-uk Kim mov $n0, 32(%rsp) 26867bded2dbSJung-uk Kim mov %rax, 40(%rsp) # save original %rsp 2687e71b7053SJung-uk Kim.cfi_cfa_expression %rsp+40,deref,+8 26887bded2dbSJung-uk Kim.Lpowerx5_body: 26897bded2dbSJung-uk Kim 26907bded2dbSJung-uk Kim call __bn_sqrx8x_internal 26914c6a0400SJung-uk Kim call __bn_postx4x_internal 26927bded2dbSJung-uk Kim call __bn_sqrx8x_internal 26934c6a0400SJung-uk Kim call __bn_postx4x_internal 26947bded2dbSJung-uk Kim call __bn_sqrx8x_internal 26954c6a0400SJung-uk Kim call __bn_postx4x_internal 26967bded2dbSJung-uk Kim call __bn_sqrx8x_internal 26974c6a0400SJung-uk Kim call __bn_postx4x_internal 26987bded2dbSJung-uk Kim call __bn_sqrx8x_internal 26994c6a0400SJung-uk Kim call __bn_postx4x_internal 27007bded2dbSJung-uk Kim 27017bded2dbSJung-uk Kim mov %r10,$num # -num 27027bded2dbSJung-uk Kim mov $aptr,$rptr 27037bded2dbSJung-uk Kim movq %xmm2,$nptr 27047bded2dbSJung-uk Kim movq %xmm4,$bptr 27057bded2dbSJung-uk Kim mov 40(%rsp),%rax 27067bded2dbSJung-uk Kim 27077bded2dbSJung-uk Kim call mulx4x_internal 27087bded2dbSJung-uk Kim 27097bded2dbSJung-uk Kim mov 40(%rsp),%rsi # restore %rsp 2710e71b7053SJung-uk Kim.cfi_def_cfa %rsi,8 27117bded2dbSJung-uk Kim mov \$1,%rax 27124c6a0400SJung-uk Kim 27137bded2dbSJung-uk Kim mov -48(%rsi),%r15 2714e71b7053SJung-uk Kim.cfi_restore %r15 27157bded2dbSJung-uk Kim mov -40(%rsi),%r14 2716e71b7053SJung-uk Kim.cfi_restore %r14 27177bded2dbSJung-uk Kim mov -32(%rsi),%r13 2718e71b7053SJung-uk Kim.cfi_restore %r13 27197bded2dbSJung-uk Kim mov -24(%rsi),%r12 2720e71b7053SJung-uk Kim.cfi_restore %r12 27217bded2dbSJung-uk Kim mov -16(%rsi),%rbp 2722e71b7053SJung-uk Kim.cfi_restore %rbp 27237bded2dbSJung-uk Kim mov -8(%rsi),%rbx 2724e71b7053SJung-uk Kim.cfi_restore %rbx 27257bded2dbSJung-uk Kim lea (%rsi),%rsp 2726e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 27277bded2dbSJung-uk Kim.Lpowerx5_epilogue: 27287bded2dbSJung-uk Kim ret 2729e71b7053SJung-uk Kim.cfi_endproc 27307bded2dbSJung-uk Kim.size bn_powerx5,.-bn_powerx5 27317bded2dbSJung-uk Kim 27327bded2dbSJung-uk Kim.globl bn_sqrx8x_internal 27337bded2dbSJung-uk Kim.hidden bn_sqrx8x_internal 27347bded2dbSJung-uk Kim.type bn_sqrx8x_internal,\@abi-omnipotent 27357bded2dbSJung-uk Kim.align 32 27367bded2dbSJung-uk Kimbn_sqrx8x_internal: 27377bded2dbSJung-uk Kim__bn_sqrx8x_internal: 27386935a639SJung-uk Kim.cfi_startproc 27397bded2dbSJung-uk Kim ################################################################## 27407bded2dbSJung-uk Kim # Squaring part: 27417bded2dbSJung-uk Kim # 27427bded2dbSJung-uk Kim # a) multiply-n-add everything but a[i]*a[i]; 27437bded2dbSJung-uk Kim # b) shift result of a) by 1 to the left and accumulate 27447bded2dbSJung-uk Kim # a[i]*a[i] products; 27457bded2dbSJung-uk Kim # 27467bded2dbSJung-uk Kim ################################################################## 27477bded2dbSJung-uk Kim # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 27487bded2dbSJung-uk Kim # a[1]a[0] 27497bded2dbSJung-uk Kim # a[2]a[0] 27507bded2dbSJung-uk Kim # a[3]a[0] 27517bded2dbSJung-uk Kim # a[2]a[1] 27527bded2dbSJung-uk Kim # a[3]a[1] 27537bded2dbSJung-uk Kim # a[3]a[2] 27547bded2dbSJung-uk Kim # 27557bded2dbSJung-uk Kim # a[4]a[0] 27567bded2dbSJung-uk Kim # a[5]a[0] 27577bded2dbSJung-uk Kim # a[6]a[0] 27587bded2dbSJung-uk Kim # a[7]a[0] 27597bded2dbSJung-uk Kim # a[4]a[1] 27607bded2dbSJung-uk Kim # a[5]a[1] 27617bded2dbSJung-uk Kim # a[6]a[1] 27627bded2dbSJung-uk Kim # a[7]a[1] 27637bded2dbSJung-uk Kim # a[4]a[2] 27647bded2dbSJung-uk Kim # a[5]a[2] 27657bded2dbSJung-uk Kim # a[6]a[2] 27667bded2dbSJung-uk Kim # a[7]a[2] 27677bded2dbSJung-uk Kim # a[4]a[3] 27687bded2dbSJung-uk Kim # a[5]a[3] 27697bded2dbSJung-uk Kim # a[6]a[3] 27707bded2dbSJung-uk Kim # a[7]a[3] 27717bded2dbSJung-uk Kim # 27727bded2dbSJung-uk Kim # a[5]a[4] 27737bded2dbSJung-uk Kim # a[6]a[4] 27747bded2dbSJung-uk Kim # a[7]a[4] 27757bded2dbSJung-uk Kim # a[6]a[5] 27767bded2dbSJung-uk Kim # a[7]a[5] 27777bded2dbSJung-uk Kim # a[7]a[6] 27787bded2dbSJung-uk Kim # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 27797bded2dbSJung-uk Kim___ 27807bded2dbSJung-uk Kim{ 27817bded2dbSJung-uk Kimmy ($zero,$carry)=("%rbp","%rcx"); 27827bded2dbSJung-uk Kimmy $aaptr=$zero; 27837bded2dbSJung-uk Kim$code.=<<___; 27847bded2dbSJung-uk Kim lea 48+8(%rsp),$tptr 27857bded2dbSJung-uk Kim lea ($aptr,$num),$aaptr 27867bded2dbSJung-uk Kim mov $num,0+8(%rsp) # save $num 27877bded2dbSJung-uk Kim mov $aaptr,8+8(%rsp) # save end of $aptr 27887bded2dbSJung-uk Kim jmp .Lsqr8x_zero_start 27897bded2dbSJung-uk Kim 27907bded2dbSJung-uk Kim.align 32 27917bded2dbSJung-uk Kim.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 27927bded2dbSJung-uk Kim.Lsqrx8x_zero: 27937bded2dbSJung-uk Kim .byte 0x3e 27947bded2dbSJung-uk Kim movdqa %xmm0,0*8($tptr) 27957bded2dbSJung-uk Kim movdqa %xmm0,2*8($tptr) 27967bded2dbSJung-uk Kim movdqa %xmm0,4*8($tptr) 27977bded2dbSJung-uk Kim movdqa %xmm0,6*8($tptr) 27987bded2dbSJung-uk Kim.Lsqr8x_zero_start: # aligned at 32 27997bded2dbSJung-uk Kim movdqa %xmm0,8*8($tptr) 28007bded2dbSJung-uk Kim movdqa %xmm0,10*8($tptr) 28017bded2dbSJung-uk Kim movdqa %xmm0,12*8($tptr) 28027bded2dbSJung-uk Kim movdqa %xmm0,14*8($tptr) 28037bded2dbSJung-uk Kim lea 16*8($tptr),$tptr 28047bded2dbSJung-uk Kim sub \$64,$num 28057bded2dbSJung-uk Kim jnz .Lsqrx8x_zero 28067bded2dbSJung-uk Kim 28077bded2dbSJung-uk Kim mov 0*8($aptr),%rdx # a[0], modulo-scheduled 28087bded2dbSJung-uk Kim #xor %r9,%r9 # t[1], ex-$num, zero already 28097bded2dbSJung-uk Kim xor %r10,%r10 28107bded2dbSJung-uk Kim xor %r11,%r11 28117bded2dbSJung-uk Kim xor %r12,%r12 28127bded2dbSJung-uk Kim xor %r13,%r13 28137bded2dbSJung-uk Kim xor %r14,%r14 28147bded2dbSJung-uk Kim xor %r15,%r15 28157bded2dbSJung-uk Kim lea 48+8(%rsp),$tptr 28167bded2dbSJung-uk Kim xor $zero,$zero # cf=0, cf=0 28177bded2dbSJung-uk Kim jmp .Lsqrx8x_outer_loop 28187bded2dbSJung-uk Kim 28197bded2dbSJung-uk Kim.align 32 28207bded2dbSJung-uk Kim.Lsqrx8x_outer_loop: 28217bded2dbSJung-uk Kim mulx 1*8($aptr),%r8,%rax # a[1]*a[0] 28227bded2dbSJung-uk Kim adcx %r9,%r8 # a[1]*a[0]+=t[1] 28237bded2dbSJung-uk Kim adox %rax,%r10 28247bded2dbSJung-uk Kim mulx 2*8($aptr),%r9,%rax # a[2]*a[0] 28257bded2dbSJung-uk Kim adcx %r10,%r9 28267bded2dbSJung-uk Kim adox %rax,%r11 28277bded2dbSJung-uk Kim .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... 28287bded2dbSJung-uk Kim adcx %r11,%r10 28297bded2dbSJung-uk Kim adox %rax,%r12 28307bded2dbSJung-uk Kim .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax 28317bded2dbSJung-uk Kim adcx %r12,%r11 28327bded2dbSJung-uk Kim adox %rax,%r13 28337bded2dbSJung-uk Kim mulx 5*8($aptr),%r12,%rax 28347bded2dbSJung-uk Kim adcx %r13,%r12 28357bded2dbSJung-uk Kim adox %rax,%r14 28367bded2dbSJung-uk Kim mulx 6*8($aptr),%r13,%rax 28377bded2dbSJung-uk Kim adcx %r14,%r13 28387bded2dbSJung-uk Kim adox %r15,%rax 28397bded2dbSJung-uk Kim mulx 7*8($aptr),%r14,%r15 28407bded2dbSJung-uk Kim mov 1*8($aptr),%rdx # a[1] 28417bded2dbSJung-uk Kim adcx %rax,%r14 28427bded2dbSJung-uk Kim adox $zero,%r15 28437bded2dbSJung-uk Kim adc 8*8($tptr),%r15 28447bded2dbSJung-uk Kim mov %r8,1*8($tptr) # t[1] 28457bded2dbSJung-uk Kim mov %r9,2*8($tptr) # t[2] 28467bded2dbSJung-uk Kim sbb $carry,$carry # mov %cf,$carry 28477bded2dbSJung-uk Kim xor $zero,$zero # cf=0, of=0 28487bded2dbSJung-uk Kim 28497bded2dbSJung-uk Kim 28507bded2dbSJung-uk Kim mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] 28517bded2dbSJung-uk Kim mulx 3*8($aptr),%r9,%rax # a[3]*a[1] 28527bded2dbSJung-uk Kim adcx %r10,%r8 28537bded2dbSJung-uk Kim adox %rbx,%r9 28547bded2dbSJung-uk Kim mulx 4*8($aptr),%r10,%rbx # ... 28557bded2dbSJung-uk Kim adcx %r11,%r9 28567bded2dbSJung-uk Kim adox %rax,%r10 28577bded2dbSJung-uk Kim .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax 28587bded2dbSJung-uk Kim adcx %r12,%r10 28597bded2dbSJung-uk Kim adox %rbx,%r11 28607bded2dbSJung-uk Kim .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx 28617bded2dbSJung-uk Kim adcx %r13,%r11 28627bded2dbSJung-uk Kim adox %r14,%r12 28637bded2dbSJung-uk Kim .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 28647bded2dbSJung-uk Kim mov 2*8($aptr),%rdx # a[2] 28657bded2dbSJung-uk Kim adcx %rax,%r12 28667bded2dbSJung-uk Kim adox %rbx,%r13 28677bded2dbSJung-uk Kim adcx %r15,%r13 28687bded2dbSJung-uk Kim adox $zero,%r14 # of=0 28697bded2dbSJung-uk Kim adcx $zero,%r14 # cf=0 28707bded2dbSJung-uk Kim 28717bded2dbSJung-uk Kim mov %r8,3*8($tptr) # t[3] 28727bded2dbSJung-uk Kim mov %r9,4*8($tptr) # t[4] 28737bded2dbSJung-uk Kim 28747bded2dbSJung-uk Kim mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] 28757bded2dbSJung-uk Kim mulx 4*8($aptr),%r9,%rax # a[4]*a[2] 28767bded2dbSJung-uk Kim adcx %r10,%r8 28777bded2dbSJung-uk Kim adox %rbx,%r9 28787bded2dbSJung-uk Kim mulx 5*8($aptr),%r10,%rbx # ... 28797bded2dbSJung-uk Kim adcx %r11,%r9 28807bded2dbSJung-uk Kim adox %rax,%r10 28817bded2dbSJung-uk Kim .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax 28827bded2dbSJung-uk Kim adcx %r12,%r10 28837bded2dbSJung-uk Kim adox %r13,%r11 28847bded2dbSJung-uk Kim .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 28857bded2dbSJung-uk Kim .byte 0x3e 28867bded2dbSJung-uk Kim mov 3*8($aptr),%rdx # a[3] 28877bded2dbSJung-uk Kim adcx %rbx,%r11 28887bded2dbSJung-uk Kim adox %rax,%r12 28897bded2dbSJung-uk Kim adcx %r14,%r12 28907bded2dbSJung-uk Kim mov %r8,5*8($tptr) # t[5] 28917bded2dbSJung-uk Kim mov %r9,6*8($tptr) # t[6] 28927bded2dbSJung-uk Kim mulx 4*8($aptr),%r8,%rax # a[4]*a[3] 28937bded2dbSJung-uk Kim adox $zero,%r13 # of=0 28947bded2dbSJung-uk Kim adcx $zero,%r13 # cf=0 28957bded2dbSJung-uk Kim 28967bded2dbSJung-uk Kim mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] 28977bded2dbSJung-uk Kim adcx %r10,%r8 28987bded2dbSJung-uk Kim adox %rax,%r9 28997bded2dbSJung-uk Kim mulx 6*8($aptr),%r10,%rax # ... 29007bded2dbSJung-uk Kim adcx %r11,%r9 29017bded2dbSJung-uk Kim adox %r12,%r10 29027bded2dbSJung-uk Kim mulx 7*8($aptr),%r11,%r12 29037bded2dbSJung-uk Kim mov 4*8($aptr),%rdx # a[4] 29047bded2dbSJung-uk Kim mov 5*8($aptr),%r14 # a[5] 29057bded2dbSJung-uk Kim adcx %rbx,%r10 29067bded2dbSJung-uk Kim adox %rax,%r11 29077bded2dbSJung-uk Kim mov 6*8($aptr),%r15 # a[6] 29087bded2dbSJung-uk Kim adcx %r13,%r11 29097bded2dbSJung-uk Kim adox $zero,%r12 # of=0 29107bded2dbSJung-uk Kim adcx $zero,%r12 # cf=0 29117bded2dbSJung-uk Kim 29127bded2dbSJung-uk Kim mov %r8,7*8($tptr) # t[7] 29137bded2dbSJung-uk Kim mov %r9,8*8($tptr) # t[8] 29147bded2dbSJung-uk Kim 29157bded2dbSJung-uk Kim mulx %r14,%r9,%rax # a[5]*a[4] 29167bded2dbSJung-uk Kim mov 7*8($aptr),%r8 # a[7] 29177bded2dbSJung-uk Kim adcx %r10,%r9 29187bded2dbSJung-uk Kim mulx %r15,%r10,%rbx # a[6]*a[4] 29197bded2dbSJung-uk Kim adox %rax,%r10 29207bded2dbSJung-uk Kim adcx %r11,%r10 29217bded2dbSJung-uk Kim mulx %r8,%r11,%rax # a[7]*a[4] 29227bded2dbSJung-uk Kim mov %r14,%rdx # a[5] 29237bded2dbSJung-uk Kim adox %rbx,%r11 29247bded2dbSJung-uk Kim adcx %r12,%r11 29257bded2dbSJung-uk Kim #adox $zero,%rax # of=0 29267bded2dbSJung-uk Kim adcx $zero,%rax # cf=0 29277bded2dbSJung-uk Kim 29287bded2dbSJung-uk Kim mulx %r15,%r14,%rbx # a[6]*a[5] 29297bded2dbSJung-uk Kim mulx %r8,%r12,%r13 # a[7]*a[5] 29307bded2dbSJung-uk Kim mov %r15,%rdx # a[6] 29317bded2dbSJung-uk Kim lea 8*8($aptr),$aptr 29327bded2dbSJung-uk Kim adcx %r14,%r11 29337bded2dbSJung-uk Kim adox %rbx,%r12 29347bded2dbSJung-uk Kim adcx %rax,%r12 29357bded2dbSJung-uk Kim adox $zero,%r13 29367bded2dbSJung-uk Kim 29377bded2dbSJung-uk Kim .byte 0x67,0x67 29387bded2dbSJung-uk Kim mulx %r8,%r8,%r14 # a[7]*a[6] 29397bded2dbSJung-uk Kim adcx %r8,%r13 29407bded2dbSJung-uk Kim adcx $zero,%r14 29417bded2dbSJung-uk Kim 29427bded2dbSJung-uk Kim cmp 8+8(%rsp),$aptr 29437bded2dbSJung-uk Kim je .Lsqrx8x_outer_break 29447bded2dbSJung-uk Kim 29457bded2dbSJung-uk Kim neg $carry # mov $carry,%cf 29467bded2dbSJung-uk Kim mov \$-8,%rcx 29477bded2dbSJung-uk Kim mov $zero,%r15 29487bded2dbSJung-uk Kim mov 8*8($tptr),%r8 29497bded2dbSJung-uk Kim adcx 9*8($tptr),%r9 # +=t[9] 29507bded2dbSJung-uk Kim adcx 10*8($tptr),%r10 # ... 29517bded2dbSJung-uk Kim adcx 11*8($tptr),%r11 29527bded2dbSJung-uk Kim adc 12*8($tptr),%r12 29537bded2dbSJung-uk Kim adc 13*8($tptr),%r13 29547bded2dbSJung-uk Kim adc 14*8($tptr),%r14 29557bded2dbSJung-uk Kim adc 15*8($tptr),%r15 29567bded2dbSJung-uk Kim lea ($aptr),$aaptr 29577bded2dbSJung-uk Kim lea 2*64($tptr),$tptr 29587bded2dbSJung-uk Kim sbb %rax,%rax # mov %cf,$carry 29597bded2dbSJung-uk Kim 29607bded2dbSJung-uk Kim mov -64($aptr),%rdx # a[0] 29617bded2dbSJung-uk Kim mov %rax,16+8(%rsp) # offload $carry 29627bded2dbSJung-uk Kim mov $tptr,24+8(%rsp) 29637bded2dbSJung-uk Kim 29647bded2dbSJung-uk Kim #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above 29657bded2dbSJung-uk Kim xor %eax,%eax # cf=0, of=0 29667bded2dbSJung-uk Kim jmp .Lsqrx8x_loop 29677bded2dbSJung-uk Kim 29687bded2dbSJung-uk Kim.align 32 29697bded2dbSJung-uk Kim.Lsqrx8x_loop: 29707bded2dbSJung-uk Kim mov %r8,%rbx 29717bded2dbSJung-uk Kim mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] 29727bded2dbSJung-uk Kim adcx %rax,%rbx # +=t[8] 29737bded2dbSJung-uk Kim adox %r9,%r8 29747bded2dbSJung-uk Kim 29757bded2dbSJung-uk Kim mulx 1*8($aaptr),%rax,%r9 # ... 29767bded2dbSJung-uk Kim adcx %rax,%r8 29777bded2dbSJung-uk Kim adox %r10,%r9 29787bded2dbSJung-uk Kim 29797bded2dbSJung-uk Kim mulx 2*8($aaptr),%rax,%r10 29807bded2dbSJung-uk Kim adcx %rax,%r9 29817bded2dbSJung-uk Kim adox %r11,%r10 29827bded2dbSJung-uk Kim 29837bded2dbSJung-uk Kim mulx 3*8($aaptr),%rax,%r11 29847bded2dbSJung-uk Kim adcx %rax,%r10 29857bded2dbSJung-uk Kim adox %r12,%r11 29867bded2dbSJung-uk Kim 29877bded2dbSJung-uk Kim .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 29887bded2dbSJung-uk Kim adcx %rax,%r11 29897bded2dbSJung-uk Kim adox %r13,%r12 29907bded2dbSJung-uk Kim 29917bded2dbSJung-uk Kim mulx 5*8($aaptr),%rax,%r13 29927bded2dbSJung-uk Kim adcx %rax,%r12 29937bded2dbSJung-uk Kim adox %r14,%r13 29947bded2dbSJung-uk Kim 29957bded2dbSJung-uk Kim mulx 6*8($aaptr),%rax,%r14 29967bded2dbSJung-uk Kim mov %rbx,($tptr,%rcx,8) # store t[8+i] 29977bded2dbSJung-uk Kim mov \$0,%ebx 29987bded2dbSJung-uk Kim adcx %rax,%r13 29997bded2dbSJung-uk Kim adox %r15,%r14 30007bded2dbSJung-uk Kim 30017bded2dbSJung-uk Kim .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 30027bded2dbSJung-uk Kim mov 8($aptr,%rcx,8),%rdx # a[i] 30037bded2dbSJung-uk Kim adcx %rax,%r14 30047bded2dbSJung-uk Kim adox %rbx,%r15 # %rbx is 0, of=0 30057bded2dbSJung-uk Kim adcx %rbx,%r15 # cf=0 30067bded2dbSJung-uk Kim 30077bded2dbSJung-uk Kim .byte 0x67 30087bded2dbSJung-uk Kim inc %rcx # of=0 30097bded2dbSJung-uk Kim jnz .Lsqrx8x_loop 30107bded2dbSJung-uk Kim 30117bded2dbSJung-uk Kim lea 8*8($aaptr),$aaptr 30127bded2dbSJung-uk Kim mov \$-8,%rcx 30137bded2dbSJung-uk Kim cmp 8+8(%rsp),$aaptr # done? 30147bded2dbSJung-uk Kim je .Lsqrx8x_break 30157bded2dbSJung-uk Kim 30167bded2dbSJung-uk Kim sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 30177bded2dbSJung-uk Kim .byte 0x66 30187bded2dbSJung-uk Kim mov -64($aptr),%rdx 30197bded2dbSJung-uk Kim adcx 0*8($tptr),%r8 30207bded2dbSJung-uk Kim adcx 1*8($tptr),%r9 30217bded2dbSJung-uk Kim adc 2*8($tptr),%r10 30227bded2dbSJung-uk Kim adc 3*8($tptr),%r11 30237bded2dbSJung-uk Kim adc 4*8($tptr),%r12 30247bded2dbSJung-uk Kim adc 5*8($tptr),%r13 30257bded2dbSJung-uk Kim adc 6*8($tptr),%r14 30267bded2dbSJung-uk Kim adc 7*8($tptr),%r15 30277bded2dbSJung-uk Kim lea 8*8($tptr),$tptr 30287bded2dbSJung-uk Kim .byte 0x67 30297bded2dbSJung-uk Kim sbb %rax,%rax # mov %cf,%rax 30307bded2dbSJung-uk Kim xor %ebx,%ebx # cf=0, of=0 30317bded2dbSJung-uk Kim mov %rax,16+8(%rsp) # offload carry 30327bded2dbSJung-uk Kim jmp .Lsqrx8x_loop 30337bded2dbSJung-uk Kim 30347bded2dbSJung-uk Kim.align 32 30357bded2dbSJung-uk Kim.Lsqrx8x_break: 303647902a71SJung-uk Kim xor $zero,$zero 303747902a71SJung-uk Kim sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 303847902a71SJung-uk Kim adcx $zero,%r8 30397bded2dbSJung-uk Kim mov 24+8(%rsp),$carry # initial $tptr, borrow $carry 304047902a71SJung-uk Kim adcx $zero,%r9 30417bded2dbSJung-uk Kim mov 0*8($aptr),%rdx # a[8], modulo-scheduled 304247902a71SJung-uk Kim adc \$0,%r10 30437bded2dbSJung-uk Kim mov %r8,0*8($tptr) 304447902a71SJung-uk Kim adc \$0,%r11 304547902a71SJung-uk Kim adc \$0,%r12 304647902a71SJung-uk Kim adc \$0,%r13 304747902a71SJung-uk Kim adc \$0,%r14 304847902a71SJung-uk Kim adc \$0,%r15 30497bded2dbSJung-uk Kim cmp $carry,$tptr # cf=0, of=0 30507bded2dbSJung-uk Kim je .Lsqrx8x_outer_loop 30517bded2dbSJung-uk Kim 30527bded2dbSJung-uk Kim mov %r9,1*8($tptr) 30537bded2dbSJung-uk Kim mov 1*8($carry),%r9 30547bded2dbSJung-uk Kim mov %r10,2*8($tptr) 30557bded2dbSJung-uk Kim mov 2*8($carry),%r10 30567bded2dbSJung-uk Kim mov %r11,3*8($tptr) 30577bded2dbSJung-uk Kim mov 3*8($carry),%r11 30587bded2dbSJung-uk Kim mov %r12,4*8($tptr) 30597bded2dbSJung-uk Kim mov 4*8($carry),%r12 30607bded2dbSJung-uk Kim mov %r13,5*8($tptr) 30617bded2dbSJung-uk Kim mov 5*8($carry),%r13 30627bded2dbSJung-uk Kim mov %r14,6*8($tptr) 30637bded2dbSJung-uk Kim mov 6*8($carry),%r14 30647bded2dbSJung-uk Kim mov %r15,7*8($tptr) 30657bded2dbSJung-uk Kim mov 7*8($carry),%r15 30667bded2dbSJung-uk Kim mov $carry,$tptr 30677bded2dbSJung-uk Kim jmp .Lsqrx8x_outer_loop 30687bded2dbSJung-uk Kim 30697bded2dbSJung-uk Kim.align 32 30707bded2dbSJung-uk Kim.Lsqrx8x_outer_break: 30717bded2dbSJung-uk Kim mov %r9,9*8($tptr) # t[9] 30727bded2dbSJung-uk Kim movq %xmm3,%rcx # -$num 30737bded2dbSJung-uk Kim mov %r10,10*8($tptr) # ... 30747bded2dbSJung-uk Kim mov %r11,11*8($tptr) 30757bded2dbSJung-uk Kim mov %r12,12*8($tptr) 30767bded2dbSJung-uk Kim mov %r13,13*8($tptr) 30777bded2dbSJung-uk Kim mov %r14,14*8($tptr) 30787bded2dbSJung-uk Kim___ 30797bded2dbSJung-uk Kim}{ 30807bded2dbSJung-uk Kimmy $i="%rcx"; 30817bded2dbSJung-uk Kim$code.=<<___; 30827bded2dbSJung-uk Kim lea 48+8(%rsp),$tptr 30837bded2dbSJung-uk Kim mov ($aptr,$i),%rdx # a[0] 30847bded2dbSJung-uk Kim 30857bded2dbSJung-uk Kim mov 8($tptr),$A0[1] # t[1] 30867bded2dbSJung-uk Kim xor $A0[0],$A0[0] # t[0], of=0, cf=0 30877bded2dbSJung-uk Kim mov 0+8(%rsp),$num # restore $num 30887bded2dbSJung-uk Kim adox $A0[1],$A0[1] 30897bded2dbSJung-uk Kim mov 16($tptr),$A1[0] # t[2] # prefetch 30907bded2dbSJung-uk Kim mov 24($tptr),$A1[1] # t[3] # prefetch 30917bded2dbSJung-uk Kim #jmp .Lsqrx4x_shift_n_add # happens to be aligned 30927bded2dbSJung-uk Kim 30937bded2dbSJung-uk Kim.align 32 30947bded2dbSJung-uk Kim.Lsqrx4x_shift_n_add: 30957bded2dbSJung-uk Kim mulx %rdx,%rax,%rbx 30967bded2dbSJung-uk Kim adox $A1[0],$A1[0] 30977bded2dbSJung-uk Kim adcx $A0[0],%rax 30987bded2dbSJung-uk Kim .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch 30997bded2dbSJung-uk Kim .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch 31007bded2dbSJung-uk Kim adox $A1[1],$A1[1] 31017bded2dbSJung-uk Kim adcx $A0[1],%rbx 31027bded2dbSJung-uk Kim mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch 31037bded2dbSJung-uk Kim mov %rax,0($tptr) 31047bded2dbSJung-uk Kim mov %rbx,8($tptr) 31057bded2dbSJung-uk Kim 31067bded2dbSJung-uk Kim mulx %rdx,%rax,%rbx 31077bded2dbSJung-uk Kim adox $A0[0],$A0[0] 31087bded2dbSJung-uk Kim adcx $A1[0],%rax 31097bded2dbSJung-uk Kim mov 16($aptr,$i),%rdx # a[i+2] # prefetch 31107bded2dbSJung-uk Kim mov 48($tptr),$A1[0] # t[2*i+6] # prefetch 31117bded2dbSJung-uk Kim adox $A0[1],$A0[1] 31127bded2dbSJung-uk Kim adcx $A1[1],%rbx 31137bded2dbSJung-uk Kim mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch 31147bded2dbSJung-uk Kim mov %rax,16($tptr) 31157bded2dbSJung-uk Kim mov %rbx,24($tptr) 31167bded2dbSJung-uk Kim 31177bded2dbSJung-uk Kim mulx %rdx,%rax,%rbx 31187bded2dbSJung-uk Kim adox $A1[0],$A1[0] 31197bded2dbSJung-uk Kim adcx $A0[0],%rax 31207bded2dbSJung-uk Kim mov 24($aptr,$i),%rdx # a[i+3] # prefetch 31217bded2dbSJung-uk Kim lea 32($i),$i 31227bded2dbSJung-uk Kim mov 64($tptr),$A0[0] # t[2*i+8] # prefetch 31237bded2dbSJung-uk Kim adox $A1[1],$A1[1] 31247bded2dbSJung-uk Kim adcx $A0[1],%rbx 31257bded2dbSJung-uk Kim mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch 31267bded2dbSJung-uk Kim mov %rax,32($tptr) 31277bded2dbSJung-uk Kim mov %rbx,40($tptr) 31287bded2dbSJung-uk Kim 31297bded2dbSJung-uk Kim mulx %rdx,%rax,%rbx 31307bded2dbSJung-uk Kim adox $A0[0],$A0[0] 31317bded2dbSJung-uk Kim adcx $A1[0],%rax 31327bded2dbSJung-uk Kim jrcxz .Lsqrx4x_shift_n_add_break 31337bded2dbSJung-uk Kim .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch 31347bded2dbSJung-uk Kim adox $A0[1],$A0[1] 31357bded2dbSJung-uk Kim adcx $A1[1],%rbx 31367bded2dbSJung-uk Kim mov 80($tptr),$A1[0] # t[2*i+10] # prefetch 31377bded2dbSJung-uk Kim mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch 31387bded2dbSJung-uk Kim mov %rax,48($tptr) 31397bded2dbSJung-uk Kim mov %rbx,56($tptr) 31407bded2dbSJung-uk Kim lea 64($tptr),$tptr 31417bded2dbSJung-uk Kim nop 31427bded2dbSJung-uk Kim jmp .Lsqrx4x_shift_n_add 31437bded2dbSJung-uk Kim 31447bded2dbSJung-uk Kim.align 32 31457bded2dbSJung-uk Kim.Lsqrx4x_shift_n_add_break: 31467bded2dbSJung-uk Kim adcx $A1[1],%rbx 31477bded2dbSJung-uk Kim mov %rax,48($tptr) 31487bded2dbSJung-uk Kim mov %rbx,56($tptr) 31497bded2dbSJung-uk Kim lea 64($tptr),$tptr # end of t[] buffer 31507bded2dbSJung-uk Kim___ 31517bded2dbSJung-uk Kim} 31527bded2dbSJung-uk Kim###################################################################### 31537bded2dbSJung-uk Kim# Montgomery reduction part, "word-by-word" algorithm. 31547bded2dbSJung-uk Kim# 31557bded2dbSJung-uk Kim# This new path is inspired by multiple submissions from Intel, by 31567bded2dbSJung-uk Kim# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 31577bded2dbSJung-uk Kim# Vinodh Gopal... 31587bded2dbSJung-uk Kim{ 31597bded2dbSJung-uk Kimmy ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); 31607bded2dbSJung-uk Kim 31617bded2dbSJung-uk Kim$code.=<<___; 31627bded2dbSJung-uk Kim movq %xmm2,$nptr 31634c6a0400SJung-uk Kim__bn_sqrx8x_reduction: 31647bded2dbSJung-uk Kim xor %eax,%eax # initial top-most carry bit 31657bded2dbSJung-uk Kim mov 32+8(%rsp),%rbx # n0 31667bded2dbSJung-uk Kim mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) 31674c6a0400SJung-uk Kim lea -8*8($nptr,$num),%rcx # end of n[] 31687bded2dbSJung-uk Kim #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer 31697bded2dbSJung-uk Kim mov %rcx, 0+8(%rsp) # save end of n[] 31707bded2dbSJung-uk Kim mov $tptr,8+8(%rsp) # save end of t[] 31717bded2dbSJung-uk Kim 31727bded2dbSJung-uk Kim lea 48+8(%rsp),$tptr # initial t[] window 31737bded2dbSJung-uk Kim jmp .Lsqrx8x_reduction_loop 31747bded2dbSJung-uk Kim 31757bded2dbSJung-uk Kim.align 32 31767bded2dbSJung-uk Kim.Lsqrx8x_reduction_loop: 31777bded2dbSJung-uk Kim mov 8*1($tptr),%r9 31787bded2dbSJung-uk Kim mov 8*2($tptr),%r10 31797bded2dbSJung-uk Kim mov 8*3($tptr),%r11 31807bded2dbSJung-uk Kim mov 8*4($tptr),%r12 31817bded2dbSJung-uk Kim mov %rdx,%r8 31827bded2dbSJung-uk Kim imulq %rbx,%rdx # n0*a[i] 31837bded2dbSJung-uk Kim mov 8*5($tptr),%r13 31847bded2dbSJung-uk Kim mov 8*6($tptr),%r14 31857bded2dbSJung-uk Kim mov 8*7($tptr),%r15 31867bded2dbSJung-uk Kim mov %rax,24+8(%rsp) # store top-most carry bit 31877bded2dbSJung-uk Kim 31887bded2dbSJung-uk Kim lea 8*8($tptr),$tptr 31897bded2dbSJung-uk Kim xor $carry,$carry # cf=0,of=0 31907bded2dbSJung-uk Kim mov \$-8,%rcx 31917bded2dbSJung-uk Kim jmp .Lsqrx8x_reduce 31927bded2dbSJung-uk Kim 31937bded2dbSJung-uk Kim.align 32 31947bded2dbSJung-uk Kim.Lsqrx8x_reduce: 31957bded2dbSJung-uk Kim mov %r8, %rbx 31964c6a0400SJung-uk Kim mulx 8*0($nptr),%rax,%r8 # n[0] 31977bded2dbSJung-uk Kim adcx %rbx,%rax # discarded 31987bded2dbSJung-uk Kim adox %r9,%r8 31997bded2dbSJung-uk Kim 32004c6a0400SJung-uk Kim mulx 8*1($nptr),%rbx,%r9 # n[1] 32017bded2dbSJung-uk Kim adcx %rbx,%r8 32027bded2dbSJung-uk Kim adox %r10,%r9 32037bded2dbSJung-uk Kim 32044c6a0400SJung-uk Kim mulx 8*2($nptr),%rbx,%r10 32057bded2dbSJung-uk Kim adcx %rbx,%r9 32067bded2dbSJung-uk Kim adox %r11,%r10 32077bded2dbSJung-uk Kim 32084c6a0400SJung-uk Kim mulx 8*3($nptr),%rbx,%r11 32097bded2dbSJung-uk Kim adcx %rbx,%r10 32107bded2dbSJung-uk Kim adox %r12,%r11 32117bded2dbSJung-uk Kim 32124c6a0400SJung-uk Kim .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 32137bded2dbSJung-uk Kim mov %rdx,%rax 32147bded2dbSJung-uk Kim mov %r8,%rdx 32157bded2dbSJung-uk Kim adcx %rbx,%r11 32167bded2dbSJung-uk Kim adox %r13,%r12 32177bded2dbSJung-uk Kim 32187bded2dbSJung-uk Kim mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded 32197bded2dbSJung-uk Kim mov %rax,%rdx 32207bded2dbSJung-uk Kim mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] 32217bded2dbSJung-uk Kim 32224c6a0400SJung-uk Kim mulx 8*5($nptr),%rax,%r13 32237bded2dbSJung-uk Kim adcx %rax,%r12 32247bded2dbSJung-uk Kim adox %r14,%r13 32257bded2dbSJung-uk Kim 32264c6a0400SJung-uk Kim mulx 8*6($nptr),%rax,%r14 32277bded2dbSJung-uk Kim adcx %rax,%r13 32287bded2dbSJung-uk Kim adox %r15,%r14 32297bded2dbSJung-uk Kim 32304c6a0400SJung-uk Kim mulx 8*7($nptr),%rax,%r15 32317bded2dbSJung-uk Kim mov %rbx,%rdx 32327bded2dbSJung-uk Kim adcx %rax,%r14 32337bded2dbSJung-uk Kim adox $carry,%r15 # $carry is 0 32347bded2dbSJung-uk Kim adcx $carry,%r15 # cf=0 32357bded2dbSJung-uk Kim 32367bded2dbSJung-uk Kim .byte 0x67,0x67,0x67 32377bded2dbSJung-uk Kim inc %rcx # of=0 32387bded2dbSJung-uk Kim jnz .Lsqrx8x_reduce 32397bded2dbSJung-uk Kim 32407bded2dbSJung-uk Kim mov $carry,%rax # xor %rax,%rax 32417bded2dbSJung-uk Kim cmp 0+8(%rsp),$nptr # end of n[]? 32427bded2dbSJung-uk Kim jae .Lsqrx8x_no_tail 32437bded2dbSJung-uk Kim 32447bded2dbSJung-uk Kim mov 48+8(%rsp),%rdx # pull n0*a[0] 32457bded2dbSJung-uk Kim add 8*0($tptr),%r8 32464c6a0400SJung-uk Kim lea 8*8($nptr),$nptr 32477bded2dbSJung-uk Kim mov \$-8,%rcx 32487bded2dbSJung-uk Kim adcx 8*1($tptr),%r9 32497bded2dbSJung-uk Kim adcx 8*2($tptr),%r10 32507bded2dbSJung-uk Kim adc 8*3($tptr),%r11 32517bded2dbSJung-uk Kim adc 8*4($tptr),%r12 32527bded2dbSJung-uk Kim adc 8*5($tptr),%r13 32537bded2dbSJung-uk Kim adc 8*6($tptr),%r14 32547bded2dbSJung-uk Kim adc 8*7($tptr),%r15 32557bded2dbSJung-uk Kim lea 8*8($tptr),$tptr 32567bded2dbSJung-uk Kim sbb %rax,%rax # top carry 32577bded2dbSJung-uk Kim 32587bded2dbSJung-uk Kim xor $carry,$carry # of=0, cf=0 32597bded2dbSJung-uk Kim mov %rax,16+8(%rsp) 32607bded2dbSJung-uk Kim jmp .Lsqrx8x_tail 32617bded2dbSJung-uk Kim 32627bded2dbSJung-uk Kim.align 32 32637bded2dbSJung-uk Kim.Lsqrx8x_tail: 32647bded2dbSJung-uk Kim mov %r8,%rbx 32654c6a0400SJung-uk Kim mulx 8*0($nptr),%rax,%r8 32667bded2dbSJung-uk Kim adcx %rax,%rbx 32677bded2dbSJung-uk Kim adox %r9,%r8 32687bded2dbSJung-uk Kim 32694c6a0400SJung-uk Kim mulx 8*1($nptr),%rax,%r9 32707bded2dbSJung-uk Kim adcx %rax,%r8 32717bded2dbSJung-uk Kim adox %r10,%r9 32727bded2dbSJung-uk Kim 32734c6a0400SJung-uk Kim mulx 8*2($nptr),%rax,%r10 32747bded2dbSJung-uk Kim adcx %rax,%r9 32757bded2dbSJung-uk Kim adox %r11,%r10 32767bded2dbSJung-uk Kim 32774c6a0400SJung-uk Kim mulx 8*3($nptr),%rax,%r11 32787bded2dbSJung-uk Kim adcx %rax,%r10 32797bded2dbSJung-uk Kim adox %r12,%r11 32807bded2dbSJung-uk Kim 32814c6a0400SJung-uk Kim .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 32827bded2dbSJung-uk Kim adcx %rax,%r11 32837bded2dbSJung-uk Kim adox %r13,%r12 32847bded2dbSJung-uk Kim 32854c6a0400SJung-uk Kim mulx 8*5($nptr),%rax,%r13 32867bded2dbSJung-uk Kim adcx %rax,%r12 32877bded2dbSJung-uk Kim adox %r14,%r13 32887bded2dbSJung-uk Kim 32894c6a0400SJung-uk Kim mulx 8*6($nptr),%rax,%r14 32907bded2dbSJung-uk Kim adcx %rax,%r13 32917bded2dbSJung-uk Kim adox %r15,%r14 32927bded2dbSJung-uk Kim 32934c6a0400SJung-uk Kim mulx 8*7($nptr),%rax,%r15 32947bded2dbSJung-uk Kim mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] 32957bded2dbSJung-uk Kim adcx %rax,%r14 32967bded2dbSJung-uk Kim adox $carry,%r15 32977bded2dbSJung-uk Kim mov %rbx,($tptr,%rcx,8) # save result 32987bded2dbSJung-uk Kim mov %r8,%rbx 32997bded2dbSJung-uk Kim adcx $carry,%r15 # cf=0 33007bded2dbSJung-uk Kim 33017bded2dbSJung-uk Kim inc %rcx # of=0 33027bded2dbSJung-uk Kim jnz .Lsqrx8x_tail 33037bded2dbSJung-uk Kim 33047bded2dbSJung-uk Kim cmp 0+8(%rsp),$nptr # end of n[]? 33057bded2dbSJung-uk Kim jae .Lsqrx8x_tail_done # break out of loop 33067bded2dbSJung-uk Kim 33077bded2dbSJung-uk Kim sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 33087bded2dbSJung-uk Kim mov 48+8(%rsp),%rdx # pull n0*a[0] 33094c6a0400SJung-uk Kim lea 8*8($nptr),$nptr 33107bded2dbSJung-uk Kim adc 8*0($tptr),%r8 33117bded2dbSJung-uk Kim adc 8*1($tptr),%r9 33127bded2dbSJung-uk Kim adc 8*2($tptr),%r10 33137bded2dbSJung-uk Kim adc 8*3($tptr),%r11 33147bded2dbSJung-uk Kim adc 8*4($tptr),%r12 33157bded2dbSJung-uk Kim adc 8*5($tptr),%r13 33167bded2dbSJung-uk Kim adc 8*6($tptr),%r14 33177bded2dbSJung-uk Kim adc 8*7($tptr),%r15 33187bded2dbSJung-uk Kim lea 8*8($tptr),$tptr 33197bded2dbSJung-uk Kim sbb %rax,%rax 33207bded2dbSJung-uk Kim sub \$8,%rcx # mov \$-8,%rcx 33217bded2dbSJung-uk Kim 33227bded2dbSJung-uk Kim xor $carry,$carry # of=0, cf=0 33237bded2dbSJung-uk Kim mov %rax,16+8(%rsp) 33247bded2dbSJung-uk Kim jmp .Lsqrx8x_tail 33257bded2dbSJung-uk Kim 33267bded2dbSJung-uk Kim.align 32 33277bded2dbSJung-uk Kim.Lsqrx8x_tail_done: 33286cf8931aSJung-uk Kim xor %rax,%rax 33297bded2dbSJung-uk Kim add 24+8(%rsp),%r8 # can this overflow? 333080815a77SJung-uk Kim adc \$0,%r9 333180815a77SJung-uk Kim adc \$0,%r10 333280815a77SJung-uk Kim adc \$0,%r11 333380815a77SJung-uk Kim adc \$0,%r12 333480815a77SJung-uk Kim adc \$0,%r13 333580815a77SJung-uk Kim adc \$0,%r14 33366cf8931aSJung-uk Kim adc \$0,%r15 33376cf8931aSJung-uk Kim adc \$0,%rax 33387bded2dbSJung-uk Kim 33397bded2dbSJung-uk Kim sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 33407bded2dbSJung-uk Kim.Lsqrx8x_no_tail: # %cf is 0 if jumped here 33417bded2dbSJung-uk Kim adc 8*0($tptr),%r8 33427bded2dbSJung-uk Kim movq %xmm3,%rcx 33437bded2dbSJung-uk Kim adc 8*1($tptr),%r9 33444c6a0400SJung-uk Kim mov 8*7($nptr),$carry 33457bded2dbSJung-uk Kim movq %xmm2,$nptr # restore $nptr 33467bded2dbSJung-uk Kim adc 8*2($tptr),%r10 33477bded2dbSJung-uk Kim adc 8*3($tptr),%r11 33487bded2dbSJung-uk Kim adc 8*4($tptr),%r12 33497bded2dbSJung-uk Kim adc 8*5($tptr),%r13 33507bded2dbSJung-uk Kim adc 8*6($tptr),%r14 33517bded2dbSJung-uk Kim adc 8*7($tptr),%r15 33526cf8931aSJung-uk Kim adc \$0,%rax # top-most carry 33537bded2dbSJung-uk Kim 33547bded2dbSJung-uk Kim mov 32+8(%rsp),%rbx # n0 33557bded2dbSJung-uk Kim mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" 33567bded2dbSJung-uk Kim 33577bded2dbSJung-uk Kim mov %r8,8*0($tptr) # store top 512 bits 33587bded2dbSJung-uk Kim lea 8*8($tptr),%r8 # borrow %r8 33597bded2dbSJung-uk Kim mov %r9,8*1($tptr) 33607bded2dbSJung-uk Kim mov %r10,8*2($tptr) 33617bded2dbSJung-uk Kim mov %r11,8*3($tptr) 33627bded2dbSJung-uk Kim mov %r12,8*4($tptr) 33637bded2dbSJung-uk Kim mov %r13,8*5($tptr) 33647bded2dbSJung-uk Kim mov %r14,8*6($tptr) 33657bded2dbSJung-uk Kim mov %r15,8*7($tptr) 33667bded2dbSJung-uk Kim 33677bded2dbSJung-uk Kim lea 8*8($tptr,%rcx),$tptr # start of current t[] window 33687bded2dbSJung-uk Kim cmp 8+8(%rsp),%r8 # end of t[]? 33697bded2dbSJung-uk Kim jb .Lsqrx8x_reduction_loop 33704c6a0400SJung-uk Kim ret 33716935a639SJung-uk Kim.cfi_endproc 33724c6a0400SJung-uk Kim.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 33737bded2dbSJung-uk Kim___ 33747bded2dbSJung-uk Kim} 33757bded2dbSJung-uk Kim############################################################## 33767bded2dbSJung-uk Kim# Post-condition, 4x unrolled 33777bded2dbSJung-uk Kim# 33787bded2dbSJung-uk Kim{ 33797bded2dbSJung-uk Kimmy ($rptr,$nptr)=("%rdx","%rbp"); 33807bded2dbSJung-uk Kim$code.=<<___; 33814c6a0400SJung-uk Kim.align 32 33824c6a0400SJung-uk Kim__bn_postx4x_internal: 338317f01e99SJung-uk Kim.cfi_startproc 33844c6a0400SJung-uk Kim mov 8*0($nptr),%r12 33857bded2dbSJung-uk Kim mov %rcx,%r10 # -$num 33867bded2dbSJung-uk Kim mov %rcx,%r9 # -$num 33874c6a0400SJung-uk Kim neg %rax 33884c6a0400SJung-uk Kim sar \$3+2,%rcx 33897bded2dbSJung-uk Kim #lea 48+8(%rsp,%r9),$tptr 33907bded2dbSJung-uk Kim movq %xmm1,$rptr # restore $rptr 33917bded2dbSJung-uk Kim movq %xmm1,$aptr # prepare for back-to-back call 33924c6a0400SJung-uk Kim dec %r12 # so that after 'not' we get -n[0] 33934c6a0400SJung-uk Kim mov 8*1($nptr),%r13 33944c6a0400SJung-uk Kim xor %r8,%r8 33954c6a0400SJung-uk Kim mov 8*2($nptr),%r14 33964c6a0400SJung-uk Kim mov 8*3($nptr),%r15 33974c6a0400SJung-uk Kim jmp .Lsqrx4x_sub_entry 33987bded2dbSJung-uk Kim 33994c6a0400SJung-uk Kim.align 16 34007bded2dbSJung-uk Kim.Lsqrx4x_sub: 34014c6a0400SJung-uk Kim mov 8*0($nptr),%r12 34024c6a0400SJung-uk Kim mov 8*1($nptr),%r13 34034c6a0400SJung-uk Kim mov 8*2($nptr),%r14 34044c6a0400SJung-uk Kim mov 8*3($nptr),%r15 34054c6a0400SJung-uk Kim.Lsqrx4x_sub_entry: 34064c6a0400SJung-uk Kim andn %rax,%r12,%r12 34074c6a0400SJung-uk Kim lea 8*4($nptr),$nptr 34084c6a0400SJung-uk Kim andn %rax,%r13,%r13 34094c6a0400SJung-uk Kim andn %rax,%r14,%r14 34104c6a0400SJung-uk Kim andn %rax,%r15,%r15 34114c6a0400SJung-uk Kim 34124c6a0400SJung-uk Kim neg %r8 # mov %r8,%cf 34134c6a0400SJung-uk Kim adc 8*0($tptr),%r12 34144c6a0400SJung-uk Kim adc 8*1($tptr),%r13 34154c6a0400SJung-uk Kim adc 8*2($tptr),%r14 34164c6a0400SJung-uk Kim adc 8*3($tptr),%r15 34177bded2dbSJung-uk Kim mov %r12,8*0($rptr) 34184c6a0400SJung-uk Kim lea 8*4($tptr),$tptr 34197bded2dbSJung-uk Kim mov %r13,8*1($rptr) 34204c6a0400SJung-uk Kim sbb %r8,%r8 # mov %cf,%r8 34217bded2dbSJung-uk Kim mov %r14,8*2($rptr) 34227bded2dbSJung-uk Kim mov %r15,8*3($rptr) 34237bded2dbSJung-uk Kim lea 8*4($rptr),$rptr 34247bded2dbSJung-uk Kim 34257bded2dbSJung-uk Kim inc %rcx 34267bded2dbSJung-uk Kim jnz .Lsqrx4x_sub 34274c6a0400SJung-uk Kim 34287bded2dbSJung-uk Kim neg %r9 # restore $num 34297bded2dbSJung-uk Kim 34307bded2dbSJung-uk Kim ret 343117f01e99SJung-uk Kim.cfi_endproc 34324c6a0400SJung-uk Kim.size __bn_postx4x_internal,.-__bn_postx4x_internal 34331f13597dSJung-uk Kim___ 34344c6a0400SJung-uk Kim} 34351f13597dSJung-uk Kim}}} 34361f13597dSJung-uk Kim{ 34377bded2dbSJung-uk Kimmy ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order 34387bded2dbSJung-uk Kim ("%rdi","%esi","%rdx","%ecx"); # Unix order 34391f13597dSJung-uk Kimmy $out=$inp; 34401f13597dSJung-uk Kimmy $STRIDE=2**5*8; 34411f13597dSJung-uk Kimmy $N=$STRIDE/4; 34421f13597dSJung-uk Kim 34431f13597dSJung-uk Kim$code.=<<___; 34447bded2dbSJung-uk Kim.globl bn_get_bits5 34457bded2dbSJung-uk Kim.type bn_get_bits5,\@abi-omnipotent 34467bded2dbSJung-uk Kim.align 16 34477bded2dbSJung-uk Kimbn_get_bits5: 344817f01e99SJung-uk Kim.cfi_startproc 34497bded2dbSJung-uk Kim lea 0($inp),%r10 34507bded2dbSJung-uk Kim lea 1($inp),%r11 34517bded2dbSJung-uk Kim mov $num,%ecx 34527bded2dbSJung-uk Kim shr \$4,$num 34537bded2dbSJung-uk Kim and \$15,%ecx 34547bded2dbSJung-uk Kim lea -8(%ecx),%eax 34557bded2dbSJung-uk Kim cmp \$11,%ecx 34567bded2dbSJung-uk Kim cmova %r11,%r10 34577bded2dbSJung-uk Kim cmova %eax,%ecx 34587bded2dbSJung-uk Kim movzw (%r10,$num,2),%eax 34597bded2dbSJung-uk Kim shrl %cl,%eax 34607bded2dbSJung-uk Kim and \$31,%eax 34617bded2dbSJung-uk Kim ret 346217f01e99SJung-uk Kim.cfi_endproc 34637bded2dbSJung-uk Kim.size bn_get_bits5,.-bn_get_bits5 34647bded2dbSJung-uk Kim 34651f13597dSJung-uk Kim.globl bn_scatter5 34661f13597dSJung-uk Kim.type bn_scatter5,\@abi-omnipotent 34671f13597dSJung-uk Kim.align 16 34681f13597dSJung-uk Kimbn_scatter5: 346917f01e99SJung-uk Kim.cfi_startproc 34701f13597dSJung-uk Kim cmp \$0, $num 34711f13597dSJung-uk Kim jz .Lscatter_epilogue 34721f13597dSJung-uk Kim lea ($tbl,$idx,8),$tbl 34731f13597dSJung-uk Kim.Lscatter: 34741f13597dSJung-uk Kim mov ($inp),%rax 34751f13597dSJung-uk Kim lea 8($inp),$inp 34761f13597dSJung-uk Kim mov %rax,($tbl) 34771f13597dSJung-uk Kim lea 32*8($tbl),$tbl 34781f13597dSJung-uk Kim sub \$1,$num 34791f13597dSJung-uk Kim jnz .Lscatter 34801f13597dSJung-uk Kim.Lscatter_epilogue: 34811f13597dSJung-uk Kim ret 348217f01e99SJung-uk Kim.cfi_endproc 34831f13597dSJung-uk Kim.size bn_scatter5,.-bn_scatter5 34841f13597dSJung-uk Kim 34851f13597dSJung-uk Kim.globl bn_gather5 34861f13597dSJung-uk Kim.type bn_gather5,\@abi-omnipotent 34874c6a0400SJung-uk Kim.align 32 34881f13597dSJung-uk Kimbn_gather5: 34894c6a0400SJung-uk Kim.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases 349017f01e99SJung-uk Kim.cfi_startproc 34911f13597dSJung-uk Kim # I can't trust assembler to use specific encoding:-( 34924c6a0400SJung-uk Kim .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 34934c6a0400SJung-uk Kim .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp 34944c6a0400SJung-uk Kim lea .Linc(%rip),%rax 34954c6a0400SJung-uk Kim and \$-16,%rsp # shouldn't be formally required 34964c6a0400SJung-uk Kim 34974c6a0400SJung-uk Kim movd $idx,%xmm5 34984c6a0400SJung-uk Kim movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 34994c6a0400SJung-uk Kim movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 35004c6a0400SJung-uk Kim lea 128($tbl),%r11 # size optimization 35014c6a0400SJung-uk Kim lea 128(%rsp),%rax # size optimization 35024c6a0400SJung-uk Kim 35034c6a0400SJung-uk Kim pshufd \$0,%xmm5,%xmm5 # broadcast $idx 35044c6a0400SJung-uk Kim movdqa %xmm1,%xmm4 35054c6a0400SJung-uk Kim movdqa %xmm1,%xmm2 35064c6a0400SJung-uk Kim___ 35074c6a0400SJung-uk Kim######################################################################## 35084c6a0400SJung-uk Kim# calculate mask by comparing 0..31 to $idx and save result to stack 35094c6a0400SJung-uk Kim# 35104c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16;$i+=4) { 35114c6a0400SJung-uk Kim$code.=<<___; 35124c6a0400SJung-uk Kim paddd %xmm0,%xmm1 35134c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm0 # compare to 1,0 35144c6a0400SJung-uk Kim___ 35154c6a0400SJung-uk Kim$code.=<<___ if ($i); 35164c6a0400SJung-uk Kim movdqa %xmm3,`16*($i-1)-128`(%rax) 35171f13597dSJung-uk Kim___ 35181f13597dSJung-uk Kim$code.=<<___; 35194c6a0400SJung-uk Kim movdqa %xmm4,%xmm3 35201f13597dSJung-uk Kim 35214c6a0400SJung-uk Kim paddd %xmm1,%xmm2 35224c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm1 # compare to 3,2 35234c6a0400SJung-uk Kim movdqa %xmm0,`16*($i+0)-128`(%rax) 35244c6a0400SJung-uk Kim movdqa %xmm4,%xmm0 35254c6a0400SJung-uk Kim 35264c6a0400SJung-uk Kim paddd %xmm2,%xmm3 35274c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm2 # compare to 5,4 35284c6a0400SJung-uk Kim movdqa %xmm1,`16*($i+1)-128`(%rax) 35294c6a0400SJung-uk Kim movdqa %xmm4,%xmm1 35304c6a0400SJung-uk Kim 35314c6a0400SJung-uk Kim paddd %xmm3,%xmm0 35324c6a0400SJung-uk Kim pcmpeqd %xmm5,%xmm3 # compare to 7,6 35334c6a0400SJung-uk Kim movdqa %xmm2,`16*($i+2)-128`(%rax) 35344c6a0400SJung-uk Kim movdqa %xmm4,%xmm2 35354c6a0400SJung-uk Kim___ 35364c6a0400SJung-uk Kim} 35374c6a0400SJung-uk Kim$code.=<<___; 35384c6a0400SJung-uk Kim movdqa %xmm3,`16*($i-1)-128`(%rax) 35394c6a0400SJung-uk Kim jmp .Lgather 35404c6a0400SJung-uk Kim 35414c6a0400SJung-uk Kim.align 32 35424c6a0400SJung-uk Kim.Lgather: 35434c6a0400SJung-uk Kim pxor %xmm4,%xmm4 35444c6a0400SJung-uk Kim pxor %xmm5,%xmm5 35454c6a0400SJung-uk Kim___ 35464c6a0400SJung-uk Kimfor($i=0;$i<$STRIDE/16;$i+=4) { 35474c6a0400SJung-uk Kim$code.=<<___; 35484c6a0400SJung-uk Kim movdqa `16*($i+0)-128`(%r11),%xmm0 35494c6a0400SJung-uk Kim movdqa `16*($i+1)-128`(%r11),%xmm1 35504c6a0400SJung-uk Kim movdqa `16*($i+2)-128`(%r11),%xmm2 35514c6a0400SJung-uk Kim pand `16*($i+0)-128`(%rax),%xmm0 35524c6a0400SJung-uk Kim movdqa `16*($i+3)-128`(%r11),%xmm3 35534c6a0400SJung-uk Kim pand `16*($i+1)-128`(%rax),%xmm1 35544c6a0400SJung-uk Kim por %xmm0,%xmm4 35554c6a0400SJung-uk Kim pand `16*($i+2)-128`(%rax),%xmm2 35564c6a0400SJung-uk Kim por %xmm1,%xmm5 35574c6a0400SJung-uk Kim pand `16*($i+3)-128`(%rax),%xmm3 35584c6a0400SJung-uk Kim por %xmm2,%xmm4 35594c6a0400SJung-uk Kim por %xmm3,%xmm5 35604c6a0400SJung-uk Kim___ 35614c6a0400SJung-uk Kim} 35624c6a0400SJung-uk Kim$code.=<<___; 35634c6a0400SJung-uk Kim por %xmm5,%xmm4 35644c6a0400SJung-uk Kim lea $STRIDE(%r11),%r11 35654c6a0400SJung-uk Kim pshufd \$0x4e,%xmm4,%xmm0 35664c6a0400SJung-uk Kim por %xmm4,%xmm0 35671f13597dSJung-uk Kim movq %xmm0,($out) # m0=bp[0] 35681f13597dSJung-uk Kim lea 8($out),$out 35691f13597dSJung-uk Kim sub \$1,$num 35701f13597dSJung-uk Kim jnz .Lgather 35714c6a0400SJung-uk Kim 35724c6a0400SJung-uk Kim lea (%r10),%rsp 35731f13597dSJung-uk Kim ret 35741f13597dSJung-uk Kim.LSEH_end_bn_gather5: 357517f01e99SJung-uk Kim.cfi_endproc 35761f13597dSJung-uk Kim.size bn_gather5,.-bn_gather5 35771f13597dSJung-uk Kim___ 35781f13597dSJung-uk Kim} 35791f13597dSJung-uk Kim$code.=<<___; 35801f13597dSJung-uk Kim.align 64 35814c6a0400SJung-uk Kim.Linc: 35824c6a0400SJung-uk Kim .long 0,0, 1,1 35834c6a0400SJung-uk Kim .long 2,2, 2,2 35841f13597dSJung-uk Kim.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 35851f13597dSJung-uk Kim___ 35861f13597dSJung-uk Kim 35871f13597dSJung-uk Kim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 35881f13597dSJung-uk Kim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 35891f13597dSJung-uk Kimif ($win64) { 35901f13597dSJung-uk Kim$rec="%rcx"; 35911f13597dSJung-uk Kim$frame="%rdx"; 35921f13597dSJung-uk Kim$context="%r8"; 35931f13597dSJung-uk Kim$disp="%r9"; 35941f13597dSJung-uk Kim 35951f13597dSJung-uk Kim$code.=<<___; 35961f13597dSJung-uk Kim.extern __imp_RtlVirtualUnwind 35971f13597dSJung-uk Kim.type mul_handler,\@abi-omnipotent 35981f13597dSJung-uk Kim.align 16 35991f13597dSJung-uk Kimmul_handler: 36001f13597dSJung-uk Kim push %rsi 36011f13597dSJung-uk Kim push %rdi 36021f13597dSJung-uk Kim push %rbx 36031f13597dSJung-uk Kim push %rbp 36041f13597dSJung-uk Kim push %r12 36051f13597dSJung-uk Kim push %r13 36061f13597dSJung-uk Kim push %r14 36071f13597dSJung-uk Kim push %r15 36081f13597dSJung-uk Kim pushfq 36091f13597dSJung-uk Kim sub \$64,%rsp 36101f13597dSJung-uk Kim 36111f13597dSJung-uk Kim mov 120($context),%rax # pull context->Rax 36121f13597dSJung-uk Kim mov 248($context),%rbx # pull context->Rip 36131f13597dSJung-uk Kim 36141f13597dSJung-uk Kim mov 8($disp),%rsi # disp->ImageBase 36151f13597dSJung-uk Kim mov 56($disp),%r11 # disp->HandlerData 36161f13597dSJung-uk Kim 36171f13597dSJung-uk Kim mov 0(%r11),%r10d # HandlerData[0] 36181f13597dSJung-uk Kim lea (%rsi,%r10),%r10 # end of prologue label 36191f13597dSJung-uk Kim cmp %r10,%rbx # context->Rip<end of prologue label 36201f13597dSJung-uk Kim jb .Lcommon_seh_tail 36211f13597dSJung-uk Kim 3622aeb5019cSJung-uk Kim mov 4(%r11),%r10d # HandlerData[1] 3623e71b7053SJung-uk Kim lea (%rsi,%r10),%r10 # beginning of body label 3624e71b7053SJung-uk Kim cmp %r10,%rbx # context->Rip<body label 3625aeb5019cSJung-uk Kim jb .Lcommon_pop_regs 3626aeb5019cSJung-uk Kim 36271f13597dSJung-uk Kim mov 152($context),%rax # pull context->Rsp 36281f13597dSJung-uk Kim 3629aeb5019cSJung-uk Kim mov 8(%r11),%r10d # HandlerData[2] 36301f13597dSJung-uk Kim lea (%rsi,%r10),%r10 # epilogue label 36311f13597dSJung-uk Kim cmp %r10,%rbx # context->Rip>=epilogue label 36321f13597dSJung-uk Kim jae .Lcommon_seh_tail 36331f13597dSJung-uk Kim 36347bded2dbSJung-uk Kim lea .Lmul_epilogue(%rip),%r10 36357bded2dbSJung-uk Kim cmp %r10,%rbx 36364c6a0400SJung-uk Kim ja .Lbody_40 36377bded2dbSJung-uk Kim 36381f13597dSJung-uk Kim mov 192($context),%r10 # pull $num 36391f13597dSJung-uk Kim mov 8(%rax,%r10,8),%rax # pull saved stack pointer 36404c6a0400SJung-uk Kim 3641aeb5019cSJung-uk Kim jmp .Lcommon_pop_regs 36421f13597dSJung-uk Kim 36437bded2dbSJung-uk Kim.Lbody_40: 36447bded2dbSJung-uk Kim mov 40(%rax),%rax # pull saved stack pointer 3645aeb5019cSJung-uk Kim.Lcommon_pop_regs: 36461f13597dSJung-uk Kim mov -8(%rax),%rbx 36471f13597dSJung-uk Kim mov -16(%rax),%rbp 36481f13597dSJung-uk Kim mov -24(%rax),%r12 36491f13597dSJung-uk Kim mov -32(%rax),%r13 36501f13597dSJung-uk Kim mov -40(%rax),%r14 36511f13597dSJung-uk Kim mov -48(%rax),%r15 36521f13597dSJung-uk Kim mov %rbx,144($context) # restore context->Rbx 36531f13597dSJung-uk Kim mov %rbp,160($context) # restore context->Rbp 36541f13597dSJung-uk Kim mov %r12,216($context) # restore context->R12 36551f13597dSJung-uk Kim mov %r13,224($context) # restore context->R13 36561f13597dSJung-uk Kim mov %r14,232($context) # restore context->R14 36571f13597dSJung-uk Kim mov %r15,240($context) # restore context->R15 36581f13597dSJung-uk Kim 36591f13597dSJung-uk Kim.Lcommon_seh_tail: 36601f13597dSJung-uk Kim mov 8(%rax),%rdi 36611f13597dSJung-uk Kim mov 16(%rax),%rsi 36621f13597dSJung-uk Kim mov %rax,152($context) # restore context->Rsp 36631f13597dSJung-uk Kim mov %rsi,168($context) # restore context->Rsi 36641f13597dSJung-uk Kim mov %rdi,176($context) # restore context->Rdi 36651f13597dSJung-uk Kim 36661f13597dSJung-uk Kim mov 40($disp),%rdi # disp->ContextRecord 36671f13597dSJung-uk Kim mov $context,%rsi # context 36681f13597dSJung-uk Kim mov \$154,%ecx # sizeof(CONTEXT) 36691f13597dSJung-uk Kim .long 0xa548f3fc # cld; rep movsq 36701f13597dSJung-uk Kim 36711f13597dSJung-uk Kim mov $disp,%rsi 36721f13597dSJung-uk Kim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 36731f13597dSJung-uk Kim mov 8(%rsi),%rdx # arg2, disp->ImageBase 36741f13597dSJung-uk Kim mov 0(%rsi),%r8 # arg3, disp->ControlPc 36751f13597dSJung-uk Kim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 36761f13597dSJung-uk Kim mov 40(%rsi),%r10 # disp->ContextRecord 36771f13597dSJung-uk Kim lea 56(%rsi),%r11 # &disp->HandlerData 36781f13597dSJung-uk Kim lea 24(%rsi),%r12 # &disp->EstablisherFrame 36791f13597dSJung-uk Kim mov %r10,32(%rsp) # arg5 36801f13597dSJung-uk Kim mov %r11,40(%rsp) # arg6 36811f13597dSJung-uk Kim mov %r12,48(%rsp) # arg7 36821f13597dSJung-uk Kim mov %rcx,56(%rsp) # arg8, (NULL) 36831f13597dSJung-uk Kim call *__imp_RtlVirtualUnwind(%rip) 36841f13597dSJung-uk Kim 36851f13597dSJung-uk Kim mov \$1,%eax # ExceptionContinueSearch 36861f13597dSJung-uk Kim add \$64,%rsp 36871f13597dSJung-uk Kim popfq 36881f13597dSJung-uk Kim pop %r15 36891f13597dSJung-uk Kim pop %r14 36901f13597dSJung-uk Kim pop %r13 36911f13597dSJung-uk Kim pop %r12 36921f13597dSJung-uk Kim pop %rbp 36931f13597dSJung-uk Kim pop %rbx 36941f13597dSJung-uk Kim pop %rdi 36951f13597dSJung-uk Kim pop %rsi 36961f13597dSJung-uk Kim ret 36971f13597dSJung-uk Kim.size mul_handler,.-mul_handler 36981f13597dSJung-uk Kim 36991f13597dSJung-uk Kim.section .pdata 37001f13597dSJung-uk Kim.align 4 37011f13597dSJung-uk Kim .rva .LSEH_begin_bn_mul_mont_gather5 37021f13597dSJung-uk Kim .rva .LSEH_end_bn_mul_mont_gather5 37031f13597dSJung-uk Kim .rva .LSEH_info_bn_mul_mont_gather5 37041f13597dSJung-uk Kim 37051f13597dSJung-uk Kim .rva .LSEH_begin_bn_mul4x_mont_gather5 37061f13597dSJung-uk Kim .rva .LSEH_end_bn_mul4x_mont_gather5 37071f13597dSJung-uk Kim .rva .LSEH_info_bn_mul4x_mont_gather5 37081f13597dSJung-uk Kim 37097bded2dbSJung-uk Kim .rva .LSEH_begin_bn_power5 37107bded2dbSJung-uk Kim .rva .LSEH_end_bn_power5 37117bded2dbSJung-uk Kim .rva .LSEH_info_bn_power5 37127bded2dbSJung-uk Kim___ 37137bded2dbSJung-uk Kim$code.=<<___ if ($addx); 37147bded2dbSJung-uk Kim .rva .LSEH_begin_bn_mulx4x_mont_gather5 37157bded2dbSJung-uk Kim .rva .LSEH_end_bn_mulx4x_mont_gather5 37167bded2dbSJung-uk Kim .rva .LSEH_info_bn_mulx4x_mont_gather5 37177bded2dbSJung-uk Kim 37187bded2dbSJung-uk Kim .rva .LSEH_begin_bn_powerx5 37197bded2dbSJung-uk Kim .rva .LSEH_end_bn_powerx5 37207bded2dbSJung-uk Kim .rva .LSEH_info_bn_powerx5 37217bded2dbSJung-uk Kim___ 37227bded2dbSJung-uk Kim$code.=<<___; 37231f13597dSJung-uk Kim .rva .LSEH_begin_bn_gather5 37241f13597dSJung-uk Kim .rva .LSEH_end_bn_gather5 37251f13597dSJung-uk Kim .rva .LSEH_info_bn_gather5 37261f13597dSJung-uk Kim 37271f13597dSJung-uk Kim.section .xdata 37281f13597dSJung-uk Kim.align 8 37291f13597dSJung-uk Kim.LSEH_info_bn_mul_mont_gather5: 37301f13597dSJung-uk Kim .byte 9,0,0,0 37311f13597dSJung-uk Kim .rva mul_handler 3732aeb5019cSJung-uk Kim .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] 37331f13597dSJung-uk Kim.align 8 37341f13597dSJung-uk Kim.LSEH_info_bn_mul4x_mont_gather5: 37351f13597dSJung-uk Kim .byte 9,0,0,0 37361f13597dSJung-uk Kim .rva mul_handler 3737aeb5019cSJung-uk Kim .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 37387bded2dbSJung-uk Kim.align 8 37397bded2dbSJung-uk Kim.LSEH_info_bn_power5: 37407bded2dbSJung-uk Kim .byte 9,0,0,0 37417bded2dbSJung-uk Kim .rva mul_handler 3742aeb5019cSJung-uk Kim .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] 37437bded2dbSJung-uk Kim___ 37447bded2dbSJung-uk Kim$code.=<<___ if ($addx); 37457bded2dbSJung-uk Kim.align 8 37467bded2dbSJung-uk Kim.LSEH_info_bn_mulx4x_mont_gather5: 37477bded2dbSJung-uk Kim .byte 9,0,0,0 37487bded2dbSJung-uk Kim .rva mul_handler 3749aeb5019cSJung-uk Kim .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] 37507bded2dbSJung-uk Kim.align 8 37517bded2dbSJung-uk Kim.LSEH_info_bn_powerx5: 37527bded2dbSJung-uk Kim .byte 9,0,0,0 37537bded2dbSJung-uk Kim .rva mul_handler 3754aeb5019cSJung-uk Kim .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] 37557bded2dbSJung-uk Kim___ 37567bded2dbSJung-uk Kim$code.=<<___; 37571f13597dSJung-uk Kim.align 8 37581f13597dSJung-uk Kim.LSEH_info_bn_gather5: 37594c6a0400SJung-uk Kim .byte 0x01,0x0b,0x03,0x0a 37604c6a0400SJung-uk Kim .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 37614c6a0400SJung-uk Kim .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) 37621f13597dSJung-uk Kim.align 8 37631f13597dSJung-uk Kim___ 37641f13597dSJung-uk Kim} 37651f13597dSJung-uk Kim 37661f13597dSJung-uk Kim$code =~ s/\`([^\`]*)\`/eval($1)/gem; 37671f13597dSJung-uk Kim 37681f13597dSJung-uk Kimprint $code; 376917f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!"; 3770