xref: /freebsd/sys/crypto/skein/amd64/skein_block_asm.S (revision 031beb4e239bfce798af17f5fe8dba8bcaf13d99)
124ed6f55SEd Maste#
224ed6f55SEd Maste#----------------------------------------------------------------
324ed6f55SEd Maste# 64-bit x86 assembler code (gnu as) for Skein block functions
424ed6f55SEd Maste#
524ed6f55SEd Maste# Author: Doug Whiting, Hifn/Exar
624ed6f55SEd Maste#
724ed6f55SEd Maste# This code is released to the public domain.
824ed6f55SEd Maste#----------------------------------------------------------------
924ed6f55SEd Maste#
1024ed6f55SEd Maste    .text
1124ed6f55SEd Maste    .altmacro
1224ed6f55SEd Maste#ifndef __clang__
1324ed6f55SEd Maste    .psize 0,128                            #list file has no page boundaries
1424ed6f55SEd Maste#endif
1524ed6f55SEd Maste#
1624ed6f55SEd Maste_MASK_ALL_  =  (256+512+1024)               #all three algorithm bits
1724ed6f55SEd Maste_MAX_FRAME_ =  240
1824ed6f55SEd Maste#
1924ed6f55SEd Maste#################
2024ed6f55SEd Maste#ifndef SKEIN_USE_ASM
2124ed6f55SEd Maste_USE_ASM_         = _MASK_ALL_
2224ed6f55SEd Maste#else
2324ed6f55SEd Maste_USE_ASM_         = SKEIN_USE_ASM
2424ed6f55SEd Maste#endif
2524ed6f55SEd Maste#################
2624ed6f55SEd Maste#configure loop unrolling
2724ed6f55SEd Maste#ifndef SKEIN_LOOP
2824ed6f55SEd Maste_SKEIN_LOOP       =   2                     #default is fully unrolled for 256/512, twice for 1024
2924ed6f55SEd Maste#else
3024ed6f55SEd Maste_SKEIN_LOOP       = SKEIN_LOOP
3124ed6f55SEd Maste  .irp _NN_,%_SKEIN_LOOP                #only display loop unrolling if default changed on command line
3224ed6f55SEd Maste#.print  "+++ SKEIN_LOOP = \_NN_"
3324ed6f55SEd Maste  .endr
3424ed6f55SEd Maste#endif
3524ed6f55SEd Maste# the unroll counts (0 --> fully unrolled)
3624ed6f55SEd MasteSKEIN_UNROLL_256  = (_SKEIN_LOOP / 100) % 10
3724ed6f55SEd MasteSKEIN_UNROLL_512  = (_SKEIN_LOOP /  10) % 10
3824ed6f55SEd MasteSKEIN_UNROLL_1024 = (_SKEIN_LOOP      ) % 10
3924ed6f55SEd Maste#
4024ed6f55SEd MasteSKEIN_ASM_UNROLL  = 0
4124ed6f55SEd Maste  .irp _NN_,256,512,1024
4224ed6f55SEd Maste    .if (SKEIN_UNROLL_\_NN_) == 0
43*58958a74SAdrian ChaddSKEIN_ASM_UNROLL  = (SKEIN_ASM_UNROLL) + \_NN_
4424ed6f55SEd Maste    .endif
4524ed6f55SEd Maste  .endr
4624ed6f55SEd Maste#################
4724ed6f55SEd Maste#
4824ed6f55SEd Maste.ifndef SKEIN_ROUNDS
4924ed6f55SEd MasteROUNDS_256  =   72
5024ed6f55SEd MasteROUNDS_512  =   72
5124ed6f55SEd MasteROUNDS_1024 =   80
5224ed6f55SEd Maste.else
5324ed6f55SEd MasteROUNDS_256  = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
5424ed6f55SEd MasteROUNDS_512  = 8*((((SKEIN_ROUNDS /  10) + 5) % 10) + 5)
5524ed6f55SEd MasteROUNDS_1024 = 8*((((SKEIN_ROUNDS      ) + 5) % 10) + 5)
5624ed6f55SEd Maste# only display rounds if default size is changed on command line
5724ed6f55SEd Maste.irp _NN_,256,512,1024
5836972ee3SEd Maste  .if _USE_ASM_ & \_NN_
5924ed6f55SEd Maste    .irp _RR_,%(ROUNDS_\_NN_)
6024ed6f55SEd Maste      .if _NN_ < 1024
6124ed6f55SEd Maste.print  "+++ SKEIN_ROUNDS_\_NN_  = \_RR_"
6224ed6f55SEd Maste      .else
6324ed6f55SEd Maste.print  "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
6424ed6f55SEd Maste      .endif
6524ed6f55SEd Maste    .endr
6624ed6f55SEd Maste  .endif
6724ed6f55SEd Maste.endr
6824ed6f55SEd Maste.endif
6924ed6f55SEd Maste#################
7024ed6f55SEd Maste#
7124ed6f55SEd Maste.ifdef SKEIN_CODE_SIZE
7224ed6f55SEd Maste_SKEIN_CODE_SIZE = (1)
7324ed6f55SEd Maste.else
7424ed6f55SEd Maste.ifdef  SKEIN_PERF                           #use code size if SKEIN_PERF is defined
7524ed6f55SEd Maste_SKEIN_CODE_SIZE = (1)
7624ed6f55SEd Maste.else
7724ed6f55SEd Maste_SKEIN_CODE_SIZE = (0)
7824ed6f55SEd Maste.endif
7924ed6f55SEd Maste.endif
8024ed6f55SEd Maste#
8124ed6f55SEd Maste#################
8224ed6f55SEd Maste#
8324ed6f55SEd Maste.ifndef SKEIN_DEBUG
8424ed6f55SEd Maste_SKEIN_DEBUG      = 0
8524ed6f55SEd Maste.else
8624ed6f55SEd Maste_SKEIN_DEBUG      = 1
8724ed6f55SEd Maste.endif
8824ed6f55SEd Maste#################
8924ed6f55SEd Maste#
9024ed6f55SEd Maste# define offsets of fields in hash context structure
9124ed6f55SEd Maste#
9224ed6f55SEd MasteHASH_BITS   =   0                   #bits of hash output
9324ed6f55SEd MasteBCNT        =   8 + HASH_BITS       #number of bytes in BUFFER[]
9424ed6f55SEd MasteTWEAK       =   8 + BCNT            #tweak values[0..1]
9524ed6f55SEd MasteX_VARS      =  16 + TWEAK           #chaining vars
9624ed6f55SEd Maste#
9724ed6f55SEd Maste#(Note: buffer[] in context structure is NOT needed here :-)
9824ed6f55SEd Maste#
9924ed6f55SEd MasteKW_PARITY   =   0x1BD11BDAA9FC1A22  #overall parity of key schedule words
10024ed6f55SEd MasteFIRST_MASK  =   ~ (1 <<  6)
10124ed6f55SEd MasteFIRST_MASK64=   ~ (1 << 62)
10224ed6f55SEd Maste#
10324ed6f55SEd Maste# rotation constants for Skein
10424ed6f55SEd Maste#
10524ed6f55SEd MasteRC_256_0_0  = 14
10624ed6f55SEd MasteRC_256_0_1  = 16
10724ed6f55SEd Maste
10824ed6f55SEd MasteRC_256_1_0  = 52
10924ed6f55SEd MasteRC_256_1_1  = 57
11024ed6f55SEd Maste
11124ed6f55SEd MasteRC_256_2_0  = 23
11224ed6f55SEd MasteRC_256_2_1  = 40
11324ed6f55SEd Maste
11424ed6f55SEd MasteRC_256_3_0  =  5
11524ed6f55SEd MasteRC_256_3_1  = 37
11624ed6f55SEd Maste
11724ed6f55SEd MasteRC_256_4_0  = 25
11824ed6f55SEd MasteRC_256_4_1  = 33
11924ed6f55SEd Maste
12024ed6f55SEd MasteRC_256_5_0  = 46
12124ed6f55SEd MasteRC_256_5_1  = 12
12224ed6f55SEd Maste
12324ed6f55SEd MasteRC_256_6_0  = 58
12424ed6f55SEd MasteRC_256_6_1  = 22
12524ed6f55SEd Maste
12624ed6f55SEd MasteRC_256_7_0  = 32
12724ed6f55SEd MasteRC_256_7_1  = 32
12824ed6f55SEd Maste
12924ed6f55SEd MasteRC_512_0_0  = 46
13024ed6f55SEd MasteRC_512_0_1  = 36
13124ed6f55SEd MasteRC_512_0_2  = 19
13224ed6f55SEd MasteRC_512_0_3  = 37
13324ed6f55SEd Maste
13424ed6f55SEd MasteRC_512_1_0  = 33
13524ed6f55SEd MasteRC_512_1_1  = 27
13624ed6f55SEd MasteRC_512_1_2  = 14
13724ed6f55SEd MasteRC_512_1_3  = 42
13824ed6f55SEd Maste
13924ed6f55SEd MasteRC_512_2_0  = 17
14024ed6f55SEd MasteRC_512_2_1  = 49
14124ed6f55SEd MasteRC_512_2_2  = 36
14224ed6f55SEd MasteRC_512_2_3  = 39
14324ed6f55SEd Maste
14424ed6f55SEd MasteRC_512_3_0  = 44
14524ed6f55SEd MasteRC_512_3_1  =  9
14624ed6f55SEd MasteRC_512_3_2  = 54
14724ed6f55SEd MasteRC_512_3_3  = 56
14824ed6f55SEd Maste
14924ed6f55SEd MasteRC_512_4_0  = 39
15024ed6f55SEd MasteRC_512_4_1  = 30
15124ed6f55SEd MasteRC_512_4_2  = 34
15224ed6f55SEd MasteRC_512_4_3  = 24
15324ed6f55SEd Maste
15424ed6f55SEd MasteRC_512_5_0  = 13
15524ed6f55SEd MasteRC_512_5_1  = 50
15624ed6f55SEd MasteRC_512_5_2  = 10
15724ed6f55SEd MasteRC_512_5_3  = 17
15824ed6f55SEd Maste
15924ed6f55SEd MasteRC_512_6_0  = 25
16024ed6f55SEd MasteRC_512_6_1  = 29
16124ed6f55SEd MasteRC_512_6_2  = 39
16224ed6f55SEd MasteRC_512_6_3  = 43
16324ed6f55SEd Maste
16424ed6f55SEd MasteRC_512_7_0  =  8
16524ed6f55SEd MasteRC_512_7_1  = 35
16624ed6f55SEd MasteRC_512_7_2  = 56
16724ed6f55SEd MasteRC_512_7_3  = 22
16824ed6f55SEd Maste
16924ed6f55SEd MasteRC_1024_0_0 = 24
17024ed6f55SEd MasteRC_1024_0_1 = 13
17124ed6f55SEd MasteRC_1024_0_2 =  8
17224ed6f55SEd MasteRC_1024_0_3 = 47
17324ed6f55SEd MasteRC_1024_0_4 =  8
17424ed6f55SEd MasteRC_1024_0_5 = 17
17524ed6f55SEd MasteRC_1024_0_6 = 22
17624ed6f55SEd MasteRC_1024_0_7 = 37
17724ed6f55SEd Maste
17824ed6f55SEd MasteRC_1024_1_0 = 38
17924ed6f55SEd MasteRC_1024_1_1 = 19
18024ed6f55SEd MasteRC_1024_1_2 = 10
18124ed6f55SEd MasteRC_1024_1_3 = 55
18224ed6f55SEd MasteRC_1024_1_4 = 49
18324ed6f55SEd MasteRC_1024_1_5 = 18
18424ed6f55SEd MasteRC_1024_1_6 = 23
18524ed6f55SEd MasteRC_1024_1_7 = 52
18624ed6f55SEd Maste
18724ed6f55SEd MasteRC_1024_2_0 = 33
18824ed6f55SEd MasteRC_1024_2_1 =  4
18924ed6f55SEd MasteRC_1024_2_2 = 51
19024ed6f55SEd MasteRC_1024_2_3 = 13
19124ed6f55SEd MasteRC_1024_2_4 = 34
19224ed6f55SEd MasteRC_1024_2_5 = 41
19324ed6f55SEd MasteRC_1024_2_6 = 59
19424ed6f55SEd MasteRC_1024_2_7 = 17
19524ed6f55SEd Maste
19624ed6f55SEd MasteRC_1024_3_0 =  5
19724ed6f55SEd MasteRC_1024_3_1 = 20
19824ed6f55SEd MasteRC_1024_3_2 = 48
19924ed6f55SEd MasteRC_1024_3_3 = 41
20024ed6f55SEd MasteRC_1024_3_4 = 47
20124ed6f55SEd MasteRC_1024_3_5 = 28
20224ed6f55SEd MasteRC_1024_3_6 = 16
20324ed6f55SEd MasteRC_1024_3_7 = 25
20424ed6f55SEd Maste
20524ed6f55SEd MasteRC_1024_4_0 = 41
20624ed6f55SEd MasteRC_1024_4_1 =  9
20724ed6f55SEd MasteRC_1024_4_2 = 37
20824ed6f55SEd MasteRC_1024_4_3 = 31
20924ed6f55SEd MasteRC_1024_4_4 = 12
21024ed6f55SEd MasteRC_1024_4_5 = 47
21124ed6f55SEd MasteRC_1024_4_6 = 44
21224ed6f55SEd MasteRC_1024_4_7 = 30
21324ed6f55SEd Maste
21424ed6f55SEd MasteRC_1024_5_0 = 16
21524ed6f55SEd MasteRC_1024_5_1 = 34
21624ed6f55SEd MasteRC_1024_5_2 = 56
21724ed6f55SEd MasteRC_1024_5_3 = 51
21824ed6f55SEd MasteRC_1024_5_4 =  4
21924ed6f55SEd MasteRC_1024_5_5 = 53
22024ed6f55SEd MasteRC_1024_5_6 = 42
22124ed6f55SEd MasteRC_1024_5_7 = 41
22224ed6f55SEd Maste
22324ed6f55SEd MasteRC_1024_6_0 = 31
22424ed6f55SEd MasteRC_1024_6_1 = 44
22524ed6f55SEd MasteRC_1024_6_2 = 47
22624ed6f55SEd MasteRC_1024_6_3 = 46
22724ed6f55SEd MasteRC_1024_6_4 = 19
22824ed6f55SEd MasteRC_1024_6_5 = 42
22924ed6f55SEd MasteRC_1024_6_6 = 44
23024ed6f55SEd MasteRC_1024_6_7 = 25
23124ed6f55SEd Maste
23224ed6f55SEd MasteRC_1024_7_0 =  9
23324ed6f55SEd MasteRC_1024_7_1 = 48
23424ed6f55SEd MasteRC_1024_7_2 = 35
23524ed6f55SEd MasteRC_1024_7_3 = 52
23624ed6f55SEd MasteRC_1024_7_4 = 23
23724ed6f55SEd MasteRC_1024_7_5 = 31
23824ed6f55SEd MasteRC_1024_7_6 = 37
23924ed6f55SEd MasteRC_1024_7_7 = 20
24024ed6f55SEd Maste#
24124ed6f55SEd Maste#  Input:  reg
24224ed6f55SEd Maste# Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
24324ed6f55SEd Maste#
24424ed6f55SEd Maste.macro RotL64   reg,BLK_SIZE,ROUND_NUM,MIX_NUM
24524ed6f55SEd Maste  .if RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM  #is there anything to do?
24624ed6f55SEd Maste    rolq    $RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM,%\reg
24724ed6f55SEd Maste  .endif
24824ed6f55SEd Maste.endm
24924ed6f55SEd Maste#
25024ed6f55SEd Maste#----------------------------------------------------------------
25124ed6f55SEd Maste#
25224ed6f55SEd Maste# MACROS: define local vars and configure stack
25324ed6f55SEd Maste#
25424ed6f55SEd Maste#----------------------------------------------------------------
25524ed6f55SEd Maste# declare allocated space on the stack
25624ed6f55SEd Maste.macro StackVar localName,localSize
25724ed6f55SEd Maste\localName  =   _STK_OFFS_
25824ed6f55SEd Maste_STK_OFFS_  =   _STK_OFFS_+(\localSize)
25924ed6f55SEd Maste.endm #StackVar
26024ed6f55SEd Maste#
26124ed6f55SEd Maste#----------------------------------------------------------------
26224ed6f55SEd Maste#
26324ed6f55SEd Maste# MACRO: Configure stack frame, allocate local vars
26424ed6f55SEd Maste#
26524ed6f55SEd Maste.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt
26624ed6f55SEd Maste    WCNT    =    (\BLK_BITS)/64
26724ed6f55SEd Maste#
26824ed6f55SEd Maste_PushCnt_   =   0                   #save nonvolatile regs on stack
26924ed6f55SEd Maste  .irp _reg_,rbp,rbx,r12,r13,r14,r15
27024ed6f55SEd Maste       pushq    %\_reg_
27124ed6f55SEd Maste_PushCnt_ = _PushCnt_ + 1           #track count to keep alignment
27224ed6f55SEd Maste  .endr
27324ed6f55SEd Maste#
27424ed6f55SEd Maste_STK_OFFS_  =   0                   #starting offset from rsp
27524ed6f55SEd Maste    #---- local  variables         #<-- rsp
27624ed6f55SEd Maste    StackVar    X_stk  ,8*(WCNT)    #local context vars
27724ed6f55SEd Maste    StackVar    ksTwk  ,8*3         #key schedule: tweak words
27824ed6f55SEd Maste    StackVar    ksKey  ,8*(WCNT)+8  #key schedule: key   words
279cffe0e0fSAdrian Chadd  .if ((SKEIN_ASM_UNROLL) & (\BLK_BITS)) == 0
28024ed6f55SEd Maste    StackVar    ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen
28124ed6f55SEd Maste  .endif
28224ed6f55SEd Maste    StackVar    Wcopy  ,8*(WCNT)    #copy of input block
28324ed6f55SEd Maste  .if _SKEIN_DEBUG
28424ed6f55SEd Maste  .if \debugCnt + 0                 #temp location for debug X[] info
28524ed6f55SEd Maste    StackVar    xDebug_\BLK_BITS ,8*(\debugCnt)
28624ed6f55SEd Maste  .endif
28724ed6f55SEd Maste  .endif
28824ed6f55SEd Maste  .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0
28924ed6f55SEd Maste    StackVar    align16,8           #keep 16-byte aligned (adjust for retAddr?)
29024ed6f55SEd MastetmpStk_\BLK_BITS = align16          #use this
29124ed6f55SEd Maste  .endif
29224ed6f55SEd Maste    #---- saved caller parameters (from regs rdi, rsi, rdx, rcx)
29324ed6f55SEd Maste    StackVar    ctxPtr ,8           #context ptr
29424ed6f55SEd Maste    StackVar    blkPtr ,8           #pointer to block data
29524ed6f55SEd Maste    StackVar    blkCnt ,8           #number of full blocks to process
29624ed6f55SEd Maste    StackVar    bitAdd ,8           #bit count to add to tweak
29724ed6f55SEd MasteLOCAL_SIZE  =   _STK_OFFS_          #size of "local" vars
29824ed6f55SEd Maste    #----
29924ed6f55SEd Maste    StackVar    savRegs,8*_PushCnt_ #saved registers
30024ed6f55SEd Maste    StackVar    retAddr,8           #return address
30124ed6f55SEd Maste    #---- caller's stack frame (aligned mod 16)
30224ed6f55SEd Maste#
30324ed6f55SEd Maste# set up the stack frame pointer (rbp)
30424ed6f55SEd Maste#
30524ed6f55SEd MasteFRAME_OFFS  =   ksTwk + 128         #allow short (negative) offset to ksTwk, kwKey
30624ed6f55SEd Maste  .if FRAME_OFFS > _STK_OFFS_       #keep rbp in the "locals" range
30724ed6f55SEd MasteFRAME_OFFS  =      _STK_OFFS_
30824ed6f55SEd Maste  .endif
30924ed6f55SEd MasteF_O         =   -FRAME_OFFS
31024ed6f55SEd Maste#
31124ed6f55SEd Maste  #put some useful defines in the .lst file (for grep)
31224ed6f55SEd Maste__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE
31324ed6f55SEd Maste__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_
31424ed6f55SEd Maste__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS
31524ed6f55SEd Maste#
31624ed6f55SEd Maste# Notes on stack frame setup:
31724ed6f55SEd Maste#   * the most frequently used variable is X_stk[], based at [rsp+0]
31824ed6f55SEd Maste#   * the next most used is the key schedule arrays, ksKey and ksTwk
31924ed6f55SEd Maste#       so rbp is "centered" there, allowing short offsets to the key
32024ed6f55SEd Maste#       schedule even in 1024-bit Skein case
32124ed6f55SEd Maste#   * the Wcopy variables are infrequently accessed, but they have long
32224ed6f55SEd Maste#       offsets from both rsp and rbp only in the 1024-bit case.
32324ed6f55SEd Maste#   * all other local vars and calling parameters can be accessed
32424ed6f55SEd Maste#       with short offsets, except in the 1024-bit case
32524ed6f55SEd Maste#
32624ed6f55SEd Maste    subq    $LOCAL_SIZE,%rsp        #make room for the locals
32724ed6f55SEd Maste    leaq    FRAME_OFFS(%rsp),%rbp   #maximize use of short offsets
32824ed6f55SEd Maste    movq    %rdi, ctxPtr+F_O(%rbp)  #save caller's parameters on the stack
32924ed6f55SEd Maste    movq    %rsi, blkPtr+F_O(%rbp)
33024ed6f55SEd Maste    movq    %rdx, blkCnt+F_O(%rbp)
33124ed6f55SEd Maste    movq    %rcx, bitAdd+F_O(%rbp)
33224ed6f55SEd Maste#
33324ed6f55SEd Maste.endm #Setup_Stack
33424ed6f55SEd Maste#
33524ed6f55SEd Maste#----------------------------------------------------------------
33624ed6f55SEd Maste#
33724ed6f55SEd Maste.macro Reset_Stack
33824ed6f55SEd Maste    addq    $LOCAL_SIZE,%rsp        #get rid of locals (wipe?)
33924ed6f55SEd Maste  .irp _reg_,r15,r14,r13,r12,rbx,rbp
34024ed6f55SEd Maste    popq    %\_reg_                 #restore caller's regs
34124ed6f55SEd Maste_PushCnt_ = _PushCnt_ - 1
34224ed6f55SEd Maste  .endr
34324ed6f55SEd Maste  .if _PushCnt_
34424ed6f55SEd Maste    .error  "Mismatched push/pops?"
34524ed6f55SEd Maste  .endif
34624ed6f55SEd Maste.endm # Reset_Stack
34724ed6f55SEd Maste#
34824ed6f55SEd Maste#----------------------------------------------------------------
34924ed6f55SEd Maste# macros to help debug internals
35024ed6f55SEd Maste#
35124ed6f55SEd Maste.if _SKEIN_DEBUG
35224ed6f55SEd Maste    .extern  Skein_Show_Block     #calls to C routines
35324ed6f55SEd Maste    .extern  Skein_Show_Round
35424ed6f55SEd Maste#
35524ed6f55SEd MasteSKEIN_RND_SPECIAL       =   1000
35624ed6f55SEd MasteSKEIN_RND_KEY_INITIAL   =   SKEIN_RND_SPECIAL+0
35724ed6f55SEd MasteSKEIN_RND_KEY_INJECT    =   SKEIN_RND_SPECIAL+1
35824ed6f55SEd MasteSKEIN_RND_FEED_FWD      =   SKEIN_RND_SPECIAL+2
35924ed6f55SEd Maste#
36024ed6f55SEd Maste.macro Skein_Debug_Block BLK_BITS
36124ed6f55SEd Maste#
36224ed6f55SEd Maste#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
36324ed6f55SEd Maste#                     const u08b_t *blkPtr, const u64b_t *wPtr,
36424ed6f55SEd Maste#                     const u64b_t *ksPtr,const u64b_t *tsPtr)
36524ed6f55SEd Maste#
36624ed6f55SEd Maste_NN_ = 0
36724ed6f55SEd Maste  .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11
36824ed6f55SEd Maste    pushq   %\_reg_                 #save all volatile regs on tack before the call
36924ed6f55SEd Maste_NN_ = _NN_ + 1
37024ed6f55SEd Maste  .endr
37124ed6f55SEd Maste    # get and push call parameters
37224ed6f55SEd Maste    movq    $\BLK_BITS      ,%rdi   #bits
37324ed6f55SEd Maste    movq    ctxPtr+F_O(%rbp),%rsi   #h (pointer)
37424ed6f55SEd Maste    leaq    X_VARS    (%rsi),%rdx   #X (pointer)
37524ed6f55SEd Maste    movq    blkPtr+F_O(%rbp),%rcx   #blkPtr
37624ed6f55SEd Maste    leaq    Wcopy +F_O(%rbp),%r8    #wPtr
37724ed6f55SEd Maste    leaq    ksKey +F_O(%rbp),%r9    #key pointer
37824ed6f55SEd Maste    leaq    ksTwk +F_O(%rbp),%rax   #tweak pointer
37924ed6f55SEd Maste    pushq   %rax                    #   (pass on the stack)
38024ed6f55SEd Maste    call    Skein_Show_Block        #call external debug handler
38124ed6f55SEd Maste    addq    $8*1,%rsp               #discard parameters on stack
38224ed6f55SEd Maste  .if (_NN_ % 2 ) == 0              #check stack alignment
38324ed6f55SEd Maste    .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS"
38424ed6f55SEd Maste  .endif
38524ed6f55SEd Maste  .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax
38624ed6f55SEd Maste    popq    %\_reg_                 #restore regs
38724ed6f55SEd Maste_NN_ = _NN_ - 1
38824ed6f55SEd Maste  .endr
38924ed6f55SEd Maste  .if _NN_
39024ed6f55SEd Maste    .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS"
39124ed6f55SEd Maste  .endif
39224ed6f55SEd Maste.endm # Skein_Debug_Block
39324ed6f55SEd Maste#
39424ed6f55SEd Maste# the macro to "call" to debug a round
39524ed6f55SEd Maste#
39624ed6f55SEd Maste.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
39724ed6f55SEd Maste    # call the appropriate (local) debug "function"
39824ed6f55SEd Maste    pushq   %rdx                    #save rdx, so we can use it for round "number"
399*58958a74SAdrian Chadd  .if ((SKEIN_ASM_UNROLL) & \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
40024ed6f55SEd Maste    movq    $\R,%rdx
40124ed6f55SEd Maste  .else                             #compute round number using edi
40224ed6f55SEd Maste_rOffs_ = \RDI_OFFS + 0
40324ed6f55SEd Maste   .if \BLK_BITS == 1024
40424ed6f55SEd Maste    movq    rIdx_offs+8(%rsp),%rdx  #get rIdx off the stack (adjust for pushq rdx above)
40536972ee3SEd Maste    leaq    1+(((\R)-1) & 3)+_rOffs_(,%rdx,4),%rdx
40624ed6f55SEd Maste   .else
40736972ee3SEd Maste    leaq    1+(((\R)-1) & 3)+_rOffs_(,%rdi,4),%rdx
40824ed6f55SEd Maste   .endif
40924ed6f55SEd Maste  .endif
41024ed6f55SEd Maste    call    Skein_Debug_Round_\BLK_BITS
41124ed6f55SEd Maste    popq    %rdx                    #restore origianl rdx value
41224ed6f55SEd Maste#
41324ed6f55SEd Maste    afterOp
41424ed6f55SEd Maste.endm  #  Skein_Debug_Round
41524ed6f55SEd Maste.else  #------- _SKEIN_DEBUG (dummy macros if debug not enabled)
41624ed6f55SEd Maste.macro Skein_Debug_Block BLK_BITS
41724ed6f55SEd Maste.endm
41824ed6f55SEd Maste#
41924ed6f55SEd Maste.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
42024ed6f55SEd Maste.endm
42124ed6f55SEd Maste#
42224ed6f55SEd Maste.endif # _SKEIN_DEBUG
42324ed6f55SEd Maste#
42424ed6f55SEd Maste#----------------------------------------------------------------
42524ed6f55SEd Maste#
42624ed6f55SEd Maste.macro  addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
42724ed6f55SEd Maste  .if \immOffs + 0
42824ed6f55SEd Maste       leaq    \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
42924ed6f55SEd Maste  .elseif ((\useAddOp + 0) == 0)
43024ed6f55SEd Maste    .ifndef ASM_NO_LEA  #lea seems to be faster on Core 2 Duo CPUs!
43124ed6f55SEd Maste       leaq   (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
43224ed6f55SEd Maste    .else
43324ed6f55SEd Maste       addq    %\srcReg_A\srcReg_B,%\dstReg
43424ed6f55SEd Maste    .endif
43524ed6f55SEd Maste  .else
43624ed6f55SEd Maste       addq    %\srcReg_A\srcReg_B,%\dstReg
43724ed6f55SEd Maste  .endif
43824ed6f55SEd Maste.endm
43924ed6f55SEd Maste
44024ed6f55SEd Maste# keep Intel-style ordering here, to match addReg
44124ed6f55SEd Maste.macro  xorReg dstReg,srcReg_A,srcReg_B
44224ed6f55SEd Maste        xorq   %\srcReg_A\srcReg_B,%\dstReg
44324ed6f55SEd Maste.endm
44424ed6f55SEd Maste#
44524ed6f55SEd Maste#----------------------------------------------------------------
44624ed6f55SEd Maste#
44724ed6f55SEd Maste.macro C_label lName
44824ed6f55SEd Maste \lName:        #use both "genders" to work across linkage conventions
44924ed6f55SEd Maste_\lName:
45024ed6f55SEd Maste    .global  \lName
45124ed6f55SEd Maste    .global _\lName
45224ed6f55SEd Maste.endm
45324ed6f55SEd Maste#
45424ed6f55SEd Maste#=================================== Skein_256 =============================================
45524ed6f55SEd Maste#
45624ed6f55SEd Maste.if _USE_ASM_ & 256
45724ed6f55SEd Maste#
45824ed6f55SEd Maste# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
45924ed6f55SEd Maste#
46024ed6f55SEd Maste#################
46124ed6f55SEd Maste#
46224ed6f55SEd Maste# code
46324ed6f55SEd Maste#
46424ed6f55SEd MasteC_label Skein_256_Process_Block
46524ed6f55SEd Maste    Setup_Stack 256,((ROUNDS_256/8)+1)
46624ed6f55SEd Maste    movq    TWEAK+8(%rdi),%r14
46724ed6f55SEd Maste    jmp     Skein_256_block_loop
46824ed6f55SEd Maste    .p2align 4
46924ed6f55SEd Maste    # main hash loop for Skein_256
47024ed6f55SEd MasteSkein_256_block_loop:
47124ed6f55SEd Maste    #
47224ed6f55SEd Maste    # general register usage:
47324ed6f55SEd Maste    #   RAX..RDX        = X0..X3
47424ed6f55SEd Maste    #   R08..R12        = ks[0..4]
47524ed6f55SEd Maste    #   R13..R15        = ts[0..2]
47624ed6f55SEd Maste    #   RSP, RBP        = stack/frame pointers
47724ed6f55SEd Maste    #   RDI             = round counter or context pointer
47824ed6f55SEd Maste    #   RSI             = temp
47924ed6f55SEd Maste    #
48024ed6f55SEd Maste    movq    TWEAK+0(%rdi)     ,%r13
48124ed6f55SEd Maste    addq    bitAdd+F_O(%rbp)  ,%r13  #computed updated tweak value T0
48224ed6f55SEd Maste    movq    %r14              ,%r15
48324ed6f55SEd Maste    xorq    %r13              ,%r15  #now %r13.%r15 is set as the tweak
48424ed6f55SEd Maste
48524ed6f55SEd Maste    movq    $KW_PARITY        ,%r12
48624ed6f55SEd Maste    movq       X_VARS+ 0(%rdi),%r8
48724ed6f55SEd Maste    movq       X_VARS+ 8(%rdi),%r9
48824ed6f55SEd Maste    movq       X_VARS+16(%rdi),%r10
48924ed6f55SEd Maste    movq       X_VARS+24(%rdi),%r11
49024ed6f55SEd Maste    movq    %r13,TWEAK+0(%rdi)       #save updated tweak value ctx->h.T[0]
49124ed6f55SEd Maste    xorq    %r8               ,%r12  #start accumulating overall parity
49224ed6f55SEd Maste
49324ed6f55SEd Maste    movq    blkPtr +F_O(%rbp) ,%rsi  #esi --> input block
49424ed6f55SEd Maste    xorq    %r9               ,%r12
49524ed6f55SEd Maste    movq     0(%rsi)          ,%rax  #get X[0..3]
49624ed6f55SEd Maste    xorq    %r10              ,%r12
49724ed6f55SEd Maste    movq     8(%rsi)          ,%rbx
49824ed6f55SEd Maste    xorq    %r11              ,%r12
49924ed6f55SEd Maste    movq    16(%rsi)          ,%rcx
50024ed6f55SEd Maste    movq    24(%rsi)          ,%rdx
50124ed6f55SEd Maste
50224ed6f55SEd Maste    movq    %rax,Wcopy+ 0+F_O(%rbp)  #save copy of input block
50324ed6f55SEd Maste    movq    %rbx,Wcopy+ 8+F_O(%rbp)
50424ed6f55SEd Maste    movq    %rcx,Wcopy+16+F_O(%rbp)
50524ed6f55SEd Maste    movq    %rdx,Wcopy+24+F_O(%rbp)
50624ed6f55SEd Maste
50724ed6f55SEd Maste    addq    %r8 ,%rax                #initial key injection
50824ed6f55SEd Maste    addq    %r9 ,%rbx
50924ed6f55SEd Maste    addq    %r10,%rcx
51024ed6f55SEd Maste    addq    %r11,%rdx
51124ed6f55SEd Maste    addq    %r13,%rbx
51224ed6f55SEd Maste    addq    %r14,%rcx
51324ed6f55SEd Maste
51424ed6f55SEd Maste.if _SKEIN_DEBUG
51524ed6f55SEd Maste    movq    %r14,TWEAK+ 8(%rdi)      #save updated tweak T[1] (start bit cleared?)
51624ed6f55SEd Maste    movq    %r8 ,ksKey+ 0+F_O(%rbp)  #save key schedule on stack for Skein_Debug_Block
51724ed6f55SEd Maste    movq    %r9 ,ksKey+ 8+F_O(%rbp)
51824ed6f55SEd Maste    movq    %r10,ksKey+16+F_O(%rbp)
51924ed6f55SEd Maste    movq    %r11,ksKey+24+F_O(%rbp)
52024ed6f55SEd Maste    movq    %r12,ksKey+32+F_O(%rbp)
52124ed6f55SEd Maste
52224ed6f55SEd Maste    movq    %r13,ksTwk+ 0+F_O(%rbp)
52324ed6f55SEd Maste    movq    %r14,ksTwk+ 8+F_O(%rbp)
52424ed6f55SEd Maste    movq    %r15,ksTwk+16+F_O(%rbp)
52524ed6f55SEd Maste
52624ed6f55SEd Maste    movq    %rax,X_stk + 0(%rsp)     #save X[] on stack for Skein_Debug_Block
52724ed6f55SEd Maste    movq    %rbx,X_stk + 8(%rsp)
52824ed6f55SEd Maste    movq    %rcx,X_stk +16(%rsp)
52924ed6f55SEd Maste    movq    %rdx,X_stk +24(%rsp)
53024ed6f55SEd Maste
53124ed6f55SEd Maste    Skein_Debug_Block 256            #debug dump
53224ed6f55SEd Maste    Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
53324ed6f55SEd Maste.endif
53424ed6f55SEd Maste#
535*58958a74SAdrian Chadd.if (((SKEIN_ASM_UNROLL) & 256) == 0)
53624ed6f55SEd Maste    movq    %r8 ,ksKey+40+F_O(%rbp)  #save key schedule on stack for looping code
53724ed6f55SEd Maste    movq    %r9 ,ksKey+ 8+F_O(%rbp)
53824ed6f55SEd Maste    movq    %r10,ksKey+16+F_O(%rbp)
53924ed6f55SEd Maste    movq    %r11,ksKey+24+F_O(%rbp)
54024ed6f55SEd Maste    movq    %r12,ksKey+32+F_O(%rbp)
54124ed6f55SEd Maste
54224ed6f55SEd Maste    movq    %r13,ksTwk+24+F_O(%rbp)
54324ed6f55SEd Maste    movq    %r14,ksTwk+ 8+F_O(%rbp)
54424ed6f55SEd Maste    movq    %r15,ksTwk+16+F_O(%rbp)
54524ed6f55SEd Maste.endif
54624ed6f55SEd Maste    addq    $WCNT*8,%rsi             #skip the block
54724ed6f55SEd Maste    movq    %rsi,blkPtr  +F_O(%rbp)  #update block pointer
54824ed6f55SEd Maste    #
54924ed6f55SEd Maste    # now the key schedule is computed. Start the rounds
55024ed6f55SEd Maste    #
551*58958a74SAdrian Chadd.if (SKEIN_ASM_UNROLL) & 256
55224ed6f55SEd Maste_UNROLL_CNT =   ROUNDS_256/8
55324ed6f55SEd Maste.else
55424ed6f55SEd Maste_UNROLL_CNT =   SKEIN_UNROLL_256
55524ed6f55SEd Maste  .if ((ROUNDS_256/8) % _UNROLL_CNT)
55624ed6f55SEd Maste    .error "Invalid SKEIN_UNROLL_256"
55724ed6f55SEd Maste  .endif
55824ed6f55SEd Maste    xorq    %rdi,%rdi                #rdi = iteration count
55924ed6f55SEd MasteSkein_256_round_loop:
56024ed6f55SEd Maste.endif
56124ed6f55SEd Maste_Rbase_ = 0
56224ed6f55SEd Maste.rept _UNROLL_CNT*2
56324ed6f55SEd Maste    # all X and ks vars in regs      # (ops to "rotate" ks vars, via mem, if not unrolled)
56424ed6f55SEd Maste    # round 4*_RBase_ + 0
56524ed6f55SEd Maste    addReg  rax, rbx
56624ed6f55SEd Maste    RotL64  rbx, 256,%((4*_Rbase_+0) % 8),0
56724ed6f55SEd Maste    addReg  rcx, rdx
568*58958a74SAdrian Chadd                .if ((SKEIN_ASM_UNROLL) & 256) == 0
56924ed6f55SEd Maste                    movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8
57024ed6f55SEd Maste                .endif
57124ed6f55SEd Maste    xorReg  rbx, rax
57224ed6f55SEd Maste    RotL64  rdx, 256,%((4*_Rbase_+0) % 8),1
57324ed6f55SEd Maste    xorReg  rdx, rcx
574*58958a74SAdrian Chadd  .if (SKEIN_ASM_UNROLL) & 256
57524ed6f55SEd Maste    .irp _r0_,%( 8+(_Rbase_+3) % 5)
57624ed6f55SEd Maste    .irp _r1_,%(13+(_Rbase_+2) % 3)
57724ed6f55SEd Maste      leaq   (%r\_r0_,%r\_r1_),%rdi    #precompute key injection value for %rcx
57824ed6f55SEd Maste    .endr
57924ed6f55SEd Maste    .endr
58024ed6f55SEd Maste  .endif
581*58958a74SAdrian Chadd                .if ((SKEIN_ASM_UNROLL) & 256) == 0
58224ed6f55SEd Maste                    movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13
58324ed6f55SEd Maste                .endif
58424ed6f55SEd Maste    Skein_Debug_Round 256,%(4*_Rbase_+1)
58524ed6f55SEd Maste
58624ed6f55SEd Maste    # round 4*_Rbase_ + 1
58724ed6f55SEd Maste    addReg  rax, rdx
58824ed6f55SEd Maste    RotL64  rdx, 256,%((4*_Rbase_+1) % 8),0
58924ed6f55SEd Maste    xorReg  rdx, rax
590*58958a74SAdrian Chadd                .if ((SKEIN_ASM_UNROLL) & 256) == 0
59124ed6f55SEd Maste                    movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9
59224ed6f55SEd Maste                .endif
59324ed6f55SEd Maste    addReg  rcx, rbx
59424ed6f55SEd Maste    RotL64  rbx, 256,%((4*_Rbase_+1) % 8),1
59524ed6f55SEd Maste    xorReg  rbx, rcx
596*58958a74SAdrian Chadd                .if ((SKEIN_ASM_UNROLL) & 256) == 0
59724ed6f55SEd Maste                    movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11
59824ed6f55SEd Maste                .endif
59924ed6f55SEd Maste    Skein_Debug_Round 256,%(4*_Rbase_+2)
600*58958a74SAdrian Chadd .if (SKEIN_ASM_UNROLL) & 256
60124ed6f55SEd Maste    .irp _r0_,%( 8+(_Rbase_+2) % 5)
60224ed6f55SEd Maste    .irp _r1_,%(13+(_Rbase_+1) % 3)
60324ed6f55SEd Maste      leaq   (%r\_r0_,%r\_r1_),%rsi     #precompute key injection value for %rbx
60424ed6f55SEd Maste    .endr
60524ed6f55SEd Maste    .endr
60624ed6f55SEd Maste .endif
60724ed6f55SEd Maste    # round 4*_Rbase_ + 2
60824ed6f55SEd Maste    addReg  rax, rbx
60924ed6f55SEd Maste    RotL64  rbx, 256,%((4*_Rbase_+2) % 8),0
61024ed6f55SEd Maste    addReg  rcx, rdx
611*58958a74SAdrian Chadd                .if ((SKEIN_ASM_UNROLL) & 256) == 0
61224ed6f55SEd Maste                    movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10
61324ed6f55SEd Maste                .endif
61424ed6f55SEd Maste    xorReg  rbx, rax
61524ed6f55SEd Maste    RotL64  rdx, 256,%((4*_Rbase_+2) % 8),1
61624ed6f55SEd Maste    xorReg  rdx, rcx
617*58958a74SAdrian Chadd                .if ((SKEIN_ASM_UNROLL) & 256) == 0
61824ed6f55SEd Maste                    movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8)  #"rotate" the key
61924ed6f55SEd Maste                    leaq 1(%r11,%rdi),%r11               #precompute key + tweak
62024ed6f55SEd Maste                .endif
62124ed6f55SEd Maste    Skein_Debug_Round 256,%(4*_Rbase_+3)
62224ed6f55SEd Maste    # round 4*_Rbase_ + 3
62324ed6f55SEd Maste    addReg  rax, rdx
62424ed6f55SEd Maste    RotL64  rdx, 256,%((4*_Rbase_+3) % 8),0
62524ed6f55SEd Maste    addReg  rcx, rbx
626*58958a74SAdrian Chadd                .if ((SKEIN_ASM_UNROLL) & 256) == 0
62724ed6f55SEd Maste                    addq      ksTwk+8*2+F_O(%rbp,%rdi,8),%r10  #precompute key + tweak
62824ed6f55SEd Maste                    movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8)       #"rotate" the tweak
62924ed6f55SEd Maste                .endif
63024ed6f55SEd Maste    xorReg  rdx, rax
63124ed6f55SEd Maste    RotL64  rbx, 256,%((4*_Rbase_+3) % 8),1
63224ed6f55SEd Maste    xorReg  rbx, rcx
63324ed6f55SEd Maste    Skein_Debug_Round 256,%(4*_Rbase_+4)
634*58958a74SAdrian Chadd                .if ((SKEIN_ASM_UNROLL) & 256) == 0
63524ed6f55SEd Maste                    addReg r9 ,r13           #precompute key+tweak
63624ed6f55SEd Maste                .endif
63724ed6f55SEd Maste      #inject key schedule words
63824ed6f55SEd Maste_Rbase_ = _Rbase_+1
639*58958a74SAdrian Chadd  .if (SKEIN_ASM_UNROLL) & 256
64024ed6f55SEd Maste    addReg    rax,r,%(8+((_Rbase_+0) % 5))
64124ed6f55SEd Maste    addReg    rbx,rsi
64224ed6f55SEd Maste    addReg    rcx,rdi
64324ed6f55SEd Maste    addReg    rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_
64424ed6f55SEd Maste  .else
64524ed6f55SEd Maste    incq      %rdi
64624ed6f55SEd Maste    addReg    rax,r8
64724ed6f55SEd Maste    addReg    rcx,r10
64824ed6f55SEd Maste    addReg    rbx,r9
64924ed6f55SEd Maste    addReg    rdx,r11
65024ed6f55SEd Maste  .endif
65124ed6f55SEd Maste    Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
65224ed6f55SEd Maste.endr #rept _UNROLL_CNT
65324ed6f55SEd Maste#
654*58958a74SAdrian Chadd.if ((SKEIN_ASM_UNROLL) & 256) == 0
65524ed6f55SEd Maste    cmpq    $2*(ROUNDS_256/8),%rdi
65624ed6f55SEd Maste    jb      Skein_256_round_loop
65724ed6f55SEd Maste.endif # (SKEIN_ASM_UNROLL & 256) == 0
65824ed6f55SEd Maste    movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
65924ed6f55SEd Maste
66024ed6f55SEd Maste    #----------------------------
66124ed6f55SEd Maste    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..3}
66224ed6f55SEd Maste    movq    $FIRST_MASK64 ,%r14
66324ed6f55SEd Maste    xorq    Wcopy + 0+F_O (%rbp),%rax
66424ed6f55SEd Maste    xorq    Wcopy + 8+F_O (%rbp),%rbx
66524ed6f55SEd Maste    xorq    Wcopy +16+F_O (%rbp),%rcx
66624ed6f55SEd Maste    xorq    Wcopy +24+F_O (%rbp),%rdx
66724ed6f55SEd Maste    andq    TWEAK + 8     (%rdi),%r14
66824ed6f55SEd Maste    movq    %rax,X_VARS+ 0(%rdi)             #store final result
66924ed6f55SEd Maste    movq    %rbx,X_VARS+ 8(%rdi)
67024ed6f55SEd Maste    movq    %rcx,X_VARS+16(%rdi)
67124ed6f55SEd Maste    movq    %rdx,X_VARS+24(%rdi)
67224ed6f55SEd Maste
67324ed6f55SEd Maste    Skein_Debug_Round 256,SKEIN_RND_FEED_FWD
67424ed6f55SEd Maste
67524ed6f55SEd Maste    # go back for more blocks, if needed
67624ed6f55SEd Maste    decq    blkCnt+F_O(%rbp)
67724ed6f55SEd Maste    jnz     Skein_256_block_loop
67824ed6f55SEd Maste    movq    %r14,TWEAK + 8(%rdi)
67924ed6f55SEd Maste    Reset_Stack
68024ed6f55SEd Maste    ret
68124ed6f55SEd MasteSkein_256_Process_Block_End:
68224ed6f55SEd Maste
68324ed6f55SEd Maste  .if _SKEIN_DEBUG
68424ed6f55SEd MasteSkein_Debug_Round_256:               #here with rdx == round "number" from macro
68524ed6f55SEd Maste    pushq   %rsi                     #save two regs for BLK_BITS-specific parms
68624ed6f55SEd Maste    pushq   %rdi
68724ed6f55SEd Maste    movq    24(%rsp),%rdi            #get back original rdx (pushed on stack in macro call) to rdi
68824ed6f55SEd Maste    movq    %rax,X_stk+ 0+F_O(%rbp)  #save X[] state on stack so debug routines can access it
68924ed6f55SEd Maste    movq    %rbx,X_stk+ 8+F_O(%rbp)  #(use FP_ since rsp has changed!)
69024ed6f55SEd Maste    movq    %rcx,X_stk+16+F_O(%rbp)
69124ed6f55SEd Maste    movq    %rdi,X_stk+24+F_O(%rbp)
69224ed6f55SEd Maste
69324ed6f55SEd Maste    movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
69424ed6f55SEd Maste    movq    $256,%rdi                #now <rdi,rsi,rdx> are set for the call
69524ed6f55SEd Maste    jmp     Skein_Debug_Round_Common
69624ed6f55SEd Maste  .endif
69724ed6f55SEd Maste#
69824ed6f55SEd Maste.if _SKEIN_CODE_SIZE
69924ed6f55SEd MasteC_label  Skein_256_Process_Block_CodeSize
70024ed6f55SEd Maste    movq    $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax
70124ed6f55SEd Maste    ret
70224ed6f55SEd Maste#
70324ed6f55SEd MasteC_label Skein_256_Unroll_Cnt
70424ed6f55SEd Maste  .if _UNROLL_CNT <> ROUNDS_256/8
70524ed6f55SEd Maste    movq    $_UNROLL_CNT,%rax
70624ed6f55SEd Maste  .else
70724ed6f55SEd Maste    xorq    %rax,%rax
70824ed6f55SEd Maste  .endif
70924ed6f55SEd Maste    ret
71024ed6f55SEd Maste.endif
71124ed6f55SEd Maste#
71224ed6f55SEd Maste.endif #_USE_ASM_ & 256
71324ed6f55SEd Maste#
71424ed6f55SEd Maste#=================================== Skein_512 =============================================
71524ed6f55SEd Maste#
71624ed6f55SEd Maste.if _USE_ASM_ & 512
71724ed6f55SEd Maste#
71824ed6f55SEd Maste# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)
71924ed6f55SEd Maste#
72024ed6f55SEd Maste# X[i] == %r[8+i]          #register assignments for X[] values during rounds (i=0..7)
72124ed6f55SEd Maste#
72224ed6f55SEd Maste#################
72324ed6f55SEd Maste# MACRO: one round for 512-bit blocks
72424ed6f55SEd Maste#
72524ed6f55SEd Maste.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4
72624ed6f55SEd Maste#
72724ed6f55SEd Maste    addReg      r\rn0, r\rn1
72824ed6f55SEd Maste    RotL64      r\rn1, 512,%((\_Rn_) % 8),0
72924ed6f55SEd Maste    xorReg      r\rn1, r\rn0
73024ed6f55SEd Maste            \op1
73124ed6f55SEd Maste    addReg      r\rn2, r\rn3
73224ed6f55SEd Maste    RotL64      r\rn3, 512,%((\_Rn_) % 8),1
73324ed6f55SEd Maste    xorReg      r\rn3, r\rn2
73424ed6f55SEd Maste            \op2
73524ed6f55SEd Maste    addReg      r\rn4, r\rn5
73624ed6f55SEd Maste    RotL64      r\rn5, 512,%((\_Rn_) % 8),2
73724ed6f55SEd Maste    xorReg      r\rn5, r\rn4
73824ed6f55SEd Maste            \op3
73924ed6f55SEd Maste    addReg      r\rn6, r\rn7
74024ed6f55SEd Maste    RotL64      r\rn7, 512,%((\_Rn_) % 8),3
74124ed6f55SEd Maste    xorReg      r\rn7, r\rn6
74224ed6f55SEd Maste            \op4
74324ed6f55SEd Maste    Skein_Debug_Round 512,%(\_Rn_+1),-4
74424ed6f55SEd Maste#
74524ed6f55SEd Maste.endm #R_512_OneRound
74624ed6f55SEd Maste#
74724ed6f55SEd Maste#################
74824ed6f55SEd Maste# MACRO: eight rounds for 512-bit blocks
74924ed6f55SEd Maste#
75024ed6f55SEd Maste.macro R_512_FourRounds _RR_    #RR = base round number (0 % 8)
751cffe0e0fSAdrian Chadd  .if ((SKEIN_ASM_UNROLL) & 512)
75224ed6f55SEd Maste    # here for fully unrolled case.
75324ed6f55SEd Maste    _II_ = ((\_RR_)/4) + 1       #key injection counter
75424ed6f55SEd Maste    R_512_OneRound  8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>
75524ed6f55SEd Maste    R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>
75624ed6f55SEd Maste    R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>
75724ed6f55SEd Maste    R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,
75824ed6f55SEd Maste    # inject the key schedule
75924ed6f55SEd Maste    addq    ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8
76024ed6f55SEd Maste    addReg   r11, rax
76124ed6f55SEd Maste    addq    ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9
76224ed6f55SEd Maste    addReg   r12, rbx
76324ed6f55SEd Maste    addq    ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10
76424ed6f55SEd Maste    addReg   r13, rcx
76524ed6f55SEd Maste    addReg   r14, rdx
76624ed6f55SEd Maste    addReg   r15, rsi,,,(_II_)
76724ed6f55SEd Maste  .else
76824ed6f55SEd Maste    # here for looping case                                                    #"rotate" key/tweak schedule (move up on stack)
76924ed6f55SEd Maste    incq    %rdi                 #bump key injection counter
77024ed6f55SEd Maste    R_512_OneRound  8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq      ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq      ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>
77124ed6f55SEd Maste    R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8)     >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>
77224ed6f55SEd Maste    R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq      ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq      ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>
77324ed6f55SEd Maste    R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq      ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>
77424ed6f55SEd Maste    # inject the key schedule
77524ed6f55SEd Maste    addq    ksKey+8*0+F_O(%rbp,%rdi,8),%r8
77624ed6f55SEd Maste    addReg   r11, rax
77724ed6f55SEd Maste    addReg   r12, rbx
77824ed6f55SEd Maste    addq    ksKey+8*1+F_O(%rbp,%rdi,8),%r9
77924ed6f55SEd Maste    addReg   r13, rcx
78024ed6f55SEd Maste    addReg   r14, rdx
78124ed6f55SEd Maste    addq    ksKey+8*2+F_O(%rbp,%rdi,8),%r10
78224ed6f55SEd Maste    addReg   r15, rsi
78324ed6f55SEd Maste    addReg   r15, rdi              #inject the round number
78424ed6f55SEd Maste  .endif
78524ed6f55SEd Maste
78624ed6f55SEd Maste    #show the result of the key injection
78724ed6f55SEd Maste    Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT
78824ed6f55SEd Maste.endm #R_512_EightRounds
78924ed6f55SEd Maste#
79024ed6f55SEd Maste#################
79124ed6f55SEd Maste# instantiated code
79224ed6f55SEd Maste#
79324ed6f55SEd MasteC_label Skein_512_Process_Block
79424ed6f55SEd Maste    Setup_Stack 512,ROUNDS_512/8
79524ed6f55SEd Maste    movq    TWEAK+ 8(%rdi),%rbx
79624ed6f55SEd Maste    jmp     Skein_512_block_loop
79724ed6f55SEd Maste    .p2align 4
79824ed6f55SEd Maste    # main hash loop for Skein_512
79924ed6f55SEd MasteSkein_512_block_loop:
80024ed6f55SEd Maste    # general register usage:
80124ed6f55SEd Maste    #   RAX..RDX       = temps for key schedule pre-loads
80224ed6f55SEd Maste    #   R8 ..R15       = X0..X7
80324ed6f55SEd Maste    #   RSP, RBP       = stack/frame pointers
80424ed6f55SEd Maste    #   RDI            = round counter or context pointer
80524ed6f55SEd Maste    #   RSI            = temp
80624ed6f55SEd Maste    #
80724ed6f55SEd Maste    movq    TWEAK +  0(%rdi),%rax
80824ed6f55SEd Maste    addq    bitAdd+F_O(%rbp),%rax     #computed updated tweak value T0
80924ed6f55SEd Maste    movq    %rbx,%rcx
81024ed6f55SEd Maste    xorq    %rax,%rcx                 #%rax/%rbx/%rcx = tweak schedule
81124ed6f55SEd Maste    movq    %rax,TWEAK+ 0    (%rdi)   #save updated tweak value ctx->h.T[0]
81224ed6f55SEd Maste    movq    %rax,ksTwk+ 0+F_O(%rbp)
81324ed6f55SEd Maste    movq    $KW_PARITY,%rdx
81424ed6f55SEd Maste    movq    blkPtr +F_O(%rbp),%rsi    #%rsi --> input block
81524ed6f55SEd Maste    movq    %rbx,ksTwk+ 8+F_O(%rbp)
81624ed6f55SEd Maste    movq    %rcx,ksTwk+16+F_O(%rbp)
81724ed6f55SEd Maste    .irp _Rn_,8,9,10,11,12,13,14,15
81824ed6f55SEd Maste      movq  X_VARS+8*(\_Rn_-8)(%rdi),%r\_Rn_
81924ed6f55SEd Maste      xorq  %r\_Rn_,%rdx              #compute overall parity
82024ed6f55SEd Maste      movq  %r\_Rn_,ksKey+8*(\_Rn_-8)+F_O(%rbp)
82124ed6f55SEd Maste    .endr                             #load state into %r8 ..%r15, compute parity
82224ed6f55SEd Maste      movq  %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity
82324ed6f55SEd Maste
82424ed6f55SEd Maste    addReg   r13,rax                  #precompute key injection for tweak
82524ed6f55SEd Maste    addReg   r14, rbx
82624ed6f55SEd Maste.if _SKEIN_DEBUG
82724ed6f55SEd Maste    movq    %rbx,TWEAK+ 8(%rdi)       #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below
82824ed6f55SEd Maste.endif
82924ed6f55SEd Maste    movq     0(%rsi),%rax             #load input block
83024ed6f55SEd Maste    movq     8(%rsi),%rbx
83124ed6f55SEd Maste    movq    16(%rsi),%rcx
83224ed6f55SEd Maste    movq    24(%rsi),%rdx
83324ed6f55SEd Maste    addReg   r8 , rax                 #do initial key injection
83424ed6f55SEd Maste    addReg   r9 , rbx
83524ed6f55SEd Maste    movq    %rax,Wcopy+ 0+F_O(%rbp)   #keep local copy for feedforward
83624ed6f55SEd Maste    movq    %rbx,Wcopy+ 8+F_O(%rbp)
83724ed6f55SEd Maste    addReg   r10, rcx
83824ed6f55SEd Maste    addReg   r11, rdx
83924ed6f55SEd Maste    movq    %rcx,Wcopy+16+F_O(%rbp)
84024ed6f55SEd Maste    movq    %rdx,Wcopy+24+F_O(%rbp)
84124ed6f55SEd Maste
84224ed6f55SEd Maste    movq    32(%rsi),%rax
84324ed6f55SEd Maste    movq    40(%rsi),%rbx
84424ed6f55SEd Maste    movq    48(%rsi),%rcx
84524ed6f55SEd Maste    movq    56(%rsi),%rdx
84624ed6f55SEd Maste    addReg   r12, rax
84724ed6f55SEd Maste    addReg   r13, rbx
84824ed6f55SEd Maste    addReg   r14, rcx
84924ed6f55SEd Maste    addReg   r15, rdx
85024ed6f55SEd Maste    movq    %rax,Wcopy+32+F_O(%rbp)
85124ed6f55SEd Maste    movq    %rbx,Wcopy+40+F_O(%rbp)
85224ed6f55SEd Maste    movq    %rcx,Wcopy+48+F_O(%rbp)
85324ed6f55SEd Maste    movq    %rdx,Wcopy+56+F_O(%rbp)
85424ed6f55SEd Maste
85524ed6f55SEd Maste.if _SKEIN_DEBUG
85624ed6f55SEd Maste    .irp _Rn_,8,9,10,11,12,13,14,15   #save values on stack for debug output
85724ed6f55SEd Maste      movq  %r\_Rn_,X_stk+8*(\_Rn_-8)(%rsp)
85824ed6f55SEd Maste    .endr
85924ed6f55SEd Maste
86024ed6f55SEd Maste    Skein_Debug_Block 512             #debug dump
86124ed6f55SEd Maste    Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL
86224ed6f55SEd Maste.endif
86324ed6f55SEd Maste    addq    $8*WCNT,%rsi              #skip the block
86424ed6f55SEd Maste    movq    %rsi,blkPtr+F_O(%rbp)     #update block pointer
86524ed6f55SEd Maste    #
86624ed6f55SEd Maste    #################
86724ed6f55SEd Maste    # now the key schedule is computed. Start the rounds
86824ed6f55SEd Maste    #
869*58958a74SAdrian Chadd.if (SKEIN_ASM_UNROLL) & 512
87024ed6f55SEd Maste_UNROLL_CNT =   ROUNDS_512/8
87124ed6f55SEd Maste.else
87224ed6f55SEd Maste_UNROLL_CNT =   SKEIN_UNROLL_512
87324ed6f55SEd Maste  .if ((ROUNDS_512/8) % _UNROLL_CNT)
87424ed6f55SEd Maste    .error "Invalid SKEIN_UNROLL_512"
87524ed6f55SEd Maste  .endif
87624ed6f55SEd Maste    xorq    %rdi,%rdi                 #rdi = round counter
87724ed6f55SEd MasteSkein_512_round_loop:
87824ed6f55SEd Maste.endif
87924ed6f55SEd Maste#
88024ed6f55SEd Maste_Rbase_ = 0
88124ed6f55SEd Maste.rept _UNROLL_CNT*2
88224ed6f55SEd Maste      R_512_FourRounds %(4*_Rbase_+00)
88324ed6f55SEd Maste_Rbase_ = _Rbase_+1
88424ed6f55SEd Maste.endr #rept _UNROLL_CNT
88524ed6f55SEd Maste#
886*58958a74SAdrian Chadd.if ((SKEIN_ASM_UNROLL) & 512) == 0
88724ed6f55SEd Maste    cmpq    $2*(ROUNDS_512/8),%rdi
88824ed6f55SEd Maste    jb      Skein_512_round_loop
88924ed6f55SEd Maste    movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
89024ed6f55SEd Maste.endif
89124ed6f55SEd Maste    # end of rounds
89224ed6f55SEd Maste    #################
89324ed6f55SEd Maste    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..7}
89424ed6f55SEd Maste    .irp _Rn_,8,9,10,11,12,13,14,15
89524ed6f55SEd Maste  .if (\_Rn_ == 8)
89624ed6f55SEd Maste    movq    $FIRST_MASK64,%rbx
89724ed6f55SEd Maste  .endif
89824ed6f55SEd Maste      xorq  Wcopy+8*(\_Rn_-8)+F_O(%rbp),%r\_Rn_  #feedforward XOR
89924ed6f55SEd Maste      movq  %r\_Rn_,X_VARS+8*(\_Rn_-8)(%rdi)     #and store result
90024ed6f55SEd Maste  .if (\_Rn_ == 14)
90124ed6f55SEd Maste    andq    TWEAK+ 8(%rdi),%rbx
90224ed6f55SEd Maste  .endif
90324ed6f55SEd Maste    .endr
90424ed6f55SEd Maste    Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
90524ed6f55SEd Maste
90624ed6f55SEd Maste    # go back for more blocks, if needed
90724ed6f55SEd Maste    decq    blkCnt+F_O(%rbp)
90824ed6f55SEd Maste    jnz     Skein_512_block_loop
90924ed6f55SEd Maste    movq    %rbx,TWEAK + 8(%rdi)
91024ed6f55SEd Maste
91124ed6f55SEd Maste    Reset_Stack
91224ed6f55SEd Maste    ret
91324ed6f55SEd MasteSkein_512_Process_Block_End:
91424ed6f55SEd Maste#
91524ed6f55SEd Maste  .if _SKEIN_DEBUG
91624ed6f55SEd Maste# call here with rdx  = "round number"
91724ed6f55SEd MasteSkein_Debug_Round_512:
91824ed6f55SEd Maste    pushq   %rsi                     #save two regs for BLK_BITS-specific parms
91924ed6f55SEd Maste    pushq   %rdi
92024ed6f55SEd Maste  .irp _Rn_,8,9,10,11,12,13,14,15    #save X[] state on stack so debug routines can access it
92124ed6f55SEd Maste    movq    %r\_Rn_,X_stk+8*(\_Rn_-8)+F_O(%rbp)
92224ed6f55SEd Maste  .endr
92324ed6f55SEd Maste    movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
92424ed6f55SEd Maste    movq    $512,%rdi                #now <rdi,rsi,rdx> are set for the call
92524ed6f55SEd Maste    jmp     Skein_Debug_Round_Common
92624ed6f55SEd Maste  .endif
92724ed6f55SEd Maste#
92824ed6f55SEd Maste.if _SKEIN_CODE_SIZE
92924ed6f55SEd MasteC_label Skein_512_Process_Block_CodeSize
93024ed6f55SEd Maste    movq    $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax
93124ed6f55SEd Maste    ret
93224ed6f55SEd Maste#
93324ed6f55SEd MasteC_label Skein_512_Unroll_Cnt
93424ed6f55SEd Maste  .if _UNROLL_CNT <> (ROUNDS_512/8)
93524ed6f55SEd Maste    movq    $_UNROLL_CNT,%rax
93624ed6f55SEd Maste  .else
93724ed6f55SEd Maste    xorq    %rax,%rax
93824ed6f55SEd Maste  .endif
93924ed6f55SEd Maste    ret
94024ed6f55SEd Maste.endif
94124ed6f55SEd Maste#
94224ed6f55SEd Maste.endif # _USE_ASM_ & 512
94324ed6f55SEd Maste#
94424ed6f55SEd Maste#=================================== Skein1024 =============================================
94524ed6f55SEd Maste.if _USE_ASM_ & 1024
94624ed6f55SEd Maste#
94724ed6f55SEd Maste# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
94824ed6f55SEd Maste#
94924ed6f55SEd Maste#################
95024ed6f55SEd Maste# use details of permutation to make register assignments
95124ed6f55SEd Maste#
95224ed6f55SEd Masteo1K_rdi =  0        #offsets in X[] associated with each register
95324ed6f55SEd Masteo1K_rsi =  1
95424ed6f55SEd Masteo1K_rbp =  2
95524ed6f55SEd Masteo1K_rax =  3
95624ed6f55SEd Masteo1K_rcx =  4        #rcx is "shared" with X6, since X4/X6 alternate
95724ed6f55SEd Masteo1K_rbx =  5
95824ed6f55SEd Masteo1K_rdx =  7
95924ed6f55SEd Masteo1K_r8  =  8
96024ed6f55SEd Masteo1K_r9  =  9
96124ed6f55SEd Masteo1K_r10 = 10
96224ed6f55SEd Masteo1K_r11 = 11
96324ed6f55SEd Masteo1K_r12 = 12
96424ed6f55SEd Masteo1K_r13 = 13
96524ed6f55SEd Masteo1K_r14 = 14
96624ed6f55SEd Masteo1K_r15 = 15
96724ed6f55SEd Maste#
96824ed6f55SEd MasterIdx_offs = tmpStk_1024
96924ed6f55SEd Maste#
97024ed6f55SEd Maste.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1
97124ed6f55SEd Maste    addReg      \reg0 , \reg1                      #perform the MIX
97224ed6f55SEd Maste    RotL64      \reg1 , 1024,%((\_RN0_) % 8),\_Rn1_
97324ed6f55SEd Maste    xorReg      \reg1 , \reg0
97436972ee3SEd Maste.if ((\_RN0_) & 3) == 3        #time to do key injection?
97524ed6f55SEd Maste .if _SKEIN_DEBUG
97624ed6f55SEd Maste    movq       %\reg0 , xDebug_1024+8*\w0(%rsp)    #save intermediate values for Debug_Round
97724ed6f55SEd Maste    movq       %\reg1 , xDebug_1024+8*\w1(%rsp)    # (before inline key injection)
97824ed6f55SEd Maste .endif
97924ed6f55SEd Maste_II_ = ((\_RN0_)/4)+1           #injection count
980cffe0e0fSAdrian Chadd .if (SKEIN_ASM_UNROLL) & 1024   #here to do fully unrolled key injection
98124ed6f55SEd Maste    addq        ksKey+ 8*((_II_+\w0) % 17)(%rsp),%\reg0
98224ed6f55SEd Maste    addq        ksKey+ 8*((_II_+\w1) % 17)(%rsp),%\reg1
98324ed6f55SEd Maste  .if     \w1 == 13                                #tweak injection
98424ed6f55SEd Maste    addq        ksTwk+ 8*((_II_+ 0) %  3)(%rsp),%\reg1
98524ed6f55SEd Maste  .elseif \w0 == 14
98624ed6f55SEd Maste    addq        ksTwk+ 8*((_II_+ 1) %  3)(%rsp),%\reg0
98724ed6f55SEd Maste  .elseif \w1 == 15
98824ed6f55SEd Maste    addq        $_II_, %\reg1                      #(injection counter)
98924ed6f55SEd Maste  .endif
99024ed6f55SEd Maste .else                          #here to do looping  key injection
99124ed6f55SEd Maste  .if  (\w0 == 0)
99224ed6f55SEd Maste    movq        %rdi, X_stk+8*\w0(%rsp)            #if so, store N0 so we can use reg as index
99324ed6f55SEd Maste    movq         rIdx_offs(%rsp),%rdi              #get the injection counter index into rdi
99424ed6f55SEd Maste  .else
99524ed6f55SEd Maste    addq         ksKey+8+8*\w0(%rsp,%rdi,8),%\reg0 #even key injection
99624ed6f55SEd Maste  .endif
99724ed6f55SEd Maste  .if     \w1 == 13                                #tweak injection
99824ed6f55SEd Maste    addq         ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1
99924ed6f55SEd Maste  .elseif \w0 == 14
100024ed6f55SEd Maste    addq         ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0
100124ed6f55SEd Maste  .elseif \w1 == 15
100224ed6f55SEd Maste    addReg      \reg1,rdi,,,1                      #(injection counter)
100324ed6f55SEd Maste  .endif
100424ed6f55SEd Maste    addq         ksKey+8+8*\w1(%rsp,%rdi,8),%\reg1 #odd key injection
100524ed6f55SEd Maste .endif
100624ed6f55SEd Maste.endif
100724ed6f55SEd Maste    # insert the op provided, .if any
100824ed6f55SEd Maste    \op1
100924ed6f55SEd Maste.endm
101024ed6f55SEd Maste#################
101124ed6f55SEd Maste# MACRO: four rounds for 1024-bit blocks
101224ed6f55SEd Maste#
101324ed6f55SEd Maste.macro r1024_FourRounds _RR_    #RR = base round number (0 mod 4)
101424ed6f55SEd Maste    # should be here with X4 set properly, X6 stored on stack
101524ed6f55SEd Maste_Rn_ = (\_RR_) + 0
101624ed6f55SEd Maste        r1024_Mix  0, 1,rdi,rsi,_Rn_,0
101724ed6f55SEd Maste        r1024_Mix  2, 3,rbp,rax,_Rn_,1
101824ed6f55SEd Maste        r1024_Mix  4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)>       #save X4  on  stack (x4/x6 alternate)
101924ed6f55SEd Maste        r1024_Mix  8, 9,r8 ,r9 ,_Rn_,4,<movq      X_stk+8*6(%rsp),%rcx>  #load X6 from stack
102024ed6f55SEd Maste        r1024_Mix 10,11,r10,r11,_Rn_,5
102124ed6f55SEd Maste        r1024_Mix 12,13,r12,r13,_Rn_,6
102224ed6f55SEd Maste        r1024_Mix  6, 7,rcx,rdx,_Rn_,3
102324ed6f55SEd Maste        r1024_Mix 14,15,r14,r15,_Rn_,7
102424ed6f55SEd Maste    .if _SKEIN_DEBUG
102524ed6f55SEd Maste      Skein_Debug_Round 1024,%(_Rn_+1)
102624ed6f55SEd Maste    .endif
102724ed6f55SEd Maste_Rn_ = (\_RR_) + 1
102824ed6f55SEd Maste        r1024_Mix  0, 9,rdi,r9 ,_Rn_,0
102924ed6f55SEd Maste        r1024_Mix  2,13,rbp,r13,_Rn_,1
103024ed6f55SEd Maste        r1024_Mix  6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)>       #save X6  on  stack (x4/x6 alternate)
103124ed6f55SEd Maste        r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq      X_stk+8*4(%rsp),%rcx>  #load X4 from stack
103224ed6f55SEd Maste        r1024_Mix 12, 3,r12,rax,_Rn_,5
103324ed6f55SEd Maste        r1024_Mix 14, 5,r14,rbx,_Rn_,6
103424ed6f55SEd Maste        r1024_Mix  4,15,rcx,r15,_Rn_,3
103524ed6f55SEd Maste        r1024_Mix  8, 1,r8 ,rsi,_Rn_,7
103624ed6f55SEd Maste    .if _SKEIN_DEBUG
103724ed6f55SEd Maste      Skein_Debug_Round 1024,%(_Rn_+1)
103824ed6f55SEd Maste    .endif
103924ed6f55SEd Maste_Rn_ = (\_RR_) + 2
104024ed6f55SEd Maste        r1024_Mix  0, 7,rdi,rdx,_Rn_,0
104124ed6f55SEd Maste        r1024_Mix  2, 5,rbp,rbx,_Rn_,1
104224ed6f55SEd Maste        r1024_Mix  4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)>       #save X4  on  stack (x4/x6 alternate)
104324ed6f55SEd Maste        r1024_Mix 12,15,r12,r15,_Rn_,4,<movq      X_stk+8*6(%rsp),%rcx>  #load X6 from stack
104424ed6f55SEd Maste        r1024_Mix 14,13,r14,r13,_Rn_,5
104524ed6f55SEd Maste        r1024_Mix  8,11,r8 ,r11,_Rn_,6
104624ed6f55SEd Maste        r1024_Mix  6, 1,rcx,rsi,_Rn_,3
104724ed6f55SEd Maste        r1024_Mix 10, 9,r10,r9 ,_Rn_,7
104824ed6f55SEd Maste    .if _SKEIN_DEBUG
104924ed6f55SEd Maste      Skein_Debug_Round 1024,%(_Rn_+1)
105024ed6f55SEd Maste    .endif
105124ed6f55SEd Maste_Rn_ = (\_RR_) + 3
105224ed6f55SEd Maste        r1024_Mix  0,15,rdi,r15,_Rn_,0
105324ed6f55SEd Maste        r1024_Mix  2,11,rbp,r11,_Rn_,1
105424ed6f55SEd Maste        r1024_Mix  6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)>       #save X6  on  stack (x4/x6 alternate)
105524ed6f55SEd Maste        r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq      X_stk+8*4(%rsp),%rcx>  #load X4 from stack
105624ed6f55SEd Maste        r1024_Mix  8, 5,r8 ,rbx,_Rn_,5
105724ed6f55SEd Maste        r1024_Mix 10, 3,r10,rax,_Rn_,6
105824ed6f55SEd Maste        r1024_Mix  4, 9,rcx,r9 ,_Rn_,3
105924ed6f55SEd Maste        r1024_Mix 12, 7,r12,rdx,_Rn_,7
106024ed6f55SEd Maste    .if _SKEIN_DEBUG
106124ed6f55SEd Maste      Skein_Debug_Round 1024,%(_Rn_+1)
106224ed6f55SEd Maste    .endif
106324ed6f55SEd Maste
1064*58958a74SAdrian Chadd  .if ((SKEIN_ASM_UNROLL) & 1024) == 0           #here with rdi == rIdx, X0 on stack
106524ed6f55SEd Maste    #"rotate" the key schedule on the stack
106624ed6f55SEd Mastei8 = o1K_r8
106724ed6f55SEd Mastei0 = o1K_rdi
106824ed6f55SEd Maste    movq    %r8 , X_stk+8*i8(%rsp)              #free up a register (save it on the stack)
106924ed6f55SEd Maste    movq          ksKey+8* 0(%rsp,%rdi,8),%r8   #get  key  word
107024ed6f55SEd Maste    movq    %r8 , ksKey+8*17(%rsp,%rdi,8)       #rotate key (must do key first or tweak clobbers it!)
107124ed6f55SEd Maste    movq          ksTwk+8* 0(%rsp,%rdi,8),%r8   #get tweak word
107224ed6f55SEd Maste    movq    %r8 , ksTwk+8* 3(%rsp,%rdi,8)       #rotate tweak (onto the stack)
107324ed6f55SEd Maste    movq          X_stk+8*i8(%rsp)       ,%r8   #get the reg back
107424ed6f55SEd Maste    incq    %rdi                                #bump the index
107524ed6f55SEd Maste    movq    %rdi, rIdx_offs (%rsp)              #save rdi again
107624ed6f55SEd Maste    movq          ksKey+8*i0(%rsp,%rdi,8),%rdi  #get the key schedule word for X0 back
107724ed6f55SEd Maste    addq          X_stk+8*i0(%rsp)       ,%rdi  #perform the X0 key injection
107824ed6f55SEd Maste  .endif
107924ed6f55SEd Maste    #show the result of the key injection
108024ed6f55SEd Maste    Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT
108124ed6f55SEd Maste.endm #r1024_FourRounds
108224ed6f55SEd Maste#
108324ed6f55SEd Maste################
108424ed6f55SEd Maste# code
108524ed6f55SEd Maste#
108624ed6f55SEd MasteC_label Skein1024_Process_Block
108724ed6f55SEd Maste#
108824ed6f55SEd Maste    Setup_Stack 1024,ROUNDS_1024/8,WCNT
108924ed6f55SEd Maste    movq    TWEAK+ 8(%rdi),%r9
109024ed6f55SEd Maste    jmp     Skein1024_block_loop
109124ed6f55SEd Maste    # main hash loop for Skein1024
109224ed6f55SEd Maste    .p2align 4
109324ed6f55SEd MasteSkein1024_block_loop:
109424ed6f55SEd Maste    # general register usage:
109524ed6f55SEd Maste    #   RSP              = stack pointer
109624ed6f55SEd Maste    #   RAX..RDX,RSI,RDI = X1, X3..X7 (state words)
109724ed6f55SEd Maste    #   R8 ..R15         = X8..X15    (state words)
109824ed6f55SEd Maste    #   RBP              = temp (used for X0 and X2)
109924ed6f55SEd Maste    #
1100*58958a74SAdrian Chadd  .if ((SKEIN_ASM_UNROLL) & 1024) == 0
110124ed6f55SEd Maste    xorq    %rax,%rax                      #init loop index on the stack
110224ed6f55SEd Maste    movq    %rax,rIdx_offs(%rsp)
110324ed6f55SEd Maste  .endif
110424ed6f55SEd Maste    movq         TWEAK+     0(%rdi),%r8
110524ed6f55SEd Maste    addq         bitAdd+  F_O(%rbp),%r8    #computed updated tweak value T0
110624ed6f55SEd Maste    movq    %r9 ,%r10
110724ed6f55SEd Maste    xorq    %r8 ,%r10                      #%rax/%rbx/%rcx = tweak schedule
110824ed6f55SEd Maste    movq    %r8 ,TWEAK+     0(%rdi)        #save updated tweak value ctx->h.T[0]
110924ed6f55SEd Maste    movq    %r8 ,ksTwk+ 0+F_O(%rbp)
111024ed6f55SEd Maste    movq    %r9 ,ksTwk+ 8+F_O(%rbp)        #keep values in %r8 ,%r9  for initial tweak injection below
111124ed6f55SEd Maste    movq    %r10,ksTwk+16+F_O(%rbp)
111224ed6f55SEd Maste  .if _SKEIN_DEBUG
111324ed6f55SEd Maste    movq    %r9 ,TWEAK+     8(%rdi)        #save updated tweak value ctx->h.T[1] for Skein_Debug_Block
111424ed6f55SEd Maste  .endif
111524ed6f55SEd Maste    movq         blkPtr +F_O(%rbp),%rsi    # rsi --> input block
111624ed6f55SEd Maste    movq        $KW_PARITY        ,%rax    #overall key schedule parity
111724ed6f55SEd Maste
111824ed6f55SEd Maste    # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3]
111924ed6f55SEd Maste    .irp _rN_,0,1,2,3,4,6                  #process the "initial" words, using r14/r15 as temps
112024ed6f55SEd Maste      movq       X_VARS+8*\_rN_(%rdi),%r14 #get state word
112124ed6f55SEd Maste      movq              8*\_rN_(%rsi),%r15 #get msg   word
112224ed6f55SEd Maste      xorq  %r14,%rax                      #update key schedule overall parity
112324ed6f55SEd Maste      movq  %r14,ksKey +8*\_rN_+F_O(%rbp)  #save key schedule word on stack
112424ed6f55SEd Maste      movq  %r15,Wcopy +8*\_rN_+F_O(%rbp)  #save local msg Wcopy
112524ed6f55SEd Maste      addq  %r15,%r14                      #do the initial key injection
112624ed6f55SEd Maste      movq  %r14,X_stk +8*\_rN_    (%rsp)  #save initial state var on stack
112724ed6f55SEd Maste    .endr
112824ed6f55SEd Maste    # now process the rest, using the "real" registers
112924ed6f55SEd Maste    #     (MUST do it in reverse order to inject tweaks r8/r9 first)
113024ed6f55SEd Maste    .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx
113124ed6f55SEd Maste_oo_ = o1K_\_rr_                           #offset assocated with the register
113224ed6f55SEd Maste      movq  X_VARS+8*_oo_(%rdi),%\_rr_     #get key schedule word from context
113324ed6f55SEd Maste      movq         8*_oo_(%rsi),%rcx       #get next input msg word
113424ed6f55SEd Maste      movq  %\_rr_, ksKey +8*_oo_(%rsp)    #save key schedule on stack
113524ed6f55SEd Maste      xorq  %\_rr_, %rax                   #accumulate key schedule parity
113624ed6f55SEd Maste      movq  %rcx,Wcopy+8*_oo_+F_O(%rbp)    #save copy of msg word for feedforward
113724ed6f55SEd Maste      addq  %rcx,%\_rr_                    #do the initial  key  injection
113824ed6f55SEd Maste      .if    _oo_ == 13                    #do the initial tweak injection
113924ed6f55SEd Maste        addReg \_rr_,r8                    #          (only in words 13/14)
114024ed6f55SEd Maste      .elseif _oo_ == 14
114124ed6f55SEd Maste        addReg \_rr_,r9
114224ed6f55SEd Maste      .endif
114324ed6f55SEd Maste    .endr
114424ed6f55SEd Maste    movq    %rax,ksKey+8*WCNT+F_O(%rbp)    #save key schedule parity
114524ed6f55SEd Maste.if _SKEIN_DEBUG
114624ed6f55SEd Maste    Skein_Debug_Block 1024                 #initial debug dump
114724ed6f55SEd Maste.endif
114824ed6f55SEd Maste    addq     $8*WCNT,%rsi                  #bump the msg ptr
114924ed6f55SEd Maste    movq     %rsi,blkPtr+F_O(%rbp)         #save bumped msg ptr
115024ed6f55SEd Maste    # re-load words 0..4 from stack, enter the main loop
115124ed6f55SEd Maste    .irp _rr_,rdi,rsi,rbp,rax,rcx          #(no need to re-load x6, already on stack)
115224ed6f55SEd Maste      movq  X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go!
115324ed6f55SEd Maste    .endr
115424ed6f55SEd Maste.if _SKEIN_DEBUG
115524ed6f55SEd Maste    Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL        #show state after initial key injection
115624ed6f55SEd Maste.endif
115724ed6f55SEd Maste    #
115824ed6f55SEd Maste    #################
115924ed6f55SEd Maste    # now the key schedule is computed. Start the rounds
116024ed6f55SEd Maste    #
1161*58958a74SAdrian Chadd.if (SKEIN_ASM_UNROLL) & 1024
116224ed6f55SEd Maste_UNROLL_CNT =   ROUNDS_1024/8
116324ed6f55SEd Maste.else
116424ed6f55SEd Maste_UNROLL_CNT =   SKEIN_UNROLL_1024
116524ed6f55SEd Maste  .if ((ROUNDS_1024/8) % _UNROLL_CNT)
116624ed6f55SEd Maste    .error "Invalid SKEIN_UNROLL_1024"
116724ed6f55SEd Maste  .endif
116824ed6f55SEd MasteSkein1024_round_loop:
116924ed6f55SEd Maste.endif
117024ed6f55SEd Maste#
117124ed6f55SEd Maste_Rbase_ = 0
117224ed6f55SEd Maste.rept _UNROLL_CNT*2                        #implement the rounds, 4 at a time
117324ed6f55SEd Maste      r1024_FourRounds %(4*_Rbase_+00)
117424ed6f55SEd Maste_Rbase_ = _Rbase_+1
117524ed6f55SEd Maste.endr #rept _UNROLL_CNT
117624ed6f55SEd Maste#
1177*58958a74SAdrian Chadd.if ((SKEIN_ASM_UNROLL) & 1024) == 0
117824ed6f55SEd Maste    cmpq    $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done
117924ed6f55SEd Maste    jb      Skein1024_round_loop
118024ed6f55SEd Maste.endif
118124ed6f55SEd Maste    # end of rounds
118224ed6f55SEd Maste    #################
118324ed6f55SEd Maste    #
118424ed6f55SEd Maste    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..15}
118524ed6f55SEd Maste    movq    %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack
118624ed6f55SEd Maste    movq       ctxPtr(%rsp),%rdx
118724ed6f55SEd Maste
118824ed6f55SEd Maste    .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15   #do all but x6,x7
118924ed6f55SEd Maste_oo_ = o1K_\_rr_
119024ed6f55SEd Maste      xorq  Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR
119124ed6f55SEd Maste      movq  %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context
119224ed6f55SEd Maste      .if (_oo_ ==  9)
119324ed6f55SEd Maste        movq   $FIRST_MASK64 ,%r9
119424ed6f55SEd Maste      .endif
119524ed6f55SEd Maste      .if (_oo_ == 14)
119624ed6f55SEd Maste        andq   TWEAK+ 8(%rdx),%r9
119724ed6f55SEd Maste      .endif
119824ed6f55SEd Maste    .endr
119924ed6f55SEd Maste    #
120024ed6f55SEd Maste    movq         X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above)
120124ed6f55SEd Maste    movq         X_stk +8*7(%rsp),%rbx
120224ed6f55SEd Maste    xorq         Wcopy +8*6(%rsp),%rax
120324ed6f55SEd Maste    xorq         Wcopy +8*7(%rsp),%rbx
120424ed6f55SEd Maste    movq    %rax,X_VARS+8*6(%rdx)
120524ed6f55SEd Maste    decq             blkCnt(%rsp)      #set zero flag iff done
120624ed6f55SEd Maste    movq    %rbx,X_VARS+8*7(%rdx)
120724ed6f55SEd Maste
120824ed6f55SEd Maste    Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)>
120924ed6f55SEd Maste    # go back for more blocks, if needed
121024ed6f55SEd Maste    movq             ctxPtr(%rsp),%rdi #don't muck with the flags here!
121124ed6f55SEd Maste    lea          FRAME_OFFS(%rsp),%rbp
121224ed6f55SEd Maste    jnz     Skein1024_block_loop
121324ed6f55SEd Maste    movq    %r9 ,TWEAK+   8(%rdx)
121424ed6f55SEd Maste    Reset_Stack
121524ed6f55SEd Maste    ret
121624ed6f55SEd Maste#
121724ed6f55SEd MasteSkein1024_Process_Block_End:
121824ed6f55SEd Maste#
121924ed6f55SEd Maste.if _SKEIN_DEBUG
122024ed6f55SEd MasteSkein_Debug_Round_1024:
122124ed6f55SEd Maste    # call here with rdx  = "round number",
122224ed6f55SEd Maste_SP_OFFS_ = 8*2                     #stack "offset" here: rdx, return addr
122324ed6f55SEd Maste    #
122424ed6f55SEd Maste  #save rest of X[] state on stack so debug routines can access it
122524ed6f55SEd Maste  .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15
122624ed6f55SEd Maste    movq    %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp)
122724ed6f55SEd Maste  .endr
122824ed6f55SEd Maste    # Figure out what to do with x0 (rdi).  When rdx == 0 mod 4, it's already on stack
122924ed6f55SEd Maste    cmpq    $SKEIN_RND_SPECIAL,%rdx #special rounds always save
123024ed6f55SEd Maste    jae     save_x0
123124ed6f55SEd Maste    testq   $3,%rdx                 #otherwise only if rdx != 0 mod 4
123224ed6f55SEd Maste    jz      save_x0_not
123324ed6f55SEd Mastesave_x0:
123424ed6f55SEd Maste    movq    %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp)
123524ed6f55SEd Mastesave_x0_not:
123624ed6f55SEd Maste    #figure out the x4/x6 swapping state and save the correct one!
123724ed6f55SEd Maste    cmpq    $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4
123824ed6f55SEd Maste    jae     save_x4
123924ed6f55SEd Maste    testq   $1,%rdx                  #and even ones have r4 as well
124024ed6f55SEd Maste    jz      save_x4
124124ed6f55SEd Maste    movq    %rcx,X_stk+8*6+_SP_OFFS_(%rsp)
124224ed6f55SEd Maste    jmp     debug_1024_go
124324ed6f55SEd Mastesave_x4:
124424ed6f55SEd Maste    movq    %rcx,X_stk+8*4+_SP_OFFS_(%rsp)
124524ed6f55SEd Mastedebug_1024_go:
124624ed6f55SEd Maste    #now all is saved in Xstk[] except for rdx
124724ed6f55SEd Maste    push    %rsi                    #save two regs for BLK_BITS-specific parms
124824ed6f55SEd Maste    push    %rdi
124924ed6f55SEd Maste_SP_OFFS_ = _SP_OFFS_ + 16          #adjust stack offset accordingly (now 32)
125024ed6f55SEd Maste
125124ed6f55SEd Maste    movq    _SP_OFFS_-8(%rsp),%rsi  #get back original %rdx (pushed on stack in macro call)
125224ed6f55SEd Maste    movq    %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[]
125324ed6f55SEd Maste
125424ed6f55SEd Maste    movq    ctxPtr+_SP_OFFS_(%rsp),%rsi  #rsi = ctx_hdr_ptr
125524ed6f55SEd Maste    movq    $1024,%rdi                   #rdi = block size
125624ed6f55SEd Maste    jmp     Skein_Debug_Round_Common
125724ed6f55SEd Maste.endif
125824ed6f55SEd Maste#
125924ed6f55SEd Maste.if _SKEIN_CODE_SIZE
126024ed6f55SEd MasteC_label Skein1024_Process_Block_CodeSize
126124ed6f55SEd Maste    movq    $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax
126224ed6f55SEd Maste    ret
126324ed6f55SEd Maste#
126424ed6f55SEd MasteC_label Skein1024_Unroll_Cnt
126524ed6f55SEd Maste  .if _UNROLL_CNT <> (ROUNDS_1024/8)
126624ed6f55SEd Maste    movq    $_UNROLL_CNT,%rax
126724ed6f55SEd Maste  .else
126824ed6f55SEd Maste    xorq    %rax,%rax
126924ed6f55SEd Maste  .endif
127024ed6f55SEd Maste    ret
127124ed6f55SEd Maste.endif
127224ed6f55SEd Maste#
127324ed6f55SEd Maste.endif # _USE_ASM_ and 1024
127424ed6f55SEd Maste#
127524ed6f55SEd Maste.if _SKEIN_DEBUG
127624ed6f55SEd Maste#----------------------------------------------------------------
127724ed6f55SEd Maste#local debug routine to set up for calls to:
127824ed6f55SEd Maste#  void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)
127924ed6f55SEd Maste#                       [       rdi                        rsi   rdx              rcx]
128024ed6f55SEd Maste#
128124ed6f55SEd Maste# here with %rdx = round number
128224ed6f55SEd Maste#           %rsi = ctx_hdr_ptr
128324ed6f55SEd Maste#           %rdi = block size (256/512/1024)
128424ed6f55SEd Maste# on stack: saved rdi, saved rsi, retAddr, saved rdx
128524ed6f55SEd Maste#
128624ed6f55SEd MasteSkein_Debug_Round_Common:
128724ed6f55SEd Maste_SP_OFFS_ = 32                        #account for four words on stack already
128824ed6f55SEd Maste  .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15  #save the rest of the regs
128924ed6f55SEd Maste    pushq %\_rr_
129024ed6f55SEd Maste_SP_OFFS_ = _SP_OFFS_+8
129124ed6f55SEd Maste  .endr
129224ed6f55SEd Maste  .if (_SP_OFFS_ % 16)                # make sure stack is still 16-byte aligned here
129324ed6f55SEd Maste    .error  "Debug_Round_Common: stack alignment"
129424ed6f55SEd Maste  .endif
129524ed6f55SEd Maste    # compute %rcx  = ptr to the X[] array on the stack (final parameter to call)
129624ed6f55SEd Maste    leaq    X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address
129724ed6f55SEd Maste    cmpq    $SKEIN_RND_FEED_FWD,%rdx   #special handling for feedforward "round"?
129824ed6f55SEd Maste    jnz     _got_rcxA
129924ed6f55SEd Maste    leaq    X_VARS(%rsi),%rcx
130024ed6f55SEd Maste_got_rcxA:
130124ed6f55SEd Maste  .if _USE_ASM_ & 1024
130224ed6f55SEd Maste    # special handling for 1024-bit case
130324ed6f55SEd Maste    #    (for rounds right before with key injection:
130424ed6f55SEd Maste    #        use xDebug_1024[] instead of X_stk[])
130524ed6f55SEd Maste    cmpq    $SKEIN_RND_SPECIAL,%rdx
130624ed6f55SEd Maste    jae     _got_rcxB               #must be a normal round
130724ed6f55SEd Maste    orq     %rdx,%rdx
130824ed6f55SEd Maste    jz      _got_rcxB               #just before key injection
130924ed6f55SEd Maste    test    $3,%rdx
131024ed6f55SEd Maste    jne     _got_rcxB
131124ed6f55SEd Maste    cmp     $1024,%rdi              #only 1024-bit(s) for now
131224ed6f55SEd Maste    jne     _got_rcxB
131324ed6f55SEd Maste    leaq    xDebug_1024+_SP_OFFS_(%rsp),%rcx
131424ed6f55SEd Maste_got_rcxB:
131524ed6f55SEd Maste  .endif
131624ed6f55SEd Maste    call    Skein_Show_Round        #call external debug handler
131724ed6f55SEd Maste
131824ed6f55SEd Maste  .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax  #restore regs
131924ed6f55SEd Maste    popq  %\_rr_
132024ed6f55SEd Maste_SP_OFFS_ = _SP_OFFS_-8
132124ed6f55SEd Maste  .endr
132224ed6f55SEd Maste  .if _SP_OFFS_ - 32
132324ed6f55SEd Maste    .error   "Debug_Round_Common: push/pop misalignment!"
132424ed6f55SEd Maste  .endif
132524ed6f55SEd Maste    popq    %rdi
132624ed6f55SEd Maste    popq    %rsi
132724ed6f55SEd Maste    ret
132824ed6f55SEd Maste.endif
132924ed6f55SEd Maste#----------------------------------------------------------------
133024ed6f55SEd Maste    .section .note.GNU-stack,"",@progbits
133124ed6f55SEd Maste
133224ed6f55SEd Maste    .end
1333