124ed6f55SEd Maste# 224ed6f55SEd Maste#---------------------------------------------------------------- 324ed6f55SEd Maste# 64-bit x86 assembler code (gnu as) for Skein block functions 424ed6f55SEd Maste# 524ed6f55SEd Maste# Author: Doug Whiting, Hifn/Exar 624ed6f55SEd Maste# 724ed6f55SEd Maste# This code is released to the public domain. 824ed6f55SEd Maste#---------------------------------------------------------------- 924ed6f55SEd Maste# 1024ed6f55SEd Maste .text 1124ed6f55SEd Maste .altmacro 1224ed6f55SEd Maste#ifndef __clang__ 1324ed6f55SEd Maste .psize 0,128 #list file has no page boundaries 1424ed6f55SEd Maste#endif 1524ed6f55SEd Maste# 1624ed6f55SEd Maste_MASK_ALL_ = (256+512+1024) #all three algorithm bits 1724ed6f55SEd Maste_MAX_FRAME_ = 240 1824ed6f55SEd Maste# 1924ed6f55SEd Maste################# 2024ed6f55SEd Maste#ifndef SKEIN_USE_ASM 2124ed6f55SEd Maste_USE_ASM_ = _MASK_ALL_ 2224ed6f55SEd Maste#else 2324ed6f55SEd Maste_USE_ASM_ = SKEIN_USE_ASM 2424ed6f55SEd Maste#endif 2524ed6f55SEd Maste################# 2624ed6f55SEd Maste#configure loop unrolling 2724ed6f55SEd Maste#ifndef SKEIN_LOOP 2824ed6f55SEd Maste_SKEIN_LOOP = 2 #default is fully unrolled for 256/512, twice for 1024 2924ed6f55SEd Maste#else 3024ed6f55SEd Maste_SKEIN_LOOP = SKEIN_LOOP 3124ed6f55SEd Maste .irp _NN_,%_SKEIN_LOOP #only display loop unrolling if default changed on command line 3224ed6f55SEd Maste#.print "+++ SKEIN_LOOP = \_NN_" 3324ed6f55SEd Maste .endr 3424ed6f55SEd Maste#endif 3524ed6f55SEd Maste# the unroll counts (0 --> fully unrolled) 3624ed6f55SEd MasteSKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10 3724ed6f55SEd MasteSKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10 3824ed6f55SEd MasteSKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10 3924ed6f55SEd Maste# 4024ed6f55SEd MasteSKEIN_ASM_UNROLL = 0 4124ed6f55SEd Maste .irp _NN_,256,512,1024 4224ed6f55SEd Maste .if (SKEIN_UNROLL_\_NN_) == 0 43*58958a74SAdrian ChaddSKEIN_ASM_UNROLL = (SKEIN_ASM_UNROLL) + \_NN_ 4424ed6f55SEd Maste .endif 4524ed6f55SEd Maste .endr 4624ed6f55SEd Maste################# 4724ed6f55SEd Maste# 4824ed6f55SEd Maste.ifndef SKEIN_ROUNDS 4924ed6f55SEd MasteROUNDS_256 = 72 5024ed6f55SEd MasteROUNDS_512 = 72 5124ed6f55SEd MasteROUNDS_1024 = 80 5224ed6f55SEd Maste.else 5324ed6f55SEd MasteROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5) 5424ed6f55SEd MasteROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5) 5524ed6f55SEd MasteROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5) 5624ed6f55SEd Maste# only display rounds if default size is changed on command line 5724ed6f55SEd Maste.irp _NN_,256,512,1024 5836972ee3SEd Maste .if _USE_ASM_ & \_NN_ 5924ed6f55SEd Maste .irp _RR_,%(ROUNDS_\_NN_) 6024ed6f55SEd Maste .if _NN_ < 1024 6124ed6f55SEd Maste.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" 6224ed6f55SEd Maste .else 6324ed6f55SEd Maste.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" 6424ed6f55SEd Maste .endif 6524ed6f55SEd Maste .endr 6624ed6f55SEd Maste .endif 6724ed6f55SEd Maste.endr 6824ed6f55SEd Maste.endif 6924ed6f55SEd Maste################# 7024ed6f55SEd Maste# 7124ed6f55SEd Maste.ifdef SKEIN_CODE_SIZE 7224ed6f55SEd Maste_SKEIN_CODE_SIZE = (1) 7324ed6f55SEd Maste.else 7424ed6f55SEd Maste.ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined 7524ed6f55SEd Maste_SKEIN_CODE_SIZE = (1) 7624ed6f55SEd Maste.else 7724ed6f55SEd Maste_SKEIN_CODE_SIZE = (0) 7824ed6f55SEd Maste.endif 7924ed6f55SEd Maste.endif 8024ed6f55SEd Maste# 8124ed6f55SEd Maste################# 8224ed6f55SEd Maste# 8324ed6f55SEd Maste.ifndef SKEIN_DEBUG 8424ed6f55SEd Maste_SKEIN_DEBUG = 0 8524ed6f55SEd Maste.else 8624ed6f55SEd Maste_SKEIN_DEBUG = 1 8724ed6f55SEd Maste.endif 8824ed6f55SEd Maste################# 8924ed6f55SEd Maste# 9024ed6f55SEd Maste# define offsets of fields in hash context structure 9124ed6f55SEd Maste# 9224ed6f55SEd MasteHASH_BITS = 0 #bits of hash output 9324ed6f55SEd MasteBCNT = 8 + HASH_BITS #number of bytes in BUFFER[] 9424ed6f55SEd MasteTWEAK = 8 + BCNT #tweak values[0..1] 9524ed6f55SEd MasteX_VARS = 16 + TWEAK #chaining vars 9624ed6f55SEd Maste# 9724ed6f55SEd Maste#(Note: buffer[] in context structure is NOT needed here :-) 9824ed6f55SEd Maste# 9924ed6f55SEd MasteKW_PARITY = 0x1BD11BDAA9FC1A22 #overall parity of key schedule words 10024ed6f55SEd MasteFIRST_MASK = ~ (1 << 6) 10124ed6f55SEd MasteFIRST_MASK64= ~ (1 << 62) 10224ed6f55SEd Maste# 10324ed6f55SEd Maste# rotation constants for Skein 10424ed6f55SEd Maste# 10524ed6f55SEd MasteRC_256_0_0 = 14 10624ed6f55SEd MasteRC_256_0_1 = 16 10724ed6f55SEd Maste 10824ed6f55SEd MasteRC_256_1_0 = 52 10924ed6f55SEd MasteRC_256_1_1 = 57 11024ed6f55SEd Maste 11124ed6f55SEd MasteRC_256_2_0 = 23 11224ed6f55SEd MasteRC_256_2_1 = 40 11324ed6f55SEd Maste 11424ed6f55SEd MasteRC_256_3_0 = 5 11524ed6f55SEd MasteRC_256_3_1 = 37 11624ed6f55SEd Maste 11724ed6f55SEd MasteRC_256_4_0 = 25 11824ed6f55SEd MasteRC_256_4_1 = 33 11924ed6f55SEd Maste 12024ed6f55SEd MasteRC_256_5_0 = 46 12124ed6f55SEd MasteRC_256_5_1 = 12 12224ed6f55SEd Maste 12324ed6f55SEd MasteRC_256_6_0 = 58 12424ed6f55SEd MasteRC_256_6_1 = 22 12524ed6f55SEd Maste 12624ed6f55SEd MasteRC_256_7_0 = 32 12724ed6f55SEd MasteRC_256_7_1 = 32 12824ed6f55SEd Maste 12924ed6f55SEd MasteRC_512_0_0 = 46 13024ed6f55SEd MasteRC_512_0_1 = 36 13124ed6f55SEd MasteRC_512_0_2 = 19 13224ed6f55SEd MasteRC_512_0_3 = 37 13324ed6f55SEd Maste 13424ed6f55SEd MasteRC_512_1_0 = 33 13524ed6f55SEd MasteRC_512_1_1 = 27 13624ed6f55SEd MasteRC_512_1_2 = 14 13724ed6f55SEd MasteRC_512_1_3 = 42 13824ed6f55SEd Maste 13924ed6f55SEd MasteRC_512_2_0 = 17 14024ed6f55SEd MasteRC_512_2_1 = 49 14124ed6f55SEd MasteRC_512_2_2 = 36 14224ed6f55SEd MasteRC_512_2_3 = 39 14324ed6f55SEd Maste 14424ed6f55SEd MasteRC_512_3_0 = 44 14524ed6f55SEd MasteRC_512_3_1 = 9 14624ed6f55SEd MasteRC_512_3_2 = 54 14724ed6f55SEd MasteRC_512_3_3 = 56 14824ed6f55SEd Maste 14924ed6f55SEd MasteRC_512_4_0 = 39 15024ed6f55SEd MasteRC_512_4_1 = 30 15124ed6f55SEd MasteRC_512_4_2 = 34 15224ed6f55SEd MasteRC_512_4_3 = 24 15324ed6f55SEd Maste 15424ed6f55SEd MasteRC_512_5_0 = 13 15524ed6f55SEd MasteRC_512_5_1 = 50 15624ed6f55SEd MasteRC_512_5_2 = 10 15724ed6f55SEd MasteRC_512_5_3 = 17 15824ed6f55SEd Maste 15924ed6f55SEd MasteRC_512_6_0 = 25 16024ed6f55SEd MasteRC_512_6_1 = 29 16124ed6f55SEd MasteRC_512_6_2 = 39 16224ed6f55SEd MasteRC_512_6_3 = 43 16324ed6f55SEd Maste 16424ed6f55SEd MasteRC_512_7_0 = 8 16524ed6f55SEd MasteRC_512_7_1 = 35 16624ed6f55SEd MasteRC_512_7_2 = 56 16724ed6f55SEd MasteRC_512_7_3 = 22 16824ed6f55SEd Maste 16924ed6f55SEd MasteRC_1024_0_0 = 24 17024ed6f55SEd MasteRC_1024_0_1 = 13 17124ed6f55SEd MasteRC_1024_0_2 = 8 17224ed6f55SEd MasteRC_1024_0_3 = 47 17324ed6f55SEd MasteRC_1024_0_4 = 8 17424ed6f55SEd MasteRC_1024_0_5 = 17 17524ed6f55SEd MasteRC_1024_0_6 = 22 17624ed6f55SEd MasteRC_1024_0_7 = 37 17724ed6f55SEd Maste 17824ed6f55SEd MasteRC_1024_1_0 = 38 17924ed6f55SEd MasteRC_1024_1_1 = 19 18024ed6f55SEd MasteRC_1024_1_2 = 10 18124ed6f55SEd MasteRC_1024_1_3 = 55 18224ed6f55SEd MasteRC_1024_1_4 = 49 18324ed6f55SEd MasteRC_1024_1_5 = 18 18424ed6f55SEd MasteRC_1024_1_6 = 23 18524ed6f55SEd MasteRC_1024_1_7 = 52 18624ed6f55SEd Maste 18724ed6f55SEd MasteRC_1024_2_0 = 33 18824ed6f55SEd MasteRC_1024_2_1 = 4 18924ed6f55SEd MasteRC_1024_2_2 = 51 19024ed6f55SEd MasteRC_1024_2_3 = 13 19124ed6f55SEd MasteRC_1024_2_4 = 34 19224ed6f55SEd MasteRC_1024_2_5 = 41 19324ed6f55SEd MasteRC_1024_2_6 = 59 19424ed6f55SEd MasteRC_1024_2_7 = 17 19524ed6f55SEd Maste 19624ed6f55SEd MasteRC_1024_3_0 = 5 19724ed6f55SEd MasteRC_1024_3_1 = 20 19824ed6f55SEd MasteRC_1024_3_2 = 48 19924ed6f55SEd MasteRC_1024_3_3 = 41 20024ed6f55SEd MasteRC_1024_3_4 = 47 20124ed6f55SEd MasteRC_1024_3_5 = 28 20224ed6f55SEd MasteRC_1024_3_6 = 16 20324ed6f55SEd MasteRC_1024_3_7 = 25 20424ed6f55SEd Maste 20524ed6f55SEd MasteRC_1024_4_0 = 41 20624ed6f55SEd MasteRC_1024_4_1 = 9 20724ed6f55SEd MasteRC_1024_4_2 = 37 20824ed6f55SEd MasteRC_1024_4_3 = 31 20924ed6f55SEd MasteRC_1024_4_4 = 12 21024ed6f55SEd MasteRC_1024_4_5 = 47 21124ed6f55SEd MasteRC_1024_4_6 = 44 21224ed6f55SEd MasteRC_1024_4_7 = 30 21324ed6f55SEd Maste 21424ed6f55SEd MasteRC_1024_5_0 = 16 21524ed6f55SEd MasteRC_1024_5_1 = 34 21624ed6f55SEd MasteRC_1024_5_2 = 56 21724ed6f55SEd MasteRC_1024_5_3 = 51 21824ed6f55SEd MasteRC_1024_5_4 = 4 21924ed6f55SEd MasteRC_1024_5_5 = 53 22024ed6f55SEd MasteRC_1024_5_6 = 42 22124ed6f55SEd MasteRC_1024_5_7 = 41 22224ed6f55SEd Maste 22324ed6f55SEd MasteRC_1024_6_0 = 31 22424ed6f55SEd MasteRC_1024_6_1 = 44 22524ed6f55SEd MasteRC_1024_6_2 = 47 22624ed6f55SEd MasteRC_1024_6_3 = 46 22724ed6f55SEd MasteRC_1024_6_4 = 19 22824ed6f55SEd MasteRC_1024_6_5 = 42 22924ed6f55SEd MasteRC_1024_6_6 = 44 23024ed6f55SEd MasteRC_1024_6_7 = 25 23124ed6f55SEd Maste 23224ed6f55SEd MasteRC_1024_7_0 = 9 23324ed6f55SEd MasteRC_1024_7_1 = 48 23424ed6f55SEd MasteRC_1024_7_2 = 35 23524ed6f55SEd MasteRC_1024_7_3 = 52 23624ed6f55SEd MasteRC_1024_7_4 = 23 23724ed6f55SEd MasteRC_1024_7_5 = 31 23824ed6f55SEd MasteRC_1024_7_6 = 37 23924ed6f55SEd MasteRC_1024_7_7 = 20 24024ed6f55SEd Maste# 24124ed6f55SEd Maste# Input: reg 24224ed6f55SEd Maste# Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024 24324ed6f55SEd Maste# 24424ed6f55SEd Maste.macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM 24524ed6f55SEd Maste .if RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM #is there anything to do? 24624ed6f55SEd Maste rolq $RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM,%\reg 24724ed6f55SEd Maste .endif 24824ed6f55SEd Maste.endm 24924ed6f55SEd Maste# 25024ed6f55SEd Maste#---------------------------------------------------------------- 25124ed6f55SEd Maste# 25224ed6f55SEd Maste# MACROS: define local vars and configure stack 25324ed6f55SEd Maste# 25424ed6f55SEd Maste#---------------------------------------------------------------- 25524ed6f55SEd Maste# declare allocated space on the stack 25624ed6f55SEd Maste.macro StackVar localName,localSize 25724ed6f55SEd Maste\localName = _STK_OFFS_ 25824ed6f55SEd Maste_STK_OFFS_ = _STK_OFFS_+(\localSize) 25924ed6f55SEd Maste.endm #StackVar 26024ed6f55SEd Maste# 26124ed6f55SEd Maste#---------------------------------------------------------------- 26224ed6f55SEd Maste# 26324ed6f55SEd Maste# MACRO: Configure stack frame, allocate local vars 26424ed6f55SEd Maste# 26524ed6f55SEd Maste.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt 26624ed6f55SEd Maste WCNT = (\BLK_BITS)/64 26724ed6f55SEd Maste# 26824ed6f55SEd Maste_PushCnt_ = 0 #save nonvolatile regs on stack 26924ed6f55SEd Maste .irp _reg_,rbp,rbx,r12,r13,r14,r15 27024ed6f55SEd Maste pushq %\_reg_ 27124ed6f55SEd Maste_PushCnt_ = _PushCnt_ + 1 #track count to keep alignment 27224ed6f55SEd Maste .endr 27324ed6f55SEd Maste# 27424ed6f55SEd Maste_STK_OFFS_ = 0 #starting offset from rsp 27524ed6f55SEd Maste #---- local variables #<-- rsp 27624ed6f55SEd Maste StackVar X_stk ,8*(WCNT) #local context vars 27724ed6f55SEd Maste StackVar ksTwk ,8*3 #key schedule: tweak words 27824ed6f55SEd Maste StackVar ksKey ,8*(WCNT)+8 #key schedule: key words 279cffe0e0fSAdrian Chadd .if ((SKEIN_ASM_UNROLL) & (\BLK_BITS)) == 0 28024ed6f55SEd Maste StackVar ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen 28124ed6f55SEd Maste .endif 28224ed6f55SEd Maste StackVar Wcopy ,8*(WCNT) #copy of input block 28324ed6f55SEd Maste .if _SKEIN_DEBUG 28424ed6f55SEd Maste .if \debugCnt + 0 #temp location for debug X[] info 28524ed6f55SEd Maste StackVar xDebug_\BLK_BITS ,8*(\debugCnt) 28624ed6f55SEd Maste .endif 28724ed6f55SEd Maste .endif 28824ed6f55SEd Maste .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0 28924ed6f55SEd Maste StackVar align16,8 #keep 16-byte aligned (adjust for retAddr?) 29024ed6f55SEd MastetmpStk_\BLK_BITS = align16 #use this 29124ed6f55SEd Maste .endif 29224ed6f55SEd Maste #---- saved caller parameters (from regs rdi, rsi, rdx, rcx) 29324ed6f55SEd Maste StackVar ctxPtr ,8 #context ptr 29424ed6f55SEd Maste StackVar blkPtr ,8 #pointer to block data 29524ed6f55SEd Maste StackVar blkCnt ,8 #number of full blocks to process 29624ed6f55SEd Maste StackVar bitAdd ,8 #bit count to add to tweak 29724ed6f55SEd MasteLOCAL_SIZE = _STK_OFFS_ #size of "local" vars 29824ed6f55SEd Maste #---- 29924ed6f55SEd Maste StackVar savRegs,8*_PushCnt_ #saved registers 30024ed6f55SEd Maste StackVar retAddr,8 #return address 30124ed6f55SEd Maste #---- caller's stack frame (aligned mod 16) 30224ed6f55SEd Maste# 30324ed6f55SEd Maste# set up the stack frame pointer (rbp) 30424ed6f55SEd Maste# 30524ed6f55SEd MasteFRAME_OFFS = ksTwk + 128 #allow short (negative) offset to ksTwk, kwKey 30624ed6f55SEd Maste .if FRAME_OFFS > _STK_OFFS_ #keep rbp in the "locals" range 30724ed6f55SEd MasteFRAME_OFFS = _STK_OFFS_ 30824ed6f55SEd Maste .endif 30924ed6f55SEd MasteF_O = -FRAME_OFFS 31024ed6f55SEd Maste# 31124ed6f55SEd Maste #put some useful defines in the .lst file (for grep) 31224ed6f55SEd Maste__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE 31324ed6f55SEd Maste__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_ 31424ed6f55SEd Maste__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS 31524ed6f55SEd Maste# 31624ed6f55SEd Maste# Notes on stack frame setup: 31724ed6f55SEd Maste# * the most frequently used variable is X_stk[], based at [rsp+0] 31824ed6f55SEd Maste# * the next most used is the key schedule arrays, ksKey and ksTwk 31924ed6f55SEd Maste# so rbp is "centered" there, allowing short offsets to the key 32024ed6f55SEd Maste# schedule even in 1024-bit Skein case 32124ed6f55SEd Maste# * the Wcopy variables are infrequently accessed, but they have long 32224ed6f55SEd Maste# offsets from both rsp and rbp only in the 1024-bit case. 32324ed6f55SEd Maste# * all other local vars and calling parameters can be accessed 32424ed6f55SEd Maste# with short offsets, except in the 1024-bit case 32524ed6f55SEd Maste# 32624ed6f55SEd Maste subq $LOCAL_SIZE,%rsp #make room for the locals 32724ed6f55SEd Maste leaq FRAME_OFFS(%rsp),%rbp #maximize use of short offsets 32824ed6f55SEd Maste movq %rdi, ctxPtr+F_O(%rbp) #save caller's parameters on the stack 32924ed6f55SEd Maste movq %rsi, blkPtr+F_O(%rbp) 33024ed6f55SEd Maste movq %rdx, blkCnt+F_O(%rbp) 33124ed6f55SEd Maste movq %rcx, bitAdd+F_O(%rbp) 33224ed6f55SEd Maste# 33324ed6f55SEd Maste.endm #Setup_Stack 33424ed6f55SEd Maste# 33524ed6f55SEd Maste#---------------------------------------------------------------- 33624ed6f55SEd Maste# 33724ed6f55SEd Maste.macro Reset_Stack 33824ed6f55SEd Maste addq $LOCAL_SIZE,%rsp #get rid of locals (wipe?) 33924ed6f55SEd Maste .irp _reg_,r15,r14,r13,r12,rbx,rbp 34024ed6f55SEd Maste popq %\_reg_ #restore caller's regs 34124ed6f55SEd Maste_PushCnt_ = _PushCnt_ - 1 34224ed6f55SEd Maste .endr 34324ed6f55SEd Maste .if _PushCnt_ 34424ed6f55SEd Maste .error "Mismatched push/pops?" 34524ed6f55SEd Maste .endif 34624ed6f55SEd Maste.endm # Reset_Stack 34724ed6f55SEd Maste# 34824ed6f55SEd Maste#---------------------------------------------------------------- 34924ed6f55SEd Maste# macros to help debug internals 35024ed6f55SEd Maste# 35124ed6f55SEd Maste.if _SKEIN_DEBUG 35224ed6f55SEd Maste .extern Skein_Show_Block #calls to C routines 35324ed6f55SEd Maste .extern Skein_Show_Round 35424ed6f55SEd Maste# 35524ed6f55SEd MasteSKEIN_RND_SPECIAL = 1000 35624ed6f55SEd MasteSKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0 35724ed6f55SEd MasteSKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1 35824ed6f55SEd MasteSKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2 35924ed6f55SEd Maste# 36024ed6f55SEd Maste.macro Skein_Debug_Block BLK_BITS 36124ed6f55SEd Maste# 36224ed6f55SEd Maste#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X, 36324ed6f55SEd Maste# const u08b_t *blkPtr, const u64b_t *wPtr, 36424ed6f55SEd Maste# const u64b_t *ksPtr,const u64b_t *tsPtr) 36524ed6f55SEd Maste# 36624ed6f55SEd Maste_NN_ = 0 36724ed6f55SEd Maste .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11 36824ed6f55SEd Maste pushq %\_reg_ #save all volatile regs on tack before the call 36924ed6f55SEd Maste_NN_ = _NN_ + 1 37024ed6f55SEd Maste .endr 37124ed6f55SEd Maste # get and push call parameters 37224ed6f55SEd Maste movq $\BLK_BITS ,%rdi #bits 37324ed6f55SEd Maste movq ctxPtr+F_O(%rbp),%rsi #h (pointer) 37424ed6f55SEd Maste leaq X_VARS (%rsi),%rdx #X (pointer) 37524ed6f55SEd Maste movq blkPtr+F_O(%rbp),%rcx #blkPtr 37624ed6f55SEd Maste leaq Wcopy +F_O(%rbp),%r8 #wPtr 37724ed6f55SEd Maste leaq ksKey +F_O(%rbp),%r9 #key pointer 37824ed6f55SEd Maste leaq ksTwk +F_O(%rbp),%rax #tweak pointer 37924ed6f55SEd Maste pushq %rax # (pass on the stack) 38024ed6f55SEd Maste call Skein_Show_Block #call external debug handler 38124ed6f55SEd Maste addq $8*1,%rsp #discard parameters on stack 38224ed6f55SEd Maste .if (_NN_ % 2 ) == 0 #check stack alignment 38324ed6f55SEd Maste .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS" 38424ed6f55SEd Maste .endif 38524ed6f55SEd Maste .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax 38624ed6f55SEd Maste popq %\_reg_ #restore regs 38724ed6f55SEd Maste_NN_ = _NN_ - 1 38824ed6f55SEd Maste .endr 38924ed6f55SEd Maste .if _NN_ 39024ed6f55SEd Maste .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS" 39124ed6f55SEd Maste .endif 39224ed6f55SEd Maste.endm # Skein_Debug_Block 39324ed6f55SEd Maste# 39424ed6f55SEd Maste# the macro to "call" to debug a round 39524ed6f55SEd Maste# 39624ed6f55SEd Maste.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp 39724ed6f55SEd Maste # call the appropriate (local) debug "function" 39824ed6f55SEd Maste pushq %rdx #save rdx, so we can use it for round "number" 399*58958a74SAdrian Chadd .if ((SKEIN_ASM_UNROLL) & \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL) 40024ed6f55SEd Maste movq $\R,%rdx 40124ed6f55SEd Maste .else #compute round number using edi 40224ed6f55SEd Maste_rOffs_ = \RDI_OFFS + 0 40324ed6f55SEd Maste .if \BLK_BITS == 1024 40424ed6f55SEd Maste movq rIdx_offs+8(%rsp),%rdx #get rIdx off the stack (adjust for pushq rdx above) 40536972ee3SEd Maste leaq 1+(((\R)-1) & 3)+_rOffs_(,%rdx,4),%rdx 40624ed6f55SEd Maste .else 40736972ee3SEd Maste leaq 1+(((\R)-1) & 3)+_rOffs_(,%rdi,4),%rdx 40824ed6f55SEd Maste .endif 40924ed6f55SEd Maste .endif 41024ed6f55SEd Maste call Skein_Debug_Round_\BLK_BITS 41124ed6f55SEd Maste popq %rdx #restore origianl rdx value 41224ed6f55SEd Maste# 41324ed6f55SEd Maste afterOp 41424ed6f55SEd Maste.endm # Skein_Debug_Round 41524ed6f55SEd Maste.else #------- _SKEIN_DEBUG (dummy macros if debug not enabled) 41624ed6f55SEd Maste.macro Skein_Debug_Block BLK_BITS 41724ed6f55SEd Maste.endm 41824ed6f55SEd Maste# 41924ed6f55SEd Maste.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp 42024ed6f55SEd Maste.endm 42124ed6f55SEd Maste# 42224ed6f55SEd Maste.endif # _SKEIN_DEBUG 42324ed6f55SEd Maste# 42424ed6f55SEd Maste#---------------------------------------------------------------- 42524ed6f55SEd Maste# 42624ed6f55SEd Maste.macro addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs 42724ed6f55SEd Maste .if \immOffs + 0 42824ed6f55SEd Maste leaq \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg 42924ed6f55SEd Maste .elseif ((\useAddOp + 0) == 0) 43024ed6f55SEd Maste .ifndef ASM_NO_LEA #lea seems to be faster on Core 2 Duo CPUs! 43124ed6f55SEd Maste leaq (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg 43224ed6f55SEd Maste .else 43324ed6f55SEd Maste addq %\srcReg_A\srcReg_B,%\dstReg 43424ed6f55SEd Maste .endif 43524ed6f55SEd Maste .else 43624ed6f55SEd Maste addq %\srcReg_A\srcReg_B,%\dstReg 43724ed6f55SEd Maste .endif 43824ed6f55SEd Maste.endm 43924ed6f55SEd Maste 44024ed6f55SEd Maste# keep Intel-style ordering here, to match addReg 44124ed6f55SEd Maste.macro xorReg dstReg,srcReg_A,srcReg_B 44224ed6f55SEd Maste xorq %\srcReg_A\srcReg_B,%\dstReg 44324ed6f55SEd Maste.endm 44424ed6f55SEd Maste# 44524ed6f55SEd Maste#---------------------------------------------------------------- 44624ed6f55SEd Maste# 44724ed6f55SEd Maste.macro C_label lName 44824ed6f55SEd Maste \lName: #use both "genders" to work across linkage conventions 44924ed6f55SEd Maste_\lName: 45024ed6f55SEd Maste .global \lName 45124ed6f55SEd Maste .global _\lName 45224ed6f55SEd Maste.endm 45324ed6f55SEd Maste# 45424ed6f55SEd Maste#=================================== Skein_256 ============================================= 45524ed6f55SEd Maste# 45624ed6f55SEd Maste.if _USE_ASM_ & 256 45724ed6f55SEd Maste# 45824ed6f55SEd Maste# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# 45924ed6f55SEd Maste# 46024ed6f55SEd Maste################# 46124ed6f55SEd Maste# 46224ed6f55SEd Maste# code 46324ed6f55SEd Maste# 46424ed6f55SEd MasteC_label Skein_256_Process_Block 46524ed6f55SEd Maste Setup_Stack 256,((ROUNDS_256/8)+1) 46624ed6f55SEd Maste movq TWEAK+8(%rdi),%r14 46724ed6f55SEd Maste jmp Skein_256_block_loop 46824ed6f55SEd Maste .p2align 4 46924ed6f55SEd Maste # main hash loop for Skein_256 47024ed6f55SEd MasteSkein_256_block_loop: 47124ed6f55SEd Maste # 47224ed6f55SEd Maste # general register usage: 47324ed6f55SEd Maste # RAX..RDX = X0..X3 47424ed6f55SEd Maste # R08..R12 = ks[0..4] 47524ed6f55SEd Maste # R13..R15 = ts[0..2] 47624ed6f55SEd Maste # RSP, RBP = stack/frame pointers 47724ed6f55SEd Maste # RDI = round counter or context pointer 47824ed6f55SEd Maste # RSI = temp 47924ed6f55SEd Maste # 48024ed6f55SEd Maste movq TWEAK+0(%rdi) ,%r13 48124ed6f55SEd Maste addq bitAdd+F_O(%rbp) ,%r13 #computed updated tweak value T0 48224ed6f55SEd Maste movq %r14 ,%r15 48324ed6f55SEd Maste xorq %r13 ,%r15 #now %r13.%r15 is set as the tweak 48424ed6f55SEd Maste 48524ed6f55SEd Maste movq $KW_PARITY ,%r12 48624ed6f55SEd Maste movq X_VARS+ 0(%rdi),%r8 48724ed6f55SEd Maste movq X_VARS+ 8(%rdi),%r9 48824ed6f55SEd Maste movq X_VARS+16(%rdi),%r10 48924ed6f55SEd Maste movq X_VARS+24(%rdi),%r11 49024ed6f55SEd Maste movq %r13,TWEAK+0(%rdi) #save updated tweak value ctx->h.T[0] 49124ed6f55SEd Maste xorq %r8 ,%r12 #start accumulating overall parity 49224ed6f55SEd Maste 49324ed6f55SEd Maste movq blkPtr +F_O(%rbp) ,%rsi #esi --> input block 49424ed6f55SEd Maste xorq %r9 ,%r12 49524ed6f55SEd Maste movq 0(%rsi) ,%rax #get X[0..3] 49624ed6f55SEd Maste xorq %r10 ,%r12 49724ed6f55SEd Maste movq 8(%rsi) ,%rbx 49824ed6f55SEd Maste xorq %r11 ,%r12 49924ed6f55SEd Maste movq 16(%rsi) ,%rcx 50024ed6f55SEd Maste movq 24(%rsi) ,%rdx 50124ed6f55SEd Maste 50224ed6f55SEd Maste movq %rax,Wcopy+ 0+F_O(%rbp) #save copy of input block 50324ed6f55SEd Maste movq %rbx,Wcopy+ 8+F_O(%rbp) 50424ed6f55SEd Maste movq %rcx,Wcopy+16+F_O(%rbp) 50524ed6f55SEd Maste movq %rdx,Wcopy+24+F_O(%rbp) 50624ed6f55SEd Maste 50724ed6f55SEd Maste addq %r8 ,%rax #initial key injection 50824ed6f55SEd Maste addq %r9 ,%rbx 50924ed6f55SEd Maste addq %r10,%rcx 51024ed6f55SEd Maste addq %r11,%rdx 51124ed6f55SEd Maste addq %r13,%rbx 51224ed6f55SEd Maste addq %r14,%rcx 51324ed6f55SEd Maste 51424ed6f55SEd Maste.if _SKEIN_DEBUG 51524ed6f55SEd Maste movq %r14,TWEAK+ 8(%rdi) #save updated tweak T[1] (start bit cleared?) 51624ed6f55SEd Maste movq %r8 ,ksKey+ 0+F_O(%rbp) #save key schedule on stack for Skein_Debug_Block 51724ed6f55SEd Maste movq %r9 ,ksKey+ 8+F_O(%rbp) 51824ed6f55SEd Maste movq %r10,ksKey+16+F_O(%rbp) 51924ed6f55SEd Maste movq %r11,ksKey+24+F_O(%rbp) 52024ed6f55SEd Maste movq %r12,ksKey+32+F_O(%rbp) 52124ed6f55SEd Maste 52224ed6f55SEd Maste movq %r13,ksTwk+ 0+F_O(%rbp) 52324ed6f55SEd Maste movq %r14,ksTwk+ 8+F_O(%rbp) 52424ed6f55SEd Maste movq %r15,ksTwk+16+F_O(%rbp) 52524ed6f55SEd Maste 52624ed6f55SEd Maste movq %rax,X_stk + 0(%rsp) #save X[] on stack for Skein_Debug_Block 52724ed6f55SEd Maste movq %rbx,X_stk + 8(%rsp) 52824ed6f55SEd Maste movq %rcx,X_stk +16(%rsp) 52924ed6f55SEd Maste movq %rdx,X_stk +24(%rsp) 53024ed6f55SEd Maste 53124ed6f55SEd Maste Skein_Debug_Block 256 #debug dump 53224ed6f55SEd Maste Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL 53324ed6f55SEd Maste.endif 53424ed6f55SEd Maste# 535*58958a74SAdrian Chadd.if (((SKEIN_ASM_UNROLL) & 256) == 0) 53624ed6f55SEd Maste movq %r8 ,ksKey+40+F_O(%rbp) #save key schedule on stack for looping code 53724ed6f55SEd Maste movq %r9 ,ksKey+ 8+F_O(%rbp) 53824ed6f55SEd Maste movq %r10,ksKey+16+F_O(%rbp) 53924ed6f55SEd Maste movq %r11,ksKey+24+F_O(%rbp) 54024ed6f55SEd Maste movq %r12,ksKey+32+F_O(%rbp) 54124ed6f55SEd Maste 54224ed6f55SEd Maste movq %r13,ksTwk+24+F_O(%rbp) 54324ed6f55SEd Maste movq %r14,ksTwk+ 8+F_O(%rbp) 54424ed6f55SEd Maste movq %r15,ksTwk+16+F_O(%rbp) 54524ed6f55SEd Maste.endif 54624ed6f55SEd Maste addq $WCNT*8,%rsi #skip the block 54724ed6f55SEd Maste movq %rsi,blkPtr +F_O(%rbp) #update block pointer 54824ed6f55SEd Maste # 54924ed6f55SEd Maste # now the key schedule is computed. Start the rounds 55024ed6f55SEd Maste # 551*58958a74SAdrian Chadd.if (SKEIN_ASM_UNROLL) & 256 55224ed6f55SEd Maste_UNROLL_CNT = ROUNDS_256/8 55324ed6f55SEd Maste.else 55424ed6f55SEd Maste_UNROLL_CNT = SKEIN_UNROLL_256 55524ed6f55SEd Maste .if ((ROUNDS_256/8) % _UNROLL_CNT) 55624ed6f55SEd Maste .error "Invalid SKEIN_UNROLL_256" 55724ed6f55SEd Maste .endif 55824ed6f55SEd Maste xorq %rdi,%rdi #rdi = iteration count 55924ed6f55SEd MasteSkein_256_round_loop: 56024ed6f55SEd Maste.endif 56124ed6f55SEd Maste_Rbase_ = 0 56224ed6f55SEd Maste.rept _UNROLL_CNT*2 56324ed6f55SEd Maste # all X and ks vars in regs # (ops to "rotate" ks vars, via mem, if not unrolled) 56424ed6f55SEd Maste # round 4*_RBase_ + 0 56524ed6f55SEd Maste addReg rax, rbx 56624ed6f55SEd Maste RotL64 rbx, 256,%((4*_Rbase_+0) % 8),0 56724ed6f55SEd Maste addReg rcx, rdx 568*58958a74SAdrian Chadd .if ((SKEIN_ASM_UNROLL) & 256) == 0 56924ed6f55SEd Maste movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8 57024ed6f55SEd Maste .endif 57124ed6f55SEd Maste xorReg rbx, rax 57224ed6f55SEd Maste RotL64 rdx, 256,%((4*_Rbase_+0) % 8),1 57324ed6f55SEd Maste xorReg rdx, rcx 574*58958a74SAdrian Chadd .if (SKEIN_ASM_UNROLL) & 256 57524ed6f55SEd Maste .irp _r0_,%( 8+(_Rbase_+3) % 5) 57624ed6f55SEd Maste .irp _r1_,%(13+(_Rbase_+2) % 3) 57724ed6f55SEd Maste leaq (%r\_r0_,%r\_r1_),%rdi #precompute key injection value for %rcx 57824ed6f55SEd Maste .endr 57924ed6f55SEd Maste .endr 58024ed6f55SEd Maste .endif 581*58958a74SAdrian Chadd .if ((SKEIN_ASM_UNROLL) & 256) == 0 58224ed6f55SEd Maste movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13 58324ed6f55SEd Maste .endif 58424ed6f55SEd Maste Skein_Debug_Round 256,%(4*_Rbase_+1) 58524ed6f55SEd Maste 58624ed6f55SEd Maste # round 4*_Rbase_ + 1 58724ed6f55SEd Maste addReg rax, rdx 58824ed6f55SEd Maste RotL64 rdx, 256,%((4*_Rbase_+1) % 8),0 58924ed6f55SEd Maste xorReg rdx, rax 590*58958a74SAdrian Chadd .if ((SKEIN_ASM_UNROLL) & 256) == 0 59124ed6f55SEd Maste movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9 59224ed6f55SEd Maste .endif 59324ed6f55SEd Maste addReg rcx, rbx 59424ed6f55SEd Maste RotL64 rbx, 256,%((4*_Rbase_+1) % 8),1 59524ed6f55SEd Maste xorReg rbx, rcx 596*58958a74SAdrian Chadd .if ((SKEIN_ASM_UNROLL) & 256) == 0 59724ed6f55SEd Maste movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11 59824ed6f55SEd Maste .endif 59924ed6f55SEd Maste Skein_Debug_Round 256,%(4*_Rbase_+2) 600*58958a74SAdrian Chadd .if (SKEIN_ASM_UNROLL) & 256 60124ed6f55SEd Maste .irp _r0_,%( 8+(_Rbase_+2) % 5) 60224ed6f55SEd Maste .irp _r1_,%(13+(_Rbase_+1) % 3) 60324ed6f55SEd Maste leaq (%r\_r0_,%r\_r1_),%rsi #precompute key injection value for %rbx 60424ed6f55SEd Maste .endr 60524ed6f55SEd Maste .endr 60624ed6f55SEd Maste .endif 60724ed6f55SEd Maste # round 4*_Rbase_ + 2 60824ed6f55SEd Maste addReg rax, rbx 60924ed6f55SEd Maste RotL64 rbx, 256,%((4*_Rbase_+2) % 8),0 61024ed6f55SEd Maste addReg rcx, rdx 611*58958a74SAdrian Chadd .if ((SKEIN_ASM_UNROLL) & 256) == 0 61224ed6f55SEd Maste movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10 61324ed6f55SEd Maste .endif 61424ed6f55SEd Maste xorReg rbx, rax 61524ed6f55SEd Maste RotL64 rdx, 256,%((4*_Rbase_+2) % 8),1 61624ed6f55SEd Maste xorReg rdx, rcx 617*58958a74SAdrian Chadd .if ((SKEIN_ASM_UNROLL) & 256) == 0 61824ed6f55SEd Maste movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8) #"rotate" the key 61924ed6f55SEd Maste leaq 1(%r11,%rdi),%r11 #precompute key + tweak 62024ed6f55SEd Maste .endif 62124ed6f55SEd Maste Skein_Debug_Round 256,%(4*_Rbase_+3) 62224ed6f55SEd Maste # round 4*_Rbase_ + 3 62324ed6f55SEd Maste addReg rax, rdx 62424ed6f55SEd Maste RotL64 rdx, 256,%((4*_Rbase_+3) % 8),0 62524ed6f55SEd Maste addReg rcx, rbx 626*58958a74SAdrian Chadd .if ((SKEIN_ASM_UNROLL) & 256) == 0 62724ed6f55SEd Maste addq ksTwk+8*2+F_O(%rbp,%rdi,8),%r10 #precompute key + tweak 62824ed6f55SEd Maste movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8) #"rotate" the tweak 62924ed6f55SEd Maste .endif 63024ed6f55SEd Maste xorReg rdx, rax 63124ed6f55SEd Maste RotL64 rbx, 256,%((4*_Rbase_+3) % 8),1 63224ed6f55SEd Maste xorReg rbx, rcx 63324ed6f55SEd Maste Skein_Debug_Round 256,%(4*_Rbase_+4) 634*58958a74SAdrian Chadd .if ((SKEIN_ASM_UNROLL) & 256) == 0 63524ed6f55SEd Maste addReg r9 ,r13 #precompute key+tweak 63624ed6f55SEd Maste .endif 63724ed6f55SEd Maste #inject key schedule words 63824ed6f55SEd Maste_Rbase_ = _Rbase_+1 639*58958a74SAdrian Chadd .if (SKEIN_ASM_UNROLL) & 256 64024ed6f55SEd Maste addReg rax,r,%(8+((_Rbase_+0) % 5)) 64124ed6f55SEd Maste addReg rbx,rsi 64224ed6f55SEd Maste addReg rcx,rdi 64324ed6f55SEd Maste addReg rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_ 64424ed6f55SEd Maste .else 64524ed6f55SEd Maste incq %rdi 64624ed6f55SEd Maste addReg rax,r8 64724ed6f55SEd Maste addReg rcx,r10 64824ed6f55SEd Maste addReg rbx,r9 64924ed6f55SEd Maste addReg rdx,r11 65024ed6f55SEd Maste .endif 65124ed6f55SEd Maste Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT 65224ed6f55SEd Maste.endr #rept _UNROLL_CNT 65324ed6f55SEd Maste# 654*58958a74SAdrian Chadd.if ((SKEIN_ASM_UNROLL) & 256) == 0 65524ed6f55SEd Maste cmpq $2*(ROUNDS_256/8),%rdi 65624ed6f55SEd Maste jb Skein_256_round_loop 65724ed6f55SEd Maste.endif # (SKEIN_ASM_UNROLL & 256) == 0 65824ed6f55SEd Maste movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context 65924ed6f55SEd Maste 66024ed6f55SEd Maste #---------------------------- 66124ed6f55SEd Maste # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3} 66224ed6f55SEd Maste movq $FIRST_MASK64 ,%r14 66324ed6f55SEd Maste xorq Wcopy + 0+F_O (%rbp),%rax 66424ed6f55SEd Maste xorq Wcopy + 8+F_O (%rbp),%rbx 66524ed6f55SEd Maste xorq Wcopy +16+F_O (%rbp),%rcx 66624ed6f55SEd Maste xorq Wcopy +24+F_O (%rbp),%rdx 66724ed6f55SEd Maste andq TWEAK + 8 (%rdi),%r14 66824ed6f55SEd Maste movq %rax,X_VARS+ 0(%rdi) #store final result 66924ed6f55SEd Maste movq %rbx,X_VARS+ 8(%rdi) 67024ed6f55SEd Maste movq %rcx,X_VARS+16(%rdi) 67124ed6f55SEd Maste movq %rdx,X_VARS+24(%rdi) 67224ed6f55SEd Maste 67324ed6f55SEd Maste Skein_Debug_Round 256,SKEIN_RND_FEED_FWD 67424ed6f55SEd Maste 67524ed6f55SEd Maste # go back for more blocks, if needed 67624ed6f55SEd Maste decq blkCnt+F_O(%rbp) 67724ed6f55SEd Maste jnz Skein_256_block_loop 67824ed6f55SEd Maste movq %r14,TWEAK + 8(%rdi) 67924ed6f55SEd Maste Reset_Stack 68024ed6f55SEd Maste ret 68124ed6f55SEd MasteSkein_256_Process_Block_End: 68224ed6f55SEd Maste 68324ed6f55SEd Maste .if _SKEIN_DEBUG 68424ed6f55SEd MasteSkein_Debug_Round_256: #here with rdx == round "number" from macro 68524ed6f55SEd Maste pushq %rsi #save two regs for BLK_BITS-specific parms 68624ed6f55SEd Maste pushq %rdi 68724ed6f55SEd Maste movq 24(%rsp),%rdi #get back original rdx (pushed on stack in macro call) to rdi 68824ed6f55SEd Maste movq %rax,X_stk+ 0+F_O(%rbp) #save X[] state on stack so debug routines can access it 68924ed6f55SEd Maste movq %rbx,X_stk+ 8+F_O(%rbp) #(use FP_ since rsp has changed!) 69024ed6f55SEd Maste movq %rcx,X_stk+16+F_O(%rbp) 69124ed6f55SEd Maste movq %rdi,X_stk+24+F_O(%rbp) 69224ed6f55SEd Maste 69324ed6f55SEd Maste movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr 69424ed6f55SEd Maste movq $256,%rdi #now <rdi,rsi,rdx> are set for the call 69524ed6f55SEd Maste jmp Skein_Debug_Round_Common 69624ed6f55SEd Maste .endif 69724ed6f55SEd Maste# 69824ed6f55SEd Maste.if _SKEIN_CODE_SIZE 69924ed6f55SEd MasteC_label Skein_256_Process_Block_CodeSize 70024ed6f55SEd Maste movq $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax 70124ed6f55SEd Maste ret 70224ed6f55SEd Maste# 70324ed6f55SEd MasteC_label Skein_256_Unroll_Cnt 70424ed6f55SEd Maste .if _UNROLL_CNT <> ROUNDS_256/8 70524ed6f55SEd Maste movq $_UNROLL_CNT,%rax 70624ed6f55SEd Maste .else 70724ed6f55SEd Maste xorq %rax,%rax 70824ed6f55SEd Maste .endif 70924ed6f55SEd Maste ret 71024ed6f55SEd Maste.endif 71124ed6f55SEd Maste# 71224ed6f55SEd Maste.endif #_USE_ASM_ & 256 71324ed6f55SEd Maste# 71424ed6f55SEd Maste#=================================== Skein_512 ============================================= 71524ed6f55SEd Maste# 71624ed6f55SEd Maste.if _USE_ASM_ & 512 71724ed6f55SEd Maste# 71824ed6f55SEd Maste# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd) 71924ed6f55SEd Maste# 72024ed6f55SEd Maste# X[i] == %r[8+i] #register assignments for X[] values during rounds (i=0..7) 72124ed6f55SEd Maste# 72224ed6f55SEd Maste################# 72324ed6f55SEd Maste# MACRO: one round for 512-bit blocks 72424ed6f55SEd Maste# 72524ed6f55SEd Maste.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4 72624ed6f55SEd Maste# 72724ed6f55SEd Maste addReg r\rn0, r\rn1 72824ed6f55SEd Maste RotL64 r\rn1, 512,%((\_Rn_) % 8),0 72924ed6f55SEd Maste xorReg r\rn1, r\rn0 73024ed6f55SEd Maste \op1 73124ed6f55SEd Maste addReg r\rn2, r\rn3 73224ed6f55SEd Maste RotL64 r\rn3, 512,%((\_Rn_) % 8),1 73324ed6f55SEd Maste xorReg r\rn3, r\rn2 73424ed6f55SEd Maste \op2 73524ed6f55SEd Maste addReg r\rn4, r\rn5 73624ed6f55SEd Maste RotL64 r\rn5, 512,%((\_Rn_) % 8),2 73724ed6f55SEd Maste xorReg r\rn5, r\rn4 73824ed6f55SEd Maste \op3 73924ed6f55SEd Maste addReg r\rn6, r\rn7 74024ed6f55SEd Maste RotL64 r\rn7, 512,%((\_Rn_) % 8),3 74124ed6f55SEd Maste xorReg r\rn7, r\rn6 74224ed6f55SEd Maste \op4 74324ed6f55SEd Maste Skein_Debug_Round 512,%(\_Rn_+1),-4 74424ed6f55SEd Maste# 74524ed6f55SEd Maste.endm #R_512_OneRound 74624ed6f55SEd Maste# 74724ed6f55SEd Maste################# 74824ed6f55SEd Maste# MACRO: eight rounds for 512-bit blocks 74924ed6f55SEd Maste# 75024ed6f55SEd Maste.macro R_512_FourRounds _RR_ #RR = base round number (0 % 8) 751cffe0e0fSAdrian Chadd .if ((SKEIN_ASM_UNROLL) & 512) 75224ed6f55SEd Maste # here for fully unrolled case. 75324ed6f55SEd Maste _II_ = ((\_RR_)/4) + 1 #key injection counter 75424ed6f55SEd Maste R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx> 75524ed6f55SEd Maste R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx> 75624ed6f55SEd Maste R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx> 75724ed6f55SEd Maste R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>, 75824ed6f55SEd Maste # inject the key schedule 75924ed6f55SEd Maste addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8 76024ed6f55SEd Maste addReg r11, rax 76124ed6f55SEd Maste addq ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9 76224ed6f55SEd Maste addReg r12, rbx 76324ed6f55SEd Maste addq ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10 76424ed6f55SEd Maste addReg r13, rcx 76524ed6f55SEd Maste addReg r14, rdx 76624ed6f55SEd Maste addReg r15, rsi,,,(_II_) 76724ed6f55SEd Maste .else 76824ed6f55SEd Maste # here for looping case #"rotate" key/tweak schedule (move up on stack) 76924ed6f55SEd Maste incq %rdi #bump key injection counter 77024ed6f55SEd Maste R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq ksKey-8*1+F_O(%rbp,%rdi,8),%rsi> 77124ed6f55SEd Maste R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8) >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)> 77224ed6f55SEd Maste R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq ksKey+8*7+F_O(%rbp,%rdi,8),%rsi> 77324ed6f55SEd Maste R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx> 77424ed6f55SEd Maste # inject the key schedule 77524ed6f55SEd Maste addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8 77624ed6f55SEd Maste addReg r11, rax 77724ed6f55SEd Maste addReg r12, rbx 77824ed6f55SEd Maste addq ksKey+8*1+F_O(%rbp,%rdi,8),%r9 77924ed6f55SEd Maste addReg r13, rcx 78024ed6f55SEd Maste addReg r14, rdx 78124ed6f55SEd Maste addq ksKey+8*2+F_O(%rbp,%rdi,8),%r10 78224ed6f55SEd Maste addReg r15, rsi 78324ed6f55SEd Maste addReg r15, rdi #inject the round number 78424ed6f55SEd Maste .endif 78524ed6f55SEd Maste 78624ed6f55SEd Maste #show the result of the key injection 78724ed6f55SEd Maste Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT 78824ed6f55SEd Maste.endm #R_512_EightRounds 78924ed6f55SEd Maste# 79024ed6f55SEd Maste################# 79124ed6f55SEd Maste# instantiated code 79224ed6f55SEd Maste# 79324ed6f55SEd MasteC_label Skein_512_Process_Block 79424ed6f55SEd Maste Setup_Stack 512,ROUNDS_512/8 79524ed6f55SEd Maste movq TWEAK+ 8(%rdi),%rbx 79624ed6f55SEd Maste jmp Skein_512_block_loop 79724ed6f55SEd Maste .p2align 4 79824ed6f55SEd Maste # main hash loop for Skein_512 79924ed6f55SEd MasteSkein_512_block_loop: 80024ed6f55SEd Maste # general register usage: 80124ed6f55SEd Maste # RAX..RDX = temps for key schedule pre-loads 80224ed6f55SEd Maste # R8 ..R15 = X0..X7 80324ed6f55SEd Maste # RSP, RBP = stack/frame pointers 80424ed6f55SEd Maste # RDI = round counter or context pointer 80524ed6f55SEd Maste # RSI = temp 80624ed6f55SEd Maste # 80724ed6f55SEd Maste movq TWEAK + 0(%rdi),%rax 80824ed6f55SEd Maste addq bitAdd+F_O(%rbp),%rax #computed updated tweak value T0 80924ed6f55SEd Maste movq %rbx,%rcx 81024ed6f55SEd Maste xorq %rax,%rcx #%rax/%rbx/%rcx = tweak schedule 81124ed6f55SEd Maste movq %rax,TWEAK+ 0 (%rdi) #save updated tweak value ctx->h.T[0] 81224ed6f55SEd Maste movq %rax,ksTwk+ 0+F_O(%rbp) 81324ed6f55SEd Maste movq $KW_PARITY,%rdx 81424ed6f55SEd Maste movq blkPtr +F_O(%rbp),%rsi #%rsi --> input block 81524ed6f55SEd Maste movq %rbx,ksTwk+ 8+F_O(%rbp) 81624ed6f55SEd Maste movq %rcx,ksTwk+16+F_O(%rbp) 81724ed6f55SEd Maste .irp _Rn_,8,9,10,11,12,13,14,15 81824ed6f55SEd Maste movq X_VARS+8*(\_Rn_-8)(%rdi),%r\_Rn_ 81924ed6f55SEd Maste xorq %r\_Rn_,%rdx #compute overall parity 82024ed6f55SEd Maste movq %r\_Rn_,ksKey+8*(\_Rn_-8)+F_O(%rbp) 82124ed6f55SEd Maste .endr #load state into %r8 ..%r15, compute parity 82224ed6f55SEd Maste movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity 82324ed6f55SEd Maste 82424ed6f55SEd Maste addReg r13,rax #precompute key injection for tweak 82524ed6f55SEd Maste addReg r14, rbx 82624ed6f55SEd Maste.if _SKEIN_DEBUG 82724ed6f55SEd Maste movq %rbx,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below 82824ed6f55SEd Maste.endif 82924ed6f55SEd Maste movq 0(%rsi),%rax #load input block 83024ed6f55SEd Maste movq 8(%rsi),%rbx 83124ed6f55SEd Maste movq 16(%rsi),%rcx 83224ed6f55SEd Maste movq 24(%rsi),%rdx 83324ed6f55SEd Maste addReg r8 , rax #do initial key injection 83424ed6f55SEd Maste addReg r9 , rbx 83524ed6f55SEd Maste movq %rax,Wcopy+ 0+F_O(%rbp) #keep local copy for feedforward 83624ed6f55SEd Maste movq %rbx,Wcopy+ 8+F_O(%rbp) 83724ed6f55SEd Maste addReg r10, rcx 83824ed6f55SEd Maste addReg r11, rdx 83924ed6f55SEd Maste movq %rcx,Wcopy+16+F_O(%rbp) 84024ed6f55SEd Maste movq %rdx,Wcopy+24+F_O(%rbp) 84124ed6f55SEd Maste 84224ed6f55SEd Maste movq 32(%rsi),%rax 84324ed6f55SEd Maste movq 40(%rsi),%rbx 84424ed6f55SEd Maste movq 48(%rsi),%rcx 84524ed6f55SEd Maste movq 56(%rsi),%rdx 84624ed6f55SEd Maste addReg r12, rax 84724ed6f55SEd Maste addReg r13, rbx 84824ed6f55SEd Maste addReg r14, rcx 84924ed6f55SEd Maste addReg r15, rdx 85024ed6f55SEd Maste movq %rax,Wcopy+32+F_O(%rbp) 85124ed6f55SEd Maste movq %rbx,Wcopy+40+F_O(%rbp) 85224ed6f55SEd Maste movq %rcx,Wcopy+48+F_O(%rbp) 85324ed6f55SEd Maste movq %rdx,Wcopy+56+F_O(%rbp) 85424ed6f55SEd Maste 85524ed6f55SEd Maste.if _SKEIN_DEBUG 85624ed6f55SEd Maste .irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output 85724ed6f55SEd Maste movq %r\_Rn_,X_stk+8*(\_Rn_-8)(%rsp) 85824ed6f55SEd Maste .endr 85924ed6f55SEd Maste 86024ed6f55SEd Maste Skein_Debug_Block 512 #debug dump 86124ed6f55SEd Maste Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL 86224ed6f55SEd Maste.endif 86324ed6f55SEd Maste addq $8*WCNT,%rsi #skip the block 86424ed6f55SEd Maste movq %rsi,blkPtr+F_O(%rbp) #update block pointer 86524ed6f55SEd Maste # 86624ed6f55SEd Maste ################# 86724ed6f55SEd Maste # now the key schedule is computed. Start the rounds 86824ed6f55SEd Maste # 869*58958a74SAdrian Chadd.if (SKEIN_ASM_UNROLL) & 512 87024ed6f55SEd Maste_UNROLL_CNT = ROUNDS_512/8 87124ed6f55SEd Maste.else 87224ed6f55SEd Maste_UNROLL_CNT = SKEIN_UNROLL_512 87324ed6f55SEd Maste .if ((ROUNDS_512/8) % _UNROLL_CNT) 87424ed6f55SEd Maste .error "Invalid SKEIN_UNROLL_512" 87524ed6f55SEd Maste .endif 87624ed6f55SEd Maste xorq %rdi,%rdi #rdi = round counter 87724ed6f55SEd MasteSkein_512_round_loop: 87824ed6f55SEd Maste.endif 87924ed6f55SEd Maste# 88024ed6f55SEd Maste_Rbase_ = 0 88124ed6f55SEd Maste.rept _UNROLL_CNT*2 88224ed6f55SEd Maste R_512_FourRounds %(4*_Rbase_+00) 88324ed6f55SEd Maste_Rbase_ = _Rbase_+1 88424ed6f55SEd Maste.endr #rept _UNROLL_CNT 88524ed6f55SEd Maste# 886*58958a74SAdrian Chadd.if ((SKEIN_ASM_UNROLL) & 512) == 0 88724ed6f55SEd Maste cmpq $2*(ROUNDS_512/8),%rdi 88824ed6f55SEd Maste jb Skein_512_round_loop 88924ed6f55SEd Maste movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context 89024ed6f55SEd Maste.endif 89124ed6f55SEd Maste # end of rounds 89224ed6f55SEd Maste ################# 89324ed6f55SEd Maste # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7} 89424ed6f55SEd Maste .irp _Rn_,8,9,10,11,12,13,14,15 89524ed6f55SEd Maste .if (\_Rn_ == 8) 89624ed6f55SEd Maste movq $FIRST_MASK64,%rbx 89724ed6f55SEd Maste .endif 89824ed6f55SEd Maste xorq Wcopy+8*(\_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR 89924ed6f55SEd Maste movq %r\_Rn_,X_VARS+8*(\_Rn_-8)(%rdi) #and store result 90024ed6f55SEd Maste .if (\_Rn_ == 14) 90124ed6f55SEd Maste andq TWEAK+ 8(%rdi),%rbx 90224ed6f55SEd Maste .endif 90324ed6f55SEd Maste .endr 90424ed6f55SEd Maste Skein_Debug_Round 512,SKEIN_RND_FEED_FWD 90524ed6f55SEd Maste 90624ed6f55SEd Maste # go back for more blocks, if needed 90724ed6f55SEd Maste decq blkCnt+F_O(%rbp) 90824ed6f55SEd Maste jnz Skein_512_block_loop 90924ed6f55SEd Maste movq %rbx,TWEAK + 8(%rdi) 91024ed6f55SEd Maste 91124ed6f55SEd Maste Reset_Stack 91224ed6f55SEd Maste ret 91324ed6f55SEd MasteSkein_512_Process_Block_End: 91424ed6f55SEd Maste# 91524ed6f55SEd Maste .if _SKEIN_DEBUG 91624ed6f55SEd Maste# call here with rdx = "round number" 91724ed6f55SEd MasteSkein_Debug_Round_512: 91824ed6f55SEd Maste pushq %rsi #save two regs for BLK_BITS-specific parms 91924ed6f55SEd Maste pushq %rdi 92024ed6f55SEd Maste .irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it 92124ed6f55SEd Maste movq %r\_Rn_,X_stk+8*(\_Rn_-8)+F_O(%rbp) 92224ed6f55SEd Maste .endr 92324ed6f55SEd Maste movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr 92424ed6f55SEd Maste movq $512,%rdi #now <rdi,rsi,rdx> are set for the call 92524ed6f55SEd Maste jmp Skein_Debug_Round_Common 92624ed6f55SEd Maste .endif 92724ed6f55SEd Maste# 92824ed6f55SEd Maste.if _SKEIN_CODE_SIZE 92924ed6f55SEd MasteC_label Skein_512_Process_Block_CodeSize 93024ed6f55SEd Maste movq $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax 93124ed6f55SEd Maste ret 93224ed6f55SEd Maste# 93324ed6f55SEd MasteC_label Skein_512_Unroll_Cnt 93424ed6f55SEd Maste .if _UNROLL_CNT <> (ROUNDS_512/8) 93524ed6f55SEd Maste movq $_UNROLL_CNT,%rax 93624ed6f55SEd Maste .else 93724ed6f55SEd Maste xorq %rax,%rax 93824ed6f55SEd Maste .endif 93924ed6f55SEd Maste ret 94024ed6f55SEd Maste.endif 94124ed6f55SEd Maste# 94224ed6f55SEd Maste.endif # _USE_ASM_ & 512 94324ed6f55SEd Maste# 94424ed6f55SEd Maste#=================================== Skein1024 ============================================= 94524ed6f55SEd Maste.if _USE_ASM_ & 1024 94624ed6f55SEd Maste# 94724ed6f55SEd Maste# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# 94824ed6f55SEd Maste# 94924ed6f55SEd Maste################# 95024ed6f55SEd Maste# use details of permutation to make register assignments 95124ed6f55SEd Maste# 95224ed6f55SEd Masteo1K_rdi = 0 #offsets in X[] associated with each register 95324ed6f55SEd Masteo1K_rsi = 1 95424ed6f55SEd Masteo1K_rbp = 2 95524ed6f55SEd Masteo1K_rax = 3 95624ed6f55SEd Masteo1K_rcx = 4 #rcx is "shared" with X6, since X4/X6 alternate 95724ed6f55SEd Masteo1K_rbx = 5 95824ed6f55SEd Masteo1K_rdx = 7 95924ed6f55SEd Masteo1K_r8 = 8 96024ed6f55SEd Masteo1K_r9 = 9 96124ed6f55SEd Masteo1K_r10 = 10 96224ed6f55SEd Masteo1K_r11 = 11 96324ed6f55SEd Masteo1K_r12 = 12 96424ed6f55SEd Masteo1K_r13 = 13 96524ed6f55SEd Masteo1K_r14 = 14 96624ed6f55SEd Masteo1K_r15 = 15 96724ed6f55SEd Maste# 96824ed6f55SEd MasterIdx_offs = tmpStk_1024 96924ed6f55SEd Maste# 97024ed6f55SEd Maste.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1 97124ed6f55SEd Maste addReg \reg0 , \reg1 #perform the MIX 97224ed6f55SEd Maste RotL64 \reg1 , 1024,%((\_RN0_) % 8),\_Rn1_ 97324ed6f55SEd Maste xorReg \reg1 , \reg0 97436972ee3SEd Maste.if ((\_RN0_) & 3) == 3 #time to do key injection? 97524ed6f55SEd Maste .if _SKEIN_DEBUG 97624ed6f55SEd Maste movq %\reg0 , xDebug_1024+8*\w0(%rsp) #save intermediate values for Debug_Round 97724ed6f55SEd Maste movq %\reg1 , xDebug_1024+8*\w1(%rsp) # (before inline key injection) 97824ed6f55SEd Maste .endif 97924ed6f55SEd Maste_II_ = ((\_RN0_)/4)+1 #injection count 980cffe0e0fSAdrian Chadd .if (SKEIN_ASM_UNROLL) & 1024 #here to do fully unrolled key injection 98124ed6f55SEd Maste addq ksKey+ 8*((_II_+\w0) % 17)(%rsp),%\reg0 98224ed6f55SEd Maste addq ksKey+ 8*((_II_+\w1) % 17)(%rsp),%\reg1 98324ed6f55SEd Maste .if \w1 == 13 #tweak injection 98424ed6f55SEd Maste addq ksTwk+ 8*((_II_+ 0) % 3)(%rsp),%\reg1 98524ed6f55SEd Maste .elseif \w0 == 14 98624ed6f55SEd Maste addq ksTwk+ 8*((_II_+ 1) % 3)(%rsp),%\reg0 98724ed6f55SEd Maste .elseif \w1 == 15 98824ed6f55SEd Maste addq $_II_, %\reg1 #(injection counter) 98924ed6f55SEd Maste .endif 99024ed6f55SEd Maste .else #here to do looping key injection 99124ed6f55SEd Maste .if (\w0 == 0) 99224ed6f55SEd Maste movq %rdi, X_stk+8*\w0(%rsp) #if so, store N0 so we can use reg as index 99324ed6f55SEd Maste movq rIdx_offs(%rsp),%rdi #get the injection counter index into rdi 99424ed6f55SEd Maste .else 99524ed6f55SEd Maste addq ksKey+8+8*\w0(%rsp,%rdi,8),%\reg0 #even key injection 99624ed6f55SEd Maste .endif 99724ed6f55SEd Maste .if \w1 == 13 #tweak injection 99824ed6f55SEd Maste addq ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1 99924ed6f55SEd Maste .elseif \w0 == 14 100024ed6f55SEd Maste addq ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0 100124ed6f55SEd Maste .elseif \w1 == 15 100224ed6f55SEd Maste addReg \reg1,rdi,,,1 #(injection counter) 100324ed6f55SEd Maste .endif 100424ed6f55SEd Maste addq ksKey+8+8*\w1(%rsp,%rdi,8),%\reg1 #odd key injection 100524ed6f55SEd Maste .endif 100624ed6f55SEd Maste.endif 100724ed6f55SEd Maste # insert the op provided, .if any 100824ed6f55SEd Maste \op1 100924ed6f55SEd Maste.endm 101024ed6f55SEd Maste################# 101124ed6f55SEd Maste# MACRO: four rounds for 1024-bit blocks 101224ed6f55SEd Maste# 101324ed6f55SEd Maste.macro r1024_FourRounds _RR_ #RR = base round number (0 mod 4) 101424ed6f55SEd Maste # should be here with X4 set properly, X6 stored on stack 101524ed6f55SEd Maste_Rn_ = (\_RR_) + 0 101624ed6f55SEd Maste r1024_Mix 0, 1,rdi,rsi,_Rn_,0 101724ed6f55SEd Maste r1024_Mix 2, 3,rbp,rax,_Rn_,1 101824ed6f55SEd Maste r1024_Mix 4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate) 101924ed6f55SEd Maste r1024_Mix 8, 9,r8 ,r9 ,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack 102024ed6f55SEd Maste r1024_Mix 10,11,r10,r11,_Rn_,5 102124ed6f55SEd Maste r1024_Mix 12,13,r12,r13,_Rn_,6 102224ed6f55SEd Maste r1024_Mix 6, 7,rcx,rdx,_Rn_,3 102324ed6f55SEd Maste r1024_Mix 14,15,r14,r15,_Rn_,7 102424ed6f55SEd Maste .if _SKEIN_DEBUG 102524ed6f55SEd Maste Skein_Debug_Round 1024,%(_Rn_+1) 102624ed6f55SEd Maste .endif 102724ed6f55SEd Maste_Rn_ = (\_RR_) + 1 102824ed6f55SEd Maste r1024_Mix 0, 9,rdi,r9 ,_Rn_,0 102924ed6f55SEd Maste r1024_Mix 2,13,rbp,r13,_Rn_,1 103024ed6f55SEd Maste r1024_Mix 6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate) 103124ed6f55SEd Maste r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack 103224ed6f55SEd Maste r1024_Mix 12, 3,r12,rax,_Rn_,5 103324ed6f55SEd Maste r1024_Mix 14, 5,r14,rbx,_Rn_,6 103424ed6f55SEd Maste r1024_Mix 4,15,rcx,r15,_Rn_,3 103524ed6f55SEd Maste r1024_Mix 8, 1,r8 ,rsi,_Rn_,7 103624ed6f55SEd Maste .if _SKEIN_DEBUG 103724ed6f55SEd Maste Skein_Debug_Round 1024,%(_Rn_+1) 103824ed6f55SEd Maste .endif 103924ed6f55SEd Maste_Rn_ = (\_RR_) + 2 104024ed6f55SEd Maste r1024_Mix 0, 7,rdi,rdx,_Rn_,0 104124ed6f55SEd Maste r1024_Mix 2, 5,rbp,rbx,_Rn_,1 104224ed6f55SEd Maste r1024_Mix 4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate) 104324ed6f55SEd Maste r1024_Mix 12,15,r12,r15,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack 104424ed6f55SEd Maste r1024_Mix 14,13,r14,r13,_Rn_,5 104524ed6f55SEd Maste r1024_Mix 8,11,r8 ,r11,_Rn_,6 104624ed6f55SEd Maste r1024_Mix 6, 1,rcx,rsi,_Rn_,3 104724ed6f55SEd Maste r1024_Mix 10, 9,r10,r9 ,_Rn_,7 104824ed6f55SEd Maste .if _SKEIN_DEBUG 104924ed6f55SEd Maste Skein_Debug_Round 1024,%(_Rn_+1) 105024ed6f55SEd Maste .endif 105124ed6f55SEd Maste_Rn_ = (\_RR_) + 3 105224ed6f55SEd Maste r1024_Mix 0,15,rdi,r15,_Rn_,0 105324ed6f55SEd Maste r1024_Mix 2,11,rbp,r11,_Rn_,1 105424ed6f55SEd Maste r1024_Mix 6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate) 105524ed6f55SEd Maste r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack 105624ed6f55SEd Maste r1024_Mix 8, 5,r8 ,rbx,_Rn_,5 105724ed6f55SEd Maste r1024_Mix 10, 3,r10,rax,_Rn_,6 105824ed6f55SEd Maste r1024_Mix 4, 9,rcx,r9 ,_Rn_,3 105924ed6f55SEd Maste r1024_Mix 12, 7,r12,rdx,_Rn_,7 106024ed6f55SEd Maste .if _SKEIN_DEBUG 106124ed6f55SEd Maste Skein_Debug_Round 1024,%(_Rn_+1) 106224ed6f55SEd Maste .endif 106324ed6f55SEd Maste 1064*58958a74SAdrian Chadd .if ((SKEIN_ASM_UNROLL) & 1024) == 0 #here with rdi == rIdx, X0 on stack 106524ed6f55SEd Maste #"rotate" the key schedule on the stack 106624ed6f55SEd Mastei8 = o1K_r8 106724ed6f55SEd Mastei0 = o1K_rdi 106824ed6f55SEd Maste movq %r8 , X_stk+8*i8(%rsp) #free up a register (save it on the stack) 106924ed6f55SEd Maste movq ksKey+8* 0(%rsp,%rdi,8),%r8 #get key word 107024ed6f55SEd Maste movq %r8 , ksKey+8*17(%rsp,%rdi,8) #rotate key (must do key first or tweak clobbers it!) 107124ed6f55SEd Maste movq ksTwk+8* 0(%rsp,%rdi,8),%r8 #get tweak word 107224ed6f55SEd Maste movq %r8 , ksTwk+8* 3(%rsp,%rdi,8) #rotate tweak (onto the stack) 107324ed6f55SEd Maste movq X_stk+8*i8(%rsp) ,%r8 #get the reg back 107424ed6f55SEd Maste incq %rdi #bump the index 107524ed6f55SEd Maste movq %rdi, rIdx_offs (%rsp) #save rdi again 107624ed6f55SEd Maste movq ksKey+8*i0(%rsp,%rdi,8),%rdi #get the key schedule word for X0 back 107724ed6f55SEd Maste addq X_stk+8*i0(%rsp) ,%rdi #perform the X0 key injection 107824ed6f55SEd Maste .endif 107924ed6f55SEd Maste #show the result of the key injection 108024ed6f55SEd Maste Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT 108124ed6f55SEd Maste.endm #r1024_FourRounds 108224ed6f55SEd Maste# 108324ed6f55SEd Maste################ 108424ed6f55SEd Maste# code 108524ed6f55SEd Maste# 108624ed6f55SEd MasteC_label Skein1024_Process_Block 108724ed6f55SEd Maste# 108824ed6f55SEd Maste Setup_Stack 1024,ROUNDS_1024/8,WCNT 108924ed6f55SEd Maste movq TWEAK+ 8(%rdi),%r9 109024ed6f55SEd Maste jmp Skein1024_block_loop 109124ed6f55SEd Maste # main hash loop for Skein1024 109224ed6f55SEd Maste .p2align 4 109324ed6f55SEd MasteSkein1024_block_loop: 109424ed6f55SEd Maste # general register usage: 109524ed6f55SEd Maste # RSP = stack pointer 109624ed6f55SEd Maste # RAX..RDX,RSI,RDI = X1, X3..X7 (state words) 109724ed6f55SEd Maste # R8 ..R15 = X8..X15 (state words) 109824ed6f55SEd Maste # RBP = temp (used for X0 and X2) 109924ed6f55SEd Maste # 1100*58958a74SAdrian Chadd .if ((SKEIN_ASM_UNROLL) & 1024) == 0 110124ed6f55SEd Maste xorq %rax,%rax #init loop index on the stack 110224ed6f55SEd Maste movq %rax,rIdx_offs(%rsp) 110324ed6f55SEd Maste .endif 110424ed6f55SEd Maste movq TWEAK+ 0(%rdi),%r8 110524ed6f55SEd Maste addq bitAdd+ F_O(%rbp),%r8 #computed updated tweak value T0 110624ed6f55SEd Maste movq %r9 ,%r10 110724ed6f55SEd Maste xorq %r8 ,%r10 #%rax/%rbx/%rcx = tweak schedule 110824ed6f55SEd Maste movq %r8 ,TWEAK+ 0(%rdi) #save updated tweak value ctx->h.T[0] 110924ed6f55SEd Maste movq %r8 ,ksTwk+ 0+F_O(%rbp) 111024ed6f55SEd Maste movq %r9 ,ksTwk+ 8+F_O(%rbp) #keep values in %r8 ,%r9 for initial tweak injection below 111124ed6f55SEd Maste movq %r10,ksTwk+16+F_O(%rbp) 111224ed6f55SEd Maste .if _SKEIN_DEBUG 111324ed6f55SEd Maste movq %r9 ,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block 111424ed6f55SEd Maste .endif 111524ed6f55SEd Maste movq blkPtr +F_O(%rbp),%rsi # rsi --> input block 111624ed6f55SEd Maste movq $KW_PARITY ,%rax #overall key schedule parity 111724ed6f55SEd Maste 111824ed6f55SEd Maste # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3] 111924ed6f55SEd Maste .irp _rN_,0,1,2,3,4,6 #process the "initial" words, using r14/r15 as temps 112024ed6f55SEd Maste movq X_VARS+8*\_rN_(%rdi),%r14 #get state word 112124ed6f55SEd Maste movq 8*\_rN_(%rsi),%r15 #get msg word 112224ed6f55SEd Maste xorq %r14,%rax #update key schedule overall parity 112324ed6f55SEd Maste movq %r14,ksKey +8*\_rN_+F_O(%rbp) #save key schedule word on stack 112424ed6f55SEd Maste movq %r15,Wcopy +8*\_rN_+F_O(%rbp) #save local msg Wcopy 112524ed6f55SEd Maste addq %r15,%r14 #do the initial key injection 112624ed6f55SEd Maste movq %r14,X_stk +8*\_rN_ (%rsp) #save initial state var on stack 112724ed6f55SEd Maste .endr 112824ed6f55SEd Maste # now process the rest, using the "real" registers 112924ed6f55SEd Maste # (MUST do it in reverse order to inject tweaks r8/r9 first) 113024ed6f55SEd Maste .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx 113124ed6f55SEd Maste_oo_ = o1K_\_rr_ #offset assocated with the register 113224ed6f55SEd Maste movq X_VARS+8*_oo_(%rdi),%\_rr_ #get key schedule word from context 113324ed6f55SEd Maste movq 8*_oo_(%rsi),%rcx #get next input msg word 113424ed6f55SEd Maste movq %\_rr_, ksKey +8*_oo_(%rsp) #save key schedule on stack 113524ed6f55SEd Maste xorq %\_rr_, %rax #accumulate key schedule parity 113624ed6f55SEd Maste movq %rcx,Wcopy+8*_oo_+F_O(%rbp) #save copy of msg word for feedforward 113724ed6f55SEd Maste addq %rcx,%\_rr_ #do the initial key injection 113824ed6f55SEd Maste .if _oo_ == 13 #do the initial tweak injection 113924ed6f55SEd Maste addReg \_rr_,r8 # (only in words 13/14) 114024ed6f55SEd Maste .elseif _oo_ == 14 114124ed6f55SEd Maste addReg \_rr_,r9 114224ed6f55SEd Maste .endif 114324ed6f55SEd Maste .endr 114424ed6f55SEd Maste movq %rax,ksKey+8*WCNT+F_O(%rbp) #save key schedule parity 114524ed6f55SEd Maste.if _SKEIN_DEBUG 114624ed6f55SEd Maste Skein_Debug_Block 1024 #initial debug dump 114724ed6f55SEd Maste.endif 114824ed6f55SEd Maste addq $8*WCNT,%rsi #bump the msg ptr 114924ed6f55SEd Maste movq %rsi,blkPtr+F_O(%rbp) #save bumped msg ptr 115024ed6f55SEd Maste # re-load words 0..4 from stack, enter the main loop 115124ed6f55SEd Maste .irp _rr_,rdi,rsi,rbp,rax,rcx #(no need to re-load x6, already on stack) 115224ed6f55SEd Maste movq X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go! 115324ed6f55SEd Maste .endr 115424ed6f55SEd Maste.if _SKEIN_DEBUG 115524ed6f55SEd Maste Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL #show state after initial key injection 115624ed6f55SEd Maste.endif 115724ed6f55SEd Maste # 115824ed6f55SEd Maste ################# 115924ed6f55SEd Maste # now the key schedule is computed. Start the rounds 116024ed6f55SEd Maste # 1161*58958a74SAdrian Chadd.if (SKEIN_ASM_UNROLL) & 1024 116224ed6f55SEd Maste_UNROLL_CNT = ROUNDS_1024/8 116324ed6f55SEd Maste.else 116424ed6f55SEd Maste_UNROLL_CNT = SKEIN_UNROLL_1024 116524ed6f55SEd Maste .if ((ROUNDS_1024/8) % _UNROLL_CNT) 116624ed6f55SEd Maste .error "Invalid SKEIN_UNROLL_1024" 116724ed6f55SEd Maste .endif 116824ed6f55SEd MasteSkein1024_round_loop: 116924ed6f55SEd Maste.endif 117024ed6f55SEd Maste# 117124ed6f55SEd Maste_Rbase_ = 0 117224ed6f55SEd Maste.rept _UNROLL_CNT*2 #implement the rounds, 4 at a time 117324ed6f55SEd Maste r1024_FourRounds %(4*_Rbase_+00) 117424ed6f55SEd Maste_Rbase_ = _Rbase_+1 117524ed6f55SEd Maste.endr #rept _UNROLL_CNT 117624ed6f55SEd Maste# 1177*58958a74SAdrian Chadd.if ((SKEIN_ASM_UNROLL) & 1024) == 0 117824ed6f55SEd Maste cmpq $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done 117924ed6f55SEd Maste jb Skein1024_round_loop 118024ed6f55SEd Maste.endif 118124ed6f55SEd Maste # end of rounds 118224ed6f55SEd Maste ################# 118324ed6f55SEd Maste # 118424ed6f55SEd Maste # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15} 118524ed6f55SEd Maste movq %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack 118624ed6f55SEd Maste movq ctxPtr(%rsp),%rdx 118724ed6f55SEd Maste 118824ed6f55SEd Maste .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15 #do all but x6,x7 118924ed6f55SEd Maste_oo_ = o1K_\_rr_ 119024ed6f55SEd Maste xorq Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR 119124ed6f55SEd Maste movq %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context 119224ed6f55SEd Maste .if (_oo_ == 9) 119324ed6f55SEd Maste movq $FIRST_MASK64 ,%r9 119424ed6f55SEd Maste .endif 119524ed6f55SEd Maste .if (_oo_ == 14) 119624ed6f55SEd Maste andq TWEAK+ 8(%rdx),%r9 119724ed6f55SEd Maste .endif 119824ed6f55SEd Maste .endr 119924ed6f55SEd Maste # 120024ed6f55SEd Maste movq X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above) 120124ed6f55SEd Maste movq X_stk +8*7(%rsp),%rbx 120224ed6f55SEd Maste xorq Wcopy +8*6(%rsp),%rax 120324ed6f55SEd Maste xorq Wcopy +8*7(%rsp),%rbx 120424ed6f55SEd Maste movq %rax,X_VARS+8*6(%rdx) 120524ed6f55SEd Maste decq blkCnt(%rsp) #set zero flag iff done 120624ed6f55SEd Maste movq %rbx,X_VARS+8*7(%rdx) 120724ed6f55SEd Maste 120824ed6f55SEd Maste Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)> 120924ed6f55SEd Maste # go back for more blocks, if needed 121024ed6f55SEd Maste movq ctxPtr(%rsp),%rdi #don't muck with the flags here! 121124ed6f55SEd Maste lea FRAME_OFFS(%rsp),%rbp 121224ed6f55SEd Maste jnz Skein1024_block_loop 121324ed6f55SEd Maste movq %r9 ,TWEAK+ 8(%rdx) 121424ed6f55SEd Maste Reset_Stack 121524ed6f55SEd Maste ret 121624ed6f55SEd Maste# 121724ed6f55SEd MasteSkein1024_Process_Block_End: 121824ed6f55SEd Maste# 121924ed6f55SEd Maste.if _SKEIN_DEBUG 122024ed6f55SEd MasteSkein_Debug_Round_1024: 122124ed6f55SEd Maste # call here with rdx = "round number", 122224ed6f55SEd Maste_SP_OFFS_ = 8*2 #stack "offset" here: rdx, return addr 122324ed6f55SEd Maste # 122424ed6f55SEd Maste #save rest of X[] state on stack so debug routines can access it 122524ed6f55SEd Maste .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15 122624ed6f55SEd Maste movq %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp) 122724ed6f55SEd Maste .endr 122824ed6f55SEd Maste # Figure out what to do with x0 (rdi). When rdx == 0 mod 4, it's already on stack 122924ed6f55SEd Maste cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always save 123024ed6f55SEd Maste jae save_x0 123124ed6f55SEd Maste testq $3,%rdx #otherwise only if rdx != 0 mod 4 123224ed6f55SEd Maste jz save_x0_not 123324ed6f55SEd Mastesave_x0: 123424ed6f55SEd Maste movq %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp) 123524ed6f55SEd Mastesave_x0_not: 123624ed6f55SEd Maste #figure out the x4/x6 swapping state and save the correct one! 123724ed6f55SEd Maste cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4 123824ed6f55SEd Maste jae save_x4 123924ed6f55SEd Maste testq $1,%rdx #and even ones have r4 as well 124024ed6f55SEd Maste jz save_x4 124124ed6f55SEd Maste movq %rcx,X_stk+8*6+_SP_OFFS_(%rsp) 124224ed6f55SEd Maste jmp debug_1024_go 124324ed6f55SEd Mastesave_x4: 124424ed6f55SEd Maste movq %rcx,X_stk+8*4+_SP_OFFS_(%rsp) 124524ed6f55SEd Mastedebug_1024_go: 124624ed6f55SEd Maste #now all is saved in Xstk[] except for rdx 124724ed6f55SEd Maste push %rsi #save two regs for BLK_BITS-specific parms 124824ed6f55SEd Maste push %rdi 124924ed6f55SEd Maste_SP_OFFS_ = _SP_OFFS_ + 16 #adjust stack offset accordingly (now 32) 125024ed6f55SEd Maste 125124ed6f55SEd Maste movq _SP_OFFS_-8(%rsp),%rsi #get back original %rdx (pushed on stack in macro call) 125224ed6f55SEd Maste movq %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[] 125324ed6f55SEd Maste 125424ed6f55SEd Maste movq ctxPtr+_SP_OFFS_(%rsp),%rsi #rsi = ctx_hdr_ptr 125524ed6f55SEd Maste movq $1024,%rdi #rdi = block size 125624ed6f55SEd Maste jmp Skein_Debug_Round_Common 125724ed6f55SEd Maste.endif 125824ed6f55SEd Maste# 125924ed6f55SEd Maste.if _SKEIN_CODE_SIZE 126024ed6f55SEd MasteC_label Skein1024_Process_Block_CodeSize 126124ed6f55SEd Maste movq $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax 126224ed6f55SEd Maste ret 126324ed6f55SEd Maste# 126424ed6f55SEd MasteC_label Skein1024_Unroll_Cnt 126524ed6f55SEd Maste .if _UNROLL_CNT <> (ROUNDS_1024/8) 126624ed6f55SEd Maste movq $_UNROLL_CNT,%rax 126724ed6f55SEd Maste .else 126824ed6f55SEd Maste xorq %rax,%rax 126924ed6f55SEd Maste .endif 127024ed6f55SEd Maste ret 127124ed6f55SEd Maste.endif 127224ed6f55SEd Maste# 127324ed6f55SEd Maste.endif # _USE_ASM_ and 1024 127424ed6f55SEd Maste# 127524ed6f55SEd Maste.if _SKEIN_DEBUG 127624ed6f55SEd Maste#---------------------------------------------------------------- 127724ed6f55SEd Maste#local debug routine to set up for calls to: 127824ed6f55SEd Maste# void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X) 127924ed6f55SEd Maste# [ rdi rsi rdx rcx] 128024ed6f55SEd Maste# 128124ed6f55SEd Maste# here with %rdx = round number 128224ed6f55SEd Maste# %rsi = ctx_hdr_ptr 128324ed6f55SEd Maste# %rdi = block size (256/512/1024) 128424ed6f55SEd Maste# on stack: saved rdi, saved rsi, retAddr, saved rdx 128524ed6f55SEd Maste# 128624ed6f55SEd MasteSkein_Debug_Round_Common: 128724ed6f55SEd Maste_SP_OFFS_ = 32 #account for four words on stack already 128824ed6f55SEd Maste .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15 #save the rest of the regs 128924ed6f55SEd Maste pushq %\_rr_ 129024ed6f55SEd Maste_SP_OFFS_ = _SP_OFFS_+8 129124ed6f55SEd Maste .endr 129224ed6f55SEd Maste .if (_SP_OFFS_ % 16) # make sure stack is still 16-byte aligned here 129324ed6f55SEd Maste .error "Debug_Round_Common: stack alignment" 129424ed6f55SEd Maste .endif 129524ed6f55SEd Maste # compute %rcx = ptr to the X[] array on the stack (final parameter to call) 129624ed6f55SEd Maste leaq X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address 129724ed6f55SEd Maste cmpq $SKEIN_RND_FEED_FWD,%rdx #special handling for feedforward "round"? 129824ed6f55SEd Maste jnz _got_rcxA 129924ed6f55SEd Maste leaq X_VARS(%rsi),%rcx 130024ed6f55SEd Maste_got_rcxA: 130124ed6f55SEd Maste .if _USE_ASM_ & 1024 130224ed6f55SEd Maste # special handling for 1024-bit case 130324ed6f55SEd Maste # (for rounds right before with key injection: 130424ed6f55SEd Maste # use xDebug_1024[] instead of X_stk[]) 130524ed6f55SEd Maste cmpq $SKEIN_RND_SPECIAL,%rdx 130624ed6f55SEd Maste jae _got_rcxB #must be a normal round 130724ed6f55SEd Maste orq %rdx,%rdx 130824ed6f55SEd Maste jz _got_rcxB #just before key injection 130924ed6f55SEd Maste test $3,%rdx 131024ed6f55SEd Maste jne _got_rcxB 131124ed6f55SEd Maste cmp $1024,%rdi #only 1024-bit(s) for now 131224ed6f55SEd Maste jne _got_rcxB 131324ed6f55SEd Maste leaq xDebug_1024+_SP_OFFS_(%rsp),%rcx 131424ed6f55SEd Maste_got_rcxB: 131524ed6f55SEd Maste .endif 131624ed6f55SEd Maste call Skein_Show_Round #call external debug handler 131724ed6f55SEd Maste 131824ed6f55SEd Maste .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax #restore regs 131924ed6f55SEd Maste popq %\_rr_ 132024ed6f55SEd Maste_SP_OFFS_ = _SP_OFFS_-8 132124ed6f55SEd Maste .endr 132224ed6f55SEd Maste .if _SP_OFFS_ - 32 132324ed6f55SEd Maste .error "Debug_Round_Common: push/pop misalignment!" 132424ed6f55SEd Maste .endif 132524ed6f55SEd Maste popq %rdi 132624ed6f55SEd Maste popq %rsi 132724ed6f55SEd Maste ret 132824ed6f55SEd Maste.endif 132924ed6f55SEd Maste#---------------------------------------------------------------- 133024ed6f55SEd Maste .section .note.GNU-stack,"",@progbits 133124ed6f55SEd Maste 133224ed6f55SEd Maste .end 1333