xref: /freebsd/sys/crypto/skein/amd64/skein_block_asm.S (revision 0946e70a3b60dec23922cf3e0c313cb0917fee0a)
1#
2#----------------------------------------------------------------
3# 64-bit x86 assembler code (gnu as) for Skein block functions
4#
5# Author: Doug Whiting, Hifn/Exar
6#
7# This code is released to the public domain.
8#----------------------------------------------------------------
9# $FreeBSD$
10#
11    .text
12    .altmacro
13#ifndef __clang__
14    .psize 0,128                            #list file has no page boundaries
15#endif
16#
17_MASK_ALL_  =  (256+512+1024)               #all three algorithm bits
18_MAX_FRAME_ =  240
19#
20#################
21#ifndef SKEIN_USE_ASM
22_USE_ASM_         = _MASK_ALL_
23#else
24_USE_ASM_         = SKEIN_USE_ASM
25#endif
26#################
27#configure loop unrolling
28#ifndef SKEIN_LOOP
29_SKEIN_LOOP       =   2                     #default is fully unrolled for 256/512, twice for 1024
30#else
31_SKEIN_LOOP       = SKEIN_LOOP
32  .irp _NN_,%_SKEIN_LOOP                #only display loop unrolling if default changed on command line
33#.print  "+++ SKEIN_LOOP = \_NN_"
34  .endr
35#endif
36# the unroll counts (0 --> fully unrolled)
37SKEIN_UNROLL_256  = (_SKEIN_LOOP / 100) % 10
38SKEIN_UNROLL_512  = (_SKEIN_LOOP /  10) % 10
39SKEIN_UNROLL_1024 = (_SKEIN_LOOP      ) % 10
40#
41SKEIN_ASM_UNROLL  = 0
42  .irp _NN_,256,512,1024
43    .if (SKEIN_UNROLL_\_NN_) == 0
44SKEIN_ASM_UNROLL  = SKEIN_ASM_UNROLL + \_NN_
45    .endif
46  .endr
47#################
48#
49.ifndef SKEIN_ROUNDS
50ROUNDS_256  =   72
51ROUNDS_512  =   72
52ROUNDS_1024 =   80
53.else
54ROUNDS_256  = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
55ROUNDS_512  = 8*((((SKEIN_ROUNDS /  10) + 5) % 10) + 5)
56ROUNDS_1024 = 8*((((SKEIN_ROUNDS      ) + 5) % 10) + 5)
57# only display rounds if default size is changed on command line
58.irp _NN_,256,512,1024
59  .if _USE_ASM_ && \_NN_
60    .irp _RR_,%(ROUNDS_\_NN_)
61      .if _NN_ < 1024
62.print  "+++ SKEIN_ROUNDS_\_NN_  = \_RR_"
63      .else
64.print  "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
65      .endif
66    .endr
67  .endif
68.endr
69.endif
70#################
71#
72.ifdef SKEIN_CODE_SIZE
73_SKEIN_CODE_SIZE = (1)
74.else
75.ifdef  SKEIN_PERF                           #use code size if SKEIN_PERF is defined
76_SKEIN_CODE_SIZE = (1)
77.else
78_SKEIN_CODE_SIZE = (0)
79.endif
80.endif
81#
82#################
83#
84.ifndef SKEIN_DEBUG
85_SKEIN_DEBUG      = 0
86.else
87_SKEIN_DEBUG      = 1
88.endif
89#################
90#
91# define offsets of fields in hash context structure
92#
93HASH_BITS   =   0                   #bits of hash output
94BCNT        =   8 + HASH_BITS       #number of bytes in BUFFER[]
95TWEAK       =   8 + BCNT            #tweak values[0..1]
96X_VARS      =  16 + TWEAK           #chaining vars
97#
98#(Note: buffer[] in context structure is NOT needed here :-)
99#
100KW_PARITY   =   0x1BD11BDAA9FC1A22  #overall parity of key schedule words
101FIRST_MASK  =   ~ (1 <<  6)
102FIRST_MASK64=   ~ (1 << 62)
103#
104# rotation constants for Skein
105#
106RC_256_0_0  = 14
107RC_256_0_1  = 16
108
109RC_256_1_0  = 52
110RC_256_1_1  = 57
111
112RC_256_2_0  = 23
113RC_256_2_1  = 40
114
115RC_256_3_0  =  5
116RC_256_3_1  = 37
117
118RC_256_4_0  = 25
119RC_256_4_1  = 33
120
121RC_256_5_0  = 46
122RC_256_5_1  = 12
123
124RC_256_6_0  = 58
125RC_256_6_1  = 22
126
127RC_256_7_0  = 32
128RC_256_7_1  = 32
129
130RC_512_0_0  = 46
131RC_512_0_1  = 36
132RC_512_0_2  = 19
133RC_512_0_3  = 37
134
135RC_512_1_0  = 33
136RC_512_1_1  = 27
137RC_512_1_2  = 14
138RC_512_1_3  = 42
139
140RC_512_2_0  = 17
141RC_512_2_1  = 49
142RC_512_2_2  = 36
143RC_512_2_3  = 39
144
145RC_512_3_0  = 44
146RC_512_3_1  =  9
147RC_512_3_2  = 54
148RC_512_3_3  = 56
149
150RC_512_4_0  = 39
151RC_512_4_1  = 30
152RC_512_4_2  = 34
153RC_512_4_3  = 24
154
155RC_512_5_0  = 13
156RC_512_5_1  = 50
157RC_512_5_2  = 10
158RC_512_5_3  = 17
159
160RC_512_6_0  = 25
161RC_512_6_1  = 29
162RC_512_6_2  = 39
163RC_512_6_3  = 43
164
165RC_512_7_0  =  8
166RC_512_7_1  = 35
167RC_512_7_2  = 56
168RC_512_7_3  = 22
169
170RC_1024_0_0 = 24
171RC_1024_0_1 = 13
172RC_1024_0_2 =  8
173RC_1024_0_3 = 47
174RC_1024_0_4 =  8
175RC_1024_0_5 = 17
176RC_1024_0_6 = 22
177RC_1024_0_7 = 37
178
179RC_1024_1_0 = 38
180RC_1024_1_1 = 19
181RC_1024_1_2 = 10
182RC_1024_1_3 = 55
183RC_1024_1_4 = 49
184RC_1024_1_5 = 18
185RC_1024_1_6 = 23
186RC_1024_1_7 = 52
187
188RC_1024_2_0 = 33
189RC_1024_2_1 =  4
190RC_1024_2_2 = 51
191RC_1024_2_3 = 13
192RC_1024_2_4 = 34
193RC_1024_2_5 = 41
194RC_1024_2_6 = 59
195RC_1024_2_7 = 17
196
197RC_1024_3_0 =  5
198RC_1024_3_1 = 20
199RC_1024_3_2 = 48
200RC_1024_3_3 = 41
201RC_1024_3_4 = 47
202RC_1024_3_5 = 28
203RC_1024_3_6 = 16
204RC_1024_3_7 = 25
205
206RC_1024_4_0 = 41
207RC_1024_4_1 =  9
208RC_1024_4_2 = 37
209RC_1024_4_3 = 31
210RC_1024_4_4 = 12
211RC_1024_4_5 = 47
212RC_1024_4_6 = 44
213RC_1024_4_7 = 30
214
215RC_1024_5_0 = 16
216RC_1024_5_1 = 34
217RC_1024_5_2 = 56
218RC_1024_5_3 = 51
219RC_1024_5_4 =  4
220RC_1024_5_5 = 53
221RC_1024_5_6 = 42
222RC_1024_5_7 = 41
223
224RC_1024_6_0 = 31
225RC_1024_6_1 = 44
226RC_1024_6_2 = 47
227RC_1024_6_3 = 46
228RC_1024_6_4 = 19
229RC_1024_6_5 = 42
230RC_1024_6_6 = 44
231RC_1024_6_7 = 25
232
233RC_1024_7_0 =  9
234RC_1024_7_1 = 48
235RC_1024_7_2 = 35
236RC_1024_7_3 = 52
237RC_1024_7_4 = 23
238RC_1024_7_5 = 31
239RC_1024_7_6 = 37
240RC_1024_7_7 = 20
241#
242#  Input:  reg
243# Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
244#
245.macro RotL64   reg,BLK_SIZE,ROUND_NUM,MIX_NUM
246  .if RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM  #is there anything to do?
247    rolq    $RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM,%\reg
248  .endif
249.endm
250#
251#----------------------------------------------------------------
252#
253# MACROS: define local vars and configure stack
254#
255#----------------------------------------------------------------
256# declare allocated space on the stack
257.macro StackVar localName,localSize
258\localName  =   _STK_OFFS_
259_STK_OFFS_  =   _STK_OFFS_+(\localSize)
260.endm #StackVar
261#
262#----------------------------------------------------------------
263#
264# MACRO: Configure stack frame, allocate local vars
265#
266.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt
267    WCNT    =    (\BLK_BITS)/64
268#
269_PushCnt_   =   0                   #save nonvolatile regs on stack
270  .irp _reg_,rbp,rbx,r12,r13,r14,r15
271       pushq    %\_reg_
272_PushCnt_ = _PushCnt_ + 1           #track count to keep alignment
273  .endr
274#
275_STK_OFFS_  =   0                   #starting offset from rsp
276    #---- local  variables         #<-- rsp
277    StackVar    X_stk  ,8*(WCNT)    #local context vars
278    StackVar    ksTwk  ,8*3         #key schedule: tweak words
279    StackVar    ksKey  ,8*(WCNT)+8  #key schedule: key   words
280  .if (SKEIN_ASM_UNROLL && (\BLK_BITS)) == 0
281    StackVar    ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen
282  .endif
283    StackVar    Wcopy  ,8*(WCNT)    #copy of input block
284  .if _SKEIN_DEBUG
285  .if \debugCnt + 0                 #temp location for debug X[] info
286    StackVar    xDebug_\BLK_BITS ,8*(\debugCnt)
287  .endif
288  .endif
289  .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0
290    StackVar    align16,8           #keep 16-byte aligned (adjust for retAddr?)
291tmpStk_\BLK_BITS = align16          #use this
292  .endif
293    #---- saved caller parameters (from regs rdi, rsi, rdx, rcx)
294    StackVar    ctxPtr ,8           #context ptr
295    StackVar    blkPtr ,8           #pointer to block data
296    StackVar    blkCnt ,8           #number of full blocks to process
297    StackVar    bitAdd ,8           #bit count to add to tweak
298LOCAL_SIZE  =   _STK_OFFS_          #size of "local" vars
299    #----
300    StackVar    savRegs,8*_PushCnt_ #saved registers
301    StackVar    retAddr,8           #return address
302    #---- caller's stack frame (aligned mod 16)
303#
304# set up the stack frame pointer (rbp)
305#
306FRAME_OFFS  =   ksTwk + 128         #allow short (negative) offset to ksTwk, kwKey
307  .if FRAME_OFFS > _STK_OFFS_       #keep rbp in the "locals" range
308FRAME_OFFS  =      _STK_OFFS_
309  .endif
310F_O         =   -FRAME_OFFS
311#
312  #put some useful defines in the .lst file (for grep)
313__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE
314__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_
315__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS
316#
317# Notes on stack frame setup:
318#   * the most frequently used variable is X_stk[], based at [rsp+0]
319#   * the next most used is the key schedule arrays, ksKey and ksTwk
320#       so rbp is "centered" there, allowing short offsets to the key
321#       schedule even in 1024-bit Skein case
322#   * the Wcopy variables are infrequently accessed, but they have long
323#       offsets from both rsp and rbp only in the 1024-bit case.
324#   * all other local vars and calling parameters can be accessed
325#       with short offsets, except in the 1024-bit case
326#
327    subq    $LOCAL_SIZE,%rsp        #make room for the locals
328    leaq    FRAME_OFFS(%rsp),%rbp   #maximize use of short offsets
329    movq    %rdi, ctxPtr+F_O(%rbp)  #save caller's parameters on the stack
330    movq    %rsi, blkPtr+F_O(%rbp)
331    movq    %rdx, blkCnt+F_O(%rbp)
332    movq    %rcx, bitAdd+F_O(%rbp)
333#
334.endm #Setup_Stack
335#
336#----------------------------------------------------------------
337#
338.macro Reset_Stack
339    addq    $LOCAL_SIZE,%rsp        #get rid of locals (wipe?)
340  .irp _reg_,r15,r14,r13,r12,rbx,rbp
341    popq    %\_reg_                 #restore caller's regs
342_PushCnt_ = _PushCnt_ - 1
343  .endr
344  .if _PushCnt_
345    .error  "Mismatched push/pops?"
346  .endif
347.endm # Reset_Stack
348#
349#----------------------------------------------------------------
350# macros to help debug internals
351#
352.if _SKEIN_DEBUG
353    .extern  Skein_Show_Block     #calls to C routines
354    .extern  Skein_Show_Round
355#
356SKEIN_RND_SPECIAL       =   1000
357SKEIN_RND_KEY_INITIAL   =   SKEIN_RND_SPECIAL+0
358SKEIN_RND_KEY_INJECT    =   SKEIN_RND_SPECIAL+1
359SKEIN_RND_FEED_FWD      =   SKEIN_RND_SPECIAL+2
360#
361.macro Skein_Debug_Block BLK_BITS
362#
363#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
364#                     const u08b_t *blkPtr, const u64b_t *wPtr,
365#                     const u64b_t *ksPtr,const u64b_t *tsPtr)
366#
367_NN_ = 0
368  .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11
369    pushq   %\_reg_                 #save all volatile regs on tack before the call
370_NN_ = _NN_ + 1
371  .endr
372    # get and push call parameters
373    movq    $\BLK_BITS      ,%rdi   #bits
374    movq    ctxPtr+F_O(%rbp),%rsi   #h (pointer)
375    leaq    X_VARS    (%rsi),%rdx   #X (pointer)
376    movq    blkPtr+F_O(%rbp),%rcx   #blkPtr
377    leaq    Wcopy +F_O(%rbp),%r8    #wPtr
378    leaq    ksKey +F_O(%rbp),%r9    #key pointer
379    leaq    ksTwk +F_O(%rbp),%rax   #tweak pointer
380    pushq   %rax                    #   (pass on the stack)
381    call    Skein_Show_Block        #call external debug handler
382    addq    $8*1,%rsp               #discard parameters on stack
383  .if (_NN_ % 2 ) == 0              #check stack alignment
384    .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS"
385  .endif
386  .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax
387    popq    %\_reg_                 #restore regs
388_NN_ = _NN_ - 1
389  .endr
390  .if _NN_
391    .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS"
392  .endif
393.endm # Skein_Debug_Block
394#
395# the macro to "call" to debug a round
396#
397.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
398    # call the appropriate (local) debug "function"
399    pushq   %rdx                    #save rdx, so we can use it for round "number"
400  .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
401    movq    $\R,%rdx
402  .else                             #compute round number using edi
403_rOffs_ = \RDI_OFFS + 0
404   .if \BLK_BITS == 1024
405    movq    rIdx_offs+8(%rsp),%rdx  #get rIdx off the stack (adjust for pushq rdx above)
406    leaq    1+(((\R)-1) && 3)+_rOffs_(,%rdx,4),%rdx
407   .else
408    leaq    1+(((\R)-1) && 3)+_rOffs_(,%rdi,4),%rdx
409   .endif
410  .endif
411    call    Skein_Debug_Round_\BLK_BITS
412    popq    %rdx                    #restore origianl rdx value
413#
414    afterOp
415.endm  #  Skein_Debug_Round
416.else  #------- _SKEIN_DEBUG (dummy macros if debug not enabled)
417.macro Skein_Debug_Block BLK_BITS
418.endm
419#
420.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
421.endm
422#
423.endif # _SKEIN_DEBUG
424#
425#----------------------------------------------------------------
426#
427.macro  addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
428  .if \immOffs + 0
429       leaq    \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
430  .elseif ((\useAddOp + 0) == 0)
431    .ifndef ASM_NO_LEA  #lea seems to be faster on Core 2 Duo CPUs!
432       leaq   (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
433    .else
434       addq    %\srcReg_A\srcReg_B,%\dstReg
435    .endif
436  .else
437       addq    %\srcReg_A\srcReg_B,%\dstReg
438  .endif
439.endm
440
441# keep Intel-style ordering here, to match addReg
442.macro  xorReg dstReg,srcReg_A,srcReg_B
443        xorq   %\srcReg_A\srcReg_B,%\dstReg
444.endm
445#
446#----------------------------------------------------------------
447#
448.macro C_label lName
449 \lName:        #use both "genders" to work across linkage conventions
450_\lName:
451    .global  \lName
452    .global _\lName
453.endm
454#
455#=================================== Skein_256 =============================================
456#
457.if _USE_ASM_ & 256
458#
459# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
460#
461#################
462#
463# code
464#
465C_label Skein_256_Process_Block
466    Setup_Stack 256,((ROUNDS_256/8)+1)
467    movq    TWEAK+8(%rdi),%r14
468    jmp     Skein_256_block_loop
469    .p2align 4
470    # main hash loop for Skein_256
471Skein_256_block_loop:
472    #
473    # general register usage:
474    #   RAX..RDX        = X0..X3
475    #   R08..R12        = ks[0..4]
476    #   R13..R15        = ts[0..2]
477    #   RSP, RBP        = stack/frame pointers
478    #   RDI             = round counter or context pointer
479    #   RSI             = temp
480    #
481    movq    TWEAK+0(%rdi)     ,%r13
482    addq    bitAdd+F_O(%rbp)  ,%r13  #computed updated tweak value T0
483    movq    %r14              ,%r15
484    xorq    %r13              ,%r15  #now %r13.%r15 is set as the tweak
485
486    movq    $KW_PARITY        ,%r12
487    movq       X_VARS+ 0(%rdi),%r8
488    movq       X_VARS+ 8(%rdi),%r9
489    movq       X_VARS+16(%rdi),%r10
490    movq       X_VARS+24(%rdi),%r11
491    movq    %r13,TWEAK+0(%rdi)       #save updated tweak value ctx->h.T[0]
492    xorq    %r8               ,%r12  #start accumulating overall parity
493
494    movq    blkPtr +F_O(%rbp) ,%rsi  #esi --> input block
495    xorq    %r9               ,%r12
496    movq     0(%rsi)          ,%rax  #get X[0..3]
497    xorq    %r10              ,%r12
498    movq     8(%rsi)          ,%rbx
499    xorq    %r11              ,%r12
500    movq    16(%rsi)          ,%rcx
501    movq    24(%rsi)          ,%rdx
502
503    movq    %rax,Wcopy+ 0+F_O(%rbp)  #save copy of input block
504    movq    %rbx,Wcopy+ 8+F_O(%rbp)
505    movq    %rcx,Wcopy+16+F_O(%rbp)
506    movq    %rdx,Wcopy+24+F_O(%rbp)
507
508    addq    %r8 ,%rax                #initial key injection
509    addq    %r9 ,%rbx
510    addq    %r10,%rcx
511    addq    %r11,%rdx
512    addq    %r13,%rbx
513    addq    %r14,%rcx
514
515.if _SKEIN_DEBUG
516    movq    %r14,TWEAK+ 8(%rdi)      #save updated tweak T[1] (start bit cleared?)
517    movq    %r8 ,ksKey+ 0+F_O(%rbp)  #save key schedule on stack for Skein_Debug_Block
518    movq    %r9 ,ksKey+ 8+F_O(%rbp)
519    movq    %r10,ksKey+16+F_O(%rbp)
520    movq    %r11,ksKey+24+F_O(%rbp)
521    movq    %r12,ksKey+32+F_O(%rbp)
522
523    movq    %r13,ksTwk+ 0+F_O(%rbp)
524    movq    %r14,ksTwk+ 8+F_O(%rbp)
525    movq    %r15,ksTwk+16+F_O(%rbp)
526
527    movq    %rax,X_stk + 0(%rsp)     #save X[] on stack for Skein_Debug_Block
528    movq    %rbx,X_stk + 8(%rsp)
529    movq    %rcx,X_stk +16(%rsp)
530    movq    %rdx,X_stk +24(%rsp)
531
532    Skein_Debug_Block 256            #debug dump
533    Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
534.endif
535#
536.if ((SKEIN_ASM_UNROLL & 256) == 0)
537    movq    %r8 ,ksKey+40+F_O(%rbp)  #save key schedule on stack for looping code
538    movq    %r9 ,ksKey+ 8+F_O(%rbp)
539    movq    %r10,ksKey+16+F_O(%rbp)
540    movq    %r11,ksKey+24+F_O(%rbp)
541    movq    %r12,ksKey+32+F_O(%rbp)
542
543    movq    %r13,ksTwk+24+F_O(%rbp)
544    movq    %r14,ksTwk+ 8+F_O(%rbp)
545    movq    %r15,ksTwk+16+F_O(%rbp)
546.endif
547    addq    $WCNT*8,%rsi             #skip the block
548    movq    %rsi,blkPtr  +F_O(%rbp)  #update block pointer
549    #
550    # now the key schedule is computed. Start the rounds
551    #
552.if SKEIN_ASM_UNROLL & 256
553_UNROLL_CNT =   ROUNDS_256/8
554.else
555_UNROLL_CNT =   SKEIN_UNROLL_256
556  .if ((ROUNDS_256/8) % _UNROLL_CNT)
557    .error "Invalid SKEIN_UNROLL_256"
558  .endif
559    xorq    %rdi,%rdi                #rdi = iteration count
560Skein_256_round_loop:
561.endif
562_Rbase_ = 0
563.rept _UNROLL_CNT*2
564    # all X and ks vars in regs      # (ops to "rotate" ks vars, via mem, if not unrolled)
565    # round 4*_RBase_ + 0
566    addReg  rax, rbx
567    RotL64  rbx, 256,%((4*_Rbase_+0) % 8),0
568    addReg  rcx, rdx
569                .if (SKEIN_ASM_UNROLL & 256) == 0
570                    movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8
571                .endif
572    xorReg  rbx, rax
573    RotL64  rdx, 256,%((4*_Rbase_+0) % 8),1
574    xorReg  rdx, rcx
575  .if SKEIN_ASM_UNROLL & 256
576    .irp _r0_,%( 8+(_Rbase_+3) % 5)
577    .irp _r1_,%(13+(_Rbase_+2) % 3)
578      leaq   (%r\_r0_,%r\_r1_),%rdi    #precompute key injection value for %rcx
579    .endr
580    .endr
581  .endif
582                .if (SKEIN_ASM_UNROLL & 256) == 0
583                    movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13
584                .endif
585    Skein_Debug_Round 256,%(4*_Rbase_+1)
586
587    # round 4*_Rbase_ + 1
588    addReg  rax, rdx
589    RotL64  rdx, 256,%((4*_Rbase_+1) % 8),0
590    xorReg  rdx, rax
591                .if (SKEIN_ASM_UNROLL & 256) == 0
592                    movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9
593                .endif
594    addReg  rcx, rbx
595    RotL64  rbx, 256,%((4*_Rbase_+1) % 8),1
596    xorReg  rbx, rcx
597                .if (SKEIN_ASM_UNROLL & 256) == 0
598                    movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11
599                .endif
600    Skein_Debug_Round 256,%(4*_Rbase_+2)
601 .if SKEIN_ASM_UNROLL & 256
602    .irp _r0_,%( 8+(_Rbase_+2) % 5)
603    .irp _r1_,%(13+(_Rbase_+1) % 3)
604      leaq   (%r\_r0_,%r\_r1_),%rsi     #precompute key injection value for %rbx
605    .endr
606    .endr
607 .endif
608    # round 4*_Rbase_ + 2
609    addReg  rax, rbx
610    RotL64  rbx, 256,%((4*_Rbase_+2) % 8),0
611    addReg  rcx, rdx
612                .if (SKEIN_ASM_UNROLL & 256) == 0
613                    movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10
614                .endif
615    xorReg  rbx, rax
616    RotL64  rdx, 256,%((4*_Rbase_+2) % 8),1
617    xorReg  rdx, rcx
618                .if (SKEIN_ASM_UNROLL & 256) == 0
619                    movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8)  #"rotate" the key
620                    leaq 1(%r11,%rdi),%r11               #precompute key + tweak
621                .endif
622    Skein_Debug_Round 256,%(4*_Rbase_+3)
623    # round 4*_Rbase_ + 3
624    addReg  rax, rdx
625    RotL64  rdx, 256,%((4*_Rbase_+3) % 8),0
626    addReg  rcx, rbx
627                .if (SKEIN_ASM_UNROLL & 256) == 0
628                    addq      ksTwk+8*2+F_O(%rbp,%rdi,8),%r10  #precompute key + tweak
629                    movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8)       #"rotate" the tweak
630                .endif
631    xorReg  rdx, rax
632    RotL64  rbx, 256,%((4*_Rbase_+3) % 8),1
633    xorReg  rbx, rcx
634    Skein_Debug_Round 256,%(4*_Rbase_+4)
635                .if (SKEIN_ASM_UNROLL & 256) == 0
636                    addReg r9 ,r13           #precompute key+tweak
637                .endif
638      #inject key schedule words
639_Rbase_ = _Rbase_+1
640  .if SKEIN_ASM_UNROLL & 256
641    addReg    rax,r,%(8+((_Rbase_+0) % 5))
642    addReg    rbx,rsi
643    addReg    rcx,rdi
644    addReg    rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_
645  .else
646    incq      %rdi
647    addReg    rax,r8
648    addReg    rcx,r10
649    addReg    rbx,r9
650    addReg    rdx,r11
651  .endif
652    Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
653.endr #rept _UNROLL_CNT
654#
655.if (SKEIN_ASM_UNROLL & 256) == 0
656    cmpq    $2*(ROUNDS_256/8),%rdi
657    jb      Skein_256_round_loop
658.endif # (SKEIN_ASM_UNROLL & 256) == 0
659    movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
660
661    #----------------------------
662    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..3}
663    movq    $FIRST_MASK64 ,%r14
664    xorq    Wcopy + 0+F_O (%rbp),%rax
665    xorq    Wcopy + 8+F_O (%rbp),%rbx
666    xorq    Wcopy +16+F_O (%rbp),%rcx
667    xorq    Wcopy +24+F_O (%rbp),%rdx
668    andq    TWEAK + 8     (%rdi),%r14
669    movq    %rax,X_VARS+ 0(%rdi)             #store final result
670    movq    %rbx,X_VARS+ 8(%rdi)
671    movq    %rcx,X_VARS+16(%rdi)
672    movq    %rdx,X_VARS+24(%rdi)
673
674    Skein_Debug_Round 256,SKEIN_RND_FEED_FWD
675
676    # go back for more blocks, if needed
677    decq    blkCnt+F_O(%rbp)
678    jnz     Skein_256_block_loop
679    movq    %r14,TWEAK + 8(%rdi)
680    Reset_Stack
681    ret
682Skein_256_Process_Block_End:
683
684  .if _SKEIN_DEBUG
685Skein_Debug_Round_256:               #here with rdx == round "number" from macro
686    pushq   %rsi                     #save two regs for BLK_BITS-specific parms
687    pushq   %rdi
688    movq    24(%rsp),%rdi            #get back original rdx (pushed on stack in macro call) to rdi
689    movq    %rax,X_stk+ 0+F_O(%rbp)  #save X[] state on stack so debug routines can access it
690    movq    %rbx,X_stk+ 8+F_O(%rbp)  #(use FP_ since rsp has changed!)
691    movq    %rcx,X_stk+16+F_O(%rbp)
692    movq    %rdi,X_stk+24+F_O(%rbp)
693
694    movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
695    movq    $256,%rdi                #now <rdi,rsi,rdx> are set for the call
696    jmp     Skein_Debug_Round_Common
697  .endif
698#
699.if _SKEIN_CODE_SIZE
700C_label  Skein_256_Process_Block_CodeSize
701    movq    $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax
702    ret
703#
704C_label Skein_256_Unroll_Cnt
705  .if _UNROLL_CNT <> ROUNDS_256/8
706    movq    $_UNROLL_CNT,%rax
707  .else
708    xorq    %rax,%rax
709  .endif
710    ret
711.endif
712#
713.endif #_USE_ASM_ & 256
714#
715#=================================== Skein_512 =============================================
716#
717.if _USE_ASM_ & 512
718#
719# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)
720#
721# X[i] == %r[8+i]          #register assignments for X[] values during rounds (i=0..7)
722#
723#################
724# MACRO: one round for 512-bit blocks
725#
726.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4
727#
728    addReg      r\rn0, r\rn1
729    RotL64      r\rn1, 512,%((\_Rn_) % 8),0
730    xorReg      r\rn1, r\rn0
731            \op1
732    addReg      r\rn2, r\rn3
733    RotL64      r\rn3, 512,%((\_Rn_) % 8),1
734    xorReg      r\rn3, r\rn2
735            \op2
736    addReg      r\rn4, r\rn5
737    RotL64      r\rn5, 512,%((\_Rn_) % 8),2
738    xorReg      r\rn5, r\rn4
739            \op3
740    addReg      r\rn6, r\rn7
741    RotL64      r\rn7, 512,%((\_Rn_) % 8),3
742    xorReg      r\rn7, r\rn6
743            \op4
744    Skein_Debug_Round 512,%(\_Rn_+1),-4
745#
746.endm #R_512_OneRound
747#
748#################
749# MACRO: eight rounds for 512-bit blocks
750#
751.macro R_512_FourRounds _RR_    #RR = base round number (0 % 8)
752  .if (SKEIN_ASM_UNROLL && 512)
753    # here for fully unrolled case.
754    _II_ = ((\_RR_)/4) + 1       #key injection counter
755    R_512_OneRound  8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>
756    R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>
757    R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>
758    R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,
759    # inject the key schedule
760    addq    ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8
761    addReg   r11, rax
762    addq    ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9
763    addReg   r12, rbx
764    addq    ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10
765    addReg   r13, rcx
766    addReg   r14, rdx
767    addReg   r15, rsi,,,(_II_)
768  .else
769    # here for looping case                                                    #"rotate" key/tweak schedule (move up on stack)
770    incq    %rdi                 #bump key injection counter
771    R_512_OneRound  8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq      ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq      ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>
772    R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8)     >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>
773    R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq      ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq      ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>
774    R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq      ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>
775    # inject the key schedule
776    addq    ksKey+8*0+F_O(%rbp,%rdi,8),%r8
777    addReg   r11, rax
778    addReg   r12, rbx
779    addq    ksKey+8*1+F_O(%rbp,%rdi,8),%r9
780    addReg   r13, rcx
781    addReg   r14, rdx
782    addq    ksKey+8*2+F_O(%rbp,%rdi,8),%r10
783    addReg   r15, rsi
784    addReg   r15, rdi              #inject the round number
785  .endif
786
787    #show the result of the key injection
788    Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT
789.endm #R_512_EightRounds
790#
791#################
792# instantiated code
793#
794C_label Skein_512_Process_Block
795    Setup_Stack 512,ROUNDS_512/8
796    movq    TWEAK+ 8(%rdi),%rbx
797    jmp     Skein_512_block_loop
798    .p2align 4
799    # main hash loop for Skein_512
800Skein_512_block_loop:
801    # general register usage:
802    #   RAX..RDX       = temps for key schedule pre-loads
803    #   R8 ..R15       = X0..X7
804    #   RSP, RBP       = stack/frame pointers
805    #   RDI            = round counter or context pointer
806    #   RSI            = temp
807    #
808    movq    TWEAK +  0(%rdi),%rax
809    addq    bitAdd+F_O(%rbp),%rax     #computed updated tweak value T0
810    movq    %rbx,%rcx
811    xorq    %rax,%rcx                 #%rax/%rbx/%rcx = tweak schedule
812    movq    %rax,TWEAK+ 0    (%rdi)   #save updated tweak value ctx->h.T[0]
813    movq    %rax,ksTwk+ 0+F_O(%rbp)
814    movq    $KW_PARITY,%rdx
815    movq    blkPtr +F_O(%rbp),%rsi    #%rsi --> input block
816    movq    %rbx,ksTwk+ 8+F_O(%rbp)
817    movq    %rcx,ksTwk+16+F_O(%rbp)
818    .irp _Rn_,8,9,10,11,12,13,14,15
819      movq  X_VARS+8*(\_Rn_-8)(%rdi),%r\_Rn_
820      xorq  %r\_Rn_,%rdx              #compute overall parity
821      movq  %r\_Rn_,ksKey+8*(\_Rn_-8)+F_O(%rbp)
822    .endr                             #load state into %r8 ..%r15, compute parity
823      movq  %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity
824
825    addReg   r13,rax                  #precompute key injection for tweak
826    addReg   r14, rbx
827.if _SKEIN_DEBUG
828    movq    %rbx,TWEAK+ 8(%rdi)       #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below
829.endif
830    movq     0(%rsi),%rax             #load input block
831    movq     8(%rsi),%rbx
832    movq    16(%rsi),%rcx
833    movq    24(%rsi),%rdx
834    addReg   r8 , rax                 #do initial key injection
835    addReg   r9 , rbx
836    movq    %rax,Wcopy+ 0+F_O(%rbp)   #keep local copy for feedforward
837    movq    %rbx,Wcopy+ 8+F_O(%rbp)
838    addReg   r10, rcx
839    addReg   r11, rdx
840    movq    %rcx,Wcopy+16+F_O(%rbp)
841    movq    %rdx,Wcopy+24+F_O(%rbp)
842
843    movq    32(%rsi),%rax
844    movq    40(%rsi),%rbx
845    movq    48(%rsi),%rcx
846    movq    56(%rsi),%rdx
847    addReg   r12, rax
848    addReg   r13, rbx
849    addReg   r14, rcx
850    addReg   r15, rdx
851    movq    %rax,Wcopy+32+F_O(%rbp)
852    movq    %rbx,Wcopy+40+F_O(%rbp)
853    movq    %rcx,Wcopy+48+F_O(%rbp)
854    movq    %rdx,Wcopy+56+F_O(%rbp)
855
856.if _SKEIN_DEBUG
857    .irp _Rn_,8,9,10,11,12,13,14,15   #save values on stack for debug output
858      movq  %r\_Rn_,X_stk+8*(\_Rn_-8)(%rsp)
859    .endr
860
861    Skein_Debug_Block 512             #debug dump
862    Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL
863.endif
864    addq    $8*WCNT,%rsi              #skip the block
865    movq    %rsi,blkPtr+F_O(%rbp)     #update block pointer
866    #
867    #################
868    # now the key schedule is computed. Start the rounds
869    #
870.if SKEIN_ASM_UNROLL & 512
871_UNROLL_CNT =   ROUNDS_512/8
872.else
873_UNROLL_CNT =   SKEIN_UNROLL_512
874  .if ((ROUNDS_512/8) % _UNROLL_CNT)
875    .error "Invalid SKEIN_UNROLL_512"
876  .endif
877    xorq    %rdi,%rdi                 #rdi = round counter
878Skein_512_round_loop:
879.endif
880#
881_Rbase_ = 0
882.rept _UNROLL_CNT*2
883      R_512_FourRounds %(4*_Rbase_+00)
884_Rbase_ = _Rbase_+1
885.endr #rept _UNROLL_CNT
886#
887.if (SKEIN_ASM_UNROLL & 512) == 0
888    cmpq    $2*(ROUNDS_512/8),%rdi
889    jb      Skein_512_round_loop
890    movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
891.endif
892    # end of rounds
893    #################
894    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..7}
895    .irp _Rn_,8,9,10,11,12,13,14,15
896  .if (\_Rn_ == 8)
897    movq    $FIRST_MASK64,%rbx
898  .endif
899      xorq  Wcopy+8*(\_Rn_-8)+F_O(%rbp),%r\_Rn_  #feedforward XOR
900      movq  %r\_Rn_,X_VARS+8*(\_Rn_-8)(%rdi)     #and store result
901  .if (\_Rn_ == 14)
902    andq    TWEAK+ 8(%rdi),%rbx
903  .endif
904    .endr
905    Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
906
907    # go back for more blocks, if needed
908    decq    blkCnt+F_O(%rbp)
909    jnz     Skein_512_block_loop
910    movq    %rbx,TWEAK + 8(%rdi)
911
912    Reset_Stack
913    ret
914Skein_512_Process_Block_End:
915#
916  .if _SKEIN_DEBUG
917# call here with rdx  = "round number"
918Skein_Debug_Round_512:
919    pushq   %rsi                     #save two regs for BLK_BITS-specific parms
920    pushq   %rdi
921  .irp _Rn_,8,9,10,11,12,13,14,15    #save X[] state on stack so debug routines can access it
922    movq    %r\_Rn_,X_stk+8*(\_Rn_-8)+F_O(%rbp)
923  .endr
924    movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
925    movq    $512,%rdi                #now <rdi,rsi,rdx> are set for the call
926    jmp     Skein_Debug_Round_Common
927  .endif
928#
929.if _SKEIN_CODE_SIZE
930C_label Skein_512_Process_Block_CodeSize
931    movq    $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax
932    ret
933#
934C_label Skein_512_Unroll_Cnt
935  .if _UNROLL_CNT <> (ROUNDS_512/8)
936    movq    $_UNROLL_CNT,%rax
937  .else
938    xorq    %rax,%rax
939  .endif
940    ret
941.endif
942#
943.endif # _USE_ASM_ & 512
944#
945#=================================== Skein1024 =============================================
946.if _USE_ASM_ & 1024
947#
948# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
949#
950#################
951# use details of permutation to make register assignments
952#
953o1K_rdi =  0        #offsets in X[] associated with each register
954o1K_rsi =  1
955o1K_rbp =  2
956o1K_rax =  3
957o1K_rcx =  4        #rcx is "shared" with X6, since X4/X6 alternate
958o1K_rbx =  5
959o1K_rdx =  7
960o1K_r8  =  8
961o1K_r9  =  9
962o1K_r10 = 10
963o1K_r11 = 11
964o1K_r12 = 12
965o1K_r13 = 13
966o1K_r14 = 14
967o1K_r15 = 15
968#
969rIdx_offs = tmpStk_1024
970#
971.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1
972    addReg      \reg0 , \reg1                      #perform the MIX
973    RotL64      \reg1 , 1024,%((\_RN0_) % 8),\_Rn1_
974    xorReg      \reg1 , \reg0
975.if ((\_RN0_) && 3) == 3        #time to do key injection?
976 .if _SKEIN_DEBUG
977    movq       %\reg0 , xDebug_1024+8*\w0(%rsp)    #save intermediate values for Debug_Round
978    movq       %\reg1 , xDebug_1024+8*\w1(%rsp)    # (before inline key injection)
979 .endif
980_II_ = ((\_RN0_)/4)+1           #injection count
981 .if SKEIN_ASM_UNROLL && 1024   #here to do fully unrolled key injection
982    addq        ksKey+ 8*((_II_+\w0) % 17)(%rsp),%\reg0
983    addq        ksKey+ 8*((_II_+\w1) % 17)(%rsp),%\reg1
984  .if     \w1 == 13                                #tweak injection
985    addq        ksTwk+ 8*((_II_+ 0) %  3)(%rsp),%\reg1
986  .elseif \w0 == 14
987    addq        ksTwk+ 8*((_II_+ 1) %  3)(%rsp),%\reg0
988  .elseif \w1 == 15
989    addq        $_II_, %\reg1                      #(injection counter)
990  .endif
991 .else                          #here to do looping  key injection
992  .if  (\w0 == 0)
993    movq        %rdi, X_stk+8*\w0(%rsp)            #if so, store N0 so we can use reg as index
994    movq         rIdx_offs(%rsp),%rdi              #get the injection counter index into rdi
995  .else
996    addq         ksKey+8+8*\w0(%rsp,%rdi,8),%\reg0 #even key injection
997  .endif
998  .if     \w1 == 13                                #tweak injection
999    addq         ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1
1000  .elseif \w0 == 14
1001    addq         ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0
1002  .elseif \w1 == 15
1003    addReg      \reg1,rdi,,,1                      #(injection counter)
1004  .endif
1005    addq         ksKey+8+8*\w1(%rsp,%rdi,8),%\reg1 #odd key injection
1006 .endif
1007.endif
1008    # insert the op provided, .if any
1009    \op1
1010.endm
1011#################
1012# MACRO: four rounds for 1024-bit blocks
1013#
1014.macro r1024_FourRounds _RR_    #RR = base round number (0 mod 4)
1015    # should be here with X4 set properly, X6 stored on stack
1016_Rn_ = (\_RR_) + 0
1017        r1024_Mix  0, 1,rdi,rsi,_Rn_,0
1018        r1024_Mix  2, 3,rbp,rax,_Rn_,1
1019        r1024_Mix  4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)>       #save X4  on  stack (x4/x6 alternate)
1020        r1024_Mix  8, 9,r8 ,r9 ,_Rn_,4,<movq      X_stk+8*6(%rsp),%rcx>  #load X6 from stack
1021        r1024_Mix 10,11,r10,r11,_Rn_,5
1022        r1024_Mix 12,13,r12,r13,_Rn_,6
1023        r1024_Mix  6, 7,rcx,rdx,_Rn_,3
1024        r1024_Mix 14,15,r14,r15,_Rn_,7
1025    .if _SKEIN_DEBUG
1026      Skein_Debug_Round 1024,%(_Rn_+1)
1027    .endif
1028_Rn_ = (\_RR_) + 1
1029        r1024_Mix  0, 9,rdi,r9 ,_Rn_,0
1030        r1024_Mix  2,13,rbp,r13,_Rn_,1
1031        r1024_Mix  6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)>       #save X6  on  stack (x4/x6 alternate)
1032        r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq      X_stk+8*4(%rsp),%rcx>  #load X4 from stack
1033        r1024_Mix 12, 3,r12,rax,_Rn_,5
1034        r1024_Mix 14, 5,r14,rbx,_Rn_,6
1035        r1024_Mix  4,15,rcx,r15,_Rn_,3
1036        r1024_Mix  8, 1,r8 ,rsi,_Rn_,7
1037    .if _SKEIN_DEBUG
1038      Skein_Debug_Round 1024,%(_Rn_+1)
1039    .endif
1040_Rn_ = (\_RR_) + 2
1041        r1024_Mix  0, 7,rdi,rdx,_Rn_,0
1042        r1024_Mix  2, 5,rbp,rbx,_Rn_,1
1043        r1024_Mix  4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)>       #save X4  on  stack (x4/x6 alternate)
1044        r1024_Mix 12,15,r12,r15,_Rn_,4,<movq      X_stk+8*6(%rsp),%rcx>  #load X6 from stack
1045        r1024_Mix 14,13,r14,r13,_Rn_,5
1046        r1024_Mix  8,11,r8 ,r11,_Rn_,6
1047        r1024_Mix  6, 1,rcx,rsi,_Rn_,3
1048        r1024_Mix 10, 9,r10,r9 ,_Rn_,7
1049    .if _SKEIN_DEBUG
1050      Skein_Debug_Round 1024,%(_Rn_+1)
1051    .endif
1052_Rn_ = (\_RR_) + 3
1053        r1024_Mix  0,15,rdi,r15,_Rn_,0
1054        r1024_Mix  2,11,rbp,r11,_Rn_,1
1055        r1024_Mix  6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)>       #save X6  on  stack (x4/x6 alternate)
1056        r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq      X_stk+8*4(%rsp),%rcx>  #load X4 from stack
1057        r1024_Mix  8, 5,r8 ,rbx,_Rn_,5
1058        r1024_Mix 10, 3,r10,rax,_Rn_,6
1059        r1024_Mix  4, 9,rcx,r9 ,_Rn_,3
1060        r1024_Mix 12, 7,r12,rdx,_Rn_,7
1061    .if _SKEIN_DEBUG
1062      Skein_Debug_Round 1024,%(_Rn_+1)
1063    .endif
1064
1065  .if (SKEIN_ASM_UNROLL && 1024) == 0           #here with rdi == rIdx, X0 on stack
1066    #"rotate" the key schedule on the stack
1067i8 = o1K_r8
1068i0 = o1K_rdi
1069    movq    %r8 , X_stk+8*i8(%rsp)              #free up a register (save it on the stack)
1070    movq          ksKey+8* 0(%rsp,%rdi,8),%r8   #get  key  word
1071    movq    %r8 , ksKey+8*17(%rsp,%rdi,8)       #rotate key (must do key first or tweak clobbers it!)
1072    movq          ksTwk+8* 0(%rsp,%rdi,8),%r8   #get tweak word
1073    movq    %r8 , ksTwk+8* 3(%rsp,%rdi,8)       #rotate tweak (onto the stack)
1074    movq          X_stk+8*i8(%rsp)       ,%r8   #get the reg back
1075    incq    %rdi                                #bump the index
1076    movq    %rdi, rIdx_offs (%rsp)              #save rdi again
1077    movq          ksKey+8*i0(%rsp,%rdi,8),%rdi  #get the key schedule word for X0 back
1078    addq          X_stk+8*i0(%rsp)       ,%rdi  #perform the X0 key injection
1079  .endif
1080    #show the result of the key injection
1081    Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT
1082.endm #r1024_FourRounds
1083#
1084################
1085# code
1086#
1087C_label Skein1024_Process_Block
1088#
1089    Setup_Stack 1024,ROUNDS_1024/8,WCNT
1090    movq    TWEAK+ 8(%rdi),%r9
1091    jmp     Skein1024_block_loop
1092    # main hash loop for Skein1024
1093    .p2align 4
1094Skein1024_block_loop:
1095    # general register usage:
1096    #   RSP              = stack pointer
1097    #   RAX..RDX,RSI,RDI = X1, X3..X7 (state words)
1098    #   R8 ..R15         = X8..X15    (state words)
1099    #   RBP              = temp (used for X0 and X2)
1100    #
1101  .if (SKEIN_ASM_UNROLL & 1024) == 0
1102    xorq    %rax,%rax                      #init loop index on the stack
1103    movq    %rax,rIdx_offs(%rsp)
1104  .endif
1105    movq         TWEAK+     0(%rdi),%r8
1106    addq         bitAdd+  F_O(%rbp),%r8    #computed updated tweak value T0
1107    movq    %r9 ,%r10
1108    xorq    %r8 ,%r10                      #%rax/%rbx/%rcx = tweak schedule
1109    movq    %r8 ,TWEAK+     0(%rdi)        #save updated tweak value ctx->h.T[0]
1110    movq    %r8 ,ksTwk+ 0+F_O(%rbp)
1111    movq    %r9 ,ksTwk+ 8+F_O(%rbp)        #keep values in %r8 ,%r9  for initial tweak injection below
1112    movq    %r10,ksTwk+16+F_O(%rbp)
1113  .if _SKEIN_DEBUG
1114    movq    %r9 ,TWEAK+     8(%rdi)        #save updated tweak value ctx->h.T[1] for Skein_Debug_Block
1115  .endif
1116    movq         blkPtr +F_O(%rbp),%rsi    # rsi --> input block
1117    movq        $KW_PARITY        ,%rax    #overall key schedule parity
1118
1119    # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3]
1120    .irp _rN_,0,1,2,3,4,6                  #process the "initial" words, using r14/r15 as temps
1121      movq       X_VARS+8*\_rN_(%rdi),%r14 #get state word
1122      movq              8*\_rN_(%rsi),%r15 #get msg   word
1123      xorq  %r14,%rax                      #update key schedule overall parity
1124      movq  %r14,ksKey +8*\_rN_+F_O(%rbp)  #save key schedule word on stack
1125      movq  %r15,Wcopy +8*\_rN_+F_O(%rbp)  #save local msg Wcopy
1126      addq  %r15,%r14                      #do the initial key injection
1127      movq  %r14,X_stk +8*\_rN_    (%rsp)  #save initial state var on stack
1128    .endr
1129    # now process the rest, using the "real" registers
1130    #     (MUST do it in reverse order to inject tweaks r8/r9 first)
1131    .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx
1132_oo_ = o1K_\_rr_                           #offset assocated with the register
1133      movq  X_VARS+8*_oo_(%rdi),%\_rr_     #get key schedule word from context
1134      movq         8*_oo_(%rsi),%rcx       #get next input msg word
1135      movq  %\_rr_, ksKey +8*_oo_(%rsp)    #save key schedule on stack
1136      xorq  %\_rr_, %rax                   #accumulate key schedule parity
1137      movq  %rcx,Wcopy+8*_oo_+F_O(%rbp)    #save copy of msg word for feedforward
1138      addq  %rcx,%\_rr_                    #do the initial  key  injection
1139      .if    _oo_ == 13                    #do the initial tweak injection
1140        addReg \_rr_,r8                    #          (only in words 13/14)
1141      .elseif _oo_ == 14
1142        addReg \_rr_,r9
1143      .endif
1144    .endr
1145    movq    %rax,ksKey+8*WCNT+F_O(%rbp)    #save key schedule parity
1146.if _SKEIN_DEBUG
1147    Skein_Debug_Block 1024                 #initial debug dump
1148.endif
1149    addq     $8*WCNT,%rsi                  #bump the msg ptr
1150    movq     %rsi,blkPtr+F_O(%rbp)         #save bumped msg ptr
1151    # re-load words 0..4 from stack, enter the main loop
1152    .irp _rr_,rdi,rsi,rbp,rax,rcx          #(no need to re-load x6, already on stack)
1153      movq  X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go!
1154    .endr
1155.if _SKEIN_DEBUG
1156    Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL        #show state after initial key injection
1157.endif
1158    #
1159    #################
1160    # now the key schedule is computed. Start the rounds
1161    #
1162.if SKEIN_ASM_UNROLL & 1024
1163_UNROLL_CNT =   ROUNDS_1024/8
1164.else
1165_UNROLL_CNT =   SKEIN_UNROLL_1024
1166  .if ((ROUNDS_1024/8) % _UNROLL_CNT)
1167    .error "Invalid SKEIN_UNROLL_1024"
1168  .endif
1169Skein1024_round_loop:
1170.endif
1171#
1172_Rbase_ = 0
1173.rept _UNROLL_CNT*2                        #implement the rounds, 4 at a time
1174      r1024_FourRounds %(4*_Rbase_+00)
1175_Rbase_ = _Rbase_+1
1176.endr #rept _UNROLL_CNT
1177#
1178.if (SKEIN_ASM_UNROLL & 1024) == 0
1179    cmpq    $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done
1180    jb      Skein1024_round_loop
1181.endif
1182    # end of rounds
1183    #################
1184    #
1185    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..15}
1186    movq    %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack
1187    movq       ctxPtr(%rsp),%rdx
1188
1189    .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15   #do all but x6,x7
1190_oo_ = o1K_\_rr_
1191      xorq  Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR
1192      movq  %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context
1193      .if (_oo_ ==  9)
1194        movq   $FIRST_MASK64 ,%r9
1195      .endif
1196      .if (_oo_ == 14)
1197        andq   TWEAK+ 8(%rdx),%r9
1198      .endif
1199    .endr
1200    #
1201    movq         X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above)
1202    movq         X_stk +8*7(%rsp),%rbx
1203    xorq         Wcopy +8*6(%rsp),%rax
1204    xorq         Wcopy +8*7(%rsp),%rbx
1205    movq    %rax,X_VARS+8*6(%rdx)
1206    decq             blkCnt(%rsp)      #set zero flag iff done
1207    movq    %rbx,X_VARS+8*7(%rdx)
1208
1209    Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)>
1210    # go back for more blocks, if needed
1211    movq             ctxPtr(%rsp),%rdi #don't muck with the flags here!
1212    lea          FRAME_OFFS(%rsp),%rbp
1213    jnz     Skein1024_block_loop
1214    movq    %r9 ,TWEAK+   8(%rdx)
1215    Reset_Stack
1216    ret
1217#
1218Skein1024_Process_Block_End:
1219#
1220.if _SKEIN_DEBUG
1221Skein_Debug_Round_1024:
1222    # call here with rdx  = "round number",
1223_SP_OFFS_ = 8*2                     #stack "offset" here: rdx, return addr
1224    #
1225  #save rest of X[] state on stack so debug routines can access it
1226  .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15
1227    movq    %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp)
1228  .endr
1229    # Figure out what to do with x0 (rdi).  When rdx == 0 mod 4, it's already on stack
1230    cmpq    $SKEIN_RND_SPECIAL,%rdx #special rounds always save
1231    jae     save_x0
1232    testq   $3,%rdx                 #otherwise only if rdx != 0 mod 4
1233    jz      save_x0_not
1234save_x0:
1235    movq    %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp)
1236save_x0_not:
1237    #figure out the x4/x6 swapping state and save the correct one!
1238    cmpq    $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4
1239    jae     save_x4
1240    testq   $1,%rdx                  #and even ones have r4 as well
1241    jz      save_x4
1242    movq    %rcx,X_stk+8*6+_SP_OFFS_(%rsp)
1243    jmp     debug_1024_go
1244save_x4:
1245    movq    %rcx,X_stk+8*4+_SP_OFFS_(%rsp)
1246debug_1024_go:
1247    #now all is saved in Xstk[] except for rdx
1248    push    %rsi                    #save two regs for BLK_BITS-specific parms
1249    push    %rdi
1250_SP_OFFS_ = _SP_OFFS_ + 16          #adjust stack offset accordingly (now 32)
1251
1252    movq    _SP_OFFS_-8(%rsp),%rsi  #get back original %rdx (pushed on stack in macro call)
1253    movq    %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[]
1254
1255    movq    ctxPtr+_SP_OFFS_(%rsp),%rsi  #rsi = ctx_hdr_ptr
1256    movq    $1024,%rdi                   #rdi = block size
1257    jmp     Skein_Debug_Round_Common
1258.endif
1259#
1260.if _SKEIN_CODE_SIZE
1261C_label Skein1024_Process_Block_CodeSize
1262    movq    $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax
1263    ret
1264#
1265C_label Skein1024_Unroll_Cnt
1266  .if _UNROLL_CNT <> (ROUNDS_1024/8)
1267    movq    $_UNROLL_CNT,%rax
1268  .else
1269    xorq    %rax,%rax
1270  .endif
1271    ret
1272.endif
1273#
1274.endif # _USE_ASM_ and 1024
1275#
1276.if _SKEIN_DEBUG
1277#----------------------------------------------------------------
1278#local debug routine to set up for calls to:
1279#  void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)
1280#                       [       rdi                        rsi   rdx              rcx]
1281#
1282# here with %rdx = round number
1283#           %rsi = ctx_hdr_ptr
1284#           %rdi = block size (256/512/1024)
1285# on stack: saved rdi, saved rsi, retAddr, saved rdx
1286#
1287Skein_Debug_Round_Common:
1288_SP_OFFS_ = 32                        #account for four words on stack already
1289  .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15  #save the rest of the regs
1290    pushq %\_rr_
1291_SP_OFFS_ = _SP_OFFS_+8
1292  .endr
1293  .if (_SP_OFFS_ % 16)                # make sure stack is still 16-byte aligned here
1294    .error  "Debug_Round_Common: stack alignment"
1295  .endif
1296    # compute %rcx  = ptr to the X[] array on the stack (final parameter to call)
1297    leaq    X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address
1298    cmpq    $SKEIN_RND_FEED_FWD,%rdx   #special handling for feedforward "round"?
1299    jnz     _got_rcxA
1300    leaq    X_VARS(%rsi),%rcx
1301_got_rcxA:
1302  .if _USE_ASM_ & 1024
1303    # special handling for 1024-bit case
1304    #    (for rounds right before with key injection:
1305    #        use xDebug_1024[] instead of X_stk[])
1306    cmpq    $SKEIN_RND_SPECIAL,%rdx
1307    jae     _got_rcxB               #must be a normal round
1308    orq     %rdx,%rdx
1309    jz      _got_rcxB               #just before key injection
1310    test    $3,%rdx
1311    jne     _got_rcxB
1312    cmp     $1024,%rdi              #only 1024-bit(s) for now
1313    jne     _got_rcxB
1314    leaq    xDebug_1024+_SP_OFFS_(%rsp),%rcx
1315_got_rcxB:
1316  .endif
1317    call    Skein_Show_Round        #call external debug handler
1318
1319  .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax  #restore regs
1320    popq  %\_rr_
1321_SP_OFFS_ = _SP_OFFS_-8
1322  .endr
1323  .if _SP_OFFS_ - 32
1324    .error   "Debug_Round_Common: push/pop misalignment!"
1325  .endif
1326    popq    %rdi
1327    popq    %rsi
1328    ret
1329.endif
1330#----------------------------------------------------------------
1331    .section .note.GNU-stack,"",@progbits
1332
1333    .end
1334