1# 2#---------------------------------------------------------------- 3# 64-bit x86 assembler code (gnu as) for Skein block functions 4# 5# Author: Doug Whiting, Hifn/Exar 6# 7# This code is released to the public domain. 8#---------------------------------------------------------------- 9# 10 .text 11 .altmacro 12#ifndef __clang__ 13 .psize 0,128 #list file has no page boundaries 14#endif 15# 16_MASK_ALL_ = (256+512+1024) #all three algorithm bits 17_MAX_FRAME_ = 240 18# 19################# 20#ifndef SKEIN_USE_ASM 21_USE_ASM_ = _MASK_ALL_ 22#else 23_USE_ASM_ = SKEIN_USE_ASM 24#endif 25################# 26#configure loop unrolling 27#ifndef SKEIN_LOOP 28_SKEIN_LOOP = 2 #default is fully unrolled for 256/512, twice for 1024 29#else 30_SKEIN_LOOP = SKEIN_LOOP 31 .irp _NN_,%_SKEIN_LOOP #only display loop unrolling if default changed on command line 32#.print "+++ SKEIN_LOOP = \_NN_" 33 .endr 34#endif 35# the unroll counts (0 --> fully unrolled) 36SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10 37SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10 38SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10 39# 40SKEIN_ASM_UNROLL = 0 41 .irp _NN_,256,512,1024 42 .if (SKEIN_UNROLL_\_NN_) == 0 43SKEIN_ASM_UNROLL = (SKEIN_ASM_UNROLL) + \_NN_ 44 .endif 45 .endr 46################# 47# 48.ifndef SKEIN_ROUNDS 49ROUNDS_256 = 72 50ROUNDS_512 = 72 51ROUNDS_1024 = 80 52.else 53ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5) 54ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5) 55ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5) 56# only display rounds if default size is changed on command line 57.irp _NN_,256,512,1024 58 .if _USE_ASM_ & \_NN_ 59 .irp _RR_,%(ROUNDS_\_NN_) 60 .if _NN_ < 1024 61.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" 62 .else 63.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" 64 .endif 65 .endr 66 .endif 67.endr 68.endif 69################# 70# 71.ifdef SKEIN_CODE_SIZE 72_SKEIN_CODE_SIZE = (1) 73.else 74.ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined 75_SKEIN_CODE_SIZE = (1) 76.else 77_SKEIN_CODE_SIZE = (0) 78.endif 79.endif 80# 81################# 82# 83.ifndef SKEIN_DEBUG 84_SKEIN_DEBUG = 0 85.else 86_SKEIN_DEBUG = 1 87.endif 88################# 89# 90# define offsets of fields in hash context structure 91# 92HASH_BITS = 0 #bits of hash output 93BCNT = 8 + HASH_BITS #number of bytes in BUFFER[] 94TWEAK = 8 + BCNT #tweak values[0..1] 95X_VARS = 16 + TWEAK #chaining vars 96# 97#(Note: buffer[] in context structure is NOT needed here :-) 98# 99KW_PARITY = 0x1BD11BDAA9FC1A22 #overall parity of key schedule words 100FIRST_MASK = ~ (1 << 6) 101FIRST_MASK64= ~ (1 << 62) 102# 103# rotation constants for Skein 104# 105RC_256_0_0 = 14 106RC_256_0_1 = 16 107 108RC_256_1_0 = 52 109RC_256_1_1 = 57 110 111RC_256_2_0 = 23 112RC_256_2_1 = 40 113 114RC_256_3_0 = 5 115RC_256_3_1 = 37 116 117RC_256_4_0 = 25 118RC_256_4_1 = 33 119 120RC_256_5_0 = 46 121RC_256_5_1 = 12 122 123RC_256_6_0 = 58 124RC_256_6_1 = 22 125 126RC_256_7_0 = 32 127RC_256_7_1 = 32 128 129RC_512_0_0 = 46 130RC_512_0_1 = 36 131RC_512_0_2 = 19 132RC_512_0_3 = 37 133 134RC_512_1_0 = 33 135RC_512_1_1 = 27 136RC_512_1_2 = 14 137RC_512_1_3 = 42 138 139RC_512_2_0 = 17 140RC_512_2_1 = 49 141RC_512_2_2 = 36 142RC_512_2_3 = 39 143 144RC_512_3_0 = 44 145RC_512_3_1 = 9 146RC_512_3_2 = 54 147RC_512_3_3 = 56 148 149RC_512_4_0 = 39 150RC_512_4_1 = 30 151RC_512_4_2 = 34 152RC_512_4_3 = 24 153 154RC_512_5_0 = 13 155RC_512_5_1 = 50 156RC_512_5_2 = 10 157RC_512_5_3 = 17 158 159RC_512_6_0 = 25 160RC_512_6_1 = 29 161RC_512_6_2 = 39 162RC_512_6_3 = 43 163 164RC_512_7_0 = 8 165RC_512_7_1 = 35 166RC_512_7_2 = 56 167RC_512_7_3 = 22 168 169RC_1024_0_0 = 24 170RC_1024_0_1 = 13 171RC_1024_0_2 = 8 172RC_1024_0_3 = 47 173RC_1024_0_4 = 8 174RC_1024_0_5 = 17 175RC_1024_0_6 = 22 176RC_1024_0_7 = 37 177 178RC_1024_1_0 = 38 179RC_1024_1_1 = 19 180RC_1024_1_2 = 10 181RC_1024_1_3 = 55 182RC_1024_1_4 = 49 183RC_1024_1_5 = 18 184RC_1024_1_6 = 23 185RC_1024_1_7 = 52 186 187RC_1024_2_0 = 33 188RC_1024_2_1 = 4 189RC_1024_2_2 = 51 190RC_1024_2_3 = 13 191RC_1024_2_4 = 34 192RC_1024_2_5 = 41 193RC_1024_2_6 = 59 194RC_1024_2_7 = 17 195 196RC_1024_3_0 = 5 197RC_1024_3_1 = 20 198RC_1024_3_2 = 48 199RC_1024_3_3 = 41 200RC_1024_3_4 = 47 201RC_1024_3_5 = 28 202RC_1024_3_6 = 16 203RC_1024_3_7 = 25 204 205RC_1024_4_0 = 41 206RC_1024_4_1 = 9 207RC_1024_4_2 = 37 208RC_1024_4_3 = 31 209RC_1024_4_4 = 12 210RC_1024_4_5 = 47 211RC_1024_4_6 = 44 212RC_1024_4_7 = 30 213 214RC_1024_5_0 = 16 215RC_1024_5_1 = 34 216RC_1024_5_2 = 56 217RC_1024_5_3 = 51 218RC_1024_5_4 = 4 219RC_1024_5_5 = 53 220RC_1024_5_6 = 42 221RC_1024_5_7 = 41 222 223RC_1024_6_0 = 31 224RC_1024_6_1 = 44 225RC_1024_6_2 = 47 226RC_1024_6_3 = 46 227RC_1024_6_4 = 19 228RC_1024_6_5 = 42 229RC_1024_6_6 = 44 230RC_1024_6_7 = 25 231 232RC_1024_7_0 = 9 233RC_1024_7_1 = 48 234RC_1024_7_2 = 35 235RC_1024_7_3 = 52 236RC_1024_7_4 = 23 237RC_1024_7_5 = 31 238RC_1024_7_6 = 37 239RC_1024_7_7 = 20 240# 241# Input: reg 242# Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024 243# 244.macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM 245 .if RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM #is there anything to do? 246 rolq $RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM,%\reg 247 .endif 248.endm 249# 250#---------------------------------------------------------------- 251# 252# MACROS: define local vars and configure stack 253# 254#---------------------------------------------------------------- 255# declare allocated space on the stack 256.macro StackVar localName,localSize 257\localName = _STK_OFFS_ 258_STK_OFFS_ = _STK_OFFS_+(\localSize) 259.endm #StackVar 260# 261#---------------------------------------------------------------- 262# 263# MACRO: Configure stack frame, allocate local vars 264# 265.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt 266 WCNT = (\BLK_BITS)/64 267# 268_PushCnt_ = 0 #save nonvolatile regs on stack 269 .irp _reg_,rbp,rbx,r12,r13,r14,r15 270 pushq %\_reg_ 271_PushCnt_ = _PushCnt_ + 1 #track count to keep alignment 272 .endr 273# 274_STK_OFFS_ = 0 #starting offset from rsp 275 #---- local variables #<-- rsp 276 StackVar X_stk ,8*(WCNT) #local context vars 277 StackVar ksTwk ,8*3 #key schedule: tweak words 278 StackVar ksKey ,8*(WCNT)+8 #key schedule: key words 279 .if ((SKEIN_ASM_UNROLL) & (\BLK_BITS)) == 0 280 StackVar ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen 281 .endif 282 StackVar Wcopy ,8*(WCNT) #copy of input block 283 .if _SKEIN_DEBUG 284 .if \debugCnt + 0 #temp location for debug X[] info 285 StackVar xDebug_\BLK_BITS ,8*(\debugCnt) 286 .endif 287 .endif 288 .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0 289 StackVar align16,8 #keep 16-byte aligned (adjust for retAddr?) 290tmpStk_\BLK_BITS = align16 #use this 291 .endif 292 #---- saved caller parameters (from regs rdi, rsi, rdx, rcx) 293 StackVar ctxPtr ,8 #context ptr 294 StackVar blkPtr ,8 #pointer to block data 295 StackVar blkCnt ,8 #number of full blocks to process 296 StackVar bitAdd ,8 #bit count to add to tweak 297LOCAL_SIZE = _STK_OFFS_ #size of "local" vars 298 #---- 299 StackVar savRegs,8*_PushCnt_ #saved registers 300 StackVar retAddr,8 #return address 301 #---- caller's stack frame (aligned mod 16) 302# 303# set up the stack frame pointer (rbp) 304# 305FRAME_OFFS = ksTwk + 128 #allow short (negative) offset to ksTwk, kwKey 306 .if FRAME_OFFS > _STK_OFFS_ #keep rbp in the "locals" range 307FRAME_OFFS = _STK_OFFS_ 308 .endif 309F_O = -FRAME_OFFS 310# 311 #put some useful defines in the .lst file (for grep) 312__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE 313__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_ 314__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS 315# 316# Notes on stack frame setup: 317# * the most frequently used variable is X_stk[], based at [rsp+0] 318# * the next most used is the key schedule arrays, ksKey and ksTwk 319# so rbp is "centered" there, allowing short offsets to the key 320# schedule even in 1024-bit Skein case 321# * the Wcopy variables are infrequently accessed, but they have long 322# offsets from both rsp and rbp only in the 1024-bit case. 323# * all other local vars and calling parameters can be accessed 324# with short offsets, except in the 1024-bit case 325# 326 subq $LOCAL_SIZE,%rsp #make room for the locals 327 leaq FRAME_OFFS(%rsp),%rbp #maximize use of short offsets 328 movq %rdi, ctxPtr+F_O(%rbp) #save caller's parameters on the stack 329 movq %rsi, blkPtr+F_O(%rbp) 330 movq %rdx, blkCnt+F_O(%rbp) 331 movq %rcx, bitAdd+F_O(%rbp) 332# 333.endm #Setup_Stack 334# 335#---------------------------------------------------------------- 336# 337.macro Reset_Stack 338 addq $LOCAL_SIZE,%rsp #get rid of locals (wipe?) 339 .irp _reg_,r15,r14,r13,r12,rbx,rbp 340 popq %\_reg_ #restore caller's regs 341_PushCnt_ = _PushCnt_ - 1 342 .endr 343 .if _PushCnt_ 344 .error "Mismatched push/pops?" 345 .endif 346.endm # Reset_Stack 347# 348#---------------------------------------------------------------- 349# macros to help debug internals 350# 351.if _SKEIN_DEBUG 352 .extern Skein_Show_Block #calls to C routines 353 .extern Skein_Show_Round 354# 355SKEIN_RND_SPECIAL = 1000 356SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0 357SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1 358SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2 359# 360.macro Skein_Debug_Block BLK_BITS 361# 362#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X, 363# const u08b_t *blkPtr, const u64b_t *wPtr, 364# const u64b_t *ksPtr,const u64b_t *tsPtr) 365# 366_NN_ = 0 367 .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11 368 pushq %\_reg_ #save all volatile regs on tack before the call 369_NN_ = _NN_ + 1 370 .endr 371 # get and push call parameters 372 movq $\BLK_BITS ,%rdi #bits 373 movq ctxPtr+F_O(%rbp),%rsi #h (pointer) 374 leaq X_VARS (%rsi),%rdx #X (pointer) 375 movq blkPtr+F_O(%rbp),%rcx #blkPtr 376 leaq Wcopy +F_O(%rbp),%r8 #wPtr 377 leaq ksKey +F_O(%rbp),%r9 #key pointer 378 leaq ksTwk +F_O(%rbp),%rax #tweak pointer 379 pushq %rax # (pass on the stack) 380 call Skein_Show_Block #call external debug handler 381 addq $8*1,%rsp #discard parameters on stack 382 .if (_NN_ % 2 ) == 0 #check stack alignment 383 .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS" 384 .endif 385 .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax 386 popq %\_reg_ #restore regs 387_NN_ = _NN_ - 1 388 .endr 389 .if _NN_ 390 .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS" 391 .endif 392.endm # Skein_Debug_Block 393# 394# the macro to "call" to debug a round 395# 396.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp 397 # call the appropriate (local) debug "function" 398 pushq %rdx #save rdx, so we can use it for round "number" 399 .if ((SKEIN_ASM_UNROLL) & \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL) 400 movq $\R,%rdx 401 .else #compute round number using edi 402_rOffs_ = \RDI_OFFS + 0 403 .if \BLK_BITS == 1024 404 movq rIdx_offs+8(%rsp),%rdx #get rIdx off the stack (adjust for pushq rdx above) 405 leaq 1+(((\R)-1) & 3)+_rOffs_(,%rdx,4),%rdx 406 .else 407 leaq 1+(((\R)-1) & 3)+_rOffs_(,%rdi,4),%rdx 408 .endif 409 .endif 410 call Skein_Debug_Round_\BLK_BITS 411 popq %rdx #restore origianl rdx value 412# 413 afterOp 414.endm # Skein_Debug_Round 415.else #------- _SKEIN_DEBUG (dummy macros if debug not enabled) 416.macro Skein_Debug_Block BLK_BITS 417.endm 418# 419.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp 420.endm 421# 422.endif # _SKEIN_DEBUG 423# 424#---------------------------------------------------------------- 425# 426.macro addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs 427 .if \immOffs + 0 428 leaq \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg 429 .elseif ((\useAddOp + 0) == 0) 430 .ifndef ASM_NO_LEA #lea seems to be faster on Core 2 Duo CPUs! 431 leaq (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg 432 .else 433 addq %\srcReg_A\srcReg_B,%\dstReg 434 .endif 435 .else 436 addq %\srcReg_A\srcReg_B,%\dstReg 437 .endif 438.endm 439 440# keep Intel-style ordering here, to match addReg 441.macro xorReg dstReg,srcReg_A,srcReg_B 442 xorq %\srcReg_A\srcReg_B,%\dstReg 443.endm 444# 445#---------------------------------------------------------------- 446# 447.macro C_label lName 448 \lName: #use both "genders" to work across linkage conventions 449_\lName: 450 .global \lName 451 .global _\lName 452.endm 453# 454#=================================== Skein_256 ============================================= 455# 456.if _USE_ASM_ & 256 457# 458# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# 459# 460################# 461# 462# code 463# 464C_label Skein_256_Process_Block 465 Setup_Stack 256,((ROUNDS_256/8)+1) 466 movq TWEAK+8(%rdi),%r14 467 jmp Skein_256_block_loop 468 .p2align 4 469 # main hash loop for Skein_256 470Skein_256_block_loop: 471 # 472 # general register usage: 473 # RAX..RDX = X0..X3 474 # R08..R12 = ks[0..4] 475 # R13..R15 = ts[0..2] 476 # RSP, RBP = stack/frame pointers 477 # RDI = round counter or context pointer 478 # RSI = temp 479 # 480 movq TWEAK+0(%rdi) ,%r13 481 addq bitAdd+F_O(%rbp) ,%r13 #computed updated tweak value T0 482 movq %r14 ,%r15 483 xorq %r13 ,%r15 #now %r13.%r15 is set as the tweak 484 485 movq $KW_PARITY ,%r12 486 movq X_VARS+ 0(%rdi),%r8 487 movq X_VARS+ 8(%rdi),%r9 488 movq X_VARS+16(%rdi),%r10 489 movq X_VARS+24(%rdi),%r11 490 movq %r13,TWEAK+0(%rdi) #save updated tweak value ctx->h.T[0] 491 xorq %r8 ,%r12 #start accumulating overall parity 492 493 movq blkPtr +F_O(%rbp) ,%rsi #esi --> input block 494 xorq %r9 ,%r12 495 movq 0(%rsi) ,%rax #get X[0..3] 496 xorq %r10 ,%r12 497 movq 8(%rsi) ,%rbx 498 xorq %r11 ,%r12 499 movq 16(%rsi) ,%rcx 500 movq 24(%rsi) ,%rdx 501 502 movq %rax,Wcopy+ 0+F_O(%rbp) #save copy of input block 503 movq %rbx,Wcopy+ 8+F_O(%rbp) 504 movq %rcx,Wcopy+16+F_O(%rbp) 505 movq %rdx,Wcopy+24+F_O(%rbp) 506 507 addq %r8 ,%rax #initial key injection 508 addq %r9 ,%rbx 509 addq %r10,%rcx 510 addq %r11,%rdx 511 addq %r13,%rbx 512 addq %r14,%rcx 513 514.if _SKEIN_DEBUG 515 movq %r14,TWEAK+ 8(%rdi) #save updated tweak T[1] (start bit cleared?) 516 movq %r8 ,ksKey+ 0+F_O(%rbp) #save key schedule on stack for Skein_Debug_Block 517 movq %r9 ,ksKey+ 8+F_O(%rbp) 518 movq %r10,ksKey+16+F_O(%rbp) 519 movq %r11,ksKey+24+F_O(%rbp) 520 movq %r12,ksKey+32+F_O(%rbp) 521 522 movq %r13,ksTwk+ 0+F_O(%rbp) 523 movq %r14,ksTwk+ 8+F_O(%rbp) 524 movq %r15,ksTwk+16+F_O(%rbp) 525 526 movq %rax,X_stk + 0(%rsp) #save X[] on stack for Skein_Debug_Block 527 movq %rbx,X_stk + 8(%rsp) 528 movq %rcx,X_stk +16(%rsp) 529 movq %rdx,X_stk +24(%rsp) 530 531 Skein_Debug_Block 256 #debug dump 532 Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL 533.endif 534# 535.if (((SKEIN_ASM_UNROLL) & 256) == 0) 536 movq %r8 ,ksKey+40+F_O(%rbp) #save key schedule on stack for looping code 537 movq %r9 ,ksKey+ 8+F_O(%rbp) 538 movq %r10,ksKey+16+F_O(%rbp) 539 movq %r11,ksKey+24+F_O(%rbp) 540 movq %r12,ksKey+32+F_O(%rbp) 541 542 movq %r13,ksTwk+24+F_O(%rbp) 543 movq %r14,ksTwk+ 8+F_O(%rbp) 544 movq %r15,ksTwk+16+F_O(%rbp) 545.endif 546 addq $WCNT*8,%rsi #skip the block 547 movq %rsi,blkPtr +F_O(%rbp) #update block pointer 548 # 549 # now the key schedule is computed. Start the rounds 550 # 551.if (SKEIN_ASM_UNROLL) & 256 552_UNROLL_CNT = ROUNDS_256/8 553.else 554_UNROLL_CNT = SKEIN_UNROLL_256 555 .if ((ROUNDS_256/8) % _UNROLL_CNT) 556 .error "Invalid SKEIN_UNROLL_256" 557 .endif 558 xorq %rdi,%rdi #rdi = iteration count 559Skein_256_round_loop: 560.endif 561_Rbase_ = 0 562.rept _UNROLL_CNT*2 563 # all X and ks vars in regs # (ops to "rotate" ks vars, via mem, if not unrolled) 564 # round 4*_RBase_ + 0 565 addReg rax, rbx 566 RotL64 rbx, 256,%((4*_Rbase_+0) % 8),0 567 addReg rcx, rdx 568 .if ((SKEIN_ASM_UNROLL) & 256) == 0 569 movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8 570 .endif 571 xorReg rbx, rax 572 RotL64 rdx, 256,%((4*_Rbase_+0) % 8),1 573 xorReg rdx, rcx 574 .if (SKEIN_ASM_UNROLL) & 256 575 .irp _r0_,%( 8+(_Rbase_+3) % 5) 576 .irp _r1_,%(13+(_Rbase_+2) % 3) 577 leaq (%r\_r0_,%r\_r1_),%rdi #precompute key injection value for %rcx 578 .endr 579 .endr 580 .endif 581 .if ((SKEIN_ASM_UNROLL) & 256) == 0 582 movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13 583 .endif 584 Skein_Debug_Round 256,%(4*_Rbase_+1) 585 586 # round 4*_Rbase_ + 1 587 addReg rax, rdx 588 RotL64 rdx, 256,%((4*_Rbase_+1) % 8),0 589 xorReg rdx, rax 590 .if ((SKEIN_ASM_UNROLL) & 256) == 0 591 movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9 592 .endif 593 addReg rcx, rbx 594 RotL64 rbx, 256,%((4*_Rbase_+1) % 8),1 595 xorReg rbx, rcx 596 .if ((SKEIN_ASM_UNROLL) & 256) == 0 597 movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11 598 .endif 599 Skein_Debug_Round 256,%(4*_Rbase_+2) 600 .if (SKEIN_ASM_UNROLL) & 256 601 .irp _r0_,%( 8+(_Rbase_+2) % 5) 602 .irp _r1_,%(13+(_Rbase_+1) % 3) 603 leaq (%r\_r0_,%r\_r1_),%rsi #precompute key injection value for %rbx 604 .endr 605 .endr 606 .endif 607 # round 4*_Rbase_ + 2 608 addReg rax, rbx 609 RotL64 rbx, 256,%((4*_Rbase_+2) % 8),0 610 addReg rcx, rdx 611 .if ((SKEIN_ASM_UNROLL) & 256) == 0 612 movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10 613 .endif 614 xorReg rbx, rax 615 RotL64 rdx, 256,%((4*_Rbase_+2) % 8),1 616 xorReg rdx, rcx 617 .if ((SKEIN_ASM_UNROLL) & 256) == 0 618 movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8) #"rotate" the key 619 leaq 1(%r11,%rdi),%r11 #precompute key + tweak 620 .endif 621 Skein_Debug_Round 256,%(4*_Rbase_+3) 622 # round 4*_Rbase_ + 3 623 addReg rax, rdx 624 RotL64 rdx, 256,%((4*_Rbase_+3) % 8),0 625 addReg rcx, rbx 626 .if ((SKEIN_ASM_UNROLL) & 256) == 0 627 addq ksTwk+8*2+F_O(%rbp,%rdi,8),%r10 #precompute key + tweak 628 movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8) #"rotate" the tweak 629 .endif 630 xorReg rdx, rax 631 RotL64 rbx, 256,%((4*_Rbase_+3) % 8),1 632 xorReg rbx, rcx 633 Skein_Debug_Round 256,%(4*_Rbase_+4) 634 .if ((SKEIN_ASM_UNROLL) & 256) == 0 635 addReg r9 ,r13 #precompute key+tweak 636 .endif 637 #inject key schedule words 638_Rbase_ = _Rbase_+1 639 .if (SKEIN_ASM_UNROLL) & 256 640 addReg rax,r,%(8+((_Rbase_+0) % 5)) 641 addReg rbx,rsi 642 addReg rcx,rdi 643 addReg rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_ 644 .else 645 incq %rdi 646 addReg rax,r8 647 addReg rcx,r10 648 addReg rbx,r9 649 addReg rdx,r11 650 .endif 651 Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT 652.endr #rept _UNROLL_CNT 653# 654.if ((SKEIN_ASM_UNROLL) & 256) == 0 655 cmpq $2*(ROUNDS_256/8),%rdi 656 jb Skein_256_round_loop 657.endif # (SKEIN_ASM_UNROLL & 256) == 0 658 movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context 659 660 #---------------------------- 661 # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3} 662 movq $FIRST_MASK64 ,%r14 663 xorq Wcopy + 0+F_O (%rbp),%rax 664 xorq Wcopy + 8+F_O (%rbp),%rbx 665 xorq Wcopy +16+F_O (%rbp),%rcx 666 xorq Wcopy +24+F_O (%rbp),%rdx 667 andq TWEAK + 8 (%rdi),%r14 668 movq %rax,X_VARS+ 0(%rdi) #store final result 669 movq %rbx,X_VARS+ 8(%rdi) 670 movq %rcx,X_VARS+16(%rdi) 671 movq %rdx,X_VARS+24(%rdi) 672 673 Skein_Debug_Round 256,SKEIN_RND_FEED_FWD 674 675 # go back for more blocks, if needed 676 decq blkCnt+F_O(%rbp) 677 jnz Skein_256_block_loop 678 movq %r14,TWEAK + 8(%rdi) 679 Reset_Stack 680 ret 681Skein_256_Process_Block_End: 682 683 .if _SKEIN_DEBUG 684Skein_Debug_Round_256: #here with rdx == round "number" from macro 685 pushq %rsi #save two regs for BLK_BITS-specific parms 686 pushq %rdi 687 movq 24(%rsp),%rdi #get back original rdx (pushed on stack in macro call) to rdi 688 movq %rax,X_stk+ 0+F_O(%rbp) #save X[] state on stack so debug routines can access it 689 movq %rbx,X_stk+ 8+F_O(%rbp) #(use FP_ since rsp has changed!) 690 movq %rcx,X_stk+16+F_O(%rbp) 691 movq %rdi,X_stk+24+F_O(%rbp) 692 693 movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr 694 movq $256,%rdi #now <rdi,rsi,rdx> are set for the call 695 jmp Skein_Debug_Round_Common 696 .endif 697# 698.if _SKEIN_CODE_SIZE 699C_label Skein_256_Process_Block_CodeSize 700 movq $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax 701 ret 702# 703C_label Skein_256_Unroll_Cnt 704 .if _UNROLL_CNT <> ROUNDS_256/8 705 movq $_UNROLL_CNT,%rax 706 .else 707 xorq %rax,%rax 708 .endif 709 ret 710.endif 711# 712.endif #_USE_ASM_ & 256 713# 714#=================================== Skein_512 ============================================= 715# 716.if _USE_ASM_ & 512 717# 718# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd) 719# 720# X[i] == %r[8+i] #register assignments for X[] values during rounds (i=0..7) 721# 722################# 723# MACRO: one round for 512-bit blocks 724# 725.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4 726# 727 addReg r\rn0, r\rn1 728 RotL64 r\rn1, 512,%((\_Rn_) % 8),0 729 xorReg r\rn1, r\rn0 730 \op1 731 addReg r\rn2, r\rn3 732 RotL64 r\rn3, 512,%((\_Rn_) % 8),1 733 xorReg r\rn3, r\rn2 734 \op2 735 addReg r\rn4, r\rn5 736 RotL64 r\rn5, 512,%((\_Rn_) % 8),2 737 xorReg r\rn5, r\rn4 738 \op3 739 addReg r\rn6, r\rn7 740 RotL64 r\rn7, 512,%((\_Rn_) % 8),3 741 xorReg r\rn7, r\rn6 742 \op4 743 Skein_Debug_Round 512,%(\_Rn_+1),-4 744# 745.endm #R_512_OneRound 746# 747################# 748# MACRO: eight rounds for 512-bit blocks 749# 750.macro R_512_FourRounds _RR_ #RR = base round number (0 % 8) 751 .if ((SKEIN_ASM_UNROLL) & 512) 752 # here for fully unrolled case. 753 _II_ = ((\_RR_)/4) + 1 #key injection counter 754 R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx> 755 R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx> 756 R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx> 757 R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>, 758 # inject the key schedule 759 addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8 760 addReg r11, rax 761 addq ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9 762 addReg r12, rbx 763 addq ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10 764 addReg r13, rcx 765 addReg r14, rdx 766 addReg r15, rsi,,,(_II_) 767 .else 768 # here for looping case #"rotate" key/tweak schedule (move up on stack) 769 incq %rdi #bump key injection counter 770 R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq ksKey-8*1+F_O(%rbp,%rdi,8),%rsi> 771 R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8) >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)> 772 R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq ksKey+8*7+F_O(%rbp,%rdi,8),%rsi> 773 R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx> 774 # inject the key schedule 775 addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8 776 addReg r11, rax 777 addReg r12, rbx 778 addq ksKey+8*1+F_O(%rbp,%rdi,8),%r9 779 addReg r13, rcx 780 addReg r14, rdx 781 addq ksKey+8*2+F_O(%rbp,%rdi,8),%r10 782 addReg r15, rsi 783 addReg r15, rdi #inject the round number 784 .endif 785 786 #show the result of the key injection 787 Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT 788.endm #R_512_EightRounds 789# 790################# 791# instantiated code 792# 793C_label Skein_512_Process_Block 794 Setup_Stack 512,ROUNDS_512/8 795 movq TWEAK+ 8(%rdi),%rbx 796 jmp Skein_512_block_loop 797 .p2align 4 798 # main hash loop for Skein_512 799Skein_512_block_loop: 800 # general register usage: 801 # RAX..RDX = temps for key schedule pre-loads 802 # R8 ..R15 = X0..X7 803 # RSP, RBP = stack/frame pointers 804 # RDI = round counter or context pointer 805 # RSI = temp 806 # 807 movq TWEAK + 0(%rdi),%rax 808 addq bitAdd+F_O(%rbp),%rax #computed updated tweak value T0 809 movq %rbx,%rcx 810 xorq %rax,%rcx #%rax/%rbx/%rcx = tweak schedule 811 movq %rax,TWEAK+ 0 (%rdi) #save updated tweak value ctx->h.T[0] 812 movq %rax,ksTwk+ 0+F_O(%rbp) 813 movq $KW_PARITY,%rdx 814 movq blkPtr +F_O(%rbp),%rsi #%rsi --> input block 815 movq %rbx,ksTwk+ 8+F_O(%rbp) 816 movq %rcx,ksTwk+16+F_O(%rbp) 817 .irp _Rn_,8,9,10,11,12,13,14,15 818 movq X_VARS+8*(\_Rn_-8)(%rdi),%r\_Rn_ 819 xorq %r\_Rn_,%rdx #compute overall parity 820 movq %r\_Rn_,ksKey+8*(\_Rn_-8)+F_O(%rbp) 821 .endr #load state into %r8 ..%r15, compute parity 822 movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity 823 824 addReg r13,rax #precompute key injection for tweak 825 addReg r14, rbx 826.if _SKEIN_DEBUG 827 movq %rbx,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below 828.endif 829 movq 0(%rsi),%rax #load input block 830 movq 8(%rsi),%rbx 831 movq 16(%rsi),%rcx 832 movq 24(%rsi),%rdx 833 addReg r8 , rax #do initial key injection 834 addReg r9 , rbx 835 movq %rax,Wcopy+ 0+F_O(%rbp) #keep local copy for feedforward 836 movq %rbx,Wcopy+ 8+F_O(%rbp) 837 addReg r10, rcx 838 addReg r11, rdx 839 movq %rcx,Wcopy+16+F_O(%rbp) 840 movq %rdx,Wcopy+24+F_O(%rbp) 841 842 movq 32(%rsi),%rax 843 movq 40(%rsi),%rbx 844 movq 48(%rsi),%rcx 845 movq 56(%rsi),%rdx 846 addReg r12, rax 847 addReg r13, rbx 848 addReg r14, rcx 849 addReg r15, rdx 850 movq %rax,Wcopy+32+F_O(%rbp) 851 movq %rbx,Wcopy+40+F_O(%rbp) 852 movq %rcx,Wcopy+48+F_O(%rbp) 853 movq %rdx,Wcopy+56+F_O(%rbp) 854 855.if _SKEIN_DEBUG 856 .irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output 857 movq %r\_Rn_,X_stk+8*(\_Rn_-8)(%rsp) 858 .endr 859 860 Skein_Debug_Block 512 #debug dump 861 Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL 862.endif 863 addq $8*WCNT,%rsi #skip the block 864 movq %rsi,blkPtr+F_O(%rbp) #update block pointer 865 # 866 ################# 867 # now the key schedule is computed. Start the rounds 868 # 869.if (SKEIN_ASM_UNROLL) & 512 870_UNROLL_CNT = ROUNDS_512/8 871.else 872_UNROLL_CNT = SKEIN_UNROLL_512 873 .if ((ROUNDS_512/8) % _UNROLL_CNT) 874 .error "Invalid SKEIN_UNROLL_512" 875 .endif 876 xorq %rdi,%rdi #rdi = round counter 877Skein_512_round_loop: 878.endif 879# 880_Rbase_ = 0 881.rept _UNROLL_CNT*2 882 R_512_FourRounds %(4*_Rbase_+00) 883_Rbase_ = _Rbase_+1 884.endr #rept _UNROLL_CNT 885# 886.if ((SKEIN_ASM_UNROLL) & 512) == 0 887 cmpq $2*(ROUNDS_512/8),%rdi 888 jb Skein_512_round_loop 889 movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context 890.endif 891 # end of rounds 892 ################# 893 # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7} 894 .irp _Rn_,8,9,10,11,12,13,14,15 895 .if (\_Rn_ == 8) 896 movq $FIRST_MASK64,%rbx 897 .endif 898 xorq Wcopy+8*(\_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR 899 movq %r\_Rn_,X_VARS+8*(\_Rn_-8)(%rdi) #and store result 900 .if (\_Rn_ == 14) 901 andq TWEAK+ 8(%rdi),%rbx 902 .endif 903 .endr 904 Skein_Debug_Round 512,SKEIN_RND_FEED_FWD 905 906 # go back for more blocks, if needed 907 decq blkCnt+F_O(%rbp) 908 jnz Skein_512_block_loop 909 movq %rbx,TWEAK + 8(%rdi) 910 911 Reset_Stack 912 ret 913Skein_512_Process_Block_End: 914# 915 .if _SKEIN_DEBUG 916# call here with rdx = "round number" 917Skein_Debug_Round_512: 918 pushq %rsi #save two regs for BLK_BITS-specific parms 919 pushq %rdi 920 .irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it 921 movq %r\_Rn_,X_stk+8*(\_Rn_-8)+F_O(%rbp) 922 .endr 923 movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr 924 movq $512,%rdi #now <rdi,rsi,rdx> are set for the call 925 jmp Skein_Debug_Round_Common 926 .endif 927# 928.if _SKEIN_CODE_SIZE 929C_label Skein_512_Process_Block_CodeSize 930 movq $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax 931 ret 932# 933C_label Skein_512_Unroll_Cnt 934 .if _UNROLL_CNT <> (ROUNDS_512/8) 935 movq $_UNROLL_CNT,%rax 936 .else 937 xorq %rax,%rax 938 .endif 939 ret 940.endif 941# 942.endif # _USE_ASM_ & 512 943# 944#=================================== Skein1024 ============================================= 945.if _USE_ASM_ & 1024 946# 947# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# 948# 949################# 950# use details of permutation to make register assignments 951# 952o1K_rdi = 0 #offsets in X[] associated with each register 953o1K_rsi = 1 954o1K_rbp = 2 955o1K_rax = 3 956o1K_rcx = 4 #rcx is "shared" with X6, since X4/X6 alternate 957o1K_rbx = 5 958o1K_rdx = 7 959o1K_r8 = 8 960o1K_r9 = 9 961o1K_r10 = 10 962o1K_r11 = 11 963o1K_r12 = 12 964o1K_r13 = 13 965o1K_r14 = 14 966o1K_r15 = 15 967# 968rIdx_offs = tmpStk_1024 969# 970.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1 971 addReg \reg0 , \reg1 #perform the MIX 972 RotL64 \reg1 , 1024,%((\_RN0_) % 8),\_Rn1_ 973 xorReg \reg1 , \reg0 974.if ((\_RN0_) & 3) == 3 #time to do key injection? 975 .if _SKEIN_DEBUG 976 movq %\reg0 , xDebug_1024+8*\w0(%rsp) #save intermediate values for Debug_Round 977 movq %\reg1 , xDebug_1024+8*\w1(%rsp) # (before inline key injection) 978 .endif 979_II_ = ((\_RN0_)/4)+1 #injection count 980 .if (SKEIN_ASM_UNROLL) & 1024 #here to do fully unrolled key injection 981 addq ksKey+ 8*((_II_+\w0) % 17)(%rsp),%\reg0 982 addq ksKey+ 8*((_II_+\w1) % 17)(%rsp),%\reg1 983 .if \w1 == 13 #tweak injection 984 addq ksTwk+ 8*((_II_+ 0) % 3)(%rsp),%\reg1 985 .elseif \w0 == 14 986 addq ksTwk+ 8*((_II_+ 1) % 3)(%rsp),%\reg0 987 .elseif \w1 == 15 988 addq $_II_, %\reg1 #(injection counter) 989 .endif 990 .else #here to do looping key injection 991 .if (\w0 == 0) 992 movq %rdi, X_stk+8*\w0(%rsp) #if so, store N0 so we can use reg as index 993 movq rIdx_offs(%rsp),%rdi #get the injection counter index into rdi 994 .else 995 addq ksKey+8+8*\w0(%rsp,%rdi,8),%\reg0 #even key injection 996 .endif 997 .if \w1 == 13 #tweak injection 998 addq ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1 999 .elseif \w0 == 14 1000 addq ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0 1001 .elseif \w1 == 15 1002 addReg \reg1,rdi,,,1 #(injection counter) 1003 .endif 1004 addq ksKey+8+8*\w1(%rsp,%rdi,8),%\reg1 #odd key injection 1005 .endif 1006.endif 1007 # insert the op provided, .if any 1008 \op1 1009.endm 1010################# 1011# MACRO: four rounds for 1024-bit blocks 1012# 1013.macro r1024_FourRounds _RR_ #RR = base round number (0 mod 4) 1014 # should be here with X4 set properly, X6 stored on stack 1015_Rn_ = (\_RR_) + 0 1016 r1024_Mix 0, 1,rdi,rsi,_Rn_,0 1017 r1024_Mix 2, 3,rbp,rax,_Rn_,1 1018 r1024_Mix 4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate) 1019 r1024_Mix 8, 9,r8 ,r9 ,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack 1020 r1024_Mix 10,11,r10,r11,_Rn_,5 1021 r1024_Mix 12,13,r12,r13,_Rn_,6 1022 r1024_Mix 6, 7,rcx,rdx,_Rn_,3 1023 r1024_Mix 14,15,r14,r15,_Rn_,7 1024 .if _SKEIN_DEBUG 1025 Skein_Debug_Round 1024,%(_Rn_+1) 1026 .endif 1027_Rn_ = (\_RR_) + 1 1028 r1024_Mix 0, 9,rdi,r9 ,_Rn_,0 1029 r1024_Mix 2,13,rbp,r13,_Rn_,1 1030 r1024_Mix 6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate) 1031 r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack 1032 r1024_Mix 12, 3,r12,rax,_Rn_,5 1033 r1024_Mix 14, 5,r14,rbx,_Rn_,6 1034 r1024_Mix 4,15,rcx,r15,_Rn_,3 1035 r1024_Mix 8, 1,r8 ,rsi,_Rn_,7 1036 .if _SKEIN_DEBUG 1037 Skein_Debug_Round 1024,%(_Rn_+1) 1038 .endif 1039_Rn_ = (\_RR_) + 2 1040 r1024_Mix 0, 7,rdi,rdx,_Rn_,0 1041 r1024_Mix 2, 5,rbp,rbx,_Rn_,1 1042 r1024_Mix 4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate) 1043 r1024_Mix 12,15,r12,r15,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack 1044 r1024_Mix 14,13,r14,r13,_Rn_,5 1045 r1024_Mix 8,11,r8 ,r11,_Rn_,6 1046 r1024_Mix 6, 1,rcx,rsi,_Rn_,3 1047 r1024_Mix 10, 9,r10,r9 ,_Rn_,7 1048 .if _SKEIN_DEBUG 1049 Skein_Debug_Round 1024,%(_Rn_+1) 1050 .endif 1051_Rn_ = (\_RR_) + 3 1052 r1024_Mix 0,15,rdi,r15,_Rn_,0 1053 r1024_Mix 2,11,rbp,r11,_Rn_,1 1054 r1024_Mix 6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate) 1055 r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack 1056 r1024_Mix 8, 5,r8 ,rbx,_Rn_,5 1057 r1024_Mix 10, 3,r10,rax,_Rn_,6 1058 r1024_Mix 4, 9,rcx,r9 ,_Rn_,3 1059 r1024_Mix 12, 7,r12,rdx,_Rn_,7 1060 .if _SKEIN_DEBUG 1061 Skein_Debug_Round 1024,%(_Rn_+1) 1062 .endif 1063 1064 .if ((SKEIN_ASM_UNROLL) & 1024) == 0 #here with rdi == rIdx, X0 on stack 1065 #"rotate" the key schedule on the stack 1066i8 = o1K_r8 1067i0 = o1K_rdi 1068 movq %r8 , X_stk+8*i8(%rsp) #free up a register (save it on the stack) 1069 movq ksKey+8* 0(%rsp,%rdi,8),%r8 #get key word 1070 movq %r8 , ksKey+8*17(%rsp,%rdi,8) #rotate key (must do key first or tweak clobbers it!) 1071 movq ksTwk+8* 0(%rsp,%rdi,8),%r8 #get tweak word 1072 movq %r8 , ksTwk+8* 3(%rsp,%rdi,8) #rotate tweak (onto the stack) 1073 movq X_stk+8*i8(%rsp) ,%r8 #get the reg back 1074 incq %rdi #bump the index 1075 movq %rdi, rIdx_offs (%rsp) #save rdi again 1076 movq ksKey+8*i0(%rsp,%rdi,8),%rdi #get the key schedule word for X0 back 1077 addq X_stk+8*i0(%rsp) ,%rdi #perform the X0 key injection 1078 .endif 1079 #show the result of the key injection 1080 Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT 1081.endm #r1024_FourRounds 1082# 1083################ 1084# code 1085# 1086C_label Skein1024_Process_Block 1087# 1088 Setup_Stack 1024,ROUNDS_1024/8,WCNT 1089 movq TWEAK+ 8(%rdi),%r9 1090 jmp Skein1024_block_loop 1091 # main hash loop for Skein1024 1092 .p2align 4 1093Skein1024_block_loop: 1094 # general register usage: 1095 # RSP = stack pointer 1096 # RAX..RDX,RSI,RDI = X1, X3..X7 (state words) 1097 # R8 ..R15 = X8..X15 (state words) 1098 # RBP = temp (used for X0 and X2) 1099 # 1100 .if ((SKEIN_ASM_UNROLL) & 1024) == 0 1101 xorq %rax,%rax #init loop index on the stack 1102 movq %rax,rIdx_offs(%rsp) 1103 .endif 1104 movq TWEAK+ 0(%rdi),%r8 1105 addq bitAdd+ F_O(%rbp),%r8 #computed updated tweak value T0 1106 movq %r9 ,%r10 1107 xorq %r8 ,%r10 #%rax/%rbx/%rcx = tweak schedule 1108 movq %r8 ,TWEAK+ 0(%rdi) #save updated tweak value ctx->h.T[0] 1109 movq %r8 ,ksTwk+ 0+F_O(%rbp) 1110 movq %r9 ,ksTwk+ 8+F_O(%rbp) #keep values in %r8 ,%r9 for initial tweak injection below 1111 movq %r10,ksTwk+16+F_O(%rbp) 1112 .if _SKEIN_DEBUG 1113 movq %r9 ,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block 1114 .endif 1115 movq blkPtr +F_O(%rbp),%rsi # rsi --> input block 1116 movq $KW_PARITY ,%rax #overall key schedule parity 1117 1118 # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3] 1119 .irp _rN_,0,1,2,3,4,6 #process the "initial" words, using r14/r15 as temps 1120 movq X_VARS+8*\_rN_(%rdi),%r14 #get state word 1121 movq 8*\_rN_(%rsi),%r15 #get msg word 1122 xorq %r14,%rax #update key schedule overall parity 1123 movq %r14,ksKey +8*\_rN_+F_O(%rbp) #save key schedule word on stack 1124 movq %r15,Wcopy +8*\_rN_+F_O(%rbp) #save local msg Wcopy 1125 addq %r15,%r14 #do the initial key injection 1126 movq %r14,X_stk +8*\_rN_ (%rsp) #save initial state var on stack 1127 .endr 1128 # now process the rest, using the "real" registers 1129 # (MUST do it in reverse order to inject tweaks r8/r9 first) 1130 .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx 1131_oo_ = o1K_\_rr_ #offset assocated with the register 1132 movq X_VARS+8*_oo_(%rdi),%\_rr_ #get key schedule word from context 1133 movq 8*_oo_(%rsi),%rcx #get next input msg word 1134 movq %\_rr_, ksKey +8*_oo_(%rsp) #save key schedule on stack 1135 xorq %\_rr_, %rax #accumulate key schedule parity 1136 movq %rcx,Wcopy+8*_oo_+F_O(%rbp) #save copy of msg word for feedforward 1137 addq %rcx,%\_rr_ #do the initial key injection 1138 .if _oo_ == 13 #do the initial tweak injection 1139 addReg \_rr_,r8 # (only in words 13/14) 1140 .elseif _oo_ == 14 1141 addReg \_rr_,r9 1142 .endif 1143 .endr 1144 movq %rax,ksKey+8*WCNT+F_O(%rbp) #save key schedule parity 1145.if _SKEIN_DEBUG 1146 Skein_Debug_Block 1024 #initial debug dump 1147.endif 1148 addq $8*WCNT,%rsi #bump the msg ptr 1149 movq %rsi,blkPtr+F_O(%rbp) #save bumped msg ptr 1150 # re-load words 0..4 from stack, enter the main loop 1151 .irp _rr_,rdi,rsi,rbp,rax,rcx #(no need to re-load x6, already on stack) 1152 movq X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go! 1153 .endr 1154.if _SKEIN_DEBUG 1155 Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL #show state after initial key injection 1156.endif 1157 # 1158 ################# 1159 # now the key schedule is computed. Start the rounds 1160 # 1161.if (SKEIN_ASM_UNROLL) & 1024 1162_UNROLL_CNT = ROUNDS_1024/8 1163.else 1164_UNROLL_CNT = SKEIN_UNROLL_1024 1165 .if ((ROUNDS_1024/8) % _UNROLL_CNT) 1166 .error "Invalid SKEIN_UNROLL_1024" 1167 .endif 1168Skein1024_round_loop: 1169.endif 1170# 1171_Rbase_ = 0 1172.rept _UNROLL_CNT*2 #implement the rounds, 4 at a time 1173 r1024_FourRounds %(4*_Rbase_+00) 1174_Rbase_ = _Rbase_+1 1175.endr #rept _UNROLL_CNT 1176# 1177.if ((SKEIN_ASM_UNROLL) & 1024) == 0 1178 cmpq $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done 1179 jb Skein1024_round_loop 1180.endif 1181 # end of rounds 1182 ################# 1183 # 1184 # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15} 1185 movq %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack 1186 movq ctxPtr(%rsp),%rdx 1187 1188 .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15 #do all but x6,x7 1189_oo_ = o1K_\_rr_ 1190 xorq Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR 1191 movq %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context 1192 .if (_oo_ == 9) 1193 movq $FIRST_MASK64 ,%r9 1194 .endif 1195 .if (_oo_ == 14) 1196 andq TWEAK+ 8(%rdx),%r9 1197 .endif 1198 .endr 1199 # 1200 movq X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above) 1201 movq X_stk +8*7(%rsp),%rbx 1202 xorq Wcopy +8*6(%rsp),%rax 1203 xorq Wcopy +8*7(%rsp),%rbx 1204 movq %rax,X_VARS+8*6(%rdx) 1205 decq blkCnt(%rsp) #set zero flag iff done 1206 movq %rbx,X_VARS+8*7(%rdx) 1207 1208 Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)> 1209 # go back for more blocks, if needed 1210 movq ctxPtr(%rsp),%rdi #don't muck with the flags here! 1211 lea FRAME_OFFS(%rsp),%rbp 1212 jnz Skein1024_block_loop 1213 movq %r9 ,TWEAK+ 8(%rdx) 1214 Reset_Stack 1215 ret 1216# 1217Skein1024_Process_Block_End: 1218# 1219.if _SKEIN_DEBUG 1220Skein_Debug_Round_1024: 1221 # call here with rdx = "round number", 1222_SP_OFFS_ = 8*2 #stack "offset" here: rdx, return addr 1223 # 1224 #save rest of X[] state on stack so debug routines can access it 1225 .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15 1226 movq %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp) 1227 .endr 1228 # Figure out what to do with x0 (rdi). When rdx == 0 mod 4, it's already on stack 1229 cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always save 1230 jae save_x0 1231 testq $3,%rdx #otherwise only if rdx != 0 mod 4 1232 jz save_x0_not 1233save_x0: 1234 movq %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp) 1235save_x0_not: 1236 #figure out the x4/x6 swapping state and save the correct one! 1237 cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4 1238 jae save_x4 1239 testq $1,%rdx #and even ones have r4 as well 1240 jz save_x4 1241 movq %rcx,X_stk+8*6+_SP_OFFS_(%rsp) 1242 jmp debug_1024_go 1243save_x4: 1244 movq %rcx,X_stk+8*4+_SP_OFFS_(%rsp) 1245debug_1024_go: 1246 #now all is saved in Xstk[] except for rdx 1247 push %rsi #save two regs for BLK_BITS-specific parms 1248 push %rdi 1249_SP_OFFS_ = _SP_OFFS_ + 16 #adjust stack offset accordingly (now 32) 1250 1251 movq _SP_OFFS_-8(%rsp),%rsi #get back original %rdx (pushed on stack in macro call) 1252 movq %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[] 1253 1254 movq ctxPtr+_SP_OFFS_(%rsp),%rsi #rsi = ctx_hdr_ptr 1255 movq $1024,%rdi #rdi = block size 1256 jmp Skein_Debug_Round_Common 1257.endif 1258# 1259.if _SKEIN_CODE_SIZE 1260C_label Skein1024_Process_Block_CodeSize 1261 movq $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax 1262 ret 1263# 1264C_label Skein1024_Unroll_Cnt 1265 .if _UNROLL_CNT <> (ROUNDS_1024/8) 1266 movq $_UNROLL_CNT,%rax 1267 .else 1268 xorq %rax,%rax 1269 .endif 1270 ret 1271.endif 1272# 1273.endif # _USE_ASM_ and 1024 1274# 1275.if _SKEIN_DEBUG 1276#---------------------------------------------------------------- 1277#local debug routine to set up for calls to: 1278# void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X) 1279# [ rdi rsi rdx rcx] 1280# 1281# here with %rdx = round number 1282# %rsi = ctx_hdr_ptr 1283# %rdi = block size (256/512/1024) 1284# on stack: saved rdi, saved rsi, retAddr, saved rdx 1285# 1286Skein_Debug_Round_Common: 1287_SP_OFFS_ = 32 #account for four words on stack already 1288 .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15 #save the rest of the regs 1289 pushq %\_rr_ 1290_SP_OFFS_ = _SP_OFFS_+8 1291 .endr 1292 .if (_SP_OFFS_ % 16) # make sure stack is still 16-byte aligned here 1293 .error "Debug_Round_Common: stack alignment" 1294 .endif 1295 # compute %rcx = ptr to the X[] array on the stack (final parameter to call) 1296 leaq X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address 1297 cmpq $SKEIN_RND_FEED_FWD,%rdx #special handling for feedforward "round"? 1298 jnz _got_rcxA 1299 leaq X_VARS(%rsi),%rcx 1300_got_rcxA: 1301 .if _USE_ASM_ & 1024 1302 # special handling for 1024-bit case 1303 # (for rounds right before with key injection: 1304 # use xDebug_1024[] instead of X_stk[]) 1305 cmpq $SKEIN_RND_SPECIAL,%rdx 1306 jae _got_rcxB #must be a normal round 1307 orq %rdx,%rdx 1308 jz _got_rcxB #just before key injection 1309 test $3,%rdx 1310 jne _got_rcxB 1311 cmp $1024,%rdi #only 1024-bit(s) for now 1312 jne _got_rcxB 1313 leaq xDebug_1024+_SP_OFFS_(%rsp),%rcx 1314_got_rcxB: 1315 .endif 1316 call Skein_Show_Round #call external debug handler 1317 1318 .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax #restore regs 1319 popq %\_rr_ 1320_SP_OFFS_ = _SP_OFFS_-8 1321 .endr 1322 .if _SP_OFFS_ - 32 1323 .error "Debug_Round_Common: push/pop misalignment!" 1324 .endif 1325 popq %rdi 1326 popq %rsi 1327 ret 1328.endif 1329#---------------------------------------------------------------- 1330 .section .note.GNU-stack,"",@progbits 1331 1332 .end 1333