1# 2#---------------------------------------------------------------- 3# 64-bit x86 assembler code (gnu as) for Skein block functions 4# 5# Author: Doug Whiting, Hifn/Exar 6# 7# This code is released to the public domain. 8#---------------------------------------------------------------- 9# $FreeBSD$ 10# 11 .text 12 .altmacro 13#ifndef __clang__ 14 .psize 0,128 #list file has no page boundaries 15#endif 16# 17_MASK_ALL_ = (256+512+1024) #all three algorithm bits 18_MAX_FRAME_ = 240 19# 20################# 21#ifndef SKEIN_USE_ASM 22_USE_ASM_ = _MASK_ALL_ 23#else 24_USE_ASM_ = SKEIN_USE_ASM 25#endif 26################# 27#configure loop unrolling 28#ifndef SKEIN_LOOP 29_SKEIN_LOOP = 2 #default is fully unrolled for 256/512, twice for 1024 30#else 31_SKEIN_LOOP = SKEIN_LOOP 32 .irp _NN_,%_SKEIN_LOOP #only display loop unrolling if default changed on command line 33#.print "+++ SKEIN_LOOP = \_NN_" 34 .endr 35#endif 36# the unroll counts (0 --> fully unrolled) 37SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10 38SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10 39SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10 40# 41SKEIN_ASM_UNROLL = 0 42 .irp _NN_,256,512,1024 43 .if (SKEIN_UNROLL_\_NN_) == 0 44SKEIN_ASM_UNROLL = (SKEIN_ASM_UNROLL) + \_NN_ 45 .endif 46 .endr 47################# 48# 49.ifndef SKEIN_ROUNDS 50ROUNDS_256 = 72 51ROUNDS_512 = 72 52ROUNDS_1024 = 80 53.else 54ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5) 55ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5) 56ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5) 57# only display rounds if default size is changed on command line 58.irp _NN_,256,512,1024 59 .if _USE_ASM_ & \_NN_ 60 .irp _RR_,%(ROUNDS_\_NN_) 61 .if _NN_ < 1024 62.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" 63 .else 64.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" 65 .endif 66 .endr 67 .endif 68.endr 69.endif 70################# 71# 72.ifdef SKEIN_CODE_SIZE 73_SKEIN_CODE_SIZE = (1) 74.else 75.ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined 76_SKEIN_CODE_SIZE = (1) 77.else 78_SKEIN_CODE_SIZE = (0) 79.endif 80.endif 81# 82################# 83# 84.ifndef SKEIN_DEBUG 85_SKEIN_DEBUG = 0 86.else 87_SKEIN_DEBUG = 1 88.endif 89################# 90# 91# define offsets of fields in hash context structure 92# 93HASH_BITS = 0 #bits of hash output 94BCNT = 8 + HASH_BITS #number of bytes in BUFFER[] 95TWEAK = 8 + BCNT #tweak values[0..1] 96X_VARS = 16 + TWEAK #chaining vars 97# 98#(Note: buffer[] in context structure is NOT needed here :-) 99# 100KW_PARITY = 0x1BD11BDAA9FC1A22 #overall parity of key schedule words 101FIRST_MASK = ~ (1 << 6) 102FIRST_MASK64= ~ (1 << 62) 103# 104# rotation constants for Skein 105# 106RC_256_0_0 = 14 107RC_256_0_1 = 16 108 109RC_256_1_0 = 52 110RC_256_1_1 = 57 111 112RC_256_2_0 = 23 113RC_256_2_1 = 40 114 115RC_256_3_0 = 5 116RC_256_3_1 = 37 117 118RC_256_4_0 = 25 119RC_256_4_1 = 33 120 121RC_256_5_0 = 46 122RC_256_5_1 = 12 123 124RC_256_6_0 = 58 125RC_256_6_1 = 22 126 127RC_256_7_0 = 32 128RC_256_7_1 = 32 129 130RC_512_0_0 = 46 131RC_512_0_1 = 36 132RC_512_0_2 = 19 133RC_512_0_3 = 37 134 135RC_512_1_0 = 33 136RC_512_1_1 = 27 137RC_512_1_2 = 14 138RC_512_1_3 = 42 139 140RC_512_2_0 = 17 141RC_512_2_1 = 49 142RC_512_2_2 = 36 143RC_512_2_3 = 39 144 145RC_512_3_0 = 44 146RC_512_3_1 = 9 147RC_512_3_2 = 54 148RC_512_3_3 = 56 149 150RC_512_4_0 = 39 151RC_512_4_1 = 30 152RC_512_4_2 = 34 153RC_512_4_3 = 24 154 155RC_512_5_0 = 13 156RC_512_5_1 = 50 157RC_512_5_2 = 10 158RC_512_5_3 = 17 159 160RC_512_6_0 = 25 161RC_512_6_1 = 29 162RC_512_6_2 = 39 163RC_512_6_3 = 43 164 165RC_512_7_0 = 8 166RC_512_7_1 = 35 167RC_512_7_2 = 56 168RC_512_7_3 = 22 169 170RC_1024_0_0 = 24 171RC_1024_0_1 = 13 172RC_1024_0_2 = 8 173RC_1024_0_3 = 47 174RC_1024_0_4 = 8 175RC_1024_0_5 = 17 176RC_1024_0_6 = 22 177RC_1024_0_7 = 37 178 179RC_1024_1_0 = 38 180RC_1024_1_1 = 19 181RC_1024_1_2 = 10 182RC_1024_1_3 = 55 183RC_1024_1_4 = 49 184RC_1024_1_5 = 18 185RC_1024_1_6 = 23 186RC_1024_1_7 = 52 187 188RC_1024_2_0 = 33 189RC_1024_2_1 = 4 190RC_1024_2_2 = 51 191RC_1024_2_3 = 13 192RC_1024_2_4 = 34 193RC_1024_2_5 = 41 194RC_1024_2_6 = 59 195RC_1024_2_7 = 17 196 197RC_1024_3_0 = 5 198RC_1024_3_1 = 20 199RC_1024_3_2 = 48 200RC_1024_3_3 = 41 201RC_1024_3_4 = 47 202RC_1024_3_5 = 28 203RC_1024_3_6 = 16 204RC_1024_3_7 = 25 205 206RC_1024_4_0 = 41 207RC_1024_4_1 = 9 208RC_1024_4_2 = 37 209RC_1024_4_3 = 31 210RC_1024_4_4 = 12 211RC_1024_4_5 = 47 212RC_1024_4_6 = 44 213RC_1024_4_7 = 30 214 215RC_1024_5_0 = 16 216RC_1024_5_1 = 34 217RC_1024_5_2 = 56 218RC_1024_5_3 = 51 219RC_1024_5_4 = 4 220RC_1024_5_5 = 53 221RC_1024_5_6 = 42 222RC_1024_5_7 = 41 223 224RC_1024_6_0 = 31 225RC_1024_6_1 = 44 226RC_1024_6_2 = 47 227RC_1024_6_3 = 46 228RC_1024_6_4 = 19 229RC_1024_6_5 = 42 230RC_1024_6_6 = 44 231RC_1024_6_7 = 25 232 233RC_1024_7_0 = 9 234RC_1024_7_1 = 48 235RC_1024_7_2 = 35 236RC_1024_7_3 = 52 237RC_1024_7_4 = 23 238RC_1024_7_5 = 31 239RC_1024_7_6 = 37 240RC_1024_7_7 = 20 241# 242# Input: reg 243# Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024 244# 245.macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM 246 .if RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM #is there anything to do? 247 rolq $RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM,%\reg 248 .endif 249.endm 250# 251#---------------------------------------------------------------- 252# 253# MACROS: define local vars and configure stack 254# 255#---------------------------------------------------------------- 256# declare allocated space on the stack 257.macro StackVar localName,localSize 258\localName = _STK_OFFS_ 259_STK_OFFS_ = _STK_OFFS_+(\localSize) 260.endm #StackVar 261# 262#---------------------------------------------------------------- 263# 264# MACRO: Configure stack frame, allocate local vars 265# 266.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt 267 WCNT = (\BLK_BITS)/64 268# 269_PushCnt_ = 0 #save nonvolatile regs on stack 270 .irp _reg_,rbp,rbx,r12,r13,r14,r15 271 pushq %\_reg_ 272_PushCnt_ = _PushCnt_ + 1 #track count to keep alignment 273 .endr 274# 275_STK_OFFS_ = 0 #starting offset from rsp 276 #---- local variables #<-- rsp 277 StackVar X_stk ,8*(WCNT) #local context vars 278 StackVar ksTwk ,8*3 #key schedule: tweak words 279 StackVar ksKey ,8*(WCNT)+8 #key schedule: key words 280 .if ((SKEIN_ASM_UNROLL) & (\BLK_BITS)) == 0 281 StackVar ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen 282 .endif 283 StackVar Wcopy ,8*(WCNT) #copy of input block 284 .if _SKEIN_DEBUG 285 .if \debugCnt + 0 #temp location for debug X[] info 286 StackVar xDebug_\BLK_BITS ,8*(\debugCnt) 287 .endif 288 .endif 289 .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0 290 StackVar align16,8 #keep 16-byte aligned (adjust for retAddr?) 291tmpStk_\BLK_BITS = align16 #use this 292 .endif 293 #---- saved caller parameters (from regs rdi, rsi, rdx, rcx) 294 StackVar ctxPtr ,8 #context ptr 295 StackVar blkPtr ,8 #pointer to block data 296 StackVar blkCnt ,8 #number of full blocks to process 297 StackVar bitAdd ,8 #bit count to add to tweak 298LOCAL_SIZE = _STK_OFFS_ #size of "local" vars 299 #---- 300 StackVar savRegs,8*_PushCnt_ #saved registers 301 StackVar retAddr,8 #return address 302 #---- caller's stack frame (aligned mod 16) 303# 304# set up the stack frame pointer (rbp) 305# 306FRAME_OFFS = ksTwk + 128 #allow short (negative) offset to ksTwk, kwKey 307 .if FRAME_OFFS > _STK_OFFS_ #keep rbp in the "locals" range 308FRAME_OFFS = _STK_OFFS_ 309 .endif 310F_O = -FRAME_OFFS 311# 312 #put some useful defines in the .lst file (for grep) 313__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE 314__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_ 315__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS 316# 317# Notes on stack frame setup: 318# * the most frequently used variable is X_stk[], based at [rsp+0] 319# * the next most used is the key schedule arrays, ksKey and ksTwk 320# so rbp is "centered" there, allowing short offsets to the key 321# schedule even in 1024-bit Skein case 322# * the Wcopy variables are infrequently accessed, but they have long 323# offsets from both rsp and rbp only in the 1024-bit case. 324# * all other local vars and calling parameters can be accessed 325# with short offsets, except in the 1024-bit case 326# 327 subq $LOCAL_SIZE,%rsp #make room for the locals 328 leaq FRAME_OFFS(%rsp),%rbp #maximize use of short offsets 329 movq %rdi, ctxPtr+F_O(%rbp) #save caller's parameters on the stack 330 movq %rsi, blkPtr+F_O(%rbp) 331 movq %rdx, blkCnt+F_O(%rbp) 332 movq %rcx, bitAdd+F_O(%rbp) 333# 334.endm #Setup_Stack 335# 336#---------------------------------------------------------------- 337# 338.macro Reset_Stack 339 addq $LOCAL_SIZE,%rsp #get rid of locals (wipe?) 340 .irp _reg_,r15,r14,r13,r12,rbx,rbp 341 popq %\_reg_ #restore caller's regs 342_PushCnt_ = _PushCnt_ - 1 343 .endr 344 .if _PushCnt_ 345 .error "Mismatched push/pops?" 346 .endif 347.endm # Reset_Stack 348# 349#---------------------------------------------------------------- 350# macros to help debug internals 351# 352.if _SKEIN_DEBUG 353 .extern Skein_Show_Block #calls to C routines 354 .extern Skein_Show_Round 355# 356SKEIN_RND_SPECIAL = 1000 357SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0 358SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1 359SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2 360# 361.macro Skein_Debug_Block BLK_BITS 362# 363#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X, 364# const u08b_t *blkPtr, const u64b_t *wPtr, 365# const u64b_t *ksPtr,const u64b_t *tsPtr) 366# 367_NN_ = 0 368 .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11 369 pushq %\_reg_ #save all volatile regs on tack before the call 370_NN_ = _NN_ + 1 371 .endr 372 # get and push call parameters 373 movq $\BLK_BITS ,%rdi #bits 374 movq ctxPtr+F_O(%rbp),%rsi #h (pointer) 375 leaq X_VARS (%rsi),%rdx #X (pointer) 376 movq blkPtr+F_O(%rbp),%rcx #blkPtr 377 leaq Wcopy +F_O(%rbp),%r8 #wPtr 378 leaq ksKey +F_O(%rbp),%r9 #key pointer 379 leaq ksTwk +F_O(%rbp),%rax #tweak pointer 380 pushq %rax # (pass on the stack) 381 call Skein_Show_Block #call external debug handler 382 addq $8*1,%rsp #discard parameters on stack 383 .if (_NN_ % 2 ) == 0 #check stack alignment 384 .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS" 385 .endif 386 .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax 387 popq %\_reg_ #restore regs 388_NN_ = _NN_ - 1 389 .endr 390 .if _NN_ 391 .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS" 392 .endif 393.endm # Skein_Debug_Block 394# 395# the macro to "call" to debug a round 396# 397.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp 398 # call the appropriate (local) debug "function" 399 pushq %rdx #save rdx, so we can use it for round "number" 400 .if ((SKEIN_ASM_UNROLL) & \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL) 401 movq $\R,%rdx 402 .else #compute round number using edi 403_rOffs_ = \RDI_OFFS + 0 404 .if \BLK_BITS == 1024 405 movq rIdx_offs+8(%rsp),%rdx #get rIdx off the stack (adjust for pushq rdx above) 406 leaq 1+(((\R)-1) & 3)+_rOffs_(,%rdx,4),%rdx 407 .else 408 leaq 1+(((\R)-1) & 3)+_rOffs_(,%rdi,4),%rdx 409 .endif 410 .endif 411 call Skein_Debug_Round_\BLK_BITS 412 popq %rdx #restore origianl rdx value 413# 414 afterOp 415.endm # Skein_Debug_Round 416.else #------- _SKEIN_DEBUG (dummy macros if debug not enabled) 417.macro Skein_Debug_Block BLK_BITS 418.endm 419# 420.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp 421.endm 422# 423.endif # _SKEIN_DEBUG 424# 425#---------------------------------------------------------------- 426# 427.macro addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs 428 .if \immOffs + 0 429 leaq \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg 430 .elseif ((\useAddOp + 0) == 0) 431 .ifndef ASM_NO_LEA #lea seems to be faster on Core 2 Duo CPUs! 432 leaq (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg 433 .else 434 addq %\srcReg_A\srcReg_B,%\dstReg 435 .endif 436 .else 437 addq %\srcReg_A\srcReg_B,%\dstReg 438 .endif 439.endm 440 441# keep Intel-style ordering here, to match addReg 442.macro xorReg dstReg,srcReg_A,srcReg_B 443 xorq %\srcReg_A\srcReg_B,%\dstReg 444.endm 445# 446#---------------------------------------------------------------- 447# 448.macro C_label lName 449 \lName: #use both "genders" to work across linkage conventions 450_\lName: 451 .global \lName 452 .global _\lName 453.endm 454# 455#=================================== Skein_256 ============================================= 456# 457.if _USE_ASM_ & 256 458# 459# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# 460# 461################# 462# 463# code 464# 465C_label Skein_256_Process_Block 466 Setup_Stack 256,((ROUNDS_256/8)+1) 467 movq TWEAK+8(%rdi),%r14 468 jmp Skein_256_block_loop 469 .p2align 4 470 # main hash loop for Skein_256 471Skein_256_block_loop: 472 # 473 # general register usage: 474 # RAX..RDX = X0..X3 475 # R08..R12 = ks[0..4] 476 # R13..R15 = ts[0..2] 477 # RSP, RBP = stack/frame pointers 478 # RDI = round counter or context pointer 479 # RSI = temp 480 # 481 movq TWEAK+0(%rdi) ,%r13 482 addq bitAdd+F_O(%rbp) ,%r13 #computed updated tweak value T0 483 movq %r14 ,%r15 484 xorq %r13 ,%r15 #now %r13.%r15 is set as the tweak 485 486 movq $KW_PARITY ,%r12 487 movq X_VARS+ 0(%rdi),%r8 488 movq X_VARS+ 8(%rdi),%r9 489 movq X_VARS+16(%rdi),%r10 490 movq X_VARS+24(%rdi),%r11 491 movq %r13,TWEAK+0(%rdi) #save updated tweak value ctx->h.T[0] 492 xorq %r8 ,%r12 #start accumulating overall parity 493 494 movq blkPtr +F_O(%rbp) ,%rsi #esi --> input block 495 xorq %r9 ,%r12 496 movq 0(%rsi) ,%rax #get X[0..3] 497 xorq %r10 ,%r12 498 movq 8(%rsi) ,%rbx 499 xorq %r11 ,%r12 500 movq 16(%rsi) ,%rcx 501 movq 24(%rsi) ,%rdx 502 503 movq %rax,Wcopy+ 0+F_O(%rbp) #save copy of input block 504 movq %rbx,Wcopy+ 8+F_O(%rbp) 505 movq %rcx,Wcopy+16+F_O(%rbp) 506 movq %rdx,Wcopy+24+F_O(%rbp) 507 508 addq %r8 ,%rax #initial key injection 509 addq %r9 ,%rbx 510 addq %r10,%rcx 511 addq %r11,%rdx 512 addq %r13,%rbx 513 addq %r14,%rcx 514 515.if _SKEIN_DEBUG 516 movq %r14,TWEAK+ 8(%rdi) #save updated tweak T[1] (start bit cleared?) 517 movq %r8 ,ksKey+ 0+F_O(%rbp) #save key schedule on stack for Skein_Debug_Block 518 movq %r9 ,ksKey+ 8+F_O(%rbp) 519 movq %r10,ksKey+16+F_O(%rbp) 520 movq %r11,ksKey+24+F_O(%rbp) 521 movq %r12,ksKey+32+F_O(%rbp) 522 523 movq %r13,ksTwk+ 0+F_O(%rbp) 524 movq %r14,ksTwk+ 8+F_O(%rbp) 525 movq %r15,ksTwk+16+F_O(%rbp) 526 527 movq %rax,X_stk + 0(%rsp) #save X[] on stack for Skein_Debug_Block 528 movq %rbx,X_stk + 8(%rsp) 529 movq %rcx,X_stk +16(%rsp) 530 movq %rdx,X_stk +24(%rsp) 531 532 Skein_Debug_Block 256 #debug dump 533 Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL 534.endif 535# 536.if (((SKEIN_ASM_UNROLL) & 256) == 0) 537 movq %r8 ,ksKey+40+F_O(%rbp) #save key schedule on stack for looping code 538 movq %r9 ,ksKey+ 8+F_O(%rbp) 539 movq %r10,ksKey+16+F_O(%rbp) 540 movq %r11,ksKey+24+F_O(%rbp) 541 movq %r12,ksKey+32+F_O(%rbp) 542 543 movq %r13,ksTwk+24+F_O(%rbp) 544 movq %r14,ksTwk+ 8+F_O(%rbp) 545 movq %r15,ksTwk+16+F_O(%rbp) 546.endif 547 addq $WCNT*8,%rsi #skip the block 548 movq %rsi,blkPtr +F_O(%rbp) #update block pointer 549 # 550 # now the key schedule is computed. Start the rounds 551 # 552.if (SKEIN_ASM_UNROLL) & 256 553_UNROLL_CNT = ROUNDS_256/8 554.else 555_UNROLL_CNT = SKEIN_UNROLL_256 556 .if ((ROUNDS_256/8) % _UNROLL_CNT) 557 .error "Invalid SKEIN_UNROLL_256" 558 .endif 559 xorq %rdi,%rdi #rdi = iteration count 560Skein_256_round_loop: 561.endif 562_Rbase_ = 0 563.rept _UNROLL_CNT*2 564 # all X and ks vars in regs # (ops to "rotate" ks vars, via mem, if not unrolled) 565 # round 4*_RBase_ + 0 566 addReg rax, rbx 567 RotL64 rbx, 256,%((4*_Rbase_+0) % 8),0 568 addReg rcx, rdx 569 .if ((SKEIN_ASM_UNROLL) & 256) == 0 570 movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8 571 .endif 572 xorReg rbx, rax 573 RotL64 rdx, 256,%((4*_Rbase_+0) % 8),1 574 xorReg rdx, rcx 575 .if (SKEIN_ASM_UNROLL) & 256 576 .irp _r0_,%( 8+(_Rbase_+3) % 5) 577 .irp _r1_,%(13+(_Rbase_+2) % 3) 578 leaq (%r\_r0_,%r\_r1_),%rdi #precompute key injection value for %rcx 579 .endr 580 .endr 581 .endif 582 .if ((SKEIN_ASM_UNROLL) & 256) == 0 583 movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13 584 .endif 585 Skein_Debug_Round 256,%(4*_Rbase_+1) 586 587 # round 4*_Rbase_ + 1 588 addReg rax, rdx 589 RotL64 rdx, 256,%((4*_Rbase_+1) % 8),0 590 xorReg rdx, rax 591 .if ((SKEIN_ASM_UNROLL) & 256) == 0 592 movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9 593 .endif 594 addReg rcx, rbx 595 RotL64 rbx, 256,%((4*_Rbase_+1) % 8),1 596 xorReg rbx, rcx 597 .if ((SKEIN_ASM_UNROLL) & 256) == 0 598 movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11 599 .endif 600 Skein_Debug_Round 256,%(4*_Rbase_+2) 601 .if (SKEIN_ASM_UNROLL) & 256 602 .irp _r0_,%( 8+(_Rbase_+2) % 5) 603 .irp _r1_,%(13+(_Rbase_+1) % 3) 604 leaq (%r\_r0_,%r\_r1_),%rsi #precompute key injection value for %rbx 605 .endr 606 .endr 607 .endif 608 # round 4*_Rbase_ + 2 609 addReg rax, rbx 610 RotL64 rbx, 256,%((4*_Rbase_+2) % 8),0 611 addReg rcx, rdx 612 .if ((SKEIN_ASM_UNROLL) & 256) == 0 613 movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10 614 .endif 615 xorReg rbx, rax 616 RotL64 rdx, 256,%((4*_Rbase_+2) % 8),1 617 xorReg rdx, rcx 618 .if ((SKEIN_ASM_UNROLL) & 256) == 0 619 movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8) #"rotate" the key 620 leaq 1(%r11,%rdi),%r11 #precompute key + tweak 621 .endif 622 Skein_Debug_Round 256,%(4*_Rbase_+3) 623 # round 4*_Rbase_ + 3 624 addReg rax, rdx 625 RotL64 rdx, 256,%((4*_Rbase_+3) % 8),0 626 addReg rcx, rbx 627 .if ((SKEIN_ASM_UNROLL) & 256) == 0 628 addq ksTwk+8*2+F_O(%rbp,%rdi,8),%r10 #precompute key + tweak 629 movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8) #"rotate" the tweak 630 .endif 631 xorReg rdx, rax 632 RotL64 rbx, 256,%((4*_Rbase_+3) % 8),1 633 xorReg rbx, rcx 634 Skein_Debug_Round 256,%(4*_Rbase_+4) 635 .if ((SKEIN_ASM_UNROLL) & 256) == 0 636 addReg r9 ,r13 #precompute key+tweak 637 .endif 638 #inject key schedule words 639_Rbase_ = _Rbase_+1 640 .if (SKEIN_ASM_UNROLL) & 256 641 addReg rax,r,%(8+((_Rbase_+0) % 5)) 642 addReg rbx,rsi 643 addReg rcx,rdi 644 addReg rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_ 645 .else 646 incq %rdi 647 addReg rax,r8 648 addReg rcx,r10 649 addReg rbx,r9 650 addReg rdx,r11 651 .endif 652 Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT 653.endr #rept _UNROLL_CNT 654# 655.if ((SKEIN_ASM_UNROLL) & 256) == 0 656 cmpq $2*(ROUNDS_256/8),%rdi 657 jb Skein_256_round_loop 658.endif # (SKEIN_ASM_UNROLL & 256) == 0 659 movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context 660 661 #---------------------------- 662 # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3} 663 movq $FIRST_MASK64 ,%r14 664 xorq Wcopy + 0+F_O (%rbp),%rax 665 xorq Wcopy + 8+F_O (%rbp),%rbx 666 xorq Wcopy +16+F_O (%rbp),%rcx 667 xorq Wcopy +24+F_O (%rbp),%rdx 668 andq TWEAK + 8 (%rdi),%r14 669 movq %rax,X_VARS+ 0(%rdi) #store final result 670 movq %rbx,X_VARS+ 8(%rdi) 671 movq %rcx,X_VARS+16(%rdi) 672 movq %rdx,X_VARS+24(%rdi) 673 674 Skein_Debug_Round 256,SKEIN_RND_FEED_FWD 675 676 # go back for more blocks, if needed 677 decq blkCnt+F_O(%rbp) 678 jnz Skein_256_block_loop 679 movq %r14,TWEAK + 8(%rdi) 680 Reset_Stack 681 ret 682Skein_256_Process_Block_End: 683 684 .if _SKEIN_DEBUG 685Skein_Debug_Round_256: #here with rdx == round "number" from macro 686 pushq %rsi #save two regs for BLK_BITS-specific parms 687 pushq %rdi 688 movq 24(%rsp),%rdi #get back original rdx (pushed on stack in macro call) to rdi 689 movq %rax,X_stk+ 0+F_O(%rbp) #save X[] state on stack so debug routines can access it 690 movq %rbx,X_stk+ 8+F_O(%rbp) #(use FP_ since rsp has changed!) 691 movq %rcx,X_stk+16+F_O(%rbp) 692 movq %rdi,X_stk+24+F_O(%rbp) 693 694 movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr 695 movq $256,%rdi #now <rdi,rsi,rdx> are set for the call 696 jmp Skein_Debug_Round_Common 697 .endif 698# 699.if _SKEIN_CODE_SIZE 700C_label Skein_256_Process_Block_CodeSize 701 movq $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax 702 ret 703# 704C_label Skein_256_Unroll_Cnt 705 .if _UNROLL_CNT <> ROUNDS_256/8 706 movq $_UNROLL_CNT,%rax 707 .else 708 xorq %rax,%rax 709 .endif 710 ret 711.endif 712# 713.endif #_USE_ASM_ & 256 714# 715#=================================== Skein_512 ============================================= 716# 717.if _USE_ASM_ & 512 718# 719# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd) 720# 721# X[i] == %r[8+i] #register assignments for X[] values during rounds (i=0..7) 722# 723################# 724# MACRO: one round for 512-bit blocks 725# 726.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4 727# 728 addReg r\rn0, r\rn1 729 RotL64 r\rn1, 512,%((\_Rn_) % 8),0 730 xorReg r\rn1, r\rn0 731 \op1 732 addReg r\rn2, r\rn3 733 RotL64 r\rn3, 512,%((\_Rn_) % 8),1 734 xorReg r\rn3, r\rn2 735 \op2 736 addReg r\rn4, r\rn5 737 RotL64 r\rn5, 512,%((\_Rn_) % 8),2 738 xorReg r\rn5, r\rn4 739 \op3 740 addReg r\rn6, r\rn7 741 RotL64 r\rn7, 512,%((\_Rn_) % 8),3 742 xorReg r\rn7, r\rn6 743 \op4 744 Skein_Debug_Round 512,%(\_Rn_+1),-4 745# 746.endm #R_512_OneRound 747# 748################# 749# MACRO: eight rounds for 512-bit blocks 750# 751.macro R_512_FourRounds _RR_ #RR = base round number (0 % 8) 752 .if ((SKEIN_ASM_UNROLL) & 512) 753 # here for fully unrolled case. 754 _II_ = ((\_RR_)/4) + 1 #key injection counter 755 R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx> 756 R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx> 757 R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx> 758 R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>, 759 # inject the key schedule 760 addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8 761 addReg r11, rax 762 addq ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9 763 addReg r12, rbx 764 addq ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10 765 addReg r13, rcx 766 addReg r14, rdx 767 addReg r15, rsi,,,(_II_) 768 .else 769 # here for looping case #"rotate" key/tweak schedule (move up on stack) 770 incq %rdi #bump key injection counter 771 R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq ksKey-8*1+F_O(%rbp,%rdi,8),%rsi> 772 R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8) >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)> 773 R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq ksKey+8*7+F_O(%rbp,%rdi,8),%rsi> 774 R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx> 775 # inject the key schedule 776 addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8 777 addReg r11, rax 778 addReg r12, rbx 779 addq ksKey+8*1+F_O(%rbp,%rdi,8),%r9 780 addReg r13, rcx 781 addReg r14, rdx 782 addq ksKey+8*2+F_O(%rbp,%rdi,8),%r10 783 addReg r15, rsi 784 addReg r15, rdi #inject the round number 785 .endif 786 787 #show the result of the key injection 788 Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT 789.endm #R_512_EightRounds 790# 791################# 792# instantiated code 793# 794C_label Skein_512_Process_Block 795 Setup_Stack 512,ROUNDS_512/8 796 movq TWEAK+ 8(%rdi),%rbx 797 jmp Skein_512_block_loop 798 .p2align 4 799 # main hash loop for Skein_512 800Skein_512_block_loop: 801 # general register usage: 802 # RAX..RDX = temps for key schedule pre-loads 803 # R8 ..R15 = X0..X7 804 # RSP, RBP = stack/frame pointers 805 # RDI = round counter or context pointer 806 # RSI = temp 807 # 808 movq TWEAK + 0(%rdi),%rax 809 addq bitAdd+F_O(%rbp),%rax #computed updated tweak value T0 810 movq %rbx,%rcx 811 xorq %rax,%rcx #%rax/%rbx/%rcx = tweak schedule 812 movq %rax,TWEAK+ 0 (%rdi) #save updated tweak value ctx->h.T[0] 813 movq %rax,ksTwk+ 0+F_O(%rbp) 814 movq $KW_PARITY,%rdx 815 movq blkPtr +F_O(%rbp),%rsi #%rsi --> input block 816 movq %rbx,ksTwk+ 8+F_O(%rbp) 817 movq %rcx,ksTwk+16+F_O(%rbp) 818 .irp _Rn_,8,9,10,11,12,13,14,15 819 movq X_VARS+8*(\_Rn_-8)(%rdi),%r\_Rn_ 820 xorq %r\_Rn_,%rdx #compute overall parity 821 movq %r\_Rn_,ksKey+8*(\_Rn_-8)+F_O(%rbp) 822 .endr #load state into %r8 ..%r15, compute parity 823 movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity 824 825 addReg r13,rax #precompute key injection for tweak 826 addReg r14, rbx 827.if _SKEIN_DEBUG 828 movq %rbx,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below 829.endif 830 movq 0(%rsi),%rax #load input block 831 movq 8(%rsi),%rbx 832 movq 16(%rsi),%rcx 833 movq 24(%rsi),%rdx 834 addReg r8 , rax #do initial key injection 835 addReg r9 , rbx 836 movq %rax,Wcopy+ 0+F_O(%rbp) #keep local copy for feedforward 837 movq %rbx,Wcopy+ 8+F_O(%rbp) 838 addReg r10, rcx 839 addReg r11, rdx 840 movq %rcx,Wcopy+16+F_O(%rbp) 841 movq %rdx,Wcopy+24+F_O(%rbp) 842 843 movq 32(%rsi),%rax 844 movq 40(%rsi),%rbx 845 movq 48(%rsi),%rcx 846 movq 56(%rsi),%rdx 847 addReg r12, rax 848 addReg r13, rbx 849 addReg r14, rcx 850 addReg r15, rdx 851 movq %rax,Wcopy+32+F_O(%rbp) 852 movq %rbx,Wcopy+40+F_O(%rbp) 853 movq %rcx,Wcopy+48+F_O(%rbp) 854 movq %rdx,Wcopy+56+F_O(%rbp) 855 856.if _SKEIN_DEBUG 857 .irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output 858 movq %r\_Rn_,X_stk+8*(\_Rn_-8)(%rsp) 859 .endr 860 861 Skein_Debug_Block 512 #debug dump 862 Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL 863.endif 864 addq $8*WCNT,%rsi #skip the block 865 movq %rsi,blkPtr+F_O(%rbp) #update block pointer 866 # 867 ################# 868 # now the key schedule is computed. Start the rounds 869 # 870.if (SKEIN_ASM_UNROLL) & 512 871_UNROLL_CNT = ROUNDS_512/8 872.else 873_UNROLL_CNT = SKEIN_UNROLL_512 874 .if ((ROUNDS_512/8) % _UNROLL_CNT) 875 .error "Invalid SKEIN_UNROLL_512" 876 .endif 877 xorq %rdi,%rdi #rdi = round counter 878Skein_512_round_loop: 879.endif 880# 881_Rbase_ = 0 882.rept _UNROLL_CNT*2 883 R_512_FourRounds %(4*_Rbase_+00) 884_Rbase_ = _Rbase_+1 885.endr #rept _UNROLL_CNT 886# 887.if ((SKEIN_ASM_UNROLL) & 512) == 0 888 cmpq $2*(ROUNDS_512/8),%rdi 889 jb Skein_512_round_loop 890 movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context 891.endif 892 # end of rounds 893 ################# 894 # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7} 895 .irp _Rn_,8,9,10,11,12,13,14,15 896 .if (\_Rn_ == 8) 897 movq $FIRST_MASK64,%rbx 898 .endif 899 xorq Wcopy+8*(\_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR 900 movq %r\_Rn_,X_VARS+8*(\_Rn_-8)(%rdi) #and store result 901 .if (\_Rn_ == 14) 902 andq TWEAK+ 8(%rdi),%rbx 903 .endif 904 .endr 905 Skein_Debug_Round 512,SKEIN_RND_FEED_FWD 906 907 # go back for more blocks, if needed 908 decq blkCnt+F_O(%rbp) 909 jnz Skein_512_block_loop 910 movq %rbx,TWEAK + 8(%rdi) 911 912 Reset_Stack 913 ret 914Skein_512_Process_Block_End: 915# 916 .if _SKEIN_DEBUG 917# call here with rdx = "round number" 918Skein_Debug_Round_512: 919 pushq %rsi #save two regs for BLK_BITS-specific parms 920 pushq %rdi 921 .irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it 922 movq %r\_Rn_,X_stk+8*(\_Rn_-8)+F_O(%rbp) 923 .endr 924 movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr 925 movq $512,%rdi #now <rdi,rsi,rdx> are set for the call 926 jmp Skein_Debug_Round_Common 927 .endif 928# 929.if _SKEIN_CODE_SIZE 930C_label Skein_512_Process_Block_CodeSize 931 movq $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax 932 ret 933# 934C_label Skein_512_Unroll_Cnt 935 .if _UNROLL_CNT <> (ROUNDS_512/8) 936 movq $_UNROLL_CNT,%rax 937 .else 938 xorq %rax,%rax 939 .endif 940 ret 941.endif 942# 943.endif # _USE_ASM_ & 512 944# 945#=================================== Skein1024 ============================================= 946.if _USE_ASM_ & 1024 947# 948# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# 949# 950################# 951# use details of permutation to make register assignments 952# 953o1K_rdi = 0 #offsets in X[] associated with each register 954o1K_rsi = 1 955o1K_rbp = 2 956o1K_rax = 3 957o1K_rcx = 4 #rcx is "shared" with X6, since X4/X6 alternate 958o1K_rbx = 5 959o1K_rdx = 7 960o1K_r8 = 8 961o1K_r9 = 9 962o1K_r10 = 10 963o1K_r11 = 11 964o1K_r12 = 12 965o1K_r13 = 13 966o1K_r14 = 14 967o1K_r15 = 15 968# 969rIdx_offs = tmpStk_1024 970# 971.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1 972 addReg \reg0 , \reg1 #perform the MIX 973 RotL64 \reg1 , 1024,%((\_RN0_) % 8),\_Rn1_ 974 xorReg \reg1 , \reg0 975.if ((\_RN0_) & 3) == 3 #time to do key injection? 976 .if _SKEIN_DEBUG 977 movq %\reg0 , xDebug_1024+8*\w0(%rsp) #save intermediate values for Debug_Round 978 movq %\reg1 , xDebug_1024+8*\w1(%rsp) # (before inline key injection) 979 .endif 980_II_ = ((\_RN0_)/4)+1 #injection count 981 .if (SKEIN_ASM_UNROLL) & 1024 #here to do fully unrolled key injection 982 addq ksKey+ 8*((_II_+\w0) % 17)(%rsp),%\reg0 983 addq ksKey+ 8*((_II_+\w1) % 17)(%rsp),%\reg1 984 .if \w1 == 13 #tweak injection 985 addq ksTwk+ 8*((_II_+ 0) % 3)(%rsp),%\reg1 986 .elseif \w0 == 14 987 addq ksTwk+ 8*((_II_+ 1) % 3)(%rsp),%\reg0 988 .elseif \w1 == 15 989 addq $_II_, %\reg1 #(injection counter) 990 .endif 991 .else #here to do looping key injection 992 .if (\w0 == 0) 993 movq %rdi, X_stk+8*\w0(%rsp) #if so, store N0 so we can use reg as index 994 movq rIdx_offs(%rsp),%rdi #get the injection counter index into rdi 995 .else 996 addq ksKey+8+8*\w0(%rsp,%rdi,8),%\reg0 #even key injection 997 .endif 998 .if \w1 == 13 #tweak injection 999 addq ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1 1000 .elseif \w0 == 14 1001 addq ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0 1002 .elseif \w1 == 15 1003 addReg \reg1,rdi,,,1 #(injection counter) 1004 .endif 1005 addq ksKey+8+8*\w1(%rsp,%rdi,8),%\reg1 #odd key injection 1006 .endif 1007.endif 1008 # insert the op provided, .if any 1009 \op1 1010.endm 1011################# 1012# MACRO: four rounds for 1024-bit blocks 1013# 1014.macro r1024_FourRounds _RR_ #RR = base round number (0 mod 4) 1015 # should be here with X4 set properly, X6 stored on stack 1016_Rn_ = (\_RR_) + 0 1017 r1024_Mix 0, 1,rdi,rsi,_Rn_,0 1018 r1024_Mix 2, 3,rbp,rax,_Rn_,1 1019 r1024_Mix 4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate) 1020 r1024_Mix 8, 9,r8 ,r9 ,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack 1021 r1024_Mix 10,11,r10,r11,_Rn_,5 1022 r1024_Mix 12,13,r12,r13,_Rn_,6 1023 r1024_Mix 6, 7,rcx,rdx,_Rn_,3 1024 r1024_Mix 14,15,r14,r15,_Rn_,7 1025 .if _SKEIN_DEBUG 1026 Skein_Debug_Round 1024,%(_Rn_+1) 1027 .endif 1028_Rn_ = (\_RR_) + 1 1029 r1024_Mix 0, 9,rdi,r9 ,_Rn_,0 1030 r1024_Mix 2,13,rbp,r13,_Rn_,1 1031 r1024_Mix 6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate) 1032 r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack 1033 r1024_Mix 12, 3,r12,rax,_Rn_,5 1034 r1024_Mix 14, 5,r14,rbx,_Rn_,6 1035 r1024_Mix 4,15,rcx,r15,_Rn_,3 1036 r1024_Mix 8, 1,r8 ,rsi,_Rn_,7 1037 .if _SKEIN_DEBUG 1038 Skein_Debug_Round 1024,%(_Rn_+1) 1039 .endif 1040_Rn_ = (\_RR_) + 2 1041 r1024_Mix 0, 7,rdi,rdx,_Rn_,0 1042 r1024_Mix 2, 5,rbp,rbx,_Rn_,1 1043 r1024_Mix 4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate) 1044 r1024_Mix 12,15,r12,r15,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack 1045 r1024_Mix 14,13,r14,r13,_Rn_,5 1046 r1024_Mix 8,11,r8 ,r11,_Rn_,6 1047 r1024_Mix 6, 1,rcx,rsi,_Rn_,3 1048 r1024_Mix 10, 9,r10,r9 ,_Rn_,7 1049 .if _SKEIN_DEBUG 1050 Skein_Debug_Round 1024,%(_Rn_+1) 1051 .endif 1052_Rn_ = (\_RR_) + 3 1053 r1024_Mix 0,15,rdi,r15,_Rn_,0 1054 r1024_Mix 2,11,rbp,r11,_Rn_,1 1055 r1024_Mix 6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate) 1056 r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack 1057 r1024_Mix 8, 5,r8 ,rbx,_Rn_,5 1058 r1024_Mix 10, 3,r10,rax,_Rn_,6 1059 r1024_Mix 4, 9,rcx,r9 ,_Rn_,3 1060 r1024_Mix 12, 7,r12,rdx,_Rn_,7 1061 .if _SKEIN_DEBUG 1062 Skein_Debug_Round 1024,%(_Rn_+1) 1063 .endif 1064 1065 .if ((SKEIN_ASM_UNROLL) & 1024) == 0 #here with rdi == rIdx, X0 on stack 1066 #"rotate" the key schedule on the stack 1067i8 = o1K_r8 1068i0 = o1K_rdi 1069 movq %r8 , X_stk+8*i8(%rsp) #free up a register (save it on the stack) 1070 movq ksKey+8* 0(%rsp,%rdi,8),%r8 #get key word 1071 movq %r8 , ksKey+8*17(%rsp,%rdi,8) #rotate key (must do key first or tweak clobbers it!) 1072 movq ksTwk+8* 0(%rsp,%rdi,8),%r8 #get tweak word 1073 movq %r8 , ksTwk+8* 3(%rsp,%rdi,8) #rotate tweak (onto the stack) 1074 movq X_stk+8*i8(%rsp) ,%r8 #get the reg back 1075 incq %rdi #bump the index 1076 movq %rdi, rIdx_offs (%rsp) #save rdi again 1077 movq ksKey+8*i0(%rsp,%rdi,8),%rdi #get the key schedule word for X0 back 1078 addq X_stk+8*i0(%rsp) ,%rdi #perform the X0 key injection 1079 .endif 1080 #show the result of the key injection 1081 Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT 1082.endm #r1024_FourRounds 1083# 1084################ 1085# code 1086# 1087C_label Skein1024_Process_Block 1088# 1089 Setup_Stack 1024,ROUNDS_1024/8,WCNT 1090 movq TWEAK+ 8(%rdi),%r9 1091 jmp Skein1024_block_loop 1092 # main hash loop for Skein1024 1093 .p2align 4 1094Skein1024_block_loop: 1095 # general register usage: 1096 # RSP = stack pointer 1097 # RAX..RDX,RSI,RDI = X1, X3..X7 (state words) 1098 # R8 ..R15 = X8..X15 (state words) 1099 # RBP = temp (used for X0 and X2) 1100 # 1101 .if ((SKEIN_ASM_UNROLL) & 1024) == 0 1102 xorq %rax,%rax #init loop index on the stack 1103 movq %rax,rIdx_offs(%rsp) 1104 .endif 1105 movq TWEAK+ 0(%rdi),%r8 1106 addq bitAdd+ F_O(%rbp),%r8 #computed updated tweak value T0 1107 movq %r9 ,%r10 1108 xorq %r8 ,%r10 #%rax/%rbx/%rcx = tweak schedule 1109 movq %r8 ,TWEAK+ 0(%rdi) #save updated tweak value ctx->h.T[0] 1110 movq %r8 ,ksTwk+ 0+F_O(%rbp) 1111 movq %r9 ,ksTwk+ 8+F_O(%rbp) #keep values in %r8 ,%r9 for initial tweak injection below 1112 movq %r10,ksTwk+16+F_O(%rbp) 1113 .if _SKEIN_DEBUG 1114 movq %r9 ,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block 1115 .endif 1116 movq blkPtr +F_O(%rbp),%rsi # rsi --> input block 1117 movq $KW_PARITY ,%rax #overall key schedule parity 1118 1119 # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3] 1120 .irp _rN_,0,1,2,3,4,6 #process the "initial" words, using r14/r15 as temps 1121 movq X_VARS+8*\_rN_(%rdi),%r14 #get state word 1122 movq 8*\_rN_(%rsi),%r15 #get msg word 1123 xorq %r14,%rax #update key schedule overall parity 1124 movq %r14,ksKey +8*\_rN_+F_O(%rbp) #save key schedule word on stack 1125 movq %r15,Wcopy +8*\_rN_+F_O(%rbp) #save local msg Wcopy 1126 addq %r15,%r14 #do the initial key injection 1127 movq %r14,X_stk +8*\_rN_ (%rsp) #save initial state var on stack 1128 .endr 1129 # now process the rest, using the "real" registers 1130 # (MUST do it in reverse order to inject tweaks r8/r9 first) 1131 .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx 1132_oo_ = o1K_\_rr_ #offset assocated with the register 1133 movq X_VARS+8*_oo_(%rdi),%\_rr_ #get key schedule word from context 1134 movq 8*_oo_(%rsi),%rcx #get next input msg word 1135 movq %\_rr_, ksKey +8*_oo_(%rsp) #save key schedule on stack 1136 xorq %\_rr_, %rax #accumulate key schedule parity 1137 movq %rcx,Wcopy+8*_oo_+F_O(%rbp) #save copy of msg word for feedforward 1138 addq %rcx,%\_rr_ #do the initial key injection 1139 .if _oo_ == 13 #do the initial tweak injection 1140 addReg \_rr_,r8 # (only in words 13/14) 1141 .elseif _oo_ == 14 1142 addReg \_rr_,r9 1143 .endif 1144 .endr 1145 movq %rax,ksKey+8*WCNT+F_O(%rbp) #save key schedule parity 1146.if _SKEIN_DEBUG 1147 Skein_Debug_Block 1024 #initial debug dump 1148.endif 1149 addq $8*WCNT,%rsi #bump the msg ptr 1150 movq %rsi,blkPtr+F_O(%rbp) #save bumped msg ptr 1151 # re-load words 0..4 from stack, enter the main loop 1152 .irp _rr_,rdi,rsi,rbp,rax,rcx #(no need to re-load x6, already on stack) 1153 movq X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go! 1154 .endr 1155.if _SKEIN_DEBUG 1156 Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL #show state after initial key injection 1157.endif 1158 # 1159 ################# 1160 # now the key schedule is computed. Start the rounds 1161 # 1162.if (SKEIN_ASM_UNROLL) & 1024 1163_UNROLL_CNT = ROUNDS_1024/8 1164.else 1165_UNROLL_CNT = SKEIN_UNROLL_1024 1166 .if ((ROUNDS_1024/8) % _UNROLL_CNT) 1167 .error "Invalid SKEIN_UNROLL_1024" 1168 .endif 1169Skein1024_round_loop: 1170.endif 1171# 1172_Rbase_ = 0 1173.rept _UNROLL_CNT*2 #implement the rounds, 4 at a time 1174 r1024_FourRounds %(4*_Rbase_+00) 1175_Rbase_ = _Rbase_+1 1176.endr #rept _UNROLL_CNT 1177# 1178.if ((SKEIN_ASM_UNROLL) & 1024) == 0 1179 cmpq $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done 1180 jb Skein1024_round_loop 1181.endif 1182 # end of rounds 1183 ################# 1184 # 1185 # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15} 1186 movq %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack 1187 movq ctxPtr(%rsp),%rdx 1188 1189 .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15 #do all but x6,x7 1190_oo_ = o1K_\_rr_ 1191 xorq Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR 1192 movq %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context 1193 .if (_oo_ == 9) 1194 movq $FIRST_MASK64 ,%r9 1195 .endif 1196 .if (_oo_ == 14) 1197 andq TWEAK+ 8(%rdx),%r9 1198 .endif 1199 .endr 1200 # 1201 movq X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above) 1202 movq X_stk +8*7(%rsp),%rbx 1203 xorq Wcopy +8*6(%rsp),%rax 1204 xorq Wcopy +8*7(%rsp),%rbx 1205 movq %rax,X_VARS+8*6(%rdx) 1206 decq blkCnt(%rsp) #set zero flag iff done 1207 movq %rbx,X_VARS+8*7(%rdx) 1208 1209 Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)> 1210 # go back for more blocks, if needed 1211 movq ctxPtr(%rsp),%rdi #don't muck with the flags here! 1212 lea FRAME_OFFS(%rsp),%rbp 1213 jnz Skein1024_block_loop 1214 movq %r9 ,TWEAK+ 8(%rdx) 1215 Reset_Stack 1216 ret 1217# 1218Skein1024_Process_Block_End: 1219# 1220.if _SKEIN_DEBUG 1221Skein_Debug_Round_1024: 1222 # call here with rdx = "round number", 1223_SP_OFFS_ = 8*2 #stack "offset" here: rdx, return addr 1224 # 1225 #save rest of X[] state on stack so debug routines can access it 1226 .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15 1227 movq %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp) 1228 .endr 1229 # Figure out what to do with x0 (rdi). When rdx == 0 mod 4, it's already on stack 1230 cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always save 1231 jae save_x0 1232 testq $3,%rdx #otherwise only if rdx != 0 mod 4 1233 jz save_x0_not 1234save_x0: 1235 movq %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp) 1236save_x0_not: 1237 #figure out the x4/x6 swapping state and save the correct one! 1238 cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4 1239 jae save_x4 1240 testq $1,%rdx #and even ones have r4 as well 1241 jz save_x4 1242 movq %rcx,X_stk+8*6+_SP_OFFS_(%rsp) 1243 jmp debug_1024_go 1244save_x4: 1245 movq %rcx,X_stk+8*4+_SP_OFFS_(%rsp) 1246debug_1024_go: 1247 #now all is saved in Xstk[] except for rdx 1248 push %rsi #save two regs for BLK_BITS-specific parms 1249 push %rdi 1250_SP_OFFS_ = _SP_OFFS_ + 16 #adjust stack offset accordingly (now 32) 1251 1252 movq _SP_OFFS_-8(%rsp),%rsi #get back original %rdx (pushed on stack in macro call) 1253 movq %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[] 1254 1255 movq ctxPtr+_SP_OFFS_(%rsp),%rsi #rsi = ctx_hdr_ptr 1256 movq $1024,%rdi #rdi = block size 1257 jmp Skein_Debug_Round_Common 1258.endif 1259# 1260.if _SKEIN_CODE_SIZE 1261C_label Skein1024_Process_Block_CodeSize 1262 movq $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax 1263 ret 1264# 1265C_label Skein1024_Unroll_Cnt 1266 .if _UNROLL_CNT <> (ROUNDS_1024/8) 1267 movq $_UNROLL_CNT,%rax 1268 .else 1269 xorq %rax,%rax 1270 .endif 1271 ret 1272.endif 1273# 1274.endif # _USE_ASM_ and 1024 1275# 1276.if _SKEIN_DEBUG 1277#---------------------------------------------------------------- 1278#local debug routine to set up for calls to: 1279# void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X) 1280# [ rdi rsi rdx rcx] 1281# 1282# here with %rdx = round number 1283# %rsi = ctx_hdr_ptr 1284# %rdi = block size (256/512/1024) 1285# on stack: saved rdi, saved rsi, retAddr, saved rdx 1286# 1287Skein_Debug_Round_Common: 1288_SP_OFFS_ = 32 #account for four words on stack already 1289 .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15 #save the rest of the regs 1290 pushq %\_rr_ 1291_SP_OFFS_ = _SP_OFFS_+8 1292 .endr 1293 .if (_SP_OFFS_ % 16) # make sure stack is still 16-byte aligned here 1294 .error "Debug_Round_Common: stack alignment" 1295 .endif 1296 # compute %rcx = ptr to the X[] array on the stack (final parameter to call) 1297 leaq X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address 1298 cmpq $SKEIN_RND_FEED_FWD,%rdx #special handling for feedforward "round"? 1299 jnz _got_rcxA 1300 leaq X_VARS(%rsi),%rcx 1301_got_rcxA: 1302 .if _USE_ASM_ & 1024 1303 # special handling for 1024-bit case 1304 # (for rounds right before with key injection: 1305 # use xDebug_1024[] instead of X_stk[]) 1306 cmpq $SKEIN_RND_SPECIAL,%rdx 1307 jae _got_rcxB #must be a normal round 1308 orq %rdx,%rdx 1309 jz _got_rcxB #just before key injection 1310 test $3,%rdx 1311 jne _got_rcxB 1312 cmp $1024,%rdi #only 1024-bit(s) for now 1313 jne _got_rcxB 1314 leaq xDebug_1024+_SP_OFFS_(%rsp),%rcx 1315_got_rcxB: 1316 .endif 1317 call Skein_Show_Round #call external debug handler 1318 1319 .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax #restore regs 1320 popq %\_rr_ 1321_SP_OFFS_ = _SP_OFFS_-8 1322 .endr 1323 .if _SP_OFFS_ - 32 1324 .error "Debug_Round_Common: push/pop misalignment!" 1325 .endif 1326 popq %rdi 1327 popq %rsi 1328 ret 1329.endif 1330#---------------------------------------------------------------- 1331 .section .note.GNU-stack,"",@progbits 1332 1333 .end 1334