1/* 2 * Implement fast SHA-1 with AVX2 instructions. (x86_64) 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * Copyright(c) 2014 Intel Corporation. 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of version 2 of the GNU General Public License as 13 * published by the Free Software Foundation. 14 * 15 * This program is distributed in the hope that it will be useful, but 16 * WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * Contact Information: 21 * Ilya Albrekht <ilya.albrekht@intel.com> 22 * Maxim Locktyukhin <maxim.locktyukhin@intel.com> 23 * Ronen Zohar <ronen.zohar@intel.com> 24 * Chandramouli Narayanan <mouli@linux.intel.com> 25 * 26 * BSD LICENSE 27 * 28 * Copyright(c) 2014 Intel Corporation. 29 * 30 * Redistribution and use in source and binary forms, with or without 31 * modification, are permitted provided that the following conditions 32 * are met: 33 * 34 * Redistributions of source code must retain the above copyright 35 * notice, this list of conditions and the following disclaimer. 36 * Redistributions in binary form must reproduce the above copyright 37 * notice, this list of conditions and the following disclaimer in 38 * the documentation and/or other materials provided with the 39 * distribution. 40 * Neither the name of Intel Corporation nor the names of its 41 * contributors may be used to endorse or promote products derived 42 * from this software without specific prior written permission. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 45 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 46 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 47 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 48 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 49 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 50 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 51 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 52 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 53 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 54 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 55 * 56 */ 57 58/* 59 * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. 60 * 61 *This implementation is based on the previous SSSE3 release: 62 *Visit http://software.intel.com/en-us/articles/ 63 *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ 64 * 65 * void sha1_transform_avx2(struct sha1_block_state *state, 66 * const u8 *data, size_t nblocks); 67 */ 68 69#include <linux/linkage.h> 70 71#define CTX %rdi /* arg1 */ 72#define BUF %rsi /* arg2 */ 73#define CNT %rdx /* arg3 */ 74 75#define REG_A %ecx 76#define REG_B %esi 77#define REG_C %edi 78#define REG_D %eax 79#define REG_E %edx 80#define REG_TB %ebx 81#define REG_TA %r12d 82#define REG_RA %rcx 83#define REG_RB %rsi 84#define REG_RC %rdi 85#define REG_RD %rax 86#define REG_RE %rdx 87#define REG_RTA %r12 88#define REG_RTB %rbx 89#define REG_T1 %r11d 90#define xmm_mov vmovups 91#define avx2_zeroupper vzeroupper 92#define RND_F1 1 93#define RND_F2 2 94#define RND_F3 3 95 96.macro REGALLOC 97 .set A, REG_A 98 .set B, REG_B 99 .set C, REG_C 100 .set D, REG_D 101 .set E, REG_E 102 .set TB, REG_TB 103 .set TA, REG_TA 104 105 .set RA, REG_RA 106 .set RB, REG_RB 107 .set RC, REG_RC 108 .set RD, REG_RD 109 .set RE, REG_RE 110 111 .set RTA, REG_RTA 112 .set RTB, REG_RTB 113 114 .set T1, REG_T1 115.endm 116 117#define HASH_PTR %r9 118#define BLOCKS_CTR %r8 119#define BUFFER_PTR %r10 120#define BUFFER_PTR2 %r13 121 122#define PRECALC_BUF %r14 123#define WK_BUF %r15 124 125#define W_TMP %xmm0 126#define WY_TMP %ymm0 127#define WY_TMP2 %ymm9 128 129# AVX2 variables 130#define WY0 %ymm3 131#define WY4 %ymm5 132#define WY08 %ymm7 133#define WY12 %ymm8 134#define WY16 %ymm12 135#define WY20 %ymm13 136#define WY24 %ymm14 137#define WY28 %ymm15 138 139#define YMM_SHUFB_BSWAP %ymm10 140 141/* 142 * Keep 2 iterations precalculated at a time: 143 * - 80 DWORDs per iteration * 2 144 */ 145#define W_SIZE (80*2*2 +16) 146 147#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) 148#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) 149 150 151.macro UPDATE_HASH hash, val 152 add \hash, \val 153 mov \val, \hash 154.endm 155 156.macro PRECALC_RESET_WY 157 .set WY_00, WY0 158 .set WY_04, WY4 159 .set WY_08, WY08 160 .set WY_12, WY12 161 .set WY_16, WY16 162 .set WY_20, WY20 163 .set WY_24, WY24 164 .set WY_28, WY28 165 .set WY_32, WY_00 166.endm 167 168.macro PRECALC_ROTATE_WY 169 /* Rotate macros */ 170 .set WY_32, WY_28 171 .set WY_28, WY_24 172 .set WY_24, WY_20 173 .set WY_20, WY_16 174 .set WY_16, WY_12 175 .set WY_12, WY_08 176 .set WY_08, WY_04 177 .set WY_04, WY_00 178 .set WY_00, WY_32 179 180 /* Define register aliases */ 181 .set WY, WY_00 182 .set WY_minus_04, WY_04 183 .set WY_minus_08, WY_08 184 .set WY_minus_12, WY_12 185 .set WY_minus_16, WY_16 186 .set WY_minus_20, WY_20 187 .set WY_minus_24, WY_24 188 .set WY_minus_28, WY_28 189 .set WY_minus_32, WY 190.endm 191 192.macro PRECALC_00_15 193 .if (i == 0) # Initialize and rotate registers 194 PRECALC_RESET_WY 195 PRECALC_ROTATE_WY 196 .endif 197 198 /* message scheduling pre-compute for rounds 0-15 */ 199 .if ((i & 7) == 0) 200 /* 201 * blended AVX2 and ALU instruction scheduling 202 * 1 vector iteration per 8 rounds 203 */ 204 vmovdqu (i * 2)(BUFFER_PTR), W_TMP 205 .elseif ((i & 7) == 1) 206 vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ 207 WY_TMP, WY_TMP 208 .elseif ((i & 7) == 2) 209 vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY 210 .elseif ((i & 7) == 4) 211 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP 212 .elseif ((i & 7) == 7) 213 vmovdqu WY_TMP, PRECALC_WK(i&~7) 214 215 PRECALC_ROTATE_WY 216 .endif 217.endm 218 219.macro PRECALC_16_31 220 /* 221 * message scheduling pre-compute for rounds 16-31 222 * calculating last 32 w[i] values in 8 XMM registers 223 * pre-calculate K+w[i] values and store to mem 224 * for later load by ALU add instruction 225 * 226 * "brute force" vectorization for rounds 16-31 only 227 * due to w[i]->w[i-3] dependency 228 */ 229 .if ((i & 7) == 0) 230 /* 231 * blended AVX2 and ALU instruction scheduling 232 * 1 vector iteration per 8 rounds 233 */ 234 /* w[i-14] */ 235 vpalignr $8, WY_minus_16, WY_minus_12, WY 236 vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ 237 .elseif ((i & 7) == 1) 238 vpxor WY_minus_08, WY, WY 239 vpxor WY_minus_16, WY_TMP, WY_TMP 240 .elseif ((i & 7) == 2) 241 vpxor WY_TMP, WY, WY 242 vpslldq $12, WY, WY_TMP2 243 .elseif ((i & 7) == 3) 244 vpslld $1, WY, WY_TMP 245 vpsrld $31, WY, WY 246 .elseif ((i & 7) == 4) 247 vpor WY, WY_TMP, WY_TMP 248 vpslld $2, WY_TMP2, WY 249 .elseif ((i & 7) == 5) 250 vpsrld $30, WY_TMP2, WY_TMP2 251 vpxor WY, WY_TMP, WY_TMP 252 .elseif ((i & 7) == 7) 253 vpxor WY_TMP2, WY_TMP, WY 254 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP 255 vmovdqu WY_TMP, PRECALC_WK(i&~7) 256 257 PRECALC_ROTATE_WY 258 .endif 259.endm 260 261.macro PRECALC_32_79 262 /* 263 * in SHA-1 specification: 264 * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 265 * instead we do equal: 266 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 267 * allows more efficient vectorization 268 * since w[i]=>w[i-3] dependency is broken 269 */ 270 271 .if ((i & 7) == 0) 272 /* 273 * blended AVX2 and ALU instruction scheduling 274 * 1 vector iteration per 8 rounds 275 */ 276 vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP 277 .elseif ((i & 7) == 1) 278 /* W is W_minus_32 before xor */ 279 vpxor WY_minus_28, WY, WY 280 .elseif ((i & 7) == 2) 281 vpxor WY_minus_16, WY_TMP, WY_TMP 282 .elseif ((i & 7) == 3) 283 vpxor WY_TMP, WY, WY 284 .elseif ((i & 7) == 4) 285 vpslld $2, WY, WY_TMP 286 .elseif ((i & 7) == 5) 287 vpsrld $30, WY, WY 288 vpor WY, WY_TMP, WY 289 .elseif ((i & 7) == 7) 290 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP 291 vmovdqu WY_TMP, PRECALC_WK(i&~7) 292 293 PRECALC_ROTATE_WY 294 .endif 295.endm 296 297.macro PRECALC r, s 298 .set i, \r 299 300 .if (i < 40) 301 .set K_XMM, 32*0 302 .elseif (i < 80) 303 .set K_XMM, 32*1 304 .elseif (i < 120) 305 .set K_XMM, 32*2 306 .else 307 .set K_XMM, 32*3 308 .endif 309 310 .if (i<32) 311 PRECALC_00_15 \s 312 .elseif (i<64) 313 PRECALC_16_31 \s 314 .elseif (i < 160) 315 PRECALC_32_79 \s 316 .endif 317.endm 318 319.macro ROTATE_STATE 320 .set T_REG, E 321 .set E, D 322 .set D, C 323 .set C, B 324 .set B, TB 325 .set TB, A 326 .set A, T_REG 327 328 .set T_REG, RE 329 .set RE, RD 330 .set RD, RC 331 .set RC, RB 332 .set RB, RTB 333 .set RTB, RA 334 .set RA, T_REG 335.endm 336 337/* Macro relies on saved ROUND_Fx */ 338 339.macro RND_FUN f, r 340 .if (\f == RND_F1) 341 ROUND_F1 \r 342 .elseif (\f == RND_F2) 343 ROUND_F2 \r 344 .elseif (\f == RND_F3) 345 ROUND_F3 \r 346 .endif 347.endm 348 349.macro RR r 350 .set round_id, (\r % 80) 351 352 .if (round_id == 0) /* Precalculate F for first round */ 353 .set ROUND_FUNC, RND_F1 354 mov B, TB 355 356 rorx $(32-30), B, B /* b>>>2 */ 357 andn D, TB, T1 358 and C, TB 359 xor T1, TB 360 .endif 361 362 RND_FUN ROUND_FUNC, \r 363 ROTATE_STATE 364 365 .if (round_id == 18) 366 .set ROUND_FUNC, RND_F2 367 .elseif (round_id == 38) 368 .set ROUND_FUNC, RND_F3 369 .elseif (round_id == 58) 370 .set ROUND_FUNC, RND_F2 371 .endif 372 373 .set round_id, ( (\r+1) % 80) 374 375 RND_FUN ROUND_FUNC, (\r+1) 376 ROTATE_STATE 377.endm 378 379.macro ROUND_F1 r 380 add WK(\r), E 381 382 andn C, A, T1 /* ~b&d */ 383 lea (RE,RTB), E /* Add F from the previous round */ 384 385 rorx $(32-5), A, TA /* T2 = A >>> 5 */ 386 rorx $(32-30),A, TB /* b>>>2 for next round */ 387 388 PRECALC (\r) /* msg scheduling for next 2 blocks */ 389 390 /* 391 * Calculate F for the next round 392 * (b & c) ^ andn[b, d] 393 */ 394 and B, A /* b&c */ 395 xor T1, A /* F1 = (b&c) ^ (~b&d) */ 396 397 lea (RE,RTA), E /* E += A >>> 5 */ 398.endm 399 400.macro ROUND_F2 r 401 add WK(\r), E 402 lea (RE,RTB), E /* Add F from the previous round */ 403 404 /* Calculate F for the next round */ 405 rorx $(32-5), A, TA /* T2 = A >>> 5 */ 406 .if ((round_id) < 79) 407 rorx $(32-30), A, TB /* b>>>2 for next round */ 408 .endif 409 PRECALC (\r) /* msg scheduling for next 2 blocks */ 410 411 .if ((round_id) < 79) 412 xor B, A 413 .endif 414 415 add TA, E /* E += A >>> 5 */ 416 417 .if ((round_id) < 79) 418 xor C, A 419 .endif 420.endm 421 422.macro ROUND_F3 r 423 add WK(\r), E 424 PRECALC (\r) /* msg scheduling for next 2 blocks */ 425 426 lea (RE,RTB), E /* Add F from the previous round */ 427 428 mov B, T1 429 or A, T1 430 431 rorx $(32-5), A, TA /* T2 = A >>> 5 */ 432 rorx $(32-30), A, TB /* b>>>2 for next round */ 433 434 /* Calculate F for the next round 435 * (b and c) or (d and (b or c)) 436 */ 437 and C, T1 438 and B, A 439 or T1, A 440 441 add TA, E /* E += A >>> 5 */ 442 443.endm 444 445/* Add constant only if (%2 > %3) condition met (uses RTA as temp) 446 * %1 + %2 >= %3 ? %4 : 0 447 */ 448.macro ADD_IF_GE a, b, c, d 449 mov \a, RTA 450 add $\d, RTA 451 cmp $\c, \b 452 cmovge RTA, \a 453.endm 454 455/* 456 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining 457 */ 458.macro SHA1_PIPELINED_MAIN_BODY 459 460 REGALLOC 461 462 mov (HASH_PTR), A 463 mov 4(HASH_PTR), B 464 mov 8(HASH_PTR), C 465 mov 12(HASH_PTR), D 466 mov 16(HASH_PTR), E 467 468 mov %rsp, PRECALC_BUF 469 lea (2*4*80+32)(%rsp), WK_BUF 470 471 # Precalc WK for first 2 blocks 472 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 473 .set i, 0 474 .rept 160 475 PRECALC i 476 .set i, i + 1 477 .endr 478 479 /* Go to next block if needed */ 480 ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 481 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 482 xchg WK_BUF, PRECALC_BUF 483 484 .align 32 485.L_loop: 486 /* 487 * code loops through more than one block 488 * we use K_BASE value as a signal of a last block, 489 * it is set below by: cmovae BUFFER_PTR, K_BASE 490 */ 491 test BLOCKS_CTR, BLOCKS_CTR 492 jnz .L_begin 493 .align 32 494 jmp .L_end 495 .align 32 496.L_begin: 497 498 /* 499 * Do first block 500 * rounds: 0,2,4,6,8 501 */ 502 .set j, 0 503 .rept 5 504 RR j 505 .set j, j+2 506 .endr 507 508 /* 509 * rounds: 510 * 10,12,14,16,18 511 * 20,22,24,26,28 512 * 30,32,34,36,38 513 * 40,42,44,46,48 514 * 50,52,54,56,58 515 */ 516 .rept 25 517 RR j 518 .set j, j+2 519 .endr 520 521 /* Update Counter */ 522 sub $1, BLOCKS_CTR 523 /* Move to the next block only if needed*/ 524 ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 525 /* 526 * rounds 527 * 60,62,64,66,68 528 * 70,72,74,76,78 529 */ 530 .rept 10 531 RR j 532 .set j, j+2 533 .endr 534 535 UPDATE_HASH (HASH_PTR), A 536 UPDATE_HASH 4(HASH_PTR), TB 537 UPDATE_HASH 8(HASH_PTR), C 538 UPDATE_HASH 12(HASH_PTR), D 539 UPDATE_HASH 16(HASH_PTR), E 540 541 test BLOCKS_CTR, BLOCKS_CTR 542 jz .L_loop 543 544 mov TB, B 545 546 /* Process second block */ 547 /* 548 * rounds 549 * 0+80, 2+80, 4+80, 6+80, 8+80 550 * 10+80,12+80,14+80,16+80,18+80 551 */ 552 553 .set j, 0 554 .rept 10 555 RR j+80 556 .set j, j+2 557 .endr 558 559 /* 560 * rounds 561 * 20+80,22+80,24+80,26+80,28+80 562 * 30+80,32+80,34+80,36+80,38+80 563 */ 564 .rept 10 565 RR j+80 566 .set j, j+2 567 .endr 568 569 /* 570 * rounds 571 * 40+80,42+80,44+80,46+80,48+80 572 * 50+80,52+80,54+80,56+80,58+80 573 */ 574 .rept 10 575 RR j+80 576 .set j, j+2 577 .endr 578 579 /* update counter */ 580 sub $1, BLOCKS_CTR 581 /* Move to the next block only if needed*/ 582 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 583 584 /* 585 * rounds 586 * 60+80,62+80,64+80,66+80,68+80 587 * 70+80,72+80,74+80,76+80,78+80 588 */ 589 .rept 10 590 RR j+80 591 .set j, j+2 592 .endr 593 594 UPDATE_HASH (HASH_PTR), A 595 UPDATE_HASH 4(HASH_PTR), TB 596 UPDATE_HASH 8(HASH_PTR), C 597 UPDATE_HASH 12(HASH_PTR), D 598 UPDATE_HASH 16(HASH_PTR), E 599 600 /* Reset state for AVX2 reg permutation */ 601 mov A, TA 602 mov TB, A 603 mov C, TB 604 mov E, C 605 mov D, B 606 mov TA, D 607 608 REGALLOC 609 610 xchg WK_BUF, PRECALC_BUF 611 612 jmp .L_loop 613 614 .align 32 615.L_end: 616 617.endm 618/* 619 * macro implements SHA-1 function's body for several 64-byte blocks 620 * param: function's name 621 */ 622.macro SHA1_VECTOR_ASM name 623 SYM_FUNC_START(\name) 624 625 push %rbx 626 push %r12 627 push %r13 628 push %r14 629 push %r15 630 631 RESERVE_STACK = (W_SIZE*4 + 8+24) 632 633 /* Align stack */ 634 push %rbp 635 mov %rsp, %rbp 636 and $~(0x20-1), %rsp 637 sub $RESERVE_STACK, %rsp 638 639 avx2_zeroupper 640 641 /* Setup initial values */ 642 mov CTX, HASH_PTR 643 mov BUF, BUFFER_PTR 644 645 mov BUF, BUFFER_PTR2 646 mov CNT, BLOCKS_CTR 647 648 xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP 649 650 SHA1_PIPELINED_MAIN_BODY 651 652 avx2_zeroupper 653 654 mov %rbp, %rsp 655 pop %rbp 656 657 pop %r15 658 pop %r14 659 pop %r13 660 pop %r12 661 pop %rbx 662 663 RET 664 665 SYM_FUNC_END(\name) 666.endm 667 668.section .rodata 669 670#define K1 0x5a827999 671#define K2 0x6ed9eba1 672#define K3 0x8f1bbcdc 673#define K4 0xca62c1d6 674 675.align 128 676K_XMM_AR: 677 .long K1, K1, K1, K1 678 .long K1, K1, K1, K1 679 .long K2, K2, K2, K2 680 .long K2, K2, K2, K2 681 .long K3, K3, K3, K3 682 .long K3, K3, K3, K3 683 .long K4, K4, K4, K4 684 .long K4, K4, K4, K4 685 686BSWAP_SHUFB_CTL: 687 .long 0x00010203 688 .long 0x04050607 689 .long 0x08090a0b 690 .long 0x0c0d0e0f 691 .long 0x00010203 692 .long 0x04050607 693 .long 0x08090a0b 694 .long 0x0c0d0e0f 695.text 696 697SHA1_VECTOR_ASM sha1_transform_avx2 698