12874c5fdSThomas Gleixner/* SPDX-License-Identifier: GPL-2.0-or-later */ 266be8951SMathias Krause/* 366be8951SMathias Krause * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental 466be8951SMathias Krause * SSE3 instruction set extensions introduced in Intel Core Microarchitecture 566be8951SMathias Krause * processors. CPUs supporting Intel(R) AVX extensions will get an additional 666be8951SMathias Krause * boost. 766be8951SMathias Krause * 866be8951SMathias Krause * This work was inspired by the vectorized implementation of Dean Gaudet. 966be8951SMathias Krause * Additional information on it can be found at: 1066be8951SMathias Krause * http://www.arctic.org/~dean/crypto/sha1.html 1166be8951SMathias Krause * 1266be8951SMathias Krause * It was improved upon with more efficient vectorization of the message 1366be8951SMathias Krause * scheduling. This implementation has also been optimized for all current and 1466be8951SMathias Krause * several future generations of Intel CPUs. 1566be8951SMathias Krause * 1666be8951SMathias Krause * See this article for more information about the implementation details: 1766be8951SMathias Krause * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ 1866be8951SMathias Krause * 1966be8951SMathias Krause * Copyright (C) 2010, Intel Corp. 2066be8951SMathias Krause * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> 2166be8951SMathias Krause * Ronen Zohar <ronen.zohar@intel.com> 2266be8951SMathias Krause * 2366be8951SMathias Krause * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: 2466be8951SMathias Krause * Author: Mathias Krause <minipli@googlemail.com> 2566be8951SMathias Krause */ 2666be8951SMathias Krause 27ac9d55ddSJussi Kivilinna#include <linux/linkage.h> 28ac9d55ddSJussi Kivilinna 2966be8951SMathias Krause#define CTX %rdi // arg1 3066be8951SMathias Krause#define BUF %rsi // arg2 3166be8951SMathias Krause#define CNT %rdx // arg3 3266be8951SMathias Krause 3366be8951SMathias Krause#define REG_A %ecx 3466be8951SMathias Krause#define REG_B %esi 3566be8951SMathias Krause#define REG_C %edi 366488bce7SJosh Poimboeuf#define REG_D %r12d 3766be8951SMathias Krause#define REG_E %edx 3866be8951SMathias Krause 3966be8951SMathias Krause#define REG_T1 %eax 4066be8951SMathias Krause#define REG_T2 %ebx 4166be8951SMathias Krause 4266be8951SMathias Krause#define K_BASE %r8 4366be8951SMathias Krause#define HASH_PTR %r9 4466be8951SMathias Krause#define BUFFER_PTR %r10 4566be8951SMathias Krause#define BUFFER_END %r11 4666be8951SMathias Krause 4766be8951SMathias Krause#define W_TMP1 %xmm0 4866be8951SMathias Krause#define W_TMP2 %xmm9 4966be8951SMathias Krause 5066be8951SMathias Krause#define W0 %xmm1 5166be8951SMathias Krause#define W4 %xmm2 5266be8951SMathias Krause#define W8 %xmm3 5366be8951SMathias Krause#define W12 %xmm4 5466be8951SMathias Krause#define W16 %xmm5 5566be8951SMathias Krause#define W20 %xmm6 5666be8951SMathias Krause#define W24 %xmm7 5766be8951SMathias Krause#define W28 %xmm8 5866be8951SMathias Krause 5966be8951SMathias Krause#define XMM_SHUFB_BSWAP %xmm10 6066be8951SMathias Krause 6166be8951SMathias Krause/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ 6266be8951SMathias Krause#define WK(t) (((t) & 15) * 4)(%rsp) 6366be8951SMathias Krause#define W_PRECALC_AHEAD 16 6466be8951SMathias Krause 6566be8951SMathias Krause/* 6666be8951SMathias Krause * This macro implements the SHA-1 function's body for single 64-byte block 6766be8951SMathias Krause * param: function's name 6866be8951SMathias Krause */ 6966be8951SMathias Krause.macro SHA1_VECTOR_ASM name 70*6dcc5627SJiri Slaby SYM_FUNC_START(\name) 71ac9d55ddSJussi Kivilinna 7266be8951SMathias Krause push %rbx 7366be8951SMathias Krause push %r12 746488bce7SJosh Poimboeuf push %rbp 756488bce7SJosh Poimboeuf mov %rsp, %rbp 7666be8951SMathias Krause 7766be8951SMathias Krause sub $64, %rsp # allocate workspace 7866be8951SMathias Krause and $~15, %rsp # align stack 7966be8951SMathias Krause 8066be8951SMathias Krause mov CTX, HASH_PTR 8166be8951SMathias Krause mov BUF, BUFFER_PTR 8266be8951SMathias Krause 8366be8951SMathias Krause shl $6, CNT # multiply by 64 8466be8951SMathias Krause add BUF, CNT 8566be8951SMathias Krause mov CNT, BUFFER_END 8666be8951SMathias Krause 8766be8951SMathias Krause lea K_XMM_AR(%rip), K_BASE 8866be8951SMathias Krause xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP 8966be8951SMathias Krause 9066be8951SMathias Krause SHA1_PIPELINED_MAIN_BODY 9166be8951SMathias Krause 9266be8951SMathias Krause # cleanup workspace 9366be8951SMathias Krause mov $8, %ecx 9466be8951SMathias Krause mov %rsp, %rdi 95a7bea830SJan Beulich xor %eax, %eax 9666be8951SMathias Krause rep stosq 9766be8951SMathias Krause 986488bce7SJosh Poimboeuf mov %rbp, %rsp # deallocate workspace 9966be8951SMathias Krause pop %rbp 1006488bce7SJosh Poimboeuf pop %r12 10166be8951SMathias Krause pop %rbx 10266be8951SMathias Krause ret 10366be8951SMathias Krause 104*6dcc5627SJiri Slaby SYM_FUNC_END(\name) 10566be8951SMathias Krause.endm 10666be8951SMathias Krause 10766be8951SMathias Krause/* 10866be8951SMathias Krause * This macro implements 80 rounds of SHA-1 for one 64-byte block 10966be8951SMathias Krause */ 11066be8951SMathias Krause.macro SHA1_PIPELINED_MAIN_BODY 11166be8951SMathias Krause INIT_REGALLOC 11266be8951SMathias Krause 11366be8951SMathias Krause mov (HASH_PTR), A 11466be8951SMathias Krause mov 4(HASH_PTR), B 11566be8951SMathias Krause mov 8(HASH_PTR), C 11666be8951SMathias Krause mov 12(HASH_PTR), D 11766be8951SMathias Krause mov 16(HASH_PTR), E 11866be8951SMathias Krause 11966be8951SMathias Krause .set i, 0 12066be8951SMathias Krause .rept W_PRECALC_AHEAD 12166be8951SMathias Krause W_PRECALC i 12266be8951SMathias Krause .set i, (i+1) 12366be8951SMathias Krause .endr 12466be8951SMathias Krause 12566be8951SMathias Krause.align 4 12666be8951SMathias Krause1: 12766be8951SMathias Krause RR F1,A,B,C,D,E,0 12866be8951SMathias Krause RR F1,D,E,A,B,C,2 12966be8951SMathias Krause RR F1,B,C,D,E,A,4 13066be8951SMathias Krause RR F1,E,A,B,C,D,6 13166be8951SMathias Krause RR F1,C,D,E,A,B,8 13266be8951SMathias Krause 13366be8951SMathias Krause RR F1,A,B,C,D,E,10 13466be8951SMathias Krause RR F1,D,E,A,B,C,12 13566be8951SMathias Krause RR F1,B,C,D,E,A,14 13666be8951SMathias Krause RR F1,E,A,B,C,D,16 13766be8951SMathias Krause RR F1,C,D,E,A,B,18 13866be8951SMathias Krause 13966be8951SMathias Krause RR F2,A,B,C,D,E,20 14066be8951SMathias Krause RR F2,D,E,A,B,C,22 14166be8951SMathias Krause RR F2,B,C,D,E,A,24 14266be8951SMathias Krause RR F2,E,A,B,C,D,26 14366be8951SMathias Krause RR F2,C,D,E,A,B,28 14466be8951SMathias Krause 14566be8951SMathias Krause RR F2,A,B,C,D,E,30 14666be8951SMathias Krause RR F2,D,E,A,B,C,32 14766be8951SMathias Krause RR F2,B,C,D,E,A,34 14866be8951SMathias Krause RR F2,E,A,B,C,D,36 14966be8951SMathias Krause RR F2,C,D,E,A,B,38 15066be8951SMathias Krause 15166be8951SMathias Krause RR F3,A,B,C,D,E,40 15266be8951SMathias Krause RR F3,D,E,A,B,C,42 15366be8951SMathias Krause RR F3,B,C,D,E,A,44 15466be8951SMathias Krause RR F3,E,A,B,C,D,46 15566be8951SMathias Krause RR F3,C,D,E,A,B,48 15666be8951SMathias Krause 15766be8951SMathias Krause RR F3,A,B,C,D,E,50 15866be8951SMathias Krause RR F3,D,E,A,B,C,52 15966be8951SMathias Krause RR F3,B,C,D,E,A,54 16066be8951SMathias Krause RR F3,E,A,B,C,D,56 16166be8951SMathias Krause RR F3,C,D,E,A,B,58 16266be8951SMathias Krause 16366be8951SMathias Krause add $64, BUFFER_PTR # move to the next 64-byte block 16466be8951SMathias Krause cmp BUFFER_END, BUFFER_PTR # if the current is the last one use 16566be8951SMathias Krause cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun 16666be8951SMathias Krause 16766be8951SMathias Krause RR F4,A,B,C,D,E,60 16866be8951SMathias Krause RR F4,D,E,A,B,C,62 16966be8951SMathias Krause RR F4,B,C,D,E,A,64 17066be8951SMathias Krause RR F4,E,A,B,C,D,66 17166be8951SMathias Krause RR F4,C,D,E,A,B,68 17266be8951SMathias Krause 17366be8951SMathias Krause RR F4,A,B,C,D,E,70 17466be8951SMathias Krause RR F4,D,E,A,B,C,72 17566be8951SMathias Krause RR F4,B,C,D,E,A,74 17666be8951SMathias Krause RR F4,E,A,B,C,D,76 17766be8951SMathias Krause RR F4,C,D,E,A,B,78 17866be8951SMathias Krause 17966be8951SMathias Krause UPDATE_HASH (HASH_PTR), A 18066be8951SMathias Krause UPDATE_HASH 4(HASH_PTR), B 18166be8951SMathias Krause UPDATE_HASH 8(HASH_PTR), C 18266be8951SMathias Krause UPDATE_HASH 12(HASH_PTR), D 18366be8951SMathias Krause UPDATE_HASH 16(HASH_PTR), E 18466be8951SMathias Krause 18566be8951SMathias Krause RESTORE_RENAMED_REGS 18666be8951SMathias Krause cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end 18766be8951SMathias Krause jne 1b 18866be8951SMathias Krause.endm 18966be8951SMathias Krause 19066be8951SMathias Krause.macro INIT_REGALLOC 19166be8951SMathias Krause .set A, REG_A 19266be8951SMathias Krause .set B, REG_B 19366be8951SMathias Krause .set C, REG_C 19466be8951SMathias Krause .set D, REG_D 19566be8951SMathias Krause .set E, REG_E 19666be8951SMathias Krause .set T1, REG_T1 19766be8951SMathias Krause .set T2, REG_T2 19866be8951SMathias Krause.endm 19966be8951SMathias Krause 20066be8951SMathias Krause.macro RESTORE_RENAMED_REGS 20166be8951SMathias Krause # order is important (REG_C is where it should be) 20266be8951SMathias Krause mov B, REG_B 20366be8951SMathias Krause mov D, REG_D 20466be8951SMathias Krause mov A, REG_A 20566be8951SMathias Krause mov E, REG_E 20666be8951SMathias Krause.endm 20766be8951SMathias Krause 20866be8951SMathias Krause.macro SWAP_REG_NAMES a, b 20966be8951SMathias Krause .set _T, \a 21066be8951SMathias Krause .set \a, \b 21166be8951SMathias Krause .set \b, _T 21266be8951SMathias Krause.endm 21366be8951SMathias Krause 21466be8951SMathias Krause.macro F1 b, c, d 21566be8951SMathias Krause mov \c, T1 21666be8951SMathias Krause SWAP_REG_NAMES \c, T1 21766be8951SMathias Krause xor \d, T1 21866be8951SMathias Krause and \b, T1 21966be8951SMathias Krause xor \d, T1 22066be8951SMathias Krause.endm 22166be8951SMathias Krause 22266be8951SMathias Krause.macro F2 b, c, d 22366be8951SMathias Krause mov \d, T1 22466be8951SMathias Krause SWAP_REG_NAMES \d, T1 22566be8951SMathias Krause xor \c, T1 22666be8951SMathias Krause xor \b, T1 22766be8951SMathias Krause.endm 22866be8951SMathias Krause 22966be8951SMathias Krause.macro F3 b, c ,d 23066be8951SMathias Krause mov \c, T1 23166be8951SMathias Krause SWAP_REG_NAMES \c, T1 23266be8951SMathias Krause mov \b, T2 23366be8951SMathias Krause or \b, T1 23466be8951SMathias Krause and \c, T2 23566be8951SMathias Krause and \d, T1 23666be8951SMathias Krause or T2, T1 23766be8951SMathias Krause.endm 23866be8951SMathias Krause 23966be8951SMathias Krause.macro F4 b, c, d 24066be8951SMathias Krause F2 \b, \c, \d 24166be8951SMathias Krause.endm 24266be8951SMathias Krause 24366be8951SMathias Krause.macro UPDATE_HASH hash, val 24466be8951SMathias Krause add \hash, \val 24566be8951SMathias Krause mov \val, \hash 24666be8951SMathias Krause.endm 24766be8951SMathias Krause 24866be8951SMathias Krause/* 24966be8951SMathias Krause * RR does two rounds of SHA-1 back to back with W[] pre-calc 25066be8951SMathias Krause * t1 = F(b, c, d); e += w(i) 25166be8951SMathias Krause * e += t1; b <<= 30; d += w(i+1); 25266be8951SMathias Krause * t1 = F(a, b, c); 25366be8951SMathias Krause * d += t1; a <<= 5; 25466be8951SMathias Krause * e += a; 25566be8951SMathias Krause * t1 = e; a >>= 7; 25666be8951SMathias Krause * t1 <<= 5; 25766be8951SMathias Krause * d += t1; 25866be8951SMathias Krause */ 25966be8951SMathias Krause.macro RR F, a, b, c, d, e, round 26066be8951SMathias Krause add WK(\round), \e 26166be8951SMathias Krause \F \b, \c, \d # t1 = F(b, c, d); 26266be8951SMathias Krause W_PRECALC (\round + W_PRECALC_AHEAD) 26366be8951SMathias Krause rol $30, \b 26466be8951SMathias Krause add T1, \e 26566be8951SMathias Krause add WK(\round + 1), \d 26666be8951SMathias Krause 26766be8951SMathias Krause \F \a, \b, \c 26866be8951SMathias Krause W_PRECALC (\round + W_PRECALC_AHEAD + 1) 26966be8951SMathias Krause rol $5, \a 27066be8951SMathias Krause add \a, \e 27166be8951SMathias Krause add T1, \d 27266be8951SMathias Krause ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) 27366be8951SMathias Krause 27466be8951SMathias Krause mov \e, T1 27566be8951SMathias Krause SWAP_REG_NAMES \e, T1 27666be8951SMathias Krause 27766be8951SMathias Krause rol $5, T1 27866be8951SMathias Krause add T1, \d 27966be8951SMathias Krause 28066be8951SMathias Krause # write: \a, \b 28166be8951SMathias Krause # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c 28266be8951SMathias Krause.endm 28366be8951SMathias Krause 28466be8951SMathias Krause.macro W_PRECALC r 28566be8951SMathias Krause .set i, \r 28666be8951SMathias Krause 28766be8951SMathias Krause .if (i < 20) 28866be8951SMathias Krause .set K_XMM, 0 28966be8951SMathias Krause .elseif (i < 40) 29066be8951SMathias Krause .set K_XMM, 16 29166be8951SMathias Krause .elseif (i < 60) 29266be8951SMathias Krause .set K_XMM, 32 29366be8951SMathias Krause .elseif (i < 80) 29466be8951SMathias Krause .set K_XMM, 48 29566be8951SMathias Krause .endif 29666be8951SMathias Krause 29766be8951SMathias Krause .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) 29866be8951SMathias Krause .set i, ((\r) % 80) # pre-compute for the next iteration 29966be8951SMathias Krause .if (i == 0) 30066be8951SMathias Krause W_PRECALC_RESET 30166be8951SMathias Krause .endif 30266be8951SMathias Krause W_PRECALC_00_15 30366be8951SMathias Krause .elseif (i<32) 30466be8951SMathias Krause W_PRECALC_16_31 30566be8951SMathias Krause .elseif (i < 80) // rounds 32-79 30666be8951SMathias Krause W_PRECALC_32_79 30766be8951SMathias Krause .endif 30866be8951SMathias Krause.endm 30966be8951SMathias Krause 31066be8951SMathias Krause.macro W_PRECALC_RESET 31166be8951SMathias Krause .set W, W0 31266be8951SMathias Krause .set W_minus_04, W4 31366be8951SMathias Krause .set W_minus_08, W8 31466be8951SMathias Krause .set W_minus_12, W12 31566be8951SMathias Krause .set W_minus_16, W16 31666be8951SMathias Krause .set W_minus_20, W20 31766be8951SMathias Krause .set W_minus_24, W24 31866be8951SMathias Krause .set W_minus_28, W28 31966be8951SMathias Krause .set W_minus_32, W 32066be8951SMathias Krause.endm 32166be8951SMathias Krause 32266be8951SMathias Krause.macro W_PRECALC_ROTATE 32366be8951SMathias Krause .set W_minus_32, W_minus_28 32466be8951SMathias Krause .set W_minus_28, W_minus_24 32566be8951SMathias Krause .set W_minus_24, W_minus_20 32666be8951SMathias Krause .set W_minus_20, W_minus_16 32766be8951SMathias Krause .set W_minus_16, W_minus_12 32866be8951SMathias Krause .set W_minus_12, W_minus_08 32966be8951SMathias Krause .set W_minus_08, W_minus_04 33066be8951SMathias Krause .set W_minus_04, W 33166be8951SMathias Krause .set W, W_minus_32 33266be8951SMathias Krause.endm 33366be8951SMathias Krause 33466be8951SMathias Krause.macro W_PRECALC_SSSE3 33566be8951SMathias Krause 33666be8951SMathias Krause.macro W_PRECALC_00_15 33766be8951SMathias Krause W_PRECALC_00_15_SSSE3 33866be8951SMathias Krause.endm 33966be8951SMathias Krause.macro W_PRECALC_16_31 34066be8951SMathias Krause W_PRECALC_16_31_SSSE3 34166be8951SMathias Krause.endm 34266be8951SMathias Krause.macro W_PRECALC_32_79 34366be8951SMathias Krause W_PRECALC_32_79_SSSE3 34466be8951SMathias Krause.endm 34566be8951SMathias Krause 34666be8951SMathias Krause/* message scheduling pre-compute for rounds 0-15 */ 34766be8951SMathias Krause.macro W_PRECALC_00_15_SSSE3 34866be8951SMathias Krause .if ((i & 3) == 0) 34966be8951SMathias Krause movdqu (i*4)(BUFFER_PTR), W_TMP1 35066be8951SMathias Krause .elseif ((i & 3) == 1) 35166be8951SMathias Krause pshufb XMM_SHUFB_BSWAP, W_TMP1 35266be8951SMathias Krause movdqa W_TMP1, W 35366be8951SMathias Krause .elseif ((i & 3) == 2) 35466be8951SMathias Krause paddd (K_BASE), W_TMP1 35566be8951SMathias Krause .elseif ((i & 3) == 3) 35666be8951SMathias Krause movdqa W_TMP1, WK(i&~3) 35766be8951SMathias Krause W_PRECALC_ROTATE 35866be8951SMathias Krause .endif 35966be8951SMathias Krause.endm 36066be8951SMathias Krause 36166be8951SMathias Krause/* message scheduling pre-compute for rounds 16-31 36266be8951SMathias Krause * 36366be8951SMathias Krause * - calculating last 32 w[i] values in 8 XMM registers 36466be8951SMathias Krause * - pre-calculate K+w[i] values and store to mem, for later load by ALU add 36566be8951SMathias Krause * instruction 36666be8951SMathias Krause * 36766be8951SMathias Krause * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] 36866be8951SMathias Krause * dependency, but improves for 32-79 36966be8951SMathias Krause */ 37066be8951SMathias Krause.macro W_PRECALC_16_31_SSSE3 37166be8951SMathias Krause # blended scheduling of vector and scalar instruction streams, one 4-wide 37266be8951SMathias Krause # vector iteration / 4 scalar rounds 37366be8951SMathias Krause .if ((i & 3) == 0) 37466be8951SMathias Krause movdqa W_minus_12, W 37566be8951SMathias Krause palignr $8, W_minus_16, W # w[i-14] 37666be8951SMathias Krause movdqa W_minus_04, W_TMP1 37766be8951SMathias Krause psrldq $4, W_TMP1 # w[i-3] 37866be8951SMathias Krause pxor W_minus_08, W 37966be8951SMathias Krause .elseif ((i & 3) == 1) 38066be8951SMathias Krause pxor W_minus_16, W_TMP1 38166be8951SMathias Krause pxor W_TMP1, W 38266be8951SMathias Krause movdqa W, W_TMP2 38366be8951SMathias Krause movdqa W, W_TMP1 38466be8951SMathias Krause pslldq $12, W_TMP2 38566be8951SMathias Krause .elseif ((i & 3) == 2) 38666be8951SMathias Krause psrld $31, W 38766be8951SMathias Krause pslld $1, W_TMP1 38866be8951SMathias Krause por W, W_TMP1 38966be8951SMathias Krause movdqa W_TMP2, W 39066be8951SMathias Krause psrld $30, W_TMP2 39166be8951SMathias Krause pslld $2, W 39266be8951SMathias Krause .elseif ((i & 3) == 3) 39366be8951SMathias Krause pxor W, W_TMP1 39466be8951SMathias Krause pxor W_TMP2, W_TMP1 39566be8951SMathias Krause movdqa W_TMP1, W 39666be8951SMathias Krause paddd K_XMM(K_BASE), W_TMP1 39766be8951SMathias Krause movdqa W_TMP1, WK(i&~3) 39866be8951SMathias Krause W_PRECALC_ROTATE 39966be8951SMathias Krause .endif 40066be8951SMathias Krause.endm 40166be8951SMathias Krause 40266be8951SMathias Krause/* message scheduling pre-compute for rounds 32-79 40366be8951SMathias Krause * 40466be8951SMathias Krause * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 40566be8951SMathias Krause * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 40666be8951SMathias Krause * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken 40766be8951SMathias Krause */ 40866be8951SMathias Krause.macro W_PRECALC_32_79_SSSE3 40966be8951SMathias Krause .if ((i & 3) == 0) 41066be8951SMathias Krause movdqa W_minus_04, W_TMP1 41166be8951SMathias Krause pxor W_minus_28, W # W is W_minus_32 before xor 41266be8951SMathias Krause palignr $8, W_minus_08, W_TMP1 41366be8951SMathias Krause .elseif ((i & 3) == 1) 41466be8951SMathias Krause pxor W_minus_16, W 41566be8951SMathias Krause pxor W_TMP1, W 41666be8951SMathias Krause movdqa W, W_TMP1 41766be8951SMathias Krause .elseif ((i & 3) == 2) 41866be8951SMathias Krause psrld $30, W 41966be8951SMathias Krause pslld $2, W_TMP1 42066be8951SMathias Krause por W, W_TMP1 42166be8951SMathias Krause .elseif ((i & 3) == 3) 42266be8951SMathias Krause movdqa W_TMP1, W 42366be8951SMathias Krause paddd K_XMM(K_BASE), W_TMP1 42466be8951SMathias Krause movdqa W_TMP1, WK(i&~3) 42566be8951SMathias Krause W_PRECALC_ROTATE 42666be8951SMathias Krause .endif 42766be8951SMathias Krause.endm 42866be8951SMathias Krause 42966be8951SMathias Krause.endm // W_PRECALC_SSSE3 43066be8951SMathias Krause 43166be8951SMathias Krause 43266be8951SMathias Krause#define K1 0x5a827999 43366be8951SMathias Krause#define K2 0x6ed9eba1 43466be8951SMathias Krause#define K3 0x8f1bbcdc 43566be8951SMathias Krause#define K4 0xca62c1d6 43666be8951SMathias Krause 43766be8951SMathias Krause.section .rodata 43866be8951SMathias Krause.align 16 43966be8951SMathias Krause 44066be8951SMathias KrauseK_XMM_AR: 44166be8951SMathias Krause .long K1, K1, K1, K1 44266be8951SMathias Krause .long K2, K2, K2, K2 44366be8951SMathias Krause .long K3, K3, K3, K3 44466be8951SMathias Krause .long K4, K4, K4, K4 44566be8951SMathias Krause 44666be8951SMathias KrauseBSWAP_SHUFB_CTL: 44766be8951SMathias Krause .long 0x00010203 44866be8951SMathias Krause .long 0x04050607 44966be8951SMathias Krause .long 0x08090a0b 45066be8951SMathias Krause .long 0x0c0d0e0f 45166be8951SMathias Krause 45266be8951SMathias Krause 45366be8951SMathias Krause.section .text 45466be8951SMathias Krause 45566be8951SMathias KrauseW_PRECALC_SSSE3 45666be8951SMathias Krause.macro xmm_mov a, b 45766be8951SMathias Krause movdqu \a,\b 45866be8951SMathias Krause.endm 45966be8951SMathias Krause 46066be8951SMathias Krause/* SSSE3 optimized implementation: 46166be8951SMathias Krause * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, 46266be8951SMathias Krause * unsigned int rounds); 46366be8951SMathias Krause */ 46466be8951SMathias KrauseSHA1_VECTOR_ASM sha1_transform_ssse3 46566be8951SMathias Krause 46665df5774SMathias Krause#ifdef CONFIG_AS_AVX 46766be8951SMathias Krause 46866be8951SMathias Krause.macro W_PRECALC_AVX 46966be8951SMathias Krause 47066be8951SMathias Krause.purgem W_PRECALC_00_15 47166be8951SMathias Krause.macro W_PRECALC_00_15 47266be8951SMathias Krause W_PRECALC_00_15_AVX 47366be8951SMathias Krause.endm 47466be8951SMathias Krause.purgem W_PRECALC_16_31 47566be8951SMathias Krause.macro W_PRECALC_16_31 47666be8951SMathias Krause W_PRECALC_16_31_AVX 47766be8951SMathias Krause.endm 47866be8951SMathias Krause.purgem W_PRECALC_32_79 47966be8951SMathias Krause.macro W_PRECALC_32_79 48066be8951SMathias Krause W_PRECALC_32_79_AVX 48166be8951SMathias Krause.endm 48266be8951SMathias Krause 48366be8951SMathias Krause.macro W_PRECALC_00_15_AVX 48466be8951SMathias Krause .if ((i & 3) == 0) 48566be8951SMathias Krause vmovdqu (i*4)(BUFFER_PTR), W_TMP1 48666be8951SMathias Krause .elseif ((i & 3) == 1) 48766be8951SMathias Krause vpshufb XMM_SHUFB_BSWAP, W_TMP1, W 48866be8951SMathias Krause .elseif ((i & 3) == 2) 48966be8951SMathias Krause vpaddd (K_BASE), W, W_TMP1 49066be8951SMathias Krause .elseif ((i & 3) == 3) 49166be8951SMathias Krause vmovdqa W_TMP1, WK(i&~3) 49266be8951SMathias Krause W_PRECALC_ROTATE 49366be8951SMathias Krause .endif 49466be8951SMathias Krause.endm 49566be8951SMathias Krause 49666be8951SMathias Krause.macro W_PRECALC_16_31_AVX 49766be8951SMathias Krause .if ((i & 3) == 0) 49866be8951SMathias Krause vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] 49966be8951SMathias Krause vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] 50066be8951SMathias Krause vpxor W_minus_08, W, W 50166be8951SMathias Krause vpxor W_minus_16, W_TMP1, W_TMP1 50266be8951SMathias Krause .elseif ((i & 3) == 1) 50366be8951SMathias Krause vpxor W_TMP1, W, W 50466be8951SMathias Krause vpslldq $12, W, W_TMP2 50566be8951SMathias Krause vpslld $1, W, W_TMP1 50666be8951SMathias Krause .elseif ((i & 3) == 2) 50766be8951SMathias Krause vpsrld $31, W, W 50866be8951SMathias Krause vpor W, W_TMP1, W_TMP1 50966be8951SMathias Krause vpslld $2, W_TMP2, W 51066be8951SMathias Krause vpsrld $30, W_TMP2, W_TMP2 51166be8951SMathias Krause .elseif ((i & 3) == 3) 51266be8951SMathias Krause vpxor W, W_TMP1, W_TMP1 51366be8951SMathias Krause vpxor W_TMP2, W_TMP1, W 51466be8951SMathias Krause vpaddd K_XMM(K_BASE), W, W_TMP1 51566be8951SMathias Krause vmovdqu W_TMP1, WK(i&~3) 51666be8951SMathias Krause W_PRECALC_ROTATE 51766be8951SMathias Krause .endif 51866be8951SMathias Krause.endm 51966be8951SMathias Krause 52066be8951SMathias Krause.macro W_PRECALC_32_79_AVX 52166be8951SMathias Krause .if ((i & 3) == 0) 52266be8951SMathias Krause vpalignr $8, W_minus_08, W_minus_04, W_TMP1 52366be8951SMathias Krause vpxor W_minus_28, W, W # W is W_minus_32 before xor 52466be8951SMathias Krause .elseif ((i & 3) == 1) 52566be8951SMathias Krause vpxor W_minus_16, W_TMP1, W_TMP1 52666be8951SMathias Krause vpxor W_TMP1, W, W 52766be8951SMathias Krause .elseif ((i & 3) == 2) 52866be8951SMathias Krause vpslld $2, W, W_TMP1 52966be8951SMathias Krause vpsrld $30, W, W 53066be8951SMathias Krause vpor W, W_TMP1, W 53166be8951SMathias Krause .elseif ((i & 3) == 3) 53266be8951SMathias Krause vpaddd K_XMM(K_BASE), W, W_TMP1 53366be8951SMathias Krause vmovdqu W_TMP1, WK(i&~3) 53466be8951SMathias Krause W_PRECALC_ROTATE 53566be8951SMathias Krause .endif 53666be8951SMathias Krause.endm 53766be8951SMathias Krause 53866be8951SMathias Krause.endm // W_PRECALC_AVX 53966be8951SMathias Krause 54066be8951SMathias KrauseW_PRECALC_AVX 54166be8951SMathias Krause.purgem xmm_mov 54266be8951SMathias Krause.macro xmm_mov a, b 54366be8951SMathias Krause vmovdqu \a,\b 54466be8951SMathias Krause.endm 54566be8951SMathias Krause 54666be8951SMathias Krause 54766be8951SMathias Krause/* AVX optimized implementation: 54866be8951SMathias Krause * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, 54966be8951SMathias Krause * unsigned int rounds); 55066be8951SMathias Krause */ 55166be8951SMathias KrauseSHA1_VECTOR_ASM sha1_transform_avx 55266be8951SMathias Krause 55366be8951SMathias Krause#endif 554