12874c5fdSThomas Gleixner/* SPDX-License-Identifier: GPL-2.0-or-later */ 266be8951SMathias Krause/* 366be8951SMathias Krause * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental 466be8951SMathias Krause * SSE3 instruction set extensions introduced in Intel Core Microarchitecture 566be8951SMathias Krause * processors. CPUs supporting Intel(R) AVX extensions will get an additional 666be8951SMathias Krause * boost. 766be8951SMathias Krause * 866be8951SMathias Krause * This work was inspired by the vectorized implementation of Dean Gaudet. 966be8951SMathias Krause * Additional information on it can be found at: 1066be8951SMathias Krause * http://www.arctic.org/~dean/crypto/sha1.html 1166be8951SMathias Krause * 1266be8951SMathias Krause * It was improved upon with more efficient vectorization of the message 1366be8951SMathias Krause * scheduling. This implementation has also been optimized for all current and 1466be8951SMathias Krause * several future generations of Intel CPUs. 1566be8951SMathias Krause * 1666be8951SMathias Krause * See this article for more information about the implementation details: 1766be8951SMathias Krause * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ 1866be8951SMathias Krause * 1966be8951SMathias Krause * Copyright (C) 2010, Intel Corp. 2066be8951SMathias Krause * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> 2166be8951SMathias Krause * Ronen Zohar <ronen.zohar@intel.com> 2266be8951SMathias Krause * 2366be8951SMathias Krause * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: 2466be8951SMathias Krause * Author: Mathias Krause <minipli@googlemail.com> 2566be8951SMathias Krause */ 2666be8951SMathias Krause 27ac9d55ddSJussi Kivilinna#include <linux/linkage.h> 28*32f34bf7SEric Biggers#include <linux/cfi_types.h> 29ac9d55ddSJussi Kivilinna 3066be8951SMathias Krause#define CTX %rdi // arg1 3166be8951SMathias Krause#define BUF %rsi // arg2 3266be8951SMathias Krause#define CNT %rdx // arg3 3366be8951SMathias Krause 3466be8951SMathias Krause#define REG_A %ecx 3566be8951SMathias Krause#define REG_B %esi 3666be8951SMathias Krause#define REG_C %edi 376488bce7SJosh Poimboeuf#define REG_D %r12d 3866be8951SMathias Krause#define REG_E %edx 3966be8951SMathias Krause 4066be8951SMathias Krause#define REG_T1 %eax 4166be8951SMathias Krause#define REG_T2 %ebx 4266be8951SMathias Krause 4366be8951SMathias Krause#define K_BASE %r8 4466be8951SMathias Krause#define HASH_PTR %r9 4566be8951SMathias Krause#define BUFFER_PTR %r10 4666be8951SMathias Krause#define BUFFER_END %r11 4766be8951SMathias Krause 4866be8951SMathias Krause#define W_TMP1 %xmm0 4966be8951SMathias Krause#define W_TMP2 %xmm9 5066be8951SMathias Krause 5166be8951SMathias Krause#define W0 %xmm1 5266be8951SMathias Krause#define W4 %xmm2 5366be8951SMathias Krause#define W8 %xmm3 5466be8951SMathias Krause#define W12 %xmm4 5566be8951SMathias Krause#define W16 %xmm5 5666be8951SMathias Krause#define W20 %xmm6 5766be8951SMathias Krause#define W24 %xmm7 5866be8951SMathias Krause#define W28 %xmm8 5966be8951SMathias Krause 6066be8951SMathias Krause#define XMM_SHUFB_BSWAP %xmm10 6166be8951SMathias Krause 6266be8951SMathias Krause/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ 6366be8951SMathias Krause#define WK(t) (((t) & 15) * 4)(%rsp) 6466be8951SMathias Krause#define W_PRECALC_AHEAD 16 6566be8951SMathias Krause 6666be8951SMathias Krause/* 6766be8951SMathias Krause * This macro implements the SHA-1 function's body for single 64-byte block 6866be8951SMathias Krause * param: function's name 6966be8951SMathias Krause */ 7066be8951SMathias Krause.macro SHA1_VECTOR_ASM name 71*32f34bf7SEric Biggers SYM_TYPED_FUNC_START(\name) 72ac9d55ddSJussi Kivilinna 7366be8951SMathias Krause push %rbx 7466be8951SMathias Krause push %r12 756488bce7SJosh Poimboeuf push %rbp 766488bce7SJosh Poimboeuf mov %rsp, %rbp 7766be8951SMathias Krause 7866be8951SMathias Krause sub $64, %rsp # allocate workspace 7966be8951SMathias Krause and $~15, %rsp # align stack 8066be8951SMathias Krause 8166be8951SMathias Krause mov CTX, HASH_PTR 8266be8951SMathias Krause mov BUF, BUFFER_PTR 8366be8951SMathias Krause 8466be8951SMathias Krause shl $6, CNT # multiply by 64 8566be8951SMathias Krause add BUF, CNT 8666be8951SMathias Krause mov CNT, BUFFER_END 8766be8951SMathias Krause 8866be8951SMathias Krause lea K_XMM_AR(%rip), K_BASE 8966be8951SMathias Krause xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP 9066be8951SMathias Krause 9166be8951SMathias Krause SHA1_PIPELINED_MAIN_BODY 9266be8951SMathias Krause 9366be8951SMathias Krause # cleanup workspace 9466be8951SMathias Krause mov $8, %ecx 9566be8951SMathias Krause mov %rsp, %rdi 96a7bea830SJan Beulich xor %eax, %eax 9766be8951SMathias Krause rep stosq 9866be8951SMathias Krause 996488bce7SJosh Poimboeuf mov %rbp, %rsp # deallocate workspace 10066be8951SMathias Krause pop %rbp 1016488bce7SJosh Poimboeuf pop %r12 10266be8951SMathias Krause pop %rbx 103f94909ceSPeter Zijlstra RET 10466be8951SMathias Krause 1056dcc5627SJiri Slaby SYM_FUNC_END(\name) 10666be8951SMathias Krause.endm 10766be8951SMathias Krause 10866be8951SMathias Krause/* 10966be8951SMathias Krause * This macro implements 80 rounds of SHA-1 for one 64-byte block 11066be8951SMathias Krause */ 11166be8951SMathias Krause.macro SHA1_PIPELINED_MAIN_BODY 11266be8951SMathias Krause INIT_REGALLOC 11366be8951SMathias Krause 11466be8951SMathias Krause mov (HASH_PTR), A 11566be8951SMathias Krause mov 4(HASH_PTR), B 11666be8951SMathias Krause mov 8(HASH_PTR), C 11766be8951SMathias Krause mov 12(HASH_PTR), D 11866be8951SMathias Krause mov 16(HASH_PTR), E 11966be8951SMathias Krause 12066be8951SMathias Krause .set i, 0 12166be8951SMathias Krause .rept W_PRECALC_AHEAD 12266be8951SMathias Krause W_PRECALC i 12366be8951SMathias Krause .set i, (i+1) 12466be8951SMathias Krause .endr 12566be8951SMathias Krause 12666be8951SMathias Krause.align 4 12766be8951SMathias Krause1: 12866be8951SMathias Krause RR F1,A,B,C,D,E,0 12966be8951SMathias Krause RR F1,D,E,A,B,C,2 13066be8951SMathias Krause RR F1,B,C,D,E,A,4 13166be8951SMathias Krause RR F1,E,A,B,C,D,6 13266be8951SMathias Krause RR F1,C,D,E,A,B,8 13366be8951SMathias Krause 13466be8951SMathias Krause RR F1,A,B,C,D,E,10 13566be8951SMathias Krause RR F1,D,E,A,B,C,12 13666be8951SMathias Krause RR F1,B,C,D,E,A,14 13766be8951SMathias Krause RR F1,E,A,B,C,D,16 13866be8951SMathias Krause RR F1,C,D,E,A,B,18 13966be8951SMathias Krause 14066be8951SMathias Krause RR F2,A,B,C,D,E,20 14166be8951SMathias Krause RR F2,D,E,A,B,C,22 14266be8951SMathias Krause RR F2,B,C,D,E,A,24 14366be8951SMathias Krause RR F2,E,A,B,C,D,26 14466be8951SMathias Krause RR F2,C,D,E,A,B,28 14566be8951SMathias Krause 14666be8951SMathias Krause RR F2,A,B,C,D,E,30 14766be8951SMathias Krause RR F2,D,E,A,B,C,32 14866be8951SMathias Krause RR F2,B,C,D,E,A,34 14966be8951SMathias Krause RR F2,E,A,B,C,D,36 15066be8951SMathias Krause RR F2,C,D,E,A,B,38 15166be8951SMathias Krause 15266be8951SMathias Krause RR F3,A,B,C,D,E,40 15366be8951SMathias Krause RR F3,D,E,A,B,C,42 15466be8951SMathias Krause RR F3,B,C,D,E,A,44 15566be8951SMathias Krause RR F3,E,A,B,C,D,46 15666be8951SMathias Krause RR F3,C,D,E,A,B,48 15766be8951SMathias Krause 15866be8951SMathias Krause RR F3,A,B,C,D,E,50 15966be8951SMathias Krause RR F3,D,E,A,B,C,52 16066be8951SMathias Krause RR F3,B,C,D,E,A,54 16166be8951SMathias Krause RR F3,E,A,B,C,D,56 16266be8951SMathias Krause RR F3,C,D,E,A,B,58 16366be8951SMathias Krause 16466be8951SMathias Krause add $64, BUFFER_PTR # move to the next 64-byte block 16566be8951SMathias Krause cmp BUFFER_END, BUFFER_PTR # if the current is the last one use 16666be8951SMathias Krause cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun 16766be8951SMathias Krause 16866be8951SMathias Krause RR F4,A,B,C,D,E,60 16966be8951SMathias Krause RR F4,D,E,A,B,C,62 17066be8951SMathias Krause RR F4,B,C,D,E,A,64 17166be8951SMathias Krause RR F4,E,A,B,C,D,66 17266be8951SMathias Krause RR F4,C,D,E,A,B,68 17366be8951SMathias Krause 17466be8951SMathias Krause RR F4,A,B,C,D,E,70 17566be8951SMathias Krause RR F4,D,E,A,B,C,72 17666be8951SMathias Krause RR F4,B,C,D,E,A,74 17766be8951SMathias Krause RR F4,E,A,B,C,D,76 17866be8951SMathias Krause RR F4,C,D,E,A,B,78 17966be8951SMathias Krause 18066be8951SMathias Krause UPDATE_HASH (HASH_PTR), A 18166be8951SMathias Krause UPDATE_HASH 4(HASH_PTR), B 18266be8951SMathias Krause UPDATE_HASH 8(HASH_PTR), C 18366be8951SMathias Krause UPDATE_HASH 12(HASH_PTR), D 18466be8951SMathias Krause UPDATE_HASH 16(HASH_PTR), E 18566be8951SMathias Krause 18666be8951SMathias Krause RESTORE_RENAMED_REGS 18766be8951SMathias Krause cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end 18866be8951SMathias Krause jne 1b 18966be8951SMathias Krause.endm 19066be8951SMathias Krause 19166be8951SMathias Krause.macro INIT_REGALLOC 19266be8951SMathias Krause .set A, REG_A 19366be8951SMathias Krause .set B, REG_B 19466be8951SMathias Krause .set C, REG_C 19566be8951SMathias Krause .set D, REG_D 19666be8951SMathias Krause .set E, REG_E 19766be8951SMathias Krause .set T1, REG_T1 19866be8951SMathias Krause .set T2, REG_T2 19966be8951SMathias Krause.endm 20066be8951SMathias Krause 20166be8951SMathias Krause.macro RESTORE_RENAMED_REGS 20266be8951SMathias Krause # order is important (REG_C is where it should be) 20366be8951SMathias Krause mov B, REG_B 20466be8951SMathias Krause mov D, REG_D 20566be8951SMathias Krause mov A, REG_A 20666be8951SMathias Krause mov E, REG_E 20766be8951SMathias Krause.endm 20866be8951SMathias Krause 20966be8951SMathias Krause.macro SWAP_REG_NAMES a, b 21066be8951SMathias Krause .set _T, \a 21166be8951SMathias Krause .set \a, \b 21266be8951SMathias Krause .set \b, _T 21366be8951SMathias Krause.endm 21466be8951SMathias Krause 21566be8951SMathias Krause.macro F1 b, c, d 21666be8951SMathias Krause mov \c, T1 21766be8951SMathias Krause SWAP_REG_NAMES \c, T1 21866be8951SMathias Krause xor \d, T1 21966be8951SMathias Krause and \b, T1 22066be8951SMathias Krause xor \d, T1 22166be8951SMathias Krause.endm 22266be8951SMathias Krause 22366be8951SMathias Krause.macro F2 b, c, d 22466be8951SMathias Krause mov \d, T1 22566be8951SMathias Krause SWAP_REG_NAMES \d, T1 22666be8951SMathias Krause xor \c, T1 22766be8951SMathias Krause xor \b, T1 22866be8951SMathias Krause.endm 22966be8951SMathias Krause 23066be8951SMathias Krause.macro F3 b, c ,d 23166be8951SMathias Krause mov \c, T1 23266be8951SMathias Krause SWAP_REG_NAMES \c, T1 23366be8951SMathias Krause mov \b, T2 23466be8951SMathias Krause or \b, T1 23566be8951SMathias Krause and \c, T2 23666be8951SMathias Krause and \d, T1 23766be8951SMathias Krause or T2, T1 23866be8951SMathias Krause.endm 23966be8951SMathias Krause 24066be8951SMathias Krause.macro F4 b, c, d 24166be8951SMathias Krause F2 \b, \c, \d 24266be8951SMathias Krause.endm 24366be8951SMathias Krause 24466be8951SMathias Krause.macro UPDATE_HASH hash, val 24566be8951SMathias Krause add \hash, \val 24666be8951SMathias Krause mov \val, \hash 24766be8951SMathias Krause.endm 24866be8951SMathias Krause 24966be8951SMathias Krause/* 25066be8951SMathias Krause * RR does two rounds of SHA-1 back to back with W[] pre-calc 25166be8951SMathias Krause * t1 = F(b, c, d); e += w(i) 25266be8951SMathias Krause * e += t1; b <<= 30; d += w(i+1); 25366be8951SMathias Krause * t1 = F(a, b, c); 25466be8951SMathias Krause * d += t1; a <<= 5; 25566be8951SMathias Krause * e += a; 25666be8951SMathias Krause * t1 = e; a >>= 7; 25766be8951SMathias Krause * t1 <<= 5; 25866be8951SMathias Krause * d += t1; 25966be8951SMathias Krause */ 26066be8951SMathias Krause.macro RR F, a, b, c, d, e, round 26166be8951SMathias Krause add WK(\round), \e 26266be8951SMathias Krause \F \b, \c, \d # t1 = F(b, c, d); 26366be8951SMathias Krause W_PRECALC (\round + W_PRECALC_AHEAD) 26466be8951SMathias Krause rol $30, \b 26566be8951SMathias Krause add T1, \e 26666be8951SMathias Krause add WK(\round + 1), \d 26766be8951SMathias Krause 26866be8951SMathias Krause \F \a, \b, \c 26966be8951SMathias Krause W_PRECALC (\round + W_PRECALC_AHEAD + 1) 27066be8951SMathias Krause rol $5, \a 27166be8951SMathias Krause add \a, \e 27266be8951SMathias Krause add T1, \d 27366be8951SMathias Krause ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) 27466be8951SMathias Krause 27566be8951SMathias Krause mov \e, T1 27666be8951SMathias Krause SWAP_REG_NAMES \e, T1 27766be8951SMathias Krause 27866be8951SMathias Krause rol $5, T1 27966be8951SMathias Krause add T1, \d 28066be8951SMathias Krause 28166be8951SMathias Krause # write: \a, \b 28266be8951SMathias Krause # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c 28366be8951SMathias Krause.endm 28466be8951SMathias Krause 28566be8951SMathias Krause.macro W_PRECALC r 28666be8951SMathias Krause .set i, \r 28766be8951SMathias Krause 28866be8951SMathias Krause .if (i < 20) 28966be8951SMathias Krause .set K_XMM, 0 29066be8951SMathias Krause .elseif (i < 40) 29166be8951SMathias Krause .set K_XMM, 16 29266be8951SMathias Krause .elseif (i < 60) 29366be8951SMathias Krause .set K_XMM, 32 29466be8951SMathias Krause .elseif (i < 80) 29566be8951SMathias Krause .set K_XMM, 48 29666be8951SMathias Krause .endif 29766be8951SMathias Krause 29866be8951SMathias Krause .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) 29966be8951SMathias Krause .set i, ((\r) % 80) # pre-compute for the next iteration 30066be8951SMathias Krause .if (i == 0) 30166be8951SMathias Krause W_PRECALC_RESET 30266be8951SMathias Krause .endif 30366be8951SMathias Krause W_PRECALC_00_15 30466be8951SMathias Krause .elseif (i<32) 30566be8951SMathias Krause W_PRECALC_16_31 30666be8951SMathias Krause .elseif (i < 80) // rounds 32-79 30766be8951SMathias Krause W_PRECALC_32_79 30866be8951SMathias Krause .endif 30966be8951SMathias Krause.endm 31066be8951SMathias Krause 31166be8951SMathias Krause.macro W_PRECALC_RESET 31266be8951SMathias Krause .set W, W0 31366be8951SMathias Krause .set W_minus_04, W4 31466be8951SMathias Krause .set W_minus_08, W8 31566be8951SMathias Krause .set W_minus_12, W12 31666be8951SMathias Krause .set W_minus_16, W16 31766be8951SMathias Krause .set W_minus_20, W20 31866be8951SMathias Krause .set W_minus_24, W24 31966be8951SMathias Krause .set W_minus_28, W28 32066be8951SMathias Krause .set W_minus_32, W 32166be8951SMathias Krause.endm 32266be8951SMathias Krause 32366be8951SMathias Krause.macro W_PRECALC_ROTATE 32466be8951SMathias Krause .set W_minus_32, W_minus_28 32566be8951SMathias Krause .set W_minus_28, W_minus_24 32666be8951SMathias Krause .set W_minus_24, W_minus_20 32766be8951SMathias Krause .set W_minus_20, W_minus_16 32866be8951SMathias Krause .set W_minus_16, W_minus_12 32966be8951SMathias Krause .set W_minus_12, W_minus_08 33066be8951SMathias Krause .set W_minus_08, W_minus_04 33166be8951SMathias Krause .set W_minus_04, W 33266be8951SMathias Krause .set W, W_minus_32 33366be8951SMathias Krause.endm 33466be8951SMathias Krause 33566be8951SMathias Krause.macro W_PRECALC_SSSE3 33666be8951SMathias Krause 33766be8951SMathias Krause.macro W_PRECALC_00_15 33866be8951SMathias Krause W_PRECALC_00_15_SSSE3 33966be8951SMathias Krause.endm 34066be8951SMathias Krause.macro W_PRECALC_16_31 34166be8951SMathias Krause W_PRECALC_16_31_SSSE3 34266be8951SMathias Krause.endm 34366be8951SMathias Krause.macro W_PRECALC_32_79 34466be8951SMathias Krause W_PRECALC_32_79_SSSE3 34566be8951SMathias Krause.endm 34666be8951SMathias Krause 34766be8951SMathias Krause/* message scheduling pre-compute for rounds 0-15 */ 34866be8951SMathias Krause.macro W_PRECALC_00_15_SSSE3 34966be8951SMathias Krause .if ((i & 3) == 0) 35066be8951SMathias Krause movdqu (i*4)(BUFFER_PTR), W_TMP1 35166be8951SMathias Krause .elseif ((i & 3) == 1) 35266be8951SMathias Krause pshufb XMM_SHUFB_BSWAP, W_TMP1 35366be8951SMathias Krause movdqa W_TMP1, W 35466be8951SMathias Krause .elseif ((i & 3) == 2) 35566be8951SMathias Krause paddd (K_BASE), W_TMP1 35666be8951SMathias Krause .elseif ((i & 3) == 3) 35766be8951SMathias Krause movdqa W_TMP1, WK(i&~3) 35866be8951SMathias Krause W_PRECALC_ROTATE 35966be8951SMathias Krause .endif 36066be8951SMathias Krause.endm 36166be8951SMathias Krause 36266be8951SMathias Krause/* message scheduling pre-compute for rounds 16-31 36366be8951SMathias Krause * 36466be8951SMathias Krause * - calculating last 32 w[i] values in 8 XMM registers 36566be8951SMathias Krause * - pre-calculate K+w[i] values and store to mem, for later load by ALU add 36666be8951SMathias Krause * instruction 36766be8951SMathias Krause * 36866be8951SMathias Krause * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] 36966be8951SMathias Krause * dependency, but improves for 32-79 37066be8951SMathias Krause */ 37166be8951SMathias Krause.macro W_PRECALC_16_31_SSSE3 37266be8951SMathias Krause # blended scheduling of vector and scalar instruction streams, one 4-wide 37366be8951SMathias Krause # vector iteration / 4 scalar rounds 37466be8951SMathias Krause .if ((i & 3) == 0) 37566be8951SMathias Krause movdqa W_minus_12, W 37666be8951SMathias Krause palignr $8, W_minus_16, W # w[i-14] 37766be8951SMathias Krause movdqa W_minus_04, W_TMP1 37866be8951SMathias Krause psrldq $4, W_TMP1 # w[i-3] 37966be8951SMathias Krause pxor W_minus_08, W 38066be8951SMathias Krause .elseif ((i & 3) == 1) 38166be8951SMathias Krause pxor W_minus_16, W_TMP1 38266be8951SMathias Krause pxor W_TMP1, W 38366be8951SMathias Krause movdqa W, W_TMP2 38466be8951SMathias Krause movdqa W, W_TMP1 38566be8951SMathias Krause pslldq $12, W_TMP2 38666be8951SMathias Krause .elseif ((i & 3) == 2) 38766be8951SMathias Krause psrld $31, W 38866be8951SMathias Krause pslld $1, W_TMP1 38966be8951SMathias Krause por W, W_TMP1 39066be8951SMathias Krause movdqa W_TMP2, W 39166be8951SMathias Krause psrld $30, W_TMP2 39266be8951SMathias Krause pslld $2, W 39366be8951SMathias Krause .elseif ((i & 3) == 3) 39466be8951SMathias Krause pxor W, W_TMP1 39566be8951SMathias Krause pxor W_TMP2, W_TMP1 39666be8951SMathias Krause movdqa W_TMP1, W 39766be8951SMathias Krause paddd K_XMM(K_BASE), W_TMP1 39866be8951SMathias Krause movdqa W_TMP1, WK(i&~3) 39966be8951SMathias Krause W_PRECALC_ROTATE 40066be8951SMathias Krause .endif 40166be8951SMathias Krause.endm 40266be8951SMathias Krause 40366be8951SMathias Krause/* message scheduling pre-compute for rounds 32-79 40466be8951SMathias Krause * 40566be8951SMathias Krause * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 40666be8951SMathias Krause * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 40766be8951SMathias Krause * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken 40866be8951SMathias Krause */ 40966be8951SMathias Krause.macro W_PRECALC_32_79_SSSE3 41066be8951SMathias Krause .if ((i & 3) == 0) 41166be8951SMathias Krause movdqa W_minus_04, W_TMP1 41266be8951SMathias Krause pxor W_minus_28, W # W is W_minus_32 before xor 41366be8951SMathias Krause palignr $8, W_minus_08, W_TMP1 41466be8951SMathias Krause .elseif ((i & 3) == 1) 41566be8951SMathias Krause pxor W_minus_16, W 41666be8951SMathias Krause pxor W_TMP1, W 41766be8951SMathias Krause movdqa W, W_TMP1 41866be8951SMathias Krause .elseif ((i & 3) == 2) 41966be8951SMathias Krause psrld $30, W 42066be8951SMathias Krause pslld $2, W_TMP1 42166be8951SMathias Krause por W, W_TMP1 42266be8951SMathias Krause .elseif ((i & 3) == 3) 42366be8951SMathias Krause movdqa W_TMP1, W 42466be8951SMathias Krause paddd K_XMM(K_BASE), W_TMP1 42566be8951SMathias Krause movdqa W_TMP1, WK(i&~3) 42666be8951SMathias Krause W_PRECALC_ROTATE 42766be8951SMathias Krause .endif 42866be8951SMathias Krause.endm 42966be8951SMathias Krause 43066be8951SMathias Krause.endm // W_PRECALC_SSSE3 43166be8951SMathias Krause 43266be8951SMathias Krause 43366be8951SMathias Krause#define K1 0x5a827999 43466be8951SMathias Krause#define K2 0x6ed9eba1 43566be8951SMathias Krause#define K3 0x8f1bbcdc 43666be8951SMathias Krause#define K4 0xca62c1d6 43766be8951SMathias Krause 43866be8951SMathias Krause.section .rodata 43966be8951SMathias Krause.align 16 44066be8951SMathias Krause 44166be8951SMathias KrauseK_XMM_AR: 44266be8951SMathias Krause .long K1, K1, K1, K1 44366be8951SMathias Krause .long K2, K2, K2, K2 44466be8951SMathias Krause .long K3, K3, K3, K3 44566be8951SMathias Krause .long K4, K4, K4, K4 44666be8951SMathias Krause 44766be8951SMathias KrauseBSWAP_SHUFB_CTL: 44866be8951SMathias Krause .long 0x00010203 44966be8951SMathias Krause .long 0x04050607 45066be8951SMathias Krause .long 0x08090a0b 45166be8951SMathias Krause .long 0x0c0d0e0f 45266be8951SMathias Krause 45366be8951SMathias Krause 45466be8951SMathias Krause.section .text 45566be8951SMathias Krause 45666be8951SMathias KrauseW_PRECALC_SSSE3 45766be8951SMathias Krause.macro xmm_mov a, b 45866be8951SMathias Krause movdqu \a,\b 45966be8951SMathias Krause.endm 46066be8951SMathias Krause 46141419a28SKees Cook/* 46241419a28SKees Cook * SSSE3 optimized implementation: 46341419a28SKees Cook * 46441419a28SKees Cook * extern "C" void sha1_transform_ssse3(struct sha1_state *state, 46541419a28SKees Cook * const u8 *data, int blocks); 46641419a28SKees Cook * 46741419a28SKees Cook * Note that struct sha1_state is assumed to begin with u32 state[5]. 46866be8951SMathias Krause */ 46966be8951SMathias KrauseSHA1_VECTOR_ASM sha1_transform_ssse3 47066be8951SMathias Krause 47166be8951SMathias Krause.macro W_PRECALC_AVX 47266be8951SMathias Krause 47366be8951SMathias Krause.purgem W_PRECALC_00_15 47466be8951SMathias Krause.macro W_PRECALC_00_15 47566be8951SMathias Krause W_PRECALC_00_15_AVX 47666be8951SMathias Krause.endm 47766be8951SMathias Krause.purgem W_PRECALC_16_31 47866be8951SMathias Krause.macro W_PRECALC_16_31 47966be8951SMathias Krause W_PRECALC_16_31_AVX 48066be8951SMathias Krause.endm 48166be8951SMathias Krause.purgem W_PRECALC_32_79 48266be8951SMathias Krause.macro W_PRECALC_32_79 48366be8951SMathias Krause W_PRECALC_32_79_AVX 48466be8951SMathias Krause.endm 48566be8951SMathias Krause 48666be8951SMathias Krause.macro W_PRECALC_00_15_AVX 48766be8951SMathias Krause .if ((i & 3) == 0) 48866be8951SMathias Krause vmovdqu (i*4)(BUFFER_PTR), W_TMP1 48966be8951SMathias Krause .elseif ((i & 3) == 1) 49066be8951SMathias Krause vpshufb XMM_SHUFB_BSWAP, W_TMP1, W 49166be8951SMathias Krause .elseif ((i & 3) == 2) 49266be8951SMathias Krause vpaddd (K_BASE), W, W_TMP1 49366be8951SMathias Krause .elseif ((i & 3) == 3) 49466be8951SMathias Krause vmovdqa W_TMP1, WK(i&~3) 49566be8951SMathias Krause W_PRECALC_ROTATE 49666be8951SMathias Krause .endif 49766be8951SMathias Krause.endm 49866be8951SMathias Krause 49966be8951SMathias Krause.macro W_PRECALC_16_31_AVX 50066be8951SMathias Krause .if ((i & 3) == 0) 50166be8951SMathias Krause vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] 50266be8951SMathias Krause vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] 50366be8951SMathias Krause vpxor W_minus_08, W, W 50466be8951SMathias Krause vpxor W_minus_16, W_TMP1, W_TMP1 50566be8951SMathias Krause .elseif ((i & 3) == 1) 50666be8951SMathias Krause vpxor W_TMP1, W, W 50766be8951SMathias Krause vpslldq $12, W, W_TMP2 50866be8951SMathias Krause vpslld $1, W, W_TMP1 50966be8951SMathias Krause .elseif ((i & 3) == 2) 51066be8951SMathias Krause vpsrld $31, W, W 51166be8951SMathias Krause vpor W, W_TMP1, W_TMP1 51266be8951SMathias Krause vpslld $2, W_TMP2, W 51366be8951SMathias Krause vpsrld $30, W_TMP2, W_TMP2 51466be8951SMathias Krause .elseif ((i & 3) == 3) 51566be8951SMathias Krause vpxor W, W_TMP1, W_TMP1 51666be8951SMathias Krause vpxor W_TMP2, W_TMP1, W 51766be8951SMathias Krause vpaddd K_XMM(K_BASE), W, W_TMP1 51866be8951SMathias Krause vmovdqu W_TMP1, WK(i&~3) 51966be8951SMathias Krause W_PRECALC_ROTATE 52066be8951SMathias Krause .endif 52166be8951SMathias Krause.endm 52266be8951SMathias Krause 52366be8951SMathias Krause.macro W_PRECALC_32_79_AVX 52466be8951SMathias Krause .if ((i & 3) == 0) 52566be8951SMathias Krause vpalignr $8, W_minus_08, W_minus_04, W_TMP1 52666be8951SMathias Krause vpxor W_minus_28, W, W # W is W_minus_32 before xor 52766be8951SMathias Krause .elseif ((i & 3) == 1) 52866be8951SMathias Krause vpxor W_minus_16, W_TMP1, W_TMP1 52966be8951SMathias Krause vpxor W_TMP1, W, W 53066be8951SMathias Krause .elseif ((i & 3) == 2) 53166be8951SMathias Krause vpslld $2, W, W_TMP1 53266be8951SMathias Krause vpsrld $30, W, W 53366be8951SMathias Krause vpor W, W_TMP1, W 53466be8951SMathias Krause .elseif ((i & 3) == 3) 53566be8951SMathias Krause vpaddd K_XMM(K_BASE), W, W_TMP1 53666be8951SMathias Krause vmovdqu W_TMP1, WK(i&~3) 53766be8951SMathias Krause W_PRECALC_ROTATE 53866be8951SMathias Krause .endif 53966be8951SMathias Krause.endm 54066be8951SMathias Krause 54166be8951SMathias Krause.endm // W_PRECALC_AVX 54266be8951SMathias Krause 54366be8951SMathias KrauseW_PRECALC_AVX 54466be8951SMathias Krause.purgem xmm_mov 54566be8951SMathias Krause.macro xmm_mov a, b 54666be8951SMathias Krause vmovdqu \a,\b 54766be8951SMathias Krause.endm 54866be8951SMathias Krause 54966be8951SMathias Krause 55066be8951SMathias Krause/* AVX optimized implementation: 55141419a28SKees Cook * extern "C" void sha1_transform_avx(struct sha1_state *state, 55241419a28SKees Cook * const u8 *data, int blocks); 55366be8951SMathias Krause */ 55466be8951SMathias KrauseSHA1_VECTOR_ASM sha1_transform_avx 555