xref: /linux/arch/x86/crypto/sha1_ssse3_asm.S (revision 7ae9fb1b7ecbb5d85d07857943f677fd1a559b18)
12874c5fdSThomas Gleixner/* SPDX-License-Identifier: GPL-2.0-or-later */
266be8951SMathias Krause/*
366be8951SMathias Krause * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
466be8951SMathias Krause * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
566be8951SMathias Krause * processors. CPUs supporting Intel(R) AVX extensions will get an additional
666be8951SMathias Krause * boost.
766be8951SMathias Krause *
866be8951SMathias Krause * This work was inspired by the vectorized implementation of Dean Gaudet.
966be8951SMathias Krause * Additional information on it can be found at:
1066be8951SMathias Krause *    http://www.arctic.org/~dean/crypto/sha1.html
1166be8951SMathias Krause *
1266be8951SMathias Krause * It was improved upon with more efficient vectorization of the message
1366be8951SMathias Krause * scheduling. This implementation has also been optimized for all current and
1466be8951SMathias Krause * several future generations of Intel CPUs.
1566be8951SMathias Krause *
1666be8951SMathias Krause * See this article for more information about the implementation details:
1766be8951SMathias Krause *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
1866be8951SMathias Krause *
1966be8951SMathias Krause * Copyright (C) 2010, Intel Corp.
2066be8951SMathias Krause *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
2166be8951SMathias Krause *            Ronen Zohar <ronen.zohar@intel.com>
2266be8951SMathias Krause *
2366be8951SMathias Krause * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
2466be8951SMathias Krause *   Author: Mathias Krause <minipli@googlemail.com>
2566be8951SMathias Krause */
2666be8951SMathias Krause
27ac9d55ddSJussi Kivilinna#include <linux/linkage.h>
28*32f34bf7SEric Biggers#include <linux/cfi_types.h>
29ac9d55ddSJussi Kivilinna
3066be8951SMathias Krause#define CTX	%rdi	// arg1
3166be8951SMathias Krause#define BUF	%rsi	// arg2
3266be8951SMathias Krause#define CNT	%rdx	// arg3
3366be8951SMathias Krause
3466be8951SMathias Krause#define REG_A	%ecx
3566be8951SMathias Krause#define REG_B	%esi
3666be8951SMathias Krause#define REG_C	%edi
376488bce7SJosh Poimboeuf#define REG_D	%r12d
3866be8951SMathias Krause#define REG_E	%edx
3966be8951SMathias Krause
4066be8951SMathias Krause#define REG_T1	%eax
4166be8951SMathias Krause#define REG_T2	%ebx
4266be8951SMathias Krause
4366be8951SMathias Krause#define K_BASE		%r8
4466be8951SMathias Krause#define HASH_PTR	%r9
4566be8951SMathias Krause#define BUFFER_PTR	%r10
4666be8951SMathias Krause#define BUFFER_END	%r11
4766be8951SMathias Krause
4866be8951SMathias Krause#define W_TMP1	%xmm0
4966be8951SMathias Krause#define W_TMP2	%xmm9
5066be8951SMathias Krause
5166be8951SMathias Krause#define W0	%xmm1
5266be8951SMathias Krause#define W4	%xmm2
5366be8951SMathias Krause#define W8	%xmm3
5466be8951SMathias Krause#define W12	%xmm4
5566be8951SMathias Krause#define W16	%xmm5
5666be8951SMathias Krause#define W20	%xmm6
5766be8951SMathias Krause#define W24	%xmm7
5866be8951SMathias Krause#define W28	%xmm8
5966be8951SMathias Krause
6066be8951SMathias Krause#define XMM_SHUFB_BSWAP	%xmm10
6166be8951SMathias Krause
6266be8951SMathias Krause/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
6366be8951SMathias Krause#define WK(t)	(((t) & 15) * 4)(%rsp)
6466be8951SMathias Krause#define W_PRECALC_AHEAD	16
6566be8951SMathias Krause
6666be8951SMathias Krause/*
6766be8951SMathias Krause * This macro implements the SHA-1 function's body for single 64-byte block
6866be8951SMathias Krause * param: function's name
6966be8951SMathias Krause */
7066be8951SMathias Krause.macro SHA1_VECTOR_ASM  name
71*32f34bf7SEric Biggers	SYM_TYPED_FUNC_START(\name)
72ac9d55ddSJussi Kivilinna
7366be8951SMathias Krause	push	%rbx
7466be8951SMathias Krause	push	%r12
756488bce7SJosh Poimboeuf	push	%rbp
766488bce7SJosh Poimboeuf	mov	%rsp, %rbp
7766be8951SMathias Krause
7866be8951SMathias Krause	sub	$64, %rsp		# allocate workspace
7966be8951SMathias Krause	and	$~15, %rsp		# align stack
8066be8951SMathias Krause
8166be8951SMathias Krause	mov	CTX, HASH_PTR
8266be8951SMathias Krause	mov	BUF, BUFFER_PTR
8366be8951SMathias Krause
8466be8951SMathias Krause	shl	$6, CNT			# multiply by 64
8566be8951SMathias Krause	add	BUF, CNT
8666be8951SMathias Krause	mov	CNT, BUFFER_END
8766be8951SMathias Krause
8866be8951SMathias Krause	lea	K_XMM_AR(%rip), K_BASE
8966be8951SMathias Krause	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
9066be8951SMathias Krause
9166be8951SMathias Krause	SHA1_PIPELINED_MAIN_BODY
9266be8951SMathias Krause
9366be8951SMathias Krause	# cleanup workspace
9466be8951SMathias Krause	mov	$8, %ecx
9566be8951SMathias Krause	mov	%rsp, %rdi
96a7bea830SJan Beulich	xor	%eax, %eax
9766be8951SMathias Krause	rep stosq
9866be8951SMathias Krause
996488bce7SJosh Poimboeuf	mov	%rbp, %rsp		# deallocate workspace
10066be8951SMathias Krause	pop	%rbp
1016488bce7SJosh Poimboeuf	pop	%r12
10266be8951SMathias Krause	pop	%rbx
103f94909ceSPeter Zijlstra	RET
10466be8951SMathias Krause
1056dcc5627SJiri Slaby	SYM_FUNC_END(\name)
10666be8951SMathias Krause.endm
10766be8951SMathias Krause
10866be8951SMathias Krause/*
10966be8951SMathias Krause * This macro implements 80 rounds of SHA-1 for one 64-byte block
11066be8951SMathias Krause */
11166be8951SMathias Krause.macro SHA1_PIPELINED_MAIN_BODY
11266be8951SMathias Krause	INIT_REGALLOC
11366be8951SMathias Krause
11466be8951SMathias Krause	mov	  (HASH_PTR), A
11566be8951SMathias Krause	mov	 4(HASH_PTR), B
11666be8951SMathias Krause	mov	 8(HASH_PTR), C
11766be8951SMathias Krause	mov	12(HASH_PTR), D
11866be8951SMathias Krause	mov	16(HASH_PTR), E
11966be8951SMathias Krause
12066be8951SMathias Krause  .set i, 0
12166be8951SMathias Krause  .rept W_PRECALC_AHEAD
12266be8951SMathias Krause	W_PRECALC i
12366be8951SMathias Krause    .set i, (i+1)
12466be8951SMathias Krause  .endr
12566be8951SMathias Krause
12666be8951SMathias Krause.align 4
12766be8951SMathias Krause1:
12866be8951SMathias Krause	RR F1,A,B,C,D,E,0
12966be8951SMathias Krause	RR F1,D,E,A,B,C,2
13066be8951SMathias Krause	RR F1,B,C,D,E,A,4
13166be8951SMathias Krause	RR F1,E,A,B,C,D,6
13266be8951SMathias Krause	RR F1,C,D,E,A,B,8
13366be8951SMathias Krause
13466be8951SMathias Krause	RR F1,A,B,C,D,E,10
13566be8951SMathias Krause	RR F1,D,E,A,B,C,12
13666be8951SMathias Krause	RR F1,B,C,D,E,A,14
13766be8951SMathias Krause	RR F1,E,A,B,C,D,16
13866be8951SMathias Krause	RR F1,C,D,E,A,B,18
13966be8951SMathias Krause
14066be8951SMathias Krause	RR F2,A,B,C,D,E,20
14166be8951SMathias Krause	RR F2,D,E,A,B,C,22
14266be8951SMathias Krause	RR F2,B,C,D,E,A,24
14366be8951SMathias Krause	RR F2,E,A,B,C,D,26
14466be8951SMathias Krause	RR F2,C,D,E,A,B,28
14566be8951SMathias Krause
14666be8951SMathias Krause	RR F2,A,B,C,D,E,30
14766be8951SMathias Krause	RR F2,D,E,A,B,C,32
14866be8951SMathias Krause	RR F2,B,C,D,E,A,34
14966be8951SMathias Krause	RR F2,E,A,B,C,D,36
15066be8951SMathias Krause	RR F2,C,D,E,A,B,38
15166be8951SMathias Krause
15266be8951SMathias Krause	RR F3,A,B,C,D,E,40
15366be8951SMathias Krause	RR F3,D,E,A,B,C,42
15466be8951SMathias Krause	RR F3,B,C,D,E,A,44
15566be8951SMathias Krause	RR F3,E,A,B,C,D,46
15666be8951SMathias Krause	RR F3,C,D,E,A,B,48
15766be8951SMathias Krause
15866be8951SMathias Krause	RR F3,A,B,C,D,E,50
15966be8951SMathias Krause	RR F3,D,E,A,B,C,52
16066be8951SMathias Krause	RR F3,B,C,D,E,A,54
16166be8951SMathias Krause	RR F3,E,A,B,C,D,56
16266be8951SMathias Krause	RR F3,C,D,E,A,B,58
16366be8951SMathias Krause
16466be8951SMathias Krause	add	$64, BUFFER_PTR		# move to the next 64-byte block
16566be8951SMathias Krause	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use
16666be8951SMathias Krause	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun
16766be8951SMathias Krause
16866be8951SMathias Krause	RR F4,A,B,C,D,E,60
16966be8951SMathias Krause	RR F4,D,E,A,B,C,62
17066be8951SMathias Krause	RR F4,B,C,D,E,A,64
17166be8951SMathias Krause	RR F4,E,A,B,C,D,66
17266be8951SMathias Krause	RR F4,C,D,E,A,B,68
17366be8951SMathias Krause
17466be8951SMathias Krause	RR F4,A,B,C,D,E,70
17566be8951SMathias Krause	RR F4,D,E,A,B,C,72
17666be8951SMathias Krause	RR F4,B,C,D,E,A,74
17766be8951SMathias Krause	RR F4,E,A,B,C,D,76
17866be8951SMathias Krause	RR F4,C,D,E,A,B,78
17966be8951SMathias Krause
18066be8951SMathias Krause	UPDATE_HASH   (HASH_PTR), A
18166be8951SMathias Krause	UPDATE_HASH  4(HASH_PTR), B
18266be8951SMathias Krause	UPDATE_HASH  8(HASH_PTR), C
18366be8951SMathias Krause	UPDATE_HASH 12(HASH_PTR), D
18466be8951SMathias Krause	UPDATE_HASH 16(HASH_PTR), E
18566be8951SMathias Krause
18666be8951SMathias Krause	RESTORE_RENAMED_REGS
18766be8951SMathias Krause	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end
18866be8951SMathias Krause	jne	1b
18966be8951SMathias Krause.endm
19066be8951SMathias Krause
19166be8951SMathias Krause.macro INIT_REGALLOC
19266be8951SMathias Krause  .set A, REG_A
19366be8951SMathias Krause  .set B, REG_B
19466be8951SMathias Krause  .set C, REG_C
19566be8951SMathias Krause  .set D, REG_D
19666be8951SMathias Krause  .set E, REG_E
19766be8951SMathias Krause  .set T1, REG_T1
19866be8951SMathias Krause  .set T2, REG_T2
19966be8951SMathias Krause.endm
20066be8951SMathias Krause
20166be8951SMathias Krause.macro RESTORE_RENAMED_REGS
20266be8951SMathias Krause	# order is important (REG_C is where it should be)
20366be8951SMathias Krause	mov	B, REG_B
20466be8951SMathias Krause	mov	D, REG_D
20566be8951SMathias Krause	mov	A, REG_A
20666be8951SMathias Krause	mov	E, REG_E
20766be8951SMathias Krause.endm
20866be8951SMathias Krause
20966be8951SMathias Krause.macro SWAP_REG_NAMES  a, b
21066be8951SMathias Krause  .set _T, \a
21166be8951SMathias Krause  .set \a, \b
21266be8951SMathias Krause  .set \b, _T
21366be8951SMathias Krause.endm
21466be8951SMathias Krause
21566be8951SMathias Krause.macro F1  b, c, d
21666be8951SMathias Krause	mov	\c, T1
21766be8951SMathias Krause	SWAP_REG_NAMES \c, T1
21866be8951SMathias Krause	xor	\d, T1
21966be8951SMathias Krause	and	\b, T1
22066be8951SMathias Krause	xor	\d, T1
22166be8951SMathias Krause.endm
22266be8951SMathias Krause
22366be8951SMathias Krause.macro F2  b, c, d
22466be8951SMathias Krause	mov	\d, T1
22566be8951SMathias Krause	SWAP_REG_NAMES \d, T1
22666be8951SMathias Krause	xor	\c, T1
22766be8951SMathias Krause	xor	\b, T1
22866be8951SMathias Krause.endm
22966be8951SMathias Krause
23066be8951SMathias Krause.macro F3  b, c ,d
23166be8951SMathias Krause	mov	\c, T1
23266be8951SMathias Krause	SWAP_REG_NAMES \c, T1
23366be8951SMathias Krause	mov	\b, T2
23466be8951SMathias Krause	or	\b, T1
23566be8951SMathias Krause	and	\c, T2
23666be8951SMathias Krause	and	\d, T1
23766be8951SMathias Krause	or	T2, T1
23866be8951SMathias Krause.endm
23966be8951SMathias Krause
24066be8951SMathias Krause.macro F4  b, c, d
24166be8951SMathias Krause	F2 \b, \c, \d
24266be8951SMathias Krause.endm
24366be8951SMathias Krause
24466be8951SMathias Krause.macro UPDATE_HASH  hash, val
24566be8951SMathias Krause	add	\hash, \val
24666be8951SMathias Krause	mov	\val, \hash
24766be8951SMathias Krause.endm
24866be8951SMathias Krause
24966be8951SMathias Krause/*
25066be8951SMathias Krause * RR does two rounds of SHA-1 back to back with W[] pre-calc
25166be8951SMathias Krause *   t1 = F(b, c, d);   e += w(i)
25266be8951SMathias Krause *   e += t1;           b <<= 30;   d  += w(i+1);
25366be8951SMathias Krause *   t1 = F(a, b, c);
25466be8951SMathias Krause *   d += t1;           a <<= 5;
25566be8951SMathias Krause *   e += a;
25666be8951SMathias Krause *   t1 = e;            a >>= 7;
25766be8951SMathias Krause *   t1 <<= 5;
25866be8951SMathias Krause *   d += t1;
25966be8951SMathias Krause */
26066be8951SMathias Krause.macro RR  F, a, b, c, d, e, round
26166be8951SMathias Krause	add	WK(\round), \e
26266be8951SMathias Krause	\F   \b, \c, \d		# t1 = F(b, c, d);
26366be8951SMathias Krause	W_PRECALC (\round + W_PRECALC_AHEAD)
26466be8951SMathias Krause	rol	$30, \b
26566be8951SMathias Krause	add	T1, \e
26666be8951SMathias Krause	add	WK(\round + 1), \d
26766be8951SMathias Krause
26866be8951SMathias Krause	\F   \a, \b, \c
26966be8951SMathias Krause	W_PRECALC (\round + W_PRECALC_AHEAD + 1)
27066be8951SMathias Krause	rol	$5, \a
27166be8951SMathias Krause	add	\a, \e
27266be8951SMathias Krause	add	T1, \d
27366be8951SMathias Krause	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30)
27466be8951SMathias Krause
27566be8951SMathias Krause	mov	\e, T1
27666be8951SMathias Krause	SWAP_REG_NAMES \e, T1
27766be8951SMathias Krause
27866be8951SMathias Krause	rol	$5, T1
27966be8951SMathias Krause	add	T1, \d
28066be8951SMathias Krause
28166be8951SMathias Krause	# write:  \a, \b
28266be8951SMathias Krause	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
28366be8951SMathias Krause.endm
28466be8951SMathias Krause
28566be8951SMathias Krause.macro W_PRECALC  r
28666be8951SMathias Krause  .set i, \r
28766be8951SMathias Krause
28866be8951SMathias Krause  .if (i < 20)
28966be8951SMathias Krause    .set K_XMM, 0
29066be8951SMathias Krause  .elseif (i < 40)
29166be8951SMathias Krause    .set K_XMM, 16
29266be8951SMathias Krause  .elseif (i < 60)
29366be8951SMathias Krause    .set K_XMM, 32
29466be8951SMathias Krause  .elseif (i < 80)
29566be8951SMathias Krause    .set K_XMM, 48
29666be8951SMathias Krause  .endif
29766be8951SMathias Krause
29866be8951SMathias Krause  .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
29966be8951SMathias Krause    .set i, ((\r) % 80)	    # pre-compute for the next iteration
30066be8951SMathias Krause    .if (i == 0)
30166be8951SMathias Krause	W_PRECALC_RESET
30266be8951SMathias Krause    .endif
30366be8951SMathias Krause	W_PRECALC_00_15
30466be8951SMathias Krause  .elseif (i<32)
30566be8951SMathias Krause	W_PRECALC_16_31
30666be8951SMathias Krause  .elseif (i < 80)   // rounds 32-79
30766be8951SMathias Krause	W_PRECALC_32_79
30866be8951SMathias Krause  .endif
30966be8951SMathias Krause.endm
31066be8951SMathias Krause
31166be8951SMathias Krause.macro W_PRECALC_RESET
31266be8951SMathias Krause  .set W,          W0
31366be8951SMathias Krause  .set W_minus_04, W4
31466be8951SMathias Krause  .set W_minus_08, W8
31566be8951SMathias Krause  .set W_minus_12, W12
31666be8951SMathias Krause  .set W_minus_16, W16
31766be8951SMathias Krause  .set W_minus_20, W20
31866be8951SMathias Krause  .set W_minus_24, W24
31966be8951SMathias Krause  .set W_minus_28, W28
32066be8951SMathias Krause  .set W_minus_32, W
32166be8951SMathias Krause.endm
32266be8951SMathias Krause
32366be8951SMathias Krause.macro W_PRECALC_ROTATE
32466be8951SMathias Krause  .set W_minus_32, W_minus_28
32566be8951SMathias Krause  .set W_minus_28, W_minus_24
32666be8951SMathias Krause  .set W_minus_24, W_minus_20
32766be8951SMathias Krause  .set W_minus_20, W_minus_16
32866be8951SMathias Krause  .set W_minus_16, W_minus_12
32966be8951SMathias Krause  .set W_minus_12, W_minus_08
33066be8951SMathias Krause  .set W_minus_08, W_minus_04
33166be8951SMathias Krause  .set W_minus_04, W
33266be8951SMathias Krause  .set W,          W_minus_32
33366be8951SMathias Krause.endm
33466be8951SMathias Krause
33566be8951SMathias Krause.macro W_PRECALC_SSSE3
33666be8951SMathias Krause
33766be8951SMathias Krause.macro W_PRECALC_00_15
33866be8951SMathias Krause	W_PRECALC_00_15_SSSE3
33966be8951SMathias Krause.endm
34066be8951SMathias Krause.macro W_PRECALC_16_31
34166be8951SMathias Krause	W_PRECALC_16_31_SSSE3
34266be8951SMathias Krause.endm
34366be8951SMathias Krause.macro W_PRECALC_32_79
34466be8951SMathias Krause	W_PRECALC_32_79_SSSE3
34566be8951SMathias Krause.endm
34666be8951SMathias Krause
34766be8951SMathias Krause/* message scheduling pre-compute for rounds 0-15 */
34866be8951SMathias Krause.macro W_PRECALC_00_15_SSSE3
34966be8951SMathias Krause  .if ((i & 3) == 0)
35066be8951SMathias Krause	movdqu	(i*4)(BUFFER_PTR), W_TMP1
35166be8951SMathias Krause  .elseif ((i & 3) == 1)
35266be8951SMathias Krause	pshufb	XMM_SHUFB_BSWAP, W_TMP1
35366be8951SMathias Krause	movdqa	W_TMP1, W
35466be8951SMathias Krause  .elseif ((i & 3) == 2)
35566be8951SMathias Krause	paddd	(K_BASE), W_TMP1
35666be8951SMathias Krause  .elseif ((i & 3) == 3)
35766be8951SMathias Krause	movdqa  W_TMP1, WK(i&~3)
35866be8951SMathias Krause	W_PRECALC_ROTATE
35966be8951SMathias Krause  .endif
36066be8951SMathias Krause.endm
36166be8951SMathias Krause
36266be8951SMathias Krause/* message scheduling pre-compute for rounds 16-31
36366be8951SMathias Krause *
36466be8951SMathias Krause * - calculating last 32 w[i] values in 8 XMM registers
36566be8951SMathias Krause * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
36666be8951SMathias Krause *   instruction
36766be8951SMathias Krause *
36866be8951SMathias Krause * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
36966be8951SMathias Krause * dependency, but improves for 32-79
37066be8951SMathias Krause */
37166be8951SMathias Krause.macro W_PRECALC_16_31_SSSE3
37266be8951SMathias Krause  # blended scheduling of vector and scalar instruction streams, one 4-wide
37366be8951SMathias Krause  # vector iteration / 4 scalar rounds
37466be8951SMathias Krause  .if ((i & 3) == 0)
37566be8951SMathias Krause	movdqa	W_minus_12, W
37666be8951SMathias Krause	palignr	$8, W_minus_16, W	# w[i-14]
37766be8951SMathias Krause	movdqa	W_minus_04, W_TMP1
37866be8951SMathias Krause	psrldq	$4, W_TMP1		# w[i-3]
37966be8951SMathias Krause	pxor	W_minus_08, W
38066be8951SMathias Krause  .elseif ((i & 3) == 1)
38166be8951SMathias Krause	pxor	W_minus_16, W_TMP1
38266be8951SMathias Krause	pxor	W_TMP1, W
38366be8951SMathias Krause	movdqa	W, W_TMP2
38466be8951SMathias Krause	movdqa	W, W_TMP1
38566be8951SMathias Krause	pslldq	$12, W_TMP2
38666be8951SMathias Krause  .elseif ((i & 3) == 2)
38766be8951SMathias Krause	psrld	$31, W
38866be8951SMathias Krause	pslld	$1, W_TMP1
38966be8951SMathias Krause	por	W, W_TMP1
39066be8951SMathias Krause	movdqa	W_TMP2, W
39166be8951SMathias Krause	psrld	$30, W_TMP2
39266be8951SMathias Krause	pslld	$2, W
39366be8951SMathias Krause  .elseif ((i & 3) == 3)
39466be8951SMathias Krause	pxor	W, W_TMP1
39566be8951SMathias Krause	pxor	W_TMP2, W_TMP1
39666be8951SMathias Krause	movdqa	W_TMP1, W
39766be8951SMathias Krause	paddd	K_XMM(K_BASE), W_TMP1
39866be8951SMathias Krause	movdqa	W_TMP1, WK(i&~3)
39966be8951SMathias Krause	W_PRECALC_ROTATE
40066be8951SMathias Krause  .endif
40166be8951SMathias Krause.endm
40266be8951SMathias Krause
40366be8951SMathias Krause/* message scheduling pre-compute for rounds 32-79
40466be8951SMathias Krause *
40566be8951SMathias Krause * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
40666be8951SMathias Krause * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
40766be8951SMathias Krause * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
40866be8951SMathias Krause */
40966be8951SMathias Krause.macro W_PRECALC_32_79_SSSE3
41066be8951SMathias Krause  .if ((i & 3) == 0)
41166be8951SMathias Krause	movdqa	W_minus_04, W_TMP1
41266be8951SMathias Krause	pxor	W_minus_28, W		# W is W_minus_32 before xor
41366be8951SMathias Krause	palignr	$8, W_minus_08, W_TMP1
41466be8951SMathias Krause  .elseif ((i & 3) == 1)
41566be8951SMathias Krause	pxor	W_minus_16, W
41666be8951SMathias Krause	pxor	W_TMP1, W
41766be8951SMathias Krause	movdqa	W, W_TMP1
41866be8951SMathias Krause  .elseif ((i & 3) == 2)
41966be8951SMathias Krause	psrld	$30, W
42066be8951SMathias Krause	pslld	$2, W_TMP1
42166be8951SMathias Krause	por	W, W_TMP1
42266be8951SMathias Krause  .elseif ((i & 3) == 3)
42366be8951SMathias Krause	movdqa	W_TMP1, W
42466be8951SMathias Krause	paddd	K_XMM(K_BASE), W_TMP1
42566be8951SMathias Krause	movdqa	W_TMP1, WK(i&~3)
42666be8951SMathias Krause	W_PRECALC_ROTATE
42766be8951SMathias Krause  .endif
42866be8951SMathias Krause.endm
42966be8951SMathias Krause
43066be8951SMathias Krause.endm		// W_PRECALC_SSSE3
43166be8951SMathias Krause
43266be8951SMathias Krause
43366be8951SMathias Krause#define K1	0x5a827999
43466be8951SMathias Krause#define K2	0x6ed9eba1
43566be8951SMathias Krause#define K3	0x8f1bbcdc
43666be8951SMathias Krause#define K4	0xca62c1d6
43766be8951SMathias Krause
43866be8951SMathias Krause.section .rodata
43966be8951SMathias Krause.align 16
44066be8951SMathias Krause
44166be8951SMathias KrauseK_XMM_AR:
44266be8951SMathias Krause	.long K1, K1, K1, K1
44366be8951SMathias Krause	.long K2, K2, K2, K2
44466be8951SMathias Krause	.long K3, K3, K3, K3
44566be8951SMathias Krause	.long K4, K4, K4, K4
44666be8951SMathias Krause
44766be8951SMathias KrauseBSWAP_SHUFB_CTL:
44866be8951SMathias Krause	.long 0x00010203
44966be8951SMathias Krause	.long 0x04050607
45066be8951SMathias Krause	.long 0x08090a0b
45166be8951SMathias Krause	.long 0x0c0d0e0f
45266be8951SMathias Krause
45366be8951SMathias Krause
45466be8951SMathias Krause.section .text
45566be8951SMathias Krause
45666be8951SMathias KrauseW_PRECALC_SSSE3
45766be8951SMathias Krause.macro xmm_mov a, b
45866be8951SMathias Krause	movdqu	\a,\b
45966be8951SMathias Krause.endm
46066be8951SMathias Krause
46141419a28SKees Cook/*
46241419a28SKees Cook * SSSE3 optimized implementation:
46341419a28SKees Cook *
46441419a28SKees Cook * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
46541419a28SKees Cook *					const u8 *data, int blocks);
46641419a28SKees Cook *
46741419a28SKees Cook * Note that struct sha1_state is assumed to begin with u32 state[5].
46866be8951SMathias Krause */
46966be8951SMathias KrauseSHA1_VECTOR_ASM     sha1_transform_ssse3
47066be8951SMathias Krause
47166be8951SMathias Krause.macro W_PRECALC_AVX
47266be8951SMathias Krause
47366be8951SMathias Krause.purgem W_PRECALC_00_15
47466be8951SMathias Krause.macro  W_PRECALC_00_15
47566be8951SMathias Krause    W_PRECALC_00_15_AVX
47666be8951SMathias Krause.endm
47766be8951SMathias Krause.purgem W_PRECALC_16_31
47866be8951SMathias Krause.macro  W_PRECALC_16_31
47966be8951SMathias Krause    W_PRECALC_16_31_AVX
48066be8951SMathias Krause.endm
48166be8951SMathias Krause.purgem W_PRECALC_32_79
48266be8951SMathias Krause.macro  W_PRECALC_32_79
48366be8951SMathias Krause    W_PRECALC_32_79_AVX
48466be8951SMathias Krause.endm
48566be8951SMathias Krause
48666be8951SMathias Krause.macro W_PRECALC_00_15_AVX
48766be8951SMathias Krause  .if ((i & 3) == 0)
48866be8951SMathias Krause	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1
48966be8951SMathias Krause  .elseif ((i & 3) == 1)
49066be8951SMathias Krause	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W
49166be8951SMathias Krause  .elseif ((i & 3) == 2)
49266be8951SMathias Krause	vpaddd	(K_BASE), W, W_TMP1
49366be8951SMathias Krause  .elseif ((i & 3) == 3)
49466be8951SMathias Krause	vmovdqa	W_TMP1, WK(i&~3)
49566be8951SMathias Krause	W_PRECALC_ROTATE
49666be8951SMathias Krause  .endif
49766be8951SMathias Krause.endm
49866be8951SMathias Krause
49966be8951SMathias Krause.macro W_PRECALC_16_31_AVX
50066be8951SMathias Krause  .if ((i & 3) == 0)
50166be8951SMathias Krause	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14]
50266be8951SMathias Krause	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3]
50366be8951SMathias Krause	vpxor	W_minus_08, W, W
50466be8951SMathias Krause	vpxor	W_minus_16, W_TMP1, W_TMP1
50566be8951SMathias Krause  .elseif ((i & 3) == 1)
50666be8951SMathias Krause	vpxor	W_TMP1, W, W
50766be8951SMathias Krause	vpslldq	$12, W, W_TMP2
50866be8951SMathias Krause	vpslld	$1, W, W_TMP1
50966be8951SMathias Krause  .elseif ((i & 3) == 2)
51066be8951SMathias Krause	vpsrld	$31, W, W
51166be8951SMathias Krause	vpor	W, W_TMP1, W_TMP1
51266be8951SMathias Krause	vpslld	$2, W_TMP2, W
51366be8951SMathias Krause	vpsrld	$30, W_TMP2, W_TMP2
51466be8951SMathias Krause  .elseif ((i & 3) == 3)
51566be8951SMathias Krause	vpxor	W, W_TMP1, W_TMP1
51666be8951SMathias Krause	vpxor	W_TMP2, W_TMP1, W
51766be8951SMathias Krause	vpaddd	K_XMM(K_BASE), W, W_TMP1
51866be8951SMathias Krause	vmovdqu	W_TMP1, WK(i&~3)
51966be8951SMathias Krause	W_PRECALC_ROTATE
52066be8951SMathias Krause  .endif
52166be8951SMathias Krause.endm
52266be8951SMathias Krause
52366be8951SMathias Krause.macro W_PRECALC_32_79_AVX
52466be8951SMathias Krause  .if ((i & 3) == 0)
52566be8951SMathias Krause	vpalignr $8, W_minus_08, W_minus_04, W_TMP1
52666be8951SMathias Krause	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor
52766be8951SMathias Krause  .elseif ((i & 3) == 1)
52866be8951SMathias Krause	vpxor	W_minus_16, W_TMP1, W_TMP1
52966be8951SMathias Krause	vpxor	W_TMP1, W, W
53066be8951SMathias Krause  .elseif ((i & 3) == 2)
53166be8951SMathias Krause	vpslld	$2, W, W_TMP1
53266be8951SMathias Krause	vpsrld	$30, W, W
53366be8951SMathias Krause	vpor	W, W_TMP1, W
53466be8951SMathias Krause  .elseif ((i & 3) == 3)
53566be8951SMathias Krause	vpaddd	K_XMM(K_BASE), W, W_TMP1
53666be8951SMathias Krause	vmovdqu	W_TMP1, WK(i&~3)
53766be8951SMathias Krause	W_PRECALC_ROTATE
53866be8951SMathias Krause  .endif
53966be8951SMathias Krause.endm
54066be8951SMathias Krause
54166be8951SMathias Krause.endm    // W_PRECALC_AVX
54266be8951SMathias Krause
54366be8951SMathias KrauseW_PRECALC_AVX
54466be8951SMathias Krause.purgem xmm_mov
54566be8951SMathias Krause.macro xmm_mov a, b
54666be8951SMathias Krause	vmovdqu	\a,\b
54766be8951SMathias Krause.endm
54866be8951SMathias Krause
54966be8951SMathias Krause
55066be8951SMathias Krause/* AVX optimized implementation:
55141419a28SKees Cook *  extern "C" void sha1_transform_avx(struct sha1_state *state,
55241419a28SKees Cook *				       const u8 *data, int blocks);
55366be8951SMathias Krause */
55466be8951SMathias KrauseSHA1_VECTOR_ASM     sha1_transform_avx
555