xref: /linux/arch/x86/crypto/sha1_ssse3_asm.S (revision 6dcc5627f6aec4cb1d1494d06a48d8061db06a04)
12874c5fdSThomas Gleixner/* SPDX-License-Identifier: GPL-2.0-or-later */
266be8951SMathias Krause/*
366be8951SMathias Krause * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
466be8951SMathias Krause * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
566be8951SMathias Krause * processors. CPUs supporting Intel(R) AVX extensions will get an additional
666be8951SMathias Krause * boost.
766be8951SMathias Krause *
866be8951SMathias Krause * This work was inspired by the vectorized implementation of Dean Gaudet.
966be8951SMathias Krause * Additional information on it can be found at:
1066be8951SMathias Krause *    http://www.arctic.org/~dean/crypto/sha1.html
1166be8951SMathias Krause *
1266be8951SMathias Krause * It was improved upon with more efficient vectorization of the message
1366be8951SMathias Krause * scheduling. This implementation has also been optimized for all current and
1466be8951SMathias Krause * several future generations of Intel CPUs.
1566be8951SMathias Krause *
1666be8951SMathias Krause * See this article for more information about the implementation details:
1766be8951SMathias Krause *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
1866be8951SMathias Krause *
1966be8951SMathias Krause * Copyright (C) 2010, Intel Corp.
2066be8951SMathias Krause *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
2166be8951SMathias Krause *            Ronen Zohar <ronen.zohar@intel.com>
2266be8951SMathias Krause *
2366be8951SMathias Krause * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
2466be8951SMathias Krause *   Author: Mathias Krause <minipli@googlemail.com>
2566be8951SMathias Krause */
2666be8951SMathias Krause
27ac9d55ddSJussi Kivilinna#include <linux/linkage.h>
28ac9d55ddSJussi Kivilinna
2966be8951SMathias Krause#define CTX	%rdi	// arg1
3066be8951SMathias Krause#define BUF	%rsi	// arg2
3166be8951SMathias Krause#define CNT	%rdx	// arg3
3266be8951SMathias Krause
3366be8951SMathias Krause#define REG_A	%ecx
3466be8951SMathias Krause#define REG_B	%esi
3566be8951SMathias Krause#define REG_C	%edi
366488bce7SJosh Poimboeuf#define REG_D	%r12d
3766be8951SMathias Krause#define REG_E	%edx
3866be8951SMathias Krause
3966be8951SMathias Krause#define REG_T1	%eax
4066be8951SMathias Krause#define REG_T2	%ebx
4166be8951SMathias Krause
4266be8951SMathias Krause#define K_BASE		%r8
4366be8951SMathias Krause#define HASH_PTR	%r9
4466be8951SMathias Krause#define BUFFER_PTR	%r10
4566be8951SMathias Krause#define BUFFER_END	%r11
4666be8951SMathias Krause
4766be8951SMathias Krause#define W_TMP1	%xmm0
4866be8951SMathias Krause#define W_TMP2	%xmm9
4966be8951SMathias Krause
5066be8951SMathias Krause#define W0	%xmm1
5166be8951SMathias Krause#define W4	%xmm2
5266be8951SMathias Krause#define W8	%xmm3
5366be8951SMathias Krause#define W12	%xmm4
5466be8951SMathias Krause#define W16	%xmm5
5566be8951SMathias Krause#define W20	%xmm6
5666be8951SMathias Krause#define W24	%xmm7
5766be8951SMathias Krause#define W28	%xmm8
5866be8951SMathias Krause
5966be8951SMathias Krause#define XMM_SHUFB_BSWAP	%xmm10
6066be8951SMathias Krause
6166be8951SMathias Krause/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
6266be8951SMathias Krause#define WK(t)	(((t) & 15) * 4)(%rsp)
6366be8951SMathias Krause#define W_PRECALC_AHEAD	16
6466be8951SMathias Krause
6566be8951SMathias Krause/*
6666be8951SMathias Krause * This macro implements the SHA-1 function's body for single 64-byte block
6766be8951SMathias Krause * param: function's name
6866be8951SMathias Krause */
6966be8951SMathias Krause.macro SHA1_VECTOR_ASM  name
70*6dcc5627SJiri Slaby	SYM_FUNC_START(\name)
71ac9d55ddSJussi Kivilinna
7266be8951SMathias Krause	push	%rbx
7366be8951SMathias Krause	push	%r12
746488bce7SJosh Poimboeuf	push	%rbp
756488bce7SJosh Poimboeuf	mov	%rsp, %rbp
7666be8951SMathias Krause
7766be8951SMathias Krause	sub	$64, %rsp		# allocate workspace
7866be8951SMathias Krause	and	$~15, %rsp		# align stack
7966be8951SMathias Krause
8066be8951SMathias Krause	mov	CTX, HASH_PTR
8166be8951SMathias Krause	mov	BUF, BUFFER_PTR
8266be8951SMathias Krause
8366be8951SMathias Krause	shl	$6, CNT			# multiply by 64
8466be8951SMathias Krause	add	BUF, CNT
8566be8951SMathias Krause	mov	CNT, BUFFER_END
8666be8951SMathias Krause
8766be8951SMathias Krause	lea	K_XMM_AR(%rip), K_BASE
8866be8951SMathias Krause	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
8966be8951SMathias Krause
9066be8951SMathias Krause	SHA1_PIPELINED_MAIN_BODY
9166be8951SMathias Krause
9266be8951SMathias Krause	# cleanup workspace
9366be8951SMathias Krause	mov	$8, %ecx
9466be8951SMathias Krause	mov	%rsp, %rdi
95a7bea830SJan Beulich	xor	%eax, %eax
9666be8951SMathias Krause	rep stosq
9766be8951SMathias Krause
986488bce7SJosh Poimboeuf	mov	%rbp, %rsp		# deallocate workspace
9966be8951SMathias Krause	pop	%rbp
1006488bce7SJosh Poimboeuf	pop	%r12
10166be8951SMathias Krause	pop	%rbx
10266be8951SMathias Krause	ret
10366be8951SMathias Krause
104*6dcc5627SJiri Slaby	SYM_FUNC_END(\name)
10566be8951SMathias Krause.endm
10666be8951SMathias Krause
10766be8951SMathias Krause/*
10866be8951SMathias Krause * This macro implements 80 rounds of SHA-1 for one 64-byte block
10966be8951SMathias Krause */
11066be8951SMathias Krause.macro SHA1_PIPELINED_MAIN_BODY
11166be8951SMathias Krause	INIT_REGALLOC
11266be8951SMathias Krause
11366be8951SMathias Krause	mov	  (HASH_PTR), A
11466be8951SMathias Krause	mov	 4(HASH_PTR), B
11566be8951SMathias Krause	mov	 8(HASH_PTR), C
11666be8951SMathias Krause	mov	12(HASH_PTR), D
11766be8951SMathias Krause	mov	16(HASH_PTR), E
11866be8951SMathias Krause
11966be8951SMathias Krause  .set i, 0
12066be8951SMathias Krause  .rept W_PRECALC_AHEAD
12166be8951SMathias Krause	W_PRECALC i
12266be8951SMathias Krause    .set i, (i+1)
12366be8951SMathias Krause  .endr
12466be8951SMathias Krause
12566be8951SMathias Krause.align 4
12666be8951SMathias Krause1:
12766be8951SMathias Krause	RR F1,A,B,C,D,E,0
12866be8951SMathias Krause	RR F1,D,E,A,B,C,2
12966be8951SMathias Krause	RR F1,B,C,D,E,A,4
13066be8951SMathias Krause	RR F1,E,A,B,C,D,6
13166be8951SMathias Krause	RR F1,C,D,E,A,B,8
13266be8951SMathias Krause
13366be8951SMathias Krause	RR F1,A,B,C,D,E,10
13466be8951SMathias Krause	RR F1,D,E,A,B,C,12
13566be8951SMathias Krause	RR F1,B,C,D,E,A,14
13666be8951SMathias Krause	RR F1,E,A,B,C,D,16
13766be8951SMathias Krause	RR F1,C,D,E,A,B,18
13866be8951SMathias Krause
13966be8951SMathias Krause	RR F2,A,B,C,D,E,20
14066be8951SMathias Krause	RR F2,D,E,A,B,C,22
14166be8951SMathias Krause	RR F2,B,C,D,E,A,24
14266be8951SMathias Krause	RR F2,E,A,B,C,D,26
14366be8951SMathias Krause	RR F2,C,D,E,A,B,28
14466be8951SMathias Krause
14566be8951SMathias Krause	RR F2,A,B,C,D,E,30
14666be8951SMathias Krause	RR F2,D,E,A,B,C,32
14766be8951SMathias Krause	RR F2,B,C,D,E,A,34
14866be8951SMathias Krause	RR F2,E,A,B,C,D,36
14966be8951SMathias Krause	RR F2,C,D,E,A,B,38
15066be8951SMathias Krause
15166be8951SMathias Krause	RR F3,A,B,C,D,E,40
15266be8951SMathias Krause	RR F3,D,E,A,B,C,42
15366be8951SMathias Krause	RR F3,B,C,D,E,A,44
15466be8951SMathias Krause	RR F3,E,A,B,C,D,46
15566be8951SMathias Krause	RR F3,C,D,E,A,B,48
15666be8951SMathias Krause
15766be8951SMathias Krause	RR F3,A,B,C,D,E,50
15866be8951SMathias Krause	RR F3,D,E,A,B,C,52
15966be8951SMathias Krause	RR F3,B,C,D,E,A,54
16066be8951SMathias Krause	RR F3,E,A,B,C,D,56
16166be8951SMathias Krause	RR F3,C,D,E,A,B,58
16266be8951SMathias Krause
16366be8951SMathias Krause	add	$64, BUFFER_PTR		# move to the next 64-byte block
16466be8951SMathias Krause	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use
16566be8951SMathias Krause	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun
16666be8951SMathias Krause
16766be8951SMathias Krause	RR F4,A,B,C,D,E,60
16866be8951SMathias Krause	RR F4,D,E,A,B,C,62
16966be8951SMathias Krause	RR F4,B,C,D,E,A,64
17066be8951SMathias Krause	RR F4,E,A,B,C,D,66
17166be8951SMathias Krause	RR F4,C,D,E,A,B,68
17266be8951SMathias Krause
17366be8951SMathias Krause	RR F4,A,B,C,D,E,70
17466be8951SMathias Krause	RR F4,D,E,A,B,C,72
17566be8951SMathias Krause	RR F4,B,C,D,E,A,74
17666be8951SMathias Krause	RR F4,E,A,B,C,D,76
17766be8951SMathias Krause	RR F4,C,D,E,A,B,78
17866be8951SMathias Krause
17966be8951SMathias Krause	UPDATE_HASH   (HASH_PTR), A
18066be8951SMathias Krause	UPDATE_HASH  4(HASH_PTR), B
18166be8951SMathias Krause	UPDATE_HASH  8(HASH_PTR), C
18266be8951SMathias Krause	UPDATE_HASH 12(HASH_PTR), D
18366be8951SMathias Krause	UPDATE_HASH 16(HASH_PTR), E
18466be8951SMathias Krause
18566be8951SMathias Krause	RESTORE_RENAMED_REGS
18666be8951SMathias Krause	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end
18766be8951SMathias Krause	jne	1b
18866be8951SMathias Krause.endm
18966be8951SMathias Krause
19066be8951SMathias Krause.macro INIT_REGALLOC
19166be8951SMathias Krause  .set A, REG_A
19266be8951SMathias Krause  .set B, REG_B
19366be8951SMathias Krause  .set C, REG_C
19466be8951SMathias Krause  .set D, REG_D
19566be8951SMathias Krause  .set E, REG_E
19666be8951SMathias Krause  .set T1, REG_T1
19766be8951SMathias Krause  .set T2, REG_T2
19866be8951SMathias Krause.endm
19966be8951SMathias Krause
20066be8951SMathias Krause.macro RESTORE_RENAMED_REGS
20166be8951SMathias Krause	# order is important (REG_C is where it should be)
20266be8951SMathias Krause	mov	B, REG_B
20366be8951SMathias Krause	mov	D, REG_D
20466be8951SMathias Krause	mov	A, REG_A
20566be8951SMathias Krause	mov	E, REG_E
20666be8951SMathias Krause.endm
20766be8951SMathias Krause
20866be8951SMathias Krause.macro SWAP_REG_NAMES  a, b
20966be8951SMathias Krause  .set _T, \a
21066be8951SMathias Krause  .set \a, \b
21166be8951SMathias Krause  .set \b, _T
21266be8951SMathias Krause.endm
21366be8951SMathias Krause
21466be8951SMathias Krause.macro F1  b, c, d
21566be8951SMathias Krause	mov	\c, T1
21666be8951SMathias Krause	SWAP_REG_NAMES \c, T1
21766be8951SMathias Krause	xor	\d, T1
21866be8951SMathias Krause	and	\b, T1
21966be8951SMathias Krause	xor	\d, T1
22066be8951SMathias Krause.endm
22166be8951SMathias Krause
22266be8951SMathias Krause.macro F2  b, c, d
22366be8951SMathias Krause	mov	\d, T1
22466be8951SMathias Krause	SWAP_REG_NAMES \d, T1
22566be8951SMathias Krause	xor	\c, T1
22666be8951SMathias Krause	xor	\b, T1
22766be8951SMathias Krause.endm
22866be8951SMathias Krause
22966be8951SMathias Krause.macro F3  b, c ,d
23066be8951SMathias Krause	mov	\c, T1
23166be8951SMathias Krause	SWAP_REG_NAMES \c, T1
23266be8951SMathias Krause	mov	\b, T2
23366be8951SMathias Krause	or	\b, T1
23466be8951SMathias Krause	and	\c, T2
23566be8951SMathias Krause	and	\d, T1
23666be8951SMathias Krause	or	T2, T1
23766be8951SMathias Krause.endm
23866be8951SMathias Krause
23966be8951SMathias Krause.macro F4  b, c, d
24066be8951SMathias Krause	F2 \b, \c, \d
24166be8951SMathias Krause.endm
24266be8951SMathias Krause
24366be8951SMathias Krause.macro UPDATE_HASH  hash, val
24466be8951SMathias Krause	add	\hash, \val
24566be8951SMathias Krause	mov	\val, \hash
24666be8951SMathias Krause.endm
24766be8951SMathias Krause
24866be8951SMathias Krause/*
24966be8951SMathias Krause * RR does two rounds of SHA-1 back to back with W[] pre-calc
25066be8951SMathias Krause *   t1 = F(b, c, d);   e += w(i)
25166be8951SMathias Krause *   e += t1;           b <<= 30;   d  += w(i+1);
25266be8951SMathias Krause *   t1 = F(a, b, c);
25366be8951SMathias Krause *   d += t1;           a <<= 5;
25466be8951SMathias Krause *   e += a;
25566be8951SMathias Krause *   t1 = e;            a >>= 7;
25666be8951SMathias Krause *   t1 <<= 5;
25766be8951SMathias Krause *   d += t1;
25866be8951SMathias Krause */
25966be8951SMathias Krause.macro RR  F, a, b, c, d, e, round
26066be8951SMathias Krause	add	WK(\round), \e
26166be8951SMathias Krause	\F   \b, \c, \d		# t1 = F(b, c, d);
26266be8951SMathias Krause	W_PRECALC (\round + W_PRECALC_AHEAD)
26366be8951SMathias Krause	rol	$30, \b
26466be8951SMathias Krause	add	T1, \e
26566be8951SMathias Krause	add	WK(\round + 1), \d
26666be8951SMathias Krause
26766be8951SMathias Krause	\F   \a, \b, \c
26866be8951SMathias Krause	W_PRECALC (\round + W_PRECALC_AHEAD + 1)
26966be8951SMathias Krause	rol	$5, \a
27066be8951SMathias Krause	add	\a, \e
27166be8951SMathias Krause	add	T1, \d
27266be8951SMathias Krause	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30)
27366be8951SMathias Krause
27466be8951SMathias Krause	mov	\e, T1
27566be8951SMathias Krause	SWAP_REG_NAMES \e, T1
27666be8951SMathias Krause
27766be8951SMathias Krause	rol	$5, T1
27866be8951SMathias Krause	add	T1, \d
27966be8951SMathias Krause
28066be8951SMathias Krause	# write:  \a, \b
28166be8951SMathias Krause	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
28266be8951SMathias Krause.endm
28366be8951SMathias Krause
28466be8951SMathias Krause.macro W_PRECALC  r
28566be8951SMathias Krause  .set i, \r
28666be8951SMathias Krause
28766be8951SMathias Krause  .if (i < 20)
28866be8951SMathias Krause    .set K_XMM, 0
28966be8951SMathias Krause  .elseif (i < 40)
29066be8951SMathias Krause    .set K_XMM, 16
29166be8951SMathias Krause  .elseif (i < 60)
29266be8951SMathias Krause    .set K_XMM, 32
29366be8951SMathias Krause  .elseif (i < 80)
29466be8951SMathias Krause    .set K_XMM, 48
29566be8951SMathias Krause  .endif
29666be8951SMathias Krause
29766be8951SMathias Krause  .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
29866be8951SMathias Krause    .set i, ((\r) % 80)	    # pre-compute for the next iteration
29966be8951SMathias Krause    .if (i == 0)
30066be8951SMathias Krause	W_PRECALC_RESET
30166be8951SMathias Krause    .endif
30266be8951SMathias Krause	W_PRECALC_00_15
30366be8951SMathias Krause  .elseif (i<32)
30466be8951SMathias Krause	W_PRECALC_16_31
30566be8951SMathias Krause  .elseif (i < 80)   // rounds 32-79
30666be8951SMathias Krause	W_PRECALC_32_79
30766be8951SMathias Krause  .endif
30866be8951SMathias Krause.endm
30966be8951SMathias Krause
31066be8951SMathias Krause.macro W_PRECALC_RESET
31166be8951SMathias Krause  .set W,          W0
31266be8951SMathias Krause  .set W_minus_04, W4
31366be8951SMathias Krause  .set W_minus_08, W8
31466be8951SMathias Krause  .set W_minus_12, W12
31566be8951SMathias Krause  .set W_minus_16, W16
31666be8951SMathias Krause  .set W_minus_20, W20
31766be8951SMathias Krause  .set W_minus_24, W24
31866be8951SMathias Krause  .set W_minus_28, W28
31966be8951SMathias Krause  .set W_minus_32, W
32066be8951SMathias Krause.endm
32166be8951SMathias Krause
32266be8951SMathias Krause.macro W_PRECALC_ROTATE
32366be8951SMathias Krause  .set W_minus_32, W_minus_28
32466be8951SMathias Krause  .set W_minus_28, W_minus_24
32566be8951SMathias Krause  .set W_minus_24, W_minus_20
32666be8951SMathias Krause  .set W_minus_20, W_minus_16
32766be8951SMathias Krause  .set W_minus_16, W_minus_12
32866be8951SMathias Krause  .set W_minus_12, W_minus_08
32966be8951SMathias Krause  .set W_minus_08, W_minus_04
33066be8951SMathias Krause  .set W_minus_04, W
33166be8951SMathias Krause  .set W,          W_minus_32
33266be8951SMathias Krause.endm
33366be8951SMathias Krause
33466be8951SMathias Krause.macro W_PRECALC_SSSE3
33566be8951SMathias Krause
33666be8951SMathias Krause.macro W_PRECALC_00_15
33766be8951SMathias Krause	W_PRECALC_00_15_SSSE3
33866be8951SMathias Krause.endm
33966be8951SMathias Krause.macro W_PRECALC_16_31
34066be8951SMathias Krause	W_PRECALC_16_31_SSSE3
34166be8951SMathias Krause.endm
34266be8951SMathias Krause.macro W_PRECALC_32_79
34366be8951SMathias Krause	W_PRECALC_32_79_SSSE3
34466be8951SMathias Krause.endm
34566be8951SMathias Krause
34666be8951SMathias Krause/* message scheduling pre-compute for rounds 0-15 */
34766be8951SMathias Krause.macro W_PRECALC_00_15_SSSE3
34866be8951SMathias Krause  .if ((i & 3) == 0)
34966be8951SMathias Krause	movdqu	(i*4)(BUFFER_PTR), W_TMP1
35066be8951SMathias Krause  .elseif ((i & 3) == 1)
35166be8951SMathias Krause	pshufb	XMM_SHUFB_BSWAP, W_TMP1
35266be8951SMathias Krause	movdqa	W_TMP1, W
35366be8951SMathias Krause  .elseif ((i & 3) == 2)
35466be8951SMathias Krause	paddd	(K_BASE), W_TMP1
35566be8951SMathias Krause  .elseif ((i & 3) == 3)
35666be8951SMathias Krause	movdqa  W_TMP1, WK(i&~3)
35766be8951SMathias Krause	W_PRECALC_ROTATE
35866be8951SMathias Krause  .endif
35966be8951SMathias Krause.endm
36066be8951SMathias Krause
36166be8951SMathias Krause/* message scheduling pre-compute for rounds 16-31
36266be8951SMathias Krause *
36366be8951SMathias Krause * - calculating last 32 w[i] values in 8 XMM registers
36466be8951SMathias Krause * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
36566be8951SMathias Krause *   instruction
36666be8951SMathias Krause *
36766be8951SMathias Krause * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
36866be8951SMathias Krause * dependency, but improves for 32-79
36966be8951SMathias Krause */
37066be8951SMathias Krause.macro W_PRECALC_16_31_SSSE3
37166be8951SMathias Krause  # blended scheduling of vector and scalar instruction streams, one 4-wide
37266be8951SMathias Krause  # vector iteration / 4 scalar rounds
37366be8951SMathias Krause  .if ((i & 3) == 0)
37466be8951SMathias Krause	movdqa	W_minus_12, W
37566be8951SMathias Krause	palignr	$8, W_minus_16, W	# w[i-14]
37666be8951SMathias Krause	movdqa	W_minus_04, W_TMP1
37766be8951SMathias Krause	psrldq	$4, W_TMP1		# w[i-3]
37866be8951SMathias Krause	pxor	W_minus_08, W
37966be8951SMathias Krause  .elseif ((i & 3) == 1)
38066be8951SMathias Krause	pxor	W_minus_16, W_TMP1
38166be8951SMathias Krause	pxor	W_TMP1, W
38266be8951SMathias Krause	movdqa	W, W_TMP2
38366be8951SMathias Krause	movdqa	W, W_TMP1
38466be8951SMathias Krause	pslldq	$12, W_TMP2
38566be8951SMathias Krause  .elseif ((i & 3) == 2)
38666be8951SMathias Krause	psrld	$31, W
38766be8951SMathias Krause	pslld	$1, W_TMP1
38866be8951SMathias Krause	por	W, W_TMP1
38966be8951SMathias Krause	movdqa	W_TMP2, W
39066be8951SMathias Krause	psrld	$30, W_TMP2
39166be8951SMathias Krause	pslld	$2, W
39266be8951SMathias Krause  .elseif ((i & 3) == 3)
39366be8951SMathias Krause	pxor	W, W_TMP1
39466be8951SMathias Krause	pxor	W_TMP2, W_TMP1
39566be8951SMathias Krause	movdqa	W_TMP1, W
39666be8951SMathias Krause	paddd	K_XMM(K_BASE), W_TMP1
39766be8951SMathias Krause	movdqa	W_TMP1, WK(i&~3)
39866be8951SMathias Krause	W_PRECALC_ROTATE
39966be8951SMathias Krause  .endif
40066be8951SMathias Krause.endm
40166be8951SMathias Krause
40266be8951SMathias Krause/* message scheduling pre-compute for rounds 32-79
40366be8951SMathias Krause *
40466be8951SMathias Krause * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
40566be8951SMathias Krause * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
40666be8951SMathias Krause * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
40766be8951SMathias Krause */
40866be8951SMathias Krause.macro W_PRECALC_32_79_SSSE3
40966be8951SMathias Krause  .if ((i & 3) == 0)
41066be8951SMathias Krause	movdqa	W_minus_04, W_TMP1
41166be8951SMathias Krause	pxor	W_minus_28, W		# W is W_minus_32 before xor
41266be8951SMathias Krause	palignr	$8, W_minus_08, W_TMP1
41366be8951SMathias Krause  .elseif ((i & 3) == 1)
41466be8951SMathias Krause	pxor	W_minus_16, W
41566be8951SMathias Krause	pxor	W_TMP1, W
41666be8951SMathias Krause	movdqa	W, W_TMP1
41766be8951SMathias Krause  .elseif ((i & 3) == 2)
41866be8951SMathias Krause	psrld	$30, W
41966be8951SMathias Krause	pslld	$2, W_TMP1
42066be8951SMathias Krause	por	W, W_TMP1
42166be8951SMathias Krause  .elseif ((i & 3) == 3)
42266be8951SMathias Krause	movdqa	W_TMP1, W
42366be8951SMathias Krause	paddd	K_XMM(K_BASE), W_TMP1
42466be8951SMathias Krause	movdqa	W_TMP1, WK(i&~3)
42566be8951SMathias Krause	W_PRECALC_ROTATE
42666be8951SMathias Krause  .endif
42766be8951SMathias Krause.endm
42866be8951SMathias Krause
42966be8951SMathias Krause.endm		// W_PRECALC_SSSE3
43066be8951SMathias Krause
43166be8951SMathias Krause
43266be8951SMathias Krause#define K1	0x5a827999
43366be8951SMathias Krause#define K2	0x6ed9eba1
43466be8951SMathias Krause#define K3	0x8f1bbcdc
43566be8951SMathias Krause#define K4	0xca62c1d6
43666be8951SMathias Krause
43766be8951SMathias Krause.section .rodata
43866be8951SMathias Krause.align 16
43966be8951SMathias Krause
44066be8951SMathias KrauseK_XMM_AR:
44166be8951SMathias Krause	.long K1, K1, K1, K1
44266be8951SMathias Krause	.long K2, K2, K2, K2
44366be8951SMathias Krause	.long K3, K3, K3, K3
44466be8951SMathias Krause	.long K4, K4, K4, K4
44566be8951SMathias Krause
44666be8951SMathias KrauseBSWAP_SHUFB_CTL:
44766be8951SMathias Krause	.long 0x00010203
44866be8951SMathias Krause	.long 0x04050607
44966be8951SMathias Krause	.long 0x08090a0b
45066be8951SMathias Krause	.long 0x0c0d0e0f
45166be8951SMathias Krause
45266be8951SMathias Krause
45366be8951SMathias Krause.section .text
45466be8951SMathias Krause
45566be8951SMathias KrauseW_PRECALC_SSSE3
45666be8951SMathias Krause.macro xmm_mov a, b
45766be8951SMathias Krause	movdqu	\a,\b
45866be8951SMathias Krause.endm
45966be8951SMathias Krause
46066be8951SMathias Krause/* SSSE3 optimized implementation:
46166be8951SMathias Krause *  extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws,
46266be8951SMathias Krause *                                       unsigned int rounds);
46366be8951SMathias Krause */
46466be8951SMathias KrauseSHA1_VECTOR_ASM     sha1_transform_ssse3
46566be8951SMathias Krause
46665df5774SMathias Krause#ifdef CONFIG_AS_AVX
46766be8951SMathias Krause
46866be8951SMathias Krause.macro W_PRECALC_AVX
46966be8951SMathias Krause
47066be8951SMathias Krause.purgem W_PRECALC_00_15
47166be8951SMathias Krause.macro  W_PRECALC_00_15
47266be8951SMathias Krause    W_PRECALC_00_15_AVX
47366be8951SMathias Krause.endm
47466be8951SMathias Krause.purgem W_PRECALC_16_31
47566be8951SMathias Krause.macro  W_PRECALC_16_31
47666be8951SMathias Krause    W_PRECALC_16_31_AVX
47766be8951SMathias Krause.endm
47866be8951SMathias Krause.purgem W_PRECALC_32_79
47966be8951SMathias Krause.macro  W_PRECALC_32_79
48066be8951SMathias Krause    W_PRECALC_32_79_AVX
48166be8951SMathias Krause.endm
48266be8951SMathias Krause
48366be8951SMathias Krause.macro W_PRECALC_00_15_AVX
48466be8951SMathias Krause  .if ((i & 3) == 0)
48566be8951SMathias Krause	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1
48666be8951SMathias Krause  .elseif ((i & 3) == 1)
48766be8951SMathias Krause	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W
48866be8951SMathias Krause  .elseif ((i & 3) == 2)
48966be8951SMathias Krause	vpaddd	(K_BASE), W, W_TMP1
49066be8951SMathias Krause  .elseif ((i & 3) == 3)
49166be8951SMathias Krause	vmovdqa	W_TMP1, WK(i&~3)
49266be8951SMathias Krause	W_PRECALC_ROTATE
49366be8951SMathias Krause  .endif
49466be8951SMathias Krause.endm
49566be8951SMathias Krause
49666be8951SMathias Krause.macro W_PRECALC_16_31_AVX
49766be8951SMathias Krause  .if ((i & 3) == 0)
49866be8951SMathias Krause	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14]
49966be8951SMathias Krause	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3]
50066be8951SMathias Krause	vpxor	W_minus_08, W, W
50166be8951SMathias Krause	vpxor	W_minus_16, W_TMP1, W_TMP1
50266be8951SMathias Krause  .elseif ((i & 3) == 1)
50366be8951SMathias Krause	vpxor	W_TMP1, W, W
50466be8951SMathias Krause	vpslldq	$12, W, W_TMP2
50566be8951SMathias Krause	vpslld	$1, W, W_TMP1
50666be8951SMathias Krause  .elseif ((i & 3) == 2)
50766be8951SMathias Krause	vpsrld	$31, W, W
50866be8951SMathias Krause	vpor	W, W_TMP1, W_TMP1
50966be8951SMathias Krause	vpslld	$2, W_TMP2, W
51066be8951SMathias Krause	vpsrld	$30, W_TMP2, W_TMP2
51166be8951SMathias Krause  .elseif ((i & 3) == 3)
51266be8951SMathias Krause	vpxor	W, W_TMP1, W_TMP1
51366be8951SMathias Krause	vpxor	W_TMP2, W_TMP1, W
51466be8951SMathias Krause	vpaddd	K_XMM(K_BASE), W, W_TMP1
51566be8951SMathias Krause	vmovdqu	W_TMP1, WK(i&~3)
51666be8951SMathias Krause	W_PRECALC_ROTATE
51766be8951SMathias Krause  .endif
51866be8951SMathias Krause.endm
51966be8951SMathias Krause
52066be8951SMathias Krause.macro W_PRECALC_32_79_AVX
52166be8951SMathias Krause  .if ((i & 3) == 0)
52266be8951SMathias Krause	vpalignr $8, W_minus_08, W_minus_04, W_TMP1
52366be8951SMathias Krause	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor
52466be8951SMathias Krause  .elseif ((i & 3) == 1)
52566be8951SMathias Krause	vpxor	W_minus_16, W_TMP1, W_TMP1
52666be8951SMathias Krause	vpxor	W_TMP1, W, W
52766be8951SMathias Krause  .elseif ((i & 3) == 2)
52866be8951SMathias Krause	vpslld	$2, W, W_TMP1
52966be8951SMathias Krause	vpsrld	$30, W, W
53066be8951SMathias Krause	vpor	W, W_TMP1, W
53166be8951SMathias Krause  .elseif ((i & 3) == 3)
53266be8951SMathias Krause	vpaddd	K_XMM(K_BASE), W, W_TMP1
53366be8951SMathias Krause	vmovdqu	W_TMP1, WK(i&~3)
53466be8951SMathias Krause	W_PRECALC_ROTATE
53566be8951SMathias Krause  .endif
53666be8951SMathias Krause.endm
53766be8951SMathias Krause
53866be8951SMathias Krause.endm    // W_PRECALC_AVX
53966be8951SMathias Krause
54066be8951SMathias KrauseW_PRECALC_AVX
54166be8951SMathias Krause.purgem xmm_mov
54266be8951SMathias Krause.macro xmm_mov a, b
54366be8951SMathias Krause	vmovdqu	\a,\b
54466be8951SMathias Krause.endm
54566be8951SMathias Krause
54666be8951SMathias Krause
54766be8951SMathias Krause/* AVX optimized implementation:
54866be8951SMathias Krause *  extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws,
54966be8951SMathias Krause *                                     unsigned int rounds);
55066be8951SMathias Krause */
55166be8951SMathias KrauseSHA1_VECTOR_ASM     sha1_transform_avx
55266be8951SMathias Krause
55366be8951SMathias Krause#endif
554