xref: /linux/arch/x86/crypto/nh-sse2-x86_64.S (revision 8795a739e5c72abeec51caf36b6df2b37e5720c5)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
4 *
5 * Copyright 2018 Google LLC
6 *
7 * Author: Eric Biggers <ebiggers@google.com>
8 */
9
10#include <linux/linkage.h>
11
12#define		PASS0_SUMS	%xmm0
13#define		PASS1_SUMS	%xmm1
14#define		PASS2_SUMS	%xmm2
15#define		PASS3_SUMS	%xmm3
16#define		K0		%xmm4
17#define		K1		%xmm5
18#define		K2		%xmm6
19#define		K3		%xmm7
20#define		T0		%xmm8
21#define		T1		%xmm9
22#define		T2		%xmm10
23#define		T3		%xmm11
24#define		T4		%xmm12
25#define		T5		%xmm13
26#define		T6		%xmm14
27#define		T7		%xmm15
28#define		KEY		%rdi
29#define		MESSAGE		%rsi
30#define		MESSAGE_LEN	%rdx
31#define		HASH		%rcx
32
33.macro _nh_stride	k0, k1, k2, k3, offset
34
35	// Load next message stride
36	movdqu		\offset(MESSAGE), T1
37
38	// Load next key stride
39	movdqu		\offset(KEY), \k3
40
41	// Add message words to key words
42	movdqa		T1, T2
43	movdqa		T1, T3
44	paddd		T1, \k0    // reuse k0 to avoid a move
45	paddd		\k1, T1
46	paddd		\k2, T2
47	paddd		\k3, T3
48
49	// Multiply 32x32 => 64 and accumulate
50	pshufd		$0x10, \k0, T4
51	pshufd		$0x32, \k0, \k0
52	pshufd		$0x10, T1, T5
53	pshufd		$0x32, T1, T1
54	pshufd		$0x10, T2, T6
55	pshufd		$0x32, T2, T2
56	pshufd		$0x10, T3, T7
57	pshufd		$0x32, T3, T3
58	pmuludq		T4, \k0
59	pmuludq		T5, T1
60	pmuludq		T6, T2
61	pmuludq		T7, T3
62	paddq		\k0, PASS0_SUMS
63	paddq		T1, PASS1_SUMS
64	paddq		T2, PASS2_SUMS
65	paddq		T3, PASS3_SUMS
66.endm
67
68/*
69 * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
70 *		u8 hash[NH_HASH_BYTES])
71 *
72 * It's guaranteed that message_len % 16 == 0.
73 */
74ENTRY(nh_sse2)
75
76	movdqu		0x00(KEY), K0
77	movdqu		0x10(KEY), K1
78	movdqu		0x20(KEY), K2
79	add		$0x30, KEY
80	pxor		PASS0_SUMS, PASS0_SUMS
81	pxor		PASS1_SUMS, PASS1_SUMS
82	pxor		PASS2_SUMS, PASS2_SUMS
83	pxor		PASS3_SUMS, PASS3_SUMS
84
85	sub		$0x40, MESSAGE_LEN
86	jl		.Lloop4_done
87.Lloop4:
88	_nh_stride	K0, K1, K2, K3, 0x00
89	_nh_stride	K1, K2, K3, K0, 0x10
90	_nh_stride	K2, K3, K0, K1, 0x20
91	_nh_stride	K3, K0, K1, K2, 0x30
92	add		$0x40, KEY
93	add		$0x40, MESSAGE
94	sub		$0x40, MESSAGE_LEN
95	jge		.Lloop4
96
97.Lloop4_done:
98	and		$0x3f, MESSAGE_LEN
99	jz		.Ldone
100	_nh_stride	K0, K1, K2, K3, 0x00
101
102	sub		$0x10, MESSAGE_LEN
103	jz		.Ldone
104	_nh_stride	K1, K2, K3, K0, 0x10
105
106	sub		$0x10, MESSAGE_LEN
107	jz		.Ldone
108	_nh_stride	K2, K3, K0, K1, 0x20
109
110.Ldone:
111	// Sum the accumulators for each pass, then store the sums to 'hash'
112	movdqa		PASS0_SUMS, T0
113	movdqa		PASS2_SUMS, T1
114	punpcklqdq	PASS1_SUMS, T0		// => (PASS0_SUM_A PASS1_SUM_A)
115	punpcklqdq	PASS3_SUMS, T1		// => (PASS2_SUM_A PASS3_SUM_A)
116	punpckhqdq	PASS1_SUMS, PASS0_SUMS	// => (PASS0_SUM_B PASS1_SUM_B)
117	punpckhqdq	PASS3_SUMS, PASS2_SUMS	// => (PASS2_SUM_B PASS3_SUM_B)
118	paddq		PASS0_SUMS, T0
119	paddq		PASS2_SUMS, T1
120	movdqu		T0, 0x00(HASH)
121	movdqu		T1, 0x10(HASH)
122	ret
123ENDPROC(nh_sse2)
124