xref: /linux/lib/crypto/arm64/nh-neon-core.S (revision c17ee635fd3a482b2ad2bf5e269755c2eae5f25e)
1*b4a8528dSEric Biggers/* SPDX-License-Identifier: GPL-2.0 */
2*b4a8528dSEric Biggers/*
3*b4a8528dSEric Biggers * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
4*b4a8528dSEric Biggers *
5*b4a8528dSEric Biggers * Copyright 2018 Google LLC
6*b4a8528dSEric Biggers *
7*b4a8528dSEric Biggers * Author: Eric Biggers <ebiggers@google.com>
8*b4a8528dSEric Biggers */
9*b4a8528dSEric Biggers
10*b4a8528dSEric Biggers#include <linux/linkage.h>
11*b4a8528dSEric Biggers
12*b4a8528dSEric Biggers	KEY		.req	x0
13*b4a8528dSEric Biggers	MESSAGE		.req	x1
14*b4a8528dSEric Biggers	MESSAGE_LEN	.req	x2
15*b4a8528dSEric Biggers	HASH		.req	x3
16*b4a8528dSEric Biggers
17*b4a8528dSEric Biggers	PASS0_SUMS	.req	v0
18*b4a8528dSEric Biggers	PASS1_SUMS	.req	v1
19*b4a8528dSEric Biggers	PASS2_SUMS	.req	v2
20*b4a8528dSEric Biggers	PASS3_SUMS	.req	v3
21*b4a8528dSEric Biggers	K0		.req	v4
22*b4a8528dSEric Biggers	K1		.req	v5
23*b4a8528dSEric Biggers	K2		.req	v6
24*b4a8528dSEric Biggers	K3		.req	v7
25*b4a8528dSEric Biggers	T0		.req	v8
26*b4a8528dSEric Biggers	T1		.req	v9
27*b4a8528dSEric Biggers	T2		.req	v10
28*b4a8528dSEric Biggers	T3		.req	v11
29*b4a8528dSEric Biggers	T4		.req	v12
30*b4a8528dSEric Biggers	T5		.req	v13
31*b4a8528dSEric Biggers	T6		.req	v14
32*b4a8528dSEric Biggers	T7		.req	v15
33*b4a8528dSEric Biggers
34*b4a8528dSEric Biggers.macro _nh_stride	k0, k1, k2, k3
35*b4a8528dSEric Biggers
36*b4a8528dSEric Biggers	// Load next message stride
37*b4a8528dSEric Biggers	ld1		{T3.16b}, [MESSAGE], #16
38*b4a8528dSEric Biggers
39*b4a8528dSEric Biggers	// Load next key stride
40*b4a8528dSEric Biggers	ld1		{\k3\().4s}, [KEY], #16
41*b4a8528dSEric Biggers
42*b4a8528dSEric Biggers	// Add message words to key words
43*b4a8528dSEric Biggers	add		T0.4s, T3.4s, \k0\().4s
44*b4a8528dSEric Biggers	add		T1.4s, T3.4s, \k1\().4s
45*b4a8528dSEric Biggers	add		T2.4s, T3.4s, \k2\().4s
46*b4a8528dSEric Biggers	add		T3.4s, T3.4s, \k3\().4s
47*b4a8528dSEric Biggers
48*b4a8528dSEric Biggers	// Multiply 32x32 => 64 and accumulate
49*b4a8528dSEric Biggers	mov		T4.d[0], T0.d[1]
50*b4a8528dSEric Biggers	mov		T5.d[0], T1.d[1]
51*b4a8528dSEric Biggers	mov		T6.d[0], T2.d[1]
52*b4a8528dSEric Biggers	mov		T7.d[0], T3.d[1]
53*b4a8528dSEric Biggers	umlal		PASS0_SUMS.2d, T0.2s, T4.2s
54*b4a8528dSEric Biggers	umlal		PASS1_SUMS.2d, T1.2s, T5.2s
55*b4a8528dSEric Biggers	umlal		PASS2_SUMS.2d, T2.2s, T6.2s
56*b4a8528dSEric Biggers	umlal		PASS3_SUMS.2d, T3.2s, T7.2s
57*b4a8528dSEric Biggers.endm
58*b4a8528dSEric Biggers
59*b4a8528dSEric Biggers/*
60*b4a8528dSEric Biggers * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
61*b4a8528dSEric Biggers *		__le64 hash[NH_NUM_PASSES])
62*b4a8528dSEric Biggers *
63*b4a8528dSEric Biggers * It's guaranteed that message_len % 16 == 0.
64*b4a8528dSEric Biggers */
65*b4a8528dSEric BiggersSYM_FUNC_START(nh_neon)
66*b4a8528dSEric Biggers
67*b4a8528dSEric Biggers	ld1		{K0.4s,K1.4s}, [KEY], #32
68*b4a8528dSEric Biggers	  movi		PASS0_SUMS.2d, #0
69*b4a8528dSEric Biggers	  movi		PASS1_SUMS.2d, #0
70*b4a8528dSEric Biggers	ld1		{K2.4s}, [KEY], #16
71*b4a8528dSEric Biggers	  movi		PASS2_SUMS.2d, #0
72*b4a8528dSEric Biggers	  movi		PASS3_SUMS.2d, #0
73*b4a8528dSEric Biggers
74*b4a8528dSEric Biggers	subs		MESSAGE_LEN, MESSAGE_LEN, #64
75*b4a8528dSEric Biggers	blt		.Lloop4_done
76*b4a8528dSEric Biggers.Lloop4:
77*b4a8528dSEric Biggers	_nh_stride	K0, K1, K2, K3
78*b4a8528dSEric Biggers	_nh_stride	K1, K2, K3, K0
79*b4a8528dSEric Biggers	_nh_stride	K2, K3, K0, K1
80*b4a8528dSEric Biggers	_nh_stride	K3, K0, K1, K2
81*b4a8528dSEric Biggers	subs		MESSAGE_LEN, MESSAGE_LEN, #64
82*b4a8528dSEric Biggers	bge		.Lloop4
83*b4a8528dSEric Biggers
84*b4a8528dSEric Biggers.Lloop4_done:
85*b4a8528dSEric Biggers	ands		MESSAGE_LEN, MESSAGE_LEN, #63
86*b4a8528dSEric Biggers	beq		.Ldone
87*b4a8528dSEric Biggers	_nh_stride	K0, K1, K2, K3
88*b4a8528dSEric Biggers
89*b4a8528dSEric Biggers	subs		MESSAGE_LEN, MESSAGE_LEN, #16
90*b4a8528dSEric Biggers	beq		.Ldone
91*b4a8528dSEric Biggers	_nh_stride	K1, K2, K3, K0
92*b4a8528dSEric Biggers
93*b4a8528dSEric Biggers	subs		MESSAGE_LEN, MESSAGE_LEN, #16
94*b4a8528dSEric Biggers	beq		.Ldone
95*b4a8528dSEric Biggers	_nh_stride	K2, K3, K0, K1
96*b4a8528dSEric Biggers
97*b4a8528dSEric Biggers.Ldone:
98*b4a8528dSEric Biggers	// Sum the accumulators for each pass, then store the sums to 'hash'
99*b4a8528dSEric Biggers	addp		T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
100*b4a8528dSEric Biggers	addp		T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
101*b4a8528dSEric Biggers	st1		{T0.16b,T1.16b}, [HASH]
102*b4a8528dSEric Biggers	ret
103*b4a8528dSEric BiggersSYM_FUNC_END(nh_neon)
104