xref: /linux/lib/crypto/arm/nh-neon-core.S (revision c17ee635fd3a482b2ad2bf5e269755c2eae5f25e)
1*29e39a11SEric Biggers/* SPDX-License-Identifier: GPL-2.0 */
2*29e39a11SEric Biggers/*
3*29e39a11SEric Biggers * NH - ε-almost-universal hash function, NEON accelerated version
4*29e39a11SEric Biggers *
5*29e39a11SEric Biggers * Copyright 2018 Google LLC
6*29e39a11SEric Biggers *
7*29e39a11SEric Biggers * Author: Eric Biggers <ebiggers@google.com>
8*29e39a11SEric Biggers */
9*29e39a11SEric Biggers
10*29e39a11SEric Biggers#include <linux/linkage.h>
11*29e39a11SEric Biggers
12*29e39a11SEric Biggers	.text
13*29e39a11SEric Biggers	.fpu		neon
14*29e39a11SEric Biggers
15*29e39a11SEric Biggers	KEY		.req	r0
16*29e39a11SEric Biggers	MESSAGE		.req	r1
17*29e39a11SEric Biggers	MESSAGE_LEN	.req	r2
18*29e39a11SEric Biggers	HASH		.req	r3
19*29e39a11SEric Biggers
20*29e39a11SEric Biggers	PASS0_SUMS	.req	q0
21*29e39a11SEric Biggers	PASS0_SUM_A	.req	d0
22*29e39a11SEric Biggers	PASS0_SUM_B	.req	d1
23*29e39a11SEric Biggers	PASS1_SUMS	.req	q1
24*29e39a11SEric Biggers	PASS1_SUM_A	.req	d2
25*29e39a11SEric Biggers	PASS1_SUM_B	.req	d3
26*29e39a11SEric Biggers	PASS2_SUMS	.req	q2
27*29e39a11SEric Biggers	PASS2_SUM_A	.req	d4
28*29e39a11SEric Biggers	PASS2_SUM_B	.req	d5
29*29e39a11SEric Biggers	PASS3_SUMS	.req	q3
30*29e39a11SEric Biggers	PASS3_SUM_A	.req	d6
31*29e39a11SEric Biggers	PASS3_SUM_B	.req	d7
32*29e39a11SEric Biggers	K0		.req	q4
33*29e39a11SEric Biggers	K1		.req	q5
34*29e39a11SEric Biggers	K2		.req	q6
35*29e39a11SEric Biggers	K3		.req	q7
36*29e39a11SEric Biggers	T0		.req	q8
37*29e39a11SEric Biggers	T0_L		.req	d16
38*29e39a11SEric Biggers	T0_H		.req	d17
39*29e39a11SEric Biggers	T1		.req	q9
40*29e39a11SEric Biggers	T1_L		.req	d18
41*29e39a11SEric Biggers	T1_H		.req	d19
42*29e39a11SEric Biggers	T2		.req	q10
43*29e39a11SEric Biggers	T2_L		.req	d20
44*29e39a11SEric Biggers	T2_H		.req	d21
45*29e39a11SEric Biggers	T3		.req	q11
46*29e39a11SEric Biggers	T3_L		.req	d22
47*29e39a11SEric Biggers	T3_H		.req	d23
48*29e39a11SEric Biggers
49*29e39a11SEric Biggers.macro _nh_stride	k0, k1, k2, k3
50*29e39a11SEric Biggers
51*29e39a11SEric Biggers	// Load next message stride
52*29e39a11SEric Biggers	vld1.8		{T3}, [MESSAGE]!
53*29e39a11SEric Biggers
54*29e39a11SEric Biggers	// Load next key stride
55*29e39a11SEric Biggers	vld1.32		{\k3}, [KEY]!
56*29e39a11SEric Biggers
57*29e39a11SEric Biggers	// Add message words to key words
58*29e39a11SEric Biggers	vadd.u32	T0, T3, \k0
59*29e39a11SEric Biggers	vadd.u32	T1, T3, \k1
60*29e39a11SEric Biggers	vadd.u32	T2, T3, \k2
61*29e39a11SEric Biggers	vadd.u32	T3, T3, \k3
62*29e39a11SEric Biggers
63*29e39a11SEric Biggers	// Multiply 32x32 => 64 and accumulate
64*29e39a11SEric Biggers	vmlal.u32	PASS0_SUMS, T0_L, T0_H
65*29e39a11SEric Biggers	vmlal.u32	PASS1_SUMS, T1_L, T1_H
66*29e39a11SEric Biggers	vmlal.u32	PASS2_SUMS, T2_L, T2_H
67*29e39a11SEric Biggers	vmlal.u32	PASS3_SUMS, T3_L, T3_H
68*29e39a11SEric Biggers.endm
69*29e39a11SEric Biggers
70*29e39a11SEric Biggers/*
71*29e39a11SEric Biggers * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
72*29e39a11SEric Biggers *		__le64 hash[NH_NUM_PASSES])
73*29e39a11SEric Biggers *
74*29e39a11SEric Biggers * It's guaranteed that message_len % 16 == 0.
75*29e39a11SEric Biggers */
76*29e39a11SEric BiggersENTRY(nh_neon)
77*29e39a11SEric Biggers
78*29e39a11SEric Biggers	vld1.32		{K0,K1}, [KEY]!
79*29e39a11SEric Biggers	  vmov.u64	PASS0_SUMS, #0
80*29e39a11SEric Biggers	  vmov.u64	PASS1_SUMS, #0
81*29e39a11SEric Biggers	vld1.32		{K2}, [KEY]!
82*29e39a11SEric Biggers	  vmov.u64	PASS2_SUMS, #0
83*29e39a11SEric Biggers	  vmov.u64	PASS3_SUMS, #0
84*29e39a11SEric Biggers
85*29e39a11SEric Biggers	subs		MESSAGE_LEN, MESSAGE_LEN, #64
86*29e39a11SEric Biggers	blt		.Lloop4_done
87*29e39a11SEric Biggers.Lloop4:
88*29e39a11SEric Biggers	_nh_stride	K0, K1, K2, K3
89*29e39a11SEric Biggers	_nh_stride	K1, K2, K3, K0
90*29e39a11SEric Biggers	_nh_stride	K2, K3, K0, K1
91*29e39a11SEric Biggers	_nh_stride	K3, K0, K1, K2
92*29e39a11SEric Biggers	subs		MESSAGE_LEN, MESSAGE_LEN, #64
93*29e39a11SEric Biggers	bge		.Lloop4
94*29e39a11SEric Biggers
95*29e39a11SEric Biggers.Lloop4_done:
96*29e39a11SEric Biggers	ands		MESSAGE_LEN, MESSAGE_LEN, #63
97*29e39a11SEric Biggers	beq		.Ldone
98*29e39a11SEric Biggers	_nh_stride	K0, K1, K2, K3
99*29e39a11SEric Biggers
100*29e39a11SEric Biggers	subs		MESSAGE_LEN, MESSAGE_LEN, #16
101*29e39a11SEric Biggers	beq		.Ldone
102*29e39a11SEric Biggers	_nh_stride	K1, K2, K3, K0
103*29e39a11SEric Biggers
104*29e39a11SEric Biggers	subs		MESSAGE_LEN, MESSAGE_LEN, #16
105*29e39a11SEric Biggers	beq		.Ldone
106*29e39a11SEric Biggers	_nh_stride	K2, K3, K0, K1
107*29e39a11SEric Biggers
108*29e39a11SEric Biggers.Ldone:
109*29e39a11SEric Biggers	// Sum the accumulators for each pass, then store the sums to 'hash'
110*29e39a11SEric Biggers	vadd.u64	T0_L, PASS0_SUM_A, PASS0_SUM_B
111*29e39a11SEric Biggers	vadd.u64	T0_H, PASS1_SUM_A, PASS1_SUM_B
112*29e39a11SEric Biggers	vadd.u64	T1_L, PASS2_SUM_A, PASS2_SUM_B
113*29e39a11SEric Biggers	vadd.u64	T1_H, PASS3_SUM_A, PASS3_SUM_B
114*29e39a11SEric Biggers	vst1.8		{T0-T1}, [HASH]
115*29e39a11SEric Biggers	bx		lr
116*29e39a11SEric BiggersENDPROC(nh_neon)
117