1*b4a8528dSEric Biggers/* SPDX-License-Identifier: GPL-2.0 */ 2*b4a8528dSEric Biggers/* 3*b4a8528dSEric Biggers * NH - ε-almost-universal hash function, ARM64 NEON accelerated version 4*b4a8528dSEric Biggers * 5*b4a8528dSEric Biggers * Copyright 2018 Google LLC 6*b4a8528dSEric Biggers * 7*b4a8528dSEric Biggers * Author: Eric Biggers <ebiggers@google.com> 8*b4a8528dSEric Biggers */ 9*b4a8528dSEric Biggers 10*b4a8528dSEric Biggers#include <linux/linkage.h> 11*b4a8528dSEric Biggers 12*b4a8528dSEric Biggers KEY .req x0 13*b4a8528dSEric Biggers MESSAGE .req x1 14*b4a8528dSEric Biggers MESSAGE_LEN .req x2 15*b4a8528dSEric Biggers HASH .req x3 16*b4a8528dSEric Biggers 17*b4a8528dSEric Biggers PASS0_SUMS .req v0 18*b4a8528dSEric Biggers PASS1_SUMS .req v1 19*b4a8528dSEric Biggers PASS2_SUMS .req v2 20*b4a8528dSEric Biggers PASS3_SUMS .req v3 21*b4a8528dSEric Biggers K0 .req v4 22*b4a8528dSEric Biggers K1 .req v5 23*b4a8528dSEric Biggers K2 .req v6 24*b4a8528dSEric Biggers K3 .req v7 25*b4a8528dSEric Biggers T0 .req v8 26*b4a8528dSEric Biggers T1 .req v9 27*b4a8528dSEric Biggers T2 .req v10 28*b4a8528dSEric Biggers T3 .req v11 29*b4a8528dSEric Biggers T4 .req v12 30*b4a8528dSEric Biggers T5 .req v13 31*b4a8528dSEric Biggers T6 .req v14 32*b4a8528dSEric Biggers T7 .req v15 33*b4a8528dSEric Biggers 34*b4a8528dSEric Biggers.macro _nh_stride k0, k1, k2, k3 35*b4a8528dSEric Biggers 36*b4a8528dSEric Biggers // Load next message stride 37*b4a8528dSEric Biggers ld1 {T3.16b}, [MESSAGE], #16 38*b4a8528dSEric Biggers 39*b4a8528dSEric Biggers // Load next key stride 40*b4a8528dSEric Biggers ld1 {\k3\().4s}, [KEY], #16 41*b4a8528dSEric Biggers 42*b4a8528dSEric Biggers // Add message words to key words 43*b4a8528dSEric Biggers add T0.4s, T3.4s, \k0\().4s 44*b4a8528dSEric Biggers add T1.4s, T3.4s, \k1\().4s 45*b4a8528dSEric Biggers add T2.4s, T3.4s, \k2\().4s 46*b4a8528dSEric Biggers add T3.4s, T3.4s, \k3\().4s 47*b4a8528dSEric Biggers 48*b4a8528dSEric Biggers // Multiply 32x32 => 64 and accumulate 49*b4a8528dSEric Biggers mov T4.d[0], T0.d[1] 50*b4a8528dSEric Biggers mov T5.d[0], T1.d[1] 51*b4a8528dSEric Biggers mov T6.d[0], T2.d[1] 52*b4a8528dSEric Biggers mov T7.d[0], T3.d[1] 53*b4a8528dSEric Biggers umlal PASS0_SUMS.2d, T0.2s, T4.2s 54*b4a8528dSEric Biggers umlal PASS1_SUMS.2d, T1.2s, T5.2s 55*b4a8528dSEric Biggers umlal PASS2_SUMS.2d, T2.2s, T6.2s 56*b4a8528dSEric Biggers umlal PASS3_SUMS.2d, T3.2s, T7.2s 57*b4a8528dSEric Biggers.endm 58*b4a8528dSEric Biggers 59*b4a8528dSEric Biggers/* 60*b4a8528dSEric Biggers * void nh_neon(const u32 *key, const u8 *message, size_t message_len, 61*b4a8528dSEric Biggers * __le64 hash[NH_NUM_PASSES]) 62*b4a8528dSEric Biggers * 63*b4a8528dSEric Biggers * It's guaranteed that message_len % 16 == 0. 64*b4a8528dSEric Biggers */ 65*b4a8528dSEric BiggersSYM_FUNC_START(nh_neon) 66*b4a8528dSEric Biggers 67*b4a8528dSEric Biggers ld1 {K0.4s,K1.4s}, [KEY], #32 68*b4a8528dSEric Biggers movi PASS0_SUMS.2d, #0 69*b4a8528dSEric Biggers movi PASS1_SUMS.2d, #0 70*b4a8528dSEric Biggers ld1 {K2.4s}, [KEY], #16 71*b4a8528dSEric Biggers movi PASS2_SUMS.2d, #0 72*b4a8528dSEric Biggers movi PASS3_SUMS.2d, #0 73*b4a8528dSEric Biggers 74*b4a8528dSEric Biggers subs MESSAGE_LEN, MESSAGE_LEN, #64 75*b4a8528dSEric Biggers blt .Lloop4_done 76*b4a8528dSEric Biggers.Lloop4: 77*b4a8528dSEric Biggers _nh_stride K0, K1, K2, K3 78*b4a8528dSEric Biggers _nh_stride K1, K2, K3, K0 79*b4a8528dSEric Biggers _nh_stride K2, K3, K0, K1 80*b4a8528dSEric Biggers _nh_stride K3, K0, K1, K2 81*b4a8528dSEric Biggers subs MESSAGE_LEN, MESSAGE_LEN, #64 82*b4a8528dSEric Biggers bge .Lloop4 83*b4a8528dSEric Biggers 84*b4a8528dSEric Biggers.Lloop4_done: 85*b4a8528dSEric Biggers ands MESSAGE_LEN, MESSAGE_LEN, #63 86*b4a8528dSEric Biggers beq .Ldone 87*b4a8528dSEric Biggers _nh_stride K0, K1, K2, K3 88*b4a8528dSEric Biggers 89*b4a8528dSEric Biggers subs MESSAGE_LEN, MESSAGE_LEN, #16 90*b4a8528dSEric Biggers beq .Ldone 91*b4a8528dSEric Biggers _nh_stride K1, K2, K3, K0 92*b4a8528dSEric Biggers 93*b4a8528dSEric Biggers subs MESSAGE_LEN, MESSAGE_LEN, #16 94*b4a8528dSEric Biggers beq .Ldone 95*b4a8528dSEric Biggers _nh_stride K2, K3, K0, K1 96*b4a8528dSEric Biggers 97*b4a8528dSEric Biggers.Ldone: 98*b4a8528dSEric Biggers // Sum the accumulators for each pass, then store the sums to 'hash' 99*b4a8528dSEric Biggers addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d 100*b4a8528dSEric Biggers addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d 101*b4a8528dSEric Biggers st1 {T0.16b,T1.16b}, [HASH] 102*b4a8528dSEric Biggers ret 103*b4a8528dSEric BiggersSYM_FUNC_END(nh_neon) 104