1*29e39a11SEric Biggers/* SPDX-License-Identifier: GPL-2.0 */ 2*29e39a11SEric Biggers/* 3*29e39a11SEric Biggers * NH - ε-almost-universal hash function, NEON accelerated version 4*29e39a11SEric Biggers * 5*29e39a11SEric Biggers * Copyright 2018 Google LLC 6*29e39a11SEric Biggers * 7*29e39a11SEric Biggers * Author: Eric Biggers <ebiggers@google.com> 8*29e39a11SEric Biggers */ 9*29e39a11SEric Biggers 10*29e39a11SEric Biggers#include <linux/linkage.h> 11*29e39a11SEric Biggers 12*29e39a11SEric Biggers .text 13*29e39a11SEric Biggers .fpu neon 14*29e39a11SEric Biggers 15*29e39a11SEric Biggers KEY .req r0 16*29e39a11SEric Biggers MESSAGE .req r1 17*29e39a11SEric Biggers MESSAGE_LEN .req r2 18*29e39a11SEric Biggers HASH .req r3 19*29e39a11SEric Biggers 20*29e39a11SEric Biggers PASS0_SUMS .req q0 21*29e39a11SEric Biggers PASS0_SUM_A .req d0 22*29e39a11SEric Biggers PASS0_SUM_B .req d1 23*29e39a11SEric Biggers PASS1_SUMS .req q1 24*29e39a11SEric Biggers PASS1_SUM_A .req d2 25*29e39a11SEric Biggers PASS1_SUM_B .req d3 26*29e39a11SEric Biggers PASS2_SUMS .req q2 27*29e39a11SEric Biggers PASS2_SUM_A .req d4 28*29e39a11SEric Biggers PASS2_SUM_B .req d5 29*29e39a11SEric Biggers PASS3_SUMS .req q3 30*29e39a11SEric Biggers PASS3_SUM_A .req d6 31*29e39a11SEric Biggers PASS3_SUM_B .req d7 32*29e39a11SEric Biggers K0 .req q4 33*29e39a11SEric Biggers K1 .req q5 34*29e39a11SEric Biggers K2 .req q6 35*29e39a11SEric Biggers K3 .req q7 36*29e39a11SEric Biggers T0 .req q8 37*29e39a11SEric Biggers T0_L .req d16 38*29e39a11SEric Biggers T0_H .req d17 39*29e39a11SEric Biggers T1 .req q9 40*29e39a11SEric Biggers T1_L .req d18 41*29e39a11SEric Biggers T1_H .req d19 42*29e39a11SEric Biggers T2 .req q10 43*29e39a11SEric Biggers T2_L .req d20 44*29e39a11SEric Biggers T2_H .req d21 45*29e39a11SEric Biggers T3 .req q11 46*29e39a11SEric Biggers T3_L .req d22 47*29e39a11SEric Biggers T3_H .req d23 48*29e39a11SEric Biggers 49*29e39a11SEric Biggers.macro _nh_stride k0, k1, k2, k3 50*29e39a11SEric Biggers 51*29e39a11SEric Biggers // Load next message stride 52*29e39a11SEric Biggers vld1.8 {T3}, [MESSAGE]! 53*29e39a11SEric Biggers 54*29e39a11SEric Biggers // Load next key stride 55*29e39a11SEric Biggers vld1.32 {\k3}, [KEY]! 56*29e39a11SEric Biggers 57*29e39a11SEric Biggers // Add message words to key words 58*29e39a11SEric Biggers vadd.u32 T0, T3, \k0 59*29e39a11SEric Biggers vadd.u32 T1, T3, \k1 60*29e39a11SEric Biggers vadd.u32 T2, T3, \k2 61*29e39a11SEric Biggers vadd.u32 T3, T3, \k3 62*29e39a11SEric Biggers 63*29e39a11SEric Biggers // Multiply 32x32 => 64 and accumulate 64*29e39a11SEric Biggers vmlal.u32 PASS0_SUMS, T0_L, T0_H 65*29e39a11SEric Biggers vmlal.u32 PASS1_SUMS, T1_L, T1_H 66*29e39a11SEric Biggers vmlal.u32 PASS2_SUMS, T2_L, T2_H 67*29e39a11SEric Biggers vmlal.u32 PASS3_SUMS, T3_L, T3_H 68*29e39a11SEric Biggers.endm 69*29e39a11SEric Biggers 70*29e39a11SEric Biggers/* 71*29e39a11SEric Biggers * void nh_neon(const u32 *key, const u8 *message, size_t message_len, 72*29e39a11SEric Biggers * __le64 hash[NH_NUM_PASSES]) 73*29e39a11SEric Biggers * 74*29e39a11SEric Biggers * It's guaranteed that message_len % 16 == 0. 75*29e39a11SEric Biggers */ 76*29e39a11SEric BiggersENTRY(nh_neon) 77*29e39a11SEric Biggers 78*29e39a11SEric Biggers vld1.32 {K0,K1}, [KEY]! 79*29e39a11SEric Biggers vmov.u64 PASS0_SUMS, #0 80*29e39a11SEric Biggers vmov.u64 PASS1_SUMS, #0 81*29e39a11SEric Biggers vld1.32 {K2}, [KEY]! 82*29e39a11SEric Biggers vmov.u64 PASS2_SUMS, #0 83*29e39a11SEric Biggers vmov.u64 PASS3_SUMS, #0 84*29e39a11SEric Biggers 85*29e39a11SEric Biggers subs MESSAGE_LEN, MESSAGE_LEN, #64 86*29e39a11SEric Biggers blt .Lloop4_done 87*29e39a11SEric Biggers.Lloop4: 88*29e39a11SEric Biggers _nh_stride K0, K1, K2, K3 89*29e39a11SEric Biggers _nh_stride K1, K2, K3, K0 90*29e39a11SEric Biggers _nh_stride K2, K3, K0, K1 91*29e39a11SEric Biggers _nh_stride K3, K0, K1, K2 92*29e39a11SEric Biggers subs MESSAGE_LEN, MESSAGE_LEN, #64 93*29e39a11SEric Biggers bge .Lloop4 94*29e39a11SEric Biggers 95*29e39a11SEric Biggers.Lloop4_done: 96*29e39a11SEric Biggers ands MESSAGE_LEN, MESSAGE_LEN, #63 97*29e39a11SEric Biggers beq .Ldone 98*29e39a11SEric Biggers _nh_stride K0, K1, K2, K3 99*29e39a11SEric Biggers 100*29e39a11SEric Biggers subs MESSAGE_LEN, MESSAGE_LEN, #16 101*29e39a11SEric Biggers beq .Ldone 102*29e39a11SEric Biggers _nh_stride K1, K2, K3, K0 103*29e39a11SEric Biggers 104*29e39a11SEric Biggers subs MESSAGE_LEN, MESSAGE_LEN, #16 105*29e39a11SEric Biggers beq .Ldone 106*29e39a11SEric Biggers _nh_stride K2, K3, K0, K1 107*29e39a11SEric Biggers 108*29e39a11SEric Biggers.Ldone: 109*29e39a11SEric Biggers // Sum the accumulators for each pass, then store the sums to 'hash' 110*29e39a11SEric Biggers vadd.u64 T0_L, PASS0_SUM_A, PASS0_SUM_B 111*29e39a11SEric Biggers vadd.u64 T0_H, PASS1_SUM_A, PASS1_SUM_B 112*29e39a11SEric Biggers vadd.u64 T1_L, PASS2_SUM_A, PASS2_SUM_B 113*29e39a11SEric Biggers vadd.u64 T1_H, PASS3_SUM_A, PASS3_SUM_B 114*29e39a11SEric Biggers vst1.8 {T0-T1}, [HASH] 115*29e39a11SEric Biggers bx lr 116*29e39a11SEric BiggersENDPROC(nh_neon) 117