1*71e59795SEric Biggers/* SPDX-License-Identifier: GPL-2.0-only */ 2*71e59795SEric Biggers/* 3*71e59795SEric Biggers * Accelerated GHASH implementation with NEON vmull.p8 instructions. 4*71e59795SEric Biggers * 5*71e59795SEric Biggers * Copyright (C) 2015 - 2017 Linaro Ltd. 6*71e59795SEric Biggers * Copyright (C) 2023 Google LLC. <ardb@google.com> 7*71e59795SEric Biggers */ 8*71e59795SEric Biggers 9*71e59795SEric Biggers#include <linux/linkage.h> 10*71e59795SEric Biggers#include <asm/assembler.h> 11*71e59795SEric Biggers 12*71e59795SEric Biggers .fpu neon 13*71e59795SEric Biggers 14*71e59795SEric Biggers SHASH .req q0 15*71e59795SEric Biggers T1 .req q1 16*71e59795SEric Biggers XL .req q2 17*71e59795SEric Biggers XM .req q3 18*71e59795SEric Biggers XH .req q4 19*71e59795SEric Biggers IN1 .req q4 20*71e59795SEric Biggers 21*71e59795SEric Biggers SHASH_L .req d0 22*71e59795SEric Biggers SHASH_H .req d1 23*71e59795SEric Biggers T1_L .req d2 24*71e59795SEric Biggers T1_H .req d3 25*71e59795SEric Biggers XL_L .req d4 26*71e59795SEric Biggers XL_H .req d5 27*71e59795SEric Biggers XM_L .req d6 28*71e59795SEric Biggers XM_H .req d7 29*71e59795SEric Biggers XH_L .req d8 30*71e59795SEric Biggers 31*71e59795SEric Biggers t0l .req d10 32*71e59795SEric Biggers t0h .req d11 33*71e59795SEric Biggers t1l .req d12 34*71e59795SEric Biggers t1h .req d13 35*71e59795SEric Biggers t2l .req d14 36*71e59795SEric Biggers t2h .req d15 37*71e59795SEric Biggers t3l .req d16 38*71e59795SEric Biggers t3h .req d17 39*71e59795SEric Biggers t4l .req d18 40*71e59795SEric Biggers t4h .req d19 41*71e59795SEric Biggers 42*71e59795SEric Biggers t0q .req q5 43*71e59795SEric Biggers t1q .req q6 44*71e59795SEric Biggers t2q .req q7 45*71e59795SEric Biggers t3q .req q8 46*71e59795SEric Biggers t4q .req q9 47*71e59795SEric Biggers 48*71e59795SEric Biggers s1l .req d20 49*71e59795SEric Biggers s1h .req d21 50*71e59795SEric Biggers s2l .req d22 51*71e59795SEric Biggers s2h .req d23 52*71e59795SEric Biggers s3l .req d24 53*71e59795SEric Biggers s3h .req d25 54*71e59795SEric Biggers s4l .req d26 55*71e59795SEric Biggers s4h .req d27 56*71e59795SEric Biggers 57*71e59795SEric Biggers SHASH2_p8 .req d28 58*71e59795SEric Biggers 59*71e59795SEric Biggers k16 .req d29 60*71e59795SEric Biggers k32 .req d30 61*71e59795SEric Biggers k48 .req d31 62*71e59795SEric Biggers 63*71e59795SEric Biggers T2 .req q7 64*71e59795SEric Biggers 65*71e59795SEric Biggers .text 66*71e59795SEric Biggers 67*71e59795SEric Biggers /* 68*71e59795SEric Biggers * This implementation of 64x64 -> 128 bit polynomial multiplication 69*71e59795SEric Biggers * using vmull.p8 instructions (8x8 -> 16) is taken from the paper 70*71e59795SEric Biggers * "Fast Software Polynomial Multiplication on ARM Processors Using 71*71e59795SEric Biggers * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and 72*71e59795SEric Biggers * Ricardo Dahab (https://hal.inria.fr/hal-01506572) 73*71e59795SEric Biggers * 74*71e59795SEric Biggers * It has been slightly tweaked for in-order performance, and to allow 75*71e59795SEric Biggers * 'rq' to overlap with 'ad' or 'bd'. 76*71e59795SEric Biggers */ 77*71e59795SEric Biggers .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l 78*71e59795SEric Biggers vext.8 t0l, \ad, \ad, #1 @ A1 79*71e59795SEric Biggers .ifc \b1, t4l 80*71e59795SEric Biggers vext.8 t4l, \bd, \bd, #1 @ B1 81*71e59795SEric Biggers .endif 82*71e59795SEric Biggers vmull.p8 t0q, t0l, \bd @ F = A1*B 83*71e59795SEric Biggers vext.8 t1l, \ad, \ad, #2 @ A2 84*71e59795SEric Biggers vmull.p8 t4q, \ad, \b1 @ E = A*B1 85*71e59795SEric Biggers .ifc \b2, t3l 86*71e59795SEric Biggers vext.8 t3l, \bd, \bd, #2 @ B2 87*71e59795SEric Biggers .endif 88*71e59795SEric Biggers vmull.p8 t1q, t1l, \bd @ H = A2*B 89*71e59795SEric Biggers vext.8 t2l, \ad, \ad, #3 @ A3 90*71e59795SEric Biggers vmull.p8 t3q, \ad, \b2 @ G = A*B2 91*71e59795SEric Biggers veor t0q, t0q, t4q @ L = E + F 92*71e59795SEric Biggers .ifc \b3, t4l 93*71e59795SEric Biggers vext.8 t4l, \bd, \bd, #3 @ B3 94*71e59795SEric Biggers .endif 95*71e59795SEric Biggers vmull.p8 t2q, t2l, \bd @ J = A3*B 96*71e59795SEric Biggers veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 97*71e59795SEric Biggers veor t1q, t1q, t3q @ M = G + H 98*71e59795SEric Biggers .ifc \b4, t3l 99*71e59795SEric Biggers vext.8 t3l, \bd, \bd, #4 @ B4 100*71e59795SEric Biggers .endif 101*71e59795SEric Biggers vmull.p8 t4q, \ad, \b3 @ I = A*B3 102*71e59795SEric Biggers veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 103*71e59795SEric Biggers vmull.p8 t3q, \ad, \b4 @ K = A*B4 104*71e59795SEric Biggers vand t0h, t0h, k48 105*71e59795SEric Biggers vand t1h, t1h, k32 106*71e59795SEric Biggers veor t2q, t2q, t4q @ N = I + J 107*71e59795SEric Biggers veor t0l, t0l, t0h 108*71e59795SEric Biggers veor t1l, t1l, t1h 109*71e59795SEric Biggers veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 110*71e59795SEric Biggers vand t2h, t2h, k16 111*71e59795SEric Biggers veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 112*71e59795SEric Biggers vmov.i64 t3h, #0 113*71e59795SEric Biggers vext.8 t0q, t0q, t0q, #15 114*71e59795SEric Biggers veor t2l, t2l, t2h 115*71e59795SEric Biggers vext.8 t1q, t1q, t1q, #14 116*71e59795SEric Biggers vmull.p8 \rq, \ad, \bd @ D = A*B 117*71e59795SEric Biggers vext.8 t2q, t2q, t2q, #13 118*71e59795SEric Biggers vext.8 t3q, t3q, t3q, #12 119*71e59795SEric Biggers veor t0q, t0q, t1q 120*71e59795SEric Biggers veor t2q, t2q, t3q 121*71e59795SEric Biggers veor \rq, \rq, t0q 122*71e59795SEric Biggers veor \rq, \rq, t2q 123*71e59795SEric Biggers .endm 124*71e59795SEric Biggers 125*71e59795SEric Biggers .macro __pmull_reduce_p8 126*71e59795SEric Biggers veor XL_H, XL_H, XM_L 127*71e59795SEric Biggers veor XH_L, XH_L, XM_H 128*71e59795SEric Biggers 129*71e59795SEric Biggers vshl.i64 T1, XL, #57 130*71e59795SEric Biggers vshl.i64 T2, XL, #62 131*71e59795SEric Biggers veor T1, T1, T2 132*71e59795SEric Biggers vshl.i64 T2, XL, #63 133*71e59795SEric Biggers veor T1, T1, T2 134*71e59795SEric Biggers veor XL_H, XL_H, T1_L 135*71e59795SEric Biggers veor XH_L, XH_L, T1_H 136*71e59795SEric Biggers 137*71e59795SEric Biggers vshr.u64 T1, XL, #1 138*71e59795SEric Biggers veor XH, XH, XL 139*71e59795SEric Biggers veor XL, XL, T1 140*71e59795SEric Biggers vshr.u64 T1, T1, #6 141*71e59795SEric Biggers vshr.u64 XL, XL, #1 142*71e59795SEric Biggers .endm 143*71e59795SEric Biggers 144*71e59795SEric Biggers .macro vrev64_if_be a 145*71e59795SEric Biggers#ifdef CONFIG_CPU_BIG_ENDIAN 146*71e59795SEric Biggers vrev64.8 \a, \a 147*71e59795SEric Biggers#endif 148*71e59795SEric Biggers .endm 149*71e59795SEric Biggers 150*71e59795SEric Biggers .macro ghash_update 151*71e59795SEric Biggers vld1.64 {XL}, [r1] 152*71e59795SEric Biggers vrev64_if_be XL 153*71e59795SEric Biggers 154*71e59795SEric Biggers0: 155*71e59795SEric Biggers vld1.8 {T1}, [r2]! 156*71e59795SEric Biggers subs r0, r0, #1 157*71e59795SEric Biggers 158*71e59795SEric Biggers /* multiply XL by SHASH in GF(2^128) */ 159*71e59795SEric Biggers vrev64.8 T1, T1 160*71e59795SEric Biggers 161*71e59795SEric Biggers vext.8 IN1, T1, T1, #8 162*71e59795SEric Biggers veor T1_L, T1_L, XL_H 163*71e59795SEric Biggers veor XL, XL, IN1 164*71e59795SEric Biggers 165*71e59795SEric Biggers __pmull_p8 XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 166*71e59795SEric Biggers veor T1, T1, XL 167*71e59795SEric Biggers __pmull_p8 XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 168*71e59795SEric Biggers __pmull_p8 XM, T1_L, SHASH2_p8 @ (a1+a0)(b1+b0) 169*71e59795SEric Biggers 170*71e59795SEric Biggers veor T1, XL, XH 171*71e59795SEric Biggers veor XM, XM, T1 172*71e59795SEric Biggers 173*71e59795SEric Biggers __pmull_reduce_p8 174*71e59795SEric Biggers 175*71e59795SEric Biggers veor T1, T1, XH 176*71e59795SEric Biggers veor XL, XL, T1 177*71e59795SEric Biggers 178*71e59795SEric Biggers bne 0b 179*71e59795SEric Biggers .endm 180*71e59795SEric Biggers 181*71e59795SEric Biggers /* 182*71e59795SEric Biggers * void pmull_ghash_update_p8(size_t blocks, struct polyval_elem *dg, 183*71e59795SEric Biggers * const u8 *src, 184*71e59795SEric Biggers * const struct polyval_elem *h) 185*71e59795SEric Biggers */ 186*71e59795SEric BiggersENTRY(pmull_ghash_update_p8) 187*71e59795SEric Biggers vld1.64 {SHASH}, [r3] 188*71e59795SEric Biggers vrev64_if_be SHASH 189*71e59795SEric Biggers veor SHASH2_p8, SHASH_L, SHASH_H 190*71e59795SEric Biggers 191*71e59795SEric Biggers vext.8 s1l, SHASH_L, SHASH_L, #1 192*71e59795SEric Biggers vext.8 s2l, SHASH_L, SHASH_L, #2 193*71e59795SEric Biggers vext.8 s3l, SHASH_L, SHASH_L, #3 194*71e59795SEric Biggers vext.8 s4l, SHASH_L, SHASH_L, #4 195*71e59795SEric Biggers vext.8 s1h, SHASH_H, SHASH_H, #1 196*71e59795SEric Biggers vext.8 s2h, SHASH_H, SHASH_H, #2 197*71e59795SEric Biggers vext.8 s3h, SHASH_H, SHASH_H, #3 198*71e59795SEric Biggers vext.8 s4h, SHASH_H, SHASH_H, #4 199*71e59795SEric Biggers 200*71e59795SEric Biggers vmov.i64 k16, #0xffff 201*71e59795SEric Biggers vmov.i64 k32, #0xffffffff 202*71e59795SEric Biggers vmov.i64 k48, #0xffffffffffff 203*71e59795SEric Biggers 204*71e59795SEric Biggers ghash_update 205*71e59795SEric Biggers vrev64_if_be XL 206*71e59795SEric Biggers vst1.64 {XL}, [r1] 207*71e59795SEric Biggers 208*71e59795SEric Biggers bx lr 209*71e59795SEric BiggersENDPROC(pmull_ghash_update_p8) 210