1*4d3c5cbfSArd Biesheuvel // SPDX-License-Identifier: GPL-2.0-only 2*4d3c5cbfSArd Biesheuvel 3*4d3c5cbfSArd Biesheuvel #include <linux/cache.h> 4*4d3c5cbfSArd Biesheuvel #include <asm/neon-intrinsics.h> 5*4d3c5cbfSArd Biesheuvel #include "xor_impl.h" 6*4d3c5cbfSArd Biesheuvel #include "xor_arch.h" 7*4d3c5cbfSArd Biesheuvel #include "xor-neon.h" 8*4d3c5cbfSArd Biesheuvel 9*4d3c5cbfSArd Biesheuvel extern void __xor_eor3_2(unsigned long bytes, unsigned long * __restrict p1, 10*4d3c5cbfSArd Biesheuvel const unsigned long * __restrict p2); 11*4d3c5cbfSArd Biesheuvel 12*4d3c5cbfSArd Biesheuvel static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) 13*4d3c5cbfSArd Biesheuvel { 14*4d3c5cbfSArd Biesheuvel uint64x2_t res; 15*4d3c5cbfSArd Biesheuvel 16*4d3c5cbfSArd Biesheuvel asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n" 17*4d3c5cbfSArd Biesheuvel "eor3 %0.16b, %1.16b, %2.16b, %3.16b" 18*4d3c5cbfSArd Biesheuvel : "=w"(res) : "w"(p), "w"(q), "w"(r)); 19*4d3c5cbfSArd Biesheuvel return res; 20*4d3c5cbfSArd Biesheuvel } 21*4d3c5cbfSArd Biesheuvel 22*4d3c5cbfSArd Biesheuvel static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1, 23*4d3c5cbfSArd Biesheuvel const unsigned long * __restrict p2, 24*4d3c5cbfSArd Biesheuvel const unsigned long * __restrict p3) 25*4d3c5cbfSArd Biesheuvel { 26*4d3c5cbfSArd Biesheuvel uint64_t *dp1 = (uint64_t *)p1; 27*4d3c5cbfSArd Biesheuvel uint64_t *dp2 = (uint64_t *)p2; 28*4d3c5cbfSArd Biesheuvel uint64_t *dp3 = (uint64_t *)p3; 29*4d3c5cbfSArd Biesheuvel 30*4d3c5cbfSArd Biesheuvel register uint64x2_t v0, v1, v2, v3; 31*4d3c5cbfSArd Biesheuvel long lines = bytes / (sizeof(uint64x2_t) * 4); 32*4d3c5cbfSArd Biesheuvel 33*4d3c5cbfSArd Biesheuvel do { 34*4d3c5cbfSArd Biesheuvel /* p1 ^= p2 ^ p3 */ 35*4d3c5cbfSArd Biesheuvel v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), 36*4d3c5cbfSArd Biesheuvel vld1q_u64(dp3 + 0)); 37*4d3c5cbfSArd Biesheuvel v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), 38*4d3c5cbfSArd Biesheuvel vld1q_u64(dp3 + 2)); 39*4d3c5cbfSArd Biesheuvel v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), 40*4d3c5cbfSArd Biesheuvel vld1q_u64(dp3 + 4)); 41*4d3c5cbfSArd Biesheuvel v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), 42*4d3c5cbfSArd Biesheuvel vld1q_u64(dp3 + 6)); 43*4d3c5cbfSArd Biesheuvel 44*4d3c5cbfSArd Biesheuvel /* store */ 45*4d3c5cbfSArd Biesheuvel vst1q_u64(dp1 + 0, v0); 46*4d3c5cbfSArd Biesheuvel vst1q_u64(dp1 + 2, v1); 47*4d3c5cbfSArd Biesheuvel vst1q_u64(dp1 + 4, v2); 48*4d3c5cbfSArd Biesheuvel vst1q_u64(dp1 + 6, v3); 49*4d3c5cbfSArd Biesheuvel 50*4d3c5cbfSArd Biesheuvel dp1 += 8; 51*4d3c5cbfSArd Biesheuvel dp2 += 8; 52*4d3c5cbfSArd Biesheuvel dp3 += 8; 53*4d3c5cbfSArd Biesheuvel } while (--lines > 0); 54*4d3c5cbfSArd Biesheuvel } 55*4d3c5cbfSArd Biesheuvel 56*4d3c5cbfSArd Biesheuvel static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1, 57*4d3c5cbfSArd Biesheuvel const unsigned long * __restrict p2, 58*4d3c5cbfSArd Biesheuvel const unsigned long * __restrict p3, 59*4d3c5cbfSArd Biesheuvel const unsigned long * __restrict p4) 60*4d3c5cbfSArd Biesheuvel { 61*4d3c5cbfSArd Biesheuvel uint64_t *dp1 = (uint64_t *)p1; 62*4d3c5cbfSArd Biesheuvel uint64_t *dp2 = (uint64_t *)p2; 63*4d3c5cbfSArd Biesheuvel uint64_t *dp3 = (uint64_t *)p3; 64*4d3c5cbfSArd Biesheuvel uint64_t *dp4 = (uint64_t *)p4; 65*4d3c5cbfSArd Biesheuvel 66*4d3c5cbfSArd Biesheuvel register uint64x2_t v0, v1, v2, v3; 67*4d3c5cbfSArd Biesheuvel long lines = bytes / (sizeof(uint64x2_t) * 4); 68*4d3c5cbfSArd Biesheuvel 69*4d3c5cbfSArd Biesheuvel do { 70*4d3c5cbfSArd Biesheuvel /* p1 ^= p2 ^ p3 */ 71*4d3c5cbfSArd Biesheuvel v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), 72*4d3c5cbfSArd Biesheuvel vld1q_u64(dp3 + 0)); 73*4d3c5cbfSArd Biesheuvel v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), 74*4d3c5cbfSArd Biesheuvel vld1q_u64(dp3 + 2)); 75*4d3c5cbfSArd Biesheuvel v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), 76*4d3c5cbfSArd Biesheuvel vld1q_u64(dp3 + 4)); 77*4d3c5cbfSArd Biesheuvel v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), 78*4d3c5cbfSArd Biesheuvel vld1q_u64(dp3 + 6)); 79*4d3c5cbfSArd Biesheuvel 80*4d3c5cbfSArd Biesheuvel /* p1 ^= p4 */ 81*4d3c5cbfSArd Biesheuvel v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 82*4d3c5cbfSArd Biesheuvel v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 83*4d3c5cbfSArd Biesheuvel v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 84*4d3c5cbfSArd Biesheuvel v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 85*4d3c5cbfSArd Biesheuvel 86*4d3c5cbfSArd Biesheuvel /* store */ 87*4d3c5cbfSArd Biesheuvel vst1q_u64(dp1 + 0, v0); 88*4d3c5cbfSArd Biesheuvel vst1q_u64(dp1 + 2, v1); 89*4d3c5cbfSArd Biesheuvel vst1q_u64(dp1 + 4, v2); 90*4d3c5cbfSArd Biesheuvel vst1q_u64(dp1 + 6, v3); 91*4d3c5cbfSArd Biesheuvel 92*4d3c5cbfSArd Biesheuvel dp1 += 8; 93*4d3c5cbfSArd Biesheuvel dp2 += 8; 94*4d3c5cbfSArd Biesheuvel dp3 += 8; 95*4d3c5cbfSArd Biesheuvel dp4 += 8; 96*4d3c5cbfSArd Biesheuvel } while (--lines > 0); 97*4d3c5cbfSArd Biesheuvel } 98*4d3c5cbfSArd Biesheuvel 99*4d3c5cbfSArd Biesheuvel static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1, 100*4d3c5cbfSArd Biesheuvel const unsigned long * __restrict p2, 101*4d3c5cbfSArd Biesheuvel const unsigned long * __restrict p3, 102*4d3c5cbfSArd Biesheuvel const unsigned long * __restrict p4, 103*4d3c5cbfSArd Biesheuvel const unsigned long * __restrict p5) 104*4d3c5cbfSArd Biesheuvel { 105*4d3c5cbfSArd Biesheuvel uint64_t *dp1 = (uint64_t *)p1; 106*4d3c5cbfSArd Biesheuvel uint64_t *dp2 = (uint64_t *)p2; 107*4d3c5cbfSArd Biesheuvel uint64_t *dp3 = (uint64_t *)p3; 108*4d3c5cbfSArd Biesheuvel uint64_t *dp4 = (uint64_t *)p4; 109*4d3c5cbfSArd Biesheuvel uint64_t *dp5 = (uint64_t *)p5; 110*4d3c5cbfSArd Biesheuvel 111*4d3c5cbfSArd Biesheuvel register uint64x2_t v0, v1, v2, v3; 112*4d3c5cbfSArd Biesheuvel long lines = bytes / (sizeof(uint64x2_t) * 4); 113*4d3c5cbfSArd Biesheuvel 114*4d3c5cbfSArd Biesheuvel do { 115*4d3c5cbfSArd Biesheuvel /* p1 ^= p2 ^ p3 */ 116*4d3c5cbfSArd Biesheuvel v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), 117*4d3c5cbfSArd Biesheuvel vld1q_u64(dp3 + 0)); 118*4d3c5cbfSArd Biesheuvel v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), 119*4d3c5cbfSArd Biesheuvel vld1q_u64(dp3 + 2)); 120*4d3c5cbfSArd Biesheuvel v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), 121*4d3c5cbfSArd Biesheuvel vld1q_u64(dp3 + 4)); 122*4d3c5cbfSArd Biesheuvel v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), 123*4d3c5cbfSArd Biesheuvel vld1q_u64(dp3 + 6)); 124*4d3c5cbfSArd Biesheuvel 125*4d3c5cbfSArd Biesheuvel /* p1 ^= p4 ^ p5 */ 126*4d3c5cbfSArd Biesheuvel v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0)); 127*4d3c5cbfSArd Biesheuvel v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2)); 128*4d3c5cbfSArd Biesheuvel v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4)); 129*4d3c5cbfSArd Biesheuvel v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6)); 130*4d3c5cbfSArd Biesheuvel 131*4d3c5cbfSArd Biesheuvel /* store */ 132*4d3c5cbfSArd Biesheuvel vst1q_u64(dp1 + 0, v0); 133*4d3c5cbfSArd Biesheuvel vst1q_u64(dp1 + 2, v1); 134*4d3c5cbfSArd Biesheuvel vst1q_u64(dp1 + 4, v2); 135*4d3c5cbfSArd Biesheuvel vst1q_u64(dp1 + 6, v3); 136*4d3c5cbfSArd Biesheuvel 137*4d3c5cbfSArd Biesheuvel dp1 += 8; 138*4d3c5cbfSArd Biesheuvel dp2 += 8; 139*4d3c5cbfSArd Biesheuvel dp3 += 8; 140*4d3c5cbfSArd Biesheuvel dp4 += 8; 141*4d3c5cbfSArd Biesheuvel dp5 += 8; 142*4d3c5cbfSArd Biesheuvel } while (--lines > 0); 143*4d3c5cbfSArd Biesheuvel } 144*4d3c5cbfSArd Biesheuvel 145*4d3c5cbfSArd Biesheuvel __DO_XOR_BLOCKS(eor3_inner, __xor_eor3_2, __xor_eor3_3, __xor_eor3_4, 146*4d3c5cbfSArd Biesheuvel __xor_eor3_5); 147