1 // SPDX-License-Identifier: GPL-2.0-only 2 3 #include <linux/cache.h> 4 #include <asm/neon-intrinsics.h> 5 #include "xor_impl.h" 6 #include "xor_arch.h" 7 #include "xor-neon.h" 8 9 extern void __xor_eor3_2(unsigned long bytes, unsigned long * __restrict p1, 10 const unsigned long * __restrict p2); 11 12 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) 13 { 14 uint64x2_t res; 15 16 asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n" 17 "eor3 %0.16b, %1.16b, %2.16b, %3.16b" 18 : "=w"(res) : "w"(p), "w"(q), "w"(r)); 19 return res; 20 } 21 22 static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1, 23 const unsigned long * __restrict p2, 24 const unsigned long * __restrict p3) 25 { 26 uint64_t *dp1 = (uint64_t *)p1; 27 uint64_t *dp2 = (uint64_t *)p2; 28 uint64_t *dp3 = (uint64_t *)p3; 29 30 register uint64x2_t v0, v1, v2, v3; 31 long lines = bytes / (sizeof(uint64x2_t) * 4); 32 33 do { 34 /* p1 ^= p2 ^ p3 */ 35 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), 36 vld1q_u64(dp3 + 0)); 37 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), 38 vld1q_u64(dp3 + 2)); 39 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), 40 vld1q_u64(dp3 + 4)); 41 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), 42 vld1q_u64(dp3 + 6)); 43 44 /* store */ 45 vst1q_u64(dp1 + 0, v0); 46 vst1q_u64(dp1 + 2, v1); 47 vst1q_u64(dp1 + 4, v2); 48 vst1q_u64(dp1 + 6, v3); 49 50 dp1 += 8; 51 dp2 += 8; 52 dp3 += 8; 53 } while (--lines > 0); 54 } 55 56 static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1, 57 const unsigned long * __restrict p2, 58 const unsigned long * __restrict p3, 59 const unsigned long * __restrict p4) 60 { 61 uint64_t *dp1 = (uint64_t *)p1; 62 uint64_t *dp2 = (uint64_t *)p2; 63 uint64_t *dp3 = (uint64_t *)p3; 64 uint64_t *dp4 = (uint64_t *)p4; 65 66 register uint64x2_t v0, v1, v2, v3; 67 long lines = bytes / (sizeof(uint64x2_t) * 4); 68 69 do { 70 /* p1 ^= p2 ^ p3 */ 71 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), 72 vld1q_u64(dp3 + 0)); 73 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), 74 vld1q_u64(dp3 + 2)); 75 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), 76 vld1q_u64(dp3 + 4)); 77 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), 78 vld1q_u64(dp3 + 6)); 79 80 /* p1 ^= p4 */ 81 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 82 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 83 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 84 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 85 86 /* store */ 87 vst1q_u64(dp1 + 0, v0); 88 vst1q_u64(dp1 + 2, v1); 89 vst1q_u64(dp1 + 4, v2); 90 vst1q_u64(dp1 + 6, v3); 91 92 dp1 += 8; 93 dp2 += 8; 94 dp3 += 8; 95 dp4 += 8; 96 } while (--lines > 0); 97 } 98 99 static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1, 100 const unsigned long * __restrict p2, 101 const unsigned long * __restrict p3, 102 const unsigned long * __restrict p4, 103 const unsigned long * __restrict p5) 104 { 105 uint64_t *dp1 = (uint64_t *)p1; 106 uint64_t *dp2 = (uint64_t *)p2; 107 uint64_t *dp3 = (uint64_t *)p3; 108 uint64_t *dp4 = (uint64_t *)p4; 109 uint64_t *dp5 = (uint64_t *)p5; 110 111 register uint64x2_t v0, v1, v2, v3; 112 long lines = bytes / (sizeof(uint64x2_t) * 4); 113 114 do { 115 /* p1 ^= p2 ^ p3 */ 116 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), 117 vld1q_u64(dp3 + 0)); 118 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), 119 vld1q_u64(dp3 + 2)); 120 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), 121 vld1q_u64(dp3 + 4)); 122 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), 123 vld1q_u64(dp3 + 6)); 124 125 /* p1 ^= p4 ^ p5 */ 126 v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0)); 127 v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2)); 128 v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4)); 129 v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6)); 130 131 /* store */ 132 vst1q_u64(dp1 + 0, v0); 133 vst1q_u64(dp1 + 2, v1); 134 vst1q_u64(dp1 + 4, v2); 135 vst1q_u64(dp1 + 6, v3); 136 137 dp1 += 8; 138 dp2 += 8; 139 dp3 += 8; 140 dp4 += 8; 141 dp5 += 8; 142 } while (--lines > 0); 143 } 144 145 __DO_XOR_BLOCKS(eor3_inner, __xor_eor3_2, __xor_eor3_3, __xor_eor3_4, 146 __xor_eor3_5); 147