xref: /linux/lib/raid/xor/arm64/xor-eor3.c (revision 9611c0ce215a66770ccbe5c126bf57ba8c31bcad)
1 // SPDX-License-Identifier: GPL-2.0-only
2 
3 #include <linux/cache.h>
4 #include <asm/neon-intrinsics.h>
5 #include "xor_impl.h"
6 #include "xor_arch.h"
7 #include "xor-neon.h"
8 
9 extern void __xor_eor3_2(unsigned long bytes, unsigned long * __restrict p1,
10 		const unsigned long * __restrict p2);
11 
12 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
13 {
14 	uint64x2_t res;
15 
16 	asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
17 	    "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
18 	    : "=w"(res) : "w"(p), "w"(q), "w"(r));
19 	return res;
20 }
21 
22 static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
23 		const unsigned long * __restrict p2,
24 		const unsigned long * __restrict p3)
25 {
26 	uint64_t *dp1 = (uint64_t *)p1;
27 	uint64_t *dp2 = (uint64_t *)p2;
28 	uint64_t *dp3 = (uint64_t *)p3;
29 
30 	register uint64x2_t v0, v1, v2, v3;
31 	long lines = bytes / (sizeof(uint64x2_t) * 4);
32 
33 	do {
34 		/* p1 ^= p2 ^ p3 */
35 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
36 			  vld1q_u64(dp3 + 0));
37 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
38 			  vld1q_u64(dp3 + 2));
39 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
40 			  vld1q_u64(dp3 + 4));
41 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
42 			  vld1q_u64(dp3 + 6));
43 
44 		/* store */
45 		vst1q_u64(dp1 + 0, v0);
46 		vst1q_u64(dp1 + 2, v1);
47 		vst1q_u64(dp1 + 4, v2);
48 		vst1q_u64(dp1 + 6, v3);
49 
50 		dp1 += 8;
51 		dp2 += 8;
52 		dp3 += 8;
53 	} while (--lines > 0);
54 }
55 
56 static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
57 		const unsigned long * __restrict p2,
58 		const unsigned long * __restrict p3,
59 		const unsigned long * __restrict p4)
60 {
61 	uint64_t *dp1 = (uint64_t *)p1;
62 	uint64_t *dp2 = (uint64_t *)p2;
63 	uint64_t *dp3 = (uint64_t *)p3;
64 	uint64_t *dp4 = (uint64_t *)p4;
65 
66 	register uint64x2_t v0, v1, v2, v3;
67 	long lines = bytes / (sizeof(uint64x2_t) * 4);
68 
69 	do {
70 		/* p1 ^= p2 ^ p3 */
71 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
72 			  vld1q_u64(dp3 + 0));
73 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
74 			  vld1q_u64(dp3 + 2));
75 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
76 			  vld1q_u64(dp3 + 4));
77 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
78 			  vld1q_u64(dp3 + 6));
79 
80 		/* p1 ^= p4 */
81 		v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
82 		v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
83 		v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
84 		v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
85 
86 		/* store */
87 		vst1q_u64(dp1 + 0, v0);
88 		vst1q_u64(dp1 + 2, v1);
89 		vst1q_u64(dp1 + 4, v2);
90 		vst1q_u64(dp1 + 6, v3);
91 
92 		dp1 += 8;
93 		dp2 += 8;
94 		dp3 += 8;
95 		dp4 += 8;
96 	} while (--lines > 0);
97 }
98 
99 static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
100 		const unsigned long * __restrict p2,
101 		const unsigned long * __restrict p3,
102 		const unsigned long * __restrict p4,
103 		const unsigned long * __restrict p5)
104 {
105 	uint64_t *dp1 = (uint64_t *)p1;
106 	uint64_t *dp2 = (uint64_t *)p2;
107 	uint64_t *dp3 = (uint64_t *)p3;
108 	uint64_t *dp4 = (uint64_t *)p4;
109 	uint64_t *dp5 = (uint64_t *)p5;
110 
111 	register uint64x2_t v0, v1, v2, v3;
112 	long lines = bytes / (sizeof(uint64x2_t) * 4);
113 
114 	do {
115 		/* p1 ^= p2 ^ p3 */
116 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
117 			  vld1q_u64(dp3 + 0));
118 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
119 			  vld1q_u64(dp3 + 2));
120 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
121 			  vld1q_u64(dp3 + 4));
122 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
123 			  vld1q_u64(dp3 + 6));
124 
125 		/* p1 ^= p4 ^ p5 */
126 		v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
127 		v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
128 		v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
129 		v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
130 
131 		/* store */
132 		vst1q_u64(dp1 + 0, v0);
133 		vst1q_u64(dp1 + 2, v1);
134 		vst1q_u64(dp1 + 4, v2);
135 		vst1q_u64(dp1 + 6, v3);
136 
137 		dp1 += 8;
138 		dp2 += 8;
139 		dp3 += 8;
140 		dp4 += 8;
141 		dp5 += 8;
142 	} while (--lines > 0);
143 }
144 
145 __DO_XOR_BLOCKS(eor3_inner, __xor_eor3_2, __xor_eor3_3, __xor_eor3_4,
146 		__xor_eor3_5);
147