xref: /linux/lib/raid/xor/arm64/xor-neon.c (revision 440d6635b20037bc9ad46b20817d7b61cef0fc1b)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Authors: Jackie Liu <liuyun01@kylinos.cn>
4  * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
5  */
6 
7 #include <linux/cache.h>
8 #include <asm/neon-intrinsics.h>
9 #include "xor_impl.h"
10 #include "xor_arch.h"
11 #include "xor-neon.h"
12 
13 static void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
14 		const unsigned long * __restrict p2)
15 {
16 	uint64_t *dp1 = (uint64_t *)p1;
17 	uint64_t *dp2 = (uint64_t *)p2;
18 
19 	register uint64x2_t v0, v1, v2, v3;
20 	long lines = bytes / (sizeof(uint64x2_t) * 4);
21 
22 	do {
23 		/* p1 ^= p2 */
24 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
25 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
26 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
27 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
28 
29 		/* store */
30 		vst1q_u64(dp1 +  0, v0);
31 		vst1q_u64(dp1 +  2, v1);
32 		vst1q_u64(dp1 +  4, v2);
33 		vst1q_u64(dp1 +  6, v3);
34 
35 		dp1 += 8;
36 		dp2 += 8;
37 	} while (--lines > 0);
38 }
39 
40 static void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
41 		const unsigned long * __restrict p2,
42 		const unsigned long * __restrict p3)
43 {
44 	uint64_t *dp1 = (uint64_t *)p1;
45 	uint64_t *dp2 = (uint64_t *)p2;
46 	uint64_t *dp3 = (uint64_t *)p3;
47 
48 	register uint64x2_t v0, v1, v2, v3;
49 	long lines = bytes / (sizeof(uint64x2_t) * 4);
50 
51 	do {
52 		/* p1 ^= p2 */
53 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
54 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
55 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
56 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
57 
58 		/* p1 ^= p3 */
59 		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
60 		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
61 		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
62 		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
63 
64 		/* store */
65 		vst1q_u64(dp1 +  0, v0);
66 		vst1q_u64(dp1 +  2, v1);
67 		vst1q_u64(dp1 +  4, v2);
68 		vst1q_u64(dp1 +  6, v3);
69 
70 		dp1 += 8;
71 		dp2 += 8;
72 		dp3 += 8;
73 	} while (--lines > 0);
74 }
75 
76 static void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
77 		const unsigned long * __restrict p2,
78 		const unsigned long * __restrict p3,
79 		const unsigned long * __restrict p4)
80 {
81 	uint64_t *dp1 = (uint64_t *)p1;
82 	uint64_t *dp2 = (uint64_t *)p2;
83 	uint64_t *dp3 = (uint64_t *)p3;
84 	uint64_t *dp4 = (uint64_t *)p4;
85 
86 	register uint64x2_t v0, v1, v2, v3;
87 	long lines = bytes / (sizeof(uint64x2_t) * 4);
88 
89 	do {
90 		/* p1 ^= p2 */
91 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
92 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
93 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
94 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
95 
96 		/* p1 ^= p3 */
97 		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
98 		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
99 		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
100 		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
101 
102 		/* p1 ^= p4 */
103 		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
104 		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
105 		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
106 		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
107 
108 		/* store */
109 		vst1q_u64(dp1 +  0, v0);
110 		vst1q_u64(dp1 +  2, v1);
111 		vst1q_u64(dp1 +  4, v2);
112 		vst1q_u64(dp1 +  6, v3);
113 
114 		dp1 += 8;
115 		dp2 += 8;
116 		dp3 += 8;
117 		dp4 += 8;
118 	} while (--lines > 0);
119 }
120 
121 static void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
122 		const unsigned long * __restrict p2,
123 		const unsigned long * __restrict p3,
124 		const unsigned long * __restrict p4,
125 		const unsigned long * __restrict p5)
126 {
127 	uint64_t *dp1 = (uint64_t *)p1;
128 	uint64_t *dp2 = (uint64_t *)p2;
129 	uint64_t *dp3 = (uint64_t *)p3;
130 	uint64_t *dp4 = (uint64_t *)p4;
131 	uint64_t *dp5 = (uint64_t *)p5;
132 
133 	register uint64x2_t v0, v1, v2, v3;
134 	long lines = bytes / (sizeof(uint64x2_t) * 4);
135 
136 	do {
137 		/* p1 ^= p2 */
138 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
139 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
140 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
141 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
142 
143 		/* p1 ^= p3 */
144 		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
145 		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
146 		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
147 		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
148 
149 		/* p1 ^= p4 */
150 		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
151 		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
152 		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
153 		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
154 
155 		/* p1 ^= p5 */
156 		v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
157 		v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
158 		v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
159 		v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
160 
161 		/* store */
162 		vst1q_u64(dp1 +  0, v0);
163 		vst1q_u64(dp1 +  2, v1);
164 		vst1q_u64(dp1 +  4, v2);
165 		vst1q_u64(dp1 +  6, v3);
166 
167 		dp1 += 8;
168 		dp2 += 8;
169 		dp3 += 8;
170 		dp4 += 8;
171 		dp5 += 8;
172 	} while (--lines > 0);
173 }
174 
175 __DO_XOR_BLOCKS(neon_inner, __xor_neon_2, __xor_neon_3, __xor_neon_4,
176 		__xor_neon_5);
177 
178 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
179 {
180 	uint64x2_t res;
181 
182 	asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
183 	    "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
184 	    : "=w"(res) : "w"(p), "w"(q), "w"(r));
185 	return res;
186 }
187 
188 static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
189 		const unsigned long * __restrict p2,
190 		const unsigned long * __restrict p3)
191 {
192 	uint64_t *dp1 = (uint64_t *)p1;
193 	uint64_t *dp2 = (uint64_t *)p2;
194 	uint64_t *dp3 = (uint64_t *)p3;
195 
196 	register uint64x2_t v0, v1, v2, v3;
197 	long lines = bytes / (sizeof(uint64x2_t) * 4);
198 
199 	do {
200 		/* p1 ^= p2 ^ p3 */
201 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
202 			  vld1q_u64(dp3 + 0));
203 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
204 			  vld1q_u64(dp3 + 2));
205 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
206 			  vld1q_u64(dp3 + 4));
207 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
208 			  vld1q_u64(dp3 + 6));
209 
210 		/* store */
211 		vst1q_u64(dp1 + 0, v0);
212 		vst1q_u64(dp1 + 2, v1);
213 		vst1q_u64(dp1 + 4, v2);
214 		vst1q_u64(dp1 + 6, v3);
215 
216 		dp1 += 8;
217 		dp2 += 8;
218 		dp3 += 8;
219 	} while (--lines > 0);
220 }
221 
222 static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
223 		const unsigned long * __restrict p2,
224 		const unsigned long * __restrict p3,
225 		const unsigned long * __restrict p4)
226 {
227 	uint64_t *dp1 = (uint64_t *)p1;
228 	uint64_t *dp2 = (uint64_t *)p2;
229 	uint64_t *dp3 = (uint64_t *)p3;
230 	uint64_t *dp4 = (uint64_t *)p4;
231 
232 	register uint64x2_t v0, v1, v2, v3;
233 	long lines = bytes / (sizeof(uint64x2_t) * 4);
234 
235 	do {
236 		/* p1 ^= p2 ^ p3 */
237 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
238 			  vld1q_u64(dp3 + 0));
239 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
240 			  vld1q_u64(dp3 + 2));
241 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
242 			  vld1q_u64(dp3 + 4));
243 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
244 			  vld1q_u64(dp3 + 6));
245 
246 		/* p1 ^= p4 */
247 		v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
248 		v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
249 		v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
250 		v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
251 
252 		/* store */
253 		vst1q_u64(dp1 + 0, v0);
254 		vst1q_u64(dp1 + 2, v1);
255 		vst1q_u64(dp1 + 4, v2);
256 		vst1q_u64(dp1 + 6, v3);
257 
258 		dp1 += 8;
259 		dp2 += 8;
260 		dp3 += 8;
261 		dp4 += 8;
262 	} while (--lines > 0);
263 }
264 
265 static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
266 		const unsigned long * __restrict p2,
267 		const unsigned long * __restrict p3,
268 		const unsigned long * __restrict p4,
269 		const unsigned long * __restrict p5)
270 {
271 	uint64_t *dp1 = (uint64_t *)p1;
272 	uint64_t *dp2 = (uint64_t *)p2;
273 	uint64_t *dp3 = (uint64_t *)p3;
274 	uint64_t *dp4 = (uint64_t *)p4;
275 	uint64_t *dp5 = (uint64_t *)p5;
276 
277 	register uint64x2_t v0, v1, v2, v3;
278 	long lines = bytes / (sizeof(uint64x2_t) * 4);
279 
280 	do {
281 		/* p1 ^= p2 ^ p3 */
282 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
283 			  vld1q_u64(dp3 + 0));
284 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
285 			  vld1q_u64(dp3 + 2));
286 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
287 			  vld1q_u64(dp3 + 4));
288 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
289 			  vld1q_u64(dp3 + 6));
290 
291 		/* p1 ^= p4 ^ p5 */
292 		v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
293 		v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
294 		v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
295 		v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
296 
297 		/* store */
298 		vst1q_u64(dp1 + 0, v0);
299 		vst1q_u64(dp1 + 2, v1);
300 		vst1q_u64(dp1 + 4, v2);
301 		vst1q_u64(dp1 + 6, v3);
302 
303 		dp1 += 8;
304 		dp2 += 8;
305 		dp3 += 8;
306 		dp4 += 8;
307 		dp5 += 8;
308 	} while (--lines > 0);
309 }
310 
311 __DO_XOR_BLOCKS(eor3_inner, __xor_neon_2, __xor_eor3_3, __xor_eor3_4,
312 		__xor_eor3_5);
313