xref: /linux/arch/arm64/lib/xor-neon.c (revision 64b14a184e83eb62ea0615e31a409956049d40e7)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * arch/arm64/lib/xor-neon.c
4  *
5  * Authors: Jackie Liu <liuyun01@kylinos.cn>
6  * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
7  */
8 
9 #include <linux/raid/xor.h>
10 #include <linux/module.h>
11 #include <asm/neon-intrinsics.h>
12 
13 void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
14 	unsigned long *p2)
15 {
16 	uint64_t *dp1 = (uint64_t *)p1;
17 	uint64_t *dp2 = (uint64_t *)p2;
18 
19 	register uint64x2_t v0, v1, v2, v3;
20 	long lines = bytes / (sizeof(uint64x2_t) * 4);
21 
22 	do {
23 		/* p1 ^= p2 */
24 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
25 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
26 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
27 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
28 
29 		/* store */
30 		vst1q_u64(dp1 +  0, v0);
31 		vst1q_u64(dp1 +  2, v1);
32 		vst1q_u64(dp1 +  4, v2);
33 		vst1q_u64(dp1 +  6, v3);
34 
35 		dp1 += 8;
36 		dp2 += 8;
37 	} while (--lines > 0);
38 }
39 
40 void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
41 	unsigned long *p2, unsigned long *p3)
42 {
43 	uint64_t *dp1 = (uint64_t *)p1;
44 	uint64_t *dp2 = (uint64_t *)p2;
45 	uint64_t *dp3 = (uint64_t *)p3;
46 
47 	register uint64x2_t v0, v1, v2, v3;
48 	long lines = bytes / (sizeof(uint64x2_t) * 4);
49 
50 	do {
51 		/* p1 ^= p2 */
52 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
53 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
54 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
55 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
56 
57 		/* p1 ^= p3 */
58 		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
59 		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
60 		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
61 		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
62 
63 		/* store */
64 		vst1q_u64(dp1 +  0, v0);
65 		vst1q_u64(dp1 +  2, v1);
66 		vst1q_u64(dp1 +  4, v2);
67 		vst1q_u64(dp1 +  6, v3);
68 
69 		dp1 += 8;
70 		dp2 += 8;
71 		dp3 += 8;
72 	} while (--lines > 0);
73 }
74 
75 void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
76 	unsigned long *p2, unsigned long *p3, unsigned long *p4)
77 {
78 	uint64_t *dp1 = (uint64_t *)p1;
79 	uint64_t *dp2 = (uint64_t *)p2;
80 	uint64_t *dp3 = (uint64_t *)p3;
81 	uint64_t *dp4 = (uint64_t *)p4;
82 
83 	register uint64x2_t v0, v1, v2, v3;
84 	long lines = bytes / (sizeof(uint64x2_t) * 4);
85 
86 	do {
87 		/* p1 ^= p2 */
88 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
89 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
90 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
91 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
92 
93 		/* p1 ^= p3 */
94 		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
95 		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
96 		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
97 		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
98 
99 		/* p1 ^= p4 */
100 		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
101 		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
102 		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
103 		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
104 
105 		/* store */
106 		vst1q_u64(dp1 +  0, v0);
107 		vst1q_u64(dp1 +  2, v1);
108 		vst1q_u64(dp1 +  4, v2);
109 		vst1q_u64(dp1 +  6, v3);
110 
111 		dp1 += 8;
112 		dp2 += 8;
113 		dp3 += 8;
114 		dp4 += 8;
115 	} while (--lines > 0);
116 }
117 
118 void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
119 	unsigned long *p2, unsigned long *p3,
120 	unsigned long *p4, unsigned long *p5)
121 {
122 	uint64_t *dp1 = (uint64_t *)p1;
123 	uint64_t *dp2 = (uint64_t *)p2;
124 	uint64_t *dp3 = (uint64_t *)p3;
125 	uint64_t *dp4 = (uint64_t *)p4;
126 	uint64_t *dp5 = (uint64_t *)p5;
127 
128 	register uint64x2_t v0, v1, v2, v3;
129 	long lines = bytes / (sizeof(uint64x2_t) * 4);
130 
131 	do {
132 		/* p1 ^= p2 */
133 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
134 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
135 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
136 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
137 
138 		/* p1 ^= p3 */
139 		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
140 		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
141 		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
142 		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
143 
144 		/* p1 ^= p4 */
145 		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
146 		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
147 		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
148 		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
149 
150 		/* p1 ^= p5 */
151 		v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
152 		v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
153 		v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
154 		v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
155 
156 		/* store */
157 		vst1q_u64(dp1 +  0, v0);
158 		vst1q_u64(dp1 +  2, v1);
159 		vst1q_u64(dp1 +  4, v2);
160 		vst1q_u64(dp1 +  6, v3);
161 
162 		dp1 += 8;
163 		dp2 += 8;
164 		dp3 += 8;
165 		dp4 += 8;
166 		dp5 += 8;
167 	} while (--lines > 0);
168 }
169 
170 struct xor_block_template xor_block_inner_neon __ro_after_init = {
171 	.name	= "__inner_neon__",
172 	.do_2	= xor_arm64_neon_2,
173 	.do_3	= xor_arm64_neon_3,
174 	.do_4	= xor_arm64_neon_4,
175 	.do_5	= xor_arm64_neon_5,
176 };
177 EXPORT_SYMBOL(xor_block_inner_neon);
178 
179 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
180 {
181 	uint64x2_t res;
182 
183 	asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
184 	    "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
185 	    : "=w"(res) : "w"(p), "w"(q), "w"(r));
186 	return res;
187 }
188 
189 static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1,
190 			     unsigned long *p2, unsigned long *p3)
191 {
192 	uint64_t *dp1 = (uint64_t *)p1;
193 	uint64_t *dp2 = (uint64_t *)p2;
194 	uint64_t *dp3 = (uint64_t *)p3;
195 
196 	register uint64x2_t v0, v1, v2, v3;
197 	long lines = bytes / (sizeof(uint64x2_t) * 4);
198 
199 	do {
200 		/* p1 ^= p2 ^ p3 */
201 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
202 			  vld1q_u64(dp3 + 0));
203 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
204 			  vld1q_u64(dp3 + 2));
205 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
206 			  vld1q_u64(dp3 + 4));
207 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
208 			  vld1q_u64(dp3 + 6));
209 
210 		/* store */
211 		vst1q_u64(dp1 + 0, v0);
212 		vst1q_u64(dp1 + 2, v1);
213 		vst1q_u64(dp1 + 4, v2);
214 		vst1q_u64(dp1 + 6, v3);
215 
216 		dp1 += 8;
217 		dp2 += 8;
218 		dp3 += 8;
219 	} while (--lines > 0);
220 }
221 
222 static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1,
223 			     unsigned long *p2, unsigned long *p3,
224 			     unsigned long *p4)
225 {
226 	uint64_t *dp1 = (uint64_t *)p1;
227 	uint64_t *dp2 = (uint64_t *)p2;
228 	uint64_t *dp3 = (uint64_t *)p3;
229 	uint64_t *dp4 = (uint64_t *)p4;
230 
231 	register uint64x2_t v0, v1, v2, v3;
232 	long lines = bytes / (sizeof(uint64x2_t) * 4);
233 
234 	do {
235 		/* p1 ^= p2 ^ p3 */
236 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
237 			  vld1q_u64(dp3 + 0));
238 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
239 			  vld1q_u64(dp3 + 2));
240 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
241 			  vld1q_u64(dp3 + 4));
242 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
243 			  vld1q_u64(dp3 + 6));
244 
245 		/* p1 ^= p4 */
246 		v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
247 		v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
248 		v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
249 		v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
250 
251 		/* store */
252 		vst1q_u64(dp1 + 0, v0);
253 		vst1q_u64(dp1 + 2, v1);
254 		vst1q_u64(dp1 + 4, v2);
255 		vst1q_u64(dp1 + 6, v3);
256 
257 		dp1 += 8;
258 		dp2 += 8;
259 		dp3 += 8;
260 		dp4 += 8;
261 	} while (--lines > 0);
262 }
263 
264 static void xor_arm64_eor3_5(unsigned long bytes, unsigned long *p1,
265 			     unsigned long *p2, unsigned long *p3,
266 			     unsigned long *p4, unsigned long *p5)
267 {
268 	uint64_t *dp1 = (uint64_t *)p1;
269 	uint64_t *dp2 = (uint64_t *)p2;
270 	uint64_t *dp3 = (uint64_t *)p3;
271 	uint64_t *dp4 = (uint64_t *)p4;
272 	uint64_t *dp5 = (uint64_t *)p5;
273 
274 	register uint64x2_t v0, v1, v2, v3;
275 	long lines = bytes / (sizeof(uint64x2_t) * 4);
276 
277 	do {
278 		/* p1 ^= p2 ^ p3 */
279 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
280 			  vld1q_u64(dp3 + 0));
281 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
282 			  vld1q_u64(dp3 + 2));
283 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
284 			  vld1q_u64(dp3 + 4));
285 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
286 			  vld1q_u64(dp3 + 6));
287 
288 		/* p1 ^= p4 ^ p5 */
289 		v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
290 		v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
291 		v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
292 		v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
293 
294 		/* store */
295 		vst1q_u64(dp1 + 0, v0);
296 		vst1q_u64(dp1 + 2, v1);
297 		vst1q_u64(dp1 + 4, v2);
298 		vst1q_u64(dp1 + 6, v3);
299 
300 		dp1 += 8;
301 		dp2 += 8;
302 		dp3 += 8;
303 		dp4 += 8;
304 		dp5 += 8;
305 	} while (--lines > 0);
306 }
307 
308 static int __init xor_neon_init(void)
309 {
310 	if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
311 		xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
312 		xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
313 		xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
314 	}
315 	return 0;
316 }
317 module_init(xor_neon_init);
318 
319 static void __exit xor_neon_exit(void)
320 {
321 }
322 module_exit(xor_neon_exit);
323 
324 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
325 MODULE_DESCRIPTION("ARMv8 XOR Extensions");
326 MODULE_LICENSE("GPL");
327