xref: /linux/lib/raid6/loongarch_simd.c (revision 24168c5e6dfbdd5b414f048f47f75d64533296ca)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX)
4  *
5  * Copyright 2023 WANG Xuerui <git@xen0n.name>
6  *
7  * Based on the generic RAID-6 code (int.uc):
8  *
9  * Copyright 2002-2004 H. Peter Anvin
10  */
11 
12 #include <linux/raid/pq.h>
13 #include "loongarch.h"
14 
15 /*
16  * The vector algorithms are currently priority 0, which means the generic
17  * scalar algorithms are not being disabled if vector support is present.
18  * This is like the similar LoongArch RAID5 XOR code, with the main reason
19  * repeated here: it cannot be ruled out at this point of time, that some
20  * future (maybe reduced) models could run the vector algorithms slower than
21  * the scalar ones, maybe for errata or micro-op reasons. It may be
22  * appropriate to revisit this after one or two more uarch generations.
23  */
24 
25 #ifdef CONFIG_CPU_HAS_LSX
26 #define NSIZE 16
27 
28 static int raid6_has_lsx(void)
29 {
30 	return cpu_has_lsx;
31 }
32 
33 static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs)
34 {
35 	u8 **dptr = (u8 **)ptrs;
36 	u8 *p, *q;
37 	int d, z, z0;
38 
39 	z0 = disks - 3;		/* Highest data disk */
40 	p = dptr[z0+1];		/* XOR parity */
41 	q = dptr[z0+2];		/* RS syndrome */
42 
43 	kernel_fpu_begin();
44 
45 	/*
46 	 * $vr0, $vr1, $vr2, $vr3: wp
47 	 * $vr4, $vr5, $vr6, $vr7: wq
48 	 * $vr8, $vr9, $vr10, $vr11: wd
49 	 * $vr12, $vr13, $vr14, $vr15: w2
50 	 * $vr16, $vr17, $vr18, $vr19: w1
51 	 */
52 	for (d = 0; d < bytes; d += NSIZE*4) {
53 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
54 		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
55 		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
56 		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
57 		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
58 		asm volatile("vori.b $vr4, $vr0, 0");
59 		asm volatile("vori.b $vr5, $vr1, 0");
60 		asm volatile("vori.b $vr6, $vr2, 0");
61 		asm volatile("vori.b $vr7, $vr3, 0");
62 		for (z = z0-1; z >= 0; z--) {
63 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
64 			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
65 			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
66 			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
67 			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
68 			/* wp$$ ^= wd$$; */
69 			asm volatile("vxor.v $vr0, $vr0, $vr8");
70 			asm volatile("vxor.v $vr1, $vr1, $vr9");
71 			asm volatile("vxor.v $vr2, $vr2, $vr10");
72 			asm volatile("vxor.v $vr3, $vr3, $vr11");
73 			/* w2$$ = MASK(wq$$); */
74 			asm volatile("vslti.b $vr12, $vr4, 0");
75 			asm volatile("vslti.b $vr13, $vr5, 0");
76 			asm volatile("vslti.b $vr14, $vr6, 0");
77 			asm volatile("vslti.b $vr15, $vr7, 0");
78 			/* w1$$ = SHLBYTE(wq$$); */
79 			asm volatile("vslli.b $vr16, $vr4, 1");
80 			asm volatile("vslli.b $vr17, $vr5, 1");
81 			asm volatile("vslli.b $vr18, $vr6, 1");
82 			asm volatile("vslli.b $vr19, $vr7, 1");
83 			/* w2$$ &= NBYTES(0x1d); */
84 			asm volatile("vandi.b $vr12, $vr12, 0x1d");
85 			asm volatile("vandi.b $vr13, $vr13, 0x1d");
86 			asm volatile("vandi.b $vr14, $vr14, 0x1d");
87 			asm volatile("vandi.b $vr15, $vr15, 0x1d");
88 			/* w1$$ ^= w2$$; */
89 			asm volatile("vxor.v $vr16, $vr16, $vr12");
90 			asm volatile("vxor.v $vr17, $vr17, $vr13");
91 			asm volatile("vxor.v $vr18, $vr18, $vr14");
92 			asm volatile("vxor.v $vr19, $vr19, $vr15");
93 			/* wq$$ = w1$$ ^ wd$$; */
94 			asm volatile("vxor.v $vr4, $vr16, $vr8");
95 			asm volatile("vxor.v $vr5, $vr17, $vr9");
96 			asm volatile("vxor.v $vr6, $vr18, $vr10");
97 			asm volatile("vxor.v $vr7, $vr19, $vr11");
98 		}
99 		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
100 		asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0]));
101 		asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1]));
102 		asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2]));
103 		asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3]));
104 		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
105 		asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0]));
106 		asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1]));
107 		asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2]));
108 		asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3]));
109 	}
110 
111 	kernel_fpu_end();
112 }
113 
114 static void raid6_lsx_xor_syndrome(int disks, int start, int stop,
115 				   size_t bytes, void **ptrs)
116 {
117 	u8 **dptr = (u8 **)ptrs;
118 	u8 *p, *q;
119 	int d, z, z0;
120 
121 	z0 = stop;		/* P/Q right side optimization */
122 	p = dptr[disks-2];	/* XOR parity */
123 	q = dptr[disks-1];	/* RS syndrome */
124 
125 	kernel_fpu_begin();
126 
127 	/*
128 	 * $vr0, $vr1, $vr2, $vr3: wp
129 	 * $vr4, $vr5, $vr6, $vr7: wq
130 	 * $vr8, $vr9, $vr10, $vr11: wd
131 	 * $vr12, $vr13, $vr14, $vr15: w2
132 	 * $vr16, $vr17, $vr18, $vr19: w1
133 	 */
134 	for (d = 0; d < bytes; d += NSIZE*4) {
135 		/* P/Q data pages */
136 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
137 		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
138 		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
139 		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
140 		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
141 		asm volatile("vori.b $vr4, $vr0, 0");
142 		asm volatile("vori.b $vr5, $vr1, 0");
143 		asm volatile("vori.b $vr6, $vr2, 0");
144 		asm volatile("vori.b $vr7, $vr3, 0");
145 		for (z = z0-1; z >= start; z--) {
146 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
147 			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
148 			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
149 			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
150 			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
151 			/* wp$$ ^= wd$$; */
152 			asm volatile("vxor.v $vr0, $vr0, $vr8");
153 			asm volatile("vxor.v $vr1, $vr1, $vr9");
154 			asm volatile("vxor.v $vr2, $vr2, $vr10");
155 			asm volatile("vxor.v $vr3, $vr3, $vr11");
156 			/* w2$$ = MASK(wq$$); */
157 			asm volatile("vslti.b $vr12, $vr4, 0");
158 			asm volatile("vslti.b $vr13, $vr5, 0");
159 			asm volatile("vslti.b $vr14, $vr6, 0");
160 			asm volatile("vslti.b $vr15, $vr7, 0");
161 			/* w1$$ = SHLBYTE(wq$$); */
162 			asm volatile("vslli.b $vr16, $vr4, 1");
163 			asm volatile("vslli.b $vr17, $vr5, 1");
164 			asm volatile("vslli.b $vr18, $vr6, 1");
165 			asm volatile("vslli.b $vr19, $vr7, 1");
166 			/* w2$$ &= NBYTES(0x1d); */
167 			asm volatile("vandi.b $vr12, $vr12, 0x1d");
168 			asm volatile("vandi.b $vr13, $vr13, 0x1d");
169 			asm volatile("vandi.b $vr14, $vr14, 0x1d");
170 			asm volatile("vandi.b $vr15, $vr15, 0x1d");
171 			/* w1$$ ^= w2$$; */
172 			asm volatile("vxor.v $vr16, $vr16, $vr12");
173 			asm volatile("vxor.v $vr17, $vr17, $vr13");
174 			asm volatile("vxor.v $vr18, $vr18, $vr14");
175 			asm volatile("vxor.v $vr19, $vr19, $vr15");
176 			/* wq$$ = w1$$ ^ wd$$; */
177 			asm volatile("vxor.v $vr4, $vr16, $vr8");
178 			asm volatile("vxor.v $vr5, $vr17, $vr9");
179 			asm volatile("vxor.v $vr6, $vr18, $vr10");
180 			asm volatile("vxor.v $vr7, $vr19, $vr11");
181 		}
182 
183 		/* P/Q left side optimization */
184 		for (z = start-1; z >= 0; z--) {
185 			/* w2$$ = MASK(wq$$); */
186 			asm volatile("vslti.b $vr12, $vr4, 0");
187 			asm volatile("vslti.b $vr13, $vr5, 0");
188 			asm volatile("vslti.b $vr14, $vr6, 0");
189 			asm volatile("vslti.b $vr15, $vr7, 0");
190 			/* w1$$ = SHLBYTE(wq$$); */
191 			asm volatile("vslli.b $vr16, $vr4, 1");
192 			asm volatile("vslli.b $vr17, $vr5, 1");
193 			asm volatile("vslli.b $vr18, $vr6, 1");
194 			asm volatile("vslli.b $vr19, $vr7, 1");
195 			/* w2$$ &= NBYTES(0x1d); */
196 			asm volatile("vandi.b $vr12, $vr12, 0x1d");
197 			asm volatile("vandi.b $vr13, $vr13, 0x1d");
198 			asm volatile("vandi.b $vr14, $vr14, 0x1d");
199 			asm volatile("vandi.b $vr15, $vr15, 0x1d");
200 			/* wq$$ = w1$$ ^ w2$$; */
201 			asm volatile("vxor.v $vr4, $vr16, $vr12");
202 			asm volatile("vxor.v $vr5, $vr17, $vr13");
203 			asm volatile("vxor.v $vr6, $vr18, $vr14");
204 			asm volatile("vxor.v $vr7, $vr19, $vr15");
205 		}
206 		/*
207 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
208 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
209 		 */
210 		asm volatile(
211 			"vld $vr20, %0\n\t"
212 			"vld $vr21, %1\n\t"
213 			"vld $vr22, %2\n\t"
214 			"vld $vr23, %3\n\t"
215 			"vld $vr24, %4\n\t"
216 			"vld $vr25, %5\n\t"
217 			"vld $vr26, %6\n\t"
218 			"vld $vr27, %7\n\t"
219 			"vxor.v $vr20, $vr20, $vr0\n\t"
220 			"vxor.v $vr21, $vr21, $vr1\n\t"
221 			"vxor.v $vr22, $vr22, $vr2\n\t"
222 			"vxor.v $vr23, $vr23, $vr3\n\t"
223 			"vxor.v $vr24, $vr24, $vr4\n\t"
224 			"vxor.v $vr25, $vr25, $vr5\n\t"
225 			"vxor.v $vr26, $vr26, $vr6\n\t"
226 			"vxor.v $vr27, $vr27, $vr7\n\t"
227 			"vst $vr20, %0\n\t"
228 			"vst $vr21, %1\n\t"
229 			"vst $vr22, %2\n\t"
230 			"vst $vr23, %3\n\t"
231 			"vst $vr24, %4\n\t"
232 			"vst $vr25, %5\n\t"
233 			"vst $vr26, %6\n\t"
234 			"vst $vr27, %7\n\t"
235 			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
236 			  "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]),
237 			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]),
238 			  "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3])
239 		);
240 	}
241 
242 	kernel_fpu_end();
243 }
244 
245 const struct raid6_calls raid6_lsx = {
246 	raid6_lsx_gen_syndrome,
247 	raid6_lsx_xor_syndrome,
248 	raid6_has_lsx,
249 	"lsx",
250 	.priority = 0 /* see the comment near the top of the file for reason */
251 };
252 
253 #undef NSIZE
254 #endif /* CONFIG_CPU_HAS_LSX */
255 
256 #ifdef CONFIG_CPU_HAS_LASX
257 #define NSIZE 32
258 
259 static int raid6_has_lasx(void)
260 {
261 	return cpu_has_lasx;
262 }
263 
264 static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs)
265 {
266 	u8 **dptr = (u8 **)ptrs;
267 	u8 *p, *q;
268 	int d, z, z0;
269 
270 	z0 = disks - 3;		/* Highest data disk */
271 	p = dptr[z0+1];		/* XOR parity */
272 	q = dptr[z0+2];		/* RS syndrome */
273 
274 	kernel_fpu_begin();
275 
276 	/*
277 	 * $xr0, $xr1: wp
278 	 * $xr2, $xr3: wq
279 	 * $xr4, $xr5: wd
280 	 * $xr6, $xr7: w2
281 	 * $xr8, $xr9: w1
282 	 */
283 	for (d = 0; d < bytes; d += NSIZE*2) {
284 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
285 		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
286 		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
287 		asm volatile("xvori.b $xr2, $xr0, 0");
288 		asm volatile("xvori.b $xr3, $xr1, 0");
289 		for (z = z0-1; z >= 0; z--) {
290 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
291 			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
292 			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
293 			/* wp$$ ^= wd$$; */
294 			asm volatile("xvxor.v $xr0, $xr0, $xr4");
295 			asm volatile("xvxor.v $xr1, $xr1, $xr5");
296 			/* w2$$ = MASK(wq$$); */
297 			asm volatile("xvslti.b $xr6, $xr2, 0");
298 			asm volatile("xvslti.b $xr7, $xr3, 0");
299 			/* w1$$ = SHLBYTE(wq$$); */
300 			asm volatile("xvslli.b $xr8, $xr2, 1");
301 			asm volatile("xvslli.b $xr9, $xr3, 1");
302 			/* w2$$ &= NBYTES(0x1d); */
303 			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
304 			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
305 			/* w1$$ ^= w2$$; */
306 			asm volatile("xvxor.v $xr8, $xr8, $xr6");
307 			asm volatile("xvxor.v $xr9, $xr9, $xr7");
308 			/* wq$$ = w1$$ ^ wd$$; */
309 			asm volatile("xvxor.v $xr2, $xr8, $xr4");
310 			asm volatile("xvxor.v $xr3, $xr9, $xr5");
311 		}
312 		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
313 		asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0]));
314 		asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1]));
315 		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
316 		asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0]));
317 		asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1]));
318 	}
319 
320 	kernel_fpu_end();
321 }
322 
323 static void raid6_lasx_xor_syndrome(int disks, int start, int stop,
324 				    size_t bytes, void **ptrs)
325 {
326 	u8 **dptr = (u8 **)ptrs;
327 	u8 *p, *q;
328 	int d, z, z0;
329 
330 	z0 = stop;		/* P/Q right side optimization */
331 	p = dptr[disks-2];	/* XOR parity */
332 	q = dptr[disks-1];	/* RS syndrome */
333 
334 	kernel_fpu_begin();
335 
336 	/*
337 	 * $xr0, $xr1: wp
338 	 * $xr2, $xr3: wq
339 	 * $xr4, $xr5: wd
340 	 * $xr6, $xr7: w2
341 	 * $xr8, $xr9: w1
342 	 */
343 	for (d = 0; d < bytes; d += NSIZE*2) {
344 		/* P/Q data pages */
345 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
346 		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
347 		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
348 		asm volatile("xvori.b $xr2, $xr0, 0");
349 		asm volatile("xvori.b $xr3, $xr1, 0");
350 		for (z = z0-1; z >= start; z--) {
351 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
352 			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
353 			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
354 			/* wp$$ ^= wd$$; */
355 			asm volatile("xvxor.v $xr0, $xr0, $xr4");
356 			asm volatile("xvxor.v $xr1, $xr1, $xr5");
357 			/* w2$$ = MASK(wq$$); */
358 			asm volatile("xvslti.b $xr6, $xr2, 0");
359 			asm volatile("xvslti.b $xr7, $xr3, 0");
360 			/* w1$$ = SHLBYTE(wq$$); */
361 			asm volatile("xvslli.b $xr8, $xr2, 1");
362 			asm volatile("xvslli.b $xr9, $xr3, 1");
363 			/* w2$$ &= NBYTES(0x1d); */
364 			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
365 			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
366 			/* w1$$ ^= w2$$; */
367 			asm volatile("xvxor.v $xr8, $xr8, $xr6");
368 			asm volatile("xvxor.v $xr9, $xr9, $xr7");
369 			/* wq$$ = w1$$ ^ wd$$; */
370 			asm volatile("xvxor.v $xr2, $xr8, $xr4");
371 			asm volatile("xvxor.v $xr3, $xr9, $xr5");
372 		}
373 
374 		/* P/Q left side optimization */
375 		for (z = start-1; z >= 0; z--) {
376 			/* w2$$ = MASK(wq$$); */
377 			asm volatile("xvslti.b $xr6, $xr2, 0");
378 			asm volatile("xvslti.b $xr7, $xr3, 0");
379 			/* w1$$ = SHLBYTE(wq$$); */
380 			asm volatile("xvslli.b $xr8, $xr2, 1");
381 			asm volatile("xvslli.b $xr9, $xr3, 1");
382 			/* w2$$ &= NBYTES(0x1d); */
383 			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
384 			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
385 			/* wq$$ = w1$$ ^ w2$$; */
386 			asm volatile("xvxor.v $xr2, $xr8, $xr6");
387 			asm volatile("xvxor.v $xr3, $xr9, $xr7");
388 		}
389 		/*
390 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
391 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
392 		 */
393 		asm volatile(
394 			"xvld $xr10, %0\n\t"
395 			"xvld $xr11, %1\n\t"
396 			"xvld $xr12, %2\n\t"
397 			"xvld $xr13, %3\n\t"
398 			"xvxor.v $xr10, $xr10, $xr0\n\t"
399 			"xvxor.v $xr11, $xr11, $xr1\n\t"
400 			"xvxor.v $xr12, $xr12, $xr2\n\t"
401 			"xvxor.v $xr13, $xr13, $xr3\n\t"
402 			"xvst $xr10, %0\n\t"
403 			"xvst $xr11, %1\n\t"
404 			"xvst $xr12, %2\n\t"
405 			"xvst $xr13, %3\n\t"
406 			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
407 			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1])
408 		);
409 	}
410 
411 	kernel_fpu_end();
412 }
413 
414 const struct raid6_calls raid6_lasx = {
415 	raid6_lasx_gen_syndrome,
416 	raid6_lasx_xor_syndrome,
417 	raid6_has_lasx,
418 	"lasx",
419 	.priority = 0 /* see the comment near the top of the file for reason */
420 };
421 #undef NSIZE
422 #endif /* CONFIG_CPU_HAS_LASX */
423