xref: /linux/lib/raid/raid6/loongarch/loongarch_simd.c (revision bba2c3615bd6cfee7456d1130f2e6b01b3f4e9ba)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX)
4  *
5  * Copyright 2023 WANG Xuerui <git@xen0n.name>
6  *
7  * Based on the generic RAID-6 code (int.uc):
8  *
9  * Copyright 2002-2004 H. Peter Anvin
10  */
11 
12 #include <asm/cpu-features.h>
13 #include <asm/fpu.h>
14 #include "algos.h"
15 
16 /*
17  * The vector algorithms are currently priority 0, which means the generic
18  * scalar algorithms are not being disabled if vector support is present.
19  * This is like the similar LoongArch RAID5 XOR code, with the main reason
20  * repeated here: it cannot be ruled out at this point of time, that some
21  * future (maybe reduced) models could run the vector algorithms slower than
22  * the scalar ones, maybe for errata or micro-op reasons. It may be
23  * appropriate to revisit this after one or two more uarch generations.
24  */
25 
26 #ifdef CONFIG_CPU_HAS_LSX
27 #define NSIZE 16
28 
29 static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs)
30 {
31 	u8 **dptr = (u8 **)ptrs;
32 	u8 *p, *q;
33 	int d, z, z0;
34 
35 	z0 = disks - 3;		/* Highest data disk */
36 	p = dptr[z0+1];		/* XOR parity */
37 	q = dptr[z0+2];		/* RS syndrome */
38 
39 	kernel_fpu_begin();
40 
41 	/*
42 	 * $vr0, $vr1, $vr2, $vr3: wp
43 	 * $vr4, $vr5, $vr6, $vr7: wq
44 	 * $vr8, $vr9, $vr10, $vr11: wd
45 	 * $vr12, $vr13, $vr14, $vr15: w2
46 	 * $vr16, $vr17, $vr18, $vr19: w1
47 	 */
48 	for (d = 0; d < bytes; d += NSIZE*4) {
49 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
50 		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
51 		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
52 		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
53 		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
54 		asm volatile("vori.b $vr4, $vr0, 0");
55 		asm volatile("vori.b $vr5, $vr1, 0");
56 		asm volatile("vori.b $vr6, $vr2, 0");
57 		asm volatile("vori.b $vr7, $vr3, 0");
58 		for (z = z0-1; z >= 0; z--) {
59 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
60 			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
61 			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
62 			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
63 			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
64 			/* wp$$ ^= wd$$; */
65 			asm volatile("vxor.v $vr0, $vr0, $vr8");
66 			asm volatile("vxor.v $vr1, $vr1, $vr9");
67 			asm volatile("vxor.v $vr2, $vr2, $vr10");
68 			asm volatile("vxor.v $vr3, $vr3, $vr11");
69 			/* w2$$ = MASK(wq$$); */
70 			asm volatile("vslti.b $vr12, $vr4, 0");
71 			asm volatile("vslti.b $vr13, $vr5, 0");
72 			asm volatile("vslti.b $vr14, $vr6, 0");
73 			asm volatile("vslti.b $vr15, $vr7, 0");
74 			/* w1$$ = SHLBYTE(wq$$); */
75 			asm volatile("vslli.b $vr16, $vr4, 1");
76 			asm volatile("vslli.b $vr17, $vr5, 1");
77 			asm volatile("vslli.b $vr18, $vr6, 1");
78 			asm volatile("vslli.b $vr19, $vr7, 1");
79 			/* w2$$ &= NBYTES(0x1d); */
80 			asm volatile("vandi.b $vr12, $vr12, 0x1d");
81 			asm volatile("vandi.b $vr13, $vr13, 0x1d");
82 			asm volatile("vandi.b $vr14, $vr14, 0x1d");
83 			asm volatile("vandi.b $vr15, $vr15, 0x1d");
84 			/* w1$$ ^= w2$$; */
85 			asm volatile("vxor.v $vr16, $vr16, $vr12");
86 			asm volatile("vxor.v $vr17, $vr17, $vr13");
87 			asm volatile("vxor.v $vr18, $vr18, $vr14");
88 			asm volatile("vxor.v $vr19, $vr19, $vr15");
89 			/* wq$$ = w1$$ ^ wd$$; */
90 			asm volatile("vxor.v $vr4, $vr16, $vr8");
91 			asm volatile("vxor.v $vr5, $vr17, $vr9");
92 			asm volatile("vxor.v $vr6, $vr18, $vr10");
93 			asm volatile("vxor.v $vr7, $vr19, $vr11");
94 		}
95 		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
96 		asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0]));
97 		asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1]));
98 		asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2]));
99 		asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3]));
100 		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
101 		asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0]));
102 		asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1]));
103 		asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2]));
104 		asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3]));
105 	}
106 
107 	kernel_fpu_end();
108 }
109 
110 static void raid6_lsx_xor_syndrome(int disks, int start, int stop,
111 				   size_t bytes, void **ptrs)
112 {
113 	u8 **dptr = (u8 **)ptrs;
114 	u8 *p, *q;
115 	int d, z, z0;
116 
117 	z0 = stop;		/* P/Q right side optimization */
118 	p = dptr[disks-2];	/* XOR parity */
119 	q = dptr[disks-1];	/* RS syndrome */
120 
121 	kernel_fpu_begin();
122 
123 	/*
124 	 * $vr0, $vr1, $vr2, $vr3: wp
125 	 * $vr4, $vr5, $vr6, $vr7: wq
126 	 * $vr8, $vr9, $vr10, $vr11: wd
127 	 * $vr12, $vr13, $vr14, $vr15: w2
128 	 * $vr16, $vr17, $vr18, $vr19: w1
129 	 */
130 	for (d = 0; d < bytes; d += NSIZE*4) {
131 		/* P/Q data pages */
132 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
133 		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
134 		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
135 		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
136 		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
137 		asm volatile("vori.b $vr4, $vr0, 0");
138 		asm volatile("vori.b $vr5, $vr1, 0");
139 		asm volatile("vori.b $vr6, $vr2, 0");
140 		asm volatile("vori.b $vr7, $vr3, 0");
141 		for (z = z0-1; z >= start; z--) {
142 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
143 			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
144 			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
145 			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
146 			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
147 			/* wp$$ ^= wd$$; */
148 			asm volatile("vxor.v $vr0, $vr0, $vr8");
149 			asm volatile("vxor.v $vr1, $vr1, $vr9");
150 			asm volatile("vxor.v $vr2, $vr2, $vr10");
151 			asm volatile("vxor.v $vr3, $vr3, $vr11");
152 			/* w2$$ = MASK(wq$$); */
153 			asm volatile("vslti.b $vr12, $vr4, 0");
154 			asm volatile("vslti.b $vr13, $vr5, 0");
155 			asm volatile("vslti.b $vr14, $vr6, 0");
156 			asm volatile("vslti.b $vr15, $vr7, 0");
157 			/* w1$$ = SHLBYTE(wq$$); */
158 			asm volatile("vslli.b $vr16, $vr4, 1");
159 			asm volatile("vslli.b $vr17, $vr5, 1");
160 			asm volatile("vslli.b $vr18, $vr6, 1");
161 			asm volatile("vslli.b $vr19, $vr7, 1");
162 			/* w2$$ &= NBYTES(0x1d); */
163 			asm volatile("vandi.b $vr12, $vr12, 0x1d");
164 			asm volatile("vandi.b $vr13, $vr13, 0x1d");
165 			asm volatile("vandi.b $vr14, $vr14, 0x1d");
166 			asm volatile("vandi.b $vr15, $vr15, 0x1d");
167 			/* w1$$ ^= w2$$; */
168 			asm volatile("vxor.v $vr16, $vr16, $vr12");
169 			asm volatile("vxor.v $vr17, $vr17, $vr13");
170 			asm volatile("vxor.v $vr18, $vr18, $vr14");
171 			asm volatile("vxor.v $vr19, $vr19, $vr15");
172 			/* wq$$ = w1$$ ^ wd$$; */
173 			asm volatile("vxor.v $vr4, $vr16, $vr8");
174 			asm volatile("vxor.v $vr5, $vr17, $vr9");
175 			asm volatile("vxor.v $vr6, $vr18, $vr10");
176 			asm volatile("vxor.v $vr7, $vr19, $vr11");
177 		}
178 
179 		/* P/Q left side optimization */
180 		for (z = start-1; z >= 0; z--) {
181 			/* w2$$ = MASK(wq$$); */
182 			asm volatile("vslti.b $vr12, $vr4, 0");
183 			asm volatile("vslti.b $vr13, $vr5, 0");
184 			asm volatile("vslti.b $vr14, $vr6, 0");
185 			asm volatile("vslti.b $vr15, $vr7, 0");
186 			/* w1$$ = SHLBYTE(wq$$); */
187 			asm volatile("vslli.b $vr16, $vr4, 1");
188 			asm volatile("vslli.b $vr17, $vr5, 1");
189 			asm volatile("vslli.b $vr18, $vr6, 1");
190 			asm volatile("vslli.b $vr19, $vr7, 1");
191 			/* w2$$ &= NBYTES(0x1d); */
192 			asm volatile("vandi.b $vr12, $vr12, 0x1d");
193 			asm volatile("vandi.b $vr13, $vr13, 0x1d");
194 			asm volatile("vandi.b $vr14, $vr14, 0x1d");
195 			asm volatile("vandi.b $vr15, $vr15, 0x1d");
196 			/* wq$$ = w1$$ ^ w2$$; */
197 			asm volatile("vxor.v $vr4, $vr16, $vr12");
198 			asm volatile("vxor.v $vr5, $vr17, $vr13");
199 			asm volatile("vxor.v $vr6, $vr18, $vr14");
200 			asm volatile("vxor.v $vr7, $vr19, $vr15");
201 		}
202 		/*
203 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
204 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
205 		 */
206 		asm volatile(
207 			"vld $vr20, %0\n\t"
208 			"vld $vr21, %1\n\t"
209 			"vld $vr22, %2\n\t"
210 			"vld $vr23, %3\n\t"
211 			"vld $vr24, %4\n\t"
212 			"vld $vr25, %5\n\t"
213 			"vld $vr26, %6\n\t"
214 			"vld $vr27, %7\n\t"
215 			"vxor.v $vr20, $vr20, $vr0\n\t"
216 			"vxor.v $vr21, $vr21, $vr1\n\t"
217 			"vxor.v $vr22, $vr22, $vr2\n\t"
218 			"vxor.v $vr23, $vr23, $vr3\n\t"
219 			"vxor.v $vr24, $vr24, $vr4\n\t"
220 			"vxor.v $vr25, $vr25, $vr5\n\t"
221 			"vxor.v $vr26, $vr26, $vr6\n\t"
222 			"vxor.v $vr27, $vr27, $vr7\n\t"
223 			"vst $vr20, %0\n\t"
224 			"vst $vr21, %1\n\t"
225 			"vst $vr22, %2\n\t"
226 			"vst $vr23, %3\n\t"
227 			"vst $vr24, %4\n\t"
228 			"vst $vr25, %5\n\t"
229 			"vst $vr26, %6\n\t"
230 			"vst $vr27, %7\n\t"
231 			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
232 			  "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]),
233 			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]),
234 			  "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3])
235 		);
236 	}
237 
238 	kernel_fpu_end();
239 }
240 
241 const struct raid6_calls raid6_lsx = {
242 	.gen_syndrome	= raid6_lsx_gen_syndrome,
243 	.xor_syndrome	= raid6_lsx_xor_syndrome,
244 	.name		= "lsx",
245 };
246 
247 #undef NSIZE
248 #endif /* CONFIG_CPU_HAS_LSX */
249 
250 #ifdef CONFIG_CPU_HAS_LASX
251 #define NSIZE 32
252 
253 static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs)
254 {
255 	u8 **dptr = (u8 **)ptrs;
256 	u8 *p, *q;
257 	int d, z, z0;
258 
259 	z0 = disks - 3;		/* Highest data disk */
260 	p = dptr[z0+1];		/* XOR parity */
261 	q = dptr[z0+2];		/* RS syndrome */
262 
263 	kernel_fpu_begin();
264 
265 	/*
266 	 * $xr0, $xr1: wp
267 	 * $xr2, $xr3: wq
268 	 * $xr4, $xr5: wd
269 	 * $xr6, $xr7: w2
270 	 * $xr8, $xr9: w1
271 	 */
272 	for (d = 0; d < bytes; d += NSIZE*2) {
273 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
274 		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
275 		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
276 		asm volatile("xvori.b $xr2, $xr0, 0");
277 		asm volatile("xvori.b $xr3, $xr1, 0");
278 		for (z = z0-1; z >= 0; z--) {
279 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
280 			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
281 			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
282 			/* wp$$ ^= wd$$; */
283 			asm volatile("xvxor.v $xr0, $xr0, $xr4");
284 			asm volatile("xvxor.v $xr1, $xr1, $xr5");
285 			/* w2$$ = MASK(wq$$); */
286 			asm volatile("xvslti.b $xr6, $xr2, 0");
287 			asm volatile("xvslti.b $xr7, $xr3, 0");
288 			/* w1$$ = SHLBYTE(wq$$); */
289 			asm volatile("xvslli.b $xr8, $xr2, 1");
290 			asm volatile("xvslli.b $xr9, $xr3, 1");
291 			/* w2$$ &= NBYTES(0x1d); */
292 			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
293 			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
294 			/* w1$$ ^= w2$$; */
295 			asm volatile("xvxor.v $xr8, $xr8, $xr6");
296 			asm volatile("xvxor.v $xr9, $xr9, $xr7");
297 			/* wq$$ = w1$$ ^ wd$$; */
298 			asm volatile("xvxor.v $xr2, $xr8, $xr4");
299 			asm volatile("xvxor.v $xr3, $xr9, $xr5");
300 		}
301 		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
302 		asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0]));
303 		asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1]));
304 		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
305 		asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0]));
306 		asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1]));
307 	}
308 
309 	kernel_fpu_end();
310 }
311 
312 static void raid6_lasx_xor_syndrome(int disks, int start, int stop,
313 				    size_t bytes, void **ptrs)
314 {
315 	u8 **dptr = (u8 **)ptrs;
316 	u8 *p, *q;
317 	int d, z, z0;
318 
319 	z0 = stop;		/* P/Q right side optimization */
320 	p = dptr[disks-2];	/* XOR parity */
321 	q = dptr[disks-1];	/* RS syndrome */
322 
323 	kernel_fpu_begin();
324 
325 	/*
326 	 * $xr0, $xr1: wp
327 	 * $xr2, $xr3: wq
328 	 * $xr4, $xr5: wd
329 	 * $xr6, $xr7: w2
330 	 * $xr8, $xr9: w1
331 	 */
332 	for (d = 0; d < bytes; d += NSIZE*2) {
333 		/* P/Q data pages */
334 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
335 		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
336 		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
337 		asm volatile("xvori.b $xr2, $xr0, 0");
338 		asm volatile("xvori.b $xr3, $xr1, 0");
339 		for (z = z0-1; z >= start; z--) {
340 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
341 			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
342 			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
343 			/* wp$$ ^= wd$$; */
344 			asm volatile("xvxor.v $xr0, $xr0, $xr4");
345 			asm volatile("xvxor.v $xr1, $xr1, $xr5");
346 			/* w2$$ = MASK(wq$$); */
347 			asm volatile("xvslti.b $xr6, $xr2, 0");
348 			asm volatile("xvslti.b $xr7, $xr3, 0");
349 			/* w1$$ = SHLBYTE(wq$$); */
350 			asm volatile("xvslli.b $xr8, $xr2, 1");
351 			asm volatile("xvslli.b $xr9, $xr3, 1");
352 			/* w2$$ &= NBYTES(0x1d); */
353 			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
354 			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
355 			/* w1$$ ^= w2$$; */
356 			asm volatile("xvxor.v $xr8, $xr8, $xr6");
357 			asm volatile("xvxor.v $xr9, $xr9, $xr7");
358 			/* wq$$ = w1$$ ^ wd$$; */
359 			asm volatile("xvxor.v $xr2, $xr8, $xr4");
360 			asm volatile("xvxor.v $xr3, $xr9, $xr5");
361 		}
362 
363 		/* P/Q left side optimization */
364 		for (z = start-1; z >= 0; z--) {
365 			/* w2$$ = MASK(wq$$); */
366 			asm volatile("xvslti.b $xr6, $xr2, 0");
367 			asm volatile("xvslti.b $xr7, $xr3, 0");
368 			/* w1$$ = SHLBYTE(wq$$); */
369 			asm volatile("xvslli.b $xr8, $xr2, 1");
370 			asm volatile("xvslli.b $xr9, $xr3, 1");
371 			/* w2$$ &= NBYTES(0x1d); */
372 			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
373 			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
374 			/* wq$$ = w1$$ ^ w2$$; */
375 			asm volatile("xvxor.v $xr2, $xr8, $xr6");
376 			asm volatile("xvxor.v $xr3, $xr9, $xr7");
377 		}
378 		/*
379 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
380 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
381 		 */
382 		asm volatile(
383 			"xvld $xr10, %0\n\t"
384 			"xvld $xr11, %1\n\t"
385 			"xvld $xr12, %2\n\t"
386 			"xvld $xr13, %3\n\t"
387 			"xvxor.v $xr10, $xr10, $xr0\n\t"
388 			"xvxor.v $xr11, $xr11, $xr1\n\t"
389 			"xvxor.v $xr12, $xr12, $xr2\n\t"
390 			"xvxor.v $xr13, $xr13, $xr3\n\t"
391 			"xvst $xr10, %0\n\t"
392 			"xvst $xr11, %1\n\t"
393 			"xvst $xr12, %2\n\t"
394 			"xvst $xr13, %3\n\t"
395 			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
396 			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1])
397 		);
398 	}
399 
400 	kernel_fpu_end();
401 }
402 
403 const struct raid6_calls raid6_lasx = {
404 	.gen_syndrome	= raid6_lasx_gen_syndrome,
405 	.xor_syndrome	= raid6_lasx_xor_syndrome,
406 	.name		= "lasx",
407 };
408 #undef NSIZE
409 #endif /* CONFIG_CPU_HAS_LASX */
410