xref: /linux/lib/raid6/rvv.c (revision f4922b69165735e81752ee47d174f873e989a449)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * RAID-6 syndrome calculation using RISC-V vector instructions
4  *
5  * Copyright 2024 Institute of Software, CAS.
6  * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
7  *
8  * Based on neon.uc:
9  *	Copyright 2002-2004 H. Peter Anvin
10  */
11 
12 #include "rvv.h"
13 
14 #ifdef __riscv_vector
15 #error "This code must be built without compiler support for vector"
16 #endif
17 
18 static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
19 {
20 	u8 **dptr = (u8 **)ptrs;
21 	u8 *p, *q;
22 	unsigned long vl, d, nsize;
23 	int z, z0;
24 
25 	z0 = disks - 3;		/* Highest data disk */
26 	p = dptr[z0 + 1];		/* XOR parity */
27 	q = dptr[z0 + 2];		/* RS syndrome */
28 
29 	asm volatile (".option	push\n"
30 		      ".option	arch,+v\n"
31 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
32 		      ".option	pop\n"
33 		      : "=&r" (vl)
34 	);
35 
36 	nsize = vl;
37 
38 	 /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
39 	for (d = 0; d < bytes; d += nsize * 1) {
40 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
41 		asm volatile (".option	push\n"
42 			      ".option	arch,+v\n"
43 			      "vle8.v	v0, (%[wp0])\n"
44 			      "vmv.v.v	v1, v0\n"
45 			      ".option	pop\n"
46 			      : :
47 			      [wp0]"r"(&dptr[z0][d + 0 * nsize])
48 		);
49 
50 		for (z = z0 - 1 ; z >= 0 ; z--) {
51 			/*
52 			 * w2$$ = MASK(wq$$);
53 			 * w1$$ = SHLBYTE(wq$$);
54 			 * w2$$ &= NBYTES(0x1d);
55 			 * w1$$ ^= w2$$;
56 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
57 			 * wq$$ = w1$$ ^ wd$$;
58 			 * wp$$ ^= wd$$;
59 			 */
60 			asm volatile (".option	push\n"
61 				      ".option	arch,+v\n"
62 				      "vsra.vi	v2, v1, 7\n"
63 				      "vsll.vi	v3, v1, 1\n"
64 				      "vand.vx	v2, v2, %[x1d]\n"
65 				      "vxor.vv	v3, v3, v2\n"
66 				      "vle8.v	v2, (%[wd0])\n"
67 				      "vxor.vv	v1, v3, v2\n"
68 				      "vxor.vv	v0, v0, v2\n"
69 				      ".option	pop\n"
70 				      : :
71 				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
72 				      [x1d]"r"(0x1d)
73 			);
74 		}
75 
76 		/*
77 		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
78 		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
79 		 */
80 		asm volatile (".option	push\n"
81 			      ".option	arch,+v\n"
82 			      "vse8.v	v0, (%[wp0])\n"
83 			      "vse8.v	v1, (%[wq0])\n"
84 			      ".option	pop\n"
85 			      : :
86 			      [wp0]"r"(&p[d + nsize * 0]),
87 			      [wq0]"r"(&q[d + nsize * 0])
88 		);
89 	}
90 }
91 
92 static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
93 					 unsigned long bytes, void **ptrs)
94 {
95 	u8 **dptr = (u8 **)ptrs;
96 	u8 *p, *q;
97 	unsigned long vl, d, nsize;
98 	int z, z0;
99 
100 	z0 = stop;		/* P/Q right side optimization */
101 	p = dptr[disks - 2];	/* XOR parity */
102 	q = dptr[disks - 1];	/* RS syndrome */
103 
104 	asm volatile (".option	push\n"
105 		      ".option	arch,+v\n"
106 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
107 		      ".option	pop\n"
108 		      : "=&r" (vl)
109 	);
110 
111 	nsize = vl;
112 
113 	/* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
114 	for (d = 0 ; d < bytes ; d += nsize * 1) {
115 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
116 		asm volatile (".option	push\n"
117 			      ".option	arch,+v\n"
118 			      "vle8.v	v0, (%[wp0])\n"
119 			      "vmv.v.v	v1, v0\n"
120 			      ".option	pop\n"
121 			      : :
122 			      [wp0]"r"(&dptr[z0][d + 0 * nsize])
123 		);
124 
125 		/* P/Q data pages */
126 		for (z = z0 - 1; z >= start; z--) {
127 			/*
128 			 * w2$$ = MASK(wq$$);
129 			 * w1$$ = SHLBYTE(wq$$);
130 			 * w2$$ &= NBYTES(0x1d);
131 			 * w1$$ ^= w2$$;
132 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
133 			 * wq$$ = w1$$ ^ wd$$;
134 			 * wp$$ ^= wd$$;
135 			 */
136 			asm volatile (".option	push\n"
137 				      ".option	arch,+v\n"
138 				      "vsra.vi	v2, v1, 7\n"
139 				      "vsll.vi	v3, v1, 1\n"
140 				      "vand.vx	v2, v2, %[x1d]\n"
141 				      "vxor.vv	v3, v3, v2\n"
142 				      "vle8.v	v2, (%[wd0])\n"
143 				      "vxor.vv	v1, v3, v2\n"
144 				      "vxor.vv	v0, v0, v2\n"
145 				      ".option	pop\n"
146 				      : :
147 				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
148 				      [x1d]"r"(0x1d)
149 			);
150 		}
151 
152 		/* P/Q left side optimization */
153 		for (z = start - 1; z >= 0; z--) {
154 			/*
155 			 * w2$$ = MASK(wq$$);
156 			 * w1$$ = SHLBYTE(wq$$);
157 			 * w2$$ &= NBYTES(0x1d);
158 			 * wq$$ = w1$$ ^ w2$$;
159 			 */
160 			asm volatile (".option	push\n"
161 				      ".option	arch,+v\n"
162 				      "vsra.vi	v2, v1, 7\n"
163 				      "vsll.vi	v3, v1, 1\n"
164 				      "vand.vx	v2, v2, %[x1d]\n"
165 				      "vxor.vv	v1, v3, v2\n"
166 				      ".option	pop\n"
167 				      : :
168 				      [x1d]"r"(0x1d)
169 			);
170 		}
171 
172 		/*
173 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
174 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
175 		 * v0:wp0, v1:wq0, v2:p0, v3:q0
176 		 */
177 		asm volatile (".option	push\n"
178 			      ".option	arch,+v\n"
179 			      "vle8.v	v2, (%[wp0])\n"
180 			      "vle8.v	v3, (%[wq0])\n"
181 			      "vxor.vv	v2, v2, v0\n"
182 			      "vxor.vv	v3, v3, v1\n"
183 			      "vse8.v	v2, (%[wp0])\n"
184 			      "vse8.v	v3, (%[wq0])\n"
185 			      ".option	pop\n"
186 			      : :
187 			      [wp0]"r"(&p[d + nsize * 0]),
188 			      [wq0]"r"(&q[d + nsize * 0])
189 		);
190 	}
191 }
192 
193 static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
194 {
195 	u8 **dptr = (u8 **)ptrs;
196 	u8 *p, *q;
197 	unsigned long vl, d, nsize;
198 	int z, z0;
199 
200 	z0 = disks - 3;		/* Highest data disk */
201 	p = dptr[z0 + 1];		/* XOR parity */
202 	q = dptr[z0 + 2];		/* RS syndrome */
203 
204 	asm volatile (".option	push\n"
205 		      ".option	arch,+v\n"
206 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
207 		      ".option	pop\n"
208 		      : "=&r" (vl)
209 	);
210 
211 	nsize = vl;
212 
213 	/*
214 	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
215 	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
216 	 */
217 	for (d = 0; d < bytes; d += nsize * 2) {
218 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
219 		asm volatile (".option	push\n"
220 			      ".option	arch,+v\n"
221 			      "vle8.v	v0, (%[wp0])\n"
222 			      "vmv.v.v	v1, v0\n"
223 			      "vle8.v	v4, (%[wp1])\n"
224 			      "vmv.v.v	v5, v4\n"
225 			      ".option	pop\n"
226 			      : :
227 			      [wp0]"r"(&dptr[z0][d + 0 * nsize]),
228 			      [wp1]"r"(&dptr[z0][d + 1 * nsize])
229 		);
230 
231 		for (z = z0 - 1; z >= 0; z--) {
232 			/*
233 			 * w2$$ = MASK(wq$$);
234 			 * w1$$ = SHLBYTE(wq$$);
235 			 * w2$$ &= NBYTES(0x1d);
236 			 * w1$$ ^= w2$$;
237 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
238 			 * wq$$ = w1$$ ^ wd$$;
239 			 * wp$$ ^= wd$$;
240 			 */
241 			asm volatile (".option	push\n"
242 				      ".option	arch,+v\n"
243 				      "vsra.vi	v2, v1, 7\n"
244 				      "vsll.vi	v3, v1, 1\n"
245 				      "vand.vx	v2, v2, %[x1d]\n"
246 				      "vxor.vv	v3, v3, v2\n"
247 				      "vle8.v	v2, (%[wd0])\n"
248 				      "vxor.vv	v1, v3, v2\n"
249 				      "vxor.vv	v0, v0, v2\n"
250 
251 				      "vsra.vi	v6, v5, 7\n"
252 				      "vsll.vi	v7, v5, 1\n"
253 				      "vand.vx	v6, v6, %[x1d]\n"
254 				      "vxor.vv	v7, v7, v6\n"
255 				      "vle8.v	v6, (%[wd1])\n"
256 				      "vxor.vv	v5, v7, v6\n"
257 				      "vxor.vv	v4, v4, v6\n"
258 				      ".option	pop\n"
259 				      : :
260 				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
261 				      [wd1]"r"(&dptr[z][d + 1 * nsize]),
262 				      [x1d]"r"(0x1d)
263 			);
264 		}
265 
266 		/*
267 		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
268 		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
269 		 */
270 		asm volatile (".option	push\n"
271 			      ".option	arch,+v\n"
272 			      "vse8.v	v0, (%[wp0])\n"
273 			      "vse8.v	v1, (%[wq0])\n"
274 			      "vse8.v	v4, (%[wp1])\n"
275 			      "vse8.v	v5, (%[wq1])\n"
276 			      ".option	pop\n"
277 			      : :
278 			      [wp0]"r"(&p[d + nsize * 0]),
279 			      [wq0]"r"(&q[d + nsize * 0]),
280 			      [wp1]"r"(&p[d + nsize * 1]),
281 			      [wq1]"r"(&q[d + nsize * 1])
282 		);
283 	}
284 }
285 
286 static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
287 					 unsigned long bytes, void **ptrs)
288 {
289 	u8 **dptr = (u8 **)ptrs;
290 	u8 *p, *q;
291 	unsigned long vl, d, nsize;
292 	int z, z0;
293 
294 	z0 = stop;		/* P/Q right side optimization */
295 	p = dptr[disks - 2];	/* XOR parity */
296 	q = dptr[disks - 1];	/* RS syndrome */
297 
298 	asm volatile (".option	push\n"
299 		      ".option	arch,+v\n"
300 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
301 		      ".option	pop\n"
302 		      : "=&r" (vl)
303 	);
304 
305 	nsize = vl;
306 
307 	/*
308 	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
309 	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
310 	 */
311 	for (d = 0; d < bytes; d += nsize * 2) {
312 		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
313 		asm volatile (".option	push\n"
314 			      ".option	arch,+v\n"
315 			      "vle8.v	v0, (%[wp0])\n"
316 			      "vmv.v.v	v1, v0\n"
317 			      "vle8.v	v4, (%[wp1])\n"
318 			      "vmv.v.v	v5, v4\n"
319 			      ".option	pop\n"
320 			      : :
321 			      [wp0]"r"(&dptr[z0][d + 0 * nsize]),
322 			      [wp1]"r"(&dptr[z0][d + 1 * nsize])
323 		);
324 
325 		/* P/Q data pages */
326 		for (z = z0 - 1; z >= start; z--) {
327 			/*
328 			 * w2$$ = MASK(wq$$);
329 			 * w1$$ = SHLBYTE(wq$$);
330 			 * w2$$ &= NBYTES(0x1d);
331 			 * w1$$ ^= w2$$;
332 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
333 			 * wq$$ = w1$$ ^ wd$$;
334 			 * wp$$ ^= wd$$;
335 			 */
336 			asm volatile (".option	push\n"
337 				      ".option	arch,+v\n"
338 				      "vsra.vi	v2, v1, 7\n"
339 				      "vsll.vi	v3, v1, 1\n"
340 				      "vand.vx	v2, v2, %[x1d]\n"
341 				      "vxor.vv	v3, v3, v2\n"
342 				      "vle8.v	v2, (%[wd0])\n"
343 				      "vxor.vv	v1, v3, v2\n"
344 				      "vxor.vv	v0, v0, v2\n"
345 
346 				      "vsra.vi	v6, v5, 7\n"
347 				      "vsll.vi	v7, v5, 1\n"
348 				      "vand.vx	v6, v6, %[x1d]\n"
349 				      "vxor.vv	v7, v7, v6\n"
350 				      "vle8.v	v6, (%[wd1])\n"
351 				      "vxor.vv	v5, v7, v6\n"
352 				      "vxor.vv	v4, v4, v6\n"
353 				      ".option	pop\n"
354 				      : :
355 				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
356 				      [wd1]"r"(&dptr[z][d + 1 * nsize]),
357 				      [x1d]"r"(0x1d)
358 			);
359 		}
360 
361 		/* P/Q left side optimization */
362 		for (z = start - 1; z >= 0; z--) {
363 			/*
364 			 * w2$$ = MASK(wq$$);
365 			 * w1$$ = SHLBYTE(wq$$);
366 			 * w2$$ &= NBYTES(0x1d);
367 			 * wq$$ = w1$$ ^ w2$$;
368 			 */
369 			asm volatile (".option	push\n"
370 				      ".option	arch,+v\n"
371 				      "vsra.vi	v2, v1, 7\n"
372 				      "vsll.vi	v3, v1, 1\n"
373 				      "vand.vx	v2, v2, %[x1d]\n"
374 				      "vxor.vv	v1, v3, v2\n"
375 
376 				      "vsra.vi	v6, v5, 7\n"
377 				      "vsll.vi	v7, v5, 1\n"
378 				      "vand.vx	v6, v6, %[x1d]\n"
379 				      "vxor.vv	v5, v7, v6\n"
380 				      ".option	pop\n"
381 				      : :
382 				      [x1d]"r"(0x1d)
383 			);
384 		}
385 
386 		/*
387 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
388 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
389 		 * v0:wp0, v1:wq0, v2:p0, v3:q0
390 		 * v4:wp1, v5:wq1, v6:p1, v7:q1
391 		 */
392 		asm volatile (".option	push\n"
393 			      ".option	arch,+v\n"
394 			      "vle8.v	v2, (%[wp0])\n"
395 			      "vle8.v	v3, (%[wq0])\n"
396 			      "vxor.vv	v2, v2, v0\n"
397 			      "vxor.vv	v3, v3, v1\n"
398 			      "vse8.v	v2, (%[wp0])\n"
399 			      "vse8.v	v3, (%[wq0])\n"
400 
401 			      "vle8.v	v6, (%[wp1])\n"
402 			      "vle8.v	v7, (%[wq1])\n"
403 			      "vxor.vv	v6, v6, v4\n"
404 			      "vxor.vv	v7, v7, v5\n"
405 			      "vse8.v	v6, (%[wp1])\n"
406 			      "vse8.v	v7, (%[wq1])\n"
407 			      ".option	pop\n"
408 			      : :
409 			      [wp0]"r"(&p[d + nsize * 0]),
410 			      [wq0]"r"(&q[d + nsize * 0]),
411 			      [wp1]"r"(&p[d + nsize * 1]),
412 			      [wq1]"r"(&q[d + nsize * 1])
413 		);
414 	}
415 }
416 
417 static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
418 {
419 	u8 **dptr = (u8 **)ptrs;
420 	u8 *p, *q;
421 	unsigned long vl, d, nsize;
422 	int z, z0;
423 
424 	z0 = disks - 3;	/* Highest data disk */
425 	p = dptr[z0 + 1];	/* XOR parity */
426 	q = dptr[z0 + 2];	/* RS syndrome */
427 
428 	asm volatile (".option	push\n"
429 		      ".option	arch,+v\n"
430 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
431 		      ".option	pop\n"
432 		      : "=&r" (vl)
433 	);
434 
435 	nsize = vl;
436 
437 	/*
438 	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
439 	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
440 	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
441 	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
442 	 */
443 	for (d = 0; d < bytes; d += nsize * 4) {
444 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
445 		asm volatile (".option	push\n"
446 			      ".option	arch,+v\n"
447 			      "vle8.v	v0, (%[wp0])\n"
448 			      "vmv.v.v	v1, v0\n"
449 			      "vle8.v	v4, (%[wp1])\n"
450 			      "vmv.v.v	v5, v4\n"
451 			      "vle8.v	v8, (%[wp2])\n"
452 			      "vmv.v.v	v9, v8\n"
453 			      "vle8.v	v12, (%[wp3])\n"
454 			      "vmv.v.v	v13, v12\n"
455 			      ".option	pop\n"
456 			      : :
457 			      [wp0]"r"(&dptr[z0][d + 0 * nsize]),
458 			      [wp1]"r"(&dptr[z0][d + 1 * nsize]),
459 			      [wp2]"r"(&dptr[z0][d + 2 * nsize]),
460 			      [wp3]"r"(&dptr[z0][d + 3 * nsize])
461 		);
462 
463 		for (z = z0 - 1; z >= 0; z--) {
464 			/*
465 			 * w2$$ = MASK(wq$$);
466 			 * w1$$ = SHLBYTE(wq$$);
467 			 * w2$$ &= NBYTES(0x1d);
468 			 * w1$$ ^= w2$$;
469 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
470 			 * wq$$ = w1$$ ^ wd$$;
471 			 * wp$$ ^= wd$$;
472 			 */
473 			asm volatile (".option	push\n"
474 				      ".option	arch,+v\n"
475 				      "vsra.vi	v2, v1, 7\n"
476 				      "vsll.vi	v3, v1, 1\n"
477 				      "vand.vx	v2, v2, %[x1d]\n"
478 				      "vxor.vv	v3, v3, v2\n"
479 				      "vle8.v	v2, (%[wd0])\n"
480 				      "vxor.vv	v1, v3, v2\n"
481 				      "vxor.vv	v0, v0, v2\n"
482 
483 				      "vsra.vi	v6, v5, 7\n"
484 				      "vsll.vi	v7, v5, 1\n"
485 				      "vand.vx	v6, v6, %[x1d]\n"
486 				      "vxor.vv	v7, v7, v6\n"
487 				      "vle8.v	v6, (%[wd1])\n"
488 				      "vxor.vv	v5, v7, v6\n"
489 				      "vxor.vv	v4, v4, v6\n"
490 
491 				      "vsra.vi	v10, v9, 7\n"
492 				      "vsll.vi	v11, v9, 1\n"
493 				      "vand.vx	v10, v10, %[x1d]\n"
494 				      "vxor.vv	v11, v11, v10\n"
495 				      "vle8.v	v10, (%[wd2])\n"
496 				      "vxor.vv	v9, v11, v10\n"
497 				      "vxor.vv	v8, v8, v10\n"
498 
499 				      "vsra.vi	v14, v13, 7\n"
500 				      "vsll.vi	v15, v13, 1\n"
501 				      "vand.vx	v14, v14, %[x1d]\n"
502 				      "vxor.vv	v15, v15, v14\n"
503 				      "vle8.v	v14, (%[wd3])\n"
504 				      "vxor.vv	v13, v15, v14\n"
505 				      "vxor.vv	v12, v12, v14\n"
506 				      ".option	pop\n"
507 				      : :
508 				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
509 				      [wd1]"r"(&dptr[z][d + 1 * nsize]),
510 				      [wd2]"r"(&dptr[z][d + 2 * nsize]),
511 				      [wd3]"r"(&dptr[z][d + 3 * nsize]),
512 				      [x1d]"r"(0x1d)
513 			);
514 		}
515 
516 		/*
517 		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
518 		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
519 		 */
520 		asm volatile (".option	push\n"
521 			      ".option	arch,+v\n"
522 			      "vse8.v	v0, (%[wp0])\n"
523 			      "vse8.v	v1, (%[wq0])\n"
524 			      "vse8.v	v4, (%[wp1])\n"
525 			      "vse8.v	v5, (%[wq1])\n"
526 			      "vse8.v	v8, (%[wp2])\n"
527 			      "vse8.v	v9, (%[wq2])\n"
528 			      "vse8.v	v12, (%[wp3])\n"
529 			      "vse8.v	v13, (%[wq3])\n"
530 			      ".option	pop\n"
531 			      : :
532 			      [wp0]"r"(&p[d + nsize * 0]),
533 			      [wq0]"r"(&q[d + nsize * 0]),
534 			      [wp1]"r"(&p[d + nsize * 1]),
535 			      [wq1]"r"(&q[d + nsize * 1]),
536 			      [wp2]"r"(&p[d + nsize * 2]),
537 			      [wq2]"r"(&q[d + nsize * 2]),
538 			      [wp3]"r"(&p[d + nsize * 3]),
539 			      [wq3]"r"(&q[d + nsize * 3])
540 		);
541 	}
542 }
543 
544 static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
545 					 unsigned long bytes, void **ptrs)
546 {
547 	u8 **dptr = (u8 **)ptrs;
548 	u8 *p, *q;
549 	unsigned long vl, d, nsize;
550 	int z, z0;
551 
552 	z0 = stop;		/* P/Q right side optimization */
553 	p = dptr[disks - 2];	/* XOR parity */
554 	q = dptr[disks - 1];	/* RS syndrome */
555 
556 	asm volatile (".option	push\n"
557 		      ".option	arch,+v\n"
558 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
559 		      ".option	pop\n"
560 		      : "=&r" (vl)
561 	);
562 
563 	nsize = vl;
564 
565 	/*
566 	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
567 	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
568 	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
569 	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
570 	 */
571 	for (d = 0; d < bytes; d += nsize * 4) {
572 		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
573 		asm volatile (".option	push\n"
574 			      ".option	arch,+v\n"
575 			      "vle8.v	v0, (%[wp0])\n"
576 			      "vmv.v.v	v1, v0\n"
577 			      "vle8.v	v4, (%[wp1])\n"
578 			      "vmv.v.v	v5, v4\n"
579 			      "vle8.v	v8, (%[wp2])\n"
580 			      "vmv.v.v	v9, v8\n"
581 			      "vle8.v	v12, (%[wp3])\n"
582 			      "vmv.v.v	v13, v12\n"
583 			      ".option	pop\n"
584 			      : :
585 			      [wp0]"r"(&dptr[z0][d + 0 * nsize]),
586 			      [wp1]"r"(&dptr[z0][d + 1 * nsize]),
587 			      [wp2]"r"(&dptr[z0][d + 2 * nsize]),
588 			      [wp3]"r"(&dptr[z0][d + 3 * nsize])
589 		);
590 
591 		/* P/Q data pages */
592 		for (z = z0 - 1; z >= start; z--) {
593 			/*
594 			 * w2$$ = MASK(wq$$);
595 			 * w1$$ = SHLBYTE(wq$$);
596 			 * w2$$ &= NBYTES(0x1d);
597 			 * w1$$ ^= w2$$;
598 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
599 			 * wq$$ = w1$$ ^ wd$$;
600 			 * wp$$ ^= wd$$;
601 			 */
602 			asm volatile (".option	push\n"
603 				      ".option	arch,+v\n"
604 				      "vsra.vi	v2, v1, 7\n"
605 				      "vsll.vi	v3, v1, 1\n"
606 				      "vand.vx	v2, v2, %[x1d]\n"
607 				      "vxor.vv	v3, v3, v2\n"
608 				      "vle8.v	v2, (%[wd0])\n"
609 				      "vxor.vv	v1, v3, v2\n"
610 				      "vxor.vv	v0, v0, v2\n"
611 
612 				      "vsra.vi	v6, v5, 7\n"
613 				      "vsll.vi	v7, v5, 1\n"
614 				      "vand.vx	v6, v6, %[x1d]\n"
615 				      "vxor.vv	v7, v7, v6\n"
616 				      "vle8.v	v6, (%[wd1])\n"
617 				      "vxor.vv	v5, v7, v6\n"
618 				      "vxor.vv	v4, v4, v6\n"
619 
620 				      "vsra.vi	v10, v9, 7\n"
621 				      "vsll.vi	v11, v9, 1\n"
622 				      "vand.vx	v10, v10, %[x1d]\n"
623 				      "vxor.vv	v11, v11, v10\n"
624 				      "vle8.v	v10, (%[wd2])\n"
625 				      "vxor.vv	v9, v11, v10\n"
626 				      "vxor.vv	v8, v8, v10\n"
627 
628 				      "vsra.vi	v14, v13, 7\n"
629 				      "vsll.vi	v15, v13, 1\n"
630 				      "vand.vx	v14, v14, %[x1d]\n"
631 				      "vxor.vv	v15, v15, v14\n"
632 				      "vle8.v	v14, (%[wd3])\n"
633 				      "vxor.vv	v13, v15, v14\n"
634 				      "vxor.vv	v12, v12, v14\n"
635 				      ".option	pop\n"
636 				      : :
637 				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
638 				      [wd1]"r"(&dptr[z][d + 1 * nsize]),
639 				      [wd2]"r"(&dptr[z][d + 2 * nsize]),
640 				      [wd3]"r"(&dptr[z][d + 3 * nsize]),
641 				      [x1d]"r"(0x1d)
642 			);
643 		}
644 
645 		/* P/Q left side optimization */
646 		for (z = start - 1; z >= 0; z--) {
647 			/*
648 			 * w2$$ = MASK(wq$$);
649 			 * w1$$ = SHLBYTE(wq$$);
650 			 * w2$$ &= NBYTES(0x1d);
651 			 * wq$$ = w1$$ ^ w2$$;
652 			 */
653 			asm volatile (".option	push\n"
654 				      ".option	arch,+v\n"
655 				      "vsra.vi	v2, v1, 7\n"
656 				      "vsll.vi	v3, v1, 1\n"
657 				      "vand.vx	v2, v2, %[x1d]\n"
658 				      "vxor.vv	v1, v3, v2\n"
659 
660 				      "vsra.vi	v6, v5, 7\n"
661 				      "vsll.vi	v7, v5, 1\n"
662 				      "vand.vx	v6, v6, %[x1d]\n"
663 				      "vxor.vv	v5, v7, v6\n"
664 
665 				      "vsra.vi	v10, v9, 7\n"
666 				      "vsll.vi	v11, v9, 1\n"
667 				      "vand.vx	v10, v10, %[x1d]\n"
668 				      "vxor.vv	v9, v11, v10\n"
669 
670 				      "vsra.vi	v14, v13, 7\n"
671 				      "vsll.vi	v15, v13, 1\n"
672 				      "vand.vx	v14, v14, %[x1d]\n"
673 				      "vxor.vv	v13, v15, v14\n"
674 				      ".option	pop\n"
675 				      : :
676 				      [x1d]"r"(0x1d)
677 			);
678 		}
679 
680 		/*
681 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
682 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
683 		 * v0:wp0, v1:wq0, v2:p0, v3:q0
684 		 * v4:wp1, v5:wq1, v6:p1, v7:q1
685 		 * v8:wp2, v9:wq2, v10:p2, v11:q2
686 		 * v12:wp3, v13:wq3, v14:p3, v15:q3
687 		 */
688 		asm volatile (".option	push\n"
689 			      ".option	arch,+v\n"
690 			      "vle8.v	v2, (%[wp0])\n"
691 			      "vle8.v	v3, (%[wq0])\n"
692 			      "vxor.vv	v2, v2, v0\n"
693 			      "vxor.vv	v3, v3, v1\n"
694 			      "vse8.v	v2, (%[wp0])\n"
695 			      "vse8.v	v3, (%[wq0])\n"
696 
697 			      "vle8.v	v6, (%[wp1])\n"
698 			      "vle8.v	v7, (%[wq1])\n"
699 			      "vxor.vv	v6, v6, v4\n"
700 			      "vxor.vv	v7, v7, v5\n"
701 			      "vse8.v	v6, (%[wp1])\n"
702 			      "vse8.v	v7, (%[wq1])\n"
703 
704 			      "vle8.v	v10, (%[wp2])\n"
705 			      "vle8.v	v11, (%[wq2])\n"
706 			      "vxor.vv	v10, v10, v8\n"
707 			      "vxor.vv	v11, v11, v9\n"
708 			      "vse8.v	v10, (%[wp2])\n"
709 			      "vse8.v	v11, (%[wq2])\n"
710 
711 			      "vle8.v	v14, (%[wp3])\n"
712 			      "vle8.v	v15, (%[wq3])\n"
713 			      "vxor.vv	v14, v14, v12\n"
714 			      "vxor.vv	v15, v15, v13\n"
715 			      "vse8.v	v14, (%[wp3])\n"
716 			      "vse8.v	v15, (%[wq3])\n"
717 			      ".option	pop\n"
718 			      : :
719 			      [wp0]"r"(&p[d + nsize * 0]),
720 			      [wq0]"r"(&q[d + nsize * 0]),
721 			      [wp1]"r"(&p[d + nsize * 1]),
722 			      [wq1]"r"(&q[d + nsize * 1]),
723 			      [wp2]"r"(&p[d + nsize * 2]),
724 			      [wq2]"r"(&q[d + nsize * 2]),
725 			      [wp3]"r"(&p[d + nsize * 3]),
726 			      [wq3]"r"(&q[d + nsize * 3])
727 		);
728 	}
729 }
730 
731 static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
732 {
733 	u8 **dptr = (u8 **)ptrs;
734 	u8 *p, *q;
735 	unsigned long vl, d, nsize;
736 	int z, z0;
737 
738 	z0 = disks - 3;	/* Highest data disk */
739 	p = dptr[z0 + 1];	/* XOR parity */
740 	q = dptr[z0 + 2];	/* RS syndrome */
741 
742 	asm volatile (".option	push\n"
743 		      ".option	arch,+v\n"
744 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
745 		      ".option	pop\n"
746 		      : "=&r" (vl)
747 	);
748 
749 	nsize = vl;
750 
751 	/*
752 	 * v0:wp0,   v1:wq0,  v2:wd0/w20,  v3:w10
753 	 * v4:wp1,   v5:wq1,  v6:wd1/w21,  v7:w11
754 	 * v8:wp2,   v9:wq2, v10:wd2/w22, v11:w12
755 	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
756 	 * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
757 	 * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
758 	 * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
759 	 * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
760 	 */
761 	for (d = 0; d < bytes; d += nsize * 8) {
762 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
763 		asm volatile (".option	push\n"
764 			      ".option	arch,+v\n"
765 			      "vle8.v	v0, (%[wp0])\n"
766 			      "vmv.v.v	v1, v0\n"
767 			      "vle8.v	v4, (%[wp1])\n"
768 			      "vmv.v.v	v5, v4\n"
769 			      "vle8.v	v8, (%[wp2])\n"
770 			      "vmv.v.v	v9, v8\n"
771 			      "vle8.v	v12, (%[wp3])\n"
772 			      "vmv.v.v	v13, v12\n"
773 			      "vle8.v	v16, (%[wp4])\n"
774 			      "vmv.v.v	v17, v16\n"
775 			      "vle8.v	v20, (%[wp5])\n"
776 			      "vmv.v.v	v21, v20\n"
777 			      "vle8.v	v24, (%[wp6])\n"
778 			      "vmv.v.v	v25, v24\n"
779 			      "vle8.v	v28, (%[wp7])\n"
780 			      "vmv.v.v	v29, v28\n"
781 			      ".option	pop\n"
782 			      : :
783 			      [wp0]"r"(&dptr[z0][d + 0 * nsize]),
784 			      [wp1]"r"(&dptr[z0][d + 1 * nsize]),
785 			      [wp2]"r"(&dptr[z0][d + 2 * nsize]),
786 			      [wp3]"r"(&dptr[z0][d + 3 * nsize]),
787 			      [wp4]"r"(&dptr[z0][d + 4 * nsize]),
788 			      [wp5]"r"(&dptr[z0][d + 5 * nsize]),
789 			      [wp6]"r"(&dptr[z0][d + 6 * nsize]),
790 			      [wp7]"r"(&dptr[z0][d + 7 * nsize])
791 		);
792 
793 		for (z = z0 - 1; z >= 0; z--) {
794 			/*
795 			 * w2$$ = MASK(wq$$);
796 			 * w1$$ = SHLBYTE(wq$$);
797 			 * w2$$ &= NBYTES(0x1d);
798 			 * w1$$ ^= w2$$;
799 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
800 			 * wq$$ = w1$$ ^ wd$$;
801 			 * wp$$ ^= wd$$;
802 			 */
803 			asm volatile (".option	push\n"
804 				      ".option	arch,+v\n"
805 				      "vsra.vi	v2, v1, 7\n"
806 				      "vsll.vi	v3, v1, 1\n"
807 				      "vand.vx	v2, v2, %[x1d]\n"
808 				      "vxor.vv	v3, v3, v2\n"
809 				      "vle8.v	v2, (%[wd0])\n"
810 				      "vxor.vv	v1, v3, v2\n"
811 				      "vxor.vv	v0, v0, v2\n"
812 
813 				      "vsra.vi	v6, v5, 7\n"
814 				      "vsll.vi	v7, v5, 1\n"
815 				      "vand.vx	v6, v6, %[x1d]\n"
816 				      "vxor.vv	v7, v7, v6\n"
817 				      "vle8.v	v6, (%[wd1])\n"
818 				      "vxor.vv	v5, v7, v6\n"
819 				      "vxor.vv	v4, v4, v6\n"
820 
821 				      "vsra.vi	v10, v9, 7\n"
822 				      "vsll.vi	v11, v9, 1\n"
823 				      "vand.vx	v10, v10, %[x1d]\n"
824 				      "vxor.vv	v11, v11, v10\n"
825 				      "vle8.v	v10, (%[wd2])\n"
826 				      "vxor.vv	v9, v11, v10\n"
827 				      "vxor.vv	v8, v8, v10\n"
828 
829 				      "vsra.vi	v14, v13, 7\n"
830 				      "vsll.vi	v15, v13, 1\n"
831 				      "vand.vx	v14, v14, %[x1d]\n"
832 				      "vxor.vv	v15, v15, v14\n"
833 				      "vle8.v	v14, (%[wd3])\n"
834 				      "vxor.vv	v13, v15, v14\n"
835 				      "vxor.vv	v12, v12, v14\n"
836 
837 				      "vsra.vi	v18, v17, 7\n"
838 				      "vsll.vi	v19, v17, 1\n"
839 				      "vand.vx	v18, v18, %[x1d]\n"
840 				      "vxor.vv	v19, v19, v18\n"
841 				      "vle8.v	v18, (%[wd4])\n"
842 				      "vxor.vv	v17, v19, v18\n"
843 				      "vxor.vv	v16, v16, v18\n"
844 
845 				      "vsra.vi	v22, v21, 7\n"
846 				      "vsll.vi	v23, v21, 1\n"
847 				      "vand.vx	v22, v22, %[x1d]\n"
848 				      "vxor.vv	v23, v23, v22\n"
849 				      "vle8.v	v22, (%[wd5])\n"
850 				      "vxor.vv	v21, v23, v22\n"
851 				      "vxor.vv	v20, v20, v22\n"
852 
853 				      "vsra.vi	v26, v25, 7\n"
854 				      "vsll.vi	v27, v25, 1\n"
855 				      "vand.vx	v26, v26, %[x1d]\n"
856 				      "vxor.vv	v27, v27, v26\n"
857 				      "vle8.v	v26, (%[wd6])\n"
858 				      "vxor.vv	v25, v27, v26\n"
859 				      "vxor.vv	v24, v24, v26\n"
860 
861 				      "vsra.vi	v30, v29, 7\n"
862 				      "vsll.vi	v31, v29, 1\n"
863 				      "vand.vx	v30, v30, %[x1d]\n"
864 				      "vxor.vv	v31, v31, v30\n"
865 				      "vle8.v	v30, (%[wd7])\n"
866 				      "vxor.vv	v29, v31, v30\n"
867 				      "vxor.vv	v28, v28, v30\n"
868 				      ".option	pop\n"
869 				      : :
870 				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
871 				      [wd1]"r"(&dptr[z][d + 1 * nsize]),
872 				      [wd2]"r"(&dptr[z][d + 2 * nsize]),
873 				      [wd3]"r"(&dptr[z][d + 3 * nsize]),
874 				      [wd4]"r"(&dptr[z][d + 4 * nsize]),
875 				      [wd5]"r"(&dptr[z][d + 5 * nsize]),
876 				      [wd6]"r"(&dptr[z][d + 6 * nsize]),
877 				      [wd7]"r"(&dptr[z][d + 7 * nsize]),
878 				      [x1d]"r"(0x1d)
879 			);
880 		}
881 
882 		/*
883 		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
884 		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
885 		 */
886 		asm volatile (".option	push\n"
887 			      ".option	arch,+v\n"
888 			      "vse8.v	v0, (%[wp0])\n"
889 			      "vse8.v	v1, (%[wq0])\n"
890 			      "vse8.v	v4, (%[wp1])\n"
891 			      "vse8.v	v5, (%[wq1])\n"
892 			      "vse8.v	v8, (%[wp2])\n"
893 			      "vse8.v	v9, (%[wq2])\n"
894 			      "vse8.v	v12, (%[wp3])\n"
895 			      "vse8.v	v13, (%[wq3])\n"
896 			      "vse8.v	v16, (%[wp4])\n"
897 			      "vse8.v	v17, (%[wq4])\n"
898 			      "vse8.v	v20, (%[wp5])\n"
899 			      "vse8.v	v21, (%[wq5])\n"
900 			      "vse8.v	v24, (%[wp6])\n"
901 			      "vse8.v	v25, (%[wq6])\n"
902 			      "vse8.v	v28, (%[wp7])\n"
903 			      "vse8.v	v29, (%[wq7])\n"
904 			      ".option	pop\n"
905 			      : :
906 			      [wp0]"r"(&p[d + nsize * 0]),
907 			      [wq0]"r"(&q[d + nsize * 0]),
908 			      [wp1]"r"(&p[d + nsize * 1]),
909 			      [wq1]"r"(&q[d + nsize * 1]),
910 			      [wp2]"r"(&p[d + nsize * 2]),
911 			      [wq2]"r"(&q[d + nsize * 2]),
912 			      [wp3]"r"(&p[d + nsize * 3]),
913 			      [wq3]"r"(&q[d + nsize * 3]),
914 			      [wp4]"r"(&p[d + nsize * 4]),
915 			      [wq4]"r"(&q[d + nsize * 4]),
916 			      [wp5]"r"(&p[d + nsize * 5]),
917 			      [wq5]"r"(&q[d + nsize * 5]),
918 			      [wp6]"r"(&p[d + nsize * 6]),
919 			      [wq6]"r"(&q[d + nsize * 6]),
920 			      [wp7]"r"(&p[d + nsize * 7]),
921 			      [wq7]"r"(&q[d + nsize * 7])
922 		);
923 	}
924 }
925 
926 static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop,
927 					 unsigned long bytes, void **ptrs)
928 {
929 	u8 **dptr = (u8 **)ptrs;
930 	u8 *p, *q;
931 	unsigned long vl, d, nsize;
932 	int z, z0;
933 
934 	z0 = stop;		/* P/Q right side optimization */
935 	p = dptr[disks - 2];	/* XOR parity */
936 	q = dptr[disks - 1];	/* RS syndrome */
937 
938 	asm volatile (".option	push\n"
939 		      ".option	arch,+v\n"
940 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
941 		      ".option	pop\n"
942 		      : "=&r" (vl)
943 	);
944 
945 	nsize = vl;
946 
947 	/*
948 	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
949 	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
950 	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
951 	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
952 	 * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
953 	 * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
954 	 * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
955 	 * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
956 	 */
957 	for (d = 0; d < bytes; d += nsize * 8) {
958 		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
959 		asm volatile (".option	push\n"
960 			      ".option	arch,+v\n"
961 			      "vle8.v	v0, (%[wp0])\n"
962 			      "vmv.v.v	v1, v0\n"
963 			      "vle8.v	v4, (%[wp1])\n"
964 			      "vmv.v.v	v5, v4\n"
965 			      "vle8.v	v8, (%[wp2])\n"
966 			      "vmv.v.v	v9, v8\n"
967 			      "vle8.v	v12, (%[wp3])\n"
968 			      "vmv.v.v	v13, v12\n"
969 			      "vle8.v	v16, (%[wp4])\n"
970 			      "vmv.v.v	v17, v16\n"
971 			      "vle8.v	v20, (%[wp5])\n"
972 			      "vmv.v.v	v21, v20\n"
973 			      "vle8.v	v24, (%[wp6])\n"
974 			      "vmv.v.v	v25, v24\n"
975 			      "vle8.v	v28, (%[wp7])\n"
976 			      "vmv.v.v	v29, v28\n"
977 			      ".option	pop\n"
978 			      : :
979 			      [wp0]"r"(&dptr[z0][d + 0 * nsize]),
980 			      [wp1]"r"(&dptr[z0][d + 1 * nsize]),
981 			      [wp2]"r"(&dptr[z0][d + 2 * nsize]),
982 			      [wp3]"r"(&dptr[z0][d + 3 * nsize]),
983 			      [wp4]"r"(&dptr[z0][d + 4 * nsize]),
984 			      [wp5]"r"(&dptr[z0][d + 5 * nsize]),
985 			      [wp6]"r"(&dptr[z0][d + 6 * nsize]),
986 			      [wp7]"r"(&dptr[z0][d + 7 * nsize])
987 		);
988 
989 		/* P/Q data pages */
990 		for (z = z0 - 1; z >= start; z--) {
991 			/*
992 			 * w2$$ = MASK(wq$$);
993 			 * w1$$ = SHLBYTE(wq$$);
994 			 * w2$$ &= NBYTES(0x1d);
995 			 * w1$$ ^= w2$$;
996 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
997 			 * wq$$ = w1$$ ^ wd$$;
998 			 * wp$$ ^= wd$$;
999 			 */
1000 			asm volatile (".option	push\n"
1001 				      ".option	arch,+v\n"
1002 				      "vsra.vi	v2, v1, 7\n"
1003 				      "vsll.vi	v3, v1, 1\n"
1004 				      "vand.vx	v2, v2, %[x1d]\n"
1005 				      "vxor.vv	v3, v3, v2\n"
1006 				      "vle8.v	v2, (%[wd0])\n"
1007 				      "vxor.vv	v1, v3, v2\n"
1008 				      "vxor.vv	v0, v0, v2\n"
1009 
1010 				      "vsra.vi	v6, v5, 7\n"
1011 				      "vsll.vi	v7, v5, 1\n"
1012 				      "vand.vx	v6, v6, %[x1d]\n"
1013 				      "vxor.vv	v7, v7, v6\n"
1014 				      "vle8.v	v6, (%[wd1])\n"
1015 				      "vxor.vv	v5, v7, v6\n"
1016 				      "vxor.vv	v4, v4, v6\n"
1017 
1018 				      "vsra.vi	v10, v9, 7\n"
1019 				      "vsll.vi	v11, v9, 1\n"
1020 				      "vand.vx	v10, v10, %[x1d]\n"
1021 				      "vxor.vv	v11, v11, v10\n"
1022 				      "vle8.v	v10, (%[wd2])\n"
1023 				      "vxor.vv	v9, v11, v10\n"
1024 				      "vxor.vv	v8, v8, v10\n"
1025 
1026 				      "vsra.vi	v14, v13, 7\n"
1027 				      "vsll.vi	v15, v13, 1\n"
1028 				      "vand.vx	v14, v14, %[x1d]\n"
1029 				      "vxor.vv	v15, v15, v14\n"
1030 				      "vle8.v	v14, (%[wd3])\n"
1031 				      "vxor.vv	v13, v15, v14\n"
1032 				      "vxor.vv	v12, v12, v14\n"
1033 
1034 				      "vsra.vi	v18, v17, 7\n"
1035 				      "vsll.vi	v19, v17, 1\n"
1036 				      "vand.vx	v18, v18, %[x1d]\n"
1037 				      "vxor.vv	v19, v19, v18\n"
1038 				      "vle8.v	v18, (%[wd4])\n"
1039 				      "vxor.vv	v17, v19, v18\n"
1040 				      "vxor.vv	v16, v16, v18\n"
1041 
1042 				      "vsra.vi	v22, v21, 7\n"
1043 				      "vsll.vi	v23, v21, 1\n"
1044 				      "vand.vx	v22, v22, %[x1d]\n"
1045 				      "vxor.vv	v23, v23, v22\n"
1046 				      "vle8.v	v22, (%[wd5])\n"
1047 				      "vxor.vv	v21, v23, v22\n"
1048 				      "vxor.vv	v20, v20, v22\n"
1049 
1050 				      "vsra.vi	v26, v25, 7\n"
1051 				      "vsll.vi	v27, v25, 1\n"
1052 				      "vand.vx	v26, v26, %[x1d]\n"
1053 				      "vxor.vv	v27, v27, v26\n"
1054 				      "vle8.v	v26, (%[wd6])\n"
1055 				      "vxor.vv	v25, v27, v26\n"
1056 				      "vxor.vv	v24, v24, v26\n"
1057 
1058 				      "vsra.vi	v30, v29, 7\n"
1059 				      "vsll.vi	v31, v29, 1\n"
1060 				      "vand.vx	v30, v30, %[x1d]\n"
1061 				      "vxor.vv	v31, v31, v30\n"
1062 				      "vle8.v	v30, (%[wd7])\n"
1063 				      "vxor.vv	v29, v31, v30\n"
1064 				      "vxor.vv	v28, v28, v30\n"
1065 				      ".option	pop\n"
1066 				      : :
1067 				      [wd0]"r"(&dptr[z][d + 0 * nsize]),
1068 				      [wd1]"r"(&dptr[z][d + 1 * nsize]),
1069 				      [wd2]"r"(&dptr[z][d + 2 * nsize]),
1070 				      [wd3]"r"(&dptr[z][d + 3 * nsize]),
1071 				      [wd4]"r"(&dptr[z][d + 4 * nsize]),
1072 				      [wd5]"r"(&dptr[z][d + 5 * nsize]),
1073 				      [wd6]"r"(&dptr[z][d + 6 * nsize]),
1074 				      [wd7]"r"(&dptr[z][d + 7 * nsize]),
1075 				      [x1d]"r"(0x1d)
1076 			);
1077 		}
1078 
1079 		/* P/Q left side optimization */
1080 		for (z = start - 1; z >= 0; z--) {
1081 			/*
1082 			 * w2$$ = MASK(wq$$);
1083 			 * w1$$ = SHLBYTE(wq$$);
1084 			 * w2$$ &= NBYTES(0x1d);
1085 			 * wq$$ = w1$$ ^ w2$$;
1086 			 */
1087 			asm volatile (".option	push\n"
1088 				      ".option	arch,+v\n"
1089 				      "vsra.vi	v2, v1, 7\n"
1090 				      "vsll.vi	v3, v1, 1\n"
1091 				      "vand.vx	v2, v2, %[x1d]\n"
1092 				      "vxor.vv	v1, v3, v2\n"
1093 
1094 				      "vsra.vi	v6, v5, 7\n"
1095 				      "vsll.vi	v7, v5, 1\n"
1096 				      "vand.vx	v6, v6, %[x1d]\n"
1097 				      "vxor.vv	v5, v7, v6\n"
1098 
1099 				      "vsra.vi	v10, v9, 7\n"
1100 				      "vsll.vi	v11, v9, 1\n"
1101 				      "vand.vx	v10, v10, %[x1d]\n"
1102 				      "vxor.vv	v9, v11, v10\n"
1103 
1104 				      "vsra.vi	v14, v13, 7\n"
1105 				      "vsll.vi	v15, v13, 1\n"
1106 				      "vand.vx	v14, v14, %[x1d]\n"
1107 				      "vxor.vv	v13, v15, v14\n"
1108 
1109 				      "vsra.vi	v18, v17, 7\n"
1110 				      "vsll.vi	v19, v17, 1\n"
1111 				      "vand.vx	v18, v18, %[x1d]\n"
1112 				      "vxor.vv	v17, v19, v18\n"
1113 
1114 				      "vsra.vi	v22, v21, 7\n"
1115 				      "vsll.vi	v23, v21, 1\n"
1116 				      "vand.vx	v22, v22, %[x1d]\n"
1117 				      "vxor.vv	v21, v23, v22\n"
1118 
1119 				      "vsra.vi	v26, v25, 7\n"
1120 				      "vsll.vi	v27, v25, 1\n"
1121 				      "vand.vx	v26, v26, %[x1d]\n"
1122 				      "vxor.vv	v25, v27, v26\n"
1123 
1124 				      "vsra.vi	v30, v29, 7\n"
1125 				      "vsll.vi	v31, v29, 1\n"
1126 				      "vand.vx	v30, v30, %[x1d]\n"
1127 				      "vxor.vv	v29, v31, v30\n"
1128 				      ".option	pop\n"
1129 				      : :
1130 				      [x1d]"r"(0x1d)
1131 			);
1132 		}
1133 
1134 		/*
1135 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
1136 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
1137 		 * v0:wp0, v1:wq0, v2:p0, v3:q0
1138 		 * v4:wp1, v5:wq1, v6:p1, v7:q1
1139 		 * v8:wp2, v9:wq2, v10:p2, v11:q2
1140 		 * v12:wp3, v13:wq3, v14:p3, v15:q3
1141 		 * v16:wp4, v17:wq4, v18:p4, v19:q4
1142 		 * v20:wp5, v21:wq5, v22:p5, v23:q5
1143 		 * v24:wp6, v25:wq6, v26:p6, v27:q6
1144 		 * v28:wp7, v29:wq7, v30:p7, v31:q7
1145 		 */
1146 		asm volatile (".option	push\n"
1147 			      ".option	arch,+v\n"
1148 			      "vle8.v	v2, (%[wp0])\n"
1149 			      "vle8.v	v3, (%[wq0])\n"
1150 			      "vxor.vv	v2, v2, v0\n"
1151 			      "vxor.vv	v3, v3, v1\n"
1152 			      "vse8.v	v2, (%[wp0])\n"
1153 			      "vse8.v	v3, (%[wq0])\n"
1154 
1155 			      "vle8.v	v6, (%[wp1])\n"
1156 			      "vle8.v	v7, (%[wq1])\n"
1157 			      "vxor.vv	v6, v6, v4\n"
1158 			      "vxor.vv	v7, v7, v5\n"
1159 			      "vse8.v	v6, (%[wp1])\n"
1160 			      "vse8.v	v7, (%[wq1])\n"
1161 
1162 			      "vle8.v	v10, (%[wp2])\n"
1163 			      "vle8.v	v11, (%[wq2])\n"
1164 			      "vxor.vv	v10, v10, v8\n"
1165 			      "vxor.vv	v11, v11, v9\n"
1166 			      "vse8.v	v10, (%[wp2])\n"
1167 			      "vse8.v	v11, (%[wq2])\n"
1168 
1169 			      "vle8.v	v14, (%[wp3])\n"
1170 			      "vle8.v	v15, (%[wq3])\n"
1171 			      "vxor.vv	v14, v14, v12\n"
1172 			      "vxor.vv	v15, v15, v13\n"
1173 			      "vse8.v	v14, (%[wp3])\n"
1174 			      "vse8.v	v15, (%[wq3])\n"
1175 
1176 			      "vle8.v	v18, (%[wp4])\n"
1177 			      "vle8.v	v19, (%[wq4])\n"
1178 			      "vxor.vv	v18, v18, v16\n"
1179 			      "vxor.vv	v19, v19, v17\n"
1180 			      "vse8.v	v18, (%[wp4])\n"
1181 			      "vse8.v	v19, (%[wq4])\n"
1182 
1183 			      "vle8.v	v22, (%[wp5])\n"
1184 			      "vle8.v	v23, (%[wq5])\n"
1185 			      "vxor.vv	v22, v22, v20\n"
1186 			      "vxor.vv	v23, v23, v21\n"
1187 			      "vse8.v	v22, (%[wp5])\n"
1188 			      "vse8.v	v23, (%[wq5])\n"
1189 
1190 			      "vle8.v	v26, (%[wp6])\n"
1191 			      "vle8.v	v27, (%[wq6])\n"
1192 			      "vxor.vv	v26, v26, v24\n"
1193 			      "vxor.vv	v27, v27, v25\n"
1194 			      "vse8.v	v26, (%[wp6])\n"
1195 			      "vse8.v	v27, (%[wq6])\n"
1196 
1197 			      "vle8.v	v30, (%[wp7])\n"
1198 			      "vle8.v	v31, (%[wq7])\n"
1199 			      "vxor.vv	v30, v30, v28\n"
1200 			      "vxor.vv	v31, v31, v29\n"
1201 			      "vse8.v	v30, (%[wp7])\n"
1202 			      "vse8.v	v31, (%[wq7])\n"
1203 			      ".option	pop\n"
1204 			      : :
1205 			      [wp0]"r"(&p[d + nsize * 0]),
1206 			      [wq0]"r"(&q[d + nsize * 0]),
1207 			      [wp1]"r"(&p[d + nsize * 1]),
1208 			      [wq1]"r"(&q[d + nsize * 1]),
1209 			      [wp2]"r"(&p[d + nsize * 2]),
1210 			      [wq2]"r"(&q[d + nsize * 2]),
1211 			      [wp3]"r"(&p[d + nsize * 3]),
1212 			      [wq3]"r"(&q[d + nsize * 3]),
1213 			      [wp4]"r"(&p[d + nsize * 4]),
1214 			      [wq4]"r"(&q[d + nsize * 4]),
1215 			      [wp5]"r"(&p[d + nsize * 5]),
1216 			      [wq5]"r"(&q[d + nsize * 5]),
1217 			      [wp6]"r"(&p[d + nsize * 6]),
1218 			      [wq6]"r"(&q[d + nsize * 6]),
1219 			      [wp7]"r"(&p[d + nsize * 7]),
1220 			      [wq7]"r"(&q[d + nsize * 7])
1221 		);
1222 	}
1223 }
1224 
1225 RAID6_RVV_WRAPPER(1);
1226 RAID6_RVV_WRAPPER(2);
1227 RAID6_RVV_WRAPPER(4);
1228 RAID6_RVV_WRAPPER(8);
1229