xref: /linux/lib/raid6/rvv.c (revision 260f6f4fda93c8485c8037865c941b42b9cba5d2)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * RAID-6 syndrome calculation using RISC-V vector instructions
4  *
5  * Copyright 2024 Institute of Software, CAS.
6  * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
7  *
8  * Based on neon.uc:
9  *	Copyright 2002-2004 H. Peter Anvin
10  */
11 
12 #include <asm/simd.h>
13 #include <asm/vector.h>
14 #include <crypto/internal/simd.h>
15 #include <linux/raid/pq.h>
16 #include <linux/types.h>
17 #include "rvv.h"
18 
19 #define NSIZE	(riscv_v_vsize / 32) /* NSIZE = vlenb */
20 
21 static int rvv_has_vector(void)
22 {
23 	return has_vector();
24 }
25 
26 static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
27 {
28 	u8 **dptr = (u8 **)ptrs;
29 	u8 *p, *q;
30 	unsigned long vl, d;
31 	int z, z0;
32 
33 	z0 = disks - 3;		/* Highest data disk */
34 	p = dptr[z0 + 1];		/* XOR parity */
35 	q = dptr[z0 + 2];		/* RS syndrome */
36 
37 	asm volatile (".option	push\n"
38 		      ".option	arch,+v\n"
39 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
40 		      ".option	pop\n"
41 		      : "=&r" (vl)
42 	);
43 
44 	 /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
45 	for (d = 0; d < bytes; d += NSIZE * 1) {
46 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
47 		asm volatile (".option	push\n"
48 			      ".option	arch,+v\n"
49 			      "vle8.v	v0, (%[wp0])\n"
50 			      "vle8.v	v1, (%[wp0])\n"
51 			      ".option	pop\n"
52 			      : :
53 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE])
54 		);
55 
56 		for (z = z0 - 1 ; z >= 0 ; z--) {
57 			/*
58 			 * w2$$ = MASK(wq$$);
59 			 * w1$$ = SHLBYTE(wq$$);
60 			 * w2$$ &= NBYTES(0x1d);
61 			 * w1$$ ^= w2$$;
62 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
63 			 * wq$$ = w1$$ ^ wd$$;
64 			 * wp$$ ^= wd$$;
65 			 */
66 			asm volatile (".option	push\n"
67 				      ".option	arch,+v\n"
68 				      "vsra.vi	v2, v1, 7\n"
69 				      "vsll.vi	v3, v1, 1\n"
70 				      "vand.vx	v2, v2, %[x1d]\n"
71 				      "vxor.vv	v3, v3, v2\n"
72 				      "vle8.v	v2, (%[wd0])\n"
73 				      "vxor.vv	v1, v3, v2\n"
74 				      "vxor.vv	v0, v0, v2\n"
75 				      ".option	pop\n"
76 				      : :
77 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
78 				      [x1d]"r"(0x1d)
79 			);
80 		}
81 
82 		/*
83 		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
84 		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
85 		 */
86 		asm volatile (".option	push\n"
87 			      ".option	arch,+v\n"
88 			      "vse8.v	v0, (%[wp0])\n"
89 			      "vse8.v	v1, (%[wq0])\n"
90 			      ".option	pop\n"
91 			      : :
92 			      [wp0]"r"(&p[d + NSIZE * 0]),
93 			      [wq0]"r"(&q[d + NSIZE * 0])
94 		);
95 	}
96 }
97 
98 static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
99 					 unsigned long bytes, void **ptrs)
100 {
101 	u8 **dptr = (u8 **)ptrs;
102 	u8 *p, *q;
103 	unsigned long vl, d;
104 	int z, z0;
105 
106 	z0 = stop;		/* P/Q right side optimization */
107 	p = dptr[disks - 2];	/* XOR parity */
108 	q = dptr[disks - 1];	/* RS syndrome */
109 
110 	asm volatile (".option	push\n"
111 		      ".option	arch,+v\n"
112 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
113 		      ".option	pop\n"
114 		      : "=&r" (vl)
115 	);
116 
117 	/* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
118 	for (d = 0 ; d < bytes ; d += NSIZE * 1) {
119 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
120 		asm volatile (".option	push\n"
121 			      ".option	arch,+v\n"
122 			      "vle8.v	v0, (%[wp0])\n"
123 			      "vle8.v	v1, (%[wp0])\n"
124 			      ".option	pop\n"
125 			      : :
126 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE])
127 		);
128 
129 		/* P/Q data pages */
130 		for (z = z0 - 1; z >= start; z--) {
131 			/*
132 			 * w2$$ = MASK(wq$$);
133 			 * w1$$ = SHLBYTE(wq$$);
134 			 * w2$$ &= NBYTES(0x1d);
135 			 * w1$$ ^= w2$$;
136 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
137 			 * wq$$ = w1$$ ^ wd$$;
138 			 * wp$$ ^= wd$$;
139 			 */
140 			asm volatile (".option	push\n"
141 				      ".option	arch,+v\n"
142 				      "vsra.vi	v2, v1, 7\n"
143 				      "vsll.vi	v3, v1, 1\n"
144 				      "vand.vx	v2, v2, %[x1d]\n"
145 				      "vxor.vv	v3, v3, v2\n"
146 				      "vle8.v	v2, (%[wd0])\n"
147 				      "vxor.vv	v1, v3, v2\n"
148 				      "vxor.vv	v0, v0, v2\n"
149 				      ".option	pop\n"
150 				      : :
151 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
152 				      [x1d]"r"(0x1d)
153 			);
154 		}
155 
156 		/* P/Q left side optimization */
157 		for (z = start - 1; z >= 0; z--) {
158 			/*
159 			 * w2$$ = MASK(wq$$);
160 			 * w1$$ = SHLBYTE(wq$$);
161 			 * w2$$ &= NBYTES(0x1d);
162 			 * wq$$ = w1$$ ^ w2$$;
163 			 */
164 			asm volatile (".option	push\n"
165 				      ".option	arch,+v\n"
166 				      "vsra.vi	v2, v1, 7\n"
167 				      "vsll.vi	v3, v1, 1\n"
168 				      "vand.vx	v2, v2, %[x1d]\n"
169 				      "vxor.vv	v1, v3, v2\n"
170 				      ".option	pop\n"
171 				      : :
172 				      [x1d]"r"(0x1d)
173 			);
174 		}
175 
176 		/*
177 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
178 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
179 		 * v0:wp0, v1:wq0, v2:p0, v3:q0
180 		 */
181 		asm volatile (".option	push\n"
182 			      ".option	arch,+v\n"
183 			      "vle8.v	v2, (%[wp0])\n"
184 			      "vle8.v	v3, (%[wq0])\n"
185 			      "vxor.vv	v2, v2, v0\n"
186 			      "vxor.vv	v3, v3, v1\n"
187 			      "vse8.v	v2, (%[wp0])\n"
188 			      "vse8.v	v3, (%[wq0])\n"
189 			      ".option	pop\n"
190 			      : :
191 			      [wp0]"r"(&p[d + NSIZE * 0]),
192 			      [wq0]"r"(&q[d + NSIZE * 0])
193 		);
194 	}
195 }
196 
197 static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
198 {
199 	u8 **dptr = (u8 **)ptrs;
200 	u8 *p, *q;
201 	unsigned long vl, d;
202 	int z, z0;
203 
204 	z0 = disks - 3;		/* Highest data disk */
205 	p = dptr[z0 + 1];		/* XOR parity */
206 	q = dptr[z0 + 2];		/* RS syndrome */
207 
208 	asm volatile (".option	push\n"
209 		      ".option	arch,+v\n"
210 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
211 		      ".option	pop\n"
212 		      : "=&r" (vl)
213 	);
214 
215 	/*
216 	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
217 	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
218 	 */
219 	for (d = 0; d < bytes; d += NSIZE * 2) {
220 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
221 		asm volatile (".option	push\n"
222 			      ".option	arch,+v\n"
223 			      "vle8.v	v0, (%[wp0])\n"
224 			      "vle8.v	v1, (%[wp0])\n"
225 			      "vle8.v	v4, (%[wp1])\n"
226 			      "vle8.v	v5, (%[wp1])\n"
227 			      ".option	pop\n"
228 			      : :
229 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
230 			      [wp1]"r"(&dptr[z0][d + 1 * NSIZE])
231 		);
232 
233 		for (z = z0 - 1; z >= 0; z--) {
234 			/*
235 			 * w2$$ = MASK(wq$$);
236 			 * w1$$ = SHLBYTE(wq$$);
237 			 * w2$$ &= NBYTES(0x1d);
238 			 * w1$$ ^= w2$$;
239 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
240 			 * wq$$ = w1$$ ^ wd$$;
241 			 * wp$$ ^= wd$$;
242 			 */
243 			asm volatile (".option	push\n"
244 				      ".option	arch,+v\n"
245 				      "vsra.vi	v2, v1, 7\n"
246 				      "vsll.vi	v3, v1, 1\n"
247 				      "vand.vx	v2, v2, %[x1d]\n"
248 				      "vxor.vv	v3, v3, v2\n"
249 				      "vle8.v	v2, (%[wd0])\n"
250 				      "vxor.vv	v1, v3, v2\n"
251 				      "vxor.vv	v0, v0, v2\n"
252 
253 				      "vsra.vi	v6, v5, 7\n"
254 				      "vsll.vi	v7, v5, 1\n"
255 				      "vand.vx	v6, v6, %[x1d]\n"
256 				      "vxor.vv	v7, v7, v6\n"
257 				      "vle8.v	v6, (%[wd1])\n"
258 				      "vxor.vv	v5, v7, v6\n"
259 				      "vxor.vv	v4, v4, v6\n"
260 				      ".option	pop\n"
261 				      : :
262 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
263 				      [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
264 				      [x1d]"r"(0x1d)
265 			);
266 		}
267 
268 		/*
269 		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
270 		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
271 		 */
272 		asm volatile (".option	push\n"
273 			      ".option	arch,+v\n"
274 			      "vse8.v	v0, (%[wp0])\n"
275 			      "vse8.v	v1, (%[wq0])\n"
276 			      "vse8.v	v4, (%[wp1])\n"
277 			      "vse8.v	v5, (%[wq1])\n"
278 			      ".option	pop\n"
279 			      : :
280 			      [wp0]"r"(&p[d + NSIZE * 0]),
281 			      [wq0]"r"(&q[d + NSIZE * 0]),
282 			      [wp1]"r"(&p[d + NSIZE * 1]),
283 			      [wq1]"r"(&q[d + NSIZE * 1])
284 		);
285 	}
286 }
287 
288 static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
289 					 unsigned long bytes, void **ptrs)
290 {
291 	u8 **dptr = (u8 **)ptrs;
292 	u8 *p, *q;
293 	unsigned long vl, d;
294 	int z, z0;
295 
296 	z0 = stop;		/* P/Q right side optimization */
297 	p = dptr[disks - 2];	/* XOR parity */
298 	q = dptr[disks - 1];	/* RS syndrome */
299 
300 	asm volatile (".option	push\n"
301 		      ".option	arch,+v\n"
302 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
303 		      ".option	pop\n"
304 		      : "=&r" (vl)
305 	);
306 
307 	/*
308 	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
309 	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
310 	 */
311 	for (d = 0; d < bytes; d += NSIZE * 2) {
312 		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
313 		asm volatile (".option	push\n"
314 			      ".option	arch,+v\n"
315 			      "vle8.v	v0, (%[wp0])\n"
316 			      "vle8.v	v1, (%[wp0])\n"
317 			      "vle8.v	v4, (%[wp1])\n"
318 			      "vle8.v	v5, (%[wp1])\n"
319 			      ".option	pop\n"
320 			      : :
321 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
322 			      [wp1]"r"(&dptr[z0][d + 1 * NSIZE])
323 		);
324 
325 		/* P/Q data pages */
326 		for (z = z0 - 1; z >= start; z--) {
327 			/*
328 			 * w2$$ = MASK(wq$$);
329 			 * w1$$ = SHLBYTE(wq$$);
330 			 * w2$$ &= NBYTES(0x1d);
331 			 * w1$$ ^= w2$$;
332 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
333 			 * wq$$ = w1$$ ^ wd$$;
334 			 * wp$$ ^= wd$$;
335 			 */
336 			asm volatile (".option	push\n"
337 				      ".option	arch,+v\n"
338 				      "vsra.vi	v2, v1, 7\n"
339 				      "vsll.vi	v3, v1, 1\n"
340 				      "vand.vx	v2, v2, %[x1d]\n"
341 				      "vxor.vv	v3, v3, v2\n"
342 				      "vle8.v	v2, (%[wd0])\n"
343 				      "vxor.vv	v1, v3, v2\n"
344 				      "vxor.vv	v0, v0, v2\n"
345 
346 				      "vsra.vi	v6, v5, 7\n"
347 				      "vsll.vi	v7, v5, 1\n"
348 				      "vand.vx	v6, v6, %[x1d]\n"
349 				      "vxor.vv	v7, v7, v6\n"
350 				      "vle8.v	v6, (%[wd1])\n"
351 				      "vxor.vv	v5, v7, v6\n"
352 				      "vxor.vv	v4, v4, v6\n"
353 				      ".option	pop\n"
354 				      : :
355 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
356 				      [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
357 				      [x1d]"r"(0x1d)
358 			);
359 		}
360 
361 		/* P/Q left side optimization */
362 		for (z = start - 1; z >= 0; z--) {
363 			/*
364 			 * w2$$ = MASK(wq$$);
365 			 * w1$$ = SHLBYTE(wq$$);
366 			 * w2$$ &= NBYTES(0x1d);
367 			 * wq$$ = w1$$ ^ w2$$;
368 			 */
369 			asm volatile (".option	push\n"
370 				      ".option	arch,+v\n"
371 				      "vsra.vi	v2, v1, 7\n"
372 				      "vsll.vi	v3, v1, 1\n"
373 				      "vand.vx	v2, v2, %[x1d]\n"
374 				      "vxor.vv	v1, v3, v2\n"
375 
376 				      "vsra.vi	v6, v5, 7\n"
377 				      "vsll.vi	v7, v5, 1\n"
378 				      "vand.vx	v6, v6, %[x1d]\n"
379 				      "vxor.vv	v5, v7, v6\n"
380 				      ".option	pop\n"
381 				      : :
382 				      [x1d]"r"(0x1d)
383 			);
384 		}
385 
386 		/*
387 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
388 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
389 		 * v0:wp0, v1:wq0, v2:p0, v3:q0
390 		 * v4:wp1, v5:wq1, v6:p1, v7:q1
391 		 */
392 		asm volatile (".option	push\n"
393 			      ".option	arch,+v\n"
394 			      "vle8.v	v2, (%[wp0])\n"
395 			      "vle8.v	v3, (%[wq0])\n"
396 			      "vxor.vv	v2, v2, v0\n"
397 			      "vxor.vv	v3, v3, v1\n"
398 			      "vse8.v	v2, (%[wp0])\n"
399 			      "vse8.v	v3, (%[wq0])\n"
400 
401 			      "vle8.v	v6, (%[wp1])\n"
402 			      "vle8.v	v7, (%[wq1])\n"
403 			      "vxor.vv	v6, v6, v4\n"
404 			      "vxor.vv	v7, v7, v5\n"
405 			      "vse8.v	v6, (%[wp1])\n"
406 			      "vse8.v	v7, (%[wq1])\n"
407 			      ".option	pop\n"
408 			      : :
409 			      [wp0]"r"(&p[d + NSIZE * 0]),
410 			      [wq0]"r"(&q[d + NSIZE * 0]),
411 			      [wp1]"r"(&p[d + NSIZE * 1]),
412 			      [wq1]"r"(&q[d + NSIZE * 1])
413 		);
414 	}
415 }
416 
417 static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
418 {
419 	u8 **dptr = (u8 **)ptrs;
420 	u8 *p, *q;
421 	unsigned long vl, d;
422 	int z, z0;
423 
424 	z0 = disks - 3;	/* Highest data disk */
425 	p = dptr[z0 + 1];	/* XOR parity */
426 	q = dptr[z0 + 2];	/* RS syndrome */
427 
428 	asm volatile (".option	push\n"
429 		      ".option	arch,+v\n"
430 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
431 		      ".option	pop\n"
432 		      : "=&r" (vl)
433 	);
434 
435 	/*
436 	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
437 	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
438 	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
439 	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
440 	 */
441 	for (d = 0; d < bytes; d += NSIZE * 4) {
442 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
443 		asm volatile (".option	push\n"
444 			      ".option	arch,+v\n"
445 			      "vle8.v	v0, (%[wp0])\n"
446 			      "vle8.v	v1, (%[wp0])\n"
447 			      "vle8.v	v4, (%[wp1])\n"
448 			      "vle8.v	v5, (%[wp1])\n"
449 			      "vle8.v	v8, (%[wp2])\n"
450 			      "vle8.v	v9, (%[wp2])\n"
451 			      "vle8.v	v12, (%[wp3])\n"
452 			      "vle8.v	v13, (%[wp3])\n"
453 			      ".option	pop\n"
454 			      : :
455 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
456 			      [wp1]"r"(&dptr[z0][d + 1 * NSIZE]),
457 			      [wp2]"r"(&dptr[z0][d + 2 * NSIZE]),
458 			      [wp3]"r"(&dptr[z0][d + 3 * NSIZE])
459 		);
460 
461 		for (z = z0 - 1; z >= 0; z--) {
462 			/*
463 			 * w2$$ = MASK(wq$$);
464 			 * w1$$ = SHLBYTE(wq$$);
465 			 * w2$$ &= NBYTES(0x1d);
466 			 * w1$$ ^= w2$$;
467 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
468 			 * wq$$ = w1$$ ^ wd$$;
469 			 * wp$$ ^= wd$$;
470 			 */
471 			asm volatile (".option	push\n"
472 				      ".option	arch,+v\n"
473 				      "vsra.vi	v2, v1, 7\n"
474 				      "vsll.vi	v3, v1, 1\n"
475 				      "vand.vx	v2, v2, %[x1d]\n"
476 				      "vxor.vv	v3, v3, v2\n"
477 				      "vle8.v	v2, (%[wd0])\n"
478 				      "vxor.vv	v1, v3, v2\n"
479 				      "vxor.vv	v0, v0, v2\n"
480 
481 				      "vsra.vi	v6, v5, 7\n"
482 				      "vsll.vi	v7, v5, 1\n"
483 				      "vand.vx	v6, v6, %[x1d]\n"
484 				      "vxor.vv	v7, v7, v6\n"
485 				      "vle8.v	v6, (%[wd1])\n"
486 				      "vxor.vv	v5, v7, v6\n"
487 				      "vxor.vv	v4, v4, v6\n"
488 
489 				      "vsra.vi	v10, v9, 7\n"
490 				      "vsll.vi	v11, v9, 1\n"
491 				      "vand.vx	v10, v10, %[x1d]\n"
492 				      "vxor.vv	v11, v11, v10\n"
493 				      "vle8.v	v10, (%[wd2])\n"
494 				      "vxor.vv	v9, v11, v10\n"
495 				      "vxor.vv	v8, v8, v10\n"
496 
497 				      "vsra.vi	v14, v13, 7\n"
498 				      "vsll.vi	v15, v13, 1\n"
499 				      "vand.vx	v14, v14, %[x1d]\n"
500 				      "vxor.vv	v15, v15, v14\n"
501 				      "vle8.v	v14, (%[wd3])\n"
502 				      "vxor.vv	v13, v15, v14\n"
503 				      "vxor.vv	v12, v12, v14\n"
504 				      ".option	pop\n"
505 				      : :
506 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
507 				      [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
508 				      [wd2]"r"(&dptr[z][d + 2 * NSIZE]),
509 				      [wd3]"r"(&dptr[z][d + 3 * NSIZE]),
510 				      [x1d]"r"(0x1d)
511 			);
512 		}
513 
514 		/*
515 		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
516 		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
517 		 */
518 		asm volatile (".option	push\n"
519 			      ".option	arch,+v\n"
520 			      "vse8.v	v0, (%[wp0])\n"
521 			      "vse8.v	v1, (%[wq0])\n"
522 			      "vse8.v	v4, (%[wp1])\n"
523 			      "vse8.v	v5, (%[wq1])\n"
524 			      "vse8.v	v8, (%[wp2])\n"
525 			      "vse8.v	v9, (%[wq2])\n"
526 			      "vse8.v	v12, (%[wp3])\n"
527 			      "vse8.v	v13, (%[wq3])\n"
528 			      ".option	pop\n"
529 			      : :
530 			      [wp0]"r"(&p[d + NSIZE * 0]),
531 			      [wq0]"r"(&q[d + NSIZE * 0]),
532 			      [wp1]"r"(&p[d + NSIZE * 1]),
533 			      [wq1]"r"(&q[d + NSIZE * 1]),
534 			      [wp2]"r"(&p[d + NSIZE * 2]),
535 			      [wq2]"r"(&q[d + NSIZE * 2]),
536 			      [wp3]"r"(&p[d + NSIZE * 3]),
537 			      [wq3]"r"(&q[d + NSIZE * 3])
538 		);
539 	}
540 }
541 
542 static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
543 					 unsigned long bytes, void **ptrs)
544 {
545 	u8 **dptr = (u8 **)ptrs;
546 	u8 *p, *q;
547 	unsigned long vl, d;
548 	int z, z0;
549 
550 	z0 = stop;		/* P/Q right side optimization */
551 	p = dptr[disks - 2];	/* XOR parity */
552 	q = dptr[disks - 1];	/* RS syndrome */
553 
554 	asm volatile (".option	push\n"
555 		      ".option	arch,+v\n"
556 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
557 		      ".option	pop\n"
558 		      : "=&r" (vl)
559 	);
560 
561 	/*
562 	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
563 	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
564 	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
565 	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
566 	 */
567 	for (d = 0; d < bytes; d += NSIZE * 4) {
568 		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
569 		asm volatile (".option	push\n"
570 			      ".option	arch,+v\n"
571 			      "vle8.v	v0, (%[wp0])\n"
572 			      "vle8.v	v1, (%[wp0])\n"
573 			      "vle8.v	v4, (%[wp1])\n"
574 			      "vle8.v	v5, (%[wp1])\n"
575 			      "vle8.v	v8, (%[wp2])\n"
576 			      "vle8.v	v9, (%[wp2])\n"
577 			      "vle8.v	v12, (%[wp3])\n"
578 			      "vle8.v	v13, (%[wp3])\n"
579 			      ".option	pop\n"
580 			      : :
581 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
582 			      [wp1]"r"(&dptr[z0][d + 1 * NSIZE]),
583 			      [wp2]"r"(&dptr[z0][d + 2 * NSIZE]),
584 			      [wp3]"r"(&dptr[z0][d + 3 * NSIZE])
585 		);
586 
587 		/* P/Q data pages */
588 		for (z = z0 - 1; z >= start; z--) {
589 			/*
590 			 * w2$$ = MASK(wq$$);
591 			 * w1$$ = SHLBYTE(wq$$);
592 			 * w2$$ &= NBYTES(0x1d);
593 			 * w1$$ ^= w2$$;
594 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
595 			 * wq$$ = w1$$ ^ wd$$;
596 			 * wp$$ ^= wd$$;
597 			 */
598 			asm volatile (".option	push\n"
599 				      ".option	arch,+v\n"
600 				      "vsra.vi	v2, v1, 7\n"
601 				      "vsll.vi	v3, v1, 1\n"
602 				      "vand.vx	v2, v2, %[x1d]\n"
603 				      "vxor.vv	v3, v3, v2\n"
604 				      "vle8.v	v2, (%[wd0])\n"
605 				      "vxor.vv	v1, v3, v2\n"
606 				      "vxor.vv	v0, v0, v2\n"
607 
608 				      "vsra.vi	v6, v5, 7\n"
609 				      "vsll.vi	v7, v5, 1\n"
610 				      "vand.vx	v6, v6, %[x1d]\n"
611 				      "vxor.vv	v7, v7, v6\n"
612 				      "vle8.v	v6, (%[wd1])\n"
613 				      "vxor.vv	v5, v7, v6\n"
614 				      "vxor.vv	v4, v4, v6\n"
615 
616 				      "vsra.vi	v10, v9, 7\n"
617 				      "vsll.vi	v11, v9, 1\n"
618 				      "vand.vx	v10, v10, %[x1d]\n"
619 				      "vxor.vv	v11, v11, v10\n"
620 				      "vle8.v	v10, (%[wd2])\n"
621 				      "vxor.vv	v9, v11, v10\n"
622 				      "vxor.vv	v8, v8, v10\n"
623 
624 				      "vsra.vi	v14, v13, 7\n"
625 				      "vsll.vi	v15, v13, 1\n"
626 				      "vand.vx	v14, v14, %[x1d]\n"
627 				      "vxor.vv	v15, v15, v14\n"
628 				      "vle8.v	v14, (%[wd3])\n"
629 				      "vxor.vv	v13, v15, v14\n"
630 				      "vxor.vv	v12, v12, v14\n"
631 				      ".option	pop\n"
632 				      : :
633 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
634 				      [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
635 				      [wd2]"r"(&dptr[z][d + 2 * NSIZE]),
636 				      [wd3]"r"(&dptr[z][d + 3 * NSIZE]),
637 				      [x1d]"r"(0x1d)
638 			);
639 		}
640 
641 		/* P/Q left side optimization */
642 		for (z = start - 1; z >= 0; z--) {
643 			/*
644 			 * w2$$ = MASK(wq$$);
645 			 * w1$$ = SHLBYTE(wq$$);
646 			 * w2$$ &= NBYTES(0x1d);
647 			 * wq$$ = w1$$ ^ w2$$;
648 			 */
649 			asm volatile (".option	push\n"
650 				      ".option	arch,+v\n"
651 				      "vsra.vi	v2, v1, 7\n"
652 				      "vsll.vi	v3, v1, 1\n"
653 				      "vand.vx	v2, v2, %[x1d]\n"
654 				      "vxor.vv	v1, v3, v2\n"
655 
656 				      "vsra.vi	v6, v5, 7\n"
657 				      "vsll.vi	v7, v5, 1\n"
658 				      "vand.vx	v6, v6, %[x1d]\n"
659 				      "vxor.vv	v5, v7, v6\n"
660 
661 				      "vsra.vi	v10, v9, 7\n"
662 				      "vsll.vi	v11, v9, 1\n"
663 				      "vand.vx	v10, v10, %[x1d]\n"
664 				      "vxor.vv	v9, v11, v10\n"
665 
666 				      "vsra.vi	v14, v13, 7\n"
667 				      "vsll.vi	v15, v13, 1\n"
668 				      "vand.vx	v14, v14, %[x1d]\n"
669 				      "vxor.vv	v13, v15, v14\n"
670 				      ".option	pop\n"
671 				      : :
672 				      [x1d]"r"(0x1d)
673 			);
674 		}
675 
676 		/*
677 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
678 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
679 		 * v0:wp0, v1:wq0, v2:p0, v3:q0
680 		 * v4:wp1, v5:wq1, v6:p1, v7:q1
681 		 * v8:wp2, v9:wq2, v10:p2, v11:q2
682 		 * v12:wp3, v13:wq3, v14:p3, v15:q3
683 		 */
684 		asm volatile (".option	push\n"
685 			      ".option	arch,+v\n"
686 			      "vle8.v	v2, (%[wp0])\n"
687 			      "vle8.v	v3, (%[wq0])\n"
688 			      "vxor.vv	v2, v2, v0\n"
689 			      "vxor.vv	v3, v3, v1\n"
690 			      "vse8.v	v2, (%[wp0])\n"
691 			      "vse8.v	v3, (%[wq0])\n"
692 
693 			      "vle8.v	v6, (%[wp1])\n"
694 			      "vle8.v	v7, (%[wq1])\n"
695 			      "vxor.vv	v6, v6, v4\n"
696 			      "vxor.vv	v7, v7, v5\n"
697 			      "vse8.v	v6, (%[wp1])\n"
698 			      "vse8.v	v7, (%[wq1])\n"
699 
700 			      "vle8.v	v10, (%[wp2])\n"
701 			      "vle8.v	v11, (%[wq2])\n"
702 			      "vxor.vv	v10, v10, v8\n"
703 			      "vxor.vv	v11, v11, v9\n"
704 			      "vse8.v	v10, (%[wp2])\n"
705 			      "vse8.v	v11, (%[wq2])\n"
706 
707 			      "vle8.v	v14, (%[wp3])\n"
708 			      "vle8.v	v15, (%[wq3])\n"
709 			      "vxor.vv	v14, v14, v12\n"
710 			      "vxor.vv	v15, v15, v13\n"
711 			      "vse8.v	v14, (%[wp3])\n"
712 			      "vse8.v	v15, (%[wq3])\n"
713 			      ".option	pop\n"
714 			      : :
715 			      [wp0]"r"(&p[d + NSIZE * 0]),
716 			      [wq0]"r"(&q[d + NSIZE * 0]),
717 			      [wp1]"r"(&p[d + NSIZE * 1]),
718 			      [wq1]"r"(&q[d + NSIZE * 1]),
719 			      [wp2]"r"(&p[d + NSIZE * 2]),
720 			      [wq2]"r"(&q[d + NSIZE * 2]),
721 			      [wp3]"r"(&p[d + NSIZE * 3]),
722 			      [wq3]"r"(&q[d + NSIZE * 3])
723 		);
724 	}
725 }
726 
727 static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
728 {
729 	u8 **dptr = (u8 **)ptrs;
730 	u8 *p, *q;
731 	unsigned long vl, d;
732 	int z, z0;
733 
734 	z0 = disks - 3;	/* Highest data disk */
735 	p = dptr[z0 + 1];	/* XOR parity */
736 	q = dptr[z0 + 2];	/* RS syndrome */
737 
738 	asm volatile (".option	push\n"
739 		      ".option	arch,+v\n"
740 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
741 		      ".option	pop\n"
742 		      : "=&r" (vl)
743 	);
744 
745 	/*
746 	 * v0:wp0,   v1:wq0,  v2:wd0/w20,  v3:w10
747 	 * v4:wp1,   v5:wq1,  v6:wd1/w21,  v7:w11
748 	 * v8:wp2,   v9:wq2, v10:wd2/w22, v11:w12
749 	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
750 	 * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
751 	 * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
752 	 * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
753 	 * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
754 	 */
755 	for (d = 0; d < bytes; d += NSIZE * 8) {
756 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
757 		asm volatile (".option	push\n"
758 			      ".option	arch,+v\n"
759 			      "vle8.v	v0, (%[wp0])\n"
760 			      "vle8.v	v1, (%[wp0])\n"
761 			      "vle8.v	v4, (%[wp1])\n"
762 			      "vle8.v	v5, (%[wp1])\n"
763 			      "vle8.v	v8, (%[wp2])\n"
764 			      "vle8.v	v9, (%[wp2])\n"
765 			      "vle8.v	v12, (%[wp3])\n"
766 			      "vle8.v	v13, (%[wp3])\n"
767 			      "vle8.v	v16, (%[wp4])\n"
768 			      "vle8.v	v17, (%[wp4])\n"
769 			      "vle8.v	v20, (%[wp5])\n"
770 			      "vle8.v	v21, (%[wp5])\n"
771 			      "vle8.v	v24, (%[wp6])\n"
772 			      "vle8.v	v25, (%[wp6])\n"
773 			      "vle8.v	v28, (%[wp7])\n"
774 			      "vle8.v	v29, (%[wp7])\n"
775 			      ".option	pop\n"
776 			      : :
777 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
778 			      [wp1]"r"(&dptr[z0][d + 1 * NSIZE]),
779 			      [wp2]"r"(&dptr[z0][d + 2 * NSIZE]),
780 			      [wp3]"r"(&dptr[z0][d + 3 * NSIZE]),
781 			      [wp4]"r"(&dptr[z0][d + 4 * NSIZE]),
782 			      [wp5]"r"(&dptr[z0][d + 5 * NSIZE]),
783 			      [wp6]"r"(&dptr[z0][d + 6 * NSIZE]),
784 			      [wp7]"r"(&dptr[z0][d + 7 * NSIZE])
785 		);
786 
787 		for (z = z0 - 1; z >= 0; z--) {
788 			/*
789 			 * w2$$ = MASK(wq$$);
790 			 * w1$$ = SHLBYTE(wq$$);
791 			 * w2$$ &= NBYTES(0x1d);
792 			 * w1$$ ^= w2$$;
793 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
794 			 * wq$$ = w1$$ ^ wd$$;
795 			 * wp$$ ^= wd$$;
796 			 */
797 			asm volatile (".option	push\n"
798 				      ".option	arch,+v\n"
799 				      "vsra.vi	v2, v1, 7\n"
800 				      "vsll.vi	v3, v1, 1\n"
801 				      "vand.vx	v2, v2, %[x1d]\n"
802 				      "vxor.vv	v3, v3, v2\n"
803 				      "vle8.v	v2, (%[wd0])\n"
804 				      "vxor.vv	v1, v3, v2\n"
805 				      "vxor.vv	v0, v0, v2\n"
806 
807 				      "vsra.vi	v6, v5, 7\n"
808 				      "vsll.vi	v7, v5, 1\n"
809 				      "vand.vx	v6, v6, %[x1d]\n"
810 				      "vxor.vv	v7, v7, v6\n"
811 				      "vle8.v	v6, (%[wd1])\n"
812 				      "vxor.vv	v5, v7, v6\n"
813 				      "vxor.vv	v4, v4, v6\n"
814 
815 				      "vsra.vi	v10, v9, 7\n"
816 				      "vsll.vi	v11, v9, 1\n"
817 				      "vand.vx	v10, v10, %[x1d]\n"
818 				      "vxor.vv	v11, v11, v10\n"
819 				      "vle8.v	v10, (%[wd2])\n"
820 				      "vxor.vv	v9, v11, v10\n"
821 				      "vxor.vv	v8, v8, v10\n"
822 
823 				      "vsra.vi	v14, v13, 7\n"
824 				      "vsll.vi	v15, v13, 1\n"
825 				      "vand.vx	v14, v14, %[x1d]\n"
826 				      "vxor.vv	v15, v15, v14\n"
827 				      "vle8.v	v14, (%[wd3])\n"
828 				      "vxor.vv	v13, v15, v14\n"
829 				      "vxor.vv	v12, v12, v14\n"
830 
831 				      "vsra.vi	v18, v17, 7\n"
832 				      "vsll.vi	v19, v17, 1\n"
833 				      "vand.vx	v18, v18, %[x1d]\n"
834 				      "vxor.vv	v19, v19, v18\n"
835 				      "vle8.v	v18, (%[wd4])\n"
836 				      "vxor.vv	v17, v19, v18\n"
837 				      "vxor.vv	v16, v16, v18\n"
838 
839 				      "vsra.vi	v22, v21, 7\n"
840 				      "vsll.vi	v23, v21, 1\n"
841 				      "vand.vx	v22, v22, %[x1d]\n"
842 				      "vxor.vv	v23, v23, v22\n"
843 				      "vle8.v	v22, (%[wd5])\n"
844 				      "vxor.vv	v21, v23, v22\n"
845 				      "vxor.vv	v20, v20, v22\n"
846 
847 				      "vsra.vi	v26, v25, 7\n"
848 				      "vsll.vi	v27, v25, 1\n"
849 				      "vand.vx	v26, v26, %[x1d]\n"
850 				      "vxor.vv	v27, v27, v26\n"
851 				      "vle8.v	v26, (%[wd6])\n"
852 				      "vxor.vv	v25, v27, v26\n"
853 				      "vxor.vv	v24, v24, v26\n"
854 
855 				      "vsra.vi	v30, v29, 7\n"
856 				      "vsll.vi	v31, v29, 1\n"
857 				      "vand.vx	v30, v30, %[x1d]\n"
858 				      "vxor.vv	v31, v31, v30\n"
859 				      "vle8.v	v30, (%[wd7])\n"
860 				      "vxor.vv	v29, v31, v30\n"
861 				      "vxor.vv	v28, v28, v30\n"
862 				      ".option	pop\n"
863 				      : :
864 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
865 				      [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
866 				      [wd2]"r"(&dptr[z][d + 2 * NSIZE]),
867 				      [wd3]"r"(&dptr[z][d + 3 * NSIZE]),
868 				      [wd4]"r"(&dptr[z][d + 4 * NSIZE]),
869 				      [wd5]"r"(&dptr[z][d + 5 * NSIZE]),
870 				      [wd6]"r"(&dptr[z][d + 6 * NSIZE]),
871 				      [wd7]"r"(&dptr[z][d + 7 * NSIZE]),
872 				      [x1d]"r"(0x1d)
873 			);
874 		}
875 
876 		/*
877 		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
878 		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
879 		 */
880 		asm volatile (".option	push\n"
881 			      ".option	arch,+v\n"
882 			      "vse8.v	v0, (%[wp0])\n"
883 			      "vse8.v	v1, (%[wq0])\n"
884 			      "vse8.v	v4, (%[wp1])\n"
885 			      "vse8.v	v5, (%[wq1])\n"
886 			      "vse8.v	v8, (%[wp2])\n"
887 			      "vse8.v	v9, (%[wq2])\n"
888 			      "vse8.v	v12, (%[wp3])\n"
889 			      "vse8.v	v13, (%[wq3])\n"
890 			      "vse8.v	v16, (%[wp4])\n"
891 			      "vse8.v	v17, (%[wq4])\n"
892 			      "vse8.v	v20, (%[wp5])\n"
893 			      "vse8.v	v21, (%[wq5])\n"
894 			      "vse8.v	v24, (%[wp6])\n"
895 			      "vse8.v	v25, (%[wq6])\n"
896 			      "vse8.v	v28, (%[wp7])\n"
897 			      "vse8.v	v29, (%[wq7])\n"
898 			      ".option	pop\n"
899 			      : :
900 			      [wp0]"r"(&p[d + NSIZE * 0]),
901 			      [wq0]"r"(&q[d + NSIZE * 0]),
902 			      [wp1]"r"(&p[d + NSIZE * 1]),
903 			      [wq1]"r"(&q[d + NSIZE * 1]),
904 			      [wp2]"r"(&p[d + NSIZE * 2]),
905 			      [wq2]"r"(&q[d + NSIZE * 2]),
906 			      [wp3]"r"(&p[d + NSIZE * 3]),
907 			      [wq3]"r"(&q[d + NSIZE * 3]),
908 			      [wp4]"r"(&p[d + NSIZE * 4]),
909 			      [wq4]"r"(&q[d + NSIZE * 4]),
910 			      [wp5]"r"(&p[d + NSIZE * 5]),
911 			      [wq5]"r"(&q[d + NSIZE * 5]),
912 			      [wp6]"r"(&p[d + NSIZE * 6]),
913 			      [wq6]"r"(&q[d + NSIZE * 6]),
914 			      [wp7]"r"(&p[d + NSIZE * 7]),
915 			      [wq7]"r"(&q[d + NSIZE * 7])
916 		);
917 	}
918 }
919 
920 static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop,
921 					 unsigned long bytes, void **ptrs)
922 {
923 	u8 **dptr = (u8 **)ptrs;
924 	u8 *p, *q;
925 	unsigned long vl, d;
926 	int z, z0;
927 
928 	z0 = stop;		/* P/Q right side optimization */
929 	p = dptr[disks - 2];	/* XOR parity */
930 	q = dptr[disks - 1];	/* RS syndrome */
931 
932 	asm volatile (".option	push\n"
933 		      ".option	arch,+v\n"
934 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
935 		      ".option	pop\n"
936 		      : "=&r" (vl)
937 	);
938 
939 	/*
940 	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
941 	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
942 	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
943 	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
944 	 * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
945 	 * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
946 	 * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
947 	 * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
948 	 */
949 	for (d = 0; d < bytes; d += NSIZE * 8) {
950 		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
951 		asm volatile (".option	push\n"
952 			      ".option	arch,+v\n"
953 			      "vle8.v	v0, (%[wp0])\n"
954 			      "vle8.v	v1, (%[wp0])\n"
955 			      "vle8.v	v4, (%[wp1])\n"
956 			      "vle8.v	v5, (%[wp1])\n"
957 			      "vle8.v	v8, (%[wp2])\n"
958 			      "vle8.v	v9, (%[wp2])\n"
959 			      "vle8.v	v12, (%[wp3])\n"
960 			      "vle8.v	v13, (%[wp3])\n"
961 			      "vle8.v	v16, (%[wp4])\n"
962 			      "vle8.v	v17, (%[wp4])\n"
963 			      "vle8.v	v20, (%[wp5])\n"
964 			      "vle8.v	v21, (%[wp5])\n"
965 			      "vle8.v	v24, (%[wp6])\n"
966 			      "vle8.v	v25, (%[wp6])\n"
967 			      "vle8.v	v28, (%[wp7])\n"
968 			      "vle8.v	v29, (%[wp7])\n"
969 			      ".option	pop\n"
970 			      : :
971 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
972 			      [wp1]"r"(&dptr[z0][d + 1 * NSIZE]),
973 			      [wp2]"r"(&dptr[z0][d + 2 * NSIZE]),
974 			      [wp3]"r"(&dptr[z0][d + 3 * NSIZE]),
975 			      [wp4]"r"(&dptr[z0][d + 4 * NSIZE]),
976 			      [wp5]"r"(&dptr[z0][d + 5 * NSIZE]),
977 			      [wp6]"r"(&dptr[z0][d + 6 * NSIZE]),
978 			      [wp7]"r"(&dptr[z0][d + 7 * NSIZE])
979 		);
980 
981 		/* P/Q data pages */
982 		for (z = z0 - 1; z >= start; z--) {
983 			/*
984 			 * w2$$ = MASK(wq$$);
985 			 * w1$$ = SHLBYTE(wq$$);
986 			 * w2$$ &= NBYTES(0x1d);
987 			 * w1$$ ^= w2$$;
988 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
989 			 * wq$$ = w1$$ ^ wd$$;
990 			 * wp$$ ^= wd$$;
991 			 */
992 			asm volatile (".option	push\n"
993 				      ".option	arch,+v\n"
994 				      "vsra.vi	v2, v1, 7\n"
995 				      "vsll.vi	v3, v1, 1\n"
996 				      "vand.vx	v2, v2, %[x1d]\n"
997 				      "vxor.vv	v3, v3, v2\n"
998 				      "vle8.v	v2, (%[wd0])\n"
999 				      "vxor.vv	v1, v3, v2\n"
1000 				      "vxor.vv	v0, v0, v2\n"
1001 
1002 				      "vsra.vi	v6, v5, 7\n"
1003 				      "vsll.vi	v7, v5, 1\n"
1004 				      "vand.vx	v6, v6, %[x1d]\n"
1005 				      "vxor.vv	v7, v7, v6\n"
1006 				      "vle8.v	v6, (%[wd1])\n"
1007 				      "vxor.vv	v5, v7, v6\n"
1008 				      "vxor.vv	v4, v4, v6\n"
1009 
1010 				      "vsra.vi	v10, v9, 7\n"
1011 				      "vsll.vi	v11, v9, 1\n"
1012 				      "vand.vx	v10, v10, %[x1d]\n"
1013 				      "vxor.vv	v11, v11, v10\n"
1014 				      "vle8.v	v10, (%[wd2])\n"
1015 				      "vxor.vv	v9, v11, v10\n"
1016 				      "vxor.vv	v8, v8, v10\n"
1017 
1018 				      "vsra.vi	v14, v13, 7\n"
1019 				      "vsll.vi	v15, v13, 1\n"
1020 				      "vand.vx	v14, v14, %[x1d]\n"
1021 				      "vxor.vv	v15, v15, v14\n"
1022 				      "vle8.v	v14, (%[wd3])\n"
1023 				      "vxor.vv	v13, v15, v14\n"
1024 				      "vxor.vv	v12, v12, v14\n"
1025 
1026 				      "vsra.vi	v18, v17, 7\n"
1027 				      "vsll.vi	v19, v17, 1\n"
1028 				      "vand.vx	v18, v18, %[x1d]\n"
1029 				      "vxor.vv	v19, v19, v18\n"
1030 				      "vle8.v	v18, (%[wd4])\n"
1031 				      "vxor.vv	v17, v19, v18\n"
1032 				      "vxor.vv	v16, v16, v18\n"
1033 
1034 				      "vsra.vi	v22, v21, 7\n"
1035 				      "vsll.vi	v23, v21, 1\n"
1036 				      "vand.vx	v22, v22, %[x1d]\n"
1037 				      "vxor.vv	v23, v23, v22\n"
1038 				      "vle8.v	v22, (%[wd5])\n"
1039 				      "vxor.vv	v21, v23, v22\n"
1040 				      "vxor.vv	v20, v20, v22\n"
1041 
1042 				      "vsra.vi	v26, v25, 7\n"
1043 				      "vsll.vi	v27, v25, 1\n"
1044 				      "vand.vx	v26, v26, %[x1d]\n"
1045 				      "vxor.vv	v27, v27, v26\n"
1046 				      "vle8.v	v26, (%[wd6])\n"
1047 				      "vxor.vv	v25, v27, v26\n"
1048 				      "vxor.vv	v24, v24, v26\n"
1049 
1050 				      "vsra.vi	v30, v29, 7\n"
1051 				      "vsll.vi	v31, v29, 1\n"
1052 				      "vand.vx	v30, v30, %[x1d]\n"
1053 				      "vxor.vv	v31, v31, v30\n"
1054 				      "vle8.v	v30, (%[wd7])\n"
1055 				      "vxor.vv	v29, v31, v30\n"
1056 				      "vxor.vv	v28, v28, v30\n"
1057 				      ".option	pop\n"
1058 				      : :
1059 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
1060 				      [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
1061 				      [wd2]"r"(&dptr[z][d + 2 * NSIZE]),
1062 				      [wd3]"r"(&dptr[z][d + 3 * NSIZE]),
1063 				      [wd4]"r"(&dptr[z][d + 4 * NSIZE]),
1064 				      [wd5]"r"(&dptr[z][d + 5 * NSIZE]),
1065 				      [wd6]"r"(&dptr[z][d + 6 * NSIZE]),
1066 				      [wd7]"r"(&dptr[z][d + 7 * NSIZE]),
1067 				      [x1d]"r"(0x1d)
1068 			);
1069 		}
1070 
1071 		/* P/Q left side optimization */
1072 		for (z = start - 1; z >= 0; z--) {
1073 			/*
1074 			 * w2$$ = MASK(wq$$);
1075 			 * w1$$ = SHLBYTE(wq$$);
1076 			 * w2$$ &= NBYTES(0x1d);
1077 			 * wq$$ = w1$$ ^ w2$$;
1078 			 */
1079 			asm volatile (".option	push\n"
1080 				      ".option	arch,+v\n"
1081 				      "vsra.vi	v2, v1, 7\n"
1082 				      "vsll.vi	v3, v1, 1\n"
1083 				      "vand.vx	v2, v2, %[x1d]\n"
1084 				      "vxor.vv	v1, v3, v2\n"
1085 
1086 				      "vsra.vi	v6, v5, 7\n"
1087 				      "vsll.vi	v7, v5, 1\n"
1088 				      "vand.vx	v6, v6, %[x1d]\n"
1089 				      "vxor.vv	v5, v7, v6\n"
1090 
1091 				      "vsra.vi	v10, v9, 7\n"
1092 				      "vsll.vi	v11, v9, 1\n"
1093 				      "vand.vx	v10, v10, %[x1d]\n"
1094 				      "vxor.vv	v9, v11, v10\n"
1095 
1096 				      "vsra.vi	v14, v13, 7\n"
1097 				      "vsll.vi	v15, v13, 1\n"
1098 				      "vand.vx	v14, v14, %[x1d]\n"
1099 				      "vxor.vv	v13, v15, v14\n"
1100 
1101 				      "vsra.vi	v18, v17, 7\n"
1102 				      "vsll.vi	v19, v17, 1\n"
1103 				      "vand.vx	v18, v18, %[x1d]\n"
1104 				      "vxor.vv	v17, v19, v18\n"
1105 
1106 				      "vsra.vi	v22, v21, 7\n"
1107 				      "vsll.vi	v23, v21, 1\n"
1108 				      "vand.vx	v22, v22, %[x1d]\n"
1109 				      "vxor.vv	v21, v23, v22\n"
1110 
1111 				      "vsra.vi	v26, v25, 7\n"
1112 				      "vsll.vi	v27, v25, 1\n"
1113 				      "vand.vx	v26, v26, %[x1d]\n"
1114 				      "vxor.vv	v25, v27, v26\n"
1115 
1116 				      "vsra.vi	v30, v29, 7\n"
1117 				      "vsll.vi	v31, v29, 1\n"
1118 				      "vand.vx	v30, v30, %[x1d]\n"
1119 				      "vxor.vv	v29, v31, v30\n"
1120 				      ".option	pop\n"
1121 				      : :
1122 				      [x1d]"r"(0x1d)
1123 			);
1124 		}
1125 
1126 		/*
1127 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
1128 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
1129 		 * v0:wp0, v1:wq0, v2:p0, v3:q0
1130 		 * v4:wp1, v5:wq1, v6:p1, v7:q1
1131 		 * v8:wp2, v9:wq2, v10:p2, v11:q2
1132 		 * v12:wp3, v13:wq3, v14:p3, v15:q3
1133 		 * v16:wp4, v17:wq4, v18:p4, v19:q4
1134 		 * v20:wp5, v21:wq5, v22:p5, v23:q5
1135 		 * v24:wp6, v25:wq6, v26:p6, v27:q6
1136 		 * v28:wp7, v29:wq7, v30:p7, v31:q7
1137 		 */
1138 		asm volatile (".option	push\n"
1139 			      ".option	arch,+v\n"
1140 			      "vle8.v	v2, (%[wp0])\n"
1141 			      "vle8.v	v3, (%[wq0])\n"
1142 			      "vxor.vv	v2, v2, v0\n"
1143 			      "vxor.vv	v3, v3, v1\n"
1144 			      "vse8.v	v2, (%[wp0])\n"
1145 			      "vse8.v	v3, (%[wq0])\n"
1146 
1147 			      "vle8.v	v6, (%[wp1])\n"
1148 			      "vle8.v	v7, (%[wq1])\n"
1149 			      "vxor.vv	v6, v6, v4\n"
1150 			      "vxor.vv	v7, v7, v5\n"
1151 			      "vse8.v	v6, (%[wp1])\n"
1152 			      "vse8.v	v7, (%[wq1])\n"
1153 
1154 			      "vle8.v	v10, (%[wp2])\n"
1155 			      "vle8.v	v11, (%[wq2])\n"
1156 			      "vxor.vv	v10, v10, v8\n"
1157 			      "vxor.vv	v11, v11, v9\n"
1158 			      "vse8.v	v10, (%[wp2])\n"
1159 			      "vse8.v	v11, (%[wq2])\n"
1160 
1161 			      "vle8.v	v14, (%[wp3])\n"
1162 			      "vle8.v	v15, (%[wq3])\n"
1163 			      "vxor.vv	v14, v14, v12\n"
1164 			      "vxor.vv	v15, v15, v13\n"
1165 			      "vse8.v	v14, (%[wp3])\n"
1166 			      "vse8.v	v15, (%[wq3])\n"
1167 
1168 			      "vle8.v	v18, (%[wp4])\n"
1169 			      "vle8.v	v19, (%[wq4])\n"
1170 			      "vxor.vv	v18, v18, v16\n"
1171 			      "vxor.vv	v19, v19, v17\n"
1172 			      "vse8.v	v18, (%[wp4])\n"
1173 			      "vse8.v	v19, (%[wq4])\n"
1174 
1175 			      "vle8.v	v22, (%[wp5])\n"
1176 			      "vle8.v	v23, (%[wq5])\n"
1177 			      "vxor.vv	v22, v22, v20\n"
1178 			      "vxor.vv	v23, v23, v21\n"
1179 			      "vse8.v	v22, (%[wp5])\n"
1180 			      "vse8.v	v23, (%[wq5])\n"
1181 
1182 			      "vle8.v	v26, (%[wp6])\n"
1183 			      "vle8.v	v27, (%[wq6])\n"
1184 			      "vxor.vv	v26, v26, v24\n"
1185 			      "vxor.vv	v27, v27, v25\n"
1186 			      "vse8.v	v26, (%[wp6])\n"
1187 			      "vse8.v	v27, (%[wq6])\n"
1188 
1189 			      "vle8.v	v30, (%[wp7])\n"
1190 			      "vle8.v	v31, (%[wq7])\n"
1191 			      "vxor.vv	v30, v30, v28\n"
1192 			      "vxor.vv	v31, v31, v29\n"
1193 			      "vse8.v	v30, (%[wp7])\n"
1194 			      "vse8.v	v31, (%[wq7])\n"
1195 			      ".option	pop\n"
1196 			      : :
1197 			      [wp0]"r"(&p[d + NSIZE * 0]),
1198 			      [wq0]"r"(&q[d + NSIZE * 0]),
1199 			      [wp1]"r"(&p[d + NSIZE * 1]),
1200 			      [wq1]"r"(&q[d + NSIZE * 1]),
1201 			      [wp2]"r"(&p[d + NSIZE * 2]),
1202 			      [wq2]"r"(&q[d + NSIZE * 2]),
1203 			      [wp3]"r"(&p[d + NSIZE * 3]),
1204 			      [wq3]"r"(&q[d + NSIZE * 3]),
1205 			      [wp4]"r"(&p[d + NSIZE * 4]),
1206 			      [wq4]"r"(&q[d + NSIZE * 4]),
1207 			      [wp5]"r"(&p[d + NSIZE * 5]),
1208 			      [wq5]"r"(&q[d + NSIZE * 5]),
1209 			      [wp6]"r"(&p[d + NSIZE * 6]),
1210 			      [wq6]"r"(&q[d + NSIZE * 6]),
1211 			      [wp7]"r"(&p[d + NSIZE * 7]),
1212 			      [wq7]"r"(&q[d + NSIZE * 7])
1213 		);
1214 	}
1215 }
1216 
1217 RAID6_RVV_WRAPPER(1);
1218 RAID6_RVV_WRAPPER(2);
1219 RAID6_RVV_WRAPPER(4);
1220 RAID6_RVV_WRAPPER(8);
1221