xref: /linux/lib/raid6/rvv.c (revision 07fdad3a93756b872da7b53647715c48d0f4a2d0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * RAID-6 syndrome calculation using RISC-V vector instructions
4  *
5  * Copyright 2024 Institute of Software, CAS.
6  * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
7  *
8  * Based on neon.uc:
9  *	Copyright 2002-2004 H. Peter Anvin
10  */
11 
12 #include <asm/vector.h>
13 #include <linux/raid/pq.h>
14 #include "rvv.h"
15 
16 #define NSIZE	(riscv_v_vsize / 32) /* NSIZE = vlenb */
17 
18 static int rvv_has_vector(void)
19 {
20 	return has_vector();
21 }
22 
23 static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
24 {
25 	u8 **dptr = (u8 **)ptrs;
26 	u8 *p, *q;
27 	unsigned long vl, d;
28 	int z, z0;
29 
30 	z0 = disks - 3;		/* Highest data disk */
31 	p = dptr[z0 + 1];		/* XOR parity */
32 	q = dptr[z0 + 2];		/* RS syndrome */
33 
34 	asm volatile (".option	push\n"
35 		      ".option	arch,+v\n"
36 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
37 		      ".option	pop\n"
38 		      : "=&r" (vl)
39 	);
40 
41 	 /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
42 	for (d = 0; d < bytes; d += NSIZE * 1) {
43 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
44 		asm volatile (".option	push\n"
45 			      ".option	arch,+v\n"
46 			      "vle8.v	v0, (%[wp0])\n"
47 			      "vmv.v.v	v1, v0\n"
48 			      ".option	pop\n"
49 			      : :
50 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE])
51 		);
52 
53 		for (z = z0 - 1 ; z >= 0 ; z--) {
54 			/*
55 			 * w2$$ = MASK(wq$$);
56 			 * w1$$ = SHLBYTE(wq$$);
57 			 * w2$$ &= NBYTES(0x1d);
58 			 * w1$$ ^= w2$$;
59 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
60 			 * wq$$ = w1$$ ^ wd$$;
61 			 * wp$$ ^= wd$$;
62 			 */
63 			asm volatile (".option	push\n"
64 				      ".option	arch,+v\n"
65 				      "vsra.vi	v2, v1, 7\n"
66 				      "vsll.vi	v3, v1, 1\n"
67 				      "vand.vx	v2, v2, %[x1d]\n"
68 				      "vxor.vv	v3, v3, v2\n"
69 				      "vle8.v	v2, (%[wd0])\n"
70 				      "vxor.vv	v1, v3, v2\n"
71 				      "vxor.vv	v0, v0, v2\n"
72 				      ".option	pop\n"
73 				      : :
74 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
75 				      [x1d]"r"(0x1d)
76 			);
77 		}
78 
79 		/*
80 		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
81 		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
82 		 */
83 		asm volatile (".option	push\n"
84 			      ".option	arch,+v\n"
85 			      "vse8.v	v0, (%[wp0])\n"
86 			      "vse8.v	v1, (%[wq0])\n"
87 			      ".option	pop\n"
88 			      : :
89 			      [wp0]"r"(&p[d + NSIZE * 0]),
90 			      [wq0]"r"(&q[d + NSIZE * 0])
91 		);
92 	}
93 }
94 
95 static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
96 					 unsigned long bytes, void **ptrs)
97 {
98 	u8 **dptr = (u8 **)ptrs;
99 	u8 *p, *q;
100 	unsigned long vl, d;
101 	int z, z0;
102 
103 	z0 = stop;		/* P/Q right side optimization */
104 	p = dptr[disks - 2];	/* XOR parity */
105 	q = dptr[disks - 1];	/* RS syndrome */
106 
107 	asm volatile (".option	push\n"
108 		      ".option	arch,+v\n"
109 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
110 		      ".option	pop\n"
111 		      : "=&r" (vl)
112 	);
113 
114 	/* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
115 	for (d = 0 ; d < bytes ; d += NSIZE * 1) {
116 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
117 		asm volatile (".option	push\n"
118 			      ".option	arch,+v\n"
119 			      "vle8.v	v0, (%[wp0])\n"
120 			      "vmv.v.v	v1, v0\n"
121 			      ".option	pop\n"
122 			      : :
123 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE])
124 		);
125 
126 		/* P/Q data pages */
127 		for (z = z0 - 1; z >= start; z--) {
128 			/*
129 			 * w2$$ = MASK(wq$$);
130 			 * w1$$ = SHLBYTE(wq$$);
131 			 * w2$$ &= NBYTES(0x1d);
132 			 * w1$$ ^= w2$$;
133 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
134 			 * wq$$ = w1$$ ^ wd$$;
135 			 * wp$$ ^= wd$$;
136 			 */
137 			asm volatile (".option	push\n"
138 				      ".option	arch,+v\n"
139 				      "vsra.vi	v2, v1, 7\n"
140 				      "vsll.vi	v3, v1, 1\n"
141 				      "vand.vx	v2, v2, %[x1d]\n"
142 				      "vxor.vv	v3, v3, v2\n"
143 				      "vle8.v	v2, (%[wd0])\n"
144 				      "vxor.vv	v1, v3, v2\n"
145 				      "vxor.vv	v0, v0, v2\n"
146 				      ".option	pop\n"
147 				      : :
148 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
149 				      [x1d]"r"(0x1d)
150 			);
151 		}
152 
153 		/* P/Q left side optimization */
154 		for (z = start - 1; z >= 0; z--) {
155 			/*
156 			 * w2$$ = MASK(wq$$);
157 			 * w1$$ = SHLBYTE(wq$$);
158 			 * w2$$ &= NBYTES(0x1d);
159 			 * wq$$ = w1$$ ^ w2$$;
160 			 */
161 			asm volatile (".option	push\n"
162 				      ".option	arch,+v\n"
163 				      "vsra.vi	v2, v1, 7\n"
164 				      "vsll.vi	v3, v1, 1\n"
165 				      "vand.vx	v2, v2, %[x1d]\n"
166 				      "vxor.vv	v1, v3, v2\n"
167 				      ".option	pop\n"
168 				      : :
169 				      [x1d]"r"(0x1d)
170 			);
171 		}
172 
173 		/*
174 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
175 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
176 		 * v0:wp0, v1:wq0, v2:p0, v3:q0
177 		 */
178 		asm volatile (".option	push\n"
179 			      ".option	arch,+v\n"
180 			      "vle8.v	v2, (%[wp0])\n"
181 			      "vle8.v	v3, (%[wq0])\n"
182 			      "vxor.vv	v2, v2, v0\n"
183 			      "vxor.vv	v3, v3, v1\n"
184 			      "vse8.v	v2, (%[wp0])\n"
185 			      "vse8.v	v3, (%[wq0])\n"
186 			      ".option	pop\n"
187 			      : :
188 			      [wp0]"r"(&p[d + NSIZE * 0]),
189 			      [wq0]"r"(&q[d + NSIZE * 0])
190 		);
191 	}
192 }
193 
194 static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
195 {
196 	u8 **dptr = (u8 **)ptrs;
197 	u8 *p, *q;
198 	unsigned long vl, d;
199 	int z, z0;
200 
201 	z0 = disks - 3;		/* Highest data disk */
202 	p = dptr[z0 + 1];		/* XOR parity */
203 	q = dptr[z0 + 2];		/* RS syndrome */
204 
205 	asm volatile (".option	push\n"
206 		      ".option	arch,+v\n"
207 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
208 		      ".option	pop\n"
209 		      : "=&r" (vl)
210 	);
211 
212 	/*
213 	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
214 	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
215 	 */
216 	for (d = 0; d < bytes; d += NSIZE * 2) {
217 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
218 		asm volatile (".option	push\n"
219 			      ".option	arch,+v\n"
220 			      "vle8.v	v0, (%[wp0])\n"
221 			      "vmv.v.v	v1, v0\n"
222 			      "vle8.v	v4, (%[wp1])\n"
223 			      "vmv.v.v	v5, v4\n"
224 			      ".option	pop\n"
225 			      : :
226 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
227 			      [wp1]"r"(&dptr[z0][d + 1 * NSIZE])
228 		);
229 
230 		for (z = z0 - 1; z >= 0; z--) {
231 			/*
232 			 * w2$$ = MASK(wq$$);
233 			 * w1$$ = SHLBYTE(wq$$);
234 			 * w2$$ &= NBYTES(0x1d);
235 			 * w1$$ ^= w2$$;
236 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
237 			 * wq$$ = w1$$ ^ wd$$;
238 			 * wp$$ ^= wd$$;
239 			 */
240 			asm volatile (".option	push\n"
241 				      ".option	arch,+v\n"
242 				      "vsra.vi	v2, v1, 7\n"
243 				      "vsll.vi	v3, v1, 1\n"
244 				      "vand.vx	v2, v2, %[x1d]\n"
245 				      "vxor.vv	v3, v3, v2\n"
246 				      "vle8.v	v2, (%[wd0])\n"
247 				      "vxor.vv	v1, v3, v2\n"
248 				      "vxor.vv	v0, v0, v2\n"
249 
250 				      "vsra.vi	v6, v5, 7\n"
251 				      "vsll.vi	v7, v5, 1\n"
252 				      "vand.vx	v6, v6, %[x1d]\n"
253 				      "vxor.vv	v7, v7, v6\n"
254 				      "vle8.v	v6, (%[wd1])\n"
255 				      "vxor.vv	v5, v7, v6\n"
256 				      "vxor.vv	v4, v4, v6\n"
257 				      ".option	pop\n"
258 				      : :
259 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
260 				      [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
261 				      [x1d]"r"(0x1d)
262 			);
263 		}
264 
265 		/*
266 		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
267 		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
268 		 */
269 		asm volatile (".option	push\n"
270 			      ".option	arch,+v\n"
271 			      "vse8.v	v0, (%[wp0])\n"
272 			      "vse8.v	v1, (%[wq0])\n"
273 			      "vse8.v	v4, (%[wp1])\n"
274 			      "vse8.v	v5, (%[wq1])\n"
275 			      ".option	pop\n"
276 			      : :
277 			      [wp0]"r"(&p[d + NSIZE * 0]),
278 			      [wq0]"r"(&q[d + NSIZE * 0]),
279 			      [wp1]"r"(&p[d + NSIZE * 1]),
280 			      [wq1]"r"(&q[d + NSIZE * 1])
281 		);
282 	}
283 }
284 
285 static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
286 					 unsigned long bytes, void **ptrs)
287 {
288 	u8 **dptr = (u8 **)ptrs;
289 	u8 *p, *q;
290 	unsigned long vl, d;
291 	int z, z0;
292 
293 	z0 = stop;		/* P/Q right side optimization */
294 	p = dptr[disks - 2];	/* XOR parity */
295 	q = dptr[disks - 1];	/* RS syndrome */
296 
297 	asm volatile (".option	push\n"
298 		      ".option	arch,+v\n"
299 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
300 		      ".option	pop\n"
301 		      : "=&r" (vl)
302 	);
303 
304 	/*
305 	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
306 	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
307 	 */
308 	for (d = 0; d < bytes; d += NSIZE * 2) {
309 		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
310 		asm volatile (".option	push\n"
311 			      ".option	arch,+v\n"
312 			      "vle8.v	v0, (%[wp0])\n"
313 			      "vmv.v.v	v1, v0\n"
314 			      "vle8.v	v4, (%[wp1])\n"
315 			      "vmv.v.v	v5, v4\n"
316 			      ".option	pop\n"
317 			      : :
318 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
319 			      [wp1]"r"(&dptr[z0][d + 1 * NSIZE])
320 		);
321 
322 		/* P/Q data pages */
323 		for (z = z0 - 1; z >= start; z--) {
324 			/*
325 			 * w2$$ = MASK(wq$$);
326 			 * w1$$ = SHLBYTE(wq$$);
327 			 * w2$$ &= NBYTES(0x1d);
328 			 * w1$$ ^= w2$$;
329 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
330 			 * wq$$ = w1$$ ^ wd$$;
331 			 * wp$$ ^= wd$$;
332 			 */
333 			asm volatile (".option	push\n"
334 				      ".option	arch,+v\n"
335 				      "vsra.vi	v2, v1, 7\n"
336 				      "vsll.vi	v3, v1, 1\n"
337 				      "vand.vx	v2, v2, %[x1d]\n"
338 				      "vxor.vv	v3, v3, v2\n"
339 				      "vle8.v	v2, (%[wd0])\n"
340 				      "vxor.vv	v1, v3, v2\n"
341 				      "vxor.vv	v0, v0, v2\n"
342 
343 				      "vsra.vi	v6, v5, 7\n"
344 				      "vsll.vi	v7, v5, 1\n"
345 				      "vand.vx	v6, v6, %[x1d]\n"
346 				      "vxor.vv	v7, v7, v6\n"
347 				      "vle8.v	v6, (%[wd1])\n"
348 				      "vxor.vv	v5, v7, v6\n"
349 				      "vxor.vv	v4, v4, v6\n"
350 				      ".option	pop\n"
351 				      : :
352 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
353 				      [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
354 				      [x1d]"r"(0x1d)
355 			);
356 		}
357 
358 		/* P/Q left side optimization */
359 		for (z = start - 1; z >= 0; z--) {
360 			/*
361 			 * w2$$ = MASK(wq$$);
362 			 * w1$$ = SHLBYTE(wq$$);
363 			 * w2$$ &= NBYTES(0x1d);
364 			 * wq$$ = w1$$ ^ w2$$;
365 			 */
366 			asm volatile (".option	push\n"
367 				      ".option	arch,+v\n"
368 				      "vsra.vi	v2, v1, 7\n"
369 				      "vsll.vi	v3, v1, 1\n"
370 				      "vand.vx	v2, v2, %[x1d]\n"
371 				      "vxor.vv	v1, v3, v2\n"
372 
373 				      "vsra.vi	v6, v5, 7\n"
374 				      "vsll.vi	v7, v5, 1\n"
375 				      "vand.vx	v6, v6, %[x1d]\n"
376 				      "vxor.vv	v5, v7, v6\n"
377 				      ".option	pop\n"
378 				      : :
379 				      [x1d]"r"(0x1d)
380 			);
381 		}
382 
383 		/*
384 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
385 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
386 		 * v0:wp0, v1:wq0, v2:p0, v3:q0
387 		 * v4:wp1, v5:wq1, v6:p1, v7:q1
388 		 */
389 		asm volatile (".option	push\n"
390 			      ".option	arch,+v\n"
391 			      "vle8.v	v2, (%[wp0])\n"
392 			      "vle8.v	v3, (%[wq0])\n"
393 			      "vxor.vv	v2, v2, v0\n"
394 			      "vxor.vv	v3, v3, v1\n"
395 			      "vse8.v	v2, (%[wp0])\n"
396 			      "vse8.v	v3, (%[wq0])\n"
397 
398 			      "vle8.v	v6, (%[wp1])\n"
399 			      "vle8.v	v7, (%[wq1])\n"
400 			      "vxor.vv	v6, v6, v4\n"
401 			      "vxor.vv	v7, v7, v5\n"
402 			      "vse8.v	v6, (%[wp1])\n"
403 			      "vse8.v	v7, (%[wq1])\n"
404 			      ".option	pop\n"
405 			      : :
406 			      [wp0]"r"(&p[d + NSIZE * 0]),
407 			      [wq0]"r"(&q[d + NSIZE * 0]),
408 			      [wp1]"r"(&p[d + NSIZE * 1]),
409 			      [wq1]"r"(&q[d + NSIZE * 1])
410 		);
411 	}
412 }
413 
414 static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
415 {
416 	u8 **dptr = (u8 **)ptrs;
417 	u8 *p, *q;
418 	unsigned long vl, d;
419 	int z, z0;
420 
421 	z0 = disks - 3;	/* Highest data disk */
422 	p = dptr[z0 + 1];	/* XOR parity */
423 	q = dptr[z0 + 2];	/* RS syndrome */
424 
425 	asm volatile (".option	push\n"
426 		      ".option	arch,+v\n"
427 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
428 		      ".option	pop\n"
429 		      : "=&r" (vl)
430 	);
431 
432 	/*
433 	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
434 	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
435 	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
436 	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
437 	 */
438 	for (d = 0; d < bytes; d += NSIZE * 4) {
439 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
440 		asm volatile (".option	push\n"
441 			      ".option	arch,+v\n"
442 			      "vle8.v	v0, (%[wp0])\n"
443 			      "vmv.v.v	v1, v0\n"
444 			      "vle8.v	v4, (%[wp1])\n"
445 			      "vmv.v.v	v5, v4\n"
446 			      "vle8.v	v8, (%[wp2])\n"
447 			      "vmv.v.v	v9, v8\n"
448 			      "vle8.v	v12, (%[wp3])\n"
449 			      "vmv.v.v	v13, v12\n"
450 			      ".option	pop\n"
451 			      : :
452 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
453 			      [wp1]"r"(&dptr[z0][d + 1 * NSIZE]),
454 			      [wp2]"r"(&dptr[z0][d + 2 * NSIZE]),
455 			      [wp3]"r"(&dptr[z0][d + 3 * NSIZE])
456 		);
457 
458 		for (z = z0 - 1; z >= 0; z--) {
459 			/*
460 			 * w2$$ = MASK(wq$$);
461 			 * w1$$ = SHLBYTE(wq$$);
462 			 * w2$$ &= NBYTES(0x1d);
463 			 * w1$$ ^= w2$$;
464 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
465 			 * wq$$ = w1$$ ^ wd$$;
466 			 * wp$$ ^= wd$$;
467 			 */
468 			asm volatile (".option	push\n"
469 				      ".option	arch,+v\n"
470 				      "vsra.vi	v2, v1, 7\n"
471 				      "vsll.vi	v3, v1, 1\n"
472 				      "vand.vx	v2, v2, %[x1d]\n"
473 				      "vxor.vv	v3, v3, v2\n"
474 				      "vle8.v	v2, (%[wd0])\n"
475 				      "vxor.vv	v1, v3, v2\n"
476 				      "vxor.vv	v0, v0, v2\n"
477 
478 				      "vsra.vi	v6, v5, 7\n"
479 				      "vsll.vi	v7, v5, 1\n"
480 				      "vand.vx	v6, v6, %[x1d]\n"
481 				      "vxor.vv	v7, v7, v6\n"
482 				      "vle8.v	v6, (%[wd1])\n"
483 				      "vxor.vv	v5, v7, v6\n"
484 				      "vxor.vv	v4, v4, v6\n"
485 
486 				      "vsra.vi	v10, v9, 7\n"
487 				      "vsll.vi	v11, v9, 1\n"
488 				      "vand.vx	v10, v10, %[x1d]\n"
489 				      "vxor.vv	v11, v11, v10\n"
490 				      "vle8.v	v10, (%[wd2])\n"
491 				      "vxor.vv	v9, v11, v10\n"
492 				      "vxor.vv	v8, v8, v10\n"
493 
494 				      "vsra.vi	v14, v13, 7\n"
495 				      "vsll.vi	v15, v13, 1\n"
496 				      "vand.vx	v14, v14, %[x1d]\n"
497 				      "vxor.vv	v15, v15, v14\n"
498 				      "vle8.v	v14, (%[wd3])\n"
499 				      "vxor.vv	v13, v15, v14\n"
500 				      "vxor.vv	v12, v12, v14\n"
501 				      ".option	pop\n"
502 				      : :
503 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
504 				      [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
505 				      [wd2]"r"(&dptr[z][d + 2 * NSIZE]),
506 				      [wd3]"r"(&dptr[z][d + 3 * NSIZE]),
507 				      [x1d]"r"(0x1d)
508 			);
509 		}
510 
511 		/*
512 		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
513 		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
514 		 */
515 		asm volatile (".option	push\n"
516 			      ".option	arch,+v\n"
517 			      "vse8.v	v0, (%[wp0])\n"
518 			      "vse8.v	v1, (%[wq0])\n"
519 			      "vse8.v	v4, (%[wp1])\n"
520 			      "vse8.v	v5, (%[wq1])\n"
521 			      "vse8.v	v8, (%[wp2])\n"
522 			      "vse8.v	v9, (%[wq2])\n"
523 			      "vse8.v	v12, (%[wp3])\n"
524 			      "vse8.v	v13, (%[wq3])\n"
525 			      ".option	pop\n"
526 			      : :
527 			      [wp0]"r"(&p[d + NSIZE * 0]),
528 			      [wq0]"r"(&q[d + NSIZE * 0]),
529 			      [wp1]"r"(&p[d + NSIZE * 1]),
530 			      [wq1]"r"(&q[d + NSIZE * 1]),
531 			      [wp2]"r"(&p[d + NSIZE * 2]),
532 			      [wq2]"r"(&q[d + NSIZE * 2]),
533 			      [wp3]"r"(&p[d + NSIZE * 3]),
534 			      [wq3]"r"(&q[d + NSIZE * 3])
535 		);
536 	}
537 }
538 
539 static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
540 					 unsigned long bytes, void **ptrs)
541 {
542 	u8 **dptr = (u8 **)ptrs;
543 	u8 *p, *q;
544 	unsigned long vl, d;
545 	int z, z0;
546 
547 	z0 = stop;		/* P/Q right side optimization */
548 	p = dptr[disks - 2];	/* XOR parity */
549 	q = dptr[disks - 1];	/* RS syndrome */
550 
551 	asm volatile (".option	push\n"
552 		      ".option	arch,+v\n"
553 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
554 		      ".option	pop\n"
555 		      : "=&r" (vl)
556 	);
557 
558 	/*
559 	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
560 	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
561 	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
562 	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
563 	 */
564 	for (d = 0; d < bytes; d += NSIZE * 4) {
565 		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
566 		asm volatile (".option	push\n"
567 			      ".option	arch,+v\n"
568 			      "vle8.v	v0, (%[wp0])\n"
569 			      "vmv.v.v	v1, v0\n"
570 			      "vle8.v	v4, (%[wp1])\n"
571 			      "vmv.v.v	v5, v4\n"
572 			      "vle8.v	v8, (%[wp2])\n"
573 			      "vmv.v.v	v9, v8\n"
574 			      "vle8.v	v12, (%[wp3])\n"
575 			      "vmv.v.v	v13, v12\n"
576 			      ".option	pop\n"
577 			      : :
578 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
579 			      [wp1]"r"(&dptr[z0][d + 1 * NSIZE]),
580 			      [wp2]"r"(&dptr[z0][d + 2 * NSIZE]),
581 			      [wp3]"r"(&dptr[z0][d + 3 * NSIZE])
582 		);
583 
584 		/* P/Q data pages */
585 		for (z = z0 - 1; z >= start; z--) {
586 			/*
587 			 * w2$$ = MASK(wq$$);
588 			 * w1$$ = SHLBYTE(wq$$);
589 			 * w2$$ &= NBYTES(0x1d);
590 			 * w1$$ ^= w2$$;
591 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
592 			 * wq$$ = w1$$ ^ wd$$;
593 			 * wp$$ ^= wd$$;
594 			 */
595 			asm volatile (".option	push\n"
596 				      ".option	arch,+v\n"
597 				      "vsra.vi	v2, v1, 7\n"
598 				      "vsll.vi	v3, v1, 1\n"
599 				      "vand.vx	v2, v2, %[x1d]\n"
600 				      "vxor.vv	v3, v3, v2\n"
601 				      "vle8.v	v2, (%[wd0])\n"
602 				      "vxor.vv	v1, v3, v2\n"
603 				      "vxor.vv	v0, v0, v2\n"
604 
605 				      "vsra.vi	v6, v5, 7\n"
606 				      "vsll.vi	v7, v5, 1\n"
607 				      "vand.vx	v6, v6, %[x1d]\n"
608 				      "vxor.vv	v7, v7, v6\n"
609 				      "vle8.v	v6, (%[wd1])\n"
610 				      "vxor.vv	v5, v7, v6\n"
611 				      "vxor.vv	v4, v4, v6\n"
612 
613 				      "vsra.vi	v10, v9, 7\n"
614 				      "vsll.vi	v11, v9, 1\n"
615 				      "vand.vx	v10, v10, %[x1d]\n"
616 				      "vxor.vv	v11, v11, v10\n"
617 				      "vle8.v	v10, (%[wd2])\n"
618 				      "vxor.vv	v9, v11, v10\n"
619 				      "vxor.vv	v8, v8, v10\n"
620 
621 				      "vsra.vi	v14, v13, 7\n"
622 				      "vsll.vi	v15, v13, 1\n"
623 				      "vand.vx	v14, v14, %[x1d]\n"
624 				      "vxor.vv	v15, v15, v14\n"
625 				      "vle8.v	v14, (%[wd3])\n"
626 				      "vxor.vv	v13, v15, v14\n"
627 				      "vxor.vv	v12, v12, v14\n"
628 				      ".option	pop\n"
629 				      : :
630 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
631 				      [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
632 				      [wd2]"r"(&dptr[z][d + 2 * NSIZE]),
633 				      [wd3]"r"(&dptr[z][d + 3 * NSIZE]),
634 				      [x1d]"r"(0x1d)
635 			);
636 		}
637 
638 		/* P/Q left side optimization */
639 		for (z = start - 1; z >= 0; z--) {
640 			/*
641 			 * w2$$ = MASK(wq$$);
642 			 * w1$$ = SHLBYTE(wq$$);
643 			 * w2$$ &= NBYTES(0x1d);
644 			 * wq$$ = w1$$ ^ w2$$;
645 			 */
646 			asm volatile (".option	push\n"
647 				      ".option	arch,+v\n"
648 				      "vsra.vi	v2, v1, 7\n"
649 				      "vsll.vi	v3, v1, 1\n"
650 				      "vand.vx	v2, v2, %[x1d]\n"
651 				      "vxor.vv	v1, v3, v2\n"
652 
653 				      "vsra.vi	v6, v5, 7\n"
654 				      "vsll.vi	v7, v5, 1\n"
655 				      "vand.vx	v6, v6, %[x1d]\n"
656 				      "vxor.vv	v5, v7, v6\n"
657 
658 				      "vsra.vi	v10, v9, 7\n"
659 				      "vsll.vi	v11, v9, 1\n"
660 				      "vand.vx	v10, v10, %[x1d]\n"
661 				      "vxor.vv	v9, v11, v10\n"
662 
663 				      "vsra.vi	v14, v13, 7\n"
664 				      "vsll.vi	v15, v13, 1\n"
665 				      "vand.vx	v14, v14, %[x1d]\n"
666 				      "vxor.vv	v13, v15, v14\n"
667 				      ".option	pop\n"
668 				      : :
669 				      [x1d]"r"(0x1d)
670 			);
671 		}
672 
673 		/*
674 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
675 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
676 		 * v0:wp0, v1:wq0, v2:p0, v3:q0
677 		 * v4:wp1, v5:wq1, v6:p1, v7:q1
678 		 * v8:wp2, v9:wq2, v10:p2, v11:q2
679 		 * v12:wp3, v13:wq3, v14:p3, v15:q3
680 		 */
681 		asm volatile (".option	push\n"
682 			      ".option	arch,+v\n"
683 			      "vle8.v	v2, (%[wp0])\n"
684 			      "vle8.v	v3, (%[wq0])\n"
685 			      "vxor.vv	v2, v2, v0\n"
686 			      "vxor.vv	v3, v3, v1\n"
687 			      "vse8.v	v2, (%[wp0])\n"
688 			      "vse8.v	v3, (%[wq0])\n"
689 
690 			      "vle8.v	v6, (%[wp1])\n"
691 			      "vle8.v	v7, (%[wq1])\n"
692 			      "vxor.vv	v6, v6, v4\n"
693 			      "vxor.vv	v7, v7, v5\n"
694 			      "vse8.v	v6, (%[wp1])\n"
695 			      "vse8.v	v7, (%[wq1])\n"
696 
697 			      "vle8.v	v10, (%[wp2])\n"
698 			      "vle8.v	v11, (%[wq2])\n"
699 			      "vxor.vv	v10, v10, v8\n"
700 			      "vxor.vv	v11, v11, v9\n"
701 			      "vse8.v	v10, (%[wp2])\n"
702 			      "vse8.v	v11, (%[wq2])\n"
703 
704 			      "vle8.v	v14, (%[wp3])\n"
705 			      "vle8.v	v15, (%[wq3])\n"
706 			      "vxor.vv	v14, v14, v12\n"
707 			      "vxor.vv	v15, v15, v13\n"
708 			      "vse8.v	v14, (%[wp3])\n"
709 			      "vse8.v	v15, (%[wq3])\n"
710 			      ".option	pop\n"
711 			      : :
712 			      [wp0]"r"(&p[d + NSIZE * 0]),
713 			      [wq0]"r"(&q[d + NSIZE * 0]),
714 			      [wp1]"r"(&p[d + NSIZE * 1]),
715 			      [wq1]"r"(&q[d + NSIZE * 1]),
716 			      [wp2]"r"(&p[d + NSIZE * 2]),
717 			      [wq2]"r"(&q[d + NSIZE * 2]),
718 			      [wp3]"r"(&p[d + NSIZE * 3]),
719 			      [wq3]"r"(&q[d + NSIZE * 3])
720 		);
721 	}
722 }
723 
724 static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
725 {
726 	u8 **dptr = (u8 **)ptrs;
727 	u8 *p, *q;
728 	unsigned long vl, d;
729 	int z, z0;
730 
731 	z0 = disks - 3;	/* Highest data disk */
732 	p = dptr[z0 + 1];	/* XOR parity */
733 	q = dptr[z0 + 2];	/* RS syndrome */
734 
735 	asm volatile (".option	push\n"
736 		      ".option	arch,+v\n"
737 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
738 		      ".option	pop\n"
739 		      : "=&r" (vl)
740 	);
741 
742 	/*
743 	 * v0:wp0,   v1:wq0,  v2:wd0/w20,  v3:w10
744 	 * v4:wp1,   v5:wq1,  v6:wd1/w21,  v7:w11
745 	 * v8:wp2,   v9:wq2, v10:wd2/w22, v11:w12
746 	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
747 	 * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
748 	 * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
749 	 * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
750 	 * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
751 	 */
752 	for (d = 0; d < bytes; d += NSIZE * 8) {
753 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
754 		asm volatile (".option	push\n"
755 			      ".option	arch,+v\n"
756 			      "vle8.v	v0, (%[wp0])\n"
757 			      "vmv.v.v	v1, v0\n"
758 			      "vle8.v	v4, (%[wp1])\n"
759 			      "vmv.v.v	v5, v4\n"
760 			      "vle8.v	v8, (%[wp2])\n"
761 			      "vmv.v.v	v9, v8\n"
762 			      "vle8.v	v12, (%[wp3])\n"
763 			      "vmv.v.v	v13, v12\n"
764 			      "vle8.v	v16, (%[wp4])\n"
765 			      "vmv.v.v	v17, v16\n"
766 			      "vle8.v	v20, (%[wp5])\n"
767 			      "vmv.v.v	v21, v20\n"
768 			      "vle8.v	v24, (%[wp6])\n"
769 			      "vmv.v.v	v25, v24\n"
770 			      "vle8.v	v28, (%[wp7])\n"
771 			      "vmv.v.v	v29, v28\n"
772 			      ".option	pop\n"
773 			      : :
774 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
775 			      [wp1]"r"(&dptr[z0][d + 1 * NSIZE]),
776 			      [wp2]"r"(&dptr[z0][d + 2 * NSIZE]),
777 			      [wp3]"r"(&dptr[z0][d + 3 * NSIZE]),
778 			      [wp4]"r"(&dptr[z0][d + 4 * NSIZE]),
779 			      [wp5]"r"(&dptr[z0][d + 5 * NSIZE]),
780 			      [wp6]"r"(&dptr[z0][d + 6 * NSIZE]),
781 			      [wp7]"r"(&dptr[z0][d + 7 * NSIZE])
782 		);
783 
784 		for (z = z0 - 1; z >= 0; z--) {
785 			/*
786 			 * w2$$ = MASK(wq$$);
787 			 * w1$$ = SHLBYTE(wq$$);
788 			 * w2$$ &= NBYTES(0x1d);
789 			 * w1$$ ^= w2$$;
790 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
791 			 * wq$$ = w1$$ ^ wd$$;
792 			 * wp$$ ^= wd$$;
793 			 */
794 			asm volatile (".option	push\n"
795 				      ".option	arch,+v\n"
796 				      "vsra.vi	v2, v1, 7\n"
797 				      "vsll.vi	v3, v1, 1\n"
798 				      "vand.vx	v2, v2, %[x1d]\n"
799 				      "vxor.vv	v3, v3, v2\n"
800 				      "vle8.v	v2, (%[wd0])\n"
801 				      "vxor.vv	v1, v3, v2\n"
802 				      "vxor.vv	v0, v0, v2\n"
803 
804 				      "vsra.vi	v6, v5, 7\n"
805 				      "vsll.vi	v7, v5, 1\n"
806 				      "vand.vx	v6, v6, %[x1d]\n"
807 				      "vxor.vv	v7, v7, v6\n"
808 				      "vle8.v	v6, (%[wd1])\n"
809 				      "vxor.vv	v5, v7, v6\n"
810 				      "vxor.vv	v4, v4, v6\n"
811 
812 				      "vsra.vi	v10, v9, 7\n"
813 				      "vsll.vi	v11, v9, 1\n"
814 				      "vand.vx	v10, v10, %[x1d]\n"
815 				      "vxor.vv	v11, v11, v10\n"
816 				      "vle8.v	v10, (%[wd2])\n"
817 				      "vxor.vv	v9, v11, v10\n"
818 				      "vxor.vv	v8, v8, v10\n"
819 
820 				      "vsra.vi	v14, v13, 7\n"
821 				      "vsll.vi	v15, v13, 1\n"
822 				      "vand.vx	v14, v14, %[x1d]\n"
823 				      "vxor.vv	v15, v15, v14\n"
824 				      "vle8.v	v14, (%[wd3])\n"
825 				      "vxor.vv	v13, v15, v14\n"
826 				      "vxor.vv	v12, v12, v14\n"
827 
828 				      "vsra.vi	v18, v17, 7\n"
829 				      "vsll.vi	v19, v17, 1\n"
830 				      "vand.vx	v18, v18, %[x1d]\n"
831 				      "vxor.vv	v19, v19, v18\n"
832 				      "vle8.v	v18, (%[wd4])\n"
833 				      "vxor.vv	v17, v19, v18\n"
834 				      "vxor.vv	v16, v16, v18\n"
835 
836 				      "vsra.vi	v22, v21, 7\n"
837 				      "vsll.vi	v23, v21, 1\n"
838 				      "vand.vx	v22, v22, %[x1d]\n"
839 				      "vxor.vv	v23, v23, v22\n"
840 				      "vle8.v	v22, (%[wd5])\n"
841 				      "vxor.vv	v21, v23, v22\n"
842 				      "vxor.vv	v20, v20, v22\n"
843 
844 				      "vsra.vi	v26, v25, 7\n"
845 				      "vsll.vi	v27, v25, 1\n"
846 				      "vand.vx	v26, v26, %[x1d]\n"
847 				      "vxor.vv	v27, v27, v26\n"
848 				      "vle8.v	v26, (%[wd6])\n"
849 				      "vxor.vv	v25, v27, v26\n"
850 				      "vxor.vv	v24, v24, v26\n"
851 
852 				      "vsra.vi	v30, v29, 7\n"
853 				      "vsll.vi	v31, v29, 1\n"
854 				      "vand.vx	v30, v30, %[x1d]\n"
855 				      "vxor.vv	v31, v31, v30\n"
856 				      "vle8.v	v30, (%[wd7])\n"
857 				      "vxor.vv	v29, v31, v30\n"
858 				      "vxor.vv	v28, v28, v30\n"
859 				      ".option	pop\n"
860 				      : :
861 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
862 				      [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
863 				      [wd2]"r"(&dptr[z][d + 2 * NSIZE]),
864 				      [wd3]"r"(&dptr[z][d + 3 * NSIZE]),
865 				      [wd4]"r"(&dptr[z][d + 4 * NSIZE]),
866 				      [wd5]"r"(&dptr[z][d + 5 * NSIZE]),
867 				      [wd6]"r"(&dptr[z][d + 6 * NSIZE]),
868 				      [wd7]"r"(&dptr[z][d + 7 * NSIZE]),
869 				      [x1d]"r"(0x1d)
870 			);
871 		}
872 
873 		/*
874 		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
875 		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
876 		 */
877 		asm volatile (".option	push\n"
878 			      ".option	arch,+v\n"
879 			      "vse8.v	v0, (%[wp0])\n"
880 			      "vse8.v	v1, (%[wq0])\n"
881 			      "vse8.v	v4, (%[wp1])\n"
882 			      "vse8.v	v5, (%[wq1])\n"
883 			      "vse8.v	v8, (%[wp2])\n"
884 			      "vse8.v	v9, (%[wq2])\n"
885 			      "vse8.v	v12, (%[wp3])\n"
886 			      "vse8.v	v13, (%[wq3])\n"
887 			      "vse8.v	v16, (%[wp4])\n"
888 			      "vse8.v	v17, (%[wq4])\n"
889 			      "vse8.v	v20, (%[wp5])\n"
890 			      "vse8.v	v21, (%[wq5])\n"
891 			      "vse8.v	v24, (%[wp6])\n"
892 			      "vse8.v	v25, (%[wq6])\n"
893 			      "vse8.v	v28, (%[wp7])\n"
894 			      "vse8.v	v29, (%[wq7])\n"
895 			      ".option	pop\n"
896 			      : :
897 			      [wp0]"r"(&p[d + NSIZE * 0]),
898 			      [wq0]"r"(&q[d + NSIZE * 0]),
899 			      [wp1]"r"(&p[d + NSIZE * 1]),
900 			      [wq1]"r"(&q[d + NSIZE * 1]),
901 			      [wp2]"r"(&p[d + NSIZE * 2]),
902 			      [wq2]"r"(&q[d + NSIZE * 2]),
903 			      [wp3]"r"(&p[d + NSIZE * 3]),
904 			      [wq3]"r"(&q[d + NSIZE * 3]),
905 			      [wp4]"r"(&p[d + NSIZE * 4]),
906 			      [wq4]"r"(&q[d + NSIZE * 4]),
907 			      [wp5]"r"(&p[d + NSIZE * 5]),
908 			      [wq5]"r"(&q[d + NSIZE * 5]),
909 			      [wp6]"r"(&p[d + NSIZE * 6]),
910 			      [wq6]"r"(&q[d + NSIZE * 6]),
911 			      [wp7]"r"(&p[d + NSIZE * 7]),
912 			      [wq7]"r"(&q[d + NSIZE * 7])
913 		);
914 	}
915 }
916 
917 static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop,
918 					 unsigned long bytes, void **ptrs)
919 {
920 	u8 **dptr = (u8 **)ptrs;
921 	u8 *p, *q;
922 	unsigned long vl, d;
923 	int z, z0;
924 
925 	z0 = stop;		/* P/Q right side optimization */
926 	p = dptr[disks - 2];	/* XOR parity */
927 	q = dptr[disks - 1];	/* RS syndrome */
928 
929 	asm volatile (".option	push\n"
930 		      ".option	arch,+v\n"
931 		      "vsetvli	%0, x0, e8, m1, ta, ma\n"
932 		      ".option	pop\n"
933 		      : "=&r" (vl)
934 	);
935 
936 	/*
937 	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
938 	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
939 	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
940 	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
941 	 * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
942 	 * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
943 	 * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
944 	 * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
945 	 */
946 	for (d = 0; d < bytes; d += NSIZE * 8) {
947 		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
948 		asm volatile (".option	push\n"
949 			      ".option	arch,+v\n"
950 			      "vle8.v	v0, (%[wp0])\n"
951 			      "vmv.v.v	v1, v0\n"
952 			      "vle8.v	v4, (%[wp1])\n"
953 			      "vmv.v.v	v5, v4\n"
954 			      "vle8.v	v8, (%[wp2])\n"
955 			      "vmv.v.v	v9, v8\n"
956 			      "vle8.v	v12, (%[wp3])\n"
957 			      "vmv.v.v	v13, v12\n"
958 			      "vle8.v	v16, (%[wp4])\n"
959 			      "vmv.v.v	v17, v16\n"
960 			      "vle8.v	v20, (%[wp5])\n"
961 			      "vmv.v.v	v21, v20\n"
962 			      "vle8.v	v24, (%[wp6])\n"
963 			      "vmv.v.v	v25, v24\n"
964 			      "vle8.v	v28, (%[wp7])\n"
965 			      "vmv.v.v	v29, v28\n"
966 			      ".option	pop\n"
967 			      : :
968 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
969 			      [wp1]"r"(&dptr[z0][d + 1 * NSIZE]),
970 			      [wp2]"r"(&dptr[z0][d + 2 * NSIZE]),
971 			      [wp3]"r"(&dptr[z0][d + 3 * NSIZE]),
972 			      [wp4]"r"(&dptr[z0][d + 4 * NSIZE]),
973 			      [wp5]"r"(&dptr[z0][d + 5 * NSIZE]),
974 			      [wp6]"r"(&dptr[z0][d + 6 * NSIZE]),
975 			      [wp7]"r"(&dptr[z0][d + 7 * NSIZE])
976 		);
977 
978 		/* P/Q data pages */
979 		for (z = z0 - 1; z >= start; z--) {
980 			/*
981 			 * w2$$ = MASK(wq$$);
982 			 * w1$$ = SHLBYTE(wq$$);
983 			 * w2$$ &= NBYTES(0x1d);
984 			 * w1$$ ^= w2$$;
985 			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
986 			 * wq$$ = w1$$ ^ wd$$;
987 			 * wp$$ ^= wd$$;
988 			 */
989 			asm volatile (".option	push\n"
990 				      ".option	arch,+v\n"
991 				      "vsra.vi	v2, v1, 7\n"
992 				      "vsll.vi	v3, v1, 1\n"
993 				      "vand.vx	v2, v2, %[x1d]\n"
994 				      "vxor.vv	v3, v3, v2\n"
995 				      "vle8.v	v2, (%[wd0])\n"
996 				      "vxor.vv	v1, v3, v2\n"
997 				      "vxor.vv	v0, v0, v2\n"
998 
999 				      "vsra.vi	v6, v5, 7\n"
1000 				      "vsll.vi	v7, v5, 1\n"
1001 				      "vand.vx	v6, v6, %[x1d]\n"
1002 				      "vxor.vv	v7, v7, v6\n"
1003 				      "vle8.v	v6, (%[wd1])\n"
1004 				      "vxor.vv	v5, v7, v6\n"
1005 				      "vxor.vv	v4, v4, v6\n"
1006 
1007 				      "vsra.vi	v10, v9, 7\n"
1008 				      "vsll.vi	v11, v9, 1\n"
1009 				      "vand.vx	v10, v10, %[x1d]\n"
1010 				      "vxor.vv	v11, v11, v10\n"
1011 				      "vle8.v	v10, (%[wd2])\n"
1012 				      "vxor.vv	v9, v11, v10\n"
1013 				      "vxor.vv	v8, v8, v10\n"
1014 
1015 				      "vsra.vi	v14, v13, 7\n"
1016 				      "vsll.vi	v15, v13, 1\n"
1017 				      "vand.vx	v14, v14, %[x1d]\n"
1018 				      "vxor.vv	v15, v15, v14\n"
1019 				      "vle8.v	v14, (%[wd3])\n"
1020 				      "vxor.vv	v13, v15, v14\n"
1021 				      "vxor.vv	v12, v12, v14\n"
1022 
1023 				      "vsra.vi	v18, v17, 7\n"
1024 				      "vsll.vi	v19, v17, 1\n"
1025 				      "vand.vx	v18, v18, %[x1d]\n"
1026 				      "vxor.vv	v19, v19, v18\n"
1027 				      "vle8.v	v18, (%[wd4])\n"
1028 				      "vxor.vv	v17, v19, v18\n"
1029 				      "vxor.vv	v16, v16, v18\n"
1030 
1031 				      "vsra.vi	v22, v21, 7\n"
1032 				      "vsll.vi	v23, v21, 1\n"
1033 				      "vand.vx	v22, v22, %[x1d]\n"
1034 				      "vxor.vv	v23, v23, v22\n"
1035 				      "vle8.v	v22, (%[wd5])\n"
1036 				      "vxor.vv	v21, v23, v22\n"
1037 				      "vxor.vv	v20, v20, v22\n"
1038 
1039 				      "vsra.vi	v26, v25, 7\n"
1040 				      "vsll.vi	v27, v25, 1\n"
1041 				      "vand.vx	v26, v26, %[x1d]\n"
1042 				      "vxor.vv	v27, v27, v26\n"
1043 				      "vle8.v	v26, (%[wd6])\n"
1044 				      "vxor.vv	v25, v27, v26\n"
1045 				      "vxor.vv	v24, v24, v26\n"
1046 
1047 				      "vsra.vi	v30, v29, 7\n"
1048 				      "vsll.vi	v31, v29, 1\n"
1049 				      "vand.vx	v30, v30, %[x1d]\n"
1050 				      "vxor.vv	v31, v31, v30\n"
1051 				      "vle8.v	v30, (%[wd7])\n"
1052 				      "vxor.vv	v29, v31, v30\n"
1053 				      "vxor.vv	v28, v28, v30\n"
1054 				      ".option	pop\n"
1055 				      : :
1056 				      [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
1057 				      [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
1058 				      [wd2]"r"(&dptr[z][d + 2 * NSIZE]),
1059 				      [wd3]"r"(&dptr[z][d + 3 * NSIZE]),
1060 				      [wd4]"r"(&dptr[z][d + 4 * NSIZE]),
1061 				      [wd5]"r"(&dptr[z][d + 5 * NSIZE]),
1062 				      [wd6]"r"(&dptr[z][d + 6 * NSIZE]),
1063 				      [wd7]"r"(&dptr[z][d + 7 * NSIZE]),
1064 				      [x1d]"r"(0x1d)
1065 			);
1066 		}
1067 
1068 		/* P/Q left side optimization */
1069 		for (z = start - 1; z >= 0; z--) {
1070 			/*
1071 			 * w2$$ = MASK(wq$$);
1072 			 * w1$$ = SHLBYTE(wq$$);
1073 			 * w2$$ &= NBYTES(0x1d);
1074 			 * wq$$ = w1$$ ^ w2$$;
1075 			 */
1076 			asm volatile (".option	push\n"
1077 				      ".option	arch,+v\n"
1078 				      "vsra.vi	v2, v1, 7\n"
1079 				      "vsll.vi	v3, v1, 1\n"
1080 				      "vand.vx	v2, v2, %[x1d]\n"
1081 				      "vxor.vv	v1, v3, v2\n"
1082 
1083 				      "vsra.vi	v6, v5, 7\n"
1084 				      "vsll.vi	v7, v5, 1\n"
1085 				      "vand.vx	v6, v6, %[x1d]\n"
1086 				      "vxor.vv	v5, v7, v6\n"
1087 
1088 				      "vsra.vi	v10, v9, 7\n"
1089 				      "vsll.vi	v11, v9, 1\n"
1090 				      "vand.vx	v10, v10, %[x1d]\n"
1091 				      "vxor.vv	v9, v11, v10\n"
1092 
1093 				      "vsra.vi	v14, v13, 7\n"
1094 				      "vsll.vi	v15, v13, 1\n"
1095 				      "vand.vx	v14, v14, %[x1d]\n"
1096 				      "vxor.vv	v13, v15, v14\n"
1097 
1098 				      "vsra.vi	v18, v17, 7\n"
1099 				      "vsll.vi	v19, v17, 1\n"
1100 				      "vand.vx	v18, v18, %[x1d]\n"
1101 				      "vxor.vv	v17, v19, v18\n"
1102 
1103 				      "vsra.vi	v22, v21, 7\n"
1104 				      "vsll.vi	v23, v21, 1\n"
1105 				      "vand.vx	v22, v22, %[x1d]\n"
1106 				      "vxor.vv	v21, v23, v22\n"
1107 
1108 				      "vsra.vi	v26, v25, 7\n"
1109 				      "vsll.vi	v27, v25, 1\n"
1110 				      "vand.vx	v26, v26, %[x1d]\n"
1111 				      "vxor.vv	v25, v27, v26\n"
1112 
1113 				      "vsra.vi	v30, v29, 7\n"
1114 				      "vsll.vi	v31, v29, 1\n"
1115 				      "vand.vx	v30, v30, %[x1d]\n"
1116 				      "vxor.vv	v29, v31, v30\n"
1117 				      ".option	pop\n"
1118 				      : :
1119 				      [x1d]"r"(0x1d)
1120 			);
1121 		}
1122 
1123 		/*
1124 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
1125 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
1126 		 * v0:wp0, v1:wq0, v2:p0, v3:q0
1127 		 * v4:wp1, v5:wq1, v6:p1, v7:q1
1128 		 * v8:wp2, v9:wq2, v10:p2, v11:q2
1129 		 * v12:wp3, v13:wq3, v14:p3, v15:q3
1130 		 * v16:wp4, v17:wq4, v18:p4, v19:q4
1131 		 * v20:wp5, v21:wq5, v22:p5, v23:q5
1132 		 * v24:wp6, v25:wq6, v26:p6, v27:q6
1133 		 * v28:wp7, v29:wq7, v30:p7, v31:q7
1134 		 */
1135 		asm volatile (".option	push\n"
1136 			      ".option	arch,+v\n"
1137 			      "vle8.v	v2, (%[wp0])\n"
1138 			      "vle8.v	v3, (%[wq0])\n"
1139 			      "vxor.vv	v2, v2, v0\n"
1140 			      "vxor.vv	v3, v3, v1\n"
1141 			      "vse8.v	v2, (%[wp0])\n"
1142 			      "vse8.v	v3, (%[wq0])\n"
1143 
1144 			      "vle8.v	v6, (%[wp1])\n"
1145 			      "vle8.v	v7, (%[wq1])\n"
1146 			      "vxor.vv	v6, v6, v4\n"
1147 			      "vxor.vv	v7, v7, v5\n"
1148 			      "vse8.v	v6, (%[wp1])\n"
1149 			      "vse8.v	v7, (%[wq1])\n"
1150 
1151 			      "vle8.v	v10, (%[wp2])\n"
1152 			      "vle8.v	v11, (%[wq2])\n"
1153 			      "vxor.vv	v10, v10, v8\n"
1154 			      "vxor.vv	v11, v11, v9\n"
1155 			      "vse8.v	v10, (%[wp2])\n"
1156 			      "vse8.v	v11, (%[wq2])\n"
1157 
1158 			      "vle8.v	v14, (%[wp3])\n"
1159 			      "vle8.v	v15, (%[wq3])\n"
1160 			      "vxor.vv	v14, v14, v12\n"
1161 			      "vxor.vv	v15, v15, v13\n"
1162 			      "vse8.v	v14, (%[wp3])\n"
1163 			      "vse8.v	v15, (%[wq3])\n"
1164 
1165 			      "vle8.v	v18, (%[wp4])\n"
1166 			      "vle8.v	v19, (%[wq4])\n"
1167 			      "vxor.vv	v18, v18, v16\n"
1168 			      "vxor.vv	v19, v19, v17\n"
1169 			      "vse8.v	v18, (%[wp4])\n"
1170 			      "vse8.v	v19, (%[wq4])\n"
1171 
1172 			      "vle8.v	v22, (%[wp5])\n"
1173 			      "vle8.v	v23, (%[wq5])\n"
1174 			      "vxor.vv	v22, v22, v20\n"
1175 			      "vxor.vv	v23, v23, v21\n"
1176 			      "vse8.v	v22, (%[wp5])\n"
1177 			      "vse8.v	v23, (%[wq5])\n"
1178 
1179 			      "vle8.v	v26, (%[wp6])\n"
1180 			      "vle8.v	v27, (%[wq6])\n"
1181 			      "vxor.vv	v26, v26, v24\n"
1182 			      "vxor.vv	v27, v27, v25\n"
1183 			      "vse8.v	v26, (%[wp6])\n"
1184 			      "vse8.v	v27, (%[wq6])\n"
1185 
1186 			      "vle8.v	v30, (%[wp7])\n"
1187 			      "vle8.v	v31, (%[wq7])\n"
1188 			      "vxor.vv	v30, v30, v28\n"
1189 			      "vxor.vv	v31, v31, v29\n"
1190 			      "vse8.v	v30, (%[wp7])\n"
1191 			      "vse8.v	v31, (%[wq7])\n"
1192 			      ".option	pop\n"
1193 			      : :
1194 			      [wp0]"r"(&p[d + NSIZE * 0]),
1195 			      [wq0]"r"(&q[d + NSIZE * 0]),
1196 			      [wp1]"r"(&p[d + NSIZE * 1]),
1197 			      [wq1]"r"(&q[d + NSIZE * 1]),
1198 			      [wp2]"r"(&p[d + NSIZE * 2]),
1199 			      [wq2]"r"(&q[d + NSIZE * 2]),
1200 			      [wp3]"r"(&p[d + NSIZE * 3]),
1201 			      [wq3]"r"(&q[d + NSIZE * 3]),
1202 			      [wp4]"r"(&p[d + NSIZE * 4]),
1203 			      [wq4]"r"(&q[d + NSIZE * 4]),
1204 			      [wp5]"r"(&p[d + NSIZE * 5]),
1205 			      [wq5]"r"(&q[d + NSIZE * 5]),
1206 			      [wp6]"r"(&p[d + NSIZE * 6]),
1207 			      [wq6]"r"(&q[d + NSIZE * 6]),
1208 			      [wp7]"r"(&p[d + NSIZE * 7]),
1209 			      [wq7]"r"(&q[d + NSIZE * 7])
1210 		);
1211 	}
1212 }
1213 
1214 RAID6_RVV_WRAPPER(1);
1215 RAID6_RVV_WRAPPER(2);
1216 RAID6_RVV_WRAPPER(4);
1217 RAID6_RVV_WRAPPER(8);
1218