1*6093faafSChunyan Zhang // SPDX-License-Identifier: GPL-2.0-or-later
2*6093faafSChunyan Zhang /*
3*6093faafSChunyan Zhang * RAID-6 syndrome calculation using RISC-V vector instructions
4*6093faafSChunyan Zhang *
5*6093faafSChunyan Zhang * Copyright 2024 Institute of Software, CAS.
6*6093faafSChunyan Zhang * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
7*6093faafSChunyan Zhang *
8*6093faafSChunyan Zhang * Based on neon.uc:
9*6093faafSChunyan Zhang * Copyright 2002-2004 H. Peter Anvin
10*6093faafSChunyan Zhang */
11*6093faafSChunyan Zhang
12*6093faafSChunyan Zhang #include <asm/simd.h>
13*6093faafSChunyan Zhang #include <asm/vector.h>
14*6093faafSChunyan Zhang #include <crypto/internal/simd.h>
15*6093faafSChunyan Zhang #include <linux/raid/pq.h>
16*6093faafSChunyan Zhang #include <linux/types.h>
17*6093faafSChunyan Zhang #include "rvv.h"
18*6093faafSChunyan Zhang
19*6093faafSChunyan Zhang #define NSIZE (riscv_v_vsize / 32) /* NSIZE = vlenb */
20*6093faafSChunyan Zhang
rvv_has_vector(void)21*6093faafSChunyan Zhang static int rvv_has_vector(void)
22*6093faafSChunyan Zhang {
23*6093faafSChunyan Zhang return has_vector();
24*6093faafSChunyan Zhang }
25*6093faafSChunyan Zhang
raid6_rvv1_gen_syndrome_real(int disks,unsigned long bytes,void ** ptrs)26*6093faafSChunyan Zhang static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
27*6093faafSChunyan Zhang {
28*6093faafSChunyan Zhang u8 **dptr = (u8 **)ptrs;
29*6093faafSChunyan Zhang unsigned long d;
30*6093faafSChunyan Zhang int z, z0;
31*6093faafSChunyan Zhang u8 *p, *q;
32*6093faafSChunyan Zhang
33*6093faafSChunyan Zhang z0 = disks - 3; /* Highest data disk */
34*6093faafSChunyan Zhang p = dptr[z0 + 1]; /* XOR parity */
35*6093faafSChunyan Zhang q = dptr[z0 + 2]; /* RS syndrome */
36*6093faafSChunyan Zhang
37*6093faafSChunyan Zhang asm volatile (".option push\n"
38*6093faafSChunyan Zhang ".option arch,+v\n"
39*6093faafSChunyan Zhang "vsetvli t0, x0, e8, m1, ta, ma\n"
40*6093faafSChunyan Zhang ".option pop\n"
41*6093faafSChunyan Zhang );
42*6093faafSChunyan Zhang
43*6093faafSChunyan Zhang /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
44*6093faafSChunyan Zhang for (d = 0; d < bytes; d += NSIZE * 1) {
45*6093faafSChunyan Zhang /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
46*6093faafSChunyan Zhang asm volatile (".option push\n"
47*6093faafSChunyan Zhang ".option arch,+v\n"
48*6093faafSChunyan Zhang "vle8.v v0, (%[wp0])\n"
49*6093faafSChunyan Zhang "vle8.v v1, (%[wp0])\n"
50*6093faafSChunyan Zhang ".option pop\n"
51*6093faafSChunyan Zhang : :
52*6093faafSChunyan Zhang [wp0]"r"(&dptr[z0][d + 0 * NSIZE])
53*6093faafSChunyan Zhang );
54*6093faafSChunyan Zhang
55*6093faafSChunyan Zhang for (z = z0 - 1 ; z >= 0 ; z--) {
56*6093faafSChunyan Zhang /*
57*6093faafSChunyan Zhang * w2$$ = MASK(wq$$);
58*6093faafSChunyan Zhang * w1$$ = SHLBYTE(wq$$);
59*6093faafSChunyan Zhang * w2$$ &= NBYTES(0x1d);
60*6093faafSChunyan Zhang * w1$$ ^= w2$$;
61*6093faafSChunyan Zhang * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
62*6093faafSChunyan Zhang * wq$$ = w1$$ ^ wd$$;
63*6093faafSChunyan Zhang * wp$$ ^= wd$$;
64*6093faafSChunyan Zhang */
65*6093faafSChunyan Zhang asm volatile (".option push\n"
66*6093faafSChunyan Zhang ".option arch,+v\n"
67*6093faafSChunyan Zhang "vsra.vi v2, v1, 7\n"
68*6093faafSChunyan Zhang "vsll.vi v3, v1, 1\n"
69*6093faafSChunyan Zhang "vand.vx v2, v2, %[x1d]\n"
70*6093faafSChunyan Zhang "vxor.vv v3, v3, v2\n"
71*6093faafSChunyan Zhang "vle8.v v2, (%[wd0])\n"
72*6093faafSChunyan Zhang "vxor.vv v1, v3, v2\n"
73*6093faafSChunyan Zhang "vxor.vv v0, v0, v2\n"
74*6093faafSChunyan Zhang ".option pop\n"
75*6093faafSChunyan Zhang : :
76*6093faafSChunyan Zhang [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
77*6093faafSChunyan Zhang [x1d]"r"(0x1d)
78*6093faafSChunyan Zhang );
79*6093faafSChunyan Zhang }
80*6093faafSChunyan Zhang
81*6093faafSChunyan Zhang /*
82*6093faafSChunyan Zhang * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
83*6093faafSChunyan Zhang * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
84*6093faafSChunyan Zhang */
85*6093faafSChunyan Zhang asm volatile (".option push\n"
86*6093faafSChunyan Zhang ".option arch,+v\n"
87*6093faafSChunyan Zhang "vse8.v v0, (%[wp0])\n"
88*6093faafSChunyan Zhang "vse8.v v1, (%[wq0])\n"
89*6093faafSChunyan Zhang ".option pop\n"
90*6093faafSChunyan Zhang : :
91*6093faafSChunyan Zhang [wp0]"r"(&p[d + NSIZE * 0]),
92*6093faafSChunyan Zhang [wq0]"r"(&q[d + NSIZE * 0])
93*6093faafSChunyan Zhang );
94*6093faafSChunyan Zhang }
95*6093faafSChunyan Zhang }
96*6093faafSChunyan Zhang
raid6_rvv1_xor_syndrome_real(int disks,int start,int stop,unsigned long bytes,void ** ptrs)97*6093faafSChunyan Zhang static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
98*6093faafSChunyan Zhang unsigned long bytes, void **ptrs)
99*6093faafSChunyan Zhang {
100*6093faafSChunyan Zhang u8 **dptr = (u8 **)ptrs;
101*6093faafSChunyan Zhang u8 *p, *q;
102*6093faafSChunyan Zhang unsigned long d;
103*6093faafSChunyan Zhang int z, z0;
104*6093faafSChunyan Zhang
105*6093faafSChunyan Zhang z0 = stop; /* P/Q right side optimization */
106*6093faafSChunyan Zhang p = dptr[disks - 2]; /* XOR parity */
107*6093faafSChunyan Zhang q = dptr[disks - 1]; /* RS syndrome */
108*6093faafSChunyan Zhang
109*6093faafSChunyan Zhang asm volatile (".option push\n"
110*6093faafSChunyan Zhang ".option arch,+v\n"
111*6093faafSChunyan Zhang "vsetvli t0, x0, e8, m1, ta, ma\n"
112*6093faafSChunyan Zhang ".option pop\n"
113*6093faafSChunyan Zhang );
114*6093faafSChunyan Zhang
115*6093faafSChunyan Zhang /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
116*6093faafSChunyan Zhang for (d = 0 ; d < bytes ; d += NSIZE * 1) {
117*6093faafSChunyan Zhang /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
118*6093faafSChunyan Zhang asm volatile (".option push\n"
119*6093faafSChunyan Zhang ".option arch,+v\n"
120*6093faafSChunyan Zhang "vle8.v v0, (%[wp0])\n"
121*6093faafSChunyan Zhang "vle8.v v1, (%[wp0])\n"
122*6093faafSChunyan Zhang ".option pop\n"
123*6093faafSChunyan Zhang : :
124*6093faafSChunyan Zhang [wp0]"r"(&dptr[z0][d + 0 * NSIZE])
125*6093faafSChunyan Zhang );
126*6093faafSChunyan Zhang
127*6093faafSChunyan Zhang /* P/Q data pages */
128*6093faafSChunyan Zhang for (z = z0 - 1; z >= start; z--) {
129*6093faafSChunyan Zhang /*
130*6093faafSChunyan Zhang * w2$$ = MASK(wq$$);
131*6093faafSChunyan Zhang * w1$$ = SHLBYTE(wq$$);
132*6093faafSChunyan Zhang * w2$$ &= NBYTES(0x1d);
133*6093faafSChunyan Zhang * w1$$ ^= w2$$;
134*6093faafSChunyan Zhang * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
135*6093faafSChunyan Zhang * wq$$ = w1$$ ^ wd$$;
136*6093faafSChunyan Zhang * wp$$ ^= wd$$;
137*6093faafSChunyan Zhang */
138*6093faafSChunyan Zhang asm volatile (".option push\n"
139*6093faafSChunyan Zhang ".option arch,+v\n"
140*6093faafSChunyan Zhang "vsra.vi v2, v1, 7\n"
141*6093faafSChunyan Zhang "vsll.vi v3, v1, 1\n"
142*6093faafSChunyan Zhang "vand.vx v2, v2, %[x1d]\n"
143*6093faafSChunyan Zhang "vxor.vv v3, v3, v2\n"
144*6093faafSChunyan Zhang "vle8.v v2, (%[wd0])\n"
145*6093faafSChunyan Zhang "vxor.vv v1, v3, v2\n"
146*6093faafSChunyan Zhang "vxor.vv v0, v0, v2\n"
147*6093faafSChunyan Zhang ".option pop\n"
148*6093faafSChunyan Zhang : :
149*6093faafSChunyan Zhang [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
150*6093faafSChunyan Zhang [x1d]"r"(0x1d)
151*6093faafSChunyan Zhang );
152*6093faafSChunyan Zhang }
153*6093faafSChunyan Zhang
154*6093faafSChunyan Zhang /* P/Q left side optimization */
155*6093faafSChunyan Zhang for (z = start - 1; z >= 0; z--) {
156*6093faafSChunyan Zhang /*
157*6093faafSChunyan Zhang * w2$$ = MASK(wq$$);
158*6093faafSChunyan Zhang * w1$$ = SHLBYTE(wq$$);
159*6093faafSChunyan Zhang * w2$$ &= NBYTES(0x1d);
160*6093faafSChunyan Zhang * wq$$ = w1$$ ^ w2$$;
161*6093faafSChunyan Zhang */
162*6093faafSChunyan Zhang asm volatile (".option push\n"
163*6093faafSChunyan Zhang ".option arch,+v\n"
164*6093faafSChunyan Zhang "vsra.vi v2, v1, 7\n"
165*6093faafSChunyan Zhang "vsll.vi v3, v1, 1\n"
166*6093faafSChunyan Zhang "vand.vx v2, v2, %[x1d]\n"
167*6093faafSChunyan Zhang "vxor.vv v1, v3, v2\n"
168*6093faafSChunyan Zhang ".option pop\n"
169*6093faafSChunyan Zhang : :
170*6093faafSChunyan Zhang [x1d]"r"(0x1d)
171*6093faafSChunyan Zhang );
172*6093faafSChunyan Zhang }
173*6093faafSChunyan Zhang
174*6093faafSChunyan Zhang /*
175*6093faafSChunyan Zhang * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
176*6093faafSChunyan Zhang * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
177*6093faafSChunyan Zhang * v0:wp0, v1:wq0, v2:p0, v3:q0
178*6093faafSChunyan Zhang */
179*6093faafSChunyan Zhang asm volatile (".option push\n"
180*6093faafSChunyan Zhang ".option arch,+v\n"
181*6093faafSChunyan Zhang "vle8.v v2, (%[wp0])\n"
182*6093faafSChunyan Zhang "vle8.v v3, (%[wq0])\n"
183*6093faafSChunyan Zhang "vxor.vv v2, v2, v0\n"
184*6093faafSChunyan Zhang "vxor.vv v3, v3, v1\n"
185*6093faafSChunyan Zhang "vse8.v v2, (%[wp0])\n"
186*6093faafSChunyan Zhang "vse8.v v3, (%[wq0])\n"
187*6093faafSChunyan Zhang ".option pop\n"
188*6093faafSChunyan Zhang : :
189*6093faafSChunyan Zhang [wp0]"r"(&p[d + NSIZE * 0]),
190*6093faafSChunyan Zhang [wq0]"r"(&q[d + NSIZE * 0])
191*6093faafSChunyan Zhang );
192*6093faafSChunyan Zhang }
193*6093faafSChunyan Zhang }
194*6093faafSChunyan Zhang
raid6_rvv2_gen_syndrome_real(int disks,unsigned long bytes,void ** ptrs)195*6093faafSChunyan Zhang static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
196*6093faafSChunyan Zhang {
197*6093faafSChunyan Zhang u8 **dptr = (u8 **)ptrs;
198*6093faafSChunyan Zhang unsigned long d;
199*6093faafSChunyan Zhang int z, z0;
200*6093faafSChunyan Zhang u8 *p, *q;
201*6093faafSChunyan Zhang
202*6093faafSChunyan Zhang z0 = disks - 3; /* Highest data disk */
203*6093faafSChunyan Zhang p = dptr[z0 + 1]; /* XOR parity */
204*6093faafSChunyan Zhang q = dptr[z0 + 2]; /* RS syndrome */
205*6093faafSChunyan Zhang
206*6093faafSChunyan Zhang asm volatile (".option push\n"
207*6093faafSChunyan Zhang ".option arch,+v\n"
208*6093faafSChunyan Zhang "vsetvli t0, x0, e8, m1, ta, ma\n"
209*6093faafSChunyan Zhang ".option pop\n"
210*6093faafSChunyan Zhang );
211*6093faafSChunyan Zhang
212*6093faafSChunyan Zhang /*
213*6093faafSChunyan Zhang * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
214*6093faafSChunyan Zhang * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
215*6093faafSChunyan Zhang */
216*6093faafSChunyan Zhang for (d = 0; d < bytes; d += NSIZE * 2) {
217*6093faafSChunyan Zhang /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
218*6093faafSChunyan Zhang asm volatile (".option push\n"
219*6093faafSChunyan Zhang ".option arch,+v\n"
220*6093faafSChunyan Zhang "vle8.v v0, (%[wp0])\n"
221*6093faafSChunyan Zhang "vle8.v v1, (%[wp0])\n"
222*6093faafSChunyan Zhang "vle8.v v4, (%[wp1])\n"
223*6093faafSChunyan Zhang "vle8.v v5, (%[wp1])\n"
224*6093faafSChunyan Zhang ".option pop\n"
225*6093faafSChunyan Zhang : :
226*6093faafSChunyan Zhang [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
227*6093faafSChunyan Zhang [wp1]"r"(&dptr[z0][d + 1 * NSIZE])
228*6093faafSChunyan Zhang );
229*6093faafSChunyan Zhang
230*6093faafSChunyan Zhang for (z = z0 - 1; z >= 0; z--) {
231*6093faafSChunyan Zhang /*
232*6093faafSChunyan Zhang * w2$$ = MASK(wq$$);
233*6093faafSChunyan Zhang * w1$$ = SHLBYTE(wq$$);
234*6093faafSChunyan Zhang * w2$$ &= NBYTES(0x1d);
235*6093faafSChunyan Zhang * w1$$ ^= w2$$;
236*6093faafSChunyan Zhang * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
237*6093faafSChunyan Zhang * wq$$ = w1$$ ^ wd$$;
238*6093faafSChunyan Zhang * wp$$ ^= wd$$;
239*6093faafSChunyan Zhang */
240*6093faafSChunyan Zhang asm volatile (".option push\n"
241*6093faafSChunyan Zhang ".option arch,+v\n"
242*6093faafSChunyan Zhang "vsra.vi v2, v1, 7\n"
243*6093faafSChunyan Zhang "vsll.vi v3, v1, 1\n"
244*6093faafSChunyan Zhang "vand.vx v2, v2, %[x1d]\n"
245*6093faafSChunyan Zhang "vxor.vv v3, v3, v2\n"
246*6093faafSChunyan Zhang "vle8.v v2, (%[wd0])\n"
247*6093faafSChunyan Zhang "vxor.vv v1, v3, v2\n"
248*6093faafSChunyan Zhang "vxor.vv v0, v0, v2\n"
249*6093faafSChunyan Zhang
250*6093faafSChunyan Zhang "vsra.vi v6, v5, 7\n"
251*6093faafSChunyan Zhang "vsll.vi v7, v5, 1\n"
252*6093faafSChunyan Zhang "vand.vx v6, v6, %[x1d]\n"
253*6093faafSChunyan Zhang "vxor.vv v7, v7, v6\n"
254*6093faafSChunyan Zhang "vle8.v v6, (%[wd1])\n"
255*6093faafSChunyan Zhang "vxor.vv v5, v7, v6\n"
256*6093faafSChunyan Zhang "vxor.vv v4, v4, v6\n"
257*6093faafSChunyan Zhang ".option pop\n"
258*6093faafSChunyan Zhang : :
259*6093faafSChunyan Zhang [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
260*6093faafSChunyan Zhang [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
261*6093faafSChunyan Zhang [x1d]"r"(0x1d)
262*6093faafSChunyan Zhang );
263*6093faafSChunyan Zhang }
264*6093faafSChunyan Zhang
265*6093faafSChunyan Zhang /*
266*6093faafSChunyan Zhang * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
267*6093faafSChunyan Zhang * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
268*6093faafSChunyan Zhang */
269*6093faafSChunyan Zhang asm volatile (".option push\n"
270*6093faafSChunyan Zhang ".option arch,+v\n"
271*6093faafSChunyan Zhang "vse8.v v0, (%[wp0])\n"
272*6093faafSChunyan Zhang "vse8.v v1, (%[wq0])\n"
273*6093faafSChunyan Zhang "vse8.v v4, (%[wp1])\n"
274*6093faafSChunyan Zhang "vse8.v v5, (%[wq1])\n"
275*6093faafSChunyan Zhang ".option pop\n"
276*6093faafSChunyan Zhang : :
277*6093faafSChunyan Zhang [wp0]"r"(&p[d + NSIZE * 0]),
278*6093faafSChunyan Zhang [wq0]"r"(&q[d + NSIZE * 0]),
279*6093faafSChunyan Zhang [wp1]"r"(&p[d + NSIZE * 1]),
280*6093faafSChunyan Zhang [wq1]"r"(&q[d + NSIZE * 1])
281*6093faafSChunyan Zhang );
282*6093faafSChunyan Zhang }
283*6093faafSChunyan Zhang }
284*6093faafSChunyan Zhang
raid6_rvv2_xor_syndrome_real(int disks,int start,int stop,unsigned long bytes,void ** ptrs)285*6093faafSChunyan Zhang static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
286*6093faafSChunyan Zhang unsigned long bytes, void **ptrs)
287*6093faafSChunyan Zhang {
288*6093faafSChunyan Zhang u8 **dptr = (u8 **)ptrs;
289*6093faafSChunyan Zhang u8 *p, *q;
290*6093faafSChunyan Zhang unsigned long d;
291*6093faafSChunyan Zhang int z, z0;
292*6093faafSChunyan Zhang
293*6093faafSChunyan Zhang z0 = stop; /* P/Q right side optimization */
294*6093faafSChunyan Zhang p = dptr[disks - 2]; /* XOR parity */
295*6093faafSChunyan Zhang q = dptr[disks - 1]; /* RS syndrome */
296*6093faafSChunyan Zhang
297*6093faafSChunyan Zhang asm volatile (".option push\n"
298*6093faafSChunyan Zhang ".option arch,+v\n"
299*6093faafSChunyan Zhang "vsetvli t0, x0, e8, m1, ta, ma\n"
300*6093faafSChunyan Zhang ".option pop\n"
301*6093faafSChunyan Zhang );
302*6093faafSChunyan Zhang
303*6093faafSChunyan Zhang /*
304*6093faafSChunyan Zhang * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
305*6093faafSChunyan Zhang * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
306*6093faafSChunyan Zhang */
307*6093faafSChunyan Zhang for (d = 0; d < bytes; d += NSIZE * 2) {
308*6093faafSChunyan Zhang /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
309*6093faafSChunyan Zhang asm volatile (".option push\n"
310*6093faafSChunyan Zhang ".option arch,+v\n"
311*6093faafSChunyan Zhang "vle8.v v0, (%[wp0])\n"
312*6093faafSChunyan Zhang "vle8.v v1, (%[wp0])\n"
313*6093faafSChunyan Zhang "vle8.v v4, (%[wp1])\n"
314*6093faafSChunyan Zhang "vle8.v v5, (%[wp1])\n"
315*6093faafSChunyan Zhang ".option pop\n"
316*6093faafSChunyan Zhang : :
317*6093faafSChunyan Zhang [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
318*6093faafSChunyan Zhang [wp1]"r"(&dptr[z0][d + 1 * NSIZE])
319*6093faafSChunyan Zhang );
320*6093faafSChunyan Zhang
321*6093faafSChunyan Zhang /* P/Q data pages */
322*6093faafSChunyan Zhang for (z = z0 - 1; z >= start; z--) {
323*6093faafSChunyan Zhang /*
324*6093faafSChunyan Zhang * w2$$ = MASK(wq$$);
325*6093faafSChunyan Zhang * w1$$ = SHLBYTE(wq$$);
326*6093faafSChunyan Zhang * w2$$ &= NBYTES(0x1d);
327*6093faafSChunyan Zhang * w1$$ ^= w2$$;
328*6093faafSChunyan Zhang * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
329*6093faafSChunyan Zhang * wq$$ = w1$$ ^ wd$$;
330*6093faafSChunyan Zhang * wp$$ ^= wd$$;
331*6093faafSChunyan Zhang */
332*6093faafSChunyan Zhang asm volatile (".option push\n"
333*6093faafSChunyan Zhang ".option arch,+v\n"
334*6093faafSChunyan Zhang "vsra.vi v2, v1, 7\n"
335*6093faafSChunyan Zhang "vsll.vi v3, v1, 1\n"
336*6093faafSChunyan Zhang "vand.vx v2, v2, %[x1d]\n"
337*6093faafSChunyan Zhang "vxor.vv v3, v3, v2\n"
338*6093faafSChunyan Zhang "vle8.v v2, (%[wd0])\n"
339*6093faafSChunyan Zhang "vxor.vv v1, v3, v2\n"
340*6093faafSChunyan Zhang "vxor.vv v0, v0, v2\n"
341*6093faafSChunyan Zhang
342*6093faafSChunyan Zhang "vsra.vi v6, v5, 7\n"
343*6093faafSChunyan Zhang "vsll.vi v7, v5, 1\n"
344*6093faafSChunyan Zhang "vand.vx v6, v6, %[x1d]\n"
345*6093faafSChunyan Zhang "vxor.vv v7, v7, v6\n"
346*6093faafSChunyan Zhang "vle8.v v6, (%[wd1])\n"
347*6093faafSChunyan Zhang "vxor.vv v5, v7, v6\n"
348*6093faafSChunyan Zhang "vxor.vv v4, v4, v6\n"
349*6093faafSChunyan Zhang ".option pop\n"
350*6093faafSChunyan Zhang : :
351*6093faafSChunyan Zhang [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
352*6093faafSChunyan Zhang [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
353*6093faafSChunyan Zhang [x1d]"r"(0x1d)
354*6093faafSChunyan Zhang );
355*6093faafSChunyan Zhang }
356*6093faafSChunyan Zhang
357*6093faafSChunyan Zhang /* P/Q left side optimization */
358*6093faafSChunyan Zhang for (z = start - 1; z >= 0; z--) {
359*6093faafSChunyan Zhang /*
360*6093faafSChunyan Zhang * w2$$ = MASK(wq$$);
361*6093faafSChunyan Zhang * w1$$ = SHLBYTE(wq$$);
362*6093faafSChunyan Zhang * w2$$ &= NBYTES(0x1d);
363*6093faafSChunyan Zhang * wq$$ = w1$$ ^ w2$$;
364*6093faafSChunyan Zhang */
365*6093faafSChunyan Zhang asm volatile (".option push\n"
366*6093faafSChunyan Zhang ".option arch,+v\n"
367*6093faafSChunyan Zhang "vsra.vi v2, v1, 7\n"
368*6093faafSChunyan Zhang "vsll.vi v3, v1, 1\n"
369*6093faafSChunyan Zhang "vand.vx v2, v2, %[x1d]\n"
370*6093faafSChunyan Zhang "vxor.vv v1, v3, v2\n"
371*6093faafSChunyan Zhang
372*6093faafSChunyan Zhang "vsra.vi v6, v5, 7\n"
373*6093faafSChunyan Zhang "vsll.vi v7, v5, 1\n"
374*6093faafSChunyan Zhang "vand.vx v6, v6, %[x1d]\n"
375*6093faafSChunyan Zhang "vxor.vv v5, v7, v6\n"
376*6093faafSChunyan Zhang ".option pop\n"
377*6093faafSChunyan Zhang : :
378*6093faafSChunyan Zhang [x1d]"r"(0x1d)
379*6093faafSChunyan Zhang );
380*6093faafSChunyan Zhang }
381*6093faafSChunyan Zhang
382*6093faafSChunyan Zhang /*
383*6093faafSChunyan Zhang * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
384*6093faafSChunyan Zhang * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
385*6093faafSChunyan Zhang * v0:wp0, v1:wq0, v2:p0, v3:q0
386*6093faafSChunyan Zhang * v4:wp1, v5:wq1, v6:p1, v7:q1
387*6093faafSChunyan Zhang */
388*6093faafSChunyan Zhang asm volatile (".option push\n"
389*6093faafSChunyan Zhang ".option arch,+v\n"
390*6093faafSChunyan Zhang "vle8.v v2, (%[wp0])\n"
391*6093faafSChunyan Zhang "vle8.v v3, (%[wq0])\n"
392*6093faafSChunyan Zhang "vxor.vv v2, v2, v0\n"
393*6093faafSChunyan Zhang "vxor.vv v3, v3, v1\n"
394*6093faafSChunyan Zhang "vse8.v v2, (%[wp0])\n"
395*6093faafSChunyan Zhang "vse8.v v3, (%[wq0])\n"
396*6093faafSChunyan Zhang
397*6093faafSChunyan Zhang "vle8.v v6, (%[wp1])\n"
398*6093faafSChunyan Zhang "vle8.v v7, (%[wq1])\n"
399*6093faafSChunyan Zhang "vxor.vv v6, v6, v4\n"
400*6093faafSChunyan Zhang "vxor.vv v7, v7, v5\n"
401*6093faafSChunyan Zhang "vse8.v v6, (%[wp1])\n"
402*6093faafSChunyan Zhang "vse8.v v7, (%[wq1])\n"
403*6093faafSChunyan Zhang ".option pop\n"
404*6093faafSChunyan Zhang : :
405*6093faafSChunyan Zhang [wp0]"r"(&p[d + NSIZE * 0]),
406*6093faafSChunyan Zhang [wq0]"r"(&q[d + NSIZE * 0]),
407*6093faafSChunyan Zhang [wp1]"r"(&p[d + NSIZE * 1]),
408*6093faafSChunyan Zhang [wq1]"r"(&q[d + NSIZE * 1])
409*6093faafSChunyan Zhang );
410*6093faafSChunyan Zhang }
411*6093faafSChunyan Zhang }
412*6093faafSChunyan Zhang
raid6_rvv4_gen_syndrome_real(int disks,unsigned long bytes,void ** ptrs)413*6093faafSChunyan Zhang static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
414*6093faafSChunyan Zhang {
415*6093faafSChunyan Zhang u8 **dptr = (u8 **)ptrs;
416*6093faafSChunyan Zhang unsigned long d;
417*6093faafSChunyan Zhang int z, z0;
418*6093faafSChunyan Zhang u8 *p, *q;
419*6093faafSChunyan Zhang
420*6093faafSChunyan Zhang z0 = disks - 3; /* Highest data disk */
421*6093faafSChunyan Zhang p = dptr[z0 + 1]; /* XOR parity */
422*6093faafSChunyan Zhang q = dptr[z0 + 2]; /* RS syndrome */
423*6093faafSChunyan Zhang
424*6093faafSChunyan Zhang asm volatile (".option push\n"
425*6093faafSChunyan Zhang ".option arch,+v\n"
426*6093faafSChunyan Zhang "vsetvli t0, x0, e8, m1, ta, ma\n"
427*6093faafSChunyan Zhang ".option pop\n"
428*6093faafSChunyan Zhang );
429*6093faafSChunyan Zhang
430*6093faafSChunyan Zhang /*
431*6093faafSChunyan Zhang * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
432*6093faafSChunyan Zhang * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
433*6093faafSChunyan Zhang * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
434*6093faafSChunyan Zhang * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
435*6093faafSChunyan Zhang */
436*6093faafSChunyan Zhang for (d = 0; d < bytes; d += NSIZE * 4) {
437*6093faafSChunyan Zhang /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
438*6093faafSChunyan Zhang asm volatile (".option push\n"
439*6093faafSChunyan Zhang ".option arch,+v\n"
440*6093faafSChunyan Zhang "vle8.v v0, (%[wp0])\n"
441*6093faafSChunyan Zhang "vle8.v v1, (%[wp0])\n"
442*6093faafSChunyan Zhang "vle8.v v4, (%[wp1])\n"
443*6093faafSChunyan Zhang "vle8.v v5, (%[wp1])\n"
444*6093faafSChunyan Zhang "vle8.v v8, (%[wp2])\n"
445*6093faafSChunyan Zhang "vle8.v v9, (%[wp2])\n"
446*6093faafSChunyan Zhang "vle8.v v12, (%[wp3])\n"
447*6093faafSChunyan Zhang "vle8.v v13, (%[wp3])\n"
448*6093faafSChunyan Zhang ".option pop\n"
449*6093faafSChunyan Zhang : :
450*6093faafSChunyan Zhang [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
451*6093faafSChunyan Zhang [wp1]"r"(&dptr[z0][d + 1 * NSIZE]),
452*6093faafSChunyan Zhang [wp2]"r"(&dptr[z0][d + 2 * NSIZE]),
453*6093faafSChunyan Zhang [wp3]"r"(&dptr[z0][d + 3 * NSIZE])
454*6093faafSChunyan Zhang );
455*6093faafSChunyan Zhang
456*6093faafSChunyan Zhang for (z = z0 - 1; z >= 0; z--) {
457*6093faafSChunyan Zhang /*
458*6093faafSChunyan Zhang * w2$$ = MASK(wq$$);
459*6093faafSChunyan Zhang * w1$$ = SHLBYTE(wq$$);
460*6093faafSChunyan Zhang * w2$$ &= NBYTES(0x1d);
461*6093faafSChunyan Zhang * w1$$ ^= w2$$;
462*6093faafSChunyan Zhang * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
463*6093faafSChunyan Zhang * wq$$ = w1$$ ^ wd$$;
464*6093faafSChunyan Zhang * wp$$ ^= wd$$;
465*6093faafSChunyan Zhang */
466*6093faafSChunyan Zhang asm volatile (".option push\n"
467*6093faafSChunyan Zhang ".option arch,+v\n"
468*6093faafSChunyan Zhang "vsra.vi v2, v1, 7\n"
469*6093faafSChunyan Zhang "vsll.vi v3, v1, 1\n"
470*6093faafSChunyan Zhang "vand.vx v2, v2, %[x1d]\n"
471*6093faafSChunyan Zhang "vxor.vv v3, v3, v2\n"
472*6093faafSChunyan Zhang "vle8.v v2, (%[wd0])\n"
473*6093faafSChunyan Zhang "vxor.vv v1, v3, v2\n"
474*6093faafSChunyan Zhang "vxor.vv v0, v0, v2\n"
475*6093faafSChunyan Zhang
476*6093faafSChunyan Zhang "vsra.vi v6, v5, 7\n"
477*6093faafSChunyan Zhang "vsll.vi v7, v5, 1\n"
478*6093faafSChunyan Zhang "vand.vx v6, v6, %[x1d]\n"
479*6093faafSChunyan Zhang "vxor.vv v7, v7, v6\n"
480*6093faafSChunyan Zhang "vle8.v v6, (%[wd1])\n"
481*6093faafSChunyan Zhang "vxor.vv v5, v7, v6\n"
482*6093faafSChunyan Zhang "vxor.vv v4, v4, v6\n"
483*6093faafSChunyan Zhang
484*6093faafSChunyan Zhang "vsra.vi v10, v9, 7\n"
485*6093faafSChunyan Zhang "vsll.vi v11, v9, 1\n"
486*6093faafSChunyan Zhang "vand.vx v10, v10, %[x1d]\n"
487*6093faafSChunyan Zhang "vxor.vv v11, v11, v10\n"
488*6093faafSChunyan Zhang "vle8.v v10, (%[wd2])\n"
489*6093faafSChunyan Zhang "vxor.vv v9, v11, v10\n"
490*6093faafSChunyan Zhang "vxor.vv v8, v8, v10\n"
491*6093faafSChunyan Zhang
492*6093faafSChunyan Zhang "vsra.vi v14, v13, 7\n"
493*6093faafSChunyan Zhang "vsll.vi v15, v13, 1\n"
494*6093faafSChunyan Zhang "vand.vx v14, v14, %[x1d]\n"
495*6093faafSChunyan Zhang "vxor.vv v15, v15, v14\n"
496*6093faafSChunyan Zhang "vle8.v v14, (%[wd3])\n"
497*6093faafSChunyan Zhang "vxor.vv v13, v15, v14\n"
498*6093faafSChunyan Zhang "vxor.vv v12, v12, v14\n"
499*6093faafSChunyan Zhang ".option pop\n"
500*6093faafSChunyan Zhang : :
501*6093faafSChunyan Zhang [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
502*6093faafSChunyan Zhang [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
503*6093faafSChunyan Zhang [wd2]"r"(&dptr[z][d + 2 * NSIZE]),
504*6093faafSChunyan Zhang [wd3]"r"(&dptr[z][d + 3 * NSIZE]),
505*6093faafSChunyan Zhang [x1d]"r"(0x1d)
506*6093faafSChunyan Zhang );
507*6093faafSChunyan Zhang }
508*6093faafSChunyan Zhang
509*6093faafSChunyan Zhang /*
510*6093faafSChunyan Zhang * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
511*6093faafSChunyan Zhang * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
512*6093faafSChunyan Zhang */
513*6093faafSChunyan Zhang asm volatile (".option push\n"
514*6093faafSChunyan Zhang ".option arch,+v\n"
515*6093faafSChunyan Zhang "vse8.v v0, (%[wp0])\n"
516*6093faafSChunyan Zhang "vse8.v v1, (%[wq0])\n"
517*6093faafSChunyan Zhang "vse8.v v4, (%[wp1])\n"
518*6093faafSChunyan Zhang "vse8.v v5, (%[wq1])\n"
519*6093faafSChunyan Zhang "vse8.v v8, (%[wp2])\n"
520*6093faafSChunyan Zhang "vse8.v v9, (%[wq2])\n"
521*6093faafSChunyan Zhang "vse8.v v12, (%[wp3])\n"
522*6093faafSChunyan Zhang "vse8.v v13, (%[wq3])\n"
523*6093faafSChunyan Zhang ".option pop\n"
524*6093faafSChunyan Zhang : :
525*6093faafSChunyan Zhang [wp0]"r"(&p[d + NSIZE * 0]),
526*6093faafSChunyan Zhang [wq0]"r"(&q[d + NSIZE * 0]),
527*6093faafSChunyan Zhang [wp1]"r"(&p[d + NSIZE * 1]),
528*6093faafSChunyan Zhang [wq1]"r"(&q[d + NSIZE * 1]),
529*6093faafSChunyan Zhang [wp2]"r"(&p[d + NSIZE * 2]),
530*6093faafSChunyan Zhang [wq2]"r"(&q[d + NSIZE * 2]),
531*6093faafSChunyan Zhang [wp3]"r"(&p[d + NSIZE * 3]),
532*6093faafSChunyan Zhang [wq3]"r"(&q[d + NSIZE * 3])
533*6093faafSChunyan Zhang );
534*6093faafSChunyan Zhang }
535*6093faafSChunyan Zhang }
536*6093faafSChunyan Zhang
raid6_rvv4_xor_syndrome_real(int disks,int start,int stop,unsigned long bytes,void ** ptrs)537*6093faafSChunyan Zhang static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
538*6093faafSChunyan Zhang unsigned long bytes, void **ptrs)
539*6093faafSChunyan Zhang {
540*6093faafSChunyan Zhang u8 **dptr = (u8 **)ptrs;
541*6093faafSChunyan Zhang u8 *p, *q;
542*6093faafSChunyan Zhang unsigned long d;
543*6093faafSChunyan Zhang int z, z0;
544*6093faafSChunyan Zhang
545*6093faafSChunyan Zhang z0 = stop; /* P/Q right side optimization */
546*6093faafSChunyan Zhang p = dptr[disks - 2]; /* XOR parity */
547*6093faafSChunyan Zhang q = dptr[disks - 1]; /* RS syndrome */
548*6093faafSChunyan Zhang
549*6093faafSChunyan Zhang asm volatile (".option push\n"
550*6093faafSChunyan Zhang ".option arch,+v\n"
551*6093faafSChunyan Zhang "vsetvli t0, x0, e8, m1, ta, ma\n"
552*6093faafSChunyan Zhang ".option pop\n"
553*6093faafSChunyan Zhang );
554*6093faafSChunyan Zhang
555*6093faafSChunyan Zhang /*
556*6093faafSChunyan Zhang * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
557*6093faafSChunyan Zhang * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
558*6093faafSChunyan Zhang * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
559*6093faafSChunyan Zhang * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
560*6093faafSChunyan Zhang */
561*6093faafSChunyan Zhang for (d = 0; d < bytes; d += NSIZE * 4) {
562*6093faafSChunyan Zhang /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
563*6093faafSChunyan Zhang asm volatile (".option push\n"
564*6093faafSChunyan Zhang ".option arch,+v\n"
565*6093faafSChunyan Zhang "vle8.v v0, (%[wp0])\n"
566*6093faafSChunyan Zhang "vle8.v v1, (%[wp0])\n"
567*6093faafSChunyan Zhang "vle8.v v4, (%[wp1])\n"
568*6093faafSChunyan Zhang "vle8.v v5, (%[wp1])\n"
569*6093faafSChunyan Zhang "vle8.v v8, (%[wp2])\n"
570*6093faafSChunyan Zhang "vle8.v v9, (%[wp2])\n"
571*6093faafSChunyan Zhang "vle8.v v12, (%[wp3])\n"
572*6093faafSChunyan Zhang "vle8.v v13, (%[wp3])\n"
573*6093faafSChunyan Zhang ".option pop\n"
574*6093faafSChunyan Zhang : :
575*6093faafSChunyan Zhang [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
576*6093faafSChunyan Zhang [wp1]"r"(&dptr[z0][d + 1 * NSIZE]),
577*6093faafSChunyan Zhang [wp2]"r"(&dptr[z0][d + 2 * NSIZE]),
578*6093faafSChunyan Zhang [wp3]"r"(&dptr[z0][d + 3 * NSIZE])
579*6093faafSChunyan Zhang );
580*6093faafSChunyan Zhang
581*6093faafSChunyan Zhang /* P/Q data pages */
582*6093faafSChunyan Zhang for (z = z0 - 1; z >= start; z--) {
583*6093faafSChunyan Zhang /*
584*6093faafSChunyan Zhang * w2$$ = MASK(wq$$);
585*6093faafSChunyan Zhang * w1$$ = SHLBYTE(wq$$);
586*6093faafSChunyan Zhang * w2$$ &= NBYTES(0x1d);
587*6093faafSChunyan Zhang * w1$$ ^= w2$$;
588*6093faafSChunyan Zhang * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
589*6093faafSChunyan Zhang * wq$$ = w1$$ ^ wd$$;
590*6093faafSChunyan Zhang * wp$$ ^= wd$$;
591*6093faafSChunyan Zhang */
592*6093faafSChunyan Zhang asm volatile (".option push\n"
593*6093faafSChunyan Zhang ".option arch,+v\n"
594*6093faafSChunyan Zhang "vsra.vi v2, v1, 7\n"
595*6093faafSChunyan Zhang "vsll.vi v3, v1, 1\n"
596*6093faafSChunyan Zhang "vand.vx v2, v2, %[x1d]\n"
597*6093faafSChunyan Zhang "vxor.vv v3, v3, v2\n"
598*6093faafSChunyan Zhang "vle8.v v2, (%[wd0])\n"
599*6093faafSChunyan Zhang "vxor.vv v1, v3, v2\n"
600*6093faafSChunyan Zhang "vxor.vv v0, v0, v2\n"
601*6093faafSChunyan Zhang
602*6093faafSChunyan Zhang "vsra.vi v6, v5, 7\n"
603*6093faafSChunyan Zhang "vsll.vi v7, v5, 1\n"
604*6093faafSChunyan Zhang "vand.vx v6, v6, %[x1d]\n"
605*6093faafSChunyan Zhang "vxor.vv v7, v7, v6\n"
606*6093faafSChunyan Zhang "vle8.v v6, (%[wd1])\n"
607*6093faafSChunyan Zhang "vxor.vv v5, v7, v6\n"
608*6093faafSChunyan Zhang "vxor.vv v4, v4, v6\n"
609*6093faafSChunyan Zhang
610*6093faafSChunyan Zhang "vsra.vi v10, v9, 7\n"
611*6093faafSChunyan Zhang "vsll.vi v11, v9, 1\n"
612*6093faafSChunyan Zhang "vand.vx v10, v10, %[x1d]\n"
613*6093faafSChunyan Zhang "vxor.vv v11, v11, v10\n"
614*6093faafSChunyan Zhang "vle8.v v10, (%[wd2])\n"
615*6093faafSChunyan Zhang "vxor.vv v9, v11, v10\n"
616*6093faafSChunyan Zhang "vxor.vv v8, v8, v10\n"
617*6093faafSChunyan Zhang
618*6093faafSChunyan Zhang "vsra.vi v14, v13, 7\n"
619*6093faafSChunyan Zhang "vsll.vi v15, v13, 1\n"
620*6093faafSChunyan Zhang "vand.vx v14, v14, %[x1d]\n"
621*6093faafSChunyan Zhang "vxor.vv v15, v15, v14\n"
622*6093faafSChunyan Zhang "vle8.v v14, (%[wd3])\n"
623*6093faafSChunyan Zhang "vxor.vv v13, v15, v14\n"
624*6093faafSChunyan Zhang "vxor.vv v12, v12, v14\n"
625*6093faafSChunyan Zhang ".option pop\n"
626*6093faafSChunyan Zhang : :
627*6093faafSChunyan Zhang [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
628*6093faafSChunyan Zhang [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
629*6093faafSChunyan Zhang [wd2]"r"(&dptr[z][d + 2 * NSIZE]),
630*6093faafSChunyan Zhang [wd3]"r"(&dptr[z][d + 3 * NSIZE]),
631*6093faafSChunyan Zhang [x1d]"r"(0x1d)
632*6093faafSChunyan Zhang );
633*6093faafSChunyan Zhang }
634*6093faafSChunyan Zhang
635*6093faafSChunyan Zhang /* P/Q left side optimization */
636*6093faafSChunyan Zhang for (z = start - 1; z >= 0; z--) {
637*6093faafSChunyan Zhang /*
638*6093faafSChunyan Zhang * w2$$ = MASK(wq$$);
639*6093faafSChunyan Zhang * w1$$ = SHLBYTE(wq$$);
640*6093faafSChunyan Zhang * w2$$ &= NBYTES(0x1d);
641*6093faafSChunyan Zhang * wq$$ = w1$$ ^ w2$$;
642*6093faafSChunyan Zhang */
643*6093faafSChunyan Zhang asm volatile (".option push\n"
644*6093faafSChunyan Zhang ".option arch,+v\n"
645*6093faafSChunyan Zhang "vsra.vi v2, v1, 7\n"
646*6093faafSChunyan Zhang "vsll.vi v3, v1, 1\n"
647*6093faafSChunyan Zhang "vand.vx v2, v2, %[x1d]\n"
648*6093faafSChunyan Zhang "vxor.vv v1, v3, v2\n"
649*6093faafSChunyan Zhang
650*6093faafSChunyan Zhang "vsra.vi v6, v5, 7\n"
651*6093faafSChunyan Zhang "vsll.vi v7, v5, 1\n"
652*6093faafSChunyan Zhang "vand.vx v6, v6, %[x1d]\n"
653*6093faafSChunyan Zhang "vxor.vv v5, v7, v6\n"
654*6093faafSChunyan Zhang
655*6093faafSChunyan Zhang "vsra.vi v10, v9, 7\n"
656*6093faafSChunyan Zhang "vsll.vi v11, v9, 1\n"
657*6093faafSChunyan Zhang "vand.vx v10, v10, %[x1d]\n"
658*6093faafSChunyan Zhang "vxor.vv v9, v11, v10\n"
659*6093faafSChunyan Zhang
660*6093faafSChunyan Zhang "vsra.vi v14, v13, 7\n"
661*6093faafSChunyan Zhang "vsll.vi v15, v13, 1\n"
662*6093faafSChunyan Zhang "vand.vx v14, v14, %[x1d]\n"
663*6093faafSChunyan Zhang "vxor.vv v13, v15, v14\n"
664*6093faafSChunyan Zhang ".option pop\n"
665*6093faafSChunyan Zhang : :
666*6093faafSChunyan Zhang [x1d]"r"(0x1d)
667*6093faafSChunyan Zhang );
668*6093faafSChunyan Zhang }
669*6093faafSChunyan Zhang
670*6093faafSChunyan Zhang /*
671*6093faafSChunyan Zhang * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
672*6093faafSChunyan Zhang * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
673*6093faafSChunyan Zhang * v0:wp0, v1:wq0, v2:p0, v3:q0
674*6093faafSChunyan Zhang * v4:wp1, v5:wq1, v6:p1, v7:q1
675*6093faafSChunyan Zhang * v8:wp2, v9:wq2, v10:p2, v11:q2
676*6093faafSChunyan Zhang * v12:wp3, v13:wq3, v14:p3, v15:q3
677*6093faafSChunyan Zhang */
678*6093faafSChunyan Zhang asm volatile (".option push\n"
679*6093faafSChunyan Zhang ".option arch,+v\n"
680*6093faafSChunyan Zhang "vle8.v v2, (%[wp0])\n"
681*6093faafSChunyan Zhang "vle8.v v3, (%[wq0])\n"
682*6093faafSChunyan Zhang "vxor.vv v2, v2, v0\n"
683*6093faafSChunyan Zhang "vxor.vv v3, v3, v1\n"
684*6093faafSChunyan Zhang "vse8.v v2, (%[wp0])\n"
685*6093faafSChunyan Zhang "vse8.v v3, (%[wq0])\n"
686*6093faafSChunyan Zhang
687*6093faafSChunyan Zhang "vle8.v v6, (%[wp1])\n"
688*6093faafSChunyan Zhang "vle8.v v7, (%[wq1])\n"
689*6093faafSChunyan Zhang "vxor.vv v6, v6, v4\n"
690*6093faafSChunyan Zhang "vxor.vv v7, v7, v5\n"
691*6093faafSChunyan Zhang "vse8.v v6, (%[wp1])\n"
692*6093faafSChunyan Zhang "vse8.v v7, (%[wq1])\n"
693*6093faafSChunyan Zhang
694*6093faafSChunyan Zhang "vle8.v v10, (%[wp2])\n"
695*6093faafSChunyan Zhang "vle8.v v11, (%[wq2])\n"
696*6093faafSChunyan Zhang "vxor.vv v10, v10, v8\n"
697*6093faafSChunyan Zhang "vxor.vv v11, v11, v9\n"
698*6093faafSChunyan Zhang "vse8.v v10, (%[wp2])\n"
699*6093faafSChunyan Zhang "vse8.v v11, (%[wq2])\n"
700*6093faafSChunyan Zhang
701*6093faafSChunyan Zhang "vle8.v v14, (%[wp3])\n"
702*6093faafSChunyan Zhang "vle8.v v15, (%[wq3])\n"
703*6093faafSChunyan Zhang "vxor.vv v14, v14, v12\n"
704*6093faafSChunyan Zhang "vxor.vv v15, v15, v13\n"
705*6093faafSChunyan Zhang "vse8.v v14, (%[wp3])\n"
706*6093faafSChunyan Zhang "vse8.v v15, (%[wq3])\n"
707*6093faafSChunyan Zhang ".option pop\n"
708*6093faafSChunyan Zhang : :
709*6093faafSChunyan Zhang [wp0]"r"(&p[d + NSIZE * 0]),
710*6093faafSChunyan Zhang [wq0]"r"(&q[d + NSIZE * 0]),
711*6093faafSChunyan Zhang [wp1]"r"(&p[d + NSIZE * 1]),
712*6093faafSChunyan Zhang [wq1]"r"(&q[d + NSIZE * 1]),
713*6093faafSChunyan Zhang [wp2]"r"(&p[d + NSIZE * 2]),
714*6093faafSChunyan Zhang [wq2]"r"(&q[d + NSIZE * 2]),
715*6093faafSChunyan Zhang [wp3]"r"(&p[d + NSIZE * 3]),
716*6093faafSChunyan Zhang [wq3]"r"(&q[d + NSIZE * 3])
717*6093faafSChunyan Zhang );
718*6093faafSChunyan Zhang }
719*6093faafSChunyan Zhang }
720*6093faafSChunyan Zhang
raid6_rvv8_gen_syndrome_real(int disks,unsigned long bytes,void ** ptrs)721*6093faafSChunyan Zhang static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
722*6093faafSChunyan Zhang {
723*6093faafSChunyan Zhang u8 **dptr = (u8 **)ptrs;
724*6093faafSChunyan Zhang unsigned long d;
725*6093faafSChunyan Zhang int z, z0;
726*6093faafSChunyan Zhang u8 *p, *q;
727*6093faafSChunyan Zhang
728*6093faafSChunyan Zhang z0 = disks - 3; /* Highest data disk */
729*6093faafSChunyan Zhang p = dptr[z0 + 1]; /* XOR parity */
730*6093faafSChunyan Zhang q = dptr[z0 + 2]; /* RS syndrome */
731*6093faafSChunyan Zhang
732*6093faafSChunyan Zhang asm volatile (".option push\n"
733*6093faafSChunyan Zhang ".option arch,+v\n"
734*6093faafSChunyan Zhang "vsetvli t0, x0, e8, m1, ta, ma\n"
735*6093faafSChunyan Zhang ".option pop\n"
736*6093faafSChunyan Zhang );
737*6093faafSChunyan Zhang
738*6093faafSChunyan Zhang /*
739*6093faafSChunyan Zhang * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
740*6093faafSChunyan Zhang * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
741*6093faafSChunyan Zhang * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
742*6093faafSChunyan Zhang * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
743*6093faafSChunyan Zhang * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
744*6093faafSChunyan Zhang * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
745*6093faafSChunyan Zhang * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
746*6093faafSChunyan Zhang * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
747*6093faafSChunyan Zhang */
748*6093faafSChunyan Zhang for (d = 0; d < bytes; d += NSIZE * 8) {
749*6093faafSChunyan Zhang /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
750*6093faafSChunyan Zhang asm volatile (".option push\n"
751*6093faafSChunyan Zhang ".option arch,+v\n"
752*6093faafSChunyan Zhang "vle8.v v0, (%[wp0])\n"
753*6093faafSChunyan Zhang "vle8.v v1, (%[wp0])\n"
754*6093faafSChunyan Zhang "vle8.v v4, (%[wp1])\n"
755*6093faafSChunyan Zhang "vle8.v v5, (%[wp1])\n"
756*6093faafSChunyan Zhang "vle8.v v8, (%[wp2])\n"
757*6093faafSChunyan Zhang "vle8.v v9, (%[wp2])\n"
758*6093faafSChunyan Zhang "vle8.v v12, (%[wp3])\n"
759*6093faafSChunyan Zhang "vle8.v v13, (%[wp3])\n"
760*6093faafSChunyan Zhang "vle8.v v16, (%[wp4])\n"
761*6093faafSChunyan Zhang "vle8.v v17, (%[wp4])\n"
762*6093faafSChunyan Zhang "vle8.v v20, (%[wp5])\n"
763*6093faafSChunyan Zhang "vle8.v v21, (%[wp5])\n"
764*6093faafSChunyan Zhang "vle8.v v24, (%[wp6])\n"
765*6093faafSChunyan Zhang "vle8.v v25, (%[wp6])\n"
766*6093faafSChunyan Zhang "vle8.v v28, (%[wp7])\n"
767*6093faafSChunyan Zhang "vle8.v v29, (%[wp7])\n"
768*6093faafSChunyan Zhang ".option pop\n"
769*6093faafSChunyan Zhang : :
770*6093faafSChunyan Zhang [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
771*6093faafSChunyan Zhang [wp1]"r"(&dptr[z0][d + 1 * NSIZE]),
772*6093faafSChunyan Zhang [wp2]"r"(&dptr[z0][d + 2 * NSIZE]),
773*6093faafSChunyan Zhang [wp3]"r"(&dptr[z0][d + 3 * NSIZE]),
774*6093faafSChunyan Zhang [wp4]"r"(&dptr[z0][d + 4 * NSIZE]),
775*6093faafSChunyan Zhang [wp5]"r"(&dptr[z0][d + 5 * NSIZE]),
776*6093faafSChunyan Zhang [wp6]"r"(&dptr[z0][d + 6 * NSIZE]),
777*6093faafSChunyan Zhang [wp7]"r"(&dptr[z0][d + 7 * NSIZE])
778*6093faafSChunyan Zhang );
779*6093faafSChunyan Zhang
780*6093faafSChunyan Zhang for (z = z0 - 1; z >= 0; z--) {
781*6093faafSChunyan Zhang /*
782*6093faafSChunyan Zhang * w2$$ = MASK(wq$$);
783*6093faafSChunyan Zhang * w1$$ = SHLBYTE(wq$$);
784*6093faafSChunyan Zhang * w2$$ &= NBYTES(0x1d);
785*6093faafSChunyan Zhang * w1$$ ^= w2$$;
786*6093faafSChunyan Zhang * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
787*6093faafSChunyan Zhang * wq$$ = w1$$ ^ wd$$;
788*6093faafSChunyan Zhang * wp$$ ^= wd$$;
789*6093faafSChunyan Zhang */
790*6093faafSChunyan Zhang asm volatile (".option push\n"
791*6093faafSChunyan Zhang ".option arch,+v\n"
792*6093faafSChunyan Zhang "vsra.vi v2, v1, 7\n"
793*6093faafSChunyan Zhang "vsll.vi v3, v1, 1\n"
794*6093faafSChunyan Zhang "vand.vx v2, v2, %[x1d]\n"
795*6093faafSChunyan Zhang "vxor.vv v3, v3, v2\n"
796*6093faafSChunyan Zhang "vle8.v v2, (%[wd0])\n"
797*6093faafSChunyan Zhang "vxor.vv v1, v3, v2\n"
798*6093faafSChunyan Zhang "vxor.vv v0, v0, v2\n"
799*6093faafSChunyan Zhang
800*6093faafSChunyan Zhang "vsra.vi v6, v5, 7\n"
801*6093faafSChunyan Zhang "vsll.vi v7, v5, 1\n"
802*6093faafSChunyan Zhang "vand.vx v6, v6, %[x1d]\n"
803*6093faafSChunyan Zhang "vxor.vv v7, v7, v6\n"
804*6093faafSChunyan Zhang "vle8.v v6, (%[wd1])\n"
805*6093faafSChunyan Zhang "vxor.vv v5, v7, v6\n"
806*6093faafSChunyan Zhang "vxor.vv v4, v4, v6\n"
807*6093faafSChunyan Zhang
808*6093faafSChunyan Zhang "vsra.vi v10, v9, 7\n"
809*6093faafSChunyan Zhang "vsll.vi v11, v9, 1\n"
810*6093faafSChunyan Zhang "vand.vx v10, v10, %[x1d]\n"
811*6093faafSChunyan Zhang "vxor.vv v11, v11, v10\n"
812*6093faafSChunyan Zhang "vle8.v v10, (%[wd2])\n"
813*6093faafSChunyan Zhang "vxor.vv v9, v11, v10\n"
814*6093faafSChunyan Zhang "vxor.vv v8, v8, v10\n"
815*6093faafSChunyan Zhang
816*6093faafSChunyan Zhang "vsra.vi v14, v13, 7\n"
817*6093faafSChunyan Zhang "vsll.vi v15, v13, 1\n"
818*6093faafSChunyan Zhang "vand.vx v14, v14, %[x1d]\n"
819*6093faafSChunyan Zhang "vxor.vv v15, v15, v14\n"
820*6093faafSChunyan Zhang "vle8.v v14, (%[wd3])\n"
821*6093faafSChunyan Zhang "vxor.vv v13, v15, v14\n"
822*6093faafSChunyan Zhang "vxor.vv v12, v12, v14\n"
823*6093faafSChunyan Zhang
824*6093faafSChunyan Zhang "vsra.vi v18, v17, 7\n"
825*6093faafSChunyan Zhang "vsll.vi v19, v17, 1\n"
826*6093faafSChunyan Zhang "vand.vx v18, v18, %[x1d]\n"
827*6093faafSChunyan Zhang "vxor.vv v19, v19, v18\n"
828*6093faafSChunyan Zhang "vle8.v v18, (%[wd4])\n"
829*6093faafSChunyan Zhang "vxor.vv v17, v19, v18\n"
830*6093faafSChunyan Zhang "vxor.vv v16, v16, v18\n"
831*6093faafSChunyan Zhang
832*6093faafSChunyan Zhang "vsra.vi v22, v21, 7\n"
833*6093faafSChunyan Zhang "vsll.vi v23, v21, 1\n"
834*6093faafSChunyan Zhang "vand.vx v22, v22, %[x1d]\n"
835*6093faafSChunyan Zhang "vxor.vv v23, v23, v22\n"
836*6093faafSChunyan Zhang "vle8.v v22, (%[wd5])\n"
837*6093faafSChunyan Zhang "vxor.vv v21, v23, v22\n"
838*6093faafSChunyan Zhang "vxor.vv v20, v20, v22\n"
839*6093faafSChunyan Zhang
840*6093faafSChunyan Zhang "vsra.vi v26, v25, 7\n"
841*6093faafSChunyan Zhang "vsll.vi v27, v25, 1\n"
842*6093faafSChunyan Zhang "vand.vx v26, v26, %[x1d]\n"
843*6093faafSChunyan Zhang "vxor.vv v27, v27, v26\n"
844*6093faafSChunyan Zhang "vle8.v v26, (%[wd6])\n"
845*6093faafSChunyan Zhang "vxor.vv v25, v27, v26\n"
846*6093faafSChunyan Zhang "vxor.vv v24, v24, v26\n"
847*6093faafSChunyan Zhang
848*6093faafSChunyan Zhang "vsra.vi v30, v29, 7\n"
849*6093faafSChunyan Zhang "vsll.vi v31, v29, 1\n"
850*6093faafSChunyan Zhang "vand.vx v30, v30, %[x1d]\n"
851*6093faafSChunyan Zhang "vxor.vv v31, v31, v30\n"
852*6093faafSChunyan Zhang "vle8.v v30, (%[wd7])\n"
853*6093faafSChunyan Zhang "vxor.vv v29, v31, v30\n"
854*6093faafSChunyan Zhang "vxor.vv v28, v28, v30\n"
855*6093faafSChunyan Zhang ".option pop\n"
856*6093faafSChunyan Zhang : :
857*6093faafSChunyan Zhang [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
858*6093faafSChunyan Zhang [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
859*6093faafSChunyan Zhang [wd2]"r"(&dptr[z][d + 2 * NSIZE]),
860*6093faafSChunyan Zhang [wd3]"r"(&dptr[z][d + 3 * NSIZE]),
861*6093faafSChunyan Zhang [wd4]"r"(&dptr[z][d + 4 * NSIZE]),
862*6093faafSChunyan Zhang [wd5]"r"(&dptr[z][d + 5 * NSIZE]),
863*6093faafSChunyan Zhang [wd6]"r"(&dptr[z][d + 6 * NSIZE]),
864*6093faafSChunyan Zhang [wd7]"r"(&dptr[z][d + 7 * NSIZE]),
865*6093faafSChunyan Zhang [x1d]"r"(0x1d)
866*6093faafSChunyan Zhang );
867*6093faafSChunyan Zhang }
868*6093faafSChunyan Zhang
869*6093faafSChunyan Zhang /*
870*6093faafSChunyan Zhang * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
871*6093faafSChunyan Zhang * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
872*6093faafSChunyan Zhang */
873*6093faafSChunyan Zhang asm volatile (".option push\n"
874*6093faafSChunyan Zhang ".option arch,+v\n"
875*6093faafSChunyan Zhang "vse8.v v0, (%[wp0])\n"
876*6093faafSChunyan Zhang "vse8.v v1, (%[wq0])\n"
877*6093faafSChunyan Zhang "vse8.v v4, (%[wp1])\n"
878*6093faafSChunyan Zhang "vse8.v v5, (%[wq1])\n"
879*6093faafSChunyan Zhang "vse8.v v8, (%[wp2])\n"
880*6093faafSChunyan Zhang "vse8.v v9, (%[wq2])\n"
881*6093faafSChunyan Zhang "vse8.v v12, (%[wp3])\n"
882*6093faafSChunyan Zhang "vse8.v v13, (%[wq3])\n"
883*6093faafSChunyan Zhang "vse8.v v16, (%[wp4])\n"
884*6093faafSChunyan Zhang "vse8.v v17, (%[wq4])\n"
885*6093faafSChunyan Zhang "vse8.v v20, (%[wp5])\n"
886*6093faafSChunyan Zhang "vse8.v v21, (%[wq5])\n"
887*6093faafSChunyan Zhang "vse8.v v24, (%[wp6])\n"
888*6093faafSChunyan Zhang "vse8.v v25, (%[wq6])\n"
889*6093faafSChunyan Zhang "vse8.v v28, (%[wp7])\n"
890*6093faafSChunyan Zhang "vse8.v v29, (%[wq7])\n"
891*6093faafSChunyan Zhang ".option pop\n"
892*6093faafSChunyan Zhang : :
893*6093faafSChunyan Zhang [wp0]"r"(&p[d + NSIZE * 0]),
894*6093faafSChunyan Zhang [wq0]"r"(&q[d + NSIZE * 0]),
895*6093faafSChunyan Zhang [wp1]"r"(&p[d + NSIZE * 1]),
896*6093faafSChunyan Zhang [wq1]"r"(&q[d + NSIZE * 1]),
897*6093faafSChunyan Zhang [wp2]"r"(&p[d + NSIZE * 2]),
898*6093faafSChunyan Zhang [wq2]"r"(&q[d + NSIZE * 2]),
899*6093faafSChunyan Zhang [wp3]"r"(&p[d + NSIZE * 3]),
900*6093faafSChunyan Zhang [wq3]"r"(&q[d + NSIZE * 3]),
901*6093faafSChunyan Zhang [wp4]"r"(&p[d + NSIZE * 4]),
902*6093faafSChunyan Zhang [wq4]"r"(&q[d + NSIZE * 4]),
903*6093faafSChunyan Zhang [wp5]"r"(&p[d + NSIZE * 5]),
904*6093faafSChunyan Zhang [wq5]"r"(&q[d + NSIZE * 5]),
905*6093faafSChunyan Zhang [wp6]"r"(&p[d + NSIZE * 6]),
906*6093faafSChunyan Zhang [wq6]"r"(&q[d + NSIZE * 6]),
907*6093faafSChunyan Zhang [wp7]"r"(&p[d + NSIZE * 7]),
908*6093faafSChunyan Zhang [wq7]"r"(&q[d + NSIZE * 7])
909*6093faafSChunyan Zhang );
910*6093faafSChunyan Zhang }
911*6093faafSChunyan Zhang }
912*6093faafSChunyan Zhang
raid6_rvv8_xor_syndrome_real(int disks,int start,int stop,unsigned long bytes,void ** ptrs)913*6093faafSChunyan Zhang static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop,
914*6093faafSChunyan Zhang unsigned long bytes, void **ptrs)
915*6093faafSChunyan Zhang {
916*6093faafSChunyan Zhang u8 **dptr = (u8 **)ptrs;
917*6093faafSChunyan Zhang u8 *p, *q;
918*6093faafSChunyan Zhang unsigned long d;
919*6093faafSChunyan Zhang int z, z0;
920*6093faafSChunyan Zhang
921*6093faafSChunyan Zhang z0 = stop; /* P/Q right side optimization */
922*6093faafSChunyan Zhang p = dptr[disks - 2]; /* XOR parity */
923*6093faafSChunyan Zhang q = dptr[disks - 1]; /* RS syndrome */
924*6093faafSChunyan Zhang
925*6093faafSChunyan Zhang asm volatile (".option push\n"
926*6093faafSChunyan Zhang ".option arch,+v\n"
927*6093faafSChunyan Zhang "vsetvli t0, x0, e8, m1, ta, ma\n"
928*6093faafSChunyan Zhang ".option pop\n"
929*6093faafSChunyan Zhang );
930*6093faafSChunyan Zhang
931*6093faafSChunyan Zhang /*
932*6093faafSChunyan Zhang * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
933*6093faafSChunyan Zhang * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
934*6093faafSChunyan Zhang * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
935*6093faafSChunyan Zhang * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
936*6093faafSChunyan Zhang * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
937*6093faafSChunyan Zhang * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
938*6093faafSChunyan Zhang * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
939*6093faafSChunyan Zhang * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
940*6093faafSChunyan Zhang */
941*6093faafSChunyan Zhang for (d = 0; d < bytes; d += NSIZE * 8) {
942*6093faafSChunyan Zhang /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
943*6093faafSChunyan Zhang asm volatile (".option push\n"
944*6093faafSChunyan Zhang ".option arch,+v\n"
945*6093faafSChunyan Zhang "vle8.v v0, (%[wp0])\n"
946*6093faafSChunyan Zhang "vle8.v v1, (%[wp0])\n"
947*6093faafSChunyan Zhang "vle8.v v4, (%[wp1])\n"
948*6093faafSChunyan Zhang "vle8.v v5, (%[wp1])\n"
949*6093faafSChunyan Zhang "vle8.v v8, (%[wp2])\n"
950*6093faafSChunyan Zhang "vle8.v v9, (%[wp2])\n"
951*6093faafSChunyan Zhang "vle8.v v12, (%[wp3])\n"
952*6093faafSChunyan Zhang "vle8.v v13, (%[wp3])\n"
953*6093faafSChunyan Zhang "vle8.v v16, (%[wp4])\n"
954*6093faafSChunyan Zhang "vle8.v v17, (%[wp4])\n"
955*6093faafSChunyan Zhang "vle8.v v20, (%[wp5])\n"
956*6093faafSChunyan Zhang "vle8.v v21, (%[wp5])\n"
957*6093faafSChunyan Zhang "vle8.v v24, (%[wp6])\n"
958*6093faafSChunyan Zhang "vle8.v v25, (%[wp6])\n"
959*6093faafSChunyan Zhang "vle8.v v28, (%[wp7])\n"
960*6093faafSChunyan Zhang "vle8.v v29, (%[wp7])\n"
961*6093faafSChunyan Zhang ".option pop\n"
962*6093faafSChunyan Zhang : :
963*6093faafSChunyan Zhang [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
964*6093faafSChunyan Zhang [wp1]"r"(&dptr[z0][d + 1 * NSIZE]),
965*6093faafSChunyan Zhang [wp2]"r"(&dptr[z0][d + 2 * NSIZE]),
966*6093faafSChunyan Zhang [wp3]"r"(&dptr[z0][d + 3 * NSIZE]),
967*6093faafSChunyan Zhang [wp4]"r"(&dptr[z0][d + 4 * NSIZE]),
968*6093faafSChunyan Zhang [wp5]"r"(&dptr[z0][d + 5 * NSIZE]),
969*6093faafSChunyan Zhang [wp6]"r"(&dptr[z0][d + 6 * NSIZE]),
970*6093faafSChunyan Zhang [wp7]"r"(&dptr[z0][d + 7 * NSIZE])
971*6093faafSChunyan Zhang );
972*6093faafSChunyan Zhang
973*6093faafSChunyan Zhang /* P/Q data pages */
974*6093faafSChunyan Zhang for (z = z0 - 1; z >= start; z--) {
975*6093faafSChunyan Zhang /*
976*6093faafSChunyan Zhang * w2$$ = MASK(wq$$);
977*6093faafSChunyan Zhang * w1$$ = SHLBYTE(wq$$);
978*6093faafSChunyan Zhang * w2$$ &= NBYTES(0x1d);
979*6093faafSChunyan Zhang * w1$$ ^= w2$$;
980*6093faafSChunyan Zhang * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
981*6093faafSChunyan Zhang * wq$$ = w1$$ ^ wd$$;
982*6093faafSChunyan Zhang * wp$$ ^= wd$$;
983*6093faafSChunyan Zhang */
984*6093faafSChunyan Zhang asm volatile (".option push\n"
985*6093faafSChunyan Zhang ".option arch,+v\n"
986*6093faafSChunyan Zhang "vsra.vi v2, v1, 7\n"
987*6093faafSChunyan Zhang "vsll.vi v3, v1, 1\n"
988*6093faafSChunyan Zhang "vand.vx v2, v2, %[x1d]\n"
989*6093faafSChunyan Zhang "vxor.vv v3, v3, v2\n"
990*6093faafSChunyan Zhang "vle8.v v2, (%[wd0])\n"
991*6093faafSChunyan Zhang "vxor.vv v1, v3, v2\n"
992*6093faafSChunyan Zhang "vxor.vv v0, v0, v2\n"
993*6093faafSChunyan Zhang
994*6093faafSChunyan Zhang "vsra.vi v6, v5, 7\n"
995*6093faafSChunyan Zhang "vsll.vi v7, v5, 1\n"
996*6093faafSChunyan Zhang "vand.vx v6, v6, %[x1d]\n"
997*6093faafSChunyan Zhang "vxor.vv v7, v7, v6\n"
998*6093faafSChunyan Zhang "vle8.v v6, (%[wd1])\n"
999*6093faafSChunyan Zhang "vxor.vv v5, v7, v6\n"
1000*6093faafSChunyan Zhang "vxor.vv v4, v4, v6\n"
1001*6093faafSChunyan Zhang
1002*6093faafSChunyan Zhang "vsra.vi v10, v9, 7\n"
1003*6093faafSChunyan Zhang "vsll.vi v11, v9, 1\n"
1004*6093faafSChunyan Zhang "vand.vx v10, v10, %[x1d]\n"
1005*6093faafSChunyan Zhang "vxor.vv v11, v11, v10\n"
1006*6093faafSChunyan Zhang "vle8.v v10, (%[wd2])\n"
1007*6093faafSChunyan Zhang "vxor.vv v9, v11, v10\n"
1008*6093faafSChunyan Zhang "vxor.vv v8, v8, v10\n"
1009*6093faafSChunyan Zhang
1010*6093faafSChunyan Zhang "vsra.vi v14, v13, 7\n"
1011*6093faafSChunyan Zhang "vsll.vi v15, v13, 1\n"
1012*6093faafSChunyan Zhang "vand.vx v14, v14, %[x1d]\n"
1013*6093faafSChunyan Zhang "vxor.vv v15, v15, v14\n"
1014*6093faafSChunyan Zhang "vle8.v v14, (%[wd3])\n"
1015*6093faafSChunyan Zhang "vxor.vv v13, v15, v14\n"
1016*6093faafSChunyan Zhang "vxor.vv v12, v12, v14\n"
1017*6093faafSChunyan Zhang
1018*6093faafSChunyan Zhang "vsra.vi v18, v17, 7\n"
1019*6093faafSChunyan Zhang "vsll.vi v19, v17, 1\n"
1020*6093faafSChunyan Zhang "vand.vx v18, v18, %[x1d]\n"
1021*6093faafSChunyan Zhang "vxor.vv v19, v19, v18\n"
1022*6093faafSChunyan Zhang "vle8.v v18, (%[wd4])\n"
1023*6093faafSChunyan Zhang "vxor.vv v17, v19, v18\n"
1024*6093faafSChunyan Zhang "vxor.vv v16, v16, v18\n"
1025*6093faafSChunyan Zhang
1026*6093faafSChunyan Zhang "vsra.vi v22, v21, 7\n"
1027*6093faafSChunyan Zhang "vsll.vi v23, v21, 1\n"
1028*6093faafSChunyan Zhang "vand.vx v22, v22, %[x1d]\n"
1029*6093faafSChunyan Zhang "vxor.vv v23, v23, v22\n"
1030*6093faafSChunyan Zhang "vle8.v v22, (%[wd5])\n"
1031*6093faafSChunyan Zhang "vxor.vv v21, v23, v22\n"
1032*6093faafSChunyan Zhang "vxor.vv v20, v20, v22\n"
1033*6093faafSChunyan Zhang
1034*6093faafSChunyan Zhang "vsra.vi v26, v25, 7\n"
1035*6093faafSChunyan Zhang "vsll.vi v27, v25, 1\n"
1036*6093faafSChunyan Zhang "vand.vx v26, v26, %[x1d]\n"
1037*6093faafSChunyan Zhang "vxor.vv v27, v27, v26\n"
1038*6093faafSChunyan Zhang "vle8.v v26, (%[wd6])\n"
1039*6093faafSChunyan Zhang "vxor.vv v25, v27, v26\n"
1040*6093faafSChunyan Zhang "vxor.vv v24, v24, v26\n"
1041*6093faafSChunyan Zhang
1042*6093faafSChunyan Zhang "vsra.vi v30, v29, 7\n"
1043*6093faafSChunyan Zhang "vsll.vi v31, v29, 1\n"
1044*6093faafSChunyan Zhang "vand.vx v30, v30, %[x1d]\n"
1045*6093faafSChunyan Zhang "vxor.vv v31, v31, v30\n"
1046*6093faafSChunyan Zhang "vle8.v v30, (%[wd7])\n"
1047*6093faafSChunyan Zhang "vxor.vv v29, v31, v30\n"
1048*6093faafSChunyan Zhang "vxor.vv v28, v28, v30\n"
1049*6093faafSChunyan Zhang ".option pop\n"
1050*6093faafSChunyan Zhang : :
1051*6093faafSChunyan Zhang [wd0]"r"(&dptr[z][d + 0 * NSIZE]),
1052*6093faafSChunyan Zhang [wd1]"r"(&dptr[z][d + 1 * NSIZE]),
1053*6093faafSChunyan Zhang [wd2]"r"(&dptr[z][d + 2 * NSIZE]),
1054*6093faafSChunyan Zhang [wd3]"r"(&dptr[z][d + 3 * NSIZE]),
1055*6093faafSChunyan Zhang [wd4]"r"(&dptr[z][d + 4 * NSIZE]),
1056*6093faafSChunyan Zhang [wd5]"r"(&dptr[z][d + 5 * NSIZE]),
1057*6093faafSChunyan Zhang [wd6]"r"(&dptr[z][d + 6 * NSIZE]),
1058*6093faafSChunyan Zhang [wd7]"r"(&dptr[z][d + 7 * NSIZE]),
1059*6093faafSChunyan Zhang [x1d]"r"(0x1d)
1060*6093faafSChunyan Zhang );
1061*6093faafSChunyan Zhang }
1062*6093faafSChunyan Zhang
1063*6093faafSChunyan Zhang /* P/Q left side optimization */
1064*6093faafSChunyan Zhang for (z = start - 1; z >= 0; z--) {
1065*6093faafSChunyan Zhang /*
1066*6093faafSChunyan Zhang * w2$$ = MASK(wq$$);
1067*6093faafSChunyan Zhang * w1$$ = SHLBYTE(wq$$);
1068*6093faafSChunyan Zhang * w2$$ &= NBYTES(0x1d);
1069*6093faafSChunyan Zhang * wq$$ = w1$$ ^ w2$$;
1070*6093faafSChunyan Zhang */
1071*6093faafSChunyan Zhang asm volatile (".option push\n"
1072*6093faafSChunyan Zhang ".option arch,+v\n"
1073*6093faafSChunyan Zhang "vsra.vi v2, v1, 7\n"
1074*6093faafSChunyan Zhang "vsll.vi v3, v1, 1\n"
1075*6093faafSChunyan Zhang "vand.vx v2, v2, %[x1d]\n"
1076*6093faafSChunyan Zhang "vxor.vv v1, v3, v2\n"
1077*6093faafSChunyan Zhang
1078*6093faafSChunyan Zhang "vsra.vi v6, v5, 7\n"
1079*6093faafSChunyan Zhang "vsll.vi v7, v5, 1\n"
1080*6093faafSChunyan Zhang "vand.vx v6, v6, %[x1d]\n"
1081*6093faafSChunyan Zhang "vxor.vv v5, v7, v6\n"
1082*6093faafSChunyan Zhang
1083*6093faafSChunyan Zhang "vsra.vi v10, v9, 7\n"
1084*6093faafSChunyan Zhang "vsll.vi v11, v9, 1\n"
1085*6093faafSChunyan Zhang "vand.vx v10, v10, %[x1d]\n"
1086*6093faafSChunyan Zhang "vxor.vv v9, v11, v10\n"
1087*6093faafSChunyan Zhang
1088*6093faafSChunyan Zhang "vsra.vi v14, v13, 7\n"
1089*6093faafSChunyan Zhang "vsll.vi v15, v13, 1\n"
1090*6093faafSChunyan Zhang "vand.vx v14, v14, %[x1d]\n"
1091*6093faafSChunyan Zhang "vxor.vv v13, v15, v14\n"
1092*6093faafSChunyan Zhang
1093*6093faafSChunyan Zhang "vsra.vi v18, v17, 7\n"
1094*6093faafSChunyan Zhang "vsll.vi v19, v17, 1\n"
1095*6093faafSChunyan Zhang "vand.vx v18, v18, %[x1d]\n"
1096*6093faafSChunyan Zhang "vxor.vv v17, v19, v18\n"
1097*6093faafSChunyan Zhang
1098*6093faafSChunyan Zhang "vsra.vi v22, v21, 7\n"
1099*6093faafSChunyan Zhang "vsll.vi v23, v21, 1\n"
1100*6093faafSChunyan Zhang "vand.vx v22, v22, %[x1d]\n"
1101*6093faafSChunyan Zhang "vxor.vv v21, v23, v22\n"
1102*6093faafSChunyan Zhang
1103*6093faafSChunyan Zhang "vsra.vi v26, v25, 7\n"
1104*6093faafSChunyan Zhang "vsll.vi v27, v25, 1\n"
1105*6093faafSChunyan Zhang "vand.vx v26, v26, %[x1d]\n"
1106*6093faafSChunyan Zhang "vxor.vv v25, v27, v26\n"
1107*6093faafSChunyan Zhang
1108*6093faafSChunyan Zhang "vsra.vi v30, v29, 7\n"
1109*6093faafSChunyan Zhang "vsll.vi v31, v29, 1\n"
1110*6093faafSChunyan Zhang "vand.vx v30, v30, %[x1d]\n"
1111*6093faafSChunyan Zhang "vxor.vv v29, v31, v30\n"
1112*6093faafSChunyan Zhang ".option pop\n"
1113*6093faafSChunyan Zhang : :
1114*6093faafSChunyan Zhang [x1d]"r"(0x1d)
1115*6093faafSChunyan Zhang );
1116*6093faafSChunyan Zhang }
1117*6093faafSChunyan Zhang
1118*6093faafSChunyan Zhang /*
1119*6093faafSChunyan Zhang * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
1120*6093faafSChunyan Zhang * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
1121*6093faafSChunyan Zhang * v0:wp0, v1:wq0, v2:p0, v3:q0
1122*6093faafSChunyan Zhang * v4:wp1, v5:wq1, v6:p1, v7:q1
1123*6093faafSChunyan Zhang * v8:wp2, v9:wq2, v10:p2, v11:q2
1124*6093faafSChunyan Zhang * v12:wp3, v13:wq3, v14:p3, v15:q3
1125*6093faafSChunyan Zhang * v16:wp4, v17:wq4, v18:p4, v19:q4
1126*6093faafSChunyan Zhang * v20:wp5, v21:wq5, v22:p5, v23:q5
1127*6093faafSChunyan Zhang * v24:wp6, v25:wq6, v26:p6, v27:q6
1128*6093faafSChunyan Zhang * v28:wp7, v29:wq7, v30:p7, v31:q7
1129*6093faafSChunyan Zhang */
1130*6093faafSChunyan Zhang asm volatile (".option push\n"
1131*6093faafSChunyan Zhang ".option arch,+v\n"
1132*6093faafSChunyan Zhang "vle8.v v2, (%[wp0])\n"
1133*6093faafSChunyan Zhang "vle8.v v3, (%[wq0])\n"
1134*6093faafSChunyan Zhang "vxor.vv v2, v2, v0\n"
1135*6093faafSChunyan Zhang "vxor.vv v3, v3, v1\n"
1136*6093faafSChunyan Zhang "vse8.v v2, (%[wp0])\n"
1137*6093faafSChunyan Zhang "vse8.v v3, (%[wq0])\n"
1138*6093faafSChunyan Zhang
1139*6093faafSChunyan Zhang "vle8.v v6, (%[wp1])\n"
1140*6093faafSChunyan Zhang "vle8.v v7, (%[wq1])\n"
1141*6093faafSChunyan Zhang "vxor.vv v6, v6, v4\n"
1142*6093faafSChunyan Zhang "vxor.vv v7, v7, v5\n"
1143*6093faafSChunyan Zhang "vse8.v v6, (%[wp1])\n"
1144*6093faafSChunyan Zhang "vse8.v v7, (%[wq1])\n"
1145*6093faafSChunyan Zhang
1146*6093faafSChunyan Zhang "vle8.v v10, (%[wp2])\n"
1147*6093faafSChunyan Zhang "vle8.v v11, (%[wq2])\n"
1148*6093faafSChunyan Zhang "vxor.vv v10, v10, v8\n"
1149*6093faafSChunyan Zhang "vxor.vv v11, v11, v9\n"
1150*6093faafSChunyan Zhang "vse8.v v10, (%[wp2])\n"
1151*6093faafSChunyan Zhang "vse8.v v11, (%[wq2])\n"
1152*6093faafSChunyan Zhang
1153*6093faafSChunyan Zhang "vle8.v v14, (%[wp3])\n"
1154*6093faafSChunyan Zhang "vle8.v v15, (%[wq3])\n"
1155*6093faafSChunyan Zhang "vxor.vv v14, v14, v12\n"
1156*6093faafSChunyan Zhang "vxor.vv v15, v15, v13\n"
1157*6093faafSChunyan Zhang "vse8.v v14, (%[wp3])\n"
1158*6093faafSChunyan Zhang "vse8.v v15, (%[wq3])\n"
1159*6093faafSChunyan Zhang
1160*6093faafSChunyan Zhang "vle8.v v18, (%[wp4])\n"
1161*6093faafSChunyan Zhang "vle8.v v19, (%[wq4])\n"
1162*6093faafSChunyan Zhang "vxor.vv v18, v18, v16\n"
1163*6093faafSChunyan Zhang "vxor.vv v19, v19, v17\n"
1164*6093faafSChunyan Zhang "vse8.v v18, (%[wp4])\n"
1165*6093faafSChunyan Zhang "vse8.v v19, (%[wq4])\n"
1166*6093faafSChunyan Zhang
1167*6093faafSChunyan Zhang "vle8.v v22, (%[wp5])\n"
1168*6093faafSChunyan Zhang "vle8.v v23, (%[wq5])\n"
1169*6093faafSChunyan Zhang "vxor.vv v22, v22, v20\n"
1170*6093faafSChunyan Zhang "vxor.vv v23, v23, v21\n"
1171*6093faafSChunyan Zhang "vse8.v v22, (%[wp5])\n"
1172*6093faafSChunyan Zhang "vse8.v v23, (%[wq5])\n"
1173*6093faafSChunyan Zhang
1174*6093faafSChunyan Zhang "vle8.v v26, (%[wp6])\n"
1175*6093faafSChunyan Zhang "vle8.v v27, (%[wq6])\n"
1176*6093faafSChunyan Zhang "vxor.vv v26, v26, v24\n"
1177*6093faafSChunyan Zhang "vxor.vv v27, v27, v25\n"
1178*6093faafSChunyan Zhang "vse8.v v26, (%[wp6])\n"
1179*6093faafSChunyan Zhang "vse8.v v27, (%[wq6])\n"
1180*6093faafSChunyan Zhang
1181*6093faafSChunyan Zhang "vle8.v v30, (%[wp7])\n"
1182*6093faafSChunyan Zhang "vle8.v v31, (%[wq7])\n"
1183*6093faafSChunyan Zhang "vxor.vv v30, v30, v28\n"
1184*6093faafSChunyan Zhang "vxor.vv v31, v31, v29\n"
1185*6093faafSChunyan Zhang "vse8.v v30, (%[wp7])\n"
1186*6093faafSChunyan Zhang "vse8.v v31, (%[wq7])\n"
1187*6093faafSChunyan Zhang ".option pop\n"
1188*6093faafSChunyan Zhang : :
1189*6093faafSChunyan Zhang [wp0]"r"(&p[d + NSIZE * 0]),
1190*6093faafSChunyan Zhang [wq0]"r"(&q[d + NSIZE * 0]),
1191*6093faafSChunyan Zhang [wp1]"r"(&p[d + NSIZE * 1]),
1192*6093faafSChunyan Zhang [wq1]"r"(&q[d + NSIZE * 1]),
1193*6093faafSChunyan Zhang [wp2]"r"(&p[d + NSIZE * 2]),
1194*6093faafSChunyan Zhang [wq2]"r"(&q[d + NSIZE * 2]),
1195*6093faafSChunyan Zhang [wp3]"r"(&p[d + NSIZE * 3]),
1196*6093faafSChunyan Zhang [wq3]"r"(&q[d + NSIZE * 3]),
1197*6093faafSChunyan Zhang [wp4]"r"(&p[d + NSIZE * 4]),
1198*6093faafSChunyan Zhang [wq4]"r"(&q[d + NSIZE * 4]),
1199*6093faafSChunyan Zhang [wp5]"r"(&p[d + NSIZE * 5]),
1200*6093faafSChunyan Zhang [wq5]"r"(&q[d + NSIZE * 5]),
1201*6093faafSChunyan Zhang [wp6]"r"(&p[d + NSIZE * 6]),
1202*6093faafSChunyan Zhang [wq6]"r"(&q[d + NSIZE * 6]),
1203*6093faafSChunyan Zhang [wp7]"r"(&p[d + NSIZE * 7]),
1204*6093faafSChunyan Zhang [wq7]"r"(&q[d + NSIZE * 7])
1205*6093faafSChunyan Zhang );
1206*6093faafSChunyan Zhang }
1207*6093faafSChunyan Zhang }
1208*6093faafSChunyan Zhang
1209*6093faafSChunyan Zhang RAID6_RVV_WRAPPER(1);
1210*6093faafSChunyan Zhang RAID6_RVV_WRAPPER(2);
1211*6093faafSChunyan Zhang RAID6_RVV_WRAPPER(4);
1212*6093faafSChunyan Zhang RAID6_RVV_WRAPPER(8);
1213