xref: /linux/lib/raid/raid6/loongarch/recov_loongarch_simd.c (revision 2e05544060b9fef5d4d0e0172944e6956c55080f)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * RAID6 recovery algorithms in LoongArch SIMD (LSX & LASX)
4  *
5  * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
6  *
7  * Originally based on recov_avx2.c and recov_ssse3.c:
8  *
9  * Copyright (C) 2012 Intel Corporation
10  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
11  */
12 
13 #include <linux/mm.h>
14 #include <linux/raid/pq.h>
15 #include <asm/cpu-features.h>
16 #include <asm/fpu.h>
17 #include "algos.h"
18 
19 /*
20  * Unlike with the syndrome calculation algorithms, there's no boot-time
21  * selection of recovery algorithms by benchmarking, so we have to specify
22  * the priorities and hope the future cores will all have decent vector
23  * support (i.e. no LASX slower than LSX, or even scalar code).
24  */
25 
26 #ifdef CONFIG_CPU_HAS_LSX
27 static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila,
28 				  int failb, void **ptrs)
29 {
30 	u8 *p, *q, *dp, *dq;
31 	const u8 *pbmul;	/* P multiplier table for B data */
32 	const u8 *qmul;		/* Q multiplier table (for both) */
33 
34 	p = (u8 *)ptrs[disks - 2];
35 	q = (u8 *)ptrs[disks - 1];
36 
37 	/*
38 	 * Compute syndrome with zero for the missing data pages
39 	 * Use the dead data pages as temporary storage for
40 	 * delta p and delta q
41 	 */
42 	dp = (u8 *)ptrs[faila];
43 	ptrs[faila] = page_address(ZERO_PAGE(0));
44 	ptrs[disks - 2] = dp;
45 	dq = (u8 *)ptrs[failb];
46 	ptrs[failb] = page_address(ZERO_PAGE(0));
47 	ptrs[disks - 1] = dq;
48 
49 	raid6_gen_syndrome(disks, bytes, ptrs);
50 
51 	/* Restore pointer table */
52 	ptrs[faila] = dp;
53 	ptrs[failb] = dq;
54 	ptrs[disks - 2] = p;
55 	ptrs[disks - 1] = q;
56 
57 	/* Now, pick the proper data tables */
58 	pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
59 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
60 
61 	kernel_fpu_begin();
62 
63 	/*
64 	 * vr20, vr21: qmul
65 	 * vr22, vr23: pbmul
66 	 */
67 	asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
68 	asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
69 	asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
70 	asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
71 
72 	while (bytes) {
73 		/* vr4 - vr7: Q */
74 		asm volatile("vld $vr4, %0" : : "m" (q[0]));
75 		asm volatile("vld $vr5, %0" : : "m" (q[16]));
76 		asm volatile("vld $vr6, %0" : : "m" (q[32]));
77 		asm volatile("vld $vr7, %0" : : "m" (q[48]));
78 		/*  vr4 - vr7: Q + Qxy */
79 		asm volatile("vld $vr8, %0" : : "m" (dq[0]));
80 		asm volatile("vld $vr9, %0" : : "m" (dq[16]));
81 		asm volatile("vld $vr10, %0" : : "m" (dq[32]));
82 		asm volatile("vld $vr11, %0" : : "m" (dq[48]));
83 		asm volatile("vxor.v $vr4, $vr4, $vr8");
84 		asm volatile("vxor.v $vr5, $vr5, $vr9");
85 		asm volatile("vxor.v $vr6, $vr6, $vr10");
86 		asm volatile("vxor.v $vr7, $vr7, $vr11");
87 		/* vr0 - vr3: P */
88 		asm volatile("vld $vr0, %0" : : "m" (p[0]));
89 		asm volatile("vld $vr1, %0" : : "m" (p[16]));
90 		asm volatile("vld $vr2, %0" : : "m" (p[32]));
91 		asm volatile("vld $vr3, %0" : : "m" (p[48]));
92 		/* vr0 - vr3: P + Pxy */
93 		asm volatile("vld $vr8, %0" : : "m" (dp[0]));
94 		asm volatile("vld $vr9, %0" : : "m" (dp[16]));
95 		asm volatile("vld $vr10, %0" : : "m" (dp[32]));
96 		asm volatile("vld $vr11, %0" : : "m" (dp[48]));
97 		asm volatile("vxor.v $vr0, $vr0, $vr8");
98 		asm volatile("vxor.v $vr1, $vr1, $vr9");
99 		asm volatile("vxor.v $vr2, $vr2, $vr10");
100 		asm volatile("vxor.v $vr3, $vr3, $vr11");
101 
102 		/* vr8 - vr11: higher 4 bits of each byte of (Q + Qxy) */
103 		asm volatile("vsrli.b $vr8, $vr4, 4");
104 		asm volatile("vsrli.b $vr9, $vr5, 4");
105 		asm volatile("vsrli.b $vr10, $vr6, 4");
106 		asm volatile("vsrli.b $vr11, $vr7, 4");
107 		/* vr4 - vr7: lower 4 bits of each byte of (Q + Qxy) */
108 		asm volatile("vandi.b $vr4, $vr4, 0x0f");
109 		asm volatile("vandi.b $vr5, $vr5, 0x0f");
110 		asm volatile("vandi.b $vr6, $vr6, 0x0f");
111 		asm volatile("vandi.b $vr7, $vr7, 0x0f");
112 		/* lookup from qmul[0] */
113 		asm volatile("vshuf.b $vr4, $vr20, $vr20, $vr4");
114 		asm volatile("vshuf.b $vr5, $vr20, $vr20, $vr5");
115 		asm volatile("vshuf.b $vr6, $vr20, $vr20, $vr6");
116 		asm volatile("vshuf.b $vr7, $vr20, $vr20, $vr7");
117 		/* lookup from qmul[16] */
118 		asm volatile("vshuf.b $vr8, $vr21, $vr21, $vr8");
119 		asm volatile("vshuf.b $vr9, $vr21, $vr21, $vr9");
120 		asm volatile("vshuf.b $vr10, $vr21, $vr21, $vr10");
121 		asm volatile("vshuf.b $vr11, $vr21, $vr21, $vr11");
122 		/* vr16 - vr19: B(Q + Qxy) */
123 		asm volatile("vxor.v $vr16, $vr8, $vr4");
124 		asm volatile("vxor.v $vr17, $vr9, $vr5");
125 		asm volatile("vxor.v $vr18, $vr10, $vr6");
126 		asm volatile("vxor.v $vr19, $vr11, $vr7");
127 
128 		/* vr4 - vr7: higher 4 bits of each byte of (P + Pxy) */
129 		asm volatile("vsrli.b $vr4, $vr0, 4");
130 		asm volatile("vsrli.b $vr5, $vr1, 4");
131 		asm volatile("vsrli.b $vr6, $vr2, 4");
132 		asm volatile("vsrli.b $vr7, $vr3, 4");
133 		/* vr12 - vr15: lower 4 bits of each byte of (P + Pxy) */
134 		asm volatile("vandi.b $vr12, $vr0, 0x0f");
135 		asm volatile("vandi.b $vr13, $vr1, 0x0f");
136 		asm volatile("vandi.b $vr14, $vr2, 0x0f");
137 		asm volatile("vandi.b $vr15, $vr3, 0x0f");
138 		/* lookup from pbmul[0] */
139 		asm volatile("vshuf.b $vr12, $vr22, $vr22, $vr12");
140 		asm volatile("vshuf.b $vr13, $vr22, $vr22, $vr13");
141 		asm volatile("vshuf.b $vr14, $vr22, $vr22, $vr14");
142 		asm volatile("vshuf.b $vr15, $vr22, $vr22, $vr15");
143 		/* lookup from pbmul[16] */
144 		asm volatile("vshuf.b $vr4, $vr23, $vr23, $vr4");
145 		asm volatile("vshuf.b $vr5, $vr23, $vr23, $vr5");
146 		asm volatile("vshuf.b $vr6, $vr23, $vr23, $vr6");
147 		asm volatile("vshuf.b $vr7, $vr23, $vr23, $vr7");
148 		/* vr4 - vr7: A(P + Pxy) */
149 		asm volatile("vxor.v $vr4, $vr4, $vr12");
150 		asm volatile("vxor.v $vr5, $vr5, $vr13");
151 		asm volatile("vxor.v $vr6, $vr6, $vr14");
152 		asm volatile("vxor.v $vr7, $vr7, $vr15");
153 
154 		/* vr4 - vr7: A(P + Pxy) + B(Q + Qxy) = Dx */
155 		asm volatile("vxor.v $vr4, $vr4, $vr16");
156 		asm volatile("vxor.v $vr5, $vr5, $vr17");
157 		asm volatile("vxor.v $vr6, $vr6, $vr18");
158 		asm volatile("vxor.v $vr7, $vr7, $vr19");
159 		asm volatile("vst $vr4, %0" : "=m" (dq[0]));
160 		asm volatile("vst $vr5, %0" : "=m" (dq[16]));
161 		asm volatile("vst $vr6, %0" : "=m" (dq[32]));
162 		asm volatile("vst $vr7, %0" : "=m" (dq[48]));
163 
164 		/* vr0 - vr3: P + Pxy + Dx = Dy */
165 		asm volatile("vxor.v $vr0, $vr0, $vr4");
166 		asm volatile("vxor.v $vr1, $vr1, $vr5");
167 		asm volatile("vxor.v $vr2, $vr2, $vr6");
168 		asm volatile("vxor.v $vr3, $vr3, $vr7");
169 		asm volatile("vst $vr0, %0" : "=m" (dp[0]));
170 		asm volatile("vst $vr1, %0" : "=m" (dp[16]));
171 		asm volatile("vst $vr2, %0" : "=m" (dp[32]));
172 		asm volatile("vst $vr3, %0" : "=m" (dp[48]));
173 
174 		bytes -= 64;
175 		p += 64;
176 		q += 64;
177 		dp += 64;
178 		dq += 64;
179 	}
180 
181 	kernel_fpu_end();
182 }
183 
184 static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila,
185 				  void **ptrs)
186 {
187 	u8 *p, *q, *dq;
188 	const u8 *qmul;		/* Q multiplier table */
189 
190 	p = (u8 *)ptrs[disks - 2];
191 	q = (u8 *)ptrs[disks - 1];
192 
193 	/*
194 	 * Compute syndrome with zero for the missing data page
195 	 * Use the dead data page as temporary storage for delta q
196 	 */
197 	dq = (u8 *)ptrs[faila];
198 	ptrs[faila] = page_address(ZERO_PAGE(0));
199 	ptrs[disks - 1] = dq;
200 
201 	raid6_gen_syndrome(disks, bytes, ptrs);
202 
203 	/* Restore pointer table */
204 	ptrs[faila] = dq;
205 	ptrs[disks - 1] = q;
206 
207 	/* Now, pick the proper data tables */
208 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
209 
210 	kernel_fpu_begin();
211 
212 	/* vr22, vr23: qmul */
213 	asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
214 	asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
215 
216 	while (bytes) {
217 		/* vr0 - vr3: P + Dx */
218 		asm volatile("vld $vr0, %0" : : "m" (p[0]));
219 		asm volatile("vld $vr1, %0" : : "m" (p[16]));
220 		asm volatile("vld $vr2, %0" : : "m" (p[32]));
221 		asm volatile("vld $vr3, %0" : : "m" (p[48]));
222 		/* vr4 - vr7: Qx */
223 		asm volatile("vld $vr4, %0" : : "m" (dq[0]));
224 		asm volatile("vld $vr5, %0" : : "m" (dq[16]));
225 		asm volatile("vld $vr6, %0" : : "m" (dq[32]));
226 		asm volatile("vld $vr7, %0" : : "m" (dq[48]));
227 		/* vr4 - vr7: Q + Qx */
228 		asm volatile("vld $vr8, %0" : : "m" (q[0]));
229 		asm volatile("vld $vr9, %0" : : "m" (q[16]));
230 		asm volatile("vld $vr10, %0" : : "m" (q[32]));
231 		asm volatile("vld $vr11, %0" : : "m" (q[48]));
232 		asm volatile("vxor.v $vr4, $vr4, $vr8");
233 		asm volatile("vxor.v $vr5, $vr5, $vr9");
234 		asm volatile("vxor.v $vr6, $vr6, $vr10");
235 		asm volatile("vxor.v $vr7, $vr7, $vr11");
236 
237 		/* vr8 - vr11: higher 4 bits of each byte of (Q + Qx) */
238 		asm volatile("vsrli.b $vr8, $vr4, 4");
239 		asm volatile("vsrli.b $vr9, $vr5, 4");
240 		asm volatile("vsrli.b $vr10, $vr6, 4");
241 		asm volatile("vsrli.b $vr11, $vr7, 4");
242 		/* vr4 - vr7: lower 4 bits of each byte of (Q + Qx) */
243 		asm volatile("vandi.b $vr4, $vr4, 0x0f");
244 		asm volatile("vandi.b $vr5, $vr5, 0x0f");
245 		asm volatile("vandi.b $vr6, $vr6, 0x0f");
246 		asm volatile("vandi.b $vr7, $vr7, 0x0f");
247 		/* lookup from qmul[0] */
248 		asm volatile("vshuf.b $vr4, $vr22, $vr22, $vr4");
249 		asm volatile("vshuf.b $vr5, $vr22, $vr22, $vr5");
250 		asm volatile("vshuf.b $vr6, $vr22, $vr22, $vr6");
251 		asm volatile("vshuf.b $vr7, $vr22, $vr22, $vr7");
252 		/* lookup from qmul[16] */
253 		asm volatile("vshuf.b $vr8, $vr23, $vr23, $vr8");
254 		asm volatile("vshuf.b $vr9, $vr23, $vr23, $vr9");
255 		asm volatile("vshuf.b $vr10, $vr23, $vr23, $vr10");
256 		asm volatile("vshuf.b $vr11, $vr23, $vr23, $vr11");
257 		/* vr4 - vr7: qmul(Q + Qx) = Dx */
258 		asm volatile("vxor.v $vr4, $vr4, $vr8");
259 		asm volatile("vxor.v $vr5, $vr5, $vr9");
260 		asm volatile("vxor.v $vr6, $vr6, $vr10");
261 		asm volatile("vxor.v $vr7, $vr7, $vr11");
262 		asm volatile("vst $vr4, %0" : "=m" (dq[0]));
263 		asm volatile("vst $vr5, %0" : "=m" (dq[16]));
264 		asm volatile("vst $vr6, %0" : "=m" (dq[32]));
265 		asm volatile("vst $vr7, %0" : "=m" (dq[48]));
266 
267 		/* vr0 - vr3: P + Dx + Dx = P */
268 		asm volatile("vxor.v $vr0, $vr0, $vr4");
269 		asm volatile("vxor.v $vr1, $vr1, $vr5");
270 		asm volatile("vxor.v $vr2, $vr2, $vr6");
271 		asm volatile("vxor.v $vr3, $vr3, $vr7");
272 		asm volatile("vst $vr0, %0" : "=m" (p[0]));
273 		asm volatile("vst $vr1, %0" : "=m" (p[16]));
274 		asm volatile("vst $vr2, %0" : "=m" (p[32]));
275 		asm volatile("vst $vr3, %0" : "=m" (p[48]));
276 
277 		bytes -= 64;
278 		p += 64;
279 		q += 64;
280 		dq += 64;
281 	}
282 
283 	kernel_fpu_end();
284 }
285 
286 const struct raid6_recov_calls raid6_recov_lsx = {
287 	.data2 = raid6_2data_recov_lsx,
288 	.datap = raid6_datap_recov_lsx,
289 	.name = "lsx",
290 };
291 #endif /* CONFIG_CPU_HAS_LSX */
292 
293 #ifdef CONFIG_CPU_HAS_LASX
294 static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila,
295 				   int failb, void **ptrs)
296 {
297 	u8 *p, *q, *dp, *dq;
298 	const u8 *pbmul;	/* P multiplier table for B data */
299 	const u8 *qmul;		/* Q multiplier table (for both) */
300 
301 	p = (u8 *)ptrs[disks - 2];
302 	q = (u8 *)ptrs[disks - 1];
303 
304 	/*
305 	 * Compute syndrome with zero for the missing data pages
306 	 * Use the dead data pages as temporary storage for
307 	 * delta p and delta q
308 	 */
309 	dp = (u8 *)ptrs[faila];
310 	ptrs[faila] = page_address(ZERO_PAGE(0));
311 	ptrs[disks - 2] = dp;
312 	dq = (u8 *)ptrs[failb];
313 	ptrs[failb] = page_address(ZERO_PAGE(0));
314 	ptrs[disks - 1] = dq;
315 
316 	raid6_gen_syndrome(disks, bytes, ptrs);
317 
318 	/* Restore pointer table */
319 	ptrs[faila] = dp;
320 	ptrs[failb] = dq;
321 	ptrs[disks - 2] = p;
322 	ptrs[disks - 1] = q;
323 
324 	/* Now, pick the proper data tables */
325 	pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
326 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
327 
328 	kernel_fpu_begin();
329 
330 	/*
331 	 * xr20, xr21: qmul
332 	 * xr22, xr23: pbmul
333 	 */
334 	asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
335 	asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
336 	asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
337 	asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
338 	asm volatile("xvreplve0.q $xr20, $xr20");
339 	asm volatile("xvreplve0.q $xr21, $xr21");
340 	asm volatile("xvreplve0.q $xr22, $xr22");
341 	asm volatile("xvreplve0.q $xr23, $xr23");
342 
343 	while (bytes) {
344 		/* xr0, xr1: Q */
345 		asm volatile("xvld $xr0, %0" : : "m" (q[0]));
346 		asm volatile("xvld $xr1, %0" : : "m" (q[32]));
347 		/* xr0, xr1: Q + Qxy */
348 		asm volatile("xvld $xr4, %0" : : "m" (dq[0]));
349 		asm volatile("xvld $xr5, %0" : : "m" (dq[32]));
350 		asm volatile("xvxor.v $xr0, $xr0, $xr4");
351 		asm volatile("xvxor.v $xr1, $xr1, $xr5");
352 		/* xr2, xr3: P */
353 		asm volatile("xvld $xr2, %0" : : "m" (p[0]));
354 		asm volatile("xvld $xr3, %0" : : "m" (p[32]));
355 		/* xr2, xr3: P + Pxy */
356 		asm volatile("xvld $xr4, %0" : : "m" (dp[0]));
357 		asm volatile("xvld $xr5, %0" : : "m" (dp[32]));
358 		asm volatile("xvxor.v $xr2, $xr2, $xr4");
359 		asm volatile("xvxor.v $xr3, $xr3, $xr5");
360 
361 		/* xr4, xr5: higher 4 bits of each byte of (Q + Qxy) */
362 		asm volatile("xvsrli.b $xr4, $xr0, 4");
363 		asm volatile("xvsrli.b $xr5, $xr1, 4");
364 		/* xr0, xr1: lower 4 bits of each byte of (Q + Qxy) */
365 		asm volatile("xvandi.b $xr0, $xr0, 0x0f");
366 		asm volatile("xvandi.b $xr1, $xr1, 0x0f");
367 		/* lookup from qmul[0] */
368 		asm volatile("xvshuf.b $xr0, $xr20, $xr20, $xr0");
369 		asm volatile("xvshuf.b $xr1, $xr20, $xr20, $xr1");
370 		/* lookup from qmul[16] */
371 		asm volatile("xvshuf.b $xr4, $xr21, $xr21, $xr4");
372 		asm volatile("xvshuf.b $xr5, $xr21, $xr21, $xr5");
373 		/* xr6, xr7: B(Q + Qxy) */
374 		asm volatile("xvxor.v $xr6, $xr4, $xr0");
375 		asm volatile("xvxor.v $xr7, $xr5, $xr1");
376 
377 		/* xr4, xr5: higher 4 bits of each byte of (P + Pxy) */
378 		asm volatile("xvsrli.b $xr4, $xr2, 4");
379 		asm volatile("xvsrli.b $xr5, $xr3, 4");
380 		/* xr0, xr1: lower 4 bits of each byte of (P + Pxy) */
381 		asm volatile("xvandi.b $xr0, $xr2, 0x0f");
382 		asm volatile("xvandi.b $xr1, $xr3, 0x0f");
383 		/* lookup from pbmul[0] */
384 		asm volatile("xvshuf.b $xr0, $xr22, $xr22, $xr0");
385 		asm volatile("xvshuf.b $xr1, $xr22, $xr22, $xr1");
386 		/* lookup from pbmul[16] */
387 		asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
388 		asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
389 		/* xr0, xr1: A(P + Pxy) */
390 		asm volatile("xvxor.v $xr0, $xr0, $xr4");
391 		asm volatile("xvxor.v $xr1, $xr1, $xr5");
392 
393 		/* xr0, xr1: A(P + Pxy) + B(Q + Qxy) = Dx */
394 		asm volatile("xvxor.v $xr0, $xr0, $xr6");
395 		asm volatile("xvxor.v $xr1, $xr1, $xr7");
396 
397 		/* xr2, xr3: P + Pxy + Dx = Dy */
398 		asm volatile("xvxor.v $xr2, $xr2, $xr0");
399 		asm volatile("xvxor.v $xr3, $xr3, $xr1");
400 
401 		asm volatile("xvst $xr0, %0" : "=m" (dq[0]));
402 		asm volatile("xvst $xr1, %0" : "=m" (dq[32]));
403 		asm volatile("xvst $xr2, %0" : "=m" (dp[0]));
404 		asm volatile("xvst $xr3, %0" : "=m" (dp[32]));
405 
406 		bytes -= 64;
407 		p += 64;
408 		q += 64;
409 		dp += 64;
410 		dq += 64;
411 	}
412 
413 	kernel_fpu_end();
414 }
415 
416 static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila,
417 				   void **ptrs)
418 {
419 	u8 *p, *q, *dq;
420 	const u8 *qmul;		/* Q multiplier table */
421 
422 	p = (u8 *)ptrs[disks - 2];
423 	q = (u8 *)ptrs[disks - 1];
424 
425 	/*
426 	 * Compute syndrome with zero for the missing data page
427 	 * Use the dead data page as temporary storage for delta q
428 	 */
429 	dq = (u8 *)ptrs[faila];
430 	ptrs[faila] = page_address(ZERO_PAGE(0));
431 	ptrs[disks - 1] = dq;
432 
433 	raid6_gen_syndrome(disks, bytes, ptrs);
434 
435 	/* Restore pointer table */
436 	ptrs[faila] = dq;
437 	ptrs[disks - 1] = q;
438 
439 	/* Now, pick the proper data tables */
440 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
441 
442 	kernel_fpu_begin();
443 
444 	/* xr22, xr23: qmul */
445 	asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
446 	asm volatile("xvreplve0.q $xr22, $xr22");
447 	asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
448 	asm volatile("xvreplve0.q $xr23, $xr23");
449 
450 	while (bytes) {
451 		/* xr0, xr1: P + Dx */
452 		asm volatile("xvld $xr0, %0" : : "m" (p[0]));
453 		asm volatile("xvld $xr1, %0" : : "m" (p[32]));
454 		/* xr2, xr3: Qx */
455 		asm volatile("xvld $xr2, %0" : : "m" (dq[0]));
456 		asm volatile("xvld $xr3, %0" : : "m" (dq[32]));
457 		/* xr2, xr3: Q + Qx */
458 		asm volatile("xvld $xr4, %0" : : "m" (q[0]));
459 		asm volatile("xvld $xr5, %0" : : "m" (q[32]));
460 		asm volatile("xvxor.v $xr2, $xr2, $xr4");
461 		asm volatile("xvxor.v $xr3, $xr3, $xr5");
462 
463 		/* xr4, xr5: higher 4 bits of each byte of (Q + Qx) */
464 		asm volatile("xvsrli.b $xr4, $xr2, 4");
465 		asm volatile("xvsrli.b $xr5, $xr3, 4");
466 		/* xr2, xr3: lower 4 bits of each byte of (Q + Qx) */
467 		asm volatile("xvandi.b $xr2, $xr2, 0x0f");
468 		asm volatile("xvandi.b $xr3, $xr3, 0x0f");
469 		/* lookup from qmul[0] */
470 		asm volatile("xvshuf.b $xr2, $xr22, $xr22, $xr2");
471 		asm volatile("xvshuf.b $xr3, $xr22, $xr22, $xr3");
472 		/* lookup from qmul[16] */
473 		asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
474 		asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
475 		/* xr2, xr3: qmul(Q + Qx) = Dx */
476 		asm volatile("xvxor.v $xr2, $xr2, $xr4");
477 		asm volatile("xvxor.v $xr3, $xr3, $xr5");
478 
479 		/* xr0, xr1: P + Dx + Dx = P */
480 		asm volatile("xvxor.v $xr0, $xr0, $xr2");
481 		asm volatile("xvxor.v $xr1, $xr1, $xr3");
482 
483 		asm volatile("xvst $xr2, %0" : "=m" (dq[0]));
484 		asm volatile("xvst $xr3, %0" : "=m" (dq[32]));
485 		asm volatile("xvst $xr0, %0" : "=m" (p[0]));
486 		asm volatile("xvst $xr1, %0" : "=m" (p[32]));
487 
488 		bytes -= 64;
489 		p += 64;
490 		q += 64;
491 		dq += 64;
492 	}
493 
494 	kernel_fpu_end();
495 }
496 
497 const struct raid6_recov_calls raid6_recov_lasx = {
498 	.data2 = raid6_2data_recov_lasx,
499 	.datap = raid6_datap_recov_lasx,
500 	.name = "lasx",
501 };
502 #endif /* CONFIG_CPU_HAS_LASX */
503