xref: /linux/lib/raid6/recov_loongarch_simd.c (revision 086c6cbcc563c81d55257f9b27e14faf1d0963d3)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * RAID6 recovery algorithms in LoongArch SIMD (LSX & LASX)
4  *
5  * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
6  *
7  * Originally based on recov_avx2.c and recov_ssse3.c:
8  *
9  * Copyright (C) 2012 Intel Corporation
10  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
11  */
12 
13 #include <linux/raid/pq.h>
14 #include "loongarch.h"
15 
16 /*
17  * Unlike with the syndrome calculation algorithms, there's no boot-time
18  * selection of recovery algorithms by benchmarking, so we have to specify
19  * the priorities and hope the future cores will all have decent vector
20  * support (i.e. no LASX slower than LSX, or even scalar code).
21  */
22 
23 #ifdef CONFIG_CPU_HAS_LSX
24 static int raid6_has_lsx(void)
25 {
26 	return cpu_has_lsx;
27 }
28 
29 static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila,
30 				  int failb, void **ptrs)
31 {
32 	u8 *p, *q, *dp, *dq;
33 	const u8 *pbmul;	/* P multiplier table for B data */
34 	const u8 *qmul;		/* Q multiplier table (for both) */
35 
36 	p = (u8 *)ptrs[disks - 2];
37 	q = (u8 *)ptrs[disks - 1];
38 
39 	/*
40 	 * Compute syndrome with zero for the missing data pages
41 	 * Use the dead data pages as temporary storage for
42 	 * delta p and delta q
43 	 */
44 	dp = (u8 *)ptrs[faila];
45 	ptrs[faila] = (void *)raid6_empty_zero_page;
46 	ptrs[disks - 2] = dp;
47 	dq = (u8 *)ptrs[failb];
48 	ptrs[failb] = (void *)raid6_empty_zero_page;
49 	ptrs[disks - 1] = dq;
50 
51 	raid6_call.gen_syndrome(disks, bytes, ptrs);
52 
53 	/* Restore pointer table */
54 	ptrs[faila] = dp;
55 	ptrs[failb] = dq;
56 	ptrs[disks - 2] = p;
57 	ptrs[disks - 1] = q;
58 
59 	/* Now, pick the proper data tables */
60 	pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
61 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
62 
63 	kernel_fpu_begin();
64 
65 	/*
66 	 * vr20, vr21: qmul
67 	 * vr22, vr23: pbmul
68 	 */
69 	asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
70 	asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
71 	asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
72 	asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
73 
74 	while (bytes) {
75 		/* vr4 - vr7: Q */
76 		asm volatile("vld $vr4, %0" : : "m" (q[0]));
77 		asm volatile("vld $vr5, %0" : : "m" (q[16]));
78 		asm volatile("vld $vr6, %0" : : "m" (q[32]));
79 		asm volatile("vld $vr7, %0" : : "m" (q[48]));
80 		/*  vr4 - vr7: Q + Qxy */
81 		asm volatile("vld $vr8, %0" : : "m" (dq[0]));
82 		asm volatile("vld $vr9, %0" : : "m" (dq[16]));
83 		asm volatile("vld $vr10, %0" : : "m" (dq[32]));
84 		asm volatile("vld $vr11, %0" : : "m" (dq[48]));
85 		asm volatile("vxor.v $vr4, $vr4, $vr8");
86 		asm volatile("vxor.v $vr5, $vr5, $vr9");
87 		asm volatile("vxor.v $vr6, $vr6, $vr10");
88 		asm volatile("vxor.v $vr7, $vr7, $vr11");
89 		/* vr0 - vr3: P */
90 		asm volatile("vld $vr0, %0" : : "m" (p[0]));
91 		asm volatile("vld $vr1, %0" : : "m" (p[16]));
92 		asm volatile("vld $vr2, %0" : : "m" (p[32]));
93 		asm volatile("vld $vr3, %0" : : "m" (p[48]));
94 		/* vr0 - vr3: P + Pxy */
95 		asm volatile("vld $vr8, %0" : : "m" (dp[0]));
96 		asm volatile("vld $vr9, %0" : : "m" (dp[16]));
97 		asm volatile("vld $vr10, %0" : : "m" (dp[32]));
98 		asm volatile("vld $vr11, %0" : : "m" (dp[48]));
99 		asm volatile("vxor.v $vr0, $vr0, $vr8");
100 		asm volatile("vxor.v $vr1, $vr1, $vr9");
101 		asm volatile("vxor.v $vr2, $vr2, $vr10");
102 		asm volatile("vxor.v $vr3, $vr3, $vr11");
103 
104 		/* vr8 - vr11: higher 4 bits of each byte of (Q + Qxy) */
105 		asm volatile("vsrli.b $vr8, $vr4, 4");
106 		asm volatile("vsrli.b $vr9, $vr5, 4");
107 		asm volatile("vsrli.b $vr10, $vr6, 4");
108 		asm volatile("vsrli.b $vr11, $vr7, 4");
109 		/* vr4 - vr7: lower 4 bits of each byte of (Q + Qxy) */
110 		asm volatile("vandi.b $vr4, $vr4, 0x0f");
111 		asm volatile("vandi.b $vr5, $vr5, 0x0f");
112 		asm volatile("vandi.b $vr6, $vr6, 0x0f");
113 		asm volatile("vandi.b $vr7, $vr7, 0x0f");
114 		/* lookup from qmul[0] */
115 		asm volatile("vshuf.b $vr4, $vr20, $vr20, $vr4");
116 		asm volatile("vshuf.b $vr5, $vr20, $vr20, $vr5");
117 		asm volatile("vshuf.b $vr6, $vr20, $vr20, $vr6");
118 		asm volatile("vshuf.b $vr7, $vr20, $vr20, $vr7");
119 		/* lookup from qmul[16] */
120 		asm volatile("vshuf.b $vr8, $vr21, $vr21, $vr8");
121 		asm volatile("vshuf.b $vr9, $vr21, $vr21, $vr9");
122 		asm volatile("vshuf.b $vr10, $vr21, $vr21, $vr10");
123 		asm volatile("vshuf.b $vr11, $vr21, $vr21, $vr11");
124 		/* vr16 - vr19: B(Q + Qxy) */
125 		asm volatile("vxor.v $vr16, $vr8, $vr4");
126 		asm volatile("vxor.v $vr17, $vr9, $vr5");
127 		asm volatile("vxor.v $vr18, $vr10, $vr6");
128 		asm volatile("vxor.v $vr19, $vr11, $vr7");
129 
130 		/* vr4 - vr7: higher 4 bits of each byte of (P + Pxy) */
131 		asm volatile("vsrli.b $vr4, $vr0, 4");
132 		asm volatile("vsrli.b $vr5, $vr1, 4");
133 		asm volatile("vsrli.b $vr6, $vr2, 4");
134 		asm volatile("vsrli.b $vr7, $vr3, 4");
135 		/* vr12 - vr15: lower 4 bits of each byte of (P + Pxy) */
136 		asm volatile("vandi.b $vr12, $vr0, 0x0f");
137 		asm volatile("vandi.b $vr13, $vr1, 0x0f");
138 		asm volatile("vandi.b $vr14, $vr2, 0x0f");
139 		asm volatile("vandi.b $vr15, $vr3, 0x0f");
140 		/* lookup from pbmul[0] */
141 		asm volatile("vshuf.b $vr12, $vr22, $vr22, $vr12");
142 		asm volatile("vshuf.b $vr13, $vr22, $vr22, $vr13");
143 		asm volatile("vshuf.b $vr14, $vr22, $vr22, $vr14");
144 		asm volatile("vshuf.b $vr15, $vr22, $vr22, $vr15");
145 		/* lookup from pbmul[16] */
146 		asm volatile("vshuf.b $vr4, $vr23, $vr23, $vr4");
147 		asm volatile("vshuf.b $vr5, $vr23, $vr23, $vr5");
148 		asm volatile("vshuf.b $vr6, $vr23, $vr23, $vr6");
149 		asm volatile("vshuf.b $vr7, $vr23, $vr23, $vr7");
150 		/* vr4 - vr7: A(P + Pxy) */
151 		asm volatile("vxor.v $vr4, $vr4, $vr12");
152 		asm volatile("vxor.v $vr5, $vr5, $vr13");
153 		asm volatile("vxor.v $vr6, $vr6, $vr14");
154 		asm volatile("vxor.v $vr7, $vr7, $vr15");
155 
156 		/* vr4 - vr7: A(P + Pxy) + B(Q + Qxy) = Dx */
157 		asm volatile("vxor.v $vr4, $vr4, $vr16");
158 		asm volatile("vxor.v $vr5, $vr5, $vr17");
159 		asm volatile("vxor.v $vr6, $vr6, $vr18");
160 		asm volatile("vxor.v $vr7, $vr7, $vr19");
161 		asm volatile("vst $vr4, %0" : "=m" (dq[0]));
162 		asm volatile("vst $vr5, %0" : "=m" (dq[16]));
163 		asm volatile("vst $vr6, %0" : "=m" (dq[32]));
164 		asm volatile("vst $vr7, %0" : "=m" (dq[48]));
165 
166 		/* vr0 - vr3: P + Pxy + Dx = Dy */
167 		asm volatile("vxor.v $vr0, $vr0, $vr4");
168 		asm volatile("vxor.v $vr1, $vr1, $vr5");
169 		asm volatile("vxor.v $vr2, $vr2, $vr6");
170 		asm volatile("vxor.v $vr3, $vr3, $vr7");
171 		asm volatile("vst $vr0, %0" : "=m" (dp[0]));
172 		asm volatile("vst $vr1, %0" : "=m" (dp[16]));
173 		asm volatile("vst $vr2, %0" : "=m" (dp[32]));
174 		asm volatile("vst $vr3, %0" : "=m" (dp[48]));
175 
176 		bytes -= 64;
177 		p += 64;
178 		q += 64;
179 		dp += 64;
180 		dq += 64;
181 	}
182 
183 	kernel_fpu_end();
184 }
185 
186 static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila,
187 				  void **ptrs)
188 {
189 	u8 *p, *q, *dq;
190 	const u8 *qmul;		/* Q multiplier table */
191 
192 	p = (u8 *)ptrs[disks - 2];
193 	q = (u8 *)ptrs[disks - 1];
194 
195 	/*
196 	 * Compute syndrome with zero for the missing data page
197 	 * Use the dead data page as temporary storage for delta q
198 	 */
199 	dq = (u8 *)ptrs[faila];
200 	ptrs[faila] = (void *)raid6_empty_zero_page;
201 	ptrs[disks - 1] = dq;
202 
203 	raid6_call.gen_syndrome(disks, bytes, ptrs);
204 
205 	/* Restore pointer table */
206 	ptrs[faila] = dq;
207 	ptrs[disks - 1] = q;
208 
209 	/* Now, pick the proper data tables */
210 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
211 
212 	kernel_fpu_begin();
213 
214 	/* vr22, vr23: qmul */
215 	asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
216 	asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
217 
218 	while (bytes) {
219 		/* vr0 - vr3: P + Dx */
220 		asm volatile("vld $vr0, %0" : : "m" (p[0]));
221 		asm volatile("vld $vr1, %0" : : "m" (p[16]));
222 		asm volatile("vld $vr2, %0" : : "m" (p[32]));
223 		asm volatile("vld $vr3, %0" : : "m" (p[48]));
224 		/* vr4 - vr7: Qx */
225 		asm volatile("vld $vr4, %0" : : "m" (dq[0]));
226 		asm volatile("vld $vr5, %0" : : "m" (dq[16]));
227 		asm volatile("vld $vr6, %0" : : "m" (dq[32]));
228 		asm volatile("vld $vr7, %0" : : "m" (dq[48]));
229 		/* vr4 - vr7: Q + Qx */
230 		asm volatile("vld $vr8, %0" : : "m" (q[0]));
231 		asm volatile("vld $vr9, %0" : : "m" (q[16]));
232 		asm volatile("vld $vr10, %0" : : "m" (q[32]));
233 		asm volatile("vld $vr11, %0" : : "m" (q[48]));
234 		asm volatile("vxor.v $vr4, $vr4, $vr8");
235 		asm volatile("vxor.v $vr5, $vr5, $vr9");
236 		asm volatile("vxor.v $vr6, $vr6, $vr10");
237 		asm volatile("vxor.v $vr7, $vr7, $vr11");
238 
239 		/* vr8 - vr11: higher 4 bits of each byte of (Q + Qx) */
240 		asm volatile("vsrli.b $vr8, $vr4, 4");
241 		asm volatile("vsrli.b $vr9, $vr5, 4");
242 		asm volatile("vsrli.b $vr10, $vr6, 4");
243 		asm volatile("vsrli.b $vr11, $vr7, 4");
244 		/* vr4 - vr7: lower 4 bits of each byte of (Q + Qx) */
245 		asm volatile("vandi.b $vr4, $vr4, 0x0f");
246 		asm volatile("vandi.b $vr5, $vr5, 0x0f");
247 		asm volatile("vandi.b $vr6, $vr6, 0x0f");
248 		asm volatile("vandi.b $vr7, $vr7, 0x0f");
249 		/* lookup from qmul[0] */
250 		asm volatile("vshuf.b $vr4, $vr22, $vr22, $vr4");
251 		asm volatile("vshuf.b $vr5, $vr22, $vr22, $vr5");
252 		asm volatile("vshuf.b $vr6, $vr22, $vr22, $vr6");
253 		asm volatile("vshuf.b $vr7, $vr22, $vr22, $vr7");
254 		/* lookup from qmul[16] */
255 		asm volatile("vshuf.b $vr8, $vr23, $vr23, $vr8");
256 		asm volatile("vshuf.b $vr9, $vr23, $vr23, $vr9");
257 		asm volatile("vshuf.b $vr10, $vr23, $vr23, $vr10");
258 		asm volatile("vshuf.b $vr11, $vr23, $vr23, $vr11");
259 		/* vr4 - vr7: qmul(Q + Qx) = Dx */
260 		asm volatile("vxor.v $vr4, $vr4, $vr8");
261 		asm volatile("vxor.v $vr5, $vr5, $vr9");
262 		asm volatile("vxor.v $vr6, $vr6, $vr10");
263 		asm volatile("vxor.v $vr7, $vr7, $vr11");
264 		asm volatile("vst $vr4, %0" : "=m" (dq[0]));
265 		asm volatile("vst $vr5, %0" : "=m" (dq[16]));
266 		asm volatile("vst $vr6, %0" : "=m" (dq[32]));
267 		asm volatile("vst $vr7, %0" : "=m" (dq[48]));
268 
269 		/* vr0 - vr3: P + Dx + Dx = P */
270 		asm volatile("vxor.v $vr0, $vr0, $vr4");
271 		asm volatile("vxor.v $vr1, $vr1, $vr5");
272 		asm volatile("vxor.v $vr2, $vr2, $vr6");
273 		asm volatile("vxor.v $vr3, $vr3, $vr7");
274 		asm volatile("vst $vr0, %0" : "=m" (p[0]));
275 		asm volatile("vst $vr1, %0" : "=m" (p[16]));
276 		asm volatile("vst $vr2, %0" : "=m" (p[32]));
277 		asm volatile("vst $vr3, %0" : "=m" (p[48]));
278 
279 		bytes -= 64;
280 		p += 64;
281 		q += 64;
282 		dq += 64;
283 	}
284 
285 	kernel_fpu_end();
286 }
287 
288 const struct raid6_recov_calls raid6_recov_lsx = {
289 	.data2 = raid6_2data_recov_lsx,
290 	.datap = raid6_datap_recov_lsx,
291 	.valid = raid6_has_lsx,
292 	.name = "lsx",
293 	.priority = 1,
294 };
295 #endif /* CONFIG_CPU_HAS_LSX */
296 
297 #ifdef CONFIG_CPU_HAS_LASX
298 static int raid6_has_lasx(void)
299 {
300 	return cpu_has_lasx;
301 }
302 
303 static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila,
304 				   int failb, void **ptrs)
305 {
306 	u8 *p, *q, *dp, *dq;
307 	const u8 *pbmul;	/* P multiplier table for B data */
308 	const u8 *qmul;		/* Q multiplier table (for both) */
309 
310 	p = (u8 *)ptrs[disks - 2];
311 	q = (u8 *)ptrs[disks - 1];
312 
313 	/*
314 	 * Compute syndrome with zero for the missing data pages
315 	 * Use the dead data pages as temporary storage for
316 	 * delta p and delta q
317 	 */
318 	dp = (u8 *)ptrs[faila];
319 	ptrs[faila] = (void *)raid6_empty_zero_page;
320 	ptrs[disks - 2] = dp;
321 	dq = (u8 *)ptrs[failb];
322 	ptrs[failb] = (void *)raid6_empty_zero_page;
323 	ptrs[disks - 1] = dq;
324 
325 	raid6_call.gen_syndrome(disks, bytes, ptrs);
326 
327 	/* Restore pointer table */
328 	ptrs[faila] = dp;
329 	ptrs[failb] = dq;
330 	ptrs[disks - 2] = p;
331 	ptrs[disks - 1] = q;
332 
333 	/* Now, pick the proper data tables */
334 	pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
335 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
336 
337 	kernel_fpu_begin();
338 
339 	/*
340 	 * xr20, xr21: qmul
341 	 * xr22, xr23: pbmul
342 	 */
343 	asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
344 	asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
345 	asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
346 	asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
347 	asm volatile("xvreplve0.q $xr20, $xr20");
348 	asm volatile("xvreplve0.q $xr21, $xr21");
349 	asm volatile("xvreplve0.q $xr22, $xr22");
350 	asm volatile("xvreplve0.q $xr23, $xr23");
351 
352 	while (bytes) {
353 		/* xr0, xr1: Q */
354 		asm volatile("xvld $xr0, %0" : : "m" (q[0]));
355 		asm volatile("xvld $xr1, %0" : : "m" (q[32]));
356 		/* xr0, xr1: Q + Qxy */
357 		asm volatile("xvld $xr4, %0" : : "m" (dq[0]));
358 		asm volatile("xvld $xr5, %0" : : "m" (dq[32]));
359 		asm volatile("xvxor.v $xr0, $xr0, $xr4");
360 		asm volatile("xvxor.v $xr1, $xr1, $xr5");
361 		/* xr2, xr3: P */
362 		asm volatile("xvld $xr2, %0" : : "m" (p[0]));
363 		asm volatile("xvld $xr3, %0" : : "m" (p[32]));
364 		/* xr2, xr3: P + Pxy */
365 		asm volatile("xvld $xr4, %0" : : "m" (dp[0]));
366 		asm volatile("xvld $xr5, %0" : : "m" (dp[32]));
367 		asm volatile("xvxor.v $xr2, $xr2, $xr4");
368 		asm volatile("xvxor.v $xr3, $xr3, $xr5");
369 
370 		/* xr4, xr5: higher 4 bits of each byte of (Q + Qxy) */
371 		asm volatile("xvsrli.b $xr4, $xr0, 4");
372 		asm volatile("xvsrli.b $xr5, $xr1, 4");
373 		/* xr0, xr1: lower 4 bits of each byte of (Q + Qxy) */
374 		asm volatile("xvandi.b $xr0, $xr0, 0x0f");
375 		asm volatile("xvandi.b $xr1, $xr1, 0x0f");
376 		/* lookup from qmul[0] */
377 		asm volatile("xvshuf.b $xr0, $xr20, $xr20, $xr0");
378 		asm volatile("xvshuf.b $xr1, $xr20, $xr20, $xr1");
379 		/* lookup from qmul[16] */
380 		asm volatile("xvshuf.b $xr4, $xr21, $xr21, $xr4");
381 		asm volatile("xvshuf.b $xr5, $xr21, $xr21, $xr5");
382 		/* xr6, xr7: B(Q + Qxy) */
383 		asm volatile("xvxor.v $xr6, $xr4, $xr0");
384 		asm volatile("xvxor.v $xr7, $xr5, $xr1");
385 
386 		/* xr4, xr5: higher 4 bits of each byte of (P + Pxy) */
387 		asm volatile("xvsrli.b $xr4, $xr2, 4");
388 		asm volatile("xvsrli.b $xr5, $xr3, 4");
389 		/* xr0, xr1: lower 4 bits of each byte of (P + Pxy) */
390 		asm volatile("xvandi.b $xr0, $xr2, 0x0f");
391 		asm volatile("xvandi.b $xr1, $xr3, 0x0f");
392 		/* lookup from pbmul[0] */
393 		asm volatile("xvshuf.b $xr0, $xr22, $xr22, $xr0");
394 		asm volatile("xvshuf.b $xr1, $xr22, $xr22, $xr1");
395 		/* lookup from pbmul[16] */
396 		asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
397 		asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
398 		/* xr0, xr1: A(P + Pxy) */
399 		asm volatile("xvxor.v $xr0, $xr0, $xr4");
400 		asm volatile("xvxor.v $xr1, $xr1, $xr5");
401 
402 		/* xr0, xr1: A(P + Pxy) + B(Q + Qxy) = Dx */
403 		asm volatile("xvxor.v $xr0, $xr0, $xr6");
404 		asm volatile("xvxor.v $xr1, $xr1, $xr7");
405 
406 		/* xr2, xr3: P + Pxy + Dx = Dy */
407 		asm volatile("xvxor.v $xr2, $xr2, $xr0");
408 		asm volatile("xvxor.v $xr3, $xr3, $xr1");
409 
410 		asm volatile("xvst $xr0, %0" : "=m" (dq[0]));
411 		asm volatile("xvst $xr1, %0" : "=m" (dq[32]));
412 		asm volatile("xvst $xr2, %0" : "=m" (dp[0]));
413 		asm volatile("xvst $xr3, %0" : "=m" (dp[32]));
414 
415 		bytes -= 64;
416 		p += 64;
417 		q += 64;
418 		dp += 64;
419 		dq += 64;
420 	}
421 
422 	kernel_fpu_end();
423 }
424 
425 static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila,
426 				   void **ptrs)
427 {
428 	u8 *p, *q, *dq;
429 	const u8 *qmul;		/* Q multiplier table */
430 
431 	p = (u8 *)ptrs[disks - 2];
432 	q = (u8 *)ptrs[disks - 1];
433 
434 	/*
435 	 * Compute syndrome with zero for the missing data page
436 	 * Use the dead data page as temporary storage for delta q
437 	 */
438 	dq = (u8 *)ptrs[faila];
439 	ptrs[faila] = (void *)raid6_empty_zero_page;
440 	ptrs[disks - 1] = dq;
441 
442 	raid6_call.gen_syndrome(disks, bytes, ptrs);
443 
444 	/* Restore pointer table */
445 	ptrs[faila] = dq;
446 	ptrs[disks - 1] = q;
447 
448 	/* Now, pick the proper data tables */
449 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
450 
451 	kernel_fpu_begin();
452 
453 	/* xr22, xr23: qmul */
454 	asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
455 	asm volatile("xvreplve0.q $xr22, $xr22");
456 	asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
457 	asm volatile("xvreplve0.q $xr23, $xr23");
458 
459 	while (bytes) {
460 		/* xr0, xr1: P + Dx */
461 		asm volatile("xvld $xr0, %0" : : "m" (p[0]));
462 		asm volatile("xvld $xr1, %0" : : "m" (p[32]));
463 		/* xr2, xr3: Qx */
464 		asm volatile("xvld $xr2, %0" : : "m" (dq[0]));
465 		asm volatile("xvld $xr3, %0" : : "m" (dq[32]));
466 		/* xr2, xr3: Q + Qx */
467 		asm volatile("xvld $xr4, %0" : : "m" (q[0]));
468 		asm volatile("xvld $xr5, %0" : : "m" (q[32]));
469 		asm volatile("xvxor.v $xr2, $xr2, $xr4");
470 		asm volatile("xvxor.v $xr3, $xr3, $xr5");
471 
472 		/* xr4, xr5: higher 4 bits of each byte of (Q + Qx) */
473 		asm volatile("xvsrli.b $xr4, $xr2, 4");
474 		asm volatile("xvsrli.b $xr5, $xr3, 4");
475 		/* xr2, xr3: lower 4 bits of each byte of (Q + Qx) */
476 		asm volatile("xvandi.b $xr2, $xr2, 0x0f");
477 		asm volatile("xvandi.b $xr3, $xr3, 0x0f");
478 		/* lookup from qmul[0] */
479 		asm volatile("xvshuf.b $xr2, $xr22, $xr22, $xr2");
480 		asm volatile("xvshuf.b $xr3, $xr22, $xr22, $xr3");
481 		/* lookup from qmul[16] */
482 		asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
483 		asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
484 		/* xr2, xr3: qmul(Q + Qx) = Dx */
485 		asm volatile("xvxor.v $xr2, $xr2, $xr4");
486 		asm volatile("xvxor.v $xr3, $xr3, $xr5");
487 
488 		/* xr0, xr1: P + Dx + Dx = P */
489 		asm volatile("xvxor.v $xr0, $xr0, $xr2");
490 		asm volatile("xvxor.v $xr1, $xr1, $xr3");
491 
492 		asm volatile("xvst $xr2, %0" : "=m" (dq[0]));
493 		asm volatile("xvst $xr3, %0" : "=m" (dq[32]));
494 		asm volatile("xvst $xr0, %0" : "=m" (p[0]));
495 		asm volatile("xvst $xr1, %0" : "=m" (p[32]));
496 
497 		bytes -= 64;
498 		p += 64;
499 		q += 64;
500 		dq += 64;
501 	}
502 
503 	kernel_fpu_end();
504 }
505 
506 const struct raid6_recov_calls raid6_recov_lasx = {
507 	.data2 = raid6_2data_recov_lasx,
508 	.datap = raid6_datap_recov_lasx,
509 	.valid = raid6_has_lasx,
510 	.name = "lasx",
511 	.priority = 2,
512 };
513 #endif /* CONFIG_CPU_HAS_LASX */
514