xref: /linux/lib/raid6/avx2.c (revision 15a1fbdcfb519c2bd291ed01c6c94e0b89537a77)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* -*- linux-c -*- ------------------------------------------------------- *
3  *
4  *   Copyright (C) 2012 Intel Corporation
5  *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
6  *
7  *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
8  *
9  * ----------------------------------------------------------------------- */
10 
11 /*
12  * AVX2 implementation of RAID-6 syndrome functions
13  *
14  */
15 
16 #ifdef CONFIG_AS_AVX2
17 
18 #include <linux/raid/pq.h>
19 #include "x86.h"
20 
21 static const struct raid6_avx2_constants {
22 	u64 x1d[4];
23 } raid6_avx2_constants __aligned(32) = {
24 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
25 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
26 };
27 
28 static int raid6_have_avx2(void)
29 {
30 	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
31 }
32 
33 /*
34  * Plain AVX2 implementation
35  */
36 static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
37 {
38 	u8 **dptr = (u8 **)ptrs;
39 	u8 *p, *q;
40 	int d, z, z0;
41 
42 	z0 = disks - 3;		/* Highest data disk */
43 	p = dptr[z0+1];		/* XOR parity */
44 	q = dptr[z0+2];		/* RS syndrome */
45 
46 	kernel_fpu_begin();
47 
48 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
49 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
50 
51 	for (d = 0; d < bytes; d += 32) {
52 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
53 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
54 		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
55 		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
56 		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
57 		for (z = z0-2; z >= 0; z--) {
58 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
59 			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
60 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
61 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
62 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
63 			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
64 			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
65 			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
66 		}
67 		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
68 		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
69 		asm volatile("vpand %ymm0,%ymm5,%ymm5");
70 		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
71 		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
72 		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
73 
74 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
75 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
76 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
77 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
78 	}
79 
80 	asm volatile("sfence" : : : "memory");
81 	kernel_fpu_end();
82 }
83 
84 static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
85 				     size_t bytes, void **ptrs)
86 {
87 	u8 **dptr = (u8 **)ptrs;
88 	u8 *p, *q;
89 	int d, z, z0;
90 
91 	z0 = stop;		/* P/Q right side optimization */
92 	p = dptr[disks-2];	/* XOR parity */
93 	q = dptr[disks-1];	/* RS syndrome */
94 
95 	kernel_fpu_begin();
96 
97 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
98 
99 	for (d = 0 ; d < bytes ; d += 32) {
100 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
101 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
102 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
103 		/* P/Q data pages */
104 		for (z = z0-1 ; z >= start ; z--) {
105 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
106 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
107 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
108 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
109 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
110 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
111 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
112 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
113 		}
114 		/* P/Q left side optimization */
115 		for (z = start-1 ; z >= 0 ; z--) {
116 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
117 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
118 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
119 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
120 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
121 		}
122 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
123 		/* Don't use movntdq for r/w memory area < cache line */
124 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
125 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
126 	}
127 
128 	asm volatile("sfence" : : : "memory");
129 	kernel_fpu_end();
130 }
131 
132 const struct raid6_calls raid6_avx2x1 = {
133 	raid6_avx21_gen_syndrome,
134 	raid6_avx21_xor_syndrome,
135 	raid6_have_avx2,
136 	"avx2x1",
137 	1			/* Has cache hints */
138 };
139 
140 /*
141  * Unrolled-by-2 AVX2 implementation
142  */
143 static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
144 {
145 	u8 **dptr = (u8 **)ptrs;
146 	u8 *p, *q;
147 	int d, z, z0;
148 
149 	z0 = disks - 3;		/* Highest data disk */
150 	p = dptr[z0+1];		/* XOR parity */
151 	q = dptr[z0+2];		/* RS syndrome */
152 
153 	kernel_fpu_begin();
154 
155 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
156 	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
157 
158 	/* We uniformly assume a single prefetch covers at least 32 bytes */
159 	for (d = 0; d < bytes; d += 64) {
160 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
161 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
162 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
163 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
164 		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
165 		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
166 		for (z = z0-1; z >= 0; z--) {
167 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
168 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
169 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
170 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
171 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
172 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
173 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
174 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
175 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
176 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
177 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
178 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
179 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
180 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
181 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
182 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
183 		}
184 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
185 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
186 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
187 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
188 	}
189 
190 	asm volatile("sfence" : : : "memory");
191 	kernel_fpu_end();
192 }
193 
194 static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
195 				     size_t bytes, void **ptrs)
196 {
197 	u8 **dptr = (u8 **)ptrs;
198 	u8 *p, *q;
199 	int d, z, z0;
200 
201 	z0 = stop;		/* P/Q right side optimization */
202 	p = dptr[disks-2];	/* XOR parity */
203 	q = dptr[disks-1];	/* RS syndrome */
204 
205 	kernel_fpu_begin();
206 
207 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
208 
209 	for (d = 0 ; d < bytes ; d += 64) {
210 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
211 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
212 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
213 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
214 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
215 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
216 		/* P/Q data pages */
217 		for (z = z0-1 ; z >= start ; z--) {
218 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
219 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
220 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
221 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
222 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
223 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
224 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
225 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
226 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
227 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
228 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
229 			asm volatile("vmovdqa %0,%%ymm7"
230 				     :: "m" (dptr[z][d+32]));
231 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
232 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
233 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
234 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
235 		}
236 		/* P/Q left side optimization */
237 		for (z = start-1 ; z >= 0 ; z--) {
238 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
239 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
240 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
241 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
242 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
243 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
244 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
245 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
246 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
247 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
248 		}
249 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
250 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
251 		/* Don't use movntdq for r/w memory area < cache line */
252 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
253 		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
254 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
255 		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
256 	}
257 
258 	asm volatile("sfence" : : : "memory");
259 	kernel_fpu_end();
260 }
261 
262 const struct raid6_calls raid6_avx2x2 = {
263 	raid6_avx22_gen_syndrome,
264 	raid6_avx22_xor_syndrome,
265 	raid6_have_avx2,
266 	"avx2x2",
267 	1			/* Has cache hints */
268 };
269 
270 #ifdef CONFIG_X86_64
271 
272 /*
273  * Unrolled-by-4 AVX2 implementation
274  */
275 static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
276 {
277 	u8 **dptr = (u8 **)ptrs;
278 	u8 *p, *q;
279 	int d, z, z0;
280 
281 	z0 = disks - 3;		/* Highest data disk */
282 	p = dptr[z0+1];		/* XOR parity */
283 	q = dptr[z0+2];		/* RS syndrome */
284 
285 	kernel_fpu_begin();
286 
287 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
288 	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
289 	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
290 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
291 	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
292 	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
293 	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
294 	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
295 	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
296 	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
297 
298 	for (d = 0; d < bytes; d += 128) {
299 		for (z = z0; z >= 0; z--) {
300 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
301 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
302 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
303 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
304 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
305 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
306 			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
307 			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
308 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
309 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
310 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
311 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
312 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
313 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
314 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
315 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
316 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
317 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
318 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
319 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
320 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
321 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
322 			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
323 			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
324 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
325 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
326 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
327 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
328 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
329 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
330 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
331 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
332 		}
333 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
334 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
335 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
336 		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
337 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
338 		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
339 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
340 		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
341 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
342 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
343 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
344 		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
345 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
346 		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
347 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
348 		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
349 	}
350 
351 	asm volatile("sfence" : : : "memory");
352 	kernel_fpu_end();
353 }
354 
355 static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
356 				     size_t bytes, void **ptrs)
357 {
358 	u8 **dptr = (u8 **)ptrs;
359 	u8 *p, *q;
360 	int d, z, z0;
361 
362 	z0 = stop;		/* P/Q right side optimization */
363 	p = dptr[disks-2];	/* XOR parity */
364 	q = dptr[disks-1];	/* RS syndrome */
365 
366 	kernel_fpu_begin();
367 
368 	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
369 
370 	for (d = 0 ; d < bytes ; d += 128) {
371 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
372 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
373 		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
374 		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
375 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
376 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
377 		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
378 		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
379 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
380 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
381 		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
382 		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
383 		/* P/Q data pages */
384 		for (z = z0-1 ; z >= start ; z--) {
385 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
386 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
387 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
388 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
389 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
390 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
391 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
392 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
393 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
394 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
395 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
396 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
397 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
398 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
399 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
400 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
401 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
402 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
403 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
404 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
405 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
406 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
407 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
408 			asm volatile("vmovdqa %0,%%ymm7"
409 				     :: "m" (dptr[z][d+32]));
410 			asm volatile("vmovdqa %0,%%ymm13"
411 				     :: "m" (dptr[z][d+64]));
412 			asm volatile("vmovdqa %0,%%ymm15"
413 				     :: "m" (dptr[z][d+96]));
414 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
415 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
416 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
417 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
418 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
419 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
420 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
421 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
422 		}
423 		asm volatile("prefetchnta %0" :: "m" (q[d]));
424 		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
425 		/* P/Q left side optimization */
426 		for (z = start-1 ; z >= 0 ; z--) {
427 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
428 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
429 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
430 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
431 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
432 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
433 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
434 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
435 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
436 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
437 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
438 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
439 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
440 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
441 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
442 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
443 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
444 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
445 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
446 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
447 		}
448 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
449 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
450 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
451 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
452 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
453 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
454 		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
455 		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
456 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
457 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
458 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
459 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
460 	}
461 	asm volatile("sfence" : : : "memory");
462 	kernel_fpu_end();
463 }
464 
465 const struct raid6_calls raid6_avx2x4 = {
466 	raid6_avx24_gen_syndrome,
467 	raid6_avx24_xor_syndrome,
468 	raid6_have_avx2,
469 	"avx2x4",
470 	1			/* Has cache hints */
471 };
472 #endif
473 
474 #endif /* CONFIG_AS_AVX2 */
475