xref: /linux/lib/raid/raid6/x86/avx2.c (revision 769d603fc44f896e7f61de7f0cdb8b78d46bc8c8)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* -*- linux-c -*- ------------------------------------------------------- *
3  *
4  *   Copyright (C) 2012 Intel Corporation
5  *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
6  *
7  *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
8  *
9  * ----------------------------------------------------------------------- */
10 
11 /*
12  * AVX2 implementation of RAID-6 syndrome functions
13  *
14  */
15 
16 #include <asm/cpufeature.h>
17 #include <asm/fpu/api.h>
18 #include "algos.h"
19 
20 static const struct raid6_avx2_constants {
21 	u64 x1d[4];
22 } raid6_avx2_constants __aligned(32) = {
23 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
24 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
25 };
26 
27 static int raid6_have_avx2(void)
28 {
29 	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
30 }
31 
32 /*
33  * Plain AVX2 implementation
34  */
35 static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
36 {
37 	u8 **dptr = (u8 **)ptrs;
38 	u8 *p, *q;
39 	int d, z, z0;
40 
41 	z0 = disks - 3;		/* Highest data disk */
42 	p = dptr[z0+1];		/* XOR parity */
43 	q = dptr[z0+2];		/* RS syndrome */
44 
45 	kernel_fpu_begin();
46 
47 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
48 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
49 
50 	for (d = 0; d < bytes; d += 32) {
51 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
52 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
53 		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
54 		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
55 		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
56 		for (z = z0-2; z >= 0; z--) {
57 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
58 			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
59 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
60 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
61 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
62 			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
63 			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
64 			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
65 		}
66 		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
67 		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
68 		asm volatile("vpand %ymm0,%ymm5,%ymm5");
69 		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
70 		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
71 		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
72 
73 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
74 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
75 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
76 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
77 	}
78 
79 	asm volatile("sfence" : : : "memory");
80 	kernel_fpu_end();
81 }
82 
83 static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
84 				     size_t bytes, void **ptrs)
85 {
86 	u8 **dptr = (u8 **)ptrs;
87 	u8 *p, *q;
88 	int d, z, z0;
89 
90 	z0 = stop;		/* P/Q right side optimization */
91 	p = dptr[disks-2];	/* XOR parity */
92 	q = dptr[disks-1];	/* RS syndrome */
93 
94 	kernel_fpu_begin();
95 
96 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
97 
98 	for (d = 0 ; d < bytes ; d += 32) {
99 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
100 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
101 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
102 		/* P/Q data pages */
103 		for (z = z0-1 ; z >= start ; z--) {
104 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
105 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
106 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
107 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
108 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
109 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
110 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
111 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
112 		}
113 		/* P/Q left side optimization */
114 		for (z = start-1 ; z >= 0 ; z--) {
115 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
116 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
117 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
118 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
119 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
120 		}
121 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
122 		/* Don't use movntdq for r/w memory area < cache line */
123 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
124 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
125 	}
126 
127 	asm volatile("sfence" : : : "memory");
128 	kernel_fpu_end();
129 }
130 
131 const struct raid6_calls raid6_avx2x1 = {
132 	.gen_syndrome	= raid6_avx21_gen_syndrome,
133 	.xor_syndrome	= raid6_avx21_xor_syndrome,
134 	.valid		= raid6_have_avx2,
135 	.name		= "avx2x1",
136 	/* Prefer AVX2 over priority 1 (SSE2 and others) */
137 	.priority	= 2,
138 };
139 
140 /*
141  * Unrolled-by-2 AVX2 implementation
142  */
143 static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
144 {
145 	u8 **dptr = (u8 **)ptrs;
146 	u8 *p, *q;
147 	int d, z, z0;
148 
149 	z0 = disks - 3;		/* Highest data disk */
150 	p = dptr[z0+1];		/* XOR parity */
151 	q = dptr[z0+2];		/* RS syndrome */
152 
153 	kernel_fpu_begin();
154 
155 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
156 	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
157 
158 	/* We uniformly assume a single prefetch covers at least 32 bytes */
159 	for (d = 0; d < bytes; d += 64) {
160 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
161 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
162 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
163 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
164 		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
165 		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
166 		for (z = z0-1; z >= 0; z--) {
167 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
168 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
169 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
170 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
171 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
172 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
173 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
174 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
175 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
176 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
177 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
178 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
179 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
180 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
181 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
182 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
183 		}
184 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
185 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
186 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
187 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
188 	}
189 
190 	asm volatile("sfence" : : : "memory");
191 	kernel_fpu_end();
192 }
193 
194 static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
195 				     size_t bytes, void **ptrs)
196 {
197 	u8 **dptr = (u8 **)ptrs;
198 	u8 *p, *q;
199 	int d, z, z0;
200 
201 	z0 = stop;		/* P/Q right side optimization */
202 	p = dptr[disks-2];	/* XOR parity */
203 	q = dptr[disks-1];	/* RS syndrome */
204 
205 	kernel_fpu_begin();
206 
207 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
208 
209 	for (d = 0 ; d < bytes ; d += 64) {
210 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
211 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
212 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
213 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
214 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
215 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
216 		/* P/Q data pages */
217 		for (z = z0-1 ; z >= start ; z--) {
218 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
219 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
220 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
221 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
222 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
223 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
224 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
225 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
226 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
227 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
228 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
229 			asm volatile("vmovdqa %0,%%ymm7"
230 				     :: "m" (dptr[z][d+32]));
231 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
232 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
233 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
234 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
235 		}
236 		/* P/Q left side optimization */
237 		for (z = start-1 ; z >= 0 ; z--) {
238 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
239 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
240 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
241 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
242 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
243 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
244 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
245 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
246 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
247 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
248 		}
249 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
250 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
251 		/* Don't use movntdq for r/w memory area < cache line */
252 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
253 		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
254 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
255 		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
256 	}
257 
258 	asm volatile("sfence" : : : "memory");
259 	kernel_fpu_end();
260 }
261 
262 const struct raid6_calls raid6_avx2x2 = {
263 	.gen_syndrome	= raid6_avx22_gen_syndrome,
264 	.xor_syndrome	= raid6_avx22_xor_syndrome,
265 	.valid		= raid6_have_avx2,
266 	.name		= "avx2x2",
267 	/* Prefer AVX2 over priority 1 (SSE2 and others) */
268 	.priority	= 2,
269 };
270 
271 #ifdef CONFIG_X86_64
272 
273 /*
274  * Unrolled-by-4 AVX2 implementation
275  */
276 static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
277 {
278 	u8 **dptr = (u8 **)ptrs;
279 	u8 *p, *q;
280 	int d, z, z0;
281 
282 	z0 = disks - 3;		/* Highest data disk */
283 	p = dptr[z0+1];		/* XOR parity */
284 	q = dptr[z0+2];		/* RS syndrome */
285 
286 	kernel_fpu_begin();
287 
288 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
289 	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
290 	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
291 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
292 	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
293 	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
294 	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
295 	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
296 	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
297 	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
298 
299 	for (d = 0; d < bytes; d += 128) {
300 		for (z = z0; z >= 0; z--) {
301 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
302 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
303 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
304 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
305 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
306 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
307 			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
308 			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
309 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
310 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
311 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
312 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
313 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
314 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
315 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
316 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
317 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
318 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
319 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
320 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
321 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
322 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
323 			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
324 			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
325 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
326 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
327 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
328 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
329 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
330 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
331 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
332 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
333 		}
334 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
335 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
336 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
337 		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
338 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
339 		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
340 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
341 		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
342 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
343 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
344 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
345 		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
346 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
347 		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
348 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
349 		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
350 	}
351 
352 	asm volatile("sfence" : : : "memory");
353 	kernel_fpu_end();
354 }
355 
356 static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
357 				     size_t bytes, void **ptrs)
358 {
359 	u8 **dptr = (u8 **)ptrs;
360 	u8 *p, *q;
361 	int d, z, z0;
362 
363 	z0 = stop;		/* P/Q right side optimization */
364 	p = dptr[disks-2];	/* XOR parity */
365 	q = dptr[disks-1];	/* RS syndrome */
366 
367 	kernel_fpu_begin();
368 
369 	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
370 
371 	for (d = 0 ; d < bytes ; d += 128) {
372 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
373 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
374 		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
375 		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
376 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
377 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
378 		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
379 		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
380 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
381 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
382 		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
383 		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
384 		/* P/Q data pages */
385 		for (z = z0-1 ; z >= start ; z--) {
386 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
387 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
388 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
389 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
390 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
391 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
392 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
393 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
394 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
395 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
396 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
397 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
398 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
399 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
400 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
401 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
402 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
403 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
404 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
405 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
406 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
407 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
408 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
409 			asm volatile("vmovdqa %0,%%ymm7"
410 				     :: "m" (dptr[z][d+32]));
411 			asm volatile("vmovdqa %0,%%ymm13"
412 				     :: "m" (dptr[z][d+64]));
413 			asm volatile("vmovdqa %0,%%ymm15"
414 				     :: "m" (dptr[z][d+96]));
415 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
416 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
417 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
418 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
419 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
420 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
421 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
422 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
423 		}
424 		asm volatile("prefetchnta %0" :: "m" (q[d]));
425 		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
426 		/* P/Q left side optimization */
427 		for (z = start-1 ; z >= 0 ; z--) {
428 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
429 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
430 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
431 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
432 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
433 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
434 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
435 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
436 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
437 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
438 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
439 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
440 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
441 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
442 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
443 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
444 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
445 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
446 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
447 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
448 		}
449 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
450 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
451 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
452 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
453 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
454 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
455 		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
456 		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
457 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
458 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
459 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
460 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
461 	}
462 	asm volatile("sfence" : : : "memory");
463 	kernel_fpu_end();
464 }
465 
466 const struct raid6_calls raid6_avx2x4 = {
467 	.gen_syndrome	= raid6_avx24_gen_syndrome,
468 	.xor_syndrome	= raid6_avx24_xor_syndrome,
469 	.valid		= raid6_have_avx2,
470 	.name		= "avx2x4",
471 	/* Prefer AVX2 over priority 1 (SSE2 and others) */
472 	.priority	= 2,
473 };
474 #endif /* CONFIG_X86_64 */
475