xref: /linux/lib/raid/raid6/x86/avx2.c (revision 7e91f76a96686b3341c96e1e7f3e86c0f51e2cff)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* -*- linux-c -*- ------------------------------------------------------- *
3  *
4  *   Copyright (C) 2012 Intel Corporation
5  *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
6  *
7  *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
8  *
9  * ----------------------------------------------------------------------- */
10 
11 /*
12  * AVX2 implementation of RAID-6 syndrome functions
13  *
14  */
15 
16 #include <linux/raid/pq.h>
17 #include <asm/fpu/api.h>
18 
19 static const struct raid6_avx2_constants {
20 	u64 x1d[4];
21 } raid6_avx2_constants __aligned(32) = {
22 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
23 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
24 };
25 
26 static int raid6_have_avx2(void)
27 {
28 	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
29 }
30 
31 /*
32  * Plain AVX2 implementation
33  */
34 static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
35 {
36 	u8 **dptr = (u8 **)ptrs;
37 	u8 *p, *q;
38 	int d, z, z0;
39 
40 	z0 = disks - 3;		/* Highest data disk */
41 	p = dptr[z0+1];		/* XOR parity */
42 	q = dptr[z0+2];		/* RS syndrome */
43 
44 	kernel_fpu_begin();
45 
46 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
47 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
48 
49 	for (d = 0; d < bytes; d += 32) {
50 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
51 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
52 		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
53 		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
54 		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
55 		for (z = z0-2; z >= 0; z--) {
56 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
57 			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
58 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
59 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
60 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
61 			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
62 			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
63 			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
64 		}
65 		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
66 		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
67 		asm volatile("vpand %ymm0,%ymm5,%ymm5");
68 		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
69 		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
70 		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
71 
72 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
73 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
74 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
75 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
76 	}
77 
78 	asm volatile("sfence" : : : "memory");
79 	kernel_fpu_end();
80 }
81 
82 static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
83 				     size_t bytes, void **ptrs)
84 {
85 	u8 **dptr = (u8 **)ptrs;
86 	u8 *p, *q;
87 	int d, z, z0;
88 
89 	z0 = stop;		/* P/Q right side optimization */
90 	p = dptr[disks-2];	/* XOR parity */
91 	q = dptr[disks-1];	/* RS syndrome */
92 
93 	kernel_fpu_begin();
94 
95 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
96 
97 	for (d = 0 ; d < bytes ; d += 32) {
98 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
99 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
100 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
101 		/* P/Q data pages */
102 		for (z = z0-1 ; z >= start ; z--) {
103 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
104 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
105 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
106 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
107 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
108 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
109 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
110 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
111 		}
112 		/* P/Q left side optimization */
113 		for (z = start-1 ; z >= 0 ; z--) {
114 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
115 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
116 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
117 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
118 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
119 		}
120 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
121 		/* Don't use movntdq for r/w memory area < cache line */
122 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
123 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
124 	}
125 
126 	asm volatile("sfence" : : : "memory");
127 	kernel_fpu_end();
128 }
129 
130 const struct raid6_calls raid6_avx2x1 = {
131 	.gen_syndrome	= raid6_avx21_gen_syndrome,
132 	.xor_syndrome	= raid6_avx21_xor_syndrome,
133 	.valid		= raid6_have_avx2,
134 	.name		= "avx2x1",
135 	/* Prefer AVX2 over priority 1 (SSE2 and others) */
136 	.priority	= 2,
137 };
138 
139 /*
140  * Unrolled-by-2 AVX2 implementation
141  */
142 static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
143 {
144 	u8 **dptr = (u8 **)ptrs;
145 	u8 *p, *q;
146 	int d, z, z0;
147 
148 	z0 = disks - 3;		/* Highest data disk */
149 	p = dptr[z0+1];		/* XOR parity */
150 	q = dptr[z0+2];		/* RS syndrome */
151 
152 	kernel_fpu_begin();
153 
154 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
155 	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
156 
157 	/* We uniformly assume a single prefetch covers at least 32 bytes */
158 	for (d = 0; d < bytes; d += 64) {
159 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
160 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
161 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
162 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
163 		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
164 		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
165 		for (z = z0-1; z >= 0; z--) {
166 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
167 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
168 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
169 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
170 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
171 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
172 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
173 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
174 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
175 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
176 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
177 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
178 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
179 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
180 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
181 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
182 		}
183 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
184 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
185 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
186 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
187 	}
188 
189 	asm volatile("sfence" : : : "memory");
190 	kernel_fpu_end();
191 }
192 
193 static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
194 				     size_t bytes, void **ptrs)
195 {
196 	u8 **dptr = (u8 **)ptrs;
197 	u8 *p, *q;
198 	int d, z, z0;
199 
200 	z0 = stop;		/* P/Q right side optimization */
201 	p = dptr[disks-2];	/* XOR parity */
202 	q = dptr[disks-1];	/* RS syndrome */
203 
204 	kernel_fpu_begin();
205 
206 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
207 
208 	for (d = 0 ; d < bytes ; d += 64) {
209 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
210 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
211 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
212 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
213 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
214 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
215 		/* P/Q data pages */
216 		for (z = z0-1 ; z >= start ; z--) {
217 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
218 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
219 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
220 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
221 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
222 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
223 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
224 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
225 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
226 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
227 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
228 			asm volatile("vmovdqa %0,%%ymm7"
229 				     :: "m" (dptr[z][d+32]));
230 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
231 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
232 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
233 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
234 		}
235 		/* P/Q left side optimization */
236 		for (z = start-1 ; z >= 0 ; z--) {
237 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
238 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
239 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
240 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
241 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
242 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
243 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
244 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
245 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
246 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
247 		}
248 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
249 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
250 		/* Don't use movntdq for r/w memory area < cache line */
251 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
252 		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
253 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
254 		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
255 	}
256 
257 	asm volatile("sfence" : : : "memory");
258 	kernel_fpu_end();
259 }
260 
261 const struct raid6_calls raid6_avx2x2 = {
262 	.gen_syndrome	= raid6_avx22_gen_syndrome,
263 	.xor_syndrome	= raid6_avx22_xor_syndrome,
264 	.valid		= raid6_have_avx2,
265 	.name		= "avx2x2",
266 	/* Prefer AVX2 over priority 1 (SSE2 and others) */
267 	.priority	= 2,
268 };
269 
270 #ifdef CONFIG_X86_64
271 
272 /*
273  * Unrolled-by-4 AVX2 implementation
274  */
275 static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
276 {
277 	u8 **dptr = (u8 **)ptrs;
278 	u8 *p, *q;
279 	int d, z, z0;
280 
281 	z0 = disks - 3;		/* Highest data disk */
282 	p = dptr[z0+1];		/* XOR parity */
283 	q = dptr[z0+2];		/* RS syndrome */
284 
285 	kernel_fpu_begin();
286 
287 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
288 	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
289 	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
290 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
291 	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
292 	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
293 	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
294 	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
295 	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
296 	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
297 
298 	for (d = 0; d < bytes; d += 128) {
299 		for (z = z0; z >= 0; z--) {
300 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
301 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
302 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
303 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
304 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
305 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
306 			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
307 			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
308 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
309 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
310 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
311 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
312 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
313 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
314 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
315 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
316 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
317 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
318 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
319 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
320 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
321 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
322 			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
323 			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
324 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
325 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
326 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
327 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
328 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
329 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
330 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
331 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
332 		}
333 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
334 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
335 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
336 		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
337 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
338 		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
339 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
340 		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
341 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
342 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
343 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
344 		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
345 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
346 		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
347 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
348 		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
349 	}
350 
351 	asm volatile("sfence" : : : "memory");
352 	kernel_fpu_end();
353 }
354 
355 static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
356 				     size_t bytes, void **ptrs)
357 {
358 	u8 **dptr = (u8 **)ptrs;
359 	u8 *p, *q;
360 	int d, z, z0;
361 
362 	z0 = stop;		/* P/Q right side optimization */
363 	p = dptr[disks-2];	/* XOR parity */
364 	q = dptr[disks-1];	/* RS syndrome */
365 
366 	kernel_fpu_begin();
367 
368 	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
369 
370 	for (d = 0 ; d < bytes ; d += 128) {
371 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
372 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
373 		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
374 		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
375 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
376 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
377 		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
378 		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
379 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
380 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
381 		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
382 		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
383 		/* P/Q data pages */
384 		for (z = z0-1 ; z >= start ; z--) {
385 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
386 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
387 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
388 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
389 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
390 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
391 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
392 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
393 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
394 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
395 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
396 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
397 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
398 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
399 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
400 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
401 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
402 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
403 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
404 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
405 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
406 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
407 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
408 			asm volatile("vmovdqa %0,%%ymm7"
409 				     :: "m" (dptr[z][d+32]));
410 			asm volatile("vmovdqa %0,%%ymm13"
411 				     :: "m" (dptr[z][d+64]));
412 			asm volatile("vmovdqa %0,%%ymm15"
413 				     :: "m" (dptr[z][d+96]));
414 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
415 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
416 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
417 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
418 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
419 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
420 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
421 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
422 		}
423 		asm volatile("prefetchnta %0" :: "m" (q[d]));
424 		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
425 		/* P/Q left side optimization */
426 		for (z = start-1 ; z >= 0 ; z--) {
427 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
428 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
429 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
430 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
431 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
432 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
433 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
434 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
435 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
436 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
437 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
438 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
439 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
440 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
441 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
442 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
443 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
444 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
445 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
446 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
447 		}
448 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
449 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
450 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
451 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
452 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
453 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
454 		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
455 		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
456 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
457 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
458 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
459 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
460 	}
461 	asm volatile("sfence" : : : "memory");
462 	kernel_fpu_end();
463 }
464 
465 const struct raid6_calls raid6_avx2x4 = {
466 	.gen_syndrome	= raid6_avx24_gen_syndrome,
467 	.xor_syndrome	= raid6_avx24_xor_syndrome,
468 	.valid		= raid6_have_avx2,
469 	.name		= "avx2x4",
470 	/* Prefer AVX2 over priority 1 (SSE2 and others) */
471 	.priority	= 2,
472 };
473 #endif /* CONFIG_X86_64 */
474