xref: /linux/lib/raid/raid6/x86/avx2.c (revision adfcf6e89bc1322c4a6cc37ad32e411cf0500622)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* -*- linux-c -*- ------------------------------------------------------- *
3  *
4  *   Copyright (C) 2012 Intel Corporation
5  *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
6  *
7  *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
8  *
9  * ----------------------------------------------------------------------- */
10 
11 /*
12  * AVX2 implementation of RAID-6 syndrome functions
13  *
14  */
15 
16 #include <asm/cpufeature.h>
17 #include <asm/fpu/api.h>
18 #include "algos.h"
19 
20 static const struct raid6_avx2_constants {
21 	u64 x1d[4];
22 } raid6_avx2_constants __aligned(32) = {
23 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
24 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
25 };
26 
27 /*
28  * Plain AVX2 implementation
29  */
30 static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
31 {
32 	u8 **dptr = (u8 **)ptrs;
33 	u8 *p, *q;
34 	int d, z, z0;
35 
36 	z0 = disks - 3;		/* Highest data disk */
37 	p = dptr[z0+1];		/* XOR parity */
38 	q = dptr[z0+2];		/* RS syndrome */
39 
40 	kernel_fpu_begin();
41 
42 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
43 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
44 
45 	for (d = 0; d < bytes; d += 32) {
46 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
47 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
48 		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
49 		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
50 		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
51 		for (z = z0-2; z >= 0; z--) {
52 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
53 			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
54 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
55 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
56 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
57 			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
58 			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
59 			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
60 		}
61 		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
62 		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
63 		asm volatile("vpand %ymm0,%ymm5,%ymm5");
64 		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
65 		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
66 		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
67 
68 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
69 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
70 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
71 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
72 	}
73 
74 	asm volatile("sfence" : : : "memory");
75 	kernel_fpu_end();
76 }
77 
78 static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
79 				     size_t bytes, void **ptrs)
80 {
81 	u8 **dptr = (u8 **)ptrs;
82 	u8 *p, *q;
83 	int d, z, z0;
84 
85 	z0 = stop;		/* P/Q right side optimization */
86 	p = dptr[disks-2];	/* XOR parity */
87 	q = dptr[disks-1];	/* RS syndrome */
88 
89 	kernel_fpu_begin();
90 
91 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
92 
93 	for (d = 0 ; d < bytes ; d += 32) {
94 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
95 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
96 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
97 		/* P/Q data pages */
98 		for (z = z0-1 ; z >= start ; z--) {
99 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
100 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
101 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
102 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
103 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
104 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
105 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
106 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
107 		}
108 		/* P/Q left side optimization */
109 		for (z = start-1 ; z >= 0 ; z--) {
110 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
111 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
112 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
113 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
114 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
115 		}
116 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
117 		/* Don't use movntdq for r/w memory area < cache line */
118 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
119 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
120 	}
121 
122 	asm volatile("sfence" : : : "memory");
123 	kernel_fpu_end();
124 }
125 
126 const struct raid6_calls raid6_avx2x1 = {
127 	.gen_syndrome	= raid6_avx21_gen_syndrome,
128 	.xor_syndrome	= raid6_avx21_xor_syndrome,
129 	.name		= "avx2x1",
130 };
131 
132 /*
133  * Unrolled-by-2 AVX2 implementation
134  */
135 static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
136 {
137 	u8 **dptr = (u8 **)ptrs;
138 	u8 *p, *q;
139 	int d, z, z0;
140 
141 	z0 = disks - 3;		/* Highest data disk */
142 	p = dptr[z0+1];		/* XOR parity */
143 	q = dptr[z0+2];		/* RS syndrome */
144 
145 	kernel_fpu_begin();
146 
147 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
148 	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
149 
150 	/* We uniformly assume a single prefetch covers at least 32 bytes */
151 	for (d = 0; d < bytes; d += 64) {
152 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
153 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
154 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
155 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
156 		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
157 		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
158 		for (z = z0-1; z >= 0; z--) {
159 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
160 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
161 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
162 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
163 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
164 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
165 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
166 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
167 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
168 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
169 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
170 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
171 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
172 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
173 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
174 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
175 		}
176 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
177 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
178 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
179 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
180 	}
181 
182 	asm volatile("sfence" : : : "memory");
183 	kernel_fpu_end();
184 }
185 
186 static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
187 				     size_t bytes, void **ptrs)
188 {
189 	u8 **dptr = (u8 **)ptrs;
190 	u8 *p, *q;
191 	int d, z, z0;
192 
193 	z0 = stop;		/* P/Q right side optimization */
194 	p = dptr[disks-2];	/* XOR parity */
195 	q = dptr[disks-1];	/* RS syndrome */
196 
197 	kernel_fpu_begin();
198 
199 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
200 
201 	for (d = 0 ; d < bytes ; d += 64) {
202 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
203 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
204 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
205 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
206 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
207 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
208 		/* P/Q data pages */
209 		for (z = z0-1 ; z >= start ; z--) {
210 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
211 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
212 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
213 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
214 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
215 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
216 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
217 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
218 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
219 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
220 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
221 			asm volatile("vmovdqa %0,%%ymm7"
222 				     :: "m" (dptr[z][d+32]));
223 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
224 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
225 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
226 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
227 		}
228 		/* P/Q left side optimization */
229 		for (z = start-1 ; z >= 0 ; z--) {
230 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
231 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
232 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
233 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
234 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
235 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
236 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
237 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
238 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
239 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
240 		}
241 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
242 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
243 		/* Don't use movntdq for r/w memory area < cache line */
244 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
245 		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
246 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
247 		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
248 	}
249 
250 	asm volatile("sfence" : : : "memory");
251 	kernel_fpu_end();
252 }
253 
254 const struct raid6_calls raid6_avx2x2 = {
255 	.gen_syndrome	= raid6_avx22_gen_syndrome,
256 	.xor_syndrome	= raid6_avx22_xor_syndrome,
257 	.name		= "avx2x2",
258 };
259 
260 #ifdef CONFIG_X86_64
261 
262 /*
263  * Unrolled-by-4 AVX2 implementation
264  */
265 static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
266 {
267 	u8 **dptr = (u8 **)ptrs;
268 	u8 *p, *q;
269 	int d, z, z0;
270 
271 	z0 = disks - 3;		/* Highest data disk */
272 	p = dptr[z0+1];		/* XOR parity */
273 	q = dptr[z0+2];		/* RS syndrome */
274 
275 	kernel_fpu_begin();
276 
277 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
278 	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
279 	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
280 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
281 	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
282 	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
283 	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
284 	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
285 	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
286 	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
287 
288 	for (d = 0; d < bytes; d += 128) {
289 		for (z = z0; z >= 0; z--) {
290 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
291 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
292 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
293 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
294 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
295 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
296 			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
297 			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
298 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
299 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
300 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
301 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
302 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
303 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
304 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
305 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
306 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
307 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
308 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
309 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
310 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
311 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
312 			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
313 			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
314 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
315 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
316 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
317 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
318 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
319 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
320 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
321 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
322 		}
323 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
324 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
325 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
326 		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
327 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
328 		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
329 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
330 		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
331 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
332 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
333 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
334 		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
335 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
336 		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
337 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
338 		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
339 	}
340 
341 	asm volatile("sfence" : : : "memory");
342 	kernel_fpu_end();
343 }
344 
345 static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
346 				     size_t bytes, void **ptrs)
347 {
348 	u8 **dptr = (u8 **)ptrs;
349 	u8 *p, *q;
350 	int d, z, z0;
351 
352 	z0 = stop;		/* P/Q right side optimization */
353 	p = dptr[disks-2];	/* XOR parity */
354 	q = dptr[disks-1];	/* RS syndrome */
355 
356 	kernel_fpu_begin();
357 
358 	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
359 
360 	for (d = 0 ; d < bytes ; d += 128) {
361 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
362 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
363 		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
364 		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
365 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
366 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
367 		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
368 		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
369 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
370 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
371 		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
372 		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
373 		/* P/Q data pages */
374 		for (z = z0-1 ; z >= start ; z--) {
375 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
376 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
377 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
378 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
379 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
380 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
381 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
382 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
383 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
384 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
385 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
386 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
387 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
388 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
389 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
390 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
391 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
392 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
393 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
394 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
395 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
396 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
397 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
398 			asm volatile("vmovdqa %0,%%ymm7"
399 				     :: "m" (dptr[z][d+32]));
400 			asm volatile("vmovdqa %0,%%ymm13"
401 				     :: "m" (dptr[z][d+64]));
402 			asm volatile("vmovdqa %0,%%ymm15"
403 				     :: "m" (dptr[z][d+96]));
404 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
405 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
406 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
407 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
408 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
409 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
410 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
411 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
412 		}
413 		asm volatile("prefetchnta %0" :: "m" (q[d]));
414 		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
415 		/* P/Q left side optimization */
416 		for (z = start-1 ; z >= 0 ; z--) {
417 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
418 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
419 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
420 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
421 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
422 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
423 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
424 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
425 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
426 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
427 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
428 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
429 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
430 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
431 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
432 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
433 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
434 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
435 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
436 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
437 		}
438 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
439 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
440 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
441 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
442 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
443 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
444 		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
445 		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
446 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
447 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
448 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
449 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
450 	}
451 	asm volatile("sfence" : : : "memory");
452 	kernel_fpu_end();
453 }
454 
455 const struct raid6_calls raid6_avx2x4 = {
456 	.gen_syndrome	= raid6_avx24_gen_syndrome,
457 	.xor_syndrome	= raid6_avx24_xor_syndrome,
458 	.name		= "avx2x4",
459 };
460 #endif /* CONFIG_X86_64 */
461