xref: /linux/lib/raid/raid6/x86/avx2.c (revision 30bf04bd13a58cd9b877589569aa0abd06f04e52)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2012 Intel Corporation
4  * Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
5  *
6  * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
7  *
8  * AVX2 implementation of RAID-6 syndrome functions
9  */
10 
11 #include <asm/cpufeature.h>
12 #include <asm/fpu/api.h>
13 #include "algos.h"
14 
15 static const struct raid6_avx2_constants {
16 	u64 x1d[4];
17 } raid6_avx2_constants __aligned(32) = {
18 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
19 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
20 };
21 
22 /*
23  * Plain AVX2 implementation
24  */
25 static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
26 {
27 	u8 **dptr = (u8 **)ptrs;
28 	u8 *p, *q;
29 	int d, z, z0;
30 
31 	z0 = disks - 3;		/* Highest data disk */
32 	p = dptr[z0+1];		/* XOR parity */
33 	q = dptr[z0+2];		/* RS syndrome */
34 
35 	kernel_fpu_begin();
36 
37 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
38 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
39 
40 	for (d = 0; d < bytes; d += 32) {
41 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
42 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
43 		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
44 		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
45 		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
46 		for (z = z0-2; z >= 0; z--) {
47 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
48 			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
49 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
50 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
51 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
52 			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
53 			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
54 			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
55 		}
56 		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
57 		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
58 		asm volatile("vpand %ymm0,%ymm5,%ymm5");
59 		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
60 		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
61 		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
62 
63 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
64 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
65 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
66 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
67 	}
68 
69 	asm volatile("sfence" : : : "memory");
70 	kernel_fpu_end();
71 }
72 
73 static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
74 				     size_t bytes, void **ptrs)
75 {
76 	u8 **dptr = (u8 **)ptrs;
77 	u8 *p, *q;
78 	int d, z, z0;
79 
80 	z0 = stop;		/* P/Q right side optimization */
81 	p = dptr[disks-2];	/* XOR parity */
82 	q = dptr[disks-1];	/* RS syndrome */
83 
84 	kernel_fpu_begin();
85 
86 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
87 
88 	for (d = 0 ; d < bytes ; d += 32) {
89 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
90 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
91 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
92 		/* P/Q data pages */
93 		for (z = z0-1 ; z >= start ; z--) {
94 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
95 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
96 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
97 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
98 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
99 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
100 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
101 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
102 		}
103 		/* P/Q left side optimization */
104 		for (z = start-1 ; z >= 0 ; z--) {
105 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
106 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
107 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
108 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
109 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
110 		}
111 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
112 		/* Don't use movntdq for r/w memory area < cache line */
113 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
114 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
115 	}
116 
117 	asm volatile("sfence" : : : "memory");
118 	kernel_fpu_end();
119 }
120 
121 const struct raid6_calls raid6_avx2x1 = {
122 	.gen_syndrome	= raid6_avx21_gen_syndrome,
123 	.xor_syndrome	= raid6_avx21_xor_syndrome,
124 	.name		= "avx2x1",
125 };
126 
127 /*
128  * Unrolled-by-2 AVX2 implementation
129  */
130 static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
131 {
132 	u8 **dptr = (u8 **)ptrs;
133 	u8 *p, *q;
134 	int d, z, z0;
135 
136 	z0 = disks - 3;		/* Highest data disk */
137 	p = dptr[z0+1];		/* XOR parity */
138 	q = dptr[z0+2];		/* RS syndrome */
139 
140 	kernel_fpu_begin();
141 
142 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
143 	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
144 
145 	/* We uniformly assume a single prefetch covers at least 32 bytes */
146 	for (d = 0; d < bytes; d += 64) {
147 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
148 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
149 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
150 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
151 		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
152 		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
153 		for (z = z0-1; z >= 0; z--) {
154 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
155 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
156 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
157 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
158 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
159 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
160 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
161 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
162 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
163 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
164 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
165 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
166 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
167 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
168 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
169 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
170 		}
171 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
172 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
173 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
174 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
175 	}
176 
177 	asm volatile("sfence" : : : "memory");
178 	kernel_fpu_end();
179 }
180 
181 static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
182 				     size_t bytes, void **ptrs)
183 {
184 	u8 **dptr = (u8 **)ptrs;
185 	u8 *p, *q;
186 	int d, z, z0;
187 
188 	z0 = stop;		/* P/Q right side optimization */
189 	p = dptr[disks-2];	/* XOR parity */
190 	q = dptr[disks-1];	/* RS syndrome */
191 
192 	kernel_fpu_begin();
193 
194 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
195 
196 	for (d = 0 ; d < bytes ; d += 64) {
197 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
198 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
199 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
200 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
201 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
202 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
203 		/* P/Q data pages */
204 		for (z = z0-1 ; z >= start ; z--) {
205 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
206 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
207 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
208 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
209 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
210 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
211 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
212 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
213 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
214 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
215 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
216 			asm volatile("vmovdqa %0,%%ymm7"
217 				     :: "m" (dptr[z][d+32]));
218 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
219 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
220 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
221 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
222 		}
223 		/* P/Q left side optimization */
224 		for (z = start-1 ; z >= 0 ; z--) {
225 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
226 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
227 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
228 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
229 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
230 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
231 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
232 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
233 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
234 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
235 		}
236 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
237 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
238 		/* Don't use movntdq for r/w memory area < cache line */
239 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
240 		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
241 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
242 		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
243 	}
244 
245 	asm volatile("sfence" : : : "memory");
246 	kernel_fpu_end();
247 }
248 
249 const struct raid6_calls raid6_avx2x2 = {
250 	.gen_syndrome	= raid6_avx22_gen_syndrome,
251 	.xor_syndrome	= raid6_avx22_xor_syndrome,
252 	.name		= "avx2x2",
253 };
254 
255 #ifdef CONFIG_X86_64
256 
257 /*
258  * Unrolled-by-4 AVX2 implementation
259  */
260 static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
261 {
262 	u8 **dptr = (u8 **)ptrs;
263 	u8 *p, *q;
264 	int d, z, z0;
265 
266 	z0 = disks - 3;		/* Highest data disk */
267 	p = dptr[z0+1];		/* XOR parity */
268 	q = dptr[z0+2];		/* RS syndrome */
269 
270 	kernel_fpu_begin();
271 
272 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
273 	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
274 	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
275 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
276 	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
277 	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
278 	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
279 	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
280 	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
281 	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
282 
283 	for (d = 0; d < bytes; d += 128) {
284 		for (z = z0; z >= 0; z--) {
285 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
286 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
287 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
288 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
289 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
290 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
291 			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
292 			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
293 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
294 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
295 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
296 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
297 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
298 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
299 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
300 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
301 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
302 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
303 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
304 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
305 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
306 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
307 			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
308 			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
309 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
310 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
311 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
312 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
313 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
314 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
315 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
316 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
317 		}
318 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
319 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
320 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
321 		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
322 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
323 		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
324 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
325 		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
326 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
327 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
328 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
329 		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
330 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
331 		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
332 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
333 		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
334 	}
335 
336 	asm volatile("sfence" : : : "memory");
337 	kernel_fpu_end();
338 }
339 
340 static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
341 				     size_t bytes, void **ptrs)
342 {
343 	u8 **dptr = (u8 **)ptrs;
344 	u8 *p, *q;
345 	int d, z, z0;
346 
347 	z0 = stop;		/* P/Q right side optimization */
348 	p = dptr[disks-2];	/* XOR parity */
349 	q = dptr[disks-1];	/* RS syndrome */
350 
351 	kernel_fpu_begin();
352 
353 	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
354 
355 	for (d = 0 ; d < bytes ; d += 128) {
356 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
357 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
358 		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
359 		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
360 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
361 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
362 		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
363 		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
364 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
365 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
366 		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
367 		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
368 		/* P/Q data pages */
369 		for (z = z0-1 ; z >= start ; z--) {
370 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
371 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
372 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
373 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
374 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
375 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
376 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
377 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
378 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
379 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
380 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
381 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
382 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
383 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
384 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
385 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
386 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
387 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
388 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
389 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
390 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
391 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
392 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
393 			asm volatile("vmovdqa %0,%%ymm7"
394 				     :: "m" (dptr[z][d+32]));
395 			asm volatile("vmovdqa %0,%%ymm13"
396 				     :: "m" (dptr[z][d+64]));
397 			asm volatile("vmovdqa %0,%%ymm15"
398 				     :: "m" (dptr[z][d+96]));
399 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
400 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
401 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
402 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
403 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
404 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
405 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
406 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
407 		}
408 		asm volatile("prefetchnta %0" :: "m" (q[d]));
409 		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
410 		/* P/Q left side optimization */
411 		for (z = start-1 ; z >= 0 ; z--) {
412 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
413 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
414 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
415 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
416 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
417 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
418 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
419 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
420 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
421 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
422 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
423 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
424 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
425 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
426 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
427 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
428 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
429 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
430 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
431 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
432 		}
433 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
434 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
435 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
436 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
437 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
438 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
439 		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
440 		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
441 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
442 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
443 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
444 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
445 	}
446 	asm volatile("sfence" : : : "memory");
447 	kernel_fpu_end();
448 }
449 
450 const struct raid6_calls raid6_avx2x4 = {
451 	.gen_syndrome	= raid6_avx24_gen_syndrome,
452 	.xor_syndrome	= raid6_avx24_xor_syndrome,
453 	.name		= "avx2x4",
454 };
455 #endif /* CONFIG_X86_64 */
456