xref: /linux/lib/raid/raid6/x86/avx512.c (revision 30bf04bd13a58cd9b877589569aa0abd06f04e52)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2016 Intel Corporation
4  *
5  * Author: Gayatri Kammela <gayatri.kammela@intel.com>
6  * Author: Megha Dey <megha.dey@linux.intel.com>
7  *
8  * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
9  * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
10  *
11  * AVX512 implementation of RAID-6 syndrome functions
12  */
13 
14 #include <asm/cpufeature.h>
15 #include <asm/fpu/api.h>
16 #include "algos.h"
17 
18 static const struct raid6_avx512_constants {
19 	u64 x1d[8];
20 } raid6_avx512_constants __aligned(512/8) = {
21 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
22 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
23 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
24 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
25 };
26 
27 static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
28 {
29 	u8 **dptr = (u8 **)ptrs;
30 	u8 *p, *q;
31 	int d, z, z0;
32 
33 	z0 = disks - 3;         /* Highest data disk */
34 	p = dptr[z0+1];         /* XOR parity */
35 	q = dptr[z0+2];         /* RS syndrome */
36 
37 	kernel_fpu_begin();
38 
39 	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
40 		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
41 		     :
42 		     : "m" (raid6_avx512_constants.x1d[0]));
43 
44 	for (d = 0; d < bytes; d += 64) {
45 		asm volatile("prefetchnta %0\n\t"
46 			     "vmovdqa64 %0,%%zmm2\n\t"     /* P[0] */
47 			     "prefetchnta %1\n\t"
48 			     "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
49 			     "vmovdqa64 %1,%%zmm6"
50 			     :
51 			     : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
52 		for (z = z0-2; z >= 0; z--) {
53 			asm volatile("prefetchnta %0\n\t"
54 				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
55 				     "vpmovm2b %%k1,%%zmm5\n\t"
56 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
57 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
58 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
59 				     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
60 				     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
61 				     "vmovdqa64 %0,%%zmm6"
62 				     :
63 				     : "m" (dptr[z][d]));
64 		}
65 		asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
66 			     "vpmovm2b %%k1,%%zmm5\n\t"
67 			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
68 			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
69 			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
70 			     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
71 			     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
72 			     "vmovntdq %%zmm2,%0\n\t"
73 			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
74 			     "vmovntdq %%zmm4,%1\n\t"
75 			     "vpxorq %%zmm4,%%zmm4,%%zmm4"
76 			     :
77 			     : "m" (p[d]), "m" (q[d]));
78 	}
79 
80 	asm volatile("sfence" : : : "memory");
81 	kernel_fpu_end();
82 }
83 
84 static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
85 				       size_t bytes, void **ptrs)
86 {
87 	u8 **dptr = (u8 **)ptrs;
88 	u8 *p, *q;
89 	int d, z, z0;
90 
91 	z0 = stop;		/* P/Q right side optimization */
92 	p = dptr[disks-2];	/* XOR parity */
93 	q = dptr[disks-1];	/* RS syndrome */
94 
95 	kernel_fpu_begin();
96 
97 	asm volatile("vmovdqa64 %0,%%zmm0"
98 		     : : "m" (raid6_avx512_constants.x1d[0]));
99 
100 	for (d = 0 ; d < bytes ; d += 64) {
101 		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
102 			     "vmovdqa64 %1,%%zmm2\n\t"
103 			     "vpxorq %%zmm4,%%zmm2,%%zmm2"
104 			     :
105 			     : "m" (dptr[z0][d]),  "m" (p[d]));
106 		/* P/Q data pages */
107 		for (z = z0-1 ; z >= start ; z--) {
108 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
109 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
110 				     "vpmovm2b %%k1,%%zmm5\n\t"
111 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
112 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
113 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
114 				     "vmovdqa64 %0,%%zmm5\n\t"
115 				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
116 				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
117 				     :
118 				     : "m" (dptr[z][d]));
119 		}
120 		/* P/Q left side optimization */
121 		for (z = start-1 ; z >= 0 ; z--) {
122 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
123 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
124 				     "vpmovm2b %%k1,%%zmm5\n\t"
125 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
126 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
127 				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
128 				     :
129 				     : );
130 		}
131 		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
132 		/* Don't use movntdq for r/w memory area < cache line */
133 			     "vmovdqa64 %%zmm4,%0\n\t"
134 			     "vmovdqa64 %%zmm2,%1"
135 			     :
136 			     : "m" (q[d]), "m" (p[d]));
137 	}
138 
139 	asm volatile("sfence" : : : "memory");
140 	kernel_fpu_end();
141 }
142 
143 const struct raid6_calls raid6_avx512x1 = {
144 	.gen_syndrome	= raid6_avx5121_gen_syndrome,
145 	.xor_syndrome	= raid6_avx5121_xor_syndrome,
146 	.name		= "avx512x1",
147 };
148 
149 /*
150  * Unrolled-by-2 AVX512 implementation
151  */
152 static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
153 {
154 	u8 **dptr = (u8 **)ptrs;
155 	u8 *p, *q;
156 	int d, z, z0;
157 
158 	z0 = disks - 3;         /* Highest data disk */
159 	p = dptr[z0+1];         /* XOR parity */
160 	q = dptr[z0+2];         /* RS syndrome */
161 
162 	kernel_fpu_begin();
163 
164 	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
165 		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
166 		     :
167 		     : "m" (raid6_avx512_constants.x1d[0]));
168 
169 	/* We uniformly assume a single prefetch covers at least 64 bytes */
170 	for (d = 0; d < bytes; d += 128) {
171 		asm volatile("prefetchnta %0\n\t"
172 			     "prefetchnta %1\n\t"
173 			     "vmovdqa64 %0,%%zmm2\n\t"      /* P[0] */
174 			     "vmovdqa64 %1,%%zmm3\n\t"      /* P[1] */
175 			     "vmovdqa64 %%zmm2,%%zmm4\n\t"  /* Q[0] */
176 			     "vmovdqa64 %%zmm3,%%zmm6"      /* Q[1] */
177 			     :
178 			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
179 		for (z = z0-1; z >= 0; z--) {
180 			asm volatile("prefetchnta %0\n\t"
181 				     "prefetchnta %1\n\t"
182 				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
183 				     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
184 				     "vpmovm2b %%k1,%%zmm5\n\t"
185 				     "vpmovm2b %%k2,%%zmm7\n\t"
186 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
187 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
188 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
189 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
190 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
191 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
192 				     "vmovdqa64 %0,%%zmm5\n\t"
193 				     "vmovdqa64 %1,%%zmm7\n\t"
194 				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
195 				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
196 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
197 				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
198 				     :
199 				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
200 		}
201 		asm volatile("vmovntdq %%zmm2,%0\n\t"
202 			     "vmovntdq %%zmm3,%1\n\t"
203 			     "vmovntdq %%zmm4,%2\n\t"
204 			     "vmovntdq %%zmm6,%3"
205 			     :
206 			     : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
207 			       "m" (q[d+64]));
208 	}
209 
210 	asm volatile("sfence" : : : "memory");
211 	kernel_fpu_end();
212 }
213 
214 static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
215 				       size_t bytes, void **ptrs)
216 {
217 	u8 **dptr = (u8 **)ptrs;
218 	u8 *p, *q;
219 	int d, z, z0;
220 
221 	z0 = stop;		/* P/Q right side optimization */
222 	p = dptr[disks-2];	/* XOR parity */
223 	q = dptr[disks-1];	/* RS syndrome */
224 
225 	kernel_fpu_begin();
226 
227 	asm volatile("vmovdqa64 %0,%%zmm0"
228 		     : : "m" (raid6_avx512_constants.x1d[0]));
229 
230 	for (d = 0 ; d < bytes ; d += 128) {
231 		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
232 			     "vmovdqa64 %1,%%zmm6\n\t"
233 			     "vmovdqa64 %2,%%zmm2\n\t"
234 			     "vmovdqa64 %3,%%zmm3\n\t"
235 			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
236 			     "vpxorq %%zmm6,%%zmm3,%%zmm3"
237 			     :
238 			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
239 			       "m" (p[d]), "m" (p[d+64]));
240 		/* P/Q data pages */
241 		for (z = z0-1 ; z >= start ; z--) {
242 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
243 				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
244 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
245 				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
246 				     "vpmovm2b %%k1,%%zmm5\n\t"
247 				     "vpmovm2b %%k2,%%zmm7\n\t"
248 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
249 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
250 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
251 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
252 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
253 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
254 				     "vmovdqa64 %0,%%zmm5\n\t"
255 				     "vmovdqa64 %1,%%zmm7\n\t"
256 				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
257 				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
258 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
259 				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
260 				     :
261 				     : "m" (dptr[z][d]),  "m" (dptr[z][d+64]));
262 		}
263 		/* P/Q left side optimization */
264 		for (z = start-1 ; z >= 0 ; z--) {
265 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
266 				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
267 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
268 				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
269 				     "vpmovm2b %%k1,%%zmm5\n\t"
270 				     "vpmovm2b %%k2,%%zmm7\n\t"
271 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
272 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
273 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
274 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
275 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
276 				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
277 				     :
278 				     : );
279 		}
280 		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
281 			     "vpxorq %1,%%zmm6,%%zmm6\n\t"
282 			     /* Don't use movntdq for r/w
283 			      * memory area < cache line
284 			      */
285 			     "vmovdqa64 %%zmm4,%0\n\t"
286 			     "vmovdqa64 %%zmm6,%1\n\t"
287 			     "vmovdqa64 %%zmm2,%2\n\t"
288 			     "vmovdqa64 %%zmm3,%3"
289 			     :
290 			     : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
291 			       "m" (p[d+64]));
292 	}
293 
294 	asm volatile("sfence" : : : "memory");
295 	kernel_fpu_end();
296 }
297 
298 const struct raid6_calls raid6_avx512x2 = {
299 	.gen_syndrome	= raid6_avx5122_gen_syndrome,
300 	.xor_syndrome	= raid6_avx5122_xor_syndrome,
301 	.name		= "avx512x2",
302 };
303 
304 #ifdef CONFIG_X86_64
305 
306 /*
307  * Unrolled-by-4 AVX2 implementation
308  */
309 static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
310 {
311 	u8 **dptr = (u8 **)ptrs;
312 	u8 *p, *q;
313 	int d, z, z0;
314 
315 	z0 = disks - 3;         /* Highest data disk */
316 	p = dptr[z0+1];         /* XOR parity */
317 	q = dptr[z0+2];         /* RS syndrome */
318 
319 	kernel_fpu_begin();
320 
321 	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
322 		     "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t"       /* Zero temp */
323 		     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"       /* P[0] */
324 		     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"       /* P[1] */
325 		     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"       /* Q[0] */
326 		     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"       /* Q[1] */
327 		     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"    /* P[2] */
328 		     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"    /* P[3] */
329 		     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"    /* Q[2] */
330 		     "vpxorq %%zmm14,%%zmm14,%%zmm14"        /* Q[3] */
331 		     :
332 		     : "m" (raid6_avx512_constants.x1d[0]));
333 
334 	for (d = 0; d < bytes; d += 256) {
335 		for (z = z0; z >= 0; z--) {
336 		asm volatile("prefetchnta %0\n\t"
337 			     "prefetchnta %1\n\t"
338 			     "prefetchnta %2\n\t"
339 			     "prefetchnta %3\n\t"
340 			     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
341 			     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
342 			     "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
343 			     "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
344 			     "vpmovm2b %%k1,%%zmm5\n\t"
345 			     "vpmovm2b %%k2,%%zmm7\n\t"
346 			     "vpmovm2b %%k3,%%zmm13\n\t"
347 			     "vpmovm2b %%k4,%%zmm15\n\t"
348 			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
349 			     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
350 			     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
351 			     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
352 			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
353 			     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
354 			     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
355 			     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
356 			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
357 			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
358 			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
359 			     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
360 			     "vmovdqa64 %0,%%zmm5\n\t"
361 			     "vmovdqa64 %1,%%zmm7\n\t"
362 			     "vmovdqa64 %2,%%zmm13\n\t"
363 			     "vmovdqa64 %3,%%zmm15\n\t"
364 			     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
365 			     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
366 			     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
367 			     "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
368 			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
369 			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
370 			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
371 			     "vpxorq %%zmm15,%%zmm14,%%zmm14"
372 			     :
373 			     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
374 			       "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
375 		}
376 		asm volatile("vmovntdq %%zmm2,%0\n\t"
377 			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
378 			     "vmovntdq %%zmm3,%1\n\t"
379 			     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
380 			     "vmovntdq %%zmm10,%2\n\t"
381 			     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
382 			     "vmovntdq %%zmm11,%3\n\t"
383 			     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
384 			     "vmovntdq %%zmm4,%4\n\t"
385 			     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
386 			     "vmovntdq %%zmm6,%5\n\t"
387 			     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
388 			     "vmovntdq %%zmm12,%6\n\t"
389 			     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
390 			     "vmovntdq %%zmm14,%7\n\t"
391 			     "vpxorq %%zmm14,%%zmm14,%%zmm14"
392 			     :
393 			     : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
394 			       "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
395 			       "m" (q[d+128]), "m" (q[d+192]));
396 	}
397 
398 	asm volatile("sfence" : : : "memory");
399 	kernel_fpu_end();
400 }
401 
402 static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
403 				       size_t bytes, void **ptrs)
404 {
405 	u8 **dptr = (u8 **)ptrs;
406 	u8 *p, *q;
407 	int d, z, z0;
408 
409 	z0 = stop;		/* P/Q right side optimization */
410 	p = dptr[disks-2];	/* XOR parity */
411 	q = dptr[disks-1];	/* RS syndrome */
412 
413 	kernel_fpu_begin();
414 
415 	asm volatile("vmovdqa64 %0,%%zmm0"
416 		     :: "m" (raid6_avx512_constants.x1d[0]));
417 
418 	for (d = 0 ; d < bytes ; d += 256) {
419 		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
420 			     "vmovdqa64 %1,%%zmm6\n\t"
421 			     "vmovdqa64 %2,%%zmm12\n\t"
422 			     "vmovdqa64 %3,%%zmm14\n\t"
423 			     "vmovdqa64 %4,%%zmm2\n\t"
424 			     "vmovdqa64 %5,%%zmm3\n\t"
425 			     "vmovdqa64 %6,%%zmm10\n\t"
426 			     "vmovdqa64 %7,%%zmm11\n\t"
427 			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
428 			     "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
429 			     "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
430 			     "vpxorq %%zmm14,%%zmm11,%%zmm11"
431 			     :
432 			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
433 			       "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
434 			       "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
435 			       "m" (p[d+192]));
436 		/* P/Q data pages */
437 		for (z = z0-1 ; z >= start ; z--) {
438 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
439 				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
440 				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
441 				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
442 				     "prefetchnta %0\n\t"
443 				     "prefetchnta %2\n\t"
444 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
445 				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
446 				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
447 				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
448 				     "vpmovm2b %%k1,%%zmm5\n\t"
449 				     "vpmovm2b %%k2,%%zmm7\n\t"
450 				     "vpmovm2b %%k3,%%zmm13\n\t"
451 				     "vpmovm2b %%k4,%%zmm15\n\t"
452 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
453 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
454 				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
455 				     "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
456 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
457 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
458 				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
459 				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
460 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
461 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
462 				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
463 				     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
464 				     "vmovdqa64 %0,%%zmm5\n\t"
465 				     "vmovdqa64 %1,%%zmm7\n\t"
466 				     "vmovdqa64 %2,%%zmm13\n\t"
467 				     "vmovdqa64 %3,%%zmm15\n\t"
468 				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
469 				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
470 				     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
471 				     "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
472 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
473 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
474 				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
475 				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
476 				     :
477 				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
478 				       "m" (dptr[z][d+128]),
479 				       "m" (dptr[z][d+192]));
480 		}
481 		asm volatile("prefetchnta %0\n\t"
482 			     "prefetchnta %1\n\t"
483 			     :
484 			     : "m" (q[d]), "m" (q[d+128]));
485 		/* P/Q left side optimization */
486 		for (z = start-1 ; z >= 0 ; z--) {
487 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
488 				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
489 				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
490 				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
491 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
492 				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
493 				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
494 				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
495 				     "vpmovm2b %%k1,%%zmm5\n\t"
496 				     "vpmovm2b %%k2,%%zmm7\n\t"
497 				     "vpmovm2b %%k3,%%zmm13\n\t"
498 				     "vpmovm2b %%k4,%%zmm15\n\t"
499 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
500 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
501 				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
502 				     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
503 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
504 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
505 				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
506 				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
507 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
508 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
509 				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
510 				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
511 				     :
512 				     : );
513 		}
514 		asm volatile("vmovntdq %%zmm2,%0\n\t"
515 			     "vmovntdq %%zmm3,%1\n\t"
516 			     "vmovntdq %%zmm10,%2\n\t"
517 			     "vmovntdq %%zmm11,%3\n\t"
518 			     "vpxorq %4,%%zmm4,%%zmm4\n\t"
519 			     "vpxorq %5,%%zmm6,%%zmm6\n\t"
520 			     "vpxorq %6,%%zmm12,%%zmm12\n\t"
521 			     "vpxorq %7,%%zmm14,%%zmm14\n\t"
522 			     "vmovntdq %%zmm4,%4\n\t"
523 			     "vmovntdq %%zmm6,%5\n\t"
524 			     "vmovntdq %%zmm12,%6\n\t"
525 			     "vmovntdq %%zmm14,%7"
526 			     :
527 			     : "m" (p[d]),  "m" (p[d+64]), "m" (p[d+128]),
528 			       "m" (p[d+192]), "m" (q[d]),  "m" (q[d+64]),
529 			       "m" (q[d+128]), "m" (q[d+192]));
530 	}
531 	asm volatile("sfence" : : : "memory");
532 	kernel_fpu_end();
533 }
534 const struct raid6_calls raid6_avx512x4 = {
535 	.gen_syndrome	= raid6_avx5124_gen_syndrome,
536 	.xor_syndrome	= raid6_avx5124_xor_syndrome,
537 	.name		= "avx512x4",
538 };
539 #endif
540