xref: /linux/lib/raid6/avx512.c (revision dee264c16a6334dcdbea5c186f5ff35f98b1df42)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* -*- linux-c -*- --------------------------------------------------------
3  *
4  *   Copyright (C) 2016 Intel Corporation
5  *
6  *   Author: Gayatri Kammela <gayatri.kammela@intel.com>
7  *   Author: Megha Dey <megha.dey@linux.intel.com>
8  *
9  *   Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
10  *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
11  *
12  * -----------------------------------------------------------------------
13  */
14 
15 /*
16  * AVX512 implementation of RAID-6 syndrome functions
17  *
18  */
19 
20 #include <linux/raid/pq.h>
21 #include "x86.h"
22 
23 static const struct raid6_avx512_constants {
24 	u64 x1d[8];
25 } raid6_avx512_constants __aligned(512/8) = {
26 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
27 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
28 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
29 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
30 };
31 
raid6_have_avx512(void)32 static int raid6_have_avx512(void)
33 {
34 	return boot_cpu_has(X86_FEATURE_AVX2) &&
35 		boot_cpu_has(X86_FEATURE_AVX) &&
36 		boot_cpu_has(X86_FEATURE_AVX512F) &&
37 		boot_cpu_has(X86_FEATURE_AVX512BW) &&
38 		boot_cpu_has(X86_FEATURE_AVX512VL) &&
39 		boot_cpu_has(X86_FEATURE_AVX512DQ);
40 }
41 
raid6_avx5121_gen_syndrome(int disks,size_t bytes,void ** ptrs)42 static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
43 {
44 	u8 **dptr = (u8 **)ptrs;
45 	u8 *p, *q;
46 	int d, z, z0;
47 
48 	z0 = disks - 3;         /* Highest data disk */
49 	p = dptr[z0+1];         /* XOR parity */
50 	q = dptr[z0+2];         /* RS syndrome */
51 
52 	kernel_fpu_begin();
53 
54 	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
55 		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
56 		     :
57 		     : "m" (raid6_avx512_constants.x1d[0]));
58 
59 	for (d = 0; d < bytes; d += 64) {
60 		asm volatile("prefetchnta %0\n\t"
61 			     "vmovdqa64 %0,%%zmm2\n\t"     /* P[0] */
62 			     "prefetchnta %1\n\t"
63 			     "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
64 			     "vmovdqa64 %1,%%zmm6"
65 			     :
66 			     : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
67 		for (z = z0-2; z >= 0; z--) {
68 			asm volatile("prefetchnta %0\n\t"
69 				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
70 				     "vpmovm2b %%k1,%%zmm5\n\t"
71 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
72 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
73 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
74 				     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
75 				     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
76 				     "vmovdqa64 %0,%%zmm6"
77 				     :
78 				     : "m" (dptr[z][d]));
79 		}
80 		asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
81 			     "vpmovm2b %%k1,%%zmm5\n\t"
82 			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
83 			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
84 			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
85 			     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
86 			     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
87 			     "vmovntdq %%zmm2,%0\n\t"
88 			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
89 			     "vmovntdq %%zmm4,%1\n\t"
90 			     "vpxorq %%zmm4,%%zmm4,%%zmm4"
91 			     :
92 			     : "m" (p[d]), "m" (q[d]));
93 	}
94 
95 	asm volatile("sfence" : : : "memory");
96 	kernel_fpu_end();
97 }
98 
raid6_avx5121_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)99 static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
100 				       size_t bytes, void **ptrs)
101 {
102 	u8 **dptr = (u8 **)ptrs;
103 	u8 *p, *q;
104 	int d, z, z0;
105 
106 	z0 = stop;		/* P/Q right side optimization */
107 	p = dptr[disks-2];	/* XOR parity */
108 	q = dptr[disks-1];	/* RS syndrome */
109 
110 	kernel_fpu_begin();
111 
112 	asm volatile("vmovdqa64 %0,%%zmm0"
113 		     : : "m" (raid6_avx512_constants.x1d[0]));
114 
115 	for (d = 0 ; d < bytes ; d += 64) {
116 		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
117 			     "vmovdqa64 %1,%%zmm2\n\t"
118 			     "vpxorq %%zmm4,%%zmm2,%%zmm2"
119 			     :
120 			     : "m" (dptr[z0][d]),  "m" (p[d]));
121 		/* P/Q data pages */
122 		for (z = z0-1 ; z >= start ; z--) {
123 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
124 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
125 				     "vpmovm2b %%k1,%%zmm5\n\t"
126 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
127 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
128 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
129 				     "vmovdqa64 %0,%%zmm5\n\t"
130 				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
131 				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
132 				     :
133 				     : "m" (dptr[z][d]));
134 		}
135 		/* P/Q left side optimization */
136 		for (z = start-1 ; z >= 0 ; z--) {
137 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
138 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
139 				     "vpmovm2b %%k1,%%zmm5\n\t"
140 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
141 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
142 				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
143 				     :
144 				     : );
145 		}
146 		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
147 		/* Don't use movntdq for r/w memory area < cache line */
148 			     "vmovdqa64 %%zmm4,%0\n\t"
149 			     "vmovdqa64 %%zmm2,%1"
150 			     :
151 			     : "m" (q[d]), "m" (p[d]));
152 	}
153 
154 	asm volatile("sfence" : : : "memory");
155 	kernel_fpu_end();
156 }
157 
158 const struct raid6_calls raid6_avx512x1 = {
159 	raid6_avx5121_gen_syndrome,
160 	raid6_avx5121_xor_syndrome,
161 	raid6_have_avx512,
162 	"avx512x1",
163 	.priority = 2		/* Prefer AVX512 over priority 1 (SSE2 and others) */
164 };
165 
166 /*
167  * Unrolled-by-2 AVX512 implementation
168  */
raid6_avx5122_gen_syndrome(int disks,size_t bytes,void ** ptrs)169 static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
170 {
171 	u8 **dptr = (u8 **)ptrs;
172 	u8 *p, *q;
173 	int d, z, z0;
174 
175 	z0 = disks - 3;         /* Highest data disk */
176 	p = dptr[z0+1];         /* XOR parity */
177 	q = dptr[z0+2];         /* RS syndrome */
178 
179 	kernel_fpu_begin();
180 
181 	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
182 		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
183 		     :
184 		     : "m" (raid6_avx512_constants.x1d[0]));
185 
186 	/* We uniformly assume a single prefetch covers at least 64 bytes */
187 	for (d = 0; d < bytes; d += 128) {
188 		asm volatile("prefetchnta %0\n\t"
189 			     "prefetchnta %1\n\t"
190 			     "vmovdqa64 %0,%%zmm2\n\t"      /* P[0] */
191 			     "vmovdqa64 %1,%%zmm3\n\t"      /* P[1] */
192 			     "vmovdqa64 %%zmm2,%%zmm4\n\t"  /* Q[0] */
193 			     "vmovdqa64 %%zmm3,%%zmm6"      /* Q[1] */
194 			     :
195 			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
196 		for (z = z0-1; z >= 0; z--) {
197 			asm volatile("prefetchnta %0\n\t"
198 				     "prefetchnta %1\n\t"
199 				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
200 				     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
201 				     "vpmovm2b %%k1,%%zmm5\n\t"
202 				     "vpmovm2b %%k2,%%zmm7\n\t"
203 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
204 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
205 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
206 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
207 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
208 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
209 				     "vmovdqa64 %0,%%zmm5\n\t"
210 				     "vmovdqa64 %1,%%zmm7\n\t"
211 				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
212 				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
213 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
214 				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
215 				     :
216 				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
217 		}
218 		asm volatile("vmovntdq %%zmm2,%0\n\t"
219 			     "vmovntdq %%zmm3,%1\n\t"
220 			     "vmovntdq %%zmm4,%2\n\t"
221 			     "vmovntdq %%zmm6,%3"
222 			     :
223 			     : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
224 			       "m" (q[d+64]));
225 	}
226 
227 	asm volatile("sfence" : : : "memory");
228 	kernel_fpu_end();
229 }
230 
raid6_avx5122_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)231 static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
232 				       size_t bytes, void **ptrs)
233 {
234 	u8 **dptr = (u8 **)ptrs;
235 	u8 *p, *q;
236 	int d, z, z0;
237 
238 	z0 = stop;		/* P/Q right side optimization */
239 	p = dptr[disks-2];	/* XOR parity */
240 	q = dptr[disks-1];	/* RS syndrome */
241 
242 	kernel_fpu_begin();
243 
244 	asm volatile("vmovdqa64 %0,%%zmm0"
245 		     : : "m" (raid6_avx512_constants.x1d[0]));
246 
247 	for (d = 0 ; d < bytes ; d += 128) {
248 		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
249 			     "vmovdqa64 %1,%%zmm6\n\t"
250 			     "vmovdqa64 %2,%%zmm2\n\t"
251 			     "vmovdqa64 %3,%%zmm3\n\t"
252 			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
253 			     "vpxorq %%zmm6,%%zmm3,%%zmm3"
254 			     :
255 			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
256 			       "m" (p[d]), "m" (p[d+64]));
257 		/* P/Q data pages */
258 		for (z = z0-1 ; z >= start ; z--) {
259 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
260 				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
261 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
262 				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
263 				     "vpmovm2b %%k1,%%zmm5\n\t"
264 				     "vpmovm2b %%k2,%%zmm7\n\t"
265 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
266 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
267 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
268 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
269 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
270 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
271 				     "vmovdqa64 %0,%%zmm5\n\t"
272 				     "vmovdqa64 %1,%%zmm7\n\t"
273 				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
274 				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
275 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
276 				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
277 				     :
278 				     : "m" (dptr[z][d]),  "m" (dptr[z][d+64]));
279 		}
280 		/* P/Q left side optimization */
281 		for (z = start-1 ; z >= 0 ; z--) {
282 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
283 				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
284 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
285 				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
286 				     "vpmovm2b %%k1,%%zmm5\n\t"
287 				     "vpmovm2b %%k2,%%zmm7\n\t"
288 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
289 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
290 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
291 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
292 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
293 				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
294 				     :
295 				     : );
296 		}
297 		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
298 			     "vpxorq %1,%%zmm6,%%zmm6\n\t"
299 			     /* Don't use movntdq for r/w
300 			      * memory area < cache line
301 			      */
302 			     "vmovdqa64 %%zmm4,%0\n\t"
303 			     "vmovdqa64 %%zmm6,%1\n\t"
304 			     "vmovdqa64 %%zmm2,%2\n\t"
305 			     "vmovdqa64 %%zmm3,%3"
306 			     :
307 			     : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
308 			       "m" (p[d+64]));
309 	}
310 
311 	asm volatile("sfence" : : : "memory");
312 	kernel_fpu_end();
313 }
314 
315 const struct raid6_calls raid6_avx512x2 = {
316 	raid6_avx5122_gen_syndrome,
317 	raid6_avx5122_xor_syndrome,
318 	raid6_have_avx512,
319 	"avx512x2",
320 	.priority = 2		/* Prefer AVX512 over priority 1 (SSE2 and others) */
321 };
322 
323 #ifdef CONFIG_X86_64
324 
325 /*
326  * Unrolled-by-4 AVX2 implementation
327  */
raid6_avx5124_gen_syndrome(int disks,size_t bytes,void ** ptrs)328 static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
329 {
330 	u8 **dptr = (u8 **)ptrs;
331 	u8 *p, *q;
332 	int d, z, z0;
333 
334 	z0 = disks - 3;         /* Highest data disk */
335 	p = dptr[z0+1];         /* XOR parity */
336 	q = dptr[z0+2];         /* RS syndrome */
337 
338 	kernel_fpu_begin();
339 
340 	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
341 		     "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t"       /* Zero temp */
342 		     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"       /* P[0] */
343 		     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"       /* P[1] */
344 		     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"       /* Q[0] */
345 		     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"       /* Q[1] */
346 		     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"    /* P[2] */
347 		     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"    /* P[3] */
348 		     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"    /* Q[2] */
349 		     "vpxorq %%zmm14,%%zmm14,%%zmm14"        /* Q[3] */
350 		     :
351 		     : "m" (raid6_avx512_constants.x1d[0]));
352 
353 	for (d = 0; d < bytes; d += 256) {
354 		for (z = z0; z >= 0; z--) {
355 		asm volatile("prefetchnta %0\n\t"
356 			     "prefetchnta %1\n\t"
357 			     "prefetchnta %2\n\t"
358 			     "prefetchnta %3\n\t"
359 			     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
360 			     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
361 			     "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
362 			     "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
363 			     "vpmovm2b %%k1,%%zmm5\n\t"
364 			     "vpmovm2b %%k2,%%zmm7\n\t"
365 			     "vpmovm2b %%k3,%%zmm13\n\t"
366 			     "vpmovm2b %%k4,%%zmm15\n\t"
367 			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
368 			     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
369 			     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
370 			     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
371 			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
372 			     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
373 			     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
374 			     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
375 			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
376 			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
377 			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
378 			     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
379 			     "vmovdqa64 %0,%%zmm5\n\t"
380 			     "vmovdqa64 %1,%%zmm7\n\t"
381 			     "vmovdqa64 %2,%%zmm13\n\t"
382 			     "vmovdqa64 %3,%%zmm15\n\t"
383 			     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
384 			     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
385 			     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
386 			     "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
387 			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
388 			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
389 			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
390 			     "vpxorq %%zmm15,%%zmm14,%%zmm14"
391 			     :
392 			     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
393 			       "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
394 		}
395 		asm volatile("vmovntdq %%zmm2,%0\n\t"
396 			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
397 			     "vmovntdq %%zmm3,%1\n\t"
398 			     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
399 			     "vmovntdq %%zmm10,%2\n\t"
400 			     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
401 			     "vmovntdq %%zmm11,%3\n\t"
402 			     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
403 			     "vmovntdq %%zmm4,%4\n\t"
404 			     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
405 			     "vmovntdq %%zmm6,%5\n\t"
406 			     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
407 			     "vmovntdq %%zmm12,%6\n\t"
408 			     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
409 			     "vmovntdq %%zmm14,%7\n\t"
410 			     "vpxorq %%zmm14,%%zmm14,%%zmm14"
411 			     :
412 			     : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
413 			       "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
414 			       "m" (q[d+128]), "m" (q[d+192]));
415 	}
416 
417 	asm volatile("sfence" : : : "memory");
418 	kernel_fpu_end();
419 }
420 
raid6_avx5124_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)421 static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
422 				       size_t bytes, void **ptrs)
423 {
424 	u8 **dptr = (u8 **)ptrs;
425 	u8 *p, *q;
426 	int d, z, z0;
427 
428 	z0 = stop;		/* P/Q right side optimization */
429 	p = dptr[disks-2];	/* XOR parity */
430 	q = dptr[disks-1];	/* RS syndrome */
431 
432 	kernel_fpu_begin();
433 
434 	asm volatile("vmovdqa64 %0,%%zmm0"
435 		     :: "m" (raid6_avx512_constants.x1d[0]));
436 
437 	for (d = 0 ; d < bytes ; d += 256) {
438 		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
439 			     "vmovdqa64 %1,%%zmm6\n\t"
440 			     "vmovdqa64 %2,%%zmm12\n\t"
441 			     "vmovdqa64 %3,%%zmm14\n\t"
442 			     "vmovdqa64 %4,%%zmm2\n\t"
443 			     "vmovdqa64 %5,%%zmm3\n\t"
444 			     "vmovdqa64 %6,%%zmm10\n\t"
445 			     "vmovdqa64 %7,%%zmm11\n\t"
446 			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
447 			     "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
448 			     "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
449 			     "vpxorq %%zmm14,%%zmm11,%%zmm11"
450 			     :
451 			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
452 			       "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
453 			       "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
454 			       "m" (p[d+192]));
455 		/* P/Q data pages */
456 		for (z = z0-1 ; z >= start ; z--) {
457 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
458 				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
459 				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
460 				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
461 				     "prefetchnta %0\n\t"
462 				     "prefetchnta %2\n\t"
463 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
464 				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
465 				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
466 				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
467 				     "vpmovm2b %%k1,%%zmm5\n\t"
468 				     "vpmovm2b %%k2,%%zmm7\n\t"
469 				     "vpmovm2b %%k3,%%zmm13\n\t"
470 				     "vpmovm2b %%k4,%%zmm15\n\t"
471 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
472 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
473 				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
474 				     "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
475 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
476 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
477 				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
478 				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
479 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
480 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
481 				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
482 				     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
483 				     "vmovdqa64 %0,%%zmm5\n\t"
484 				     "vmovdqa64 %1,%%zmm7\n\t"
485 				     "vmovdqa64 %2,%%zmm13\n\t"
486 				     "vmovdqa64 %3,%%zmm15\n\t"
487 				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
488 				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
489 				     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
490 				     "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
491 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
492 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
493 				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
494 				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
495 				     :
496 				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
497 				       "m" (dptr[z][d+128]),
498 				       "m" (dptr[z][d+192]));
499 		}
500 		asm volatile("prefetchnta %0\n\t"
501 			     "prefetchnta %1\n\t"
502 			     :
503 			     : "m" (q[d]), "m" (q[d+128]));
504 		/* P/Q left side optimization */
505 		for (z = start-1 ; z >= 0 ; z--) {
506 			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
507 				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
508 				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
509 				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
510 				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
511 				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
512 				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
513 				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
514 				     "vpmovm2b %%k1,%%zmm5\n\t"
515 				     "vpmovm2b %%k2,%%zmm7\n\t"
516 				     "vpmovm2b %%k3,%%zmm13\n\t"
517 				     "vpmovm2b %%k4,%%zmm15\n\t"
518 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
519 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
520 				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
521 				     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
522 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
523 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
524 				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
525 				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
526 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
527 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
528 				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
529 				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
530 				     :
531 				     : );
532 		}
533 		asm volatile("vmovntdq %%zmm2,%0\n\t"
534 			     "vmovntdq %%zmm3,%1\n\t"
535 			     "vmovntdq %%zmm10,%2\n\t"
536 			     "vmovntdq %%zmm11,%3\n\t"
537 			     "vpxorq %4,%%zmm4,%%zmm4\n\t"
538 			     "vpxorq %5,%%zmm6,%%zmm6\n\t"
539 			     "vpxorq %6,%%zmm12,%%zmm12\n\t"
540 			     "vpxorq %7,%%zmm14,%%zmm14\n\t"
541 			     "vmovntdq %%zmm4,%4\n\t"
542 			     "vmovntdq %%zmm6,%5\n\t"
543 			     "vmovntdq %%zmm12,%6\n\t"
544 			     "vmovntdq %%zmm14,%7"
545 			     :
546 			     : "m" (p[d]),  "m" (p[d+64]), "m" (p[d+128]),
547 			       "m" (p[d+192]), "m" (q[d]),  "m" (q[d+64]),
548 			       "m" (q[d+128]), "m" (q[d+192]));
549 	}
550 	asm volatile("sfence" : : : "memory");
551 	kernel_fpu_end();
552 }
553 const struct raid6_calls raid6_avx512x4 = {
554 	raid6_avx5124_gen_syndrome,
555 	raid6_avx5124_xor_syndrome,
556 	raid6_have_avx512,
557 	"avx512x4",
558 	.priority = 2		/* Prefer AVX512 over priority 1 (SSE2 and others) */
559 };
560 #endif
561