1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* -*- linux-c -*- --------------------------------------------------------
3 *
4 * Copyright (C) 2016 Intel Corporation
5 *
6 * Author: Gayatri Kammela <gayatri.kammela@intel.com>
7 * Author: Megha Dey <megha.dey@linux.intel.com>
8 *
9 * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
10 * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
11 *
12 * -----------------------------------------------------------------------
13 */
14
15 /*
16 * AVX512 implementation of RAID-6 syndrome functions
17 *
18 */
19
20 #include <linux/raid/pq.h>
21 #include "x86.h"
22
23 static const struct raid6_avx512_constants {
24 u64 x1d[8];
25 } raid6_avx512_constants __aligned(512/8) = {
26 { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
27 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
28 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
29 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
30 };
31
raid6_have_avx512(void)32 static int raid6_have_avx512(void)
33 {
34 return boot_cpu_has(X86_FEATURE_AVX2) &&
35 boot_cpu_has(X86_FEATURE_AVX) &&
36 boot_cpu_has(X86_FEATURE_AVX512F) &&
37 boot_cpu_has(X86_FEATURE_AVX512BW) &&
38 boot_cpu_has(X86_FEATURE_AVX512VL) &&
39 boot_cpu_has(X86_FEATURE_AVX512DQ);
40 }
41
raid6_avx5121_gen_syndrome(int disks,size_t bytes,void ** ptrs)42 static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
43 {
44 u8 **dptr = (u8 **)ptrs;
45 u8 *p, *q;
46 int d, z, z0;
47
48 z0 = disks - 3; /* Highest data disk */
49 p = dptr[z0+1]; /* XOR parity */
50 q = dptr[z0+2]; /* RS syndrome */
51
52 kernel_fpu_begin();
53
54 asm volatile("vmovdqa64 %0,%%zmm0\n\t"
55 "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
56 :
57 : "m" (raid6_avx512_constants.x1d[0]));
58
59 for (d = 0; d < bytes; d += 64) {
60 asm volatile("prefetchnta %0\n\t"
61 "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
62 "prefetchnta %1\n\t"
63 "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
64 "vmovdqa64 %1,%%zmm6"
65 :
66 : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
67 for (z = z0-2; z >= 0; z--) {
68 asm volatile("prefetchnta %0\n\t"
69 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
70 "vpmovm2b %%k1,%%zmm5\n\t"
71 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
72 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
73 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
74 "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
75 "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
76 "vmovdqa64 %0,%%zmm6"
77 :
78 : "m" (dptr[z][d]));
79 }
80 asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
81 "vpmovm2b %%k1,%%zmm5\n\t"
82 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
83 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
84 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
85 "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
86 "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
87 "vmovntdq %%zmm2,%0\n\t"
88 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
89 "vmovntdq %%zmm4,%1\n\t"
90 "vpxorq %%zmm4,%%zmm4,%%zmm4"
91 :
92 : "m" (p[d]), "m" (q[d]));
93 }
94
95 asm volatile("sfence" : : : "memory");
96 kernel_fpu_end();
97 }
98
raid6_avx5121_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)99 static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
100 size_t bytes, void **ptrs)
101 {
102 u8 **dptr = (u8 **)ptrs;
103 u8 *p, *q;
104 int d, z, z0;
105
106 z0 = stop; /* P/Q right side optimization */
107 p = dptr[disks-2]; /* XOR parity */
108 q = dptr[disks-1]; /* RS syndrome */
109
110 kernel_fpu_begin();
111
112 asm volatile("vmovdqa64 %0,%%zmm0"
113 : : "m" (raid6_avx512_constants.x1d[0]));
114
115 for (d = 0 ; d < bytes ; d += 64) {
116 asm volatile("vmovdqa64 %0,%%zmm4\n\t"
117 "vmovdqa64 %1,%%zmm2\n\t"
118 "vpxorq %%zmm4,%%zmm2,%%zmm2"
119 :
120 : "m" (dptr[z0][d]), "m" (p[d]));
121 /* P/Q data pages */
122 for (z = z0-1 ; z >= start ; z--) {
123 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
124 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
125 "vpmovm2b %%k1,%%zmm5\n\t"
126 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
127 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
128 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
129 "vmovdqa64 %0,%%zmm5\n\t"
130 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
131 "vpxorq %%zmm5,%%zmm4,%%zmm4"
132 :
133 : "m" (dptr[z][d]));
134 }
135 /* P/Q left side optimization */
136 for (z = start-1 ; z >= 0 ; z--) {
137 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
138 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
139 "vpmovm2b %%k1,%%zmm5\n\t"
140 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
141 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
142 "vpxorq %%zmm5,%%zmm4,%%zmm4"
143 :
144 : );
145 }
146 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
147 /* Don't use movntdq for r/w memory area < cache line */
148 "vmovdqa64 %%zmm4,%0\n\t"
149 "vmovdqa64 %%zmm2,%1"
150 :
151 : "m" (q[d]), "m" (p[d]));
152 }
153
154 asm volatile("sfence" : : : "memory");
155 kernel_fpu_end();
156 }
157
158 const struct raid6_calls raid6_avx512x1 = {
159 raid6_avx5121_gen_syndrome,
160 raid6_avx5121_xor_syndrome,
161 raid6_have_avx512,
162 "avx512x1",
163 .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */
164 };
165
166 /*
167 * Unrolled-by-2 AVX512 implementation
168 */
raid6_avx5122_gen_syndrome(int disks,size_t bytes,void ** ptrs)169 static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
170 {
171 u8 **dptr = (u8 **)ptrs;
172 u8 *p, *q;
173 int d, z, z0;
174
175 z0 = disks - 3; /* Highest data disk */
176 p = dptr[z0+1]; /* XOR parity */
177 q = dptr[z0+2]; /* RS syndrome */
178
179 kernel_fpu_begin();
180
181 asm volatile("vmovdqa64 %0,%%zmm0\n\t"
182 "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
183 :
184 : "m" (raid6_avx512_constants.x1d[0]));
185
186 /* We uniformly assume a single prefetch covers at least 64 bytes */
187 for (d = 0; d < bytes; d += 128) {
188 asm volatile("prefetchnta %0\n\t"
189 "prefetchnta %1\n\t"
190 "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
191 "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */
192 "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
193 "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */
194 :
195 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
196 for (z = z0-1; z >= 0; z--) {
197 asm volatile("prefetchnta %0\n\t"
198 "prefetchnta %1\n\t"
199 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
200 "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
201 "vpmovm2b %%k1,%%zmm5\n\t"
202 "vpmovm2b %%k2,%%zmm7\n\t"
203 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
204 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
205 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
206 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
207 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
208 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
209 "vmovdqa64 %0,%%zmm5\n\t"
210 "vmovdqa64 %1,%%zmm7\n\t"
211 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
212 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
213 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
214 "vpxorq %%zmm7,%%zmm6,%%zmm6"
215 :
216 : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
217 }
218 asm volatile("vmovntdq %%zmm2,%0\n\t"
219 "vmovntdq %%zmm3,%1\n\t"
220 "vmovntdq %%zmm4,%2\n\t"
221 "vmovntdq %%zmm6,%3"
222 :
223 : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
224 "m" (q[d+64]));
225 }
226
227 asm volatile("sfence" : : : "memory");
228 kernel_fpu_end();
229 }
230
raid6_avx5122_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)231 static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
232 size_t bytes, void **ptrs)
233 {
234 u8 **dptr = (u8 **)ptrs;
235 u8 *p, *q;
236 int d, z, z0;
237
238 z0 = stop; /* P/Q right side optimization */
239 p = dptr[disks-2]; /* XOR parity */
240 q = dptr[disks-1]; /* RS syndrome */
241
242 kernel_fpu_begin();
243
244 asm volatile("vmovdqa64 %0,%%zmm0"
245 : : "m" (raid6_avx512_constants.x1d[0]));
246
247 for (d = 0 ; d < bytes ; d += 128) {
248 asm volatile("vmovdqa64 %0,%%zmm4\n\t"
249 "vmovdqa64 %1,%%zmm6\n\t"
250 "vmovdqa64 %2,%%zmm2\n\t"
251 "vmovdqa64 %3,%%zmm3\n\t"
252 "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
253 "vpxorq %%zmm6,%%zmm3,%%zmm3"
254 :
255 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
256 "m" (p[d]), "m" (p[d+64]));
257 /* P/Q data pages */
258 for (z = z0-1 ; z >= start ; z--) {
259 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
260 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
261 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
262 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
263 "vpmovm2b %%k1,%%zmm5\n\t"
264 "vpmovm2b %%k2,%%zmm7\n\t"
265 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
266 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
267 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
268 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
269 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
270 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
271 "vmovdqa64 %0,%%zmm5\n\t"
272 "vmovdqa64 %1,%%zmm7\n\t"
273 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
274 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
275 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
276 "vpxorq %%zmm7,%%zmm6,%%zmm6"
277 :
278 : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
279 }
280 /* P/Q left side optimization */
281 for (z = start-1 ; z >= 0 ; z--) {
282 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
283 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
284 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
285 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
286 "vpmovm2b %%k1,%%zmm5\n\t"
287 "vpmovm2b %%k2,%%zmm7\n\t"
288 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
289 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
290 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
291 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
292 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
293 "vpxorq %%zmm7,%%zmm6,%%zmm6"
294 :
295 : );
296 }
297 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
298 "vpxorq %1,%%zmm6,%%zmm6\n\t"
299 /* Don't use movntdq for r/w
300 * memory area < cache line
301 */
302 "vmovdqa64 %%zmm4,%0\n\t"
303 "vmovdqa64 %%zmm6,%1\n\t"
304 "vmovdqa64 %%zmm2,%2\n\t"
305 "vmovdqa64 %%zmm3,%3"
306 :
307 : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
308 "m" (p[d+64]));
309 }
310
311 asm volatile("sfence" : : : "memory");
312 kernel_fpu_end();
313 }
314
315 const struct raid6_calls raid6_avx512x2 = {
316 raid6_avx5122_gen_syndrome,
317 raid6_avx5122_xor_syndrome,
318 raid6_have_avx512,
319 "avx512x2",
320 .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */
321 };
322
323 #ifdef CONFIG_X86_64
324
325 /*
326 * Unrolled-by-4 AVX2 implementation
327 */
raid6_avx5124_gen_syndrome(int disks,size_t bytes,void ** ptrs)328 static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
329 {
330 u8 **dptr = (u8 **)ptrs;
331 u8 *p, *q;
332 int d, z, z0;
333
334 z0 = disks - 3; /* Highest data disk */
335 p = dptr[z0+1]; /* XOR parity */
336 q = dptr[z0+2]; /* RS syndrome */
337
338 kernel_fpu_begin();
339
340 asm volatile("vmovdqa64 %0,%%zmm0\n\t"
341 "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */
342 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */
343 "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */
344 "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */
345 "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */
346 "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */
347 "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */
348 "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */
349 "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */
350 :
351 : "m" (raid6_avx512_constants.x1d[0]));
352
353 for (d = 0; d < bytes; d += 256) {
354 for (z = z0; z >= 0; z--) {
355 asm volatile("prefetchnta %0\n\t"
356 "prefetchnta %1\n\t"
357 "prefetchnta %2\n\t"
358 "prefetchnta %3\n\t"
359 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
360 "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
361 "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
362 "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
363 "vpmovm2b %%k1,%%zmm5\n\t"
364 "vpmovm2b %%k2,%%zmm7\n\t"
365 "vpmovm2b %%k3,%%zmm13\n\t"
366 "vpmovm2b %%k4,%%zmm15\n\t"
367 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
368 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
369 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
370 "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
371 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
372 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
373 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
374 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
375 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
376 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
377 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
378 "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
379 "vmovdqa64 %0,%%zmm5\n\t"
380 "vmovdqa64 %1,%%zmm7\n\t"
381 "vmovdqa64 %2,%%zmm13\n\t"
382 "vmovdqa64 %3,%%zmm15\n\t"
383 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
384 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
385 "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
386 "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
387 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
388 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
389 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
390 "vpxorq %%zmm15,%%zmm14,%%zmm14"
391 :
392 : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
393 "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
394 }
395 asm volatile("vmovntdq %%zmm2,%0\n\t"
396 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
397 "vmovntdq %%zmm3,%1\n\t"
398 "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
399 "vmovntdq %%zmm10,%2\n\t"
400 "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
401 "vmovntdq %%zmm11,%3\n\t"
402 "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
403 "vmovntdq %%zmm4,%4\n\t"
404 "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
405 "vmovntdq %%zmm6,%5\n\t"
406 "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
407 "vmovntdq %%zmm12,%6\n\t"
408 "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
409 "vmovntdq %%zmm14,%7\n\t"
410 "vpxorq %%zmm14,%%zmm14,%%zmm14"
411 :
412 : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
413 "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
414 "m" (q[d+128]), "m" (q[d+192]));
415 }
416
417 asm volatile("sfence" : : : "memory");
418 kernel_fpu_end();
419 }
420
raid6_avx5124_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)421 static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
422 size_t bytes, void **ptrs)
423 {
424 u8 **dptr = (u8 **)ptrs;
425 u8 *p, *q;
426 int d, z, z0;
427
428 z0 = stop; /* P/Q right side optimization */
429 p = dptr[disks-2]; /* XOR parity */
430 q = dptr[disks-1]; /* RS syndrome */
431
432 kernel_fpu_begin();
433
434 asm volatile("vmovdqa64 %0,%%zmm0"
435 :: "m" (raid6_avx512_constants.x1d[0]));
436
437 for (d = 0 ; d < bytes ; d += 256) {
438 asm volatile("vmovdqa64 %0,%%zmm4\n\t"
439 "vmovdqa64 %1,%%zmm6\n\t"
440 "vmovdqa64 %2,%%zmm12\n\t"
441 "vmovdqa64 %3,%%zmm14\n\t"
442 "vmovdqa64 %4,%%zmm2\n\t"
443 "vmovdqa64 %5,%%zmm3\n\t"
444 "vmovdqa64 %6,%%zmm10\n\t"
445 "vmovdqa64 %7,%%zmm11\n\t"
446 "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
447 "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
448 "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
449 "vpxorq %%zmm14,%%zmm11,%%zmm11"
450 :
451 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
452 "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
453 "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
454 "m" (p[d+192]));
455 /* P/Q data pages */
456 for (z = z0-1 ; z >= start ; z--) {
457 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
458 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
459 "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
460 "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
461 "prefetchnta %0\n\t"
462 "prefetchnta %2\n\t"
463 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
464 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
465 "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
466 "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
467 "vpmovm2b %%k1,%%zmm5\n\t"
468 "vpmovm2b %%k2,%%zmm7\n\t"
469 "vpmovm2b %%k3,%%zmm13\n\t"
470 "vpmovm2b %%k4,%%zmm15\n\t"
471 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
472 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
473 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
474 "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
475 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
476 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
477 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
478 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
479 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
480 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
481 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
482 "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
483 "vmovdqa64 %0,%%zmm5\n\t"
484 "vmovdqa64 %1,%%zmm7\n\t"
485 "vmovdqa64 %2,%%zmm13\n\t"
486 "vmovdqa64 %3,%%zmm15\n\t"
487 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
488 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
489 "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
490 "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
491 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
492 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
493 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
494 "vpxorq %%zmm15,%%zmm14,%%zmm14"
495 :
496 : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
497 "m" (dptr[z][d+128]),
498 "m" (dptr[z][d+192]));
499 }
500 asm volatile("prefetchnta %0\n\t"
501 "prefetchnta %1\n\t"
502 :
503 : "m" (q[d]), "m" (q[d+128]));
504 /* P/Q left side optimization */
505 for (z = start-1 ; z >= 0 ; z--) {
506 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
507 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
508 "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
509 "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
510 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
511 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
512 "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
513 "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
514 "vpmovm2b %%k1,%%zmm5\n\t"
515 "vpmovm2b %%k2,%%zmm7\n\t"
516 "vpmovm2b %%k3,%%zmm13\n\t"
517 "vpmovm2b %%k4,%%zmm15\n\t"
518 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
519 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
520 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
521 "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
522 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
523 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
524 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
525 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
526 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
527 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
528 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
529 "vpxorq %%zmm15,%%zmm14,%%zmm14"
530 :
531 : );
532 }
533 asm volatile("vmovntdq %%zmm2,%0\n\t"
534 "vmovntdq %%zmm3,%1\n\t"
535 "vmovntdq %%zmm10,%2\n\t"
536 "vmovntdq %%zmm11,%3\n\t"
537 "vpxorq %4,%%zmm4,%%zmm4\n\t"
538 "vpxorq %5,%%zmm6,%%zmm6\n\t"
539 "vpxorq %6,%%zmm12,%%zmm12\n\t"
540 "vpxorq %7,%%zmm14,%%zmm14\n\t"
541 "vmovntdq %%zmm4,%4\n\t"
542 "vmovntdq %%zmm6,%5\n\t"
543 "vmovntdq %%zmm12,%6\n\t"
544 "vmovntdq %%zmm14,%7"
545 :
546 : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
547 "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
548 "m" (q[d+128]), "m" (q[d+192]));
549 }
550 asm volatile("sfence" : : : "memory");
551 kernel_fpu_end();
552 }
553 const struct raid6_calls raid6_avx512x4 = {
554 raid6_avx5124_gen_syndrome,
555 raid6_avx5124_xor_syndrome,
556 raid6_have_avx512,
557 "avx512x4",
558 .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */
559 };
560 #endif
561