1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* -*- linux-c -*- -------------------------------------------------------- 3 * 4 * Copyright (C) 2016 Intel Corporation 5 * 6 * Author: Gayatri Kammela <gayatri.kammela@intel.com> 7 * Author: Megha Dey <megha.dey@linux.intel.com> 8 * 9 * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved 10 * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved 11 * 12 * ----------------------------------------------------------------------- 13 */ 14 15 /* 16 * AVX512 implementation of RAID-6 syndrome functions 17 * 18 */ 19 20 #include <linux/raid/pq.h> 21 #include "x86.h" 22 23 static const struct raid6_avx512_constants { 24 u64 x1d[8]; 25 } raid6_avx512_constants __aligned(512/8) = { 26 { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 27 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 28 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 29 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, 30 }; 31 32 static int raid6_have_avx512(void) 33 { 34 return boot_cpu_has(X86_FEATURE_AVX2) && 35 boot_cpu_has(X86_FEATURE_AVX) && 36 boot_cpu_has(X86_FEATURE_AVX512F) && 37 boot_cpu_has(X86_FEATURE_AVX512BW) && 38 boot_cpu_has(X86_FEATURE_AVX512VL) && 39 boot_cpu_has(X86_FEATURE_AVX512DQ); 40 } 41 42 static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs) 43 { 44 u8 **dptr = (u8 **)ptrs; 45 u8 *p, *q; 46 int d, z, z0; 47 48 z0 = disks - 3; /* Highest data disk */ 49 p = dptr[z0+1]; /* XOR parity */ 50 q = dptr[z0+2]; /* RS syndrome */ 51 52 kernel_fpu_begin(); 53 54 asm volatile("vmovdqa64 %0,%%zmm0\n\t" 55 "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ 56 : 57 : "m" (raid6_avx512_constants.x1d[0])); 58 59 for (d = 0; d < bytes; d += 64) { 60 asm volatile("prefetchnta %0\n\t" 61 "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ 62 "prefetchnta %1\n\t" 63 "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ 64 "vmovdqa64 %1,%%zmm6" 65 : 66 : "m" (dptr[z0][d]), "m" (dptr[z0-1][d])); 67 for (z = z0-2; z >= 0; z--) { 68 asm volatile("prefetchnta %0\n\t" 69 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 70 "vpmovm2b %%k1,%%zmm5\n\t" 71 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 72 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 73 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 74 "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" 75 "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" 76 "vmovdqa64 %0,%%zmm6" 77 : 78 : "m" (dptr[z][d])); 79 } 80 asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 81 "vpmovm2b %%k1,%%zmm5\n\t" 82 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 83 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 84 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 85 "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" 86 "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" 87 "vmovntdq %%zmm2,%0\n\t" 88 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" 89 "vmovntdq %%zmm4,%1\n\t" 90 "vpxorq %%zmm4,%%zmm4,%%zmm4" 91 : 92 : "m" (p[d]), "m" (q[d])); 93 } 94 95 asm volatile("sfence" : : : "memory"); 96 kernel_fpu_end(); 97 } 98 99 static void raid6_avx5121_xor_syndrome(int disks, int start, int stop, 100 size_t bytes, void **ptrs) 101 { 102 u8 **dptr = (u8 **)ptrs; 103 u8 *p, *q; 104 int d, z, z0; 105 106 z0 = stop; /* P/Q right side optimization */ 107 p = dptr[disks-2]; /* XOR parity */ 108 q = dptr[disks-1]; /* RS syndrome */ 109 110 kernel_fpu_begin(); 111 112 asm volatile("vmovdqa64 %0,%%zmm0" 113 : : "m" (raid6_avx512_constants.x1d[0])); 114 115 for (d = 0 ; d < bytes ; d += 64) { 116 asm volatile("vmovdqa64 %0,%%zmm4\n\t" 117 "vmovdqa64 %1,%%zmm2\n\t" 118 "vpxorq %%zmm4,%%zmm2,%%zmm2" 119 : 120 : "m" (dptr[z0][d]), "m" (p[d])); 121 /* P/Q data pages */ 122 for (z = z0-1 ; z >= start ; z--) { 123 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 124 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 125 "vpmovm2b %%k1,%%zmm5\n\t" 126 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 127 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 128 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 129 "vmovdqa64 %0,%%zmm5\n\t" 130 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 131 "vpxorq %%zmm5,%%zmm4,%%zmm4" 132 : 133 : "m" (dptr[z][d])); 134 } 135 /* P/Q left side optimization */ 136 for (z = start-1 ; z >= 0 ; z--) { 137 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 138 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 139 "vpmovm2b %%k1,%%zmm5\n\t" 140 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 141 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 142 "vpxorq %%zmm5,%%zmm4,%%zmm4" 143 : 144 : ); 145 } 146 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" 147 /* Don't use movntdq for r/w memory area < cache line */ 148 "vmovdqa64 %%zmm4,%0\n\t" 149 "vmovdqa64 %%zmm2,%1" 150 : 151 : "m" (q[d]), "m" (p[d])); 152 } 153 154 asm volatile("sfence" : : : "memory"); 155 kernel_fpu_end(); 156 } 157 158 const struct raid6_calls raid6_avx512x1 = { 159 raid6_avx5121_gen_syndrome, 160 raid6_avx5121_xor_syndrome, 161 raid6_have_avx512, 162 "avx512x1", 163 .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ 164 }; 165 166 /* 167 * Unrolled-by-2 AVX512 implementation 168 */ 169 static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs) 170 { 171 u8 **dptr = (u8 **)ptrs; 172 u8 *p, *q; 173 int d, z, z0; 174 175 z0 = disks - 3; /* Highest data disk */ 176 p = dptr[z0+1]; /* XOR parity */ 177 q = dptr[z0+2]; /* RS syndrome */ 178 179 kernel_fpu_begin(); 180 181 asm volatile("vmovdqa64 %0,%%zmm0\n\t" 182 "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ 183 : 184 : "m" (raid6_avx512_constants.x1d[0])); 185 186 /* We uniformly assume a single prefetch covers at least 64 bytes */ 187 for (d = 0; d < bytes; d += 128) { 188 asm volatile("prefetchnta %0\n\t" 189 "prefetchnta %1\n\t" 190 "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ 191 "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */ 192 "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ 193 "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */ 194 : 195 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64])); 196 for (z = z0-1; z >= 0; z--) { 197 asm volatile("prefetchnta %0\n\t" 198 "prefetchnta %1\n\t" 199 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 200 "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" 201 "vpmovm2b %%k1,%%zmm5\n\t" 202 "vpmovm2b %%k2,%%zmm7\n\t" 203 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 204 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 205 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 206 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 207 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 208 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 209 "vmovdqa64 %0,%%zmm5\n\t" 210 "vmovdqa64 %1,%%zmm7\n\t" 211 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 212 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 213 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 214 "vpxorq %%zmm7,%%zmm6,%%zmm6" 215 : 216 : "m" (dptr[z][d]), "m" (dptr[z][d+64])); 217 } 218 asm volatile("vmovntdq %%zmm2,%0\n\t" 219 "vmovntdq %%zmm3,%1\n\t" 220 "vmovntdq %%zmm4,%2\n\t" 221 "vmovntdq %%zmm6,%3" 222 : 223 : "m" (p[d]), "m" (p[d+64]), "m" (q[d]), 224 "m" (q[d+64])); 225 } 226 227 asm volatile("sfence" : : : "memory"); 228 kernel_fpu_end(); 229 } 230 231 static void raid6_avx5122_xor_syndrome(int disks, int start, int stop, 232 size_t bytes, void **ptrs) 233 { 234 u8 **dptr = (u8 **)ptrs; 235 u8 *p, *q; 236 int d, z, z0; 237 238 z0 = stop; /* P/Q right side optimization */ 239 p = dptr[disks-2]; /* XOR parity */ 240 q = dptr[disks-1]; /* RS syndrome */ 241 242 kernel_fpu_begin(); 243 244 asm volatile("vmovdqa64 %0,%%zmm0" 245 : : "m" (raid6_avx512_constants.x1d[0])); 246 247 for (d = 0 ; d < bytes ; d += 128) { 248 asm volatile("vmovdqa64 %0,%%zmm4\n\t" 249 "vmovdqa64 %1,%%zmm6\n\t" 250 "vmovdqa64 %2,%%zmm2\n\t" 251 "vmovdqa64 %3,%%zmm3\n\t" 252 "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" 253 "vpxorq %%zmm6,%%zmm3,%%zmm3" 254 : 255 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), 256 "m" (p[d]), "m" (p[d+64])); 257 /* P/Q data pages */ 258 for (z = z0-1 ; z >= start ; z--) { 259 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 260 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 261 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 262 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 263 "vpmovm2b %%k1,%%zmm5\n\t" 264 "vpmovm2b %%k2,%%zmm7\n\t" 265 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 266 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 267 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 268 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 269 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 270 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 271 "vmovdqa64 %0,%%zmm5\n\t" 272 "vmovdqa64 %1,%%zmm7\n\t" 273 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 274 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 275 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 276 "vpxorq %%zmm7,%%zmm6,%%zmm6" 277 : 278 : "m" (dptr[z][d]), "m" (dptr[z][d+64])); 279 } 280 /* P/Q left side optimization */ 281 for (z = start-1 ; z >= 0 ; z--) { 282 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 283 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 284 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 285 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 286 "vpmovm2b %%k1,%%zmm5\n\t" 287 "vpmovm2b %%k2,%%zmm7\n\t" 288 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 289 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 290 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 291 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 292 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 293 "vpxorq %%zmm7,%%zmm6,%%zmm6" 294 : 295 : ); 296 } 297 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" 298 "vpxorq %1,%%zmm6,%%zmm6\n\t" 299 /* Don't use movntdq for r/w 300 * memory area < cache line 301 */ 302 "vmovdqa64 %%zmm4,%0\n\t" 303 "vmovdqa64 %%zmm6,%1\n\t" 304 "vmovdqa64 %%zmm2,%2\n\t" 305 "vmovdqa64 %%zmm3,%3" 306 : 307 : "m" (q[d]), "m" (q[d+64]), "m" (p[d]), 308 "m" (p[d+64])); 309 } 310 311 asm volatile("sfence" : : : "memory"); 312 kernel_fpu_end(); 313 } 314 315 const struct raid6_calls raid6_avx512x2 = { 316 raid6_avx5122_gen_syndrome, 317 raid6_avx5122_xor_syndrome, 318 raid6_have_avx512, 319 "avx512x2", 320 .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ 321 }; 322 323 #ifdef CONFIG_X86_64 324 325 /* 326 * Unrolled-by-4 AVX2 implementation 327 */ 328 static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs) 329 { 330 u8 **dptr = (u8 **)ptrs; 331 u8 *p, *q; 332 int d, z, z0; 333 334 z0 = disks - 3; /* Highest data disk */ 335 p = dptr[z0+1]; /* XOR parity */ 336 q = dptr[z0+2]; /* RS syndrome */ 337 338 kernel_fpu_begin(); 339 340 asm volatile("vmovdqa64 %0,%%zmm0\n\t" 341 "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */ 342 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */ 343 "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */ 344 "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */ 345 "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */ 346 "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */ 347 "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */ 348 "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */ 349 "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */ 350 : 351 : "m" (raid6_avx512_constants.x1d[0])); 352 353 for (d = 0; d < bytes; d += 256) { 354 for (z = z0; z >= 0; z--) { 355 asm volatile("prefetchnta %0\n\t" 356 "prefetchnta %1\n\t" 357 "prefetchnta %2\n\t" 358 "prefetchnta %3\n\t" 359 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 360 "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" 361 "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t" 362 "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t" 363 "vpmovm2b %%k1,%%zmm5\n\t" 364 "vpmovm2b %%k2,%%zmm7\n\t" 365 "vpmovm2b %%k3,%%zmm13\n\t" 366 "vpmovm2b %%k4,%%zmm15\n\t" 367 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 368 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 369 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 370 "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" 371 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 372 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 373 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 374 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 375 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 376 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 377 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 378 "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" 379 "vmovdqa64 %0,%%zmm5\n\t" 380 "vmovdqa64 %1,%%zmm7\n\t" 381 "vmovdqa64 %2,%%zmm13\n\t" 382 "vmovdqa64 %3,%%zmm15\n\t" 383 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 384 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 385 "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" 386 "vpxorq %%zmm15,%%zmm11,%%zmm11\n" 387 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 388 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 389 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 390 "vpxorq %%zmm15,%%zmm14,%%zmm14" 391 : 392 : "m" (dptr[z][d]), "m" (dptr[z][d+64]), 393 "m" (dptr[z][d+128]), "m" (dptr[z][d+192])); 394 } 395 asm volatile("vmovntdq %%zmm2,%0\n\t" 396 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" 397 "vmovntdq %%zmm3,%1\n\t" 398 "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" 399 "vmovntdq %%zmm10,%2\n\t" 400 "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" 401 "vmovntdq %%zmm11,%3\n\t" 402 "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" 403 "vmovntdq %%zmm4,%4\n\t" 404 "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" 405 "vmovntdq %%zmm6,%5\n\t" 406 "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" 407 "vmovntdq %%zmm12,%6\n\t" 408 "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" 409 "vmovntdq %%zmm14,%7\n\t" 410 "vpxorq %%zmm14,%%zmm14,%%zmm14" 411 : 412 : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 413 "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), 414 "m" (q[d+128]), "m" (q[d+192])); 415 } 416 417 asm volatile("sfence" : : : "memory"); 418 kernel_fpu_end(); 419 } 420 421 static void raid6_avx5124_xor_syndrome(int disks, int start, int stop, 422 size_t bytes, void **ptrs) 423 { 424 u8 **dptr = (u8 **)ptrs; 425 u8 *p, *q; 426 int d, z, z0; 427 428 z0 = stop; /* P/Q right side optimization */ 429 p = dptr[disks-2]; /* XOR parity */ 430 q = dptr[disks-1]; /* RS syndrome */ 431 432 kernel_fpu_begin(); 433 434 asm volatile("vmovdqa64 %0,%%zmm0" 435 :: "m" (raid6_avx512_constants.x1d[0])); 436 437 for (d = 0 ; d < bytes ; d += 256) { 438 asm volatile("vmovdqa64 %0,%%zmm4\n\t" 439 "vmovdqa64 %1,%%zmm6\n\t" 440 "vmovdqa64 %2,%%zmm12\n\t" 441 "vmovdqa64 %3,%%zmm14\n\t" 442 "vmovdqa64 %4,%%zmm2\n\t" 443 "vmovdqa64 %5,%%zmm3\n\t" 444 "vmovdqa64 %6,%%zmm10\n\t" 445 "vmovdqa64 %7,%%zmm11\n\t" 446 "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" 447 "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t" 448 "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t" 449 "vpxorq %%zmm14,%%zmm11,%%zmm11" 450 : 451 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), 452 "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]), 453 "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 454 "m" (p[d+192])); 455 /* P/Q data pages */ 456 for (z = z0-1 ; z >= start ; z--) { 457 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 458 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 459 "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" 460 "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" 461 "prefetchnta %0\n\t" 462 "prefetchnta %2\n\t" 463 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 464 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 465 "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" 466 "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" 467 "vpmovm2b %%k1,%%zmm5\n\t" 468 "vpmovm2b %%k2,%%zmm7\n\t" 469 "vpmovm2b %%k3,%%zmm13\n\t" 470 "vpmovm2b %%k4,%%zmm15\n\t" 471 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 472 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 473 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 474 "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t" 475 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 476 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 477 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 478 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 479 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 480 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 481 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 482 "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" 483 "vmovdqa64 %0,%%zmm5\n\t" 484 "vmovdqa64 %1,%%zmm7\n\t" 485 "vmovdqa64 %2,%%zmm13\n\t" 486 "vmovdqa64 %3,%%zmm15\n\t" 487 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 488 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 489 "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" 490 "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t" 491 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 492 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 493 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 494 "vpxorq %%zmm15,%%zmm14,%%zmm14" 495 : 496 : "m" (dptr[z][d]), "m" (dptr[z][d+64]), 497 "m" (dptr[z][d+128]), 498 "m" (dptr[z][d+192])); 499 } 500 asm volatile("prefetchnta %0\n\t" 501 "prefetchnta %1\n\t" 502 : 503 : "m" (q[d]), "m" (q[d+128])); 504 /* P/Q left side optimization */ 505 for (z = start-1 ; z >= 0 ; z--) { 506 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 507 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 508 "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" 509 "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" 510 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 511 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 512 "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" 513 "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" 514 "vpmovm2b %%k1,%%zmm5\n\t" 515 "vpmovm2b %%k2,%%zmm7\n\t" 516 "vpmovm2b %%k3,%%zmm13\n\t" 517 "vpmovm2b %%k4,%%zmm15\n\t" 518 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 519 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 520 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 521 "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" 522 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 523 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 524 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 525 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 526 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 527 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 528 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 529 "vpxorq %%zmm15,%%zmm14,%%zmm14" 530 : 531 : ); 532 } 533 asm volatile("vmovntdq %%zmm2,%0\n\t" 534 "vmovntdq %%zmm3,%1\n\t" 535 "vmovntdq %%zmm10,%2\n\t" 536 "vmovntdq %%zmm11,%3\n\t" 537 "vpxorq %4,%%zmm4,%%zmm4\n\t" 538 "vpxorq %5,%%zmm6,%%zmm6\n\t" 539 "vpxorq %6,%%zmm12,%%zmm12\n\t" 540 "vpxorq %7,%%zmm14,%%zmm14\n\t" 541 "vmovntdq %%zmm4,%4\n\t" 542 "vmovntdq %%zmm6,%5\n\t" 543 "vmovntdq %%zmm12,%6\n\t" 544 "vmovntdq %%zmm14,%7" 545 : 546 : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 547 "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), 548 "m" (q[d+128]), "m" (q[d+192])); 549 } 550 asm volatile("sfence" : : : "memory"); 551 kernel_fpu_end(); 552 } 553 const struct raid6_calls raid6_avx512x4 = { 554 raid6_avx5124_gen_syndrome, 555 raid6_avx5124_xor_syndrome, 556 raid6_have_avx512, 557 "avx512x4", 558 .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ 559 }; 560 #endif 561