1 /* -*- linux-c -*- -------------------------------------------------------- 2 * 3 * Copyright (C) 2016 Intel Corporation 4 * 5 * Author: Gayatri Kammela <gayatri.kammela@intel.com> 6 * Author: Megha Dey <megha.dey@linux.intel.com> 7 * 8 * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved 9 * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation, Inc., 53 Temple Place Ste 330, 14 * Boston MA 02111-1307, USA; either version 2 of the License, or 15 * (at your option) any later version; incorporated herein by reference. 16 * 17 * ----------------------------------------------------------------------- 18 */ 19 20 /* 21 * AVX512 implementation of RAID-6 syndrome functions 22 * 23 */ 24 25 #ifdef CONFIG_AS_AVX512 26 27 #include <linux/raid/pq.h> 28 #include "x86.h" 29 30 static const struct raid6_avx512_constants { 31 u64 x1d[8]; 32 } raid6_avx512_constants __aligned(512) = { 33 { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 34 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 35 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 36 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, 37 }; 38 39 static int raid6_have_avx512(void) 40 { 41 return boot_cpu_has(X86_FEATURE_AVX2) && 42 boot_cpu_has(X86_FEATURE_AVX) && 43 boot_cpu_has(X86_FEATURE_AVX512F) && 44 boot_cpu_has(X86_FEATURE_AVX512BW) && 45 boot_cpu_has(X86_FEATURE_AVX512VL) && 46 boot_cpu_has(X86_FEATURE_AVX512DQ); 47 } 48 49 static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs) 50 { 51 u8 **dptr = (u8 **)ptrs; 52 u8 *p, *q; 53 int d, z, z0; 54 55 z0 = disks - 3; /* Highest data disk */ 56 p = dptr[z0+1]; /* XOR parity */ 57 q = dptr[z0+2]; /* RS syndrome */ 58 59 kernel_fpu_begin(); 60 61 asm volatile("vmovdqa64 %0,%%zmm0\n\t" 62 "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ 63 : 64 : "m" (raid6_avx512_constants.x1d[0])); 65 66 for (d = 0; d < bytes; d += 64) { 67 asm volatile("prefetchnta %0\n\t" 68 "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ 69 "prefetchnta %1\n\t" 70 "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ 71 "vmovdqa64 %1,%%zmm6" 72 : 73 : "m" (dptr[z0][d]), "m" (dptr[z0-1][d])); 74 for (z = z0-2; z >= 0; z--) { 75 asm volatile("prefetchnta %0\n\t" 76 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 77 "vpmovm2b %%k1,%%zmm5\n\t" 78 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 79 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 80 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 81 "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" 82 "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" 83 "vmovdqa64 %0,%%zmm6" 84 : 85 : "m" (dptr[z][d])); 86 } 87 asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 88 "vpmovm2b %%k1,%%zmm5\n\t" 89 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 90 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 91 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 92 "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" 93 "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" 94 "vmovntdq %%zmm2,%0\n\t" 95 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" 96 "vmovntdq %%zmm4,%1\n\t" 97 "vpxorq %%zmm4,%%zmm4,%%zmm4" 98 : 99 : "m" (p[d]), "m" (q[d])); 100 } 101 102 asm volatile("sfence" : : : "memory"); 103 kernel_fpu_end(); 104 } 105 106 static void raid6_avx5121_xor_syndrome(int disks, int start, int stop, 107 size_t bytes, void **ptrs) 108 { 109 u8 **dptr = (u8 **)ptrs; 110 u8 *p, *q; 111 int d, z, z0; 112 113 z0 = stop; /* P/Q right side optimization */ 114 p = dptr[disks-2]; /* XOR parity */ 115 q = dptr[disks-1]; /* RS syndrome */ 116 117 kernel_fpu_begin(); 118 119 asm volatile("vmovdqa64 %0,%%zmm0" 120 : : "m" (raid6_avx512_constants.x1d[0])); 121 122 for (d = 0 ; d < bytes ; d += 64) { 123 asm volatile("vmovdqa64 %0,%%zmm4\n\t" 124 "vmovdqa64 %1,%%zmm2\n\t" 125 "vpxorq %%zmm4,%%zmm2,%%zmm2" 126 : 127 : "m" (dptr[z0][d]), "m" (p[d])); 128 /* P/Q data pages */ 129 for (z = z0-1 ; z >= start ; z--) { 130 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 131 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 132 "vpmovm2b %%k1,%%zmm5\n\t" 133 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 134 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 135 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 136 "vmovdqa64 %0,%%zmm5\n\t" 137 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 138 "vpxorq %%zmm5,%%zmm4,%%zmm4" 139 : 140 : "m" (dptr[z][d])); 141 } 142 /* P/Q left side optimization */ 143 for (z = start-1 ; z >= 0 ; z--) { 144 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 145 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 146 "vpmovm2b %%k1,%%zmm5\n\t" 147 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 148 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 149 "vpxorq %%zmm5,%%zmm4,%%zmm4" 150 : 151 : ); 152 } 153 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" 154 /* Don't use movntdq for r/w memory area < cache line */ 155 "vmovdqa64 %%zmm4,%0\n\t" 156 "vmovdqa64 %%zmm2,%1" 157 : 158 : "m" (q[d]), "m" (p[d])); 159 } 160 161 asm volatile("sfence" : : : "memory"); 162 kernel_fpu_end(); 163 } 164 165 const struct raid6_calls raid6_avx512x1 = { 166 raid6_avx5121_gen_syndrome, 167 raid6_avx5121_xor_syndrome, 168 raid6_have_avx512, 169 "avx512x1", 170 1 /* Has cache hints */ 171 }; 172 173 /* 174 * Unrolled-by-2 AVX512 implementation 175 */ 176 static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs) 177 { 178 u8 **dptr = (u8 **)ptrs; 179 u8 *p, *q; 180 int d, z, z0; 181 182 z0 = disks - 3; /* Highest data disk */ 183 p = dptr[z0+1]; /* XOR parity */ 184 q = dptr[z0+2]; /* RS syndrome */ 185 186 kernel_fpu_begin(); 187 188 asm volatile("vmovdqa64 %0,%%zmm0\n\t" 189 "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ 190 : 191 : "m" (raid6_avx512_constants.x1d[0])); 192 193 /* We uniformly assume a single prefetch covers at least 64 bytes */ 194 for (d = 0; d < bytes; d += 128) { 195 asm volatile("prefetchnta %0\n\t" 196 "prefetchnta %1\n\t" 197 "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ 198 "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */ 199 "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ 200 "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */ 201 : 202 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64])); 203 for (z = z0-1; z >= 0; z--) { 204 asm volatile("prefetchnta %0\n\t" 205 "prefetchnta %1\n\t" 206 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 207 "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" 208 "vpmovm2b %%k1,%%zmm5\n\t" 209 "vpmovm2b %%k2,%%zmm7\n\t" 210 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 211 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 212 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 213 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 214 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 215 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 216 "vmovdqa64 %0,%%zmm5\n\t" 217 "vmovdqa64 %1,%%zmm7\n\t" 218 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 219 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 220 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 221 "vpxorq %%zmm7,%%zmm6,%%zmm6" 222 : 223 : "m" (dptr[z][d]), "m" (dptr[z][d+64])); 224 } 225 asm volatile("vmovntdq %%zmm2,%0\n\t" 226 "vmovntdq %%zmm3,%1\n\t" 227 "vmovntdq %%zmm4,%2\n\t" 228 "vmovntdq %%zmm6,%3" 229 : 230 : "m" (p[d]), "m" (p[d+64]), "m" (q[d]), 231 "m" (q[d+64])); 232 } 233 234 asm volatile("sfence" : : : "memory"); 235 kernel_fpu_end(); 236 } 237 238 static void raid6_avx5122_xor_syndrome(int disks, int start, int stop, 239 size_t bytes, void **ptrs) 240 { 241 u8 **dptr = (u8 **)ptrs; 242 u8 *p, *q; 243 int d, z, z0; 244 245 z0 = stop; /* P/Q right side optimization */ 246 p = dptr[disks-2]; /* XOR parity */ 247 q = dptr[disks-1]; /* RS syndrome */ 248 249 kernel_fpu_begin(); 250 251 asm volatile("vmovdqa64 %0,%%zmm0" 252 : : "m" (raid6_avx512_constants.x1d[0])); 253 254 for (d = 0 ; d < bytes ; d += 128) { 255 asm volatile("vmovdqa64 %0,%%zmm4\n\t" 256 "vmovdqa64 %1,%%zmm6\n\t" 257 "vmovdqa64 %2,%%zmm2\n\t" 258 "vmovdqa64 %3,%%zmm3\n\t" 259 "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" 260 "vpxorq %%zmm6,%%zmm3,%%zmm3" 261 : 262 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), 263 "m" (p[d]), "m" (p[d+64])); 264 /* P/Q data pages */ 265 for (z = z0-1 ; z >= start ; z--) { 266 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 267 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 268 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 269 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 270 "vpmovm2b %%k1,%%zmm5\n\t" 271 "vpmovm2b %%k2,%%zmm7\n\t" 272 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 273 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 274 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 275 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 276 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 277 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 278 "vmovdqa64 %0,%%zmm5\n\t" 279 "vmovdqa64 %1,%%zmm7\n\t" 280 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 281 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 282 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 283 "vpxorq %%zmm7,%%zmm6,%%zmm6" 284 : 285 : "m" (dptr[z][d]), "m" (dptr[z][d+64])); 286 } 287 /* P/Q left side optimization */ 288 for (z = start-1 ; z >= 0 ; z--) { 289 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 290 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 291 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 292 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 293 "vpmovm2b %%k1,%%zmm5\n\t" 294 "vpmovm2b %%k2,%%zmm7\n\t" 295 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 296 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 297 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 298 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 299 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 300 "vpxorq %%zmm7,%%zmm6,%%zmm6" 301 : 302 : ); 303 } 304 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" 305 "vpxorq %1,%%zmm6,%%zmm6\n\t" 306 /* Don't use movntdq for r/w 307 * memory area < cache line 308 */ 309 "vmovdqa64 %%zmm4,%0\n\t" 310 "vmovdqa64 %%zmm6,%1\n\t" 311 "vmovdqa64 %%zmm2,%2\n\t" 312 "vmovdqa64 %%zmm3,%3" 313 : 314 : "m" (q[d]), "m" (q[d+64]), "m" (p[d]), 315 "m" (p[d+64])); 316 } 317 318 asm volatile("sfence" : : : "memory"); 319 kernel_fpu_end(); 320 } 321 322 const struct raid6_calls raid6_avx512x2 = { 323 raid6_avx5122_gen_syndrome, 324 raid6_avx5122_xor_syndrome, 325 raid6_have_avx512, 326 "avx512x2", 327 1 /* Has cache hints */ 328 }; 329 330 #ifdef CONFIG_X86_64 331 332 /* 333 * Unrolled-by-4 AVX2 implementation 334 */ 335 static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs) 336 { 337 u8 **dptr = (u8 **)ptrs; 338 u8 *p, *q; 339 int d, z, z0; 340 341 z0 = disks - 3; /* Highest data disk */ 342 p = dptr[z0+1]; /* XOR parity */ 343 q = dptr[z0+2]; /* RS syndrome */ 344 345 kernel_fpu_begin(); 346 347 asm volatile("vmovdqa64 %0,%%zmm0\n\t" 348 "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */ 349 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */ 350 "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */ 351 "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */ 352 "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */ 353 "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */ 354 "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */ 355 "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */ 356 "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */ 357 : 358 : "m" (raid6_avx512_constants.x1d[0])); 359 360 for (d = 0; d < bytes; d += 256) { 361 for (z = z0; z >= 0; z--) { 362 asm volatile("prefetchnta %0\n\t" 363 "prefetchnta %1\n\t" 364 "prefetchnta %2\n\t" 365 "prefetchnta %3\n\t" 366 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 367 "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" 368 "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t" 369 "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t" 370 "vpmovm2b %%k1,%%zmm5\n\t" 371 "vpmovm2b %%k2,%%zmm7\n\t" 372 "vpmovm2b %%k3,%%zmm13\n\t" 373 "vpmovm2b %%k4,%%zmm15\n\t" 374 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 375 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 376 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 377 "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" 378 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 379 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 380 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 381 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 382 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 383 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 384 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 385 "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" 386 "vmovdqa64 %0,%%zmm5\n\t" 387 "vmovdqa64 %1,%%zmm7\n\t" 388 "vmovdqa64 %2,%%zmm13\n\t" 389 "vmovdqa64 %3,%%zmm15\n\t" 390 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 391 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 392 "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" 393 "vpxorq %%zmm15,%%zmm11,%%zmm11\n" 394 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 395 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 396 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 397 "vpxorq %%zmm15,%%zmm14,%%zmm14" 398 : 399 : "m" (dptr[z][d]), "m" (dptr[z][d+64]), 400 "m" (dptr[z][d+128]), "m" (dptr[z][d+192])); 401 } 402 asm volatile("vmovntdq %%zmm2,%0\n\t" 403 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" 404 "vmovntdq %%zmm3,%1\n\t" 405 "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" 406 "vmovntdq %%zmm10,%2\n\t" 407 "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" 408 "vmovntdq %%zmm11,%3\n\t" 409 "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" 410 "vmovntdq %%zmm4,%4\n\t" 411 "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" 412 "vmovntdq %%zmm6,%5\n\t" 413 "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" 414 "vmovntdq %%zmm12,%6\n\t" 415 "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" 416 "vmovntdq %%zmm14,%7\n\t" 417 "vpxorq %%zmm14,%%zmm14,%%zmm14" 418 : 419 : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 420 "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), 421 "m" (q[d+128]), "m" (q[d+192])); 422 } 423 424 asm volatile("sfence" : : : "memory"); 425 kernel_fpu_end(); 426 } 427 428 static void raid6_avx5124_xor_syndrome(int disks, int start, int stop, 429 size_t bytes, void **ptrs) 430 { 431 u8 **dptr = (u8 **)ptrs; 432 u8 *p, *q; 433 int d, z, z0; 434 435 z0 = stop; /* P/Q right side optimization */ 436 p = dptr[disks-2]; /* XOR parity */ 437 q = dptr[disks-1]; /* RS syndrome */ 438 439 kernel_fpu_begin(); 440 441 asm volatile("vmovdqa64 %0,%%zmm0" 442 :: "m" (raid6_avx512_constants.x1d[0])); 443 444 for (d = 0 ; d < bytes ; d += 256) { 445 asm volatile("vmovdqa64 %0,%%zmm4\n\t" 446 "vmovdqa64 %1,%%zmm6\n\t" 447 "vmovdqa64 %2,%%zmm12\n\t" 448 "vmovdqa64 %3,%%zmm14\n\t" 449 "vmovdqa64 %4,%%zmm2\n\t" 450 "vmovdqa64 %5,%%zmm3\n\t" 451 "vmovdqa64 %6,%%zmm10\n\t" 452 "vmovdqa64 %7,%%zmm11\n\t" 453 "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" 454 "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t" 455 "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t" 456 "vpxorq %%zmm14,%%zmm11,%%zmm11" 457 : 458 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), 459 "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]), 460 "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 461 "m" (p[d+192])); 462 /* P/Q data pages */ 463 for (z = z0-1 ; z >= start ; z--) { 464 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 465 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 466 "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" 467 "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" 468 "prefetchnta %0\n\t" 469 "prefetchnta %2\n\t" 470 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 471 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 472 "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" 473 "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" 474 "vpmovm2b %%k1,%%zmm5\n\t" 475 "vpmovm2b %%k2,%%zmm7\n\t" 476 "vpmovm2b %%k3,%%zmm13\n\t" 477 "vpmovm2b %%k4,%%zmm15\n\t" 478 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 479 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 480 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 481 "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t" 482 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 483 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 484 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 485 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 486 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 487 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 488 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 489 "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" 490 "vmovdqa64 %0,%%zmm5\n\t" 491 "vmovdqa64 %1,%%zmm7\n\t" 492 "vmovdqa64 %2,%%zmm13\n\t" 493 "vmovdqa64 %3,%%zmm15\n\t" 494 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 495 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 496 "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" 497 "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t" 498 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 499 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 500 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 501 "vpxorq %%zmm15,%%zmm14,%%zmm14" 502 : 503 : "m" (dptr[z][d]), "m" (dptr[z][d+64]), 504 "m" (dptr[z][d+128]), 505 "m" (dptr[z][d+192])); 506 } 507 asm volatile("prefetchnta %0\n\t" 508 "prefetchnta %1\n\t" 509 : 510 : "m" (q[d]), "m" (q[d+128])); 511 /* P/Q left side optimization */ 512 for (z = start-1 ; z >= 0 ; z--) { 513 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 514 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 515 "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" 516 "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" 517 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 518 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 519 "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" 520 "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" 521 "vpmovm2b %%k1,%%zmm5\n\t" 522 "vpmovm2b %%k2,%%zmm7\n\t" 523 "vpmovm2b %%k3,%%zmm13\n\t" 524 "vpmovm2b %%k4,%%zmm15\n\t" 525 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 526 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 527 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 528 "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" 529 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 530 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 531 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 532 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 533 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 534 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 535 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 536 "vpxorq %%zmm15,%%zmm14,%%zmm14" 537 : 538 : ); 539 } 540 asm volatile("vmovntdq %%zmm2,%0\n\t" 541 "vmovntdq %%zmm3,%1\n\t" 542 "vmovntdq %%zmm10,%2\n\t" 543 "vmovntdq %%zmm11,%3\n\t" 544 "vpxorq %4,%%zmm4,%%zmm4\n\t" 545 "vpxorq %5,%%zmm6,%%zmm6\n\t" 546 "vpxorq %6,%%zmm12,%%zmm12\n\t" 547 "vpxorq %7,%%zmm14,%%zmm14\n\t" 548 "vmovntdq %%zmm4,%4\n\t" 549 "vmovntdq %%zmm6,%5\n\t" 550 "vmovntdq %%zmm12,%6\n\t" 551 "vmovntdq %%zmm14,%7" 552 : 553 : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 554 "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), 555 "m" (q[d+128]), "m" (q[d+192])); 556 } 557 asm volatile("sfence" : : : "memory"); 558 kernel_fpu_end(); 559 } 560 const struct raid6_calls raid6_avx512x4 = { 561 raid6_avx5124_gen_syndrome, 562 raid6_avx5124_xor_syndrome, 563 raid6_have_avx512, 564 "avx512x4", 565 1 /* Has cache hints */ 566 }; 567 #endif 568 569 #endif /* CONFIG_AS_AVX512 */ 570