1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 #ifndef _ASM_X86_XOR_H 3 #define _ASM_X86_XOR_H 4 5 /* 6 * Optimized RAID-5 checksumming functions for SSE. 7 */ 8 9 /* 10 * Cache avoiding checksumming functions utilizing KNI instructions 11 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) 12 */ 13 14 /* 15 * Based on 16 * High-speed RAID5 checksumming functions utilizing SSE instructions. 17 * Copyright (C) 1998 Ingo Molnar. 18 */ 19 20 /* 21 * x86-64 changes / gcc fixes from Andi Kleen. 22 * Copyright 2002 Andi Kleen, SuSE Labs. 23 * 24 * This hasn't been optimized for the hammer yet, but there are likely 25 * no advantages to be gotten from x86-64 here anyways. 26 */ 27 28 #include <asm/fpu/api.h> 29 30 #ifdef CONFIG_X86_32 31 /* reduce register pressure */ 32 # define XOR_CONSTANT_CONSTRAINT "i" 33 #else 34 # define XOR_CONSTANT_CONSTRAINT "re" 35 #endif 36 37 #define OFFS(x) "16*("#x")" 38 #define PF_OFFS(x) "256+16*("#x")" 39 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" 40 #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" 41 #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" 42 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" 43 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" 44 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" 45 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" 46 #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" 47 #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 48 #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 49 #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 50 #define NOP(x) 51 52 #define BLK64(pf, op, i) \ 53 pf(i) \ 54 op(i, 0) \ 55 op(i + 1, 1) \ 56 op(i + 2, 2) \ 57 op(i + 3, 3) 58 59 static void 60 xor_sse_2(unsigned long bytes, unsigned long * __restrict p1, 61 const unsigned long * __restrict p2) 62 { 63 unsigned long lines = bytes >> 8; 64 65 kernel_fpu_begin(); 66 67 asm volatile( 68 #undef BLOCK 69 #define BLOCK(i) \ 70 LD(i, 0) \ 71 LD(i + 1, 1) \ 72 PF1(i) \ 73 PF1(i + 2) \ 74 LD(i + 2, 2) \ 75 LD(i + 3, 3) \ 76 PF0(i + 4) \ 77 PF0(i + 6) \ 78 XO1(i, 0) \ 79 XO1(i + 1, 1) \ 80 XO1(i + 2, 2) \ 81 XO1(i + 3, 3) \ 82 ST(i, 0) \ 83 ST(i + 1, 1) \ 84 ST(i + 2, 2) \ 85 ST(i + 3, 3) \ 86 87 88 PF0(0) 89 PF0(2) 90 91 " .align 32 ;\n" 92 " 1: ;\n" 93 94 BLOCK(0) 95 BLOCK(4) 96 BLOCK(8) 97 BLOCK(12) 98 99 " add %[inc], %[p1] ;\n" 100 " add %[inc], %[p2] ;\n" 101 " dec %[cnt] ;\n" 102 " jnz 1b ;\n" 103 : [cnt] "+r" (lines), 104 [p1] "+r" (p1), [p2] "+r" (p2) 105 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 106 : "memory"); 107 108 kernel_fpu_end(); 109 } 110 111 static void 112 xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1, 113 const unsigned long * __restrict p2) 114 { 115 unsigned long lines = bytes >> 8; 116 117 kernel_fpu_begin(); 118 119 asm volatile( 120 #undef BLOCK 121 #define BLOCK(i) \ 122 BLK64(PF0, LD, i) \ 123 BLK64(PF1, XO1, i) \ 124 BLK64(NOP, ST, i) \ 125 126 " .align 32 ;\n" 127 " 1: ;\n" 128 129 BLOCK(0) 130 BLOCK(4) 131 BLOCK(8) 132 BLOCK(12) 133 134 " add %[inc], %[p1] ;\n" 135 " add %[inc], %[p2] ;\n" 136 " dec %[cnt] ;\n" 137 " jnz 1b ;\n" 138 : [cnt] "+r" (lines), 139 [p1] "+r" (p1), [p2] "+r" (p2) 140 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 141 : "memory"); 142 143 kernel_fpu_end(); 144 } 145 146 static void 147 xor_sse_3(unsigned long bytes, unsigned long * __restrict p1, 148 const unsigned long * __restrict p2, 149 const unsigned long * __restrict p3) 150 { 151 unsigned long lines = bytes >> 8; 152 153 kernel_fpu_begin(); 154 155 asm volatile( 156 #undef BLOCK 157 #define BLOCK(i) \ 158 PF1(i) \ 159 PF1(i + 2) \ 160 LD(i, 0) \ 161 LD(i + 1, 1) \ 162 LD(i + 2, 2) \ 163 LD(i + 3, 3) \ 164 PF2(i) \ 165 PF2(i + 2) \ 166 PF0(i + 4) \ 167 PF0(i + 6) \ 168 XO1(i, 0) \ 169 XO1(i + 1, 1) \ 170 XO1(i + 2, 2) \ 171 XO1(i + 3, 3) \ 172 XO2(i, 0) \ 173 XO2(i + 1, 1) \ 174 XO2(i + 2, 2) \ 175 XO2(i + 3, 3) \ 176 ST(i, 0) \ 177 ST(i + 1, 1) \ 178 ST(i + 2, 2) \ 179 ST(i + 3, 3) \ 180 181 182 PF0(0) 183 PF0(2) 184 185 " .align 32 ;\n" 186 " 1: ;\n" 187 188 BLOCK(0) 189 BLOCK(4) 190 BLOCK(8) 191 BLOCK(12) 192 193 " add %[inc], %[p1] ;\n" 194 " add %[inc], %[p2] ;\n" 195 " add %[inc], %[p3] ;\n" 196 " dec %[cnt] ;\n" 197 " jnz 1b ;\n" 198 : [cnt] "+r" (lines), 199 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 200 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 201 : "memory"); 202 203 kernel_fpu_end(); 204 } 205 206 static void 207 xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1, 208 const unsigned long * __restrict p2, 209 const unsigned long * __restrict p3) 210 { 211 unsigned long lines = bytes >> 8; 212 213 kernel_fpu_begin(); 214 215 asm volatile( 216 #undef BLOCK 217 #define BLOCK(i) \ 218 BLK64(PF0, LD, i) \ 219 BLK64(PF1, XO1, i) \ 220 BLK64(PF2, XO2, i) \ 221 BLK64(NOP, ST, i) \ 222 223 " .align 32 ;\n" 224 " 1: ;\n" 225 226 BLOCK(0) 227 BLOCK(4) 228 BLOCK(8) 229 BLOCK(12) 230 231 " add %[inc], %[p1] ;\n" 232 " add %[inc], %[p2] ;\n" 233 " add %[inc], %[p3] ;\n" 234 " dec %[cnt] ;\n" 235 " jnz 1b ;\n" 236 : [cnt] "+r" (lines), 237 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 238 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 239 : "memory"); 240 241 kernel_fpu_end(); 242 } 243 244 static void 245 xor_sse_4(unsigned long bytes, unsigned long * __restrict p1, 246 const unsigned long * __restrict p2, 247 const unsigned long * __restrict p3, 248 const unsigned long * __restrict p4) 249 { 250 unsigned long lines = bytes >> 8; 251 252 kernel_fpu_begin(); 253 254 asm volatile( 255 #undef BLOCK 256 #define BLOCK(i) \ 257 PF1(i) \ 258 PF1(i + 2) \ 259 LD(i, 0) \ 260 LD(i + 1, 1) \ 261 LD(i + 2, 2) \ 262 LD(i + 3, 3) \ 263 PF2(i) \ 264 PF2(i + 2) \ 265 XO1(i, 0) \ 266 XO1(i + 1, 1) \ 267 XO1(i + 2, 2) \ 268 XO1(i + 3, 3) \ 269 PF3(i) \ 270 PF3(i + 2) \ 271 PF0(i + 4) \ 272 PF0(i + 6) \ 273 XO2(i, 0) \ 274 XO2(i + 1, 1) \ 275 XO2(i + 2, 2) \ 276 XO2(i + 3, 3) \ 277 XO3(i, 0) \ 278 XO3(i + 1, 1) \ 279 XO3(i + 2, 2) \ 280 XO3(i + 3, 3) \ 281 ST(i, 0) \ 282 ST(i + 1, 1) \ 283 ST(i + 2, 2) \ 284 ST(i + 3, 3) \ 285 286 287 PF0(0) 288 PF0(2) 289 290 " .align 32 ;\n" 291 " 1: ;\n" 292 293 BLOCK(0) 294 BLOCK(4) 295 BLOCK(8) 296 BLOCK(12) 297 298 " add %[inc], %[p1] ;\n" 299 " add %[inc], %[p2] ;\n" 300 " add %[inc], %[p3] ;\n" 301 " add %[inc], %[p4] ;\n" 302 " dec %[cnt] ;\n" 303 " jnz 1b ;\n" 304 : [cnt] "+r" (lines), [p1] "+r" (p1), 305 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 306 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 307 : "memory"); 308 309 kernel_fpu_end(); 310 } 311 312 static void 313 xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1, 314 const unsigned long * __restrict p2, 315 const unsigned long * __restrict p3, 316 const unsigned long * __restrict p4) 317 { 318 unsigned long lines = bytes >> 8; 319 320 kernel_fpu_begin(); 321 322 asm volatile( 323 #undef BLOCK 324 #define BLOCK(i) \ 325 BLK64(PF0, LD, i) \ 326 BLK64(PF1, XO1, i) \ 327 BLK64(PF2, XO2, i) \ 328 BLK64(PF3, XO3, i) \ 329 BLK64(NOP, ST, i) \ 330 331 " .align 32 ;\n" 332 " 1: ;\n" 333 334 BLOCK(0) 335 BLOCK(4) 336 BLOCK(8) 337 BLOCK(12) 338 339 " add %[inc], %[p1] ;\n" 340 " add %[inc], %[p2] ;\n" 341 " add %[inc], %[p3] ;\n" 342 " add %[inc], %[p4] ;\n" 343 " dec %[cnt] ;\n" 344 " jnz 1b ;\n" 345 : [cnt] "+r" (lines), [p1] "+r" (p1), 346 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 347 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 348 : "memory"); 349 350 kernel_fpu_end(); 351 } 352 353 static void 354 xor_sse_5(unsigned long bytes, unsigned long * __restrict p1, 355 const unsigned long * __restrict p2, 356 const unsigned long * __restrict p3, 357 const unsigned long * __restrict p4, 358 const unsigned long * __restrict p5) 359 { 360 unsigned long lines = bytes >> 8; 361 362 kernel_fpu_begin(); 363 364 asm volatile( 365 #undef BLOCK 366 #define BLOCK(i) \ 367 PF1(i) \ 368 PF1(i + 2) \ 369 LD(i, 0) \ 370 LD(i + 1, 1) \ 371 LD(i + 2, 2) \ 372 LD(i + 3, 3) \ 373 PF2(i) \ 374 PF2(i + 2) \ 375 XO1(i, 0) \ 376 XO1(i + 1, 1) \ 377 XO1(i + 2, 2) \ 378 XO1(i + 3, 3) \ 379 PF3(i) \ 380 PF3(i + 2) \ 381 XO2(i, 0) \ 382 XO2(i + 1, 1) \ 383 XO2(i + 2, 2) \ 384 XO2(i + 3, 3) \ 385 PF4(i) \ 386 PF4(i + 2) \ 387 PF0(i + 4) \ 388 PF0(i + 6) \ 389 XO3(i, 0) \ 390 XO3(i + 1, 1) \ 391 XO3(i + 2, 2) \ 392 XO3(i + 3, 3) \ 393 XO4(i, 0) \ 394 XO4(i + 1, 1) \ 395 XO4(i + 2, 2) \ 396 XO4(i + 3, 3) \ 397 ST(i, 0) \ 398 ST(i + 1, 1) \ 399 ST(i + 2, 2) \ 400 ST(i + 3, 3) \ 401 402 403 PF0(0) 404 PF0(2) 405 406 " .align 32 ;\n" 407 " 1: ;\n" 408 409 BLOCK(0) 410 BLOCK(4) 411 BLOCK(8) 412 BLOCK(12) 413 414 " add %[inc], %[p1] ;\n" 415 " add %[inc], %[p2] ;\n" 416 " add %[inc], %[p3] ;\n" 417 " add %[inc], %[p4] ;\n" 418 " add %[inc], %[p5] ;\n" 419 " dec %[cnt] ;\n" 420 " jnz 1b ;\n" 421 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 422 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 423 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 424 : "memory"); 425 426 kernel_fpu_end(); 427 } 428 429 static void 430 xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1, 431 const unsigned long * __restrict p2, 432 const unsigned long * __restrict p3, 433 const unsigned long * __restrict p4, 434 const unsigned long * __restrict p5) 435 { 436 unsigned long lines = bytes >> 8; 437 438 kernel_fpu_begin(); 439 440 asm volatile( 441 #undef BLOCK 442 #define BLOCK(i) \ 443 BLK64(PF0, LD, i) \ 444 BLK64(PF1, XO1, i) \ 445 BLK64(PF2, XO2, i) \ 446 BLK64(PF3, XO3, i) \ 447 BLK64(PF4, XO4, i) \ 448 BLK64(NOP, ST, i) \ 449 450 " .align 32 ;\n" 451 " 1: ;\n" 452 453 BLOCK(0) 454 BLOCK(4) 455 BLOCK(8) 456 BLOCK(12) 457 458 " add %[inc], %[p1] ;\n" 459 " add %[inc], %[p2] ;\n" 460 " add %[inc], %[p3] ;\n" 461 " add %[inc], %[p4] ;\n" 462 " add %[inc], %[p5] ;\n" 463 " dec %[cnt] ;\n" 464 " jnz 1b ;\n" 465 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 466 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 467 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 468 : "memory"); 469 470 kernel_fpu_end(); 471 } 472 473 static struct xor_block_template xor_block_sse_pf64 = { 474 .name = "prefetch64-sse", 475 .do_2 = xor_sse_2_pf64, 476 .do_3 = xor_sse_3_pf64, 477 .do_4 = xor_sse_4_pf64, 478 .do_5 = xor_sse_5_pf64, 479 }; 480 481 #undef LD 482 #undef XO1 483 #undef XO2 484 #undef XO3 485 #undef XO4 486 #undef ST 487 #undef NOP 488 #undef BLK64 489 #undef BLOCK 490 491 #undef XOR_CONSTANT_CONSTRAINT 492 493 #ifdef CONFIG_X86_32 494 # include <asm/xor_32.h> 495 #else 496 # include <asm/xor_64.h> 497 #endif 498 499 #define XOR_SELECT_TEMPLATE(FASTEST) \ 500 AVX_SELECT(FASTEST) 501 502 #endif /* _ASM_X86_XOR_H */ 503