1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Optimized XOR parity functions for SSE. 4 * 5 * Cache avoiding checksumming functions utilizing KNI instructions 6 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) 7 * 8 * Based on 9 * High-speed RAID5 checksumming functions utilizing SSE instructions. 10 * Copyright (C) 1998 Ingo Molnar. 11 * 12 * x86-64 changes / gcc fixes from Andi Kleen. 13 * Copyright 2002 Andi Kleen, SuSE Labs. 14 */ 15 #include <asm/fpu/api.h> 16 #include "xor_impl.h" 17 #include "xor_arch.h" 18 19 #ifdef CONFIG_X86_32 20 /* reduce register pressure */ 21 # define XOR_CONSTANT_CONSTRAINT "i" 22 #else 23 # define XOR_CONSTANT_CONSTRAINT "re" 24 #endif 25 26 #define OFFS(x) "16*("#x")" 27 #define PF_OFFS(x) "256+16*("#x")" 28 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" 29 #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" 30 #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" 31 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" 32 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" 33 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" 34 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" 35 #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" 36 #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 37 #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 38 #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 39 #define NOP(x) 40 41 #define BLK64(pf, op, i) \ 42 pf(i) \ 43 op(i, 0) \ 44 op(i + 1, 1) \ 45 op(i + 2, 2) \ 46 op(i + 3, 3) 47 48 static void 49 xor_sse_2(unsigned long bytes, unsigned long * __restrict p1, 50 const unsigned long * __restrict p2) 51 { 52 unsigned long lines = bytes >> 8; 53 54 asm volatile( 55 #undef BLOCK 56 #define BLOCK(i) \ 57 LD(i, 0) \ 58 LD(i + 1, 1) \ 59 PF1(i) \ 60 PF1(i + 2) \ 61 LD(i + 2, 2) \ 62 LD(i + 3, 3) \ 63 PF0(i + 4) \ 64 PF0(i + 6) \ 65 XO1(i, 0) \ 66 XO1(i + 1, 1) \ 67 XO1(i + 2, 2) \ 68 XO1(i + 3, 3) \ 69 ST(i, 0) \ 70 ST(i + 1, 1) \ 71 ST(i + 2, 2) \ 72 ST(i + 3, 3) \ 73 74 75 PF0(0) 76 PF0(2) 77 78 " .align 32 ;\n" 79 " 1: ;\n" 80 81 BLOCK(0) 82 BLOCK(4) 83 BLOCK(8) 84 BLOCK(12) 85 86 " add %[inc], %[p1] ;\n" 87 " add %[inc], %[p2] ;\n" 88 " dec %[cnt] ;\n" 89 " jnz 1b ;\n" 90 : [cnt] "+r" (lines), 91 [p1] "+r" (p1), [p2] "+r" (p2) 92 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 93 : "memory"); 94 } 95 96 static void 97 xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1, 98 const unsigned long * __restrict p2) 99 { 100 unsigned long lines = bytes >> 8; 101 102 asm volatile( 103 #undef BLOCK 104 #define BLOCK(i) \ 105 BLK64(PF0, LD, i) \ 106 BLK64(PF1, XO1, i) \ 107 BLK64(NOP, ST, i) \ 108 109 " .align 32 ;\n" 110 " 1: ;\n" 111 112 BLOCK(0) 113 BLOCK(4) 114 BLOCK(8) 115 BLOCK(12) 116 117 " add %[inc], %[p1] ;\n" 118 " add %[inc], %[p2] ;\n" 119 " dec %[cnt] ;\n" 120 " jnz 1b ;\n" 121 : [cnt] "+r" (lines), 122 [p1] "+r" (p1), [p2] "+r" (p2) 123 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 124 : "memory"); 125 } 126 127 static void 128 xor_sse_3(unsigned long bytes, unsigned long * __restrict p1, 129 const unsigned long * __restrict p2, 130 const unsigned long * __restrict p3) 131 { 132 unsigned long lines = bytes >> 8; 133 134 asm volatile( 135 #undef BLOCK 136 #define BLOCK(i) \ 137 PF1(i) \ 138 PF1(i + 2) \ 139 LD(i, 0) \ 140 LD(i + 1, 1) \ 141 LD(i + 2, 2) \ 142 LD(i + 3, 3) \ 143 PF2(i) \ 144 PF2(i + 2) \ 145 PF0(i + 4) \ 146 PF0(i + 6) \ 147 XO1(i, 0) \ 148 XO1(i + 1, 1) \ 149 XO1(i + 2, 2) \ 150 XO1(i + 3, 3) \ 151 XO2(i, 0) \ 152 XO2(i + 1, 1) \ 153 XO2(i + 2, 2) \ 154 XO2(i + 3, 3) \ 155 ST(i, 0) \ 156 ST(i + 1, 1) \ 157 ST(i + 2, 2) \ 158 ST(i + 3, 3) \ 159 160 161 PF0(0) 162 PF0(2) 163 164 " .align 32 ;\n" 165 " 1: ;\n" 166 167 BLOCK(0) 168 BLOCK(4) 169 BLOCK(8) 170 BLOCK(12) 171 172 " add %[inc], %[p1] ;\n" 173 " add %[inc], %[p2] ;\n" 174 " add %[inc], %[p3] ;\n" 175 " dec %[cnt] ;\n" 176 " jnz 1b ;\n" 177 : [cnt] "+r" (lines), 178 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 179 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 180 : "memory"); 181 } 182 183 static void 184 xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1, 185 const unsigned long * __restrict p2, 186 const unsigned long * __restrict p3) 187 { 188 unsigned long lines = bytes >> 8; 189 190 asm volatile( 191 #undef BLOCK 192 #define BLOCK(i) \ 193 BLK64(PF0, LD, i) \ 194 BLK64(PF1, XO1, i) \ 195 BLK64(PF2, XO2, i) \ 196 BLK64(NOP, ST, i) \ 197 198 " .align 32 ;\n" 199 " 1: ;\n" 200 201 BLOCK(0) 202 BLOCK(4) 203 BLOCK(8) 204 BLOCK(12) 205 206 " add %[inc], %[p1] ;\n" 207 " add %[inc], %[p2] ;\n" 208 " add %[inc], %[p3] ;\n" 209 " dec %[cnt] ;\n" 210 " jnz 1b ;\n" 211 : [cnt] "+r" (lines), 212 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 213 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 214 : "memory"); 215 } 216 217 static void 218 xor_sse_4(unsigned long bytes, unsigned long * __restrict p1, 219 const unsigned long * __restrict p2, 220 const unsigned long * __restrict p3, 221 const unsigned long * __restrict p4) 222 { 223 unsigned long lines = bytes >> 8; 224 225 asm volatile( 226 #undef BLOCK 227 #define BLOCK(i) \ 228 PF1(i) \ 229 PF1(i + 2) \ 230 LD(i, 0) \ 231 LD(i + 1, 1) \ 232 LD(i + 2, 2) \ 233 LD(i + 3, 3) \ 234 PF2(i) \ 235 PF2(i + 2) \ 236 XO1(i, 0) \ 237 XO1(i + 1, 1) \ 238 XO1(i + 2, 2) \ 239 XO1(i + 3, 3) \ 240 PF3(i) \ 241 PF3(i + 2) \ 242 PF0(i + 4) \ 243 PF0(i + 6) \ 244 XO2(i, 0) \ 245 XO2(i + 1, 1) \ 246 XO2(i + 2, 2) \ 247 XO2(i + 3, 3) \ 248 XO3(i, 0) \ 249 XO3(i + 1, 1) \ 250 XO3(i + 2, 2) \ 251 XO3(i + 3, 3) \ 252 ST(i, 0) \ 253 ST(i + 1, 1) \ 254 ST(i + 2, 2) \ 255 ST(i + 3, 3) \ 256 257 258 PF0(0) 259 PF0(2) 260 261 " .align 32 ;\n" 262 " 1: ;\n" 263 264 BLOCK(0) 265 BLOCK(4) 266 BLOCK(8) 267 BLOCK(12) 268 269 " add %[inc], %[p1] ;\n" 270 " add %[inc], %[p2] ;\n" 271 " add %[inc], %[p3] ;\n" 272 " add %[inc], %[p4] ;\n" 273 " dec %[cnt] ;\n" 274 " jnz 1b ;\n" 275 : [cnt] "+r" (lines), [p1] "+r" (p1), 276 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 277 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 278 : "memory"); 279 } 280 281 static void 282 xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1, 283 const unsigned long * __restrict p2, 284 const unsigned long * __restrict p3, 285 const unsigned long * __restrict p4) 286 { 287 unsigned long lines = bytes >> 8; 288 289 asm volatile( 290 #undef BLOCK 291 #define BLOCK(i) \ 292 BLK64(PF0, LD, i) \ 293 BLK64(PF1, XO1, i) \ 294 BLK64(PF2, XO2, i) \ 295 BLK64(PF3, XO3, i) \ 296 BLK64(NOP, ST, i) \ 297 298 " .align 32 ;\n" 299 " 1: ;\n" 300 301 BLOCK(0) 302 BLOCK(4) 303 BLOCK(8) 304 BLOCK(12) 305 306 " add %[inc], %[p1] ;\n" 307 " add %[inc], %[p2] ;\n" 308 " add %[inc], %[p3] ;\n" 309 " add %[inc], %[p4] ;\n" 310 " dec %[cnt] ;\n" 311 " jnz 1b ;\n" 312 : [cnt] "+r" (lines), [p1] "+r" (p1), 313 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 314 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 315 : "memory"); 316 } 317 318 static void 319 xor_sse_5(unsigned long bytes, unsigned long * __restrict p1, 320 const unsigned long * __restrict p2, 321 const unsigned long * __restrict p3, 322 const unsigned long * __restrict p4, 323 const unsigned long * __restrict p5) 324 { 325 unsigned long lines = bytes >> 8; 326 327 asm volatile( 328 #undef BLOCK 329 #define BLOCK(i) \ 330 PF1(i) \ 331 PF1(i + 2) \ 332 LD(i, 0) \ 333 LD(i + 1, 1) \ 334 LD(i + 2, 2) \ 335 LD(i + 3, 3) \ 336 PF2(i) \ 337 PF2(i + 2) \ 338 XO1(i, 0) \ 339 XO1(i + 1, 1) \ 340 XO1(i + 2, 2) \ 341 XO1(i + 3, 3) \ 342 PF3(i) \ 343 PF3(i + 2) \ 344 XO2(i, 0) \ 345 XO2(i + 1, 1) \ 346 XO2(i + 2, 2) \ 347 XO2(i + 3, 3) \ 348 PF4(i) \ 349 PF4(i + 2) \ 350 PF0(i + 4) \ 351 PF0(i + 6) \ 352 XO3(i, 0) \ 353 XO3(i + 1, 1) \ 354 XO3(i + 2, 2) \ 355 XO3(i + 3, 3) \ 356 XO4(i, 0) \ 357 XO4(i + 1, 1) \ 358 XO4(i + 2, 2) \ 359 XO4(i + 3, 3) \ 360 ST(i, 0) \ 361 ST(i + 1, 1) \ 362 ST(i + 2, 2) \ 363 ST(i + 3, 3) \ 364 365 366 PF0(0) 367 PF0(2) 368 369 " .align 32 ;\n" 370 " 1: ;\n" 371 372 BLOCK(0) 373 BLOCK(4) 374 BLOCK(8) 375 BLOCK(12) 376 377 " add %[inc], %[p1] ;\n" 378 " add %[inc], %[p2] ;\n" 379 " add %[inc], %[p3] ;\n" 380 " add %[inc], %[p4] ;\n" 381 " add %[inc], %[p5] ;\n" 382 " dec %[cnt] ;\n" 383 " jnz 1b ;\n" 384 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 385 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 386 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 387 : "memory"); 388 } 389 390 static void 391 xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1, 392 const unsigned long * __restrict p2, 393 const unsigned long * __restrict p3, 394 const unsigned long * __restrict p4, 395 const unsigned long * __restrict p5) 396 { 397 unsigned long lines = bytes >> 8; 398 399 asm volatile( 400 #undef BLOCK 401 #define BLOCK(i) \ 402 BLK64(PF0, LD, i) \ 403 BLK64(PF1, XO1, i) \ 404 BLK64(PF2, XO2, i) \ 405 BLK64(PF3, XO3, i) \ 406 BLK64(PF4, XO4, i) \ 407 BLK64(NOP, ST, i) \ 408 409 " .align 32 ;\n" 410 " 1: ;\n" 411 412 BLOCK(0) 413 BLOCK(4) 414 BLOCK(8) 415 BLOCK(12) 416 417 " add %[inc], %[p1] ;\n" 418 " add %[inc], %[p2] ;\n" 419 " add %[inc], %[p3] ;\n" 420 " add %[inc], %[p4] ;\n" 421 " add %[inc], %[p5] ;\n" 422 " dec %[cnt] ;\n" 423 " jnz 1b ;\n" 424 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 425 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 426 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 427 : "memory"); 428 } 429 430 DO_XOR_BLOCKS(sse_inner, xor_sse_2, xor_sse_3, xor_sse_4, xor_sse_5); 431 432 static void xor_gen_sse(void *dest, void **srcs, unsigned int src_cnt, 433 unsigned int bytes) 434 { 435 kernel_fpu_begin(); 436 xor_gen_sse_inner(dest, srcs, src_cnt, bytes); 437 kernel_fpu_end(); 438 } 439 440 struct xor_block_template xor_block_sse = { 441 .name = "sse", 442 .xor_gen = xor_gen_sse, 443 }; 444 445 DO_XOR_BLOCKS(sse_pf64_inner, xor_sse_2_pf64, xor_sse_3_pf64, xor_sse_4_pf64, 446 xor_sse_5_pf64); 447 448 static void xor_gen_sse_pf64(void *dest, void **srcs, unsigned int src_cnt, 449 unsigned int bytes) 450 { 451 kernel_fpu_begin(); 452 xor_gen_sse_pf64_inner(dest, srcs, src_cnt, bytes); 453 kernel_fpu_end(); 454 } 455 456 struct xor_block_template xor_block_sse_pf64 = { 457 .name = "prefetch64-sse", 458 .xor_gen = xor_gen_sse_pf64, 459 }; 460