1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Optimized XOR parity functions for MMX. 4 * 5 * Copyright (C) 1998 Ingo Molnar. 6 */ 7 #include <asm/fpu/api.h> 8 #include "xor_impl.h" 9 #include "xor_arch.h" 10 11 #define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n" 12 #define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n" 13 #define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n" 14 #define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" 15 #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" 16 #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" 17 18 static void 19 xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1, 20 const unsigned long * __restrict p2) 21 { 22 unsigned long lines = bytes >> 7; 23 24 asm volatile( 25 #undef BLOCK 26 #define BLOCK(i) \ 27 LD(i, 0) \ 28 LD(i + 1, 1) \ 29 LD(i + 2, 2) \ 30 LD(i + 3, 3) \ 31 XO1(i, 0) \ 32 ST(i, 0) \ 33 XO1(i+1, 1) \ 34 ST(i+1, 1) \ 35 XO1(i + 2, 2) \ 36 ST(i + 2, 2) \ 37 XO1(i + 3, 3) \ 38 ST(i + 3, 3) 39 40 " .align 32 ;\n" 41 " 1: ;\n" 42 43 BLOCK(0) 44 BLOCK(4) 45 BLOCK(8) 46 BLOCK(12) 47 48 " addl $128, %1 ;\n" 49 " addl $128, %2 ;\n" 50 " decl %0 ;\n" 51 " jnz 1b ;\n" 52 : "+r" (lines), 53 "+r" (p1), "+r" (p2) 54 : 55 : "memory"); 56 } 57 58 static void 59 xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1, 60 const unsigned long * __restrict p2, 61 const unsigned long * __restrict p3) 62 { 63 unsigned long lines = bytes >> 7; 64 65 asm volatile( 66 #undef BLOCK 67 #define BLOCK(i) \ 68 LD(i, 0) \ 69 LD(i + 1, 1) \ 70 LD(i + 2, 2) \ 71 LD(i + 3, 3) \ 72 XO1(i, 0) \ 73 XO1(i + 1, 1) \ 74 XO1(i + 2, 2) \ 75 XO1(i + 3, 3) \ 76 XO2(i, 0) \ 77 ST(i, 0) \ 78 XO2(i + 1, 1) \ 79 ST(i + 1, 1) \ 80 XO2(i + 2, 2) \ 81 ST(i + 2, 2) \ 82 XO2(i + 3, 3) \ 83 ST(i + 3, 3) 84 85 " .align 32 ;\n" 86 " 1: ;\n" 87 88 BLOCK(0) 89 BLOCK(4) 90 BLOCK(8) 91 BLOCK(12) 92 93 " addl $128, %1 ;\n" 94 " addl $128, %2 ;\n" 95 " addl $128, %3 ;\n" 96 " decl %0 ;\n" 97 " jnz 1b ;\n" 98 : "+r" (lines), 99 "+r" (p1), "+r" (p2), "+r" (p3) 100 : 101 : "memory"); 102 } 103 104 static void 105 xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1, 106 const unsigned long * __restrict p2, 107 const unsigned long * __restrict p3, 108 const unsigned long * __restrict p4) 109 { 110 unsigned long lines = bytes >> 7; 111 112 asm volatile( 113 #undef BLOCK 114 #define BLOCK(i) \ 115 LD(i, 0) \ 116 LD(i + 1, 1) \ 117 LD(i + 2, 2) \ 118 LD(i + 3, 3) \ 119 XO1(i, 0) \ 120 XO1(i + 1, 1) \ 121 XO1(i + 2, 2) \ 122 XO1(i + 3, 3) \ 123 XO2(i, 0) \ 124 XO2(i + 1, 1) \ 125 XO2(i + 2, 2) \ 126 XO2(i + 3, 3) \ 127 XO3(i, 0) \ 128 ST(i, 0) \ 129 XO3(i + 1, 1) \ 130 ST(i + 1, 1) \ 131 XO3(i + 2, 2) \ 132 ST(i + 2, 2) \ 133 XO3(i + 3, 3) \ 134 ST(i + 3, 3) 135 136 " .align 32 ;\n" 137 " 1: ;\n" 138 139 BLOCK(0) 140 BLOCK(4) 141 BLOCK(8) 142 BLOCK(12) 143 144 " addl $128, %1 ;\n" 145 " addl $128, %2 ;\n" 146 " addl $128, %3 ;\n" 147 " addl $128, %4 ;\n" 148 " decl %0 ;\n" 149 " jnz 1b ;\n" 150 : "+r" (lines), 151 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) 152 : 153 : "memory"); 154 } 155 156 157 static void 158 xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1, 159 const unsigned long * __restrict p2, 160 const unsigned long * __restrict p3, 161 const unsigned long * __restrict p4, 162 const unsigned long * __restrict p5) 163 { 164 unsigned long lines = bytes >> 7; 165 166 /* Make sure GCC forgets anything it knows about p4 or p5, 167 such that it won't pass to the asm volatile below a 168 register that is shared with any other variable. That's 169 because we modify p4 and p5 there, but we can't mark them 170 as read/write, otherwise we'd overflow the 10-asm-operands 171 limit of GCC < 3.1. */ 172 asm("" : "+r" (p4), "+r" (p5)); 173 174 asm volatile( 175 #undef BLOCK 176 #define BLOCK(i) \ 177 LD(i, 0) \ 178 LD(i + 1, 1) \ 179 LD(i + 2, 2) \ 180 LD(i + 3, 3) \ 181 XO1(i, 0) \ 182 XO1(i + 1, 1) \ 183 XO1(i + 2, 2) \ 184 XO1(i + 3, 3) \ 185 XO2(i, 0) \ 186 XO2(i + 1, 1) \ 187 XO2(i + 2, 2) \ 188 XO2(i + 3, 3) \ 189 XO3(i, 0) \ 190 XO3(i + 1, 1) \ 191 XO3(i + 2, 2) \ 192 XO3(i + 3, 3) \ 193 XO4(i, 0) \ 194 ST(i, 0) \ 195 XO4(i + 1, 1) \ 196 ST(i + 1, 1) \ 197 XO4(i + 2, 2) \ 198 ST(i + 2, 2) \ 199 XO4(i + 3, 3) \ 200 ST(i + 3, 3) 201 202 " .align 32 ;\n" 203 " 1: ;\n" 204 205 BLOCK(0) 206 BLOCK(4) 207 BLOCK(8) 208 BLOCK(12) 209 210 " addl $128, %1 ;\n" 211 " addl $128, %2 ;\n" 212 " addl $128, %3 ;\n" 213 " addl $128, %4 ;\n" 214 " addl $128, %5 ;\n" 215 " decl %0 ;\n" 216 " jnz 1b ;\n" 217 : "+r" (lines), 218 "+r" (p1), "+r" (p2), "+r" (p3) 219 : "r" (p4), "r" (p5) 220 : "memory"); 221 222 /* p4 and p5 were modified, and now the variables are dead. 223 Clobber them just to be sure nobody does something stupid 224 like assuming they have some legal value. */ 225 asm("" : "=r" (p4), "=r" (p5)); 226 } 227 228 #undef LD 229 #undef XO1 230 #undef XO2 231 #undef XO3 232 #undef XO4 233 #undef ST 234 #undef BLOCK 235 236 static void 237 xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1, 238 const unsigned long * __restrict p2) 239 { 240 unsigned long lines = bytes >> 6; 241 242 asm volatile( 243 " .align 32 ;\n" 244 " 1: ;\n" 245 " movq (%1), %%mm0 ;\n" 246 " movq 8(%1), %%mm1 ;\n" 247 " pxor (%2), %%mm0 ;\n" 248 " movq 16(%1), %%mm2 ;\n" 249 " movq %%mm0, (%1) ;\n" 250 " pxor 8(%2), %%mm1 ;\n" 251 " movq 24(%1), %%mm3 ;\n" 252 " movq %%mm1, 8(%1) ;\n" 253 " pxor 16(%2), %%mm2 ;\n" 254 " movq 32(%1), %%mm4 ;\n" 255 " movq %%mm2, 16(%1) ;\n" 256 " pxor 24(%2), %%mm3 ;\n" 257 " movq 40(%1), %%mm5 ;\n" 258 " movq %%mm3, 24(%1) ;\n" 259 " pxor 32(%2), %%mm4 ;\n" 260 " movq 48(%1), %%mm6 ;\n" 261 " movq %%mm4, 32(%1) ;\n" 262 " pxor 40(%2), %%mm5 ;\n" 263 " movq 56(%1), %%mm7 ;\n" 264 " movq %%mm5, 40(%1) ;\n" 265 " pxor 48(%2), %%mm6 ;\n" 266 " pxor 56(%2), %%mm7 ;\n" 267 " movq %%mm6, 48(%1) ;\n" 268 " movq %%mm7, 56(%1) ;\n" 269 270 " addl $64, %1 ;\n" 271 " addl $64, %2 ;\n" 272 " decl %0 ;\n" 273 " jnz 1b ;\n" 274 : "+r" (lines), 275 "+r" (p1), "+r" (p2) 276 : 277 : "memory"); 278 } 279 280 static void 281 xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1, 282 const unsigned long * __restrict p2, 283 const unsigned long * __restrict p3) 284 { 285 unsigned long lines = bytes >> 6; 286 287 asm volatile( 288 " .align 32,0x90 ;\n" 289 " 1: ;\n" 290 " movq (%1), %%mm0 ;\n" 291 " movq 8(%1), %%mm1 ;\n" 292 " pxor (%2), %%mm0 ;\n" 293 " movq 16(%1), %%mm2 ;\n" 294 " pxor 8(%2), %%mm1 ;\n" 295 " pxor (%3), %%mm0 ;\n" 296 " pxor 16(%2), %%mm2 ;\n" 297 " movq %%mm0, (%1) ;\n" 298 " pxor 8(%3), %%mm1 ;\n" 299 " pxor 16(%3), %%mm2 ;\n" 300 " movq 24(%1), %%mm3 ;\n" 301 " movq %%mm1, 8(%1) ;\n" 302 " movq 32(%1), %%mm4 ;\n" 303 " movq 40(%1), %%mm5 ;\n" 304 " pxor 24(%2), %%mm3 ;\n" 305 " movq %%mm2, 16(%1) ;\n" 306 " pxor 32(%2), %%mm4 ;\n" 307 " pxor 24(%3), %%mm3 ;\n" 308 " pxor 40(%2), %%mm5 ;\n" 309 " movq %%mm3, 24(%1) ;\n" 310 " pxor 32(%3), %%mm4 ;\n" 311 " pxor 40(%3), %%mm5 ;\n" 312 " movq 48(%1), %%mm6 ;\n" 313 " movq %%mm4, 32(%1) ;\n" 314 " movq 56(%1), %%mm7 ;\n" 315 " pxor 48(%2), %%mm6 ;\n" 316 " movq %%mm5, 40(%1) ;\n" 317 " pxor 56(%2), %%mm7 ;\n" 318 " pxor 48(%3), %%mm6 ;\n" 319 " pxor 56(%3), %%mm7 ;\n" 320 " movq %%mm6, 48(%1) ;\n" 321 " movq %%mm7, 56(%1) ;\n" 322 323 " addl $64, %1 ;\n" 324 " addl $64, %2 ;\n" 325 " addl $64, %3 ;\n" 326 " decl %0 ;\n" 327 " jnz 1b ;\n" 328 : "+r" (lines), 329 "+r" (p1), "+r" (p2), "+r" (p3) 330 : 331 : "memory" ); 332 } 333 334 static void 335 xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1, 336 const unsigned long * __restrict p2, 337 const unsigned long * __restrict p3, 338 const unsigned long * __restrict p4) 339 { 340 unsigned long lines = bytes >> 6; 341 342 asm volatile( 343 " .align 32,0x90 ;\n" 344 " 1: ;\n" 345 " movq (%1), %%mm0 ;\n" 346 " movq 8(%1), %%mm1 ;\n" 347 " pxor (%2), %%mm0 ;\n" 348 " movq 16(%1), %%mm2 ;\n" 349 " pxor 8(%2), %%mm1 ;\n" 350 " pxor (%3), %%mm0 ;\n" 351 " pxor 16(%2), %%mm2 ;\n" 352 " pxor 8(%3), %%mm1 ;\n" 353 " pxor (%4), %%mm0 ;\n" 354 " movq 24(%1), %%mm3 ;\n" 355 " pxor 16(%3), %%mm2 ;\n" 356 " pxor 8(%4), %%mm1 ;\n" 357 " movq %%mm0, (%1) ;\n" 358 " movq 32(%1), %%mm4 ;\n" 359 " pxor 24(%2), %%mm3 ;\n" 360 " pxor 16(%4), %%mm2 ;\n" 361 " movq %%mm1, 8(%1) ;\n" 362 " movq 40(%1), %%mm5 ;\n" 363 " pxor 32(%2), %%mm4 ;\n" 364 " pxor 24(%3), %%mm3 ;\n" 365 " movq %%mm2, 16(%1) ;\n" 366 " pxor 40(%2), %%mm5 ;\n" 367 " pxor 32(%3), %%mm4 ;\n" 368 " pxor 24(%4), %%mm3 ;\n" 369 " movq %%mm3, 24(%1) ;\n" 370 " movq 56(%1), %%mm7 ;\n" 371 " movq 48(%1), %%mm6 ;\n" 372 " pxor 40(%3), %%mm5 ;\n" 373 " pxor 32(%4), %%mm4 ;\n" 374 " pxor 48(%2), %%mm6 ;\n" 375 " movq %%mm4, 32(%1) ;\n" 376 " pxor 56(%2), %%mm7 ;\n" 377 " pxor 40(%4), %%mm5 ;\n" 378 " pxor 48(%3), %%mm6 ;\n" 379 " pxor 56(%3), %%mm7 ;\n" 380 " movq %%mm5, 40(%1) ;\n" 381 " pxor 48(%4), %%mm6 ;\n" 382 " pxor 56(%4), %%mm7 ;\n" 383 " movq %%mm6, 48(%1) ;\n" 384 " movq %%mm7, 56(%1) ;\n" 385 386 " addl $64, %1 ;\n" 387 " addl $64, %2 ;\n" 388 " addl $64, %3 ;\n" 389 " addl $64, %4 ;\n" 390 " decl %0 ;\n" 391 " jnz 1b ;\n" 392 : "+r" (lines), 393 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) 394 : 395 : "memory"); 396 } 397 398 static void 399 xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1, 400 const unsigned long * __restrict p2, 401 const unsigned long * __restrict p3, 402 const unsigned long * __restrict p4, 403 const unsigned long * __restrict p5) 404 { 405 unsigned long lines = bytes >> 6; 406 407 /* Make sure GCC forgets anything it knows about p4 or p5, 408 such that it won't pass to the asm volatile below a 409 register that is shared with any other variable. That's 410 because we modify p4 and p5 there, but we can't mark them 411 as read/write, otherwise we'd overflow the 10-asm-operands 412 limit of GCC < 3.1. */ 413 asm("" : "+r" (p4), "+r" (p5)); 414 415 asm volatile( 416 " .align 32,0x90 ;\n" 417 " 1: ;\n" 418 " movq (%1), %%mm0 ;\n" 419 " movq 8(%1), %%mm1 ;\n" 420 " pxor (%2), %%mm0 ;\n" 421 " pxor 8(%2), %%mm1 ;\n" 422 " movq 16(%1), %%mm2 ;\n" 423 " pxor (%3), %%mm0 ;\n" 424 " pxor 8(%3), %%mm1 ;\n" 425 " pxor 16(%2), %%mm2 ;\n" 426 " pxor (%4), %%mm0 ;\n" 427 " pxor 8(%4), %%mm1 ;\n" 428 " pxor 16(%3), %%mm2 ;\n" 429 " movq 24(%1), %%mm3 ;\n" 430 " pxor (%5), %%mm0 ;\n" 431 " pxor 8(%5), %%mm1 ;\n" 432 " movq %%mm0, (%1) ;\n" 433 " pxor 16(%4), %%mm2 ;\n" 434 " pxor 24(%2), %%mm3 ;\n" 435 " movq %%mm1, 8(%1) ;\n" 436 " pxor 16(%5), %%mm2 ;\n" 437 " pxor 24(%3), %%mm3 ;\n" 438 " movq 32(%1), %%mm4 ;\n" 439 " movq %%mm2, 16(%1) ;\n" 440 " pxor 24(%4), %%mm3 ;\n" 441 " pxor 32(%2), %%mm4 ;\n" 442 " movq 40(%1), %%mm5 ;\n" 443 " pxor 24(%5), %%mm3 ;\n" 444 " pxor 32(%3), %%mm4 ;\n" 445 " pxor 40(%2), %%mm5 ;\n" 446 " movq %%mm3, 24(%1) ;\n" 447 " pxor 32(%4), %%mm4 ;\n" 448 " pxor 40(%3), %%mm5 ;\n" 449 " movq 48(%1), %%mm6 ;\n" 450 " movq 56(%1), %%mm7 ;\n" 451 " pxor 32(%5), %%mm4 ;\n" 452 " pxor 40(%4), %%mm5 ;\n" 453 " pxor 48(%2), %%mm6 ;\n" 454 " pxor 56(%2), %%mm7 ;\n" 455 " movq %%mm4, 32(%1) ;\n" 456 " pxor 48(%3), %%mm6 ;\n" 457 " pxor 56(%3), %%mm7 ;\n" 458 " pxor 40(%5), %%mm5 ;\n" 459 " pxor 48(%4), %%mm6 ;\n" 460 " pxor 56(%4), %%mm7 ;\n" 461 " movq %%mm5, 40(%1) ;\n" 462 " pxor 48(%5), %%mm6 ;\n" 463 " pxor 56(%5), %%mm7 ;\n" 464 " movq %%mm6, 48(%1) ;\n" 465 " movq %%mm7, 56(%1) ;\n" 466 467 " addl $64, %1 ;\n" 468 " addl $64, %2 ;\n" 469 " addl $64, %3 ;\n" 470 " addl $64, %4 ;\n" 471 " addl $64, %5 ;\n" 472 " decl %0 ;\n" 473 " jnz 1b ;\n" 474 : "+r" (lines), 475 "+r" (p1), "+r" (p2), "+r" (p3) 476 : "r" (p4), "r" (p5) 477 : "memory"); 478 479 /* p4 and p5 were modified, and now the variables are dead. 480 Clobber them just to be sure nobody does something stupid 481 like assuming they have some legal value. */ 482 asm("" : "=r" (p4), "=r" (p5)); 483 } 484 485 DO_XOR_BLOCKS(pII_mmx_inner, xor_pII_mmx_2, xor_pII_mmx_3, xor_pII_mmx_4, 486 xor_pII_mmx_5); 487 488 static void xor_gen_pII_mmx(void *dest, void **srcs, unsigned int src_cnt, 489 unsigned int bytes) 490 { 491 kernel_fpu_begin(); 492 xor_gen_pII_mmx_inner(dest, srcs, src_cnt, bytes); 493 kernel_fpu_end(); 494 } 495 496 struct xor_block_template xor_block_pII_mmx = { 497 .name = "pII_mmx", 498 .xor_gen = xor_gen_pII_mmx, 499 }; 500 501 DO_XOR_BLOCKS(p5_mmx_inner, xor_p5_mmx_2, xor_p5_mmx_3, xor_p5_mmx_4, 502 xor_p5_mmx_5); 503 504 static void xor_gen_p5_mmx(void *dest, void **srcs, unsigned int src_cnt, 505 unsigned int bytes) 506 { 507 kernel_fpu_begin(); 508 xor_gen_p5_mmx_inner(dest, srcs, src_cnt, bytes); 509 kernel_fpu_end(); 510 } 511 512 struct xor_block_template xor_block_p5_mmx = { 513 .name = "p5_mmx", 514 .xor_gen = xor_gen_p5_mmx, 515 }; 516