1af1a8899SThomas Gleixner /* SPDX-License-Identifier: GPL-2.0-or-later */ 249502766SLevin, Alexander (Sasha Levin) #ifndef _ASM_X86_XOR_H 3e8f6e3f8SJan Beulich #define _ASM_X86_XOR_H 4e8f6e3f8SJan Beulich 5e8f6e3f8SJan Beulich /* 6e8f6e3f8SJan Beulich * Optimized RAID-5 checksumming functions for SSE. 7e8f6e3f8SJan Beulich */ 8e8f6e3f8SJan Beulich 9e8f6e3f8SJan Beulich /* 10e8f6e3f8SJan Beulich * Cache avoiding checksumming functions utilizing KNI instructions 11e8f6e3f8SJan Beulich * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) 12e8f6e3f8SJan Beulich */ 13e8f6e3f8SJan Beulich 14e8f6e3f8SJan Beulich /* 15e8f6e3f8SJan Beulich * Based on 16e8f6e3f8SJan Beulich * High-speed RAID5 checksumming functions utilizing SSE instructions. 17e8f6e3f8SJan Beulich * Copyright (C) 1998 Ingo Molnar. 18e8f6e3f8SJan Beulich */ 19e8f6e3f8SJan Beulich 20e8f6e3f8SJan Beulich /* 21e8f6e3f8SJan Beulich * x86-64 changes / gcc fixes from Andi Kleen. 22e8f6e3f8SJan Beulich * Copyright 2002 Andi Kleen, SuSE Labs. 23e8f6e3f8SJan Beulich * 24e8f6e3f8SJan Beulich * This hasn't been optimized for the hammer yet, but there are likely 25e8f6e3f8SJan Beulich * no advantages to be gotten from x86-64 here anyways. 26e8f6e3f8SJan Beulich */ 27e8f6e3f8SJan Beulich 28df6b35f4SIngo Molnar #include <asm/fpu/api.h> 29e8f6e3f8SJan Beulich 30e8f6e3f8SJan Beulich #ifdef CONFIG_X86_32 31e8f6e3f8SJan Beulich /* reduce register pressure */ 32e8f6e3f8SJan Beulich # define XOR_CONSTANT_CONSTRAINT "i" 33f8561296SVegard Nossum #else 34e8f6e3f8SJan Beulich # define XOR_CONSTANT_CONSTRAINT "re" 35e8f6e3f8SJan Beulich #endif 36e8f6e3f8SJan Beulich 37e8f6e3f8SJan Beulich #define OFFS(x) "16*("#x")" 38e8f6e3f8SJan Beulich #define PF_OFFS(x) "256+16*("#x")" 39e8f6e3f8SJan Beulich #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" 40e8f6e3f8SJan Beulich #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" 41e8f6e3f8SJan Beulich #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" 42e8f6e3f8SJan Beulich #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" 43e8f6e3f8SJan Beulich #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" 44e8f6e3f8SJan Beulich #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" 45e8f6e3f8SJan Beulich #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" 46e8f6e3f8SJan Beulich #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" 47e8f6e3f8SJan Beulich #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 48e8f6e3f8SJan Beulich #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 49e8f6e3f8SJan Beulich #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 50f317820cSJan Beulich #define NOP(x) 51f317820cSJan Beulich 52f317820cSJan Beulich #define BLK64(pf, op, i) \ 53f317820cSJan Beulich pf(i) \ 54f317820cSJan Beulich op(i, 0) \ 55f317820cSJan Beulich op(i + 1, 1) \ 56f317820cSJan Beulich op(i + 2, 2) \ 57f317820cSJan Beulich op(i + 3, 3) 58e8f6e3f8SJan Beulich 59e8f6e3f8SJan Beulich static void 60*297565aaSArd Biesheuvel xor_sse_2(unsigned long bytes, unsigned long * __restrict p1, 61*297565aaSArd Biesheuvel const unsigned long * __restrict p2) 62e8f6e3f8SJan Beulich { 63e8f6e3f8SJan Beulich unsigned long lines = bytes >> 8; 64e8f6e3f8SJan Beulich 65e8f6e3f8SJan Beulich kernel_fpu_begin(); 66e8f6e3f8SJan Beulich 67e8f6e3f8SJan Beulich asm volatile( 68e8f6e3f8SJan Beulich #undef BLOCK 69e8f6e3f8SJan Beulich #define BLOCK(i) \ 70e8f6e3f8SJan Beulich LD(i, 0) \ 71e8f6e3f8SJan Beulich LD(i + 1, 1) \ 72e8f6e3f8SJan Beulich PF1(i) \ 73e8f6e3f8SJan Beulich PF1(i + 2) \ 74e8f6e3f8SJan Beulich LD(i + 2, 2) \ 75e8f6e3f8SJan Beulich LD(i + 3, 3) \ 76e8f6e3f8SJan Beulich PF0(i + 4) \ 77e8f6e3f8SJan Beulich PF0(i + 6) \ 78e8f6e3f8SJan Beulich XO1(i, 0) \ 79e8f6e3f8SJan Beulich XO1(i + 1, 1) \ 80e8f6e3f8SJan Beulich XO1(i + 2, 2) \ 81e8f6e3f8SJan Beulich XO1(i + 3, 3) \ 82e8f6e3f8SJan Beulich ST(i, 0) \ 83e8f6e3f8SJan Beulich ST(i + 1, 1) \ 84e8f6e3f8SJan Beulich ST(i + 2, 2) \ 85e8f6e3f8SJan Beulich ST(i + 3, 3) \ 86e8f6e3f8SJan Beulich 87e8f6e3f8SJan Beulich 88e8f6e3f8SJan Beulich PF0(0) 89e8f6e3f8SJan Beulich PF0(2) 90e8f6e3f8SJan Beulich 91e8f6e3f8SJan Beulich " .align 32 ;\n" 92e8f6e3f8SJan Beulich " 1: ;\n" 93e8f6e3f8SJan Beulich 94e8f6e3f8SJan Beulich BLOCK(0) 95e8f6e3f8SJan Beulich BLOCK(4) 96e8f6e3f8SJan Beulich BLOCK(8) 97e8f6e3f8SJan Beulich BLOCK(12) 98e8f6e3f8SJan Beulich 99e8f6e3f8SJan Beulich " add %[inc], %[p1] ;\n" 100e8f6e3f8SJan Beulich " add %[inc], %[p2] ;\n" 101e8f6e3f8SJan Beulich " dec %[cnt] ;\n" 102e8f6e3f8SJan Beulich " jnz 1b ;\n" 103e8f6e3f8SJan Beulich : [cnt] "+r" (lines), 104e8f6e3f8SJan Beulich [p1] "+r" (p1), [p2] "+r" (p2) 105e8f6e3f8SJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 106e8f6e3f8SJan Beulich : "memory"); 107e8f6e3f8SJan Beulich 108e8f6e3f8SJan Beulich kernel_fpu_end(); 109e8f6e3f8SJan Beulich } 110e8f6e3f8SJan Beulich 111e8f6e3f8SJan Beulich static void 112*297565aaSArd Biesheuvel xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1, 113*297565aaSArd Biesheuvel const unsigned long * __restrict p2) 114f317820cSJan Beulich { 115f317820cSJan Beulich unsigned long lines = bytes >> 8; 116f317820cSJan Beulich 117f317820cSJan Beulich kernel_fpu_begin(); 118f317820cSJan Beulich 119f317820cSJan Beulich asm volatile( 120f317820cSJan Beulich #undef BLOCK 121f317820cSJan Beulich #define BLOCK(i) \ 122f317820cSJan Beulich BLK64(PF0, LD, i) \ 123f317820cSJan Beulich BLK64(PF1, XO1, i) \ 124f317820cSJan Beulich BLK64(NOP, ST, i) \ 125f317820cSJan Beulich 126f317820cSJan Beulich " .align 32 ;\n" 127f317820cSJan Beulich " 1: ;\n" 128f317820cSJan Beulich 129f317820cSJan Beulich BLOCK(0) 130f317820cSJan Beulich BLOCK(4) 131f317820cSJan Beulich BLOCK(8) 132f317820cSJan Beulich BLOCK(12) 133f317820cSJan Beulich 134f317820cSJan Beulich " add %[inc], %[p1] ;\n" 135f317820cSJan Beulich " add %[inc], %[p2] ;\n" 136f317820cSJan Beulich " dec %[cnt] ;\n" 137f317820cSJan Beulich " jnz 1b ;\n" 138f317820cSJan Beulich : [cnt] "+r" (lines), 139f317820cSJan Beulich [p1] "+r" (p1), [p2] "+r" (p2) 140f317820cSJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 141f317820cSJan Beulich : "memory"); 142f317820cSJan Beulich 143f317820cSJan Beulich kernel_fpu_end(); 144f317820cSJan Beulich } 145f317820cSJan Beulich 146f317820cSJan Beulich static void 147*297565aaSArd Biesheuvel xor_sse_3(unsigned long bytes, unsigned long * __restrict p1, 148*297565aaSArd Biesheuvel const unsigned long * __restrict p2, 149*297565aaSArd Biesheuvel const unsigned long * __restrict p3) 150e8f6e3f8SJan Beulich { 151e8f6e3f8SJan Beulich unsigned long lines = bytes >> 8; 152e8f6e3f8SJan Beulich 153e8f6e3f8SJan Beulich kernel_fpu_begin(); 154e8f6e3f8SJan Beulich 155e8f6e3f8SJan Beulich asm volatile( 156e8f6e3f8SJan Beulich #undef BLOCK 157e8f6e3f8SJan Beulich #define BLOCK(i) \ 158e8f6e3f8SJan Beulich PF1(i) \ 159e8f6e3f8SJan Beulich PF1(i + 2) \ 160e8f6e3f8SJan Beulich LD(i, 0) \ 161e8f6e3f8SJan Beulich LD(i + 1, 1) \ 162e8f6e3f8SJan Beulich LD(i + 2, 2) \ 163e8f6e3f8SJan Beulich LD(i + 3, 3) \ 164e8f6e3f8SJan Beulich PF2(i) \ 165e8f6e3f8SJan Beulich PF2(i + 2) \ 166e8f6e3f8SJan Beulich PF0(i + 4) \ 167e8f6e3f8SJan Beulich PF0(i + 6) \ 168e8f6e3f8SJan Beulich XO1(i, 0) \ 169e8f6e3f8SJan Beulich XO1(i + 1, 1) \ 170e8f6e3f8SJan Beulich XO1(i + 2, 2) \ 171e8f6e3f8SJan Beulich XO1(i + 3, 3) \ 172e8f6e3f8SJan Beulich XO2(i, 0) \ 173e8f6e3f8SJan Beulich XO2(i + 1, 1) \ 174e8f6e3f8SJan Beulich XO2(i + 2, 2) \ 175e8f6e3f8SJan Beulich XO2(i + 3, 3) \ 176e8f6e3f8SJan Beulich ST(i, 0) \ 177e8f6e3f8SJan Beulich ST(i + 1, 1) \ 178e8f6e3f8SJan Beulich ST(i + 2, 2) \ 179e8f6e3f8SJan Beulich ST(i + 3, 3) \ 180e8f6e3f8SJan Beulich 181e8f6e3f8SJan Beulich 182e8f6e3f8SJan Beulich PF0(0) 183e8f6e3f8SJan Beulich PF0(2) 184e8f6e3f8SJan Beulich 185e8f6e3f8SJan Beulich " .align 32 ;\n" 186e8f6e3f8SJan Beulich " 1: ;\n" 187e8f6e3f8SJan Beulich 188e8f6e3f8SJan Beulich BLOCK(0) 189e8f6e3f8SJan Beulich BLOCK(4) 190e8f6e3f8SJan Beulich BLOCK(8) 191e8f6e3f8SJan Beulich BLOCK(12) 192e8f6e3f8SJan Beulich 193e8f6e3f8SJan Beulich " add %[inc], %[p1] ;\n" 194e8f6e3f8SJan Beulich " add %[inc], %[p2] ;\n" 195e8f6e3f8SJan Beulich " add %[inc], %[p3] ;\n" 196e8f6e3f8SJan Beulich " dec %[cnt] ;\n" 197e8f6e3f8SJan Beulich " jnz 1b ;\n" 198e8f6e3f8SJan Beulich : [cnt] "+r" (lines), 199e8f6e3f8SJan Beulich [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 200e8f6e3f8SJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 201e8f6e3f8SJan Beulich : "memory"); 202e8f6e3f8SJan Beulich 203e8f6e3f8SJan Beulich kernel_fpu_end(); 204e8f6e3f8SJan Beulich } 205e8f6e3f8SJan Beulich 206e8f6e3f8SJan Beulich static void 207*297565aaSArd Biesheuvel xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1, 208*297565aaSArd Biesheuvel const unsigned long * __restrict p2, 209*297565aaSArd Biesheuvel const unsigned long * __restrict p3) 210f317820cSJan Beulich { 211f317820cSJan Beulich unsigned long lines = bytes >> 8; 212f317820cSJan Beulich 213f317820cSJan Beulich kernel_fpu_begin(); 214f317820cSJan Beulich 215f317820cSJan Beulich asm volatile( 216f317820cSJan Beulich #undef BLOCK 217f317820cSJan Beulich #define BLOCK(i) \ 218f317820cSJan Beulich BLK64(PF0, LD, i) \ 219f317820cSJan Beulich BLK64(PF1, XO1, i) \ 220f317820cSJan Beulich BLK64(PF2, XO2, i) \ 221f317820cSJan Beulich BLK64(NOP, ST, i) \ 222f317820cSJan Beulich 223f317820cSJan Beulich " .align 32 ;\n" 224f317820cSJan Beulich " 1: ;\n" 225f317820cSJan Beulich 226f317820cSJan Beulich BLOCK(0) 227f317820cSJan Beulich BLOCK(4) 228f317820cSJan Beulich BLOCK(8) 229f317820cSJan Beulich BLOCK(12) 230f317820cSJan Beulich 231f317820cSJan Beulich " add %[inc], %[p1] ;\n" 232f317820cSJan Beulich " add %[inc], %[p2] ;\n" 233f317820cSJan Beulich " add %[inc], %[p3] ;\n" 234f317820cSJan Beulich " dec %[cnt] ;\n" 235f317820cSJan Beulich " jnz 1b ;\n" 236f317820cSJan Beulich : [cnt] "+r" (lines), 237f317820cSJan Beulich [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 238f317820cSJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 239f317820cSJan Beulich : "memory"); 240f317820cSJan Beulich 241f317820cSJan Beulich kernel_fpu_end(); 242f317820cSJan Beulich } 243f317820cSJan Beulich 244f317820cSJan Beulich static void 245*297565aaSArd Biesheuvel xor_sse_4(unsigned long bytes, unsigned long * __restrict p1, 246*297565aaSArd Biesheuvel const unsigned long * __restrict p2, 247*297565aaSArd Biesheuvel const unsigned long * __restrict p3, 248*297565aaSArd Biesheuvel const unsigned long * __restrict p4) 249e8f6e3f8SJan Beulich { 250e8f6e3f8SJan Beulich unsigned long lines = bytes >> 8; 251e8f6e3f8SJan Beulich 252e8f6e3f8SJan Beulich kernel_fpu_begin(); 253e8f6e3f8SJan Beulich 254e8f6e3f8SJan Beulich asm volatile( 255e8f6e3f8SJan Beulich #undef BLOCK 256e8f6e3f8SJan Beulich #define BLOCK(i) \ 257e8f6e3f8SJan Beulich PF1(i) \ 258e8f6e3f8SJan Beulich PF1(i + 2) \ 259e8f6e3f8SJan Beulich LD(i, 0) \ 260e8f6e3f8SJan Beulich LD(i + 1, 1) \ 261e8f6e3f8SJan Beulich LD(i + 2, 2) \ 262e8f6e3f8SJan Beulich LD(i + 3, 3) \ 263e8f6e3f8SJan Beulich PF2(i) \ 264e8f6e3f8SJan Beulich PF2(i + 2) \ 265e8f6e3f8SJan Beulich XO1(i, 0) \ 266e8f6e3f8SJan Beulich XO1(i + 1, 1) \ 267e8f6e3f8SJan Beulich XO1(i + 2, 2) \ 268e8f6e3f8SJan Beulich XO1(i + 3, 3) \ 269e8f6e3f8SJan Beulich PF3(i) \ 270e8f6e3f8SJan Beulich PF3(i + 2) \ 271e8f6e3f8SJan Beulich PF0(i + 4) \ 272e8f6e3f8SJan Beulich PF0(i + 6) \ 273e8f6e3f8SJan Beulich XO2(i, 0) \ 274e8f6e3f8SJan Beulich XO2(i + 1, 1) \ 275e8f6e3f8SJan Beulich XO2(i + 2, 2) \ 276e8f6e3f8SJan Beulich XO2(i + 3, 3) \ 277e8f6e3f8SJan Beulich XO3(i, 0) \ 278e8f6e3f8SJan Beulich XO3(i + 1, 1) \ 279e8f6e3f8SJan Beulich XO3(i + 2, 2) \ 280e8f6e3f8SJan Beulich XO3(i + 3, 3) \ 281e8f6e3f8SJan Beulich ST(i, 0) \ 282e8f6e3f8SJan Beulich ST(i + 1, 1) \ 283e8f6e3f8SJan Beulich ST(i + 2, 2) \ 284e8f6e3f8SJan Beulich ST(i + 3, 3) \ 285e8f6e3f8SJan Beulich 286e8f6e3f8SJan Beulich 287e8f6e3f8SJan Beulich PF0(0) 288e8f6e3f8SJan Beulich PF0(2) 289e8f6e3f8SJan Beulich 290e8f6e3f8SJan Beulich " .align 32 ;\n" 291e8f6e3f8SJan Beulich " 1: ;\n" 292e8f6e3f8SJan Beulich 293e8f6e3f8SJan Beulich BLOCK(0) 294e8f6e3f8SJan Beulich BLOCK(4) 295e8f6e3f8SJan Beulich BLOCK(8) 296e8f6e3f8SJan Beulich BLOCK(12) 297e8f6e3f8SJan Beulich 298e8f6e3f8SJan Beulich " add %[inc], %[p1] ;\n" 299e8f6e3f8SJan Beulich " add %[inc], %[p2] ;\n" 300e8f6e3f8SJan Beulich " add %[inc], %[p3] ;\n" 301e8f6e3f8SJan Beulich " add %[inc], %[p4] ;\n" 302e8f6e3f8SJan Beulich " dec %[cnt] ;\n" 303e8f6e3f8SJan Beulich " jnz 1b ;\n" 304e8f6e3f8SJan Beulich : [cnt] "+r" (lines), [p1] "+r" (p1), 305e8f6e3f8SJan Beulich [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 306e8f6e3f8SJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 307e8f6e3f8SJan Beulich : "memory"); 308e8f6e3f8SJan Beulich 309e8f6e3f8SJan Beulich kernel_fpu_end(); 310e8f6e3f8SJan Beulich } 311e8f6e3f8SJan Beulich 312e8f6e3f8SJan Beulich static void 313*297565aaSArd Biesheuvel xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1, 314*297565aaSArd Biesheuvel const unsigned long * __restrict p2, 315*297565aaSArd Biesheuvel const unsigned long * __restrict p3, 316*297565aaSArd Biesheuvel const unsigned long * __restrict p4) 317f317820cSJan Beulich { 318f317820cSJan Beulich unsigned long lines = bytes >> 8; 319f317820cSJan Beulich 320f317820cSJan Beulich kernel_fpu_begin(); 321f317820cSJan Beulich 322f317820cSJan Beulich asm volatile( 323f317820cSJan Beulich #undef BLOCK 324f317820cSJan Beulich #define BLOCK(i) \ 325f317820cSJan Beulich BLK64(PF0, LD, i) \ 326f317820cSJan Beulich BLK64(PF1, XO1, i) \ 327f317820cSJan Beulich BLK64(PF2, XO2, i) \ 328f317820cSJan Beulich BLK64(PF3, XO3, i) \ 329f317820cSJan Beulich BLK64(NOP, ST, i) \ 330f317820cSJan Beulich 331f317820cSJan Beulich " .align 32 ;\n" 332f317820cSJan Beulich " 1: ;\n" 333f317820cSJan Beulich 334f317820cSJan Beulich BLOCK(0) 335f317820cSJan Beulich BLOCK(4) 336f317820cSJan Beulich BLOCK(8) 337f317820cSJan Beulich BLOCK(12) 338f317820cSJan Beulich 339f317820cSJan Beulich " add %[inc], %[p1] ;\n" 340f317820cSJan Beulich " add %[inc], %[p2] ;\n" 341f317820cSJan Beulich " add %[inc], %[p3] ;\n" 342f317820cSJan Beulich " add %[inc], %[p4] ;\n" 343f317820cSJan Beulich " dec %[cnt] ;\n" 344f317820cSJan Beulich " jnz 1b ;\n" 345f317820cSJan Beulich : [cnt] "+r" (lines), [p1] "+r" (p1), 346f317820cSJan Beulich [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 347f317820cSJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 348f317820cSJan Beulich : "memory"); 349f317820cSJan Beulich 350f317820cSJan Beulich kernel_fpu_end(); 351f317820cSJan Beulich } 352f317820cSJan Beulich 353f317820cSJan Beulich static void 354*297565aaSArd Biesheuvel xor_sse_5(unsigned long bytes, unsigned long * __restrict p1, 355*297565aaSArd Biesheuvel const unsigned long * __restrict p2, 356*297565aaSArd Biesheuvel const unsigned long * __restrict p3, 357*297565aaSArd Biesheuvel const unsigned long * __restrict p4, 358*297565aaSArd Biesheuvel const unsigned long * __restrict p5) 359e8f6e3f8SJan Beulich { 360e8f6e3f8SJan Beulich unsigned long lines = bytes >> 8; 361e8f6e3f8SJan Beulich 362e8f6e3f8SJan Beulich kernel_fpu_begin(); 363e8f6e3f8SJan Beulich 364e8f6e3f8SJan Beulich asm volatile( 365e8f6e3f8SJan Beulich #undef BLOCK 366e8f6e3f8SJan Beulich #define BLOCK(i) \ 367e8f6e3f8SJan Beulich PF1(i) \ 368e8f6e3f8SJan Beulich PF1(i + 2) \ 369e8f6e3f8SJan Beulich LD(i, 0) \ 370e8f6e3f8SJan Beulich LD(i + 1, 1) \ 371e8f6e3f8SJan Beulich LD(i + 2, 2) \ 372e8f6e3f8SJan Beulich LD(i + 3, 3) \ 373e8f6e3f8SJan Beulich PF2(i) \ 374e8f6e3f8SJan Beulich PF2(i + 2) \ 375e8f6e3f8SJan Beulich XO1(i, 0) \ 376e8f6e3f8SJan Beulich XO1(i + 1, 1) \ 377e8f6e3f8SJan Beulich XO1(i + 2, 2) \ 378e8f6e3f8SJan Beulich XO1(i + 3, 3) \ 379e8f6e3f8SJan Beulich PF3(i) \ 380e8f6e3f8SJan Beulich PF3(i + 2) \ 381e8f6e3f8SJan Beulich XO2(i, 0) \ 382e8f6e3f8SJan Beulich XO2(i + 1, 1) \ 383e8f6e3f8SJan Beulich XO2(i + 2, 2) \ 384e8f6e3f8SJan Beulich XO2(i + 3, 3) \ 385e8f6e3f8SJan Beulich PF4(i) \ 386e8f6e3f8SJan Beulich PF4(i + 2) \ 387e8f6e3f8SJan Beulich PF0(i + 4) \ 388e8f6e3f8SJan Beulich PF0(i + 6) \ 389e8f6e3f8SJan Beulich XO3(i, 0) \ 390e8f6e3f8SJan Beulich XO3(i + 1, 1) \ 391e8f6e3f8SJan Beulich XO3(i + 2, 2) \ 392e8f6e3f8SJan Beulich XO3(i + 3, 3) \ 393e8f6e3f8SJan Beulich XO4(i, 0) \ 394e8f6e3f8SJan Beulich XO4(i + 1, 1) \ 395e8f6e3f8SJan Beulich XO4(i + 2, 2) \ 396e8f6e3f8SJan Beulich XO4(i + 3, 3) \ 397e8f6e3f8SJan Beulich ST(i, 0) \ 398e8f6e3f8SJan Beulich ST(i + 1, 1) \ 399e8f6e3f8SJan Beulich ST(i + 2, 2) \ 400e8f6e3f8SJan Beulich ST(i + 3, 3) \ 401e8f6e3f8SJan Beulich 402e8f6e3f8SJan Beulich 403e8f6e3f8SJan Beulich PF0(0) 404e8f6e3f8SJan Beulich PF0(2) 405e8f6e3f8SJan Beulich 406e8f6e3f8SJan Beulich " .align 32 ;\n" 407e8f6e3f8SJan Beulich " 1: ;\n" 408e8f6e3f8SJan Beulich 409e8f6e3f8SJan Beulich BLOCK(0) 410e8f6e3f8SJan Beulich BLOCK(4) 411e8f6e3f8SJan Beulich BLOCK(8) 412e8f6e3f8SJan Beulich BLOCK(12) 413e8f6e3f8SJan Beulich 414e8f6e3f8SJan Beulich " add %[inc], %[p1] ;\n" 415e8f6e3f8SJan Beulich " add %[inc], %[p2] ;\n" 416e8f6e3f8SJan Beulich " add %[inc], %[p3] ;\n" 417e8f6e3f8SJan Beulich " add %[inc], %[p4] ;\n" 418e8f6e3f8SJan Beulich " add %[inc], %[p5] ;\n" 419e8f6e3f8SJan Beulich " dec %[cnt] ;\n" 420e8f6e3f8SJan Beulich " jnz 1b ;\n" 421e8f6e3f8SJan Beulich : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 422e8f6e3f8SJan Beulich [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 423e8f6e3f8SJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 424e8f6e3f8SJan Beulich : "memory"); 425e8f6e3f8SJan Beulich 426e8f6e3f8SJan Beulich kernel_fpu_end(); 427e8f6e3f8SJan Beulich } 428e8f6e3f8SJan Beulich 429f317820cSJan Beulich static void 430*297565aaSArd Biesheuvel xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1, 431*297565aaSArd Biesheuvel const unsigned long * __restrict p2, 432*297565aaSArd Biesheuvel const unsigned long * __restrict p3, 433*297565aaSArd Biesheuvel const unsigned long * __restrict p4, 434*297565aaSArd Biesheuvel const unsigned long * __restrict p5) 435f317820cSJan Beulich { 436f317820cSJan Beulich unsigned long lines = bytes >> 8; 437f317820cSJan Beulich 438f317820cSJan Beulich kernel_fpu_begin(); 439f317820cSJan Beulich 440f317820cSJan Beulich asm volatile( 441f317820cSJan Beulich #undef BLOCK 442f317820cSJan Beulich #define BLOCK(i) \ 443f317820cSJan Beulich BLK64(PF0, LD, i) \ 444f317820cSJan Beulich BLK64(PF1, XO1, i) \ 445f317820cSJan Beulich BLK64(PF2, XO2, i) \ 446f317820cSJan Beulich BLK64(PF3, XO3, i) \ 447f317820cSJan Beulich BLK64(PF4, XO4, i) \ 448f317820cSJan Beulich BLK64(NOP, ST, i) \ 449f317820cSJan Beulich 450f317820cSJan Beulich " .align 32 ;\n" 451f317820cSJan Beulich " 1: ;\n" 452f317820cSJan Beulich 453f317820cSJan Beulich BLOCK(0) 454f317820cSJan Beulich BLOCK(4) 455f317820cSJan Beulich BLOCK(8) 456f317820cSJan Beulich BLOCK(12) 457f317820cSJan Beulich 458f317820cSJan Beulich " add %[inc], %[p1] ;\n" 459f317820cSJan Beulich " add %[inc], %[p2] ;\n" 460f317820cSJan Beulich " add %[inc], %[p3] ;\n" 461f317820cSJan Beulich " add %[inc], %[p4] ;\n" 462f317820cSJan Beulich " add %[inc], %[p5] ;\n" 463f317820cSJan Beulich " dec %[cnt] ;\n" 464f317820cSJan Beulich " jnz 1b ;\n" 465f317820cSJan Beulich : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 466f317820cSJan Beulich [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 467f317820cSJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 468f317820cSJan Beulich : "memory"); 469f317820cSJan Beulich 470f317820cSJan Beulich kernel_fpu_end(); 471f317820cSJan Beulich } 472f317820cSJan Beulich 473f317820cSJan Beulich static struct xor_block_template xor_block_sse_pf64 = { 474f317820cSJan Beulich .name = "prefetch64-sse", 475f317820cSJan Beulich .do_2 = xor_sse_2_pf64, 476f317820cSJan Beulich .do_3 = xor_sse_3_pf64, 477f317820cSJan Beulich .do_4 = xor_sse_4_pf64, 478f317820cSJan Beulich .do_5 = xor_sse_5_pf64, 479f317820cSJan Beulich }; 480f317820cSJan Beulich 481e8f6e3f8SJan Beulich #undef LD 482e8f6e3f8SJan Beulich #undef XO1 483e8f6e3f8SJan Beulich #undef XO2 484e8f6e3f8SJan Beulich #undef XO3 485e8f6e3f8SJan Beulich #undef XO4 486e8f6e3f8SJan Beulich #undef ST 487f317820cSJan Beulich #undef NOP 488f317820cSJan Beulich #undef BLK64 489e8f6e3f8SJan Beulich #undef BLOCK 490e8f6e3f8SJan Beulich 491e8f6e3f8SJan Beulich #undef XOR_CONSTANT_CONSTRAINT 492e8f6e3f8SJan Beulich 493bb898558SAl Viro #ifdef CONFIG_X86_32 494a1ce3928SDavid Howells # include <asm/xor_32.h> 495bb898558SAl Viro #else 496a1ce3928SDavid Howells # include <asm/xor_64.h> 497bb898558SAl Viro #endif 498e8f6e3f8SJan Beulich 499f317820cSJan Beulich #define XOR_SELECT_TEMPLATE(FASTEST) \ 500f317820cSJan Beulich AVX_SELECT(FASTEST) 501f317820cSJan Beulich 502e8f6e3f8SJan Beulich #endif /* _ASM_X86_XOR_H */ 503