1 #ifndef _ASM_X86_XOR_64_H 2 #define _ASM_X86_XOR_64_H 3 4 /* 5 * Optimized RAID-5 checksumming functions for MMX and SSE. 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2, or (at your option) 10 * any later version. 11 * 12 * You should have received a copy of the GNU General Public License 13 * (for example /usr/src/linux/COPYING); if not, write to the Free 14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 15 */ 16 17 18 /* 19 * Cache avoiding checksumming functions utilizing KNI instructions 20 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) 21 */ 22 23 /* 24 * Based on 25 * High-speed RAID5 checksumming functions utilizing SSE instructions. 26 * Copyright (C) 1998 Ingo Molnar. 27 */ 28 29 /* 30 * x86-64 changes / gcc fixes from Andi Kleen. 31 * Copyright 2002 Andi Kleen, SuSE Labs. 32 * 33 * This hasn't been optimized for the hammer yet, but there are likely 34 * no advantages to be gotten from x86-64 here anyways. 35 */ 36 37 #include <asm/i387.h> 38 39 #define OFFS(x) "16*("#x")" 40 #define PF_OFFS(x) "256+16*("#x")" 41 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" 42 #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" 43 #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" 44 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" 45 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" 46 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" 47 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" 48 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" 49 #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" 50 #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 51 #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 52 #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 53 #define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" 54 55 56 static void 57 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 58 { 59 unsigned int lines = bytes >> 8; 60 61 kernel_fpu_begin(); 62 63 asm volatile( 64 #undef BLOCK 65 #define BLOCK(i) \ 66 LD(i, 0) \ 67 LD(i + 1, 1) \ 68 PF1(i) \ 69 PF1(i + 2) \ 70 LD(i + 2, 2) \ 71 LD(i + 3, 3) \ 72 PF0(i + 4) \ 73 PF0(i + 6) \ 74 XO1(i, 0) \ 75 XO1(i + 1, 1) \ 76 XO1(i + 2, 2) \ 77 XO1(i + 3, 3) \ 78 ST(i, 0) \ 79 ST(i + 1, 1) \ 80 ST(i + 2, 2) \ 81 ST(i + 3, 3) \ 82 83 84 PF0(0) 85 PF0(2) 86 87 " .align 32 ;\n" 88 " 1: ;\n" 89 90 BLOCK(0) 91 BLOCK(4) 92 BLOCK(8) 93 BLOCK(12) 94 95 " addq %[inc], %[p1] ;\n" 96 " addq %[inc], %[p2] ;\n" 97 " decl %[cnt] ; jnz 1b" 98 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) 99 : [inc] "r" (256UL) 100 : "memory"); 101 102 kernel_fpu_end(); 103 } 104 105 static void 106 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 107 unsigned long *p3) 108 { 109 unsigned int lines = bytes >> 8; 110 111 kernel_fpu_begin(); 112 asm volatile( 113 #undef BLOCK 114 #define BLOCK(i) \ 115 PF1(i) \ 116 PF1(i + 2) \ 117 LD(i, 0) \ 118 LD(i + 1, 1) \ 119 LD(i + 2, 2) \ 120 LD(i + 3, 3) \ 121 PF2(i) \ 122 PF2(i + 2) \ 123 PF0(i + 4) \ 124 PF0(i + 6) \ 125 XO1(i, 0) \ 126 XO1(i + 1, 1) \ 127 XO1(i + 2, 2) \ 128 XO1(i + 3, 3) \ 129 XO2(i, 0) \ 130 XO2(i + 1, 1) \ 131 XO2(i + 2, 2) \ 132 XO2(i + 3, 3) \ 133 ST(i, 0) \ 134 ST(i + 1, 1) \ 135 ST(i + 2, 2) \ 136 ST(i + 3, 3) \ 137 138 139 PF0(0) 140 PF0(2) 141 142 " .align 32 ;\n" 143 " 1: ;\n" 144 145 BLOCK(0) 146 BLOCK(4) 147 BLOCK(8) 148 BLOCK(12) 149 150 " addq %[inc], %[p1] ;\n" 151 " addq %[inc], %[p2] ;\n" 152 " addq %[inc], %[p3] ;\n" 153 " decl %[cnt] ; jnz 1b" 154 : [cnt] "+r" (lines), 155 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 156 : [inc] "r" (256UL) 157 : "memory"); 158 kernel_fpu_end(); 159 } 160 161 static void 162 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 163 unsigned long *p3, unsigned long *p4) 164 { 165 unsigned int lines = bytes >> 8; 166 167 kernel_fpu_begin(); 168 169 asm volatile( 170 #undef BLOCK 171 #define BLOCK(i) \ 172 PF1(i) \ 173 PF1(i + 2) \ 174 LD(i, 0) \ 175 LD(i + 1, 1) \ 176 LD(i + 2, 2) \ 177 LD(i + 3, 3) \ 178 PF2(i) \ 179 PF2(i + 2) \ 180 XO1(i, 0) \ 181 XO1(i + 1, 1) \ 182 XO1(i + 2, 2) \ 183 XO1(i + 3, 3) \ 184 PF3(i) \ 185 PF3(i + 2) \ 186 PF0(i + 4) \ 187 PF0(i + 6) \ 188 XO2(i, 0) \ 189 XO2(i + 1, 1) \ 190 XO2(i + 2, 2) \ 191 XO2(i + 3, 3) \ 192 XO3(i, 0) \ 193 XO3(i + 1, 1) \ 194 XO3(i + 2, 2) \ 195 XO3(i + 3, 3) \ 196 ST(i, 0) \ 197 ST(i + 1, 1) \ 198 ST(i + 2, 2) \ 199 ST(i + 3, 3) \ 200 201 202 PF0(0) 203 PF0(2) 204 205 " .align 32 ;\n" 206 " 1: ;\n" 207 208 BLOCK(0) 209 BLOCK(4) 210 BLOCK(8) 211 BLOCK(12) 212 213 " addq %[inc], %[p1] ;\n" 214 " addq %[inc], %[p2] ;\n" 215 " addq %[inc], %[p3] ;\n" 216 " addq %[inc], %[p4] ;\n" 217 " decl %[cnt] ; jnz 1b" 218 : [cnt] "+c" (lines), 219 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 220 : [inc] "r" (256UL) 221 : "memory" ); 222 223 kernel_fpu_end(); 224 } 225 226 static void 227 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 228 unsigned long *p3, unsigned long *p4, unsigned long *p5) 229 { 230 unsigned int lines = bytes >> 8; 231 232 kernel_fpu_begin(); 233 234 asm volatile( 235 #undef BLOCK 236 #define BLOCK(i) \ 237 PF1(i) \ 238 PF1(i + 2) \ 239 LD(i, 0) \ 240 LD(i + 1, 1) \ 241 LD(i + 2, 2) \ 242 LD(i + 3, 3) \ 243 PF2(i) \ 244 PF2(i + 2) \ 245 XO1(i, 0) \ 246 XO1(i + 1, 1) \ 247 XO1(i + 2, 2) \ 248 XO1(i + 3, 3) \ 249 PF3(i) \ 250 PF3(i + 2) \ 251 XO2(i, 0) \ 252 XO2(i + 1, 1) \ 253 XO2(i + 2, 2) \ 254 XO2(i + 3, 3) \ 255 PF4(i) \ 256 PF4(i + 2) \ 257 PF0(i + 4) \ 258 PF0(i + 6) \ 259 XO3(i, 0) \ 260 XO3(i + 1, 1) \ 261 XO3(i + 2, 2) \ 262 XO3(i + 3, 3) \ 263 XO4(i, 0) \ 264 XO4(i + 1, 1) \ 265 XO4(i + 2, 2) \ 266 XO4(i + 3, 3) \ 267 ST(i, 0) \ 268 ST(i + 1, 1) \ 269 ST(i + 2, 2) \ 270 ST(i + 3, 3) \ 271 272 273 PF0(0) 274 PF0(2) 275 276 " .align 32 ;\n" 277 " 1: ;\n" 278 279 BLOCK(0) 280 BLOCK(4) 281 BLOCK(8) 282 BLOCK(12) 283 284 " addq %[inc], %[p1] ;\n" 285 " addq %[inc], %[p2] ;\n" 286 " addq %[inc], %[p3] ;\n" 287 " addq %[inc], %[p4] ;\n" 288 " addq %[inc], %[p5] ;\n" 289 " decl %[cnt] ; jnz 1b" 290 : [cnt] "+c" (lines), 291 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), 292 [p5] "+r" (p5) 293 : [inc] "r" (256UL) 294 : "memory"); 295 296 kernel_fpu_end(); 297 } 298 299 static struct xor_block_template xor_block_sse = { 300 .name = "generic_sse", 301 .do_2 = xor_sse_2, 302 .do_3 = xor_sse_3, 303 .do_4 = xor_sse_4, 304 .do_5 = xor_sse_5, 305 }; 306 307 308 /* Also try the AVX routines */ 309 #include <asm/xor_avx.h> 310 311 #undef XOR_TRY_TEMPLATES 312 #define XOR_TRY_TEMPLATES \ 313 do { \ 314 AVX_XOR_SPEED; \ 315 xor_speed(&xor_block_sse); \ 316 } while (0) 317 318 /* We force the use of the SSE xor block because it can write around L2. 319 We may also be able to load into the L1 only depending on how the cpu 320 deals with a load to a line that is being prefetched. */ 321 #define XOR_SELECT_TEMPLATE(FASTEST) \ 322 AVX_SELECT(&xor_block_sse) 323 324 #endif /* _ASM_X86_XOR_64_H */ 325