1 /* -*- linux-c -*- ------------------------------------------------------- * 2 * 3 * Copyright 2002 H. Peter Anvin - All Rights Reserved 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330, 8 * Boston MA 02111-1307, USA; either version 2 of the License, or 9 * (at your option) any later version; incorporated herein by reference. 10 * 11 * ----------------------------------------------------------------------- */ 12 13 /* 14 * raid6/sse2.c 15 * 16 * SSE-2 implementation of RAID-6 syndrome functions 17 * 18 */ 19 20 #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) 21 22 #include <linux/raid/pq.h> 23 #include "x86.h" 24 25 static const struct raid6_sse_constants { 26 u64 x1d[2]; 27 } raid6_sse_constants __attribute__((aligned(16))) = { 28 { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL }, 29 }; 30 31 static int raid6_have_sse2(void) 32 { 33 /* Not really boot_cpu but "all_cpus" */ 34 return boot_cpu_has(X86_FEATURE_MMX) && 35 boot_cpu_has(X86_FEATURE_FXSR) && 36 boot_cpu_has(X86_FEATURE_XMM) && 37 boot_cpu_has(X86_FEATURE_XMM2); 38 } 39 40 /* 41 * Plain SSE2 implementation 42 */ 43 static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) 44 { 45 u8 **dptr = (u8 **)ptrs; 46 u8 *p, *q; 47 int d, z, z0; 48 49 z0 = disks - 3; /* Highest data disk */ 50 p = dptr[z0+1]; /* XOR parity */ 51 q = dptr[z0+2]; /* RS syndrome */ 52 53 kernel_fpu_begin(); 54 55 asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); 56 asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ 57 58 for ( d = 0 ; d < bytes ; d += 16 ) { 59 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); 60 asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ 61 asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); 62 asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ 63 asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d])); 64 for ( z = z0-2 ; z >= 0 ; z-- ) { 65 asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); 66 asm volatile("pcmpgtb %xmm4,%xmm5"); 67 asm volatile("paddb %xmm4,%xmm4"); 68 asm volatile("pand %xmm0,%xmm5"); 69 asm volatile("pxor %xmm5,%xmm4"); 70 asm volatile("pxor %xmm5,%xmm5"); 71 asm volatile("pxor %xmm6,%xmm2"); 72 asm volatile("pxor %xmm6,%xmm4"); 73 asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d])); 74 } 75 asm volatile("pcmpgtb %xmm4,%xmm5"); 76 asm volatile("paddb %xmm4,%xmm4"); 77 asm volatile("pand %xmm0,%xmm5"); 78 asm volatile("pxor %xmm5,%xmm4"); 79 asm volatile("pxor %xmm5,%xmm5"); 80 asm volatile("pxor %xmm6,%xmm2"); 81 asm volatile("pxor %xmm6,%xmm4"); 82 83 asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); 84 asm volatile("pxor %xmm2,%xmm2"); 85 asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); 86 asm volatile("pxor %xmm4,%xmm4"); 87 } 88 89 asm volatile("sfence" : : : "memory"); 90 kernel_fpu_end(); 91 } 92 93 const struct raid6_calls raid6_sse2x1 = { 94 raid6_sse21_gen_syndrome, 95 raid6_have_sse2, 96 "sse2x1", 97 1 /* Has cache hints */ 98 }; 99 100 /* 101 * Unrolled-by-2 SSE2 implementation 102 */ 103 static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) 104 { 105 u8 **dptr = (u8 **)ptrs; 106 u8 *p, *q; 107 int d, z, z0; 108 109 z0 = disks - 3; /* Highest data disk */ 110 p = dptr[z0+1]; /* XOR parity */ 111 q = dptr[z0+2]; /* RS syndrome */ 112 113 kernel_fpu_begin(); 114 115 asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); 116 asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ 117 asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ 118 119 /* We uniformly assume a single prefetch covers at least 32 bytes */ 120 for ( d = 0 ; d < bytes ; d += 32 ) { 121 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); 122 asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ 123 asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */ 124 asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ 125 asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */ 126 for ( z = z0-1 ; z >= 0 ; z-- ) { 127 asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); 128 asm volatile("pcmpgtb %xmm4,%xmm5"); 129 asm volatile("pcmpgtb %xmm6,%xmm7"); 130 asm volatile("paddb %xmm4,%xmm4"); 131 asm volatile("paddb %xmm6,%xmm6"); 132 asm volatile("pand %xmm0,%xmm5"); 133 asm volatile("pand %xmm0,%xmm7"); 134 asm volatile("pxor %xmm5,%xmm4"); 135 asm volatile("pxor %xmm7,%xmm6"); 136 asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d])); 137 asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16])); 138 asm volatile("pxor %xmm5,%xmm2"); 139 asm volatile("pxor %xmm7,%xmm3"); 140 asm volatile("pxor %xmm5,%xmm4"); 141 asm volatile("pxor %xmm7,%xmm6"); 142 asm volatile("pxor %xmm5,%xmm5"); 143 asm volatile("pxor %xmm7,%xmm7"); 144 } 145 asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); 146 asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); 147 asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); 148 asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); 149 } 150 151 asm volatile("sfence" : : : "memory"); 152 kernel_fpu_end(); 153 } 154 155 const struct raid6_calls raid6_sse2x2 = { 156 raid6_sse22_gen_syndrome, 157 raid6_have_sse2, 158 "sse2x2", 159 1 /* Has cache hints */ 160 }; 161 162 #endif 163 164 #if defined(__x86_64__) && !defined(__arch_um__) 165 166 /* 167 * Unrolled-by-4 SSE2 implementation 168 */ 169 static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) 170 { 171 u8 **dptr = (u8 **)ptrs; 172 u8 *p, *q; 173 int d, z, z0; 174 175 z0 = disks - 3; /* Highest data disk */ 176 p = dptr[z0+1]; /* XOR parity */ 177 q = dptr[z0+2]; /* RS syndrome */ 178 179 kernel_fpu_begin(); 180 181 asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); 182 asm volatile("pxor %xmm2,%xmm2"); /* P[0] */ 183 asm volatile("pxor %xmm3,%xmm3"); /* P[1] */ 184 asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */ 185 asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ 186 asm volatile("pxor %xmm6,%xmm6"); /* Q[1] */ 187 asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ 188 asm volatile("pxor %xmm10,%xmm10"); /* P[2] */ 189 asm volatile("pxor %xmm11,%xmm11"); /* P[3] */ 190 asm volatile("pxor %xmm12,%xmm12"); /* Q[2] */ 191 asm volatile("pxor %xmm13,%xmm13"); /* Zero temp */ 192 asm volatile("pxor %xmm14,%xmm14"); /* Q[3] */ 193 asm volatile("pxor %xmm15,%xmm15"); /* Zero temp */ 194 195 for ( d = 0 ; d < bytes ; d += 64 ) { 196 for ( z = z0 ; z >= 0 ; z-- ) { 197 /* The second prefetch seems to improve performance... */ 198 asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); 199 asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); 200 asm volatile("pcmpgtb %xmm4,%xmm5"); 201 asm volatile("pcmpgtb %xmm6,%xmm7"); 202 asm volatile("pcmpgtb %xmm12,%xmm13"); 203 asm volatile("pcmpgtb %xmm14,%xmm15"); 204 asm volatile("paddb %xmm4,%xmm4"); 205 asm volatile("paddb %xmm6,%xmm6"); 206 asm volatile("paddb %xmm12,%xmm12"); 207 asm volatile("paddb %xmm14,%xmm14"); 208 asm volatile("pand %xmm0,%xmm5"); 209 asm volatile("pand %xmm0,%xmm7"); 210 asm volatile("pand %xmm0,%xmm13"); 211 asm volatile("pand %xmm0,%xmm15"); 212 asm volatile("pxor %xmm5,%xmm4"); 213 asm volatile("pxor %xmm7,%xmm6"); 214 asm volatile("pxor %xmm13,%xmm12"); 215 asm volatile("pxor %xmm15,%xmm14"); 216 asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); 217 asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); 218 asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); 219 asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); 220 asm volatile("pxor %xmm5,%xmm2"); 221 asm volatile("pxor %xmm7,%xmm3"); 222 asm volatile("pxor %xmm13,%xmm10"); 223 asm volatile("pxor %xmm15,%xmm11"); 224 asm volatile("pxor %xmm5,%xmm4"); 225 asm volatile("pxor %xmm7,%xmm6"); 226 asm volatile("pxor %xmm13,%xmm12"); 227 asm volatile("pxor %xmm15,%xmm14"); 228 asm volatile("pxor %xmm5,%xmm5"); 229 asm volatile("pxor %xmm7,%xmm7"); 230 asm volatile("pxor %xmm13,%xmm13"); 231 asm volatile("pxor %xmm15,%xmm15"); 232 } 233 asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); 234 asm volatile("pxor %xmm2,%xmm2"); 235 asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); 236 asm volatile("pxor %xmm3,%xmm3"); 237 asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); 238 asm volatile("pxor %xmm10,%xmm10"); 239 asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); 240 asm volatile("pxor %xmm11,%xmm11"); 241 asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); 242 asm volatile("pxor %xmm4,%xmm4"); 243 asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); 244 asm volatile("pxor %xmm6,%xmm6"); 245 asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); 246 asm volatile("pxor %xmm12,%xmm12"); 247 asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); 248 asm volatile("pxor %xmm14,%xmm14"); 249 } 250 251 asm volatile("sfence" : : : "memory"); 252 kernel_fpu_end(); 253 } 254 255 const struct raid6_calls raid6_sse2x4 = { 256 raid6_sse24_gen_syndrome, 257 raid6_have_sse2, 258 "sse2x4", 259 1 /* Has cache hints */ 260 }; 261 262 #endif 263