1 #ifndef _ASM_X86_XOR_AVX_H 2 #define _ASM_X86_XOR_AVX_H 3 4 /* 5 * Optimized RAID-5 checksumming functions for AVX 6 * 7 * Copyright (C) 2012 Intel Corporation 8 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> 9 * 10 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; version 2 15 * of the License. 16 */ 17 18 #ifdef CONFIG_AS_AVX 19 20 #include <linux/compiler.h> 21 #include <asm/i387.h> 22 23 #define ALIGN32 __aligned(32) 24 25 #define YMM_SAVED_REGS 4 26 27 #define YMMS_SAVE \ 28 do { \ 29 preempt_disable(); \ 30 cr0 = read_cr0(); \ 31 clts(); \ 32 asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ 33 asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ 34 asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ 35 asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ 36 } while (0); 37 38 #define YMMS_RESTORE \ 39 do { \ 40 asm volatile("sfence" : : : "memory"); \ 41 asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ 42 asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ 43 asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ 44 asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ 45 write_cr0(cr0); \ 46 preempt_enable(); \ 47 } while (0); 48 49 #define BLOCK4(i) \ 50 BLOCK(32 * i, 0) \ 51 BLOCK(32 * (i + 1), 1) \ 52 BLOCK(32 * (i + 2), 2) \ 53 BLOCK(32 * (i + 3), 3) 54 55 #define BLOCK16() \ 56 BLOCK4(0) \ 57 BLOCK4(4) \ 58 BLOCK4(8) \ 59 BLOCK4(12) 60 61 static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) 62 { 63 unsigned long cr0, lines = bytes >> 9; 64 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; 65 66 YMMS_SAVE 67 68 while (lines--) { 69 #undef BLOCK 70 #define BLOCK(i, reg) \ 71 do { \ 72 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ 73 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 74 "m" (p0[i / sizeof(*p0)])); \ 75 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 76 "=m" (p0[i / sizeof(*p0)])); \ 77 } while (0); 78 79 BLOCK16() 80 81 p0 = (unsigned long *)((uintptr_t)p0 + 512); 82 p1 = (unsigned long *)((uintptr_t)p1 + 512); 83 } 84 85 YMMS_RESTORE 86 } 87 88 static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, 89 unsigned long *p2) 90 { 91 unsigned long cr0, lines = bytes >> 9; 92 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; 93 94 YMMS_SAVE 95 96 while (lines--) { 97 #undef BLOCK 98 #define BLOCK(i, reg) \ 99 do { \ 100 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ 101 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 102 "m" (p1[i / sizeof(*p1)])); \ 103 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 104 "m" (p0[i / sizeof(*p0)])); \ 105 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 106 "=m" (p0[i / sizeof(*p0)])); \ 107 } while (0); 108 109 BLOCK16() 110 111 p0 = (unsigned long *)((uintptr_t)p0 + 512); 112 p1 = (unsigned long *)((uintptr_t)p1 + 512); 113 p2 = (unsigned long *)((uintptr_t)p2 + 512); 114 } 115 116 YMMS_RESTORE 117 } 118 119 static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, 120 unsigned long *p2, unsigned long *p3) 121 { 122 unsigned long cr0, lines = bytes >> 9; 123 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; 124 125 YMMS_SAVE 126 127 while (lines--) { 128 #undef BLOCK 129 #define BLOCK(i, reg) \ 130 do { \ 131 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ 132 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 133 "m" (p2[i / sizeof(*p2)])); \ 134 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 135 "m" (p1[i / sizeof(*p1)])); \ 136 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 137 "m" (p0[i / sizeof(*p0)])); \ 138 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 139 "=m" (p0[i / sizeof(*p0)])); \ 140 } while (0); 141 142 BLOCK16(); 143 144 p0 = (unsigned long *)((uintptr_t)p0 + 512); 145 p1 = (unsigned long *)((uintptr_t)p1 + 512); 146 p2 = (unsigned long *)((uintptr_t)p2 + 512); 147 p3 = (unsigned long *)((uintptr_t)p3 + 512); 148 } 149 150 YMMS_RESTORE 151 } 152 153 static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, 154 unsigned long *p2, unsigned long *p3, unsigned long *p4) 155 { 156 unsigned long cr0, lines = bytes >> 9; 157 char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; 158 159 YMMS_SAVE 160 161 while (lines--) { 162 #undef BLOCK 163 #define BLOCK(i, reg) \ 164 do { \ 165 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ 166 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 167 "m" (p3[i / sizeof(*p3)])); \ 168 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 169 "m" (p2[i / sizeof(*p2)])); \ 170 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 171 "m" (p1[i / sizeof(*p1)])); \ 172 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 173 "m" (p0[i / sizeof(*p0)])); \ 174 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 175 "=m" (p0[i / sizeof(*p0)])); \ 176 } while (0); 177 178 BLOCK16() 179 180 p0 = (unsigned long *)((uintptr_t)p0 + 512); 181 p1 = (unsigned long *)((uintptr_t)p1 + 512); 182 p2 = (unsigned long *)((uintptr_t)p2 + 512); 183 p3 = (unsigned long *)((uintptr_t)p3 + 512); 184 p4 = (unsigned long *)((uintptr_t)p4 + 512); 185 } 186 187 YMMS_RESTORE 188 } 189 190 static struct xor_block_template xor_block_avx = { 191 .name = "avx", 192 .do_2 = xor_avx_2, 193 .do_3 = xor_avx_3, 194 .do_4 = xor_avx_4, 195 .do_5 = xor_avx_5, 196 }; 197 198 #define AVX_XOR_SPEED \ 199 do { \ 200 if (cpu_has_avx) \ 201 xor_speed(&xor_block_avx); \ 202 } while (0) 203 204 #define AVX_SELECT(FASTEST) \ 205 (cpu_has_avx ? &xor_block_avx : FASTEST) 206 207 #else 208 209 #define AVX_XOR_SPEED {} 210 211 #define AVX_SELECT(FASTEST) (FASTEST) 212 213 #endif 214 #endif 215