1 /* SPDX-License-Identifier: GPL-2.0-only */ 2 #ifndef _ASM_X86_XOR_AVX_H 3 #define _ASM_X86_XOR_AVX_H 4 5 /* 6 * Optimized RAID-5 checksumming functions for AVX 7 * 8 * Copyright (C) 2012 Intel Corporation 9 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> 10 * 11 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines 12 */ 13 14 #include <linux/compiler.h> 15 #include <asm/fpu/api.h> 16 17 #define BLOCK4(i) \ 18 BLOCK(32 * i, 0) \ 19 BLOCK(32 * (i + 1), 1) \ 20 BLOCK(32 * (i + 2), 2) \ 21 BLOCK(32 * (i + 3), 3) 22 23 #define BLOCK16() \ 24 BLOCK4(0) \ 25 BLOCK4(4) \ 26 BLOCK4(8) \ 27 BLOCK4(12) 28 29 static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0, 30 const unsigned long * __restrict p1) 31 { 32 unsigned long lines = bytes >> 9; 33 34 kernel_fpu_begin(); 35 36 while (lines--) { 37 #undef BLOCK 38 #define BLOCK(i, reg) \ 39 do { \ 40 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ 41 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 42 "m" (p0[i / sizeof(*p0)])); \ 43 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 44 "=m" (p0[i / sizeof(*p0)])); \ 45 } while (0); 46 47 BLOCK16() 48 49 p0 = (unsigned long *)((uintptr_t)p0 + 512); 50 p1 = (unsigned long *)((uintptr_t)p1 + 512); 51 } 52 53 kernel_fpu_end(); 54 } 55 56 static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0, 57 const unsigned long * __restrict p1, 58 const unsigned long * __restrict p2) 59 { 60 unsigned long lines = bytes >> 9; 61 62 kernel_fpu_begin(); 63 64 while (lines--) { 65 #undef BLOCK 66 #define BLOCK(i, reg) \ 67 do { \ 68 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ 69 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 70 "m" (p1[i / sizeof(*p1)])); \ 71 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 72 "m" (p0[i / sizeof(*p0)])); \ 73 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 74 "=m" (p0[i / sizeof(*p0)])); \ 75 } while (0); 76 77 BLOCK16() 78 79 p0 = (unsigned long *)((uintptr_t)p0 + 512); 80 p1 = (unsigned long *)((uintptr_t)p1 + 512); 81 p2 = (unsigned long *)((uintptr_t)p2 + 512); 82 } 83 84 kernel_fpu_end(); 85 } 86 87 static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0, 88 const unsigned long * __restrict p1, 89 const unsigned long * __restrict p2, 90 const unsigned long * __restrict p3) 91 { 92 unsigned long lines = bytes >> 9; 93 94 kernel_fpu_begin(); 95 96 while (lines--) { 97 #undef BLOCK 98 #define BLOCK(i, reg) \ 99 do { \ 100 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ 101 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 102 "m" (p2[i / sizeof(*p2)])); \ 103 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 104 "m" (p1[i / sizeof(*p1)])); \ 105 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 106 "m" (p0[i / sizeof(*p0)])); \ 107 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 108 "=m" (p0[i / sizeof(*p0)])); \ 109 } while (0); 110 111 BLOCK16(); 112 113 p0 = (unsigned long *)((uintptr_t)p0 + 512); 114 p1 = (unsigned long *)((uintptr_t)p1 + 512); 115 p2 = (unsigned long *)((uintptr_t)p2 + 512); 116 p3 = (unsigned long *)((uintptr_t)p3 + 512); 117 } 118 119 kernel_fpu_end(); 120 } 121 122 static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0, 123 const unsigned long * __restrict p1, 124 const unsigned long * __restrict p2, 125 const unsigned long * __restrict p3, 126 const unsigned long * __restrict p4) 127 { 128 unsigned long lines = bytes >> 9; 129 130 kernel_fpu_begin(); 131 132 while (lines--) { 133 #undef BLOCK 134 #define BLOCK(i, reg) \ 135 do { \ 136 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ 137 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 138 "m" (p3[i / sizeof(*p3)])); \ 139 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 140 "m" (p2[i / sizeof(*p2)])); \ 141 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 142 "m" (p1[i / sizeof(*p1)])); \ 143 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 144 "m" (p0[i / sizeof(*p0)])); \ 145 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 146 "=m" (p0[i / sizeof(*p0)])); \ 147 } while (0); 148 149 BLOCK16() 150 151 p0 = (unsigned long *)((uintptr_t)p0 + 512); 152 p1 = (unsigned long *)((uintptr_t)p1 + 512); 153 p2 = (unsigned long *)((uintptr_t)p2 + 512); 154 p3 = (unsigned long *)((uintptr_t)p3 + 512); 155 p4 = (unsigned long *)((uintptr_t)p4 + 512); 156 } 157 158 kernel_fpu_end(); 159 } 160 161 static struct xor_block_template xor_block_avx = { 162 .name = "avx", 163 .do_2 = xor_avx_2, 164 .do_3 = xor_avx_3, 165 .do_4 = xor_avx_4, 166 .do_5 = xor_avx_5, 167 }; 168 169 #define AVX_XOR_SPEED \ 170 do { \ 171 if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \ 172 xor_speed(&xor_block_avx); \ 173 } while (0) 174 175 #define AVX_SELECT(FASTEST) \ 176 (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST) 177 178 #endif 179