1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Optimized XOR parity functions for AVX 4 * 5 * Copyright (C) 2012 Intel Corporation 6 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> 7 * 8 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines 9 */ 10 #include <linux/compiler.h> 11 #include <asm/fpu/api.h> 12 #include "xor_impl.h" 13 #include "xor_arch.h" 14 15 #define BLOCK4(i) \ 16 BLOCK(32 * i, 0) \ 17 BLOCK(32 * (i + 1), 1) \ 18 BLOCK(32 * (i + 2), 2) \ 19 BLOCK(32 * (i + 3), 3) 20 21 #define BLOCK16() \ 22 BLOCK4(0) \ 23 BLOCK4(4) \ 24 BLOCK4(8) \ 25 BLOCK4(12) 26 27 static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0, 28 const unsigned long * __restrict p1) 29 { 30 unsigned long lines = bytes >> 9; 31 32 while (lines--) { 33 #undef BLOCK 34 #define BLOCK(i, reg) \ 35 do { \ 36 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ 37 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 38 "m" (p0[i / sizeof(*p0)])); \ 39 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 40 "=m" (p0[i / sizeof(*p0)])); \ 41 } while (0); 42 43 BLOCK16() 44 45 p0 = (unsigned long *)((uintptr_t)p0 + 512); 46 p1 = (unsigned long *)((uintptr_t)p1 + 512); 47 } 48 } 49 50 static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0, 51 const unsigned long * __restrict p1, 52 const unsigned long * __restrict p2) 53 { 54 unsigned long lines = bytes >> 9; 55 56 while (lines--) { 57 #undef BLOCK 58 #define BLOCK(i, reg) \ 59 do { \ 60 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ 61 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 62 "m" (p1[i / sizeof(*p1)])); \ 63 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 64 "m" (p0[i / sizeof(*p0)])); \ 65 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 66 "=m" (p0[i / sizeof(*p0)])); \ 67 } while (0); 68 69 BLOCK16() 70 71 p0 = (unsigned long *)((uintptr_t)p0 + 512); 72 p1 = (unsigned long *)((uintptr_t)p1 + 512); 73 p2 = (unsigned long *)((uintptr_t)p2 + 512); 74 } 75 } 76 77 static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0, 78 const unsigned long * __restrict p1, 79 const unsigned long * __restrict p2, 80 const unsigned long * __restrict p3) 81 { 82 unsigned long lines = bytes >> 9; 83 84 while (lines--) { 85 #undef BLOCK 86 #define BLOCK(i, reg) \ 87 do { \ 88 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ 89 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 90 "m" (p2[i / sizeof(*p2)])); \ 91 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 92 "m" (p1[i / sizeof(*p1)])); \ 93 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 94 "m" (p0[i / sizeof(*p0)])); \ 95 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 96 "=m" (p0[i / sizeof(*p0)])); \ 97 } while (0); 98 99 BLOCK16(); 100 101 p0 = (unsigned long *)((uintptr_t)p0 + 512); 102 p1 = (unsigned long *)((uintptr_t)p1 + 512); 103 p2 = (unsigned long *)((uintptr_t)p2 + 512); 104 p3 = (unsigned long *)((uintptr_t)p3 + 512); 105 } 106 } 107 108 static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0, 109 const unsigned long * __restrict p1, 110 const unsigned long * __restrict p2, 111 const unsigned long * __restrict p3, 112 const unsigned long * __restrict p4) 113 { 114 unsigned long lines = bytes >> 9; 115 116 while (lines--) { 117 #undef BLOCK 118 #define BLOCK(i, reg) \ 119 do { \ 120 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ 121 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 122 "m" (p3[i / sizeof(*p3)])); \ 123 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 124 "m" (p2[i / sizeof(*p2)])); \ 125 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 126 "m" (p1[i / sizeof(*p1)])); \ 127 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 128 "m" (p0[i / sizeof(*p0)])); \ 129 asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 130 "=m" (p0[i / sizeof(*p0)])); \ 131 } while (0); 132 133 BLOCK16() 134 135 p0 = (unsigned long *)((uintptr_t)p0 + 512); 136 p1 = (unsigned long *)((uintptr_t)p1 + 512); 137 p2 = (unsigned long *)((uintptr_t)p2 + 512); 138 p3 = (unsigned long *)((uintptr_t)p3 + 512); 139 p4 = (unsigned long *)((uintptr_t)p4 + 512); 140 } 141 } 142 143 DO_XOR_BLOCKS(avx_inner, xor_avx_2, xor_avx_3, xor_avx_4, xor_avx_5); 144 145 static void xor_gen_avx(void *dest, void **srcs, unsigned int src_cnt, 146 unsigned int bytes) 147 { 148 kernel_fpu_begin(); 149 xor_gen_avx_inner(dest, srcs, src_cnt, bytes); 150 kernel_fpu_end(); 151 } 152 153 struct xor_block_template xor_block_avx = { 154 .name = "avx", 155 .xor_gen = xor_gen_avx, 156 }; 157