xref: /linux/lib/raid/xor/x86/xor-avx.c (revision 440d6635b20037bc9ad46b20817d7b61cef0fc1b)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Optimized XOR parity functions for AVX
4  *
5  * Copyright (C) 2012 Intel Corporation
6  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
7  *
8  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
9  */
10 #include <linux/compiler.h>
11 #include <asm/fpu/api.h>
12 #include "xor_impl.h"
13 #include "xor_arch.h"
14 
15 #define BLOCK4(i) \
16 		BLOCK(32 * i, 0) \
17 		BLOCK(32 * (i + 1), 1) \
18 		BLOCK(32 * (i + 2), 2) \
19 		BLOCK(32 * (i + 3), 3)
20 
21 #define BLOCK16() \
22 		BLOCK4(0) \
23 		BLOCK4(4) \
24 		BLOCK4(8) \
25 		BLOCK4(12)
26 
27 static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
28 		      const unsigned long * __restrict p1)
29 {
30 	unsigned long lines = bytes >> 9;
31 
32 	while (lines--) {
33 #undef BLOCK
34 #define BLOCK(i, reg) \
35 do { \
36 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
37 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
38 		"m" (p0[i / sizeof(*p0)])); \
39 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
40 		"=m" (p0[i / sizeof(*p0)])); \
41 } while (0);
42 
43 		BLOCK16()
44 
45 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
46 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
47 	}
48 }
49 
50 static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
51 		      const unsigned long * __restrict p1,
52 		      const unsigned long * __restrict p2)
53 {
54 	unsigned long lines = bytes >> 9;
55 
56 	while (lines--) {
57 #undef BLOCK
58 #define BLOCK(i, reg) \
59 do { \
60 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
61 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
62 		"m" (p1[i / sizeof(*p1)])); \
63 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
64 		"m" (p0[i / sizeof(*p0)])); \
65 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
66 		"=m" (p0[i / sizeof(*p0)])); \
67 } while (0);
68 
69 		BLOCK16()
70 
71 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
72 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
73 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
74 	}
75 }
76 
77 static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
78 		      const unsigned long * __restrict p1,
79 		      const unsigned long * __restrict p2,
80 		      const unsigned long * __restrict p3)
81 {
82 	unsigned long lines = bytes >> 9;
83 
84 	while (lines--) {
85 #undef BLOCK
86 #define BLOCK(i, reg) \
87 do { \
88 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
89 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
90 		"m" (p2[i / sizeof(*p2)])); \
91 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
92 		"m" (p1[i / sizeof(*p1)])); \
93 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
94 		"m" (p0[i / sizeof(*p0)])); \
95 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
96 		"=m" (p0[i / sizeof(*p0)])); \
97 } while (0);
98 
99 		BLOCK16();
100 
101 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
102 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
103 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
104 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
105 	}
106 }
107 
108 static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
109 	     const unsigned long * __restrict p1,
110 	     const unsigned long * __restrict p2,
111 	     const unsigned long * __restrict p3,
112 	     const unsigned long * __restrict p4)
113 {
114 	unsigned long lines = bytes >> 9;
115 
116 	while (lines--) {
117 #undef BLOCK
118 #define BLOCK(i, reg) \
119 do { \
120 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
121 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
122 		"m" (p3[i / sizeof(*p3)])); \
123 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
124 		"m" (p2[i / sizeof(*p2)])); \
125 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
126 		"m" (p1[i / sizeof(*p1)])); \
127 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
128 		"m" (p0[i / sizeof(*p0)])); \
129 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
130 		"=m" (p0[i / sizeof(*p0)])); \
131 } while (0);
132 
133 		BLOCK16()
134 
135 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
136 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
137 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
138 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
139 		p4 = (unsigned long *)((uintptr_t)p4 + 512);
140 	}
141 }
142 
143 DO_XOR_BLOCKS(avx_inner, xor_avx_2, xor_avx_3, xor_avx_4, xor_avx_5);
144 
145 static void xor_gen_avx(void *dest, void **srcs, unsigned int src_cnt,
146 			unsigned int bytes)
147 {
148 	kernel_fpu_begin();
149 	xor_gen_avx_inner(dest, srcs, src_cnt, bytes);
150 	kernel_fpu_end();
151 }
152 
153 struct xor_block_template xor_block_avx = {
154 	.name		= "avx",
155 	.xor_gen	= xor_gen_avx,
156 };
157