xref: /linux/arch/x86/include/asm/xor_64.h (revision 4413e16d9d21673bb5048a2e542f1aaa00015c2e)
1 #ifndef _ASM_X86_XOR_64_H
2 #define _ASM_X86_XOR_64_H
3 
4 /*
5  * Optimized RAID-5 checksumming functions for MMX and SSE.
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2, or (at your option)
10  * any later version.
11  *
12  * You should have received a copy of the GNU General Public License
13  * (for example /usr/src/linux/COPYING); if not, write to the Free
14  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15  */
16 
17 
18 /*
19  * Cache avoiding checksumming functions utilizing KNI instructions
20  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
21  */
22 
23 /*
24  * Based on
25  * High-speed RAID5 checksumming functions utilizing SSE instructions.
26  * Copyright (C) 1998 Ingo Molnar.
27  */
28 
29 /*
30  * x86-64 changes / gcc fixes from Andi Kleen.
31  * Copyright 2002 Andi Kleen, SuSE Labs.
32  *
33  * This hasn't been optimized for the hammer yet, but there are likely
34  * no advantages to be gotten from x86-64 here anyways.
35  */
36 
37 #include <asm/i387.h>
38 
39 #define OFFS(x)		"16*("#x")"
40 #define PF_OFFS(x)	"256+16*("#x")"
41 #define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
42 #define LD(x, y)	"       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
43 #define ST(x, y)	"       movaps %%xmm"#y",   "OFFS(x)"(%[p1])	;\n"
44 #define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
45 #define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
46 #define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
47 #define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
48 #define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%[p6])		;\n"
49 #define XO1(x, y)	"       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
50 #define XO2(x, y)	"       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
51 #define XO3(x, y)	"       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
52 #define XO4(x, y)	"       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
53 #define XO5(x, y)	"       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"	;\n"
54 
55 
56 static void
57 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
58 {
59 	unsigned int lines = bytes >> 8;
60 
61 	kernel_fpu_begin();
62 
63 	asm volatile(
64 #undef BLOCK
65 #define BLOCK(i) \
66 		LD(i, 0)				\
67 			LD(i + 1, 1)			\
68 		PF1(i)					\
69 				PF1(i + 2)		\
70 				LD(i + 2, 2)		\
71 					LD(i + 3, 3)	\
72 		PF0(i + 4)				\
73 				PF0(i + 6)		\
74 		XO1(i, 0)				\
75 			XO1(i + 1, 1)			\
76 				XO1(i + 2, 2)		\
77 					XO1(i + 3, 3)	\
78 		ST(i, 0)				\
79 			ST(i + 1, 1)			\
80 				ST(i + 2, 2)		\
81 					ST(i + 3, 3)	\
82 
83 
84 		PF0(0)
85 				PF0(2)
86 
87 	" .align 32			;\n"
88 	" 1:                            ;\n"
89 
90 		BLOCK(0)
91 		BLOCK(4)
92 		BLOCK(8)
93 		BLOCK(12)
94 
95 	"       addq %[inc], %[p1]           ;\n"
96 	"       addq %[inc], %[p2]           ;\n"
97 		"		decl %[cnt] ; jnz 1b"
98 	: [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
99 	: [inc] "r" (256UL)
100 	: "memory");
101 
102 	kernel_fpu_end();
103 }
104 
105 static void
106 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
107 	  unsigned long *p3)
108 {
109 	unsigned int lines = bytes >> 8;
110 
111 	kernel_fpu_begin();
112 	asm volatile(
113 #undef BLOCK
114 #define BLOCK(i) \
115 		PF1(i)					\
116 				PF1(i + 2)		\
117 		LD(i, 0)					\
118 			LD(i + 1, 1)			\
119 				LD(i + 2, 2)		\
120 					LD(i + 3, 3)	\
121 		PF2(i)					\
122 				PF2(i + 2)		\
123 		PF0(i + 4)				\
124 				PF0(i + 6)		\
125 		XO1(i, 0)				\
126 			XO1(i + 1, 1)			\
127 				XO1(i + 2, 2)		\
128 					XO1(i + 3, 3)	\
129 		XO2(i, 0)				\
130 			XO2(i + 1, 1)			\
131 				XO2(i + 2, 2)		\
132 					XO2(i + 3, 3)	\
133 		ST(i, 0)				\
134 			ST(i + 1, 1)			\
135 				ST(i + 2, 2)		\
136 					ST(i + 3, 3)	\
137 
138 
139 		PF0(0)
140 				PF0(2)
141 
142 	" .align 32			;\n"
143 	" 1:                            ;\n"
144 
145 		BLOCK(0)
146 		BLOCK(4)
147 		BLOCK(8)
148 		BLOCK(12)
149 
150 	"       addq %[inc], %[p1]           ;\n"
151 	"       addq %[inc], %[p2]          ;\n"
152 	"       addq %[inc], %[p3]           ;\n"
153 		"		decl %[cnt] ; jnz 1b"
154 	: [cnt] "+r" (lines),
155 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
156 	: [inc] "r" (256UL)
157 	: "memory");
158 	kernel_fpu_end();
159 }
160 
161 static void
162 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
163 	  unsigned long *p3, unsigned long *p4)
164 {
165 	unsigned int lines = bytes >> 8;
166 
167 	kernel_fpu_begin();
168 
169 	asm volatile(
170 #undef BLOCK
171 #define BLOCK(i) \
172 		PF1(i)					\
173 				PF1(i + 2)		\
174 		LD(i, 0)				\
175 			LD(i + 1, 1)			\
176 				LD(i + 2, 2)		\
177 					LD(i + 3, 3)	\
178 		PF2(i)					\
179 				PF2(i + 2)		\
180 		XO1(i, 0)				\
181 			XO1(i + 1, 1)			\
182 				XO1(i + 2, 2)		\
183 					XO1(i + 3, 3)	\
184 		PF3(i)					\
185 				PF3(i + 2)		\
186 		PF0(i + 4)				\
187 				PF0(i + 6)		\
188 		XO2(i, 0)				\
189 			XO2(i + 1, 1)			\
190 				XO2(i + 2, 2)		\
191 					XO2(i + 3, 3)	\
192 		XO3(i, 0)				\
193 			XO3(i + 1, 1)			\
194 				XO3(i + 2, 2)		\
195 					XO3(i + 3, 3)	\
196 		ST(i, 0)				\
197 			ST(i + 1, 1)			\
198 				ST(i + 2, 2)		\
199 					ST(i + 3, 3)	\
200 
201 
202 		PF0(0)
203 				PF0(2)
204 
205 	" .align 32			;\n"
206 	" 1:                            ;\n"
207 
208 		BLOCK(0)
209 		BLOCK(4)
210 		BLOCK(8)
211 		BLOCK(12)
212 
213 	"       addq %[inc], %[p1]           ;\n"
214 	"       addq %[inc], %[p2]           ;\n"
215 	"       addq %[inc], %[p3]           ;\n"
216 	"       addq %[inc], %[p4]           ;\n"
217 	"	decl %[cnt] ; jnz 1b"
218 	: [cnt] "+c" (lines),
219 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
220 	: [inc] "r" (256UL)
221 	: "memory" );
222 
223 	kernel_fpu_end();
224 }
225 
226 static void
227 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
228 	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
229 {
230 	unsigned int lines = bytes >> 8;
231 
232 	kernel_fpu_begin();
233 
234 	asm volatile(
235 #undef BLOCK
236 #define BLOCK(i) \
237 		PF1(i)					\
238 				PF1(i + 2)		\
239 		LD(i, 0)				\
240 			LD(i + 1, 1)			\
241 				LD(i + 2, 2)		\
242 					LD(i + 3, 3)	\
243 		PF2(i)					\
244 				PF2(i + 2)		\
245 		XO1(i, 0)				\
246 			XO1(i + 1, 1)			\
247 				XO1(i + 2, 2)		\
248 					XO1(i + 3, 3)	\
249 		PF3(i)					\
250 				PF3(i + 2)		\
251 		XO2(i, 0)				\
252 			XO2(i + 1, 1)			\
253 				XO2(i + 2, 2)		\
254 					XO2(i + 3, 3)	\
255 		PF4(i)					\
256 				PF4(i + 2)		\
257 		PF0(i + 4)				\
258 				PF0(i + 6)		\
259 		XO3(i, 0)				\
260 			XO3(i + 1, 1)			\
261 				XO3(i + 2, 2)		\
262 					XO3(i + 3, 3)	\
263 		XO4(i, 0)				\
264 			XO4(i + 1, 1)			\
265 				XO4(i + 2, 2)		\
266 					XO4(i + 3, 3)	\
267 		ST(i, 0)				\
268 			ST(i + 1, 1)			\
269 				ST(i + 2, 2)		\
270 					ST(i + 3, 3)	\
271 
272 
273 		PF0(0)
274 				PF0(2)
275 
276 	" .align 32			;\n"
277 	" 1:                            ;\n"
278 
279 		BLOCK(0)
280 		BLOCK(4)
281 		BLOCK(8)
282 		BLOCK(12)
283 
284 	"       addq %[inc], %[p1]           ;\n"
285 	"       addq %[inc], %[p2]           ;\n"
286 	"       addq %[inc], %[p3]           ;\n"
287 	"       addq %[inc], %[p4]           ;\n"
288 	"       addq %[inc], %[p5]           ;\n"
289 	"	decl %[cnt] ; jnz 1b"
290 	: [cnt] "+c" (lines),
291 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
292 	  [p5] "+r" (p5)
293 	: [inc] "r" (256UL)
294 	: "memory");
295 
296 	kernel_fpu_end();
297 }
298 
299 static struct xor_block_template xor_block_sse = {
300 	.name = "generic_sse",
301 	.do_2 = xor_sse_2,
302 	.do_3 = xor_sse_3,
303 	.do_4 = xor_sse_4,
304 	.do_5 = xor_sse_5,
305 };
306 
307 
308 /* Also try the AVX routines */
309 #include <asm/xor_avx.h>
310 
311 #undef XOR_TRY_TEMPLATES
312 #define XOR_TRY_TEMPLATES			\
313 do {						\
314 	AVX_XOR_SPEED;				\
315 	xor_speed(&xor_block_sse);		\
316 } while (0)
317 
318 /* We force the use of the SSE xor block because it can write around L2.
319    We may also be able to load into the L1 only depending on how the cpu
320    deals with a load to a line that is being prefetched.  */
321 #define XOR_SELECT_TEMPLATE(FASTEST) \
322 	AVX_SELECT(&xor_block_sse)
323 
324 #endif /* _ASM_X86_XOR_64_H */
325