xref: /linux/arch/x86/include/asm/xor.h (revision ca55b2fef3a9373fcfc30f82fd26bc7fccbda732)
1 #ifdef CONFIG_KMEMCHECK
2 /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3 # include <asm-generic/xor.h>
4 #elif !defined(_ASM_X86_XOR_H)
5 #define _ASM_X86_XOR_H
6 
7 /*
8  * Optimized RAID-5 checksumming functions for SSE.
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2, or (at your option)
13  * any later version.
14  *
15  * You should have received a copy of the GNU General Public License
16  * (for example /usr/src/linux/COPYING); if not, write to the Free
17  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18  */
19 
20 /*
21  * Cache avoiding checksumming functions utilizing KNI instructions
22  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
23  */
24 
25 /*
26  * Based on
27  * High-speed RAID5 checksumming functions utilizing SSE instructions.
28  * Copyright (C) 1998 Ingo Molnar.
29  */
30 
31 /*
32  * x86-64 changes / gcc fixes from Andi Kleen.
33  * Copyright 2002 Andi Kleen, SuSE Labs.
34  *
35  * This hasn't been optimized for the hammer yet, but there are likely
36  * no advantages to be gotten from x86-64 here anyways.
37  */
38 
39 #include <asm/fpu/api.h>
40 
41 #ifdef CONFIG_X86_32
42 /* reduce register pressure */
43 # define XOR_CONSTANT_CONSTRAINT "i"
44 #else
45 # define XOR_CONSTANT_CONSTRAINT "re"
46 #endif
47 
48 #define OFFS(x)		"16*("#x")"
49 #define PF_OFFS(x)	"256+16*("#x")"
50 #define PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
51 #define LD(x, y)	"	movaps "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
52 #define ST(x, y)	"	movaps %%xmm"#y", "OFFS(x)"(%[p1])	;\n"
53 #define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
54 #define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
55 #define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
56 #define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
57 #define XO1(x, y)	"	xorps "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
58 #define XO2(x, y)	"	xorps "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
59 #define XO3(x, y)	"	xorps "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
60 #define XO4(x, y)	"	xorps "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
61 #define NOP(x)
62 
63 #define BLK64(pf, op, i)				\
64 		pf(i)					\
65 		op(i, 0)				\
66 			op(i + 1, 1)			\
67 				op(i + 2, 2)		\
68 					op(i + 3, 3)
69 
70 static void
71 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
72 {
73 	unsigned long lines = bytes >> 8;
74 
75 	kernel_fpu_begin();
76 
77 	asm volatile(
78 #undef BLOCK
79 #define BLOCK(i)					\
80 		LD(i, 0)				\
81 			LD(i + 1, 1)			\
82 		PF1(i)					\
83 				PF1(i + 2)		\
84 				LD(i + 2, 2)		\
85 					LD(i + 3, 3)	\
86 		PF0(i + 4)				\
87 				PF0(i + 6)		\
88 		XO1(i, 0)				\
89 			XO1(i + 1, 1)			\
90 				XO1(i + 2, 2)		\
91 					XO1(i + 3, 3)	\
92 		ST(i, 0)				\
93 			ST(i + 1, 1)			\
94 				ST(i + 2, 2)		\
95 					ST(i + 3, 3)	\
96 
97 
98 		PF0(0)
99 				PF0(2)
100 
101 	" .align 32			;\n"
102 	" 1:                            ;\n"
103 
104 		BLOCK(0)
105 		BLOCK(4)
106 		BLOCK(8)
107 		BLOCK(12)
108 
109 	"       add %[inc], %[p1]       ;\n"
110 	"       add %[inc], %[p2]       ;\n"
111 	"       dec %[cnt]              ;\n"
112 	"       jnz 1b                  ;\n"
113 	: [cnt] "+r" (lines),
114 	  [p1] "+r" (p1), [p2] "+r" (p2)
115 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
116 	: "memory");
117 
118 	kernel_fpu_end();
119 }
120 
121 static void
122 xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
123 {
124 	unsigned long lines = bytes >> 8;
125 
126 	kernel_fpu_begin();
127 
128 	asm volatile(
129 #undef BLOCK
130 #define BLOCK(i)			\
131 		BLK64(PF0, LD, i)	\
132 		BLK64(PF1, XO1, i)	\
133 		BLK64(NOP, ST, i)	\
134 
135 	" .align 32			;\n"
136 	" 1:                            ;\n"
137 
138 		BLOCK(0)
139 		BLOCK(4)
140 		BLOCK(8)
141 		BLOCK(12)
142 
143 	"       add %[inc], %[p1]       ;\n"
144 	"       add %[inc], %[p2]       ;\n"
145 	"       dec %[cnt]              ;\n"
146 	"       jnz 1b                  ;\n"
147 	: [cnt] "+r" (lines),
148 	  [p1] "+r" (p1), [p2] "+r" (p2)
149 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
150 	: "memory");
151 
152 	kernel_fpu_end();
153 }
154 
155 static void
156 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
157 	  unsigned long *p3)
158 {
159 	unsigned long lines = bytes >> 8;
160 
161 	kernel_fpu_begin();
162 
163 	asm volatile(
164 #undef BLOCK
165 #define BLOCK(i) \
166 		PF1(i)					\
167 				PF1(i + 2)		\
168 		LD(i, 0)				\
169 			LD(i + 1, 1)			\
170 				LD(i + 2, 2)		\
171 					LD(i + 3, 3)	\
172 		PF2(i)					\
173 				PF2(i + 2)		\
174 		PF0(i + 4)				\
175 				PF0(i + 6)		\
176 		XO1(i, 0)				\
177 			XO1(i + 1, 1)			\
178 				XO1(i + 2, 2)		\
179 					XO1(i + 3, 3)	\
180 		XO2(i, 0)				\
181 			XO2(i + 1, 1)			\
182 				XO2(i + 2, 2)		\
183 					XO2(i + 3, 3)	\
184 		ST(i, 0)				\
185 			ST(i + 1, 1)			\
186 				ST(i + 2, 2)		\
187 					ST(i + 3, 3)	\
188 
189 
190 		PF0(0)
191 				PF0(2)
192 
193 	" .align 32			;\n"
194 	" 1:                            ;\n"
195 
196 		BLOCK(0)
197 		BLOCK(4)
198 		BLOCK(8)
199 		BLOCK(12)
200 
201 	"       add %[inc], %[p1]       ;\n"
202 	"       add %[inc], %[p2]       ;\n"
203 	"       add %[inc], %[p3]       ;\n"
204 	"       dec %[cnt]              ;\n"
205 	"       jnz 1b                  ;\n"
206 	: [cnt] "+r" (lines),
207 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
208 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
209 	: "memory");
210 
211 	kernel_fpu_end();
212 }
213 
214 static void
215 xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
216 	       unsigned long *p3)
217 {
218 	unsigned long lines = bytes >> 8;
219 
220 	kernel_fpu_begin();
221 
222 	asm volatile(
223 #undef BLOCK
224 #define BLOCK(i)			\
225 		BLK64(PF0, LD, i)	\
226 		BLK64(PF1, XO1, i)	\
227 		BLK64(PF2, XO2, i)	\
228 		BLK64(NOP, ST, i)	\
229 
230 	" .align 32			;\n"
231 	" 1:                            ;\n"
232 
233 		BLOCK(0)
234 		BLOCK(4)
235 		BLOCK(8)
236 		BLOCK(12)
237 
238 	"       add %[inc], %[p1]       ;\n"
239 	"       add %[inc], %[p2]       ;\n"
240 	"       add %[inc], %[p3]       ;\n"
241 	"       dec %[cnt]              ;\n"
242 	"       jnz 1b                  ;\n"
243 	: [cnt] "+r" (lines),
244 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
245 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
246 	: "memory");
247 
248 	kernel_fpu_end();
249 }
250 
251 static void
252 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
253 	  unsigned long *p3, unsigned long *p4)
254 {
255 	unsigned long lines = bytes >> 8;
256 
257 	kernel_fpu_begin();
258 
259 	asm volatile(
260 #undef BLOCK
261 #define BLOCK(i) \
262 		PF1(i)					\
263 				PF1(i + 2)		\
264 		LD(i, 0)				\
265 			LD(i + 1, 1)			\
266 				LD(i + 2, 2)		\
267 					LD(i + 3, 3)	\
268 		PF2(i)					\
269 				PF2(i + 2)		\
270 		XO1(i, 0)				\
271 			XO1(i + 1, 1)			\
272 				XO1(i + 2, 2)		\
273 					XO1(i + 3, 3)	\
274 		PF3(i)					\
275 				PF3(i + 2)		\
276 		PF0(i + 4)				\
277 				PF0(i + 6)		\
278 		XO2(i, 0)				\
279 			XO2(i + 1, 1)			\
280 				XO2(i + 2, 2)		\
281 					XO2(i + 3, 3)	\
282 		XO3(i, 0)				\
283 			XO3(i + 1, 1)			\
284 				XO3(i + 2, 2)		\
285 					XO3(i + 3, 3)	\
286 		ST(i, 0)				\
287 			ST(i + 1, 1)			\
288 				ST(i + 2, 2)		\
289 					ST(i + 3, 3)	\
290 
291 
292 		PF0(0)
293 				PF0(2)
294 
295 	" .align 32			;\n"
296 	" 1:                            ;\n"
297 
298 		BLOCK(0)
299 		BLOCK(4)
300 		BLOCK(8)
301 		BLOCK(12)
302 
303 	"       add %[inc], %[p1]       ;\n"
304 	"       add %[inc], %[p2]       ;\n"
305 	"       add %[inc], %[p3]       ;\n"
306 	"       add %[inc], %[p4]       ;\n"
307 	"       dec %[cnt]              ;\n"
308 	"       jnz 1b                  ;\n"
309 	: [cnt] "+r" (lines), [p1] "+r" (p1),
310 	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
311 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
312 	: "memory");
313 
314 	kernel_fpu_end();
315 }
316 
317 static void
318 xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
319 	       unsigned long *p3, unsigned long *p4)
320 {
321 	unsigned long lines = bytes >> 8;
322 
323 	kernel_fpu_begin();
324 
325 	asm volatile(
326 #undef BLOCK
327 #define BLOCK(i)			\
328 		BLK64(PF0, LD, i)	\
329 		BLK64(PF1, XO1, i)	\
330 		BLK64(PF2, XO2, i)	\
331 		BLK64(PF3, XO3, i)	\
332 		BLK64(NOP, ST, i)	\
333 
334 	" .align 32			;\n"
335 	" 1:                            ;\n"
336 
337 		BLOCK(0)
338 		BLOCK(4)
339 		BLOCK(8)
340 		BLOCK(12)
341 
342 	"       add %[inc], %[p1]       ;\n"
343 	"       add %[inc], %[p2]       ;\n"
344 	"       add %[inc], %[p3]       ;\n"
345 	"       add %[inc], %[p4]       ;\n"
346 	"       dec %[cnt]              ;\n"
347 	"       jnz 1b                  ;\n"
348 	: [cnt] "+r" (lines), [p1] "+r" (p1),
349 	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
350 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
351 	: "memory");
352 
353 	kernel_fpu_end();
354 }
355 
356 static void
357 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
358 	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
359 {
360 	unsigned long lines = bytes >> 8;
361 
362 	kernel_fpu_begin();
363 
364 	asm volatile(
365 #undef BLOCK
366 #define BLOCK(i) \
367 		PF1(i)					\
368 				PF1(i + 2)		\
369 		LD(i, 0)				\
370 			LD(i + 1, 1)			\
371 				LD(i + 2, 2)		\
372 					LD(i + 3, 3)	\
373 		PF2(i)					\
374 				PF2(i + 2)		\
375 		XO1(i, 0)				\
376 			XO1(i + 1, 1)			\
377 				XO1(i + 2, 2)		\
378 					XO1(i + 3, 3)	\
379 		PF3(i)					\
380 				PF3(i + 2)		\
381 		XO2(i, 0)				\
382 			XO2(i + 1, 1)			\
383 				XO2(i + 2, 2)		\
384 					XO2(i + 3, 3)	\
385 		PF4(i)					\
386 				PF4(i + 2)		\
387 		PF0(i + 4)				\
388 				PF0(i + 6)		\
389 		XO3(i, 0)				\
390 			XO3(i + 1, 1)			\
391 				XO3(i + 2, 2)		\
392 					XO3(i + 3, 3)	\
393 		XO4(i, 0)				\
394 			XO4(i + 1, 1)			\
395 				XO4(i + 2, 2)		\
396 					XO4(i + 3, 3)	\
397 		ST(i, 0)				\
398 			ST(i + 1, 1)			\
399 				ST(i + 2, 2)		\
400 					ST(i + 3, 3)	\
401 
402 
403 		PF0(0)
404 				PF0(2)
405 
406 	" .align 32			;\n"
407 	" 1:                            ;\n"
408 
409 		BLOCK(0)
410 		BLOCK(4)
411 		BLOCK(8)
412 		BLOCK(12)
413 
414 	"       add %[inc], %[p1]       ;\n"
415 	"       add %[inc], %[p2]       ;\n"
416 	"       add %[inc], %[p3]       ;\n"
417 	"       add %[inc], %[p4]       ;\n"
418 	"       add %[inc], %[p5]       ;\n"
419 	"       dec %[cnt]              ;\n"
420 	"       jnz 1b                  ;\n"
421 	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
422 	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
423 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
424 	: "memory");
425 
426 	kernel_fpu_end();
427 }
428 
429 static void
430 xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
431 	       unsigned long *p3, unsigned long *p4, unsigned long *p5)
432 {
433 	unsigned long lines = bytes >> 8;
434 
435 	kernel_fpu_begin();
436 
437 	asm volatile(
438 #undef BLOCK
439 #define BLOCK(i)			\
440 		BLK64(PF0, LD, i)	\
441 		BLK64(PF1, XO1, i)	\
442 		BLK64(PF2, XO2, i)	\
443 		BLK64(PF3, XO3, i)	\
444 		BLK64(PF4, XO4, i)	\
445 		BLK64(NOP, ST, i)	\
446 
447 	" .align 32			;\n"
448 	" 1:                            ;\n"
449 
450 		BLOCK(0)
451 		BLOCK(4)
452 		BLOCK(8)
453 		BLOCK(12)
454 
455 	"       add %[inc], %[p1]       ;\n"
456 	"       add %[inc], %[p2]       ;\n"
457 	"       add %[inc], %[p3]       ;\n"
458 	"       add %[inc], %[p4]       ;\n"
459 	"       add %[inc], %[p5]       ;\n"
460 	"       dec %[cnt]              ;\n"
461 	"       jnz 1b                  ;\n"
462 	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
463 	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
464 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
465 	: "memory");
466 
467 	kernel_fpu_end();
468 }
469 
470 static struct xor_block_template xor_block_sse_pf64 = {
471 	.name = "prefetch64-sse",
472 	.do_2 = xor_sse_2_pf64,
473 	.do_3 = xor_sse_3_pf64,
474 	.do_4 = xor_sse_4_pf64,
475 	.do_5 = xor_sse_5_pf64,
476 };
477 
478 #undef LD
479 #undef XO1
480 #undef XO2
481 #undef XO3
482 #undef XO4
483 #undef ST
484 #undef NOP
485 #undef BLK64
486 #undef BLOCK
487 
488 #undef XOR_CONSTANT_CONSTRAINT
489 
490 #ifdef CONFIG_X86_32
491 # include <asm/xor_32.h>
492 #else
493 # include <asm/xor_64.h>
494 #endif
495 
496 #define XOR_SELECT_TEMPLATE(FASTEST) \
497 	AVX_SELECT(FASTEST)
498 
499 #endif /* _ASM_X86_XOR_H */
500