xref: /linux/arch/x86/include/asm/xor.h (revision 402eb8ec54b36f8fc0649768c01abb57062d6f8b)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 #ifndef _ASM_X86_XOR_H
3 #define _ASM_X86_XOR_H
4 
5 /*
6  * Optimized RAID-5 checksumming functions for SSE.
7  */
8 
9 /*
10  * Cache avoiding checksumming functions utilizing KNI instructions
11  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
12  */
13 
14 /*
15  * Based on
16  * High-speed RAID5 checksumming functions utilizing SSE instructions.
17  * Copyright (C) 1998 Ingo Molnar.
18  */
19 
20 /*
21  * x86-64 changes / gcc fixes from Andi Kleen.
22  * Copyright 2002 Andi Kleen, SuSE Labs.
23  *
24  * This hasn't been optimized for the hammer yet, but there are likely
25  * no advantages to be gotten from x86-64 here anyways.
26  */
27 
28 #include <asm/fpu/api.h>
29 
30 #ifdef CONFIG_X86_32
31 /* reduce register pressure */
32 # define XOR_CONSTANT_CONSTRAINT "i"
33 #else
34 # define XOR_CONSTANT_CONSTRAINT "re"
35 #endif
36 
37 #define OFFS(x)		"16*("#x")"
38 #define PF_OFFS(x)	"256+16*("#x")"
39 #define PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
40 #define LD(x, y)	"	movaps "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
41 #define ST(x, y)	"	movaps %%xmm"#y", "OFFS(x)"(%[p1])	;\n"
42 #define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
43 #define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
44 #define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
45 #define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
46 #define XO1(x, y)	"	xorps "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
47 #define XO2(x, y)	"	xorps "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
48 #define XO3(x, y)	"	xorps "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
49 #define XO4(x, y)	"	xorps "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
50 #define NOP(x)
51 
52 #define BLK64(pf, op, i)				\
53 		pf(i)					\
54 		op(i, 0)				\
55 			op(i + 1, 1)			\
56 				op(i + 2, 2)		\
57 					op(i + 3, 3)
58 
59 static void
60 xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
61 	  const unsigned long * __restrict p2)
62 {
63 	unsigned long lines = bytes >> 8;
64 
65 	kernel_fpu_begin();
66 
67 	asm volatile(
68 #undef BLOCK
69 #define BLOCK(i)					\
70 		LD(i, 0)				\
71 			LD(i + 1, 1)			\
72 		PF1(i)					\
73 				PF1(i + 2)		\
74 				LD(i + 2, 2)		\
75 					LD(i + 3, 3)	\
76 		PF0(i + 4)				\
77 				PF0(i + 6)		\
78 		XO1(i, 0)				\
79 			XO1(i + 1, 1)			\
80 				XO1(i + 2, 2)		\
81 					XO1(i + 3, 3)	\
82 		ST(i, 0)				\
83 			ST(i + 1, 1)			\
84 				ST(i + 2, 2)		\
85 					ST(i + 3, 3)	\
86 
87 
88 		PF0(0)
89 				PF0(2)
90 
91 	" .align 32			;\n"
92 	" 1:                            ;\n"
93 
94 		BLOCK(0)
95 		BLOCK(4)
96 		BLOCK(8)
97 		BLOCK(12)
98 
99 	"       add %[inc], %[p1]       ;\n"
100 	"       add %[inc], %[p2]       ;\n"
101 	"       dec %[cnt]              ;\n"
102 	"       jnz 1b                  ;\n"
103 	: [cnt] "+r" (lines),
104 	  [p1] "+r" (p1), [p2] "+r" (p2)
105 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
106 	: "memory");
107 
108 	kernel_fpu_end();
109 }
110 
111 static void
112 xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
113 	       const unsigned long * __restrict p2)
114 {
115 	unsigned long lines = bytes >> 8;
116 
117 	kernel_fpu_begin();
118 
119 	asm volatile(
120 #undef BLOCK
121 #define BLOCK(i)			\
122 		BLK64(PF0, LD, i)	\
123 		BLK64(PF1, XO1, i)	\
124 		BLK64(NOP, ST, i)	\
125 
126 	" .align 32			;\n"
127 	" 1:                            ;\n"
128 
129 		BLOCK(0)
130 		BLOCK(4)
131 		BLOCK(8)
132 		BLOCK(12)
133 
134 	"       add %[inc], %[p1]       ;\n"
135 	"       add %[inc], %[p2]       ;\n"
136 	"       dec %[cnt]              ;\n"
137 	"       jnz 1b                  ;\n"
138 	: [cnt] "+r" (lines),
139 	  [p1] "+r" (p1), [p2] "+r" (p2)
140 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
141 	: "memory");
142 
143 	kernel_fpu_end();
144 }
145 
146 static void
147 xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
148 	  const unsigned long * __restrict p2,
149 	  const unsigned long * __restrict p3)
150 {
151 	unsigned long lines = bytes >> 8;
152 
153 	kernel_fpu_begin();
154 
155 	asm volatile(
156 #undef BLOCK
157 #define BLOCK(i) \
158 		PF1(i)					\
159 				PF1(i + 2)		\
160 		LD(i, 0)				\
161 			LD(i + 1, 1)			\
162 				LD(i + 2, 2)		\
163 					LD(i + 3, 3)	\
164 		PF2(i)					\
165 				PF2(i + 2)		\
166 		PF0(i + 4)				\
167 				PF0(i + 6)		\
168 		XO1(i, 0)				\
169 			XO1(i + 1, 1)			\
170 				XO1(i + 2, 2)		\
171 					XO1(i + 3, 3)	\
172 		XO2(i, 0)				\
173 			XO2(i + 1, 1)			\
174 				XO2(i + 2, 2)		\
175 					XO2(i + 3, 3)	\
176 		ST(i, 0)				\
177 			ST(i + 1, 1)			\
178 				ST(i + 2, 2)		\
179 					ST(i + 3, 3)	\
180 
181 
182 		PF0(0)
183 				PF0(2)
184 
185 	" .align 32			;\n"
186 	" 1:                            ;\n"
187 
188 		BLOCK(0)
189 		BLOCK(4)
190 		BLOCK(8)
191 		BLOCK(12)
192 
193 	"       add %[inc], %[p1]       ;\n"
194 	"       add %[inc], %[p2]       ;\n"
195 	"       add %[inc], %[p3]       ;\n"
196 	"       dec %[cnt]              ;\n"
197 	"       jnz 1b                  ;\n"
198 	: [cnt] "+r" (lines),
199 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
200 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
201 	: "memory");
202 
203 	kernel_fpu_end();
204 }
205 
206 static void
207 xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
208 	       const unsigned long * __restrict p2,
209 	       const unsigned long * __restrict p3)
210 {
211 	unsigned long lines = bytes >> 8;
212 
213 	kernel_fpu_begin();
214 
215 	asm volatile(
216 #undef BLOCK
217 #define BLOCK(i)			\
218 		BLK64(PF0, LD, i)	\
219 		BLK64(PF1, XO1, i)	\
220 		BLK64(PF2, XO2, i)	\
221 		BLK64(NOP, ST, i)	\
222 
223 	" .align 32			;\n"
224 	" 1:                            ;\n"
225 
226 		BLOCK(0)
227 		BLOCK(4)
228 		BLOCK(8)
229 		BLOCK(12)
230 
231 	"       add %[inc], %[p1]       ;\n"
232 	"       add %[inc], %[p2]       ;\n"
233 	"       add %[inc], %[p3]       ;\n"
234 	"       dec %[cnt]              ;\n"
235 	"       jnz 1b                  ;\n"
236 	: [cnt] "+r" (lines),
237 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
238 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
239 	: "memory");
240 
241 	kernel_fpu_end();
242 }
243 
244 static void
245 xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
246 	  const unsigned long * __restrict p2,
247 	  const unsigned long * __restrict p3,
248 	  const unsigned long * __restrict p4)
249 {
250 	unsigned long lines = bytes >> 8;
251 
252 	kernel_fpu_begin();
253 
254 	asm volatile(
255 #undef BLOCK
256 #define BLOCK(i) \
257 		PF1(i)					\
258 				PF1(i + 2)		\
259 		LD(i, 0)				\
260 			LD(i + 1, 1)			\
261 				LD(i + 2, 2)		\
262 					LD(i + 3, 3)	\
263 		PF2(i)					\
264 				PF2(i + 2)		\
265 		XO1(i, 0)				\
266 			XO1(i + 1, 1)			\
267 				XO1(i + 2, 2)		\
268 					XO1(i + 3, 3)	\
269 		PF3(i)					\
270 				PF3(i + 2)		\
271 		PF0(i + 4)				\
272 				PF0(i + 6)		\
273 		XO2(i, 0)				\
274 			XO2(i + 1, 1)			\
275 				XO2(i + 2, 2)		\
276 					XO2(i + 3, 3)	\
277 		XO3(i, 0)				\
278 			XO3(i + 1, 1)			\
279 				XO3(i + 2, 2)		\
280 					XO3(i + 3, 3)	\
281 		ST(i, 0)				\
282 			ST(i + 1, 1)			\
283 				ST(i + 2, 2)		\
284 					ST(i + 3, 3)	\
285 
286 
287 		PF0(0)
288 				PF0(2)
289 
290 	" .align 32			;\n"
291 	" 1:                            ;\n"
292 
293 		BLOCK(0)
294 		BLOCK(4)
295 		BLOCK(8)
296 		BLOCK(12)
297 
298 	"       add %[inc], %[p1]       ;\n"
299 	"       add %[inc], %[p2]       ;\n"
300 	"       add %[inc], %[p3]       ;\n"
301 	"       add %[inc], %[p4]       ;\n"
302 	"       dec %[cnt]              ;\n"
303 	"       jnz 1b                  ;\n"
304 	: [cnt] "+r" (lines), [p1] "+r" (p1),
305 	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
306 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
307 	: "memory");
308 
309 	kernel_fpu_end();
310 }
311 
312 static void
313 xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
314 	       const unsigned long * __restrict p2,
315 	       const unsigned long * __restrict p3,
316 	       const unsigned long * __restrict p4)
317 {
318 	unsigned long lines = bytes >> 8;
319 
320 	kernel_fpu_begin();
321 
322 	asm volatile(
323 #undef BLOCK
324 #define BLOCK(i)			\
325 		BLK64(PF0, LD, i)	\
326 		BLK64(PF1, XO1, i)	\
327 		BLK64(PF2, XO2, i)	\
328 		BLK64(PF3, XO3, i)	\
329 		BLK64(NOP, ST, i)	\
330 
331 	" .align 32			;\n"
332 	" 1:                            ;\n"
333 
334 		BLOCK(0)
335 		BLOCK(4)
336 		BLOCK(8)
337 		BLOCK(12)
338 
339 	"       add %[inc], %[p1]       ;\n"
340 	"       add %[inc], %[p2]       ;\n"
341 	"       add %[inc], %[p3]       ;\n"
342 	"       add %[inc], %[p4]       ;\n"
343 	"       dec %[cnt]              ;\n"
344 	"       jnz 1b                  ;\n"
345 	: [cnt] "+r" (lines), [p1] "+r" (p1),
346 	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
347 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
348 	: "memory");
349 
350 	kernel_fpu_end();
351 }
352 
353 static void
354 xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
355 	  const unsigned long * __restrict p2,
356 	  const unsigned long * __restrict p3,
357 	  const unsigned long * __restrict p4,
358 	  const unsigned long * __restrict p5)
359 {
360 	unsigned long lines = bytes >> 8;
361 
362 	kernel_fpu_begin();
363 
364 	asm volatile(
365 #undef BLOCK
366 #define BLOCK(i) \
367 		PF1(i)					\
368 				PF1(i + 2)		\
369 		LD(i, 0)				\
370 			LD(i + 1, 1)			\
371 				LD(i + 2, 2)		\
372 					LD(i + 3, 3)	\
373 		PF2(i)					\
374 				PF2(i + 2)		\
375 		XO1(i, 0)				\
376 			XO1(i + 1, 1)			\
377 				XO1(i + 2, 2)		\
378 					XO1(i + 3, 3)	\
379 		PF3(i)					\
380 				PF3(i + 2)		\
381 		XO2(i, 0)				\
382 			XO2(i + 1, 1)			\
383 				XO2(i + 2, 2)		\
384 					XO2(i + 3, 3)	\
385 		PF4(i)					\
386 				PF4(i + 2)		\
387 		PF0(i + 4)				\
388 				PF0(i + 6)		\
389 		XO3(i, 0)				\
390 			XO3(i + 1, 1)			\
391 				XO3(i + 2, 2)		\
392 					XO3(i + 3, 3)	\
393 		XO4(i, 0)				\
394 			XO4(i + 1, 1)			\
395 				XO4(i + 2, 2)		\
396 					XO4(i + 3, 3)	\
397 		ST(i, 0)				\
398 			ST(i + 1, 1)			\
399 				ST(i + 2, 2)		\
400 					ST(i + 3, 3)	\
401 
402 
403 		PF0(0)
404 				PF0(2)
405 
406 	" .align 32			;\n"
407 	" 1:                            ;\n"
408 
409 		BLOCK(0)
410 		BLOCK(4)
411 		BLOCK(8)
412 		BLOCK(12)
413 
414 	"       add %[inc], %[p1]       ;\n"
415 	"       add %[inc], %[p2]       ;\n"
416 	"       add %[inc], %[p3]       ;\n"
417 	"       add %[inc], %[p4]       ;\n"
418 	"       add %[inc], %[p5]       ;\n"
419 	"       dec %[cnt]              ;\n"
420 	"       jnz 1b                  ;\n"
421 	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
422 	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
423 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
424 	: "memory");
425 
426 	kernel_fpu_end();
427 }
428 
429 static void
430 xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
431 	       const unsigned long * __restrict p2,
432 	       const unsigned long * __restrict p3,
433 	       const unsigned long * __restrict p4,
434 	       const unsigned long * __restrict p5)
435 {
436 	unsigned long lines = bytes >> 8;
437 
438 	kernel_fpu_begin();
439 
440 	asm volatile(
441 #undef BLOCK
442 #define BLOCK(i)			\
443 		BLK64(PF0, LD, i)	\
444 		BLK64(PF1, XO1, i)	\
445 		BLK64(PF2, XO2, i)	\
446 		BLK64(PF3, XO3, i)	\
447 		BLK64(PF4, XO4, i)	\
448 		BLK64(NOP, ST, i)	\
449 
450 	" .align 32			;\n"
451 	" 1:                            ;\n"
452 
453 		BLOCK(0)
454 		BLOCK(4)
455 		BLOCK(8)
456 		BLOCK(12)
457 
458 	"       add %[inc], %[p1]       ;\n"
459 	"       add %[inc], %[p2]       ;\n"
460 	"       add %[inc], %[p3]       ;\n"
461 	"       add %[inc], %[p4]       ;\n"
462 	"       add %[inc], %[p5]       ;\n"
463 	"       dec %[cnt]              ;\n"
464 	"       jnz 1b                  ;\n"
465 	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
466 	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
467 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
468 	: "memory");
469 
470 	kernel_fpu_end();
471 }
472 
473 static struct xor_block_template xor_block_sse_pf64 = {
474 	.name = "prefetch64-sse",
475 	.do_2 = xor_sse_2_pf64,
476 	.do_3 = xor_sse_3_pf64,
477 	.do_4 = xor_sse_4_pf64,
478 	.do_5 = xor_sse_5_pf64,
479 };
480 
481 #undef LD
482 #undef XO1
483 #undef XO2
484 #undef XO3
485 #undef XO4
486 #undef ST
487 #undef NOP
488 #undef BLK64
489 #undef BLOCK
490 
491 #undef XOR_CONSTANT_CONSTRAINT
492 
493 #ifdef CONFIG_X86_32
494 # include <asm/xor_32.h>
495 #else
496 # include <asm/xor_64.h>
497 #endif
498 
499 #define XOR_SELECT_TEMPLATE(FASTEST) \
500 	AVX_SELECT(FASTEST)
501 
502 #endif /* _ASM_X86_XOR_H */
503