xref: /linux/arch/x86/include/asm/xor_32.h (revision 8e07e0e3964ca4e23ce7b68e2096fe660a888942)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 #ifndef _ASM_X86_XOR_32_H
3 #define _ASM_X86_XOR_32_H
4 
5 /*
6  * Optimized RAID-5 checksumming functions for MMX.
7  */
8 
9 /*
10  * High-speed RAID5 checksumming functions utilizing MMX instructions.
11  * Copyright (C) 1998 Ingo Molnar.
12  */
13 
14 #define LD(x, y)	"       movq   8*("#x")(%1), %%mm"#y"   ;\n"
15 #define ST(x, y)	"       movq %%mm"#y",   8*("#x")(%1)   ;\n"
16 #define XO1(x, y)	"       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
17 #define XO2(x, y)	"       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
18 #define XO3(x, y)	"       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
19 #define XO4(x, y)	"       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
20 
21 #include <asm/fpu/api.h>
22 
23 static void
24 xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
25 	      const unsigned long * __restrict p2)
26 {
27 	unsigned long lines = bytes >> 7;
28 
29 	kernel_fpu_begin();
30 
31 	asm volatile(
32 #undef BLOCK
33 #define BLOCK(i)				\
34 	LD(i, 0)				\
35 		LD(i + 1, 1)			\
36 			LD(i + 2, 2)		\
37 				LD(i + 3, 3)	\
38 	XO1(i, 0)				\
39 	ST(i, 0)				\
40 		XO1(i+1, 1)			\
41 		ST(i+1, 1)			\
42 			XO1(i + 2, 2)		\
43 			ST(i + 2, 2)		\
44 				XO1(i + 3, 3)	\
45 				ST(i + 3, 3)
46 
47 	" .align 32			;\n"
48 	" 1:                            ;\n"
49 
50 	BLOCK(0)
51 	BLOCK(4)
52 	BLOCK(8)
53 	BLOCK(12)
54 
55 	"       addl $128, %1         ;\n"
56 	"       addl $128, %2         ;\n"
57 	"       decl %0               ;\n"
58 	"       jnz 1b                ;\n"
59 	: "+r" (lines),
60 	  "+r" (p1), "+r" (p2)
61 	:
62 	: "memory");
63 
64 	kernel_fpu_end();
65 }
66 
67 static void
68 xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
69 	      const unsigned long * __restrict p2,
70 	      const unsigned long * __restrict p3)
71 {
72 	unsigned long lines = bytes >> 7;
73 
74 	kernel_fpu_begin();
75 
76 	asm volatile(
77 #undef BLOCK
78 #define BLOCK(i)				\
79 	LD(i, 0)				\
80 		LD(i + 1, 1)			\
81 			LD(i + 2, 2)		\
82 				LD(i + 3, 3)	\
83 	XO1(i, 0)				\
84 		XO1(i + 1, 1)			\
85 			XO1(i + 2, 2)		\
86 				XO1(i + 3, 3)	\
87 	XO2(i, 0)				\
88 	ST(i, 0)				\
89 		XO2(i + 1, 1)			\
90 		ST(i + 1, 1)			\
91 			XO2(i + 2, 2)		\
92 			ST(i + 2, 2)		\
93 				XO2(i + 3, 3)	\
94 				ST(i + 3, 3)
95 
96 	" .align 32			;\n"
97 	" 1:                            ;\n"
98 
99 	BLOCK(0)
100 	BLOCK(4)
101 	BLOCK(8)
102 	BLOCK(12)
103 
104 	"       addl $128, %1         ;\n"
105 	"       addl $128, %2         ;\n"
106 	"       addl $128, %3         ;\n"
107 	"       decl %0               ;\n"
108 	"       jnz 1b                ;\n"
109 	: "+r" (lines),
110 	  "+r" (p1), "+r" (p2), "+r" (p3)
111 	:
112 	: "memory");
113 
114 	kernel_fpu_end();
115 }
116 
117 static void
118 xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
119 	      const unsigned long * __restrict p2,
120 	      const unsigned long * __restrict p3,
121 	      const unsigned long * __restrict p4)
122 {
123 	unsigned long lines = bytes >> 7;
124 
125 	kernel_fpu_begin();
126 
127 	asm volatile(
128 #undef BLOCK
129 #define BLOCK(i)				\
130 	LD(i, 0)				\
131 		LD(i + 1, 1)			\
132 			LD(i + 2, 2)		\
133 				LD(i + 3, 3)	\
134 	XO1(i, 0)				\
135 		XO1(i + 1, 1)			\
136 			XO1(i + 2, 2)		\
137 				XO1(i + 3, 3)	\
138 	XO2(i, 0)				\
139 		XO2(i + 1, 1)			\
140 			XO2(i + 2, 2)		\
141 				XO2(i + 3, 3)	\
142 	XO3(i, 0)				\
143 	ST(i, 0)				\
144 		XO3(i + 1, 1)			\
145 		ST(i + 1, 1)			\
146 			XO3(i + 2, 2)		\
147 			ST(i + 2, 2)		\
148 				XO3(i + 3, 3)	\
149 				ST(i + 3, 3)
150 
151 	" .align 32			;\n"
152 	" 1:                            ;\n"
153 
154 	BLOCK(0)
155 	BLOCK(4)
156 	BLOCK(8)
157 	BLOCK(12)
158 
159 	"       addl $128, %1         ;\n"
160 	"       addl $128, %2         ;\n"
161 	"       addl $128, %3         ;\n"
162 	"       addl $128, %4         ;\n"
163 	"       decl %0               ;\n"
164 	"       jnz 1b                ;\n"
165 	: "+r" (lines),
166 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
167 	:
168 	: "memory");
169 
170 	kernel_fpu_end();
171 }
172 
173 
174 static void
175 xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
176 	      const unsigned long * __restrict p2,
177 	      const unsigned long * __restrict p3,
178 	      const unsigned long * __restrict p4,
179 	      const unsigned long * __restrict p5)
180 {
181 	unsigned long lines = bytes >> 7;
182 
183 	kernel_fpu_begin();
184 
185 	/* Make sure GCC forgets anything it knows about p4 or p5,
186 	   such that it won't pass to the asm volatile below a
187 	   register that is shared with any other variable.  That's
188 	   because we modify p4 and p5 there, but we can't mark them
189 	   as read/write, otherwise we'd overflow the 10-asm-operands
190 	   limit of GCC < 3.1.  */
191 	asm("" : "+r" (p4), "+r" (p5));
192 
193 	asm volatile(
194 #undef BLOCK
195 #define BLOCK(i)				\
196 	LD(i, 0)				\
197 		LD(i + 1, 1)			\
198 			LD(i + 2, 2)		\
199 				LD(i + 3, 3)	\
200 	XO1(i, 0)				\
201 		XO1(i + 1, 1)			\
202 			XO1(i + 2, 2)		\
203 				XO1(i + 3, 3)	\
204 	XO2(i, 0)				\
205 		XO2(i + 1, 1)			\
206 			XO2(i + 2, 2)		\
207 				XO2(i + 3, 3)	\
208 	XO3(i, 0)				\
209 		XO3(i + 1, 1)			\
210 			XO3(i + 2, 2)		\
211 				XO3(i + 3, 3)	\
212 	XO4(i, 0)				\
213 	ST(i, 0)				\
214 		XO4(i + 1, 1)			\
215 		ST(i + 1, 1)			\
216 			XO4(i + 2, 2)		\
217 			ST(i + 2, 2)		\
218 				XO4(i + 3, 3)	\
219 				ST(i + 3, 3)
220 
221 	" .align 32			;\n"
222 	" 1:                            ;\n"
223 
224 	BLOCK(0)
225 	BLOCK(4)
226 	BLOCK(8)
227 	BLOCK(12)
228 
229 	"       addl $128, %1         ;\n"
230 	"       addl $128, %2         ;\n"
231 	"       addl $128, %3         ;\n"
232 	"       addl $128, %4         ;\n"
233 	"       addl $128, %5         ;\n"
234 	"       decl %0               ;\n"
235 	"       jnz 1b                ;\n"
236 	: "+r" (lines),
237 	  "+r" (p1), "+r" (p2), "+r" (p3)
238 	: "r" (p4), "r" (p5)
239 	: "memory");
240 
241 	/* p4 and p5 were modified, and now the variables are dead.
242 	   Clobber them just to be sure nobody does something stupid
243 	   like assuming they have some legal value.  */
244 	asm("" : "=r" (p4), "=r" (p5));
245 
246 	kernel_fpu_end();
247 }
248 
249 #undef LD
250 #undef XO1
251 #undef XO2
252 #undef XO3
253 #undef XO4
254 #undef ST
255 #undef BLOCK
256 
257 static void
258 xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
259 	     const unsigned long * __restrict p2)
260 {
261 	unsigned long lines = bytes >> 6;
262 
263 	kernel_fpu_begin();
264 
265 	asm volatile(
266 	" .align 32	             ;\n"
267 	" 1:                         ;\n"
268 	"       movq   (%1), %%mm0   ;\n"
269 	"       movq  8(%1), %%mm1   ;\n"
270 	"       pxor   (%2), %%mm0   ;\n"
271 	"       movq 16(%1), %%mm2   ;\n"
272 	"       movq %%mm0,   (%1)   ;\n"
273 	"       pxor  8(%2), %%mm1   ;\n"
274 	"       movq 24(%1), %%mm3   ;\n"
275 	"       movq %%mm1,  8(%1)   ;\n"
276 	"       pxor 16(%2), %%mm2   ;\n"
277 	"       movq 32(%1), %%mm4   ;\n"
278 	"       movq %%mm2, 16(%1)   ;\n"
279 	"       pxor 24(%2), %%mm3   ;\n"
280 	"       movq 40(%1), %%mm5   ;\n"
281 	"       movq %%mm3, 24(%1)   ;\n"
282 	"       pxor 32(%2), %%mm4   ;\n"
283 	"       movq 48(%1), %%mm6   ;\n"
284 	"       movq %%mm4, 32(%1)   ;\n"
285 	"       pxor 40(%2), %%mm5   ;\n"
286 	"       movq 56(%1), %%mm7   ;\n"
287 	"       movq %%mm5, 40(%1)   ;\n"
288 	"       pxor 48(%2), %%mm6   ;\n"
289 	"       pxor 56(%2), %%mm7   ;\n"
290 	"       movq %%mm6, 48(%1)   ;\n"
291 	"       movq %%mm7, 56(%1)   ;\n"
292 
293 	"       addl $64, %1         ;\n"
294 	"       addl $64, %2         ;\n"
295 	"       decl %0              ;\n"
296 	"       jnz 1b               ;\n"
297 	: "+r" (lines),
298 	  "+r" (p1), "+r" (p2)
299 	:
300 	: "memory");
301 
302 	kernel_fpu_end();
303 }
304 
305 static void
306 xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
307 	     const unsigned long * __restrict p2,
308 	     const unsigned long * __restrict p3)
309 {
310 	unsigned long lines = bytes >> 6;
311 
312 	kernel_fpu_begin();
313 
314 	asm volatile(
315 	" .align 32,0x90             ;\n"
316 	" 1:                         ;\n"
317 	"       movq   (%1), %%mm0   ;\n"
318 	"       movq  8(%1), %%mm1   ;\n"
319 	"       pxor   (%2), %%mm0   ;\n"
320 	"       movq 16(%1), %%mm2   ;\n"
321 	"       pxor  8(%2), %%mm1   ;\n"
322 	"       pxor   (%3), %%mm0   ;\n"
323 	"       pxor 16(%2), %%mm2   ;\n"
324 	"       movq %%mm0,   (%1)   ;\n"
325 	"       pxor  8(%3), %%mm1   ;\n"
326 	"       pxor 16(%3), %%mm2   ;\n"
327 	"       movq 24(%1), %%mm3   ;\n"
328 	"       movq %%mm1,  8(%1)   ;\n"
329 	"       movq 32(%1), %%mm4   ;\n"
330 	"       movq 40(%1), %%mm5   ;\n"
331 	"       pxor 24(%2), %%mm3   ;\n"
332 	"       movq %%mm2, 16(%1)   ;\n"
333 	"       pxor 32(%2), %%mm4   ;\n"
334 	"       pxor 24(%3), %%mm3   ;\n"
335 	"       pxor 40(%2), %%mm5   ;\n"
336 	"       movq %%mm3, 24(%1)   ;\n"
337 	"       pxor 32(%3), %%mm4   ;\n"
338 	"       pxor 40(%3), %%mm5   ;\n"
339 	"       movq 48(%1), %%mm6   ;\n"
340 	"       movq %%mm4, 32(%1)   ;\n"
341 	"       movq 56(%1), %%mm7   ;\n"
342 	"       pxor 48(%2), %%mm6   ;\n"
343 	"       movq %%mm5, 40(%1)   ;\n"
344 	"       pxor 56(%2), %%mm7   ;\n"
345 	"       pxor 48(%3), %%mm6   ;\n"
346 	"       pxor 56(%3), %%mm7   ;\n"
347 	"       movq %%mm6, 48(%1)   ;\n"
348 	"       movq %%mm7, 56(%1)   ;\n"
349 
350 	"       addl $64, %1         ;\n"
351 	"       addl $64, %2         ;\n"
352 	"       addl $64, %3         ;\n"
353 	"       decl %0              ;\n"
354 	"       jnz 1b               ;\n"
355 	: "+r" (lines),
356 	  "+r" (p1), "+r" (p2), "+r" (p3)
357 	:
358 	: "memory" );
359 
360 	kernel_fpu_end();
361 }
362 
363 static void
364 xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
365 	     const unsigned long * __restrict p2,
366 	     const unsigned long * __restrict p3,
367 	     const unsigned long * __restrict p4)
368 {
369 	unsigned long lines = bytes >> 6;
370 
371 	kernel_fpu_begin();
372 
373 	asm volatile(
374 	" .align 32,0x90             ;\n"
375 	" 1:                         ;\n"
376 	"       movq   (%1), %%mm0   ;\n"
377 	"       movq  8(%1), %%mm1   ;\n"
378 	"       pxor   (%2), %%mm0   ;\n"
379 	"       movq 16(%1), %%mm2   ;\n"
380 	"       pxor  8(%2), %%mm1   ;\n"
381 	"       pxor   (%3), %%mm0   ;\n"
382 	"       pxor 16(%2), %%mm2   ;\n"
383 	"       pxor  8(%3), %%mm1   ;\n"
384 	"       pxor   (%4), %%mm0   ;\n"
385 	"       movq 24(%1), %%mm3   ;\n"
386 	"       pxor 16(%3), %%mm2   ;\n"
387 	"       pxor  8(%4), %%mm1   ;\n"
388 	"       movq %%mm0,   (%1)   ;\n"
389 	"       movq 32(%1), %%mm4   ;\n"
390 	"       pxor 24(%2), %%mm3   ;\n"
391 	"       pxor 16(%4), %%mm2   ;\n"
392 	"       movq %%mm1,  8(%1)   ;\n"
393 	"       movq 40(%1), %%mm5   ;\n"
394 	"       pxor 32(%2), %%mm4   ;\n"
395 	"       pxor 24(%3), %%mm3   ;\n"
396 	"       movq %%mm2, 16(%1)   ;\n"
397 	"       pxor 40(%2), %%mm5   ;\n"
398 	"       pxor 32(%3), %%mm4   ;\n"
399 	"       pxor 24(%4), %%mm3   ;\n"
400 	"       movq %%mm3, 24(%1)   ;\n"
401 	"       movq 56(%1), %%mm7   ;\n"
402 	"       movq 48(%1), %%mm6   ;\n"
403 	"       pxor 40(%3), %%mm5   ;\n"
404 	"       pxor 32(%4), %%mm4   ;\n"
405 	"       pxor 48(%2), %%mm6   ;\n"
406 	"       movq %%mm4, 32(%1)   ;\n"
407 	"       pxor 56(%2), %%mm7   ;\n"
408 	"       pxor 40(%4), %%mm5   ;\n"
409 	"       pxor 48(%3), %%mm6   ;\n"
410 	"       pxor 56(%3), %%mm7   ;\n"
411 	"       movq %%mm5, 40(%1)   ;\n"
412 	"       pxor 48(%4), %%mm6   ;\n"
413 	"       pxor 56(%4), %%mm7   ;\n"
414 	"       movq %%mm6, 48(%1)   ;\n"
415 	"       movq %%mm7, 56(%1)   ;\n"
416 
417 	"       addl $64, %1         ;\n"
418 	"       addl $64, %2         ;\n"
419 	"       addl $64, %3         ;\n"
420 	"       addl $64, %4         ;\n"
421 	"       decl %0              ;\n"
422 	"       jnz 1b               ;\n"
423 	: "+r" (lines),
424 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
425 	:
426 	: "memory");
427 
428 	kernel_fpu_end();
429 }
430 
431 static void
432 xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
433 	     const unsigned long * __restrict p2,
434 	     const unsigned long * __restrict p3,
435 	     const unsigned long * __restrict p4,
436 	     const unsigned long * __restrict p5)
437 {
438 	unsigned long lines = bytes >> 6;
439 
440 	kernel_fpu_begin();
441 
442 	/* Make sure GCC forgets anything it knows about p4 or p5,
443 	   such that it won't pass to the asm volatile below a
444 	   register that is shared with any other variable.  That's
445 	   because we modify p4 and p5 there, but we can't mark them
446 	   as read/write, otherwise we'd overflow the 10-asm-operands
447 	   limit of GCC < 3.1.  */
448 	asm("" : "+r" (p4), "+r" (p5));
449 
450 	asm volatile(
451 	" .align 32,0x90             ;\n"
452 	" 1:                         ;\n"
453 	"       movq   (%1), %%mm0   ;\n"
454 	"       movq  8(%1), %%mm1   ;\n"
455 	"       pxor   (%2), %%mm0   ;\n"
456 	"       pxor  8(%2), %%mm1   ;\n"
457 	"       movq 16(%1), %%mm2   ;\n"
458 	"       pxor   (%3), %%mm0   ;\n"
459 	"       pxor  8(%3), %%mm1   ;\n"
460 	"       pxor 16(%2), %%mm2   ;\n"
461 	"       pxor   (%4), %%mm0   ;\n"
462 	"       pxor  8(%4), %%mm1   ;\n"
463 	"       pxor 16(%3), %%mm2   ;\n"
464 	"       movq 24(%1), %%mm3   ;\n"
465 	"       pxor   (%5), %%mm0   ;\n"
466 	"       pxor  8(%5), %%mm1   ;\n"
467 	"       movq %%mm0,   (%1)   ;\n"
468 	"       pxor 16(%4), %%mm2   ;\n"
469 	"       pxor 24(%2), %%mm3   ;\n"
470 	"       movq %%mm1,  8(%1)   ;\n"
471 	"       pxor 16(%5), %%mm2   ;\n"
472 	"       pxor 24(%3), %%mm3   ;\n"
473 	"       movq 32(%1), %%mm4   ;\n"
474 	"       movq %%mm2, 16(%1)   ;\n"
475 	"       pxor 24(%4), %%mm3   ;\n"
476 	"       pxor 32(%2), %%mm4   ;\n"
477 	"       movq 40(%1), %%mm5   ;\n"
478 	"       pxor 24(%5), %%mm3   ;\n"
479 	"       pxor 32(%3), %%mm4   ;\n"
480 	"       pxor 40(%2), %%mm5   ;\n"
481 	"       movq %%mm3, 24(%1)   ;\n"
482 	"       pxor 32(%4), %%mm4   ;\n"
483 	"       pxor 40(%3), %%mm5   ;\n"
484 	"       movq 48(%1), %%mm6   ;\n"
485 	"       movq 56(%1), %%mm7   ;\n"
486 	"       pxor 32(%5), %%mm4   ;\n"
487 	"       pxor 40(%4), %%mm5   ;\n"
488 	"       pxor 48(%2), %%mm6   ;\n"
489 	"       pxor 56(%2), %%mm7   ;\n"
490 	"       movq %%mm4, 32(%1)   ;\n"
491 	"       pxor 48(%3), %%mm6   ;\n"
492 	"       pxor 56(%3), %%mm7   ;\n"
493 	"       pxor 40(%5), %%mm5   ;\n"
494 	"       pxor 48(%4), %%mm6   ;\n"
495 	"       pxor 56(%4), %%mm7   ;\n"
496 	"       movq %%mm5, 40(%1)   ;\n"
497 	"       pxor 48(%5), %%mm6   ;\n"
498 	"       pxor 56(%5), %%mm7   ;\n"
499 	"       movq %%mm6, 48(%1)   ;\n"
500 	"       movq %%mm7, 56(%1)   ;\n"
501 
502 	"       addl $64, %1         ;\n"
503 	"       addl $64, %2         ;\n"
504 	"       addl $64, %3         ;\n"
505 	"       addl $64, %4         ;\n"
506 	"       addl $64, %5         ;\n"
507 	"       decl %0              ;\n"
508 	"       jnz 1b               ;\n"
509 	: "+r" (lines),
510 	  "+r" (p1), "+r" (p2), "+r" (p3)
511 	: "r" (p4), "r" (p5)
512 	: "memory");
513 
514 	/* p4 and p5 were modified, and now the variables are dead.
515 	   Clobber them just to be sure nobody does something stupid
516 	   like assuming they have some legal value.  */
517 	asm("" : "=r" (p4), "=r" (p5));
518 
519 	kernel_fpu_end();
520 }
521 
522 static struct xor_block_template xor_block_pII_mmx = {
523 	.name = "pII_mmx",
524 	.do_2 = xor_pII_mmx_2,
525 	.do_3 = xor_pII_mmx_3,
526 	.do_4 = xor_pII_mmx_4,
527 	.do_5 = xor_pII_mmx_5,
528 };
529 
530 static struct xor_block_template xor_block_p5_mmx = {
531 	.name = "p5_mmx",
532 	.do_2 = xor_p5_mmx_2,
533 	.do_3 = xor_p5_mmx_3,
534 	.do_4 = xor_p5_mmx_4,
535 	.do_5 = xor_p5_mmx_5,
536 };
537 
538 static struct xor_block_template xor_block_pIII_sse = {
539 	.name = "pIII_sse",
540 	.do_2 = xor_sse_2,
541 	.do_3 = xor_sse_3,
542 	.do_4 = xor_sse_4,
543 	.do_5 = xor_sse_5,
544 };
545 
546 /* Also try the AVX routines */
547 #include <asm/xor_avx.h>
548 
549 /* Also try the generic routines.  */
550 #include <asm-generic/xor.h>
551 
552 /* We force the use of the SSE xor block because it can write around L2.
553    We may also be able to load into the L1 only depending on how the cpu
554    deals with a load to a line that is being prefetched.  */
555 #undef XOR_TRY_TEMPLATES
556 #define XOR_TRY_TEMPLATES				\
557 do {							\
558 	AVX_XOR_SPEED;					\
559 	if (boot_cpu_has(X86_FEATURE_XMM)) {				\
560 		xor_speed(&xor_block_pIII_sse);		\
561 		xor_speed(&xor_block_sse_pf64);		\
562 	} else if (boot_cpu_has(X86_FEATURE_MMX)) {	\
563 		xor_speed(&xor_block_pII_mmx);		\
564 		xor_speed(&xor_block_p5_mmx);		\
565 	} else {					\
566 		xor_speed(&xor_block_8regs);		\
567 		xor_speed(&xor_block_8regs_p);		\
568 		xor_speed(&xor_block_32regs);		\
569 		xor_speed(&xor_block_32regs_p);		\
570 	}						\
571 } while (0)
572 
573 #endif /* _ASM_X86_XOR_32_H */
574