xref: /linux/arch/x86/include/asm/xor_32.h (revision b8265621f4888af9494e1d685620871ec81bc33d)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 #ifndef _ASM_X86_XOR_32_H
3 #define _ASM_X86_XOR_32_H
4 
5 /*
6  * Optimized RAID-5 checksumming functions for MMX.
7  */
8 
9 /*
10  * High-speed RAID5 checksumming functions utilizing MMX instructions.
11  * Copyright (C) 1998 Ingo Molnar.
12  */
13 
14 #define LD(x, y)	"       movq   8*("#x")(%1), %%mm"#y"   ;\n"
15 #define ST(x, y)	"       movq %%mm"#y",   8*("#x")(%1)   ;\n"
16 #define XO1(x, y)	"       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
17 #define XO2(x, y)	"       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
18 #define XO3(x, y)	"       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
19 #define XO4(x, y)	"       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
20 
21 #include <asm/fpu/api.h>
22 
23 static void
24 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
25 {
26 	unsigned long lines = bytes >> 7;
27 
28 	kernel_fpu_begin();
29 
30 	asm volatile(
31 #undef BLOCK
32 #define BLOCK(i)				\
33 	LD(i, 0)				\
34 		LD(i + 1, 1)			\
35 			LD(i + 2, 2)		\
36 				LD(i + 3, 3)	\
37 	XO1(i, 0)				\
38 	ST(i, 0)				\
39 		XO1(i+1, 1)			\
40 		ST(i+1, 1)			\
41 			XO1(i + 2, 2)		\
42 			ST(i + 2, 2)		\
43 				XO1(i + 3, 3)	\
44 				ST(i + 3, 3)
45 
46 	" .align 32			;\n"
47 	" 1:                            ;\n"
48 
49 	BLOCK(0)
50 	BLOCK(4)
51 	BLOCK(8)
52 	BLOCK(12)
53 
54 	"       addl $128, %1         ;\n"
55 	"       addl $128, %2         ;\n"
56 	"       decl %0               ;\n"
57 	"       jnz 1b                ;\n"
58 	: "+r" (lines),
59 	  "+r" (p1), "+r" (p2)
60 	:
61 	: "memory");
62 
63 	kernel_fpu_end();
64 }
65 
66 static void
67 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
68 	      unsigned long *p3)
69 {
70 	unsigned long lines = bytes >> 7;
71 
72 	kernel_fpu_begin();
73 
74 	asm volatile(
75 #undef BLOCK
76 #define BLOCK(i)				\
77 	LD(i, 0)				\
78 		LD(i + 1, 1)			\
79 			LD(i + 2, 2)		\
80 				LD(i + 3, 3)	\
81 	XO1(i, 0)				\
82 		XO1(i + 1, 1)			\
83 			XO1(i + 2, 2)		\
84 				XO1(i + 3, 3)	\
85 	XO2(i, 0)				\
86 	ST(i, 0)				\
87 		XO2(i + 1, 1)			\
88 		ST(i + 1, 1)			\
89 			XO2(i + 2, 2)		\
90 			ST(i + 2, 2)		\
91 				XO2(i + 3, 3)	\
92 				ST(i + 3, 3)
93 
94 	" .align 32			;\n"
95 	" 1:                            ;\n"
96 
97 	BLOCK(0)
98 	BLOCK(4)
99 	BLOCK(8)
100 	BLOCK(12)
101 
102 	"       addl $128, %1         ;\n"
103 	"       addl $128, %2         ;\n"
104 	"       addl $128, %3         ;\n"
105 	"       decl %0               ;\n"
106 	"       jnz 1b                ;\n"
107 	: "+r" (lines),
108 	  "+r" (p1), "+r" (p2), "+r" (p3)
109 	:
110 	: "memory");
111 
112 	kernel_fpu_end();
113 }
114 
115 static void
116 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
117 	      unsigned long *p3, unsigned long *p4)
118 {
119 	unsigned long lines = bytes >> 7;
120 
121 	kernel_fpu_begin();
122 
123 	asm volatile(
124 #undef BLOCK
125 #define BLOCK(i)				\
126 	LD(i, 0)				\
127 		LD(i + 1, 1)			\
128 			LD(i + 2, 2)		\
129 				LD(i + 3, 3)	\
130 	XO1(i, 0)				\
131 		XO1(i + 1, 1)			\
132 			XO1(i + 2, 2)		\
133 				XO1(i + 3, 3)	\
134 	XO2(i, 0)				\
135 		XO2(i + 1, 1)			\
136 			XO2(i + 2, 2)		\
137 				XO2(i + 3, 3)	\
138 	XO3(i, 0)				\
139 	ST(i, 0)				\
140 		XO3(i + 1, 1)			\
141 		ST(i + 1, 1)			\
142 			XO3(i + 2, 2)		\
143 			ST(i + 2, 2)		\
144 				XO3(i + 3, 3)	\
145 				ST(i + 3, 3)
146 
147 	" .align 32			;\n"
148 	" 1:                            ;\n"
149 
150 	BLOCK(0)
151 	BLOCK(4)
152 	BLOCK(8)
153 	BLOCK(12)
154 
155 	"       addl $128, %1         ;\n"
156 	"       addl $128, %2         ;\n"
157 	"       addl $128, %3         ;\n"
158 	"       addl $128, %4         ;\n"
159 	"       decl %0               ;\n"
160 	"       jnz 1b                ;\n"
161 	: "+r" (lines),
162 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
163 	:
164 	: "memory");
165 
166 	kernel_fpu_end();
167 }
168 
169 
170 static void
171 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
172 	      unsigned long *p3, unsigned long *p4, unsigned long *p5)
173 {
174 	unsigned long lines = bytes >> 7;
175 
176 	kernel_fpu_begin();
177 
178 	/* Make sure GCC forgets anything it knows about p4 or p5,
179 	   such that it won't pass to the asm volatile below a
180 	   register that is shared with any other variable.  That's
181 	   because we modify p4 and p5 there, but we can't mark them
182 	   as read/write, otherwise we'd overflow the 10-asm-operands
183 	   limit of GCC < 3.1.  */
184 	asm("" : "+r" (p4), "+r" (p5));
185 
186 	asm volatile(
187 #undef BLOCK
188 #define BLOCK(i)				\
189 	LD(i, 0)				\
190 		LD(i + 1, 1)			\
191 			LD(i + 2, 2)		\
192 				LD(i + 3, 3)	\
193 	XO1(i, 0)				\
194 		XO1(i + 1, 1)			\
195 			XO1(i + 2, 2)		\
196 				XO1(i + 3, 3)	\
197 	XO2(i, 0)				\
198 		XO2(i + 1, 1)			\
199 			XO2(i + 2, 2)		\
200 				XO2(i + 3, 3)	\
201 	XO3(i, 0)				\
202 		XO3(i + 1, 1)			\
203 			XO3(i + 2, 2)		\
204 				XO3(i + 3, 3)	\
205 	XO4(i, 0)				\
206 	ST(i, 0)				\
207 		XO4(i + 1, 1)			\
208 		ST(i + 1, 1)			\
209 			XO4(i + 2, 2)		\
210 			ST(i + 2, 2)		\
211 				XO4(i + 3, 3)	\
212 				ST(i + 3, 3)
213 
214 	" .align 32			;\n"
215 	" 1:                            ;\n"
216 
217 	BLOCK(0)
218 	BLOCK(4)
219 	BLOCK(8)
220 	BLOCK(12)
221 
222 	"       addl $128, %1         ;\n"
223 	"       addl $128, %2         ;\n"
224 	"       addl $128, %3         ;\n"
225 	"       addl $128, %4         ;\n"
226 	"       addl $128, %5         ;\n"
227 	"       decl %0               ;\n"
228 	"       jnz 1b                ;\n"
229 	: "+r" (lines),
230 	  "+r" (p1), "+r" (p2), "+r" (p3)
231 	: "r" (p4), "r" (p5)
232 	: "memory");
233 
234 	/* p4 and p5 were modified, and now the variables are dead.
235 	   Clobber them just to be sure nobody does something stupid
236 	   like assuming they have some legal value.  */
237 	asm("" : "=r" (p4), "=r" (p5));
238 
239 	kernel_fpu_end();
240 }
241 
242 #undef LD
243 #undef XO1
244 #undef XO2
245 #undef XO3
246 #undef XO4
247 #undef ST
248 #undef BLOCK
249 
250 static void
251 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
252 {
253 	unsigned long lines = bytes >> 6;
254 
255 	kernel_fpu_begin();
256 
257 	asm volatile(
258 	" .align 32	             ;\n"
259 	" 1:                         ;\n"
260 	"       movq   (%1), %%mm0   ;\n"
261 	"       movq  8(%1), %%mm1   ;\n"
262 	"       pxor   (%2), %%mm0   ;\n"
263 	"       movq 16(%1), %%mm2   ;\n"
264 	"       movq %%mm0,   (%1)   ;\n"
265 	"       pxor  8(%2), %%mm1   ;\n"
266 	"       movq 24(%1), %%mm3   ;\n"
267 	"       movq %%mm1,  8(%1)   ;\n"
268 	"       pxor 16(%2), %%mm2   ;\n"
269 	"       movq 32(%1), %%mm4   ;\n"
270 	"       movq %%mm2, 16(%1)   ;\n"
271 	"       pxor 24(%2), %%mm3   ;\n"
272 	"       movq 40(%1), %%mm5   ;\n"
273 	"       movq %%mm3, 24(%1)   ;\n"
274 	"       pxor 32(%2), %%mm4   ;\n"
275 	"       movq 48(%1), %%mm6   ;\n"
276 	"       movq %%mm4, 32(%1)   ;\n"
277 	"       pxor 40(%2), %%mm5   ;\n"
278 	"       movq 56(%1), %%mm7   ;\n"
279 	"       movq %%mm5, 40(%1)   ;\n"
280 	"       pxor 48(%2), %%mm6   ;\n"
281 	"       pxor 56(%2), %%mm7   ;\n"
282 	"       movq %%mm6, 48(%1)   ;\n"
283 	"       movq %%mm7, 56(%1)   ;\n"
284 
285 	"       addl $64, %1         ;\n"
286 	"       addl $64, %2         ;\n"
287 	"       decl %0              ;\n"
288 	"       jnz 1b               ;\n"
289 	: "+r" (lines),
290 	  "+r" (p1), "+r" (p2)
291 	:
292 	: "memory");
293 
294 	kernel_fpu_end();
295 }
296 
297 static void
298 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
299 	     unsigned long *p3)
300 {
301 	unsigned long lines = bytes >> 6;
302 
303 	kernel_fpu_begin();
304 
305 	asm volatile(
306 	" .align 32,0x90             ;\n"
307 	" 1:                         ;\n"
308 	"       movq   (%1), %%mm0   ;\n"
309 	"       movq  8(%1), %%mm1   ;\n"
310 	"       pxor   (%2), %%mm0   ;\n"
311 	"       movq 16(%1), %%mm2   ;\n"
312 	"       pxor  8(%2), %%mm1   ;\n"
313 	"       pxor   (%3), %%mm0   ;\n"
314 	"       pxor 16(%2), %%mm2   ;\n"
315 	"       movq %%mm0,   (%1)   ;\n"
316 	"       pxor  8(%3), %%mm1   ;\n"
317 	"       pxor 16(%3), %%mm2   ;\n"
318 	"       movq 24(%1), %%mm3   ;\n"
319 	"       movq %%mm1,  8(%1)   ;\n"
320 	"       movq 32(%1), %%mm4   ;\n"
321 	"       movq 40(%1), %%mm5   ;\n"
322 	"       pxor 24(%2), %%mm3   ;\n"
323 	"       movq %%mm2, 16(%1)   ;\n"
324 	"       pxor 32(%2), %%mm4   ;\n"
325 	"       pxor 24(%3), %%mm3   ;\n"
326 	"       pxor 40(%2), %%mm5   ;\n"
327 	"       movq %%mm3, 24(%1)   ;\n"
328 	"       pxor 32(%3), %%mm4   ;\n"
329 	"       pxor 40(%3), %%mm5   ;\n"
330 	"       movq 48(%1), %%mm6   ;\n"
331 	"       movq %%mm4, 32(%1)   ;\n"
332 	"       movq 56(%1), %%mm7   ;\n"
333 	"       pxor 48(%2), %%mm6   ;\n"
334 	"       movq %%mm5, 40(%1)   ;\n"
335 	"       pxor 56(%2), %%mm7   ;\n"
336 	"       pxor 48(%3), %%mm6   ;\n"
337 	"       pxor 56(%3), %%mm7   ;\n"
338 	"       movq %%mm6, 48(%1)   ;\n"
339 	"       movq %%mm7, 56(%1)   ;\n"
340 
341 	"       addl $64, %1         ;\n"
342 	"       addl $64, %2         ;\n"
343 	"       addl $64, %3         ;\n"
344 	"       decl %0              ;\n"
345 	"       jnz 1b               ;\n"
346 	: "+r" (lines),
347 	  "+r" (p1), "+r" (p2), "+r" (p3)
348 	:
349 	: "memory" );
350 
351 	kernel_fpu_end();
352 }
353 
354 static void
355 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
356 	     unsigned long *p3, unsigned long *p4)
357 {
358 	unsigned long lines = bytes >> 6;
359 
360 	kernel_fpu_begin();
361 
362 	asm volatile(
363 	" .align 32,0x90             ;\n"
364 	" 1:                         ;\n"
365 	"       movq   (%1), %%mm0   ;\n"
366 	"       movq  8(%1), %%mm1   ;\n"
367 	"       pxor   (%2), %%mm0   ;\n"
368 	"       movq 16(%1), %%mm2   ;\n"
369 	"       pxor  8(%2), %%mm1   ;\n"
370 	"       pxor   (%3), %%mm0   ;\n"
371 	"       pxor 16(%2), %%mm2   ;\n"
372 	"       pxor  8(%3), %%mm1   ;\n"
373 	"       pxor   (%4), %%mm0   ;\n"
374 	"       movq 24(%1), %%mm3   ;\n"
375 	"       pxor 16(%3), %%mm2   ;\n"
376 	"       pxor  8(%4), %%mm1   ;\n"
377 	"       movq %%mm0,   (%1)   ;\n"
378 	"       movq 32(%1), %%mm4   ;\n"
379 	"       pxor 24(%2), %%mm3   ;\n"
380 	"       pxor 16(%4), %%mm2   ;\n"
381 	"       movq %%mm1,  8(%1)   ;\n"
382 	"       movq 40(%1), %%mm5   ;\n"
383 	"       pxor 32(%2), %%mm4   ;\n"
384 	"       pxor 24(%3), %%mm3   ;\n"
385 	"       movq %%mm2, 16(%1)   ;\n"
386 	"       pxor 40(%2), %%mm5   ;\n"
387 	"       pxor 32(%3), %%mm4   ;\n"
388 	"       pxor 24(%4), %%mm3   ;\n"
389 	"       movq %%mm3, 24(%1)   ;\n"
390 	"       movq 56(%1), %%mm7   ;\n"
391 	"       movq 48(%1), %%mm6   ;\n"
392 	"       pxor 40(%3), %%mm5   ;\n"
393 	"       pxor 32(%4), %%mm4   ;\n"
394 	"       pxor 48(%2), %%mm6   ;\n"
395 	"       movq %%mm4, 32(%1)   ;\n"
396 	"       pxor 56(%2), %%mm7   ;\n"
397 	"       pxor 40(%4), %%mm5   ;\n"
398 	"       pxor 48(%3), %%mm6   ;\n"
399 	"       pxor 56(%3), %%mm7   ;\n"
400 	"       movq %%mm5, 40(%1)   ;\n"
401 	"       pxor 48(%4), %%mm6   ;\n"
402 	"       pxor 56(%4), %%mm7   ;\n"
403 	"       movq %%mm6, 48(%1)   ;\n"
404 	"       movq %%mm7, 56(%1)   ;\n"
405 
406 	"       addl $64, %1         ;\n"
407 	"       addl $64, %2         ;\n"
408 	"       addl $64, %3         ;\n"
409 	"       addl $64, %4         ;\n"
410 	"       decl %0              ;\n"
411 	"       jnz 1b               ;\n"
412 	: "+r" (lines),
413 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
414 	:
415 	: "memory");
416 
417 	kernel_fpu_end();
418 }
419 
420 static void
421 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
422 	     unsigned long *p3, unsigned long *p4, unsigned long *p5)
423 {
424 	unsigned long lines = bytes >> 6;
425 
426 	kernel_fpu_begin();
427 
428 	/* Make sure GCC forgets anything it knows about p4 or p5,
429 	   such that it won't pass to the asm volatile below a
430 	   register that is shared with any other variable.  That's
431 	   because we modify p4 and p5 there, but we can't mark them
432 	   as read/write, otherwise we'd overflow the 10-asm-operands
433 	   limit of GCC < 3.1.  */
434 	asm("" : "+r" (p4), "+r" (p5));
435 
436 	asm volatile(
437 	" .align 32,0x90             ;\n"
438 	" 1:                         ;\n"
439 	"       movq   (%1), %%mm0   ;\n"
440 	"       movq  8(%1), %%mm1   ;\n"
441 	"       pxor   (%2), %%mm0   ;\n"
442 	"       pxor  8(%2), %%mm1   ;\n"
443 	"       movq 16(%1), %%mm2   ;\n"
444 	"       pxor   (%3), %%mm0   ;\n"
445 	"       pxor  8(%3), %%mm1   ;\n"
446 	"       pxor 16(%2), %%mm2   ;\n"
447 	"       pxor   (%4), %%mm0   ;\n"
448 	"       pxor  8(%4), %%mm1   ;\n"
449 	"       pxor 16(%3), %%mm2   ;\n"
450 	"       movq 24(%1), %%mm3   ;\n"
451 	"       pxor   (%5), %%mm0   ;\n"
452 	"       pxor  8(%5), %%mm1   ;\n"
453 	"       movq %%mm0,   (%1)   ;\n"
454 	"       pxor 16(%4), %%mm2   ;\n"
455 	"       pxor 24(%2), %%mm3   ;\n"
456 	"       movq %%mm1,  8(%1)   ;\n"
457 	"       pxor 16(%5), %%mm2   ;\n"
458 	"       pxor 24(%3), %%mm3   ;\n"
459 	"       movq 32(%1), %%mm4   ;\n"
460 	"       movq %%mm2, 16(%1)   ;\n"
461 	"       pxor 24(%4), %%mm3   ;\n"
462 	"       pxor 32(%2), %%mm4   ;\n"
463 	"       movq 40(%1), %%mm5   ;\n"
464 	"       pxor 24(%5), %%mm3   ;\n"
465 	"       pxor 32(%3), %%mm4   ;\n"
466 	"       pxor 40(%2), %%mm5   ;\n"
467 	"       movq %%mm3, 24(%1)   ;\n"
468 	"       pxor 32(%4), %%mm4   ;\n"
469 	"       pxor 40(%3), %%mm5   ;\n"
470 	"       movq 48(%1), %%mm6   ;\n"
471 	"       movq 56(%1), %%mm7   ;\n"
472 	"       pxor 32(%5), %%mm4   ;\n"
473 	"       pxor 40(%4), %%mm5   ;\n"
474 	"       pxor 48(%2), %%mm6   ;\n"
475 	"       pxor 56(%2), %%mm7   ;\n"
476 	"       movq %%mm4, 32(%1)   ;\n"
477 	"       pxor 48(%3), %%mm6   ;\n"
478 	"       pxor 56(%3), %%mm7   ;\n"
479 	"       pxor 40(%5), %%mm5   ;\n"
480 	"       pxor 48(%4), %%mm6   ;\n"
481 	"       pxor 56(%4), %%mm7   ;\n"
482 	"       movq %%mm5, 40(%1)   ;\n"
483 	"       pxor 48(%5), %%mm6   ;\n"
484 	"       pxor 56(%5), %%mm7   ;\n"
485 	"       movq %%mm6, 48(%1)   ;\n"
486 	"       movq %%mm7, 56(%1)   ;\n"
487 
488 	"       addl $64, %1         ;\n"
489 	"       addl $64, %2         ;\n"
490 	"       addl $64, %3         ;\n"
491 	"       addl $64, %4         ;\n"
492 	"       addl $64, %5         ;\n"
493 	"       decl %0              ;\n"
494 	"       jnz 1b               ;\n"
495 	: "+r" (lines),
496 	  "+r" (p1), "+r" (p2), "+r" (p3)
497 	: "r" (p4), "r" (p5)
498 	: "memory");
499 
500 	/* p4 and p5 were modified, and now the variables are dead.
501 	   Clobber them just to be sure nobody does something stupid
502 	   like assuming they have some legal value.  */
503 	asm("" : "=r" (p4), "=r" (p5));
504 
505 	kernel_fpu_end();
506 }
507 
508 static struct xor_block_template xor_block_pII_mmx = {
509 	.name = "pII_mmx",
510 	.do_2 = xor_pII_mmx_2,
511 	.do_3 = xor_pII_mmx_3,
512 	.do_4 = xor_pII_mmx_4,
513 	.do_5 = xor_pII_mmx_5,
514 };
515 
516 static struct xor_block_template xor_block_p5_mmx = {
517 	.name = "p5_mmx",
518 	.do_2 = xor_p5_mmx_2,
519 	.do_3 = xor_p5_mmx_3,
520 	.do_4 = xor_p5_mmx_4,
521 	.do_5 = xor_p5_mmx_5,
522 };
523 
524 static struct xor_block_template xor_block_pIII_sse = {
525 	.name = "pIII_sse",
526 	.do_2 = xor_sse_2,
527 	.do_3 = xor_sse_3,
528 	.do_4 = xor_sse_4,
529 	.do_5 = xor_sse_5,
530 };
531 
532 /* Also try the AVX routines */
533 #include <asm/xor_avx.h>
534 
535 /* Also try the generic routines.  */
536 #include <asm-generic/xor.h>
537 
538 /* We force the use of the SSE xor block because it can write around L2.
539    We may also be able to load into the L1 only depending on how the cpu
540    deals with a load to a line that is being prefetched.  */
541 #undef XOR_TRY_TEMPLATES
542 #define XOR_TRY_TEMPLATES				\
543 do {							\
544 	AVX_XOR_SPEED;					\
545 	if (boot_cpu_has(X86_FEATURE_XMM)) {				\
546 		xor_speed(&xor_block_pIII_sse);		\
547 		xor_speed(&xor_block_sse_pf64);		\
548 	} else if (boot_cpu_has(X86_FEATURE_MMX)) {	\
549 		xor_speed(&xor_block_pII_mmx);		\
550 		xor_speed(&xor_block_p5_mmx);		\
551 	} else {					\
552 		xor_speed(&xor_block_8regs);		\
553 		xor_speed(&xor_block_8regs_p);		\
554 		xor_speed(&xor_block_32regs);		\
555 		xor_speed(&xor_block_32regs_p);		\
556 	}						\
557 } while (0)
558 
559 #endif /* _ASM_X86_XOR_32_H */
560