xref: /linux/lib/raid/xor/x86/xor-mmx.c (revision 440d6635b20037bc9ad46b20817d7b61cef0fc1b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Optimized XOR parity functions for MMX.
4  *
5  * Copyright (C) 1998 Ingo Molnar.
6  */
7 #include <asm/fpu/api.h>
8 #include "xor_impl.h"
9 #include "xor_arch.h"
10 
11 #define LD(x, y)	"       movq   8*("#x")(%1), %%mm"#y"   ;\n"
12 #define ST(x, y)	"       movq %%mm"#y",   8*("#x")(%1)   ;\n"
13 #define XO1(x, y)	"       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
14 #define XO2(x, y)	"       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
15 #define XO3(x, y)	"       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
16 #define XO4(x, y)	"       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
17 
18 static void
19 xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
20 	      const unsigned long * __restrict p2)
21 {
22 	unsigned long lines = bytes >> 7;
23 
24 	asm volatile(
25 #undef BLOCK
26 #define BLOCK(i)				\
27 	LD(i, 0)				\
28 		LD(i + 1, 1)			\
29 			LD(i + 2, 2)		\
30 				LD(i + 3, 3)	\
31 	XO1(i, 0)				\
32 	ST(i, 0)				\
33 		XO1(i+1, 1)			\
34 		ST(i+1, 1)			\
35 			XO1(i + 2, 2)		\
36 			ST(i + 2, 2)		\
37 				XO1(i + 3, 3)	\
38 				ST(i + 3, 3)
39 
40 	" .align 32			;\n"
41 	" 1:                            ;\n"
42 
43 	BLOCK(0)
44 	BLOCK(4)
45 	BLOCK(8)
46 	BLOCK(12)
47 
48 	"       addl $128, %1         ;\n"
49 	"       addl $128, %2         ;\n"
50 	"       decl %0               ;\n"
51 	"       jnz 1b                ;\n"
52 	: "+r" (lines),
53 	  "+r" (p1), "+r" (p2)
54 	:
55 	: "memory");
56 }
57 
58 static void
59 xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
60 	      const unsigned long * __restrict p2,
61 	      const unsigned long * __restrict p3)
62 {
63 	unsigned long lines = bytes >> 7;
64 
65 	asm volatile(
66 #undef BLOCK
67 #define BLOCK(i)				\
68 	LD(i, 0)				\
69 		LD(i + 1, 1)			\
70 			LD(i + 2, 2)		\
71 				LD(i + 3, 3)	\
72 	XO1(i, 0)				\
73 		XO1(i + 1, 1)			\
74 			XO1(i + 2, 2)		\
75 				XO1(i + 3, 3)	\
76 	XO2(i, 0)				\
77 	ST(i, 0)				\
78 		XO2(i + 1, 1)			\
79 		ST(i + 1, 1)			\
80 			XO2(i + 2, 2)		\
81 			ST(i + 2, 2)		\
82 				XO2(i + 3, 3)	\
83 				ST(i + 3, 3)
84 
85 	" .align 32			;\n"
86 	" 1:                            ;\n"
87 
88 	BLOCK(0)
89 	BLOCK(4)
90 	BLOCK(8)
91 	BLOCK(12)
92 
93 	"       addl $128, %1         ;\n"
94 	"       addl $128, %2         ;\n"
95 	"       addl $128, %3         ;\n"
96 	"       decl %0               ;\n"
97 	"       jnz 1b                ;\n"
98 	: "+r" (lines),
99 	  "+r" (p1), "+r" (p2), "+r" (p3)
100 	:
101 	: "memory");
102 }
103 
104 static void
105 xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
106 	      const unsigned long * __restrict p2,
107 	      const unsigned long * __restrict p3,
108 	      const unsigned long * __restrict p4)
109 {
110 	unsigned long lines = bytes >> 7;
111 
112 	asm volatile(
113 #undef BLOCK
114 #define BLOCK(i)				\
115 	LD(i, 0)				\
116 		LD(i + 1, 1)			\
117 			LD(i + 2, 2)		\
118 				LD(i + 3, 3)	\
119 	XO1(i, 0)				\
120 		XO1(i + 1, 1)			\
121 			XO1(i + 2, 2)		\
122 				XO1(i + 3, 3)	\
123 	XO2(i, 0)				\
124 		XO2(i + 1, 1)			\
125 			XO2(i + 2, 2)		\
126 				XO2(i + 3, 3)	\
127 	XO3(i, 0)				\
128 	ST(i, 0)				\
129 		XO3(i + 1, 1)			\
130 		ST(i + 1, 1)			\
131 			XO3(i + 2, 2)		\
132 			ST(i + 2, 2)		\
133 				XO3(i + 3, 3)	\
134 				ST(i + 3, 3)
135 
136 	" .align 32			;\n"
137 	" 1:                            ;\n"
138 
139 	BLOCK(0)
140 	BLOCK(4)
141 	BLOCK(8)
142 	BLOCK(12)
143 
144 	"       addl $128, %1         ;\n"
145 	"       addl $128, %2         ;\n"
146 	"       addl $128, %3         ;\n"
147 	"       addl $128, %4         ;\n"
148 	"       decl %0               ;\n"
149 	"       jnz 1b                ;\n"
150 	: "+r" (lines),
151 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
152 	:
153 	: "memory");
154 }
155 
156 
157 static void
158 xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
159 	      const unsigned long * __restrict p2,
160 	      const unsigned long * __restrict p3,
161 	      const unsigned long * __restrict p4,
162 	      const unsigned long * __restrict p5)
163 {
164 	unsigned long lines = bytes >> 7;
165 
166 	/* Make sure GCC forgets anything it knows about p4 or p5,
167 	   such that it won't pass to the asm volatile below a
168 	   register that is shared with any other variable.  That's
169 	   because we modify p4 and p5 there, but we can't mark them
170 	   as read/write, otherwise we'd overflow the 10-asm-operands
171 	   limit of GCC < 3.1.  */
172 	asm("" : "+r" (p4), "+r" (p5));
173 
174 	asm volatile(
175 #undef BLOCK
176 #define BLOCK(i)				\
177 	LD(i, 0)				\
178 		LD(i + 1, 1)			\
179 			LD(i + 2, 2)		\
180 				LD(i + 3, 3)	\
181 	XO1(i, 0)				\
182 		XO1(i + 1, 1)			\
183 			XO1(i + 2, 2)		\
184 				XO1(i + 3, 3)	\
185 	XO2(i, 0)				\
186 		XO2(i + 1, 1)			\
187 			XO2(i + 2, 2)		\
188 				XO2(i + 3, 3)	\
189 	XO3(i, 0)				\
190 		XO3(i + 1, 1)			\
191 			XO3(i + 2, 2)		\
192 				XO3(i + 3, 3)	\
193 	XO4(i, 0)				\
194 	ST(i, 0)				\
195 		XO4(i + 1, 1)			\
196 		ST(i + 1, 1)			\
197 			XO4(i + 2, 2)		\
198 			ST(i + 2, 2)		\
199 				XO4(i + 3, 3)	\
200 				ST(i + 3, 3)
201 
202 	" .align 32			;\n"
203 	" 1:                            ;\n"
204 
205 	BLOCK(0)
206 	BLOCK(4)
207 	BLOCK(8)
208 	BLOCK(12)
209 
210 	"       addl $128, %1         ;\n"
211 	"       addl $128, %2         ;\n"
212 	"       addl $128, %3         ;\n"
213 	"       addl $128, %4         ;\n"
214 	"       addl $128, %5         ;\n"
215 	"       decl %0               ;\n"
216 	"       jnz 1b                ;\n"
217 	: "+r" (lines),
218 	  "+r" (p1), "+r" (p2), "+r" (p3)
219 	: "r" (p4), "r" (p5)
220 	: "memory");
221 
222 	/* p4 and p5 were modified, and now the variables are dead.
223 	   Clobber them just to be sure nobody does something stupid
224 	   like assuming they have some legal value.  */
225 	asm("" : "=r" (p4), "=r" (p5));
226 }
227 
228 #undef LD
229 #undef XO1
230 #undef XO2
231 #undef XO3
232 #undef XO4
233 #undef ST
234 #undef BLOCK
235 
236 static void
237 xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
238 	     const unsigned long * __restrict p2)
239 {
240 	unsigned long lines = bytes >> 6;
241 
242 	asm volatile(
243 	" .align 32	             ;\n"
244 	" 1:                         ;\n"
245 	"       movq   (%1), %%mm0   ;\n"
246 	"       movq  8(%1), %%mm1   ;\n"
247 	"       pxor   (%2), %%mm0   ;\n"
248 	"       movq 16(%1), %%mm2   ;\n"
249 	"       movq %%mm0,   (%1)   ;\n"
250 	"       pxor  8(%2), %%mm1   ;\n"
251 	"       movq 24(%1), %%mm3   ;\n"
252 	"       movq %%mm1,  8(%1)   ;\n"
253 	"       pxor 16(%2), %%mm2   ;\n"
254 	"       movq 32(%1), %%mm4   ;\n"
255 	"       movq %%mm2, 16(%1)   ;\n"
256 	"       pxor 24(%2), %%mm3   ;\n"
257 	"       movq 40(%1), %%mm5   ;\n"
258 	"       movq %%mm3, 24(%1)   ;\n"
259 	"       pxor 32(%2), %%mm4   ;\n"
260 	"       movq 48(%1), %%mm6   ;\n"
261 	"       movq %%mm4, 32(%1)   ;\n"
262 	"       pxor 40(%2), %%mm5   ;\n"
263 	"       movq 56(%1), %%mm7   ;\n"
264 	"       movq %%mm5, 40(%1)   ;\n"
265 	"       pxor 48(%2), %%mm6   ;\n"
266 	"       pxor 56(%2), %%mm7   ;\n"
267 	"       movq %%mm6, 48(%1)   ;\n"
268 	"       movq %%mm7, 56(%1)   ;\n"
269 
270 	"       addl $64, %1         ;\n"
271 	"       addl $64, %2         ;\n"
272 	"       decl %0              ;\n"
273 	"       jnz 1b               ;\n"
274 	: "+r" (lines),
275 	  "+r" (p1), "+r" (p2)
276 	:
277 	: "memory");
278 }
279 
280 static void
281 xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
282 	     const unsigned long * __restrict p2,
283 	     const unsigned long * __restrict p3)
284 {
285 	unsigned long lines = bytes >> 6;
286 
287 	asm volatile(
288 	" .align 32,0x90             ;\n"
289 	" 1:                         ;\n"
290 	"       movq   (%1), %%mm0   ;\n"
291 	"       movq  8(%1), %%mm1   ;\n"
292 	"       pxor   (%2), %%mm0   ;\n"
293 	"       movq 16(%1), %%mm2   ;\n"
294 	"       pxor  8(%2), %%mm1   ;\n"
295 	"       pxor   (%3), %%mm0   ;\n"
296 	"       pxor 16(%2), %%mm2   ;\n"
297 	"       movq %%mm0,   (%1)   ;\n"
298 	"       pxor  8(%3), %%mm1   ;\n"
299 	"       pxor 16(%3), %%mm2   ;\n"
300 	"       movq 24(%1), %%mm3   ;\n"
301 	"       movq %%mm1,  8(%1)   ;\n"
302 	"       movq 32(%1), %%mm4   ;\n"
303 	"       movq 40(%1), %%mm5   ;\n"
304 	"       pxor 24(%2), %%mm3   ;\n"
305 	"       movq %%mm2, 16(%1)   ;\n"
306 	"       pxor 32(%2), %%mm4   ;\n"
307 	"       pxor 24(%3), %%mm3   ;\n"
308 	"       pxor 40(%2), %%mm5   ;\n"
309 	"       movq %%mm3, 24(%1)   ;\n"
310 	"       pxor 32(%3), %%mm4   ;\n"
311 	"       pxor 40(%3), %%mm5   ;\n"
312 	"       movq 48(%1), %%mm6   ;\n"
313 	"       movq %%mm4, 32(%1)   ;\n"
314 	"       movq 56(%1), %%mm7   ;\n"
315 	"       pxor 48(%2), %%mm6   ;\n"
316 	"       movq %%mm5, 40(%1)   ;\n"
317 	"       pxor 56(%2), %%mm7   ;\n"
318 	"       pxor 48(%3), %%mm6   ;\n"
319 	"       pxor 56(%3), %%mm7   ;\n"
320 	"       movq %%mm6, 48(%1)   ;\n"
321 	"       movq %%mm7, 56(%1)   ;\n"
322 
323 	"       addl $64, %1         ;\n"
324 	"       addl $64, %2         ;\n"
325 	"       addl $64, %3         ;\n"
326 	"       decl %0              ;\n"
327 	"       jnz 1b               ;\n"
328 	: "+r" (lines),
329 	  "+r" (p1), "+r" (p2), "+r" (p3)
330 	:
331 	: "memory" );
332 }
333 
334 static void
335 xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
336 	     const unsigned long * __restrict p2,
337 	     const unsigned long * __restrict p3,
338 	     const unsigned long * __restrict p4)
339 {
340 	unsigned long lines = bytes >> 6;
341 
342 	asm volatile(
343 	" .align 32,0x90             ;\n"
344 	" 1:                         ;\n"
345 	"       movq   (%1), %%mm0   ;\n"
346 	"       movq  8(%1), %%mm1   ;\n"
347 	"       pxor   (%2), %%mm0   ;\n"
348 	"       movq 16(%1), %%mm2   ;\n"
349 	"       pxor  8(%2), %%mm1   ;\n"
350 	"       pxor   (%3), %%mm0   ;\n"
351 	"       pxor 16(%2), %%mm2   ;\n"
352 	"       pxor  8(%3), %%mm1   ;\n"
353 	"       pxor   (%4), %%mm0   ;\n"
354 	"       movq 24(%1), %%mm3   ;\n"
355 	"       pxor 16(%3), %%mm2   ;\n"
356 	"       pxor  8(%4), %%mm1   ;\n"
357 	"       movq %%mm0,   (%1)   ;\n"
358 	"       movq 32(%1), %%mm4   ;\n"
359 	"       pxor 24(%2), %%mm3   ;\n"
360 	"       pxor 16(%4), %%mm2   ;\n"
361 	"       movq %%mm1,  8(%1)   ;\n"
362 	"       movq 40(%1), %%mm5   ;\n"
363 	"       pxor 32(%2), %%mm4   ;\n"
364 	"       pxor 24(%3), %%mm3   ;\n"
365 	"       movq %%mm2, 16(%1)   ;\n"
366 	"       pxor 40(%2), %%mm5   ;\n"
367 	"       pxor 32(%3), %%mm4   ;\n"
368 	"       pxor 24(%4), %%mm3   ;\n"
369 	"       movq %%mm3, 24(%1)   ;\n"
370 	"       movq 56(%1), %%mm7   ;\n"
371 	"       movq 48(%1), %%mm6   ;\n"
372 	"       pxor 40(%3), %%mm5   ;\n"
373 	"       pxor 32(%4), %%mm4   ;\n"
374 	"       pxor 48(%2), %%mm6   ;\n"
375 	"       movq %%mm4, 32(%1)   ;\n"
376 	"       pxor 56(%2), %%mm7   ;\n"
377 	"       pxor 40(%4), %%mm5   ;\n"
378 	"       pxor 48(%3), %%mm6   ;\n"
379 	"       pxor 56(%3), %%mm7   ;\n"
380 	"       movq %%mm5, 40(%1)   ;\n"
381 	"       pxor 48(%4), %%mm6   ;\n"
382 	"       pxor 56(%4), %%mm7   ;\n"
383 	"       movq %%mm6, 48(%1)   ;\n"
384 	"       movq %%mm7, 56(%1)   ;\n"
385 
386 	"       addl $64, %1         ;\n"
387 	"       addl $64, %2         ;\n"
388 	"       addl $64, %3         ;\n"
389 	"       addl $64, %4         ;\n"
390 	"       decl %0              ;\n"
391 	"       jnz 1b               ;\n"
392 	: "+r" (lines),
393 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
394 	:
395 	: "memory");
396 }
397 
398 static void
399 xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
400 	     const unsigned long * __restrict p2,
401 	     const unsigned long * __restrict p3,
402 	     const unsigned long * __restrict p4,
403 	     const unsigned long * __restrict p5)
404 {
405 	unsigned long lines = bytes >> 6;
406 
407 	/* Make sure GCC forgets anything it knows about p4 or p5,
408 	   such that it won't pass to the asm volatile below a
409 	   register that is shared with any other variable.  That's
410 	   because we modify p4 and p5 there, but we can't mark them
411 	   as read/write, otherwise we'd overflow the 10-asm-operands
412 	   limit of GCC < 3.1.  */
413 	asm("" : "+r" (p4), "+r" (p5));
414 
415 	asm volatile(
416 	" .align 32,0x90             ;\n"
417 	" 1:                         ;\n"
418 	"       movq   (%1), %%mm0   ;\n"
419 	"       movq  8(%1), %%mm1   ;\n"
420 	"       pxor   (%2), %%mm0   ;\n"
421 	"       pxor  8(%2), %%mm1   ;\n"
422 	"       movq 16(%1), %%mm2   ;\n"
423 	"       pxor   (%3), %%mm0   ;\n"
424 	"       pxor  8(%3), %%mm1   ;\n"
425 	"       pxor 16(%2), %%mm2   ;\n"
426 	"       pxor   (%4), %%mm0   ;\n"
427 	"       pxor  8(%4), %%mm1   ;\n"
428 	"       pxor 16(%3), %%mm2   ;\n"
429 	"       movq 24(%1), %%mm3   ;\n"
430 	"       pxor   (%5), %%mm0   ;\n"
431 	"       pxor  8(%5), %%mm1   ;\n"
432 	"       movq %%mm0,   (%1)   ;\n"
433 	"       pxor 16(%4), %%mm2   ;\n"
434 	"       pxor 24(%2), %%mm3   ;\n"
435 	"       movq %%mm1,  8(%1)   ;\n"
436 	"       pxor 16(%5), %%mm2   ;\n"
437 	"       pxor 24(%3), %%mm3   ;\n"
438 	"       movq 32(%1), %%mm4   ;\n"
439 	"       movq %%mm2, 16(%1)   ;\n"
440 	"       pxor 24(%4), %%mm3   ;\n"
441 	"       pxor 32(%2), %%mm4   ;\n"
442 	"       movq 40(%1), %%mm5   ;\n"
443 	"       pxor 24(%5), %%mm3   ;\n"
444 	"       pxor 32(%3), %%mm4   ;\n"
445 	"       pxor 40(%2), %%mm5   ;\n"
446 	"       movq %%mm3, 24(%1)   ;\n"
447 	"       pxor 32(%4), %%mm4   ;\n"
448 	"       pxor 40(%3), %%mm5   ;\n"
449 	"       movq 48(%1), %%mm6   ;\n"
450 	"       movq 56(%1), %%mm7   ;\n"
451 	"       pxor 32(%5), %%mm4   ;\n"
452 	"       pxor 40(%4), %%mm5   ;\n"
453 	"       pxor 48(%2), %%mm6   ;\n"
454 	"       pxor 56(%2), %%mm7   ;\n"
455 	"       movq %%mm4, 32(%1)   ;\n"
456 	"       pxor 48(%3), %%mm6   ;\n"
457 	"       pxor 56(%3), %%mm7   ;\n"
458 	"       pxor 40(%5), %%mm5   ;\n"
459 	"       pxor 48(%4), %%mm6   ;\n"
460 	"       pxor 56(%4), %%mm7   ;\n"
461 	"       movq %%mm5, 40(%1)   ;\n"
462 	"       pxor 48(%5), %%mm6   ;\n"
463 	"       pxor 56(%5), %%mm7   ;\n"
464 	"       movq %%mm6, 48(%1)   ;\n"
465 	"       movq %%mm7, 56(%1)   ;\n"
466 
467 	"       addl $64, %1         ;\n"
468 	"       addl $64, %2         ;\n"
469 	"       addl $64, %3         ;\n"
470 	"       addl $64, %4         ;\n"
471 	"       addl $64, %5         ;\n"
472 	"       decl %0              ;\n"
473 	"       jnz 1b               ;\n"
474 	: "+r" (lines),
475 	  "+r" (p1), "+r" (p2), "+r" (p3)
476 	: "r" (p4), "r" (p5)
477 	: "memory");
478 
479 	/* p4 and p5 were modified, and now the variables are dead.
480 	   Clobber them just to be sure nobody does something stupid
481 	   like assuming they have some legal value.  */
482 	asm("" : "=r" (p4), "=r" (p5));
483 }
484 
485 DO_XOR_BLOCKS(pII_mmx_inner, xor_pII_mmx_2, xor_pII_mmx_3, xor_pII_mmx_4,
486 		xor_pII_mmx_5);
487 
488 static void xor_gen_pII_mmx(void *dest, void **srcs, unsigned int src_cnt,
489 		unsigned int bytes)
490 {
491 	kernel_fpu_begin();
492 	xor_gen_pII_mmx_inner(dest, srcs, src_cnt, bytes);
493 	kernel_fpu_end();
494 }
495 
496 struct xor_block_template xor_block_pII_mmx = {
497 	.name		= "pII_mmx",
498 	.xor_gen	= xor_gen_pII_mmx,
499 };
500 
501 DO_XOR_BLOCKS(p5_mmx_inner, xor_p5_mmx_2, xor_p5_mmx_3, xor_p5_mmx_4,
502 		xor_p5_mmx_5);
503 
504 static void xor_gen_p5_mmx(void *dest, void **srcs, unsigned int src_cnt,
505 		unsigned int bytes)
506 {
507 	kernel_fpu_begin();
508 	xor_gen_p5_mmx_inner(dest, srcs, src_cnt, bytes);
509 	kernel_fpu_end();
510 }
511 
512 struct xor_block_template xor_block_p5_mmx = {
513 	.name		= "p5_mmx",
514 	.xor_gen	= xor_gen_p5_mmx,
515 };
516