xref: /linux/arch/x86/include/asm/xor_32.h (revision 4413e16d9d21673bb5048a2e542f1aaa00015c2e)
1 #ifndef _ASM_X86_XOR_32_H
2 #define _ASM_X86_XOR_32_H
3 
4 /*
5  * Optimized RAID-5 checksumming functions for MMX and SSE.
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2, or (at your option)
10  * any later version.
11  *
12  * You should have received a copy of the GNU General Public License
13  * (for example /usr/src/linux/COPYING); if not, write to the Free
14  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15  */
16 
17 /*
18  * High-speed RAID5 checksumming functions utilizing MMX instructions.
19  * Copyright (C) 1998 Ingo Molnar.
20  */
21 
22 #define LD(x, y)	"       movq   8*("#x")(%1), %%mm"#y"   ;\n"
23 #define ST(x, y)	"       movq %%mm"#y",   8*("#x")(%1)   ;\n"
24 #define XO1(x, y)	"       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
25 #define XO2(x, y)	"       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
26 #define XO3(x, y)	"       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
27 #define XO4(x, y)	"       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
28 
29 #include <asm/i387.h>
30 
31 static void
32 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
33 {
34 	unsigned long lines = bytes >> 7;
35 
36 	kernel_fpu_begin();
37 
38 	asm volatile(
39 #undef BLOCK
40 #define BLOCK(i)				\
41 	LD(i, 0)				\
42 		LD(i + 1, 1)			\
43 			LD(i + 2, 2)		\
44 				LD(i + 3, 3)	\
45 	XO1(i, 0)				\
46 	ST(i, 0)				\
47 		XO1(i+1, 1)			\
48 		ST(i+1, 1)			\
49 			XO1(i + 2, 2)		\
50 			ST(i + 2, 2)		\
51 				XO1(i + 3, 3)	\
52 				ST(i + 3, 3)
53 
54 	" .align 32			;\n"
55 	" 1:                            ;\n"
56 
57 	BLOCK(0)
58 	BLOCK(4)
59 	BLOCK(8)
60 	BLOCK(12)
61 
62 	"       addl $128, %1         ;\n"
63 	"       addl $128, %2         ;\n"
64 	"       decl %0               ;\n"
65 	"       jnz 1b                ;\n"
66 	: "+r" (lines),
67 	  "+r" (p1), "+r" (p2)
68 	:
69 	: "memory");
70 
71 	kernel_fpu_end();
72 }
73 
74 static void
75 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
76 	      unsigned long *p3)
77 {
78 	unsigned long lines = bytes >> 7;
79 
80 	kernel_fpu_begin();
81 
82 	asm volatile(
83 #undef BLOCK
84 #define BLOCK(i)				\
85 	LD(i, 0)				\
86 		LD(i + 1, 1)			\
87 			LD(i + 2, 2)		\
88 				LD(i + 3, 3)	\
89 	XO1(i, 0)				\
90 		XO1(i + 1, 1)			\
91 			XO1(i + 2, 2)		\
92 				XO1(i + 3, 3)	\
93 	XO2(i, 0)				\
94 	ST(i, 0)				\
95 		XO2(i + 1, 1)			\
96 		ST(i + 1, 1)			\
97 			XO2(i + 2, 2)		\
98 			ST(i + 2, 2)		\
99 				XO2(i + 3, 3)	\
100 				ST(i + 3, 3)
101 
102 	" .align 32			;\n"
103 	" 1:                            ;\n"
104 
105 	BLOCK(0)
106 	BLOCK(4)
107 	BLOCK(8)
108 	BLOCK(12)
109 
110 	"       addl $128, %1         ;\n"
111 	"       addl $128, %2         ;\n"
112 	"       addl $128, %3         ;\n"
113 	"       decl %0               ;\n"
114 	"       jnz 1b                ;\n"
115 	: "+r" (lines),
116 	  "+r" (p1), "+r" (p2), "+r" (p3)
117 	:
118 	: "memory");
119 
120 	kernel_fpu_end();
121 }
122 
123 static void
124 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
125 	      unsigned long *p3, unsigned long *p4)
126 {
127 	unsigned long lines = bytes >> 7;
128 
129 	kernel_fpu_begin();
130 
131 	asm volatile(
132 #undef BLOCK
133 #define BLOCK(i)				\
134 	LD(i, 0)				\
135 		LD(i + 1, 1)			\
136 			LD(i + 2, 2)		\
137 				LD(i + 3, 3)	\
138 	XO1(i, 0)				\
139 		XO1(i + 1, 1)			\
140 			XO1(i + 2, 2)		\
141 				XO1(i + 3, 3)	\
142 	XO2(i, 0)				\
143 		XO2(i + 1, 1)			\
144 			XO2(i + 2, 2)		\
145 				XO2(i + 3, 3)	\
146 	XO3(i, 0)				\
147 	ST(i, 0)				\
148 		XO3(i + 1, 1)			\
149 		ST(i + 1, 1)			\
150 			XO3(i + 2, 2)		\
151 			ST(i + 2, 2)		\
152 				XO3(i + 3, 3)	\
153 				ST(i + 3, 3)
154 
155 	" .align 32			;\n"
156 	" 1:                            ;\n"
157 
158 	BLOCK(0)
159 	BLOCK(4)
160 	BLOCK(8)
161 	BLOCK(12)
162 
163 	"       addl $128, %1         ;\n"
164 	"       addl $128, %2         ;\n"
165 	"       addl $128, %3         ;\n"
166 	"       addl $128, %4         ;\n"
167 	"       decl %0               ;\n"
168 	"       jnz 1b                ;\n"
169 	: "+r" (lines),
170 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
171 	:
172 	: "memory");
173 
174 	kernel_fpu_end();
175 }
176 
177 
178 static void
179 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
180 	      unsigned long *p3, unsigned long *p4, unsigned long *p5)
181 {
182 	unsigned long lines = bytes >> 7;
183 
184 	kernel_fpu_begin();
185 
186 	/* Make sure GCC forgets anything it knows about p4 or p5,
187 	   such that it won't pass to the asm volatile below a
188 	   register that is shared with any other variable.  That's
189 	   because we modify p4 and p5 there, but we can't mark them
190 	   as read/write, otherwise we'd overflow the 10-asm-operands
191 	   limit of GCC < 3.1.  */
192 	asm("" : "+r" (p4), "+r" (p5));
193 
194 	asm volatile(
195 #undef BLOCK
196 #define BLOCK(i)				\
197 	LD(i, 0)				\
198 		LD(i + 1, 1)			\
199 			LD(i + 2, 2)		\
200 				LD(i + 3, 3)	\
201 	XO1(i, 0)				\
202 		XO1(i + 1, 1)			\
203 			XO1(i + 2, 2)		\
204 				XO1(i + 3, 3)	\
205 	XO2(i, 0)				\
206 		XO2(i + 1, 1)			\
207 			XO2(i + 2, 2)		\
208 				XO2(i + 3, 3)	\
209 	XO3(i, 0)				\
210 		XO3(i + 1, 1)			\
211 			XO3(i + 2, 2)		\
212 				XO3(i + 3, 3)	\
213 	XO4(i, 0)				\
214 	ST(i, 0)				\
215 		XO4(i + 1, 1)			\
216 		ST(i + 1, 1)			\
217 			XO4(i + 2, 2)		\
218 			ST(i + 2, 2)		\
219 				XO4(i + 3, 3)	\
220 				ST(i + 3, 3)
221 
222 	" .align 32			;\n"
223 	" 1:                            ;\n"
224 
225 	BLOCK(0)
226 	BLOCK(4)
227 	BLOCK(8)
228 	BLOCK(12)
229 
230 	"       addl $128, %1         ;\n"
231 	"       addl $128, %2         ;\n"
232 	"       addl $128, %3         ;\n"
233 	"       addl $128, %4         ;\n"
234 	"       addl $128, %5         ;\n"
235 	"       decl %0               ;\n"
236 	"       jnz 1b                ;\n"
237 	: "+r" (lines),
238 	  "+r" (p1), "+r" (p2), "+r" (p3)
239 	: "r" (p4), "r" (p5)
240 	: "memory");
241 
242 	/* p4 and p5 were modified, and now the variables are dead.
243 	   Clobber them just to be sure nobody does something stupid
244 	   like assuming they have some legal value.  */
245 	asm("" : "=r" (p4), "=r" (p5));
246 
247 	kernel_fpu_end();
248 }
249 
250 #undef LD
251 #undef XO1
252 #undef XO2
253 #undef XO3
254 #undef XO4
255 #undef ST
256 #undef BLOCK
257 
258 static void
259 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
260 {
261 	unsigned long lines = bytes >> 6;
262 
263 	kernel_fpu_begin();
264 
265 	asm volatile(
266 	" .align 32	             ;\n"
267 	" 1:                         ;\n"
268 	"       movq   (%1), %%mm0   ;\n"
269 	"       movq  8(%1), %%mm1   ;\n"
270 	"       pxor   (%2), %%mm0   ;\n"
271 	"       movq 16(%1), %%mm2   ;\n"
272 	"       movq %%mm0,   (%1)   ;\n"
273 	"       pxor  8(%2), %%mm1   ;\n"
274 	"       movq 24(%1), %%mm3   ;\n"
275 	"       movq %%mm1,  8(%1)   ;\n"
276 	"       pxor 16(%2), %%mm2   ;\n"
277 	"       movq 32(%1), %%mm4   ;\n"
278 	"       movq %%mm2, 16(%1)   ;\n"
279 	"       pxor 24(%2), %%mm3   ;\n"
280 	"       movq 40(%1), %%mm5   ;\n"
281 	"       movq %%mm3, 24(%1)   ;\n"
282 	"       pxor 32(%2), %%mm4   ;\n"
283 	"       movq 48(%1), %%mm6   ;\n"
284 	"       movq %%mm4, 32(%1)   ;\n"
285 	"       pxor 40(%2), %%mm5   ;\n"
286 	"       movq 56(%1), %%mm7   ;\n"
287 	"       movq %%mm5, 40(%1)   ;\n"
288 	"       pxor 48(%2), %%mm6   ;\n"
289 	"       pxor 56(%2), %%mm7   ;\n"
290 	"       movq %%mm6, 48(%1)   ;\n"
291 	"       movq %%mm7, 56(%1)   ;\n"
292 
293 	"       addl $64, %1         ;\n"
294 	"       addl $64, %2         ;\n"
295 	"       decl %0              ;\n"
296 	"       jnz 1b               ;\n"
297 	: "+r" (lines),
298 	  "+r" (p1), "+r" (p2)
299 	:
300 	: "memory");
301 
302 	kernel_fpu_end();
303 }
304 
305 static void
306 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
307 	     unsigned long *p3)
308 {
309 	unsigned long lines = bytes >> 6;
310 
311 	kernel_fpu_begin();
312 
313 	asm volatile(
314 	" .align 32,0x90             ;\n"
315 	" 1:                         ;\n"
316 	"       movq   (%1), %%mm0   ;\n"
317 	"       movq  8(%1), %%mm1   ;\n"
318 	"       pxor   (%2), %%mm0   ;\n"
319 	"       movq 16(%1), %%mm2   ;\n"
320 	"       pxor  8(%2), %%mm1   ;\n"
321 	"       pxor   (%3), %%mm0   ;\n"
322 	"       pxor 16(%2), %%mm2   ;\n"
323 	"       movq %%mm0,   (%1)   ;\n"
324 	"       pxor  8(%3), %%mm1   ;\n"
325 	"       pxor 16(%3), %%mm2   ;\n"
326 	"       movq 24(%1), %%mm3   ;\n"
327 	"       movq %%mm1,  8(%1)   ;\n"
328 	"       movq 32(%1), %%mm4   ;\n"
329 	"       movq 40(%1), %%mm5   ;\n"
330 	"       pxor 24(%2), %%mm3   ;\n"
331 	"       movq %%mm2, 16(%1)   ;\n"
332 	"       pxor 32(%2), %%mm4   ;\n"
333 	"       pxor 24(%3), %%mm3   ;\n"
334 	"       pxor 40(%2), %%mm5   ;\n"
335 	"       movq %%mm3, 24(%1)   ;\n"
336 	"       pxor 32(%3), %%mm4   ;\n"
337 	"       pxor 40(%3), %%mm5   ;\n"
338 	"       movq 48(%1), %%mm6   ;\n"
339 	"       movq %%mm4, 32(%1)   ;\n"
340 	"       movq 56(%1), %%mm7   ;\n"
341 	"       pxor 48(%2), %%mm6   ;\n"
342 	"       movq %%mm5, 40(%1)   ;\n"
343 	"       pxor 56(%2), %%mm7   ;\n"
344 	"       pxor 48(%3), %%mm6   ;\n"
345 	"       pxor 56(%3), %%mm7   ;\n"
346 	"       movq %%mm6, 48(%1)   ;\n"
347 	"       movq %%mm7, 56(%1)   ;\n"
348 
349 	"       addl $64, %1         ;\n"
350 	"       addl $64, %2         ;\n"
351 	"       addl $64, %3         ;\n"
352 	"       decl %0              ;\n"
353 	"       jnz 1b               ;\n"
354 	: "+r" (lines),
355 	  "+r" (p1), "+r" (p2), "+r" (p3)
356 	:
357 	: "memory" );
358 
359 	kernel_fpu_end();
360 }
361 
362 static void
363 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
364 	     unsigned long *p3, unsigned long *p4)
365 {
366 	unsigned long lines = bytes >> 6;
367 
368 	kernel_fpu_begin();
369 
370 	asm volatile(
371 	" .align 32,0x90             ;\n"
372 	" 1:                         ;\n"
373 	"       movq   (%1), %%mm0   ;\n"
374 	"       movq  8(%1), %%mm1   ;\n"
375 	"       pxor   (%2), %%mm0   ;\n"
376 	"       movq 16(%1), %%mm2   ;\n"
377 	"       pxor  8(%2), %%mm1   ;\n"
378 	"       pxor   (%3), %%mm0   ;\n"
379 	"       pxor 16(%2), %%mm2   ;\n"
380 	"       pxor  8(%3), %%mm1   ;\n"
381 	"       pxor   (%4), %%mm0   ;\n"
382 	"       movq 24(%1), %%mm3   ;\n"
383 	"       pxor 16(%3), %%mm2   ;\n"
384 	"       pxor  8(%4), %%mm1   ;\n"
385 	"       movq %%mm0,   (%1)   ;\n"
386 	"       movq 32(%1), %%mm4   ;\n"
387 	"       pxor 24(%2), %%mm3   ;\n"
388 	"       pxor 16(%4), %%mm2   ;\n"
389 	"       movq %%mm1,  8(%1)   ;\n"
390 	"       movq 40(%1), %%mm5   ;\n"
391 	"       pxor 32(%2), %%mm4   ;\n"
392 	"       pxor 24(%3), %%mm3   ;\n"
393 	"       movq %%mm2, 16(%1)   ;\n"
394 	"       pxor 40(%2), %%mm5   ;\n"
395 	"       pxor 32(%3), %%mm4   ;\n"
396 	"       pxor 24(%4), %%mm3   ;\n"
397 	"       movq %%mm3, 24(%1)   ;\n"
398 	"       movq 56(%1), %%mm7   ;\n"
399 	"       movq 48(%1), %%mm6   ;\n"
400 	"       pxor 40(%3), %%mm5   ;\n"
401 	"       pxor 32(%4), %%mm4   ;\n"
402 	"       pxor 48(%2), %%mm6   ;\n"
403 	"       movq %%mm4, 32(%1)   ;\n"
404 	"       pxor 56(%2), %%mm7   ;\n"
405 	"       pxor 40(%4), %%mm5   ;\n"
406 	"       pxor 48(%3), %%mm6   ;\n"
407 	"       pxor 56(%3), %%mm7   ;\n"
408 	"       movq %%mm5, 40(%1)   ;\n"
409 	"       pxor 48(%4), %%mm6   ;\n"
410 	"       pxor 56(%4), %%mm7   ;\n"
411 	"       movq %%mm6, 48(%1)   ;\n"
412 	"       movq %%mm7, 56(%1)   ;\n"
413 
414 	"       addl $64, %1         ;\n"
415 	"       addl $64, %2         ;\n"
416 	"       addl $64, %3         ;\n"
417 	"       addl $64, %4         ;\n"
418 	"       decl %0              ;\n"
419 	"       jnz 1b               ;\n"
420 	: "+r" (lines),
421 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
422 	:
423 	: "memory");
424 
425 	kernel_fpu_end();
426 }
427 
428 static void
429 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
430 	     unsigned long *p3, unsigned long *p4, unsigned long *p5)
431 {
432 	unsigned long lines = bytes >> 6;
433 
434 	kernel_fpu_begin();
435 
436 	/* Make sure GCC forgets anything it knows about p4 or p5,
437 	   such that it won't pass to the asm volatile below a
438 	   register that is shared with any other variable.  That's
439 	   because we modify p4 and p5 there, but we can't mark them
440 	   as read/write, otherwise we'd overflow the 10-asm-operands
441 	   limit of GCC < 3.1.  */
442 	asm("" : "+r" (p4), "+r" (p5));
443 
444 	asm volatile(
445 	" .align 32,0x90             ;\n"
446 	" 1:                         ;\n"
447 	"       movq   (%1), %%mm0   ;\n"
448 	"       movq  8(%1), %%mm1   ;\n"
449 	"       pxor   (%2), %%mm0   ;\n"
450 	"       pxor  8(%2), %%mm1   ;\n"
451 	"       movq 16(%1), %%mm2   ;\n"
452 	"       pxor   (%3), %%mm0   ;\n"
453 	"       pxor  8(%3), %%mm1   ;\n"
454 	"       pxor 16(%2), %%mm2   ;\n"
455 	"       pxor   (%4), %%mm0   ;\n"
456 	"       pxor  8(%4), %%mm1   ;\n"
457 	"       pxor 16(%3), %%mm2   ;\n"
458 	"       movq 24(%1), %%mm3   ;\n"
459 	"       pxor   (%5), %%mm0   ;\n"
460 	"       pxor  8(%5), %%mm1   ;\n"
461 	"       movq %%mm0,   (%1)   ;\n"
462 	"       pxor 16(%4), %%mm2   ;\n"
463 	"       pxor 24(%2), %%mm3   ;\n"
464 	"       movq %%mm1,  8(%1)   ;\n"
465 	"       pxor 16(%5), %%mm2   ;\n"
466 	"       pxor 24(%3), %%mm3   ;\n"
467 	"       movq 32(%1), %%mm4   ;\n"
468 	"       movq %%mm2, 16(%1)   ;\n"
469 	"       pxor 24(%4), %%mm3   ;\n"
470 	"       pxor 32(%2), %%mm4   ;\n"
471 	"       movq 40(%1), %%mm5   ;\n"
472 	"       pxor 24(%5), %%mm3   ;\n"
473 	"       pxor 32(%3), %%mm4   ;\n"
474 	"       pxor 40(%2), %%mm5   ;\n"
475 	"       movq %%mm3, 24(%1)   ;\n"
476 	"       pxor 32(%4), %%mm4   ;\n"
477 	"       pxor 40(%3), %%mm5   ;\n"
478 	"       movq 48(%1), %%mm6   ;\n"
479 	"       movq 56(%1), %%mm7   ;\n"
480 	"       pxor 32(%5), %%mm4   ;\n"
481 	"       pxor 40(%4), %%mm5   ;\n"
482 	"       pxor 48(%2), %%mm6   ;\n"
483 	"       pxor 56(%2), %%mm7   ;\n"
484 	"       movq %%mm4, 32(%1)   ;\n"
485 	"       pxor 48(%3), %%mm6   ;\n"
486 	"       pxor 56(%3), %%mm7   ;\n"
487 	"       pxor 40(%5), %%mm5   ;\n"
488 	"       pxor 48(%4), %%mm6   ;\n"
489 	"       pxor 56(%4), %%mm7   ;\n"
490 	"       movq %%mm5, 40(%1)   ;\n"
491 	"       pxor 48(%5), %%mm6   ;\n"
492 	"       pxor 56(%5), %%mm7   ;\n"
493 	"       movq %%mm6, 48(%1)   ;\n"
494 	"       movq %%mm7, 56(%1)   ;\n"
495 
496 	"       addl $64, %1         ;\n"
497 	"       addl $64, %2         ;\n"
498 	"       addl $64, %3         ;\n"
499 	"       addl $64, %4         ;\n"
500 	"       addl $64, %5         ;\n"
501 	"       decl %0              ;\n"
502 	"       jnz 1b               ;\n"
503 	: "+r" (lines),
504 	  "+r" (p1), "+r" (p2), "+r" (p3)
505 	: "r" (p4), "r" (p5)
506 	: "memory");
507 
508 	/* p4 and p5 were modified, and now the variables are dead.
509 	   Clobber them just to be sure nobody does something stupid
510 	   like assuming they have some legal value.  */
511 	asm("" : "=r" (p4), "=r" (p5));
512 
513 	kernel_fpu_end();
514 }
515 
516 static struct xor_block_template xor_block_pII_mmx = {
517 	.name = "pII_mmx",
518 	.do_2 = xor_pII_mmx_2,
519 	.do_3 = xor_pII_mmx_3,
520 	.do_4 = xor_pII_mmx_4,
521 	.do_5 = xor_pII_mmx_5,
522 };
523 
524 static struct xor_block_template xor_block_p5_mmx = {
525 	.name = "p5_mmx",
526 	.do_2 = xor_p5_mmx_2,
527 	.do_3 = xor_p5_mmx_3,
528 	.do_4 = xor_p5_mmx_4,
529 	.do_5 = xor_p5_mmx_5,
530 };
531 
532 /*
533  * Cache avoiding checksumming functions utilizing KNI instructions
534  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
535  */
536 
537 #define OFFS(x)		"16*("#x")"
538 #define PF_OFFS(x)	"256+16*("#x")"
539 #define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%1)		;\n"
540 #define LD(x, y)	"       movaps   "OFFS(x)"(%1), %%xmm"#y"	;\n"
541 #define ST(x, y)	"       movaps %%xmm"#y",   "OFFS(x)"(%1)	;\n"
542 #define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%2)		;\n"
543 #define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%3)		;\n"
544 #define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%4)		;\n"
545 #define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%5)		;\n"
546 #define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%6)		;\n"
547 #define XO1(x, y)	"       xorps   "OFFS(x)"(%2), %%xmm"#y"	;\n"
548 #define XO2(x, y)	"       xorps   "OFFS(x)"(%3), %%xmm"#y"	;\n"
549 #define XO3(x, y)	"       xorps   "OFFS(x)"(%4), %%xmm"#y"	;\n"
550 #define XO4(x, y)	"       xorps   "OFFS(x)"(%5), %%xmm"#y"	;\n"
551 #define XO5(x, y)	"       xorps   "OFFS(x)"(%6), %%xmm"#y"	;\n"
552 
553 
554 static void
555 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
556 {
557 	unsigned long lines = bytes >> 8;
558 
559 	kernel_fpu_begin();
560 
561 	asm volatile(
562 #undef BLOCK
563 #define BLOCK(i)					\
564 		LD(i, 0)				\
565 			LD(i + 1, 1)			\
566 		PF1(i)					\
567 				PF1(i + 2)		\
568 				LD(i + 2, 2)		\
569 					LD(i + 3, 3)	\
570 		PF0(i + 4)				\
571 				PF0(i + 6)		\
572 		XO1(i, 0)				\
573 			XO1(i + 1, 1)			\
574 				XO1(i + 2, 2)		\
575 					XO1(i + 3, 3)	\
576 		ST(i, 0)				\
577 			ST(i + 1, 1)			\
578 				ST(i + 2, 2)		\
579 					ST(i + 3, 3)	\
580 
581 
582 		PF0(0)
583 				PF0(2)
584 
585 	" .align 32			;\n"
586 	" 1:                            ;\n"
587 
588 		BLOCK(0)
589 		BLOCK(4)
590 		BLOCK(8)
591 		BLOCK(12)
592 
593 	"       addl $256, %1           ;\n"
594 	"       addl $256, %2           ;\n"
595 	"       decl %0                 ;\n"
596 	"       jnz 1b                  ;\n"
597 	: "+r" (lines),
598 	  "+r" (p1), "+r" (p2)
599 	:
600 	: "memory");
601 
602 	kernel_fpu_end();
603 }
604 
605 static void
606 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
607 	  unsigned long *p3)
608 {
609 	unsigned long lines = bytes >> 8;
610 
611 	kernel_fpu_begin();
612 
613 	asm volatile(
614 #undef BLOCK
615 #define BLOCK(i) \
616 		PF1(i)					\
617 				PF1(i + 2)		\
618 		LD(i,0)					\
619 			LD(i + 1, 1)			\
620 				LD(i + 2, 2)		\
621 					LD(i + 3, 3)	\
622 		PF2(i)					\
623 				PF2(i + 2)		\
624 		PF0(i + 4)				\
625 				PF0(i + 6)		\
626 		XO1(i,0)				\
627 			XO1(i + 1, 1)			\
628 				XO1(i + 2, 2)		\
629 					XO1(i + 3, 3)	\
630 		XO2(i,0)				\
631 			XO2(i + 1, 1)			\
632 				XO2(i + 2, 2)		\
633 					XO2(i + 3, 3)	\
634 		ST(i,0)					\
635 			ST(i + 1, 1)			\
636 				ST(i + 2, 2)		\
637 					ST(i + 3, 3)	\
638 
639 
640 		PF0(0)
641 				PF0(2)
642 
643 	" .align 32			;\n"
644 	" 1:                            ;\n"
645 
646 		BLOCK(0)
647 		BLOCK(4)
648 		BLOCK(8)
649 		BLOCK(12)
650 
651 	"       addl $256, %1           ;\n"
652 	"       addl $256, %2           ;\n"
653 	"       addl $256, %3           ;\n"
654 	"       decl %0                 ;\n"
655 	"       jnz 1b                  ;\n"
656 	: "+r" (lines),
657 	  "+r" (p1), "+r"(p2), "+r"(p3)
658 	:
659 	: "memory" );
660 
661 	kernel_fpu_end();
662 }
663 
664 static void
665 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
666 	  unsigned long *p3, unsigned long *p4)
667 {
668 	unsigned long lines = bytes >> 8;
669 
670 	kernel_fpu_begin();
671 
672 	asm volatile(
673 #undef BLOCK
674 #define BLOCK(i) \
675 		PF1(i)					\
676 				PF1(i + 2)		\
677 		LD(i,0)					\
678 			LD(i + 1, 1)			\
679 				LD(i + 2, 2)		\
680 					LD(i + 3, 3)	\
681 		PF2(i)					\
682 				PF2(i + 2)		\
683 		XO1(i,0)				\
684 			XO1(i + 1, 1)			\
685 				XO1(i + 2, 2)		\
686 					XO1(i + 3, 3)	\
687 		PF3(i)					\
688 				PF3(i + 2)		\
689 		PF0(i + 4)				\
690 				PF0(i + 6)		\
691 		XO2(i,0)				\
692 			XO2(i + 1, 1)			\
693 				XO2(i + 2, 2)		\
694 					XO2(i + 3, 3)	\
695 		XO3(i,0)				\
696 			XO3(i + 1, 1)			\
697 				XO3(i + 2, 2)		\
698 					XO3(i + 3, 3)	\
699 		ST(i,0)					\
700 			ST(i + 1, 1)			\
701 				ST(i + 2, 2)		\
702 					ST(i + 3, 3)	\
703 
704 
705 		PF0(0)
706 				PF0(2)
707 
708 	" .align 32			;\n"
709 	" 1:                            ;\n"
710 
711 		BLOCK(0)
712 		BLOCK(4)
713 		BLOCK(8)
714 		BLOCK(12)
715 
716 	"       addl $256, %1           ;\n"
717 	"       addl $256, %2           ;\n"
718 	"       addl $256, %3           ;\n"
719 	"       addl $256, %4           ;\n"
720 	"       decl %0                 ;\n"
721 	"       jnz 1b                  ;\n"
722 	: "+r" (lines),
723 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
724 	:
725 	: "memory" );
726 
727 	kernel_fpu_end();
728 }
729 
730 static void
731 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
732 	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
733 {
734 	unsigned long lines = bytes >> 8;
735 
736 	kernel_fpu_begin();
737 
738 	/* Make sure GCC forgets anything it knows about p4 or p5,
739 	   such that it won't pass to the asm volatile below a
740 	   register that is shared with any other variable.  That's
741 	   because we modify p4 and p5 there, but we can't mark them
742 	   as read/write, otherwise we'd overflow the 10-asm-operands
743 	   limit of GCC < 3.1.  */
744 	asm("" : "+r" (p4), "+r" (p5));
745 
746 	asm volatile(
747 #undef BLOCK
748 #define BLOCK(i) \
749 		PF1(i)					\
750 				PF1(i + 2)		\
751 		LD(i,0)					\
752 			LD(i + 1, 1)			\
753 				LD(i + 2, 2)		\
754 					LD(i + 3, 3)	\
755 		PF2(i)					\
756 				PF2(i + 2)		\
757 		XO1(i,0)				\
758 			XO1(i + 1, 1)			\
759 				XO1(i + 2, 2)		\
760 					XO1(i + 3, 3)	\
761 		PF3(i)					\
762 				PF3(i + 2)		\
763 		XO2(i,0)				\
764 			XO2(i + 1, 1)			\
765 				XO2(i + 2, 2)		\
766 					XO2(i + 3, 3)	\
767 		PF4(i)					\
768 				PF4(i + 2)		\
769 		PF0(i + 4)				\
770 				PF0(i + 6)		\
771 		XO3(i,0)				\
772 			XO3(i + 1, 1)			\
773 				XO3(i + 2, 2)		\
774 					XO3(i + 3, 3)	\
775 		XO4(i,0)				\
776 			XO4(i + 1, 1)			\
777 				XO4(i + 2, 2)		\
778 					XO4(i + 3, 3)	\
779 		ST(i,0)					\
780 			ST(i + 1, 1)			\
781 				ST(i + 2, 2)		\
782 					ST(i + 3, 3)	\
783 
784 
785 		PF0(0)
786 				PF0(2)
787 
788 	" .align 32			;\n"
789 	" 1:                            ;\n"
790 
791 		BLOCK(0)
792 		BLOCK(4)
793 		BLOCK(8)
794 		BLOCK(12)
795 
796 	"       addl $256, %1           ;\n"
797 	"       addl $256, %2           ;\n"
798 	"       addl $256, %3           ;\n"
799 	"       addl $256, %4           ;\n"
800 	"       addl $256, %5           ;\n"
801 	"       decl %0                 ;\n"
802 	"       jnz 1b                  ;\n"
803 	: "+r" (lines),
804 	  "+r" (p1), "+r" (p2), "+r" (p3)
805 	: "r" (p4), "r" (p5)
806 	: "memory");
807 
808 	/* p4 and p5 were modified, and now the variables are dead.
809 	   Clobber them just to be sure nobody does something stupid
810 	   like assuming they have some legal value.  */
811 	asm("" : "=r" (p4), "=r" (p5));
812 
813 	kernel_fpu_end();
814 }
815 
816 static struct xor_block_template xor_block_pIII_sse = {
817 	.name = "pIII_sse",
818 	.do_2 = xor_sse_2,
819 	.do_3 = xor_sse_3,
820 	.do_4 = xor_sse_4,
821 	.do_5 = xor_sse_5,
822 };
823 
824 /* Also try the AVX routines */
825 #include <asm/xor_avx.h>
826 
827 /* Also try the generic routines.  */
828 #include <asm-generic/xor.h>
829 
830 #undef XOR_TRY_TEMPLATES
831 #define XOR_TRY_TEMPLATES				\
832 do {							\
833 	xor_speed(&xor_block_8regs);			\
834 	xor_speed(&xor_block_8regs_p);			\
835 	xor_speed(&xor_block_32regs);			\
836 	xor_speed(&xor_block_32regs_p);			\
837 	AVX_XOR_SPEED;					\
838 	if (cpu_has_xmm)				\
839 		xor_speed(&xor_block_pIII_sse);		\
840 	if (cpu_has_mmx) {				\
841 		xor_speed(&xor_block_pII_mmx);		\
842 		xor_speed(&xor_block_p5_mmx);		\
843 	}						\
844 } while (0)
845 
846 /* We force the use of the SSE xor block because it can write around L2.
847    We may also be able to load into the L1 only depending on how the cpu
848    deals with a load to a line that is being prefetched.  */
849 #define XOR_SELECT_TEMPLATE(FASTEST)			\
850 	AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
851 
852 #endif /* _ASM_X86_XOR_32_H */
853