1af1a8899SThomas Gleixner /* SPDX-License-Identifier: GPL-2.0-or-later */
249502766SLevin, Alexander (Sasha Levin) #ifndef _ASM_X86_XOR_H
3e8f6e3f8SJan Beulich #define _ASM_X86_XOR_H
4e8f6e3f8SJan Beulich
5e8f6e3f8SJan Beulich /*
6e8f6e3f8SJan Beulich * Optimized RAID-5 checksumming functions for SSE.
7e8f6e3f8SJan Beulich */
8e8f6e3f8SJan Beulich
9e8f6e3f8SJan Beulich /*
10e8f6e3f8SJan Beulich * Cache avoiding checksumming functions utilizing KNI instructions
11e8f6e3f8SJan Beulich * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
12e8f6e3f8SJan Beulich */
13e8f6e3f8SJan Beulich
14e8f6e3f8SJan Beulich /*
15e8f6e3f8SJan Beulich * Based on
16e8f6e3f8SJan Beulich * High-speed RAID5 checksumming functions utilizing SSE instructions.
17e8f6e3f8SJan Beulich * Copyright (C) 1998 Ingo Molnar.
18e8f6e3f8SJan Beulich */
19e8f6e3f8SJan Beulich
20e8f6e3f8SJan Beulich /*
21e8f6e3f8SJan Beulich * x86-64 changes / gcc fixes from Andi Kleen.
22e8f6e3f8SJan Beulich * Copyright 2002 Andi Kleen, SuSE Labs.
23e8f6e3f8SJan Beulich *
24e8f6e3f8SJan Beulich * This hasn't been optimized for the hammer yet, but there are likely
25e8f6e3f8SJan Beulich * no advantages to be gotten from x86-64 here anyways.
26e8f6e3f8SJan Beulich */
27e8f6e3f8SJan Beulich
28df6b35f4SIngo Molnar #include <asm/fpu/api.h>
29e8f6e3f8SJan Beulich
30e8f6e3f8SJan Beulich #ifdef CONFIG_X86_32
31e8f6e3f8SJan Beulich /* reduce register pressure */
32e8f6e3f8SJan Beulich # define XOR_CONSTANT_CONSTRAINT "i"
33f8561296SVegard Nossum #else
34e8f6e3f8SJan Beulich # define XOR_CONSTANT_CONSTRAINT "re"
35e8f6e3f8SJan Beulich #endif
36e8f6e3f8SJan Beulich
37e8f6e3f8SJan Beulich #define OFFS(x) "16*("#x")"
38e8f6e3f8SJan Beulich #define PF_OFFS(x) "256+16*("#x")"
39e8f6e3f8SJan Beulich #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
40e8f6e3f8SJan Beulich #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
41e8f6e3f8SJan Beulich #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
42e8f6e3f8SJan Beulich #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
43e8f6e3f8SJan Beulich #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
44e8f6e3f8SJan Beulich #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
45e8f6e3f8SJan Beulich #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
46e8f6e3f8SJan Beulich #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
47e8f6e3f8SJan Beulich #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
48e8f6e3f8SJan Beulich #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
49e8f6e3f8SJan Beulich #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
50f317820cSJan Beulich #define NOP(x)
51f317820cSJan Beulich
52f317820cSJan Beulich #define BLK64(pf, op, i) \
53f317820cSJan Beulich pf(i) \
54f317820cSJan Beulich op(i, 0) \
55f317820cSJan Beulich op(i + 1, 1) \
56f317820cSJan Beulich op(i + 2, 2) \
57f317820cSJan Beulich op(i + 3, 3)
58e8f6e3f8SJan Beulich
59e8f6e3f8SJan Beulich static void
xor_sse_2(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2)60*297565aaSArd Biesheuvel xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
61*297565aaSArd Biesheuvel const unsigned long * __restrict p2)
62e8f6e3f8SJan Beulich {
63e8f6e3f8SJan Beulich unsigned long lines = bytes >> 8;
64e8f6e3f8SJan Beulich
65e8f6e3f8SJan Beulich kernel_fpu_begin();
66e8f6e3f8SJan Beulich
67e8f6e3f8SJan Beulich asm volatile(
68e8f6e3f8SJan Beulich #undef BLOCK
69e8f6e3f8SJan Beulich #define BLOCK(i) \
70e8f6e3f8SJan Beulich LD(i, 0) \
71e8f6e3f8SJan Beulich LD(i + 1, 1) \
72e8f6e3f8SJan Beulich PF1(i) \
73e8f6e3f8SJan Beulich PF1(i + 2) \
74e8f6e3f8SJan Beulich LD(i + 2, 2) \
75e8f6e3f8SJan Beulich LD(i + 3, 3) \
76e8f6e3f8SJan Beulich PF0(i + 4) \
77e8f6e3f8SJan Beulich PF0(i + 6) \
78e8f6e3f8SJan Beulich XO1(i, 0) \
79e8f6e3f8SJan Beulich XO1(i + 1, 1) \
80e8f6e3f8SJan Beulich XO1(i + 2, 2) \
81e8f6e3f8SJan Beulich XO1(i + 3, 3) \
82e8f6e3f8SJan Beulich ST(i, 0) \
83e8f6e3f8SJan Beulich ST(i + 1, 1) \
84e8f6e3f8SJan Beulich ST(i + 2, 2) \
85e8f6e3f8SJan Beulich ST(i + 3, 3) \
86e8f6e3f8SJan Beulich
87e8f6e3f8SJan Beulich
88e8f6e3f8SJan Beulich PF0(0)
89e8f6e3f8SJan Beulich PF0(2)
90e8f6e3f8SJan Beulich
91e8f6e3f8SJan Beulich " .align 32 ;\n"
92e8f6e3f8SJan Beulich " 1: ;\n"
93e8f6e3f8SJan Beulich
94e8f6e3f8SJan Beulich BLOCK(0)
95e8f6e3f8SJan Beulich BLOCK(4)
96e8f6e3f8SJan Beulich BLOCK(8)
97e8f6e3f8SJan Beulich BLOCK(12)
98e8f6e3f8SJan Beulich
99e8f6e3f8SJan Beulich " add %[inc], %[p1] ;\n"
100e8f6e3f8SJan Beulich " add %[inc], %[p2] ;\n"
101e8f6e3f8SJan Beulich " dec %[cnt] ;\n"
102e8f6e3f8SJan Beulich " jnz 1b ;\n"
103e8f6e3f8SJan Beulich : [cnt] "+r" (lines),
104e8f6e3f8SJan Beulich [p1] "+r" (p1), [p2] "+r" (p2)
105e8f6e3f8SJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
106e8f6e3f8SJan Beulich : "memory");
107e8f6e3f8SJan Beulich
108e8f6e3f8SJan Beulich kernel_fpu_end();
109e8f6e3f8SJan Beulich }
110e8f6e3f8SJan Beulich
111e8f6e3f8SJan Beulich static void
xor_sse_2_pf64(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2)112*297565aaSArd Biesheuvel xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
113*297565aaSArd Biesheuvel const unsigned long * __restrict p2)
114f317820cSJan Beulich {
115f317820cSJan Beulich unsigned long lines = bytes >> 8;
116f317820cSJan Beulich
117f317820cSJan Beulich kernel_fpu_begin();
118f317820cSJan Beulich
119f317820cSJan Beulich asm volatile(
120f317820cSJan Beulich #undef BLOCK
121f317820cSJan Beulich #define BLOCK(i) \
122f317820cSJan Beulich BLK64(PF0, LD, i) \
123f317820cSJan Beulich BLK64(PF1, XO1, i) \
124f317820cSJan Beulich BLK64(NOP, ST, i) \
125f317820cSJan Beulich
126f317820cSJan Beulich " .align 32 ;\n"
127f317820cSJan Beulich " 1: ;\n"
128f317820cSJan Beulich
129f317820cSJan Beulich BLOCK(0)
130f317820cSJan Beulich BLOCK(4)
131f317820cSJan Beulich BLOCK(8)
132f317820cSJan Beulich BLOCK(12)
133f317820cSJan Beulich
134f317820cSJan Beulich " add %[inc], %[p1] ;\n"
135f317820cSJan Beulich " add %[inc], %[p2] ;\n"
136f317820cSJan Beulich " dec %[cnt] ;\n"
137f317820cSJan Beulich " jnz 1b ;\n"
138f317820cSJan Beulich : [cnt] "+r" (lines),
139f317820cSJan Beulich [p1] "+r" (p1), [p2] "+r" (p2)
140f317820cSJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
141f317820cSJan Beulich : "memory");
142f317820cSJan Beulich
143f317820cSJan Beulich kernel_fpu_end();
144f317820cSJan Beulich }
145f317820cSJan Beulich
146f317820cSJan Beulich static void
xor_sse_3(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3)147*297565aaSArd Biesheuvel xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
148*297565aaSArd Biesheuvel const unsigned long * __restrict p2,
149*297565aaSArd Biesheuvel const unsigned long * __restrict p3)
150e8f6e3f8SJan Beulich {
151e8f6e3f8SJan Beulich unsigned long lines = bytes >> 8;
152e8f6e3f8SJan Beulich
153e8f6e3f8SJan Beulich kernel_fpu_begin();
154e8f6e3f8SJan Beulich
155e8f6e3f8SJan Beulich asm volatile(
156e8f6e3f8SJan Beulich #undef BLOCK
157e8f6e3f8SJan Beulich #define BLOCK(i) \
158e8f6e3f8SJan Beulich PF1(i) \
159e8f6e3f8SJan Beulich PF1(i + 2) \
160e8f6e3f8SJan Beulich LD(i, 0) \
161e8f6e3f8SJan Beulich LD(i + 1, 1) \
162e8f6e3f8SJan Beulich LD(i + 2, 2) \
163e8f6e3f8SJan Beulich LD(i + 3, 3) \
164e8f6e3f8SJan Beulich PF2(i) \
165e8f6e3f8SJan Beulich PF2(i + 2) \
166e8f6e3f8SJan Beulich PF0(i + 4) \
167e8f6e3f8SJan Beulich PF0(i + 6) \
168e8f6e3f8SJan Beulich XO1(i, 0) \
169e8f6e3f8SJan Beulich XO1(i + 1, 1) \
170e8f6e3f8SJan Beulich XO1(i + 2, 2) \
171e8f6e3f8SJan Beulich XO1(i + 3, 3) \
172e8f6e3f8SJan Beulich XO2(i, 0) \
173e8f6e3f8SJan Beulich XO2(i + 1, 1) \
174e8f6e3f8SJan Beulich XO2(i + 2, 2) \
175e8f6e3f8SJan Beulich XO2(i + 3, 3) \
176e8f6e3f8SJan Beulich ST(i, 0) \
177e8f6e3f8SJan Beulich ST(i + 1, 1) \
178e8f6e3f8SJan Beulich ST(i + 2, 2) \
179e8f6e3f8SJan Beulich ST(i + 3, 3) \
180e8f6e3f8SJan Beulich
181e8f6e3f8SJan Beulich
182e8f6e3f8SJan Beulich PF0(0)
183e8f6e3f8SJan Beulich PF0(2)
184e8f6e3f8SJan Beulich
185e8f6e3f8SJan Beulich " .align 32 ;\n"
186e8f6e3f8SJan Beulich " 1: ;\n"
187e8f6e3f8SJan Beulich
188e8f6e3f8SJan Beulich BLOCK(0)
189e8f6e3f8SJan Beulich BLOCK(4)
190e8f6e3f8SJan Beulich BLOCK(8)
191e8f6e3f8SJan Beulich BLOCK(12)
192e8f6e3f8SJan Beulich
193e8f6e3f8SJan Beulich " add %[inc], %[p1] ;\n"
194e8f6e3f8SJan Beulich " add %[inc], %[p2] ;\n"
195e8f6e3f8SJan Beulich " add %[inc], %[p3] ;\n"
196e8f6e3f8SJan Beulich " dec %[cnt] ;\n"
197e8f6e3f8SJan Beulich " jnz 1b ;\n"
198e8f6e3f8SJan Beulich : [cnt] "+r" (lines),
199e8f6e3f8SJan Beulich [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
200e8f6e3f8SJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
201e8f6e3f8SJan Beulich : "memory");
202e8f6e3f8SJan Beulich
203e8f6e3f8SJan Beulich kernel_fpu_end();
204e8f6e3f8SJan Beulich }
205e8f6e3f8SJan Beulich
206e8f6e3f8SJan Beulich static void
xor_sse_3_pf64(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3)207*297565aaSArd Biesheuvel xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
208*297565aaSArd Biesheuvel const unsigned long * __restrict p2,
209*297565aaSArd Biesheuvel const unsigned long * __restrict p3)
210f317820cSJan Beulich {
211f317820cSJan Beulich unsigned long lines = bytes >> 8;
212f317820cSJan Beulich
213f317820cSJan Beulich kernel_fpu_begin();
214f317820cSJan Beulich
215f317820cSJan Beulich asm volatile(
216f317820cSJan Beulich #undef BLOCK
217f317820cSJan Beulich #define BLOCK(i) \
218f317820cSJan Beulich BLK64(PF0, LD, i) \
219f317820cSJan Beulich BLK64(PF1, XO1, i) \
220f317820cSJan Beulich BLK64(PF2, XO2, i) \
221f317820cSJan Beulich BLK64(NOP, ST, i) \
222f317820cSJan Beulich
223f317820cSJan Beulich " .align 32 ;\n"
224f317820cSJan Beulich " 1: ;\n"
225f317820cSJan Beulich
226f317820cSJan Beulich BLOCK(0)
227f317820cSJan Beulich BLOCK(4)
228f317820cSJan Beulich BLOCK(8)
229f317820cSJan Beulich BLOCK(12)
230f317820cSJan Beulich
231f317820cSJan Beulich " add %[inc], %[p1] ;\n"
232f317820cSJan Beulich " add %[inc], %[p2] ;\n"
233f317820cSJan Beulich " add %[inc], %[p3] ;\n"
234f317820cSJan Beulich " dec %[cnt] ;\n"
235f317820cSJan Beulich " jnz 1b ;\n"
236f317820cSJan Beulich : [cnt] "+r" (lines),
237f317820cSJan Beulich [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
238f317820cSJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
239f317820cSJan Beulich : "memory");
240f317820cSJan Beulich
241f317820cSJan Beulich kernel_fpu_end();
242f317820cSJan Beulich }
243f317820cSJan Beulich
244f317820cSJan Beulich static void
xor_sse_4(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4)245*297565aaSArd Biesheuvel xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
246*297565aaSArd Biesheuvel const unsigned long * __restrict p2,
247*297565aaSArd Biesheuvel const unsigned long * __restrict p3,
248*297565aaSArd Biesheuvel const unsigned long * __restrict p4)
249e8f6e3f8SJan Beulich {
250e8f6e3f8SJan Beulich unsigned long lines = bytes >> 8;
251e8f6e3f8SJan Beulich
252e8f6e3f8SJan Beulich kernel_fpu_begin();
253e8f6e3f8SJan Beulich
254e8f6e3f8SJan Beulich asm volatile(
255e8f6e3f8SJan Beulich #undef BLOCK
256e8f6e3f8SJan Beulich #define BLOCK(i) \
257e8f6e3f8SJan Beulich PF1(i) \
258e8f6e3f8SJan Beulich PF1(i + 2) \
259e8f6e3f8SJan Beulich LD(i, 0) \
260e8f6e3f8SJan Beulich LD(i + 1, 1) \
261e8f6e3f8SJan Beulich LD(i + 2, 2) \
262e8f6e3f8SJan Beulich LD(i + 3, 3) \
263e8f6e3f8SJan Beulich PF2(i) \
264e8f6e3f8SJan Beulich PF2(i + 2) \
265e8f6e3f8SJan Beulich XO1(i, 0) \
266e8f6e3f8SJan Beulich XO1(i + 1, 1) \
267e8f6e3f8SJan Beulich XO1(i + 2, 2) \
268e8f6e3f8SJan Beulich XO1(i + 3, 3) \
269e8f6e3f8SJan Beulich PF3(i) \
270e8f6e3f8SJan Beulich PF3(i + 2) \
271e8f6e3f8SJan Beulich PF0(i + 4) \
272e8f6e3f8SJan Beulich PF0(i + 6) \
273e8f6e3f8SJan Beulich XO2(i, 0) \
274e8f6e3f8SJan Beulich XO2(i + 1, 1) \
275e8f6e3f8SJan Beulich XO2(i + 2, 2) \
276e8f6e3f8SJan Beulich XO2(i + 3, 3) \
277e8f6e3f8SJan Beulich XO3(i, 0) \
278e8f6e3f8SJan Beulich XO3(i + 1, 1) \
279e8f6e3f8SJan Beulich XO3(i + 2, 2) \
280e8f6e3f8SJan Beulich XO3(i + 3, 3) \
281e8f6e3f8SJan Beulich ST(i, 0) \
282e8f6e3f8SJan Beulich ST(i + 1, 1) \
283e8f6e3f8SJan Beulich ST(i + 2, 2) \
284e8f6e3f8SJan Beulich ST(i + 3, 3) \
285e8f6e3f8SJan Beulich
286e8f6e3f8SJan Beulich
287e8f6e3f8SJan Beulich PF0(0)
288e8f6e3f8SJan Beulich PF0(2)
289e8f6e3f8SJan Beulich
290e8f6e3f8SJan Beulich " .align 32 ;\n"
291e8f6e3f8SJan Beulich " 1: ;\n"
292e8f6e3f8SJan Beulich
293e8f6e3f8SJan Beulich BLOCK(0)
294e8f6e3f8SJan Beulich BLOCK(4)
295e8f6e3f8SJan Beulich BLOCK(8)
296e8f6e3f8SJan Beulich BLOCK(12)
297e8f6e3f8SJan Beulich
298e8f6e3f8SJan Beulich " add %[inc], %[p1] ;\n"
299e8f6e3f8SJan Beulich " add %[inc], %[p2] ;\n"
300e8f6e3f8SJan Beulich " add %[inc], %[p3] ;\n"
301e8f6e3f8SJan Beulich " add %[inc], %[p4] ;\n"
302e8f6e3f8SJan Beulich " dec %[cnt] ;\n"
303e8f6e3f8SJan Beulich " jnz 1b ;\n"
304e8f6e3f8SJan Beulich : [cnt] "+r" (lines), [p1] "+r" (p1),
305e8f6e3f8SJan Beulich [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
306e8f6e3f8SJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
307e8f6e3f8SJan Beulich : "memory");
308e8f6e3f8SJan Beulich
309e8f6e3f8SJan Beulich kernel_fpu_end();
310e8f6e3f8SJan Beulich }
311e8f6e3f8SJan Beulich
312e8f6e3f8SJan Beulich static void
xor_sse_4_pf64(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4)313*297565aaSArd Biesheuvel xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
314*297565aaSArd Biesheuvel const unsigned long * __restrict p2,
315*297565aaSArd Biesheuvel const unsigned long * __restrict p3,
316*297565aaSArd Biesheuvel const unsigned long * __restrict p4)
317f317820cSJan Beulich {
318f317820cSJan Beulich unsigned long lines = bytes >> 8;
319f317820cSJan Beulich
320f317820cSJan Beulich kernel_fpu_begin();
321f317820cSJan Beulich
322f317820cSJan Beulich asm volatile(
323f317820cSJan Beulich #undef BLOCK
324f317820cSJan Beulich #define BLOCK(i) \
325f317820cSJan Beulich BLK64(PF0, LD, i) \
326f317820cSJan Beulich BLK64(PF1, XO1, i) \
327f317820cSJan Beulich BLK64(PF2, XO2, i) \
328f317820cSJan Beulich BLK64(PF3, XO3, i) \
329f317820cSJan Beulich BLK64(NOP, ST, i) \
330f317820cSJan Beulich
331f317820cSJan Beulich " .align 32 ;\n"
332f317820cSJan Beulich " 1: ;\n"
333f317820cSJan Beulich
334f317820cSJan Beulich BLOCK(0)
335f317820cSJan Beulich BLOCK(4)
336f317820cSJan Beulich BLOCK(8)
337f317820cSJan Beulich BLOCK(12)
338f317820cSJan Beulich
339f317820cSJan Beulich " add %[inc], %[p1] ;\n"
340f317820cSJan Beulich " add %[inc], %[p2] ;\n"
341f317820cSJan Beulich " add %[inc], %[p3] ;\n"
342f317820cSJan Beulich " add %[inc], %[p4] ;\n"
343f317820cSJan Beulich " dec %[cnt] ;\n"
344f317820cSJan Beulich " jnz 1b ;\n"
345f317820cSJan Beulich : [cnt] "+r" (lines), [p1] "+r" (p1),
346f317820cSJan Beulich [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
347f317820cSJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
348f317820cSJan Beulich : "memory");
349f317820cSJan Beulich
350f317820cSJan Beulich kernel_fpu_end();
351f317820cSJan Beulich }
352f317820cSJan Beulich
353f317820cSJan Beulich static void
xor_sse_5(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4,const unsigned long * __restrict p5)354*297565aaSArd Biesheuvel xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
355*297565aaSArd Biesheuvel const unsigned long * __restrict p2,
356*297565aaSArd Biesheuvel const unsigned long * __restrict p3,
357*297565aaSArd Biesheuvel const unsigned long * __restrict p4,
358*297565aaSArd Biesheuvel const unsigned long * __restrict p5)
359e8f6e3f8SJan Beulich {
360e8f6e3f8SJan Beulich unsigned long lines = bytes >> 8;
361e8f6e3f8SJan Beulich
362e8f6e3f8SJan Beulich kernel_fpu_begin();
363e8f6e3f8SJan Beulich
364e8f6e3f8SJan Beulich asm volatile(
365e8f6e3f8SJan Beulich #undef BLOCK
366e8f6e3f8SJan Beulich #define BLOCK(i) \
367e8f6e3f8SJan Beulich PF1(i) \
368e8f6e3f8SJan Beulich PF1(i + 2) \
369e8f6e3f8SJan Beulich LD(i, 0) \
370e8f6e3f8SJan Beulich LD(i + 1, 1) \
371e8f6e3f8SJan Beulich LD(i + 2, 2) \
372e8f6e3f8SJan Beulich LD(i + 3, 3) \
373e8f6e3f8SJan Beulich PF2(i) \
374e8f6e3f8SJan Beulich PF2(i + 2) \
375e8f6e3f8SJan Beulich XO1(i, 0) \
376e8f6e3f8SJan Beulich XO1(i + 1, 1) \
377e8f6e3f8SJan Beulich XO1(i + 2, 2) \
378e8f6e3f8SJan Beulich XO1(i + 3, 3) \
379e8f6e3f8SJan Beulich PF3(i) \
380e8f6e3f8SJan Beulich PF3(i + 2) \
381e8f6e3f8SJan Beulich XO2(i, 0) \
382e8f6e3f8SJan Beulich XO2(i + 1, 1) \
383e8f6e3f8SJan Beulich XO2(i + 2, 2) \
384e8f6e3f8SJan Beulich XO2(i + 3, 3) \
385e8f6e3f8SJan Beulich PF4(i) \
386e8f6e3f8SJan Beulich PF4(i + 2) \
387e8f6e3f8SJan Beulich PF0(i + 4) \
388e8f6e3f8SJan Beulich PF0(i + 6) \
389e8f6e3f8SJan Beulich XO3(i, 0) \
390e8f6e3f8SJan Beulich XO3(i + 1, 1) \
391e8f6e3f8SJan Beulich XO3(i + 2, 2) \
392e8f6e3f8SJan Beulich XO3(i + 3, 3) \
393e8f6e3f8SJan Beulich XO4(i, 0) \
394e8f6e3f8SJan Beulich XO4(i + 1, 1) \
395e8f6e3f8SJan Beulich XO4(i + 2, 2) \
396e8f6e3f8SJan Beulich XO4(i + 3, 3) \
397e8f6e3f8SJan Beulich ST(i, 0) \
398e8f6e3f8SJan Beulich ST(i + 1, 1) \
399e8f6e3f8SJan Beulich ST(i + 2, 2) \
400e8f6e3f8SJan Beulich ST(i + 3, 3) \
401e8f6e3f8SJan Beulich
402e8f6e3f8SJan Beulich
403e8f6e3f8SJan Beulich PF0(0)
404e8f6e3f8SJan Beulich PF0(2)
405e8f6e3f8SJan Beulich
406e8f6e3f8SJan Beulich " .align 32 ;\n"
407e8f6e3f8SJan Beulich " 1: ;\n"
408e8f6e3f8SJan Beulich
409e8f6e3f8SJan Beulich BLOCK(0)
410e8f6e3f8SJan Beulich BLOCK(4)
411e8f6e3f8SJan Beulich BLOCK(8)
412e8f6e3f8SJan Beulich BLOCK(12)
413e8f6e3f8SJan Beulich
414e8f6e3f8SJan Beulich " add %[inc], %[p1] ;\n"
415e8f6e3f8SJan Beulich " add %[inc], %[p2] ;\n"
416e8f6e3f8SJan Beulich " add %[inc], %[p3] ;\n"
417e8f6e3f8SJan Beulich " add %[inc], %[p4] ;\n"
418e8f6e3f8SJan Beulich " add %[inc], %[p5] ;\n"
419e8f6e3f8SJan Beulich " dec %[cnt] ;\n"
420e8f6e3f8SJan Beulich " jnz 1b ;\n"
421e8f6e3f8SJan Beulich : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
422e8f6e3f8SJan Beulich [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
423e8f6e3f8SJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
424e8f6e3f8SJan Beulich : "memory");
425e8f6e3f8SJan Beulich
426e8f6e3f8SJan Beulich kernel_fpu_end();
427e8f6e3f8SJan Beulich }
428e8f6e3f8SJan Beulich
429f317820cSJan Beulich static void
xor_sse_5_pf64(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4,const unsigned long * __restrict p5)430*297565aaSArd Biesheuvel xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
431*297565aaSArd Biesheuvel const unsigned long * __restrict p2,
432*297565aaSArd Biesheuvel const unsigned long * __restrict p3,
433*297565aaSArd Biesheuvel const unsigned long * __restrict p4,
434*297565aaSArd Biesheuvel const unsigned long * __restrict p5)
435f317820cSJan Beulich {
436f317820cSJan Beulich unsigned long lines = bytes >> 8;
437f317820cSJan Beulich
438f317820cSJan Beulich kernel_fpu_begin();
439f317820cSJan Beulich
440f317820cSJan Beulich asm volatile(
441f317820cSJan Beulich #undef BLOCK
442f317820cSJan Beulich #define BLOCK(i) \
443f317820cSJan Beulich BLK64(PF0, LD, i) \
444f317820cSJan Beulich BLK64(PF1, XO1, i) \
445f317820cSJan Beulich BLK64(PF2, XO2, i) \
446f317820cSJan Beulich BLK64(PF3, XO3, i) \
447f317820cSJan Beulich BLK64(PF4, XO4, i) \
448f317820cSJan Beulich BLK64(NOP, ST, i) \
449f317820cSJan Beulich
450f317820cSJan Beulich " .align 32 ;\n"
451f317820cSJan Beulich " 1: ;\n"
452f317820cSJan Beulich
453f317820cSJan Beulich BLOCK(0)
454f317820cSJan Beulich BLOCK(4)
455f317820cSJan Beulich BLOCK(8)
456f317820cSJan Beulich BLOCK(12)
457f317820cSJan Beulich
458f317820cSJan Beulich " add %[inc], %[p1] ;\n"
459f317820cSJan Beulich " add %[inc], %[p2] ;\n"
460f317820cSJan Beulich " add %[inc], %[p3] ;\n"
461f317820cSJan Beulich " add %[inc], %[p4] ;\n"
462f317820cSJan Beulich " add %[inc], %[p5] ;\n"
463f317820cSJan Beulich " dec %[cnt] ;\n"
464f317820cSJan Beulich " jnz 1b ;\n"
465f317820cSJan Beulich : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
466f317820cSJan Beulich [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
467f317820cSJan Beulich : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
468f317820cSJan Beulich : "memory");
469f317820cSJan Beulich
470f317820cSJan Beulich kernel_fpu_end();
471f317820cSJan Beulich }
472f317820cSJan Beulich
473f317820cSJan Beulich static struct xor_block_template xor_block_sse_pf64 = {
474f317820cSJan Beulich .name = "prefetch64-sse",
475f317820cSJan Beulich .do_2 = xor_sse_2_pf64,
476f317820cSJan Beulich .do_3 = xor_sse_3_pf64,
477f317820cSJan Beulich .do_4 = xor_sse_4_pf64,
478f317820cSJan Beulich .do_5 = xor_sse_5_pf64,
479f317820cSJan Beulich };
480f317820cSJan Beulich
481e8f6e3f8SJan Beulich #undef LD
482e8f6e3f8SJan Beulich #undef XO1
483e8f6e3f8SJan Beulich #undef XO2
484e8f6e3f8SJan Beulich #undef XO3
485e8f6e3f8SJan Beulich #undef XO4
486e8f6e3f8SJan Beulich #undef ST
487f317820cSJan Beulich #undef NOP
488f317820cSJan Beulich #undef BLK64
489e8f6e3f8SJan Beulich #undef BLOCK
490e8f6e3f8SJan Beulich
491e8f6e3f8SJan Beulich #undef XOR_CONSTANT_CONSTRAINT
492e8f6e3f8SJan Beulich
493bb898558SAl Viro #ifdef CONFIG_X86_32
494a1ce3928SDavid Howells # include <asm/xor_32.h>
495bb898558SAl Viro #else
496a1ce3928SDavid Howells # include <asm/xor_64.h>
497bb898558SAl Viro #endif
498e8f6e3f8SJan Beulich
499f317820cSJan Beulich #define XOR_SELECT_TEMPLATE(FASTEST) \
500f317820cSJan Beulich AVX_SELECT(FASTEST)
501f317820cSJan Beulich
502e8f6e3f8SJan Beulich #endif /* _ASM_X86_XOR_H */
503