xref: /linux/lib/raid/xor/x86/xor-sse.c (revision 440d6635b20037bc9ad46b20817d7b61cef0fc1b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Optimized XOR parity functions for SSE.
4  *
5  * Cache avoiding checksumming functions utilizing KNI instructions
6  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
7  *
8  * Based on
9  * High-speed RAID5 checksumming functions utilizing SSE instructions.
10  * Copyright (C) 1998 Ingo Molnar.
11  *
12  * x86-64 changes / gcc fixes from Andi Kleen.
13  * Copyright 2002 Andi Kleen, SuSE Labs.
14  */
15 #include <asm/fpu/api.h>
16 #include "xor_impl.h"
17 #include "xor_arch.h"
18 
19 #ifdef CONFIG_X86_32
20 /* reduce register pressure */
21 # define XOR_CONSTANT_CONSTRAINT "i"
22 #else
23 # define XOR_CONSTANT_CONSTRAINT "re"
24 #endif
25 
26 #define OFFS(x)		"16*("#x")"
27 #define PF_OFFS(x)	"256+16*("#x")"
28 #define PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
29 #define LD(x, y)	"	movaps "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
30 #define ST(x, y)	"	movaps %%xmm"#y", "OFFS(x)"(%[p1])	;\n"
31 #define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
32 #define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
33 #define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
34 #define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
35 #define XO1(x, y)	"	xorps "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
36 #define XO2(x, y)	"	xorps "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
37 #define XO3(x, y)	"	xorps "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
38 #define XO4(x, y)	"	xorps "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
39 #define NOP(x)
40 
41 #define BLK64(pf, op, i)				\
42 		pf(i)					\
43 		op(i, 0)				\
44 			op(i + 1, 1)			\
45 				op(i + 2, 2)		\
46 					op(i + 3, 3)
47 
48 static void
49 xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
50 	  const unsigned long * __restrict p2)
51 {
52 	unsigned long lines = bytes >> 8;
53 
54 	asm volatile(
55 #undef BLOCK
56 #define BLOCK(i)					\
57 		LD(i, 0)				\
58 			LD(i + 1, 1)			\
59 		PF1(i)					\
60 				PF1(i + 2)		\
61 				LD(i + 2, 2)		\
62 					LD(i + 3, 3)	\
63 		PF0(i + 4)				\
64 				PF0(i + 6)		\
65 		XO1(i, 0)				\
66 			XO1(i + 1, 1)			\
67 				XO1(i + 2, 2)		\
68 					XO1(i + 3, 3)	\
69 		ST(i, 0)				\
70 			ST(i + 1, 1)			\
71 				ST(i + 2, 2)		\
72 					ST(i + 3, 3)	\
73 
74 
75 		PF0(0)
76 				PF0(2)
77 
78 	" .align 32			;\n"
79 	" 1:                            ;\n"
80 
81 		BLOCK(0)
82 		BLOCK(4)
83 		BLOCK(8)
84 		BLOCK(12)
85 
86 	"       add %[inc], %[p1]       ;\n"
87 	"       add %[inc], %[p2]       ;\n"
88 	"       dec %[cnt]              ;\n"
89 	"       jnz 1b                  ;\n"
90 	: [cnt] "+r" (lines),
91 	  [p1] "+r" (p1), [p2] "+r" (p2)
92 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
93 	: "memory");
94 }
95 
96 static void
97 xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
98 	       const unsigned long * __restrict p2)
99 {
100 	unsigned long lines = bytes >> 8;
101 
102 	asm volatile(
103 #undef BLOCK
104 #define BLOCK(i)			\
105 		BLK64(PF0, LD, i)	\
106 		BLK64(PF1, XO1, i)	\
107 		BLK64(NOP, ST, i)	\
108 
109 	" .align 32			;\n"
110 	" 1:                            ;\n"
111 
112 		BLOCK(0)
113 		BLOCK(4)
114 		BLOCK(8)
115 		BLOCK(12)
116 
117 	"       add %[inc], %[p1]       ;\n"
118 	"       add %[inc], %[p2]       ;\n"
119 	"       dec %[cnt]              ;\n"
120 	"       jnz 1b                  ;\n"
121 	: [cnt] "+r" (lines),
122 	  [p1] "+r" (p1), [p2] "+r" (p2)
123 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
124 	: "memory");
125 }
126 
127 static void
128 xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
129 	  const unsigned long * __restrict p2,
130 	  const unsigned long * __restrict p3)
131 {
132 	unsigned long lines = bytes >> 8;
133 
134 	asm volatile(
135 #undef BLOCK
136 #define BLOCK(i) \
137 		PF1(i)					\
138 				PF1(i + 2)		\
139 		LD(i, 0)				\
140 			LD(i + 1, 1)			\
141 				LD(i + 2, 2)		\
142 					LD(i + 3, 3)	\
143 		PF2(i)					\
144 				PF2(i + 2)		\
145 		PF0(i + 4)				\
146 				PF0(i + 6)		\
147 		XO1(i, 0)				\
148 			XO1(i + 1, 1)			\
149 				XO1(i + 2, 2)		\
150 					XO1(i + 3, 3)	\
151 		XO2(i, 0)				\
152 			XO2(i + 1, 1)			\
153 				XO2(i + 2, 2)		\
154 					XO2(i + 3, 3)	\
155 		ST(i, 0)				\
156 			ST(i + 1, 1)			\
157 				ST(i + 2, 2)		\
158 					ST(i + 3, 3)	\
159 
160 
161 		PF0(0)
162 				PF0(2)
163 
164 	" .align 32			;\n"
165 	" 1:                            ;\n"
166 
167 		BLOCK(0)
168 		BLOCK(4)
169 		BLOCK(8)
170 		BLOCK(12)
171 
172 	"       add %[inc], %[p1]       ;\n"
173 	"       add %[inc], %[p2]       ;\n"
174 	"       add %[inc], %[p3]       ;\n"
175 	"       dec %[cnt]              ;\n"
176 	"       jnz 1b                  ;\n"
177 	: [cnt] "+r" (lines),
178 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
179 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
180 	: "memory");
181 }
182 
183 static void
184 xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
185 	       const unsigned long * __restrict p2,
186 	       const unsigned long * __restrict p3)
187 {
188 	unsigned long lines = bytes >> 8;
189 
190 	asm volatile(
191 #undef BLOCK
192 #define BLOCK(i)			\
193 		BLK64(PF0, LD, i)	\
194 		BLK64(PF1, XO1, i)	\
195 		BLK64(PF2, XO2, i)	\
196 		BLK64(NOP, ST, i)	\
197 
198 	" .align 32			;\n"
199 	" 1:                            ;\n"
200 
201 		BLOCK(0)
202 		BLOCK(4)
203 		BLOCK(8)
204 		BLOCK(12)
205 
206 	"       add %[inc], %[p1]       ;\n"
207 	"       add %[inc], %[p2]       ;\n"
208 	"       add %[inc], %[p3]       ;\n"
209 	"       dec %[cnt]              ;\n"
210 	"       jnz 1b                  ;\n"
211 	: [cnt] "+r" (lines),
212 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
213 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
214 	: "memory");
215 }
216 
217 static void
218 xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
219 	  const unsigned long * __restrict p2,
220 	  const unsigned long * __restrict p3,
221 	  const unsigned long * __restrict p4)
222 {
223 	unsigned long lines = bytes >> 8;
224 
225 	asm volatile(
226 #undef BLOCK
227 #define BLOCK(i) \
228 		PF1(i)					\
229 				PF1(i + 2)		\
230 		LD(i, 0)				\
231 			LD(i + 1, 1)			\
232 				LD(i + 2, 2)		\
233 					LD(i + 3, 3)	\
234 		PF2(i)					\
235 				PF2(i + 2)		\
236 		XO1(i, 0)				\
237 			XO1(i + 1, 1)			\
238 				XO1(i + 2, 2)		\
239 					XO1(i + 3, 3)	\
240 		PF3(i)					\
241 				PF3(i + 2)		\
242 		PF0(i + 4)				\
243 				PF0(i + 6)		\
244 		XO2(i, 0)				\
245 			XO2(i + 1, 1)			\
246 				XO2(i + 2, 2)		\
247 					XO2(i + 3, 3)	\
248 		XO3(i, 0)				\
249 			XO3(i + 1, 1)			\
250 				XO3(i + 2, 2)		\
251 					XO3(i + 3, 3)	\
252 		ST(i, 0)				\
253 			ST(i + 1, 1)			\
254 				ST(i + 2, 2)		\
255 					ST(i + 3, 3)	\
256 
257 
258 		PF0(0)
259 				PF0(2)
260 
261 	" .align 32			;\n"
262 	" 1:                            ;\n"
263 
264 		BLOCK(0)
265 		BLOCK(4)
266 		BLOCK(8)
267 		BLOCK(12)
268 
269 	"       add %[inc], %[p1]       ;\n"
270 	"       add %[inc], %[p2]       ;\n"
271 	"       add %[inc], %[p3]       ;\n"
272 	"       add %[inc], %[p4]       ;\n"
273 	"       dec %[cnt]              ;\n"
274 	"       jnz 1b                  ;\n"
275 	: [cnt] "+r" (lines), [p1] "+r" (p1),
276 	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
277 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
278 	: "memory");
279 }
280 
281 static void
282 xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
283 	       const unsigned long * __restrict p2,
284 	       const unsigned long * __restrict p3,
285 	       const unsigned long * __restrict p4)
286 {
287 	unsigned long lines = bytes >> 8;
288 
289 	asm volatile(
290 #undef BLOCK
291 #define BLOCK(i)			\
292 		BLK64(PF0, LD, i)	\
293 		BLK64(PF1, XO1, i)	\
294 		BLK64(PF2, XO2, i)	\
295 		BLK64(PF3, XO3, i)	\
296 		BLK64(NOP, ST, i)	\
297 
298 	" .align 32			;\n"
299 	" 1:                            ;\n"
300 
301 		BLOCK(0)
302 		BLOCK(4)
303 		BLOCK(8)
304 		BLOCK(12)
305 
306 	"       add %[inc], %[p1]       ;\n"
307 	"       add %[inc], %[p2]       ;\n"
308 	"       add %[inc], %[p3]       ;\n"
309 	"       add %[inc], %[p4]       ;\n"
310 	"       dec %[cnt]              ;\n"
311 	"       jnz 1b                  ;\n"
312 	: [cnt] "+r" (lines), [p1] "+r" (p1),
313 	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
314 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
315 	: "memory");
316 }
317 
318 static void
319 xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
320 	  const unsigned long * __restrict p2,
321 	  const unsigned long * __restrict p3,
322 	  const unsigned long * __restrict p4,
323 	  const unsigned long * __restrict p5)
324 {
325 	unsigned long lines = bytes >> 8;
326 
327 	asm volatile(
328 #undef BLOCK
329 #define BLOCK(i) \
330 		PF1(i)					\
331 				PF1(i + 2)		\
332 		LD(i, 0)				\
333 			LD(i + 1, 1)			\
334 				LD(i + 2, 2)		\
335 					LD(i + 3, 3)	\
336 		PF2(i)					\
337 				PF2(i + 2)		\
338 		XO1(i, 0)				\
339 			XO1(i + 1, 1)			\
340 				XO1(i + 2, 2)		\
341 					XO1(i + 3, 3)	\
342 		PF3(i)					\
343 				PF3(i + 2)		\
344 		XO2(i, 0)				\
345 			XO2(i + 1, 1)			\
346 				XO2(i + 2, 2)		\
347 					XO2(i + 3, 3)	\
348 		PF4(i)					\
349 				PF4(i + 2)		\
350 		PF0(i + 4)				\
351 				PF0(i + 6)		\
352 		XO3(i, 0)				\
353 			XO3(i + 1, 1)			\
354 				XO3(i + 2, 2)		\
355 					XO3(i + 3, 3)	\
356 		XO4(i, 0)				\
357 			XO4(i + 1, 1)			\
358 				XO4(i + 2, 2)		\
359 					XO4(i + 3, 3)	\
360 		ST(i, 0)				\
361 			ST(i + 1, 1)			\
362 				ST(i + 2, 2)		\
363 					ST(i + 3, 3)	\
364 
365 
366 		PF0(0)
367 				PF0(2)
368 
369 	" .align 32			;\n"
370 	" 1:                            ;\n"
371 
372 		BLOCK(0)
373 		BLOCK(4)
374 		BLOCK(8)
375 		BLOCK(12)
376 
377 	"       add %[inc], %[p1]       ;\n"
378 	"       add %[inc], %[p2]       ;\n"
379 	"       add %[inc], %[p3]       ;\n"
380 	"       add %[inc], %[p4]       ;\n"
381 	"       add %[inc], %[p5]       ;\n"
382 	"       dec %[cnt]              ;\n"
383 	"       jnz 1b                  ;\n"
384 	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
385 	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
386 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
387 	: "memory");
388 }
389 
390 static void
391 xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
392 	       const unsigned long * __restrict p2,
393 	       const unsigned long * __restrict p3,
394 	       const unsigned long * __restrict p4,
395 	       const unsigned long * __restrict p5)
396 {
397 	unsigned long lines = bytes >> 8;
398 
399 	asm volatile(
400 #undef BLOCK
401 #define BLOCK(i)			\
402 		BLK64(PF0, LD, i)	\
403 		BLK64(PF1, XO1, i)	\
404 		BLK64(PF2, XO2, i)	\
405 		BLK64(PF3, XO3, i)	\
406 		BLK64(PF4, XO4, i)	\
407 		BLK64(NOP, ST, i)	\
408 
409 	" .align 32			;\n"
410 	" 1:                            ;\n"
411 
412 		BLOCK(0)
413 		BLOCK(4)
414 		BLOCK(8)
415 		BLOCK(12)
416 
417 	"       add %[inc], %[p1]       ;\n"
418 	"       add %[inc], %[p2]       ;\n"
419 	"       add %[inc], %[p3]       ;\n"
420 	"       add %[inc], %[p4]       ;\n"
421 	"       add %[inc], %[p5]       ;\n"
422 	"       dec %[cnt]              ;\n"
423 	"       jnz 1b                  ;\n"
424 	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
425 	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
426 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
427 	: "memory");
428 }
429 
430 DO_XOR_BLOCKS(sse_inner, xor_sse_2, xor_sse_3, xor_sse_4, xor_sse_5);
431 
432 static void xor_gen_sse(void *dest, void **srcs, unsigned int src_cnt,
433 			unsigned int bytes)
434 {
435 	kernel_fpu_begin();
436 	xor_gen_sse_inner(dest, srcs, src_cnt, bytes);
437 	kernel_fpu_end();
438 }
439 
440 struct xor_block_template xor_block_sse = {
441 	.name		= "sse",
442 	.xor_gen	= xor_gen_sse,
443 };
444 
445 DO_XOR_BLOCKS(sse_pf64_inner, xor_sse_2_pf64, xor_sse_3_pf64, xor_sse_4_pf64,
446 		xor_sse_5_pf64);
447 
448 static void xor_gen_sse_pf64(void *dest, void **srcs, unsigned int src_cnt,
449 			unsigned int bytes)
450 {
451 	kernel_fpu_begin();
452 	xor_gen_sse_pf64_inner(dest, srcs, src_cnt, bytes);
453 	kernel_fpu_end();
454 }
455 
456 struct xor_block_template xor_block_sse_pf64 = {
457 	.name		= "prefetch64-sse",
458 	.xor_gen	= xor_gen_sse_pf64,
459 };
460