xref: /linux/arch/loongarch/lib/xor_template.c (revision 03c11eb3b16dc0058589751dfd91f254be2be613)
1*75ded18aSWANG Xuerui // SPDX-License-Identifier: GPL-2.0-or-later
2*75ded18aSWANG Xuerui /*
3*75ded18aSWANG Xuerui  * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
4*75ded18aSWANG Xuerui  *
5*75ded18aSWANG Xuerui  * Template for XOR operations, instantiated in xor_simd.c.
6*75ded18aSWANG Xuerui  *
7*75ded18aSWANG Xuerui  * Expected preprocessor definitions:
8*75ded18aSWANG Xuerui  *
9*75ded18aSWANG Xuerui  * - LINE_WIDTH
10*75ded18aSWANG Xuerui  * - XOR_FUNC_NAME(nr)
11*75ded18aSWANG Xuerui  * - LD_INOUT_LINE(buf)
12*75ded18aSWANG Xuerui  * - LD_AND_XOR_LINE(buf)
13*75ded18aSWANG Xuerui  * - ST_LINE(buf)
14*75ded18aSWANG Xuerui  */
15*75ded18aSWANG Xuerui 
16*75ded18aSWANG Xuerui void XOR_FUNC_NAME(2)(unsigned long bytes,
17*75ded18aSWANG Xuerui 		      unsigned long * __restrict v1,
18*75ded18aSWANG Xuerui 		      const unsigned long * __restrict v2)
19*75ded18aSWANG Xuerui {
20*75ded18aSWANG Xuerui 	unsigned long lines = bytes / LINE_WIDTH;
21*75ded18aSWANG Xuerui 
22*75ded18aSWANG Xuerui 	do {
23*75ded18aSWANG Xuerui 		__asm__ __volatile__ (
24*75ded18aSWANG Xuerui 			LD_INOUT_LINE(v1)
25*75ded18aSWANG Xuerui 			LD_AND_XOR_LINE(v2)
26*75ded18aSWANG Xuerui 			ST_LINE(v1)
27*75ded18aSWANG Xuerui 		: : [v1] "r"(v1), [v2] "r"(v2) : "memory"
28*75ded18aSWANG Xuerui 		);
29*75ded18aSWANG Xuerui 
30*75ded18aSWANG Xuerui 		v1 += LINE_WIDTH / sizeof(unsigned long);
31*75ded18aSWANG Xuerui 		v2 += LINE_WIDTH / sizeof(unsigned long);
32*75ded18aSWANG Xuerui 	} while (--lines > 0);
33*75ded18aSWANG Xuerui }
34*75ded18aSWANG Xuerui 
35*75ded18aSWANG Xuerui void XOR_FUNC_NAME(3)(unsigned long bytes,
36*75ded18aSWANG Xuerui 		      unsigned long * __restrict v1,
37*75ded18aSWANG Xuerui 		      const unsigned long * __restrict v2,
38*75ded18aSWANG Xuerui 		      const unsigned long * __restrict v3)
39*75ded18aSWANG Xuerui {
40*75ded18aSWANG Xuerui 	unsigned long lines = bytes / LINE_WIDTH;
41*75ded18aSWANG Xuerui 
42*75ded18aSWANG Xuerui 	do {
43*75ded18aSWANG Xuerui 		__asm__ __volatile__ (
44*75ded18aSWANG Xuerui 			LD_INOUT_LINE(v1)
45*75ded18aSWANG Xuerui 			LD_AND_XOR_LINE(v2)
46*75ded18aSWANG Xuerui 			LD_AND_XOR_LINE(v3)
47*75ded18aSWANG Xuerui 			ST_LINE(v1)
48*75ded18aSWANG Xuerui 		: : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3) : "memory"
49*75ded18aSWANG Xuerui 		);
50*75ded18aSWANG Xuerui 
51*75ded18aSWANG Xuerui 		v1 += LINE_WIDTH / sizeof(unsigned long);
52*75ded18aSWANG Xuerui 		v2 += LINE_WIDTH / sizeof(unsigned long);
53*75ded18aSWANG Xuerui 		v3 += LINE_WIDTH / sizeof(unsigned long);
54*75ded18aSWANG Xuerui 	} while (--lines > 0);
55*75ded18aSWANG Xuerui }
56*75ded18aSWANG Xuerui 
57*75ded18aSWANG Xuerui void XOR_FUNC_NAME(4)(unsigned long bytes,
58*75ded18aSWANG Xuerui 		      unsigned long * __restrict v1,
59*75ded18aSWANG Xuerui 		      const unsigned long * __restrict v2,
60*75ded18aSWANG Xuerui 		      const unsigned long * __restrict v3,
61*75ded18aSWANG Xuerui 		      const unsigned long * __restrict v4)
62*75ded18aSWANG Xuerui {
63*75ded18aSWANG Xuerui 	unsigned long lines = bytes / LINE_WIDTH;
64*75ded18aSWANG Xuerui 
65*75ded18aSWANG Xuerui 	do {
66*75ded18aSWANG Xuerui 		__asm__ __volatile__ (
67*75ded18aSWANG Xuerui 			LD_INOUT_LINE(v1)
68*75ded18aSWANG Xuerui 			LD_AND_XOR_LINE(v2)
69*75ded18aSWANG Xuerui 			LD_AND_XOR_LINE(v3)
70*75ded18aSWANG Xuerui 			LD_AND_XOR_LINE(v4)
71*75ded18aSWANG Xuerui 			ST_LINE(v1)
72*75ded18aSWANG Xuerui 		: : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3), [v4] "r"(v4)
73*75ded18aSWANG Xuerui 		: "memory"
74*75ded18aSWANG Xuerui 		);
75*75ded18aSWANG Xuerui 
76*75ded18aSWANG Xuerui 		v1 += LINE_WIDTH / sizeof(unsigned long);
77*75ded18aSWANG Xuerui 		v2 += LINE_WIDTH / sizeof(unsigned long);
78*75ded18aSWANG Xuerui 		v3 += LINE_WIDTH / sizeof(unsigned long);
79*75ded18aSWANG Xuerui 		v4 += LINE_WIDTH / sizeof(unsigned long);
80*75ded18aSWANG Xuerui 	} while (--lines > 0);
81*75ded18aSWANG Xuerui }
82*75ded18aSWANG Xuerui 
83*75ded18aSWANG Xuerui void XOR_FUNC_NAME(5)(unsigned long bytes,
84*75ded18aSWANG Xuerui 		      unsigned long * __restrict v1,
85*75ded18aSWANG Xuerui 		      const unsigned long * __restrict v2,
86*75ded18aSWANG Xuerui 		      const unsigned long * __restrict v3,
87*75ded18aSWANG Xuerui 		      const unsigned long * __restrict v4,
88*75ded18aSWANG Xuerui 		      const unsigned long * __restrict v5)
89*75ded18aSWANG Xuerui {
90*75ded18aSWANG Xuerui 	unsigned long lines = bytes / LINE_WIDTH;
91*75ded18aSWANG Xuerui 
92*75ded18aSWANG Xuerui 	do {
93*75ded18aSWANG Xuerui 		__asm__ __volatile__ (
94*75ded18aSWANG Xuerui 			LD_INOUT_LINE(v1)
95*75ded18aSWANG Xuerui 			LD_AND_XOR_LINE(v2)
96*75ded18aSWANG Xuerui 			LD_AND_XOR_LINE(v3)
97*75ded18aSWANG Xuerui 			LD_AND_XOR_LINE(v4)
98*75ded18aSWANG Xuerui 			LD_AND_XOR_LINE(v5)
99*75ded18aSWANG Xuerui 			ST_LINE(v1)
100*75ded18aSWANG Xuerui 		: : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3), [v4] "r"(v4),
101*75ded18aSWANG Xuerui 		    [v5] "r"(v5) : "memory"
102*75ded18aSWANG Xuerui 		);
103*75ded18aSWANG Xuerui 
104*75ded18aSWANG Xuerui 		v1 += LINE_WIDTH / sizeof(unsigned long);
105*75ded18aSWANG Xuerui 		v2 += LINE_WIDTH / sizeof(unsigned long);
106*75ded18aSWANG Xuerui 		v3 += LINE_WIDTH / sizeof(unsigned long);
107*75ded18aSWANG Xuerui 		v4 += LINE_WIDTH / sizeof(unsigned long);
108*75ded18aSWANG Xuerui 		v5 += LINE_WIDTH / sizeof(unsigned long);
109*75ded18aSWANG Xuerui 	} while (--lines > 0);
110*75ded18aSWANG Xuerui }
111