xref: /linux/lib/raid/xor/arm64/xor-eor3.c (revision 9611c0ce215a66770ccbe5c126bf57ba8c31bcad)
1*4d3c5cbfSArd Biesheuvel // SPDX-License-Identifier: GPL-2.0-only
2*4d3c5cbfSArd Biesheuvel 
3*4d3c5cbfSArd Biesheuvel #include <linux/cache.h>
4*4d3c5cbfSArd Biesheuvel #include <asm/neon-intrinsics.h>
5*4d3c5cbfSArd Biesheuvel #include "xor_impl.h"
6*4d3c5cbfSArd Biesheuvel #include "xor_arch.h"
7*4d3c5cbfSArd Biesheuvel #include "xor-neon.h"
8*4d3c5cbfSArd Biesheuvel 
9*4d3c5cbfSArd Biesheuvel extern void __xor_eor3_2(unsigned long bytes, unsigned long * __restrict p1,
10*4d3c5cbfSArd Biesheuvel 		const unsigned long * __restrict p2);
11*4d3c5cbfSArd Biesheuvel 
12*4d3c5cbfSArd Biesheuvel static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
13*4d3c5cbfSArd Biesheuvel {
14*4d3c5cbfSArd Biesheuvel 	uint64x2_t res;
15*4d3c5cbfSArd Biesheuvel 
16*4d3c5cbfSArd Biesheuvel 	asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
17*4d3c5cbfSArd Biesheuvel 	    "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
18*4d3c5cbfSArd Biesheuvel 	    : "=w"(res) : "w"(p), "w"(q), "w"(r));
19*4d3c5cbfSArd Biesheuvel 	return res;
20*4d3c5cbfSArd Biesheuvel }
21*4d3c5cbfSArd Biesheuvel 
22*4d3c5cbfSArd Biesheuvel static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
23*4d3c5cbfSArd Biesheuvel 		const unsigned long * __restrict p2,
24*4d3c5cbfSArd Biesheuvel 		const unsigned long * __restrict p3)
25*4d3c5cbfSArd Biesheuvel {
26*4d3c5cbfSArd Biesheuvel 	uint64_t *dp1 = (uint64_t *)p1;
27*4d3c5cbfSArd Biesheuvel 	uint64_t *dp2 = (uint64_t *)p2;
28*4d3c5cbfSArd Biesheuvel 	uint64_t *dp3 = (uint64_t *)p3;
29*4d3c5cbfSArd Biesheuvel 
30*4d3c5cbfSArd Biesheuvel 	register uint64x2_t v0, v1, v2, v3;
31*4d3c5cbfSArd Biesheuvel 	long lines = bytes / (sizeof(uint64x2_t) * 4);
32*4d3c5cbfSArd Biesheuvel 
33*4d3c5cbfSArd Biesheuvel 	do {
34*4d3c5cbfSArd Biesheuvel 		/* p1 ^= p2 ^ p3 */
35*4d3c5cbfSArd Biesheuvel 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
36*4d3c5cbfSArd Biesheuvel 			  vld1q_u64(dp3 + 0));
37*4d3c5cbfSArd Biesheuvel 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
38*4d3c5cbfSArd Biesheuvel 			  vld1q_u64(dp3 + 2));
39*4d3c5cbfSArd Biesheuvel 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
40*4d3c5cbfSArd Biesheuvel 			  vld1q_u64(dp3 + 4));
41*4d3c5cbfSArd Biesheuvel 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
42*4d3c5cbfSArd Biesheuvel 			  vld1q_u64(dp3 + 6));
43*4d3c5cbfSArd Biesheuvel 
44*4d3c5cbfSArd Biesheuvel 		/* store */
45*4d3c5cbfSArd Biesheuvel 		vst1q_u64(dp1 + 0, v0);
46*4d3c5cbfSArd Biesheuvel 		vst1q_u64(dp1 + 2, v1);
47*4d3c5cbfSArd Biesheuvel 		vst1q_u64(dp1 + 4, v2);
48*4d3c5cbfSArd Biesheuvel 		vst1q_u64(dp1 + 6, v3);
49*4d3c5cbfSArd Biesheuvel 
50*4d3c5cbfSArd Biesheuvel 		dp1 += 8;
51*4d3c5cbfSArd Biesheuvel 		dp2 += 8;
52*4d3c5cbfSArd Biesheuvel 		dp3 += 8;
53*4d3c5cbfSArd Biesheuvel 	} while (--lines > 0);
54*4d3c5cbfSArd Biesheuvel }
55*4d3c5cbfSArd Biesheuvel 
56*4d3c5cbfSArd Biesheuvel static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
57*4d3c5cbfSArd Biesheuvel 		const unsigned long * __restrict p2,
58*4d3c5cbfSArd Biesheuvel 		const unsigned long * __restrict p3,
59*4d3c5cbfSArd Biesheuvel 		const unsigned long * __restrict p4)
60*4d3c5cbfSArd Biesheuvel {
61*4d3c5cbfSArd Biesheuvel 	uint64_t *dp1 = (uint64_t *)p1;
62*4d3c5cbfSArd Biesheuvel 	uint64_t *dp2 = (uint64_t *)p2;
63*4d3c5cbfSArd Biesheuvel 	uint64_t *dp3 = (uint64_t *)p3;
64*4d3c5cbfSArd Biesheuvel 	uint64_t *dp4 = (uint64_t *)p4;
65*4d3c5cbfSArd Biesheuvel 
66*4d3c5cbfSArd Biesheuvel 	register uint64x2_t v0, v1, v2, v3;
67*4d3c5cbfSArd Biesheuvel 	long lines = bytes / (sizeof(uint64x2_t) * 4);
68*4d3c5cbfSArd Biesheuvel 
69*4d3c5cbfSArd Biesheuvel 	do {
70*4d3c5cbfSArd Biesheuvel 		/* p1 ^= p2 ^ p3 */
71*4d3c5cbfSArd Biesheuvel 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
72*4d3c5cbfSArd Biesheuvel 			  vld1q_u64(dp3 + 0));
73*4d3c5cbfSArd Biesheuvel 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
74*4d3c5cbfSArd Biesheuvel 			  vld1q_u64(dp3 + 2));
75*4d3c5cbfSArd Biesheuvel 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
76*4d3c5cbfSArd Biesheuvel 			  vld1q_u64(dp3 + 4));
77*4d3c5cbfSArd Biesheuvel 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
78*4d3c5cbfSArd Biesheuvel 			  vld1q_u64(dp3 + 6));
79*4d3c5cbfSArd Biesheuvel 
80*4d3c5cbfSArd Biesheuvel 		/* p1 ^= p4 */
81*4d3c5cbfSArd Biesheuvel 		v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
82*4d3c5cbfSArd Biesheuvel 		v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
83*4d3c5cbfSArd Biesheuvel 		v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
84*4d3c5cbfSArd Biesheuvel 		v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
85*4d3c5cbfSArd Biesheuvel 
86*4d3c5cbfSArd Biesheuvel 		/* store */
87*4d3c5cbfSArd Biesheuvel 		vst1q_u64(dp1 + 0, v0);
88*4d3c5cbfSArd Biesheuvel 		vst1q_u64(dp1 + 2, v1);
89*4d3c5cbfSArd Biesheuvel 		vst1q_u64(dp1 + 4, v2);
90*4d3c5cbfSArd Biesheuvel 		vst1q_u64(dp1 + 6, v3);
91*4d3c5cbfSArd Biesheuvel 
92*4d3c5cbfSArd Biesheuvel 		dp1 += 8;
93*4d3c5cbfSArd Biesheuvel 		dp2 += 8;
94*4d3c5cbfSArd Biesheuvel 		dp3 += 8;
95*4d3c5cbfSArd Biesheuvel 		dp4 += 8;
96*4d3c5cbfSArd Biesheuvel 	} while (--lines > 0);
97*4d3c5cbfSArd Biesheuvel }
98*4d3c5cbfSArd Biesheuvel 
99*4d3c5cbfSArd Biesheuvel static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
100*4d3c5cbfSArd Biesheuvel 		const unsigned long * __restrict p2,
101*4d3c5cbfSArd Biesheuvel 		const unsigned long * __restrict p3,
102*4d3c5cbfSArd Biesheuvel 		const unsigned long * __restrict p4,
103*4d3c5cbfSArd Biesheuvel 		const unsigned long * __restrict p5)
104*4d3c5cbfSArd Biesheuvel {
105*4d3c5cbfSArd Biesheuvel 	uint64_t *dp1 = (uint64_t *)p1;
106*4d3c5cbfSArd Biesheuvel 	uint64_t *dp2 = (uint64_t *)p2;
107*4d3c5cbfSArd Biesheuvel 	uint64_t *dp3 = (uint64_t *)p3;
108*4d3c5cbfSArd Biesheuvel 	uint64_t *dp4 = (uint64_t *)p4;
109*4d3c5cbfSArd Biesheuvel 	uint64_t *dp5 = (uint64_t *)p5;
110*4d3c5cbfSArd Biesheuvel 
111*4d3c5cbfSArd Biesheuvel 	register uint64x2_t v0, v1, v2, v3;
112*4d3c5cbfSArd Biesheuvel 	long lines = bytes / (sizeof(uint64x2_t) * 4);
113*4d3c5cbfSArd Biesheuvel 
114*4d3c5cbfSArd Biesheuvel 	do {
115*4d3c5cbfSArd Biesheuvel 		/* p1 ^= p2 ^ p3 */
116*4d3c5cbfSArd Biesheuvel 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
117*4d3c5cbfSArd Biesheuvel 			  vld1q_u64(dp3 + 0));
118*4d3c5cbfSArd Biesheuvel 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
119*4d3c5cbfSArd Biesheuvel 			  vld1q_u64(dp3 + 2));
120*4d3c5cbfSArd Biesheuvel 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
121*4d3c5cbfSArd Biesheuvel 			  vld1q_u64(dp3 + 4));
122*4d3c5cbfSArd Biesheuvel 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
123*4d3c5cbfSArd Biesheuvel 			  vld1q_u64(dp3 + 6));
124*4d3c5cbfSArd Biesheuvel 
125*4d3c5cbfSArd Biesheuvel 		/* p1 ^= p4 ^ p5 */
126*4d3c5cbfSArd Biesheuvel 		v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
127*4d3c5cbfSArd Biesheuvel 		v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
128*4d3c5cbfSArd Biesheuvel 		v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
129*4d3c5cbfSArd Biesheuvel 		v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
130*4d3c5cbfSArd Biesheuvel 
131*4d3c5cbfSArd Biesheuvel 		/* store */
132*4d3c5cbfSArd Biesheuvel 		vst1q_u64(dp1 + 0, v0);
133*4d3c5cbfSArd Biesheuvel 		vst1q_u64(dp1 + 2, v1);
134*4d3c5cbfSArd Biesheuvel 		vst1q_u64(dp1 + 4, v2);
135*4d3c5cbfSArd Biesheuvel 		vst1q_u64(dp1 + 6, v3);
136*4d3c5cbfSArd Biesheuvel 
137*4d3c5cbfSArd Biesheuvel 		dp1 += 8;
138*4d3c5cbfSArd Biesheuvel 		dp2 += 8;
139*4d3c5cbfSArd Biesheuvel 		dp3 += 8;
140*4d3c5cbfSArd Biesheuvel 		dp4 += 8;
141*4d3c5cbfSArd Biesheuvel 		dp5 += 8;
142*4d3c5cbfSArd Biesheuvel 	} while (--lines > 0);
143*4d3c5cbfSArd Biesheuvel }
144*4d3c5cbfSArd Biesheuvel 
145*4d3c5cbfSArd Biesheuvel __DO_XOR_BLOCKS(eor3_inner, __xor_eor3_2, __xor_eor3_3, __xor_eor3_4,
146*4d3c5cbfSArd Biesheuvel 		__xor_eor3_5);
147