xref: /linux/arch/arm64/lib/xor-neon.c (revision 9f7d35d9f7a184ffb591b090b2cbf63d2d599c02)
1 /*
2  * arch/arm64/lib/xor-neon.c
3  *
4  * Authors: Jackie Liu <liuyun01@kylinos.cn>
5  * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11 
12 #include <linux/raid/xor.h>
13 #include <linux/module.h>
14 #include <asm/neon-intrinsics.h>
15 
16 void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
17 	unsigned long *p2)
18 {
19 	uint64_t *dp1 = (uint64_t *)p1;
20 	uint64_t *dp2 = (uint64_t *)p2;
21 
22 	register uint64x2_t v0, v1, v2, v3;
23 	long lines = bytes / (sizeof(uint64x2_t) * 4);
24 
25 	do {
26 		/* p1 ^= p2 */
27 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
28 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
29 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
30 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
31 
32 		/* store */
33 		vst1q_u64(dp1 +  0, v0);
34 		vst1q_u64(dp1 +  2, v1);
35 		vst1q_u64(dp1 +  4, v2);
36 		vst1q_u64(dp1 +  6, v3);
37 
38 		dp1 += 8;
39 		dp2 += 8;
40 	} while (--lines > 0);
41 }
42 
43 void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
44 	unsigned long *p2, unsigned long *p3)
45 {
46 	uint64_t *dp1 = (uint64_t *)p1;
47 	uint64_t *dp2 = (uint64_t *)p2;
48 	uint64_t *dp3 = (uint64_t *)p3;
49 
50 	register uint64x2_t v0, v1, v2, v3;
51 	long lines = bytes / (sizeof(uint64x2_t) * 4);
52 
53 	do {
54 		/* p1 ^= p2 */
55 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
56 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
57 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
58 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
59 
60 		/* p1 ^= p3 */
61 		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
62 		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
63 		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
64 		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
65 
66 		/* store */
67 		vst1q_u64(dp1 +  0, v0);
68 		vst1q_u64(dp1 +  2, v1);
69 		vst1q_u64(dp1 +  4, v2);
70 		vst1q_u64(dp1 +  6, v3);
71 
72 		dp1 += 8;
73 		dp2 += 8;
74 		dp3 += 8;
75 	} while (--lines > 0);
76 }
77 
78 void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
79 	unsigned long *p2, unsigned long *p3, unsigned long *p4)
80 {
81 	uint64_t *dp1 = (uint64_t *)p1;
82 	uint64_t *dp2 = (uint64_t *)p2;
83 	uint64_t *dp3 = (uint64_t *)p3;
84 	uint64_t *dp4 = (uint64_t *)p4;
85 
86 	register uint64x2_t v0, v1, v2, v3;
87 	long lines = bytes / (sizeof(uint64x2_t) * 4);
88 
89 	do {
90 		/* p1 ^= p2 */
91 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
92 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
93 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
94 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
95 
96 		/* p1 ^= p3 */
97 		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
98 		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
99 		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
100 		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
101 
102 		/* p1 ^= p4 */
103 		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
104 		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
105 		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
106 		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
107 
108 		/* store */
109 		vst1q_u64(dp1 +  0, v0);
110 		vst1q_u64(dp1 +  2, v1);
111 		vst1q_u64(dp1 +  4, v2);
112 		vst1q_u64(dp1 +  6, v3);
113 
114 		dp1 += 8;
115 		dp2 += 8;
116 		dp3 += 8;
117 		dp4 += 8;
118 	} while (--lines > 0);
119 }
120 
121 void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
122 	unsigned long *p2, unsigned long *p3,
123 	unsigned long *p4, unsigned long *p5)
124 {
125 	uint64_t *dp1 = (uint64_t *)p1;
126 	uint64_t *dp2 = (uint64_t *)p2;
127 	uint64_t *dp3 = (uint64_t *)p3;
128 	uint64_t *dp4 = (uint64_t *)p4;
129 	uint64_t *dp5 = (uint64_t *)p5;
130 
131 	register uint64x2_t v0, v1, v2, v3;
132 	long lines = bytes / (sizeof(uint64x2_t) * 4);
133 
134 	do {
135 		/* p1 ^= p2 */
136 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
137 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
138 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
139 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
140 
141 		/* p1 ^= p3 */
142 		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
143 		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
144 		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
145 		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
146 
147 		/* p1 ^= p4 */
148 		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
149 		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
150 		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
151 		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
152 
153 		/* p1 ^= p5 */
154 		v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
155 		v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
156 		v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
157 		v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
158 
159 		/* store */
160 		vst1q_u64(dp1 +  0, v0);
161 		vst1q_u64(dp1 +  2, v1);
162 		vst1q_u64(dp1 +  4, v2);
163 		vst1q_u64(dp1 +  6, v3);
164 
165 		dp1 += 8;
166 		dp2 += 8;
167 		dp3 += 8;
168 		dp4 += 8;
169 		dp5 += 8;
170 	} while (--lines > 0);
171 }
172 
173 struct xor_block_template const xor_block_inner_neon = {
174 	.name	= "__inner_neon__",
175 	.do_2	= xor_arm64_neon_2,
176 	.do_3	= xor_arm64_neon_3,
177 	.do_4	= xor_arm64_neon_4,
178 	.do_5	= xor_arm64_neon_5,
179 };
180 EXPORT_SYMBOL(xor_block_inner_neon);
181 
182 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
183 MODULE_DESCRIPTION("ARMv8 XOR Extensions");
184 MODULE_LICENSE("GPL");
185