xref: /linux/arch/sparc/lib/csum_copy.S (revision 4d5e3b06e1fc1428be14cd4ebe3b37c1bb34f95d)
1/* SPDX-License-Identifier: GPL-2.0 */
2/* csum_copy.S: Checksum+copy code for sparc64
3 *
4 * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
5 */
6
7#include <asm/export.h>
8
9#ifdef __KERNEL__
10#define GLOBAL_SPARE	%g7
11#else
12#define GLOBAL_SPARE	%g5
13#endif
14
15#ifndef EX_LD
16#define EX_LD(x)	x
17#endif
18
19#ifndef EX_ST
20#define EX_ST(x)	x
21#endif
22
23#ifndef EX_RETVAL
24#define EX_RETVAL(x)	x
25#endif
26
27#ifndef LOAD
28#define LOAD(type,addr,dest)	type [addr], dest
29#endif
30
31#ifndef STORE
32#define STORE(type,src,addr)	type src, [addr]
33#endif
34
35#ifndef FUNC_NAME
36#define FUNC_NAME	csum_partial_copy_nocheck
37#endif
38
39	.register	%g2, #scratch
40	.register	%g3, #scratch
41
42	.text
43
4490:
45	/* We checked for zero length already, so there must be
46	 * at least one byte.
47	 */
48	be,pt		%icc, 1f
49	 nop
50	EX_LD(LOAD(ldub, %o0 + 0x00, %o4))
51	add		%o0, 1, %o0
52	sub		%o2, 1, %o2
53	EX_ST(STORE(stb, %o4, %o1 + 0x00))
54	add		%o1, 1, %o1
551:	andcc		%o0, 0x2, %g0
56	be,pn		%icc, 80f
57	 cmp		%o2, 2
58	blu,pn		%icc, 60f
59	 nop
60	EX_LD(LOAD(lduh, %o0 + 0x00, %o5))
61	add		%o0, 2, %o0
62	sub		%o2, 2, %o2
63	EX_ST(STORE(sth, %o5, %o1 + 0x00))
64	add		%o1, 2, %o1
65	ba,pt		%xcc, 80f
66	 add		%o5, %o4, %o4
67
68	.globl		FUNC_NAME
69	.type		FUNC_NAME,#function
70	EXPORT_SYMBOL(FUNC_NAME)
71FUNC_NAME:		/* %o0=src, %o1=dst, %o2=len */
72	LOAD(prefetch, %o0 + 0x000, #n_reads)
73	xor		%o0, %o1, %g1
74	mov		-1, %o3
75	clr		%o4
76	andcc		%g1, 0x3, %g0
77	bne,pn		%icc, 95f
78	 LOAD(prefetch, %o0 + 0x040, #n_reads)
79
80	brz,pn		%o2, 70f
81	 andcc		%o0, 0x3, %g0
82
83	/* We "remember" whether the lowest bit in the address
84	 * was set in GLOBAL_SPARE.  Because if it is, we have to swap
85	 * upper and lower 8 bit fields of the sum we calculate.
86	*/
87	bne,pn		%icc, 90b
88	 andcc		%o0, 0x1, GLOBAL_SPARE
89
9080:
91	LOAD(prefetch, %o0 + 0x080, #n_reads)
92	andncc		%o2, 0x3f, %g3
93
94	LOAD(prefetch, %o0 + 0x0c0, #n_reads)
95	sub		%o2, %g3, %o2
96	brz,pn		%g3, 2f
97	 LOAD(prefetch, %o0 + 0x100, #n_reads)
98
99	/* So that we don't need to use the non-pairing
100	 * add-with-carry instructions we accumulate 32-bit
101	 * values into a 64-bit register.  At the end of the
102	 * loop we fold it down to 32-bits and so on.
103	 */
104	ba,pt		%xcc, 1f
105	LOAD(prefetch, %o0 + 0x140, #n_reads)
106
107	.align		32
1081:	EX_LD(LOAD(lduw, %o0 + 0x00, %o5))
109	EX_LD(LOAD(lduw, %o0 + 0x04, %g1))
110	EX_LD(LOAD(lduw, %o0 + 0x08, %g2))
111	add		%o4, %o5, %o4
112	EX_ST(STORE(stw, %o5, %o1 + 0x00))
113	EX_LD(LOAD(lduw, %o0 + 0x0c, %o5))
114	add		%o4, %g1, %o4
115	EX_ST(STORE(stw, %g1, %o1 + 0x04))
116	EX_LD(LOAD(lduw, %o0 + 0x10, %g1))
117	add		%o4, %g2, %o4
118	EX_ST(STORE(stw, %g2, %o1 + 0x08))
119	EX_LD(LOAD(lduw, %o0 + 0x14, %g2))
120	add		%o4, %o5, %o4
121	EX_ST(STORE(stw, %o5, %o1 + 0x0c))
122	EX_LD(LOAD(lduw, %o0 + 0x18, %o5))
123	add		%o4, %g1, %o4
124	EX_ST(STORE(stw, %g1, %o1 + 0x10))
125	EX_LD(LOAD(lduw, %o0 + 0x1c, %g1))
126	add		%o4, %g2, %o4
127	EX_ST(STORE(stw, %g2, %o1 + 0x14))
128	EX_LD(LOAD(lduw, %o0 + 0x20, %g2))
129	add		%o4, %o5, %o4
130	EX_ST(STORE(stw, %o5, %o1 + 0x18))
131	EX_LD(LOAD(lduw, %o0 + 0x24, %o5))
132	add		%o4, %g1, %o4
133	EX_ST(STORE(stw, %g1, %o1 + 0x1c))
134	EX_LD(LOAD(lduw, %o0 + 0x28, %g1))
135	add		%o4, %g2, %o4
136	EX_ST(STORE(stw, %g2, %o1 + 0x20))
137	EX_LD(LOAD(lduw, %o0 + 0x2c, %g2))
138	add		%o4, %o5, %o4
139	EX_ST(STORE(stw, %o5, %o1 + 0x24))
140	EX_LD(LOAD(lduw, %o0 + 0x30, %o5))
141	add		%o4, %g1, %o4
142	EX_ST(STORE(stw, %g1, %o1 + 0x28))
143	EX_LD(LOAD(lduw, %o0 + 0x34, %g1))
144	add		%o4, %g2, %o4
145	EX_ST(STORE(stw, %g2, %o1 + 0x2c))
146	EX_LD(LOAD(lduw, %o0 + 0x38, %g2))
147	add		%o4, %o5, %o4
148	EX_ST(STORE(stw, %o5, %o1 + 0x30))
149	EX_LD(LOAD(lduw, %o0 + 0x3c, %o5))
150	add		%o4, %g1, %o4
151	EX_ST(STORE(stw, %g1, %o1 + 0x34))
152	LOAD(prefetch, %o0 + 0x180, #n_reads)
153	add		%o4, %g2, %o4
154	EX_ST(STORE(stw, %g2, %o1 + 0x38))
155	subcc		%g3, 0x40, %g3
156	add		%o0, 0x40, %o0
157	add		%o4, %o5, %o4
158	EX_ST(STORE(stw, %o5, %o1 + 0x3c))
159	bne,pt		%icc, 1b
160	 add		%o1, 0x40, %o1
161
1622:	and		%o2, 0x3c, %g3
163	brz,pn		%g3, 2f
164	 sub		%o2, %g3, %o2
1651:	EX_LD(LOAD(lduw, %o0 + 0x00, %o5))
166	subcc		%g3, 0x4, %g3
167	add		%o0, 0x4, %o0
168	add		%o4, %o5, %o4
169	EX_ST(STORE(stw, %o5, %o1 + 0x00))
170	bne,pt		%icc, 1b
171	 add		%o1, 0x4, %o1
172
1732:
174	/* fold 64-->32 */
175	srlx		%o4, 32, %o5
176	srl		%o4, 0, %o4
177	add		%o4, %o5, %o4
178	srlx		%o4, 32, %o5
179	srl		%o4, 0, %o4
180	add		%o4, %o5, %o4
181
182	/* fold 32-->16 */
183	sethi		%hi(0xffff0000), %g1
184	srl		%o4, 16, %o5
185	andn		%o4, %g1, %g2
186	add		%o5, %g2, %o4
187	srl		%o4, 16, %o5
188	andn		%o4, %g1, %g2
189	add		%o5, %g2, %o4
190
19160:
192	/* %o4 has the 16-bit sum we have calculated so-far.  */
193	cmp		%o2, 2
194	blu,pt		%icc, 1f
195	 nop
196	EX_LD(LOAD(lduh, %o0 + 0x00, %o5))
197	sub		%o2, 2, %o2
198	add		%o0, 2, %o0
199	add		%o4, %o5, %o4
200	EX_ST(STORE(sth, %o5, %o1 + 0x00))
201	add		%o1, 0x2, %o1
2021:	brz,pt		%o2, 1f
203	 nop
204	EX_LD(LOAD(ldub, %o0 + 0x00, %o5))
205	sub		%o2, 1, %o2
206	add		%o0, 1, %o0
207	EX_ST(STORE(stb, %o5, %o1 + 0x00))
208	sllx		%o5, 8, %o5
209	add		%o1, 1, %o1
210	add		%o4, %o5, %o4
2111:
212	/* fold 32-->16 */
213	sethi		%hi(0xffff0000), %g1
214	srl		%o4, 16, %o5
215	andn		%o4, %g1, %g2
216	add		%o5, %g2, %o4
217	srl		%o4, 16, %o5
218	andn		%o4, %g1, %g2
219	add		%o5, %g2, %o4
220
2211:	brz,pt		GLOBAL_SPARE, 1f
222	 nop
223
224	/* We started with an odd byte, byte-swap the result.  */
225	srl		%o4, 8, %o5
226	and		%o4, 0xff, %g1
227	sll		%g1, 8, %g1
228	or		%o5, %g1, %o4
229
2301:	addcc		%o3, %o4, %o3
231	addc		%g0, %o3, %o3
232
23370:
234	retl
235	 srl		%o3, 0, %o0
236
23795:	mov		0, GLOBAL_SPARE
238	brlez,pn	%o2, 4f
239	 andcc		%o0, 1, %o5
240	be,a,pt		%icc, 1f
241	 srl		%o2, 1, %g1
242	sub		%o2, 1, %o2
243	EX_LD(LOAD(ldub, %o0, GLOBAL_SPARE))
244	add		%o0, 1, %o0
245	EX_ST(STORE(stb, GLOBAL_SPARE, %o1))
246	srl		%o2, 1, %g1
247	add		%o1, 1, %o1
2481:	brz,a,pn	%g1, 3f
249	 andcc		%o2, 1, %g0
250	andcc		%o0, 2, %g0
251	be,a,pt		%icc, 1f
252	 srl		%g1, 1, %g1
253	EX_LD(LOAD(lduh, %o0, %o4))
254	sub		%o2, 2, %o2
255	srl		%o4, 8, %g2
256	sub		%g1, 1, %g1
257	EX_ST(STORE(stb, %g2, %o1))
258	add		%o4, GLOBAL_SPARE, GLOBAL_SPARE
259	EX_ST(STORE(stb, %o4, %o1 + 1))
260	add		%o0, 2, %o0
261	srl		%g1, 1, %g1
262	add		%o1, 2, %o1
2631:	brz,a,pn	%g1, 2f
264	 andcc		%o2, 2, %g0
265	EX_LD(LOAD(lduw, %o0, %o4))
2665:	srl		%o4, 24, %g2
267	srl		%o4, 16, %g3
268	EX_ST(STORE(stb, %g2, %o1))
269	srl		%o4, 8, %g2
270	EX_ST(STORE(stb, %g3, %o1 + 1))
271	add		%o0, 4, %o0
272	EX_ST(STORE(stb, %g2, %o1 + 2))
273	addcc		%o4, GLOBAL_SPARE, GLOBAL_SPARE
274	EX_ST(STORE(stb, %o4, %o1 + 3))
275	addc		GLOBAL_SPARE, %g0, GLOBAL_SPARE
276	add		%o1, 4, %o1
277	subcc		%g1, 1, %g1
278	bne,a,pt	%icc, 5b
279	 EX_LD(LOAD(lduw, %o0, %o4))
280	sll		GLOBAL_SPARE, 16, %g2
281	srl		GLOBAL_SPARE, 16, GLOBAL_SPARE
282	srl		%g2, 16, %g2
283	andcc		%o2, 2, %g0
284	add		%g2, GLOBAL_SPARE, GLOBAL_SPARE
2852:	be,a,pt		%icc, 3f
286	 andcc		%o2, 1, %g0
287	EX_LD(LOAD(lduh, %o0, %o4))
288	andcc		%o2, 1, %g0
289	srl		%o4, 8, %g2
290	add		%o0, 2, %o0
291	EX_ST(STORE(stb, %g2, %o1))
292	add		GLOBAL_SPARE, %o4, GLOBAL_SPARE
293	EX_ST(STORE(stb, %o4, %o1 + 1))
294	add		%o1, 2, %o1
2953:	be,a,pt		%icc, 1f
296	 sll		GLOBAL_SPARE, 16, %o4
297	EX_LD(LOAD(ldub, %o0, %g2))
298	sll		%g2, 8, %o4
299	EX_ST(STORE(stb, %g2, %o1))
300	add		GLOBAL_SPARE, %o4, GLOBAL_SPARE
301	sll		GLOBAL_SPARE, 16, %o4
3021:	addcc		%o4, GLOBAL_SPARE, GLOBAL_SPARE
303	srl		GLOBAL_SPARE, 16, %o4
304	addc		%g0, %o4, GLOBAL_SPARE
305	brz,pt		%o5, 4f
306	 srl		GLOBAL_SPARE, 8, %o4
307	and		GLOBAL_SPARE, 0xff, %g2
308	and		%o4, 0xff, %o4
309	sll		%g2, 8, %g2
310	or		%g2, %o4, GLOBAL_SPARE
3114:	addcc		%o3, GLOBAL_SPARE, %o3
312	addc		%g0, %o3, %o0
313	retl
314	 srl		%o0, 0, %o0
315	.size		FUNC_NAME, .-FUNC_NAME
316