xref: /linux/arch/sparc/lib/csum_copy.S (revision fcc8487d477a3452a1d0ccbdd4c5e0e1e3cb8bed)
1/* csum_copy.S: Checksum+copy code for sparc64
2 *
3 * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
4 */
5
6#include <asm/export.h>
7
8#ifdef __KERNEL__
9#define GLOBAL_SPARE	%g7
10#else
11#define GLOBAL_SPARE	%g5
12#endif
13
14#ifndef EX_LD
15#define EX_LD(x)	x
16#endif
17
18#ifndef EX_ST
19#define EX_ST(x)	x
20#endif
21
22#ifndef EX_RETVAL
23#define EX_RETVAL(x)	x
24#endif
25
26#ifndef LOAD
27#define LOAD(type,addr,dest)	type [addr], dest
28#endif
29
30#ifndef STORE
31#define STORE(type,src,addr)	type src, [addr]
32#endif
33
34#ifndef FUNC_NAME
35#define FUNC_NAME	csum_partial_copy_nocheck
36#endif
37
38	.register	%g2, #scratch
39	.register	%g3, #scratch
40
41	.text
42
4390:
44	/* We checked for zero length already, so there must be
45	 * at least one byte.
46	 */
47	be,pt		%icc, 1f
48	 nop
49	EX_LD(LOAD(ldub, %o0 + 0x00, %o4))
50	add		%o0, 1, %o0
51	sub		%o2, 1, %o2
52	EX_ST(STORE(stb, %o4, %o1 + 0x00))
53	add		%o1, 1, %o1
541:	andcc		%o0, 0x2, %g0
55	be,pn		%icc, 80f
56	 cmp		%o2, 2
57	blu,pn		%icc, 60f
58	 nop
59	EX_LD(LOAD(lduh, %o0 + 0x00, %o5))
60	add		%o0, 2, %o0
61	sub		%o2, 2, %o2
62	EX_ST(STORE(sth, %o5, %o1 + 0x00))
63	add		%o1, 2, %o1
64	ba,pt		%xcc, 80f
65	 add		%o5, %o4, %o4
66
67	.globl		FUNC_NAME
68	EXPORT_SYMBOL(FUNC_NAME)
69FUNC_NAME:		/* %o0=src, %o1=dst, %o2=len, %o3=sum */
70	LOAD(prefetch, %o0 + 0x000, #n_reads)
71	xor		%o0, %o1, %g1
72	clr		%o4
73	andcc		%g1, 0x3, %g0
74	bne,pn		%icc, 95f
75	 LOAD(prefetch, %o0 + 0x040, #n_reads)
76
77	brz,pn		%o2, 70f
78	 andcc		%o0, 0x3, %g0
79
80	/* We "remember" whether the lowest bit in the address
81	 * was set in GLOBAL_SPARE.  Because if it is, we have to swap
82	 * upper and lower 8 bit fields of the sum we calculate.
83	*/
84	bne,pn		%icc, 90b
85	 andcc		%o0, 0x1, GLOBAL_SPARE
86
8780:
88	LOAD(prefetch, %o0 + 0x080, #n_reads)
89	andncc		%o2, 0x3f, %g3
90
91	LOAD(prefetch, %o0 + 0x0c0, #n_reads)
92	sub		%o2, %g3, %o2
93	brz,pn		%g3, 2f
94	 LOAD(prefetch, %o0 + 0x100, #n_reads)
95
96	/* So that we don't need to use the non-pairing
97	 * add-with-carry instructions we accumulate 32-bit
98	 * values into a 64-bit register.  At the end of the
99	 * loop we fold it down to 32-bits and so on.
100	 */
101	ba,pt		%xcc, 1f
102	LOAD(prefetch, %o0 + 0x140, #n_reads)
103
104	.align		32
1051:	EX_LD(LOAD(lduw, %o0 + 0x00, %o5))
106	EX_LD(LOAD(lduw, %o0 + 0x04, %g1))
107	EX_LD(LOAD(lduw, %o0 + 0x08, %g2))
108	add		%o4, %o5, %o4
109	EX_ST(STORE(stw, %o5, %o1 + 0x00))
110	EX_LD(LOAD(lduw, %o0 + 0x0c, %o5))
111	add		%o4, %g1, %o4
112	EX_ST(STORE(stw, %g1, %o1 + 0x04))
113	EX_LD(LOAD(lduw, %o0 + 0x10, %g1))
114	add		%o4, %g2, %o4
115	EX_ST(STORE(stw, %g2, %o1 + 0x08))
116	EX_LD(LOAD(lduw, %o0 + 0x14, %g2))
117	add		%o4, %o5, %o4
118	EX_ST(STORE(stw, %o5, %o1 + 0x0c))
119	EX_LD(LOAD(lduw, %o0 + 0x18, %o5))
120	add		%o4, %g1, %o4
121	EX_ST(STORE(stw, %g1, %o1 + 0x10))
122	EX_LD(LOAD(lduw, %o0 + 0x1c, %g1))
123	add		%o4, %g2, %o4
124	EX_ST(STORE(stw, %g2, %o1 + 0x14))
125	EX_LD(LOAD(lduw, %o0 + 0x20, %g2))
126	add		%o4, %o5, %o4
127	EX_ST(STORE(stw, %o5, %o1 + 0x18))
128	EX_LD(LOAD(lduw, %o0 + 0x24, %o5))
129	add		%o4, %g1, %o4
130	EX_ST(STORE(stw, %g1, %o1 + 0x1c))
131	EX_LD(LOAD(lduw, %o0 + 0x28, %g1))
132	add		%o4, %g2, %o4
133	EX_ST(STORE(stw, %g2, %o1 + 0x20))
134	EX_LD(LOAD(lduw, %o0 + 0x2c, %g2))
135	add		%o4, %o5, %o4
136	EX_ST(STORE(stw, %o5, %o1 + 0x24))
137	EX_LD(LOAD(lduw, %o0 + 0x30, %o5))
138	add		%o4, %g1, %o4
139	EX_ST(STORE(stw, %g1, %o1 + 0x28))
140	EX_LD(LOAD(lduw, %o0 + 0x34, %g1))
141	add		%o4, %g2, %o4
142	EX_ST(STORE(stw, %g2, %o1 + 0x2c))
143	EX_LD(LOAD(lduw, %o0 + 0x38, %g2))
144	add		%o4, %o5, %o4
145	EX_ST(STORE(stw, %o5, %o1 + 0x30))
146	EX_LD(LOAD(lduw, %o0 + 0x3c, %o5))
147	add		%o4, %g1, %o4
148	EX_ST(STORE(stw, %g1, %o1 + 0x34))
149	LOAD(prefetch, %o0 + 0x180, #n_reads)
150	add		%o4, %g2, %o4
151	EX_ST(STORE(stw, %g2, %o1 + 0x38))
152	subcc		%g3, 0x40, %g3
153	add		%o0, 0x40, %o0
154	add		%o4, %o5, %o4
155	EX_ST(STORE(stw, %o5, %o1 + 0x3c))
156	bne,pt		%icc, 1b
157	 add		%o1, 0x40, %o1
158
1592:	and		%o2, 0x3c, %g3
160	brz,pn		%g3, 2f
161	 sub		%o2, %g3, %o2
1621:	EX_LD(LOAD(lduw, %o0 + 0x00, %o5))
163	subcc		%g3, 0x4, %g3
164	add		%o0, 0x4, %o0
165	add		%o4, %o5, %o4
166	EX_ST(STORE(stw, %o5, %o1 + 0x00))
167	bne,pt		%icc, 1b
168	 add		%o1, 0x4, %o1
169
1702:
171	/* fold 64-->32 */
172	srlx		%o4, 32, %o5
173	srl		%o4, 0, %o4
174	add		%o4, %o5, %o4
175	srlx		%o4, 32, %o5
176	srl		%o4, 0, %o4
177	add		%o4, %o5, %o4
178
179	/* fold 32-->16 */
180	sethi		%hi(0xffff0000), %g1
181	srl		%o4, 16, %o5
182	andn		%o4, %g1, %g2
183	add		%o5, %g2, %o4
184	srl		%o4, 16, %o5
185	andn		%o4, %g1, %g2
186	add		%o5, %g2, %o4
187
18860:
189	/* %o4 has the 16-bit sum we have calculated so-far.  */
190	cmp		%o2, 2
191	blu,pt		%icc, 1f
192	 nop
193	EX_LD(LOAD(lduh, %o0 + 0x00, %o5))
194	sub		%o2, 2, %o2
195	add		%o0, 2, %o0
196	add		%o4, %o5, %o4
197	EX_ST(STORE(sth, %o5, %o1 + 0x00))
198	add		%o1, 0x2, %o1
1991:	brz,pt		%o2, 1f
200	 nop
201	EX_LD(LOAD(ldub, %o0 + 0x00, %o5))
202	sub		%o2, 1, %o2
203	add		%o0, 1, %o0
204	EX_ST(STORE(stb, %o5, %o1 + 0x00))
205	sllx		%o5, 8, %o5
206	add		%o1, 1, %o1
207	add		%o4, %o5, %o4
2081:
209	/* fold 32-->16 */
210	sethi		%hi(0xffff0000), %g1
211	srl		%o4, 16, %o5
212	andn		%o4, %g1, %g2
213	add		%o5, %g2, %o4
214	srl		%o4, 16, %o5
215	andn		%o4, %g1, %g2
216	add		%o5, %g2, %o4
217
2181:	brz,pt		GLOBAL_SPARE, 1f
219	 nop
220
221	/* We started with an odd byte, byte-swap the result.  */
222	srl		%o4, 8, %o5
223	and		%o4, 0xff, %g1
224	sll		%g1, 8, %g1
225	or		%o5, %g1, %o4
226
2271:	addcc		%o3, %o4, %o3
228	addc		%g0, %o3, %o3
229
23070:
231	retl
232	 srl		%o3, 0, %o0
233
23495:	mov		0, GLOBAL_SPARE
235	brlez,pn	%o2, 4f
236	 andcc		%o0, 1, %o5
237	be,a,pt		%icc, 1f
238	 srl		%o2, 1, %g1
239	sub		%o2, 1, %o2
240	EX_LD(LOAD(ldub, %o0, GLOBAL_SPARE))
241	add		%o0, 1, %o0
242	EX_ST(STORE(stb, GLOBAL_SPARE, %o1))
243	srl		%o2, 1, %g1
244	add		%o1, 1, %o1
2451:	brz,a,pn	%g1, 3f
246	 andcc		%o2, 1, %g0
247	andcc		%o0, 2, %g0
248	be,a,pt		%icc, 1f
249	 srl		%g1, 1, %g1
250	EX_LD(LOAD(lduh, %o0, %o4))
251	sub		%o2, 2, %o2
252	srl		%o4, 8, %g2
253	sub		%g1, 1, %g1
254	EX_ST(STORE(stb, %g2, %o1))
255	add		%o4, GLOBAL_SPARE, GLOBAL_SPARE
256	EX_ST(STORE(stb, %o4, %o1 + 1))
257	add		%o0, 2, %o0
258	srl		%g1, 1, %g1
259	add		%o1, 2, %o1
2601:	brz,a,pn	%g1, 2f
261	 andcc		%o2, 2, %g0
262	EX_LD(LOAD(lduw, %o0, %o4))
2635:	srl		%o4, 24, %g2
264	srl		%o4, 16, %g3
265	EX_ST(STORE(stb, %g2, %o1))
266	srl		%o4, 8, %g2
267	EX_ST(STORE(stb, %g3, %o1 + 1))
268	add		%o0, 4, %o0
269	EX_ST(STORE(stb, %g2, %o1 + 2))
270	addcc		%o4, GLOBAL_SPARE, GLOBAL_SPARE
271	EX_ST(STORE(stb, %o4, %o1 + 3))
272	addc		GLOBAL_SPARE, %g0, GLOBAL_SPARE
273	add		%o1, 4, %o1
274	subcc		%g1, 1, %g1
275	bne,a,pt	%icc, 5b
276	 EX_LD(LOAD(lduw, %o0, %o4))
277	sll		GLOBAL_SPARE, 16, %g2
278	srl		GLOBAL_SPARE, 16, GLOBAL_SPARE
279	srl		%g2, 16, %g2
280	andcc		%o2, 2, %g0
281	add		%g2, GLOBAL_SPARE, GLOBAL_SPARE
2822:	be,a,pt		%icc, 3f
283	 andcc		%o2, 1, %g0
284	EX_LD(LOAD(lduh, %o0, %o4))
285	andcc		%o2, 1, %g0
286	srl		%o4, 8, %g2
287	add		%o0, 2, %o0
288	EX_ST(STORE(stb, %g2, %o1))
289	add		GLOBAL_SPARE, %o4, GLOBAL_SPARE
290	EX_ST(STORE(stb, %o4, %o1 + 1))
291	add		%o1, 2, %o1
2923:	be,a,pt		%icc, 1f
293	 sll		GLOBAL_SPARE, 16, %o4
294	EX_LD(LOAD(ldub, %o0, %g2))
295	sll		%g2, 8, %o4
296	EX_ST(STORE(stb, %g2, %o1))
297	add		GLOBAL_SPARE, %o4, GLOBAL_SPARE
298	sll		GLOBAL_SPARE, 16, %o4
2991:	addcc		%o4, GLOBAL_SPARE, GLOBAL_SPARE
300	srl		GLOBAL_SPARE, 16, %o4
301	addc		%g0, %o4, GLOBAL_SPARE
302	brz,pt		%o5, 4f
303	 srl		GLOBAL_SPARE, 8, %o4
304	and		GLOBAL_SPARE, 0xff, %g2
305	and		%o4, 0xff, %o4
306	sll		%g2, 8, %g2
307	or		%g2, %o4, GLOBAL_SPARE
3084:	addcc		%o3, GLOBAL_SPARE, %o3
309	addc		%g0, %o3, %o0
310	retl
311	 srl		%o0, 0, %o0
312	.size		FUNC_NAME, .-FUNC_NAME
313