xref: /linux/arch/xtensa/lib/checksum.S (revision f3d9478b2ce468c3115b02ecae7e975990697f15)
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		IP/TCP/UDP checksumming routines
7 *
8 * Xtensa version:  Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
9 *                  Optimized by Joe Taylor
10 *
11 *		This program is free software; you can redistribute it and/or
12 *		modify it under the terms of the GNU General Public License
13 *		as published by the Free Software Foundation; either version
14 *		2 of the License, or (at your option) any later version.
15 */
16
17#include <asm/errno.h>
18#include <linux/linkage.h>
19#define _ASMLANGUAGE
20#include <xtensa/config/core.h>
21
22/*
23 * computes a partial checksum, e.g. for TCP/UDP fragments
24 */
25
26/*
27 * unsigned int csum_partial(const unsigned char *buf, int len,
28 *                           unsigned int sum);
29 *    a2 = buf
30 *    a3 = len
31 *    a4 = sum
32 *
33 * This function assumes 2- or 4-byte alignment.  Other alignments will fail!
34 */
35
36/* ONES_ADD converts twos-complement math to ones-complement. */
37#define ONES_ADD(sum, val)	  \
38	add	sum, sum, val	; \
39	bgeu	sum, val, 99f	; \
40	addi	sum, sum, 1	; \
4199:				;
42
43.text
44ENTRY(csum_partial)
45	  /*
46	   * Experiments with Ethernet and SLIP connections show that buf
47	   * is aligned on either a 2-byte or 4-byte boundary.
48	   */
49	entry	sp, 32
50	extui	a5, a2, 0, 2
51	bnez	a5, 8f		/* branch if 2-byte aligned */
52	/* Fall-through on common case, 4-byte alignment */
531:
54	srli	a5, a3, 5	/* 32-byte chunks */
55#if XCHAL_HAVE_LOOPS
56	loopgtz	a5, 2f
57#else
58	beqz	a5, 2f
59	slli	a5, a5, 5
60	add	a5, a5, a2	/* a5 = end of last 32-byte chunk */
61.Loop1:
62#endif
63	l32i	a6, a2, 0
64	l32i	a7, a2, 4
65	ONES_ADD(a4, a6)
66	ONES_ADD(a4, a7)
67	l32i	a6, a2, 8
68	l32i	a7, a2, 12
69	ONES_ADD(a4, a6)
70	ONES_ADD(a4, a7)
71	l32i	a6, a2, 16
72	l32i	a7, a2, 20
73	ONES_ADD(a4, a6)
74	ONES_ADD(a4, a7)
75	l32i	a6, a2, 24
76	l32i	a7, a2, 28
77	ONES_ADD(a4, a6)
78	ONES_ADD(a4, a7)
79	addi	a2, a2, 4*8
80#if !XCHAL_HAVE_LOOPS
81	blt	a2, a5, .Loop1
82#endif
832:
84	extui	a5, a3, 2, 3	/* remaining 4-byte chunks */
85#if XCHAL_HAVE_LOOPS
86	loopgtz	a5, 3f
87#else
88	beqz	a5, 3f
89	slli	a5, a5, 2
90	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
91.Loop2:
92#endif
93	l32i	a6, a2, 0
94	ONES_ADD(a4, a6)
95	addi	a2, a2, 4
96#if !XCHAL_HAVE_LOOPS
97	blt	a2, a5, .Loop2
98#endif
993:
100	_bbci.l	a3, 1, 5f	/* remaining 2-byte chunk */
101	l16ui	a6, a2, 0
102	ONES_ADD(a4, a6)
103	addi	a2, a2, 2
1045:
105	_bbci.l	a3, 0, 7f	/* remaining 1-byte chunk */
1066:	l8ui	a6, a2, 0
107#ifdef __XTENSA_EB__
108	slli	a6, a6, 8	/* load byte into bits 8..15 */
109#endif
110	ONES_ADD(a4, a6)
1117:
112	mov	a2, a4
113	retw
114
115	/* uncommon case, buf is 2-byte aligned */
1168:
117	beqz	a3, 7b		/* branch if len == 0 */
118	beqi	a3, 1, 6b	/* branch if len == 1 */
119
120	extui	a5, a2, 0, 1
121	bnez	a5, 8f		/* branch if 1-byte aligned */
122
123	l16ui	a6, a2, 0	/* common case, len >= 2 */
124	ONES_ADD(a4, a6)
125	addi	a2, a2, 2	/* adjust buf */
126	addi	a3, a3, -2	/* adjust len */
127	j	1b		/* now buf is 4-byte aligned */
128
129	/* case: odd-byte aligned, len > 1
130	 * This case is dog slow, so don't give us an odd address.
131	 * (I don't think this ever happens, but just in case.)
132	 */
1338:
134	srli	a5, a3, 2	/* 4-byte chunks */
135#if XCHAL_HAVE_LOOPS
136	loopgtz	a5, 2f
137#else
138	beqz	a5, 2f
139	slli	a5, a5, 2
140	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
141.Loop3:
142#endif
143	l8ui	a6, a2, 0	/* bits 24..31 */
144	l16ui	a7, a2, 1	/* bits  8..23 */
145	l8ui	a8, a2, 3	/* bits  0.. 8 */
146#ifdef	__XTENSA_EB__
147	slli	a6, a6, 24
148#else
149	slli	a8, a8, 24
150#endif
151	slli	a7, a7, 8
152	or	a7, a7, a6
153	or	a7, a7, a8
154	ONES_ADD(a4, a7)
155	addi	a2, a2, 4
156#if !XCHAL_HAVE_LOOPS
157	blt	a2, a5, .Loop3
158#endif
1592:
160	_bbci.l	a3, 1, 3f	/* remaining 2-byte chunk, still odd addr */
161	l8ui	a6, a2, 0
162	l8ui	a7, a2, 1
163#ifdef	__XTENSA_EB__
164	slli	a6, a6, 8
165#else
166	slli	a7, a7, 8
167#endif
168	or	a7, a7, a6
169	ONES_ADD(a4, a7)
170	addi	a2, a2, 2
1713:
172	j	5b		/* branch to handle the remaining byte */
173
174
175
176/*
177 * Copy from ds while checksumming, otherwise like csum_partial
178 *
179 * The macros SRC and DST specify the type of access for the instruction.
180 * thus we can call a custom exception handler for each access type.
181 */
182
183#define SRC(y...)			\
184	9999: y;			\
185	.section __ex_table, "a";	\
186	.long 9999b, 6001f	;	\
187	.previous
188
189#define DST(y...)			\
190	9999: y;			\
191	.section __ex_table, "a";	\
192	.long 9999b, 6002f	;	\
193	.previous
194
195/*
196unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
197					int sum, int *src_err_ptr, int *dst_err_ptr)
198	a2  = src
199	a3  = dst
200	a4  = len
201	a5  = sum
202	a6  = src_err_ptr
203	a7  = dst_err_ptr
204	a8  = temp
205	a9  = temp
206	a10 = temp
207	a11 = original len for exception handling
208	a12 = original dst for exception handling
209
210    This function is optimized for 4-byte aligned addresses.  Other
211    alignments work, but not nearly as efficiently.
212 */
213
214ENTRY(csum_partial_copy_generic)
215	entry	sp, 32
216	mov	a12, a3
217	mov	a11, a4
218	or	a10, a2, a3
219
220	/* We optimize the following alignment tests for the 4-byte
221	aligned case.  Two bbsi.l instructions might seem more optimal
222	(commented out below).  However, both labels 5: and 3: are out
223	of the imm8 range, so the assembler relaxes them into
224	equivalent bbci.l, j combinations, which is actually
225	slower. */
226
227	extui	a9, a10, 0, 2
228	beqz	a9, 1f		/* branch if both are 4-byte aligned */
229	bbsi.l	a10, 0, 5f	/* branch if one address is odd */
230	j	3f		/* one address is 2-byte aligned */
231
232/*	_bbsi.l	a10, 0, 5f */	/* branch if odd address */
233/*	_bbsi.l	a10, 1, 3f */	/* branch if 2-byte-aligned address */
234
2351:
236	/* src and dst are both 4-byte aligned */
237	srli	a10, a4, 5	/* 32-byte chunks */
238#if XCHAL_HAVE_LOOPS
239	loopgtz	a10, 2f
240#else
241	beqz	a10, 2f
242	slli	a10, a10, 5
243	add	a10, a10, a2	/* a10 = end of last 32-byte src chunk */
244.Loop5:
245#endif
246SRC(	l32i	a9, a2, 0	)
247SRC(	l32i	a8, a2, 4	)
248DST(	s32i	a9, a3, 0	)
249DST(	s32i	a8, a3, 4	)
250	ONES_ADD(a5, a9)
251	ONES_ADD(a5, a8)
252SRC(	l32i	a9, a2, 8	)
253SRC(	l32i	a8, a2, 12	)
254DST(	s32i	a9, a3, 8	)
255DST(	s32i	a8, a3, 12	)
256	ONES_ADD(a5, a9)
257	ONES_ADD(a5, a8)
258SRC(	l32i	a9, a2, 16	)
259SRC(	l32i	a8, a2, 20	)
260DST(	s32i	a9, a3, 16	)
261DST(	s32i	a8, a3, 20	)
262	ONES_ADD(a5, a9)
263	ONES_ADD(a5, a8)
264SRC(	l32i	a9, a2, 24	)
265SRC(	l32i	a8, a2, 28	)
266DST(	s32i	a9, a3, 24	)
267DST(	s32i	a8, a3, 28	)
268	ONES_ADD(a5, a9)
269	ONES_ADD(a5, a8)
270	addi	a2, a2, 32
271	addi	a3, a3, 32
272#if !XCHAL_HAVE_LOOPS
273	blt	a2, a10, .Loop5
274#endif
2752:
276	extui	a10, a4, 2, 3	/* remaining 4-byte chunks */
277	extui	a4, a4, 0, 2	/* reset len for general-case, 2-byte chunks */
278#if XCHAL_HAVE_LOOPS
279	loopgtz	a10, 3f
280#else
281	beqz	a10, 3f
282	slli	a10, a10, 2
283	add	a10, a10, a2	/* a10 = end of last 4-byte src chunk */
284.Loop6:
285#endif
286SRC(	l32i	a9, a2, 0	)
287DST(	s32i	a9, a3, 0	)
288	ONES_ADD(a5, a9)
289	addi	a2, a2, 4
290	addi	a3, a3, 4
291#if !XCHAL_HAVE_LOOPS
292	blt	a2, a10, .Loop6
293#endif
2943:
295	/*
296	Control comes to here in two cases: (1) It may fall through
297	to here from the 4-byte alignment case to process, at most,
298	one 2-byte chunk.  (2) It branches to here from above if
299	either src or dst is 2-byte aligned, and we process all bytes
300	here, except for perhaps a trailing odd byte.  It's
301	inefficient, so align your addresses to 4-byte boundaries.
302
303	a2 = src
304	a3 = dst
305	a4 = len
306	a5 = sum
307	*/
308	srli	a10, a4, 1	/* 2-byte chunks */
309#if XCHAL_HAVE_LOOPS
310	loopgtz	a10, 4f
311#else
312	beqz	a10, 4f
313	slli	a10, a10, 1
314	add	a10, a10, a2	/* a10 = end of last 2-byte src chunk */
315.Loop7:
316#endif
317SRC(	l16ui	a9, a2, 0	)
318DST(	s16i	a9, a3, 0	)
319	ONES_ADD(a5, a9)
320	addi	a2, a2, 2
321	addi	a3, a3, 2
322#if !XCHAL_HAVE_LOOPS
323	blt	a2, a10, .Loop7
324#endif
3254:
326	/* This section processes a possible trailing odd byte. */
327	_bbci.l	a4, 0, 8f	/* 1-byte chunk */
328SRC(	l8ui	a9, a2, 0	)
329DST(	s8i	a9, a3, 0	)
330#ifdef __XTENSA_EB__
331	slli	a9, a9, 8	/* shift byte to bits 8..15 */
332#endif
333	ONES_ADD(a5, a9)
3348:
335	mov	a2, a5
336	retw
337
3385:
339	/* Control branch to here when either src or dst is odd.  We
340	process all bytes using 8-bit accesses.  Grossly inefficient,
341	so don't feed us an odd address. */
342
343	srli	a10, a4, 1	/* handle in pairs for 16-bit csum */
344#if XCHAL_HAVE_LOOPS
345	loopgtz	a10, 6f
346#else
347	beqz	a10, 6f
348	slli	a10, a10, 1
349	add	a10, a10, a2	/* a10 = end of last odd-aligned, 2-byte src chunk */
350.Loop8:
351#endif
352SRC(	l8ui	a9, a2, 0	)
353SRC(	l8ui	a8, a2, 1	)
354DST(	s8i	a9, a3, 0	)
355DST(	s8i	a8, a3, 1	)
356#ifdef __XTENSA_EB__
357	slli	a9, a9, 8	/* combine into a single 16-bit value */
358#else				/* for checksum computation */
359	slli	a8, a8, 8
360#endif
361	or	a9, a9, a8
362	ONES_ADD(a5, a9)
363	addi	a2, a2, 2
364	addi	a3, a3, 2
365#if !XCHAL_HAVE_LOOPS
366	blt	a2, a10, .Loop8
367#endif
3686:
369	j	4b		/* process the possible trailing odd byte */
370
371
372# Exception handler:
373.section .fixup, "ax"
374/*
375	a6  = src_err_ptr
376	a7  = dst_err_ptr
377	a11 = original len for exception handling
378	a12 = original dst for exception handling
379*/
380
3816001:
382	_movi	a2, -EFAULT
383	s32i	a2, a6, 0	/* src_err_ptr */
384
385	# clear the complete destination - computing the rest
386	# is too much work
387	movi	a2, 0
388#if XCHAL_HAVE_LOOPS
389	loopgtz	a11, 2f
390#else
391	beqz	a11, 2f
392	add	a11, a11, a12	/* a11 = ending address */
393.Leloop:
394#endif
395	s8i	a2, a12, 0
396	addi	a12, a12, 1
397#if !XCHAL_HAVE_LOOPS
398	blt	a12, a11, .Leloop
399#endif
4002:
401	retw
402
4036002:
404	movi	a2, -EFAULT
405	s32i	a2, a7, 0	/* dst_err_ptr */
406	movi	a2, 0
407	retw
408
409.previous
410
411