xref: /freebsd/sys/arm/arm/in_cksum_arm.S (revision 685dc743dc3b5645e34836464128e1c0558b404b)
1/*	$NetBSD: in_cksum_arm.S,v 1.2 2003/09/23 10:01:36 scw Exp $	*/
2
3/*-
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 *
37 */
38
39/*
40 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/armv5e
41 */
42
43#include "opt_inet.h"
44
45#include <machine/asm.h>
46#include "assym.inc"
47	.syntax	unified
48/*
49 * int in_cksum(struct mbuf *m, int len)
50 *
51 * Entry:
52 *	r0	m
53 *	r1	len
54 *
55 * NOTE: Assumes 'm' is *never* NULL.
56 */
57/* LINTSTUB: Func: int in_cksum(struct mbuf *, int) */
58ENTRY(in_cksum)
59	stmfd	sp!, {r4-r11,lr}
60	mov	r8, #0x00
61	mov	r9, r1
62	mov	r10, #0x00
63	mov	ip, r0
64
65.Lin_cksum_loop:
66	ldr	r1, [ip, #(M_LEN)]
67	ldr	r0, [ip, #(M_DATA)]
68	ldr	ip, [ip, #(M_NEXT)]
69.Lin_cksum_entry4:
70	cmp	r9, r1
71	movlt	r1, r9
72	sub	r9, r9, r1
73	eor	r11, r10, r0
74	add	r10, r10, r1
75	adds	r2, r1, #0x00
76	blne	_ASM_LABEL(L_cksumdata)
77	tst	r11, #0x01
78	movne	r2, r2, ror #8
79	adds	r8, r8, r2
80	adc	r8, r8, #0x00
81	cmp	ip, #0x00
82	bne	.Lin_cksum_loop
83
84	mov	r1, #0xff
85	orr	r1, r1, #0xff00
86	and	r0, r8, r1
87	add	r0, r0, r8, lsr #16
88	add	r0, r0, r0, lsr #16
89	and	r0, r0, r1
90	eor	r0, r0, r1
91	ldmfd	sp!, {r4-r11,pc}
92END(in_cksum)
93
94ENTRY(do_cksum)
95	stmfd	sp!, {r4-r7, lr}
96	bl	L_cksumdata
97	mov	r0, r2
98	ldmfd	sp!, {r4-r7, pc}
99END(do_cksum)
100
101/*
102 * The main in*_cksum() workhorse...
103 *
104 * Entry parameters:
105 *	r0	Pointer to buffer
106 *	r1	Buffer length
107 *	lr	Return address
108 *
109 * Returns:
110 *	r2	Accumulated 32-bit sum
111 *
112 * Clobbers:
113 *	r0-r7
114 */
115/* LINTSTUB: Ignore */
116ASENTRY_NP(L_cksumdata)
117	pld	[r0]			/* Pre-fetch the start of the buffer */
118	mov	r2, #0
119
120	/* We first have to word-align the buffer.  */
121	ands	r7, r0, #0x03
122	beq	.Lcksumdata_wordaligned
123	rsb	r7, r7, #0x04
124	cmp	r1, r7			/* Enough bytes left to make it? */
125	blt	.Lcksumdata_endgame
126	cmp	r7, #0x02
127	ldrb	r4, [r0], #0x01		/* Fetch 1st byte */
128	ldrbge	r5, [r0], #0x01		/* Fetch 2nd byte */
129	movlt	r5, #0x00
130	ldrbgt	r6, [r0], #0x01		/* Fetch 3rd byte */
131	movle	r6, #0x00
132
133	/* Combine the three bytes depending on endianness and alignment */
134	orreq	r2, r4, r5, lsl #8
135	orreq	r2, r2, r6, lsl #16
136	orrne	r2, r5, r4, lsl #8
137	orrne	r2, r2, r6, lsl #24
138	subs	r1, r1, r7		/* Update length */
139	RETeq				/* All done? */
140
141	/* Buffer is now word aligned */
142.Lcksumdata_wordaligned:
143	cmp	r1, #0x04		/* Less than 4 bytes left? */
144	blt	.Lcksumdata_endgame	/* Yup */
145
146	/* Now quad-align, if necessary */
147	ands	r7, r0, #0x04
148	ldrne	r7, [r0], #0x04
149	subne	r1, r1, #0x04
150	subs	r1, r1, #0x40
151	blt	.Lcksumdata_bigloop_end	/* Note: C flag clear if branch taken */
152
153	/*
154	 * Buffer is now quad aligned. Sum 64 bytes at a time.
155	 * Note: First ldrd is hoisted above the loop, together with
156	 * setting r6 to zero to avoid stalling for results in the
157	 * loop. (r7 is live, from above).
158	 */
159	ldrd	r4, [r0], #0x08
160	mov	r6, #0x00
161.Lcksumdata_bigloop:
162	pld	[r0, #0x18]
163	adds	r2, r2, r6
164	adcs	r2, r2, r7
165	ldrd	r6, [r0], #0x08
166	adcs	r2, r2, r4
167	adcs	r2, r2, r5
168	ldrd	r4, [r0], #0x08
169	adcs	r2, r2, r6
170	adcs	r2, r2, r7
171	ldrd	r6, [r0], #0x08
172	adcs	r2, r2, r4
173	adcs	r2, r2, r5
174	ldrd	r4, [r0], #0x08
175	adcs	r2, r2, r6
176	adcs	r2, r2, r7
177	pld	[r0, #0x18]
178	ldrd	r6, [r0], #0x08
179	adcs	r2, r2, r4
180	adcs	r2, r2, r5
181	ldrd	r4, [r0], #0x08
182	adcs	r2, r2, r6
183	adcs	r2, r2, r7
184	ldrd	r6, [r0], #0x08
185	adcs	r2, r2, r4
186	adcs	r2, r2, r5
187	adc	r2, r2, #0x00
188	subs	r1, r1, #0x40
189	ldrdge	r4, [r0], #0x08
190	bge	.Lcksumdata_bigloop
191
192	adds	r2, r2, r6		/* r6/r7 still need summing */
193.Lcksumdata_bigloop_end:
194	adcs	r2, r2, r7
195	adc	r2, r2, #0x00
196
197	adds	r1, r1, #0x40
198	RETeq
199	cmp	r1, #0x20
200
201	ldrdge	r4, [r0], #0x08		/* Avoid stalling pld and result */
202	blt	.Lcksumdata_less_than_32
203	pld	[r0, #0x18]
204	ldrd	r6, [r0], #0x08
205	adds	r2, r2, r4
206	adcs	r2, r2, r5
207	ldrd	r4, [r0], #0x08
208	adcs	r2, r2, r6
209	adcs	r2, r2, r7
210	ldrd	r6, [r0], #0x08
211	adcs	r2, r2, r4
212	adcs	r2, r2, r5
213	adcs	r2, r2, r6		/* XXX: Unavoidable result stall */
214	adcs	r2, r2, r7
215	adc	r2, r2, #0x00
216	subs	r1, r1, #0x20
217	RETeq
218
219.Lcksumdata_less_than_32:
220	/* There are less than 32 bytes left */
221	and	r3, r1, #0x18
222	rsb	r4, r3, #0x18
223	sub	r1, r1, r3
224	adds	r4, r4, r4, lsr #1	/* Side effect: Clear carry flag */
225	addne	pc, pc, r4
226	nop
227
228/*
229 * Note: We use ldm here, even on armv5e, since the combined issue/result
230 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
231 */
232	/* At least 24 bytes remaining... */
233	ldmia	r0!, {r4, r5}
234	adcs	r2, r2, r4
235	adcs	r2, r2, r5
236
237	/* At least 16 bytes remaining... */
238	ldmia	r0!, {r4, r5}
239	adcs	r2, r2, r4
240	adcs	r2, r2, r5
241
242	/* At least 8 bytes remaining... */
243	ldmia	r0!, {r4, r5}
244	adcs	r2, r2, r4
245	adcs	r2, r2, r5
246
247	/* Less than 8 bytes remaining... */
248	adc	r2, r2, #0x00
249	subs	r1, r1, #0x04
250	blt	.Lcksumdata_lessthan4
251
252	ldr	r4, [r0], #0x04
253	sub	r1, r1, #0x04
254	adds	r2, r2, r4
255	adc	r2, r2, #0x00
256
257	/* Deal with < 4 bytes remaining */
258.Lcksumdata_lessthan4:
259	adds	r1, r1, #0x04
260	RETeq
261
262	/* Deal with 1 to 3 remaining bytes, possibly misaligned */
263.Lcksumdata_endgame:
264	ldrb	r3, [r0]		/* Fetch first byte */
265	cmp	r1, #0x02
266	ldrbge	r4, [r0, #0x01]		/* Fetch 2nd and 3rd as necessary */
267	movlt	r4, #0x00
268	ldrbgt	r5, [r0, #0x02]
269	movle	r5, #0x00
270	/* Combine the three bytes depending on endianness and alignment */
271	tst	r0, #0x01
272	orreq	r3, r3, r4, lsl #8
273	orreq	r3, r3, r5, lsl #16
274	orrne	r3, r4, r3, lsl #8
275	orrne	r3, r3, r5, lsl #24
276	adds	r2, r2, r3
277	adc	r2, r2, #0x00
278	RET
279END(L_cksumdata)
280
281