xref: /titanic_52/usr/src/uts/sun4/ml/ip_ocsum.s (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1*7c478bd9Sstevel@tonic-gate/*
2*7c478bd9Sstevel@tonic-gate * CDDL HEADER START
3*7c478bd9Sstevel@tonic-gate *
4*7c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the
5*7c478bd9Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only
6*7c478bd9Sstevel@tonic-gate * (the "License").  You may not use this file except in compliance
7*7c478bd9Sstevel@tonic-gate * with the License.
8*7c478bd9Sstevel@tonic-gate *
9*7c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*7c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
11*7c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions
12*7c478bd9Sstevel@tonic-gate * and limitations under the License.
13*7c478bd9Sstevel@tonic-gate *
14*7c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
15*7c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*7c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
17*7c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
18*7c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
19*7c478bd9Sstevel@tonic-gate *
20*7c478bd9Sstevel@tonic-gate * CDDL HEADER END
21*7c478bd9Sstevel@tonic-gate */
22*7c478bd9Sstevel@tonic-gate/*
23*7c478bd9Sstevel@tonic-gate * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*7c478bd9Sstevel@tonic-gate * Use is subject to license terms.
25*7c478bd9Sstevel@tonic-gate */
26*7c478bd9Sstevel@tonic-gate
27*7c478bd9Sstevel@tonic-gate#pragma ident	"%Z%%M%	%I%	%E% SMI"
28*7c478bd9Sstevel@tonic-gate
29*7c478bd9Sstevel@tonic-gate#include <sys/param.h>
30*7c478bd9Sstevel@tonic-gate#include <sys/errno.h>
31*7c478bd9Sstevel@tonic-gate#include <sys/asm_linkage.h>
32*7c478bd9Sstevel@tonic-gate#include <sys/vtrace.h>
33*7c478bd9Sstevel@tonic-gate#include <sys/machthread.h>
34*7c478bd9Sstevel@tonic-gate#include <sys/machparam.h>
35*7c478bd9Sstevel@tonic-gate
36*7c478bd9Sstevel@tonic-gate#if defined(lint)
37*7c478bd9Sstevel@tonic-gate#include <sys/types.h>
38*7c478bd9Sstevel@tonic-gate#else	/* lint */
39*7c478bd9Sstevel@tonic-gate#include "assym.h"
40*7c478bd9Sstevel@tonic-gate#endif	/* lint */
41*7c478bd9Sstevel@tonic-gate
42*7c478bd9Sstevel@tonic-gate/*
43*7c478bd9Sstevel@tonic-gate * Prefetch considerations
44*7c478bd9Sstevel@tonic-gate *
45*7c478bd9Sstevel@tonic-gate * We prefetch one cacheline ahead.  This may not be enough on Serengeti
46*7c478bd9Sstevel@tonic-gate * systems - see default_copyout() etc which prefetch 5 lines ahead.
47*7c478bd9Sstevel@tonic-gate * On the other hand, we expect most of the source buffers to be
48*7c478bd9Sstevel@tonic-gate * recently used enough to be cached.
49*7c478bd9Sstevel@tonic-gate *
50*7c478bd9Sstevel@tonic-gate * On US-I the prefetches are inoperative.  On US-II they preload the E$;
51*7c478bd9Sstevel@tonic-gate * the mainloop unrolling and load-buffer should cover loads from E$.
52*7c478bd9Sstevel@tonic-gate * The stores appear to be the slow point on US-II.
53*7c478bd9Sstevel@tonic-gate *
54*7c478bd9Sstevel@tonic-gate * On US-IIICu the prefetch preloads the L2$ too, but there is no load
55*7c478bd9Sstevel@tonic-gate * buffer so the loads will stall for D$ miss, L2$ hit.  The hardware
56*7c478bd9Sstevel@tonic-gate * auto-prefetch is not activated by integer loads.  No solution
57*7c478bd9Sstevel@tonic-gate * in sight for this, barring odd games with FP read, write, integer read.
58*7c478bd9Sstevel@tonic-gate *
59*7c478bd9Sstevel@tonic-gate * US-IV (Panther) appears similar to US-IIICu, except that a strong
60*7c478bd9Sstevel@tonic-gate * variant of prefetch is available which can take TLB traps.  We don't
61*7c478bd9Sstevel@tonic-gate * use this.  The h/w prefetch stride can be set to 64, 128 or 192,
62*7c478bd9Sstevel@tonic-gate * and they only reach to the L2$ (we don't use these either).
63*7c478bd9Sstevel@tonic-gate * L2$ load-to-use latency is 15 cycles (best).
64*7c478bd9Sstevel@tonic-gate */
65*7c478bd9Sstevel@tonic-gate
66*7c478bd9Sstevel@tonic-gate
67*7c478bd9Sstevel@tonic-gate/*
68*7c478bd9Sstevel@tonic-gate * ip_ocsum(address, halfword_count, sum)
69*7c478bd9Sstevel@tonic-gate * Do a 16 bit one's complement sum of a given number of (16-bit)
70*7c478bd9Sstevel@tonic-gate * halfwords. The halfword pointer must not be odd.
71*7c478bd9Sstevel@tonic-gate *	%o0 address; %o1 count; %o2 sum accumulator; %o4 temp
72*7c478bd9Sstevel@tonic-gate * 	%g2 and %g3 used in main loop
73*7c478bd9Sstevel@tonic-gate *
74*7c478bd9Sstevel@tonic-gate * (from @(#)ocsum.s 1.3 89/02/24 SMI)
75*7c478bd9Sstevel@tonic-gate *
76*7c478bd9Sstevel@tonic-gate */
77*7c478bd9Sstevel@tonic-gate
78*7c478bd9Sstevel@tonic-gate#if defined(lint)
79*7c478bd9Sstevel@tonic-gate
80*7c478bd9Sstevel@tonic-gate/* ARGSUSED */
81*7c478bd9Sstevel@tonic-gateunsigned int
82*7c478bd9Sstevel@tonic-gateip_ocsum(u_short *address, int halfword_count, unsigned int sum)
83*7c478bd9Sstevel@tonic-gate{ return (0); }
84*7c478bd9Sstevel@tonic-gate
85*7c478bd9Sstevel@tonic-gate#else	/* lint */
86*7c478bd9Sstevel@tonic-gate
87*7c478bd9Sstevel@tonic-gate	ENTRY(ip_ocsum)
88*7c478bd9Sstevel@tonic-gate
89*7c478bd9Sstevel@tonic-gate/*
90*7c478bd9Sstevel@tonic-gate * On ttcp transmits, called once per ocsum_copyin but with a small
91*7c478bd9Sstevel@tonic-gate * block ( >99.9% ).  Could be the tx hdrs?  How many acks/seg are we rxing?
92*7c478bd9Sstevel@tonic-gate * On ttcp receives, called more than once per ocsum_copyout. Rx hdrs
93*7c478bd9Sstevel@tonic-gate * and tx acks?
94*7c478bd9Sstevel@tonic-gate *
95*7c478bd9Sstevel@tonic-gate * To do: telnet and nfs traffic
96*7c478bd9Sstevel@tonic-gate *
97*7c478bd9Sstevel@tonic-gate * On an NCA'd webserver about 10% of the calls are >64 bytes
98*7c478bd9Sstevel@tonic-gate *	about 10% of those start on a 64byte boundary
99*7c478bd9Sstevel@tonic-gate *	about 30% are >5*64 bytes.
100*7c478bd9Sstevel@tonic-gate * The NCA numbers & proportions don't change with h/w cksum on.
101*7c478bd9Sstevel@tonic-gate *
102*7c478bd9Sstevel@tonic-gate * Tx hdrs are likely to be already in cache.
103*7c478bd9Sstevel@tonic-gate * Rx hdrs depends if already inspected.
104*7c478bd9Sstevel@tonic-gate */
105*7c478bd9Sstevel@tonic-gate
106*7c478bd9Sstevel@tonic-gate	!
107*7c478bd9Sstevel@tonic-gate	! Entry point for checksum-only.
108*7c478bd9Sstevel@tonic-gate	! %o0 contains buffer address
109*7c478bd9Sstevel@tonic-gate	! %o1 contains count of 16bit words
110*7c478bd9Sstevel@tonic-gate	! %o2 contains sum
111*7c478bd9Sstevel@tonic-gate	!
112*7c478bd9Sstevel@tonic-gate	! %o3 temporary
113*7c478bd9Sstevel@tonic-gate	! %o4 temporary
114*7c478bd9Sstevel@tonic-gate	! %g1 32bit mask
115*7c478bd9Sstevel@tonic-gate	! %g4 16bit mask
116*7c478bd9Sstevel@tonic-gate	! %g5 64bit mask (all 1s)
117*7c478bd9Sstevel@tonic-gate	!
118*7c478bd9Sstevel@tonic-gate	not	%g0, %g5	! all 1's
119*7c478bd9Sstevel@tonic-gate	prefetch [%o0], #n_reads	! first hword, dword, cacheline
120*7c478bd9Sstevel@tonic-gate
121*7c478bd9Sstevel@tonic-gate	clruw	%g5, %g1	! 32 1's at low end
122*7c478bd9Sstevel@tonic-gate	srl	%g5, 16, %g4	! 16 1's at low end
123*7c478bd9Sstevel@tonic-gate
124*7c478bd9Sstevel@tonic-gate	cmp	%o1, 32		! at least a cacheline (64 bytes)?
125*7c478bd9Sstevel@tonic-gate	bge,pn %icc, ip_ocsum_long	! yes, do the whole works
126*7c478bd9Sstevel@tonic-gate	andn	%o0, 7, %o5	! delay: base src addr
127*7c478bd9Sstevel@tonic-gate
128*7c478bd9Sstevel@tonic-gate
129*7c478bd9Sstevel@tonic-gate	cmp	%o1, 4		! < 4 halfwords?
130*7c478bd9Sstevel@tonic-gate	bl,pn	%icc, .tiny	! < 4 halfwords, just do them
131*7c478bd9Sstevel@tonic-gate	inc	8, %o5		! delay: next addr (no matter for .tiny)
132*7c478bd9Sstevel@tonic-gate
133*7c478bd9Sstevel@tonic-gate	/* leading dword with 1-4 hwords: 9 clocks */
134*7c478bd9Sstevel@tonic-gate	/* Assumes ok to read the entire dword with the leading hwords */
135*7c478bd9Sstevel@tonic-gate
136*7c478bd9Sstevel@tonic-gate	ldx	[%o5-8], %o3	! NB base addr
137*7c478bd9Sstevel@tonic-gate	sub	%o5, %o0, %g2	! byte count: 2/4/6/8
138*7c478bd9Sstevel@tonic-gate	mov	%o5, %o0
139*7c478bd9Sstevel@tonic-gate
140*7c478bd9Sstevel@tonic-gate	sll	%g2, 2, %g2	! 8/16/24/32 for mask
141*7c478bd9Sstevel@tonic-gate
142*7c478bd9Sstevel@tonic-gate	sllx	%g5, %g2, %o5
143*7c478bd9Sstevel@tonic-gate
144*7c478bd9Sstevel@tonic-gate	sllx	%o5, %g2, %o5	! mask: 16/32/48/64 0's at low end
145*7c478bd9Sstevel@tonic-gate
146*7c478bd9Sstevel@tonic-gate	srl	%g2, 3, %g2	! hw count
147*7c478bd9Sstevel@tonic-gate	andn	%o3, %o5, %o3	! select hw's from src
148*7c478bd9Sstevel@tonic-gate
149*7c478bd9Sstevel@tonic-gate	srlx	%o3, 32, %o4	! hi32
150*7c478bd9Sstevel@tonic-gate	b	9f
151*7c478bd9Sstevel@tonic-gate	sub	%o1, %g2, %o1	! delay: decr count, 1-4 halfwords
152*7c478bd9Sstevel@tonic-gate
153*7c478bd9Sstevel@tonic-gate.short_dw:			! max 7 iters of 4 clocks; 1 mispred of 4
154*7c478bd9Sstevel@tonic-gate	ldx	[%o0], %o3	! tmp64 = *src++ (groups with the branch)
155*7c478bd9Sstevel@tonic-gate
156*7c478bd9Sstevel@tonic-gate	inc	8, %o0		! (D-cache load-use delay)
157*7c478bd9Sstevel@tonic-gate	dec	4, %o1		! decrement count, 4 halfwords
158*7c478bd9Sstevel@tonic-gate
159*7c478bd9Sstevel@tonic-gate	srlx	%o3, 32, %o4	! hi32
160*7c478bd9Sstevel@tonic-gate9:	and	%o3, %g1, %o3	! lo32
161*7c478bd9Sstevel@tonic-gate
162*7c478bd9Sstevel@tonic-gate	add	%o4, %o2, %o2	! accumulator
163*7c478bd9Sstevel@tonic-gate	andncc	%o1, 3, %g0	! more than 3 hwords left?
164*7c478bd9Sstevel@tonic-gate
165*7c478bd9Sstevel@tonic-gate	bnz,pt %icc, .short_dw
166*7c478bd9Sstevel@tonic-gate	add	%o3, %o2, %o2	! accumulator
167*7c478bd9Sstevel@tonic-gate
168*7c478bd9Sstevel@tonic-gate.short_hw:			! trailing dw: 0-3 hwords
169*7c478bd9Sstevel@tonic-gate	tst	%o1		! 0 seems fairly common...
170*7c478bd9Sstevel@tonic-gate	bz,a	.short_fold
171*7c478bd9Sstevel@tonic-gate	srlx	%o2, 32, %o4	! delay: hi32
172*7c478bd9Sstevel@tonic-gate				! mispredict 4 + 7 clocks for 1-3
173*7c478bd9Sstevel@tonic-gate	ldx	[%o0], %o3
174*7c478bd9Sstevel@tonic-gate	sll	%o1, 4, %o1	! bitcount: 16/32/48
175*7c478bd9Sstevel@tonic-gate
176*7c478bd9Sstevel@tonic-gate	srlx	%g5, %o1, %o5	! mask: 16/32/48  0's at high end
177*7c478bd9Sstevel@tonic-gate
178*7c478bd9Sstevel@tonic-gate	andn	%o3, %o5, %o3	! select hw's from src
179*7c478bd9Sstevel@tonic-gate
180*7c478bd9Sstevel@tonic-gate	srlx	%o3, 32, %o4	! hi32
181*7c478bd9Sstevel@tonic-gate	and	%o3, %g1, %o3	! lo32
182*7c478bd9Sstevel@tonic-gate
183*7c478bd9Sstevel@tonic-gate	add	%o4, %o2, %o2	! accumulator
184*7c478bd9Sstevel@tonic-gate
185*7c478bd9Sstevel@tonic-gate	add	%o3, %o2, %o2	! accumulator
186*7c478bd9Sstevel@tonic-gate
187*7c478bd9Sstevel@tonic-gate	! at this point the 64-bit accumulator
188*7c478bd9Sstevel@tonic-gate	! has the result that needs to be returned in 16-bits
189*7c478bd9Sstevel@tonic-gate	srlx	%o2, 32, %o4	! hi32
190*7c478bd9Sstevel@tonic-gate.short_fold:
191*7c478bd9Sstevel@tonic-gate	and	%o2, %g1, %o2	! lo32
192*7c478bd9Sstevel@tonic-gate
193*7c478bd9Sstevel@tonic-gate	add	%o4, %o2, %o2	! 33b
194*7c478bd9Sstevel@tonic-gate
195*7c478bd9Sstevel@tonic-gate	srlx	%o2, 16, %o3	! hi17
196*7c478bd9Sstevel@tonic-gate	and	%o2, %g4, %o2	! lo16
197*7c478bd9Sstevel@tonic-gate
198*7c478bd9Sstevel@tonic-gate	add	%o3, %o2, %o2	! 18b
199*7c478bd9Sstevel@tonic-gate
200*7c478bd9Sstevel@tonic-gate	srlx	%o2, 16, %o3	! hi2
201*7c478bd9Sstevel@tonic-gate	and	%o2, %g4, %o2	! lo16
202*7c478bd9Sstevel@tonic-gate
203*7c478bd9Sstevel@tonic-gate	retl			! return
204*7c478bd9Sstevel@tonic-gate	add	%o3, %o2, %o0	! 16b result in %o0
205*7c478bd9Sstevel@tonic-gate
206*7c478bd9Sstevel@tonic-gate.tiny:				! almost never: less than 4 halfwords total.
207*7c478bd9Sstevel@tonic-gate	tst	%o1
208*7c478bd9Sstevel@tonic-gate	bz,a	.short_fold
209*7c478bd9Sstevel@tonic-gate
210*7c478bd9Sstevel@tonic-gate	srlx	%o2, 32, %o4	! delay: hi32
211*7c478bd9Sstevel@tonic-gate
212*7c478bd9Sstevel@tonic-gate	lduh	[%o0], %o3	! tmp16 = *src++
213*7c478bd9Sstevel@tonic-gate1:
214*7c478bd9Sstevel@tonic-gate	inc	2, %o0
215*7c478bd9Sstevel@tonic-gate				! stall for D-cache
216*7c478bd9Sstevel@tonic-gate
217*7c478bd9Sstevel@tonic-gate	add	%o3, %o2, %o2	! accumulator
218*7c478bd9Sstevel@tonic-gate
219*7c478bd9Sstevel@tonic-gate	deccc	%o1		! decrement count
220*7c478bd9Sstevel@tonic-gate	bnz,a,pt %icc, 1b
221*7c478bd9Sstevel@tonic-gate	lduh	[%o0], %o3	! tmp16 = *src++
222*7c478bd9Sstevel@tonic-gate
223*7c478bd9Sstevel@tonic-gate	! at this point the 64-bit accumulator
224*7c478bd9Sstevel@tonic-gate	! has the result that needs to be returned in 16-bits
225*7c478bd9Sstevel@tonic-gate	b	.short_fold
226*7c478bd9Sstevel@tonic-gate	srlx	%o2, 32, %o4	! hi32
227*7c478bd9Sstevel@tonic-gate
228*7c478bd9Sstevel@tonic-gate	SET_SIZE(ip_ocsum)	! 64-bit version
229*7c478bd9Sstevel@tonic-gate
230*7c478bd9Sstevel@tonic-gate
231*7c478bd9Sstevel@tonic-gate	ENTRY(ip_ocsum_long)	! 64-bit, large blocks
232*7c478bd9Sstevel@tonic-gate	save	%sp, -SA(MINFRAME), %sp	! get another window
233*7c478bd9Sstevel@tonic-gate	!
234*7c478bd9Sstevel@tonic-gate	! %i0 contains buffer address
235*7c478bd9Sstevel@tonic-gate	! %i1 contains count of 16bit words
236*7c478bd9Sstevel@tonic-gate	! %i2 contains sum
237*7c478bd9Sstevel@tonic-gate	! %i4 contains the mainloop count
238*7c478bd9Sstevel@tonic-gate	! %i5 comes in with the buffer address rounded down to the first dword
239*7c478bd9Sstevel@tonic-gate	!
240*7c478bd9Sstevel@tonic-gate	! %g1 32bit mask
241*7c478bd9Sstevel@tonic-gate	! %g4 16bit mask
242*7c478bd9Sstevel@tonic-gate	! %g5 64bit mask (all 1s)
243*7c478bd9Sstevel@tonic-gate	! %g6 fetch-ahead offset for Ecache
244*7c478bd9Sstevel@tonic-gate	!
245*7c478bd9Sstevel@tonic-gate	! %l0-7,%o0-5,%g2-3 mainloop temporaries
246*7c478bd9Sstevel@tonic-gate	!
247*7c478bd9Sstevel@tonic-gate	!
248*7c478bd9Sstevel@tonic-gate				! 1 clock overhead
249*7c478bd9Sstevel@tonic-gate	btst	63, %i0		! src 64-byte aligned?
250*7c478bd9Sstevel@tonic-gate	bz,a,pt	%icc, .mainsection	! aligned blocks are fairly common
251*7c478bd9Sstevel@tonic-gate	andncc	%i1, 31, %i4	! at least 64 bytes for main loop?
252*7c478bd9Sstevel@tonic-gate
253*7c478bd9Sstevel@tonic-gate
254*7c478bd9Sstevel@tonic-gate	! Leading dword, with 1-4 hwords: 9 clocks
255*7c478bd9Sstevel@tonic-gate	! Assumes ok to read the entire dword with the leading bytes
256*7c478bd9Sstevel@tonic-gate	ldx	[%i5], %l0	! NB base addr
257*7c478bd9Sstevel@tonic-gate	inc	8, %i5		! next addr
258*7c478bd9Sstevel@tonic-gate
259*7c478bd9Sstevel@tonic-gate	sub	%i5, %i0, %l2	! byte count: 2/4/6/8
260*7c478bd9Sstevel@tonic-gate	mov	%i5, %i0
261*7c478bd9Sstevel@tonic-gate
262*7c478bd9Sstevel@tonic-gate	sll	%l2, 2, %l2	! 8/16/24/32 for mask
263*7c478bd9Sstevel@tonic-gate
264*7c478bd9Sstevel@tonic-gate	sllx	%g5, %l2, %l4
265*7c478bd9Sstevel@tonic-gate
266*7c478bd9Sstevel@tonic-gate	sllx	%l4, %l2, %l4	! mask: 16, 32, 48, 64 0's at lsb
267*7c478bd9Sstevel@tonic-gate
268*7c478bd9Sstevel@tonic-gate	srl	%l2, 3, %l2	! 1/2/3/4 for count
269*7c478bd9Sstevel@tonic-gate	andn	%l0, %l4, %l0	! select hw's from src
270*7c478bd9Sstevel@tonic-gate
271*7c478bd9Sstevel@tonic-gate	srlx	%l0, 32, %o0	! hi32
272*7c478bd9Sstevel@tonic-gate	b	9f
273*7c478bd9Sstevel@tonic-gate	sub	%i1, %l2, %i1	! decr count, 1-4 halfwords
274*7c478bd9Sstevel@tonic-gate
275*7c478bd9Sstevel@tonic-gate	! Do dwords until source is 64-byte aligned, 0-6 iterations
276*7c478bd9Sstevel@tonic-gate	! 4 clocks per + 4 for 1 mispred = 16 clocks avg
277*7c478bd9Sstevel@tonic-gate.dw:	ldx	[%i0], %l0	! tmp64 = *src++ (groups with the branch below)
278*7c478bd9Sstevel@tonic-gate
279*7c478bd9Sstevel@tonic-gate	inc	8, %i0		! (Dcache load-use delay)
280*7c478bd9Sstevel@tonic-gate	dec	4, %i1		! decrement count, 4 halfwords
281*7c478bd9Sstevel@tonic-gate
282*7c478bd9Sstevel@tonic-gate	srlx	%l0, 32, %o0	! hi32
283*7c478bd9Sstevel@tonic-gate9:	and	%l0, %g1, %l0	! lo32
284*7c478bd9Sstevel@tonic-gate
285*7c478bd9Sstevel@tonic-gate	add	%o0, %i2, %i2	! accumulator
286*7c478bd9Sstevel@tonic-gate	btst	63, %i0		! src 64-byte aligned?
287*7c478bd9Sstevel@tonic-gate
288*7c478bd9Sstevel@tonic-gate	bnz,pt	%icc, .dw
289*7c478bd9Sstevel@tonic-gate	add	%l0, %i2, %i2	! accumulator
290*7c478bd9Sstevel@tonic-gate
291*7c478bd9Sstevel@tonic-gate
292*7c478bd9Sstevel@tonic-gate	! At this point source address is 64 byte aligned
293*7c478bd9Sstevel@tonic-gate	! and we've dealt with 1-32 halfwords.
294*7c478bd9Sstevel@tonic-gate	andncc	%i1, 31, %i4	! at least 64 bytes for main loop?
295*7c478bd9Sstevel@tonic-gate.mainsection:				! total 18n + 21 clocks
296*7c478bd9Sstevel@tonic-gate	bz,pn	%icc, .postamble
297*7c478bd9Sstevel@tonic-gate	and	%i1, 31, %i1	! count for postamble
298*7c478bd9Sstevel@tonic-gate
299*7c478bd9Sstevel@tonic-gate	! preload for main loop - 9 clocks assuming D$ hits at 1 per
300*7c478bd9Sstevel@tonic-gate	ldx	[%i0+0], %l0
301*7c478bd9Sstevel@tonic-gate	ldx	[%i0+8], %l1
302*7c478bd9Sstevel@tonic-gate	ldx	[%i0+16], %l2	! %l0 could be used here if Dcache hit
303*7c478bd9Sstevel@tonic-gate	ldx	[%i0+24], %l3	!  but US-II prefetch only loads Ecache
304*7c478bd9Sstevel@tonic-gate	ldx	[%i0+32], %l4	!  check on US-III: could mix preloads & splits?
305*7c478bd9Sstevel@tonic-gate	ldx	[%i0+40], %l5
306*7c478bd9Sstevel@tonic-gate	ldx	[%i0+48], %l6
307*7c478bd9Sstevel@tonic-gate	ldx	[%i0+56], %l7
308*7c478bd9Sstevel@tonic-gate	inc	64, %i0
309*7c478bd9Sstevel@tonic-gate	prefetch [%i0], #n_reads
310*7c478bd9Sstevel@tonic-gate
311*7c478bd9Sstevel@tonic-gate	! main loop. Read 64 bytes at a time - 18 clocks per iteration
312*7c478bd9Sstevel@tonic-gate5:	!					plus 4 for the exit mispredict
313*7c478bd9Sstevel@tonic-gate	srlx	%l0, 32, %o0		! hi32 to %o0
314*7c478bd9Sstevel@tonic-gate	and	%l0, %g1, %l0		! lo32 to %l0
315*7c478bd9Sstevel@tonic-gate
316*7c478bd9Sstevel@tonic-gate	srlx	%l1, 32, %o1		! hi32 to %o1
317*7c478bd9Sstevel@tonic-gate	and	%l1, %g1, %l1		! lo32 to %l1
318*7c478bd9Sstevel@tonic-gate
319*7c478bd9Sstevel@tonic-gate	srlx	%l2, 32, %o2		! hi32 to %o2
320*7c478bd9Sstevel@tonic-gate	and	%l2, %g1, %l2		! lo32 to %l2
321*7c478bd9Sstevel@tonic-gate
322*7c478bd9Sstevel@tonic-gate	srlx	%l3, 32, %o3		! hi32 to %o3
323*7c478bd9Sstevel@tonic-gate	and	%l3, %g1, %l3		! lo32 to %l3
324*7c478bd9Sstevel@tonic-gate
325*7c478bd9Sstevel@tonic-gate	srlx	%l4, 32, %o4		! hi32 to %o4
326*7c478bd9Sstevel@tonic-gate	and	%l4, %g1, %l4		! lo32 to %l4
327*7c478bd9Sstevel@tonic-gate
328*7c478bd9Sstevel@tonic-gate	srlx	%l5, 32, %o5		! hi32 to %o5
329*7c478bd9Sstevel@tonic-gate	and	%l5, %g1, %l5		! lo32 to %l5
330*7c478bd9Sstevel@tonic-gate
331*7c478bd9Sstevel@tonic-gate	srlx	%l6, 32, %g2		! hi32 to %g2
332*7c478bd9Sstevel@tonic-gate	and	%l6, %g1, %l6		! lo32 to %l6
333*7c478bd9Sstevel@tonic-gate
334*7c478bd9Sstevel@tonic-gate	srlx	%l7, 32, %g3		! hi32 to %g3
335*7c478bd9Sstevel@tonic-gate	and	%l7, %g1, %l7		! lo32 to %l7
336*7c478bd9Sstevel@tonic-gate				! splits gave 16 off 32b vals
337*7c478bd9Sstevel@tonic-gate	deccc	32, %i4		! mv early,avoid mispredicts? nohelp US-II.
338*7c478bd9Sstevel@tonic-gate	bz,pn	%icc, .looptidy	! count now zero?
339*7c478bd9Sstevel@tonic-gate	add	%l0, %o0, %o0	! delay
340*7c478bd9Sstevel@tonic-gate
341*7c478bd9Sstevel@tonic-gate	ldx	[%i0+0], %l0
342*7c478bd9Sstevel@tonic-gate	add	%l1, %o1, %o1	! adds and loads
343*7c478bd9Sstevel@tonic-gate	add	%l2, %o2, %o2
344*7c478bd9Sstevel@tonic-gate
345*7c478bd9Sstevel@tonic-gate	ldx	[%i0+8], %l1
346*7c478bd9Sstevel@tonic-gate	add	%l3, %o3, %o3
347*7c478bd9Sstevel@tonic-gate	add	%l4, %o4, %o4
348*7c478bd9Sstevel@tonic-gate
349*7c478bd9Sstevel@tonic-gate	ldx	[%i0+16], %l2
350*7c478bd9Sstevel@tonic-gate	add	%l5, %o5, %o5
351*7c478bd9Sstevel@tonic-gate	add	%l6, %g2, %g2
352*7c478bd9Sstevel@tonic-gate
353*7c478bd9Sstevel@tonic-gate	ldx	[%i0+24], %l3
354*7c478bd9Sstevel@tonic-gate	add	%l7, %g3, %g3		! now 8 off 33b vals
355*7c478bd9Sstevel@tonic-gate	add	%o0, %o1, %o0
356*7c478bd9Sstevel@tonic-gate
357*7c478bd9Sstevel@tonic-gate	ldx	[%i0+32], %l4
358*7c478bd9Sstevel@tonic-gate	add	%o2, %o3, %o1
359*7c478bd9Sstevel@tonic-gate	add	%o4, %o5, %o2
360*7c478bd9Sstevel@tonic-gate
361*7c478bd9Sstevel@tonic-gate	ldx	[%i0+40], %l5
362*7c478bd9Sstevel@tonic-gate	add	%g2, %g3, %o3		! now 4 off 34b vals
363*7c478bd9Sstevel@tonic-gate	add	%o0, %o1, %o0
364*7c478bd9Sstevel@tonic-gate
365*7c478bd9Sstevel@tonic-gate	ldx	[%i0+48], %l6
366*7c478bd9Sstevel@tonic-gate	add	%o2, %o3, %o1		! 2 off 35b
367*7c478bd9Sstevel@tonic-gate
368*7c478bd9Sstevel@tonic-gate	ldx	[%i0+56], %l7
369*7c478bd9Sstevel@tonic-gate	add	%o0, %o1, %o0		! 36b
370*7c478bd9Sstevel@tonic-gate	inc	64, %i0		! increment source address
371*7c478bd9Sstevel@tonic-gate
372*7c478bd9Sstevel@tonic-gate	add	%o0, %i2, %i2	! accumulator
373*7c478bd9Sstevel@tonic-gate	ba	5b
374*7c478bd9Sstevel@tonic-gate	prefetch [%i0], #n_reads	! next cacheline
375*7c478bd9Sstevel@tonic-gate				! end of main loop
376*7c478bd9Sstevel@tonic-gate.looptidy:	! compute remaining partial sum - 8 clocks
377*7c478bd9Sstevel@tonic-gate	add	%l1, %o1, %o1
378*7c478bd9Sstevel@tonic-gate	add	%l2, %o2, %o2
379*7c478bd9Sstevel@tonic-gate
380*7c478bd9Sstevel@tonic-gate	add	%l3, %o3, %o3
381*7c478bd9Sstevel@tonic-gate	add	%l4, %o4, %o4
382*7c478bd9Sstevel@tonic-gate
383*7c478bd9Sstevel@tonic-gate	add	%l5, %o5, %o5
384*7c478bd9Sstevel@tonic-gate	add	%l6, %g2, %g2
385*7c478bd9Sstevel@tonic-gate
386*7c478bd9Sstevel@tonic-gate	add	%l7, %g3, %g3		! 8 x 33b
387*7c478bd9Sstevel@tonic-gate	add	%o0, %o1, %o0
388*7c478bd9Sstevel@tonic-gate
389*7c478bd9Sstevel@tonic-gate	add	%o2, %o3, %o1
390*7c478bd9Sstevel@tonic-gate	add	%o4, %o5, %o2
391*7c478bd9Sstevel@tonic-gate
392*7c478bd9Sstevel@tonic-gate	add	%g2, %g3, %o3		! 4 x 34b
393*7c478bd9Sstevel@tonic-gate	add	%o0, %o1, %o0
394*7c478bd9Sstevel@tonic-gate
395*7c478bd9Sstevel@tonic-gate	add	%o2, %o3, %o1		! 2 x 35b
396*7c478bd9Sstevel@tonic-gate	add	%o0, %i2, %i2	! accumulator
397*7c478bd9Sstevel@tonic-gate
398*7c478bd9Sstevel@tonic-gate	add	%o1, %i2, %i2	! accumulator
399*7c478bd9Sstevel@tonic-gate
400*7c478bd9Sstevel@tonic-gate
401*7c478bd9Sstevel@tonic-gate.postamble:
402*7c478bd9Sstevel@tonic-gate	! postamble hword count is in %i1 (can be zero)
403*7c478bd9Sstevel@tonic-gate	! while at least 1 dword, do dwords.   Max 7 iterations.
404*7c478bd9Sstevel@tonic-gate	andncc	%i1, 3, %g0	! more than 3 hwords?
405*7c478bd9Sstevel@tonic-gate.dotail_dw:
406*7c478bd9Sstevel@tonic-gate	bz,a,pn	%icc, .dotail_hw
407*7c478bd9Sstevel@tonic-gate	tst	%i1		! delay: any at all left?
408*7c478bd9Sstevel@tonic-gate8:
409*7c478bd9Sstevel@tonic-gate	ldx	[%i0], %l0	! tmp64 = *src++
410*7c478bd9Sstevel@tonic-gate	inc	8, %i0
411*7c478bd9Sstevel@tonic-gate	dec	4, %i1		! decrement count, 4 halfwords
412*7c478bd9Sstevel@tonic-gate
413*7c478bd9Sstevel@tonic-gate				! stall for D-cache
414*7c478bd9Sstevel@tonic-gate
415*7c478bd9Sstevel@tonic-gate	srlx	%l0, 32, %o0	! hi32
416*7c478bd9Sstevel@tonic-gate	and	%l0, %g1, %l0	! lo32
417*7c478bd9Sstevel@tonic-gate
418*7c478bd9Sstevel@tonic-gate	add	%o0, %i2, %i2	! accumulator
419*7c478bd9Sstevel@tonic-gate
420*7c478bd9Sstevel@tonic-gate	andncc	%i1, 3, %g0	! more than 3 hwords?
421*7c478bd9Sstevel@tonic-gate	bnz,pt	%icc, 8b
422*7c478bd9Sstevel@tonic-gate	add	%l0, %i2, %i2	! accumulator
423*7c478bd9Sstevel@tonic-gate
424*7c478bd9Sstevel@tonic-gate	! while at least 1 hword, do hwords.   Max 3 iterations.
425*7c478bd9Sstevel@tonic-gate	tst	%i1
426*7c478bd9Sstevel@tonic-gate.dotail_hw:
427*7c478bd9Sstevel@tonic-gate	bz,a	.fold
428*7c478bd9Sstevel@tonic-gate	srlx	%i2, 32, %o0	! delay: hi32
429*7c478bd9Sstevel@tonic-gate	lduh	[%i0], %l0	! tmp16 = *src++
430*7c478bd9Sstevel@tonic-gate1:
431*7c478bd9Sstevel@tonic-gate	inc	2, %i0
432*7c478bd9Sstevel@tonic-gate				! stall for D-cache
433*7c478bd9Sstevel@tonic-gate
434*7c478bd9Sstevel@tonic-gate	add	%l0, %i2, %i2	! accumulator
435*7c478bd9Sstevel@tonic-gate
436*7c478bd9Sstevel@tonic-gate	deccc	%i1		! decrement count
437*7c478bd9Sstevel@tonic-gate	bnz,a,pt %icc, 1b
438*7c478bd9Sstevel@tonic-gate	lduh	[%i0], %l0	! tmp16 = *src++
439*7c478bd9Sstevel@tonic-gate
440*7c478bd9Sstevel@tonic-gate	! at this point the 64-bit accumulator
441*7c478bd9Sstevel@tonic-gate	! has the result that needs to be returned in 16-bits
442*7c478bd9Sstevel@tonic-gate	srlx	%i2, 32, %o0	! hi32
443*7c478bd9Sstevel@tonic-gate.fold:
444*7c478bd9Sstevel@tonic-gate	and	%i2, %g1, %o1	! lo32
445*7c478bd9Sstevel@tonic-gate
446*7c478bd9Sstevel@tonic-gate	add	%o0, %o1, %o0	! 33b
447*7c478bd9Sstevel@tonic-gate
448*7c478bd9Sstevel@tonic-gate	srlx	%o0, 16, %o1	! hi17
449*7c478bd9Sstevel@tonic-gate	and	%o0, %g4, %o0	! lo16
450*7c478bd9Sstevel@tonic-gate
451*7c478bd9Sstevel@tonic-gate	add	%o1, %o0, %o0	! 18b
452*7c478bd9Sstevel@tonic-gate
453*7c478bd9Sstevel@tonic-gate	srlx	%o0, 16, %o1	! hi2
454*7c478bd9Sstevel@tonic-gate	and	%o0, %g4, %o0	! lo16
455*7c478bd9Sstevel@tonic-gate
456*7c478bd9Sstevel@tonic-gate	add	%o1, %o0, %i0	! 16b result in %i0
457*7c478bd9Sstevel@tonic-gate
458*7c478bd9Sstevel@tonic-gate	ret			! return
459*7c478bd9Sstevel@tonic-gate	restore
460*7c478bd9Sstevel@tonic-gate
461*7c478bd9Sstevel@tonic-gate
462*7c478bd9Sstevel@tonic-gate	SET_SIZE(ip_ocsum_long)	! 64-bit version
463*7c478bd9Sstevel@tonic-gate
464*7c478bd9Sstevel@tonic-gate#endif 	/* lint */
465