xref: /titanic_50/usr/src/uts/sun4/ml/ip_ocsum.s (revision db2bae3047e71d795bde12e3baa621f4b6cc8930)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/param.h>
30#include <sys/errno.h>
31#include <sys/asm_linkage.h>
32#include <sys/vtrace.h>
33#include <sys/machthread.h>
34#include <sys/machparam.h>
35
36#if defined(lint)
37#include <sys/types.h>
38#else	/* lint */
39#include "assym.h"
40#endif	/* lint */
41
42/*
43 * Prefetch considerations
44 *
45 * We prefetch one cacheline ahead.  This may not be enough on Serengeti
46 * systems - see default_copyout() etc which prefetch 5 lines ahead.
47 * On the other hand, we expect most of the source buffers to be
48 * recently used enough to be cached.
49 *
50 * On US-I the prefetches are inoperative.  On US-II they preload the E$;
51 * the mainloop unrolling and load-buffer should cover loads from E$.
52 * The stores appear to be the slow point on US-II.
53 *
54 * On US-IIICu the prefetch preloads the L2$ too, but there is no load
55 * buffer so the loads will stall for D$ miss, L2$ hit.  The hardware
56 * auto-prefetch is not activated by integer loads.  No solution
57 * in sight for this, barring odd games with FP read, write, integer read.
58 *
59 * US-IV (Panther) appears similar to US-IIICu, except that a strong
60 * variant of prefetch is available which can take TLB traps.  We don't
61 * use this.  The h/w prefetch stride can be set to 64, 128 or 192,
62 * and they only reach to the L2$ (we don't use these either).
63 * L2$ load-to-use latency is 15 cycles (best).
64 */
65
66
67/*
68 * ip_ocsum(address, halfword_count, sum)
69 * Do a 16 bit one's complement sum of a given number of (16-bit)
70 * halfwords. The halfword pointer must not be odd.
71 *	%o0 address; %o1 count; %o2 sum accumulator; %o4 temp
72 * 	%g2 and %g3 used in main loop
73 *
74 * (from @(#)ocsum.s 1.3 89/02/24 SMI)
75 *
76 */
77
78#if defined(lint)
79
80/* ARGSUSED */
81unsigned int
82ip_ocsum(u_short *address, int halfword_count, unsigned int sum)
83{ return (0); }
84
85#else	/* lint */
86
87	ENTRY(ip_ocsum)
88
89/*
90 * On ttcp transmits, called once per ocsum_copyin but with a small
91 * block ( >99.9% ).  Could be the tx hdrs?  How many acks/seg are we rxing?
92 * On ttcp receives, called more than once per ocsum_copyout. Rx hdrs
93 * and tx acks?
94 *
95 * To do: telnet and nfs traffic
96 *
97 * On an NCA'd webserver about 10% of the calls are >64 bytes
98 *	about 10% of those start on a 64byte boundary
99 *	about 30% are >5*64 bytes.
100 * The NCA numbers & proportions don't change with h/w cksum on.
101 *
102 * Tx hdrs are likely to be already in cache.
103 * Rx hdrs depends if already inspected.
104 */
105
106	!
107	! Entry point for checksum-only.
108	! %o0 contains buffer address
109	! %o1 contains count of 16bit words
110	! %o2 contains sum
111	!
112	! %o3 temporary
113	! %o4 temporary
114	! %g1 32bit mask
115	! %g4 16bit mask
116	! %g5 64bit mask (all 1s)
117	!
118	not	%g0, %g5	! all 1's
119	prefetch [%o0], #n_reads	! first hword, dword, cacheline
120
121	clruw	%g5, %g1	! 32 1's at low end
122	srl	%g5, 16, %g4	! 16 1's at low end
123
124	cmp	%o1, 32		! at least a cacheline (64 bytes)?
125	bge,pn %icc, ip_ocsum_long	! yes, do the whole works
126	andn	%o0, 7, %o5	! delay: base src addr
127
128
129	cmp	%o1, 4		! < 4 halfwords?
130	bl,pn	%icc, .tiny	! < 4 halfwords, just do them
131	inc	8, %o5		! delay: next addr (no matter for .tiny)
132
133	/* leading dword with 1-4 hwords: 9 clocks */
134	/* Assumes ok to read the entire dword with the leading hwords */
135
136	ldx	[%o5-8], %o3	! NB base addr
137	sub	%o5, %o0, %g2	! byte count: 2/4/6/8
138	mov	%o5, %o0
139
140	sll	%g2, 2, %g2	! 8/16/24/32 for mask
141
142	sllx	%g5, %g2, %o5
143
144	sllx	%o5, %g2, %o5	! mask: 16/32/48/64 0's at low end
145
146	srl	%g2, 3, %g2	! hw count
147	andn	%o3, %o5, %o3	! select hw's from src
148
149	srlx	%o3, 32, %o4	! hi32
150	b	9f
151	sub	%o1, %g2, %o1	! delay: decr count, 1-4 halfwords
152
153.short_dw:			! max 7 iters of 4 clocks; 1 mispred of 4
154	ldx	[%o0], %o3	! tmp64 = *src++ (groups with the branch)
155
156	inc	8, %o0		! (D-cache load-use delay)
157	dec	4, %o1		! decrement count, 4 halfwords
158
159	srlx	%o3, 32, %o4	! hi32
1609:	and	%o3, %g1, %o3	! lo32
161
162	add	%o4, %o2, %o2	! accumulator
163	andncc	%o1, 3, %g0	! more than 3 hwords left?
164
165	bnz,pt %icc, .short_dw
166	add	%o3, %o2, %o2	! accumulator
167
168.short_hw:			! trailing dw: 0-3 hwords
169	tst	%o1		! 0 seems fairly common...
170	bz,a	.short_fold
171	srlx	%o2, 32, %o4	! delay: hi32
172				! mispredict 4 + 7 clocks for 1-3
173	ldx	[%o0], %o3
174	sll	%o1, 4, %o1	! bitcount: 16/32/48
175
176	srlx	%g5, %o1, %o5	! mask: 16/32/48  0's at high end
177
178	andn	%o3, %o5, %o3	! select hw's from src
179
180	srlx	%o3, 32, %o4	! hi32
181	and	%o3, %g1, %o3	! lo32
182
183	add	%o4, %o2, %o2	! accumulator
184
185	add	%o3, %o2, %o2	! accumulator
186
187	! at this point the 64-bit accumulator
188	! has the result that needs to be returned in 16-bits
189	srlx	%o2, 32, %o4	! hi32
190.short_fold:
191	and	%o2, %g1, %o2	! lo32
192
193	add	%o4, %o2, %o2	! 33b
194
195	srlx	%o2, 16, %o3	! hi17
196	and	%o2, %g4, %o2	! lo16
197
198	add	%o3, %o2, %o2	! 18b
199
200	srlx	%o2, 16, %o3	! hi2
201	and	%o2, %g4, %o2	! lo16
202
203	retl			! return
204	add	%o3, %o2, %o0	! 16b result in %o0
205
206.tiny:				! almost never: less than 4 halfwords total.
207	tst	%o1
208	bz,a	.short_fold
209
210	srlx	%o2, 32, %o4	! delay: hi32
211
212	lduh	[%o0], %o3	! tmp16 = *src++
2131:
214	inc	2, %o0
215				! stall for D-cache
216
217	add	%o3, %o2, %o2	! accumulator
218
219	deccc	%o1		! decrement count
220	bnz,a,pt %icc, 1b
221	lduh	[%o0], %o3	! tmp16 = *src++
222
223	! at this point the 64-bit accumulator
224	! has the result that needs to be returned in 16-bits
225	b	.short_fold
226	srlx	%o2, 32, %o4	! hi32
227
228	SET_SIZE(ip_ocsum)	! 64-bit version
229
230
231	ENTRY(ip_ocsum_long)	! 64-bit, large blocks
232	save	%sp, -SA(MINFRAME), %sp	! get another window
233	!
234	! %i0 contains buffer address
235	! %i1 contains count of 16bit words
236	! %i2 contains sum
237	! %i4 contains the mainloop count
238	! %i5 comes in with the buffer address rounded down to the first dword
239	!
240	! %g1 32bit mask
241	! %g4 16bit mask
242	! %g5 64bit mask (all 1s)
243	! %g6 fetch-ahead offset for Ecache
244	!
245	! %l0-7,%o0-5,%g2-3 mainloop temporaries
246	!
247	!
248				! 1 clock overhead
249	btst	63, %i0		! src 64-byte aligned?
250	bz,a,pt	%icc, .mainsection	! aligned blocks are fairly common
251	andncc	%i1, 31, %i4	! at least 64 bytes for main loop?
252
253
254	! Leading dword, with 1-4 hwords: 9 clocks
255	! Assumes ok to read the entire dword with the leading bytes
256	ldx	[%i5], %l0	! NB base addr
257	inc	8, %i5		! next addr
258
259	sub	%i5, %i0, %l2	! byte count: 2/4/6/8
260	mov	%i5, %i0
261
262	sll	%l2, 2, %l2	! 8/16/24/32 for mask
263
264	sllx	%g5, %l2, %l4
265
266	sllx	%l4, %l2, %l4	! mask: 16, 32, 48, 64 0's at lsb
267
268	srl	%l2, 3, %l2	! 1/2/3/4 for count
269	andn	%l0, %l4, %l0	! select hw's from src
270
271	srlx	%l0, 32, %o0	! hi32
272	b	9f
273	sub	%i1, %l2, %i1	! decr count, 1-4 halfwords
274
275	! Do dwords until source is 64-byte aligned, 0-6 iterations
276	! 4 clocks per + 4 for 1 mispred = 16 clocks avg
277.dw:	ldx	[%i0], %l0	! tmp64 = *src++ (groups with the branch below)
278
279	inc	8, %i0		! (Dcache load-use delay)
280	dec	4, %i1		! decrement count, 4 halfwords
281
282	srlx	%l0, 32, %o0	! hi32
2839:	and	%l0, %g1, %l0	! lo32
284
285	add	%o0, %i2, %i2	! accumulator
286	btst	63, %i0		! src 64-byte aligned?
287
288	bnz,pt	%icc, .dw
289	add	%l0, %i2, %i2	! accumulator
290
291
292	! At this point source address is 64 byte aligned
293	! and we've dealt with 1-32 halfwords.
294	andncc	%i1, 31, %i4	! at least 64 bytes for main loop?
295.mainsection:				! total 18n + 21 clocks
296	bz,pn	%icc, .postamble
297	and	%i1, 31, %i1	! count for postamble
298
299	! preload for main loop - 9 clocks assuming D$ hits at 1 per
300	ldx	[%i0+0], %l0
301	ldx	[%i0+8], %l1
302	ldx	[%i0+16], %l2	! %l0 could be used here if Dcache hit
303	ldx	[%i0+24], %l3	!  but US-II prefetch only loads Ecache
304	ldx	[%i0+32], %l4	!  check on US-III: could mix preloads & splits?
305	ldx	[%i0+40], %l5
306	ldx	[%i0+48], %l6
307	ldx	[%i0+56], %l7
308	inc	64, %i0
309	prefetch [%i0], #n_reads
310
311	! main loop. Read 64 bytes at a time - 18 clocks per iteration
3125:	!					plus 4 for the exit mispredict
313	srlx	%l0, 32, %o0		! hi32 to %o0
314	and	%l0, %g1, %l0		! lo32 to %l0
315
316	srlx	%l1, 32, %o1		! hi32 to %o1
317	and	%l1, %g1, %l1		! lo32 to %l1
318
319	srlx	%l2, 32, %o2		! hi32 to %o2
320	and	%l2, %g1, %l2		! lo32 to %l2
321
322	srlx	%l3, 32, %o3		! hi32 to %o3
323	and	%l3, %g1, %l3		! lo32 to %l3
324
325	srlx	%l4, 32, %o4		! hi32 to %o4
326	and	%l4, %g1, %l4		! lo32 to %l4
327
328	srlx	%l5, 32, %o5		! hi32 to %o5
329	and	%l5, %g1, %l5		! lo32 to %l5
330
331	srlx	%l6, 32, %g2		! hi32 to %g2
332	and	%l6, %g1, %l6		! lo32 to %l6
333
334	srlx	%l7, 32, %g3		! hi32 to %g3
335	and	%l7, %g1, %l7		! lo32 to %l7
336				! splits gave 16 off 32b vals
337	deccc	32, %i4		! mv early,avoid mispredicts? nohelp US-II.
338	bz,pn	%icc, .looptidy	! count now zero?
339	add	%l0, %o0, %o0	! delay
340
341	ldx	[%i0+0], %l0
342	add	%l1, %o1, %o1	! adds and loads
343	add	%l2, %o2, %o2
344
345	ldx	[%i0+8], %l1
346	add	%l3, %o3, %o3
347	add	%l4, %o4, %o4
348
349	ldx	[%i0+16], %l2
350	add	%l5, %o5, %o5
351	add	%l6, %g2, %g2
352
353	ldx	[%i0+24], %l3
354	add	%l7, %g3, %g3		! now 8 off 33b vals
355	add	%o0, %o1, %o0
356
357	ldx	[%i0+32], %l4
358	add	%o2, %o3, %o1
359	add	%o4, %o5, %o2
360
361	ldx	[%i0+40], %l5
362	add	%g2, %g3, %o3		! now 4 off 34b vals
363	add	%o0, %o1, %o0
364
365	ldx	[%i0+48], %l6
366	add	%o2, %o3, %o1		! 2 off 35b
367
368	ldx	[%i0+56], %l7
369	add	%o0, %o1, %o0		! 36b
370	inc	64, %i0		! increment source address
371
372	add	%o0, %i2, %i2	! accumulator
373	ba	5b
374	prefetch [%i0], #n_reads	! next cacheline
375				! end of main loop
376.looptidy:	! compute remaining partial sum - 8 clocks
377	add	%l1, %o1, %o1
378	add	%l2, %o2, %o2
379
380	add	%l3, %o3, %o3
381	add	%l4, %o4, %o4
382
383	add	%l5, %o5, %o5
384	add	%l6, %g2, %g2
385
386	add	%l7, %g3, %g3		! 8 x 33b
387	add	%o0, %o1, %o0
388
389	add	%o2, %o3, %o1
390	add	%o4, %o5, %o2
391
392	add	%g2, %g3, %o3		! 4 x 34b
393	add	%o0, %o1, %o0
394
395	add	%o2, %o3, %o1		! 2 x 35b
396	add	%o0, %i2, %i2	! accumulator
397
398	add	%o1, %i2, %i2	! accumulator
399
400
401.postamble:
402	! postamble hword count is in %i1 (can be zero)
403	! while at least 1 dword, do dwords.   Max 7 iterations.
404	andncc	%i1, 3, %g0	! more than 3 hwords?
405.dotail_dw:
406	bz,a,pn	%icc, .dotail_hw
407	tst	%i1		! delay: any at all left?
4088:
409	ldx	[%i0], %l0	! tmp64 = *src++
410	inc	8, %i0
411	dec	4, %i1		! decrement count, 4 halfwords
412
413				! stall for D-cache
414
415	srlx	%l0, 32, %o0	! hi32
416	and	%l0, %g1, %l0	! lo32
417
418	add	%o0, %i2, %i2	! accumulator
419
420	andncc	%i1, 3, %g0	! more than 3 hwords?
421	bnz,pt	%icc, 8b
422	add	%l0, %i2, %i2	! accumulator
423
424	! while at least 1 hword, do hwords.   Max 3 iterations.
425	tst	%i1
426.dotail_hw:
427	bz,a	.fold
428	srlx	%i2, 32, %o0	! delay: hi32
429	lduh	[%i0], %l0	! tmp16 = *src++
4301:
431	inc	2, %i0
432				! stall for D-cache
433
434	add	%l0, %i2, %i2	! accumulator
435
436	deccc	%i1		! decrement count
437	bnz,a,pt %icc, 1b
438	lduh	[%i0], %l0	! tmp16 = *src++
439
440	! at this point the 64-bit accumulator
441	! has the result that needs to be returned in 16-bits
442	srlx	%i2, 32, %o0	! hi32
443.fold:
444	and	%i2, %g1, %o1	! lo32
445
446	add	%o0, %o1, %o0	! 33b
447
448	srlx	%o0, 16, %o1	! hi17
449	and	%o0, %g4, %o0	! lo16
450
451	add	%o1, %o0, %o0	! 18b
452
453	srlx	%o0, 16, %o1	! hi2
454	and	%o0, %g4, %o0	! lo16
455
456	add	%o1, %o0, %i0	! 16b result in %i0
457
458	ret			! return
459	restore
460
461
462	SET_SIZE(ip_ocsum_long)	! 64-bit version
463
464#endif 	/* lint */
465