1*7c478bd9Sstevel@tonic-gate/* 2*7c478bd9Sstevel@tonic-gate * CDDL HEADER START 3*7c478bd9Sstevel@tonic-gate * 4*7c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*7c478bd9Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 6*7c478bd9Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 7*7c478bd9Sstevel@tonic-gate * with the License. 8*7c478bd9Sstevel@tonic-gate * 9*7c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*7c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 11*7c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 12*7c478bd9Sstevel@tonic-gate * and limitations under the License. 13*7c478bd9Sstevel@tonic-gate * 14*7c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 15*7c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*7c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 17*7c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 18*7c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 19*7c478bd9Sstevel@tonic-gate * 20*7c478bd9Sstevel@tonic-gate * CDDL HEADER END 21*7c478bd9Sstevel@tonic-gate */ 22*7c478bd9Sstevel@tonic-gate/* 23*7c478bd9Sstevel@tonic-gate * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*7c478bd9Sstevel@tonic-gate * Use is subject to license terms. 25*7c478bd9Sstevel@tonic-gate */ 26*7c478bd9Sstevel@tonic-gate 27*7c478bd9Sstevel@tonic-gate#pragma ident "%Z%%M% %I% %E% SMI" 28*7c478bd9Sstevel@tonic-gate 29*7c478bd9Sstevel@tonic-gate#include <sys/param.h> 30*7c478bd9Sstevel@tonic-gate#include <sys/errno.h> 31*7c478bd9Sstevel@tonic-gate#include <sys/asm_linkage.h> 32*7c478bd9Sstevel@tonic-gate#include <sys/vtrace.h> 33*7c478bd9Sstevel@tonic-gate#include <sys/machthread.h> 34*7c478bd9Sstevel@tonic-gate#include <sys/machparam.h> 35*7c478bd9Sstevel@tonic-gate 36*7c478bd9Sstevel@tonic-gate#if defined(lint) 37*7c478bd9Sstevel@tonic-gate#include <sys/types.h> 38*7c478bd9Sstevel@tonic-gate#else /* lint */ 39*7c478bd9Sstevel@tonic-gate#include "assym.h" 40*7c478bd9Sstevel@tonic-gate#endif /* lint */ 41*7c478bd9Sstevel@tonic-gate 42*7c478bd9Sstevel@tonic-gate/* 43*7c478bd9Sstevel@tonic-gate * Prefetch considerations 44*7c478bd9Sstevel@tonic-gate * 45*7c478bd9Sstevel@tonic-gate * We prefetch one cacheline ahead. This may not be enough on Serengeti 46*7c478bd9Sstevel@tonic-gate * systems - see default_copyout() etc which prefetch 5 lines ahead. 47*7c478bd9Sstevel@tonic-gate * On the other hand, we expect most of the source buffers to be 48*7c478bd9Sstevel@tonic-gate * recently used enough to be cached. 49*7c478bd9Sstevel@tonic-gate * 50*7c478bd9Sstevel@tonic-gate * On US-I the prefetches are inoperative. On US-II they preload the E$; 51*7c478bd9Sstevel@tonic-gate * the mainloop unrolling and load-buffer should cover loads from E$. 52*7c478bd9Sstevel@tonic-gate * The stores appear to be the slow point on US-II. 53*7c478bd9Sstevel@tonic-gate * 54*7c478bd9Sstevel@tonic-gate * On US-IIICu the prefetch preloads the L2$ too, but there is no load 55*7c478bd9Sstevel@tonic-gate * buffer so the loads will stall for D$ miss, L2$ hit. The hardware 56*7c478bd9Sstevel@tonic-gate * auto-prefetch is not activated by integer loads. No solution 57*7c478bd9Sstevel@tonic-gate * in sight for this, barring odd games with FP read, write, integer read. 58*7c478bd9Sstevel@tonic-gate * 59*7c478bd9Sstevel@tonic-gate * US-IV (Panther) appears similar to US-IIICu, except that a strong 60*7c478bd9Sstevel@tonic-gate * variant of prefetch is available which can take TLB traps. We don't 61*7c478bd9Sstevel@tonic-gate * use this. The h/w prefetch stride can be set to 64, 128 or 192, 62*7c478bd9Sstevel@tonic-gate * and they only reach to the L2$ (we don't use these either). 63*7c478bd9Sstevel@tonic-gate * L2$ load-to-use latency is 15 cycles (best). 64*7c478bd9Sstevel@tonic-gate */ 65*7c478bd9Sstevel@tonic-gate 66*7c478bd9Sstevel@tonic-gate 67*7c478bd9Sstevel@tonic-gate/* 68*7c478bd9Sstevel@tonic-gate * ip_ocsum(address, halfword_count, sum) 69*7c478bd9Sstevel@tonic-gate * Do a 16 bit one's complement sum of a given number of (16-bit) 70*7c478bd9Sstevel@tonic-gate * halfwords. The halfword pointer must not be odd. 71*7c478bd9Sstevel@tonic-gate * %o0 address; %o1 count; %o2 sum accumulator; %o4 temp 72*7c478bd9Sstevel@tonic-gate * %g2 and %g3 used in main loop 73*7c478bd9Sstevel@tonic-gate * 74*7c478bd9Sstevel@tonic-gate * (from @(#)ocsum.s 1.3 89/02/24 SMI) 75*7c478bd9Sstevel@tonic-gate * 76*7c478bd9Sstevel@tonic-gate */ 77*7c478bd9Sstevel@tonic-gate 78*7c478bd9Sstevel@tonic-gate#if defined(lint) 79*7c478bd9Sstevel@tonic-gate 80*7c478bd9Sstevel@tonic-gate/* ARGSUSED */ 81*7c478bd9Sstevel@tonic-gateunsigned int 82*7c478bd9Sstevel@tonic-gateip_ocsum(u_short *address, int halfword_count, unsigned int sum) 83*7c478bd9Sstevel@tonic-gate{ return (0); } 84*7c478bd9Sstevel@tonic-gate 85*7c478bd9Sstevel@tonic-gate#else /* lint */ 86*7c478bd9Sstevel@tonic-gate 87*7c478bd9Sstevel@tonic-gate ENTRY(ip_ocsum) 88*7c478bd9Sstevel@tonic-gate 89*7c478bd9Sstevel@tonic-gate/* 90*7c478bd9Sstevel@tonic-gate * On ttcp transmits, called once per ocsum_copyin but with a small 91*7c478bd9Sstevel@tonic-gate * block ( >99.9% ). Could be the tx hdrs? How many acks/seg are we rxing? 92*7c478bd9Sstevel@tonic-gate * On ttcp receives, called more than once per ocsum_copyout. Rx hdrs 93*7c478bd9Sstevel@tonic-gate * and tx acks? 94*7c478bd9Sstevel@tonic-gate * 95*7c478bd9Sstevel@tonic-gate * To do: telnet and nfs traffic 96*7c478bd9Sstevel@tonic-gate * 97*7c478bd9Sstevel@tonic-gate * On an NCA'd webserver about 10% of the calls are >64 bytes 98*7c478bd9Sstevel@tonic-gate * about 10% of those start on a 64byte boundary 99*7c478bd9Sstevel@tonic-gate * about 30% are >5*64 bytes. 100*7c478bd9Sstevel@tonic-gate * The NCA numbers & proportions don't change with h/w cksum on. 101*7c478bd9Sstevel@tonic-gate * 102*7c478bd9Sstevel@tonic-gate * Tx hdrs are likely to be already in cache. 103*7c478bd9Sstevel@tonic-gate * Rx hdrs depends if already inspected. 104*7c478bd9Sstevel@tonic-gate */ 105*7c478bd9Sstevel@tonic-gate 106*7c478bd9Sstevel@tonic-gate ! 107*7c478bd9Sstevel@tonic-gate ! Entry point for checksum-only. 108*7c478bd9Sstevel@tonic-gate ! %o0 contains buffer address 109*7c478bd9Sstevel@tonic-gate ! %o1 contains count of 16bit words 110*7c478bd9Sstevel@tonic-gate ! %o2 contains sum 111*7c478bd9Sstevel@tonic-gate ! 112*7c478bd9Sstevel@tonic-gate ! %o3 temporary 113*7c478bd9Sstevel@tonic-gate ! %o4 temporary 114*7c478bd9Sstevel@tonic-gate ! %g1 32bit mask 115*7c478bd9Sstevel@tonic-gate ! %g4 16bit mask 116*7c478bd9Sstevel@tonic-gate ! %g5 64bit mask (all 1s) 117*7c478bd9Sstevel@tonic-gate ! 118*7c478bd9Sstevel@tonic-gate not %g0, %g5 ! all 1's 119*7c478bd9Sstevel@tonic-gate prefetch [%o0], #n_reads ! first hword, dword, cacheline 120*7c478bd9Sstevel@tonic-gate 121*7c478bd9Sstevel@tonic-gate clruw %g5, %g1 ! 32 1's at low end 122*7c478bd9Sstevel@tonic-gate srl %g5, 16, %g4 ! 16 1's at low end 123*7c478bd9Sstevel@tonic-gate 124*7c478bd9Sstevel@tonic-gate cmp %o1, 32 ! at least a cacheline (64 bytes)? 125*7c478bd9Sstevel@tonic-gate bge,pn %icc, ip_ocsum_long ! yes, do the whole works 126*7c478bd9Sstevel@tonic-gate andn %o0, 7, %o5 ! delay: base src addr 127*7c478bd9Sstevel@tonic-gate 128*7c478bd9Sstevel@tonic-gate 129*7c478bd9Sstevel@tonic-gate cmp %o1, 4 ! < 4 halfwords? 130*7c478bd9Sstevel@tonic-gate bl,pn %icc, .tiny ! < 4 halfwords, just do them 131*7c478bd9Sstevel@tonic-gate inc 8, %o5 ! delay: next addr (no matter for .tiny) 132*7c478bd9Sstevel@tonic-gate 133*7c478bd9Sstevel@tonic-gate /* leading dword with 1-4 hwords: 9 clocks */ 134*7c478bd9Sstevel@tonic-gate /* Assumes ok to read the entire dword with the leading hwords */ 135*7c478bd9Sstevel@tonic-gate 136*7c478bd9Sstevel@tonic-gate ldx [%o5-8], %o3 ! NB base addr 137*7c478bd9Sstevel@tonic-gate sub %o5, %o0, %g2 ! byte count: 2/4/6/8 138*7c478bd9Sstevel@tonic-gate mov %o5, %o0 139*7c478bd9Sstevel@tonic-gate 140*7c478bd9Sstevel@tonic-gate sll %g2, 2, %g2 ! 8/16/24/32 for mask 141*7c478bd9Sstevel@tonic-gate 142*7c478bd9Sstevel@tonic-gate sllx %g5, %g2, %o5 143*7c478bd9Sstevel@tonic-gate 144*7c478bd9Sstevel@tonic-gate sllx %o5, %g2, %o5 ! mask: 16/32/48/64 0's at low end 145*7c478bd9Sstevel@tonic-gate 146*7c478bd9Sstevel@tonic-gate srl %g2, 3, %g2 ! hw count 147*7c478bd9Sstevel@tonic-gate andn %o3, %o5, %o3 ! select hw's from src 148*7c478bd9Sstevel@tonic-gate 149*7c478bd9Sstevel@tonic-gate srlx %o3, 32, %o4 ! hi32 150*7c478bd9Sstevel@tonic-gate b 9f 151*7c478bd9Sstevel@tonic-gate sub %o1, %g2, %o1 ! delay: decr count, 1-4 halfwords 152*7c478bd9Sstevel@tonic-gate 153*7c478bd9Sstevel@tonic-gate.short_dw: ! max 7 iters of 4 clocks; 1 mispred of 4 154*7c478bd9Sstevel@tonic-gate ldx [%o0], %o3 ! tmp64 = *src++ (groups with the branch) 155*7c478bd9Sstevel@tonic-gate 156*7c478bd9Sstevel@tonic-gate inc 8, %o0 ! (D-cache load-use delay) 157*7c478bd9Sstevel@tonic-gate dec 4, %o1 ! decrement count, 4 halfwords 158*7c478bd9Sstevel@tonic-gate 159*7c478bd9Sstevel@tonic-gate srlx %o3, 32, %o4 ! hi32 160*7c478bd9Sstevel@tonic-gate9: and %o3, %g1, %o3 ! lo32 161*7c478bd9Sstevel@tonic-gate 162*7c478bd9Sstevel@tonic-gate add %o4, %o2, %o2 ! accumulator 163*7c478bd9Sstevel@tonic-gate andncc %o1, 3, %g0 ! more than 3 hwords left? 164*7c478bd9Sstevel@tonic-gate 165*7c478bd9Sstevel@tonic-gate bnz,pt %icc, .short_dw 166*7c478bd9Sstevel@tonic-gate add %o3, %o2, %o2 ! accumulator 167*7c478bd9Sstevel@tonic-gate 168*7c478bd9Sstevel@tonic-gate.short_hw: ! trailing dw: 0-3 hwords 169*7c478bd9Sstevel@tonic-gate tst %o1 ! 0 seems fairly common... 170*7c478bd9Sstevel@tonic-gate bz,a .short_fold 171*7c478bd9Sstevel@tonic-gate srlx %o2, 32, %o4 ! delay: hi32 172*7c478bd9Sstevel@tonic-gate ! mispredict 4 + 7 clocks for 1-3 173*7c478bd9Sstevel@tonic-gate ldx [%o0], %o3 174*7c478bd9Sstevel@tonic-gate sll %o1, 4, %o1 ! bitcount: 16/32/48 175*7c478bd9Sstevel@tonic-gate 176*7c478bd9Sstevel@tonic-gate srlx %g5, %o1, %o5 ! mask: 16/32/48 0's at high end 177*7c478bd9Sstevel@tonic-gate 178*7c478bd9Sstevel@tonic-gate andn %o3, %o5, %o3 ! select hw's from src 179*7c478bd9Sstevel@tonic-gate 180*7c478bd9Sstevel@tonic-gate srlx %o3, 32, %o4 ! hi32 181*7c478bd9Sstevel@tonic-gate and %o3, %g1, %o3 ! lo32 182*7c478bd9Sstevel@tonic-gate 183*7c478bd9Sstevel@tonic-gate add %o4, %o2, %o2 ! accumulator 184*7c478bd9Sstevel@tonic-gate 185*7c478bd9Sstevel@tonic-gate add %o3, %o2, %o2 ! accumulator 186*7c478bd9Sstevel@tonic-gate 187*7c478bd9Sstevel@tonic-gate ! at this point the 64-bit accumulator 188*7c478bd9Sstevel@tonic-gate ! has the result that needs to be returned in 16-bits 189*7c478bd9Sstevel@tonic-gate srlx %o2, 32, %o4 ! hi32 190*7c478bd9Sstevel@tonic-gate.short_fold: 191*7c478bd9Sstevel@tonic-gate and %o2, %g1, %o2 ! lo32 192*7c478bd9Sstevel@tonic-gate 193*7c478bd9Sstevel@tonic-gate add %o4, %o2, %o2 ! 33b 194*7c478bd9Sstevel@tonic-gate 195*7c478bd9Sstevel@tonic-gate srlx %o2, 16, %o3 ! hi17 196*7c478bd9Sstevel@tonic-gate and %o2, %g4, %o2 ! lo16 197*7c478bd9Sstevel@tonic-gate 198*7c478bd9Sstevel@tonic-gate add %o3, %o2, %o2 ! 18b 199*7c478bd9Sstevel@tonic-gate 200*7c478bd9Sstevel@tonic-gate srlx %o2, 16, %o3 ! hi2 201*7c478bd9Sstevel@tonic-gate and %o2, %g4, %o2 ! lo16 202*7c478bd9Sstevel@tonic-gate 203*7c478bd9Sstevel@tonic-gate retl ! return 204*7c478bd9Sstevel@tonic-gate add %o3, %o2, %o0 ! 16b result in %o0 205*7c478bd9Sstevel@tonic-gate 206*7c478bd9Sstevel@tonic-gate.tiny: ! almost never: less than 4 halfwords total. 207*7c478bd9Sstevel@tonic-gate tst %o1 208*7c478bd9Sstevel@tonic-gate bz,a .short_fold 209*7c478bd9Sstevel@tonic-gate 210*7c478bd9Sstevel@tonic-gate srlx %o2, 32, %o4 ! delay: hi32 211*7c478bd9Sstevel@tonic-gate 212*7c478bd9Sstevel@tonic-gate lduh [%o0], %o3 ! tmp16 = *src++ 213*7c478bd9Sstevel@tonic-gate1: 214*7c478bd9Sstevel@tonic-gate inc 2, %o0 215*7c478bd9Sstevel@tonic-gate ! stall for D-cache 216*7c478bd9Sstevel@tonic-gate 217*7c478bd9Sstevel@tonic-gate add %o3, %o2, %o2 ! accumulator 218*7c478bd9Sstevel@tonic-gate 219*7c478bd9Sstevel@tonic-gate deccc %o1 ! decrement count 220*7c478bd9Sstevel@tonic-gate bnz,a,pt %icc, 1b 221*7c478bd9Sstevel@tonic-gate lduh [%o0], %o3 ! tmp16 = *src++ 222*7c478bd9Sstevel@tonic-gate 223*7c478bd9Sstevel@tonic-gate ! at this point the 64-bit accumulator 224*7c478bd9Sstevel@tonic-gate ! has the result that needs to be returned in 16-bits 225*7c478bd9Sstevel@tonic-gate b .short_fold 226*7c478bd9Sstevel@tonic-gate srlx %o2, 32, %o4 ! hi32 227*7c478bd9Sstevel@tonic-gate 228*7c478bd9Sstevel@tonic-gate SET_SIZE(ip_ocsum) ! 64-bit version 229*7c478bd9Sstevel@tonic-gate 230*7c478bd9Sstevel@tonic-gate 231*7c478bd9Sstevel@tonic-gate ENTRY(ip_ocsum_long) ! 64-bit, large blocks 232*7c478bd9Sstevel@tonic-gate save %sp, -SA(MINFRAME), %sp ! get another window 233*7c478bd9Sstevel@tonic-gate ! 234*7c478bd9Sstevel@tonic-gate ! %i0 contains buffer address 235*7c478bd9Sstevel@tonic-gate ! %i1 contains count of 16bit words 236*7c478bd9Sstevel@tonic-gate ! %i2 contains sum 237*7c478bd9Sstevel@tonic-gate ! %i4 contains the mainloop count 238*7c478bd9Sstevel@tonic-gate ! %i5 comes in with the buffer address rounded down to the first dword 239*7c478bd9Sstevel@tonic-gate ! 240*7c478bd9Sstevel@tonic-gate ! %g1 32bit mask 241*7c478bd9Sstevel@tonic-gate ! %g4 16bit mask 242*7c478bd9Sstevel@tonic-gate ! %g5 64bit mask (all 1s) 243*7c478bd9Sstevel@tonic-gate ! %g6 fetch-ahead offset for Ecache 244*7c478bd9Sstevel@tonic-gate ! 245*7c478bd9Sstevel@tonic-gate ! %l0-7,%o0-5,%g2-3 mainloop temporaries 246*7c478bd9Sstevel@tonic-gate ! 247*7c478bd9Sstevel@tonic-gate ! 248*7c478bd9Sstevel@tonic-gate ! 1 clock overhead 249*7c478bd9Sstevel@tonic-gate btst 63, %i0 ! src 64-byte aligned? 250*7c478bd9Sstevel@tonic-gate bz,a,pt %icc, .mainsection ! aligned blocks are fairly common 251*7c478bd9Sstevel@tonic-gate andncc %i1, 31, %i4 ! at least 64 bytes for main loop? 252*7c478bd9Sstevel@tonic-gate 253*7c478bd9Sstevel@tonic-gate 254*7c478bd9Sstevel@tonic-gate ! Leading dword, with 1-4 hwords: 9 clocks 255*7c478bd9Sstevel@tonic-gate ! Assumes ok to read the entire dword with the leading bytes 256*7c478bd9Sstevel@tonic-gate ldx [%i5], %l0 ! NB base addr 257*7c478bd9Sstevel@tonic-gate inc 8, %i5 ! next addr 258*7c478bd9Sstevel@tonic-gate 259*7c478bd9Sstevel@tonic-gate sub %i5, %i0, %l2 ! byte count: 2/4/6/8 260*7c478bd9Sstevel@tonic-gate mov %i5, %i0 261*7c478bd9Sstevel@tonic-gate 262*7c478bd9Sstevel@tonic-gate sll %l2, 2, %l2 ! 8/16/24/32 for mask 263*7c478bd9Sstevel@tonic-gate 264*7c478bd9Sstevel@tonic-gate sllx %g5, %l2, %l4 265*7c478bd9Sstevel@tonic-gate 266*7c478bd9Sstevel@tonic-gate sllx %l4, %l2, %l4 ! mask: 16, 32, 48, 64 0's at lsb 267*7c478bd9Sstevel@tonic-gate 268*7c478bd9Sstevel@tonic-gate srl %l2, 3, %l2 ! 1/2/3/4 for count 269*7c478bd9Sstevel@tonic-gate andn %l0, %l4, %l0 ! select hw's from src 270*7c478bd9Sstevel@tonic-gate 271*7c478bd9Sstevel@tonic-gate srlx %l0, 32, %o0 ! hi32 272*7c478bd9Sstevel@tonic-gate b 9f 273*7c478bd9Sstevel@tonic-gate sub %i1, %l2, %i1 ! decr count, 1-4 halfwords 274*7c478bd9Sstevel@tonic-gate 275*7c478bd9Sstevel@tonic-gate ! Do dwords until source is 64-byte aligned, 0-6 iterations 276*7c478bd9Sstevel@tonic-gate ! 4 clocks per + 4 for 1 mispred = 16 clocks avg 277*7c478bd9Sstevel@tonic-gate.dw: ldx [%i0], %l0 ! tmp64 = *src++ (groups with the branch below) 278*7c478bd9Sstevel@tonic-gate 279*7c478bd9Sstevel@tonic-gate inc 8, %i0 ! (Dcache load-use delay) 280*7c478bd9Sstevel@tonic-gate dec 4, %i1 ! decrement count, 4 halfwords 281*7c478bd9Sstevel@tonic-gate 282*7c478bd9Sstevel@tonic-gate srlx %l0, 32, %o0 ! hi32 283*7c478bd9Sstevel@tonic-gate9: and %l0, %g1, %l0 ! lo32 284*7c478bd9Sstevel@tonic-gate 285*7c478bd9Sstevel@tonic-gate add %o0, %i2, %i2 ! accumulator 286*7c478bd9Sstevel@tonic-gate btst 63, %i0 ! src 64-byte aligned? 287*7c478bd9Sstevel@tonic-gate 288*7c478bd9Sstevel@tonic-gate bnz,pt %icc, .dw 289*7c478bd9Sstevel@tonic-gate add %l0, %i2, %i2 ! accumulator 290*7c478bd9Sstevel@tonic-gate 291*7c478bd9Sstevel@tonic-gate 292*7c478bd9Sstevel@tonic-gate ! At this point source address is 64 byte aligned 293*7c478bd9Sstevel@tonic-gate ! and we've dealt with 1-32 halfwords. 294*7c478bd9Sstevel@tonic-gate andncc %i1, 31, %i4 ! at least 64 bytes for main loop? 295*7c478bd9Sstevel@tonic-gate.mainsection: ! total 18n + 21 clocks 296*7c478bd9Sstevel@tonic-gate bz,pn %icc, .postamble 297*7c478bd9Sstevel@tonic-gate and %i1, 31, %i1 ! count for postamble 298*7c478bd9Sstevel@tonic-gate 299*7c478bd9Sstevel@tonic-gate ! preload for main loop - 9 clocks assuming D$ hits at 1 per 300*7c478bd9Sstevel@tonic-gate ldx [%i0+0], %l0 301*7c478bd9Sstevel@tonic-gate ldx [%i0+8], %l1 302*7c478bd9Sstevel@tonic-gate ldx [%i0+16], %l2 ! %l0 could be used here if Dcache hit 303*7c478bd9Sstevel@tonic-gate ldx [%i0+24], %l3 ! but US-II prefetch only loads Ecache 304*7c478bd9Sstevel@tonic-gate ldx [%i0+32], %l4 ! check on US-III: could mix preloads & splits? 305*7c478bd9Sstevel@tonic-gate ldx [%i0+40], %l5 306*7c478bd9Sstevel@tonic-gate ldx [%i0+48], %l6 307*7c478bd9Sstevel@tonic-gate ldx [%i0+56], %l7 308*7c478bd9Sstevel@tonic-gate inc 64, %i0 309*7c478bd9Sstevel@tonic-gate prefetch [%i0], #n_reads 310*7c478bd9Sstevel@tonic-gate 311*7c478bd9Sstevel@tonic-gate ! main loop. Read 64 bytes at a time - 18 clocks per iteration 312*7c478bd9Sstevel@tonic-gate5: ! plus 4 for the exit mispredict 313*7c478bd9Sstevel@tonic-gate srlx %l0, 32, %o0 ! hi32 to %o0 314*7c478bd9Sstevel@tonic-gate and %l0, %g1, %l0 ! lo32 to %l0 315*7c478bd9Sstevel@tonic-gate 316*7c478bd9Sstevel@tonic-gate srlx %l1, 32, %o1 ! hi32 to %o1 317*7c478bd9Sstevel@tonic-gate and %l1, %g1, %l1 ! lo32 to %l1 318*7c478bd9Sstevel@tonic-gate 319*7c478bd9Sstevel@tonic-gate srlx %l2, 32, %o2 ! hi32 to %o2 320*7c478bd9Sstevel@tonic-gate and %l2, %g1, %l2 ! lo32 to %l2 321*7c478bd9Sstevel@tonic-gate 322*7c478bd9Sstevel@tonic-gate srlx %l3, 32, %o3 ! hi32 to %o3 323*7c478bd9Sstevel@tonic-gate and %l3, %g1, %l3 ! lo32 to %l3 324*7c478bd9Sstevel@tonic-gate 325*7c478bd9Sstevel@tonic-gate srlx %l4, 32, %o4 ! hi32 to %o4 326*7c478bd9Sstevel@tonic-gate and %l4, %g1, %l4 ! lo32 to %l4 327*7c478bd9Sstevel@tonic-gate 328*7c478bd9Sstevel@tonic-gate srlx %l5, 32, %o5 ! hi32 to %o5 329*7c478bd9Sstevel@tonic-gate and %l5, %g1, %l5 ! lo32 to %l5 330*7c478bd9Sstevel@tonic-gate 331*7c478bd9Sstevel@tonic-gate srlx %l6, 32, %g2 ! hi32 to %g2 332*7c478bd9Sstevel@tonic-gate and %l6, %g1, %l6 ! lo32 to %l6 333*7c478bd9Sstevel@tonic-gate 334*7c478bd9Sstevel@tonic-gate srlx %l7, 32, %g3 ! hi32 to %g3 335*7c478bd9Sstevel@tonic-gate and %l7, %g1, %l7 ! lo32 to %l7 336*7c478bd9Sstevel@tonic-gate ! splits gave 16 off 32b vals 337*7c478bd9Sstevel@tonic-gate deccc 32, %i4 ! mv early,avoid mispredicts? nohelp US-II. 338*7c478bd9Sstevel@tonic-gate bz,pn %icc, .looptidy ! count now zero? 339*7c478bd9Sstevel@tonic-gate add %l0, %o0, %o0 ! delay 340*7c478bd9Sstevel@tonic-gate 341*7c478bd9Sstevel@tonic-gate ldx [%i0+0], %l0 342*7c478bd9Sstevel@tonic-gate add %l1, %o1, %o1 ! adds and loads 343*7c478bd9Sstevel@tonic-gate add %l2, %o2, %o2 344*7c478bd9Sstevel@tonic-gate 345*7c478bd9Sstevel@tonic-gate ldx [%i0+8], %l1 346*7c478bd9Sstevel@tonic-gate add %l3, %o3, %o3 347*7c478bd9Sstevel@tonic-gate add %l4, %o4, %o4 348*7c478bd9Sstevel@tonic-gate 349*7c478bd9Sstevel@tonic-gate ldx [%i0+16], %l2 350*7c478bd9Sstevel@tonic-gate add %l5, %o5, %o5 351*7c478bd9Sstevel@tonic-gate add %l6, %g2, %g2 352*7c478bd9Sstevel@tonic-gate 353*7c478bd9Sstevel@tonic-gate ldx [%i0+24], %l3 354*7c478bd9Sstevel@tonic-gate add %l7, %g3, %g3 ! now 8 off 33b vals 355*7c478bd9Sstevel@tonic-gate add %o0, %o1, %o0 356*7c478bd9Sstevel@tonic-gate 357*7c478bd9Sstevel@tonic-gate ldx [%i0+32], %l4 358*7c478bd9Sstevel@tonic-gate add %o2, %o3, %o1 359*7c478bd9Sstevel@tonic-gate add %o4, %o5, %o2 360*7c478bd9Sstevel@tonic-gate 361*7c478bd9Sstevel@tonic-gate ldx [%i0+40], %l5 362*7c478bd9Sstevel@tonic-gate add %g2, %g3, %o3 ! now 4 off 34b vals 363*7c478bd9Sstevel@tonic-gate add %o0, %o1, %o0 364*7c478bd9Sstevel@tonic-gate 365*7c478bd9Sstevel@tonic-gate ldx [%i0+48], %l6 366*7c478bd9Sstevel@tonic-gate add %o2, %o3, %o1 ! 2 off 35b 367*7c478bd9Sstevel@tonic-gate 368*7c478bd9Sstevel@tonic-gate ldx [%i0+56], %l7 369*7c478bd9Sstevel@tonic-gate add %o0, %o1, %o0 ! 36b 370*7c478bd9Sstevel@tonic-gate inc 64, %i0 ! increment source address 371*7c478bd9Sstevel@tonic-gate 372*7c478bd9Sstevel@tonic-gate add %o0, %i2, %i2 ! accumulator 373*7c478bd9Sstevel@tonic-gate ba 5b 374*7c478bd9Sstevel@tonic-gate prefetch [%i0], #n_reads ! next cacheline 375*7c478bd9Sstevel@tonic-gate ! end of main loop 376*7c478bd9Sstevel@tonic-gate.looptidy: ! compute remaining partial sum - 8 clocks 377*7c478bd9Sstevel@tonic-gate add %l1, %o1, %o1 378*7c478bd9Sstevel@tonic-gate add %l2, %o2, %o2 379*7c478bd9Sstevel@tonic-gate 380*7c478bd9Sstevel@tonic-gate add %l3, %o3, %o3 381*7c478bd9Sstevel@tonic-gate add %l4, %o4, %o4 382*7c478bd9Sstevel@tonic-gate 383*7c478bd9Sstevel@tonic-gate add %l5, %o5, %o5 384*7c478bd9Sstevel@tonic-gate add %l6, %g2, %g2 385*7c478bd9Sstevel@tonic-gate 386*7c478bd9Sstevel@tonic-gate add %l7, %g3, %g3 ! 8 x 33b 387*7c478bd9Sstevel@tonic-gate add %o0, %o1, %o0 388*7c478bd9Sstevel@tonic-gate 389*7c478bd9Sstevel@tonic-gate add %o2, %o3, %o1 390*7c478bd9Sstevel@tonic-gate add %o4, %o5, %o2 391*7c478bd9Sstevel@tonic-gate 392*7c478bd9Sstevel@tonic-gate add %g2, %g3, %o3 ! 4 x 34b 393*7c478bd9Sstevel@tonic-gate add %o0, %o1, %o0 394*7c478bd9Sstevel@tonic-gate 395*7c478bd9Sstevel@tonic-gate add %o2, %o3, %o1 ! 2 x 35b 396*7c478bd9Sstevel@tonic-gate add %o0, %i2, %i2 ! accumulator 397*7c478bd9Sstevel@tonic-gate 398*7c478bd9Sstevel@tonic-gate add %o1, %i2, %i2 ! accumulator 399*7c478bd9Sstevel@tonic-gate 400*7c478bd9Sstevel@tonic-gate 401*7c478bd9Sstevel@tonic-gate.postamble: 402*7c478bd9Sstevel@tonic-gate ! postamble hword count is in %i1 (can be zero) 403*7c478bd9Sstevel@tonic-gate ! while at least 1 dword, do dwords. Max 7 iterations. 404*7c478bd9Sstevel@tonic-gate andncc %i1, 3, %g0 ! more than 3 hwords? 405*7c478bd9Sstevel@tonic-gate.dotail_dw: 406*7c478bd9Sstevel@tonic-gate bz,a,pn %icc, .dotail_hw 407*7c478bd9Sstevel@tonic-gate tst %i1 ! delay: any at all left? 408*7c478bd9Sstevel@tonic-gate8: 409*7c478bd9Sstevel@tonic-gate ldx [%i0], %l0 ! tmp64 = *src++ 410*7c478bd9Sstevel@tonic-gate inc 8, %i0 411*7c478bd9Sstevel@tonic-gate dec 4, %i1 ! decrement count, 4 halfwords 412*7c478bd9Sstevel@tonic-gate 413*7c478bd9Sstevel@tonic-gate ! stall for D-cache 414*7c478bd9Sstevel@tonic-gate 415*7c478bd9Sstevel@tonic-gate srlx %l0, 32, %o0 ! hi32 416*7c478bd9Sstevel@tonic-gate and %l0, %g1, %l0 ! lo32 417*7c478bd9Sstevel@tonic-gate 418*7c478bd9Sstevel@tonic-gate add %o0, %i2, %i2 ! accumulator 419*7c478bd9Sstevel@tonic-gate 420*7c478bd9Sstevel@tonic-gate andncc %i1, 3, %g0 ! more than 3 hwords? 421*7c478bd9Sstevel@tonic-gate bnz,pt %icc, 8b 422*7c478bd9Sstevel@tonic-gate add %l0, %i2, %i2 ! accumulator 423*7c478bd9Sstevel@tonic-gate 424*7c478bd9Sstevel@tonic-gate ! while at least 1 hword, do hwords. Max 3 iterations. 425*7c478bd9Sstevel@tonic-gate tst %i1 426*7c478bd9Sstevel@tonic-gate.dotail_hw: 427*7c478bd9Sstevel@tonic-gate bz,a .fold 428*7c478bd9Sstevel@tonic-gate srlx %i2, 32, %o0 ! delay: hi32 429*7c478bd9Sstevel@tonic-gate lduh [%i0], %l0 ! tmp16 = *src++ 430*7c478bd9Sstevel@tonic-gate1: 431*7c478bd9Sstevel@tonic-gate inc 2, %i0 432*7c478bd9Sstevel@tonic-gate ! stall for D-cache 433*7c478bd9Sstevel@tonic-gate 434*7c478bd9Sstevel@tonic-gate add %l0, %i2, %i2 ! accumulator 435*7c478bd9Sstevel@tonic-gate 436*7c478bd9Sstevel@tonic-gate deccc %i1 ! decrement count 437*7c478bd9Sstevel@tonic-gate bnz,a,pt %icc, 1b 438*7c478bd9Sstevel@tonic-gate lduh [%i0], %l0 ! tmp16 = *src++ 439*7c478bd9Sstevel@tonic-gate 440*7c478bd9Sstevel@tonic-gate ! at this point the 64-bit accumulator 441*7c478bd9Sstevel@tonic-gate ! has the result that needs to be returned in 16-bits 442*7c478bd9Sstevel@tonic-gate srlx %i2, 32, %o0 ! hi32 443*7c478bd9Sstevel@tonic-gate.fold: 444*7c478bd9Sstevel@tonic-gate and %i2, %g1, %o1 ! lo32 445*7c478bd9Sstevel@tonic-gate 446*7c478bd9Sstevel@tonic-gate add %o0, %o1, %o0 ! 33b 447*7c478bd9Sstevel@tonic-gate 448*7c478bd9Sstevel@tonic-gate srlx %o0, 16, %o1 ! hi17 449*7c478bd9Sstevel@tonic-gate and %o0, %g4, %o0 ! lo16 450*7c478bd9Sstevel@tonic-gate 451*7c478bd9Sstevel@tonic-gate add %o1, %o0, %o0 ! 18b 452*7c478bd9Sstevel@tonic-gate 453*7c478bd9Sstevel@tonic-gate srlx %o0, 16, %o1 ! hi2 454*7c478bd9Sstevel@tonic-gate and %o0, %g4, %o0 ! lo16 455*7c478bd9Sstevel@tonic-gate 456*7c478bd9Sstevel@tonic-gate add %o1, %o0, %i0 ! 16b result in %i0 457*7c478bd9Sstevel@tonic-gate 458*7c478bd9Sstevel@tonic-gate ret ! return 459*7c478bd9Sstevel@tonic-gate restore 460*7c478bd9Sstevel@tonic-gate 461*7c478bd9Sstevel@tonic-gate 462*7c478bd9Sstevel@tonic-gate SET_SIZE(ip_ocsum_long) ! 64-bit version 463*7c478bd9Sstevel@tonic-gate 464*7c478bd9Sstevel@tonic-gate#endif /* lint */ 465