1*7c478bd9Sstevel@tonic-gate /* 2*7c478bd9Sstevel@tonic-gate * CDDL HEADER START 3*7c478bd9Sstevel@tonic-gate * 4*7c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*7c478bd9Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 6*7c478bd9Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 7*7c478bd9Sstevel@tonic-gate * with the License. 8*7c478bd9Sstevel@tonic-gate * 9*7c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*7c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 11*7c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 12*7c478bd9Sstevel@tonic-gate * and limitations under the License. 13*7c478bd9Sstevel@tonic-gate * 14*7c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 15*7c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*7c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 17*7c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 18*7c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 19*7c478bd9Sstevel@tonic-gate * 20*7c478bd9Sstevel@tonic-gate * CDDL HEADER END 21*7c478bd9Sstevel@tonic-gate */ 22*7c478bd9Sstevel@tonic-gate /* 23*7c478bd9Sstevel@tonic-gate * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24*7c478bd9Sstevel@tonic-gate * Use is subject to license terms. 25*7c478bd9Sstevel@tonic-gate * 26*7c478bd9Sstevel@tonic-gate * tcp.c, Code implementing the TCP protocol. 27*7c478bd9Sstevel@tonic-gate */ 28*7c478bd9Sstevel@tonic-gate 29*7c478bd9Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 30*7c478bd9Sstevel@tonic-gate 31*7c478bd9Sstevel@tonic-gate #include <sys/types.h> 32*7c478bd9Sstevel@tonic-gate #include <socket_impl.h> 33*7c478bd9Sstevel@tonic-gate #include <socket_inet.h> 34*7c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h> 35*7c478bd9Sstevel@tonic-gate #include <sys/promif.h> 36*7c478bd9Sstevel@tonic-gate #include <sys/socket.h> 37*7c478bd9Sstevel@tonic-gate #include <netinet/in_systm.h> 38*7c478bd9Sstevel@tonic-gate #include <netinet/in.h> 39*7c478bd9Sstevel@tonic-gate #include <netinet/ip.h> 40*7c478bd9Sstevel@tonic-gate #include <netinet/tcp.h> 41*7c478bd9Sstevel@tonic-gate #include <net/if_types.h> 42*7c478bd9Sstevel@tonic-gate #include <sys/salib.h> 43*7c478bd9Sstevel@tonic-gate 44*7c478bd9Sstevel@tonic-gate #include "ipv4.h" 45*7c478bd9Sstevel@tonic-gate #include "ipv4_impl.h" 46*7c478bd9Sstevel@tonic-gate #include "mac.h" 47*7c478bd9Sstevel@tonic-gate #include "mac_impl.h" 48*7c478bd9Sstevel@tonic-gate #include "v4_sum_impl.h" 49*7c478bd9Sstevel@tonic-gate #include <sys/bootdebug.h> 50*7c478bd9Sstevel@tonic-gate #include "tcp_inet.h" 51*7c478bd9Sstevel@tonic-gate #include "tcp_sack.h" 52*7c478bd9Sstevel@tonic-gate #include <inet/common.h> 53*7c478bd9Sstevel@tonic-gate #include <inet/mib2.h> 54*7c478bd9Sstevel@tonic-gate 55*7c478bd9Sstevel@tonic-gate /* 56*7c478bd9Sstevel@tonic-gate * We need to redefine BUMP_MIB/UPDATE_MIB to not have DTrace probes. 57*7c478bd9Sstevel@tonic-gate */ 58*7c478bd9Sstevel@tonic-gate #undef BUMP_MIB 59*7c478bd9Sstevel@tonic-gate #define BUMP_MIB(x) (x)++ 60*7c478bd9Sstevel@tonic-gate 61*7c478bd9Sstevel@tonic-gate #undef UPDATE_MIB 62*7c478bd9Sstevel@tonic-gate #define UPDATE_MIB(x, y) x += y 63*7c478bd9Sstevel@tonic-gate 64*7c478bd9Sstevel@tonic-gate /* 65*7c478bd9Sstevel@tonic-gate * MIB-2 stuff for SNMP 66*7c478bd9Sstevel@tonic-gate */ 67*7c478bd9Sstevel@tonic-gate mib2_tcp_t tcp_mib; /* SNMP fixed size info */ 68*7c478bd9Sstevel@tonic-gate 69*7c478bd9Sstevel@tonic-gate /* The TCP mib does not include the following errors. */ 70*7c478bd9Sstevel@tonic-gate static uint_t tcp_cksum_errors; 71*7c478bd9Sstevel@tonic-gate static uint_t tcp_drops; 72*7c478bd9Sstevel@tonic-gate 73*7c478bd9Sstevel@tonic-gate /* Macros for timestamp comparisons */ 74*7c478bd9Sstevel@tonic-gate #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) 75*7c478bd9Sstevel@tonic-gate #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) 76*7c478bd9Sstevel@tonic-gate 77*7c478bd9Sstevel@tonic-gate /* 78*7c478bd9Sstevel@tonic-gate * Parameters for TCP Initial Send Sequence number (ISS) generation. 79*7c478bd9Sstevel@tonic-gate * The ISS is calculated by adding three components: a time component 80*7c478bd9Sstevel@tonic-gate * which grows by 1 every 4096 nanoseconds (versus every 4 microseconds 81*7c478bd9Sstevel@tonic-gate * suggested by RFC 793, page 27); 82*7c478bd9Sstevel@tonic-gate * a per-connection component which grows by 125000 for every new connection; 83*7c478bd9Sstevel@tonic-gate * and an "extra" component that grows by a random amount centered 84*7c478bd9Sstevel@tonic-gate * approximately on 64000. This causes the the ISS generator to cycle every 85*7c478bd9Sstevel@tonic-gate * 4.89 hours if no TCP connections are made, and faster if connections are 86*7c478bd9Sstevel@tonic-gate * made. 87*7c478bd9Sstevel@tonic-gate */ 88*7c478bd9Sstevel@tonic-gate #define ISS_INCR 250000 89*7c478bd9Sstevel@tonic-gate #define ISS_NSEC_SHT 0 90*7c478bd9Sstevel@tonic-gate 91*7c478bd9Sstevel@tonic-gate static uint32_t tcp_iss_incr_extra; /* Incremented for each connection */ 92*7c478bd9Sstevel@tonic-gate 93*7c478bd9Sstevel@tonic-gate #define TCP_XMIT_LOWATER 4096 94*7c478bd9Sstevel@tonic-gate #define TCP_XMIT_HIWATER 49152 95*7c478bd9Sstevel@tonic-gate #define TCP_RECV_LOWATER 2048 96*7c478bd9Sstevel@tonic-gate #define TCP_RECV_HIWATER 49152 97*7c478bd9Sstevel@tonic-gate 98*7c478bd9Sstevel@tonic-gate /* 99*7c478bd9Sstevel@tonic-gate * PAWS needs a timer for 24 days. This is the number of ms in 24 days 100*7c478bd9Sstevel@tonic-gate */ 101*7c478bd9Sstevel@tonic-gate #define PAWS_TIMEOUT ((uint32_t)(24*24*60*60*1000)) 102*7c478bd9Sstevel@tonic-gate 103*7c478bd9Sstevel@tonic-gate /* 104*7c478bd9Sstevel@tonic-gate * TCP options struct returned from tcp_parse_options. 105*7c478bd9Sstevel@tonic-gate */ 106*7c478bd9Sstevel@tonic-gate typedef struct tcp_opt_s { 107*7c478bd9Sstevel@tonic-gate uint32_t tcp_opt_mss; 108*7c478bd9Sstevel@tonic-gate uint32_t tcp_opt_wscale; 109*7c478bd9Sstevel@tonic-gate uint32_t tcp_opt_ts_val; 110*7c478bd9Sstevel@tonic-gate uint32_t tcp_opt_ts_ecr; 111*7c478bd9Sstevel@tonic-gate tcp_t *tcp; 112*7c478bd9Sstevel@tonic-gate } tcp_opt_t; 113*7c478bd9Sstevel@tonic-gate 114*7c478bd9Sstevel@tonic-gate /* 115*7c478bd9Sstevel@tonic-gate * RFC1323-recommended phrasing of TSTAMP option, for easier parsing 116*7c478bd9Sstevel@tonic-gate */ 117*7c478bd9Sstevel@tonic-gate 118*7c478bd9Sstevel@tonic-gate #ifdef _BIG_ENDIAN 119*7c478bd9Sstevel@tonic-gate #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 120*7c478bd9Sstevel@tonic-gate (TCPOPT_TSTAMP << 8) | 10) 121*7c478bd9Sstevel@tonic-gate #else 122*7c478bd9Sstevel@tonic-gate #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 123*7c478bd9Sstevel@tonic-gate (TCPOPT_NOP << 8) | TCPOPT_NOP) 124*7c478bd9Sstevel@tonic-gate #endif 125*7c478bd9Sstevel@tonic-gate 126*7c478bd9Sstevel@tonic-gate /* 127*7c478bd9Sstevel@tonic-gate * Flags returned from tcp_parse_options. 128*7c478bd9Sstevel@tonic-gate */ 129*7c478bd9Sstevel@tonic-gate #define TCP_OPT_MSS_PRESENT 1 130*7c478bd9Sstevel@tonic-gate #define TCP_OPT_WSCALE_PRESENT 2 131*7c478bd9Sstevel@tonic-gate #define TCP_OPT_TSTAMP_PRESENT 4 132*7c478bd9Sstevel@tonic-gate #define TCP_OPT_SACK_OK_PRESENT 8 133*7c478bd9Sstevel@tonic-gate #define TCP_OPT_SACK_PRESENT 16 134*7c478bd9Sstevel@tonic-gate 135*7c478bd9Sstevel@tonic-gate /* TCP option length */ 136*7c478bd9Sstevel@tonic-gate #define TCPOPT_NOP_LEN 1 137*7c478bd9Sstevel@tonic-gate #define TCPOPT_MAXSEG_LEN 4 138*7c478bd9Sstevel@tonic-gate #define TCPOPT_WS_LEN 3 139*7c478bd9Sstevel@tonic-gate #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) 140*7c478bd9Sstevel@tonic-gate #define TCPOPT_TSTAMP_LEN 10 141*7c478bd9Sstevel@tonic-gate #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) 142*7c478bd9Sstevel@tonic-gate #define TCPOPT_SACK_OK_LEN 2 143*7c478bd9Sstevel@tonic-gate #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) 144*7c478bd9Sstevel@tonic-gate #define TCPOPT_REAL_SACK_LEN 4 145*7c478bd9Sstevel@tonic-gate #define TCPOPT_MAX_SACK_LEN 36 146*7c478bd9Sstevel@tonic-gate #define TCPOPT_HEADER_LEN 2 147*7c478bd9Sstevel@tonic-gate 148*7c478bd9Sstevel@tonic-gate /* TCP cwnd burst factor. */ 149*7c478bd9Sstevel@tonic-gate #define TCP_CWND_INFINITE 65535 150*7c478bd9Sstevel@tonic-gate #define TCP_CWND_SS 3 151*7c478bd9Sstevel@tonic-gate #define TCP_CWND_NORMAL 5 152*7c478bd9Sstevel@tonic-gate 153*7c478bd9Sstevel@tonic-gate /* Named Dispatch Parameter Management Structure */ 154*7c478bd9Sstevel@tonic-gate typedef struct tcpparam_s { 155*7c478bd9Sstevel@tonic-gate uint32_t tcp_param_min; 156*7c478bd9Sstevel@tonic-gate uint32_t tcp_param_max; 157*7c478bd9Sstevel@tonic-gate uint32_t tcp_param_val; 158*7c478bd9Sstevel@tonic-gate char *tcp_param_name; 159*7c478bd9Sstevel@tonic-gate } tcpparam_t; 160*7c478bd9Sstevel@tonic-gate 161*7c478bd9Sstevel@tonic-gate /* Max size IP datagram is 64k - 1 */ 162*7c478bd9Sstevel@tonic-gate #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (struct ip) + \ 163*7c478bd9Sstevel@tonic-gate sizeof (tcph_t))) 164*7c478bd9Sstevel@tonic-gate 165*7c478bd9Sstevel@tonic-gate /* Max of the above */ 166*7c478bd9Sstevel@tonic-gate #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 167*7c478bd9Sstevel@tonic-gate 168*7c478bd9Sstevel@tonic-gate /* Largest TCP port number */ 169*7c478bd9Sstevel@tonic-gate #define TCP_MAX_PORT (64 * 1024 - 1) 170*7c478bd9Sstevel@tonic-gate 171*7c478bd9Sstevel@tonic-gate /* Round up the value to the nearest mss. */ 172*7c478bd9Sstevel@tonic-gate #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) 173*7c478bd9Sstevel@tonic-gate 174*7c478bd9Sstevel@tonic-gate #define MS 1L 175*7c478bd9Sstevel@tonic-gate #define SECONDS (1000 * MS) 176*7c478bd9Sstevel@tonic-gate #define MINUTES (60 * SECONDS) 177*7c478bd9Sstevel@tonic-gate #define HOURS (60 * MINUTES) 178*7c478bd9Sstevel@tonic-gate #define DAYS (24 * HOURS) 179*7c478bd9Sstevel@tonic-gate 180*7c478bd9Sstevel@tonic-gate /* All NDD params in the core TCP became static variables. */ 181*7c478bd9Sstevel@tonic-gate static int tcp_time_wait_interval = 1 * MINUTES; 182*7c478bd9Sstevel@tonic-gate static int tcp_conn_req_max_q = 128; 183*7c478bd9Sstevel@tonic-gate static int tcp_conn_req_max_q0 = 1024; 184*7c478bd9Sstevel@tonic-gate static int tcp_conn_req_min = 1; 185*7c478bd9Sstevel@tonic-gate static int tcp_conn_grace_period = 0 * SECONDS; 186*7c478bd9Sstevel@tonic-gate static int tcp_cwnd_max_ = 1024 * 1024; 187*7c478bd9Sstevel@tonic-gate static int tcp_smallest_nonpriv_port = 1024; 188*7c478bd9Sstevel@tonic-gate static int tcp_ip_abort_cinterval = 3 * MINUTES; 189*7c478bd9Sstevel@tonic-gate static int tcp_ip_abort_linterval = 3 * MINUTES; 190*7c478bd9Sstevel@tonic-gate static int tcp_ip_abort_interval = 8 * MINUTES; 191*7c478bd9Sstevel@tonic-gate static int tcp_ip_notify_cinterval = 10 * SECONDS; 192*7c478bd9Sstevel@tonic-gate static int tcp_ip_notify_interval = 10 * SECONDS; 193*7c478bd9Sstevel@tonic-gate static int tcp_ipv4_ttl = 64; 194*7c478bd9Sstevel@tonic-gate static int tcp_mss_def_ipv4 = 536; 195*7c478bd9Sstevel@tonic-gate static int tcp_mss_max_ipv4 = TCP_MSS_MAX_IPV4; 196*7c478bd9Sstevel@tonic-gate static int tcp_mss_min = 108; 197*7c478bd9Sstevel@tonic-gate static int tcp_naglim_def = (4*1024)-1; 198*7c478bd9Sstevel@tonic-gate static int tcp_rexmit_interval_initial = 3 * SECONDS; 199*7c478bd9Sstevel@tonic-gate static int tcp_rexmit_interval_max = 60 * SECONDS; 200*7c478bd9Sstevel@tonic-gate static int tcp_rexmit_interval_min = 400 * MS; 201*7c478bd9Sstevel@tonic-gate static int tcp_dupack_fast_retransmit = 3; 202*7c478bd9Sstevel@tonic-gate static int tcp_smallest_anon_port = 32 * 1024; 203*7c478bd9Sstevel@tonic-gate static int tcp_largest_anon_port = TCP_MAX_PORT; 204*7c478bd9Sstevel@tonic-gate static int tcp_xmit_lowat = TCP_XMIT_LOWATER; 205*7c478bd9Sstevel@tonic-gate static int tcp_recv_hiwat_minmss = 4; 206*7c478bd9Sstevel@tonic-gate static int tcp_fin_wait_2_flush_interval = 1 * MINUTES; 207*7c478bd9Sstevel@tonic-gate static int tcp_max_buf = 1024 * 1024; 208*7c478bd9Sstevel@tonic-gate static int tcp_wscale_always = 1; 209*7c478bd9Sstevel@tonic-gate static int tcp_tstamp_always = 1; 210*7c478bd9Sstevel@tonic-gate static int tcp_tstamp_if_wscale = 1; 211*7c478bd9Sstevel@tonic-gate static int tcp_rexmit_interval_extra = 0; 212*7c478bd9Sstevel@tonic-gate static int tcp_slow_start_after_idle = 2; 213*7c478bd9Sstevel@tonic-gate static int tcp_slow_start_initial = 2; 214*7c478bd9Sstevel@tonic-gate static int tcp_sack_permitted = 2; 215*7c478bd9Sstevel@tonic-gate static int tcp_ecn_permitted = 2; 216*7c478bd9Sstevel@tonic-gate 217*7c478bd9Sstevel@tonic-gate /* Extra room to fit in headers. */ 218*7c478bd9Sstevel@tonic-gate static uint_t tcp_wroff_xtra; 219*7c478bd9Sstevel@tonic-gate 220*7c478bd9Sstevel@tonic-gate /* Hint for next port to try. */ 221*7c478bd9Sstevel@tonic-gate static in_port_t tcp_next_port_to_try = 32*1024; 222*7c478bd9Sstevel@tonic-gate 223*7c478bd9Sstevel@tonic-gate /* 224*7c478bd9Sstevel@tonic-gate * Figure out the value of window scale opton. Note that the rwnd is 225*7c478bd9Sstevel@tonic-gate * ASSUMED to be rounded up to the nearest MSS before the calculation. 226*7c478bd9Sstevel@tonic-gate * We cannot find the scale value and then do a round up of tcp_rwnd 227*7c478bd9Sstevel@tonic-gate * because the scale value may not be correct after that. 228*7c478bd9Sstevel@tonic-gate */ 229*7c478bd9Sstevel@tonic-gate #define SET_WS_VALUE(tcp) \ 230*7c478bd9Sstevel@tonic-gate { \ 231*7c478bd9Sstevel@tonic-gate int i; \ 232*7c478bd9Sstevel@tonic-gate uint32_t rwnd = (tcp)->tcp_rwnd; \ 233*7c478bd9Sstevel@tonic-gate for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; \ 234*7c478bd9Sstevel@tonic-gate i++, rwnd >>= 1) \ 235*7c478bd9Sstevel@tonic-gate ; \ 236*7c478bd9Sstevel@tonic-gate (tcp)->tcp_rcv_ws = i; \ 237*7c478bd9Sstevel@tonic-gate } 238*7c478bd9Sstevel@tonic-gate 239*7c478bd9Sstevel@tonic-gate /* 240*7c478bd9Sstevel@tonic-gate * Set ECN capable transport (ECT) code point in IP header. 241*7c478bd9Sstevel@tonic-gate * 242*7c478bd9Sstevel@tonic-gate * Note that there are 2 ECT code points '01' and '10', which are called 243*7c478bd9Sstevel@tonic-gate * ECT(1) and ECT(0) respectively. Here we follow the original ECT code 244*7c478bd9Sstevel@tonic-gate * point ECT(0) for TCP as described in RFC 2481. 245*7c478bd9Sstevel@tonic-gate */ 246*7c478bd9Sstevel@tonic-gate #define SET_ECT(tcp, iph) \ 247*7c478bd9Sstevel@tonic-gate if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 248*7c478bd9Sstevel@tonic-gate /* We need to clear the code point first. */ \ 249*7c478bd9Sstevel@tonic-gate ((struct ip *)(iph))->ip_tos &= 0xFC; \ 250*7c478bd9Sstevel@tonic-gate ((struct ip *)(iph))->ip_tos |= IPH_ECN_ECT0; \ 251*7c478bd9Sstevel@tonic-gate } 252*7c478bd9Sstevel@tonic-gate 253*7c478bd9Sstevel@tonic-gate /* 254*7c478bd9Sstevel@tonic-gate * The format argument to pass to tcp_display(). 255*7c478bd9Sstevel@tonic-gate * DISP_PORT_ONLY means that the returned string has only port info. 256*7c478bd9Sstevel@tonic-gate * DISP_ADDR_AND_PORT means that the returned string also contains the 257*7c478bd9Sstevel@tonic-gate * remote and local IP address. 258*7c478bd9Sstevel@tonic-gate */ 259*7c478bd9Sstevel@tonic-gate #define DISP_PORT_ONLY 1 260*7c478bd9Sstevel@tonic-gate #define DISP_ADDR_AND_PORT 2 261*7c478bd9Sstevel@tonic-gate 262*7c478bd9Sstevel@tonic-gate /* 263*7c478bd9Sstevel@tonic-gate * TCP reassembly macros. We hide starting and ending sequence numbers in 264*7c478bd9Sstevel@tonic-gate * b_next and b_prev of messages on the reassembly queue. The messages are 265*7c478bd9Sstevel@tonic-gate * chained using b_cont. These macros are used in tcp_reass() so we don't 266*7c478bd9Sstevel@tonic-gate * have to see the ugly casts and assignments. 267*7c478bd9Sstevel@tonic-gate */ 268*7c478bd9Sstevel@tonic-gate #define TCP_REASS_SEQ(mp) ((uint32_t)((mp)->b_next)) 269*7c478bd9Sstevel@tonic-gate #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = (mblk_t *)(u)) 270*7c478bd9Sstevel@tonic-gate #define TCP_REASS_END(mp) ((uint32_t)((mp)->b_prev)) 271*7c478bd9Sstevel@tonic-gate #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = (mblk_t *)(u)) 272*7c478bd9Sstevel@tonic-gate 273*7c478bd9Sstevel@tonic-gate #define TCP_TIMER_RESTART(tcp, intvl) \ 274*7c478bd9Sstevel@tonic-gate (tcp)->tcp_rto_timeout = prom_gettime() + intvl; \ 275*7c478bd9Sstevel@tonic-gate (tcp)->tcp_timer_running = B_TRUE; 276*7c478bd9Sstevel@tonic-gate 277*7c478bd9Sstevel@tonic-gate static int tcp_accept_comm(tcp_t *, tcp_t *, mblk_t *, uint_t); 278*7c478bd9Sstevel@tonic-gate static mblk_t *tcp_ack_mp(tcp_t *); 279*7c478bd9Sstevel@tonic-gate static in_port_t tcp_bindi(in_port_t, in_addr_t *, boolean_t, boolean_t); 280*7c478bd9Sstevel@tonic-gate static uint16_t tcp_cksum(uint16_t *, uint32_t); 281*7c478bd9Sstevel@tonic-gate static void tcp_clean_death(int, tcp_t *, int err); 282*7c478bd9Sstevel@tonic-gate static tcp_t *tcp_conn_request(tcp_t *, mblk_t *mp, uint_t, uint_t); 283*7c478bd9Sstevel@tonic-gate static char *tcp_display(tcp_t *, char *, char); 284*7c478bd9Sstevel@tonic-gate static int tcp_drain_input(tcp_t *, int, int); 285*7c478bd9Sstevel@tonic-gate static void tcp_drain_needed(int, tcp_t *); 286*7c478bd9Sstevel@tonic-gate static boolean_t tcp_drop_q0(tcp_t *); 287*7c478bd9Sstevel@tonic-gate static mblk_t *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *); 288*7c478bd9Sstevel@tonic-gate static int tcp_header_len(struct inetgram *); 289*7c478bd9Sstevel@tonic-gate static in_port_t tcp_report_ports(uint16_t *, enum Ports); 290*7c478bd9Sstevel@tonic-gate static int tcp_input(int); 291*7c478bd9Sstevel@tonic-gate static void tcp_iss_init(tcp_t *); 292*7c478bd9Sstevel@tonic-gate static tcp_t *tcp_lookup_ipv4(struct ip *, tcpha_t *, int, int *); 293*7c478bd9Sstevel@tonic-gate static tcp_t *tcp_lookup_listener_ipv4(in_addr_t, in_port_t, int *); 294*7c478bd9Sstevel@tonic-gate static int tcp_conn_check(tcp_t *); 295*7c478bd9Sstevel@tonic-gate static int tcp_close(int); 296*7c478bd9Sstevel@tonic-gate static void tcp_close_detached(tcp_t *); 297*7c478bd9Sstevel@tonic-gate static void tcp_eager_cleanup(tcp_t *, boolean_t, int); 298*7c478bd9Sstevel@tonic-gate static void tcp_eager_unlink(tcp_t *); 299*7c478bd9Sstevel@tonic-gate static void tcp_free(tcp_t *); 300*7c478bd9Sstevel@tonic-gate static int tcp_header_init_ipv4(tcp_t *); 301*7c478bd9Sstevel@tonic-gate static void tcp_mss_set(tcp_t *, uint32_t); 302*7c478bd9Sstevel@tonic-gate static int tcp_parse_options(tcph_t *, tcp_opt_t *); 303*7c478bd9Sstevel@tonic-gate static boolean_t tcp_paws_check(tcp_t *, tcph_t *, tcp_opt_t *); 304*7c478bd9Sstevel@tonic-gate static void tcp_process_options(tcp_t *, tcph_t *); 305*7c478bd9Sstevel@tonic-gate static int tcp_random(void); 306*7c478bd9Sstevel@tonic-gate static void tcp_random_init(void); 307*7c478bd9Sstevel@tonic-gate static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); 308*7c478bd9Sstevel@tonic-gate static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); 309*7c478bd9Sstevel@tonic-gate static void tcp_rcv_drain(int sock_id, tcp_t *); 310*7c478bd9Sstevel@tonic-gate static void tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t); 311*7c478bd9Sstevel@tonic-gate static void tcp_rput_data(tcp_t *, mblk_t *, int); 312*7c478bd9Sstevel@tonic-gate static int tcp_rwnd_set(tcp_t *, uint32_t); 313*7c478bd9Sstevel@tonic-gate static int32_t tcp_sack_rxmit(tcp_t *, int); 314*7c478bd9Sstevel@tonic-gate static void tcp_set_cksum(mblk_t *); 315*7c478bd9Sstevel@tonic-gate static void tcp_set_rto(tcp_t *, int32_t); 316*7c478bd9Sstevel@tonic-gate static void tcp_ss_rexmit(tcp_t *, int); 317*7c478bd9Sstevel@tonic-gate static int tcp_state_wait(int, tcp_t *, int); 318*7c478bd9Sstevel@tonic-gate static void tcp_timer(tcp_t *, int); 319*7c478bd9Sstevel@tonic-gate static void tcp_time_wait_append(tcp_t *); 320*7c478bd9Sstevel@tonic-gate static void tcp_time_wait_collector(void); 321*7c478bd9Sstevel@tonic-gate static void tcp_time_wait_processing(tcp_t *, mblk_t *, uint32_t, 322*7c478bd9Sstevel@tonic-gate uint32_t, int, tcph_t *, int sock_id); 323*7c478bd9Sstevel@tonic-gate static void tcp_time_wait_remove(tcp_t *); 324*7c478bd9Sstevel@tonic-gate static in_port_t tcp_update_next_port(in_port_t); 325*7c478bd9Sstevel@tonic-gate static int tcp_verify_cksum(mblk_t *); 326*7c478bd9Sstevel@tonic-gate static void tcp_wput_data(tcp_t *, mblk_t *, int); 327*7c478bd9Sstevel@tonic-gate static void tcp_xmit_ctl(char *, tcp_t *, mblk_t *, uint32_t, uint32_t, 328*7c478bd9Sstevel@tonic-gate int, uint_t, int); 329*7c478bd9Sstevel@tonic-gate static void tcp_xmit_early_reset(char *, int, mblk_t *, uint32_t, uint32_t, 330*7c478bd9Sstevel@tonic-gate int, uint_t); 331*7c478bd9Sstevel@tonic-gate static int tcp_xmit_end(tcp_t *, int); 332*7c478bd9Sstevel@tonic-gate static void tcp_xmit_listeners_reset(int, mblk_t *, uint_t); 333*7c478bd9Sstevel@tonic-gate static mblk_t *tcp_xmit_mp(tcp_t *, mblk_t *, int32_t, int32_t *, 334*7c478bd9Sstevel@tonic-gate mblk_t **, uint32_t, boolean_t, uint32_t *, boolean_t); 335*7c478bd9Sstevel@tonic-gate static int tcp_init_values(tcp_t *, struct inetboot_socket *); 336*7c478bd9Sstevel@tonic-gate 337*7c478bd9Sstevel@tonic-gate #if DEBUG > 1 338*7c478bd9Sstevel@tonic-gate #define TCP_DUMP_PACKET(str, mp) \ 339*7c478bd9Sstevel@tonic-gate { \ 340*7c478bd9Sstevel@tonic-gate int len = (mp)->b_wptr - (mp)->b_rptr; \ 341*7c478bd9Sstevel@tonic-gate \ 342*7c478bd9Sstevel@tonic-gate printf("%s: dump TCP(%d): \n", (str), len); \ 343*7c478bd9Sstevel@tonic-gate hexdump((char *)(mp)->b_rptr, len); \ 344*7c478bd9Sstevel@tonic-gate } 345*7c478bd9Sstevel@tonic-gate #else 346*7c478bd9Sstevel@tonic-gate #define TCP_DUMP_PACKET(str, mp) 347*7c478bd9Sstevel@tonic-gate #endif 348*7c478bd9Sstevel@tonic-gate 349*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 350*7c478bd9Sstevel@tonic-gate #define DEBUG_1(str, arg) printf(str, (arg)) 351*7c478bd9Sstevel@tonic-gate #define DEBUG_2(str, arg1, arg2) printf(str, (arg1), (arg2)) 352*7c478bd9Sstevel@tonic-gate #define DEBUG_3(str, arg1, arg2, arg3) printf(str, (arg1), (arg2), (arg3)) 353*7c478bd9Sstevel@tonic-gate #else 354*7c478bd9Sstevel@tonic-gate #define DEBUG_1(str, arg) 355*7c478bd9Sstevel@tonic-gate #define DEBUG_2(str, arg1, arg2) 356*7c478bd9Sstevel@tonic-gate #define DEBUG_3(str, arg1, arg2, arg3) 357*7c478bd9Sstevel@tonic-gate #endif 358*7c478bd9Sstevel@tonic-gate 359*7c478bd9Sstevel@tonic-gate /* Whether it is the first time TCP is used. */ 360*7c478bd9Sstevel@tonic-gate static boolean_t tcp_initialized = B_FALSE; 361*7c478bd9Sstevel@tonic-gate 362*7c478bd9Sstevel@tonic-gate /* TCP time wait list. */ 363*7c478bd9Sstevel@tonic-gate static tcp_t *tcp_time_wait_head; 364*7c478bd9Sstevel@tonic-gate static tcp_t *tcp_time_wait_tail; 365*7c478bd9Sstevel@tonic-gate static uint32_t tcp_cum_timewait; 366*7c478bd9Sstevel@tonic-gate /* When the tcp_time_wait_collector is run. */ 367*7c478bd9Sstevel@tonic-gate static uint32_t tcp_time_wait_runtime; 368*7c478bd9Sstevel@tonic-gate 369*7c478bd9Sstevel@tonic-gate #define TCP_RUN_TIME_WAIT_COLLECTOR() \ 370*7c478bd9Sstevel@tonic-gate if (prom_gettime() > tcp_time_wait_runtime) \ 371*7c478bd9Sstevel@tonic-gate tcp_time_wait_collector(); 372*7c478bd9Sstevel@tonic-gate 373*7c478bd9Sstevel@tonic-gate /* 374*7c478bd9Sstevel@tonic-gate * Accept will return with an error if there is no connection coming in 375*7c478bd9Sstevel@tonic-gate * after this (in ms). 376*7c478bd9Sstevel@tonic-gate */ 377*7c478bd9Sstevel@tonic-gate static int tcp_accept_timeout = 60000; 378*7c478bd9Sstevel@tonic-gate 379*7c478bd9Sstevel@tonic-gate /* 380*7c478bd9Sstevel@tonic-gate * Initialize the TCP-specific parts of a socket. 381*7c478bd9Sstevel@tonic-gate */ 382*7c478bd9Sstevel@tonic-gate void 383*7c478bd9Sstevel@tonic-gate tcp_socket_init(struct inetboot_socket *isp) 384*7c478bd9Sstevel@tonic-gate { 385*7c478bd9Sstevel@tonic-gate /* Do some initializations. */ 386*7c478bd9Sstevel@tonic-gate if (!tcp_initialized) { 387*7c478bd9Sstevel@tonic-gate tcp_random_init(); 388*7c478bd9Sstevel@tonic-gate /* Extra head room for the MAC layer address. */ 389*7c478bd9Sstevel@tonic-gate if ((tcp_wroff_xtra = mac_get_hdr_len()) & 0x3) { 390*7c478bd9Sstevel@tonic-gate tcp_wroff_xtra = (tcp_wroff_xtra & ~0x3) + 0x4; 391*7c478bd9Sstevel@tonic-gate } 392*7c478bd9Sstevel@tonic-gate /* Schedule the first time wait cleanup time */ 393*7c478bd9Sstevel@tonic-gate tcp_time_wait_runtime = prom_gettime() + tcp_time_wait_interval; 394*7c478bd9Sstevel@tonic-gate tcp_initialized = B_TRUE; 395*7c478bd9Sstevel@tonic-gate } 396*7c478bd9Sstevel@tonic-gate TCP_RUN_TIME_WAIT_COLLECTOR(); 397*7c478bd9Sstevel@tonic-gate 398*7c478bd9Sstevel@tonic-gate isp->proto = IPPROTO_TCP; 399*7c478bd9Sstevel@tonic-gate isp->input[TRANSPORT_LVL] = tcp_input; 400*7c478bd9Sstevel@tonic-gate /* Socket layer should call tcp_send() directly. */ 401*7c478bd9Sstevel@tonic-gate isp->output[TRANSPORT_LVL] = NULL; 402*7c478bd9Sstevel@tonic-gate isp->close[TRANSPORT_LVL] = tcp_close; 403*7c478bd9Sstevel@tonic-gate isp->headerlen[TRANSPORT_LVL] = tcp_header_len; 404*7c478bd9Sstevel@tonic-gate isp->ports = tcp_report_ports; 405*7c478bd9Sstevel@tonic-gate if ((isp->pcb = bkmem_alloc(sizeof (tcp_t))) == NULL) { 406*7c478bd9Sstevel@tonic-gate errno = ENOBUFS; 407*7c478bd9Sstevel@tonic-gate return; 408*7c478bd9Sstevel@tonic-gate } 409*7c478bd9Sstevel@tonic-gate if ((errno = tcp_init_values((tcp_t *)isp->pcb, isp)) != 0) { 410*7c478bd9Sstevel@tonic-gate bkmem_free(isp->pcb, sizeof (tcp_t)); 411*7c478bd9Sstevel@tonic-gate return; 412*7c478bd9Sstevel@tonic-gate } 413*7c478bd9Sstevel@tonic-gate /* 414*7c478bd9Sstevel@tonic-gate * This is set last because this field is used to determine if 415*7c478bd9Sstevel@tonic-gate * a socket is in use or not. 416*7c478bd9Sstevel@tonic-gate */ 417*7c478bd9Sstevel@tonic-gate isp->type = INETBOOT_STREAM; 418*7c478bd9Sstevel@tonic-gate } 419*7c478bd9Sstevel@tonic-gate 420*7c478bd9Sstevel@tonic-gate /* 421*7c478bd9Sstevel@tonic-gate * Return the size of a TCP header including TCP option. 422*7c478bd9Sstevel@tonic-gate */ 423*7c478bd9Sstevel@tonic-gate static int 424*7c478bd9Sstevel@tonic-gate tcp_header_len(struct inetgram *igm) 425*7c478bd9Sstevel@tonic-gate { 426*7c478bd9Sstevel@tonic-gate mblk_t *pkt; 427*7c478bd9Sstevel@tonic-gate int ipvers; 428*7c478bd9Sstevel@tonic-gate 429*7c478bd9Sstevel@tonic-gate /* Just returns the standard TCP header without option */ 430*7c478bd9Sstevel@tonic-gate if (igm == NULL) 431*7c478bd9Sstevel@tonic-gate return (sizeof (tcph_t)); 432*7c478bd9Sstevel@tonic-gate 433*7c478bd9Sstevel@tonic-gate if ((pkt = igm->igm_mp) == NULL) 434*7c478bd9Sstevel@tonic-gate return (0); 435*7c478bd9Sstevel@tonic-gate 436*7c478bd9Sstevel@tonic-gate ipvers = ((struct ip *)pkt->b_rptr)->ip_v; 437*7c478bd9Sstevel@tonic-gate if (ipvers == IPV4_VERSION) { 438*7c478bd9Sstevel@tonic-gate return (TCP_HDR_LENGTH((tcph_t *)(pkt + IPH_HDR_LENGTH(pkt)))); 439*7c478bd9Sstevel@tonic-gate } else { 440*7c478bd9Sstevel@tonic-gate dprintf("tcp_header_len: non-IPv4 packet.\n"); 441*7c478bd9Sstevel@tonic-gate return (0); 442*7c478bd9Sstevel@tonic-gate } 443*7c478bd9Sstevel@tonic-gate } 444*7c478bd9Sstevel@tonic-gate 445*7c478bd9Sstevel@tonic-gate /* 446*7c478bd9Sstevel@tonic-gate * Return the requested port number in network order. 447*7c478bd9Sstevel@tonic-gate */ 448*7c478bd9Sstevel@tonic-gate static in_port_t 449*7c478bd9Sstevel@tonic-gate tcp_report_ports(uint16_t *tcphp, enum Ports request) 450*7c478bd9Sstevel@tonic-gate { 451*7c478bd9Sstevel@tonic-gate if (request == SOURCE) 452*7c478bd9Sstevel@tonic-gate return (*(uint16_t *)(((tcph_t *)tcphp)->th_lport)); 453*7c478bd9Sstevel@tonic-gate return (*(uint16_t *)(((tcph_t *)tcphp)->th_fport)); 454*7c478bd9Sstevel@tonic-gate } 455*7c478bd9Sstevel@tonic-gate 456*7c478bd9Sstevel@tonic-gate /* 457*7c478bd9Sstevel@tonic-gate * Because inetboot is not interrupt driven, TCP can only poll. This 458*7c478bd9Sstevel@tonic-gate * means that there can be packets stuck in the NIC buffer waiting to 459*7c478bd9Sstevel@tonic-gate * be processed. Thus we need to drain them before, for example, sending 460*7c478bd9Sstevel@tonic-gate * anything because an ACK may actually be stuck there. 461*7c478bd9Sstevel@tonic-gate * 462*7c478bd9Sstevel@tonic-gate * The timeout arguments determine how long we should wait for draining. 463*7c478bd9Sstevel@tonic-gate */ 464*7c478bd9Sstevel@tonic-gate static int 465*7c478bd9Sstevel@tonic-gate tcp_drain_input(tcp_t *tcp, int sock_id, int timeout) 466*7c478bd9Sstevel@tonic-gate { 467*7c478bd9Sstevel@tonic-gate struct inetgram *in_gram; 468*7c478bd9Sstevel@tonic-gate struct inetgram *old_in_gram; 469*7c478bd9Sstevel@tonic-gate int old_timeout; 470*7c478bd9Sstevel@tonic-gate mblk_t *mp; 471*7c478bd9Sstevel@tonic-gate int i; 472*7c478bd9Sstevel@tonic-gate 473*7c478bd9Sstevel@tonic-gate dprintf("tcp_drain_input(%d): %s\n", sock_id, 474*7c478bd9Sstevel@tonic-gate tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 475*7c478bd9Sstevel@tonic-gate 476*7c478bd9Sstevel@tonic-gate /* 477*7c478bd9Sstevel@tonic-gate * Since the driver uses the in_timeout value in the socket 478*7c478bd9Sstevel@tonic-gate * structure to determine the timeout value, we need to save 479*7c478bd9Sstevel@tonic-gate * the original one so that we can restore that after draining. 480*7c478bd9Sstevel@tonic-gate */ 481*7c478bd9Sstevel@tonic-gate old_timeout = sockets[sock_id].in_timeout; 482*7c478bd9Sstevel@tonic-gate sockets[sock_id].in_timeout = timeout; 483*7c478bd9Sstevel@tonic-gate 484*7c478bd9Sstevel@tonic-gate /* 485*7c478bd9Sstevel@tonic-gate * We do this because the input queue may have some user 486*7c478bd9Sstevel@tonic-gate * data already. 487*7c478bd9Sstevel@tonic-gate */ 488*7c478bd9Sstevel@tonic-gate old_in_gram = sockets[sock_id].inq; 489*7c478bd9Sstevel@tonic-gate sockets[sock_id].inq = NULL; 490*7c478bd9Sstevel@tonic-gate 491*7c478bd9Sstevel@tonic-gate /* Go out and check the wire */ 492*7c478bd9Sstevel@tonic-gate for (i = MEDIA_LVL; i < TRANSPORT_LVL; i++) { 493*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].input[i] != NULL) { 494*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].input[i](sock_id) < 0) { 495*7c478bd9Sstevel@tonic-gate sockets[sock_id].in_timeout = old_timeout; 496*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].inq != NULL) 497*7c478bd9Sstevel@tonic-gate nuke_grams(&sockets[sock_id].inq); 498*7c478bd9Sstevel@tonic-gate sockets[sock_id].inq = old_in_gram; 499*7c478bd9Sstevel@tonic-gate return (-1); 500*7c478bd9Sstevel@tonic-gate } 501*7c478bd9Sstevel@tonic-gate } 502*7c478bd9Sstevel@tonic-gate } 503*7c478bd9Sstevel@tonic-gate #if DEBUG 504*7c478bd9Sstevel@tonic-gate printf("tcp_drain_input: done with checking packets\n"); 505*7c478bd9Sstevel@tonic-gate #endif 506*7c478bd9Sstevel@tonic-gate while ((in_gram = sockets[sock_id].inq) != NULL) { 507*7c478bd9Sstevel@tonic-gate /* Remove unknown inetgrams from the head of inq. */ 508*7c478bd9Sstevel@tonic-gate if (in_gram->igm_level != TRANSPORT_LVL) { 509*7c478bd9Sstevel@tonic-gate #if DEBUG 510*7c478bd9Sstevel@tonic-gate printf("tcp_drain_input: unexpected packet " 511*7c478bd9Sstevel@tonic-gate "level %d frame found\n", in_gram->igm_level); 512*7c478bd9Sstevel@tonic-gate #endif 513*7c478bd9Sstevel@tonic-gate del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 514*7c478bd9Sstevel@tonic-gate continue; 515*7c478bd9Sstevel@tonic-gate } 516*7c478bd9Sstevel@tonic-gate mp = in_gram->igm_mp; 517*7c478bd9Sstevel@tonic-gate del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 518*7c478bd9Sstevel@tonic-gate bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 519*7c478bd9Sstevel@tonic-gate tcp_rput_data(tcp, mp, sock_id); 520*7c478bd9Sstevel@tonic-gate sockets[sock_id].in_timeout = old_timeout; 521*7c478bd9Sstevel@tonic-gate 522*7c478bd9Sstevel@tonic-gate /* 523*7c478bd9Sstevel@tonic-gate * The other side may have closed this connection or 524*7c478bd9Sstevel@tonic-gate * RST us. But we need to continue to process other 525*7c478bd9Sstevel@tonic-gate * packets in the socket's queue because they may be 526*7c478bd9Sstevel@tonic-gate * belong to another TCP connections. 527*7c478bd9Sstevel@tonic-gate */ 528*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].pcb == NULL) 529*7c478bd9Sstevel@tonic-gate tcp = NULL; 530*7c478bd9Sstevel@tonic-gate } 531*7c478bd9Sstevel@tonic-gate 532*7c478bd9Sstevel@tonic-gate if (tcp == NULL || sockets[sock_id].pcb == NULL) { 533*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].so_error != 0) 534*7c478bd9Sstevel@tonic-gate return (-1); 535*7c478bd9Sstevel@tonic-gate else 536*7c478bd9Sstevel@tonic-gate return (0); 537*7c478bd9Sstevel@tonic-gate } 538*7c478bd9Sstevel@tonic-gate #if DEBUG 539*7c478bd9Sstevel@tonic-gate printf("tcp_drain_input: done with processing packets\n"); 540*7c478bd9Sstevel@tonic-gate #endif 541*7c478bd9Sstevel@tonic-gate sockets[sock_id].in_timeout = old_timeout; 542*7c478bd9Sstevel@tonic-gate sockets[sock_id].inq = old_in_gram; 543*7c478bd9Sstevel@tonic-gate 544*7c478bd9Sstevel@tonic-gate /* 545*7c478bd9Sstevel@tonic-gate * Data may have been received so indicate it is available 546*7c478bd9Sstevel@tonic-gate */ 547*7c478bd9Sstevel@tonic-gate tcp_drain_needed(sock_id, tcp); 548*7c478bd9Sstevel@tonic-gate return (0); 549*7c478bd9Sstevel@tonic-gate } 550*7c478bd9Sstevel@tonic-gate 551*7c478bd9Sstevel@tonic-gate /* 552*7c478bd9Sstevel@tonic-gate * The receive entry point for upper layer to call to get data. Note 553*7c478bd9Sstevel@tonic-gate * that this follows the current architecture that lower layer receive 554*7c478bd9Sstevel@tonic-gate * routines have been called already. Thus if the inq of socket is 555*7c478bd9Sstevel@tonic-gate * not NULL, the packets must be for us. 556*7c478bd9Sstevel@tonic-gate */ 557*7c478bd9Sstevel@tonic-gate static int 558*7c478bd9Sstevel@tonic-gate tcp_input(int sock_id) 559*7c478bd9Sstevel@tonic-gate { 560*7c478bd9Sstevel@tonic-gate struct inetgram *in_gram; 561*7c478bd9Sstevel@tonic-gate mblk_t *mp; 562*7c478bd9Sstevel@tonic-gate tcp_t *tcp; 563*7c478bd9Sstevel@tonic-gate 564*7c478bd9Sstevel@tonic-gate TCP_RUN_TIME_WAIT_COLLECTOR(); 565*7c478bd9Sstevel@tonic-gate 566*7c478bd9Sstevel@tonic-gate if ((tcp = sockets[sock_id].pcb) == NULL) 567*7c478bd9Sstevel@tonic-gate return (-1); 568*7c478bd9Sstevel@tonic-gate 569*7c478bd9Sstevel@tonic-gate while ((in_gram = sockets[sock_id].inq) != NULL) { 570*7c478bd9Sstevel@tonic-gate /* Remove unknown inetgrams from the head of inq. */ 571*7c478bd9Sstevel@tonic-gate if (in_gram->igm_level != TRANSPORT_LVL) { 572*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 573*7c478bd9Sstevel@tonic-gate printf("tcp_input: unexpected packet " 574*7c478bd9Sstevel@tonic-gate "level %d frame found\n", in_gram->igm_level); 575*7c478bd9Sstevel@tonic-gate #endif 576*7c478bd9Sstevel@tonic-gate del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 577*7c478bd9Sstevel@tonic-gate continue; 578*7c478bd9Sstevel@tonic-gate } 579*7c478bd9Sstevel@tonic-gate mp = in_gram->igm_mp; 580*7c478bd9Sstevel@tonic-gate del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 581*7c478bd9Sstevel@tonic-gate bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 582*7c478bd9Sstevel@tonic-gate tcp_rput_data(tcp, mp, sock_id); 583*7c478bd9Sstevel@tonic-gate /* The TCP may be gone because it gets a RST. */ 584*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].pcb == NULL) 585*7c478bd9Sstevel@tonic-gate return (-1); 586*7c478bd9Sstevel@tonic-gate } 587*7c478bd9Sstevel@tonic-gate 588*7c478bd9Sstevel@tonic-gate /* Flush the receive list. */ 589*7c478bd9Sstevel@tonic-gate if (tcp->tcp_rcv_list != NULL) { 590*7c478bd9Sstevel@tonic-gate tcp_rcv_drain(sock_id, tcp); 591*7c478bd9Sstevel@tonic-gate } else { 592*7c478bd9Sstevel@tonic-gate /* The other side has closed the connection, report this up. */ 593*7c478bd9Sstevel@tonic-gate if (tcp->tcp_state == TCPS_CLOSE_WAIT) { 594*7c478bd9Sstevel@tonic-gate sockets[sock_id].so_state |= SS_CANTRCVMORE; 595*7c478bd9Sstevel@tonic-gate return (0); 596*7c478bd9Sstevel@tonic-gate } 597*7c478bd9Sstevel@tonic-gate } 598*7c478bd9Sstevel@tonic-gate return (0); 599*7c478bd9Sstevel@tonic-gate } 600*7c478bd9Sstevel@tonic-gate 601*7c478bd9Sstevel@tonic-gate /* 602*7c478bd9Sstevel@tonic-gate * The send entry point for upper layer to call to send data. In order 603*7c478bd9Sstevel@tonic-gate * to minimize changes to the core TCP code, we need to put the 604*7c478bd9Sstevel@tonic-gate * data into mblks. 605*7c478bd9Sstevel@tonic-gate */ 606*7c478bd9Sstevel@tonic-gate int 607*7c478bd9Sstevel@tonic-gate tcp_send(int sock_id, tcp_t *tcp, const void *msg, int len) 608*7c478bd9Sstevel@tonic-gate { 609*7c478bd9Sstevel@tonic-gate mblk_t *mp; 610*7c478bd9Sstevel@tonic-gate mblk_t *head = NULL; 611*7c478bd9Sstevel@tonic-gate mblk_t *tail; 612*7c478bd9Sstevel@tonic-gate int mss = tcp->tcp_mss; 613*7c478bd9Sstevel@tonic-gate int cnt = 0; 614*7c478bd9Sstevel@tonic-gate int win_size; 615*7c478bd9Sstevel@tonic-gate char *buf = (char *)msg; 616*7c478bd9Sstevel@tonic-gate 617*7c478bd9Sstevel@tonic-gate TCP_RUN_TIME_WAIT_COLLECTOR(); 618*7c478bd9Sstevel@tonic-gate 619*7c478bd9Sstevel@tonic-gate /* We don't want to append 0 size mblk. */ 620*7c478bd9Sstevel@tonic-gate if (len == 0) 621*7c478bd9Sstevel@tonic-gate return (0); 622*7c478bd9Sstevel@tonic-gate while (len > 0) { 623*7c478bd9Sstevel@tonic-gate if (len < mss) { 624*7c478bd9Sstevel@tonic-gate mss = len; 625*7c478bd9Sstevel@tonic-gate } 626*7c478bd9Sstevel@tonic-gate /* 627*7c478bd9Sstevel@tonic-gate * If we cannot allocate more buffer, stop here and 628*7c478bd9Sstevel@tonic-gate * the number of bytes buffered will be returned. 629*7c478bd9Sstevel@tonic-gate * 630*7c478bd9Sstevel@tonic-gate * Note that we follow the core TCP optimization that 631*7c478bd9Sstevel@tonic-gate * each mblk contains only MSS bytes data. 632*7c478bd9Sstevel@tonic-gate */ 633*7c478bd9Sstevel@tonic-gate if ((mp = allocb(mss + tcp->tcp_ip_hdr_len + 634*7c478bd9Sstevel@tonic-gate TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 0)) == NULL) { 635*7c478bd9Sstevel@tonic-gate break; 636*7c478bd9Sstevel@tonic-gate } 637*7c478bd9Sstevel@tonic-gate mp->b_rptr += tcp->tcp_hdr_len + tcp_wroff_xtra; 638*7c478bd9Sstevel@tonic-gate bcopy(buf, mp->b_rptr, mss); 639*7c478bd9Sstevel@tonic-gate mp->b_wptr = mp->b_rptr + mss; 640*7c478bd9Sstevel@tonic-gate buf += mss; 641*7c478bd9Sstevel@tonic-gate cnt += mss; 642*7c478bd9Sstevel@tonic-gate len -= mss; 643*7c478bd9Sstevel@tonic-gate 644*7c478bd9Sstevel@tonic-gate if (head == NULL) { 645*7c478bd9Sstevel@tonic-gate head = mp; 646*7c478bd9Sstevel@tonic-gate tail = mp; 647*7c478bd9Sstevel@tonic-gate } else { 648*7c478bd9Sstevel@tonic-gate tail->b_cont = mp; 649*7c478bd9Sstevel@tonic-gate tail = mp; 650*7c478bd9Sstevel@tonic-gate } 651*7c478bd9Sstevel@tonic-gate } 652*7c478bd9Sstevel@tonic-gate 653*7c478bd9Sstevel@tonic-gate /* 654*7c478bd9Sstevel@tonic-gate * Since inetboot is not interrupt driven, there may be 655*7c478bd9Sstevel@tonic-gate * some ACKs in the MAC's buffer. Drain them first, 656*7c478bd9Sstevel@tonic-gate * otherwise, we may not be able to send. 657*7c478bd9Sstevel@tonic-gate * 658*7c478bd9Sstevel@tonic-gate * We expect an ACK in two cases: 659*7c478bd9Sstevel@tonic-gate * 660*7c478bd9Sstevel@tonic-gate * 1) We have un-ACK'ed data. 661*7c478bd9Sstevel@tonic-gate * 662*7c478bd9Sstevel@tonic-gate * 2) All ACK's have been received and the sender's window has been 663*7c478bd9Sstevel@tonic-gate * closed. We need an ACK back to open the window so that we can 664*7c478bd9Sstevel@tonic-gate * send. In this case, call tcp_drain_input() if the window size is 665*7c478bd9Sstevel@tonic-gate * less than 2 * MSS. 666*7c478bd9Sstevel@tonic-gate */ 667*7c478bd9Sstevel@tonic-gate 668*7c478bd9Sstevel@tonic-gate /* window size = MIN(swnd, cwnd) - unacked bytes */ 669*7c478bd9Sstevel@tonic-gate win_size = (tcp->tcp_swnd > tcp->tcp_cwnd) ? tcp->tcp_cwnd : 670*7c478bd9Sstevel@tonic-gate tcp->tcp_swnd; 671*7c478bd9Sstevel@tonic-gate win_size -= tcp->tcp_snxt; 672*7c478bd9Sstevel@tonic-gate win_size += tcp->tcp_suna; 673*7c478bd9Sstevel@tonic-gate if (win_size < (2 * tcp->tcp_mss)) 674*7c478bd9Sstevel@tonic-gate if (tcp_drain_input(tcp, sock_id, 5) < 0) 675*7c478bd9Sstevel@tonic-gate return (-1); 676*7c478bd9Sstevel@tonic-gate 677*7c478bd9Sstevel@tonic-gate tcp_wput_data(tcp, head, sock_id); 678*7c478bd9Sstevel@tonic-gate return (cnt); 679*7c478bd9Sstevel@tonic-gate } 680*7c478bd9Sstevel@tonic-gate 681*7c478bd9Sstevel@tonic-gate /* Free up all TCP related stuff */ 682*7c478bd9Sstevel@tonic-gate static void 683*7c478bd9Sstevel@tonic-gate tcp_free(tcp_t *tcp) 684*7c478bd9Sstevel@tonic-gate { 685*7c478bd9Sstevel@tonic-gate if (tcp->tcp_iphc != NULL) { 686*7c478bd9Sstevel@tonic-gate bkmem_free((caddr_t)tcp->tcp_iphc, tcp->tcp_iphc_len); 687*7c478bd9Sstevel@tonic-gate tcp->tcp_iphc = NULL; 688*7c478bd9Sstevel@tonic-gate } 689*7c478bd9Sstevel@tonic-gate if (tcp->tcp_xmit_head != NULL) { 690*7c478bd9Sstevel@tonic-gate freemsg(tcp->tcp_xmit_head); 691*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_head = NULL; 692*7c478bd9Sstevel@tonic-gate } 693*7c478bd9Sstevel@tonic-gate if (tcp->tcp_rcv_list != NULL) { 694*7c478bd9Sstevel@tonic-gate freemsg(tcp->tcp_rcv_list); 695*7c478bd9Sstevel@tonic-gate tcp->tcp_rcv_list = NULL; 696*7c478bd9Sstevel@tonic-gate } 697*7c478bd9Sstevel@tonic-gate if (tcp->tcp_reass_head != NULL) { 698*7c478bd9Sstevel@tonic-gate freemsg(tcp->tcp_reass_head); 699*7c478bd9Sstevel@tonic-gate tcp->tcp_reass_head = NULL; 700*7c478bd9Sstevel@tonic-gate } 701*7c478bd9Sstevel@tonic-gate if (tcp->tcp_sack_info != NULL) { 702*7c478bd9Sstevel@tonic-gate bkmem_free((caddr_t)tcp->tcp_sack_info, 703*7c478bd9Sstevel@tonic-gate sizeof (tcp_sack_info_t)); 704*7c478bd9Sstevel@tonic-gate tcp->tcp_sack_info = NULL; 705*7c478bd9Sstevel@tonic-gate } 706*7c478bd9Sstevel@tonic-gate } 707*7c478bd9Sstevel@tonic-gate 708*7c478bd9Sstevel@tonic-gate static void 709*7c478bd9Sstevel@tonic-gate tcp_close_detached(tcp_t *tcp) 710*7c478bd9Sstevel@tonic-gate { 711*7c478bd9Sstevel@tonic-gate if (tcp->tcp_listener != NULL) 712*7c478bd9Sstevel@tonic-gate tcp_eager_unlink(tcp); 713*7c478bd9Sstevel@tonic-gate tcp_free(tcp); 714*7c478bd9Sstevel@tonic-gate bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 715*7c478bd9Sstevel@tonic-gate } 716*7c478bd9Sstevel@tonic-gate 717*7c478bd9Sstevel@tonic-gate /* 718*7c478bd9Sstevel@tonic-gate * If we are an eager connection hanging off a listener that hasn't 719*7c478bd9Sstevel@tonic-gate * formally accepted the connection yet, get off his list and blow off 720*7c478bd9Sstevel@tonic-gate * any data that we have accumulated. 721*7c478bd9Sstevel@tonic-gate */ 722*7c478bd9Sstevel@tonic-gate static void 723*7c478bd9Sstevel@tonic-gate tcp_eager_unlink(tcp_t *tcp) 724*7c478bd9Sstevel@tonic-gate { 725*7c478bd9Sstevel@tonic-gate tcp_t *listener = tcp->tcp_listener; 726*7c478bd9Sstevel@tonic-gate 727*7c478bd9Sstevel@tonic-gate assert(listener != NULL); 728*7c478bd9Sstevel@tonic-gate if (tcp->tcp_eager_next_q0 != NULL) { 729*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_eager_prev_q0 != NULL); 730*7c478bd9Sstevel@tonic-gate 731*7c478bd9Sstevel@tonic-gate /* Remove the eager tcp from q0 */ 732*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 733*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_prev_q0; 734*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 735*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q0; 736*7c478bd9Sstevel@tonic-gate listener->tcp_conn_req_cnt_q0--; 737*7c478bd9Sstevel@tonic-gate } else { 738*7c478bd9Sstevel@tonic-gate tcp_t **tcpp = &listener->tcp_eager_next_q; 739*7c478bd9Sstevel@tonic-gate tcp_t *prev = NULL; 740*7c478bd9Sstevel@tonic-gate 741*7c478bd9Sstevel@tonic-gate for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { 742*7c478bd9Sstevel@tonic-gate if (tcpp[0] == tcp) { 743*7c478bd9Sstevel@tonic-gate if (listener->tcp_eager_last_q == tcp) { 744*7c478bd9Sstevel@tonic-gate /* 745*7c478bd9Sstevel@tonic-gate * If we are unlinking the last 746*7c478bd9Sstevel@tonic-gate * element on the list, adjust 747*7c478bd9Sstevel@tonic-gate * tail pointer. Set tail pointer 748*7c478bd9Sstevel@tonic-gate * to nil when list is empty. 749*7c478bd9Sstevel@tonic-gate */ 750*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_eager_next_q == NULL); 751*7c478bd9Sstevel@tonic-gate if (listener->tcp_eager_last_q == 752*7c478bd9Sstevel@tonic-gate listener->tcp_eager_next_q) { 753*7c478bd9Sstevel@tonic-gate listener->tcp_eager_last_q = 754*7c478bd9Sstevel@tonic-gate NULL; 755*7c478bd9Sstevel@tonic-gate } else { 756*7c478bd9Sstevel@tonic-gate /* 757*7c478bd9Sstevel@tonic-gate * We won't get here if there 758*7c478bd9Sstevel@tonic-gate * is only one eager in the 759*7c478bd9Sstevel@tonic-gate * list. 760*7c478bd9Sstevel@tonic-gate */ 761*7c478bd9Sstevel@tonic-gate assert(prev != NULL); 762*7c478bd9Sstevel@tonic-gate listener->tcp_eager_last_q = 763*7c478bd9Sstevel@tonic-gate prev; 764*7c478bd9Sstevel@tonic-gate } 765*7c478bd9Sstevel@tonic-gate } 766*7c478bd9Sstevel@tonic-gate tcpp[0] = tcp->tcp_eager_next_q; 767*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q = NULL; 768*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_last_q = NULL; 769*7c478bd9Sstevel@tonic-gate listener->tcp_conn_req_cnt_q--; 770*7c478bd9Sstevel@tonic-gate break; 771*7c478bd9Sstevel@tonic-gate } 772*7c478bd9Sstevel@tonic-gate prev = tcpp[0]; 773*7c478bd9Sstevel@tonic-gate } 774*7c478bd9Sstevel@tonic-gate } 775*7c478bd9Sstevel@tonic-gate tcp->tcp_listener = NULL; 776*7c478bd9Sstevel@tonic-gate } 777*7c478bd9Sstevel@tonic-gate 778*7c478bd9Sstevel@tonic-gate /* 779*7c478bd9Sstevel@tonic-gate * Reset any eager connection hanging off this listener 780*7c478bd9Sstevel@tonic-gate * and then reclaim it's resources. 781*7c478bd9Sstevel@tonic-gate */ 782*7c478bd9Sstevel@tonic-gate static void 783*7c478bd9Sstevel@tonic-gate tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only, int sock_id) 784*7c478bd9Sstevel@tonic-gate { 785*7c478bd9Sstevel@tonic-gate tcp_t *eager; 786*7c478bd9Sstevel@tonic-gate 787*7c478bd9Sstevel@tonic-gate if (!q0_only) { 788*7c478bd9Sstevel@tonic-gate /* First cleanup q */ 789*7c478bd9Sstevel@tonic-gate while ((eager = listener->tcp_eager_next_q) != NULL) { 790*7c478bd9Sstevel@tonic-gate assert(listener->tcp_eager_last_q != NULL); 791*7c478bd9Sstevel@tonic-gate tcp_xmit_ctl("tcp_eager_cleanup, can't wait", 792*7c478bd9Sstevel@tonic-gate eager, NULL, eager->tcp_snxt, 0, TH_RST, 0, 793*7c478bd9Sstevel@tonic-gate sock_id); 794*7c478bd9Sstevel@tonic-gate tcp_close_detached(eager); 795*7c478bd9Sstevel@tonic-gate } 796*7c478bd9Sstevel@tonic-gate assert(listener->tcp_eager_last_q == NULL); 797*7c478bd9Sstevel@tonic-gate } 798*7c478bd9Sstevel@tonic-gate /* Then cleanup q0 */ 799*7c478bd9Sstevel@tonic-gate while ((eager = listener->tcp_eager_next_q0) != listener) { 800*7c478bd9Sstevel@tonic-gate tcp_xmit_ctl("tcp_eager_cleanup, can't wait", 801*7c478bd9Sstevel@tonic-gate eager, NULL, eager->tcp_snxt, 0, TH_RST, 0, sock_id); 802*7c478bd9Sstevel@tonic-gate tcp_close_detached(eager); 803*7c478bd9Sstevel@tonic-gate } 804*7c478bd9Sstevel@tonic-gate } 805*7c478bd9Sstevel@tonic-gate 806*7c478bd9Sstevel@tonic-gate /* 807*7c478bd9Sstevel@tonic-gate * To handle the shutdown request. Called from shutdown() 808*7c478bd9Sstevel@tonic-gate */ 809*7c478bd9Sstevel@tonic-gate int 810*7c478bd9Sstevel@tonic-gate tcp_shutdown(int sock_id) 811*7c478bd9Sstevel@tonic-gate { 812*7c478bd9Sstevel@tonic-gate tcp_t *tcp; 813*7c478bd9Sstevel@tonic-gate 814*7c478bd9Sstevel@tonic-gate DEBUG_1("tcp_shutdown: sock_id %x\n", sock_id); 815*7c478bd9Sstevel@tonic-gate 816*7c478bd9Sstevel@tonic-gate if ((tcp = sockets[sock_id].pcb) == NULL) { 817*7c478bd9Sstevel@tonic-gate return (-1); 818*7c478bd9Sstevel@tonic-gate } 819*7c478bd9Sstevel@tonic-gate 820*7c478bd9Sstevel@tonic-gate /* 821*7c478bd9Sstevel@tonic-gate * Since inetboot is not interrupt driven, there may be 822*7c478bd9Sstevel@tonic-gate * some ACKs in the MAC's buffer. Drain them first, 823*7c478bd9Sstevel@tonic-gate * otherwise, we may not be able to send. 824*7c478bd9Sstevel@tonic-gate */ 825*7c478bd9Sstevel@tonic-gate if (tcp_drain_input(tcp, sock_id, 5) < 0) { 826*7c478bd9Sstevel@tonic-gate /* 827*7c478bd9Sstevel@tonic-gate * If we return now without freeing TCP, there will be 828*7c478bd9Sstevel@tonic-gate * a memory leak. 829*7c478bd9Sstevel@tonic-gate */ 830*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].pcb != NULL) 831*7c478bd9Sstevel@tonic-gate tcp_clean_death(sock_id, tcp, 0); 832*7c478bd9Sstevel@tonic-gate return (-1); 833*7c478bd9Sstevel@tonic-gate } 834*7c478bd9Sstevel@tonic-gate 835*7c478bd9Sstevel@tonic-gate DEBUG_1("tcp_shutdown: tcp_state %x\n", tcp->tcp_state); 836*7c478bd9Sstevel@tonic-gate switch (tcp->tcp_state) { 837*7c478bd9Sstevel@tonic-gate 838*7c478bd9Sstevel@tonic-gate case TCPS_SYN_RCVD: 839*7c478bd9Sstevel@tonic-gate /* 840*7c478bd9Sstevel@tonic-gate * Shutdown during the connect 3-way handshake 841*7c478bd9Sstevel@tonic-gate */ 842*7c478bd9Sstevel@tonic-gate case TCPS_ESTABLISHED: 843*7c478bd9Sstevel@tonic-gate /* 844*7c478bd9Sstevel@tonic-gate * Transmit the FIN 845*7c478bd9Sstevel@tonic-gate * wait for the FIN to be ACKed, 846*7c478bd9Sstevel@tonic-gate * then remain in FIN_WAIT_2 847*7c478bd9Sstevel@tonic-gate */ 848*7c478bd9Sstevel@tonic-gate dprintf("tcp_shutdown: sending fin\n"); 849*7c478bd9Sstevel@tonic-gate if (tcp_xmit_end(tcp, sock_id) == 0 && 850*7c478bd9Sstevel@tonic-gate tcp_state_wait(sock_id, tcp, TCPS_FIN_WAIT_2) < 0) { 851*7c478bd9Sstevel@tonic-gate /* During the wait, TCP may be gone... */ 852*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].pcb == NULL) 853*7c478bd9Sstevel@tonic-gate return (-1); 854*7c478bd9Sstevel@tonic-gate } 855*7c478bd9Sstevel@tonic-gate dprintf("tcp_shutdown: done\n"); 856*7c478bd9Sstevel@tonic-gate break; 857*7c478bd9Sstevel@tonic-gate 858*7c478bd9Sstevel@tonic-gate default: 859*7c478bd9Sstevel@tonic-gate break; 860*7c478bd9Sstevel@tonic-gate 861*7c478bd9Sstevel@tonic-gate } 862*7c478bd9Sstevel@tonic-gate return (0); 863*7c478bd9Sstevel@tonic-gate } 864*7c478bd9Sstevel@tonic-gate 865*7c478bd9Sstevel@tonic-gate /* To handle closing of the socket */ 866*7c478bd9Sstevel@tonic-gate static int 867*7c478bd9Sstevel@tonic-gate tcp_close(int sock_id) 868*7c478bd9Sstevel@tonic-gate { 869*7c478bd9Sstevel@tonic-gate char *msg; 870*7c478bd9Sstevel@tonic-gate tcp_t *tcp; 871*7c478bd9Sstevel@tonic-gate int error = 0; 872*7c478bd9Sstevel@tonic-gate 873*7c478bd9Sstevel@tonic-gate if ((tcp = sockets[sock_id].pcb) == NULL) { 874*7c478bd9Sstevel@tonic-gate return (-1); 875*7c478bd9Sstevel@tonic-gate } 876*7c478bd9Sstevel@tonic-gate 877*7c478bd9Sstevel@tonic-gate TCP_RUN_TIME_WAIT_COLLECTOR(); 878*7c478bd9Sstevel@tonic-gate 879*7c478bd9Sstevel@tonic-gate /* 880*7c478bd9Sstevel@tonic-gate * Since inetboot is not interrupt driven, there may be 881*7c478bd9Sstevel@tonic-gate * some ACKs in the MAC's buffer. Drain them first, 882*7c478bd9Sstevel@tonic-gate * otherwise, we may not be able to send. 883*7c478bd9Sstevel@tonic-gate */ 884*7c478bd9Sstevel@tonic-gate if (tcp_drain_input(tcp, sock_id, 5) < 0) { 885*7c478bd9Sstevel@tonic-gate /* 886*7c478bd9Sstevel@tonic-gate * If we return now without freeing TCP, there will be 887*7c478bd9Sstevel@tonic-gate * a memory leak. 888*7c478bd9Sstevel@tonic-gate */ 889*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].pcb != NULL) 890*7c478bd9Sstevel@tonic-gate tcp_clean_death(sock_id, tcp, 0); 891*7c478bd9Sstevel@tonic-gate return (-1); 892*7c478bd9Sstevel@tonic-gate } 893*7c478bd9Sstevel@tonic-gate 894*7c478bd9Sstevel@tonic-gate if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 895*7c478bd9Sstevel@tonic-gate /* Cleanup for listener */ 896*7c478bd9Sstevel@tonic-gate tcp_eager_cleanup(tcp, 0, sock_id); 897*7c478bd9Sstevel@tonic-gate } 898*7c478bd9Sstevel@tonic-gate 899*7c478bd9Sstevel@tonic-gate msg = NULL; 900*7c478bd9Sstevel@tonic-gate switch (tcp->tcp_state) { 901*7c478bd9Sstevel@tonic-gate case TCPS_CLOSED: 902*7c478bd9Sstevel@tonic-gate case TCPS_IDLE: 903*7c478bd9Sstevel@tonic-gate case TCPS_BOUND: 904*7c478bd9Sstevel@tonic-gate case TCPS_LISTEN: 905*7c478bd9Sstevel@tonic-gate break; 906*7c478bd9Sstevel@tonic-gate case TCPS_SYN_SENT: 907*7c478bd9Sstevel@tonic-gate msg = "tcp_close, during connect"; 908*7c478bd9Sstevel@tonic-gate break; 909*7c478bd9Sstevel@tonic-gate case TCPS_SYN_RCVD: 910*7c478bd9Sstevel@tonic-gate /* 911*7c478bd9Sstevel@tonic-gate * Close during the connect 3-way handshake 912*7c478bd9Sstevel@tonic-gate * but here there may or may not be pending data 913*7c478bd9Sstevel@tonic-gate * already on queue. Process almost same as in 914*7c478bd9Sstevel@tonic-gate * the ESTABLISHED state. 915*7c478bd9Sstevel@tonic-gate */ 916*7c478bd9Sstevel@tonic-gate /* FALLTHRU */ 917*7c478bd9Sstevel@tonic-gate default: 918*7c478bd9Sstevel@tonic-gate /* 919*7c478bd9Sstevel@tonic-gate * If SO_LINGER has set a zero linger time, abort the 920*7c478bd9Sstevel@tonic-gate * connection with a reset. 921*7c478bd9Sstevel@tonic-gate */ 922*7c478bd9Sstevel@tonic-gate if (tcp->tcp_linger && tcp->tcp_lingertime == 0) { 923*7c478bd9Sstevel@tonic-gate msg = "tcp_close, zero lingertime"; 924*7c478bd9Sstevel@tonic-gate break; 925*7c478bd9Sstevel@tonic-gate } 926*7c478bd9Sstevel@tonic-gate 927*7c478bd9Sstevel@tonic-gate /* 928*7c478bd9Sstevel@tonic-gate * Abort connection if there is unread data queued. 929*7c478bd9Sstevel@tonic-gate */ 930*7c478bd9Sstevel@tonic-gate if (tcp->tcp_rcv_list != NULL || 931*7c478bd9Sstevel@tonic-gate tcp->tcp_reass_head != NULL) { 932*7c478bd9Sstevel@tonic-gate msg = "tcp_close, unread data"; 933*7c478bd9Sstevel@tonic-gate break; 934*7c478bd9Sstevel@tonic-gate } 935*7c478bd9Sstevel@tonic-gate if (tcp->tcp_state <= TCPS_LISTEN) 936*7c478bd9Sstevel@tonic-gate break; 937*7c478bd9Sstevel@tonic-gate 938*7c478bd9Sstevel@tonic-gate /* 939*7c478bd9Sstevel@tonic-gate * Transmit the FIN before detaching the tcp_t. 940*7c478bd9Sstevel@tonic-gate * After tcp_detach returns this queue/perimeter 941*7c478bd9Sstevel@tonic-gate * no longer owns the tcp_t thus others can modify it. 942*7c478bd9Sstevel@tonic-gate * The TCP could be closed in tcp_state_wait called by 943*7c478bd9Sstevel@tonic-gate * tcp_wput_data called by tcp_xmit_end. 944*7c478bd9Sstevel@tonic-gate */ 945*7c478bd9Sstevel@tonic-gate (void) tcp_xmit_end(tcp, sock_id); 946*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].pcb == NULL) 947*7c478bd9Sstevel@tonic-gate return (0); 948*7c478bd9Sstevel@tonic-gate 949*7c478bd9Sstevel@tonic-gate /* 950*7c478bd9Sstevel@tonic-gate * If lingering on close then wait until the fin is acked, 951*7c478bd9Sstevel@tonic-gate * the SO_LINGER time passes, or a reset is sent/received. 952*7c478bd9Sstevel@tonic-gate */ 953*7c478bd9Sstevel@tonic-gate if (tcp->tcp_linger && tcp->tcp_lingertime > 0 && 954*7c478bd9Sstevel@tonic-gate !(tcp->tcp_fin_acked) && 955*7c478bd9Sstevel@tonic-gate tcp->tcp_state >= TCPS_ESTABLISHED) { 956*7c478bd9Sstevel@tonic-gate uint32_t stoptime; /* in ms */ 957*7c478bd9Sstevel@tonic-gate 958*7c478bd9Sstevel@tonic-gate tcp->tcp_client_errno = 0; 959*7c478bd9Sstevel@tonic-gate stoptime = prom_gettime() + 960*7c478bd9Sstevel@tonic-gate (tcp->tcp_lingertime * 1000); 961*7c478bd9Sstevel@tonic-gate while (!(tcp->tcp_fin_acked) && 962*7c478bd9Sstevel@tonic-gate tcp->tcp_state >= TCPS_ESTABLISHED && 963*7c478bd9Sstevel@tonic-gate tcp->tcp_client_errno == 0 && 964*7c478bd9Sstevel@tonic-gate ((int32_t)(stoptime - prom_gettime()) > 0)) { 965*7c478bd9Sstevel@tonic-gate if (tcp_drain_input(tcp, sock_id, 5) < 0) { 966*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].pcb != NULL) { 967*7c478bd9Sstevel@tonic-gate tcp_clean_death(sock_id, 968*7c478bd9Sstevel@tonic-gate tcp, 0); 969*7c478bd9Sstevel@tonic-gate } 970*7c478bd9Sstevel@tonic-gate return (-1); 971*7c478bd9Sstevel@tonic-gate } 972*7c478bd9Sstevel@tonic-gate } 973*7c478bd9Sstevel@tonic-gate tcp->tcp_client_errno = 0; 974*7c478bd9Sstevel@tonic-gate } 975*7c478bd9Sstevel@tonic-gate if (tcp_state_wait(sock_id, tcp, TCPS_TIME_WAIT) < 0) { 976*7c478bd9Sstevel@tonic-gate /* During the wait, TCP may be gone... */ 977*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].pcb == NULL) 978*7c478bd9Sstevel@tonic-gate return (0); 979*7c478bd9Sstevel@tonic-gate msg = "tcp_close, couldn't detach"; 980*7c478bd9Sstevel@tonic-gate } else { 981*7c478bd9Sstevel@tonic-gate return (0); 982*7c478bd9Sstevel@tonic-gate } 983*7c478bd9Sstevel@tonic-gate break; 984*7c478bd9Sstevel@tonic-gate } 985*7c478bd9Sstevel@tonic-gate 986*7c478bd9Sstevel@tonic-gate /* Something went wrong... Send a RST and report the error */ 987*7c478bd9Sstevel@tonic-gate if (msg != NULL) { 988*7c478bd9Sstevel@tonic-gate if (tcp->tcp_state == TCPS_ESTABLISHED || 989*7c478bd9Sstevel@tonic-gate tcp->tcp_state == TCPS_CLOSE_WAIT) 990*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpEstabResets); 991*7c478bd9Sstevel@tonic-gate if (tcp->tcp_state == TCPS_SYN_SENT || 992*7c478bd9Sstevel@tonic-gate tcp->tcp_state == TCPS_SYN_RCVD) 993*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpAttemptFails); 994*7c478bd9Sstevel@tonic-gate tcp_xmit_ctl(msg, tcp, NULL, tcp->tcp_snxt, 0, TH_RST, 0, 995*7c478bd9Sstevel@tonic-gate sock_id); 996*7c478bd9Sstevel@tonic-gate } 997*7c478bd9Sstevel@tonic-gate 998*7c478bd9Sstevel@tonic-gate tcp_free(tcp); 999*7c478bd9Sstevel@tonic-gate bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 1000*7c478bd9Sstevel@tonic-gate sockets[sock_id].pcb = NULL; 1001*7c478bd9Sstevel@tonic-gate return (error); 1002*7c478bd9Sstevel@tonic-gate } 1003*7c478bd9Sstevel@tonic-gate 1004*7c478bd9Sstevel@tonic-gate /* To make an endpoint a listener. */ 1005*7c478bd9Sstevel@tonic-gate int 1006*7c478bd9Sstevel@tonic-gate tcp_listen(int sock_id, int backlog) 1007*7c478bd9Sstevel@tonic-gate { 1008*7c478bd9Sstevel@tonic-gate tcp_t *tcp; 1009*7c478bd9Sstevel@tonic-gate 1010*7c478bd9Sstevel@tonic-gate if ((tcp = (tcp_t *)(sockets[sock_id].pcb)) == NULL) { 1011*7c478bd9Sstevel@tonic-gate errno = EINVAL; 1012*7c478bd9Sstevel@tonic-gate return (-1); 1013*7c478bd9Sstevel@tonic-gate } 1014*7c478bd9Sstevel@tonic-gate /* We allow calling listen() multiple times to change the backlog. */ 1015*7c478bd9Sstevel@tonic-gate if (tcp->tcp_state > TCPS_LISTEN || tcp->tcp_state < TCPS_BOUND) { 1016*7c478bd9Sstevel@tonic-gate errno = EOPNOTSUPP; 1017*7c478bd9Sstevel@tonic-gate return (-1); 1018*7c478bd9Sstevel@tonic-gate } 1019*7c478bd9Sstevel@tonic-gate /* The following initialization should only be done once. */ 1020*7c478bd9Sstevel@tonic-gate if (tcp->tcp_state != TCPS_LISTEN) { 1021*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 1022*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q = NULL; 1023*7c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_LISTEN; 1024*7c478bd9Sstevel@tonic-gate tcp->tcp_second_ctimer_threshold = tcp_ip_abort_linterval; 1025*7c478bd9Sstevel@tonic-gate } 1026*7c478bd9Sstevel@tonic-gate if ((tcp->tcp_conn_req_max = backlog) > tcp_conn_req_max_q) { 1027*7c478bd9Sstevel@tonic-gate tcp->tcp_conn_req_max = tcp_conn_req_max_q; 1028*7c478bd9Sstevel@tonic-gate } 1029*7c478bd9Sstevel@tonic-gate if (tcp->tcp_conn_req_max < tcp_conn_req_min) { 1030*7c478bd9Sstevel@tonic-gate tcp->tcp_conn_req_max = tcp_conn_req_min; 1031*7c478bd9Sstevel@tonic-gate } 1032*7c478bd9Sstevel@tonic-gate return (0); 1033*7c478bd9Sstevel@tonic-gate } 1034*7c478bd9Sstevel@tonic-gate 1035*7c478bd9Sstevel@tonic-gate /* To accept connections. */ 1036*7c478bd9Sstevel@tonic-gate int 1037*7c478bd9Sstevel@tonic-gate tcp_accept(int sock_id, struct sockaddr *addr, socklen_t *addr_len) 1038*7c478bd9Sstevel@tonic-gate { 1039*7c478bd9Sstevel@tonic-gate tcp_t *listener; 1040*7c478bd9Sstevel@tonic-gate tcp_t *eager; 1041*7c478bd9Sstevel@tonic-gate int sd, new_sock_id; 1042*7c478bd9Sstevel@tonic-gate struct sockaddr_in *new_addr = (struct sockaddr_in *)addr; 1043*7c478bd9Sstevel@tonic-gate int timeout; 1044*7c478bd9Sstevel@tonic-gate 1045*7c478bd9Sstevel@tonic-gate /* Sanity check. */ 1046*7c478bd9Sstevel@tonic-gate if ((listener = (tcp_t *)(sockets[sock_id].pcb)) == NULL || 1047*7c478bd9Sstevel@tonic-gate new_addr == NULL || addr_len == NULL || 1048*7c478bd9Sstevel@tonic-gate *addr_len < sizeof (struct sockaddr_in) || 1049*7c478bd9Sstevel@tonic-gate listener->tcp_state != TCPS_LISTEN) { 1050*7c478bd9Sstevel@tonic-gate errno = EINVAL; 1051*7c478bd9Sstevel@tonic-gate return (-1); 1052*7c478bd9Sstevel@tonic-gate } 1053*7c478bd9Sstevel@tonic-gate 1054*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].in_timeout > tcp_accept_timeout) 1055*7c478bd9Sstevel@tonic-gate timeout = prom_gettime() + sockets[sock_id].in_timeout; 1056*7c478bd9Sstevel@tonic-gate else 1057*7c478bd9Sstevel@tonic-gate timeout = prom_gettime() + tcp_accept_timeout; 1058*7c478bd9Sstevel@tonic-gate while (listener->tcp_eager_next_q == NULL && 1059*7c478bd9Sstevel@tonic-gate timeout > prom_gettime()) { 1060*7c478bd9Sstevel@tonic-gate #if DEBUG 1061*7c478bd9Sstevel@tonic-gate printf("tcp_accept: Waiting in tcp_accept()\n"); 1062*7c478bd9Sstevel@tonic-gate #endif 1063*7c478bd9Sstevel@tonic-gate if (tcp_drain_input(listener, sock_id, 5) < 0) { 1064*7c478bd9Sstevel@tonic-gate return (-1); 1065*7c478bd9Sstevel@tonic-gate } 1066*7c478bd9Sstevel@tonic-gate } 1067*7c478bd9Sstevel@tonic-gate /* If there is an eager, don't timeout... */ 1068*7c478bd9Sstevel@tonic-gate if (timeout <= prom_gettime() && listener->tcp_eager_next_q == NULL) { 1069*7c478bd9Sstevel@tonic-gate #if DEBUG 1070*7c478bd9Sstevel@tonic-gate printf("tcp_accept: timeout\n"); 1071*7c478bd9Sstevel@tonic-gate #endif 1072*7c478bd9Sstevel@tonic-gate errno = ETIMEDOUT; 1073*7c478bd9Sstevel@tonic-gate return (-1); 1074*7c478bd9Sstevel@tonic-gate } 1075*7c478bd9Sstevel@tonic-gate #if DEBUG 1076*7c478bd9Sstevel@tonic-gate printf("tcp_accept: got a connection\n"); 1077*7c478bd9Sstevel@tonic-gate #endif 1078*7c478bd9Sstevel@tonic-gate 1079*7c478bd9Sstevel@tonic-gate /* Now create the socket for this new TCP. */ 1080*7c478bd9Sstevel@tonic-gate if ((sd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 1081*7c478bd9Sstevel@tonic-gate return (-1); 1082*7c478bd9Sstevel@tonic-gate } 1083*7c478bd9Sstevel@tonic-gate if ((new_sock_id = so_check_fd(sd, &errno)) == -1) 1084*7c478bd9Sstevel@tonic-gate /* This should not happen! */ 1085*7c478bd9Sstevel@tonic-gate prom_panic("so_check_fd() fails in tcp_accept()"); 1086*7c478bd9Sstevel@tonic-gate /* Free the TCP PCB in the original socket. */ 1087*7c478bd9Sstevel@tonic-gate bkmem_free((caddr_t)(sockets[new_sock_id].pcb), sizeof (tcp_t)); 1088*7c478bd9Sstevel@tonic-gate /* Dequeue the eager and attach it to the socket. */ 1089*7c478bd9Sstevel@tonic-gate eager = listener->tcp_eager_next_q; 1090*7c478bd9Sstevel@tonic-gate listener->tcp_eager_next_q = eager->tcp_eager_next_q; 1091*7c478bd9Sstevel@tonic-gate if (listener->tcp_eager_last_q == eager) 1092*7c478bd9Sstevel@tonic-gate listener->tcp_eager_last_q = NULL; 1093*7c478bd9Sstevel@tonic-gate eager->tcp_eager_next_q = NULL; 1094*7c478bd9Sstevel@tonic-gate sockets[new_sock_id].pcb = eager; 1095*7c478bd9Sstevel@tonic-gate listener->tcp_conn_req_cnt_q--; 1096*7c478bd9Sstevel@tonic-gate 1097*7c478bd9Sstevel@tonic-gate /* Copy in the address info. */ 1098*7c478bd9Sstevel@tonic-gate bcopy(&eager->tcp_remote, &new_addr->sin_addr.s_addr, 1099*7c478bd9Sstevel@tonic-gate sizeof (in_addr_t)); 1100*7c478bd9Sstevel@tonic-gate bcopy(&eager->tcp_fport, &new_addr->sin_port, sizeof (in_port_t)); 1101*7c478bd9Sstevel@tonic-gate new_addr->sin_family = AF_INET; 1102*7c478bd9Sstevel@tonic-gate 1103*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 1104*7c478bd9Sstevel@tonic-gate printf("tcp_accept(), new sock_id: %d\n", sd); 1105*7c478bd9Sstevel@tonic-gate #endif 1106*7c478bd9Sstevel@tonic-gate return (sd); 1107*7c478bd9Sstevel@tonic-gate } 1108*7c478bd9Sstevel@tonic-gate 1109*7c478bd9Sstevel@tonic-gate /* Update the next anonymous port to use. */ 1110*7c478bd9Sstevel@tonic-gate static in_port_t 1111*7c478bd9Sstevel@tonic-gate tcp_update_next_port(in_port_t port) 1112*7c478bd9Sstevel@tonic-gate { 1113*7c478bd9Sstevel@tonic-gate /* Don't allow the port to fall out of the anonymous port range. */ 1114*7c478bd9Sstevel@tonic-gate if (port < tcp_smallest_anon_port || port > tcp_largest_anon_port) 1115*7c478bd9Sstevel@tonic-gate port = (in_port_t)tcp_smallest_anon_port; 1116*7c478bd9Sstevel@tonic-gate 1117*7c478bd9Sstevel@tonic-gate if (port < tcp_smallest_nonpriv_port) 1118*7c478bd9Sstevel@tonic-gate port = (in_port_t)tcp_smallest_nonpriv_port; 1119*7c478bd9Sstevel@tonic-gate return (port); 1120*7c478bd9Sstevel@tonic-gate } 1121*7c478bd9Sstevel@tonic-gate 1122*7c478bd9Sstevel@tonic-gate /* To check whether a bind to a port is allowed. */ 1123*7c478bd9Sstevel@tonic-gate static in_port_t 1124*7c478bd9Sstevel@tonic-gate tcp_bindi(in_port_t port, in_addr_t *addr, boolean_t reuseaddr, 1125*7c478bd9Sstevel@tonic-gate boolean_t bind_to_req_port_only) 1126*7c478bd9Sstevel@tonic-gate { 1127*7c478bd9Sstevel@tonic-gate int i, count; 1128*7c478bd9Sstevel@tonic-gate tcp_t *tcp; 1129*7c478bd9Sstevel@tonic-gate 1130*7c478bd9Sstevel@tonic-gate count = tcp_largest_anon_port - tcp_smallest_anon_port; 1131*7c478bd9Sstevel@tonic-gate try_again: 1132*7c478bd9Sstevel@tonic-gate for (i = 0; i < MAXSOCKET; i++) { 1133*7c478bd9Sstevel@tonic-gate if (sockets[i].type != INETBOOT_STREAM || 1134*7c478bd9Sstevel@tonic-gate ((tcp = (tcp_t *)sockets[i].pcb) == NULL) || 1135*7c478bd9Sstevel@tonic-gate ntohs(tcp->tcp_lport) != port) { 1136*7c478bd9Sstevel@tonic-gate continue; 1137*7c478bd9Sstevel@tonic-gate } 1138*7c478bd9Sstevel@tonic-gate /* 1139*7c478bd9Sstevel@tonic-gate * Both TCPs have the same port. If SO_REUSEDADDR is 1140*7c478bd9Sstevel@tonic-gate * set and the bound TCP has a state greater than 1141*7c478bd9Sstevel@tonic-gate * TCPS_LISTEN, it is fine. 1142*7c478bd9Sstevel@tonic-gate */ 1143*7c478bd9Sstevel@tonic-gate if (reuseaddr && tcp->tcp_state > TCPS_LISTEN) { 1144*7c478bd9Sstevel@tonic-gate continue; 1145*7c478bd9Sstevel@tonic-gate } 1146*7c478bd9Sstevel@tonic-gate if (tcp->tcp_bound_source != INADDR_ANY && 1147*7c478bd9Sstevel@tonic-gate *addr != INADDR_ANY && 1148*7c478bd9Sstevel@tonic-gate tcp->tcp_bound_source != *addr) { 1149*7c478bd9Sstevel@tonic-gate continue; 1150*7c478bd9Sstevel@tonic-gate } 1151*7c478bd9Sstevel@tonic-gate if (bind_to_req_port_only) { 1152*7c478bd9Sstevel@tonic-gate return (0); 1153*7c478bd9Sstevel@tonic-gate } 1154*7c478bd9Sstevel@tonic-gate if (--count > 0) { 1155*7c478bd9Sstevel@tonic-gate port = tcp_update_next_port(++port); 1156*7c478bd9Sstevel@tonic-gate goto try_again; 1157*7c478bd9Sstevel@tonic-gate } else { 1158*7c478bd9Sstevel@tonic-gate return (0); 1159*7c478bd9Sstevel@tonic-gate } 1160*7c478bd9Sstevel@tonic-gate } 1161*7c478bd9Sstevel@tonic-gate return (port); 1162*7c478bd9Sstevel@tonic-gate } 1163*7c478bd9Sstevel@tonic-gate 1164*7c478bd9Sstevel@tonic-gate /* To handle the bind request. */ 1165*7c478bd9Sstevel@tonic-gate int 1166*7c478bd9Sstevel@tonic-gate tcp_bind(int sock_id) 1167*7c478bd9Sstevel@tonic-gate { 1168*7c478bd9Sstevel@tonic-gate tcp_t *tcp; 1169*7c478bd9Sstevel@tonic-gate in_port_t requested_port, allocated_port; 1170*7c478bd9Sstevel@tonic-gate boolean_t bind_to_req_port_only; 1171*7c478bd9Sstevel@tonic-gate boolean_t reuseaddr; 1172*7c478bd9Sstevel@tonic-gate 1173*7c478bd9Sstevel@tonic-gate if ((tcp = (tcp_t *)sockets[sock_id].pcb) == NULL) { 1174*7c478bd9Sstevel@tonic-gate errno = EINVAL; 1175*7c478bd9Sstevel@tonic-gate return (-1); 1176*7c478bd9Sstevel@tonic-gate } 1177*7c478bd9Sstevel@tonic-gate 1178*7c478bd9Sstevel@tonic-gate if (tcp->tcp_state >= TCPS_BOUND) { 1179*7c478bd9Sstevel@tonic-gate /* We don't allow multiple bind(). */ 1180*7c478bd9Sstevel@tonic-gate errno = EPROTO; 1181*7c478bd9Sstevel@tonic-gate return (-1); 1182*7c478bd9Sstevel@tonic-gate } 1183*7c478bd9Sstevel@tonic-gate 1184*7c478bd9Sstevel@tonic-gate requested_port = ntohs(sockets[sock_id].bind.sin_port); 1185*7c478bd9Sstevel@tonic-gate 1186*7c478bd9Sstevel@tonic-gate /* The bound source can be INADDR_ANY. */ 1187*7c478bd9Sstevel@tonic-gate tcp->tcp_bound_source = sockets[sock_id].bind.sin_addr.s_addr; 1188*7c478bd9Sstevel@tonic-gate 1189*7c478bd9Sstevel@tonic-gate tcp->tcp_ipha->ip_src.s_addr = tcp->tcp_bound_source; 1190*7c478bd9Sstevel@tonic-gate 1191*7c478bd9Sstevel@tonic-gate /* Verify the port is available. */ 1192*7c478bd9Sstevel@tonic-gate if (requested_port == 0) 1193*7c478bd9Sstevel@tonic-gate bind_to_req_port_only = B_FALSE; 1194*7c478bd9Sstevel@tonic-gate else /* T_BIND_REQ and requested_port != 0 */ 1195*7c478bd9Sstevel@tonic-gate bind_to_req_port_only = B_TRUE; 1196*7c478bd9Sstevel@tonic-gate 1197*7c478bd9Sstevel@tonic-gate if (requested_port == 0) { 1198*7c478bd9Sstevel@tonic-gate requested_port = tcp_update_next_port(++tcp_next_port_to_try); 1199*7c478bd9Sstevel@tonic-gate } 1200*7c478bd9Sstevel@tonic-gate reuseaddr = sockets[sock_id].so_opt & SO_REUSEADDR; 1201*7c478bd9Sstevel@tonic-gate allocated_port = tcp_bindi(requested_port, &(tcp->tcp_bound_source), 1202*7c478bd9Sstevel@tonic-gate reuseaddr, bind_to_req_port_only); 1203*7c478bd9Sstevel@tonic-gate 1204*7c478bd9Sstevel@tonic-gate if (allocated_port == 0) { 1205*7c478bd9Sstevel@tonic-gate errno = EADDRINUSE; 1206*7c478bd9Sstevel@tonic-gate return (-1); 1207*7c478bd9Sstevel@tonic-gate } 1208*7c478bd9Sstevel@tonic-gate tcp->tcp_lport = htons(allocated_port); 1209*7c478bd9Sstevel@tonic-gate *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; 1210*7c478bd9Sstevel@tonic-gate sockets[sock_id].bind.sin_port = tcp->tcp_lport; 1211*7c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_BOUND; 1212*7c478bd9Sstevel@tonic-gate return (0); 1213*7c478bd9Sstevel@tonic-gate } 1214*7c478bd9Sstevel@tonic-gate 1215*7c478bd9Sstevel@tonic-gate /* 1216*7c478bd9Sstevel@tonic-gate * Check for duplicate TCP connections. 1217*7c478bd9Sstevel@tonic-gate */ 1218*7c478bd9Sstevel@tonic-gate static int 1219*7c478bd9Sstevel@tonic-gate tcp_conn_check(tcp_t *tcp) 1220*7c478bd9Sstevel@tonic-gate { 1221*7c478bd9Sstevel@tonic-gate int i; 1222*7c478bd9Sstevel@tonic-gate tcp_t *tmp_tcp; 1223*7c478bd9Sstevel@tonic-gate 1224*7c478bd9Sstevel@tonic-gate for (i = 0; i < MAXSOCKET; i++) { 1225*7c478bd9Sstevel@tonic-gate if (sockets[i].type != INETBOOT_STREAM) 1226*7c478bd9Sstevel@tonic-gate continue; 1227*7c478bd9Sstevel@tonic-gate /* Socket may not be closed but the TCP can be gone. */ 1228*7c478bd9Sstevel@tonic-gate if ((tmp_tcp = (tcp_t *)sockets[i].pcb) == NULL) 1229*7c478bd9Sstevel@tonic-gate continue; 1230*7c478bd9Sstevel@tonic-gate /* We only care about TCP in states later than SYN_SENT. */ 1231*7c478bd9Sstevel@tonic-gate if (tmp_tcp->tcp_state < TCPS_SYN_SENT) 1232*7c478bd9Sstevel@tonic-gate continue; 1233*7c478bd9Sstevel@tonic-gate if (tmp_tcp->tcp_lport != tcp->tcp_lport || 1234*7c478bd9Sstevel@tonic-gate tmp_tcp->tcp_fport != tcp->tcp_fport || 1235*7c478bd9Sstevel@tonic-gate tmp_tcp->tcp_bound_source != tcp->tcp_bound_source || 1236*7c478bd9Sstevel@tonic-gate tmp_tcp->tcp_remote != tcp->tcp_remote) { 1237*7c478bd9Sstevel@tonic-gate continue; 1238*7c478bd9Sstevel@tonic-gate } else { 1239*7c478bd9Sstevel@tonic-gate return (-1); 1240*7c478bd9Sstevel@tonic-gate } 1241*7c478bd9Sstevel@tonic-gate } 1242*7c478bd9Sstevel@tonic-gate return (0); 1243*7c478bd9Sstevel@tonic-gate } 1244*7c478bd9Sstevel@tonic-gate 1245*7c478bd9Sstevel@tonic-gate /* To handle a connect request. */ 1246*7c478bd9Sstevel@tonic-gate int 1247*7c478bd9Sstevel@tonic-gate tcp_connect(int sock_id) 1248*7c478bd9Sstevel@tonic-gate { 1249*7c478bd9Sstevel@tonic-gate tcp_t *tcp; 1250*7c478bd9Sstevel@tonic-gate in_addr_t dstaddr; 1251*7c478bd9Sstevel@tonic-gate in_port_t dstport; 1252*7c478bd9Sstevel@tonic-gate tcph_t *tcph; 1253*7c478bd9Sstevel@tonic-gate int mss; 1254*7c478bd9Sstevel@tonic-gate mblk_t *syn_mp; 1255*7c478bd9Sstevel@tonic-gate 1256*7c478bd9Sstevel@tonic-gate if ((tcp = (tcp_t *)(sockets[sock_id].pcb)) == NULL) { 1257*7c478bd9Sstevel@tonic-gate errno = EINVAL; 1258*7c478bd9Sstevel@tonic-gate return (-1); 1259*7c478bd9Sstevel@tonic-gate } 1260*7c478bd9Sstevel@tonic-gate 1261*7c478bd9Sstevel@tonic-gate TCP_RUN_TIME_WAIT_COLLECTOR(); 1262*7c478bd9Sstevel@tonic-gate 1263*7c478bd9Sstevel@tonic-gate dstaddr = sockets[sock_id].remote.sin_addr.s_addr; 1264*7c478bd9Sstevel@tonic-gate dstport = sockets[sock_id].remote.sin_port; 1265*7c478bd9Sstevel@tonic-gate 1266*7c478bd9Sstevel@tonic-gate /* 1267*7c478bd9Sstevel@tonic-gate * Check for attempt to connect to INADDR_ANY or non-unicast addrress. 1268*7c478bd9Sstevel@tonic-gate * We don't have enough info to check for broadcast addr, except 1269*7c478bd9Sstevel@tonic-gate * for the all 1 broadcast. 1270*7c478bd9Sstevel@tonic-gate */ 1271*7c478bd9Sstevel@tonic-gate if (dstaddr == INADDR_ANY || IN_CLASSD(ntohl(dstaddr)) || 1272*7c478bd9Sstevel@tonic-gate dstaddr == INADDR_BROADCAST) { 1273*7c478bd9Sstevel@tonic-gate /* 1274*7c478bd9Sstevel@tonic-gate * SunOS 4.x and 4.3 BSD allow an application 1275*7c478bd9Sstevel@tonic-gate * to connect a TCP socket to INADDR_ANY. 1276*7c478bd9Sstevel@tonic-gate * When they do this, the kernel picks the 1277*7c478bd9Sstevel@tonic-gate * address of one interface and uses it 1278*7c478bd9Sstevel@tonic-gate * instead. The kernel usually ends up 1279*7c478bd9Sstevel@tonic-gate * picking the address of the loopback 1280*7c478bd9Sstevel@tonic-gate * interface. This is an undocumented feature. 1281*7c478bd9Sstevel@tonic-gate * However, we provide the same thing here 1282*7c478bd9Sstevel@tonic-gate * in order to have source and binary 1283*7c478bd9Sstevel@tonic-gate * compatibility with SunOS 4.x. 1284*7c478bd9Sstevel@tonic-gate * Update the T_CONN_REQ (sin/sin6) since it is used to 1285*7c478bd9Sstevel@tonic-gate * generate the T_CONN_CON. 1286*7c478bd9Sstevel@tonic-gate * 1287*7c478bd9Sstevel@tonic-gate * Fail this for inetboot TCP. 1288*7c478bd9Sstevel@tonic-gate */ 1289*7c478bd9Sstevel@tonic-gate errno = EINVAL; 1290*7c478bd9Sstevel@tonic-gate return (-1); 1291*7c478bd9Sstevel@tonic-gate } 1292*7c478bd9Sstevel@tonic-gate 1293*7c478bd9Sstevel@tonic-gate /* It is not bound to any address yet... */ 1294*7c478bd9Sstevel@tonic-gate if (tcp->tcp_bound_source == INADDR_ANY) { 1295*7c478bd9Sstevel@tonic-gate ipv4_getipaddr(&(sockets[sock_id].bind.sin_addr)); 1296*7c478bd9Sstevel@tonic-gate /* We don't have an address! */ 1297*7c478bd9Sstevel@tonic-gate if (ntohl(sockets[sock_id].bind.sin_addr.s_addr) == 1298*7c478bd9Sstevel@tonic-gate INADDR_ANY) { 1299*7c478bd9Sstevel@tonic-gate errno = EPROTO; 1300*7c478bd9Sstevel@tonic-gate return (-1); 1301*7c478bd9Sstevel@tonic-gate } 1302*7c478bd9Sstevel@tonic-gate tcp->tcp_bound_source = sockets[sock_id].bind.sin_addr.s_addr; 1303*7c478bd9Sstevel@tonic-gate tcp->tcp_ipha->ip_src.s_addr = tcp->tcp_bound_source; 1304*7c478bd9Sstevel@tonic-gate } 1305*7c478bd9Sstevel@tonic-gate 1306*7c478bd9Sstevel@tonic-gate /* 1307*7c478bd9Sstevel@tonic-gate * Don't let an endpoint connect to itself. 1308*7c478bd9Sstevel@tonic-gate */ 1309*7c478bd9Sstevel@tonic-gate if (dstaddr == tcp->tcp_ipha->ip_src.s_addr && 1310*7c478bd9Sstevel@tonic-gate dstport == tcp->tcp_lport) { 1311*7c478bd9Sstevel@tonic-gate errno = EINVAL; 1312*7c478bd9Sstevel@tonic-gate return (-1); 1313*7c478bd9Sstevel@tonic-gate } 1314*7c478bd9Sstevel@tonic-gate 1315*7c478bd9Sstevel@tonic-gate tcp->tcp_ipha->ip_dst.s_addr = dstaddr; 1316*7c478bd9Sstevel@tonic-gate tcp->tcp_remote = dstaddr; 1317*7c478bd9Sstevel@tonic-gate tcph = tcp->tcp_tcph; 1318*7c478bd9Sstevel@tonic-gate *(uint16_t *)tcph->th_fport = dstport; 1319*7c478bd9Sstevel@tonic-gate tcp->tcp_fport = dstport; 1320*7c478bd9Sstevel@tonic-gate 1321*7c478bd9Sstevel@tonic-gate /* 1322*7c478bd9Sstevel@tonic-gate * Don't allow this connection to completely duplicate 1323*7c478bd9Sstevel@tonic-gate * an existing connection. 1324*7c478bd9Sstevel@tonic-gate */ 1325*7c478bd9Sstevel@tonic-gate if (tcp_conn_check(tcp) < 0) { 1326*7c478bd9Sstevel@tonic-gate errno = EADDRINUSE; 1327*7c478bd9Sstevel@tonic-gate return (-1); 1328*7c478bd9Sstevel@tonic-gate } 1329*7c478bd9Sstevel@tonic-gate 1330*7c478bd9Sstevel@tonic-gate /* 1331*7c478bd9Sstevel@tonic-gate * Just make sure our rwnd is at 1332*7c478bd9Sstevel@tonic-gate * least tcp_recv_hiwat_mss * MSS 1333*7c478bd9Sstevel@tonic-gate * large, and round up to the nearest 1334*7c478bd9Sstevel@tonic-gate * MSS. 1335*7c478bd9Sstevel@tonic-gate * 1336*7c478bd9Sstevel@tonic-gate * We do the round up here because 1337*7c478bd9Sstevel@tonic-gate * we need to get the interface 1338*7c478bd9Sstevel@tonic-gate * MTU first before we can do the 1339*7c478bd9Sstevel@tonic-gate * round up. 1340*7c478bd9Sstevel@tonic-gate */ 1341*7c478bd9Sstevel@tonic-gate mss = tcp->tcp_mss - tcp->tcp_hdr_len; 1342*7c478bd9Sstevel@tonic-gate tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), 1343*7c478bd9Sstevel@tonic-gate tcp_recv_hiwat_minmss * mss); 1344*7c478bd9Sstevel@tonic-gate tcp->tcp_rwnd_max = tcp->tcp_rwnd; 1345*7c478bd9Sstevel@tonic-gate SET_WS_VALUE(tcp); 1346*7c478bd9Sstevel@tonic-gate U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), 1347*7c478bd9Sstevel@tonic-gate tcp->tcp_tcph->th_win); 1348*7c478bd9Sstevel@tonic-gate if (tcp->tcp_rcv_ws > 0 || tcp_wscale_always) 1349*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_ws_ok = B_TRUE; 1350*7c478bd9Sstevel@tonic-gate 1351*7c478bd9Sstevel@tonic-gate /* 1352*7c478bd9Sstevel@tonic-gate * Set tcp_snd_ts_ok to true 1353*7c478bd9Sstevel@tonic-gate * so that tcp_xmit_mp will 1354*7c478bd9Sstevel@tonic-gate * include the timestamp 1355*7c478bd9Sstevel@tonic-gate * option in the SYN segment. 1356*7c478bd9Sstevel@tonic-gate */ 1357*7c478bd9Sstevel@tonic-gate if (tcp_tstamp_always || 1358*7c478bd9Sstevel@tonic-gate (tcp->tcp_rcv_ws && tcp_tstamp_if_wscale)) { 1359*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_ts_ok = B_TRUE; 1360*7c478bd9Sstevel@tonic-gate } 1361*7c478bd9Sstevel@tonic-gate 1362*7c478bd9Sstevel@tonic-gate if (tcp_sack_permitted == 2 || 1363*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_sack_ok) { 1364*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_sack_info == NULL); 1365*7c478bd9Sstevel@tonic-gate if ((tcp->tcp_sack_info = (tcp_sack_info_t *)bkmem_zalloc( 1366*7c478bd9Sstevel@tonic-gate sizeof (tcp_sack_info_t))) == NULL) { 1367*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_sack_ok = B_FALSE; 1368*7c478bd9Sstevel@tonic-gate } else { 1369*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_sack_ok = B_TRUE; 1370*7c478bd9Sstevel@tonic-gate } 1371*7c478bd9Sstevel@tonic-gate } 1372*7c478bd9Sstevel@tonic-gate /* 1373*7c478bd9Sstevel@tonic-gate * Should we use ECN? Note that the current 1374*7c478bd9Sstevel@tonic-gate * default value (SunOS 5.9) of tcp_ecn_permitted 1375*7c478bd9Sstevel@tonic-gate * is 2. The reason for doing this is that there 1376*7c478bd9Sstevel@tonic-gate * are equipments out there that will drop ECN 1377*7c478bd9Sstevel@tonic-gate * enabled IP packets. Setting it to 1 avoids 1378*7c478bd9Sstevel@tonic-gate * compatibility problems. 1379*7c478bd9Sstevel@tonic-gate */ 1380*7c478bd9Sstevel@tonic-gate if (tcp_ecn_permitted == 2) 1381*7c478bd9Sstevel@tonic-gate tcp->tcp_ecn_ok = B_TRUE; 1382*7c478bd9Sstevel@tonic-gate 1383*7c478bd9Sstevel@tonic-gate tcp_iss_init(tcp); 1384*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 1385*7c478bd9Sstevel@tonic-gate tcp->tcp_active_open = B_TRUE; 1386*7c478bd9Sstevel@tonic-gate 1387*7c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_SYN_SENT; 1388*7c478bd9Sstevel@tonic-gate syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, tcp->tcp_iss, B_FALSE, 1389*7c478bd9Sstevel@tonic-gate NULL, B_FALSE); 1390*7c478bd9Sstevel@tonic-gate if (syn_mp != NULL) { 1391*7c478bd9Sstevel@tonic-gate int ret; 1392*7c478bd9Sstevel@tonic-gate 1393*7c478bd9Sstevel@tonic-gate /* Dump the packet when debugging. */ 1394*7c478bd9Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_connect", syn_mp); 1395*7c478bd9Sstevel@tonic-gate /* Send out the SYN packet. */ 1396*7c478bd9Sstevel@tonic-gate ret = ipv4_tcp_output(sock_id, syn_mp); 1397*7c478bd9Sstevel@tonic-gate freeb(syn_mp); 1398*7c478bd9Sstevel@tonic-gate if (ret < 0) { 1399*7c478bd9Sstevel@tonic-gate return (-1); 1400*7c478bd9Sstevel@tonic-gate } 1401*7c478bd9Sstevel@tonic-gate /* tcp_state_wait() will finish the 3 way handshake. */ 1402*7c478bd9Sstevel@tonic-gate return (tcp_state_wait(sock_id, tcp, TCPS_ESTABLISHED)); 1403*7c478bd9Sstevel@tonic-gate } else { 1404*7c478bd9Sstevel@tonic-gate errno = ENOBUFS; 1405*7c478bd9Sstevel@tonic-gate return (-1); 1406*7c478bd9Sstevel@tonic-gate } 1407*7c478bd9Sstevel@tonic-gate } 1408*7c478bd9Sstevel@tonic-gate 1409*7c478bd9Sstevel@tonic-gate /* 1410*7c478bd9Sstevel@tonic-gate * Common accept code. Called by tcp_conn_request. 1411*7c478bd9Sstevel@tonic-gate * cr_pkt is the SYN packet. 1412*7c478bd9Sstevel@tonic-gate */ 1413*7c478bd9Sstevel@tonic-gate static int 1414*7c478bd9Sstevel@tonic-gate tcp_accept_comm(tcp_t *listener, tcp_t *acceptor, mblk_t *cr_pkt, 1415*7c478bd9Sstevel@tonic-gate uint_t ip_hdr_len) 1416*7c478bd9Sstevel@tonic-gate { 1417*7c478bd9Sstevel@tonic-gate tcph_t *tcph; 1418*7c478bd9Sstevel@tonic-gate 1419*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 1420*7c478bd9Sstevel@tonic-gate printf("tcp_accept_comm #######################\n"); 1421*7c478bd9Sstevel@tonic-gate #endif 1422*7c478bd9Sstevel@tonic-gate 1423*7c478bd9Sstevel@tonic-gate /* 1424*7c478bd9Sstevel@tonic-gate * When we get here, we know that the acceptor header template 1425*7c478bd9Sstevel@tonic-gate * has already been initialized. 1426*7c478bd9Sstevel@tonic-gate * However, it may not match the listener if the listener 1427*7c478bd9Sstevel@tonic-gate * includes options... 1428*7c478bd9Sstevel@tonic-gate * It may also not match the listener if the listener is v6 and 1429*7c478bd9Sstevel@tonic-gate * and the acceptor is v4 1430*7c478bd9Sstevel@tonic-gate */ 1431*7c478bd9Sstevel@tonic-gate acceptor->tcp_lport = listener->tcp_lport; 1432*7c478bd9Sstevel@tonic-gate 1433*7c478bd9Sstevel@tonic-gate if (listener->tcp_ipversion == acceptor->tcp_ipversion) { 1434*7c478bd9Sstevel@tonic-gate if (acceptor->tcp_iphc_len != listener->tcp_iphc_len) { 1435*7c478bd9Sstevel@tonic-gate /* 1436*7c478bd9Sstevel@tonic-gate * Listener had options of some sort; acceptor inherits. 1437*7c478bd9Sstevel@tonic-gate * Free up the acceptor template and allocate one 1438*7c478bd9Sstevel@tonic-gate * of the right size. 1439*7c478bd9Sstevel@tonic-gate */ 1440*7c478bd9Sstevel@tonic-gate bkmem_free(acceptor->tcp_iphc, acceptor->tcp_iphc_len); 1441*7c478bd9Sstevel@tonic-gate acceptor->tcp_iphc = bkmem_zalloc( 1442*7c478bd9Sstevel@tonic-gate listener->tcp_iphc_len); 1443*7c478bd9Sstevel@tonic-gate if (acceptor->tcp_iphc == NULL) { 1444*7c478bd9Sstevel@tonic-gate acceptor->tcp_iphc_len = 0; 1445*7c478bd9Sstevel@tonic-gate return (ENOMEM); 1446*7c478bd9Sstevel@tonic-gate } 1447*7c478bd9Sstevel@tonic-gate acceptor->tcp_iphc_len = listener->tcp_iphc_len; 1448*7c478bd9Sstevel@tonic-gate } 1449*7c478bd9Sstevel@tonic-gate acceptor->tcp_hdr_len = listener->tcp_hdr_len; 1450*7c478bd9Sstevel@tonic-gate acceptor->tcp_ip_hdr_len = listener->tcp_ip_hdr_len; 1451*7c478bd9Sstevel@tonic-gate acceptor->tcp_tcp_hdr_len = listener->tcp_tcp_hdr_len; 1452*7c478bd9Sstevel@tonic-gate 1453*7c478bd9Sstevel@tonic-gate /* 1454*7c478bd9Sstevel@tonic-gate * Copy the IP+TCP header template from listener to acceptor 1455*7c478bd9Sstevel@tonic-gate */ 1456*7c478bd9Sstevel@tonic-gate bcopy(listener->tcp_iphc, acceptor->tcp_iphc, 1457*7c478bd9Sstevel@tonic-gate listener->tcp_hdr_len); 1458*7c478bd9Sstevel@tonic-gate acceptor->tcp_ipha = (struct ip *)acceptor->tcp_iphc; 1459*7c478bd9Sstevel@tonic-gate acceptor->tcp_tcph = (tcph_t *)(acceptor->tcp_iphc + 1460*7c478bd9Sstevel@tonic-gate acceptor->tcp_ip_hdr_len); 1461*7c478bd9Sstevel@tonic-gate } else { 1462*7c478bd9Sstevel@tonic-gate prom_panic("tcp_accept_comm: version not equal"); 1463*7c478bd9Sstevel@tonic-gate } 1464*7c478bd9Sstevel@tonic-gate 1465*7c478bd9Sstevel@tonic-gate /* Copy our new dest and fport from the connection request packet */ 1466*7c478bd9Sstevel@tonic-gate if (acceptor->tcp_ipversion == IPV4_VERSION) { 1467*7c478bd9Sstevel@tonic-gate struct ip *ipha; 1468*7c478bd9Sstevel@tonic-gate 1469*7c478bd9Sstevel@tonic-gate ipha = (struct ip *)cr_pkt->b_rptr; 1470*7c478bd9Sstevel@tonic-gate acceptor->tcp_ipha->ip_dst = ipha->ip_src; 1471*7c478bd9Sstevel@tonic-gate acceptor->tcp_remote = ipha->ip_src.s_addr; 1472*7c478bd9Sstevel@tonic-gate acceptor->tcp_ipha->ip_src = ipha->ip_dst; 1473*7c478bd9Sstevel@tonic-gate acceptor->tcp_bound_source = ipha->ip_dst.s_addr; 1474*7c478bd9Sstevel@tonic-gate tcph = (tcph_t *)&cr_pkt->b_rptr[ip_hdr_len]; 1475*7c478bd9Sstevel@tonic-gate } else { 1476*7c478bd9Sstevel@tonic-gate prom_panic("tcp_accept_comm: not IPv4"); 1477*7c478bd9Sstevel@tonic-gate } 1478*7c478bd9Sstevel@tonic-gate bcopy(tcph->th_lport, acceptor->tcp_tcph->th_fport, sizeof (in_port_t)); 1479*7c478bd9Sstevel@tonic-gate bcopy(acceptor->tcp_tcph->th_fport, &acceptor->tcp_fport, 1480*7c478bd9Sstevel@tonic-gate sizeof (in_port_t)); 1481*7c478bd9Sstevel@tonic-gate /* 1482*7c478bd9Sstevel@tonic-gate * For an all-port proxy listener, the local port is determined by 1483*7c478bd9Sstevel@tonic-gate * the port number field in the SYN packet. 1484*7c478bd9Sstevel@tonic-gate */ 1485*7c478bd9Sstevel@tonic-gate if (listener->tcp_lport == 0) { 1486*7c478bd9Sstevel@tonic-gate acceptor->tcp_lport = *(in_port_t *)tcph->th_fport; 1487*7c478bd9Sstevel@tonic-gate bcopy(tcph->th_fport, acceptor->tcp_tcph->th_lport, 1488*7c478bd9Sstevel@tonic-gate sizeof (in_port_t)); 1489*7c478bd9Sstevel@tonic-gate } 1490*7c478bd9Sstevel@tonic-gate /* Inherit various TCP parameters from the listener */ 1491*7c478bd9Sstevel@tonic-gate acceptor->tcp_naglim = listener->tcp_naglim; 1492*7c478bd9Sstevel@tonic-gate acceptor->tcp_first_timer_threshold = 1493*7c478bd9Sstevel@tonic-gate listener->tcp_first_timer_threshold; 1494*7c478bd9Sstevel@tonic-gate acceptor->tcp_second_timer_threshold = 1495*7c478bd9Sstevel@tonic-gate listener->tcp_second_timer_threshold; 1496*7c478bd9Sstevel@tonic-gate 1497*7c478bd9Sstevel@tonic-gate acceptor->tcp_first_ctimer_threshold = 1498*7c478bd9Sstevel@tonic-gate listener->tcp_first_ctimer_threshold; 1499*7c478bd9Sstevel@tonic-gate acceptor->tcp_second_ctimer_threshold = 1500*7c478bd9Sstevel@tonic-gate listener->tcp_second_ctimer_threshold; 1501*7c478bd9Sstevel@tonic-gate 1502*7c478bd9Sstevel@tonic-gate acceptor->tcp_xmit_hiwater = listener->tcp_xmit_hiwater; 1503*7c478bd9Sstevel@tonic-gate 1504*7c478bd9Sstevel@tonic-gate acceptor->tcp_state = TCPS_LISTEN; 1505*7c478bd9Sstevel@tonic-gate tcp_iss_init(acceptor); 1506*7c478bd9Sstevel@tonic-gate 1507*7c478bd9Sstevel@tonic-gate /* Process all TCP options. */ 1508*7c478bd9Sstevel@tonic-gate tcp_process_options(acceptor, tcph); 1509*7c478bd9Sstevel@tonic-gate 1510*7c478bd9Sstevel@tonic-gate /* Is the other end ECN capable? */ 1511*7c478bd9Sstevel@tonic-gate if (tcp_ecn_permitted >= 1 && 1512*7c478bd9Sstevel@tonic-gate (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { 1513*7c478bd9Sstevel@tonic-gate acceptor->tcp_ecn_ok = B_TRUE; 1514*7c478bd9Sstevel@tonic-gate } 1515*7c478bd9Sstevel@tonic-gate 1516*7c478bd9Sstevel@tonic-gate /* 1517*7c478bd9Sstevel@tonic-gate * listener->tcp_rq->q_hiwat should be the default window size or a 1518*7c478bd9Sstevel@tonic-gate * window size changed via SO_RCVBUF option. First round up the 1519*7c478bd9Sstevel@tonic-gate * acceptor's tcp_rwnd to the nearest MSS. Then find out the window 1520*7c478bd9Sstevel@tonic-gate * scale option value if needed. Call tcp_rwnd_set() to finish the 1521*7c478bd9Sstevel@tonic-gate * setting. 1522*7c478bd9Sstevel@tonic-gate * 1523*7c478bd9Sstevel@tonic-gate * Note if there is a rpipe metric associated with the remote host, 1524*7c478bd9Sstevel@tonic-gate * we should not inherit receive window size from listener. 1525*7c478bd9Sstevel@tonic-gate */ 1526*7c478bd9Sstevel@tonic-gate acceptor->tcp_rwnd = MSS_ROUNDUP( 1527*7c478bd9Sstevel@tonic-gate (acceptor->tcp_rwnd == 0 ? listener->tcp_rwnd_max : 1528*7c478bd9Sstevel@tonic-gate acceptor->tcp_rwnd), acceptor->tcp_mss); 1529*7c478bd9Sstevel@tonic-gate if (acceptor->tcp_snd_ws_ok) 1530*7c478bd9Sstevel@tonic-gate SET_WS_VALUE(acceptor); 1531*7c478bd9Sstevel@tonic-gate /* 1532*7c478bd9Sstevel@tonic-gate * Note that this is the only place tcp_rwnd_set() is called for 1533*7c478bd9Sstevel@tonic-gate * accepting a connection. We need to call it here instead of 1534*7c478bd9Sstevel@tonic-gate * after the 3-way handshake because we need to tell the other 1535*7c478bd9Sstevel@tonic-gate * side our rwnd in the SYN-ACK segment. 1536*7c478bd9Sstevel@tonic-gate */ 1537*7c478bd9Sstevel@tonic-gate (void) tcp_rwnd_set(acceptor, acceptor->tcp_rwnd); 1538*7c478bd9Sstevel@tonic-gate 1539*7c478bd9Sstevel@tonic-gate return (0); 1540*7c478bd9Sstevel@tonic-gate } 1541*7c478bd9Sstevel@tonic-gate 1542*7c478bd9Sstevel@tonic-gate /* 1543*7c478bd9Sstevel@tonic-gate * Defense for the SYN attack - 1544*7c478bd9Sstevel@tonic-gate * 1. When q0 is full, drop from the tail (tcp_eager_prev_q0) the oldest 1545*7c478bd9Sstevel@tonic-gate * one that doesn't have the dontdrop bit set. 1546*7c478bd9Sstevel@tonic-gate * 2. Don't drop a SYN request before its first timeout. This gives every 1547*7c478bd9Sstevel@tonic-gate * request at least til the first timeout to complete its 3-way handshake. 1548*7c478bd9Sstevel@tonic-gate * 3. The current threshold is - # of timeout > q0len/4 => SYN alert on 1549*7c478bd9Sstevel@tonic-gate * # of timeout drops back to <= q0len/32 => SYN alert off 1550*7c478bd9Sstevel@tonic-gate */ 1551*7c478bd9Sstevel@tonic-gate static boolean_t 1552*7c478bd9Sstevel@tonic-gate tcp_drop_q0(tcp_t *tcp) 1553*7c478bd9Sstevel@tonic-gate { 1554*7c478bd9Sstevel@tonic-gate tcp_t *eager; 1555*7c478bd9Sstevel@tonic-gate 1556*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); 1557*7c478bd9Sstevel@tonic-gate /* 1558*7c478bd9Sstevel@tonic-gate * New one is added after next_q0 so prev_q0 points to the oldest 1559*7c478bd9Sstevel@tonic-gate * Also do not drop any established connections that are deferred on 1560*7c478bd9Sstevel@tonic-gate * q0 due to q being full 1561*7c478bd9Sstevel@tonic-gate */ 1562*7c478bd9Sstevel@tonic-gate 1563*7c478bd9Sstevel@tonic-gate eager = tcp->tcp_eager_prev_q0; 1564*7c478bd9Sstevel@tonic-gate while (eager->tcp_dontdrop || eager->tcp_conn_def_q0) { 1565*7c478bd9Sstevel@tonic-gate /* XXX should move the eager to the head */ 1566*7c478bd9Sstevel@tonic-gate eager = eager->tcp_eager_prev_q0; 1567*7c478bd9Sstevel@tonic-gate if (eager == tcp) { 1568*7c478bd9Sstevel@tonic-gate eager = tcp->tcp_eager_prev_q0; 1569*7c478bd9Sstevel@tonic-gate break; 1570*7c478bd9Sstevel@tonic-gate } 1571*7c478bd9Sstevel@tonic-gate } 1572*7c478bd9Sstevel@tonic-gate dprintf("tcp_drop_q0: listen half-open queue (max=%d) overflow" 1573*7c478bd9Sstevel@tonic-gate " (%d pending) on %s, drop one", tcp_conn_req_max_q0, 1574*7c478bd9Sstevel@tonic-gate tcp->tcp_conn_req_cnt_q0, 1575*7c478bd9Sstevel@tonic-gate tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1576*7c478bd9Sstevel@tonic-gate 1577*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpHalfOpenDrop); 1578*7c478bd9Sstevel@tonic-gate bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1579*7c478bd9Sstevel@tonic-gate return (B_TRUE); 1580*7c478bd9Sstevel@tonic-gate } 1581*7c478bd9Sstevel@tonic-gate 1582*7c478bd9Sstevel@tonic-gate /* ARGSUSED */ 1583*7c478bd9Sstevel@tonic-gate static tcp_t * 1584*7c478bd9Sstevel@tonic-gate tcp_conn_request(tcp_t *tcp, mblk_t *mp, uint_t sock_id, uint_t ip_hdr_len) 1585*7c478bd9Sstevel@tonic-gate { 1586*7c478bd9Sstevel@tonic-gate tcp_t *eager; 1587*7c478bd9Sstevel@tonic-gate struct ip *ipha; 1588*7c478bd9Sstevel@tonic-gate int err; 1589*7c478bd9Sstevel@tonic-gate 1590*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 1591*7c478bd9Sstevel@tonic-gate printf("tcp_conn_request ###################\n"); 1592*7c478bd9Sstevel@tonic-gate #endif 1593*7c478bd9Sstevel@tonic-gate 1594*7c478bd9Sstevel@tonic-gate if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) { 1595*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpListenDrop); 1596*7c478bd9Sstevel@tonic-gate dprintf("tcp_conn_request: listen backlog (max=%d) " 1597*7c478bd9Sstevel@tonic-gate "overflow (%d pending) on %s", 1598*7c478bd9Sstevel@tonic-gate tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q, 1599*7c478bd9Sstevel@tonic-gate tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1600*7c478bd9Sstevel@tonic-gate return (NULL); 1601*7c478bd9Sstevel@tonic-gate } 1602*7c478bd9Sstevel@tonic-gate 1603*7c478bd9Sstevel@tonic-gate assert(OK_32PTR(mp->b_rptr)); 1604*7c478bd9Sstevel@tonic-gate 1605*7c478bd9Sstevel@tonic-gate if (tcp->tcp_conn_req_cnt_q0 >= 1606*7c478bd9Sstevel@tonic-gate tcp->tcp_conn_req_max + tcp_conn_req_max_q0) { 1607*7c478bd9Sstevel@tonic-gate /* 1608*7c478bd9Sstevel@tonic-gate * Q0 is full. Drop a pending half-open req from the queue 1609*7c478bd9Sstevel@tonic-gate * to make room for the new SYN req. Also mark the time we 1610*7c478bd9Sstevel@tonic-gate * drop a SYN. 1611*7c478bd9Sstevel@tonic-gate */ 1612*7c478bd9Sstevel@tonic-gate tcp->tcp_last_rcv_lbolt = prom_gettime(); 1613*7c478bd9Sstevel@tonic-gate if (!tcp_drop_q0(tcp)) { 1614*7c478bd9Sstevel@tonic-gate freemsg(mp); 1615*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpListenDropQ0); 1616*7c478bd9Sstevel@tonic-gate dprintf("tcp_conn_request: listen half-open queue " 1617*7c478bd9Sstevel@tonic-gate "(max=%d) full (%d pending) on %s", 1618*7c478bd9Sstevel@tonic-gate tcp_conn_req_max_q0, 1619*7c478bd9Sstevel@tonic-gate tcp->tcp_conn_req_cnt_q0, 1620*7c478bd9Sstevel@tonic-gate tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1621*7c478bd9Sstevel@tonic-gate return (NULL); 1622*7c478bd9Sstevel@tonic-gate } 1623*7c478bd9Sstevel@tonic-gate } 1624*7c478bd9Sstevel@tonic-gate 1625*7c478bd9Sstevel@tonic-gate ipha = (struct ip *)mp->b_rptr; 1626*7c478bd9Sstevel@tonic-gate if (IN_CLASSD(ntohl(ipha->ip_src.s_addr)) || 1627*7c478bd9Sstevel@tonic-gate ipha->ip_src.s_addr == INADDR_BROADCAST || 1628*7c478bd9Sstevel@tonic-gate ipha->ip_src.s_addr == INADDR_ANY || 1629*7c478bd9Sstevel@tonic-gate ipha->ip_dst.s_addr == INADDR_BROADCAST) { 1630*7c478bd9Sstevel@tonic-gate freemsg(mp); 1631*7c478bd9Sstevel@tonic-gate return (NULL); 1632*7c478bd9Sstevel@tonic-gate } 1633*7c478bd9Sstevel@tonic-gate /* 1634*7c478bd9Sstevel@tonic-gate * We allow the connection to proceed 1635*7c478bd9Sstevel@tonic-gate * by generating a detached tcp state vector and put it in 1636*7c478bd9Sstevel@tonic-gate * the eager queue. When an accept happens, it will be 1637*7c478bd9Sstevel@tonic-gate * dequeued sequentially. 1638*7c478bd9Sstevel@tonic-gate */ 1639*7c478bd9Sstevel@tonic-gate if ((eager = (tcp_t *)bkmem_alloc(sizeof (tcp_t))) == NULL) { 1640*7c478bd9Sstevel@tonic-gate freemsg(mp); 1641*7c478bd9Sstevel@tonic-gate errno = ENOBUFS; 1642*7c478bd9Sstevel@tonic-gate return (NULL); 1643*7c478bd9Sstevel@tonic-gate } 1644*7c478bd9Sstevel@tonic-gate if ((errno = tcp_init_values(eager, NULL)) != 0) { 1645*7c478bd9Sstevel@tonic-gate freemsg(mp); 1646*7c478bd9Sstevel@tonic-gate bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1647*7c478bd9Sstevel@tonic-gate return (NULL); 1648*7c478bd9Sstevel@tonic-gate } 1649*7c478bd9Sstevel@tonic-gate 1650*7c478bd9Sstevel@tonic-gate /* 1651*7c478bd9Sstevel@tonic-gate * Eager connection inherits address form from its listener, 1652*7c478bd9Sstevel@tonic-gate * but its packet form comes from the version of the received 1653*7c478bd9Sstevel@tonic-gate * SYN segment. 1654*7c478bd9Sstevel@tonic-gate */ 1655*7c478bd9Sstevel@tonic-gate eager->tcp_family = tcp->tcp_family; 1656*7c478bd9Sstevel@tonic-gate 1657*7c478bd9Sstevel@tonic-gate err = tcp_accept_comm(tcp, eager, mp, ip_hdr_len); 1658*7c478bd9Sstevel@tonic-gate if (err) { 1659*7c478bd9Sstevel@tonic-gate bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1660*7c478bd9Sstevel@tonic-gate return (NULL); 1661*7c478bd9Sstevel@tonic-gate } 1662*7c478bd9Sstevel@tonic-gate 1663*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; 1664*7c478bd9Sstevel@tonic-gate eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 1665*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q0 = eager; 1666*7c478bd9Sstevel@tonic-gate eager->tcp_eager_prev_q0 = tcp; 1667*7c478bd9Sstevel@tonic-gate 1668*7c478bd9Sstevel@tonic-gate /* Set tcp_listener before adding it to tcp_conn_fanout */ 1669*7c478bd9Sstevel@tonic-gate eager->tcp_listener = tcp; 1670*7c478bd9Sstevel@tonic-gate tcp->tcp_conn_req_cnt_q0++; 1671*7c478bd9Sstevel@tonic-gate 1672*7c478bd9Sstevel@tonic-gate return (eager); 1673*7c478bd9Sstevel@tonic-gate } 1674*7c478bd9Sstevel@tonic-gate 1675*7c478bd9Sstevel@tonic-gate /* 1676*7c478bd9Sstevel@tonic-gate * To get around the non-interrupt problem of inetboot. 1677*7c478bd9Sstevel@tonic-gate * Keep on processing packets until a certain state is reached or the 1678*7c478bd9Sstevel@tonic-gate * TCP is destroyed because of getting a RST packet. 1679*7c478bd9Sstevel@tonic-gate */ 1680*7c478bd9Sstevel@tonic-gate static int 1681*7c478bd9Sstevel@tonic-gate tcp_state_wait(int sock_id, tcp_t *tcp, int state) 1682*7c478bd9Sstevel@tonic-gate { 1683*7c478bd9Sstevel@tonic-gate int i; 1684*7c478bd9Sstevel@tonic-gate struct inetgram *in_gram; 1685*7c478bd9Sstevel@tonic-gate mblk_t *mp; 1686*7c478bd9Sstevel@tonic-gate int timeout; 1687*7c478bd9Sstevel@tonic-gate boolean_t changed = B_FALSE; 1688*7c478bd9Sstevel@tonic-gate 1689*7c478bd9Sstevel@tonic-gate /* 1690*7c478bd9Sstevel@tonic-gate * We need to make sure that the MAC does not wait longer 1691*7c478bd9Sstevel@tonic-gate * than RTO for any packet so that TCP can do retransmission. 1692*7c478bd9Sstevel@tonic-gate * But if the MAC timeout is less than tcp_rto, we are fine 1693*7c478bd9Sstevel@tonic-gate * and do not need to change it. 1694*7c478bd9Sstevel@tonic-gate */ 1695*7c478bd9Sstevel@tonic-gate timeout = sockets[sock_id].in_timeout; 1696*7c478bd9Sstevel@tonic-gate if (timeout > tcp->tcp_rto) { 1697*7c478bd9Sstevel@tonic-gate sockets[sock_id].in_timeout = tcp->tcp_rto; 1698*7c478bd9Sstevel@tonic-gate changed = B_TRUE; 1699*7c478bd9Sstevel@tonic-gate } 1700*7c478bd9Sstevel@tonic-gate retry: 1701*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].inq == NULL) { 1702*7c478bd9Sstevel@tonic-gate /* Go out and check the wire */ 1703*7c478bd9Sstevel@tonic-gate for (i = MEDIA_LVL; i < TRANSPORT_LVL; i++) { 1704*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].input[i] != NULL) { 1705*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].input[i](sock_id) < 0) { 1706*7c478bd9Sstevel@tonic-gate if (changed) { 1707*7c478bd9Sstevel@tonic-gate sockets[sock_id].in_timeout = 1708*7c478bd9Sstevel@tonic-gate timeout; 1709*7c478bd9Sstevel@tonic-gate } 1710*7c478bd9Sstevel@tonic-gate return (-1); 1711*7c478bd9Sstevel@tonic-gate } 1712*7c478bd9Sstevel@tonic-gate } 1713*7c478bd9Sstevel@tonic-gate } 1714*7c478bd9Sstevel@tonic-gate } 1715*7c478bd9Sstevel@tonic-gate 1716*7c478bd9Sstevel@tonic-gate while ((in_gram = sockets[sock_id].inq) != NULL) { 1717*7c478bd9Sstevel@tonic-gate if (tcp != NULL && tcp->tcp_state == state) 1718*7c478bd9Sstevel@tonic-gate break; 1719*7c478bd9Sstevel@tonic-gate 1720*7c478bd9Sstevel@tonic-gate /* Remove unknown inetgrams from the head of inq. */ 1721*7c478bd9Sstevel@tonic-gate if (in_gram->igm_level != TRANSPORT_LVL) { 1722*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 1723*7c478bd9Sstevel@tonic-gate printf("tcp_state_wait for state %d: unexpected " 1724*7c478bd9Sstevel@tonic-gate "packet level %d frame found\n", state, 1725*7c478bd9Sstevel@tonic-gate in_gram->igm_level); 1726*7c478bd9Sstevel@tonic-gate #endif 1727*7c478bd9Sstevel@tonic-gate del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 1728*7c478bd9Sstevel@tonic-gate continue; 1729*7c478bd9Sstevel@tonic-gate } 1730*7c478bd9Sstevel@tonic-gate mp = in_gram->igm_mp; 1731*7c478bd9Sstevel@tonic-gate del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 1732*7c478bd9Sstevel@tonic-gate bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 1733*7c478bd9Sstevel@tonic-gate tcp_rput_data(tcp, mp, sock_id); 1734*7c478bd9Sstevel@tonic-gate 1735*7c478bd9Sstevel@tonic-gate /* 1736*7c478bd9Sstevel@tonic-gate * The other side may have closed this connection or 1737*7c478bd9Sstevel@tonic-gate * RST us. But we need to continue to process other 1738*7c478bd9Sstevel@tonic-gate * packets in the socket's queue because they may be 1739*7c478bd9Sstevel@tonic-gate * belong to another TCP connections. 1740*7c478bd9Sstevel@tonic-gate */ 1741*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].pcb == NULL) { 1742*7c478bd9Sstevel@tonic-gate tcp = NULL; 1743*7c478bd9Sstevel@tonic-gate } 1744*7c478bd9Sstevel@tonic-gate } 1745*7c478bd9Sstevel@tonic-gate 1746*7c478bd9Sstevel@tonic-gate /* If the other side has closed the connection, just return. */ 1747*7c478bd9Sstevel@tonic-gate if (tcp == NULL || sockets[sock_id].pcb == NULL) { 1748*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 1749*7c478bd9Sstevel@tonic-gate printf("tcp_state_wait other side dead: state %d " 1750*7c478bd9Sstevel@tonic-gate "error %d\n", state, sockets[sock_id].so_error); 1751*7c478bd9Sstevel@tonic-gate #endif 1752*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].so_error != 0) 1753*7c478bd9Sstevel@tonic-gate return (-1); 1754*7c478bd9Sstevel@tonic-gate else 1755*7c478bd9Sstevel@tonic-gate return (0); 1756*7c478bd9Sstevel@tonic-gate } 1757*7c478bd9Sstevel@tonic-gate /* 1758*7c478bd9Sstevel@tonic-gate * TCPS_ALL_ACKED is not a valid TCP state, it is just used as an 1759*7c478bd9Sstevel@tonic-gate * indicator to tcp_state_wait to mean that it is being called 1760*7c478bd9Sstevel@tonic-gate * to wait till we have received acks for all the new segments sent. 1761*7c478bd9Sstevel@tonic-gate */ 1762*7c478bd9Sstevel@tonic-gate if ((state == TCPS_ALL_ACKED) && (tcp->tcp_suna == tcp->tcp_snxt)) { 1763*7c478bd9Sstevel@tonic-gate goto done; 1764*7c478bd9Sstevel@tonic-gate } 1765*7c478bd9Sstevel@tonic-gate if (tcp->tcp_state != state) { 1766*7c478bd9Sstevel@tonic-gate if (prom_gettime() > tcp->tcp_rto_timeout) 1767*7c478bd9Sstevel@tonic-gate tcp_timer(tcp, sock_id); 1768*7c478bd9Sstevel@tonic-gate goto retry; 1769*7c478bd9Sstevel@tonic-gate } 1770*7c478bd9Sstevel@tonic-gate done: 1771*7c478bd9Sstevel@tonic-gate if (changed) 1772*7c478bd9Sstevel@tonic-gate sockets[sock_id].in_timeout = timeout; 1773*7c478bd9Sstevel@tonic-gate 1774*7c478bd9Sstevel@tonic-gate tcp_drain_needed(sock_id, tcp); 1775*7c478bd9Sstevel@tonic-gate return (0); 1776*7c478bd9Sstevel@tonic-gate } 1777*7c478bd9Sstevel@tonic-gate 1778*7c478bd9Sstevel@tonic-gate /* Verify the checksum of a segment. */ 1779*7c478bd9Sstevel@tonic-gate static int 1780*7c478bd9Sstevel@tonic-gate tcp_verify_cksum(mblk_t *mp) 1781*7c478bd9Sstevel@tonic-gate { 1782*7c478bd9Sstevel@tonic-gate struct ip *iph; 1783*7c478bd9Sstevel@tonic-gate tcpha_t *tcph; 1784*7c478bd9Sstevel@tonic-gate int len; 1785*7c478bd9Sstevel@tonic-gate uint16_t old_sum; 1786*7c478bd9Sstevel@tonic-gate 1787*7c478bd9Sstevel@tonic-gate iph = (struct ip *)mp->b_rptr; 1788*7c478bd9Sstevel@tonic-gate tcph = (tcpha_t *)(iph + 1); 1789*7c478bd9Sstevel@tonic-gate len = ntohs(iph->ip_len); 1790*7c478bd9Sstevel@tonic-gate 1791*7c478bd9Sstevel@tonic-gate /* 1792*7c478bd9Sstevel@tonic-gate * Calculate the TCP checksum. Need to include the psuedo header, 1793*7c478bd9Sstevel@tonic-gate * which is similar to the real IP header starting at the TTL field. 1794*7c478bd9Sstevel@tonic-gate */ 1795*7c478bd9Sstevel@tonic-gate iph->ip_sum = htons(len - IP_SIMPLE_HDR_LENGTH); 1796*7c478bd9Sstevel@tonic-gate old_sum = tcph->tha_sum; 1797*7c478bd9Sstevel@tonic-gate tcph->tha_sum = 0; 1798*7c478bd9Sstevel@tonic-gate iph->ip_ttl = 0; 1799*7c478bd9Sstevel@tonic-gate if (old_sum == tcp_cksum((uint16_t *)&(iph->ip_ttl), 1800*7c478bd9Sstevel@tonic-gate len - IP_SIMPLE_HDR_LENGTH + 12)) { 1801*7c478bd9Sstevel@tonic-gate return (0); 1802*7c478bd9Sstevel@tonic-gate } else { 1803*7c478bd9Sstevel@tonic-gate tcp_cksum_errors++; 1804*7c478bd9Sstevel@tonic-gate return (-1); 1805*7c478bd9Sstevel@tonic-gate } 1806*7c478bd9Sstevel@tonic-gate } 1807*7c478bd9Sstevel@tonic-gate 1808*7c478bd9Sstevel@tonic-gate /* To find a TCP connection matching the incoming segment. */ 1809*7c478bd9Sstevel@tonic-gate static tcp_t * 1810*7c478bd9Sstevel@tonic-gate tcp_lookup_ipv4(struct ip *iph, tcpha_t *tcph, int min_state, int *sock_id) 1811*7c478bd9Sstevel@tonic-gate { 1812*7c478bd9Sstevel@tonic-gate int i; 1813*7c478bd9Sstevel@tonic-gate tcp_t *tcp; 1814*7c478bd9Sstevel@tonic-gate 1815*7c478bd9Sstevel@tonic-gate for (i = 0; i < MAXSOCKET; i++) { 1816*7c478bd9Sstevel@tonic-gate if (sockets[i].type == INETBOOT_STREAM && 1817*7c478bd9Sstevel@tonic-gate (tcp = (tcp_t *)sockets[i].pcb) != NULL) { 1818*7c478bd9Sstevel@tonic-gate if (tcph->tha_lport == tcp->tcp_fport && 1819*7c478bd9Sstevel@tonic-gate tcph->tha_fport == tcp->tcp_lport && 1820*7c478bd9Sstevel@tonic-gate iph->ip_src.s_addr == tcp->tcp_remote && 1821*7c478bd9Sstevel@tonic-gate iph->ip_dst.s_addr == tcp->tcp_bound_source && 1822*7c478bd9Sstevel@tonic-gate tcp->tcp_state >= min_state) { 1823*7c478bd9Sstevel@tonic-gate *sock_id = i; 1824*7c478bd9Sstevel@tonic-gate return (tcp); 1825*7c478bd9Sstevel@tonic-gate } 1826*7c478bd9Sstevel@tonic-gate } 1827*7c478bd9Sstevel@tonic-gate } 1828*7c478bd9Sstevel@tonic-gate /* Find it in the time wait list. */ 1829*7c478bd9Sstevel@tonic-gate for (tcp = tcp_time_wait_head; tcp != NULL; 1830*7c478bd9Sstevel@tonic-gate tcp = tcp->tcp_time_wait_next) { 1831*7c478bd9Sstevel@tonic-gate if (tcph->tha_lport == tcp->tcp_fport && 1832*7c478bd9Sstevel@tonic-gate tcph->tha_fport == tcp->tcp_lport && 1833*7c478bd9Sstevel@tonic-gate iph->ip_src.s_addr == tcp->tcp_remote && 1834*7c478bd9Sstevel@tonic-gate iph->ip_dst.s_addr == tcp->tcp_bound_source && 1835*7c478bd9Sstevel@tonic-gate tcp->tcp_state >= min_state) { 1836*7c478bd9Sstevel@tonic-gate *sock_id = -1; 1837*7c478bd9Sstevel@tonic-gate return (tcp); 1838*7c478bd9Sstevel@tonic-gate } 1839*7c478bd9Sstevel@tonic-gate } 1840*7c478bd9Sstevel@tonic-gate return (NULL); 1841*7c478bd9Sstevel@tonic-gate } 1842*7c478bd9Sstevel@tonic-gate 1843*7c478bd9Sstevel@tonic-gate /* To find a TCP listening connection matching the incoming segment. */ 1844*7c478bd9Sstevel@tonic-gate static tcp_t * 1845*7c478bd9Sstevel@tonic-gate tcp_lookup_listener_ipv4(in_addr_t addr, in_port_t port, int *sock_id) 1846*7c478bd9Sstevel@tonic-gate { 1847*7c478bd9Sstevel@tonic-gate int i; 1848*7c478bd9Sstevel@tonic-gate tcp_t *tcp; 1849*7c478bd9Sstevel@tonic-gate 1850*7c478bd9Sstevel@tonic-gate for (i = 0; i < MAXSOCKET; i++) { 1851*7c478bd9Sstevel@tonic-gate if (sockets[i].type == INETBOOT_STREAM && 1852*7c478bd9Sstevel@tonic-gate (tcp = (tcp_t *)sockets[i].pcb) != NULL) { 1853*7c478bd9Sstevel@tonic-gate if (tcp->tcp_lport == port && 1854*7c478bd9Sstevel@tonic-gate (tcp->tcp_bound_source == addr || 1855*7c478bd9Sstevel@tonic-gate tcp->tcp_bound_source == INADDR_ANY)) { 1856*7c478bd9Sstevel@tonic-gate *sock_id = i; 1857*7c478bd9Sstevel@tonic-gate return (tcp); 1858*7c478bd9Sstevel@tonic-gate } 1859*7c478bd9Sstevel@tonic-gate } 1860*7c478bd9Sstevel@tonic-gate } 1861*7c478bd9Sstevel@tonic-gate 1862*7c478bd9Sstevel@tonic-gate return (NULL); 1863*7c478bd9Sstevel@tonic-gate } 1864*7c478bd9Sstevel@tonic-gate 1865*7c478bd9Sstevel@tonic-gate /* To find a TCP eager matching the incoming segment. */ 1866*7c478bd9Sstevel@tonic-gate static tcp_t * 1867*7c478bd9Sstevel@tonic-gate tcp_lookup_eager_ipv4(tcp_t *listener, struct ip *iph, tcpha_t *tcph) 1868*7c478bd9Sstevel@tonic-gate { 1869*7c478bd9Sstevel@tonic-gate tcp_t *tcp; 1870*7c478bd9Sstevel@tonic-gate 1871*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 1872*7c478bd9Sstevel@tonic-gate printf("tcp_lookup_eager_ipv4 ###############\n"); 1873*7c478bd9Sstevel@tonic-gate #endif 1874*7c478bd9Sstevel@tonic-gate for (tcp = listener->tcp_eager_next_q; tcp != NULL; 1875*7c478bd9Sstevel@tonic-gate tcp = tcp->tcp_eager_next_q) { 1876*7c478bd9Sstevel@tonic-gate if (tcph->tha_lport == tcp->tcp_fport && 1877*7c478bd9Sstevel@tonic-gate tcph->tha_fport == tcp->tcp_lport && 1878*7c478bd9Sstevel@tonic-gate iph->ip_src.s_addr == tcp->tcp_remote && 1879*7c478bd9Sstevel@tonic-gate iph->ip_dst.s_addr == tcp->tcp_bound_source) { 1880*7c478bd9Sstevel@tonic-gate return (tcp); 1881*7c478bd9Sstevel@tonic-gate } 1882*7c478bd9Sstevel@tonic-gate } 1883*7c478bd9Sstevel@tonic-gate 1884*7c478bd9Sstevel@tonic-gate for (tcp = listener->tcp_eager_next_q0; tcp != listener; 1885*7c478bd9Sstevel@tonic-gate tcp = tcp->tcp_eager_next_q0) { 1886*7c478bd9Sstevel@tonic-gate if (tcph->tha_lport == tcp->tcp_fport && 1887*7c478bd9Sstevel@tonic-gate tcph->tha_fport == tcp->tcp_lport && 1888*7c478bd9Sstevel@tonic-gate iph->ip_src.s_addr == tcp->tcp_remote && 1889*7c478bd9Sstevel@tonic-gate iph->ip_dst.s_addr == tcp->tcp_bound_source) { 1890*7c478bd9Sstevel@tonic-gate return (tcp); 1891*7c478bd9Sstevel@tonic-gate } 1892*7c478bd9Sstevel@tonic-gate } 1893*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 1894*7c478bd9Sstevel@tonic-gate printf("No eager found\n"); 1895*7c478bd9Sstevel@tonic-gate #endif 1896*7c478bd9Sstevel@tonic-gate return (NULL); 1897*7c478bd9Sstevel@tonic-gate } 1898*7c478bd9Sstevel@tonic-gate 1899*7c478bd9Sstevel@tonic-gate /* To destroy a TCP control block. */ 1900*7c478bd9Sstevel@tonic-gate static void 1901*7c478bd9Sstevel@tonic-gate tcp_clean_death(int sock_id, tcp_t *tcp, int err) 1902*7c478bd9Sstevel@tonic-gate { 1903*7c478bd9Sstevel@tonic-gate tcp_free(tcp); 1904*7c478bd9Sstevel@tonic-gate if (tcp->tcp_state == TCPS_TIME_WAIT) 1905*7c478bd9Sstevel@tonic-gate tcp_time_wait_remove(tcp); 1906*7c478bd9Sstevel@tonic-gate 1907*7c478bd9Sstevel@tonic-gate if (sock_id >= 0) { 1908*7c478bd9Sstevel@tonic-gate sockets[sock_id].pcb = NULL; 1909*7c478bd9Sstevel@tonic-gate if (err != 0) 1910*7c478bd9Sstevel@tonic-gate sockets[sock_id].so_error = err; 1911*7c478bd9Sstevel@tonic-gate } 1912*7c478bd9Sstevel@tonic-gate bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 1913*7c478bd9Sstevel@tonic-gate } 1914*7c478bd9Sstevel@tonic-gate 1915*7c478bd9Sstevel@tonic-gate /* 1916*7c478bd9Sstevel@tonic-gate * tcp_rwnd_set() is called to adjust the receive window to a desired value. 1917*7c478bd9Sstevel@tonic-gate * We do not allow the receive window to shrink. After setting rwnd, 1918*7c478bd9Sstevel@tonic-gate * set the flow control hiwat of the stream. 1919*7c478bd9Sstevel@tonic-gate * 1920*7c478bd9Sstevel@tonic-gate * This function is called in 2 cases: 1921*7c478bd9Sstevel@tonic-gate * 1922*7c478bd9Sstevel@tonic-gate * 1) Before data transfer begins, in tcp_accept_comm() for accepting a 1923*7c478bd9Sstevel@tonic-gate * connection (passive open) and in tcp_rput_data() for active connect. 1924*7c478bd9Sstevel@tonic-gate * This is called after tcp_mss_set() when the desired MSS value is known. 1925*7c478bd9Sstevel@tonic-gate * This makes sure that our window size is a mutiple of the other side's 1926*7c478bd9Sstevel@tonic-gate * MSS. 1927*7c478bd9Sstevel@tonic-gate * 2) Handling SO_RCVBUF option. 1928*7c478bd9Sstevel@tonic-gate * 1929*7c478bd9Sstevel@tonic-gate * It is ASSUMED that the requested size is a multiple of the current MSS. 1930*7c478bd9Sstevel@tonic-gate * 1931*7c478bd9Sstevel@tonic-gate * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the 1932*7c478bd9Sstevel@tonic-gate * user requests so. 1933*7c478bd9Sstevel@tonic-gate */ 1934*7c478bd9Sstevel@tonic-gate static int 1935*7c478bd9Sstevel@tonic-gate tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) 1936*7c478bd9Sstevel@tonic-gate { 1937*7c478bd9Sstevel@tonic-gate uint32_t mss = tcp->tcp_mss; 1938*7c478bd9Sstevel@tonic-gate uint32_t old_max_rwnd; 1939*7c478bd9Sstevel@tonic-gate uint32_t max_transmittable_rwnd; 1940*7c478bd9Sstevel@tonic-gate 1941*7c478bd9Sstevel@tonic-gate if (tcp->tcp_rwnd_max != 0) 1942*7c478bd9Sstevel@tonic-gate old_max_rwnd = tcp->tcp_rwnd_max; 1943*7c478bd9Sstevel@tonic-gate else 1944*7c478bd9Sstevel@tonic-gate old_max_rwnd = tcp->tcp_rwnd; 1945*7c478bd9Sstevel@tonic-gate 1946*7c478bd9Sstevel@tonic-gate /* 1947*7c478bd9Sstevel@tonic-gate * Insist on a receive window that is at least 1948*7c478bd9Sstevel@tonic-gate * tcp_recv_hiwat_minmss * MSS (default 4 * MSS) to avoid 1949*7c478bd9Sstevel@tonic-gate * funny TCP interactions of Nagle algorithm, SWS avoidance 1950*7c478bd9Sstevel@tonic-gate * and delayed acknowledgement. 1951*7c478bd9Sstevel@tonic-gate */ 1952*7c478bd9Sstevel@tonic-gate rwnd = MAX(rwnd, tcp_recv_hiwat_minmss * mss); 1953*7c478bd9Sstevel@tonic-gate 1954*7c478bd9Sstevel@tonic-gate /* 1955*7c478bd9Sstevel@tonic-gate * If window size info has already been exchanged, TCP should not 1956*7c478bd9Sstevel@tonic-gate * shrink the window. Shrinking window is doable if done carefully. 1957*7c478bd9Sstevel@tonic-gate * We may add that support later. But so far there is not a real 1958*7c478bd9Sstevel@tonic-gate * need to do that. 1959*7c478bd9Sstevel@tonic-gate */ 1960*7c478bd9Sstevel@tonic-gate if (rwnd < old_max_rwnd && tcp->tcp_state > TCPS_SYN_SENT) { 1961*7c478bd9Sstevel@tonic-gate /* MSS may have changed, do a round up again. */ 1962*7c478bd9Sstevel@tonic-gate rwnd = MSS_ROUNDUP(old_max_rwnd, mss); 1963*7c478bd9Sstevel@tonic-gate } 1964*7c478bd9Sstevel@tonic-gate 1965*7c478bd9Sstevel@tonic-gate /* 1966*7c478bd9Sstevel@tonic-gate * tcp_rcv_ws starts with TCP_MAX_WINSHIFT so the following check 1967*7c478bd9Sstevel@tonic-gate * can be applied even before the window scale option is decided. 1968*7c478bd9Sstevel@tonic-gate */ 1969*7c478bd9Sstevel@tonic-gate max_transmittable_rwnd = TCP_MAXWIN << tcp->tcp_rcv_ws; 1970*7c478bd9Sstevel@tonic-gate if (rwnd > max_transmittable_rwnd) { 1971*7c478bd9Sstevel@tonic-gate rwnd = max_transmittable_rwnd - 1972*7c478bd9Sstevel@tonic-gate (max_transmittable_rwnd % mss); 1973*7c478bd9Sstevel@tonic-gate if (rwnd < mss) 1974*7c478bd9Sstevel@tonic-gate rwnd = max_transmittable_rwnd; 1975*7c478bd9Sstevel@tonic-gate /* 1976*7c478bd9Sstevel@tonic-gate * If we're over the limit we may have to back down tcp_rwnd. 1977*7c478bd9Sstevel@tonic-gate * The increment below won't work for us. So we set all three 1978*7c478bd9Sstevel@tonic-gate * here and the increment below will have no effect. 1979*7c478bd9Sstevel@tonic-gate */ 1980*7c478bd9Sstevel@tonic-gate tcp->tcp_rwnd = old_max_rwnd = rwnd; 1981*7c478bd9Sstevel@tonic-gate } 1982*7c478bd9Sstevel@tonic-gate 1983*7c478bd9Sstevel@tonic-gate /* 1984*7c478bd9Sstevel@tonic-gate * Increment the current rwnd by the amount the maximum grew (we 1985*7c478bd9Sstevel@tonic-gate * can not overwrite it since we might be in the middle of a 1986*7c478bd9Sstevel@tonic-gate * connection.) 1987*7c478bd9Sstevel@tonic-gate */ 1988*7c478bd9Sstevel@tonic-gate tcp->tcp_rwnd += rwnd - old_max_rwnd; 1989*7c478bd9Sstevel@tonic-gate U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 1990*7c478bd9Sstevel@tonic-gate if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 1991*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_max = rwnd; 1992*7c478bd9Sstevel@tonic-gate tcp->tcp_rwnd_max = rwnd; 1993*7c478bd9Sstevel@tonic-gate 1994*7c478bd9Sstevel@tonic-gate return (rwnd); 1995*7c478bd9Sstevel@tonic-gate } 1996*7c478bd9Sstevel@tonic-gate 1997*7c478bd9Sstevel@tonic-gate /* 1998*7c478bd9Sstevel@tonic-gate * Extract option values from a tcp header. We put any found values into the 1999*7c478bd9Sstevel@tonic-gate * tcpopt struct and return a bitmask saying which options were found. 2000*7c478bd9Sstevel@tonic-gate */ 2001*7c478bd9Sstevel@tonic-gate static int 2002*7c478bd9Sstevel@tonic-gate tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt) 2003*7c478bd9Sstevel@tonic-gate { 2004*7c478bd9Sstevel@tonic-gate uchar_t *endp; 2005*7c478bd9Sstevel@tonic-gate int len; 2006*7c478bd9Sstevel@tonic-gate uint32_t mss; 2007*7c478bd9Sstevel@tonic-gate uchar_t *up = (uchar_t *)tcph; 2008*7c478bd9Sstevel@tonic-gate int found = 0; 2009*7c478bd9Sstevel@tonic-gate int32_t sack_len; 2010*7c478bd9Sstevel@tonic-gate tcp_seq sack_begin, sack_end; 2011*7c478bd9Sstevel@tonic-gate tcp_t *tcp; 2012*7c478bd9Sstevel@tonic-gate 2013*7c478bd9Sstevel@tonic-gate endp = up + TCP_HDR_LENGTH(tcph); 2014*7c478bd9Sstevel@tonic-gate up += TCP_MIN_HEADER_LENGTH; 2015*7c478bd9Sstevel@tonic-gate while (up < endp) { 2016*7c478bd9Sstevel@tonic-gate len = endp - up; 2017*7c478bd9Sstevel@tonic-gate switch (*up) { 2018*7c478bd9Sstevel@tonic-gate case TCPOPT_EOL: 2019*7c478bd9Sstevel@tonic-gate break; 2020*7c478bd9Sstevel@tonic-gate 2021*7c478bd9Sstevel@tonic-gate case TCPOPT_NOP: 2022*7c478bd9Sstevel@tonic-gate up++; 2023*7c478bd9Sstevel@tonic-gate continue; 2024*7c478bd9Sstevel@tonic-gate 2025*7c478bd9Sstevel@tonic-gate case TCPOPT_MAXSEG: 2026*7c478bd9Sstevel@tonic-gate if (len < TCPOPT_MAXSEG_LEN || 2027*7c478bd9Sstevel@tonic-gate up[1] != TCPOPT_MAXSEG_LEN) 2028*7c478bd9Sstevel@tonic-gate break; 2029*7c478bd9Sstevel@tonic-gate 2030*7c478bd9Sstevel@tonic-gate mss = BE16_TO_U16(up+2); 2031*7c478bd9Sstevel@tonic-gate /* Caller must handle tcp_mss_min and tcp_mss_max_* */ 2032*7c478bd9Sstevel@tonic-gate tcpopt->tcp_opt_mss = mss; 2033*7c478bd9Sstevel@tonic-gate found |= TCP_OPT_MSS_PRESENT; 2034*7c478bd9Sstevel@tonic-gate 2035*7c478bd9Sstevel@tonic-gate up += TCPOPT_MAXSEG_LEN; 2036*7c478bd9Sstevel@tonic-gate continue; 2037*7c478bd9Sstevel@tonic-gate 2038*7c478bd9Sstevel@tonic-gate case TCPOPT_WSCALE: 2039*7c478bd9Sstevel@tonic-gate if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) 2040*7c478bd9Sstevel@tonic-gate break; 2041*7c478bd9Sstevel@tonic-gate 2042*7c478bd9Sstevel@tonic-gate if (up[2] > TCP_MAX_WINSHIFT) 2043*7c478bd9Sstevel@tonic-gate tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; 2044*7c478bd9Sstevel@tonic-gate else 2045*7c478bd9Sstevel@tonic-gate tcpopt->tcp_opt_wscale = up[2]; 2046*7c478bd9Sstevel@tonic-gate found |= TCP_OPT_WSCALE_PRESENT; 2047*7c478bd9Sstevel@tonic-gate 2048*7c478bd9Sstevel@tonic-gate up += TCPOPT_WS_LEN; 2049*7c478bd9Sstevel@tonic-gate continue; 2050*7c478bd9Sstevel@tonic-gate 2051*7c478bd9Sstevel@tonic-gate case TCPOPT_SACK_PERMITTED: 2052*7c478bd9Sstevel@tonic-gate if (len < TCPOPT_SACK_OK_LEN || 2053*7c478bd9Sstevel@tonic-gate up[1] != TCPOPT_SACK_OK_LEN) 2054*7c478bd9Sstevel@tonic-gate break; 2055*7c478bd9Sstevel@tonic-gate found |= TCP_OPT_SACK_OK_PRESENT; 2056*7c478bd9Sstevel@tonic-gate up += TCPOPT_SACK_OK_LEN; 2057*7c478bd9Sstevel@tonic-gate continue; 2058*7c478bd9Sstevel@tonic-gate 2059*7c478bd9Sstevel@tonic-gate case TCPOPT_SACK: 2060*7c478bd9Sstevel@tonic-gate if (len <= 2 || up[1] <= 2 || len < up[1]) 2061*7c478bd9Sstevel@tonic-gate break; 2062*7c478bd9Sstevel@tonic-gate 2063*7c478bd9Sstevel@tonic-gate /* If TCP is not interested in SACK blks... */ 2064*7c478bd9Sstevel@tonic-gate if ((tcp = tcpopt->tcp) == NULL) { 2065*7c478bd9Sstevel@tonic-gate up += up[1]; 2066*7c478bd9Sstevel@tonic-gate continue; 2067*7c478bd9Sstevel@tonic-gate } 2068*7c478bd9Sstevel@tonic-gate sack_len = up[1] - TCPOPT_HEADER_LEN; 2069*7c478bd9Sstevel@tonic-gate up += TCPOPT_HEADER_LEN; 2070*7c478bd9Sstevel@tonic-gate 2071*7c478bd9Sstevel@tonic-gate /* 2072*7c478bd9Sstevel@tonic-gate * If the list is empty, allocate one and assume 2073*7c478bd9Sstevel@tonic-gate * nothing is sack'ed. 2074*7c478bd9Sstevel@tonic-gate */ 2075*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_sack_info != NULL); 2076*7c478bd9Sstevel@tonic-gate if (tcp->tcp_notsack_list == NULL) { 2077*7c478bd9Sstevel@tonic-gate tcp_notsack_update(&(tcp->tcp_notsack_list), 2078*7c478bd9Sstevel@tonic-gate tcp->tcp_suna, tcp->tcp_snxt, 2079*7c478bd9Sstevel@tonic-gate &(tcp->tcp_num_notsack_blk), 2080*7c478bd9Sstevel@tonic-gate &(tcp->tcp_cnt_notsack_list)); 2081*7c478bd9Sstevel@tonic-gate 2082*7c478bd9Sstevel@tonic-gate /* 2083*7c478bd9Sstevel@tonic-gate * Make sure tcp_notsack_list is not NULL. 2084*7c478bd9Sstevel@tonic-gate * This happens when kmem_alloc(KM_NOSLEEP) 2085*7c478bd9Sstevel@tonic-gate * returns NULL. 2086*7c478bd9Sstevel@tonic-gate */ 2087*7c478bd9Sstevel@tonic-gate if (tcp->tcp_notsack_list == NULL) { 2088*7c478bd9Sstevel@tonic-gate up += sack_len; 2089*7c478bd9Sstevel@tonic-gate continue; 2090*7c478bd9Sstevel@tonic-gate } 2091*7c478bd9Sstevel@tonic-gate tcp->tcp_fack = tcp->tcp_suna; 2092*7c478bd9Sstevel@tonic-gate } 2093*7c478bd9Sstevel@tonic-gate 2094*7c478bd9Sstevel@tonic-gate while (sack_len > 0) { 2095*7c478bd9Sstevel@tonic-gate if (up + 8 > endp) { 2096*7c478bd9Sstevel@tonic-gate up = endp; 2097*7c478bd9Sstevel@tonic-gate break; 2098*7c478bd9Sstevel@tonic-gate } 2099*7c478bd9Sstevel@tonic-gate sack_begin = BE32_TO_U32(up); 2100*7c478bd9Sstevel@tonic-gate up += 4; 2101*7c478bd9Sstevel@tonic-gate sack_end = BE32_TO_U32(up); 2102*7c478bd9Sstevel@tonic-gate up += 4; 2103*7c478bd9Sstevel@tonic-gate sack_len -= 8; 2104*7c478bd9Sstevel@tonic-gate /* 2105*7c478bd9Sstevel@tonic-gate * Bounds checking. Make sure the SACK 2106*7c478bd9Sstevel@tonic-gate * info is within tcp_suna and tcp_snxt. 2107*7c478bd9Sstevel@tonic-gate * If this SACK blk is out of bound, ignore 2108*7c478bd9Sstevel@tonic-gate * it but continue to parse the following 2109*7c478bd9Sstevel@tonic-gate * blks. 2110*7c478bd9Sstevel@tonic-gate */ 2111*7c478bd9Sstevel@tonic-gate if (SEQ_LEQ(sack_end, sack_begin) || 2112*7c478bd9Sstevel@tonic-gate SEQ_LT(sack_begin, tcp->tcp_suna) || 2113*7c478bd9Sstevel@tonic-gate SEQ_GT(sack_end, tcp->tcp_snxt)) { 2114*7c478bd9Sstevel@tonic-gate continue; 2115*7c478bd9Sstevel@tonic-gate } 2116*7c478bd9Sstevel@tonic-gate tcp_notsack_insert(&(tcp->tcp_notsack_list), 2117*7c478bd9Sstevel@tonic-gate sack_begin, sack_end, 2118*7c478bd9Sstevel@tonic-gate &(tcp->tcp_num_notsack_blk), 2119*7c478bd9Sstevel@tonic-gate &(tcp->tcp_cnt_notsack_list)); 2120*7c478bd9Sstevel@tonic-gate if (SEQ_GT(sack_end, tcp->tcp_fack)) { 2121*7c478bd9Sstevel@tonic-gate tcp->tcp_fack = sack_end; 2122*7c478bd9Sstevel@tonic-gate } 2123*7c478bd9Sstevel@tonic-gate } 2124*7c478bd9Sstevel@tonic-gate found |= TCP_OPT_SACK_PRESENT; 2125*7c478bd9Sstevel@tonic-gate continue; 2126*7c478bd9Sstevel@tonic-gate 2127*7c478bd9Sstevel@tonic-gate case TCPOPT_TSTAMP: 2128*7c478bd9Sstevel@tonic-gate if (len < TCPOPT_TSTAMP_LEN || 2129*7c478bd9Sstevel@tonic-gate up[1] != TCPOPT_TSTAMP_LEN) 2130*7c478bd9Sstevel@tonic-gate break; 2131*7c478bd9Sstevel@tonic-gate 2132*7c478bd9Sstevel@tonic-gate tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); 2133*7c478bd9Sstevel@tonic-gate tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); 2134*7c478bd9Sstevel@tonic-gate 2135*7c478bd9Sstevel@tonic-gate found |= TCP_OPT_TSTAMP_PRESENT; 2136*7c478bd9Sstevel@tonic-gate 2137*7c478bd9Sstevel@tonic-gate up += TCPOPT_TSTAMP_LEN; 2138*7c478bd9Sstevel@tonic-gate continue; 2139*7c478bd9Sstevel@tonic-gate 2140*7c478bd9Sstevel@tonic-gate default: 2141*7c478bd9Sstevel@tonic-gate if (len <= 1 || len < (int)up[1] || up[1] == 0) 2142*7c478bd9Sstevel@tonic-gate break; 2143*7c478bd9Sstevel@tonic-gate up += up[1]; 2144*7c478bd9Sstevel@tonic-gate continue; 2145*7c478bd9Sstevel@tonic-gate } 2146*7c478bd9Sstevel@tonic-gate break; 2147*7c478bd9Sstevel@tonic-gate } 2148*7c478bd9Sstevel@tonic-gate return (found); 2149*7c478bd9Sstevel@tonic-gate } 2150*7c478bd9Sstevel@tonic-gate 2151*7c478bd9Sstevel@tonic-gate /* 2152*7c478bd9Sstevel@tonic-gate * Set the mss associated with a particular tcp based on its current value, 2153*7c478bd9Sstevel@tonic-gate * and a new one passed in. Observe minimums and maximums, and reset 2154*7c478bd9Sstevel@tonic-gate * other state variables that we want to view as multiples of mss. 2155*7c478bd9Sstevel@tonic-gate * 2156*7c478bd9Sstevel@tonic-gate * This function is called in various places mainly because 2157*7c478bd9Sstevel@tonic-gate * 1) Various stuffs, tcp_mss, tcp_cwnd, ... need to be adjusted when the 2158*7c478bd9Sstevel@tonic-gate * other side's SYN/SYN-ACK packet arrives. 2159*7c478bd9Sstevel@tonic-gate * 2) PMTUd may get us a new MSS. 2160*7c478bd9Sstevel@tonic-gate * 3) If the other side stops sending us timestamp option, we need to 2161*7c478bd9Sstevel@tonic-gate * increase the MSS size to use the extra bytes available. 2162*7c478bd9Sstevel@tonic-gate */ 2163*7c478bd9Sstevel@tonic-gate static void 2164*7c478bd9Sstevel@tonic-gate tcp_mss_set(tcp_t *tcp, uint32_t mss) 2165*7c478bd9Sstevel@tonic-gate { 2166*7c478bd9Sstevel@tonic-gate uint32_t mss_max; 2167*7c478bd9Sstevel@tonic-gate 2168*7c478bd9Sstevel@tonic-gate mss_max = tcp_mss_max_ipv4; 2169*7c478bd9Sstevel@tonic-gate 2170*7c478bd9Sstevel@tonic-gate if (mss < tcp_mss_min) 2171*7c478bd9Sstevel@tonic-gate mss = tcp_mss_min; 2172*7c478bd9Sstevel@tonic-gate if (mss > mss_max) 2173*7c478bd9Sstevel@tonic-gate mss = mss_max; 2174*7c478bd9Sstevel@tonic-gate /* 2175*7c478bd9Sstevel@tonic-gate * Unless naglim has been set by our client to 2176*7c478bd9Sstevel@tonic-gate * a non-mss value, force naglim to track mss. 2177*7c478bd9Sstevel@tonic-gate * This can help to aggregate small writes. 2178*7c478bd9Sstevel@tonic-gate */ 2179*7c478bd9Sstevel@tonic-gate if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) 2180*7c478bd9Sstevel@tonic-gate tcp->tcp_naglim = mss; 2181*7c478bd9Sstevel@tonic-gate /* 2182*7c478bd9Sstevel@tonic-gate * TCP should be able to buffer at least 4 MSS data for obvious 2183*7c478bd9Sstevel@tonic-gate * performance reason. 2184*7c478bd9Sstevel@tonic-gate */ 2185*7c478bd9Sstevel@tonic-gate if ((mss << 2) > tcp->tcp_xmit_hiwater) 2186*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_hiwater = mss << 2; 2187*7c478bd9Sstevel@tonic-gate tcp->tcp_mss = mss; 2188*7c478bd9Sstevel@tonic-gate /* 2189*7c478bd9Sstevel@tonic-gate * Initialize cwnd according to draft-floyd-incr-init-win-01.txt. 2190*7c478bd9Sstevel@tonic-gate * Previously, we use tcp_slow_start_initial to control the size 2191*7c478bd9Sstevel@tonic-gate * of the initial cwnd. Now, when tcp_slow_start_initial * mss 2192*7c478bd9Sstevel@tonic-gate * is smaller than the cwnd calculated from the formula suggested in 2193*7c478bd9Sstevel@tonic-gate * the draft, we use tcp_slow_start_initial * mss as the cwnd. 2194*7c478bd9Sstevel@tonic-gate * Otherwise, use the cwnd from the draft's formula. The default 2195*7c478bd9Sstevel@tonic-gate * of tcp_slow_start_initial is 2. 2196*7c478bd9Sstevel@tonic-gate */ 2197*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd = MIN(tcp_slow_start_initial * mss, 2198*7c478bd9Sstevel@tonic-gate MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); 2199*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_cnt = 0; 2200*7c478bd9Sstevel@tonic-gate } 2201*7c478bd9Sstevel@tonic-gate 2202*7c478bd9Sstevel@tonic-gate /* 2203*7c478bd9Sstevel@tonic-gate * Process all TCP option in SYN segment. 2204*7c478bd9Sstevel@tonic-gate * 2205*7c478bd9Sstevel@tonic-gate * This function sets up the correct tcp_mss value according to the 2206*7c478bd9Sstevel@tonic-gate * MSS option value and our header size. It also sets up the window scale 2207*7c478bd9Sstevel@tonic-gate * and timestamp values, and initialize SACK info blocks. But it does not 2208*7c478bd9Sstevel@tonic-gate * change receive window size after setting the tcp_mss value. The caller 2209*7c478bd9Sstevel@tonic-gate * should do the appropriate change. 2210*7c478bd9Sstevel@tonic-gate */ 2211*7c478bd9Sstevel@tonic-gate void 2212*7c478bd9Sstevel@tonic-gate tcp_process_options(tcp_t *tcp, tcph_t *tcph) 2213*7c478bd9Sstevel@tonic-gate { 2214*7c478bd9Sstevel@tonic-gate int options; 2215*7c478bd9Sstevel@tonic-gate tcp_opt_t tcpopt; 2216*7c478bd9Sstevel@tonic-gate uint32_t mss_max; 2217*7c478bd9Sstevel@tonic-gate char *tmp_tcph; 2218*7c478bd9Sstevel@tonic-gate 2219*7c478bd9Sstevel@tonic-gate tcpopt.tcp = NULL; 2220*7c478bd9Sstevel@tonic-gate options = tcp_parse_options(tcph, &tcpopt); 2221*7c478bd9Sstevel@tonic-gate 2222*7c478bd9Sstevel@tonic-gate /* 2223*7c478bd9Sstevel@tonic-gate * Process MSS option. Note that MSS option value does not account 2224*7c478bd9Sstevel@tonic-gate * for IP or TCP options. This means that it is equal to MTU - minimum 2225*7c478bd9Sstevel@tonic-gate * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for 2226*7c478bd9Sstevel@tonic-gate * IPv6. 2227*7c478bd9Sstevel@tonic-gate */ 2228*7c478bd9Sstevel@tonic-gate if (!(options & TCP_OPT_MSS_PRESENT)) { 2229*7c478bd9Sstevel@tonic-gate tcpopt.tcp_opt_mss = tcp_mss_def_ipv4; 2230*7c478bd9Sstevel@tonic-gate } else { 2231*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ipversion == IPV4_VERSION) 2232*7c478bd9Sstevel@tonic-gate mss_max = tcp_mss_max_ipv4; 2233*7c478bd9Sstevel@tonic-gate if (tcpopt.tcp_opt_mss < tcp_mss_min) 2234*7c478bd9Sstevel@tonic-gate tcpopt.tcp_opt_mss = tcp_mss_min; 2235*7c478bd9Sstevel@tonic-gate else if (tcpopt.tcp_opt_mss > mss_max) 2236*7c478bd9Sstevel@tonic-gate tcpopt.tcp_opt_mss = mss_max; 2237*7c478bd9Sstevel@tonic-gate } 2238*7c478bd9Sstevel@tonic-gate 2239*7c478bd9Sstevel@tonic-gate /* Process Window Scale option. */ 2240*7c478bd9Sstevel@tonic-gate if (options & TCP_OPT_WSCALE_PRESENT) { 2241*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; 2242*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_ws_ok = B_TRUE; 2243*7c478bd9Sstevel@tonic-gate } else { 2244*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_ws = B_FALSE; 2245*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_ws_ok = B_FALSE; 2246*7c478bd9Sstevel@tonic-gate tcp->tcp_rcv_ws = B_FALSE; 2247*7c478bd9Sstevel@tonic-gate } 2248*7c478bd9Sstevel@tonic-gate 2249*7c478bd9Sstevel@tonic-gate /* Process Timestamp option. */ 2250*7c478bd9Sstevel@tonic-gate if ((options & TCP_OPT_TSTAMP_PRESENT) && 2251*7c478bd9Sstevel@tonic-gate (tcp->tcp_snd_ts_ok || !tcp->tcp_active_open)) { 2252*7c478bd9Sstevel@tonic-gate tmp_tcph = (char *)tcp->tcp_tcph; 2253*7c478bd9Sstevel@tonic-gate 2254*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_ts_ok = B_TRUE; 2255*7c478bd9Sstevel@tonic-gate tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 2256*7c478bd9Sstevel@tonic-gate tcp->tcp_last_rcv_lbolt = prom_gettime(); 2257*7c478bd9Sstevel@tonic-gate assert(OK_32PTR(tmp_tcph)); 2258*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 2259*7c478bd9Sstevel@tonic-gate 2260*7c478bd9Sstevel@tonic-gate /* Fill in our template header with basic timestamp option. */ 2261*7c478bd9Sstevel@tonic-gate tmp_tcph += tcp->tcp_tcp_hdr_len; 2262*7c478bd9Sstevel@tonic-gate tmp_tcph[0] = TCPOPT_NOP; 2263*7c478bd9Sstevel@tonic-gate tmp_tcph[1] = TCPOPT_NOP; 2264*7c478bd9Sstevel@tonic-gate tmp_tcph[2] = TCPOPT_TSTAMP; 2265*7c478bd9Sstevel@tonic-gate tmp_tcph[3] = TCPOPT_TSTAMP_LEN; 2266*7c478bd9Sstevel@tonic-gate tcp->tcp_hdr_len += TCPOPT_REAL_TS_LEN; 2267*7c478bd9Sstevel@tonic-gate tcp->tcp_tcp_hdr_len += TCPOPT_REAL_TS_LEN; 2268*7c478bd9Sstevel@tonic-gate tcp->tcp_tcph->th_offset_and_rsrvd[0] += (3 << 4); 2269*7c478bd9Sstevel@tonic-gate } else { 2270*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_ts_ok = B_FALSE; 2271*7c478bd9Sstevel@tonic-gate } 2272*7c478bd9Sstevel@tonic-gate 2273*7c478bd9Sstevel@tonic-gate /* 2274*7c478bd9Sstevel@tonic-gate * Process SACK options. If SACK is enabled for this connection, 2275*7c478bd9Sstevel@tonic-gate * then allocate the SACK info structure. 2276*7c478bd9Sstevel@tonic-gate */ 2277*7c478bd9Sstevel@tonic-gate if ((options & TCP_OPT_SACK_OK_PRESENT) && 2278*7c478bd9Sstevel@tonic-gate (tcp->tcp_snd_sack_ok || 2279*7c478bd9Sstevel@tonic-gate (tcp_sack_permitted != 0 && !tcp->tcp_active_open))) { 2280*7c478bd9Sstevel@tonic-gate /* This should be true only in the passive case. */ 2281*7c478bd9Sstevel@tonic-gate if (tcp->tcp_sack_info == NULL) { 2282*7c478bd9Sstevel@tonic-gate tcp->tcp_sack_info = (tcp_sack_info_t *)bkmem_zalloc( 2283*7c478bd9Sstevel@tonic-gate sizeof (tcp_sack_info_t)); 2284*7c478bd9Sstevel@tonic-gate } 2285*7c478bd9Sstevel@tonic-gate if (tcp->tcp_sack_info == NULL) { 2286*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_sack_ok = B_FALSE; 2287*7c478bd9Sstevel@tonic-gate } else { 2288*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_sack_ok = B_TRUE; 2289*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 2290*7c478bd9Sstevel@tonic-gate tcp->tcp_max_sack_blk = 3; 2291*7c478bd9Sstevel@tonic-gate } else { 2292*7c478bd9Sstevel@tonic-gate tcp->tcp_max_sack_blk = 4; 2293*7c478bd9Sstevel@tonic-gate } 2294*7c478bd9Sstevel@tonic-gate } 2295*7c478bd9Sstevel@tonic-gate } else { 2296*7c478bd9Sstevel@tonic-gate /* 2297*7c478bd9Sstevel@tonic-gate * Resetting tcp_snd_sack_ok to B_FALSE so that 2298*7c478bd9Sstevel@tonic-gate * no SACK info will be used for this 2299*7c478bd9Sstevel@tonic-gate * connection. This assumes that SACK usage 2300*7c478bd9Sstevel@tonic-gate * permission is negotiated. This may need 2301*7c478bd9Sstevel@tonic-gate * to be changed once this is clarified. 2302*7c478bd9Sstevel@tonic-gate */ 2303*7c478bd9Sstevel@tonic-gate if (tcp->tcp_sack_info != NULL) { 2304*7c478bd9Sstevel@tonic-gate bkmem_free((caddr_t)tcp->tcp_sack_info, 2305*7c478bd9Sstevel@tonic-gate sizeof (tcp_sack_info_t)); 2306*7c478bd9Sstevel@tonic-gate tcp->tcp_sack_info = NULL; 2307*7c478bd9Sstevel@tonic-gate } 2308*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_sack_ok = B_FALSE; 2309*7c478bd9Sstevel@tonic-gate } 2310*7c478bd9Sstevel@tonic-gate 2311*7c478bd9Sstevel@tonic-gate /* 2312*7c478bd9Sstevel@tonic-gate * Now we know the exact TCP/IP header length, subtract 2313*7c478bd9Sstevel@tonic-gate * that from tcp_mss to get our side's MSS. 2314*7c478bd9Sstevel@tonic-gate */ 2315*7c478bd9Sstevel@tonic-gate tcp->tcp_mss -= tcp->tcp_hdr_len; 2316*7c478bd9Sstevel@tonic-gate /* 2317*7c478bd9Sstevel@tonic-gate * Here we assume that the other side's header size will be equal to 2318*7c478bd9Sstevel@tonic-gate * our header size. We calculate the real MSS accordingly. Need to 2319*7c478bd9Sstevel@tonic-gate * take into additional stuffs IPsec puts in. 2320*7c478bd9Sstevel@tonic-gate * 2321*7c478bd9Sstevel@tonic-gate * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) 2322*7c478bd9Sstevel@tonic-gate */ 2323*7c478bd9Sstevel@tonic-gate tcpopt.tcp_opt_mss -= tcp->tcp_hdr_len - 2324*7c478bd9Sstevel@tonic-gate (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH); 2325*7c478bd9Sstevel@tonic-gate 2326*7c478bd9Sstevel@tonic-gate /* 2327*7c478bd9Sstevel@tonic-gate * Set MSS to the smaller one of both ends of the connection. 2328*7c478bd9Sstevel@tonic-gate * We should not have called tcp_mss_set() before, but our 2329*7c478bd9Sstevel@tonic-gate * side of the MSS should have been set to a proper value 2330*7c478bd9Sstevel@tonic-gate * by tcp_adapt_ire(). tcp_mss_set() will also set up the 2331*7c478bd9Sstevel@tonic-gate * STREAM head parameters properly. 2332*7c478bd9Sstevel@tonic-gate * 2333*7c478bd9Sstevel@tonic-gate * If we have a larger-than-16-bit window but the other side 2334*7c478bd9Sstevel@tonic-gate * didn't want to do window scale, tcp_rwnd_set() will take 2335*7c478bd9Sstevel@tonic-gate * care of that. 2336*7c478bd9Sstevel@tonic-gate */ 2337*7c478bd9Sstevel@tonic-gate tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); 2338*7c478bd9Sstevel@tonic-gate } 2339*7c478bd9Sstevel@tonic-gate 2340*7c478bd9Sstevel@tonic-gate /* 2341*7c478bd9Sstevel@tonic-gate * This function does PAWS protection check. Returns B_TRUE if the 2342*7c478bd9Sstevel@tonic-gate * segment passes the PAWS test, else returns B_FALSE. 2343*7c478bd9Sstevel@tonic-gate */ 2344*7c478bd9Sstevel@tonic-gate boolean_t 2345*7c478bd9Sstevel@tonic-gate tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) 2346*7c478bd9Sstevel@tonic-gate { 2347*7c478bd9Sstevel@tonic-gate uint8_t flags; 2348*7c478bd9Sstevel@tonic-gate int options; 2349*7c478bd9Sstevel@tonic-gate uint8_t *up; 2350*7c478bd9Sstevel@tonic-gate 2351*7c478bd9Sstevel@tonic-gate flags = (unsigned int)tcph->th_flags[0] & 0xFF; 2352*7c478bd9Sstevel@tonic-gate /* 2353*7c478bd9Sstevel@tonic-gate * If timestamp option is aligned nicely, get values inline, 2354*7c478bd9Sstevel@tonic-gate * otherwise call general routine to parse. Only do that 2355*7c478bd9Sstevel@tonic-gate * if timestamp is the only option. 2356*7c478bd9Sstevel@tonic-gate */ 2357*7c478bd9Sstevel@tonic-gate if (TCP_HDR_LENGTH(tcph) == (uint32_t)TCP_MIN_HEADER_LENGTH + 2358*7c478bd9Sstevel@tonic-gate TCPOPT_REAL_TS_LEN && 2359*7c478bd9Sstevel@tonic-gate OK_32PTR((up = ((uint8_t *)tcph) + 2360*7c478bd9Sstevel@tonic-gate TCP_MIN_HEADER_LENGTH)) && 2361*7c478bd9Sstevel@tonic-gate *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { 2362*7c478bd9Sstevel@tonic-gate tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); 2363*7c478bd9Sstevel@tonic-gate tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); 2364*7c478bd9Sstevel@tonic-gate 2365*7c478bd9Sstevel@tonic-gate options = TCP_OPT_TSTAMP_PRESENT; 2366*7c478bd9Sstevel@tonic-gate } else { 2367*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok) { 2368*7c478bd9Sstevel@tonic-gate tcpoptp->tcp = tcp; 2369*7c478bd9Sstevel@tonic-gate } else { 2370*7c478bd9Sstevel@tonic-gate tcpoptp->tcp = NULL; 2371*7c478bd9Sstevel@tonic-gate } 2372*7c478bd9Sstevel@tonic-gate options = tcp_parse_options(tcph, tcpoptp); 2373*7c478bd9Sstevel@tonic-gate } 2374*7c478bd9Sstevel@tonic-gate 2375*7c478bd9Sstevel@tonic-gate if (options & TCP_OPT_TSTAMP_PRESENT) { 2376*7c478bd9Sstevel@tonic-gate /* 2377*7c478bd9Sstevel@tonic-gate * Do PAWS per RFC 1323 section 4.2. Accept RST 2378*7c478bd9Sstevel@tonic-gate * regardless of the timestamp, page 18 RFC 1323.bis. 2379*7c478bd9Sstevel@tonic-gate */ 2380*7c478bd9Sstevel@tonic-gate if ((flags & TH_RST) == 0 && 2381*7c478bd9Sstevel@tonic-gate TSTMP_LT(tcpoptp->tcp_opt_ts_val, 2382*7c478bd9Sstevel@tonic-gate tcp->tcp_ts_recent)) { 2383*7c478bd9Sstevel@tonic-gate if (TSTMP_LT(prom_gettime(), 2384*7c478bd9Sstevel@tonic-gate tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) { 2385*7c478bd9Sstevel@tonic-gate /* This segment is not acceptable. */ 2386*7c478bd9Sstevel@tonic-gate return (B_FALSE); 2387*7c478bd9Sstevel@tonic-gate } else { 2388*7c478bd9Sstevel@tonic-gate /* 2389*7c478bd9Sstevel@tonic-gate * Connection has been idle for 2390*7c478bd9Sstevel@tonic-gate * too long. Reset the timestamp 2391*7c478bd9Sstevel@tonic-gate * and assume the segment is valid. 2392*7c478bd9Sstevel@tonic-gate */ 2393*7c478bd9Sstevel@tonic-gate tcp->tcp_ts_recent = 2394*7c478bd9Sstevel@tonic-gate tcpoptp->tcp_opt_ts_val; 2395*7c478bd9Sstevel@tonic-gate } 2396*7c478bd9Sstevel@tonic-gate } 2397*7c478bd9Sstevel@tonic-gate } else { 2398*7c478bd9Sstevel@tonic-gate /* 2399*7c478bd9Sstevel@tonic-gate * If we don't get a timestamp on every packet, we 2400*7c478bd9Sstevel@tonic-gate * figure we can't really trust 'em, so we stop sending 2401*7c478bd9Sstevel@tonic-gate * and parsing them. 2402*7c478bd9Sstevel@tonic-gate */ 2403*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_ts_ok = B_FALSE; 2404*7c478bd9Sstevel@tonic-gate 2405*7c478bd9Sstevel@tonic-gate tcp->tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 2406*7c478bd9Sstevel@tonic-gate tcp->tcp_tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 2407*7c478bd9Sstevel@tonic-gate tcp->tcp_tcph->th_offset_and_rsrvd[0] -= (3 << 4); 2408*7c478bd9Sstevel@tonic-gate tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN); 2409*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok) { 2410*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_sack_info != NULL); 2411*7c478bd9Sstevel@tonic-gate tcp->tcp_max_sack_blk = 4; 2412*7c478bd9Sstevel@tonic-gate } 2413*7c478bd9Sstevel@tonic-gate } 2414*7c478bd9Sstevel@tonic-gate return (B_TRUE); 2415*7c478bd9Sstevel@tonic-gate } 2416*7c478bd9Sstevel@tonic-gate 2417*7c478bd9Sstevel@tonic-gate /* 2418*7c478bd9Sstevel@tonic-gate * tcp_get_seg_mp() is called to get the pointer to a segment in the 2419*7c478bd9Sstevel@tonic-gate * send queue which starts at the given seq. no. 2420*7c478bd9Sstevel@tonic-gate * 2421*7c478bd9Sstevel@tonic-gate * Parameters: 2422*7c478bd9Sstevel@tonic-gate * tcp_t *tcp: the tcp instance pointer. 2423*7c478bd9Sstevel@tonic-gate * uint32_t seq: the starting seq. no of the requested segment. 2424*7c478bd9Sstevel@tonic-gate * int32_t *off: after the execution, *off will be the offset to 2425*7c478bd9Sstevel@tonic-gate * the returned mblk which points to the requested seq no. 2426*7c478bd9Sstevel@tonic-gate * 2427*7c478bd9Sstevel@tonic-gate * Return: 2428*7c478bd9Sstevel@tonic-gate * A mblk_t pointer pointing to the requested segment in send queue. 2429*7c478bd9Sstevel@tonic-gate */ 2430*7c478bd9Sstevel@tonic-gate static mblk_t * 2431*7c478bd9Sstevel@tonic-gate tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) 2432*7c478bd9Sstevel@tonic-gate { 2433*7c478bd9Sstevel@tonic-gate int32_t cnt; 2434*7c478bd9Sstevel@tonic-gate mblk_t *mp; 2435*7c478bd9Sstevel@tonic-gate 2436*7c478bd9Sstevel@tonic-gate /* Defensive coding. Make sure we don't send incorrect data. */ 2437*7c478bd9Sstevel@tonic-gate if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt) || 2438*7c478bd9Sstevel@tonic-gate off == NULL) { 2439*7c478bd9Sstevel@tonic-gate return (NULL); 2440*7c478bd9Sstevel@tonic-gate } 2441*7c478bd9Sstevel@tonic-gate cnt = seq - tcp->tcp_suna; 2442*7c478bd9Sstevel@tonic-gate mp = tcp->tcp_xmit_head; 2443*7c478bd9Sstevel@tonic-gate while (cnt > 0 && mp) { 2444*7c478bd9Sstevel@tonic-gate cnt -= mp->b_wptr - mp->b_rptr; 2445*7c478bd9Sstevel@tonic-gate if (cnt < 0) { 2446*7c478bd9Sstevel@tonic-gate cnt += mp->b_wptr - mp->b_rptr; 2447*7c478bd9Sstevel@tonic-gate break; 2448*7c478bd9Sstevel@tonic-gate } 2449*7c478bd9Sstevel@tonic-gate mp = mp->b_cont; 2450*7c478bd9Sstevel@tonic-gate } 2451*7c478bd9Sstevel@tonic-gate assert(mp != NULL); 2452*7c478bd9Sstevel@tonic-gate *off = cnt; 2453*7c478bd9Sstevel@tonic-gate return (mp); 2454*7c478bd9Sstevel@tonic-gate } 2455*7c478bd9Sstevel@tonic-gate 2456*7c478bd9Sstevel@tonic-gate /* 2457*7c478bd9Sstevel@tonic-gate * This function handles all retransmissions if SACK is enabled for this 2458*7c478bd9Sstevel@tonic-gate * connection. First it calculates how many segments can be retransmitted 2459*7c478bd9Sstevel@tonic-gate * based on tcp_pipe. Then it goes thru the notsack list to find eligible 2460*7c478bd9Sstevel@tonic-gate * segments. A segment is eligible if sack_cnt for that segment is greater 2461*7c478bd9Sstevel@tonic-gate * than or equal tcp_dupack_fast_retransmit. After it has retransmitted 2462*7c478bd9Sstevel@tonic-gate * all eligible segments, it checks to see if TCP can send some new segments 2463*7c478bd9Sstevel@tonic-gate * (fast recovery). If it can, it returns 1. Otherwise it returns 0. 2464*7c478bd9Sstevel@tonic-gate * 2465*7c478bd9Sstevel@tonic-gate * Parameters: 2466*7c478bd9Sstevel@tonic-gate * tcp_t *tcp: the tcp structure of the connection. 2467*7c478bd9Sstevel@tonic-gate * 2468*7c478bd9Sstevel@tonic-gate * Return: 2469*7c478bd9Sstevel@tonic-gate * 1 if the pipe is not full (new data can be sent), 0 otherwise 2470*7c478bd9Sstevel@tonic-gate */ 2471*7c478bd9Sstevel@tonic-gate static int32_t 2472*7c478bd9Sstevel@tonic-gate tcp_sack_rxmit(tcp_t *tcp, int sock_id) 2473*7c478bd9Sstevel@tonic-gate { 2474*7c478bd9Sstevel@tonic-gate notsack_blk_t *notsack_blk; 2475*7c478bd9Sstevel@tonic-gate int32_t usable_swnd; 2476*7c478bd9Sstevel@tonic-gate int32_t mss; 2477*7c478bd9Sstevel@tonic-gate uint32_t seg_len; 2478*7c478bd9Sstevel@tonic-gate mblk_t *xmit_mp; 2479*7c478bd9Sstevel@tonic-gate 2480*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_sack_info != NULL); 2481*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_notsack_list != NULL); 2482*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_rexmit == B_FALSE); 2483*7c478bd9Sstevel@tonic-gate 2484*7c478bd9Sstevel@tonic-gate /* Defensive coding in case there is a bug... */ 2485*7c478bd9Sstevel@tonic-gate if (tcp->tcp_notsack_list == NULL) { 2486*7c478bd9Sstevel@tonic-gate return (0); 2487*7c478bd9Sstevel@tonic-gate } 2488*7c478bd9Sstevel@tonic-gate notsack_blk = tcp->tcp_notsack_list; 2489*7c478bd9Sstevel@tonic-gate mss = tcp->tcp_mss; 2490*7c478bd9Sstevel@tonic-gate 2491*7c478bd9Sstevel@tonic-gate /* 2492*7c478bd9Sstevel@tonic-gate * Limit the num of outstanding data in the network to be 2493*7c478bd9Sstevel@tonic-gate * tcp_cwnd_ssthresh, which is half of the original congestion wnd. 2494*7c478bd9Sstevel@tonic-gate */ 2495*7c478bd9Sstevel@tonic-gate usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 2496*7c478bd9Sstevel@tonic-gate 2497*7c478bd9Sstevel@tonic-gate /* At least retransmit 1 MSS of data. */ 2498*7c478bd9Sstevel@tonic-gate if (usable_swnd <= 0) { 2499*7c478bd9Sstevel@tonic-gate usable_swnd = mss; 2500*7c478bd9Sstevel@tonic-gate } 2501*7c478bd9Sstevel@tonic-gate 2502*7c478bd9Sstevel@tonic-gate /* Make sure no new RTT samples will be taken. */ 2503*7c478bd9Sstevel@tonic-gate tcp->tcp_csuna = tcp->tcp_snxt; 2504*7c478bd9Sstevel@tonic-gate 2505*7c478bd9Sstevel@tonic-gate notsack_blk = tcp->tcp_notsack_list; 2506*7c478bd9Sstevel@tonic-gate while (usable_swnd > 0) { 2507*7c478bd9Sstevel@tonic-gate mblk_t *snxt_mp, *tmp_mp; 2508*7c478bd9Sstevel@tonic-gate tcp_seq begin = tcp->tcp_sack_snxt; 2509*7c478bd9Sstevel@tonic-gate tcp_seq end; 2510*7c478bd9Sstevel@tonic-gate int32_t off; 2511*7c478bd9Sstevel@tonic-gate 2512*7c478bd9Sstevel@tonic-gate for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { 2513*7c478bd9Sstevel@tonic-gate if (SEQ_GT(notsack_blk->end, begin) && 2514*7c478bd9Sstevel@tonic-gate (notsack_blk->sack_cnt >= 2515*7c478bd9Sstevel@tonic-gate tcp_dupack_fast_retransmit)) { 2516*7c478bd9Sstevel@tonic-gate end = notsack_blk->end; 2517*7c478bd9Sstevel@tonic-gate if (SEQ_LT(begin, notsack_blk->begin)) { 2518*7c478bd9Sstevel@tonic-gate begin = notsack_blk->begin; 2519*7c478bd9Sstevel@tonic-gate } 2520*7c478bd9Sstevel@tonic-gate break; 2521*7c478bd9Sstevel@tonic-gate } 2522*7c478bd9Sstevel@tonic-gate } 2523*7c478bd9Sstevel@tonic-gate /* 2524*7c478bd9Sstevel@tonic-gate * All holes are filled. Manipulate tcp_cwnd to send more 2525*7c478bd9Sstevel@tonic-gate * if we can. Note that after the SACK recovery, tcp_cwnd is 2526*7c478bd9Sstevel@tonic-gate * set to tcp_cwnd_ssthresh. 2527*7c478bd9Sstevel@tonic-gate */ 2528*7c478bd9Sstevel@tonic-gate if (notsack_blk == NULL) { 2529*7c478bd9Sstevel@tonic-gate usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 2530*7c478bd9Sstevel@tonic-gate if (usable_swnd <= 0) { 2531*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna; 2532*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_cwnd > 0); 2533*7c478bd9Sstevel@tonic-gate return (0); 2534*7c478bd9Sstevel@tonic-gate } else { 2535*7c478bd9Sstevel@tonic-gate usable_swnd = usable_swnd / mss; 2536*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna + 2537*7c478bd9Sstevel@tonic-gate MAX(usable_swnd * mss, mss); 2538*7c478bd9Sstevel@tonic-gate return (1); 2539*7c478bd9Sstevel@tonic-gate } 2540*7c478bd9Sstevel@tonic-gate } 2541*7c478bd9Sstevel@tonic-gate 2542*7c478bd9Sstevel@tonic-gate /* 2543*7c478bd9Sstevel@tonic-gate * Note that we may send more than usable_swnd allows here 2544*7c478bd9Sstevel@tonic-gate * because of round off, but no more than 1 MSS of data. 2545*7c478bd9Sstevel@tonic-gate */ 2546*7c478bd9Sstevel@tonic-gate seg_len = end - begin; 2547*7c478bd9Sstevel@tonic-gate if (seg_len > mss) 2548*7c478bd9Sstevel@tonic-gate seg_len = mss; 2549*7c478bd9Sstevel@tonic-gate snxt_mp = tcp_get_seg_mp(tcp, begin, &off); 2550*7c478bd9Sstevel@tonic-gate assert(snxt_mp != NULL); 2551*7c478bd9Sstevel@tonic-gate /* This should not happen. Defensive coding again... */ 2552*7c478bd9Sstevel@tonic-gate if (snxt_mp == NULL) { 2553*7c478bd9Sstevel@tonic-gate return (0); 2554*7c478bd9Sstevel@tonic-gate } 2555*7c478bd9Sstevel@tonic-gate 2556*7c478bd9Sstevel@tonic-gate xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off, 2557*7c478bd9Sstevel@tonic-gate &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE); 2558*7c478bd9Sstevel@tonic-gate 2559*7c478bd9Sstevel@tonic-gate if (xmit_mp == NULL) 2560*7c478bd9Sstevel@tonic-gate return (0); 2561*7c478bd9Sstevel@tonic-gate 2562*7c478bd9Sstevel@tonic-gate usable_swnd -= seg_len; 2563*7c478bd9Sstevel@tonic-gate tcp->tcp_pipe += seg_len; 2564*7c478bd9Sstevel@tonic-gate tcp->tcp_sack_snxt = begin + seg_len; 2565*7c478bd9Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_sack_rxmit", xmit_mp); 2566*7c478bd9Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, xmit_mp); 2567*7c478bd9Sstevel@tonic-gate freeb(xmit_mp); 2568*7c478bd9Sstevel@tonic-gate 2569*7c478bd9Sstevel@tonic-gate /* 2570*7c478bd9Sstevel@tonic-gate * Update the send timestamp to avoid false retransmission. 2571*7c478bd9Sstevel@tonic-gate */ 2572*7c478bd9Sstevel@tonic-gate snxt_mp->b_prev = (mblk_t *)prom_gettime(); 2573*7c478bd9Sstevel@tonic-gate 2574*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpRetransSegs); 2575*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpRetransBytes, seg_len); 2576*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutSackRetransSegs); 2577*7c478bd9Sstevel@tonic-gate /* 2578*7c478bd9Sstevel@tonic-gate * Update tcp_rexmit_max to extend this SACK recovery phase. 2579*7c478bd9Sstevel@tonic-gate * This happens when new data sent during fast recovery is 2580*7c478bd9Sstevel@tonic-gate * also lost. If TCP retransmits those new data, it needs 2581*7c478bd9Sstevel@tonic-gate * to extend SACK recover phase to avoid starting another 2582*7c478bd9Sstevel@tonic-gate * fast retransmit/recovery unnecessarily. 2583*7c478bd9Sstevel@tonic-gate */ 2584*7c478bd9Sstevel@tonic-gate if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) { 2585*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_sack_snxt; 2586*7c478bd9Sstevel@tonic-gate } 2587*7c478bd9Sstevel@tonic-gate } 2588*7c478bd9Sstevel@tonic-gate return (0); 2589*7c478bd9Sstevel@tonic-gate } 2590*7c478bd9Sstevel@tonic-gate 2591*7c478bd9Sstevel@tonic-gate static void 2592*7c478bd9Sstevel@tonic-gate tcp_rput_data(tcp_t *tcp, mblk_t *mp, int sock_id) 2593*7c478bd9Sstevel@tonic-gate { 2594*7c478bd9Sstevel@tonic-gate uchar_t *rptr; 2595*7c478bd9Sstevel@tonic-gate struct ip *iph; 2596*7c478bd9Sstevel@tonic-gate tcp_t *tcp1; 2597*7c478bd9Sstevel@tonic-gate tcpha_t *tcph; 2598*7c478bd9Sstevel@tonic-gate uint32_t seg_ack; 2599*7c478bd9Sstevel@tonic-gate int seg_len; 2600*7c478bd9Sstevel@tonic-gate uint_t ip_hdr_len; 2601*7c478bd9Sstevel@tonic-gate uint32_t seg_seq; 2602*7c478bd9Sstevel@tonic-gate mblk_t *mp1; 2603*7c478bd9Sstevel@tonic-gate uint_t flags; 2604*7c478bd9Sstevel@tonic-gate uint32_t new_swnd = 0; 2605*7c478bd9Sstevel@tonic-gate int mss; 2606*7c478bd9Sstevel@tonic-gate boolean_t ofo_seg = B_FALSE; /* Out of order segment */ 2607*7c478bd9Sstevel@tonic-gate int32_t gap; 2608*7c478bd9Sstevel@tonic-gate int32_t rgap; 2609*7c478bd9Sstevel@tonic-gate tcp_opt_t tcpopt; 2610*7c478bd9Sstevel@tonic-gate int32_t bytes_acked; 2611*7c478bd9Sstevel@tonic-gate int npkt; 2612*7c478bd9Sstevel@tonic-gate uint32_t cwnd; 2613*7c478bd9Sstevel@tonic-gate uint32_t add; 2614*7c478bd9Sstevel@tonic-gate 2615*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 2616*7c478bd9Sstevel@tonic-gate printf("tcp_rput_data sock %d mp %x mp_datap %x #################\n", 2617*7c478bd9Sstevel@tonic-gate sock_id, mp, mp->b_datap); 2618*7c478bd9Sstevel@tonic-gate #endif 2619*7c478bd9Sstevel@tonic-gate 2620*7c478bd9Sstevel@tonic-gate /* Dump the packet when debugging. */ 2621*7c478bd9Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_rput_data", mp); 2622*7c478bd9Sstevel@tonic-gate 2623*7c478bd9Sstevel@tonic-gate assert(OK_32PTR(mp->b_rptr)); 2624*7c478bd9Sstevel@tonic-gate 2625*7c478bd9Sstevel@tonic-gate rptr = mp->b_rptr; 2626*7c478bd9Sstevel@tonic-gate iph = (struct ip *)rptr; 2627*7c478bd9Sstevel@tonic-gate ip_hdr_len = IPH_HDR_LENGTH(rptr); 2628*7c478bd9Sstevel@tonic-gate if (ip_hdr_len != IP_SIMPLE_HDR_LENGTH) { 2629*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 2630*7c478bd9Sstevel@tonic-gate printf("Not simple IP header\n"); 2631*7c478bd9Sstevel@tonic-gate #endif 2632*7c478bd9Sstevel@tonic-gate /* We cannot handle IP option yet... */ 2633*7c478bd9Sstevel@tonic-gate tcp_drops++; 2634*7c478bd9Sstevel@tonic-gate freeb(mp); 2635*7c478bd9Sstevel@tonic-gate return; 2636*7c478bd9Sstevel@tonic-gate } 2637*7c478bd9Sstevel@tonic-gate /* The TCP header must be aligned. */ 2638*7c478bd9Sstevel@tonic-gate tcph = (tcpha_t *)&rptr[ip_hdr_len]; 2639*7c478bd9Sstevel@tonic-gate seg_seq = ntohl(tcph->tha_seq); 2640*7c478bd9Sstevel@tonic-gate seg_ack = ntohl(tcph->tha_ack); 2641*7c478bd9Sstevel@tonic-gate assert((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 2642*7c478bd9Sstevel@tonic-gate seg_len = (int)(mp->b_wptr - rptr) - 2643*7c478bd9Sstevel@tonic-gate (ip_hdr_len + TCP_HDR_LENGTH(((tcph_t *)tcph))); 2644*7c478bd9Sstevel@tonic-gate /* In inetboot, b_cont should always be NULL. */ 2645*7c478bd9Sstevel@tonic-gate assert(mp->b_cont == NULL); 2646*7c478bd9Sstevel@tonic-gate 2647*7c478bd9Sstevel@tonic-gate /* Verify the checksum. */ 2648*7c478bd9Sstevel@tonic-gate if (tcp_verify_cksum(mp) < 0) { 2649*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 2650*7c478bd9Sstevel@tonic-gate printf("tcp_rput_data: wrong cksum\n"); 2651*7c478bd9Sstevel@tonic-gate #endif 2652*7c478bd9Sstevel@tonic-gate freemsg(mp); 2653*7c478bd9Sstevel@tonic-gate return; 2654*7c478bd9Sstevel@tonic-gate } 2655*7c478bd9Sstevel@tonic-gate 2656*7c478bd9Sstevel@tonic-gate /* 2657*7c478bd9Sstevel@tonic-gate * This segment is not for us, try to find its 2658*7c478bd9Sstevel@tonic-gate * intended receiver. 2659*7c478bd9Sstevel@tonic-gate */ 2660*7c478bd9Sstevel@tonic-gate if (tcp == NULL || 2661*7c478bd9Sstevel@tonic-gate tcph->tha_lport != tcp->tcp_fport || 2662*7c478bd9Sstevel@tonic-gate tcph->tha_fport != tcp->tcp_lport || 2663*7c478bd9Sstevel@tonic-gate iph->ip_src.s_addr != tcp->tcp_remote || 2664*7c478bd9Sstevel@tonic-gate iph->ip_dst.s_addr != tcp->tcp_bound_source) { 2665*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 2666*7c478bd9Sstevel@tonic-gate printf("tcp_rput_data: not for us, state %d\n", 2667*7c478bd9Sstevel@tonic-gate tcp->tcp_state); 2668*7c478bd9Sstevel@tonic-gate #endif 2669*7c478bd9Sstevel@tonic-gate /* 2670*7c478bd9Sstevel@tonic-gate * First try to find a established connection. If none 2671*7c478bd9Sstevel@tonic-gate * is found, look for a listener. 2672*7c478bd9Sstevel@tonic-gate * 2673*7c478bd9Sstevel@tonic-gate * If a listener is found, we need to check to see if the 2674*7c478bd9Sstevel@tonic-gate * incoming segment is for one of its eagers. If it is, 2675*7c478bd9Sstevel@tonic-gate * give it to the eager. If not, listener should take care 2676*7c478bd9Sstevel@tonic-gate * of it. 2677*7c478bd9Sstevel@tonic-gate */ 2678*7c478bd9Sstevel@tonic-gate if ((tcp1 = tcp_lookup_ipv4(iph, tcph, TCPS_SYN_SENT, 2679*7c478bd9Sstevel@tonic-gate &sock_id)) != NULL || 2680*7c478bd9Sstevel@tonic-gate (tcp1 = tcp_lookup_listener_ipv4(iph->ip_dst.s_addr, 2681*7c478bd9Sstevel@tonic-gate tcph->tha_fport, &sock_id)) != NULL) { 2682*7c478bd9Sstevel@tonic-gate if (tcp1->tcp_state == TCPS_LISTEN) { 2683*7c478bd9Sstevel@tonic-gate if ((tcp = tcp_lookup_eager_ipv4(tcp1, 2684*7c478bd9Sstevel@tonic-gate iph, tcph)) == NULL) { 2685*7c478bd9Sstevel@tonic-gate /* No eager... sent to listener */ 2686*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 2687*7c478bd9Sstevel@tonic-gate printf("found the listener: %s\n", 2688*7c478bd9Sstevel@tonic-gate tcp_display(tcp1, NULL, 2689*7c478bd9Sstevel@tonic-gate DISP_ADDR_AND_PORT)); 2690*7c478bd9Sstevel@tonic-gate #endif 2691*7c478bd9Sstevel@tonic-gate tcp = tcp1; 2692*7c478bd9Sstevel@tonic-gate } 2693*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 2694*7c478bd9Sstevel@tonic-gate else { 2695*7c478bd9Sstevel@tonic-gate printf("found the eager: %s\n", 2696*7c478bd9Sstevel@tonic-gate tcp_display(tcp, NULL, 2697*7c478bd9Sstevel@tonic-gate DISP_ADDR_AND_PORT)); 2698*7c478bd9Sstevel@tonic-gate } 2699*7c478bd9Sstevel@tonic-gate #endif 2700*7c478bd9Sstevel@tonic-gate } else { 2701*7c478bd9Sstevel@tonic-gate /* Non listener found... */ 2702*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 2703*7c478bd9Sstevel@tonic-gate printf("found the connection: %s\n", 2704*7c478bd9Sstevel@tonic-gate tcp_display(tcp1, NULL, 2705*7c478bd9Sstevel@tonic-gate DISP_ADDR_AND_PORT)); 2706*7c478bd9Sstevel@tonic-gate #endif 2707*7c478bd9Sstevel@tonic-gate tcp = tcp1; 2708*7c478bd9Sstevel@tonic-gate } 2709*7c478bd9Sstevel@tonic-gate } else { 2710*7c478bd9Sstevel@tonic-gate /* 2711*7c478bd9Sstevel@tonic-gate * No connection for this segment... 2712*7c478bd9Sstevel@tonic-gate * Send a RST to the other side. 2713*7c478bd9Sstevel@tonic-gate */ 2714*7c478bd9Sstevel@tonic-gate tcp_xmit_listeners_reset(sock_id, mp, ip_hdr_len); 2715*7c478bd9Sstevel@tonic-gate return; 2716*7c478bd9Sstevel@tonic-gate } 2717*7c478bd9Sstevel@tonic-gate } 2718*7c478bd9Sstevel@tonic-gate 2719*7c478bd9Sstevel@tonic-gate flags = tcph->tha_flags & 0xFF; 2720*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInSegs); 2721*7c478bd9Sstevel@tonic-gate if (tcp->tcp_state == TCPS_TIME_WAIT) { 2722*7c478bd9Sstevel@tonic-gate tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, 2723*7c478bd9Sstevel@tonic-gate seg_len, (tcph_t *)tcph, sock_id); 2724*7c478bd9Sstevel@tonic-gate return; 2725*7c478bd9Sstevel@tonic-gate } 2726*7c478bd9Sstevel@tonic-gate /* 2727*7c478bd9Sstevel@tonic-gate * From this point we can assume that the tcp is not compressed, 2728*7c478bd9Sstevel@tonic-gate * since we would have branched off to tcp_time_wait_processing() 2729*7c478bd9Sstevel@tonic-gate * in such a case. 2730*7c478bd9Sstevel@tonic-gate */ 2731*7c478bd9Sstevel@tonic-gate assert(tcp != NULL && tcp->tcp_state != TCPS_TIME_WAIT); 2732*7c478bd9Sstevel@tonic-gate 2733*7c478bd9Sstevel@tonic-gate /* 2734*7c478bd9Sstevel@tonic-gate * After this point, we know we have the correct TCP, so update 2735*7c478bd9Sstevel@tonic-gate * the receive time. 2736*7c478bd9Sstevel@tonic-gate */ 2737*7c478bd9Sstevel@tonic-gate tcp->tcp_last_recv_time = prom_gettime(); 2738*7c478bd9Sstevel@tonic-gate 2739*7c478bd9Sstevel@tonic-gate /* In inetboot, we do not handle urgent pointer... */ 2740*7c478bd9Sstevel@tonic-gate if (flags & TH_URG) { 2741*7c478bd9Sstevel@tonic-gate freemsg(mp); 2742*7c478bd9Sstevel@tonic-gate DEBUG_1("tcp_rput_data(%d): received segment with urgent " 2743*7c478bd9Sstevel@tonic-gate "pointer\n", sock_id); 2744*7c478bd9Sstevel@tonic-gate tcp_drops++; 2745*7c478bd9Sstevel@tonic-gate return; 2746*7c478bd9Sstevel@tonic-gate } 2747*7c478bd9Sstevel@tonic-gate 2748*7c478bd9Sstevel@tonic-gate switch (tcp->tcp_state) { 2749*7c478bd9Sstevel@tonic-gate case TCPS_LISTEN: 2750*7c478bd9Sstevel@tonic-gate if ((flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN) { 2751*7c478bd9Sstevel@tonic-gate if (flags & TH_RST) { 2752*7c478bd9Sstevel@tonic-gate freemsg(mp); 2753*7c478bd9Sstevel@tonic-gate return; 2754*7c478bd9Sstevel@tonic-gate } 2755*7c478bd9Sstevel@tonic-gate if (flags & TH_ACK) { 2756*7c478bd9Sstevel@tonic-gate tcp_xmit_early_reset("TCPS_LISTEN-TH_ACK", 2757*7c478bd9Sstevel@tonic-gate sock_id, mp, seg_ack, 0, TH_RST, 2758*7c478bd9Sstevel@tonic-gate ip_hdr_len); 2759*7c478bd9Sstevel@tonic-gate return; 2760*7c478bd9Sstevel@tonic-gate } 2761*7c478bd9Sstevel@tonic-gate if (!(flags & TH_SYN)) { 2762*7c478bd9Sstevel@tonic-gate freemsg(mp); 2763*7c478bd9Sstevel@tonic-gate return; 2764*7c478bd9Sstevel@tonic-gate } 2765*7c478bd9Sstevel@tonic-gate printf("tcp_rput_data: %d\n", __LINE__); 2766*7c478bd9Sstevel@tonic-gate prom_panic("inetboot"); 2767*7c478bd9Sstevel@tonic-gate } 2768*7c478bd9Sstevel@tonic-gate if (tcp->tcp_conn_req_max > 0) { 2769*7c478bd9Sstevel@tonic-gate tcp = tcp_conn_request(tcp, mp, sock_id, ip_hdr_len); 2770*7c478bd9Sstevel@tonic-gate if (tcp == NULL) { 2771*7c478bd9Sstevel@tonic-gate freemsg(mp); 2772*7c478bd9Sstevel@tonic-gate return; 2773*7c478bd9Sstevel@tonic-gate } 2774*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 2775*7c478bd9Sstevel@tonic-gate printf("tcp_rput_data: new tcp created\n"); 2776*7c478bd9Sstevel@tonic-gate #endif 2777*7c478bd9Sstevel@tonic-gate } 2778*7c478bd9Sstevel@tonic-gate tcp->tcp_irs = seg_seq; 2779*7c478bd9Sstevel@tonic-gate tcp->tcp_rack = seg_seq; 2780*7c478bd9Sstevel@tonic-gate tcp->tcp_rnxt = seg_seq + 1; 2781*7c478bd9Sstevel@tonic-gate U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 2782*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpPassiveOpens); 2783*7c478bd9Sstevel@tonic-gate goto syn_rcvd; 2784*7c478bd9Sstevel@tonic-gate case TCPS_SYN_SENT: 2785*7c478bd9Sstevel@tonic-gate if (flags & TH_ACK) { 2786*7c478bd9Sstevel@tonic-gate /* 2787*7c478bd9Sstevel@tonic-gate * Note that our stack cannot send data before a 2788*7c478bd9Sstevel@tonic-gate * connection is established, therefore the 2789*7c478bd9Sstevel@tonic-gate * following check is valid. Otherwise, it has 2790*7c478bd9Sstevel@tonic-gate * to be changed. 2791*7c478bd9Sstevel@tonic-gate */ 2792*7c478bd9Sstevel@tonic-gate if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || 2793*7c478bd9Sstevel@tonic-gate SEQ_GT(seg_ack, tcp->tcp_snxt)) { 2794*7c478bd9Sstevel@tonic-gate if (flags & TH_RST) { 2795*7c478bd9Sstevel@tonic-gate freemsg(mp); 2796*7c478bd9Sstevel@tonic-gate return; 2797*7c478bd9Sstevel@tonic-gate } 2798*7c478bd9Sstevel@tonic-gate tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", 2799*7c478bd9Sstevel@tonic-gate tcp, mp, seg_ack, 0, TH_RST, 2800*7c478bd9Sstevel@tonic-gate ip_hdr_len, sock_id); 2801*7c478bd9Sstevel@tonic-gate return; 2802*7c478bd9Sstevel@tonic-gate } 2803*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_suna + 1 == seg_ack); 2804*7c478bd9Sstevel@tonic-gate } 2805*7c478bd9Sstevel@tonic-gate if (flags & TH_RST) { 2806*7c478bd9Sstevel@tonic-gate freemsg(mp); 2807*7c478bd9Sstevel@tonic-gate if (flags & TH_ACK) { 2808*7c478bd9Sstevel@tonic-gate tcp_clean_death(sock_id, tcp, ECONNREFUSED); 2809*7c478bd9Sstevel@tonic-gate } 2810*7c478bd9Sstevel@tonic-gate return; 2811*7c478bd9Sstevel@tonic-gate } 2812*7c478bd9Sstevel@tonic-gate if (!(flags & TH_SYN)) { 2813*7c478bd9Sstevel@tonic-gate freemsg(mp); 2814*7c478bd9Sstevel@tonic-gate return; 2815*7c478bd9Sstevel@tonic-gate } 2816*7c478bd9Sstevel@tonic-gate 2817*7c478bd9Sstevel@tonic-gate /* Process all TCP options. */ 2818*7c478bd9Sstevel@tonic-gate tcp_process_options(tcp, (tcph_t *)tcph); 2819*7c478bd9Sstevel@tonic-gate /* 2820*7c478bd9Sstevel@tonic-gate * The following changes our rwnd to be a multiple of the 2821*7c478bd9Sstevel@tonic-gate * MIN(peer MSS, our MSS) for performance reason. 2822*7c478bd9Sstevel@tonic-gate */ 2823*7c478bd9Sstevel@tonic-gate (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rwnd, 2824*7c478bd9Sstevel@tonic-gate tcp->tcp_mss)); 2825*7c478bd9Sstevel@tonic-gate 2826*7c478bd9Sstevel@tonic-gate /* Is the other end ECN capable? */ 2827*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ecn_ok) { 2828*7c478bd9Sstevel@tonic-gate if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { 2829*7c478bd9Sstevel@tonic-gate tcp->tcp_ecn_ok = B_FALSE; 2830*7c478bd9Sstevel@tonic-gate } 2831*7c478bd9Sstevel@tonic-gate } 2832*7c478bd9Sstevel@tonic-gate /* 2833*7c478bd9Sstevel@tonic-gate * Clear ECN flags because it may interfere with later 2834*7c478bd9Sstevel@tonic-gate * processing. 2835*7c478bd9Sstevel@tonic-gate */ 2836*7c478bd9Sstevel@tonic-gate flags &= ~(TH_ECE|TH_CWR); 2837*7c478bd9Sstevel@tonic-gate 2838*7c478bd9Sstevel@tonic-gate tcp->tcp_irs = seg_seq; 2839*7c478bd9Sstevel@tonic-gate tcp->tcp_rack = seg_seq; 2840*7c478bd9Sstevel@tonic-gate tcp->tcp_rnxt = seg_seq + 1; 2841*7c478bd9Sstevel@tonic-gate U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 2842*7c478bd9Sstevel@tonic-gate 2843*7c478bd9Sstevel@tonic-gate if (flags & TH_ACK) { 2844*7c478bd9Sstevel@tonic-gate /* One for the SYN */ 2845*7c478bd9Sstevel@tonic-gate tcp->tcp_suna = tcp->tcp_iss + 1; 2846*7c478bd9Sstevel@tonic-gate tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 2847*7c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_ESTABLISHED; 2848*7c478bd9Sstevel@tonic-gate 2849*7c478bd9Sstevel@tonic-gate /* 2850*7c478bd9Sstevel@tonic-gate * If SYN was retransmitted, need to reset all 2851*7c478bd9Sstevel@tonic-gate * retransmission info. This is because this 2852*7c478bd9Sstevel@tonic-gate * segment will be treated as a dup ACK. 2853*7c478bd9Sstevel@tonic-gate */ 2854*7c478bd9Sstevel@tonic-gate if (tcp->tcp_rexmit) { 2855*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit = B_FALSE; 2856*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 2857*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_snxt; 2858*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_burst = TCP_CWND_NORMAL; 2859*7c478bd9Sstevel@tonic-gate 2860*7c478bd9Sstevel@tonic-gate /* 2861*7c478bd9Sstevel@tonic-gate * Set tcp_cwnd back to 1 MSS, per 2862*7c478bd9Sstevel@tonic-gate * recommendation from 2863*7c478bd9Sstevel@tonic-gate * draft-floyd-incr-init-win-01.txt, 2864*7c478bd9Sstevel@tonic-gate * Increasing TCP's Initial Window. 2865*7c478bd9Sstevel@tonic-gate */ 2866*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd = tcp->tcp_mss; 2867*7c478bd9Sstevel@tonic-gate } 2868*7c478bd9Sstevel@tonic-gate 2869*7c478bd9Sstevel@tonic-gate tcp->tcp_swl1 = seg_seq; 2870*7c478bd9Sstevel@tonic-gate tcp->tcp_swl2 = seg_ack; 2871*7c478bd9Sstevel@tonic-gate 2872*7c478bd9Sstevel@tonic-gate new_swnd = BE16_TO_U16(((tcph_t *)tcph)->th_win); 2873*7c478bd9Sstevel@tonic-gate tcp->tcp_swnd = new_swnd; 2874*7c478bd9Sstevel@tonic-gate if (new_swnd > tcp->tcp_max_swnd) 2875*7c478bd9Sstevel@tonic-gate tcp->tcp_max_swnd = new_swnd; 2876*7c478bd9Sstevel@tonic-gate 2877*7c478bd9Sstevel@tonic-gate /* 2878*7c478bd9Sstevel@tonic-gate * Always send the three-way handshake ack immediately 2879*7c478bd9Sstevel@tonic-gate * in order to make the connection complete as soon as 2880*7c478bd9Sstevel@tonic-gate * possible on the accepting host. 2881*7c478bd9Sstevel@tonic-gate */ 2882*7c478bd9Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 2883*7c478bd9Sstevel@tonic-gate /* 2884*7c478bd9Sstevel@tonic-gate * Check to see if there is data to be sent. If 2885*7c478bd9Sstevel@tonic-gate * yes, set the transmit flag. Then check to see 2886*7c478bd9Sstevel@tonic-gate * if received data processing needs to be done. 2887*7c478bd9Sstevel@tonic-gate * If not, go straight to xmit_check. This short 2888*7c478bd9Sstevel@tonic-gate * cut is OK as we don't support T/TCP. 2889*7c478bd9Sstevel@tonic-gate */ 2890*7c478bd9Sstevel@tonic-gate if (tcp->tcp_unsent) 2891*7c478bd9Sstevel@tonic-gate flags |= TH_XMIT_NEEDED; 2892*7c478bd9Sstevel@tonic-gate 2893*7c478bd9Sstevel@tonic-gate if (seg_len == 0) { 2894*7c478bd9Sstevel@tonic-gate freemsg(mp); 2895*7c478bd9Sstevel@tonic-gate goto xmit_check; 2896*7c478bd9Sstevel@tonic-gate } 2897*7c478bd9Sstevel@tonic-gate 2898*7c478bd9Sstevel@tonic-gate flags &= ~TH_SYN; 2899*7c478bd9Sstevel@tonic-gate seg_seq++; 2900*7c478bd9Sstevel@tonic-gate break; 2901*7c478bd9Sstevel@tonic-gate } 2902*7c478bd9Sstevel@tonic-gate syn_rcvd: 2903*7c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_SYN_RCVD; 2904*7c478bd9Sstevel@tonic-gate mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, 2905*7c478bd9Sstevel@tonic-gate NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 2906*7c478bd9Sstevel@tonic-gate if (mp1 != NULL) { 2907*7c478bd9Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_rput_data replying SYN", mp1); 2908*7c478bd9Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp1); 2909*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 2910*7c478bd9Sstevel@tonic-gate freeb(mp1); 2911*7c478bd9Sstevel@tonic-gate /* 2912*7c478bd9Sstevel@tonic-gate * Let's wait till our SYN has been ACKED since we 2913*7c478bd9Sstevel@tonic-gate * don't have a timer. 2914*7c478bd9Sstevel@tonic-gate */ 2915*7c478bd9Sstevel@tonic-gate if (tcp_state_wait(sock_id, tcp, TCPS_ALL_ACKED) < 0) { 2916*7c478bd9Sstevel@tonic-gate freemsg(mp); 2917*7c478bd9Sstevel@tonic-gate return; 2918*7c478bd9Sstevel@tonic-gate } 2919*7c478bd9Sstevel@tonic-gate } 2920*7c478bd9Sstevel@tonic-gate freemsg(mp); 2921*7c478bd9Sstevel@tonic-gate return; 2922*7c478bd9Sstevel@tonic-gate default: 2923*7c478bd9Sstevel@tonic-gate break; 2924*7c478bd9Sstevel@tonic-gate } 2925*7c478bd9Sstevel@tonic-gate mp->b_rptr = (uchar_t *)tcph + TCP_HDR_LENGTH((tcph_t *)tcph); 2926*7c478bd9Sstevel@tonic-gate new_swnd = ntohs(tcph->tha_win) << 2927*7c478bd9Sstevel@tonic-gate ((flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); 2928*7c478bd9Sstevel@tonic-gate mss = tcp->tcp_mss; 2929*7c478bd9Sstevel@tonic-gate 2930*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 2931*7c478bd9Sstevel@tonic-gate if (!tcp_paws_check(tcp, (tcph_t *)tcph, &tcpopt)) { 2932*7c478bd9Sstevel@tonic-gate /* 2933*7c478bd9Sstevel@tonic-gate * This segment is not acceptable. 2934*7c478bd9Sstevel@tonic-gate * Drop it and send back an ACK. 2935*7c478bd9Sstevel@tonic-gate */ 2936*7c478bd9Sstevel@tonic-gate freemsg(mp); 2937*7c478bd9Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 2938*7c478bd9Sstevel@tonic-gate goto ack_check; 2939*7c478bd9Sstevel@tonic-gate } 2940*7c478bd9Sstevel@tonic-gate } else if (tcp->tcp_snd_sack_ok) { 2941*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_sack_info != NULL); 2942*7c478bd9Sstevel@tonic-gate tcpopt.tcp = tcp; 2943*7c478bd9Sstevel@tonic-gate /* 2944*7c478bd9Sstevel@tonic-gate * SACK info in already updated in tcp_parse_options. Ignore 2945*7c478bd9Sstevel@tonic-gate * all other TCP options... 2946*7c478bd9Sstevel@tonic-gate */ 2947*7c478bd9Sstevel@tonic-gate (void) tcp_parse_options((tcph_t *)tcph, &tcpopt); 2948*7c478bd9Sstevel@tonic-gate } 2949*7c478bd9Sstevel@tonic-gate try_again:; 2950*7c478bd9Sstevel@tonic-gate gap = seg_seq - tcp->tcp_rnxt; 2951*7c478bd9Sstevel@tonic-gate rgap = tcp->tcp_rwnd - (gap + seg_len); 2952*7c478bd9Sstevel@tonic-gate /* 2953*7c478bd9Sstevel@tonic-gate * gap is the amount of sequence space between what we expect to see 2954*7c478bd9Sstevel@tonic-gate * and what we got for seg_seq. A positive value for gap means 2955*7c478bd9Sstevel@tonic-gate * something got lost. A negative value means we got some old stuff. 2956*7c478bd9Sstevel@tonic-gate */ 2957*7c478bd9Sstevel@tonic-gate if (gap < 0) { 2958*7c478bd9Sstevel@tonic-gate /* Old stuff present. Is the SYN in there? */ 2959*7c478bd9Sstevel@tonic-gate if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) && 2960*7c478bd9Sstevel@tonic-gate (seg_len != 0)) { 2961*7c478bd9Sstevel@tonic-gate flags &= ~TH_SYN; 2962*7c478bd9Sstevel@tonic-gate seg_seq++; 2963*7c478bd9Sstevel@tonic-gate /* Recompute the gaps after noting the SYN. */ 2964*7c478bd9Sstevel@tonic-gate goto try_again; 2965*7c478bd9Sstevel@tonic-gate } 2966*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataDupSegs); 2967*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataDupBytes, 2968*7c478bd9Sstevel@tonic-gate (seg_len > -gap ? -gap : seg_len)); 2969*7c478bd9Sstevel@tonic-gate /* Remove the old stuff from seg_len. */ 2970*7c478bd9Sstevel@tonic-gate seg_len += gap; 2971*7c478bd9Sstevel@tonic-gate /* 2972*7c478bd9Sstevel@tonic-gate * Anything left? 2973*7c478bd9Sstevel@tonic-gate * Make sure to check for unack'd FIN when rest of data 2974*7c478bd9Sstevel@tonic-gate * has been previously ack'd. 2975*7c478bd9Sstevel@tonic-gate */ 2976*7c478bd9Sstevel@tonic-gate if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 2977*7c478bd9Sstevel@tonic-gate /* 2978*7c478bd9Sstevel@tonic-gate * Resets are only valid if they lie within our offered 2979*7c478bd9Sstevel@tonic-gate * window. If the RST bit is set, we just ignore this 2980*7c478bd9Sstevel@tonic-gate * segment. 2981*7c478bd9Sstevel@tonic-gate */ 2982*7c478bd9Sstevel@tonic-gate if (flags & TH_RST) { 2983*7c478bd9Sstevel@tonic-gate freemsg(mp); 2984*7c478bd9Sstevel@tonic-gate return; 2985*7c478bd9Sstevel@tonic-gate } 2986*7c478bd9Sstevel@tonic-gate 2987*7c478bd9Sstevel@tonic-gate /* 2988*7c478bd9Sstevel@tonic-gate * This segment is "unacceptable". None of its 2989*7c478bd9Sstevel@tonic-gate * sequence space lies within our advertized window. 2990*7c478bd9Sstevel@tonic-gate * 2991*7c478bd9Sstevel@tonic-gate * Adjust seg_len to the original value for tracing. 2992*7c478bd9Sstevel@tonic-gate */ 2993*7c478bd9Sstevel@tonic-gate seg_len -= gap; 2994*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 2995*7c478bd9Sstevel@tonic-gate printf("tcp_rput: unacceptable, gap %d, rgap " 2996*7c478bd9Sstevel@tonic-gate "%d, flags 0x%x, seg_seq %u, seg_ack %u, " 2997*7c478bd9Sstevel@tonic-gate "seg_len %d, rnxt %u, snxt %u, %s", 2998*7c478bd9Sstevel@tonic-gate gap, rgap, flags, seg_seq, seg_ack, 2999*7c478bd9Sstevel@tonic-gate seg_len, tcp->tcp_rnxt, tcp->tcp_snxt, 3000*7c478bd9Sstevel@tonic-gate tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 3001*7c478bd9Sstevel@tonic-gate #endif 3002*7c478bd9Sstevel@tonic-gate 3003*7c478bd9Sstevel@tonic-gate /* 3004*7c478bd9Sstevel@tonic-gate * Arrange to send an ACK in response to the 3005*7c478bd9Sstevel@tonic-gate * unacceptable segment per RFC 793 page 69. There 3006*7c478bd9Sstevel@tonic-gate * is only one small difference between ours and the 3007*7c478bd9Sstevel@tonic-gate * acceptability test in the RFC - we accept ACK-only 3008*7c478bd9Sstevel@tonic-gate * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK 3009*7c478bd9Sstevel@tonic-gate * will be generated. 3010*7c478bd9Sstevel@tonic-gate * 3011*7c478bd9Sstevel@tonic-gate * Note that we have to ACK an ACK-only packet at least 3012*7c478bd9Sstevel@tonic-gate * for stacks that send 0-length keep-alives with 3013*7c478bd9Sstevel@tonic-gate * SEG.SEQ = SND.NXT-1 as recommended by RFC1122, 3014*7c478bd9Sstevel@tonic-gate * section 4.2.3.6. As long as we don't ever generate 3015*7c478bd9Sstevel@tonic-gate * an unacceptable packet in response to an incoming 3016*7c478bd9Sstevel@tonic-gate * packet that is unacceptable, it should not cause 3017*7c478bd9Sstevel@tonic-gate * "ACK wars". 3018*7c478bd9Sstevel@tonic-gate */ 3019*7c478bd9Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 3020*7c478bd9Sstevel@tonic-gate 3021*7c478bd9Sstevel@tonic-gate /* 3022*7c478bd9Sstevel@tonic-gate * Continue processing this segment in order to use the 3023*7c478bd9Sstevel@tonic-gate * ACK information it contains, but skip all other 3024*7c478bd9Sstevel@tonic-gate * sequence-number processing. Processing the ACK 3025*7c478bd9Sstevel@tonic-gate * information is necessary in order to 3026*7c478bd9Sstevel@tonic-gate * re-synchronize connections that may have lost 3027*7c478bd9Sstevel@tonic-gate * synchronization. 3028*7c478bd9Sstevel@tonic-gate * 3029*7c478bd9Sstevel@tonic-gate * We clear seg_len and flag fields related to 3030*7c478bd9Sstevel@tonic-gate * sequence number processing as they are not 3031*7c478bd9Sstevel@tonic-gate * to be trusted for an unacceptable segment. 3032*7c478bd9Sstevel@tonic-gate */ 3033*7c478bd9Sstevel@tonic-gate seg_len = 0; 3034*7c478bd9Sstevel@tonic-gate flags &= ~(TH_SYN | TH_FIN | TH_URG); 3035*7c478bd9Sstevel@tonic-gate goto process_ack; 3036*7c478bd9Sstevel@tonic-gate } 3037*7c478bd9Sstevel@tonic-gate 3038*7c478bd9Sstevel@tonic-gate /* Fix seg_seq, and chew the gap off the front. */ 3039*7c478bd9Sstevel@tonic-gate seg_seq = tcp->tcp_rnxt; 3040*7c478bd9Sstevel@tonic-gate do { 3041*7c478bd9Sstevel@tonic-gate mblk_t *mp2; 3042*7c478bd9Sstevel@tonic-gate assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 3043*7c478bd9Sstevel@tonic-gate (uintptr_t)UINT_MAX); 3044*7c478bd9Sstevel@tonic-gate gap += (uint_t)(mp->b_wptr - mp->b_rptr); 3045*7c478bd9Sstevel@tonic-gate if (gap > 0) { 3046*7c478bd9Sstevel@tonic-gate mp->b_rptr = mp->b_wptr - gap; 3047*7c478bd9Sstevel@tonic-gate break; 3048*7c478bd9Sstevel@tonic-gate } 3049*7c478bd9Sstevel@tonic-gate mp2 = mp; 3050*7c478bd9Sstevel@tonic-gate mp = mp->b_cont; 3051*7c478bd9Sstevel@tonic-gate freeb(mp2); 3052*7c478bd9Sstevel@tonic-gate } while (gap < 0); 3053*7c478bd9Sstevel@tonic-gate } 3054*7c478bd9Sstevel@tonic-gate /* 3055*7c478bd9Sstevel@tonic-gate * rgap is the amount of stuff received out of window. A negative 3056*7c478bd9Sstevel@tonic-gate * value is the amount out of window. 3057*7c478bd9Sstevel@tonic-gate */ 3058*7c478bd9Sstevel@tonic-gate if (rgap < 0) { 3059*7c478bd9Sstevel@tonic-gate mblk_t *mp2; 3060*7c478bd9Sstevel@tonic-gate 3061*7c478bd9Sstevel@tonic-gate if (tcp->tcp_rwnd == 0) 3062*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInWinProbe); 3063*7c478bd9Sstevel@tonic-gate else { 3064*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataPastWinSegs); 3065*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataPastWinBytes, -rgap); 3066*7c478bd9Sstevel@tonic-gate } 3067*7c478bd9Sstevel@tonic-gate 3068*7c478bd9Sstevel@tonic-gate /* 3069*7c478bd9Sstevel@tonic-gate * seg_len does not include the FIN, so if more than 3070*7c478bd9Sstevel@tonic-gate * just the FIN is out of window, we act like we don't 3071*7c478bd9Sstevel@tonic-gate * see it. (If just the FIN is out of window, rgap 3072*7c478bd9Sstevel@tonic-gate * will be zero and we will go ahead and acknowledge 3073*7c478bd9Sstevel@tonic-gate * the FIN.) 3074*7c478bd9Sstevel@tonic-gate */ 3075*7c478bd9Sstevel@tonic-gate flags &= ~TH_FIN; 3076*7c478bd9Sstevel@tonic-gate 3077*7c478bd9Sstevel@tonic-gate /* Fix seg_len and make sure there is something left. */ 3078*7c478bd9Sstevel@tonic-gate seg_len += rgap; 3079*7c478bd9Sstevel@tonic-gate if (seg_len <= 0) { 3080*7c478bd9Sstevel@tonic-gate /* 3081*7c478bd9Sstevel@tonic-gate * Resets are only valid if they lie within our offered 3082*7c478bd9Sstevel@tonic-gate * window. If the RST bit is set, we just ignore this 3083*7c478bd9Sstevel@tonic-gate * segment. 3084*7c478bd9Sstevel@tonic-gate */ 3085*7c478bd9Sstevel@tonic-gate if (flags & TH_RST) { 3086*7c478bd9Sstevel@tonic-gate freemsg(mp); 3087*7c478bd9Sstevel@tonic-gate return; 3088*7c478bd9Sstevel@tonic-gate } 3089*7c478bd9Sstevel@tonic-gate 3090*7c478bd9Sstevel@tonic-gate /* Per RFC 793, we need to send back an ACK. */ 3091*7c478bd9Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 3092*7c478bd9Sstevel@tonic-gate 3093*7c478bd9Sstevel@tonic-gate /* 3094*7c478bd9Sstevel@tonic-gate * If this is a zero window probe, continue to 3095*7c478bd9Sstevel@tonic-gate * process the ACK part. But we need to set seg_len 3096*7c478bd9Sstevel@tonic-gate * to 0 to avoid data processing. Otherwise just 3097*7c478bd9Sstevel@tonic-gate * drop the segment and send back an ACK. 3098*7c478bd9Sstevel@tonic-gate */ 3099*7c478bd9Sstevel@tonic-gate if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) { 3100*7c478bd9Sstevel@tonic-gate flags &= ~(TH_SYN | TH_URG); 3101*7c478bd9Sstevel@tonic-gate seg_len = 0; 3102*7c478bd9Sstevel@tonic-gate /* Let's see if we can update our rwnd */ 3103*7c478bd9Sstevel@tonic-gate tcp_rcv_drain(sock_id, tcp); 3104*7c478bd9Sstevel@tonic-gate goto process_ack; 3105*7c478bd9Sstevel@tonic-gate } else { 3106*7c478bd9Sstevel@tonic-gate freemsg(mp); 3107*7c478bd9Sstevel@tonic-gate goto ack_check; 3108*7c478bd9Sstevel@tonic-gate } 3109*7c478bd9Sstevel@tonic-gate } 3110*7c478bd9Sstevel@tonic-gate /* Pitch out of window stuff off the end. */ 3111*7c478bd9Sstevel@tonic-gate rgap = seg_len; 3112*7c478bd9Sstevel@tonic-gate mp2 = mp; 3113*7c478bd9Sstevel@tonic-gate do { 3114*7c478bd9Sstevel@tonic-gate assert((uintptr_t)(mp2->b_wptr - 3115*7c478bd9Sstevel@tonic-gate mp2->b_rptr) <= (uintptr_t)INT_MAX); 3116*7c478bd9Sstevel@tonic-gate rgap -= (int)(mp2->b_wptr - mp2->b_rptr); 3117*7c478bd9Sstevel@tonic-gate if (rgap < 0) { 3118*7c478bd9Sstevel@tonic-gate mp2->b_wptr += rgap; 3119*7c478bd9Sstevel@tonic-gate if ((mp1 = mp2->b_cont) != NULL) { 3120*7c478bd9Sstevel@tonic-gate mp2->b_cont = NULL; 3121*7c478bd9Sstevel@tonic-gate freemsg(mp1); 3122*7c478bd9Sstevel@tonic-gate } 3123*7c478bd9Sstevel@tonic-gate break; 3124*7c478bd9Sstevel@tonic-gate } 3125*7c478bd9Sstevel@tonic-gate } while ((mp2 = mp2->b_cont) != NULL); 3126*7c478bd9Sstevel@tonic-gate } 3127*7c478bd9Sstevel@tonic-gate ok:; 3128*7c478bd9Sstevel@tonic-gate /* 3129*7c478bd9Sstevel@tonic-gate * TCP should check ECN info for segments inside the window only. 3130*7c478bd9Sstevel@tonic-gate * Therefore the check should be done here. 3131*7c478bd9Sstevel@tonic-gate */ 3132*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ecn_ok) { 3133*7c478bd9Sstevel@tonic-gate uchar_t tos = ((struct ip *)rptr)->ip_tos; 3134*7c478bd9Sstevel@tonic-gate 3135*7c478bd9Sstevel@tonic-gate if (flags & TH_CWR) { 3136*7c478bd9Sstevel@tonic-gate tcp->tcp_ecn_echo_on = B_FALSE; 3137*7c478bd9Sstevel@tonic-gate } 3138*7c478bd9Sstevel@tonic-gate /* 3139*7c478bd9Sstevel@tonic-gate * Note that both ECN_CE and CWR can be set in the 3140*7c478bd9Sstevel@tonic-gate * same segment. In this case, we once again turn 3141*7c478bd9Sstevel@tonic-gate * on ECN_ECHO. 3142*7c478bd9Sstevel@tonic-gate */ 3143*7c478bd9Sstevel@tonic-gate if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { 3144*7c478bd9Sstevel@tonic-gate tcp->tcp_ecn_echo_on = B_TRUE; 3145*7c478bd9Sstevel@tonic-gate } 3146*7c478bd9Sstevel@tonic-gate } 3147*7c478bd9Sstevel@tonic-gate 3148*7c478bd9Sstevel@tonic-gate /* 3149*7c478bd9Sstevel@tonic-gate * Check whether we can update tcp_ts_recent. This test is 3150*7c478bd9Sstevel@tonic-gate * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 3151*7c478bd9Sstevel@tonic-gate * Extensions for High Performance: An Update", Internet Draft. 3152*7c478bd9Sstevel@tonic-gate */ 3153*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok && 3154*7c478bd9Sstevel@tonic-gate TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 3155*7c478bd9Sstevel@tonic-gate SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 3156*7c478bd9Sstevel@tonic-gate tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 3157*7c478bd9Sstevel@tonic-gate tcp->tcp_last_rcv_lbolt = prom_gettime(); 3158*7c478bd9Sstevel@tonic-gate } 3159*7c478bd9Sstevel@tonic-gate 3160*7c478bd9Sstevel@tonic-gate if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) { 3161*7c478bd9Sstevel@tonic-gate /* 3162*7c478bd9Sstevel@tonic-gate * FIN in an out of order segment. We record this in 3163*7c478bd9Sstevel@tonic-gate * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq. 3164*7c478bd9Sstevel@tonic-gate * Clear the FIN so that any check on FIN flag will fail. 3165*7c478bd9Sstevel@tonic-gate * Remember that FIN also counts in the sequence number 3166*7c478bd9Sstevel@tonic-gate * space. So we need to ack out of order FIN only segments. 3167*7c478bd9Sstevel@tonic-gate */ 3168*7c478bd9Sstevel@tonic-gate if (flags & TH_FIN) { 3169*7c478bd9Sstevel@tonic-gate tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID; 3170*7c478bd9Sstevel@tonic-gate tcp->tcp_ofo_fin_seq = seg_seq + seg_len; 3171*7c478bd9Sstevel@tonic-gate flags &= ~TH_FIN; 3172*7c478bd9Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 3173*7c478bd9Sstevel@tonic-gate } 3174*7c478bd9Sstevel@tonic-gate if (seg_len > 0) { 3175*7c478bd9Sstevel@tonic-gate /* Fill in the SACK blk list. */ 3176*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok) { 3177*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_sack_info != NULL); 3178*7c478bd9Sstevel@tonic-gate tcp_sack_insert(tcp->tcp_sack_list, 3179*7c478bd9Sstevel@tonic-gate seg_seq, seg_seq + seg_len, 3180*7c478bd9Sstevel@tonic-gate &(tcp->tcp_num_sack_blk)); 3181*7c478bd9Sstevel@tonic-gate } 3182*7c478bd9Sstevel@tonic-gate 3183*7c478bd9Sstevel@tonic-gate /* 3184*7c478bd9Sstevel@tonic-gate * Attempt reassembly and see if we have something 3185*7c478bd9Sstevel@tonic-gate * ready to go. 3186*7c478bd9Sstevel@tonic-gate */ 3187*7c478bd9Sstevel@tonic-gate mp = tcp_reass(tcp, mp, seg_seq); 3188*7c478bd9Sstevel@tonic-gate /* Always ack out of order packets */ 3189*7c478bd9Sstevel@tonic-gate flags |= TH_ACK_NEEDED | TH_PUSH; 3190*7c478bd9Sstevel@tonic-gate if (mp != NULL) { 3191*7c478bd9Sstevel@tonic-gate assert((uintptr_t)(mp->b_wptr - 3192*7c478bd9Sstevel@tonic-gate mp->b_rptr) <= (uintptr_t)INT_MAX); 3193*7c478bd9Sstevel@tonic-gate seg_len = mp->b_cont ? msgdsize(mp) : 3194*7c478bd9Sstevel@tonic-gate (int)(mp->b_wptr - mp->b_rptr); 3195*7c478bd9Sstevel@tonic-gate seg_seq = tcp->tcp_rnxt; 3196*7c478bd9Sstevel@tonic-gate /* 3197*7c478bd9Sstevel@tonic-gate * A gap is filled and the seq num and len 3198*7c478bd9Sstevel@tonic-gate * of the gap match that of a previously 3199*7c478bd9Sstevel@tonic-gate * received FIN, put the FIN flag back in. 3200*7c478bd9Sstevel@tonic-gate */ 3201*7c478bd9Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3202*7c478bd9Sstevel@tonic-gate seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3203*7c478bd9Sstevel@tonic-gate flags |= TH_FIN; 3204*7c478bd9Sstevel@tonic-gate tcp->tcp_valid_bits &= 3205*7c478bd9Sstevel@tonic-gate ~TCP_OFO_FIN_VALID; 3206*7c478bd9Sstevel@tonic-gate } 3207*7c478bd9Sstevel@tonic-gate } else { 3208*7c478bd9Sstevel@tonic-gate /* 3209*7c478bd9Sstevel@tonic-gate * Keep going even with NULL mp. 3210*7c478bd9Sstevel@tonic-gate * There may be a useful ACK or something else 3211*7c478bd9Sstevel@tonic-gate * we don't want to miss. 3212*7c478bd9Sstevel@tonic-gate * 3213*7c478bd9Sstevel@tonic-gate * But TCP should not perform fast retransmit 3214*7c478bd9Sstevel@tonic-gate * because of the ack number. TCP uses 3215*7c478bd9Sstevel@tonic-gate * seg_len == 0 to determine if it is a pure 3216*7c478bd9Sstevel@tonic-gate * ACK. And this is not a pure ACK. 3217*7c478bd9Sstevel@tonic-gate */ 3218*7c478bd9Sstevel@tonic-gate seg_len = 0; 3219*7c478bd9Sstevel@tonic-gate ofo_seg = B_TRUE; 3220*7c478bd9Sstevel@tonic-gate } 3221*7c478bd9Sstevel@tonic-gate } 3222*7c478bd9Sstevel@tonic-gate } else if (seg_len > 0) { 3223*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataInorderSegs); 3224*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataInorderBytes, seg_len); 3225*7c478bd9Sstevel@tonic-gate /* 3226*7c478bd9Sstevel@tonic-gate * If an out of order FIN was received before, and the seq 3227*7c478bd9Sstevel@tonic-gate * num and len of the new segment match that of the FIN, 3228*7c478bd9Sstevel@tonic-gate * put the FIN flag back in. 3229*7c478bd9Sstevel@tonic-gate */ 3230*7c478bd9Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3231*7c478bd9Sstevel@tonic-gate seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3232*7c478bd9Sstevel@tonic-gate flags |= TH_FIN; 3233*7c478bd9Sstevel@tonic-gate tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID; 3234*7c478bd9Sstevel@tonic-gate } 3235*7c478bd9Sstevel@tonic-gate } 3236*7c478bd9Sstevel@tonic-gate if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) { 3237*7c478bd9Sstevel@tonic-gate if (flags & TH_RST) { 3238*7c478bd9Sstevel@tonic-gate freemsg(mp); 3239*7c478bd9Sstevel@tonic-gate switch (tcp->tcp_state) { 3240*7c478bd9Sstevel@tonic-gate case TCPS_SYN_RCVD: 3241*7c478bd9Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, ECONNREFUSED); 3242*7c478bd9Sstevel@tonic-gate break; 3243*7c478bd9Sstevel@tonic-gate case TCPS_ESTABLISHED: 3244*7c478bd9Sstevel@tonic-gate case TCPS_FIN_WAIT_1: 3245*7c478bd9Sstevel@tonic-gate case TCPS_FIN_WAIT_2: 3246*7c478bd9Sstevel@tonic-gate case TCPS_CLOSE_WAIT: 3247*7c478bd9Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, ECONNRESET); 3248*7c478bd9Sstevel@tonic-gate break; 3249*7c478bd9Sstevel@tonic-gate case TCPS_CLOSING: 3250*7c478bd9Sstevel@tonic-gate case TCPS_LAST_ACK: 3251*7c478bd9Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, 0); 3252*7c478bd9Sstevel@tonic-gate break; 3253*7c478bd9Sstevel@tonic-gate default: 3254*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_state != TCPS_TIME_WAIT); 3255*7c478bd9Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, ENXIO); 3256*7c478bd9Sstevel@tonic-gate break; 3257*7c478bd9Sstevel@tonic-gate } 3258*7c478bd9Sstevel@tonic-gate return; 3259*7c478bd9Sstevel@tonic-gate } 3260*7c478bd9Sstevel@tonic-gate if (flags & TH_SYN) { 3261*7c478bd9Sstevel@tonic-gate /* 3262*7c478bd9Sstevel@tonic-gate * See RFC 793, Page 71 3263*7c478bd9Sstevel@tonic-gate * 3264*7c478bd9Sstevel@tonic-gate * The seq number must be in the window as it should 3265*7c478bd9Sstevel@tonic-gate * be "fixed" above. If it is outside window, it should 3266*7c478bd9Sstevel@tonic-gate * be already rejected. Note that we allow seg_seq to be 3267*7c478bd9Sstevel@tonic-gate * rnxt + rwnd because we want to accept 0 window probe. 3268*7c478bd9Sstevel@tonic-gate */ 3269*7c478bd9Sstevel@tonic-gate assert(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) && 3270*7c478bd9Sstevel@tonic-gate SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd)); 3271*7c478bd9Sstevel@tonic-gate freemsg(mp); 3272*7c478bd9Sstevel@tonic-gate /* 3273*7c478bd9Sstevel@tonic-gate * If the ACK flag is not set, just use our snxt as the 3274*7c478bd9Sstevel@tonic-gate * seq number of the RST segment. 3275*7c478bd9Sstevel@tonic-gate */ 3276*7c478bd9Sstevel@tonic-gate if (!(flags & TH_ACK)) { 3277*7c478bd9Sstevel@tonic-gate seg_ack = tcp->tcp_snxt; 3278*7c478bd9Sstevel@tonic-gate } 3279*7c478bd9Sstevel@tonic-gate tcp_xmit_ctl("TH_SYN", tcp, NULL, seg_ack, 3280*7c478bd9Sstevel@tonic-gate seg_seq + 1, TH_RST|TH_ACK, 0, sock_id); 3281*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_state != TCPS_TIME_WAIT); 3282*7c478bd9Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, ECONNRESET); 3283*7c478bd9Sstevel@tonic-gate return; 3284*7c478bd9Sstevel@tonic-gate } 3285*7c478bd9Sstevel@tonic-gate 3286*7c478bd9Sstevel@tonic-gate process_ack: 3287*7c478bd9Sstevel@tonic-gate if (!(flags & TH_ACK)) { 3288*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 3289*7c478bd9Sstevel@tonic-gate printf("No ack in segment, dropped it, seq:%x\n", seg_seq); 3290*7c478bd9Sstevel@tonic-gate #endif 3291*7c478bd9Sstevel@tonic-gate freemsg(mp); 3292*7c478bd9Sstevel@tonic-gate goto xmit_check; 3293*7c478bd9Sstevel@tonic-gate } 3294*7c478bd9Sstevel@tonic-gate } 3295*7c478bd9Sstevel@tonic-gate bytes_acked = (int)(seg_ack - tcp->tcp_suna); 3296*7c478bd9Sstevel@tonic-gate 3297*7c478bd9Sstevel@tonic-gate if (tcp->tcp_state == TCPS_SYN_RCVD) { 3298*7c478bd9Sstevel@tonic-gate tcp_t *listener = tcp->tcp_listener; 3299*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 3300*7c478bd9Sstevel@tonic-gate printf("Done with eager 3-way handshake\n"); 3301*7c478bd9Sstevel@tonic-gate #endif 3302*7c478bd9Sstevel@tonic-gate /* 3303*7c478bd9Sstevel@tonic-gate * NOTE: RFC 793 pg. 72 says this should be 'bytes_acked < 0' 3304*7c478bd9Sstevel@tonic-gate * but that would mean we have an ack that ignored our SYN. 3305*7c478bd9Sstevel@tonic-gate */ 3306*7c478bd9Sstevel@tonic-gate if (bytes_acked < 1 || SEQ_GT(seg_ack, tcp->tcp_snxt)) { 3307*7c478bd9Sstevel@tonic-gate freemsg(mp); 3308*7c478bd9Sstevel@tonic-gate tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack", 3309*7c478bd9Sstevel@tonic-gate tcp, NULL, seg_ack, 0, TH_RST, 0, sock_id); 3310*7c478bd9Sstevel@tonic-gate return; 3311*7c478bd9Sstevel@tonic-gate } 3312*7c478bd9Sstevel@tonic-gate 3313*7c478bd9Sstevel@tonic-gate /* 3314*7c478bd9Sstevel@tonic-gate * if the conn_req_q is full defer processing 3315*7c478bd9Sstevel@tonic-gate * until space is availabe after accept() 3316*7c478bd9Sstevel@tonic-gate * processing 3317*7c478bd9Sstevel@tonic-gate */ 3318*7c478bd9Sstevel@tonic-gate if (listener->tcp_conn_req_cnt_q < 3319*7c478bd9Sstevel@tonic-gate listener->tcp_conn_req_max) { 3320*7c478bd9Sstevel@tonic-gate tcp_t *tail; 3321*7c478bd9Sstevel@tonic-gate 3322*7c478bd9Sstevel@tonic-gate listener->tcp_conn_req_cnt_q0--; 3323*7c478bd9Sstevel@tonic-gate listener->tcp_conn_req_cnt_q++; 3324*7c478bd9Sstevel@tonic-gate 3325*7c478bd9Sstevel@tonic-gate /* Move from SYN_RCVD to ESTABLISHED list */ 3326*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 3327*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_prev_q0; 3328*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 3329*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q0; 3330*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_prev_q0 = NULL; 3331*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q0 = NULL; 3332*7c478bd9Sstevel@tonic-gate 3333*7c478bd9Sstevel@tonic-gate /* 3334*7c478bd9Sstevel@tonic-gate * Insert at end of the queue because sockfs 3335*7c478bd9Sstevel@tonic-gate * sends down T_CONN_RES in chronological 3336*7c478bd9Sstevel@tonic-gate * order. Leaving the older conn indications 3337*7c478bd9Sstevel@tonic-gate * at front of the queue helps reducing search 3338*7c478bd9Sstevel@tonic-gate * time. 3339*7c478bd9Sstevel@tonic-gate */ 3340*7c478bd9Sstevel@tonic-gate tail = listener->tcp_eager_last_q; 3341*7c478bd9Sstevel@tonic-gate if (tail != NULL) { 3342*7c478bd9Sstevel@tonic-gate tail->tcp_eager_next_q = tcp; 3343*7c478bd9Sstevel@tonic-gate } else { 3344*7c478bd9Sstevel@tonic-gate listener->tcp_eager_next_q = tcp; 3345*7c478bd9Sstevel@tonic-gate } 3346*7c478bd9Sstevel@tonic-gate listener->tcp_eager_last_q = tcp; 3347*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q = NULL; 3348*7c478bd9Sstevel@tonic-gate } else { 3349*7c478bd9Sstevel@tonic-gate /* 3350*7c478bd9Sstevel@tonic-gate * Defer connection on q0 and set deferred 3351*7c478bd9Sstevel@tonic-gate * connection bit true 3352*7c478bd9Sstevel@tonic-gate */ 3353*7c478bd9Sstevel@tonic-gate tcp->tcp_conn_def_q0 = B_TRUE; 3354*7c478bd9Sstevel@tonic-gate 3355*7c478bd9Sstevel@tonic-gate /* take tcp out of q0 ... */ 3356*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 3357*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q0; 3358*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 3359*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_prev_q0; 3360*7c478bd9Sstevel@tonic-gate 3361*7c478bd9Sstevel@tonic-gate /* ... and place it at the end of q0 */ 3362*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 3363*7c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q0 = listener; 3364*7c478bd9Sstevel@tonic-gate listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 3365*7c478bd9Sstevel@tonic-gate listener->tcp_eager_prev_q0 = tcp; 3366*7c478bd9Sstevel@tonic-gate } 3367*7c478bd9Sstevel@tonic-gate 3368*7c478bd9Sstevel@tonic-gate tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ 3369*7c478bd9Sstevel@tonic-gate bytes_acked--; 3370*7c478bd9Sstevel@tonic-gate 3371*7c478bd9Sstevel@tonic-gate /* 3372*7c478bd9Sstevel@tonic-gate * If SYN was retransmitted, need to reset all 3373*7c478bd9Sstevel@tonic-gate * retransmission info as this segment will be 3374*7c478bd9Sstevel@tonic-gate * treated as a dup ACK. 3375*7c478bd9Sstevel@tonic-gate */ 3376*7c478bd9Sstevel@tonic-gate if (tcp->tcp_rexmit) { 3377*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit = B_FALSE; 3378*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 3379*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_snxt; 3380*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3381*7c478bd9Sstevel@tonic-gate tcp->tcp_ms_we_have_waited = 0; 3382*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd = mss; 3383*7c478bd9Sstevel@tonic-gate } 3384*7c478bd9Sstevel@tonic-gate 3385*7c478bd9Sstevel@tonic-gate /* 3386*7c478bd9Sstevel@tonic-gate * We set the send window to zero here. 3387*7c478bd9Sstevel@tonic-gate * This is needed if there is data to be 3388*7c478bd9Sstevel@tonic-gate * processed already on the queue. 3389*7c478bd9Sstevel@tonic-gate * Later (at swnd_update label), the 3390*7c478bd9Sstevel@tonic-gate * "new_swnd > tcp_swnd" condition is satisfied 3391*7c478bd9Sstevel@tonic-gate * the XMIT_NEEDED flag is set in the current 3392*7c478bd9Sstevel@tonic-gate * (SYN_RCVD) state. This ensures tcp_wput_data() is 3393*7c478bd9Sstevel@tonic-gate * called if there is already data on queue in 3394*7c478bd9Sstevel@tonic-gate * this state. 3395*7c478bd9Sstevel@tonic-gate */ 3396*7c478bd9Sstevel@tonic-gate tcp->tcp_swnd = 0; 3397*7c478bd9Sstevel@tonic-gate 3398*7c478bd9Sstevel@tonic-gate if (new_swnd > tcp->tcp_max_swnd) 3399*7c478bd9Sstevel@tonic-gate tcp->tcp_max_swnd = new_swnd; 3400*7c478bd9Sstevel@tonic-gate tcp->tcp_swl1 = seg_seq; 3401*7c478bd9Sstevel@tonic-gate tcp->tcp_swl2 = seg_ack; 3402*7c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_ESTABLISHED; 3403*7c478bd9Sstevel@tonic-gate tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 3404*7c478bd9Sstevel@tonic-gate } 3405*7c478bd9Sstevel@tonic-gate /* This code follows 4.4BSD-Lite2 mostly. */ 3406*7c478bd9Sstevel@tonic-gate if (bytes_acked < 0) 3407*7c478bd9Sstevel@tonic-gate goto est; 3408*7c478bd9Sstevel@tonic-gate 3409*7c478bd9Sstevel@tonic-gate /* 3410*7c478bd9Sstevel@tonic-gate * If TCP is ECN capable and the congestion experience bit is 3411*7c478bd9Sstevel@tonic-gate * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be 3412*7c478bd9Sstevel@tonic-gate * done once per window (or more loosely, per RTT). 3413*7c478bd9Sstevel@tonic-gate */ 3414*7c478bd9Sstevel@tonic-gate if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) 3415*7c478bd9Sstevel@tonic-gate tcp->tcp_cwr = B_FALSE; 3416*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { 3417*7c478bd9Sstevel@tonic-gate if (!tcp->tcp_cwr) { 3418*7c478bd9Sstevel@tonic-gate npkt = (MIN(tcp->tcp_cwnd, tcp->tcp_swnd) >> 1) / mss; 3419*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; 3420*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd = npkt * mss; 3421*7c478bd9Sstevel@tonic-gate /* 3422*7c478bd9Sstevel@tonic-gate * If the cwnd is 0, use the timer to clock out 3423*7c478bd9Sstevel@tonic-gate * new segments. This is required by the ECN spec. 3424*7c478bd9Sstevel@tonic-gate */ 3425*7c478bd9Sstevel@tonic-gate if (npkt == 0) { 3426*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3427*7c478bd9Sstevel@tonic-gate /* 3428*7c478bd9Sstevel@tonic-gate * This makes sure that when the ACK comes 3429*7c478bd9Sstevel@tonic-gate * back, we will increase tcp_cwnd by 1 MSS. 3430*7c478bd9Sstevel@tonic-gate */ 3431*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_cnt = 0; 3432*7c478bd9Sstevel@tonic-gate } 3433*7c478bd9Sstevel@tonic-gate tcp->tcp_cwr = B_TRUE; 3434*7c478bd9Sstevel@tonic-gate /* 3435*7c478bd9Sstevel@tonic-gate * This marks the end of the current window of in 3436*7c478bd9Sstevel@tonic-gate * flight data. That is why we don't use 3437*7c478bd9Sstevel@tonic-gate * tcp_suna + tcp_swnd. Only data in flight can 3438*7c478bd9Sstevel@tonic-gate * provide ECN info. 3439*7c478bd9Sstevel@tonic-gate */ 3440*7c478bd9Sstevel@tonic-gate tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 3441*7c478bd9Sstevel@tonic-gate tcp->tcp_ecn_cwr_sent = B_FALSE; 3442*7c478bd9Sstevel@tonic-gate } 3443*7c478bd9Sstevel@tonic-gate } 3444*7c478bd9Sstevel@tonic-gate 3445*7c478bd9Sstevel@tonic-gate mp1 = tcp->tcp_xmit_head; 3446*7c478bd9Sstevel@tonic-gate if (bytes_acked == 0) { 3447*7c478bd9Sstevel@tonic-gate if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { 3448*7c478bd9Sstevel@tonic-gate int dupack_cnt; 3449*7c478bd9Sstevel@tonic-gate 3450*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDupAck); 3451*7c478bd9Sstevel@tonic-gate /* 3452*7c478bd9Sstevel@tonic-gate * Fast retransmit. When we have seen exactly three 3453*7c478bd9Sstevel@tonic-gate * identical ACKs while we have unacked data 3454*7c478bd9Sstevel@tonic-gate * outstanding we take it as a hint that our peer 3455*7c478bd9Sstevel@tonic-gate * dropped something. 3456*7c478bd9Sstevel@tonic-gate * 3457*7c478bd9Sstevel@tonic-gate * If TCP is retransmitting, don't do fast retransmit. 3458*7c478bd9Sstevel@tonic-gate */ 3459*7c478bd9Sstevel@tonic-gate if (mp1 != NULL && tcp->tcp_suna != tcp->tcp_snxt && 3460*7c478bd9Sstevel@tonic-gate ! tcp->tcp_rexmit) { 3461*7c478bd9Sstevel@tonic-gate /* Do Limited Transmit */ 3462*7c478bd9Sstevel@tonic-gate if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < 3463*7c478bd9Sstevel@tonic-gate tcp_dupack_fast_retransmit) { 3464*7c478bd9Sstevel@tonic-gate /* 3465*7c478bd9Sstevel@tonic-gate * RFC 3042 3466*7c478bd9Sstevel@tonic-gate * 3467*7c478bd9Sstevel@tonic-gate * What we need to do is temporarily 3468*7c478bd9Sstevel@tonic-gate * increase tcp_cwnd so that new 3469*7c478bd9Sstevel@tonic-gate * data can be sent if it is allowed 3470*7c478bd9Sstevel@tonic-gate * by the receive window (tcp_rwnd). 3471*7c478bd9Sstevel@tonic-gate * tcp_wput_data() will take care of 3472*7c478bd9Sstevel@tonic-gate * the rest. 3473*7c478bd9Sstevel@tonic-gate * 3474*7c478bd9Sstevel@tonic-gate * If the connection is SACK capable, 3475*7c478bd9Sstevel@tonic-gate * only do limited xmit when there 3476*7c478bd9Sstevel@tonic-gate * is SACK info. 3477*7c478bd9Sstevel@tonic-gate * 3478*7c478bd9Sstevel@tonic-gate * Note how tcp_cwnd is incremented. 3479*7c478bd9Sstevel@tonic-gate * The first dup ACK will increase 3480*7c478bd9Sstevel@tonic-gate * it by 1 MSS. The second dup ACK 3481*7c478bd9Sstevel@tonic-gate * will increase it by 2 MSS. This 3482*7c478bd9Sstevel@tonic-gate * means that only 1 new segment will 3483*7c478bd9Sstevel@tonic-gate * be sent for each dup ACK. 3484*7c478bd9Sstevel@tonic-gate */ 3485*7c478bd9Sstevel@tonic-gate if (tcp->tcp_unsent > 0 && 3486*7c478bd9Sstevel@tonic-gate (!tcp->tcp_snd_sack_ok || 3487*7c478bd9Sstevel@tonic-gate (tcp->tcp_snd_sack_ok && 3488*7c478bd9Sstevel@tonic-gate tcp->tcp_notsack_list != NULL))) { 3489*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd += mss << 3490*7c478bd9Sstevel@tonic-gate (tcp->tcp_dupack_cnt - 1); 3491*7c478bd9Sstevel@tonic-gate flags |= TH_LIMIT_XMIT; 3492*7c478bd9Sstevel@tonic-gate } 3493*7c478bd9Sstevel@tonic-gate } else if (dupack_cnt == 3494*7c478bd9Sstevel@tonic-gate tcp_dupack_fast_retransmit) { 3495*7c478bd9Sstevel@tonic-gate 3496*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutFastRetrans); 3497*7c478bd9Sstevel@tonic-gate /* 3498*7c478bd9Sstevel@tonic-gate * If we have reduced tcp_ssthresh 3499*7c478bd9Sstevel@tonic-gate * because of ECN, do not reduce it again 3500*7c478bd9Sstevel@tonic-gate * unless it is already one window of data 3501*7c478bd9Sstevel@tonic-gate * away. After one window of data, tcp_cwr 3502*7c478bd9Sstevel@tonic-gate * should then be cleared. Note that 3503*7c478bd9Sstevel@tonic-gate * for non ECN capable connection, tcp_cwr 3504*7c478bd9Sstevel@tonic-gate * should always be false. 3505*7c478bd9Sstevel@tonic-gate * 3506*7c478bd9Sstevel@tonic-gate * Adjust cwnd since the duplicate 3507*7c478bd9Sstevel@tonic-gate * ack indicates that a packet was 3508*7c478bd9Sstevel@tonic-gate * dropped (due to congestion.) 3509*7c478bd9Sstevel@tonic-gate */ 3510*7c478bd9Sstevel@tonic-gate if (!tcp->tcp_cwr) { 3511*7c478bd9Sstevel@tonic-gate npkt = (MIN(tcp->tcp_cwnd, 3512*7c478bd9Sstevel@tonic-gate tcp->tcp_swnd) >> 1) / mss; 3513*7c478bd9Sstevel@tonic-gate if (npkt < 2) 3514*7c478bd9Sstevel@tonic-gate npkt = 2; 3515*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_ssthresh = npkt * mss; 3516*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd = (npkt + 3517*7c478bd9Sstevel@tonic-gate tcp->tcp_dupack_cnt) * mss; 3518*7c478bd9Sstevel@tonic-gate } 3519*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ecn_ok) { 3520*7c478bd9Sstevel@tonic-gate tcp->tcp_cwr = B_TRUE; 3521*7c478bd9Sstevel@tonic-gate tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 3522*7c478bd9Sstevel@tonic-gate tcp->tcp_ecn_cwr_sent = B_FALSE; 3523*7c478bd9Sstevel@tonic-gate } 3524*7c478bd9Sstevel@tonic-gate 3525*7c478bd9Sstevel@tonic-gate /* 3526*7c478bd9Sstevel@tonic-gate * We do Hoe's algorithm. Refer to her 3527*7c478bd9Sstevel@tonic-gate * paper "Improving the Start-up Behavior 3528*7c478bd9Sstevel@tonic-gate * of a Congestion Control Scheme for TCP," 3529*7c478bd9Sstevel@tonic-gate * appeared in SIGCOMM'96. 3530*7c478bd9Sstevel@tonic-gate * 3531*7c478bd9Sstevel@tonic-gate * Save highest seq no we have sent so far. 3532*7c478bd9Sstevel@tonic-gate * Be careful about the invisible FIN byte. 3533*7c478bd9Sstevel@tonic-gate */ 3534*7c478bd9Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 3535*7c478bd9Sstevel@tonic-gate (tcp->tcp_unsent == 0)) { 3536*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_fss; 3537*7c478bd9Sstevel@tonic-gate } else { 3538*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_snxt; 3539*7c478bd9Sstevel@tonic-gate } 3540*7c478bd9Sstevel@tonic-gate 3541*7c478bd9Sstevel@tonic-gate /* 3542*7c478bd9Sstevel@tonic-gate * Do not allow bursty traffic during. 3543*7c478bd9Sstevel@tonic-gate * fast recovery. Refer to Fall and Floyd's 3544*7c478bd9Sstevel@tonic-gate * paper "Simulation-based Comparisons of 3545*7c478bd9Sstevel@tonic-gate * Tahoe, Reno and SACK TCP" (in CCR ??) 3546*7c478bd9Sstevel@tonic-gate * This is a best current practise. 3547*7c478bd9Sstevel@tonic-gate */ 3548*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_burst = TCP_CWND_SS; 3549*7c478bd9Sstevel@tonic-gate 3550*7c478bd9Sstevel@tonic-gate /* 3551*7c478bd9Sstevel@tonic-gate * For SACK: 3552*7c478bd9Sstevel@tonic-gate * Calculate tcp_pipe, which is the 3553*7c478bd9Sstevel@tonic-gate * estimated number of bytes in 3554*7c478bd9Sstevel@tonic-gate * network. 3555*7c478bd9Sstevel@tonic-gate * 3556*7c478bd9Sstevel@tonic-gate * tcp_fack is the highest sack'ed seq num 3557*7c478bd9Sstevel@tonic-gate * TCP has received. 3558*7c478bd9Sstevel@tonic-gate * 3559*7c478bd9Sstevel@tonic-gate * tcp_pipe is explained in the above quoted 3560*7c478bd9Sstevel@tonic-gate * Fall and Floyd's paper. tcp_fack is 3561*7c478bd9Sstevel@tonic-gate * explained in Mathis and Mahdavi's 3562*7c478bd9Sstevel@tonic-gate * "Forward Acknowledgment: Refining TCP 3563*7c478bd9Sstevel@tonic-gate * Congestion Control" in SIGCOMM '96. 3564*7c478bd9Sstevel@tonic-gate */ 3565*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok) { 3566*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_sack_info != NULL); 3567*7c478bd9Sstevel@tonic-gate if (tcp->tcp_notsack_list != NULL) { 3568*7c478bd9Sstevel@tonic-gate tcp->tcp_pipe = tcp->tcp_snxt - 3569*7c478bd9Sstevel@tonic-gate tcp->tcp_fack; 3570*7c478bd9Sstevel@tonic-gate tcp->tcp_sack_snxt = seg_ack; 3571*7c478bd9Sstevel@tonic-gate flags |= TH_NEED_SACK_REXMIT; 3572*7c478bd9Sstevel@tonic-gate } else { 3573*7c478bd9Sstevel@tonic-gate /* 3574*7c478bd9Sstevel@tonic-gate * Always initialize tcp_pipe 3575*7c478bd9Sstevel@tonic-gate * even though we don't have 3576*7c478bd9Sstevel@tonic-gate * any SACK info. If later 3577*7c478bd9Sstevel@tonic-gate * we get SACK info and 3578*7c478bd9Sstevel@tonic-gate * tcp_pipe is not initialized, 3579*7c478bd9Sstevel@tonic-gate * funny things will happen. 3580*7c478bd9Sstevel@tonic-gate */ 3581*7c478bd9Sstevel@tonic-gate tcp->tcp_pipe = 3582*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_ssthresh; 3583*7c478bd9Sstevel@tonic-gate } 3584*7c478bd9Sstevel@tonic-gate } else { 3585*7c478bd9Sstevel@tonic-gate flags |= TH_REXMIT_NEEDED; 3586*7c478bd9Sstevel@tonic-gate } /* tcp_snd_sack_ok */ 3587*7c478bd9Sstevel@tonic-gate 3588*7c478bd9Sstevel@tonic-gate } else { 3589*7c478bd9Sstevel@tonic-gate /* 3590*7c478bd9Sstevel@tonic-gate * Here we perform congestion 3591*7c478bd9Sstevel@tonic-gate * avoidance, but NOT slow start. 3592*7c478bd9Sstevel@tonic-gate * This is known as the Fast 3593*7c478bd9Sstevel@tonic-gate * Recovery Algorithm. 3594*7c478bd9Sstevel@tonic-gate */ 3595*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && 3596*7c478bd9Sstevel@tonic-gate tcp->tcp_notsack_list != NULL) { 3597*7c478bd9Sstevel@tonic-gate flags |= TH_NEED_SACK_REXMIT; 3598*7c478bd9Sstevel@tonic-gate tcp->tcp_pipe -= mss; 3599*7c478bd9Sstevel@tonic-gate if (tcp->tcp_pipe < 0) 3600*7c478bd9Sstevel@tonic-gate tcp->tcp_pipe = 0; 3601*7c478bd9Sstevel@tonic-gate } else { 3602*7c478bd9Sstevel@tonic-gate /* 3603*7c478bd9Sstevel@tonic-gate * We know that one more packet has 3604*7c478bd9Sstevel@tonic-gate * left the pipe thus we can update 3605*7c478bd9Sstevel@tonic-gate * cwnd. 3606*7c478bd9Sstevel@tonic-gate */ 3607*7c478bd9Sstevel@tonic-gate cwnd = tcp->tcp_cwnd + mss; 3608*7c478bd9Sstevel@tonic-gate if (cwnd > tcp->tcp_cwnd_max) 3609*7c478bd9Sstevel@tonic-gate cwnd = tcp->tcp_cwnd_max; 3610*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd = cwnd; 3611*7c478bd9Sstevel@tonic-gate flags |= TH_XMIT_NEEDED; 3612*7c478bd9Sstevel@tonic-gate } 3613*7c478bd9Sstevel@tonic-gate } 3614*7c478bd9Sstevel@tonic-gate } 3615*7c478bd9Sstevel@tonic-gate } else if (tcp->tcp_zero_win_probe) { 3616*7c478bd9Sstevel@tonic-gate /* 3617*7c478bd9Sstevel@tonic-gate * If the window has opened, need to arrange 3618*7c478bd9Sstevel@tonic-gate * to send additional data. 3619*7c478bd9Sstevel@tonic-gate */ 3620*7c478bd9Sstevel@tonic-gate if (new_swnd != 0) { 3621*7c478bd9Sstevel@tonic-gate /* tcp_suna != tcp_snxt */ 3622*7c478bd9Sstevel@tonic-gate /* Packet contains a window update */ 3623*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInWinUpdate); 3624*7c478bd9Sstevel@tonic-gate tcp->tcp_zero_win_probe = 0; 3625*7c478bd9Sstevel@tonic-gate tcp->tcp_timer_backoff = 0; 3626*7c478bd9Sstevel@tonic-gate tcp->tcp_ms_we_have_waited = 0; 3627*7c478bd9Sstevel@tonic-gate 3628*7c478bd9Sstevel@tonic-gate /* 3629*7c478bd9Sstevel@tonic-gate * Transmit starting with tcp_suna since 3630*7c478bd9Sstevel@tonic-gate * the one byte probe is not ack'ed. 3631*7c478bd9Sstevel@tonic-gate * If TCP has sent more than one identical 3632*7c478bd9Sstevel@tonic-gate * probe, tcp_rexmit will be set. That means 3633*7c478bd9Sstevel@tonic-gate * tcp_ss_rexmit() will send out the one 3634*7c478bd9Sstevel@tonic-gate * byte along with new data. Otherwise, 3635*7c478bd9Sstevel@tonic-gate * fake the retransmission. 3636*7c478bd9Sstevel@tonic-gate */ 3637*7c478bd9Sstevel@tonic-gate flags |= TH_XMIT_NEEDED; 3638*7c478bd9Sstevel@tonic-gate if (!tcp->tcp_rexmit) { 3639*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit = B_TRUE; 3640*7c478bd9Sstevel@tonic-gate tcp->tcp_dupack_cnt = 0; 3641*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_nxt = tcp->tcp_suna; 3642*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_suna + 1; 3643*7c478bd9Sstevel@tonic-gate } 3644*7c478bd9Sstevel@tonic-gate } 3645*7c478bd9Sstevel@tonic-gate } 3646*7c478bd9Sstevel@tonic-gate goto swnd_update; 3647*7c478bd9Sstevel@tonic-gate } 3648*7c478bd9Sstevel@tonic-gate 3649*7c478bd9Sstevel@tonic-gate /* 3650*7c478bd9Sstevel@tonic-gate * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73. 3651*7c478bd9Sstevel@tonic-gate * If the ACK value acks something that we have not yet sent, it might 3652*7c478bd9Sstevel@tonic-gate * be an old duplicate segment. Send an ACK to re-synchronize the 3653*7c478bd9Sstevel@tonic-gate * other side. 3654*7c478bd9Sstevel@tonic-gate * Note: reset in response to unacceptable ACK in SYN_RECEIVE 3655*7c478bd9Sstevel@tonic-gate * state is handled above, so we can always just drop the segment and 3656*7c478bd9Sstevel@tonic-gate * send an ACK here. 3657*7c478bd9Sstevel@tonic-gate * 3658*7c478bd9Sstevel@tonic-gate * Should we send ACKs in response to ACK only segments? 3659*7c478bd9Sstevel@tonic-gate */ 3660*7c478bd9Sstevel@tonic-gate if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { 3661*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInAckUnsent); 3662*7c478bd9Sstevel@tonic-gate /* drop the received segment */ 3663*7c478bd9Sstevel@tonic-gate freemsg(mp); 3664*7c478bd9Sstevel@tonic-gate 3665*7c478bd9Sstevel@tonic-gate /* Send back an ACK. */ 3666*7c478bd9Sstevel@tonic-gate mp = tcp_ack_mp(tcp); 3667*7c478bd9Sstevel@tonic-gate 3668*7c478bd9Sstevel@tonic-gate if (mp == NULL) { 3669*7c478bd9Sstevel@tonic-gate return; 3670*7c478bd9Sstevel@tonic-gate } 3671*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutAck); 3672*7c478bd9Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp); 3673*7c478bd9Sstevel@tonic-gate freeb(mp); 3674*7c478bd9Sstevel@tonic-gate return; 3675*7c478bd9Sstevel@tonic-gate } 3676*7c478bd9Sstevel@tonic-gate 3677*7c478bd9Sstevel@tonic-gate /* 3678*7c478bd9Sstevel@tonic-gate * TCP gets a new ACK, update the notsack'ed list to delete those 3679*7c478bd9Sstevel@tonic-gate * blocks that are covered by this ACK. 3680*7c478bd9Sstevel@tonic-gate */ 3681*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 3682*7c478bd9Sstevel@tonic-gate tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack, 3683*7c478bd9Sstevel@tonic-gate &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list)); 3684*7c478bd9Sstevel@tonic-gate } 3685*7c478bd9Sstevel@tonic-gate 3686*7c478bd9Sstevel@tonic-gate /* 3687*7c478bd9Sstevel@tonic-gate * If we got an ACK after fast retransmit, check to see 3688*7c478bd9Sstevel@tonic-gate * if it is a partial ACK. If it is not and the congestion 3689*7c478bd9Sstevel@tonic-gate * window was inflated to account for the other side's 3690*7c478bd9Sstevel@tonic-gate * cached packets, retract it. If it is, do Hoe's algorithm. 3691*7c478bd9Sstevel@tonic-gate */ 3692*7c478bd9Sstevel@tonic-gate if (tcp->tcp_dupack_cnt >= tcp_dupack_fast_retransmit) { 3693*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_rexmit == B_FALSE); 3694*7c478bd9Sstevel@tonic-gate if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { 3695*7c478bd9Sstevel@tonic-gate tcp->tcp_dupack_cnt = 0; 3696*7c478bd9Sstevel@tonic-gate /* 3697*7c478bd9Sstevel@tonic-gate * Restore the orig tcp_cwnd_ssthresh after 3698*7c478bd9Sstevel@tonic-gate * fast retransmit phase. 3699*7c478bd9Sstevel@tonic-gate */ 3700*7c478bd9Sstevel@tonic-gate if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { 3701*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; 3702*7c478bd9Sstevel@tonic-gate } 3703*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_max = seg_ack; 3704*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_cnt = 0; 3705*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3706*7c478bd9Sstevel@tonic-gate 3707*7c478bd9Sstevel@tonic-gate /* 3708*7c478bd9Sstevel@tonic-gate * Remove all notsack info to avoid confusion with 3709*7c478bd9Sstevel@tonic-gate * the next fast retrasnmit/recovery phase. 3710*7c478bd9Sstevel@tonic-gate */ 3711*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && 3712*7c478bd9Sstevel@tonic-gate tcp->tcp_notsack_list != NULL) { 3713*7c478bd9Sstevel@tonic-gate TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 3714*7c478bd9Sstevel@tonic-gate } 3715*7c478bd9Sstevel@tonic-gate } else { 3716*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && 3717*7c478bd9Sstevel@tonic-gate tcp->tcp_notsack_list != NULL) { 3718*7c478bd9Sstevel@tonic-gate flags |= TH_NEED_SACK_REXMIT; 3719*7c478bd9Sstevel@tonic-gate tcp->tcp_pipe -= mss; 3720*7c478bd9Sstevel@tonic-gate if (tcp->tcp_pipe < 0) 3721*7c478bd9Sstevel@tonic-gate tcp->tcp_pipe = 0; 3722*7c478bd9Sstevel@tonic-gate } else { 3723*7c478bd9Sstevel@tonic-gate /* 3724*7c478bd9Sstevel@tonic-gate * Hoe's algorithm: 3725*7c478bd9Sstevel@tonic-gate * 3726*7c478bd9Sstevel@tonic-gate * Retransmit the unack'ed segment and 3727*7c478bd9Sstevel@tonic-gate * restart fast recovery. Note that we 3728*7c478bd9Sstevel@tonic-gate * need to scale back tcp_cwnd to the 3729*7c478bd9Sstevel@tonic-gate * original value when we started fast 3730*7c478bd9Sstevel@tonic-gate * recovery. This is to prevent overly 3731*7c478bd9Sstevel@tonic-gate * aggressive behaviour in sending new 3732*7c478bd9Sstevel@tonic-gate * segments. 3733*7c478bd9Sstevel@tonic-gate */ 3734*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + 3735*7c478bd9Sstevel@tonic-gate tcp_dupack_fast_retransmit * mss; 3736*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; 3737*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutFastRetrans); 3738*7c478bd9Sstevel@tonic-gate flags |= TH_REXMIT_NEEDED; 3739*7c478bd9Sstevel@tonic-gate } 3740*7c478bd9Sstevel@tonic-gate } 3741*7c478bd9Sstevel@tonic-gate } else { 3742*7c478bd9Sstevel@tonic-gate tcp->tcp_dupack_cnt = 0; 3743*7c478bd9Sstevel@tonic-gate if (tcp->tcp_rexmit) { 3744*7c478bd9Sstevel@tonic-gate /* 3745*7c478bd9Sstevel@tonic-gate * TCP is retranmitting. If the ACK ack's all 3746*7c478bd9Sstevel@tonic-gate * outstanding data, update tcp_rexmit_max and 3747*7c478bd9Sstevel@tonic-gate * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt 3748*7c478bd9Sstevel@tonic-gate * to the correct value. 3749*7c478bd9Sstevel@tonic-gate * 3750*7c478bd9Sstevel@tonic-gate * Note that SEQ_LEQ() is used. This is to avoid 3751*7c478bd9Sstevel@tonic-gate * unnecessary fast retransmit caused by dup ACKs 3752*7c478bd9Sstevel@tonic-gate * received when TCP does slow start retransmission 3753*7c478bd9Sstevel@tonic-gate * after a time out. During this phase, TCP may 3754*7c478bd9Sstevel@tonic-gate * send out segments which are already received. 3755*7c478bd9Sstevel@tonic-gate * This causes dup ACKs to be sent back. 3756*7c478bd9Sstevel@tonic-gate */ 3757*7c478bd9Sstevel@tonic-gate if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) { 3758*7c478bd9Sstevel@tonic-gate if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) { 3759*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_nxt = seg_ack; 3760*7c478bd9Sstevel@tonic-gate } 3761*7c478bd9Sstevel@tonic-gate if (seg_ack != tcp->tcp_rexmit_max) { 3762*7c478bd9Sstevel@tonic-gate flags |= TH_XMIT_NEEDED; 3763*7c478bd9Sstevel@tonic-gate } 3764*7c478bd9Sstevel@tonic-gate } else { 3765*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit = B_FALSE; 3766*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 3767*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3768*7c478bd9Sstevel@tonic-gate } 3769*7c478bd9Sstevel@tonic-gate tcp->tcp_ms_we_have_waited = 0; 3770*7c478bd9Sstevel@tonic-gate } 3771*7c478bd9Sstevel@tonic-gate } 3772*7c478bd9Sstevel@tonic-gate 3773*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInAckSegs); 3774*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInAckBytes, bytes_acked); 3775*7c478bd9Sstevel@tonic-gate tcp->tcp_suna = seg_ack; 3776*7c478bd9Sstevel@tonic-gate if (tcp->tcp_zero_win_probe != 0) { 3777*7c478bd9Sstevel@tonic-gate tcp->tcp_zero_win_probe = 0; 3778*7c478bd9Sstevel@tonic-gate tcp->tcp_timer_backoff = 0; 3779*7c478bd9Sstevel@tonic-gate } 3780*7c478bd9Sstevel@tonic-gate 3781*7c478bd9Sstevel@tonic-gate /* 3782*7c478bd9Sstevel@tonic-gate * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed. 3783*7c478bd9Sstevel@tonic-gate * Note that it cannot be the SYN being ack'ed. The code flow 3784*7c478bd9Sstevel@tonic-gate * will not reach here. 3785*7c478bd9Sstevel@tonic-gate */ 3786*7c478bd9Sstevel@tonic-gate if (mp1 == NULL) { 3787*7c478bd9Sstevel@tonic-gate goto fin_acked; 3788*7c478bd9Sstevel@tonic-gate } 3789*7c478bd9Sstevel@tonic-gate 3790*7c478bd9Sstevel@tonic-gate /* 3791*7c478bd9Sstevel@tonic-gate * Update the congestion window. 3792*7c478bd9Sstevel@tonic-gate * 3793*7c478bd9Sstevel@tonic-gate * If TCP is not ECN capable or TCP is ECN capable but the 3794*7c478bd9Sstevel@tonic-gate * congestion experience bit is not set, increase the tcp_cwnd as 3795*7c478bd9Sstevel@tonic-gate * usual. 3796*7c478bd9Sstevel@tonic-gate */ 3797*7c478bd9Sstevel@tonic-gate if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { 3798*7c478bd9Sstevel@tonic-gate cwnd = tcp->tcp_cwnd; 3799*7c478bd9Sstevel@tonic-gate add = mss; 3800*7c478bd9Sstevel@tonic-gate 3801*7c478bd9Sstevel@tonic-gate if (cwnd >= tcp->tcp_cwnd_ssthresh) { 3802*7c478bd9Sstevel@tonic-gate /* 3803*7c478bd9Sstevel@tonic-gate * This is to prevent an increase of less than 1 MSS of 3804*7c478bd9Sstevel@tonic-gate * tcp_cwnd. With partial increase, tcp_wput_data() 3805*7c478bd9Sstevel@tonic-gate * may send out tinygrams in order to preserve mblk 3806*7c478bd9Sstevel@tonic-gate * boundaries. 3807*7c478bd9Sstevel@tonic-gate * 3808*7c478bd9Sstevel@tonic-gate * By initializing tcp_cwnd_cnt to new tcp_cwnd and 3809*7c478bd9Sstevel@tonic-gate * decrementing it by 1 MSS for every ACKs, tcp_cwnd is 3810*7c478bd9Sstevel@tonic-gate * increased by 1 MSS for every RTTs. 3811*7c478bd9Sstevel@tonic-gate */ 3812*7c478bd9Sstevel@tonic-gate if (tcp->tcp_cwnd_cnt <= 0) { 3813*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_cnt = cwnd + add; 3814*7c478bd9Sstevel@tonic-gate } else { 3815*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_cnt -= add; 3816*7c478bd9Sstevel@tonic-gate add = 0; 3817*7c478bd9Sstevel@tonic-gate } 3818*7c478bd9Sstevel@tonic-gate } 3819*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); 3820*7c478bd9Sstevel@tonic-gate } 3821*7c478bd9Sstevel@tonic-gate 3822*7c478bd9Sstevel@tonic-gate /* Can we update the RTT estimates? */ 3823*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 3824*7c478bd9Sstevel@tonic-gate /* Ignore zero timestamp echo-reply. */ 3825*7c478bd9Sstevel@tonic-gate if (tcpopt.tcp_opt_ts_ecr != 0) { 3826*7c478bd9Sstevel@tonic-gate tcp_set_rto(tcp, (int32_t)(prom_gettime() - 3827*7c478bd9Sstevel@tonic-gate tcpopt.tcp_opt_ts_ecr)); 3828*7c478bd9Sstevel@tonic-gate } 3829*7c478bd9Sstevel@tonic-gate 3830*7c478bd9Sstevel@tonic-gate /* If needed, restart the timer. */ 3831*7c478bd9Sstevel@tonic-gate if (tcp->tcp_set_timer == 1) { 3832*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3833*7c478bd9Sstevel@tonic-gate tcp->tcp_set_timer = 0; 3834*7c478bd9Sstevel@tonic-gate } 3835*7c478bd9Sstevel@tonic-gate /* 3836*7c478bd9Sstevel@tonic-gate * Update tcp_csuna in case the other side stops sending 3837*7c478bd9Sstevel@tonic-gate * us timestamps. 3838*7c478bd9Sstevel@tonic-gate */ 3839*7c478bd9Sstevel@tonic-gate tcp->tcp_csuna = tcp->tcp_snxt; 3840*7c478bd9Sstevel@tonic-gate } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { 3841*7c478bd9Sstevel@tonic-gate /* 3842*7c478bd9Sstevel@tonic-gate * An ACK sequence we haven't seen before, so get the RTT 3843*7c478bd9Sstevel@tonic-gate * and update the RTO. 3844*7c478bd9Sstevel@tonic-gate */ 3845*7c478bd9Sstevel@tonic-gate tcp_set_rto(tcp, (int32_t)(prom_gettime() - 3846*7c478bd9Sstevel@tonic-gate (uint32_t)mp1->b_prev)); 3847*7c478bd9Sstevel@tonic-gate 3848*7c478bd9Sstevel@tonic-gate /* Remeber the last sequence to be ACKed */ 3849*7c478bd9Sstevel@tonic-gate tcp->tcp_csuna = seg_ack; 3850*7c478bd9Sstevel@tonic-gate if (tcp->tcp_set_timer == 1) { 3851*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3852*7c478bd9Sstevel@tonic-gate tcp->tcp_set_timer = 0; 3853*7c478bd9Sstevel@tonic-gate } 3854*7c478bd9Sstevel@tonic-gate } else { 3855*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpRttNoUpdate); 3856*7c478bd9Sstevel@tonic-gate } 3857*7c478bd9Sstevel@tonic-gate 3858*7c478bd9Sstevel@tonic-gate /* Eat acknowledged bytes off the xmit queue. */ 3859*7c478bd9Sstevel@tonic-gate for (;;) { 3860*7c478bd9Sstevel@tonic-gate mblk_t *mp2; 3861*7c478bd9Sstevel@tonic-gate uchar_t *wptr; 3862*7c478bd9Sstevel@tonic-gate 3863*7c478bd9Sstevel@tonic-gate wptr = mp1->b_wptr; 3864*7c478bd9Sstevel@tonic-gate assert((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX); 3865*7c478bd9Sstevel@tonic-gate bytes_acked -= (int)(wptr - mp1->b_rptr); 3866*7c478bd9Sstevel@tonic-gate if (bytes_acked < 0) { 3867*7c478bd9Sstevel@tonic-gate mp1->b_rptr = wptr + bytes_acked; 3868*7c478bd9Sstevel@tonic-gate break; 3869*7c478bd9Sstevel@tonic-gate } 3870*7c478bd9Sstevel@tonic-gate mp1->b_prev = NULL; 3871*7c478bd9Sstevel@tonic-gate mp2 = mp1; 3872*7c478bd9Sstevel@tonic-gate mp1 = mp1->b_cont; 3873*7c478bd9Sstevel@tonic-gate freeb(mp2); 3874*7c478bd9Sstevel@tonic-gate if (bytes_acked == 0) { 3875*7c478bd9Sstevel@tonic-gate if (mp1 == NULL) { 3876*7c478bd9Sstevel@tonic-gate /* Everything is ack'ed, clear the tail. */ 3877*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_tail = NULL; 3878*7c478bd9Sstevel@tonic-gate goto pre_swnd_update; 3879*7c478bd9Sstevel@tonic-gate } 3880*7c478bd9Sstevel@tonic-gate if (mp2 != tcp->tcp_xmit_tail) 3881*7c478bd9Sstevel@tonic-gate break; 3882*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_tail = mp1; 3883*7c478bd9Sstevel@tonic-gate assert((uintptr_t)(mp1->b_wptr - 3884*7c478bd9Sstevel@tonic-gate mp1->b_rptr) <= (uintptr_t)INT_MAX); 3885*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr - 3886*7c478bd9Sstevel@tonic-gate mp1->b_rptr); 3887*7c478bd9Sstevel@tonic-gate break; 3888*7c478bd9Sstevel@tonic-gate } 3889*7c478bd9Sstevel@tonic-gate if (mp1 == NULL) { 3890*7c478bd9Sstevel@tonic-gate /* 3891*7c478bd9Sstevel@tonic-gate * More was acked but there is nothing more 3892*7c478bd9Sstevel@tonic-gate * outstanding. This means that the FIN was 3893*7c478bd9Sstevel@tonic-gate * just acked or that we're talking to a clown. 3894*7c478bd9Sstevel@tonic-gate */ 3895*7c478bd9Sstevel@tonic-gate fin_acked: 3896*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_fin_sent); 3897*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_tail = NULL; 3898*7c478bd9Sstevel@tonic-gate if (tcp->tcp_fin_sent) { 3899*7c478bd9Sstevel@tonic-gate tcp->tcp_fin_acked = B_TRUE; 3900*7c478bd9Sstevel@tonic-gate } else { 3901*7c478bd9Sstevel@tonic-gate /* 3902*7c478bd9Sstevel@tonic-gate * We should never got here because 3903*7c478bd9Sstevel@tonic-gate * we have already checked that the 3904*7c478bd9Sstevel@tonic-gate * number of bytes ack'ed should be 3905*7c478bd9Sstevel@tonic-gate * smaller than or equal to what we 3906*7c478bd9Sstevel@tonic-gate * have sent so far (it is the 3907*7c478bd9Sstevel@tonic-gate * acceptability check of the ACK). 3908*7c478bd9Sstevel@tonic-gate * We can only get here if the send 3909*7c478bd9Sstevel@tonic-gate * queue is corrupted. 3910*7c478bd9Sstevel@tonic-gate * 3911*7c478bd9Sstevel@tonic-gate * Terminate the connection and 3912*7c478bd9Sstevel@tonic-gate * panic the system. It is better 3913*7c478bd9Sstevel@tonic-gate * for us to panic instead of 3914*7c478bd9Sstevel@tonic-gate * continuing to avoid other disaster. 3915*7c478bd9Sstevel@tonic-gate */ 3916*7c478bd9Sstevel@tonic-gate tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 3917*7c478bd9Sstevel@tonic-gate tcp->tcp_rnxt, TH_RST|TH_ACK, 0, sock_id); 3918*7c478bd9Sstevel@tonic-gate printf("Memory corruption " 3919*7c478bd9Sstevel@tonic-gate "detected for connection %s.\n", 3920*7c478bd9Sstevel@tonic-gate tcp_display(tcp, NULL, 3921*7c478bd9Sstevel@tonic-gate DISP_ADDR_AND_PORT)); 3922*7c478bd9Sstevel@tonic-gate /* We should never get here... */ 3923*7c478bd9Sstevel@tonic-gate prom_panic("tcp_rput_data"); 3924*7c478bd9Sstevel@tonic-gate return; 3925*7c478bd9Sstevel@tonic-gate } 3926*7c478bd9Sstevel@tonic-gate goto pre_swnd_update; 3927*7c478bd9Sstevel@tonic-gate } 3928*7c478bd9Sstevel@tonic-gate assert(mp2 != tcp->tcp_xmit_tail); 3929*7c478bd9Sstevel@tonic-gate } 3930*7c478bd9Sstevel@tonic-gate if (tcp->tcp_unsent) { 3931*7c478bd9Sstevel@tonic-gate flags |= TH_XMIT_NEEDED; 3932*7c478bd9Sstevel@tonic-gate } 3933*7c478bd9Sstevel@tonic-gate pre_swnd_update: 3934*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_head = mp1; 3935*7c478bd9Sstevel@tonic-gate swnd_update: 3936*7c478bd9Sstevel@tonic-gate /* 3937*7c478bd9Sstevel@tonic-gate * The following check is different from most other implementations. 3938*7c478bd9Sstevel@tonic-gate * For bi-directional transfer, when segments are dropped, the 3939*7c478bd9Sstevel@tonic-gate * "normal" check will not accept a window update in those 3940*7c478bd9Sstevel@tonic-gate * retransmitted segemnts. Failing to do that, TCP may send out 3941*7c478bd9Sstevel@tonic-gate * segments which are outside receiver's window. As TCP accepts 3942*7c478bd9Sstevel@tonic-gate * the ack in those retransmitted segments, if the window update in 3943*7c478bd9Sstevel@tonic-gate * the same segment is not accepted, TCP will incorrectly calculates 3944*7c478bd9Sstevel@tonic-gate * that it can send more segments. This can create a deadlock 3945*7c478bd9Sstevel@tonic-gate * with the receiver if its window becomes zero. 3946*7c478bd9Sstevel@tonic-gate */ 3947*7c478bd9Sstevel@tonic-gate if (SEQ_LT(tcp->tcp_swl2, seg_ack) || 3948*7c478bd9Sstevel@tonic-gate SEQ_LT(tcp->tcp_swl1, seg_seq) || 3949*7c478bd9Sstevel@tonic-gate (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) { 3950*7c478bd9Sstevel@tonic-gate /* 3951*7c478bd9Sstevel@tonic-gate * The criteria for update is: 3952*7c478bd9Sstevel@tonic-gate * 3953*7c478bd9Sstevel@tonic-gate * 1. the segment acknowledges some data. Or 3954*7c478bd9Sstevel@tonic-gate * 2. the segment is new, i.e. it has a higher seq num. Or 3955*7c478bd9Sstevel@tonic-gate * 3. the segment is not old and the advertised window is 3956*7c478bd9Sstevel@tonic-gate * larger than the previous advertised window. 3957*7c478bd9Sstevel@tonic-gate */ 3958*7c478bd9Sstevel@tonic-gate if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd) 3959*7c478bd9Sstevel@tonic-gate flags |= TH_XMIT_NEEDED; 3960*7c478bd9Sstevel@tonic-gate tcp->tcp_swnd = new_swnd; 3961*7c478bd9Sstevel@tonic-gate if (new_swnd > tcp->tcp_max_swnd) 3962*7c478bd9Sstevel@tonic-gate tcp->tcp_max_swnd = new_swnd; 3963*7c478bd9Sstevel@tonic-gate tcp->tcp_swl1 = seg_seq; 3964*7c478bd9Sstevel@tonic-gate tcp->tcp_swl2 = seg_ack; 3965*7c478bd9Sstevel@tonic-gate } 3966*7c478bd9Sstevel@tonic-gate est: 3967*7c478bd9Sstevel@tonic-gate if (tcp->tcp_state > TCPS_ESTABLISHED) { 3968*7c478bd9Sstevel@tonic-gate switch (tcp->tcp_state) { 3969*7c478bd9Sstevel@tonic-gate case TCPS_FIN_WAIT_1: 3970*7c478bd9Sstevel@tonic-gate if (tcp->tcp_fin_acked) { 3971*7c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_FIN_WAIT_2; 3972*7c478bd9Sstevel@tonic-gate /* 3973*7c478bd9Sstevel@tonic-gate * We implement the non-standard BSD/SunOS 3974*7c478bd9Sstevel@tonic-gate * FIN_WAIT_2 flushing algorithm. 3975*7c478bd9Sstevel@tonic-gate * If there is no user attached to this 3976*7c478bd9Sstevel@tonic-gate * TCP endpoint, then this TCP struct 3977*7c478bd9Sstevel@tonic-gate * could hang around forever in FIN_WAIT_2 3978*7c478bd9Sstevel@tonic-gate * state if the peer forgets to send us 3979*7c478bd9Sstevel@tonic-gate * a FIN. To prevent this, we wait only 3980*7c478bd9Sstevel@tonic-gate * 2*MSL (a convenient time value) for 3981*7c478bd9Sstevel@tonic-gate * the FIN to arrive. If it doesn't show up, 3982*7c478bd9Sstevel@tonic-gate * we flush the TCP endpoint. This algorithm, 3983*7c478bd9Sstevel@tonic-gate * though a violation of RFC-793, has worked 3984*7c478bd9Sstevel@tonic-gate * for over 10 years in BSD systems. 3985*7c478bd9Sstevel@tonic-gate * Note: SunOS 4.x waits 675 seconds before 3986*7c478bd9Sstevel@tonic-gate * flushing the FIN_WAIT_2 connection. 3987*7c478bd9Sstevel@tonic-gate */ 3988*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, 3989*7c478bd9Sstevel@tonic-gate tcp_fin_wait_2_flush_interval); 3990*7c478bd9Sstevel@tonic-gate } 3991*7c478bd9Sstevel@tonic-gate break; 3992*7c478bd9Sstevel@tonic-gate case TCPS_FIN_WAIT_2: 3993*7c478bd9Sstevel@tonic-gate break; /* Shutdown hook? */ 3994*7c478bd9Sstevel@tonic-gate case TCPS_LAST_ACK: 3995*7c478bd9Sstevel@tonic-gate freemsg(mp); 3996*7c478bd9Sstevel@tonic-gate if (tcp->tcp_fin_acked) { 3997*7c478bd9Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, 0); 3998*7c478bd9Sstevel@tonic-gate return; 3999*7c478bd9Sstevel@tonic-gate } 4000*7c478bd9Sstevel@tonic-gate goto xmit_check; 4001*7c478bd9Sstevel@tonic-gate case TCPS_CLOSING: 4002*7c478bd9Sstevel@tonic-gate if (tcp->tcp_fin_acked) { 4003*7c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_TIME_WAIT; 4004*7c478bd9Sstevel@tonic-gate tcp_time_wait_append(tcp); 4005*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 4006*7c478bd9Sstevel@tonic-gate } 4007*7c478bd9Sstevel@tonic-gate /*FALLTHRU*/ 4008*7c478bd9Sstevel@tonic-gate case TCPS_CLOSE_WAIT: 4009*7c478bd9Sstevel@tonic-gate freemsg(mp); 4010*7c478bd9Sstevel@tonic-gate goto xmit_check; 4011*7c478bd9Sstevel@tonic-gate default: 4012*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_state != TCPS_TIME_WAIT); 4013*7c478bd9Sstevel@tonic-gate break; 4014*7c478bd9Sstevel@tonic-gate } 4015*7c478bd9Sstevel@tonic-gate } 4016*7c478bd9Sstevel@tonic-gate if (flags & TH_FIN) { 4017*7c478bd9Sstevel@tonic-gate /* Make sure we ack the fin */ 4018*7c478bd9Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 4019*7c478bd9Sstevel@tonic-gate if (!tcp->tcp_fin_rcvd) { 4020*7c478bd9Sstevel@tonic-gate tcp->tcp_fin_rcvd = B_TRUE; 4021*7c478bd9Sstevel@tonic-gate tcp->tcp_rnxt++; 4022*7c478bd9Sstevel@tonic-gate U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 4023*7c478bd9Sstevel@tonic-gate 4024*7c478bd9Sstevel@tonic-gate switch (tcp->tcp_state) { 4025*7c478bd9Sstevel@tonic-gate case TCPS_SYN_RCVD: 4026*7c478bd9Sstevel@tonic-gate case TCPS_ESTABLISHED: 4027*7c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_CLOSE_WAIT; 4028*7c478bd9Sstevel@tonic-gate /* Keepalive? */ 4029*7c478bd9Sstevel@tonic-gate break; 4030*7c478bd9Sstevel@tonic-gate case TCPS_FIN_WAIT_1: 4031*7c478bd9Sstevel@tonic-gate if (!tcp->tcp_fin_acked) { 4032*7c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_CLOSING; 4033*7c478bd9Sstevel@tonic-gate break; 4034*7c478bd9Sstevel@tonic-gate } 4035*7c478bd9Sstevel@tonic-gate /* FALLTHRU */ 4036*7c478bd9Sstevel@tonic-gate case TCPS_FIN_WAIT_2: 4037*7c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_TIME_WAIT; 4038*7c478bd9Sstevel@tonic-gate tcp_time_wait_append(tcp); 4039*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 4040*7c478bd9Sstevel@tonic-gate if (seg_len) { 4041*7c478bd9Sstevel@tonic-gate /* 4042*7c478bd9Sstevel@tonic-gate * implies data piggybacked on FIN. 4043*7c478bd9Sstevel@tonic-gate * break to handle data. 4044*7c478bd9Sstevel@tonic-gate */ 4045*7c478bd9Sstevel@tonic-gate break; 4046*7c478bd9Sstevel@tonic-gate } 4047*7c478bd9Sstevel@tonic-gate freemsg(mp); 4048*7c478bd9Sstevel@tonic-gate goto ack_check; 4049*7c478bd9Sstevel@tonic-gate } 4050*7c478bd9Sstevel@tonic-gate } 4051*7c478bd9Sstevel@tonic-gate } 4052*7c478bd9Sstevel@tonic-gate if (mp == NULL) 4053*7c478bd9Sstevel@tonic-gate goto xmit_check; 4054*7c478bd9Sstevel@tonic-gate if (seg_len == 0) { 4055*7c478bd9Sstevel@tonic-gate freemsg(mp); 4056*7c478bd9Sstevel@tonic-gate goto xmit_check; 4057*7c478bd9Sstevel@tonic-gate } 4058*7c478bd9Sstevel@tonic-gate if (mp->b_rptr == mp->b_wptr) { 4059*7c478bd9Sstevel@tonic-gate /* 4060*7c478bd9Sstevel@tonic-gate * The header has been consumed, so we remove the 4061*7c478bd9Sstevel@tonic-gate * zero-length mblk here. 4062*7c478bd9Sstevel@tonic-gate */ 4063*7c478bd9Sstevel@tonic-gate mp1 = mp; 4064*7c478bd9Sstevel@tonic-gate mp = mp->b_cont; 4065*7c478bd9Sstevel@tonic-gate freeb(mp1); 4066*7c478bd9Sstevel@tonic-gate } 4067*7c478bd9Sstevel@tonic-gate /* 4068*7c478bd9Sstevel@tonic-gate * ACK every other segments, unless the input queue is empty 4069*7c478bd9Sstevel@tonic-gate * as we don't have a timer available. 4070*7c478bd9Sstevel@tonic-gate */ 4071*7c478bd9Sstevel@tonic-gate if (++tcp->tcp_rack_cnt == 2 || sockets[sock_id].inq == NULL) { 4072*7c478bd9Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 4073*7c478bd9Sstevel@tonic-gate tcp->tcp_rack_cnt = 0; 4074*7c478bd9Sstevel@tonic-gate } 4075*7c478bd9Sstevel@tonic-gate tcp->tcp_rnxt += seg_len; 4076*7c478bd9Sstevel@tonic-gate U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 4077*7c478bd9Sstevel@tonic-gate 4078*7c478bd9Sstevel@tonic-gate /* Update SACK list */ 4079*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 4080*7c478bd9Sstevel@tonic-gate tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, 4081*7c478bd9Sstevel@tonic-gate &(tcp->tcp_num_sack_blk)); 4082*7c478bd9Sstevel@tonic-gate } 4083*7c478bd9Sstevel@tonic-gate 4084*7c478bd9Sstevel@tonic-gate if (tcp->tcp_listener) { 4085*7c478bd9Sstevel@tonic-gate /* 4086*7c478bd9Sstevel@tonic-gate * Side queue inbound data until the accept happens. 4087*7c478bd9Sstevel@tonic-gate * tcp_accept/tcp_rput drains this when the accept happens. 4088*7c478bd9Sstevel@tonic-gate */ 4089*7c478bd9Sstevel@tonic-gate tcp_rcv_enqueue(tcp, mp, seg_len); 4090*7c478bd9Sstevel@tonic-gate } else { 4091*7c478bd9Sstevel@tonic-gate /* Just queue the data until the app calls read. */ 4092*7c478bd9Sstevel@tonic-gate tcp_rcv_enqueue(tcp, mp, seg_len); 4093*7c478bd9Sstevel@tonic-gate /* 4094*7c478bd9Sstevel@tonic-gate * Make sure the timer is running if we have data waiting 4095*7c478bd9Sstevel@tonic-gate * for a push bit. This provides resiliency against 4096*7c478bd9Sstevel@tonic-gate * implementations that do not correctly generate push bits. 4097*7c478bd9Sstevel@tonic-gate */ 4098*7c478bd9Sstevel@tonic-gate if (tcp->tcp_rcv_list != NULL) 4099*7c478bd9Sstevel@tonic-gate flags |= TH_TIMER_NEEDED; 4100*7c478bd9Sstevel@tonic-gate } 4101*7c478bd9Sstevel@tonic-gate 4102*7c478bd9Sstevel@tonic-gate xmit_check: 4103*7c478bd9Sstevel@tonic-gate /* Is there anything left to do? */ 4104*7c478bd9Sstevel@tonic-gate if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED| 4105*7c478bd9Sstevel@tonic-gate TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_TIMER_NEEDED)) == 0) 4106*7c478bd9Sstevel@tonic-gate return; 4107*7c478bd9Sstevel@tonic-gate 4108*7c478bd9Sstevel@tonic-gate /* Any transmit work to do and a non-zero window? */ 4109*7c478bd9Sstevel@tonic-gate if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT| 4110*7c478bd9Sstevel@tonic-gate TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) { 4111*7c478bd9Sstevel@tonic-gate if (flags & TH_REXMIT_NEEDED) { 4112*7c478bd9Sstevel@tonic-gate uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; 4113*7c478bd9Sstevel@tonic-gate 4114*7c478bd9Sstevel@tonic-gate if (snd_size > mss) 4115*7c478bd9Sstevel@tonic-gate snd_size = mss; 4116*7c478bd9Sstevel@tonic-gate if (snd_size > tcp->tcp_swnd) 4117*7c478bd9Sstevel@tonic-gate snd_size = tcp->tcp_swnd; 4118*7c478bd9Sstevel@tonic-gate mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, 4119*7c478bd9Sstevel@tonic-gate NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, 4120*7c478bd9Sstevel@tonic-gate B_TRUE); 4121*7c478bd9Sstevel@tonic-gate 4122*7c478bd9Sstevel@tonic-gate if (mp1 != NULL) { 4123*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_head->b_prev = 4124*7c478bd9Sstevel@tonic-gate (mblk_t *)prom_gettime(); 4125*7c478bd9Sstevel@tonic-gate tcp->tcp_csuna = tcp->tcp_snxt; 4126*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpRetransSegs); 4127*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpRetransBytes, snd_size); 4128*7c478bd9Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp1); 4129*7c478bd9Sstevel@tonic-gate freeb(mp1); 4130*7c478bd9Sstevel@tonic-gate } 4131*7c478bd9Sstevel@tonic-gate } 4132*7c478bd9Sstevel@tonic-gate if (flags & TH_NEED_SACK_REXMIT) { 4133*7c478bd9Sstevel@tonic-gate if (tcp_sack_rxmit(tcp, sock_id) != 0) { 4134*7c478bd9Sstevel@tonic-gate flags |= TH_XMIT_NEEDED; 4135*7c478bd9Sstevel@tonic-gate } 4136*7c478bd9Sstevel@tonic-gate } 4137*7c478bd9Sstevel@tonic-gate /* 4138*7c478bd9Sstevel@tonic-gate * For TH_LIMIT_XMIT, tcp_wput_data() is called to send 4139*7c478bd9Sstevel@tonic-gate * out new segment. Note that tcp_rexmit should not be 4140*7c478bd9Sstevel@tonic-gate * set, otherwise TH_LIMIT_XMIT should not be set. 4141*7c478bd9Sstevel@tonic-gate */ 4142*7c478bd9Sstevel@tonic-gate if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) { 4143*7c478bd9Sstevel@tonic-gate if (!tcp->tcp_rexmit) { 4144*7c478bd9Sstevel@tonic-gate tcp_wput_data(tcp, NULL, sock_id); 4145*7c478bd9Sstevel@tonic-gate } else { 4146*7c478bd9Sstevel@tonic-gate tcp_ss_rexmit(tcp, sock_id); 4147*7c478bd9Sstevel@tonic-gate } 4148*7c478bd9Sstevel@tonic-gate /* 4149*7c478bd9Sstevel@tonic-gate * The TCP could be closed in tcp_state_wait via 4150*7c478bd9Sstevel@tonic-gate * tcp_wput_data (tcp_ss_rexmit could call 4151*7c478bd9Sstevel@tonic-gate * tcp_wput_data as well). 4152*7c478bd9Sstevel@tonic-gate */ 4153*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].pcb == NULL) 4154*7c478bd9Sstevel@tonic-gate return; 4155*7c478bd9Sstevel@tonic-gate } 4156*7c478bd9Sstevel@tonic-gate /* 4157*7c478bd9Sstevel@tonic-gate * Adjust tcp_cwnd back to normal value after sending 4158*7c478bd9Sstevel@tonic-gate * new data segments. 4159*7c478bd9Sstevel@tonic-gate */ 4160*7c478bd9Sstevel@tonic-gate if (flags & TH_LIMIT_XMIT) { 4161*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1); 4162*7c478bd9Sstevel@tonic-gate } 4163*7c478bd9Sstevel@tonic-gate 4164*7c478bd9Sstevel@tonic-gate /* Anything more to do? */ 4165*7c478bd9Sstevel@tonic-gate if ((flags & (TH_ACK_NEEDED|TH_TIMER_NEEDED)) == 0) 4166*7c478bd9Sstevel@tonic-gate return; 4167*7c478bd9Sstevel@tonic-gate } 4168*7c478bd9Sstevel@tonic-gate ack_check: 4169*7c478bd9Sstevel@tonic-gate if (flags & TH_ACK_NEEDED) { 4170*7c478bd9Sstevel@tonic-gate /* 4171*7c478bd9Sstevel@tonic-gate * Time to send an ack for some reason. 4172*7c478bd9Sstevel@tonic-gate */ 4173*7c478bd9Sstevel@tonic-gate if ((mp1 = tcp_ack_mp(tcp)) != NULL) { 4174*7c478bd9Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_rput_data: ack mp", mp1); 4175*7c478bd9Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp1); 4176*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutAck); 4177*7c478bd9Sstevel@tonic-gate freeb(mp1); 4178*7c478bd9Sstevel@tonic-gate } 4179*7c478bd9Sstevel@tonic-gate } 4180*7c478bd9Sstevel@tonic-gate } 4181*7c478bd9Sstevel@tonic-gate 4182*7c478bd9Sstevel@tonic-gate /* 4183*7c478bd9Sstevel@tonic-gate * tcp_ss_rexmit() is called in tcp_rput_data() to do slow start 4184*7c478bd9Sstevel@tonic-gate * retransmission after a timeout. 4185*7c478bd9Sstevel@tonic-gate * 4186*7c478bd9Sstevel@tonic-gate * To limit the number of duplicate segments, we limit the number of segment 4187*7c478bd9Sstevel@tonic-gate * to be sent in one time to tcp_snd_burst, the burst variable. 4188*7c478bd9Sstevel@tonic-gate */ 4189*7c478bd9Sstevel@tonic-gate static void 4190*7c478bd9Sstevel@tonic-gate tcp_ss_rexmit(tcp_t *tcp, int sock_id) 4191*7c478bd9Sstevel@tonic-gate { 4192*7c478bd9Sstevel@tonic-gate uint32_t snxt; 4193*7c478bd9Sstevel@tonic-gate uint32_t smax; 4194*7c478bd9Sstevel@tonic-gate int32_t win; 4195*7c478bd9Sstevel@tonic-gate int32_t mss; 4196*7c478bd9Sstevel@tonic-gate int32_t off; 4197*7c478bd9Sstevel@tonic-gate int32_t burst = tcp->tcp_snd_burst; 4198*7c478bd9Sstevel@tonic-gate mblk_t *snxt_mp; 4199*7c478bd9Sstevel@tonic-gate 4200*7c478bd9Sstevel@tonic-gate /* 4201*7c478bd9Sstevel@tonic-gate * Note that tcp_rexmit can be set even though TCP has retransmitted 4202*7c478bd9Sstevel@tonic-gate * all unack'ed segments. 4203*7c478bd9Sstevel@tonic-gate */ 4204*7c478bd9Sstevel@tonic-gate if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) { 4205*7c478bd9Sstevel@tonic-gate smax = tcp->tcp_rexmit_max; 4206*7c478bd9Sstevel@tonic-gate snxt = tcp->tcp_rexmit_nxt; 4207*7c478bd9Sstevel@tonic-gate if (SEQ_LT(snxt, tcp->tcp_suna)) { 4208*7c478bd9Sstevel@tonic-gate snxt = tcp->tcp_suna; 4209*7c478bd9Sstevel@tonic-gate } 4210*7c478bd9Sstevel@tonic-gate win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd); 4211*7c478bd9Sstevel@tonic-gate win -= snxt - tcp->tcp_suna; 4212*7c478bd9Sstevel@tonic-gate mss = tcp->tcp_mss; 4213*7c478bd9Sstevel@tonic-gate snxt_mp = tcp_get_seg_mp(tcp, snxt, &off); 4214*7c478bd9Sstevel@tonic-gate 4215*7c478bd9Sstevel@tonic-gate while (SEQ_LT(snxt, smax) && (win > 0) && 4216*7c478bd9Sstevel@tonic-gate (burst > 0) && (snxt_mp != NULL)) { 4217*7c478bd9Sstevel@tonic-gate mblk_t *xmit_mp; 4218*7c478bd9Sstevel@tonic-gate mblk_t *old_snxt_mp = snxt_mp; 4219*7c478bd9Sstevel@tonic-gate uint32_t cnt = mss; 4220*7c478bd9Sstevel@tonic-gate 4221*7c478bd9Sstevel@tonic-gate if (win < cnt) { 4222*7c478bd9Sstevel@tonic-gate cnt = win; 4223*7c478bd9Sstevel@tonic-gate } 4224*7c478bd9Sstevel@tonic-gate if (SEQ_GT(snxt + cnt, smax)) { 4225*7c478bd9Sstevel@tonic-gate cnt = smax - snxt; 4226*7c478bd9Sstevel@tonic-gate } 4227*7c478bd9Sstevel@tonic-gate xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off, 4228*7c478bd9Sstevel@tonic-gate &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE); 4229*7c478bd9Sstevel@tonic-gate 4230*7c478bd9Sstevel@tonic-gate if (xmit_mp == NULL) 4231*7c478bd9Sstevel@tonic-gate return; 4232*7c478bd9Sstevel@tonic-gate 4233*7c478bd9Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, xmit_mp); 4234*7c478bd9Sstevel@tonic-gate freeb(xmit_mp); 4235*7c478bd9Sstevel@tonic-gate 4236*7c478bd9Sstevel@tonic-gate snxt += cnt; 4237*7c478bd9Sstevel@tonic-gate win -= cnt; 4238*7c478bd9Sstevel@tonic-gate /* 4239*7c478bd9Sstevel@tonic-gate * Update the send timestamp to avoid false 4240*7c478bd9Sstevel@tonic-gate * retransmission. 4241*7c478bd9Sstevel@tonic-gate */ 4242*7c478bd9Sstevel@tonic-gate old_snxt_mp->b_prev = (mblk_t *)prom_gettime(); 4243*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpRetransSegs); 4244*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpRetransBytes, cnt); 4245*7c478bd9Sstevel@tonic-gate 4246*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_nxt = snxt; 4247*7c478bd9Sstevel@tonic-gate burst--; 4248*7c478bd9Sstevel@tonic-gate } 4249*7c478bd9Sstevel@tonic-gate /* 4250*7c478bd9Sstevel@tonic-gate * If we have transmitted all we have at the time 4251*7c478bd9Sstevel@tonic-gate * we started the retranmission, we can leave 4252*7c478bd9Sstevel@tonic-gate * the rest of the job to tcp_wput_data(). But we 4253*7c478bd9Sstevel@tonic-gate * need to check the send window first. If the 4254*7c478bd9Sstevel@tonic-gate * win is not 0, go on with tcp_wput_data(). 4255*7c478bd9Sstevel@tonic-gate */ 4256*7c478bd9Sstevel@tonic-gate if (SEQ_LT(snxt, smax) || win == 0) { 4257*7c478bd9Sstevel@tonic-gate return; 4258*7c478bd9Sstevel@tonic-gate } 4259*7c478bd9Sstevel@tonic-gate } 4260*7c478bd9Sstevel@tonic-gate /* Only call tcp_wput_data() if there is data to be sent. */ 4261*7c478bd9Sstevel@tonic-gate if (tcp->tcp_unsent) { 4262*7c478bd9Sstevel@tonic-gate tcp_wput_data(tcp, NULL, sock_id); 4263*7c478bd9Sstevel@tonic-gate } 4264*7c478bd9Sstevel@tonic-gate } 4265*7c478bd9Sstevel@tonic-gate 4266*7c478bd9Sstevel@tonic-gate /* 4267*7c478bd9Sstevel@tonic-gate * tcp_timer is the timer service routine. It handles all timer events for 4268*7c478bd9Sstevel@tonic-gate * a tcp instance except keepalives. It figures out from the state of the 4269*7c478bd9Sstevel@tonic-gate * tcp instance what kind of action needs to be done at the time it is called. 4270*7c478bd9Sstevel@tonic-gate */ 4271*7c478bd9Sstevel@tonic-gate static void 4272*7c478bd9Sstevel@tonic-gate tcp_timer(tcp_t *tcp, int sock_id) 4273*7c478bd9Sstevel@tonic-gate { 4274*7c478bd9Sstevel@tonic-gate mblk_t *mp; 4275*7c478bd9Sstevel@tonic-gate uint32_t first_threshold; 4276*7c478bd9Sstevel@tonic-gate uint32_t second_threshold; 4277*7c478bd9Sstevel@tonic-gate uint32_t ms; 4278*7c478bd9Sstevel@tonic-gate uint32_t mss; 4279*7c478bd9Sstevel@tonic-gate 4280*7c478bd9Sstevel@tonic-gate first_threshold = tcp->tcp_first_timer_threshold; 4281*7c478bd9Sstevel@tonic-gate second_threshold = tcp->tcp_second_timer_threshold; 4282*7c478bd9Sstevel@tonic-gate switch (tcp->tcp_state) { 4283*7c478bd9Sstevel@tonic-gate case TCPS_IDLE: 4284*7c478bd9Sstevel@tonic-gate case TCPS_BOUND: 4285*7c478bd9Sstevel@tonic-gate case TCPS_LISTEN: 4286*7c478bd9Sstevel@tonic-gate return; 4287*7c478bd9Sstevel@tonic-gate case TCPS_SYN_RCVD: 4288*7c478bd9Sstevel@tonic-gate case TCPS_SYN_SENT: 4289*7c478bd9Sstevel@tonic-gate first_threshold = tcp->tcp_first_ctimer_threshold; 4290*7c478bd9Sstevel@tonic-gate second_threshold = tcp->tcp_second_ctimer_threshold; 4291*7c478bd9Sstevel@tonic-gate break; 4292*7c478bd9Sstevel@tonic-gate case TCPS_ESTABLISHED: 4293*7c478bd9Sstevel@tonic-gate case TCPS_FIN_WAIT_1: 4294*7c478bd9Sstevel@tonic-gate case TCPS_CLOSING: 4295*7c478bd9Sstevel@tonic-gate case TCPS_CLOSE_WAIT: 4296*7c478bd9Sstevel@tonic-gate case TCPS_LAST_ACK: 4297*7c478bd9Sstevel@tonic-gate /* If we have data to rexmit */ 4298*7c478bd9Sstevel@tonic-gate if (tcp->tcp_suna != tcp->tcp_snxt) { 4299*7c478bd9Sstevel@tonic-gate int32_t time_to_wait; 4300*7c478bd9Sstevel@tonic-gate 4301*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpTimRetrans); 4302*7c478bd9Sstevel@tonic-gate if (tcp->tcp_xmit_head == NULL) 4303*7c478bd9Sstevel@tonic-gate break; 4304*7c478bd9Sstevel@tonic-gate time_to_wait = (int32_t)(prom_gettime() - 4305*7c478bd9Sstevel@tonic-gate (uint32_t)tcp->tcp_xmit_head->b_prev); 4306*7c478bd9Sstevel@tonic-gate time_to_wait = tcp->tcp_rto - time_to_wait; 4307*7c478bd9Sstevel@tonic-gate if (time_to_wait > 0) { 4308*7c478bd9Sstevel@tonic-gate /* 4309*7c478bd9Sstevel@tonic-gate * Timer fired too early, so restart it. 4310*7c478bd9Sstevel@tonic-gate */ 4311*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, time_to_wait); 4312*7c478bd9Sstevel@tonic-gate return; 4313*7c478bd9Sstevel@tonic-gate } 4314*7c478bd9Sstevel@tonic-gate /* 4315*7c478bd9Sstevel@tonic-gate * When we probe zero windows, we force the swnd open. 4316*7c478bd9Sstevel@tonic-gate * If our peer acks with a closed window swnd will be 4317*7c478bd9Sstevel@tonic-gate * set to zero by tcp_rput(). As long as we are 4318*7c478bd9Sstevel@tonic-gate * receiving acks tcp_rput will 4319*7c478bd9Sstevel@tonic-gate * reset 'tcp_ms_we_have_waited' so as not to trip the 4320*7c478bd9Sstevel@tonic-gate * first and second interval actions. NOTE: the timer 4321*7c478bd9Sstevel@tonic-gate * interval is allowed to continue its exponential 4322*7c478bd9Sstevel@tonic-gate * backoff. 4323*7c478bd9Sstevel@tonic-gate */ 4324*7c478bd9Sstevel@tonic-gate if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { 4325*7c478bd9Sstevel@tonic-gate DEBUG_1("tcp_timer (%d): zero win", sock_id); 4326*7c478bd9Sstevel@tonic-gate break; 4327*7c478bd9Sstevel@tonic-gate } else { 4328*7c478bd9Sstevel@tonic-gate /* 4329*7c478bd9Sstevel@tonic-gate * After retransmission, we need to do 4330*7c478bd9Sstevel@tonic-gate * slow start. Set the ssthresh to one 4331*7c478bd9Sstevel@tonic-gate * half of current effective window and 4332*7c478bd9Sstevel@tonic-gate * cwnd to one MSS. Also reset 4333*7c478bd9Sstevel@tonic-gate * tcp_cwnd_cnt. 4334*7c478bd9Sstevel@tonic-gate * 4335*7c478bd9Sstevel@tonic-gate * Note that if tcp_ssthresh is reduced because 4336*7c478bd9Sstevel@tonic-gate * of ECN, do not reduce it again unless it is 4337*7c478bd9Sstevel@tonic-gate * already one window of data away (tcp_cwr 4338*7c478bd9Sstevel@tonic-gate * should then be cleared) or this is a 4339*7c478bd9Sstevel@tonic-gate * timeout for a retransmitted segment. 4340*7c478bd9Sstevel@tonic-gate */ 4341*7c478bd9Sstevel@tonic-gate uint32_t npkt; 4342*7c478bd9Sstevel@tonic-gate 4343*7c478bd9Sstevel@tonic-gate if (!tcp->tcp_cwr || tcp->tcp_rexmit) { 4344*7c478bd9Sstevel@tonic-gate npkt = (MIN((tcp->tcp_timer_backoff ? 4345*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_ssthresh : 4346*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd), 4347*7c478bd9Sstevel@tonic-gate tcp->tcp_swnd) >> 1) / 4348*7c478bd9Sstevel@tonic-gate tcp->tcp_mss; 4349*7c478bd9Sstevel@tonic-gate if (npkt < 2) 4350*7c478bd9Sstevel@tonic-gate npkt = 2; 4351*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_ssthresh = npkt * 4352*7c478bd9Sstevel@tonic-gate tcp->tcp_mss; 4353*7c478bd9Sstevel@tonic-gate } 4354*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd = tcp->tcp_mss; 4355*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_cnt = 0; 4356*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ecn_ok) { 4357*7c478bd9Sstevel@tonic-gate tcp->tcp_cwr = B_TRUE; 4358*7c478bd9Sstevel@tonic-gate tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 4359*7c478bd9Sstevel@tonic-gate tcp->tcp_ecn_cwr_sent = B_FALSE; 4360*7c478bd9Sstevel@tonic-gate } 4361*7c478bd9Sstevel@tonic-gate } 4362*7c478bd9Sstevel@tonic-gate break; 4363*7c478bd9Sstevel@tonic-gate } 4364*7c478bd9Sstevel@tonic-gate /* 4365*7c478bd9Sstevel@tonic-gate * We have something to send yet we cannot send. The 4366*7c478bd9Sstevel@tonic-gate * reason can be: 4367*7c478bd9Sstevel@tonic-gate * 4368*7c478bd9Sstevel@tonic-gate * 1. Zero send window: we need to do zero window probe. 4369*7c478bd9Sstevel@tonic-gate * 2. Zero cwnd: because of ECN, we need to "clock out 4370*7c478bd9Sstevel@tonic-gate * segments. 4371*7c478bd9Sstevel@tonic-gate * 3. SWS avoidance: receiver may have shrunk window, 4372*7c478bd9Sstevel@tonic-gate * reset our knowledge. 4373*7c478bd9Sstevel@tonic-gate * 4374*7c478bd9Sstevel@tonic-gate * Note that condition 2 can happen with either 1 or 4375*7c478bd9Sstevel@tonic-gate * 3. But 1 and 3 are exclusive. 4376*7c478bd9Sstevel@tonic-gate */ 4377*7c478bd9Sstevel@tonic-gate if (tcp->tcp_unsent != 0) { 4378*7c478bd9Sstevel@tonic-gate if (tcp->tcp_cwnd == 0) { 4379*7c478bd9Sstevel@tonic-gate /* 4380*7c478bd9Sstevel@tonic-gate * Set tcp_cwnd to 1 MSS so that a 4381*7c478bd9Sstevel@tonic-gate * new segment can be sent out. We 4382*7c478bd9Sstevel@tonic-gate * are "clocking out" new data when 4383*7c478bd9Sstevel@tonic-gate * the network is really congested. 4384*7c478bd9Sstevel@tonic-gate */ 4385*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_ecn_ok); 4386*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd = tcp->tcp_mss; 4387*7c478bd9Sstevel@tonic-gate } 4388*7c478bd9Sstevel@tonic-gate if (tcp->tcp_swnd == 0) { 4389*7c478bd9Sstevel@tonic-gate /* Extend window for zero window probe */ 4390*7c478bd9Sstevel@tonic-gate tcp->tcp_swnd++; 4391*7c478bd9Sstevel@tonic-gate tcp->tcp_zero_win_probe = B_TRUE; 4392*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutWinProbe); 4393*7c478bd9Sstevel@tonic-gate } else { 4394*7c478bd9Sstevel@tonic-gate /* 4395*7c478bd9Sstevel@tonic-gate * Handle timeout from sender SWS avoidance. 4396*7c478bd9Sstevel@tonic-gate * Reset our knowledge of the max send window 4397*7c478bd9Sstevel@tonic-gate * since the receiver might have reduced its 4398*7c478bd9Sstevel@tonic-gate * receive buffer. Avoid setting tcp_max_swnd 4399*7c478bd9Sstevel@tonic-gate * to one since that will essentially disable 4400*7c478bd9Sstevel@tonic-gate * the SWS checks. 4401*7c478bd9Sstevel@tonic-gate * 4402*7c478bd9Sstevel@tonic-gate * Note that since we don't have a SWS 4403*7c478bd9Sstevel@tonic-gate * state variable, if the timeout is set 4404*7c478bd9Sstevel@tonic-gate * for ECN but not for SWS, this 4405*7c478bd9Sstevel@tonic-gate * code will also be executed. This is 4406*7c478bd9Sstevel@tonic-gate * fine as tcp_max_swnd is updated 4407*7c478bd9Sstevel@tonic-gate * constantly and it will not affect 4408*7c478bd9Sstevel@tonic-gate * anything. 4409*7c478bd9Sstevel@tonic-gate */ 4410*7c478bd9Sstevel@tonic-gate tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); 4411*7c478bd9Sstevel@tonic-gate } 4412*7c478bd9Sstevel@tonic-gate tcp_wput_data(tcp, NULL, sock_id); 4413*7c478bd9Sstevel@tonic-gate return; 4414*7c478bd9Sstevel@tonic-gate } 4415*7c478bd9Sstevel@tonic-gate /* Is there a FIN that needs to be to re retransmitted? */ 4416*7c478bd9Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 4417*7c478bd9Sstevel@tonic-gate !tcp->tcp_fin_acked) 4418*7c478bd9Sstevel@tonic-gate break; 4419*7c478bd9Sstevel@tonic-gate /* Nothing to do, return without restarting timer. */ 4420*7c478bd9Sstevel@tonic-gate return; 4421*7c478bd9Sstevel@tonic-gate case TCPS_FIN_WAIT_2: 4422*7c478bd9Sstevel@tonic-gate /* 4423*7c478bd9Sstevel@tonic-gate * User closed the TCP endpoint and peer ACK'ed our FIN. 4424*7c478bd9Sstevel@tonic-gate * We waited some time for for peer's FIN, but it hasn't 4425*7c478bd9Sstevel@tonic-gate * arrived. We flush the connection now to avoid 4426*7c478bd9Sstevel@tonic-gate * case where the peer has rebooted. 4427*7c478bd9Sstevel@tonic-gate */ 4428*7c478bd9Sstevel@tonic-gate /* FALLTHRU */ 4429*7c478bd9Sstevel@tonic-gate case TCPS_TIME_WAIT: 4430*7c478bd9Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, 0); 4431*7c478bd9Sstevel@tonic-gate return; 4432*7c478bd9Sstevel@tonic-gate default: 4433*7c478bd9Sstevel@tonic-gate DEBUG_3("tcp_timer (%d): strange state (%d) %s", sock_id, 4434*7c478bd9Sstevel@tonic-gate tcp->tcp_state, tcp_display(tcp, NULL, 4435*7c478bd9Sstevel@tonic-gate DISP_PORT_ONLY)); 4436*7c478bd9Sstevel@tonic-gate return; 4437*7c478bd9Sstevel@tonic-gate } 4438*7c478bd9Sstevel@tonic-gate if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { 4439*7c478bd9Sstevel@tonic-gate /* 4440*7c478bd9Sstevel@tonic-gate * For zero window probe, we need to send indefinitely, 4441*7c478bd9Sstevel@tonic-gate * unless we have not heard from the other side for some 4442*7c478bd9Sstevel@tonic-gate * time... 4443*7c478bd9Sstevel@tonic-gate */ 4444*7c478bd9Sstevel@tonic-gate if ((tcp->tcp_zero_win_probe == 0) || 4445*7c478bd9Sstevel@tonic-gate ((prom_gettime() - tcp->tcp_last_recv_time) > 4446*7c478bd9Sstevel@tonic-gate second_threshold)) { 4447*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpTimRetransDrop); 4448*7c478bd9Sstevel@tonic-gate /* 4449*7c478bd9Sstevel@tonic-gate * If TCP is in SYN_RCVD state, send back a 4450*7c478bd9Sstevel@tonic-gate * RST|ACK as BSD does. Note that tcp_zero_win_probe 4451*7c478bd9Sstevel@tonic-gate * should be zero in TCPS_SYN_RCVD state. 4452*7c478bd9Sstevel@tonic-gate */ 4453*7c478bd9Sstevel@tonic-gate if (tcp->tcp_state == TCPS_SYN_RCVD) { 4454*7c478bd9Sstevel@tonic-gate tcp_xmit_ctl("tcp_timer: RST sent on timeout " 4455*7c478bd9Sstevel@tonic-gate "in SYN_RCVD", 4456*7c478bd9Sstevel@tonic-gate tcp, NULL, tcp->tcp_snxt, 4457*7c478bd9Sstevel@tonic-gate tcp->tcp_rnxt, TH_RST | TH_ACK, 0, sock_id); 4458*7c478bd9Sstevel@tonic-gate } 4459*7c478bd9Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, 4460*7c478bd9Sstevel@tonic-gate tcp->tcp_client_errno ? 4461*7c478bd9Sstevel@tonic-gate tcp->tcp_client_errno : ETIMEDOUT); 4462*7c478bd9Sstevel@tonic-gate return; 4463*7c478bd9Sstevel@tonic-gate } else { 4464*7c478bd9Sstevel@tonic-gate /* 4465*7c478bd9Sstevel@tonic-gate * Set tcp_ms_we_have_waited to second_threshold 4466*7c478bd9Sstevel@tonic-gate * so that in next timeout, we will do the above 4467*7c478bd9Sstevel@tonic-gate * check (lbolt - tcp_last_recv_time). This is 4468*7c478bd9Sstevel@tonic-gate * also to avoid overflow. 4469*7c478bd9Sstevel@tonic-gate * 4470*7c478bd9Sstevel@tonic-gate * We don't need to decrement tcp_timer_backoff 4471*7c478bd9Sstevel@tonic-gate * to avoid overflow because it will be decremented 4472*7c478bd9Sstevel@tonic-gate * later if new timeout value is greater than 4473*7c478bd9Sstevel@tonic-gate * tcp_rexmit_interval_max. In the case when 4474*7c478bd9Sstevel@tonic-gate * tcp_rexmit_interval_max is greater than 4475*7c478bd9Sstevel@tonic-gate * second_threshold, it means that we will wait 4476*7c478bd9Sstevel@tonic-gate * longer than second_threshold to send the next 4477*7c478bd9Sstevel@tonic-gate * window probe. 4478*7c478bd9Sstevel@tonic-gate */ 4479*7c478bd9Sstevel@tonic-gate tcp->tcp_ms_we_have_waited = second_threshold; 4480*7c478bd9Sstevel@tonic-gate } 4481*7c478bd9Sstevel@tonic-gate } else if (ms > first_threshold && tcp->tcp_rtt_sa != 0) { 4482*7c478bd9Sstevel@tonic-gate /* 4483*7c478bd9Sstevel@tonic-gate * We have been retransmitting for too long... The RTT 4484*7c478bd9Sstevel@tonic-gate * we calculated is probably incorrect. Reinitialize it. 4485*7c478bd9Sstevel@tonic-gate * Need to compensate for 0 tcp_rtt_sa. Reset 4486*7c478bd9Sstevel@tonic-gate * tcp_rtt_update so that we won't accidentally cache a 4487*7c478bd9Sstevel@tonic-gate * bad value. But only do this if this is not a zero 4488*7c478bd9Sstevel@tonic-gate * window probe. 4489*7c478bd9Sstevel@tonic-gate */ 4490*7c478bd9Sstevel@tonic-gate if (tcp->tcp_zero_win_probe == 0) { 4491*7c478bd9Sstevel@tonic-gate tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + 4492*7c478bd9Sstevel@tonic-gate (tcp->tcp_rtt_sa >> 5); 4493*7c478bd9Sstevel@tonic-gate tcp->tcp_rtt_sa = 0; 4494*7c478bd9Sstevel@tonic-gate tcp->tcp_rtt_update = 0; 4495*7c478bd9Sstevel@tonic-gate } 4496*7c478bd9Sstevel@tonic-gate } 4497*7c478bd9Sstevel@tonic-gate tcp->tcp_timer_backoff++; 4498*7c478bd9Sstevel@tonic-gate if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 4499*7c478bd9Sstevel@tonic-gate tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < 4500*7c478bd9Sstevel@tonic-gate tcp_rexmit_interval_min) { 4501*7c478bd9Sstevel@tonic-gate /* 4502*7c478bd9Sstevel@tonic-gate * This means the original RTO is tcp_rexmit_interval_min. 4503*7c478bd9Sstevel@tonic-gate * So we will use tcp_rexmit_interval_min as the RTO value 4504*7c478bd9Sstevel@tonic-gate * and do the backoff. 4505*7c478bd9Sstevel@tonic-gate */ 4506*7c478bd9Sstevel@tonic-gate ms = tcp_rexmit_interval_min << tcp->tcp_timer_backoff; 4507*7c478bd9Sstevel@tonic-gate } else { 4508*7c478bd9Sstevel@tonic-gate ms <<= tcp->tcp_timer_backoff; 4509*7c478bd9Sstevel@tonic-gate } 4510*7c478bd9Sstevel@tonic-gate if (ms > tcp_rexmit_interval_max) { 4511*7c478bd9Sstevel@tonic-gate ms = tcp_rexmit_interval_max; 4512*7c478bd9Sstevel@tonic-gate /* 4513*7c478bd9Sstevel@tonic-gate * ms is at max, decrement tcp_timer_backoff to avoid 4514*7c478bd9Sstevel@tonic-gate * overflow. 4515*7c478bd9Sstevel@tonic-gate */ 4516*7c478bd9Sstevel@tonic-gate tcp->tcp_timer_backoff--; 4517*7c478bd9Sstevel@tonic-gate } 4518*7c478bd9Sstevel@tonic-gate tcp->tcp_ms_we_have_waited += ms; 4519*7c478bd9Sstevel@tonic-gate if (tcp->tcp_zero_win_probe == 0) { 4520*7c478bd9Sstevel@tonic-gate tcp->tcp_rto = ms; 4521*7c478bd9Sstevel@tonic-gate } 4522*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, ms); 4523*7c478bd9Sstevel@tonic-gate /* 4524*7c478bd9Sstevel@tonic-gate * This is after a timeout and tcp_rto is backed off. Set 4525*7c478bd9Sstevel@tonic-gate * tcp_set_timer to 1 so that next time RTO is updated, we will 4526*7c478bd9Sstevel@tonic-gate * restart the timer with a correct value. 4527*7c478bd9Sstevel@tonic-gate */ 4528*7c478bd9Sstevel@tonic-gate tcp->tcp_set_timer = 1; 4529*7c478bd9Sstevel@tonic-gate mss = tcp->tcp_snxt - tcp->tcp_suna; 4530*7c478bd9Sstevel@tonic-gate if (mss > tcp->tcp_mss) 4531*7c478bd9Sstevel@tonic-gate mss = tcp->tcp_mss; 4532*7c478bd9Sstevel@tonic-gate if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) 4533*7c478bd9Sstevel@tonic-gate mss = tcp->tcp_swnd; 4534*7c478bd9Sstevel@tonic-gate 4535*7c478bd9Sstevel@tonic-gate if ((mp = tcp->tcp_xmit_head) != NULL) 4536*7c478bd9Sstevel@tonic-gate mp->b_prev = (mblk_t *)prom_gettime(); 4537*7c478bd9Sstevel@tonic-gate mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, 4538*7c478bd9Sstevel@tonic-gate B_TRUE); 4539*7c478bd9Sstevel@tonic-gate if (mp == NULL) 4540*7c478bd9Sstevel@tonic-gate return; 4541*7c478bd9Sstevel@tonic-gate tcp->tcp_csuna = tcp->tcp_snxt; 4542*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpRetransSegs); 4543*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpRetransBytes, mss); 4544*7c478bd9Sstevel@tonic-gate /* Dump the packet when debugging. */ 4545*7c478bd9Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_timer", mp); 4546*7c478bd9Sstevel@tonic-gate 4547*7c478bd9Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp); 4548*7c478bd9Sstevel@tonic-gate freeb(mp); 4549*7c478bd9Sstevel@tonic-gate 4550*7c478bd9Sstevel@tonic-gate /* 4551*7c478bd9Sstevel@tonic-gate * When slow start after retransmission begins, start with 4552*7c478bd9Sstevel@tonic-gate * this seq no. tcp_rexmit_max marks the end of special slow 4553*7c478bd9Sstevel@tonic-gate * start phase. tcp_snd_burst controls how many segments 4554*7c478bd9Sstevel@tonic-gate * can be sent because of an ack. 4555*7c478bd9Sstevel@tonic-gate */ 4556*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_nxt = tcp->tcp_suna; 4557*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_burst = TCP_CWND_SS; 4558*7c478bd9Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 4559*7c478bd9Sstevel@tonic-gate (tcp->tcp_unsent == 0)) { 4560*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_fss; 4561*7c478bd9Sstevel@tonic-gate } else { 4562*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_max = tcp->tcp_snxt; 4563*7c478bd9Sstevel@tonic-gate } 4564*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit = B_TRUE; 4565*7c478bd9Sstevel@tonic-gate tcp->tcp_dupack_cnt = 0; 4566*7c478bd9Sstevel@tonic-gate 4567*7c478bd9Sstevel@tonic-gate /* 4568*7c478bd9Sstevel@tonic-gate * Remove all rexmit SACK blk to start from fresh. 4569*7c478bd9Sstevel@tonic-gate */ 4570*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 4571*7c478bd9Sstevel@tonic-gate TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 4572*7c478bd9Sstevel@tonic-gate tcp->tcp_num_notsack_blk = 0; 4573*7c478bd9Sstevel@tonic-gate tcp->tcp_cnt_notsack_list = 0; 4574*7c478bd9Sstevel@tonic-gate } 4575*7c478bd9Sstevel@tonic-gate } 4576*7c478bd9Sstevel@tonic-gate 4577*7c478bd9Sstevel@tonic-gate /* 4578*7c478bd9Sstevel@tonic-gate * The TCP normal data output path. 4579*7c478bd9Sstevel@tonic-gate * NOTE: the logic of the fast path is duplicated from this function. 4580*7c478bd9Sstevel@tonic-gate */ 4581*7c478bd9Sstevel@tonic-gate static void 4582*7c478bd9Sstevel@tonic-gate tcp_wput_data(tcp_t *tcp, mblk_t *mp, int sock_id) 4583*7c478bd9Sstevel@tonic-gate { 4584*7c478bd9Sstevel@tonic-gate int len; 4585*7c478bd9Sstevel@tonic-gate mblk_t *local_time; 4586*7c478bd9Sstevel@tonic-gate mblk_t *mp1; 4587*7c478bd9Sstevel@tonic-gate uchar_t *rptr; 4588*7c478bd9Sstevel@tonic-gate uint32_t snxt; 4589*7c478bd9Sstevel@tonic-gate int tail_unsent; 4590*7c478bd9Sstevel@tonic-gate int tcpstate; 4591*7c478bd9Sstevel@tonic-gate int usable = 0; 4592*7c478bd9Sstevel@tonic-gate mblk_t *xmit_tail; 4593*7c478bd9Sstevel@tonic-gate int32_t num_burst_seg; 4594*7c478bd9Sstevel@tonic-gate int32_t mss; 4595*7c478bd9Sstevel@tonic-gate int32_t num_sack_blk = 0; 4596*7c478bd9Sstevel@tonic-gate int32_t tcp_hdr_len; 4597*7c478bd9Sstevel@tonic-gate ipaddr_t *dst; 4598*7c478bd9Sstevel@tonic-gate ipaddr_t *src; 4599*7c478bd9Sstevel@tonic-gate 4600*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 4601*7c478bd9Sstevel@tonic-gate printf("tcp_wput_data(%d) ##############################\n", sock_id); 4602*7c478bd9Sstevel@tonic-gate #endif 4603*7c478bd9Sstevel@tonic-gate tcpstate = tcp->tcp_state; 4604*7c478bd9Sstevel@tonic-gate if (mp == NULL) { 4605*7c478bd9Sstevel@tonic-gate /* Really tacky... but we need this for detached closes. */ 4606*7c478bd9Sstevel@tonic-gate len = tcp->tcp_unsent; 4607*7c478bd9Sstevel@tonic-gate goto data_null; 4608*7c478bd9Sstevel@tonic-gate } 4609*7c478bd9Sstevel@tonic-gate 4610*7c478bd9Sstevel@tonic-gate /* 4611*7c478bd9Sstevel@tonic-gate * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ, 4612*7c478bd9Sstevel@tonic-gate * or before a connection attempt has begun. 4613*7c478bd9Sstevel@tonic-gate * 4614*7c478bd9Sstevel@tonic-gate * The following should not happen in inetboot.... 4615*7c478bd9Sstevel@tonic-gate */ 4616*7c478bd9Sstevel@tonic-gate if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT || 4617*7c478bd9Sstevel@tonic-gate (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 4618*7c478bd9Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 4619*7c478bd9Sstevel@tonic-gate printf("tcp_wput_data: data after ordrel, %s\n", 4620*7c478bd9Sstevel@tonic-gate tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 4621*7c478bd9Sstevel@tonic-gate } 4622*7c478bd9Sstevel@tonic-gate freemsg(mp); 4623*7c478bd9Sstevel@tonic-gate return; 4624*7c478bd9Sstevel@tonic-gate } 4625*7c478bd9Sstevel@tonic-gate 4626*7c478bd9Sstevel@tonic-gate /* Strip empties */ 4627*7c478bd9Sstevel@tonic-gate for (;;) { 4628*7c478bd9Sstevel@tonic-gate assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 4629*7c478bd9Sstevel@tonic-gate (uintptr_t)INT_MAX); 4630*7c478bd9Sstevel@tonic-gate len = (int)(mp->b_wptr - mp->b_rptr); 4631*7c478bd9Sstevel@tonic-gate if (len > 0) 4632*7c478bd9Sstevel@tonic-gate break; 4633*7c478bd9Sstevel@tonic-gate mp1 = mp; 4634*7c478bd9Sstevel@tonic-gate mp = mp->b_cont; 4635*7c478bd9Sstevel@tonic-gate freeb(mp1); 4636*7c478bd9Sstevel@tonic-gate if (mp == NULL) { 4637*7c478bd9Sstevel@tonic-gate return; 4638*7c478bd9Sstevel@tonic-gate } 4639*7c478bd9Sstevel@tonic-gate } 4640*7c478bd9Sstevel@tonic-gate 4641*7c478bd9Sstevel@tonic-gate /* If we are the first on the list ... */ 4642*7c478bd9Sstevel@tonic-gate if (tcp->tcp_xmit_head == NULL) { 4643*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_head = mp; 4644*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_tail = mp; 4645*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_tail_unsent = len; 4646*7c478bd9Sstevel@tonic-gate } else { 4647*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_last->b_cont = mp; 4648*7c478bd9Sstevel@tonic-gate len += tcp->tcp_unsent; 4649*7c478bd9Sstevel@tonic-gate } 4650*7c478bd9Sstevel@tonic-gate 4651*7c478bd9Sstevel@tonic-gate /* Tack on however many more positive length mblks we have */ 4652*7c478bd9Sstevel@tonic-gate if ((mp1 = mp->b_cont) != NULL) { 4653*7c478bd9Sstevel@tonic-gate do { 4654*7c478bd9Sstevel@tonic-gate int tlen; 4655*7c478bd9Sstevel@tonic-gate assert((uintptr_t)(mp1->b_wptr - 4656*7c478bd9Sstevel@tonic-gate mp1->b_rptr) <= (uintptr_t)INT_MAX); 4657*7c478bd9Sstevel@tonic-gate tlen = (int)(mp1->b_wptr - mp1->b_rptr); 4658*7c478bd9Sstevel@tonic-gate if (tlen <= 0) { 4659*7c478bd9Sstevel@tonic-gate mp->b_cont = mp1->b_cont; 4660*7c478bd9Sstevel@tonic-gate freeb(mp1); 4661*7c478bd9Sstevel@tonic-gate } else { 4662*7c478bd9Sstevel@tonic-gate len += tlen; 4663*7c478bd9Sstevel@tonic-gate mp = mp1; 4664*7c478bd9Sstevel@tonic-gate } 4665*7c478bd9Sstevel@tonic-gate } while ((mp1 = mp->b_cont) != NULL); 4666*7c478bd9Sstevel@tonic-gate } 4667*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_last = mp; 4668*7c478bd9Sstevel@tonic-gate tcp->tcp_unsent = len; 4669*7c478bd9Sstevel@tonic-gate 4670*7c478bd9Sstevel@tonic-gate data_null: 4671*7c478bd9Sstevel@tonic-gate snxt = tcp->tcp_snxt; 4672*7c478bd9Sstevel@tonic-gate xmit_tail = tcp->tcp_xmit_tail; 4673*7c478bd9Sstevel@tonic-gate tail_unsent = tcp->tcp_xmit_tail_unsent; 4674*7c478bd9Sstevel@tonic-gate 4675*7c478bd9Sstevel@tonic-gate /* 4676*7c478bd9Sstevel@tonic-gate * Note that tcp_mss has been adjusted to take into account the 4677*7c478bd9Sstevel@tonic-gate * timestamp option if applicable. Because SACK options do not 4678*7c478bd9Sstevel@tonic-gate * appear in every TCP segments and they are of variable lengths, 4679*7c478bd9Sstevel@tonic-gate * they cannot be included in tcp_mss. Thus we need to calculate 4680*7c478bd9Sstevel@tonic-gate * the actual segment length when we need to send a segment which 4681*7c478bd9Sstevel@tonic-gate * includes SACK options. 4682*7c478bd9Sstevel@tonic-gate */ 4683*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 4684*7c478bd9Sstevel@tonic-gate int32_t opt_len; 4685*7c478bd9Sstevel@tonic-gate 4686*7c478bd9Sstevel@tonic-gate num_sack_blk = MIN(tcp->tcp_max_sack_blk, 4687*7c478bd9Sstevel@tonic-gate tcp->tcp_num_sack_blk); 4688*7c478bd9Sstevel@tonic-gate opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 4689*7c478bd9Sstevel@tonic-gate 2 + TCPOPT_HEADER_LEN; 4690*7c478bd9Sstevel@tonic-gate mss = tcp->tcp_mss - opt_len; 4691*7c478bd9Sstevel@tonic-gate tcp_hdr_len = tcp->tcp_hdr_len + opt_len; 4692*7c478bd9Sstevel@tonic-gate } else { 4693*7c478bd9Sstevel@tonic-gate mss = tcp->tcp_mss; 4694*7c478bd9Sstevel@tonic-gate tcp_hdr_len = tcp->tcp_hdr_len; 4695*7c478bd9Sstevel@tonic-gate } 4696*7c478bd9Sstevel@tonic-gate 4697*7c478bd9Sstevel@tonic-gate if ((tcp->tcp_suna == snxt) && 4698*7c478bd9Sstevel@tonic-gate (prom_gettime() - tcp->tcp_last_recv_time) >= tcp->tcp_rto) { 4699*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd = MIN(tcp_slow_start_after_idle * mss, 4700*7c478bd9Sstevel@tonic-gate MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); 4701*7c478bd9Sstevel@tonic-gate } 4702*7c478bd9Sstevel@tonic-gate if (tcpstate == TCPS_SYN_RCVD) { 4703*7c478bd9Sstevel@tonic-gate /* 4704*7c478bd9Sstevel@tonic-gate * The three-way connection establishment handshake is not 4705*7c478bd9Sstevel@tonic-gate * complete yet. We want to queue the data for transmission 4706*7c478bd9Sstevel@tonic-gate * after entering ESTABLISHED state (RFC793). Setting usable to 4707*7c478bd9Sstevel@tonic-gate * zero cause a jump to "done" label effectively leaving data 4708*7c478bd9Sstevel@tonic-gate * on the queue. 4709*7c478bd9Sstevel@tonic-gate */ 4710*7c478bd9Sstevel@tonic-gate 4711*7c478bd9Sstevel@tonic-gate usable = 0; 4712*7c478bd9Sstevel@tonic-gate } else { 4713*7c478bd9Sstevel@tonic-gate int usable_r = tcp->tcp_swnd; 4714*7c478bd9Sstevel@tonic-gate 4715*7c478bd9Sstevel@tonic-gate /* 4716*7c478bd9Sstevel@tonic-gate * In the special case when cwnd is zero, which can only 4717*7c478bd9Sstevel@tonic-gate * happen if the connection is ECN capable, return now. 4718*7c478bd9Sstevel@tonic-gate * New segments is sent using tcp_timer(). The timer 4719*7c478bd9Sstevel@tonic-gate * is set in tcp_rput_data(). 4720*7c478bd9Sstevel@tonic-gate */ 4721*7c478bd9Sstevel@tonic-gate if (tcp->tcp_cwnd == 0) { 4722*7c478bd9Sstevel@tonic-gate /* 4723*7c478bd9Sstevel@tonic-gate * Note that tcp_cwnd is 0 before 3-way handshake is 4724*7c478bd9Sstevel@tonic-gate * finished. 4725*7c478bd9Sstevel@tonic-gate */ 4726*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_ecn_ok || 4727*7c478bd9Sstevel@tonic-gate tcp->tcp_state < TCPS_ESTABLISHED); 4728*7c478bd9Sstevel@tonic-gate return; 4729*7c478bd9Sstevel@tonic-gate } 4730*7c478bd9Sstevel@tonic-gate 4731*7c478bd9Sstevel@tonic-gate /* usable = MIN(swnd, cwnd) - unacked_bytes */ 4732*7c478bd9Sstevel@tonic-gate if (usable_r > tcp->tcp_cwnd) 4733*7c478bd9Sstevel@tonic-gate usable_r = tcp->tcp_cwnd; 4734*7c478bd9Sstevel@tonic-gate 4735*7c478bd9Sstevel@tonic-gate /* NOTE: trouble if xmitting while SYN not acked? */ 4736*7c478bd9Sstevel@tonic-gate usable_r -= snxt; 4737*7c478bd9Sstevel@tonic-gate usable_r += tcp->tcp_suna; 4738*7c478bd9Sstevel@tonic-gate 4739*7c478bd9Sstevel@tonic-gate /* usable = MIN(usable, unsent) */ 4740*7c478bd9Sstevel@tonic-gate if (usable_r > len) 4741*7c478bd9Sstevel@tonic-gate usable_r = len; 4742*7c478bd9Sstevel@tonic-gate 4743*7c478bd9Sstevel@tonic-gate /* usable = MAX(usable, {1 for urgent, 0 for data}) */ 4744*7c478bd9Sstevel@tonic-gate if (usable_r != 0) 4745*7c478bd9Sstevel@tonic-gate usable = usable_r; 4746*7c478bd9Sstevel@tonic-gate } 4747*7c478bd9Sstevel@tonic-gate 4748*7c478bd9Sstevel@tonic-gate local_time = (mblk_t *)prom_gettime(); 4749*7c478bd9Sstevel@tonic-gate 4750*7c478bd9Sstevel@tonic-gate /* 4751*7c478bd9Sstevel@tonic-gate * "Our" Nagle Algorithm. This is not the same as in the old 4752*7c478bd9Sstevel@tonic-gate * BSD. This is more in line with the true intent of Nagle. 4753*7c478bd9Sstevel@tonic-gate * 4754*7c478bd9Sstevel@tonic-gate * The conditions are: 4755*7c478bd9Sstevel@tonic-gate * 1. The amount of unsent data (or amount of data which can be 4756*7c478bd9Sstevel@tonic-gate * sent, whichever is smaller) is less than Nagle limit. 4757*7c478bd9Sstevel@tonic-gate * 2. The last sent size is also less than Nagle limit. 4758*7c478bd9Sstevel@tonic-gate * 3. There is unack'ed data. 4759*7c478bd9Sstevel@tonic-gate * 4. Urgent pointer is not set. Send urgent data ignoring the 4760*7c478bd9Sstevel@tonic-gate * Nagle algorithm. This reduces the probability that urgent 4761*7c478bd9Sstevel@tonic-gate * bytes get "merged" together. 4762*7c478bd9Sstevel@tonic-gate * 5. The app has not closed the connection. This eliminates the 4763*7c478bd9Sstevel@tonic-gate * wait time of the receiving side waiting for the last piece of 4764*7c478bd9Sstevel@tonic-gate * (small) data. 4765*7c478bd9Sstevel@tonic-gate * 4766*7c478bd9Sstevel@tonic-gate * If all are satisified, exit without sending anything. Note 4767*7c478bd9Sstevel@tonic-gate * that Nagle limit can be smaller than 1 MSS. Nagle limit is 4768*7c478bd9Sstevel@tonic-gate * the smaller of 1 MSS and global tcp_naglim_def (default to be 4769*7c478bd9Sstevel@tonic-gate * 4095). 4770*7c478bd9Sstevel@tonic-gate */ 4771*7c478bd9Sstevel@tonic-gate if (usable < (int)tcp->tcp_naglim && 4772*7c478bd9Sstevel@tonic-gate tcp->tcp_naglim > tcp->tcp_last_sent_len && 4773*7c478bd9Sstevel@tonic-gate snxt != tcp->tcp_suna && 4774*7c478bd9Sstevel@tonic-gate !(tcp->tcp_valid_bits & TCP_URG_VALID)) 4775*7c478bd9Sstevel@tonic-gate goto done; 4776*7c478bd9Sstevel@tonic-gate 4777*7c478bd9Sstevel@tonic-gate num_burst_seg = tcp->tcp_snd_burst; 4778*7c478bd9Sstevel@tonic-gate for (;;) { 4779*7c478bd9Sstevel@tonic-gate tcph_t *tcph; 4780*7c478bd9Sstevel@tonic-gate mblk_t *new_mp; 4781*7c478bd9Sstevel@tonic-gate 4782*7c478bd9Sstevel@tonic-gate if (num_burst_seg-- == 0) 4783*7c478bd9Sstevel@tonic-gate goto done; 4784*7c478bd9Sstevel@tonic-gate 4785*7c478bd9Sstevel@tonic-gate len = mss; 4786*7c478bd9Sstevel@tonic-gate if (len > usable) { 4787*7c478bd9Sstevel@tonic-gate len = usable; 4788*7c478bd9Sstevel@tonic-gate if (len <= 0) { 4789*7c478bd9Sstevel@tonic-gate /* Terminate the loop */ 4790*7c478bd9Sstevel@tonic-gate goto done; 4791*7c478bd9Sstevel@tonic-gate } 4792*7c478bd9Sstevel@tonic-gate /* 4793*7c478bd9Sstevel@tonic-gate * Sender silly-window avoidance. 4794*7c478bd9Sstevel@tonic-gate * Ignore this if we are going to send a 4795*7c478bd9Sstevel@tonic-gate * zero window probe out. 4796*7c478bd9Sstevel@tonic-gate * 4797*7c478bd9Sstevel@tonic-gate * TODO: force data into microscopic window ?? 4798*7c478bd9Sstevel@tonic-gate * ==> (!pushed || (unsent > usable)) 4799*7c478bd9Sstevel@tonic-gate */ 4800*7c478bd9Sstevel@tonic-gate if (len < (tcp->tcp_max_swnd >> 1) && 4801*7c478bd9Sstevel@tonic-gate (tcp->tcp_unsent - (snxt - tcp->tcp_snxt)) > len && 4802*7c478bd9Sstevel@tonic-gate !((tcp->tcp_valid_bits & TCP_URG_VALID) && 4803*7c478bd9Sstevel@tonic-gate len == 1) && (! tcp->tcp_zero_win_probe)) { 4804*7c478bd9Sstevel@tonic-gate /* 4805*7c478bd9Sstevel@tonic-gate * If the retransmit timer is not running 4806*7c478bd9Sstevel@tonic-gate * we start it so that we will retransmit 4807*7c478bd9Sstevel@tonic-gate * in the case when the the receiver has 4808*7c478bd9Sstevel@tonic-gate * decremented the window. 4809*7c478bd9Sstevel@tonic-gate */ 4810*7c478bd9Sstevel@tonic-gate if (snxt == tcp->tcp_snxt && 4811*7c478bd9Sstevel@tonic-gate snxt == tcp->tcp_suna) { 4812*7c478bd9Sstevel@tonic-gate /* 4813*7c478bd9Sstevel@tonic-gate * We are not supposed to send 4814*7c478bd9Sstevel@tonic-gate * anything. So let's wait a little 4815*7c478bd9Sstevel@tonic-gate * bit longer before breaking SWS 4816*7c478bd9Sstevel@tonic-gate * avoidance. 4817*7c478bd9Sstevel@tonic-gate * 4818*7c478bd9Sstevel@tonic-gate * What should the value be? 4819*7c478bd9Sstevel@tonic-gate * Suggestion: MAX(init rexmit time, 4820*7c478bd9Sstevel@tonic-gate * tcp->tcp_rto) 4821*7c478bd9Sstevel@tonic-gate */ 4822*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 4823*7c478bd9Sstevel@tonic-gate } 4824*7c478bd9Sstevel@tonic-gate goto done; 4825*7c478bd9Sstevel@tonic-gate } 4826*7c478bd9Sstevel@tonic-gate } 4827*7c478bd9Sstevel@tonic-gate 4828*7c478bd9Sstevel@tonic-gate tcph = tcp->tcp_tcph; 4829*7c478bd9Sstevel@tonic-gate 4830*7c478bd9Sstevel@tonic-gate usable -= len; /* Approximate - can be adjusted later */ 4831*7c478bd9Sstevel@tonic-gate if (usable > 0) 4832*7c478bd9Sstevel@tonic-gate tcph->th_flags[0] = TH_ACK; 4833*7c478bd9Sstevel@tonic-gate else 4834*7c478bd9Sstevel@tonic-gate tcph->th_flags[0] = (TH_ACK | TH_PUSH); 4835*7c478bd9Sstevel@tonic-gate 4836*7c478bd9Sstevel@tonic-gate U32_TO_ABE32(snxt, tcph->th_seq); 4837*7c478bd9Sstevel@tonic-gate 4838*7c478bd9Sstevel@tonic-gate if (tcp->tcp_valid_bits) { 4839*7c478bd9Sstevel@tonic-gate uchar_t *prev_rptr = xmit_tail->b_rptr; 4840*7c478bd9Sstevel@tonic-gate uint32_t prev_snxt = tcp->tcp_snxt; 4841*7c478bd9Sstevel@tonic-gate 4842*7c478bd9Sstevel@tonic-gate if (tail_unsent == 0) { 4843*7c478bd9Sstevel@tonic-gate assert(xmit_tail->b_cont != NULL); 4844*7c478bd9Sstevel@tonic-gate xmit_tail = xmit_tail->b_cont; 4845*7c478bd9Sstevel@tonic-gate prev_rptr = xmit_tail->b_rptr; 4846*7c478bd9Sstevel@tonic-gate tail_unsent = (int)(xmit_tail->b_wptr - 4847*7c478bd9Sstevel@tonic-gate xmit_tail->b_rptr); 4848*7c478bd9Sstevel@tonic-gate } else { 4849*7c478bd9Sstevel@tonic-gate xmit_tail->b_rptr = xmit_tail->b_wptr - 4850*7c478bd9Sstevel@tonic-gate tail_unsent; 4851*7c478bd9Sstevel@tonic-gate } 4852*7c478bd9Sstevel@tonic-gate mp = tcp_xmit_mp(tcp, xmit_tail, len, NULL, NULL, 4853*7c478bd9Sstevel@tonic-gate snxt, B_FALSE, (uint32_t *)&len, B_FALSE); 4854*7c478bd9Sstevel@tonic-gate /* Restore tcp_snxt so we get amount sent right. */ 4855*7c478bd9Sstevel@tonic-gate tcp->tcp_snxt = prev_snxt; 4856*7c478bd9Sstevel@tonic-gate if (prev_rptr == xmit_tail->b_rptr) 4857*7c478bd9Sstevel@tonic-gate xmit_tail->b_prev = local_time; 4858*7c478bd9Sstevel@tonic-gate else 4859*7c478bd9Sstevel@tonic-gate xmit_tail->b_rptr = prev_rptr; 4860*7c478bd9Sstevel@tonic-gate 4861*7c478bd9Sstevel@tonic-gate if (mp == NULL) 4862*7c478bd9Sstevel@tonic-gate break; 4863*7c478bd9Sstevel@tonic-gate 4864*7c478bd9Sstevel@tonic-gate mp1 = mp->b_cont; 4865*7c478bd9Sstevel@tonic-gate 4866*7c478bd9Sstevel@tonic-gate snxt += len; 4867*7c478bd9Sstevel@tonic-gate tcp->tcp_last_sent_len = (ushort_t)len; 4868*7c478bd9Sstevel@tonic-gate while (mp1->b_cont) { 4869*7c478bd9Sstevel@tonic-gate xmit_tail = xmit_tail->b_cont; 4870*7c478bd9Sstevel@tonic-gate xmit_tail->b_prev = local_time; 4871*7c478bd9Sstevel@tonic-gate mp1 = mp1->b_cont; 4872*7c478bd9Sstevel@tonic-gate } 4873*7c478bd9Sstevel@tonic-gate tail_unsent = xmit_tail->b_wptr - mp1->b_wptr; 4874*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutDataSegs); 4875*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpOutDataBytes, len); 4876*7c478bd9Sstevel@tonic-gate /* Dump the packet when debugging. */ 4877*7c478bd9Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_wput_data (valid bits)", mp); 4878*7c478bd9Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp); 4879*7c478bd9Sstevel@tonic-gate freeb(mp); 4880*7c478bd9Sstevel@tonic-gate continue; 4881*7c478bd9Sstevel@tonic-gate } 4882*7c478bd9Sstevel@tonic-gate 4883*7c478bd9Sstevel@tonic-gate snxt += len; /* Adjust later if we don't send all of len */ 4884*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutDataSegs); 4885*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpOutDataBytes, len); 4886*7c478bd9Sstevel@tonic-gate 4887*7c478bd9Sstevel@tonic-gate if (tail_unsent) { 4888*7c478bd9Sstevel@tonic-gate /* Are the bytes above us in flight? */ 4889*7c478bd9Sstevel@tonic-gate rptr = xmit_tail->b_wptr - tail_unsent; 4890*7c478bd9Sstevel@tonic-gate if (rptr != xmit_tail->b_rptr) { 4891*7c478bd9Sstevel@tonic-gate tail_unsent -= len; 4892*7c478bd9Sstevel@tonic-gate len += tcp_hdr_len; 4893*7c478bd9Sstevel@tonic-gate tcp->tcp_ipha->ip_len = htons(len); 4894*7c478bd9Sstevel@tonic-gate mp = dupb(xmit_tail); 4895*7c478bd9Sstevel@tonic-gate if (!mp) 4896*7c478bd9Sstevel@tonic-gate break; 4897*7c478bd9Sstevel@tonic-gate mp->b_rptr = rptr; 4898*7c478bd9Sstevel@tonic-gate goto must_alloc; 4899*7c478bd9Sstevel@tonic-gate } 4900*7c478bd9Sstevel@tonic-gate } else { 4901*7c478bd9Sstevel@tonic-gate xmit_tail = xmit_tail->b_cont; 4902*7c478bd9Sstevel@tonic-gate assert((uintptr_t)(xmit_tail->b_wptr - 4903*7c478bd9Sstevel@tonic-gate xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 4904*7c478bd9Sstevel@tonic-gate tail_unsent = (int)(xmit_tail->b_wptr - 4905*7c478bd9Sstevel@tonic-gate xmit_tail->b_rptr); 4906*7c478bd9Sstevel@tonic-gate } 4907*7c478bd9Sstevel@tonic-gate 4908*7c478bd9Sstevel@tonic-gate tail_unsent -= len; 4909*7c478bd9Sstevel@tonic-gate tcp->tcp_last_sent_len = (ushort_t)len; 4910*7c478bd9Sstevel@tonic-gate 4911*7c478bd9Sstevel@tonic-gate len += tcp_hdr_len; 4912*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ipversion == IPV4_VERSION) 4913*7c478bd9Sstevel@tonic-gate tcp->tcp_ipha->ip_len = htons(len); 4914*7c478bd9Sstevel@tonic-gate 4915*7c478bd9Sstevel@tonic-gate xmit_tail->b_prev = local_time; 4916*7c478bd9Sstevel@tonic-gate 4917*7c478bd9Sstevel@tonic-gate mp = dupb(xmit_tail); 4918*7c478bd9Sstevel@tonic-gate if (mp == NULL) 4919*7c478bd9Sstevel@tonic-gate goto out_of_mem; 4920*7c478bd9Sstevel@tonic-gate 4921*7c478bd9Sstevel@tonic-gate len = tcp_hdr_len; 4922*7c478bd9Sstevel@tonic-gate /* 4923*7c478bd9Sstevel@tonic-gate * There are four reasons to allocate a new hdr mblk: 4924*7c478bd9Sstevel@tonic-gate * 1) The bytes above us are in use by another packet 4925*7c478bd9Sstevel@tonic-gate * 2) We don't have good alignment 4926*7c478bd9Sstevel@tonic-gate * 3) The mblk is being shared 4927*7c478bd9Sstevel@tonic-gate * 4) We don't have enough room for a header 4928*7c478bd9Sstevel@tonic-gate */ 4929*7c478bd9Sstevel@tonic-gate rptr = mp->b_rptr - len; 4930*7c478bd9Sstevel@tonic-gate if (!OK_32PTR(rptr) || 4931*7c478bd9Sstevel@tonic-gate rptr < mp->b_datap) { 4932*7c478bd9Sstevel@tonic-gate /* NOTE: we assume allocb returns an OK_32PTR */ 4933*7c478bd9Sstevel@tonic-gate 4934*7c478bd9Sstevel@tonic-gate must_alloc:; 4935*7c478bd9Sstevel@tonic-gate mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 4936*7c478bd9Sstevel@tonic-gate tcp_wroff_xtra, 0); 4937*7c478bd9Sstevel@tonic-gate if (mp1 == NULL) { 4938*7c478bd9Sstevel@tonic-gate freemsg(mp); 4939*7c478bd9Sstevel@tonic-gate goto out_of_mem; 4940*7c478bd9Sstevel@tonic-gate } 4941*7c478bd9Sstevel@tonic-gate mp1->b_cont = mp; 4942*7c478bd9Sstevel@tonic-gate mp = mp1; 4943*7c478bd9Sstevel@tonic-gate /* Leave room for Link Level header */ 4944*7c478bd9Sstevel@tonic-gate len = tcp_hdr_len; 4945*7c478bd9Sstevel@tonic-gate rptr = &mp->b_rptr[tcp_wroff_xtra]; 4946*7c478bd9Sstevel@tonic-gate mp->b_wptr = &rptr[len]; 4947*7c478bd9Sstevel@tonic-gate } 4948*7c478bd9Sstevel@tonic-gate 4949*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 4950*7c478bd9Sstevel@tonic-gate U32_TO_BE32((uint32_t)local_time, 4951*7c478bd9Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 4952*7c478bd9Sstevel@tonic-gate U32_TO_BE32(tcp->tcp_ts_recent, 4953*7c478bd9Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 4954*7c478bd9Sstevel@tonic-gate } else { 4955*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 4956*7c478bd9Sstevel@tonic-gate } 4957*7c478bd9Sstevel@tonic-gate 4958*7c478bd9Sstevel@tonic-gate mp->b_rptr = rptr; 4959*7c478bd9Sstevel@tonic-gate 4960*7c478bd9Sstevel@tonic-gate /* Copy the template header. */ 4961*7c478bd9Sstevel@tonic-gate dst = (ipaddr_t *)rptr; 4962*7c478bd9Sstevel@tonic-gate src = (ipaddr_t *)tcp->tcp_iphc; 4963*7c478bd9Sstevel@tonic-gate dst[0] = src[0]; 4964*7c478bd9Sstevel@tonic-gate dst[1] = src[1]; 4965*7c478bd9Sstevel@tonic-gate dst[2] = src[2]; 4966*7c478bd9Sstevel@tonic-gate dst[3] = src[3]; 4967*7c478bd9Sstevel@tonic-gate dst[4] = src[4]; 4968*7c478bd9Sstevel@tonic-gate dst[5] = src[5]; 4969*7c478bd9Sstevel@tonic-gate dst[6] = src[6]; 4970*7c478bd9Sstevel@tonic-gate dst[7] = src[7]; 4971*7c478bd9Sstevel@tonic-gate dst[8] = src[8]; 4972*7c478bd9Sstevel@tonic-gate dst[9] = src[9]; 4973*7c478bd9Sstevel@tonic-gate len = tcp->tcp_hdr_len; 4974*7c478bd9Sstevel@tonic-gate if (len -= 40) { 4975*7c478bd9Sstevel@tonic-gate len >>= 2; 4976*7c478bd9Sstevel@tonic-gate dst += 10; 4977*7c478bd9Sstevel@tonic-gate src += 10; 4978*7c478bd9Sstevel@tonic-gate do { 4979*7c478bd9Sstevel@tonic-gate *dst++ = *src++; 4980*7c478bd9Sstevel@tonic-gate } while (--len); 4981*7c478bd9Sstevel@tonic-gate } 4982*7c478bd9Sstevel@tonic-gate 4983*7c478bd9Sstevel@tonic-gate /* 4984*7c478bd9Sstevel@tonic-gate * Set tcph to point to the header of the outgoing packet, 4985*7c478bd9Sstevel@tonic-gate * not to the template header. 4986*7c478bd9Sstevel@tonic-gate */ 4987*7c478bd9Sstevel@tonic-gate tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 4988*7c478bd9Sstevel@tonic-gate 4989*7c478bd9Sstevel@tonic-gate /* 4990*7c478bd9Sstevel@tonic-gate * Set the ECN info in the TCP header if it is not a zero 4991*7c478bd9Sstevel@tonic-gate * window probe. Zero window probe is only sent in 4992*7c478bd9Sstevel@tonic-gate * tcp_wput_data() and tcp_timer(). 4993*7c478bd9Sstevel@tonic-gate */ 4994*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) { 4995*7c478bd9Sstevel@tonic-gate SET_ECT(tcp, rptr); 4996*7c478bd9Sstevel@tonic-gate 4997*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ecn_echo_on) 4998*7c478bd9Sstevel@tonic-gate tcph->th_flags[0] |= TH_ECE; 4999*7c478bd9Sstevel@tonic-gate if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 5000*7c478bd9Sstevel@tonic-gate tcph->th_flags[0] |= TH_CWR; 5001*7c478bd9Sstevel@tonic-gate tcp->tcp_ecn_cwr_sent = B_TRUE; 5002*7c478bd9Sstevel@tonic-gate } 5003*7c478bd9Sstevel@tonic-gate } 5004*7c478bd9Sstevel@tonic-gate 5005*7c478bd9Sstevel@tonic-gate /* Fill in SACK options */ 5006*7c478bd9Sstevel@tonic-gate if (num_sack_blk > 0) { 5007*7c478bd9Sstevel@tonic-gate uchar_t *wptr = rptr + tcp->tcp_hdr_len; 5008*7c478bd9Sstevel@tonic-gate sack_blk_t *tmp; 5009*7c478bd9Sstevel@tonic-gate int32_t i; 5010*7c478bd9Sstevel@tonic-gate 5011*7c478bd9Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5012*7c478bd9Sstevel@tonic-gate wptr[1] = TCPOPT_NOP; 5013*7c478bd9Sstevel@tonic-gate wptr[2] = TCPOPT_SACK; 5014*7c478bd9Sstevel@tonic-gate wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 5015*7c478bd9Sstevel@tonic-gate sizeof (sack_blk_t); 5016*7c478bd9Sstevel@tonic-gate wptr += TCPOPT_REAL_SACK_LEN; 5017*7c478bd9Sstevel@tonic-gate 5018*7c478bd9Sstevel@tonic-gate tmp = tcp->tcp_sack_list; 5019*7c478bd9Sstevel@tonic-gate for (i = 0; i < num_sack_blk; i++) { 5020*7c478bd9Sstevel@tonic-gate U32_TO_BE32(tmp[i].begin, wptr); 5021*7c478bd9Sstevel@tonic-gate wptr += sizeof (tcp_seq); 5022*7c478bd9Sstevel@tonic-gate U32_TO_BE32(tmp[i].end, wptr); 5023*7c478bd9Sstevel@tonic-gate wptr += sizeof (tcp_seq); 5024*7c478bd9Sstevel@tonic-gate } 5025*7c478bd9Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 5026*7c478bd9Sstevel@tonic-gate << 4); 5027*7c478bd9Sstevel@tonic-gate } 5028*7c478bd9Sstevel@tonic-gate 5029*7c478bd9Sstevel@tonic-gate if (tail_unsent) { 5030*7c478bd9Sstevel@tonic-gate mp1 = mp->b_cont; 5031*7c478bd9Sstevel@tonic-gate if (mp1 == NULL) 5032*7c478bd9Sstevel@tonic-gate mp1 = mp; 5033*7c478bd9Sstevel@tonic-gate /* 5034*7c478bd9Sstevel@tonic-gate * If we're a little short, tack on more mblks 5035*7c478bd9Sstevel@tonic-gate * as long as we don't need to split an mblk. 5036*7c478bd9Sstevel@tonic-gate */ 5037*7c478bd9Sstevel@tonic-gate while (tail_unsent < 0 && 5038*7c478bd9Sstevel@tonic-gate tail_unsent + (int)(xmit_tail->b_cont->b_wptr - 5039*7c478bd9Sstevel@tonic-gate xmit_tail->b_cont->b_rptr) <= 0) { 5040*7c478bd9Sstevel@tonic-gate xmit_tail = xmit_tail->b_cont; 5041*7c478bd9Sstevel@tonic-gate /* Stash for rtt use later */ 5042*7c478bd9Sstevel@tonic-gate xmit_tail->b_prev = local_time; 5043*7c478bd9Sstevel@tonic-gate mp1->b_cont = dupb(xmit_tail); 5044*7c478bd9Sstevel@tonic-gate mp1 = mp1->b_cont; 5045*7c478bd9Sstevel@tonic-gate assert((uintptr_t)(xmit_tail->b_wptr - 5046*7c478bd9Sstevel@tonic-gate xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 5047*7c478bd9Sstevel@tonic-gate tail_unsent += (int)(xmit_tail->b_wptr - 5048*7c478bd9Sstevel@tonic-gate xmit_tail->b_rptr); 5049*7c478bd9Sstevel@tonic-gate if (mp1 == NULL) { 5050*7c478bd9Sstevel@tonic-gate freemsg(mp); 5051*7c478bd9Sstevel@tonic-gate goto out_of_mem; 5052*7c478bd9Sstevel@tonic-gate } 5053*7c478bd9Sstevel@tonic-gate } 5054*7c478bd9Sstevel@tonic-gate /* Trim back any surplus on the last mblk */ 5055*7c478bd9Sstevel@tonic-gate if (tail_unsent > 0) 5056*7c478bd9Sstevel@tonic-gate mp1->b_wptr -= tail_unsent; 5057*7c478bd9Sstevel@tonic-gate if (tail_unsent < 0) { 5058*7c478bd9Sstevel@tonic-gate uint32_t ip_len; 5059*7c478bd9Sstevel@tonic-gate 5060*7c478bd9Sstevel@tonic-gate /* 5061*7c478bd9Sstevel@tonic-gate * We did not send everything we could in 5062*7c478bd9Sstevel@tonic-gate * order to preserve mblk boundaries. 5063*7c478bd9Sstevel@tonic-gate */ 5064*7c478bd9Sstevel@tonic-gate usable -= tail_unsent; 5065*7c478bd9Sstevel@tonic-gate snxt += tail_unsent; 5066*7c478bd9Sstevel@tonic-gate tcp->tcp_last_sent_len += tail_unsent; 5067*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpOutDataBytes, 5068*7c478bd9Sstevel@tonic-gate tail_unsent); 5069*7c478bd9Sstevel@tonic-gate /* Adjust the IP length field. */ 5070*7c478bd9Sstevel@tonic-gate ip_len = ntohs(((struct ip *)rptr)->ip_len) + 5071*7c478bd9Sstevel@tonic-gate tail_unsent; 5072*7c478bd9Sstevel@tonic-gate ((struct ip *)rptr)->ip_len = htons(ip_len); 5073*7c478bd9Sstevel@tonic-gate tail_unsent = 0; 5074*7c478bd9Sstevel@tonic-gate } 5075*7c478bd9Sstevel@tonic-gate } 5076*7c478bd9Sstevel@tonic-gate 5077*7c478bd9Sstevel@tonic-gate if (mp == NULL) 5078*7c478bd9Sstevel@tonic-gate goto out_of_mem; 5079*7c478bd9Sstevel@tonic-gate 5080*7c478bd9Sstevel@tonic-gate /* 5081*7c478bd9Sstevel@tonic-gate * Performance hit! We need to pullup the whole message 5082*7c478bd9Sstevel@tonic-gate * in order to do checksum and for the MAC output routine. 5083*7c478bd9Sstevel@tonic-gate */ 5084*7c478bd9Sstevel@tonic-gate if (mp->b_cont != NULL) { 5085*7c478bd9Sstevel@tonic-gate int mp_size; 5086*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 5087*7c478bd9Sstevel@tonic-gate printf("Multiple mblk %d\n", msgdsize(mp)); 5088*7c478bd9Sstevel@tonic-gate #endif 5089*7c478bd9Sstevel@tonic-gate new_mp = allocb(msgdsize(mp) + tcp_wroff_xtra, 0); 5090*7c478bd9Sstevel@tonic-gate new_mp->b_rptr += tcp_wroff_xtra; 5091*7c478bd9Sstevel@tonic-gate new_mp->b_wptr = new_mp->b_rptr; 5092*7c478bd9Sstevel@tonic-gate while (mp != NULL) { 5093*7c478bd9Sstevel@tonic-gate mp_size = mp->b_wptr - mp->b_rptr; 5094*7c478bd9Sstevel@tonic-gate bcopy(mp->b_rptr, new_mp->b_wptr, mp_size); 5095*7c478bd9Sstevel@tonic-gate new_mp->b_wptr += mp_size; 5096*7c478bd9Sstevel@tonic-gate mp = mp->b_cont; 5097*7c478bd9Sstevel@tonic-gate } 5098*7c478bd9Sstevel@tonic-gate freemsg(mp); 5099*7c478bd9Sstevel@tonic-gate mp = new_mp; 5100*7c478bd9Sstevel@tonic-gate } 5101*7c478bd9Sstevel@tonic-gate tcp_set_cksum(mp); 5102*7c478bd9Sstevel@tonic-gate ((struct ip *)mp->b_rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5103*7c478bd9Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_wput_data", mp); 5104*7c478bd9Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp); 5105*7c478bd9Sstevel@tonic-gate freemsg(mp); 5106*7c478bd9Sstevel@tonic-gate } 5107*7c478bd9Sstevel@tonic-gate out_of_mem:; 5108*7c478bd9Sstevel@tonic-gate /* Pretend that all we were trying to send really got sent */ 5109*7c478bd9Sstevel@tonic-gate if (tail_unsent < 0) { 5110*7c478bd9Sstevel@tonic-gate do { 5111*7c478bd9Sstevel@tonic-gate xmit_tail = xmit_tail->b_cont; 5112*7c478bd9Sstevel@tonic-gate xmit_tail->b_prev = local_time; 5113*7c478bd9Sstevel@tonic-gate assert((uintptr_t)(xmit_tail->b_wptr - 5114*7c478bd9Sstevel@tonic-gate xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 5115*7c478bd9Sstevel@tonic-gate tail_unsent += (int)(xmit_tail->b_wptr - 5116*7c478bd9Sstevel@tonic-gate xmit_tail->b_rptr); 5117*7c478bd9Sstevel@tonic-gate } while (tail_unsent < 0); 5118*7c478bd9Sstevel@tonic-gate } 5119*7c478bd9Sstevel@tonic-gate done:; 5120*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_tail = xmit_tail; 5121*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_tail_unsent = tail_unsent; 5122*7c478bd9Sstevel@tonic-gate len = tcp->tcp_snxt - snxt; 5123*7c478bd9Sstevel@tonic-gate if (len) { 5124*7c478bd9Sstevel@tonic-gate /* 5125*7c478bd9Sstevel@tonic-gate * If new data was sent, need to update the notsack 5126*7c478bd9Sstevel@tonic-gate * list, which is, afterall, data blocks that have 5127*7c478bd9Sstevel@tonic-gate * not been sack'ed by the receiver. New data is 5128*7c478bd9Sstevel@tonic-gate * not sack'ed. 5129*7c478bd9Sstevel@tonic-gate */ 5130*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 5131*7c478bd9Sstevel@tonic-gate /* len is a negative value. */ 5132*7c478bd9Sstevel@tonic-gate tcp->tcp_pipe -= len; 5133*7c478bd9Sstevel@tonic-gate tcp_notsack_update(&(tcp->tcp_notsack_list), 5134*7c478bd9Sstevel@tonic-gate tcp->tcp_snxt, snxt, 5135*7c478bd9Sstevel@tonic-gate &(tcp->tcp_num_notsack_blk), 5136*7c478bd9Sstevel@tonic-gate &(tcp->tcp_cnt_notsack_list)); 5137*7c478bd9Sstevel@tonic-gate } 5138*7c478bd9Sstevel@tonic-gate tcp->tcp_snxt = snxt + tcp->tcp_fin_sent; 5139*7c478bd9Sstevel@tonic-gate tcp->tcp_rack = tcp->tcp_rnxt; 5140*7c478bd9Sstevel@tonic-gate tcp->tcp_rack_cnt = 0; 5141*7c478bd9Sstevel@tonic-gate if ((snxt + len) == tcp->tcp_suna) { 5142*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5143*7c478bd9Sstevel@tonic-gate } 5144*7c478bd9Sstevel@tonic-gate /* 5145*7c478bd9Sstevel@tonic-gate * Note that len is the amount we just sent but with a negative 5146*7c478bd9Sstevel@tonic-gate * sign. We update tcp_unsent here since we may come back to 5147*7c478bd9Sstevel@tonic-gate * tcp_wput_data from tcp_state_wait. 5148*7c478bd9Sstevel@tonic-gate */ 5149*7c478bd9Sstevel@tonic-gate len += tcp->tcp_unsent; 5150*7c478bd9Sstevel@tonic-gate tcp->tcp_unsent = len; 5151*7c478bd9Sstevel@tonic-gate 5152*7c478bd9Sstevel@tonic-gate /* 5153*7c478bd9Sstevel@tonic-gate * Let's wait till all the segments have been acked, since we 5154*7c478bd9Sstevel@tonic-gate * don't have a timer. 5155*7c478bd9Sstevel@tonic-gate */ 5156*7c478bd9Sstevel@tonic-gate (void) tcp_state_wait(sock_id, tcp, TCPS_ALL_ACKED); 5157*7c478bd9Sstevel@tonic-gate return; 5158*7c478bd9Sstevel@tonic-gate } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) { 5159*7c478bd9Sstevel@tonic-gate /* 5160*7c478bd9Sstevel@tonic-gate * Didn't send anything. Make sure the timer is running 5161*7c478bd9Sstevel@tonic-gate * so that we will probe a zero window. 5162*7c478bd9Sstevel@tonic-gate */ 5163*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5164*7c478bd9Sstevel@tonic-gate } 5165*7c478bd9Sstevel@tonic-gate 5166*7c478bd9Sstevel@tonic-gate /* Note that len is the amount we just sent but with a negative sign */ 5167*7c478bd9Sstevel@tonic-gate len += tcp->tcp_unsent; 5168*7c478bd9Sstevel@tonic-gate tcp->tcp_unsent = len; 5169*7c478bd9Sstevel@tonic-gate 5170*7c478bd9Sstevel@tonic-gate } 5171*7c478bd9Sstevel@tonic-gate 5172*7c478bd9Sstevel@tonic-gate static void 5173*7c478bd9Sstevel@tonic-gate tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, 5174*7c478bd9Sstevel@tonic-gate uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcph_t *tcph, 5175*7c478bd9Sstevel@tonic-gate int sock_id) 5176*7c478bd9Sstevel@tonic-gate { 5177*7c478bd9Sstevel@tonic-gate int32_t bytes_acked; 5178*7c478bd9Sstevel@tonic-gate int32_t gap; 5179*7c478bd9Sstevel@tonic-gate int32_t rgap; 5180*7c478bd9Sstevel@tonic-gate tcp_opt_t tcpopt; 5181*7c478bd9Sstevel@tonic-gate uint_t flags; 5182*7c478bd9Sstevel@tonic-gate uint32_t new_swnd = 0; 5183*7c478bd9Sstevel@tonic-gate 5184*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 5185*7c478bd9Sstevel@tonic-gate printf("Time wait processing called ###############3\n"); 5186*7c478bd9Sstevel@tonic-gate #endif 5187*7c478bd9Sstevel@tonic-gate 5188*7c478bd9Sstevel@tonic-gate /* Just make sure we send the right sock_id to tcp_clean_death */ 5189*7c478bd9Sstevel@tonic-gate if ((sockets[sock_id].pcb == NULL) || (sockets[sock_id].pcb != tcp)) 5190*7c478bd9Sstevel@tonic-gate sock_id = -1; 5191*7c478bd9Sstevel@tonic-gate 5192*7c478bd9Sstevel@tonic-gate flags = (unsigned int)tcph->th_flags[0] & 0xFF; 5193*7c478bd9Sstevel@tonic-gate new_swnd = BE16_TO_U16(tcph->th_win) << 5194*7c478bd9Sstevel@tonic-gate ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); 5195*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 5196*7c478bd9Sstevel@tonic-gate if (!tcp_paws_check(tcp, tcph, &tcpopt)) { 5197*7c478bd9Sstevel@tonic-gate freemsg(mp); 5198*7c478bd9Sstevel@tonic-gate tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5199*7c478bd9Sstevel@tonic-gate tcp->tcp_rnxt, TH_ACK, 0, -1); 5200*7c478bd9Sstevel@tonic-gate return; 5201*7c478bd9Sstevel@tonic-gate } 5202*7c478bd9Sstevel@tonic-gate } 5203*7c478bd9Sstevel@tonic-gate gap = seg_seq - tcp->tcp_rnxt; 5204*7c478bd9Sstevel@tonic-gate rgap = tcp->tcp_rwnd - (gap + seg_len); 5205*7c478bd9Sstevel@tonic-gate if (gap < 0) { 5206*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataDupSegs); 5207*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataDupBytes, 5208*7c478bd9Sstevel@tonic-gate (seg_len > -gap ? -gap : seg_len)); 5209*7c478bd9Sstevel@tonic-gate seg_len += gap; 5210*7c478bd9Sstevel@tonic-gate if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 5211*7c478bd9Sstevel@tonic-gate if (flags & TH_RST) { 5212*7c478bd9Sstevel@tonic-gate freemsg(mp); 5213*7c478bd9Sstevel@tonic-gate return; 5214*7c478bd9Sstevel@tonic-gate } 5215*7c478bd9Sstevel@tonic-gate if ((flags & TH_FIN) && seg_len == -1) { 5216*7c478bd9Sstevel@tonic-gate /* 5217*7c478bd9Sstevel@tonic-gate * When TCP receives a duplicate FIN in 5218*7c478bd9Sstevel@tonic-gate * TIME_WAIT state, restart the 2 MSL timer. 5219*7c478bd9Sstevel@tonic-gate * See page 73 in RFC 793. Make sure this TCP 5220*7c478bd9Sstevel@tonic-gate * is already on the TIME_WAIT list. If not, 5221*7c478bd9Sstevel@tonic-gate * just restart the timer. 5222*7c478bd9Sstevel@tonic-gate */ 5223*7c478bd9Sstevel@tonic-gate tcp_time_wait_remove(tcp); 5224*7c478bd9Sstevel@tonic-gate tcp_time_wait_append(tcp); 5225*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 5226*7c478bd9Sstevel@tonic-gate tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5227*7c478bd9Sstevel@tonic-gate tcp->tcp_rnxt, TH_ACK, 0, -1); 5228*7c478bd9Sstevel@tonic-gate freemsg(mp); 5229*7c478bd9Sstevel@tonic-gate return; 5230*7c478bd9Sstevel@tonic-gate } 5231*7c478bd9Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 5232*7c478bd9Sstevel@tonic-gate seg_len = 0; 5233*7c478bd9Sstevel@tonic-gate goto process_ack; 5234*7c478bd9Sstevel@tonic-gate } 5235*7c478bd9Sstevel@tonic-gate 5236*7c478bd9Sstevel@tonic-gate /* Fix seg_seq, and chew the gap off the front. */ 5237*7c478bd9Sstevel@tonic-gate seg_seq = tcp->tcp_rnxt; 5238*7c478bd9Sstevel@tonic-gate } 5239*7c478bd9Sstevel@tonic-gate 5240*7c478bd9Sstevel@tonic-gate if ((flags & TH_SYN) && gap > 0 && rgap < 0) { 5241*7c478bd9Sstevel@tonic-gate /* 5242*7c478bd9Sstevel@tonic-gate * Make sure that when we accept the connection, pick 5243*7c478bd9Sstevel@tonic-gate * an ISS greater than (tcp_snxt + ISS_INCR/2) for the 5244*7c478bd9Sstevel@tonic-gate * old connection. 5245*7c478bd9Sstevel@tonic-gate * 5246*7c478bd9Sstevel@tonic-gate * The next ISS generated is equal to tcp_iss_incr_extra 5247*7c478bd9Sstevel@tonic-gate * + ISS_INCR/2 + other components depending on the 5248*7c478bd9Sstevel@tonic-gate * value of tcp_strong_iss. We pre-calculate the new 5249*7c478bd9Sstevel@tonic-gate * ISS here and compare with tcp_snxt to determine if 5250*7c478bd9Sstevel@tonic-gate * we need to make adjustment to tcp_iss_incr_extra. 5251*7c478bd9Sstevel@tonic-gate * 5252*7c478bd9Sstevel@tonic-gate * Note that since we are now in the global queue 5253*7c478bd9Sstevel@tonic-gate * perimeter and need to do a lateral_put() to the 5254*7c478bd9Sstevel@tonic-gate * listener queue, there can be other connection requests/ 5255*7c478bd9Sstevel@tonic-gate * attempts while the lateral_put() is going on. That 5256*7c478bd9Sstevel@tonic-gate * means what we calculate here may not be correct. This 5257*7c478bd9Sstevel@tonic-gate * is extremely difficult to solve unless TCP and IP 5258*7c478bd9Sstevel@tonic-gate * modules are merged and there is no perimeter, but just 5259*7c478bd9Sstevel@tonic-gate * locks. The above calculation is ugly and is a 5260*7c478bd9Sstevel@tonic-gate * waste of CPU cycles... 5261*7c478bd9Sstevel@tonic-gate */ 5262*7c478bd9Sstevel@tonic-gate uint32_t new_iss = tcp_iss_incr_extra; 5263*7c478bd9Sstevel@tonic-gate int32_t adj; 5264*7c478bd9Sstevel@tonic-gate 5265*7c478bd9Sstevel@tonic-gate /* Add time component and min random (i.e. 1). */ 5266*7c478bd9Sstevel@tonic-gate new_iss += (prom_gettime() >> ISS_NSEC_SHT) + 1; 5267*7c478bd9Sstevel@tonic-gate if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { 5268*7c478bd9Sstevel@tonic-gate /* 5269*7c478bd9Sstevel@tonic-gate * New ISS not guaranteed to be ISS_INCR/2 5270*7c478bd9Sstevel@tonic-gate * ahead of the current tcp_snxt, so add the 5271*7c478bd9Sstevel@tonic-gate * difference to tcp_iss_incr_extra. 5272*7c478bd9Sstevel@tonic-gate */ 5273*7c478bd9Sstevel@tonic-gate tcp_iss_incr_extra += adj; 5274*7c478bd9Sstevel@tonic-gate } 5275*7c478bd9Sstevel@tonic-gate tcp_clean_death(sock_id, tcp, 0); 5276*7c478bd9Sstevel@tonic-gate 5277*7c478bd9Sstevel@tonic-gate /* 5278*7c478bd9Sstevel@tonic-gate * This is a passive open. Right now we do not 5279*7c478bd9Sstevel@tonic-gate * do anything... 5280*7c478bd9Sstevel@tonic-gate */ 5281*7c478bd9Sstevel@tonic-gate freemsg(mp); 5282*7c478bd9Sstevel@tonic-gate return; 5283*7c478bd9Sstevel@tonic-gate } 5284*7c478bd9Sstevel@tonic-gate 5285*7c478bd9Sstevel@tonic-gate /* 5286*7c478bd9Sstevel@tonic-gate * rgap is the amount of stuff received out of window. A negative 5287*7c478bd9Sstevel@tonic-gate * value is the amount out of window. 5288*7c478bd9Sstevel@tonic-gate */ 5289*7c478bd9Sstevel@tonic-gate if (rgap < 0) { 5290*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataPastWinSegs); 5291*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataPastWinBytes, -rgap); 5292*7c478bd9Sstevel@tonic-gate /* Fix seg_len and make sure there is something left. */ 5293*7c478bd9Sstevel@tonic-gate seg_len += rgap; 5294*7c478bd9Sstevel@tonic-gate if (seg_len <= 0) { 5295*7c478bd9Sstevel@tonic-gate if (flags & TH_RST) { 5296*7c478bd9Sstevel@tonic-gate freemsg(mp); 5297*7c478bd9Sstevel@tonic-gate return; 5298*7c478bd9Sstevel@tonic-gate } 5299*7c478bd9Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 5300*7c478bd9Sstevel@tonic-gate seg_len = 0; 5301*7c478bd9Sstevel@tonic-gate goto process_ack; 5302*7c478bd9Sstevel@tonic-gate } 5303*7c478bd9Sstevel@tonic-gate } 5304*7c478bd9Sstevel@tonic-gate /* 5305*7c478bd9Sstevel@tonic-gate * Check whether we can update tcp_ts_recent. This test is 5306*7c478bd9Sstevel@tonic-gate * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 5307*7c478bd9Sstevel@tonic-gate * Extensions for High Performance: An Update", Internet Draft. 5308*7c478bd9Sstevel@tonic-gate */ 5309*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok && 5310*7c478bd9Sstevel@tonic-gate TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 5311*7c478bd9Sstevel@tonic-gate SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 5312*7c478bd9Sstevel@tonic-gate tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 5313*7c478bd9Sstevel@tonic-gate tcp->tcp_last_rcv_lbolt = prom_gettime(); 5314*7c478bd9Sstevel@tonic-gate } 5315*7c478bd9Sstevel@tonic-gate 5316*7c478bd9Sstevel@tonic-gate if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { 5317*7c478bd9Sstevel@tonic-gate /* Always ack out of order packets */ 5318*7c478bd9Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 5319*7c478bd9Sstevel@tonic-gate seg_len = 0; 5320*7c478bd9Sstevel@tonic-gate } else if (seg_len > 0) { 5321*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataInorderSegs); 5322*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataInorderBytes, seg_len); 5323*7c478bd9Sstevel@tonic-gate } 5324*7c478bd9Sstevel@tonic-gate if (flags & TH_RST) { 5325*7c478bd9Sstevel@tonic-gate freemsg(mp); 5326*7c478bd9Sstevel@tonic-gate (void) tcp_clean_death(sock_id, tcp, 0); 5327*7c478bd9Sstevel@tonic-gate return; 5328*7c478bd9Sstevel@tonic-gate } 5329*7c478bd9Sstevel@tonic-gate if (flags & TH_SYN) { 5330*7c478bd9Sstevel@tonic-gate freemsg(mp); 5331*7c478bd9Sstevel@tonic-gate tcp_xmit_ctl("TH_SYN", tcp, NULL, seg_ack, seg_seq + 1, 5332*7c478bd9Sstevel@tonic-gate TH_RST|TH_ACK, 0, -1); 5333*7c478bd9Sstevel@tonic-gate /* 5334*7c478bd9Sstevel@tonic-gate * Do not delete the TCP structure if it is in 5335*7c478bd9Sstevel@tonic-gate * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. 5336*7c478bd9Sstevel@tonic-gate */ 5337*7c478bd9Sstevel@tonic-gate return; 5338*7c478bd9Sstevel@tonic-gate } 5339*7c478bd9Sstevel@tonic-gate process_ack: 5340*7c478bd9Sstevel@tonic-gate if (flags & TH_ACK) { 5341*7c478bd9Sstevel@tonic-gate bytes_acked = (int)(seg_ack - tcp->tcp_suna); 5342*7c478bd9Sstevel@tonic-gate if (bytes_acked <= 0) { 5343*7c478bd9Sstevel@tonic-gate if (bytes_acked == 0 && seg_len == 0 && 5344*7c478bd9Sstevel@tonic-gate new_swnd == tcp->tcp_swnd) 5345*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDupAck); 5346*7c478bd9Sstevel@tonic-gate } else { 5347*7c478bd9Sstevel@tonic-gate /* Acks something not sent */ 5348*7c478bd9Sstevel@tonic-gate flags |= TH_ACK_NEEDED; 5349*7c478bd9Sstevel@tonic-gate } 5350*7c478bd9Sstevel@tonic-gate } 5351*7c478bd9Sstevel@tonic-gate freemsg(mp); 5352*7c478bd9Sstevel@tonic-gate if (flags & TH_ACK_NEEDED) { 5353*7c478bd9Sstevel@tonic-gate /* 5354*7c478bd9Sstevel@tonic-gate * Time to send an ack for some reason. 5355*7c478bd9Sstevel@tonic-gate */ 5356*7c478bd9Sstevel@tonic-gate tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5357*7c478bd9Sstevel@tonic-gate tcp->tcp_rnxt, TH_ACK, 0, -1); 5358*7c478bd9Sstevel@tonic-gate } 5359*7c478bd9Sstevel@tonic-gate } 5360*7c478bd9Sstevel@tonic-gate 5361*7c478bd9Sstevel@tonic-gate static int 5362*7c478bd9Sstevel@tonic-gate tcp_init_values(tcp_t *tcp, struct inetboot_socket *isp) 5363*7c478bd9Sstevel@tonic-gate { 5364*7c478bd9Sstevel@tonic-gate int err; 5365*7c478bd9Sstevel@tonic-gate 5366*7c478bd9Sstevel@tonic-gate tcp->tcp_family = AF_INET; 5367*7c478bd9Sstevel@tonic-gate tcp->tcp_ipversion = IPV4_VERSION; 5368*7c478bd9Sstevel@tonic-gate 5369*7c478bd9Sstevel@tonic-gate /* 5370*7c478bd9Sstevel@tonic-gate * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO 5371*7c478bd9Sstevel@tonic-gate * will be close to tcp_rexmit_interval_initial. By doing this, we 5372*7c478bd9Sstevel@tonic-gate * allow the algorithm to adjust slowly to large fluctuations of RTT 5373*7c478bd9Sstevel@tonic-gate * during first few transmissions of a connection as seen in slow 5374*7c478bd9Sstevel@tonic-gate * links. 5375*7c478bd9Sstevel@tonic-gate */ 5376*7c478bd9Sstevel@tonic-gate tcp->tcp_rtt_sa = tcp_rexmit_interval_initial << 2; 5377*7c478bd9Sstevel@tonic-gate tcp->tcp_rtt_sd = tcp_rexmit_interval_initial >> 1; 5378*7c478bd9Sstevel@tonic-gate tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 5379*7c478bd9Sstevel@tonic-gate tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + 5380*7c478bd9Sstevel@tonic-gate tcp_conn_grace_period; 5381*7c478bd9Sstevel@tonic-gate if (tcp->tcp_rto < tcp_rexmit_interval_min) 5382*7c478bd9Sstevel@tonic-gate tcp->tcp_rto = tcp_rexmit_interval_min; 5383*7c478bd9Sstevel@tonic-gate tcp->tcp_timer_backoff = 0; 5384*7c478bd9Sstevel@tonic-gate tcp->tcp_ms_we_have_waited = 0; 5385*7c478bd9Sstevel@tonic-gate tcp->tcp_last_recv_time = prom_gettime(); 5386*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_max = tcp_cwnd_max_; 5387*7c478bd9Sstevel@tonic-gate tcp->tcp_snd_burst = TCP_CWND_INFINITE; 5388*7c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 5389*7c478bd9Sstevel@tonic-gate /* For Ethernet, the mtu returned is actually 1550... */ 5390*7c478bd9Sstevel@tonic-gate if (mac_get_type() == IFT_ETHER) { 5391*7c478bd9Sstevel@tonic-gate tcp->tcp_if_mtu = mac_get_mtu() - 50; 5392*7c478bd9Sstevel@tonic-gate } else { 5393*7c478bd9Sstevel@tonic-gate tcp->tcp_if_mtu = mac_get_mtu(); 5394*7c478bd9Sstevel@tonic-gate } 5395*7c478bd9Sstevel@tonic-gate tcp->tcp_mss = tcp->tcp_if_mtu; 5396*7c478bd9Sstevel@tonic-gate 5397*7c478bd9Sstevel@tonic-gate tcp->tcp_first_timer_threshold = tcp_ip_notify_interval; 5398*7c478bd9Sstevel@tonic-gate tcp->tcp_first_ctimer_threshold = tcp_ip_notify_cinterval; 5399*7c478bd9Sstevel@tonic-gate tcp->tcp_second_timer_threshold = tcp_ip_abort_interval; 5400*7c478bd9Sstevel@tonic-gate /* 5401*7c478bd9Sstevel@tonic-gate * Fix it to tcp_ip_abort_linterval later if it turns out to be a 5402*7c478bd9Sstevel@tonic-gate * passive open. 5403*7c478bd9Sstevel@tonic-gate */ 5404*7c478bd9Sstevel@tonic-gate tcp->tcp_second_ctimer_threshold = tcp_ip_abort_cinterval; 5405*7c478bd9Sstevel@tonic-gate 5406*7c478bd9Sstevel@tonic-gate tcp->tcp_naglim = tcp_naglim_def; 5407*7c478bd9Sstevel@tonic-gate 5408*7c478bd9Sstevel@tonic-gate /* NOTE: ISS is now set in tcp_adapt_ire(). */ 5409*7c478bd9Sstevel@tonic-gate 5410*7c478bd9Sstevel@tonic-gate /* Initialize the header template */ 5411*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ipversion == IPV4_VERSION) { 5412*7c478bd9Sstevel@tonic-gate err = tcp_header_init_ipv4(tcp); 5413*7c478bd9Sstevel@tonic-gate } 5414*7c478bd9Sstevel@tonic-gate if (err) 5415*7c478bd9Sstevel@tonic-gate return (err); 5416*7c478bd9Sstevel@tonic-gate 5417*7c478bd9Sstevel@tonic-gate /* 5418*7c478bd9Sstevel@tonic-gate * Init the window scale to the max so tcp_rwnd_set() won't pare 5419*7c478bd9Sstevel@tonic-gate * down tcp_rwnd. tcp_adapt_ire() will set the right value later. 5420*7c478bd9Sstevel@tonic-gate */ 5421*7c478bd9Sstevel@tonic-gate tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; 5422*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_lowater = tcp_xmit_lowat; 5423*7c478bd9Sstevel@tonic-gate if (isp != NULL) { 5424*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_hiwater = isp->so_sndbuf; 5425*7c478bd9Sstevel@tonic-gate tcp->tcp_rwnd = isp->so_rcvbuf; 5426*7c478bd9Sstevel@tonic-gate tcp->tcp_rwnd_max = isp->so_rcvbuf; 5427*7c478bd9Sstevel@tonic-gate } 5428*7c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_IDLE; 5429*7c478bd9Sstevel@tonic-gate return (0); 5430*7c478bd9Sstevel@tonic-gate } 5431*7c478bd9Sstevel@tonic-gate 5432*7c478bd9Sstevel@tonic-gate /* 5433*7c478bd9Sstevel@tonic-gate * Initialize the IPv4 header. Loses any record of any IP options. 5434*7c478bd9Sstevel@tonic-gate */ 5435*7c478bd9Sstevel@tonic-gate static int 5436*7c478bd9Sstevel@tonic-gate tcp_header_init_ipv4(tcp_t *tcp) 5437*7c478bd9Sstevel@tonic-gate { 5438*7c478bd9Sstevel@tonic-gate tcph_t *tcph; 5439*7c478bd9Sstevel@tonic-gate 5440*7c478bd9Sstevel@tonic-gate /* 5441*7c478bd9Sstevel@tonic-gate * This is a simple initialization. If there's 5442*7c478bd9Sstevel@tonic-gate * already a template, it should never be too small, 5443*7c478bd9Sstevel@tonic-gate * so reuse it. Otherwise, allocate space for the new one. 5444*7c478bd9Sstevel@tonic-gate */ 5445*7c478bd9Sstevel@tonic-gate if (tcp->tcp_iphc != NULL) { 5446*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 5447*7c478bd9Sstevel@tonic-gate bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 5448*7c478bd9Sstevel@tonic-gate } else { 5449*7c478bd9Sstevel@tonic-gate tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; 5450*7c478bd9Sstevel@tonic-gate tcp->tcp_iphc = bkmem_zalloc(tcp->tcp_iphc_len); 5451*7c478bd9Sstevel@tonic-gate if (tcp->tcp_iphc == NULL) { 5452*7c478bd9Sstevel@tonic-gate tcp->tcp_iphc_len = 0; 5453*7c478bd9Sstevel@tonic-gate return (ENOMEM); 5454*7c478bd9Sstevel@tonic-gate } 5455*7c478bd9Sstevel@tonic-gate } 5456*7c478bd9Sstevel@tonic-gate tcp->tcp_ipha = (struct ip *)tcp->tcp_iphc; 5457*7c478bd9Sstevel@tonic-gate tcp->tcp_ipversion = IPV4_VERSION; 5458*7c478bd9Sstevel@tonic-gate 5459*7c478bd9Sstevel@tonic-gate /* 5460*7c478bd9Sstevel@tonic-gate * Note that it does not include TCP options yet. It will 5461*7c478bd9Sstevel@tonic-gate * after the connection is established. 5462*7c478bd9Sstevel@tonic-gate */ 5463*7c478bd9Sstevel@tonic-gate tcp->tcp_hdr_len = sizeof (struct ip) + sizeof (tcph_t); 5464*7c478bd9Sstevel@tonic-gate tcp->tcp_tcp_hdr_len = sizeof (tcph_t); 5465*7c478bd9Sstevel@tonic-gate tcp->tcp_ip_hdr_len = sizeof (struct ip); 5466*7c478bd9Sstevel@tonic-gate tcp->tcp_ipha->ip_v = IP_VERSION; 5467*7c478bd9Sstevel@tonic-gate /* We don't support IP options... */ 5468*7c478bd9Sstevel@tonic-gate tcp->tcp_ipha->ip_hl = IP_SIMPLE_HDR_LENGTH_IN_WORDS; 5469*7c478bd9Sstevel@tonic-gate tcp->tcp_ipha->ip_p = IPPROTO_TCP; 5470*7c478bd9Sstevel@tonic-gate /* We are not supposed to do PMTU discovery... */ 5471*7c478bd9Sstevel@tonic-gate tcp->tcp_ipha->ip_sum = 0; 5472*7c478bd9Sstevel@tonic-gate 5473*7c478bd9Sstevel@tonic-gate tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (struct ip)); 5474*7c478bd9Sstevel@tonic-gate tcp->tcp_tcph = tcph; 5475*7c478bd9Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] = (5 << 4); 5476*7c478bd9Sstevel@tonic-gate return (0); 5477*7c478bd9Sstevel@tonic-gate } 5478*7c478bd9Sstevel@tonic-gate 5479*7c478bd9Sstevel@tonic-gate /* 5480*7c478bd9Sstevel@tonic-gate * Send out a control packet on the tcp connection specified. This routine 5481*7c478bd9Sstevel@tonic-gate * is typically called where we need a simple ACK or RST generated. 5482*7c478bd9Sstevel@tonic-gate * 5483*7c478bd9Sstevel@tonic-gate * This function is called with or without a mp. 5484*7c478bd9Sstevel@tonic-gate */ 5485*7c478bd9Sstevel@tonic-gate static void 5486*7c478bd9Sstevel@tonic-gate tcp_xmit_ctl(char *str, tcp_t *tcp, mblk_t *mp, uint32_t seq, 5487*7c478bd9Sstevel@tonic-gate uint32_t ack, int ctl, uint_t ip_hdr_len, int sock_id) 5488*7c478bd9Sstevel@tonic-gate { 5489*7c478bd9Sstevel@tonic-gate uchar_t *rptr; 5490*7c478bd9Sstevel@tonic-gate tcph_t *tcph; 5491*7c478bd9Sstevel@tonic-gate struct ip *iph = NULL; 5492*7c478bd9Sstevel@tonic-gate int tcp_hdr_len; 5493*7c478bd9Sstevel@tonic-gate int tcp_ip_hdr_len; 5494*7c478bd9Sstevel@tonic-gate 5495*7c478bd9Sstevel@tonic-gate tcp_hdr_len = tcp->tcp_hdr_len; 5496*7c478bd9Sstevel@tonic-gate tcp_ip_hdr_len = tcp->tcp_ip_hdr_len; 5497*7c478bd9Sstevel@tonic-gate 5498*7c478bd9Sstevel@tonic-gate if (mp) { 5499*7c478bd9Sstevel@tonic-gate assert(ip_hdr_len != 0); 5500*7c478bd9Sstevel@tonic-gate rptr = mp->b_rptr; 5501*7c478bd9Sstevel@tonic-gate tcph = (tcph_t *)(rptr + ip_hdr_len); 5502*7c478bd9Sstevel@tonic-gate /* Don't reply to a RST segment. */ 5503*7c478bd9Sstevel@tonic-gate if (tcph->th_flags[0] & TH_RST) { 5504*7c478bd9Sstevel@tonic-gate freeb(mp); 5505*7c478bd9Sstevel@tonic-gate return; 5506*7c478bd9Sstevel@tonic-gate } 5507*7c478bd9Sstevel@tonic-gate freemsg(mp); 5508*7c478bd9Sstevel@tonic-gate rptr = NULL; 5509*7c478bd9Sstevel@tonic-gate } else { 5510*7c478bd9Sstevel@tonic-gate assert(ip_hdr_len == 0); 5511*7c478bd9Sstevel@tonic-gate } 5512*7c478bd9Sstevel@tonic-gate /* If a text string is passed in with the request, print it out. */ 5513*7c478bd9Sstevel@tonic-gate if (str != NULL) { 5514*7c478bd9Sstevel@tonic-gate dprintf("tcp_xmit_ctl(%d): '%s', seq 0x%x, ack 0x%x, " 5515*7c478bd9Sstevel@tonic-gate "ctl 0x%x\n", sock_id, str, seq, ack, ctl); 5516*7c478bd9Sstevel@tonic-gate } 5517*7c478bd9Sstevel@tonic-gate mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 0); 5518*7c478bd9Sstevel@tonic-gate if (mp == NULL) { 5519*7c478bd9Sstevel@tonic-gate dprintf("tcp_xmit_ctl(%d): Cannot allocate memory\n", sock_id); 5520*7c478bd9Sstevel@tonic-gate return; 5521*7c478bd9Sstevel@tonic-gate } 5522*7c478bd9Sstevel@tonic-gate rptr = &mp->b_rptr[tcp_wroff_xtra]; 5523*7c478bd9Sstevel@tonic-gate mp->b_rptr = rptr; 5524*7c478bd9Sstevel@tonic-gate mp->b_wptr = &rptr[tcp_hdr_len]; 5525*7c478bd9Sstevel@tonic-gate bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len); 5526*7c478bd9Sstevel@tonic-gate 5527*7c478bd9Sstevel@tonic-gate iph = (struct ip *)rptr; 5528*7c478bd9Sstevel@tonic-gate iph->ip_len = htons(tcp_hdr_len); 5529*7c478bd9Sstevel@tonic-gate 5530*7c478bd9Sstevel@tonic-gate tcph = (tcph_t *)&rptr[tcp_ip_hdr_len]; 5531*7c478bd9Sstevel@tonic-gate tcph->th_flags[0] = (uint8_t)ctl; 5532*7c478bd9Sstevel@tonic-gate if (ctl & TH_RST) { 5533*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutRsts); 5534*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutControl); 5535*7c478bd9Sstevel@tonic-gate /* 5536*7c478bd9Sstevel@tonic-gate * Don't send TSopt w/ TH_RST packets per RFC 1323. 5537*7c478bd9Sstevel@tonic-gate */ 5538*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok && tcp->tcp_state > TCPS_SYN_SENT) { 5539*7c478bd9Sstevel@tonic-gate mp->b_wptr = &rptr[tcp_hdr_len - TCPOPT_REAL_TS_LEN]; 5540*7c478bd9Sstevel@tonic-gate *(mp->b_wptr) = TCPOPT_EOL; 5541*7c478bd9Sstevel@tonic-gate iph->ip_len = htons(tcp_hdr_len - 5542*7c478bd9Sstevel@tonic-gate TCPOPT_REAL_TS_LEN); 5543*7c478bd9Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] -= (3 << 4); 5544*7c478bd9Sstevel@tonic-gate } 5545*7c478bd9Sstevel@tonic-gate } 5546*7c478bd9Sstevel@tonic-gate if (ctl & TH_ACK) { 5547*7c478bd9Sstevel@tonic-gate uint32_t now = prom_gettime(); 5548*7c478bd9Sstevel@tonic-gate 5549*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 5550*7c478bd9Sstevel@tonic-gate U32_TO_BE32(now, 5551*7c478bd9Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5552*7c478bd9Sstevel@tonic-gate U32_TO_BE32(tcp->tcp_ts_recent, 5553*7c478bd9Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5554*7c478bd9Sstevel@tonic-gate } 5555*7c478bd9Sstevel@tonic-gate tcp->tcp_rack = ack; 5556*7c478bd9Sstevel@tonic-gate tcp->tcp_rack_cnt = 0; 5557*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutAck); 5558*7c478bd9Sstevel@tonic-gate } 5559*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutSegs); 5560*7c478bd9Sstevel@tonic-gate U32_TO_BE32(seq, tcph->th_seq); 5561*7c478bd9Sstevel@tonic-gate U32_TO_BE32(ack, tcph->th_ack); 5562*7c478bd9Sstevel@tonic-gate 5563*7c478bd9Sstevel@tonic-gate tcp_set_cksum(mp); 5564*7c478bd9Sstevel@tonic-gate iph->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5565*7c478bd9Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_xmit_ctl", mp); 5566*7c478bd9Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp); 5567*7c478bd9Sstevel@tonic-gate freeb(mp); 5568*7c478bd9Sstevel@tonic-gate } 5569*7c478bd9Sstevel@tonic-gate 5570*7c478bd9Sstevel@tonic-gate /* Generate an ACK-only (no data) segment for a TCP endpoint */ 5571*7c478bd9Sstevel@tonic-gate static mblk_t * 5572*7c478bd9Sstevel@tonic-gate tcp_ack_mp(tcp_t *tcp) 5573*7c478bd9Sstevel@tonic-gate { 5574*7c478bd9Sstevel@tonic-gate if (tcp->tcp_valid_bits) { 5575*7c478bd9Sstevel@tonic-gate /* 5576*7c478bd9Sstevel@tonic-gate * For the complex case where we have to send some 5577*7c478bd9Sstevel@tonic-gate * controls (FIN or SYN), let tcp_xmit_mp do it. 5578*7c478bd9Sstevel@tonic-gate * When sending an ACK-only segment (no data) 5579*7c478bd9Sstevel@tonic-gate * into a zero window, always set the seq number to 5580*7c478bd9Sstevel@tonic-gate * suna, since snxt will be extended past the window. 5581*7c478bd9Sstevel@tonic-gate * If we used snxt, the receiver might consider the ACK 5582*7c478bd9Sstevel@tonic-gate * unacceptable. 5583*7c478bd9Sstevel@tonic-gate */ 5584*7c478bd9Sstevel@tonic-gate return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 5585*7c478bd9Sstevel@tonic-gate (tcp->tcp_zero_win_probe) ? 5586*7c478bd9Sstevel@tonic-gate tcp->tcp_suna : 5587*7c478bd9Sstevel@tonic-gate tcp->tcp_snxt, B_FALSE, NULL, B_FALSE)); 5588*7c478bd9Sstevel@tonic-gate } else { 5589*7c478bd9Sstevel@tonic-gate /* Generate a simple ACK */ 5590*7c478bd9Sstevel@tonic-gate uchar_t *rptr; 5591*7c478bd9Sstevel@tonic-gate tcph_t *tcph; 5592*7c478bd9Sstevel@tonic-gate mblk_t *mp1; 5593*7c478bd9Sstevel@tonic-gate int32_t tcp_hdr_len; 5594*7c478bd9Sstevel@tonic-gate int32_t num_sack_blk = 0; 5595*7c478bd9Sstevel@tonic-gate int32_t sack_opt_len; 5596*7c478bd9Sstevel@tonic-gate 5597*7c478bd9Sstevel@tonic-gate /* 5598*7c478bd9Sstevel@tonic-gate * Allocate space for TCP + IP headers 5599*7c478bd9Sstevel@tonic-gate * and link-level header 5600*7c478bd9Sstevel@tonic-gate */ 5601*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 5602*7c478bd9Sstevel@tonic-gate num_sack_blk = MIN(tcp->tcp_max_sack_blk, 5603*7c478bd9Sstevel@tonic-gate tcp->tcp_num_sack_blk); 5604*7c478bd9Sstevel@tonic-gate sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 5605*7c478bd9Sstevel@tonic-gate TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 5606*7c478bd9Sstevel@tonic-gate tcp_hdr_len = tcp->tcp_hdr_len + sack_opt_len; 5607*7c478bd9Sstevel@tonic-gate } else { 5608*7c478bd9Sstevel@tonic-gate tcp_hdr_len = tcp->tcp_hdr_len; 5609*7c478bd9Sstevel@tonic-gate } 5610*7c478bd9Sstevel@tonic-gate mp1 = allocb(tcp_hdr_len + tcp_wroff_xtra, 0); 5611*7c478bd9Sstevel@tonic-gate if (mp1 == NULL) 5612*7c478bd9Sstevel@tonic-gate return (NULL); 5613*7c478bd9Sstevel@tonic-gate 5614*7c478bd9Sstevel@tonic-gate /* copy in prototype TCP + IP header */ 5615*7c478bd9Sstevel@tonic-gate rptr = mp1->b_rptr + tcp_wroff_xtra; 5616*7c478bd9Sstevel@tonic-gate mp1->b_rptr = rptr; 5617*7c478bd9Sstevel@tonic-gate mp1->b_wptr = rptr + tcp_hdr_len; 5618*7c478bd9Sstevel@tonic-gate bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 5619*7c478bd9Sstevel@tonic-gate 5620*7c478bd9Sstevel@tonic-gate tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 5621*7c478bd9Sstevel@tonic-gate 5622*7c478bd9Sstevel@tonic-gate /* 5623*7c478bd9Sstevel@tonic-gate * Set the TCP sequence number. 5624*7c478bd9Sstevel@tonic-gate * When sending an ACK-only segment (no data) 5625*7c478bd9Sstevel@tonic-gate * into a zero window, always set the seq number to 5626*7c478bd9Sstevel@tonic-gate * suna, since snxt will be extended past the window. 5627*7c478bd9Sstevel@tonic-gate * If we used snxt, the receiver might consider the ACK 5628*7c478bd9Sstevel@tonic-gate * unacceptable. 5629*7c478bd9Sstevel@tonic-gate */ 5630*7c478bd9Sstevel@tonic-gate U32_TO_ABE32((tcp->tcp_zero_win_probe) ? 5631*7c478bd9Sstevel@tonic-gate tcp->tcp_suna : tcp->tcp_snxt, tcph->th_seq); 5632*7c478bd9Sstevel@tonic-gate 5633*7c478bd9Sstevel@tonic-gate /* Set up the TCP flag field. */ 5634*7c478bd9Sstevel@tonic-gate tcph->th_flags[0] = (uchar_t)TH_ACK; 5635*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ecn_echo_on) 5636*7c478bd9Sstevel@tonic-gate tcph->th_flags[0] |= TH_ECE; 5637*7c478bd9Sstevel@tonic-gate 5638*7c478bd9Sstevel@tonic-gate tcp->tcp_rack = tcp->tcp_rnxt; 5639*7c478bd9Sstevel@tonic-gate tcp->tcp_rack_cnt = 0; 5640*7c478bd9Sstevel@tonic-gate 5641*7c478bd9Sstevel@tonic-gate /* fill in timestamp option if in use */ 5642*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 5643*7c478bd9Sstevel@tonic-gate uint32_t llbolt = (uint32_t)prom_gettime(); 5644*7c478bd9Sstevel@tonic-gate 5645*7c478bd9Sstevel@tonic-gate U32_TO_BE32(llbolt, 5646*7c478bd9Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5647*7c478bd9Sstevel@tonic-gate U32_TO_BE32(tcp->tcp_ts_recent, 5648*7c478bd9Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5649*7c478bd9Sstevel@tonic-gate } 5650*7c478bd9Sstevel@tonic-gate 5651*7c478bd9Sstevel@tonic-gate /* Fill in SACK options */ 5652*7c478bd9Sstevel@tonic-gate if (num_sack_blk > 0) { 5653*7c478bd9Sstevel@tonic-gate uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 5654*7c478bd9Sstevel@tonic-gate sack_blk_t *tmp; 5655*7c478bd9Sstevel@tonic-gate int32_t i; 5656*7c478bd9Sstevel@tonic-gate 5657*7c478bd9Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5658*7c478bd9Sstevel@tonic-gate wptr[1] = TCPOPT_NOP; 5659*7c478bd9Sstevel@tonic-gate wptr[2] = TCPOPT_SACK; 5660*7c478bd9Sstevel@tonic-gate wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 5661*7c478bd9Sstevel@tonic-gate sizeof (sack_blk_t); 5662*7c478bd9Sstevel@tonic-gate wptr += TCPOPT_REAL_SACK_LEN; 5663*7c478bd9Sstevel@tonic-gate 5664*7c478bd9Sstevel@tonic-gate tmp = tcp->tcp_sack_list; 5665*7c478bd9Sstevel@tonic-gate for (i = 0; i < num_sack_blk; i++) { 5666*7c478bd9Sstevel@tonic-gate U32_TO_BE32(tmp[i].begin, wptr); 5667*7c478bd9Sstevel@tonic-gate wptr += sizeof (tcp_seq); 5668*7c478bd9Sstevel@tonic-gate U32_TO_BE32(tmp[i].end, wptr); 5669*7c478bd9Sstevel@tonic-gate wptr += sizeof (tcp_seq); 5670*7c478bd9Sstevel@tonic-gate } 5671*7c478bd9Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 5672*7c478bd9Sstevel@tonic-gate << 4); 5673*7c478bd9Sstevel@tonic-gate } 5674*7c478bd9Sstevel@tonic-gate 5675*7c478bd9Sstevel@tonic-gate ((struct ip *)rptr)->ip_len = htons(tcp_hdr_len); 5676*7c478bd9Sstevel@tonic-gate tcp_set_cksum(mp1); 5677*7c478bd9Sstevel@tonic-gate ((struct ip *)rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5678*7c478bd9Sstevel@tonic-gate return (mp1); 5679*7c478bd9Sstevel@tonic-gate } 5680*7c478bd9Sstevel@tonic-gate } 5681*7c478bd9Sstevel@tonic-gate 5682*7c478bd9Sstevel@tonic-gate /* 5683*7c478bd9Sstevel@tonic-gate * tcp_xmit_mp is called to return a pointer to an mblk chain complete with 5684*7c478bd9Sstevel@tonic-gate * ip and tcp header ready to pass down to IP. If the mp passed in is 5685*7c478bd9Sstevel@tonic-gate * non-NULL, then up to max_to_send bytes of data will be dup'ed off that 5686*7c478bd9Sstevel@tonic-gate * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary 5687*7c478bd9Sstevel@tonic-gate * otherwise it will dup partial mblks.) 5688*7c478bd9Sstevel@tonic-gate * Otherwise, an appropriate ACK packet will be generated. This 5689*7c478bd9Sstevel@tonic-gate * routine is not usually called to send new data for the first time. It 5690*7c478bd9Sstevel@tonic-gate * is mostly called out of the timer for retransmits, and to generate ACKs. 5691*7c478bd9Sstevel@tonic-gate * 5692*7c478bd9Sstevel@tonic-gate * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will 5693*7c478bd9Sstevel@tonic-gate * be adjusted by *offset. And after dupb(), the offset and the ending mblk 5694*7c478bd9Sstevel@tonic-gate * of the original mblk chain will be returned in *offset and *end_mp. 5695*7c478bd9Sstevel@tonic-gate */ 5696*7c478bd9Sstevel@tonic-gate static mblk_t * 5697*7c478bd9Sstevel@tonic-gate tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, 5698*7c478bd9Sstevel@tonic-gate mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, 5699*7c478bd9Sstevel@tonic-gate boolean_t rexmit) 5700*7c478bd9Sstevel@tonic-gate { 5701*7c478bd9Sstevel@tonic-gate int data_length; 5702*7c478bd9Sstevel@tonic-gate int32_t off = 0; 5703*7c478bd9Sstevel@tonic-gate uint_t flags; 5704*7c478bd9Sstevel@tonic-gate mblk_t *mp1; 5705*7c478bd9Sstevel@tonic-gate mblk_t *mp2; 5706*7c478bd9Sstevel@tonic-gate mblk_t *new_mp; 5707*7c478bd9Sstevel@tonic-gate uchar_t *rptr; 5708*7c478bd9Sstevel@tonic-gate tcph_t *tcph; 5709*7c478bd9Sstevel@tonic-gate int32_t num_sack_blk = 0; 5710*7c478bd9Sstevel@tonic-gate int32_t sack_opt_len = 0; 5711*7c478bd9Sstevel@tonic-gate 5712*7c478bd9Sstevel@tonic-gate /* Allocate for our maximum TCP header + link-level */ 5713*7c478bd9Sstevel@tonic-gate mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 5714*7c478bd9Sstevel@tonic-gate tcp_wroff_xtra, 0); 5715*7c478bd9Sstevel@tonic-gate if (mp1 == NULL) 5716*7c478bd9Sstevel@tonic-gate return (NULL); 5717*7c478bd9Sstevel@tonic-gate data_length = 0; 5718*7c478bd9Sstevel@tonic-gate 5719*7c478bd9Sstevel@tonic-gate /* 5720*7c478bd9Sstevel@tonic-gate * Note that tcp_mss has been adjusted to take into account the 5721*7c478bd9Sstevel@tonic-gate * timestamp option if applicable. Because SACK options do not 5722*7c478bd9Sstevel@tonic-gate * appear in every TCP segments and they are of variable lengths, 5723*7c478bd9Sstevel@tonic-gate * they cannot be included in tcp_mss. Thus we need to calculate 5724*7c478bd9Sstevel@tonic-gate * the actual segment length when we need to send a segment which 5725*7c478bd9Sstevel@tonic-gate * includes SACK options. 5726*7c478bd9Sstevel@tonic-gate */ 5727*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 5728*7c478bd9Sstevel@tonic-gate num_sack_blk = MIN(tcp->tcp_max_sack_blk, 5729*7c478bd9Sstevel@tonic-gate tcp->tcp_num_sack_blk); 5730*7c478bd9Sstevel@tonic-gate sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 5731*7c478bd9Sstevel@tonic-gate TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 5732*7c478bd9Sstevel@tonic-gate if (max_to_send + sack_opt_len > tcp->tcp_mss) 5733*7c478bd9Sstevel@tonic-gate max_to_send -= sack_opt_len; 5734*7c478bd9Sstevel@tonic-gate } 5735*7c478bd9Sstevel@tonic-gate 5736*7c478bd9Sstevel@tonic-gate if (offset != NULL) { 5737*7c478bd9Sstevel@tonic-gate off = *offset; 5738*7c478bd9Sstevel@tonic-gate /* We use offset as an indicator that end_mp is not NULL. */ 5739*7c478bd9Sstevel@tonic-gate *end_mp = NULL; 5740*7c478bd9Sstevel@tonic-gate } 5741*7c478bd9Sstevel@tonic-gate for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) { 5742*7c478bd9Sstevel@tonic-gate /* This could be faster with cooperation from downstream */ 5743*7c478bd9Sstevel@tonic-gate if (mp2 != mp1 && !sendall && 5744*7c478bd9Sstevel@tonic-gate data_length + (int)(mp->b_wptr - mp->b_rptr) > 5745*7c478bd9Sstevel@tonic-gate max_to_send) 5746*7c478bd9Sstevel@tonic-gate /* 5747*7c478bd9Sstevel@tonic-gate * Don't send the next mblk since the whole mblk 5748*7c478bd9Sstevel@tonic-gate * does not fit. 5749*7c478bd9Sstevel@tonic-gate */ 5750*7c478bd9Sstevel@tonic-gate break; 5751*7c478bd9Sstevel@tonic-gate mp2->b_cont = dupb(mp); 5752*7c478bd9Sstevel@tonic-gate mp2 = mp2->b_cont; 5753*7c478bd9Sstevel@tonic-gate if (mp2 == NULL) { 5754*7c478bd9Sstevel@tonic-gate freemsg(mp1); 5755*7c478bd9Sstevel@tonic-gate return (NULL); 5756*7c478bd9Sstevel@tonic-gate } 5757*7c478bd9Sstevel@tonic-gate mp2->b_rptr += off; 5758*7c478bd9Sstevel@tonic-gate assert((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 5759*7c478bd9Sstevel@tonic-gate (uintptr_t)INT_MAX); 5760*7c478bd9Sstevel@tonic-gate 5761*7c478bd9Sstevel@tonic-gate data_length += (int)(mp2->b_wptr - mp2->b_rptr); 5762*7c478bd9Sstevel@tonic-gate if (data_length > max_to_send) { 5763*7c478bd9Sstevel@tonic-gate mp2->b_wptr -= data_length - max_to_send; 5764*7c478bd9Sstevel@tonic-gate data_length = max_to_send; 5765*7c478bd9Sstevel@tonic-gate off = mp2->b_wptr - mp->b_rptr; 5766*7c478bd9Sstevel@tonic-gate break; 5767*7c478bd9Sstevel@tonic-gate } else { 5768*7c478bd9Sstevel@tonic-gate off = 0; 5769*7c478bd9Sstevel@tonic-gate } 5770*7c478bd9Sstevel@tonic-gate } 5771*7c478bd9Sstevel@tonic-gate if (offset != NULL) { 5772*7c478bd9Sstevel@tonic-gate *offset = off; 5773*7c478bd9Sstevel@tonic-gate *end_mp = mp; 5774*7c478bd9Sstevel@tonic-gate } 5775*7c478bd9Sstevel@tonic-gate if (seg_len != NULL) { 5776*7c478bd9Sstevel@tonic-gate *seg_len = data_length; 5777*7c478bd9Sstevel@tonic-gate } 5778*7c478bd9Sstevel@tonic-gate 5779*7c478bd9Sstevel@tonic-gate rptr = mp1->b_rptr + tcp_wroff_xtra; 5780*7c478bd9Sstevel@tonic-gate mp1->b_rptr = rptr; 5781*7c478bd9Sstevel@tonic-gate mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len; 5782*7c478bd9Sstevel@tonic-gate bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 5783*7c478bd9Sstevel@tonic-gate tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 5784*7c478bd9Sstevel@tonic-gate U32_TO_ABE32(seq, tcph->th_seq); 5785*7c478bd9Sstevel@tonic-gate 5786*7c478bd9Sstevel@tonic-gate /* 5787*7c478bd9Sstevel@tonic-gate * Use tcp_unsent to determine if the PUSH bit should be used assumes 5788*7c478bd9Sstevel@tonic-gate * that this function was called from tcp_wput_data. Thus, when called 5789*7c478bd9Sstevel@tonic-gate * to retransmit data the setting of the PUSH bit may appear some 5790*7c478bd9Sstevel@tonic-gate * what random in that it might get set when it should not. This 5791*7c478bd9Sstevel@tonic-gate * should not pose any performance issues. 5792*7c478bd9Sstevel@tonic-gate */ 5793*7c478bd9Sstevel@tonic-gate if (data_length != 0 && (tcp->tcp_unsent == 0 || 5794*7c478bd9Sstevel@tonic-gate tcp->tcp_unsent == data_length)) { 5795*7c478bd9Sstevel@tonic-gate flags = TH_ACK | TH_PUSH; 5796*7c478bd9Sstevel@tonic-gate } else { 5797*7c478bd9Sstevel@tonic-gate flags = TH_ACK; 5798*7c478bd9Sstevel@tonic-gate } 5799*7c478bd9Sstevel@tonic-gate 5800*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ecn_ok) { 5801*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ecn_echo_on) 5802*7c478bd9Sstevel@tonic-gate flags |= TH_ECE; 5803*7c478bd9Sstevel@tonic-gate 5804*7c478bd9Sstevel@tonic-gate /* 5805*7c478bd9Sstevel@tonic-gate * Only set ECT bit and ECN_CWR if a segment contains new data. 5806*7c478bd9Sstevel@tonic-gate * There is no TCP flow control for non-data segments, and 5807*7c478bd9Sstevel@tonic-gate * only data segment is transmitted reliably. 5808*7c478bd9Sstevel@tonic-gate */ 5809*7c478bd9Sstevel@tonic-gate if (data_length > 0 && !rexmit) { 5810*7c478bd9Sstevel@tonic-gate SET_ECT(tcp, rptr); 5811*7c478bd9Sstevel@tonic-gate if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 5812*7c478bd9Sstevel@tonic-gate flags |= TH_CWR; 5813*7c478bd9Sstevel@tonic-gate tcp->tcp_ecn_cwr_sent = B_TRUE; 5814*7c478bd9Sstevel@tonic-gate } 5815*7c478bd9Sstevel@tonic-gate } 5816*7c478bd9Sstevel@tonic-gate } 5817*7c478bd9Sstevel@tonic-gate 5818*7c478bd9Sstevel@tonic-gate if (tcp->tcp_valid_bits) { 5819*7c478bd9Sstevel@tonic-gate uint32_t u1; 5820*7c478bd9Sstevel@tonic-gate 5821*7c478bd9Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_ISS_VALID) && 5822*7c478bd9Sstevel@tonic-gate seq == tcp->tcp_iss) { 5823*7c478bd9Sstevel@tonic-gate uchar_t *wptr; 5824*7c478bd9Sstevel@tonic-gate 5825*7c478bd9Sstevel@tonic-gate /* 5826*7c478bd9Sstevel@tonic-gate * Tack on the MSS option. It is always needed 5827*7c478bd9Sstevel@tonic-gate * for both active and passive open. 5828*7c478bd9Sstevel@tonic-gate */ 5829*7c478bd9Sstevel@tonic-gate wptr = mp1->b_wptr; 5830*7c478bd9Sstevel@tonic-gate wptr[0] = TCPOPT_MAXSEG; 5831*7c478bd9Sstevel@tonic-gate wptr[1] = TCPOPT_MAXSEG_LEN; 5832*7c478bd9Sstevel@tonic-gate wptr += 2; 5833*7c478bd9Sstevel@tonic-gate /* 5834*7c478bd9Sstevel@tonic-gate * MSS option value should be interface MTU - MIN 5835*7c478bd9Sstevel@tonic-gate * TCP/IP header. 5836*7c478bd9Sstevel@tonic-gate */ 5837*7c478bd9Sstevel@tonic-gate u1 = tcp->tcp_if_mtu - IP_SIMPLE_HDR_LENGTH - 5838*7c478bd9Sstevel@tonic-gate TCP_MIN_HEADER_LENGTH; 5839*7c478bd9Sstevel@tonic-gate U16_TO_BE16(u1, wptr); 5840*7c478bd9Sstevel@tonic-gate mp1->b_wptr = wptr + 2; 5841*7c478bd9Sstevel@tonic-gate /* Update the offset to cover the additional word */ 5842*7c478bd9Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += (1 << 4); 5843*7c478bd9Sstevel@tonic-gate 5844*7c478bd9Sstevel@tonic-gate /* 5845*7c478bd9Sstevel@tonic-gate * Note that the following way of filling in 5846*7c478bd9Sstevel@tonic-gate * TCP options are not optimal. Some NOPs can 5847*7c478bd9Sstevel@tonic-gate * be saved. But there is no need at this time 5848*7c478bd9Sstevel@tonic-gate * to optimize it. When it is needed, we will 5849*7c478bd9Sstevel@tonic-gate * do it. 5850*7c478bd9Sstevel@tonic-gate */ 5851*7c478bd9Sstevel@tonic-gate switch (tcp->tcp_state) { 5852*7c478bd9Sstevel@tonic-gate case TCPS_SYN_SENT: 5853*7c478bd9Sstevel@tonic-gate flags = TH_SYN; 5854*7c478bd9Sstevel@tonic-gate 5855*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_ws_ok) { 5856*7c478bd9Sstevel@tonic-gate wptr = mp1->b_wptr; 5857*7c478bd9Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5858*7c478bd9Sstevel@tonic-gate wptr[1] = TCPOPT_WSCALE; 5859*7c478bd9Sstevel@tonic-gate wptr[2] = TCPOPT_WS_LEN; 5860*7c478bd9Sstevel@tonic-gate wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 5861*7c478bd9Sstevel@tonic-gate mp1->b_wptr += TCPOPT_REAL_WS_LEN; 5862*7c478bd9Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += 5863*7c478bd9Sstevel@tonic-gate (1 << 4); 5864*7c478bd9Sstevel@tonic-gate } 5865*7c478bd9Sstevel@tonic-gate 5866*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 5867*7c478bd9Sstevel@tonic-gate uint32_t llbolt; 5868*7c478bd9Sstevel@tonic-gate 5869*7c478bd9Sstevel@tonic-gate llbolt = prom_gettime(); 5870*7c478bd9Sstevel@tonic-gate wptr = mp1->b_wptr; 5871*7c478bd9Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5872*7c478bd9Sstevel@tonic-gate wptr[1] = TCPOPT_NOP; 5873*7c478bd9Sstevel@tonic-gate wptr[2] = TCPOPT_TSTAMP; 5874*7c478bd9Sstevel@tonic-gate wptr[3] = TCPOPT_TSTAMP_LEN; 5875*7c478bd9Sstevel@tonic-gate wptr += 4; 5876*7c478bd9Sstevel@tonic-gate U32_TO_BE32(llbolt, wptr); 5877*7c478bd9Sstevel@tonic-gate wptr += 4; 5878*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_ts_recent == 0); 5879*7c478bd9Sstevel@tonic-gate U32_TO_BE32(0L, wptr); 5880*7c478bd9Sstevel@tonic-gate mp1->b_wptr += TCPOPT_REAL_TS_LEN; 5881*7c478bd9Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += 5882*7c478bd9Sstevel@tonic-gate (3 << 4); 5883*7c478bd9Sstevel@tonic-gate } 5884*7c478bd9Sstevel@tonic-gate 5885*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok) { 5886*7c478bd9Sstevel@tonic-gate wptr = mp1->b_wptr; 5887*7c478bd9Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5888*7c478bd9Sstevel@tonic-gate wptr[1] = TCPOPT_NOP; 5889*7c478bd9Sstevel@tonic-gate wptr[2] = TCPOPT_SACK_PERMITTED; 5890*7c478bd9Sstevel@tonic-gate wptr[3] = TCPOPT_SACK_OK_LEN; 5891*7c478bd9Sstevel@tonic-gate mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 5892*7c478bd9Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += 5893*7c478bd9Sstevel@tonic-gate (1 << 4); 5894*7c478bd9Sstevel@tonic-gate } 5895*7c478bd9Sstevel@tonic-gate 5896*7c478bd9Sstevel@tonic-gate /* 5897*7c478bd9Sstevel@tonic-gate * Set up all the bits to tell other side 5898*7c478bd9Sstevel@tonic-gate * we are ECN capable. 5899*7c478bd9Sstevel@tonic-gate */ 5900*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ecn_ok) { 5901*7c478bd9Sstevel@tonic-gate flags |= (TH_ECE | TH_CWR); 5902*7c478bd9Sstevel@tonic-gate } 5903*7c478bd9Sstevel@tonic-gate break; 5904*7c478bd9Sstevel@tonic-gate case TCPS_SYN_RCVD: 5905*7c478bd9Sstevel@tonic-gate flags |= TH_SYN; 5906*7c478bd9Sstevel@tonic-gate 5907*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_ws_ok) { 5908*7c478bd9Sstevel@tonic-gate wptr = mp1->b_wptr; 5909*7c478bd9Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5910*7c478bd9Sstevel@tonic-gate wptr[1] = TCPOPT_WSCALE; 5911*7c478bd9Sstevel@tonic-gate wptr[2] = TCPOPT_WS_LEN; 5912*7c478bd9Sstevel@tonic-gate wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 5913*7c478bd9Sstevel@tonic-gate mp1->b_wptr += TCPOPT_REAL_WS_LEN; 5914*7c478bd9Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += (1 << 4); 5915*7c478bd9Sstevel@tonic-gate } 5916*7c478bd9Sstevel@tonic-gate 5917*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_sack_ok) { 5918*7c478bd9Sstevel@tonic-gate wptr = mp1->b_wptr; 5919*7c478bd9Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5920*7c478bd9Sstevel@tonic-gate wptr[1] = TCPOPT_NOP; 5921*7c478bd9Sstevel@tonic-gate wptr[2] = TCPOPT_SACK_PERMITTED; 5922*7c478bd9Sstevel@tonic-gate wptr[3] = TCPOPT_SACK_OK_LEN; 5923*7c478bd9Sstevel@tonic-gate mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 5924*7c478bd9Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += 5925*7c478bd9Sstevel@tonic-gate (1 << 4); 5926*7c478bd9Sstevel@tonic-gate } 5927*7c478bd9Sstevel@tonic-gate 5928*7c478bd9Sstevel@tonic-gate /* 5929*7c478bd9Sstevel@tonic-gate * If the other side is ECN capable, reply 5930*7c478bd9Sstevel@tonic-gate * that we are also ECN capable. 5931*7c478bd9Sstevel@tonic-gate */ 5932*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ecn_ok) { 5933*7c478bd9Sstevel@tonic-gate flags |= TH_ECE; 5934*7c478bd9Sstevel@tonic-gate } 5935*7c478bd9Sstevel@tonic-gate break; 5936*7c478bd9Sstevel@tonic-gate default: 5937*7c478bd9Sstevel@tonic-gate break; 5938*7c478bd9Sstevel@tonic-gate } 5939*7c478bd9Sstevel@tonic-gate /* allocb() of adequate mblk assures space */ 5940*7c478bd9Sstevel@tonic-gate assert((uintptr_t)(mp1->b_wptr - 5941*7c478bd9Sstevel@tonic-gate mp1->b_rptr) <= (uintptr_t)INT_MAX); 5942*7c478bd9Sstevel@tonic-gate if (flags & TH_SYN) 5943*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutControl); 5944*7c478bd9Sstevel@tonic-gate } 5945*7c478bd9Sstevel@tonic-gate if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 5946*7c478bd9Sstevel@tonic-gate (seq + data_length) == tcp->tcp_fss) { 5947*7c478bd9Sstevel@tonic-gate if (!tcp->tcp_fin_acked) { 5948*7c478bd9Sstevel@tonic-gate flags |= TH_FIN; 5949*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutControl); 5950*7c478bd9Sstevel@tonic-gate } 5951*7c478bd9Sstevel@tonic-gate if (!tcp->tcp_fin_sent) { 5952*7c478bd9Sstevel@tonic-gate tcp->tcp_fin_sent = B_TRUE; 5953*7c478bd9Sstevel@tonic-gate switch (tcp->tcp_state) { 5954*7c478bd9Sstevel@tonic-gate case TCPS_SYN_RCVD: 5955*7c478bd9Sstevel@tonic-gate case TCPS_ESTABLISHED: 5956*7c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_FIN_WAIT_1; 5957*7c478bd9Sstevel@tonic-gate break; 5958*7c478bd9Sstevel@tonic-gate case TCPS_CLOSE_WAIT: 5959*7c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_LAST_ACK; 5960*7c478bd9Sstevel@tonic-gate break; 5961*7c478bd9Sstevel@tonic-gate } 5962*7c478bd9Sstevel@tonic-gate if (tcp->tcp_suna == tcp->tcp_snxt) 5963*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5964*7c478bd9Sstevel@tonic-gate tcp->tcp_snxt = tcp->tcp_fss + 1; 5965*7c478bd9Sstevel@tonic-gate } 5966*7c478bd9Sstevel@tonic-gate } 5967*7c478bd9Sstevel@tonic-gate } 5968*7c478bd9Sstevel@tonic-gate tcph->th_flags[0] = (uchar_t)flags; 5969*7c478bd9Sstevel@tonic-gate tcp->tcp_rack = tcp->tcp_rnxt; 5970*7c478bd9Sstevel@tonic-gate tcp->tcp_rack_cnt = 0; 5971*7c478bd9Sstevel@tonic-gate 5972*7c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_ts_ok) { 5973*7c478bd9Sstevel@tonic-gate if (tcp->tcp_state != TCPS_SYN_SENT) { 5974*7c478bd9Sstevel@tonic-gate uint32_t llbolt = prom_gettime(); 5975*7c478bd9Sstevel@tonic-gate 5976*7c478bd9Sstevel@tonic-gate U32_TO_BE32(llbolt, 5977*7c478bd9Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5978*7c478bd9Sstevel@tonic-gate U32_TO_BE32(tcp->tcp_ts_recent, 5979*7c478bd9Sstevel@tonic-gate (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5980*7c478bd9Sstevel@tonic-gate } 5981*7c478bd9Sstevel@tonic-gate } 5982*7c478bd9Sstevel@tonic-gate 5983*7c478bd9Sstevel@tonic-gate if (num_sack_blk > 0) { 5984*7c478bd9Sstevel@tonic-gate uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 5985*7c478bd9Sstevel@tonic-gate sack_blk_t *tmp; 5986*7c478bd9Sstevel@tonic-gate int32_t i; 5987*7c478bd9Sstevel@tonic-gate 5988*7c478bd9Sstevel@tonic-gate wptr[0] = TCPOPT_NOP; 5989*7c478bd9Sstevel@tonic-gate wptr[1] = TCPOPT_NOP; 5990*7c478bd9Sstevel@tonic-gate wptr[2] = TCPOPT_SACK; 5991*7c478bd9Sstevel@tonic-gate wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 5992*7c478bd9Sstevel@tonic-gate sizeof (sack_blk_t); 5993*7c478bd9Sstevel@tonic-gate wptr += TCPOPT_REAL_SACK_LEN; 5994*7c478bd9Sstevel@tonic-gate 5995*7c478bd9Sstevel@tonic-gate tmp = tcp->tcp_sack_list; 5996*7c478bd9Sstevel@tonic-gate for (i = 0; i < num_sack_blk; i++) { 5997*7c478bd9Sstevel@tonic-gate U32_TO_BE32(tmp[i].begin, wptr); 5998*7c478bd9Sstevel@tonic-gate wptr += sizeof (tcp_seq); 5999*7c478bd9Sstevel@tonic-gate U32_TO_BE32(tmp[i].end, wptr); 6000*7c478bd9Sstevel@tonic-gate wptr += sizeof (tcp_seq); 6001*7c478bd9Sstevel@tonic-gate } 6002*7c478bd9Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) << 4); 6003*7c478bd9Sstevel@tonic-gate } 6004*7c478bd9Sstevel@tonic-gate assert((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); 6005*7c478bd9Sstevel@tonic-gate data_length += (int)(mp1->b_wptr - rptr); 6006*7c478bd9Sstevel@tonic-gate if (tcp->tcp_ipversion == IPV4_VERSION) 6007*7c478bd9Sstevel@tonic-gate ((struct ip *)rptr)->ip_len = htons(data_length); 6008*7c478bd9Sstevel@tonic-gate 6009*7c478bd9Sstevel@tonic-gate /* 6010*7c478bd9Sstevel@tonic-gate * Performance hit! We need to pullup the whole message 6011*7c478bd9Sstevel@tonic-gate * in order to do checksum and for the MAC output routine. 6012*7c478bd9Sstevel@tonic-gate */ 6013*7c478bd9Sstevel@tonic-gate if (mp1->b_cont != NULL) { 6014*7c478bd9Sstevel@tonic-gate int mp_size; 6015*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 6016*7c478bd9Sstevel@tonic-gate printf("Multiple mblk %d\n", msgdsize(mp1)); 6017*7c478bd9Sstevel@tonic-gate #endif 6018*7c478bd9Sstevel@tonic-gate new_mp = allocb(msgdsize(mp1) + tcp_wroff_xtra, 0); 6019*7c478bd9Sstevel@tonic-gate new_mp->b_rptr += tcp_wroff_xtra; 6020*7c478bd9Sstevel@tonic-gate new_mp->b_wptr = new_mp->b_rptr; 6021*7c478bd9Sstevel@tonic-gate while (mp1 != NULL) { 6022*7c478bd9Sstevel@tonic-gate mp_size = mp1->b_wptr - mp1->b_rptr; 6023*7c478bd9Sstevel@tonic-gate bcopy(mp1->b_rptr, new_mp->b_wptr, mp_size); 6024*7c478bd9Sstevel@tonic-gate new_mp->b_wptr += mp_size; 6025*7c478bd9Sstevel@tonic-gate mp1 = mp1->b_cont; 6026*7c478bd9Sstevel@tonic-gate } 6027*7c478bd9Sstevel@tonic-gate freemsg(mp1); 6028*7c478bd9Sstevel@tonic-gate mp1 = new_mp; 6029*7c478bd9Sstevel@tonic-gate } 6030*7c478bd9Sstevel@tonic-gate tcp_set_cksum(mp1); 6031*7c478bd9Sstevel@tonic-gate /* Fill in the TTL field as it is 0 in the header template. */ 6032*7c478bd9Sstevel@tonic-gate ((struct ip *)mp1->b_rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 6033*7c478bd9Sstevel@tonic-gate 6034*7c478bd9Sstevel@tonic-gate return (mp1); 6035*7c478bd9Sstevel@tonic-gate } 6036*7c478bd9Sstevel@tonic-gate 6037*7c478bd9Sstevel@tonic-gate /* 6038*7c478bd9Sstevel@tonic-gate * Generate a "no listener here" reset in response to the 6039*7c478bd9Sstevel@tonic-gate * connection request contained within 'mp' 6040*7c478bd9Sstevel@tonic-gate */ 6041*7c478bd9Sstevel@tonic-gate static void 6042*7c478bd9Sstevel@tonic-gate tcp_xmit_listeners_reset(int sock_id, mblk_t *mp, uint_t ip_hdr_len) 6043*7c478bd9Sstevel@tonic-gate { 6044*7c478bd9Sstevel@tonic-gate uchar_t *rptr; 6045*7c478bd9Sstevel@tonic-gate uint32_t seg_len; 6046*7c478bd9Sstevel@tonic-gate tcph_t *tcph; 6047*7c478bd9Sstevel@tonic-gate uint32_t seg_seq; 6048*7c478bd9Sstevel@tonic-gate uint32_t seg_ack; 6049*7c478bd9Sstevel@tonic-gate uint_t flags; 6050*7c478bd9Sstevel@tonic-gate 6051*7c478bd9Sstevel@tonic-gate rptr = mp->b_rptr; 6052*7c478bd9Sstevel@tonic-gate 6053*7c478bd9Sstevel@tonic-gate tcph = (tcph_t *)&rptr[ip_hdr_len]; 6054*7c478bd9Sstevel@tonic-gate seg_seq = BE32_TO_U32(tcph->th_seq); 6055*7c478bd9Sstevel@tonic-gate seg_ack = BE32_TO_U32(tcph->th_ack); 6056*7c478bd9Sstevel@tonic-gate flags = tcph->th_flags[0]; 6057*7c478bd9Sstevel@tonic-gate 6058*7c478bd9Sstevel@tonic-gate seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len); 6059*7c478bd9Sstevel@tonic-gate if (flags & TH_RST) { 6060*7c478bd9Sstevel@tonic-gate freeb(mp); 6061*7c478bd9Sstevel@tonic-gate } else if (flags & TH_ACK) { 6062*7c478bd9Sstevel@tonic-gate tcp_xmit_early_reset("no tcp, reset", 6063*7c478bd9Sstevel@tonic-gate sock_id, mp, seg_ack, 0, TH_RST, ip_hdr_len); 6064*7c478bd9Sstevel@tonic-gate } else { 6065*7c478bd9Sstevel@tonic-gate if (flags & TH_SYN) 6066*7c478bd9Sstevel@tonic-gate seg_len++; 6067*7c478bd9Sstevel@tonic-gate tcp_xmit_early_reset("no tcp, reset/ack", sock_id, 6068*7c478bd9Sstevel@tonic-gate mp, 0, seg_seq + seg_len, 6069*7c478bd9Sstevel@tonic-gate TH_RST | TH_ACK, ip_hdr_len); 6070*7c478bd9Sstevel@tonic-gate } 6071*7c478bd9Sstevel@tonic-gate } 6072*7c478bd9Sstevel@tonic-gate 6073*7c478bd9Sstevel@tonic-gate /* Non overlapping byte exchanger */ 6074*7c478bd9Sstevel@tonic-gate static void 6075*7c478bd9Sstevel@tonic-gate tcp_xchg(uchar_t *a, uchar_t *b, int len) 6076*7c478bd9Sstevel@tonic-gate { 6077*7c478bd9Sstevel@tonic-gate uchar_t uch; 6078*7c478bd9Sstevel@tonic-gate 6079*7c478bd9Sstevel@tonic-gate while (len-- > 0) { 6080*7c478bd9Sstevel@tonic-gate uch = a[len]; 6081*7c478bd9Sstevel@tonic-gate a[len] = b[len]; 6082*7c478bd9Sstevel@tonic-gate b[len] = uch; 6083*7c478bd9Sstevel@tonic-gate } 6084*7c478bd9Sstevel@tonic-gate } 6085*7c478bd9Sstevel@tonic-gate 6086*7c478bd9Sstevel@tonic-gate /* 6087*7c478bd9Sstevel@tonic-gate * Generate a reset based on an inbound packet for which there is no active 6088*7c478bd9Sstevel@tonic-gate * tcp state that we can find. 6089*7c478bd9Sstevel@tonic-gate */ 6090*7c478bd9Sstevel@tonic-gate static void 6091*7c478bd9Sstevel@tonic-gate tcp_xmit_early_reset(char *str, int sock_id, mblk_t *mp, uint32_t seq, 6092*7c478bd9Sstevel@tonic-gate uint32_t ack, int ctl, uint_t ip_hdr_len) 6093*7c478bd9Sstevel@tonic-gate { 6094*7c478bd9Sstevel@tonic-gate struct ip *iph = NULL; 6095*7c478bd9Sstevel@tonic-gate ushort_t len; 6096*7c478bd9Sstevel@tonic-gate tcph_t *tcph; 6097*7c478bd9Sstevel@tonic-gate int i; 6098*7c478bd9Sstevel@tonic-gate ipaddr_t addr; 6099*7c478bd9Sstevel@tonic-gate mblk_t *new_mp; 6100*7c478bd9Sstevel@tonic-gate 6101*7c478bd9Sstevel@tonic-gate if (str != NULL) { 6102*7c478bd9Sstevel@tonic-gate dprintf("tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " 6103*7c478bd9Sstevel@tonic-gate "flags 0x%x\n", str, seq, ack, ctl); 6104*7c478bd9Sstevel@tonic-gate } 6105*7c478bd9Sstevel@tonic-gate 6106*7c478bd9Sstevel@tonic-gate /* 6107*7c478bd9Sstevel@tonic-gate * We skip reversing source route here. 6108*7c478bd9Sstevel@tonic-gate * (for now we replace all IP options with EOL) 6109*7c478bd9Sstevel@tonic-gate */ 6110*7c478bd9Sstevel@tonic-gate iph = (struct ip *)mp->b_rptr; 6111*7c478bd9Sstevel@tonic-gate for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) 6112*7c478bd9Sstevel@tonic-gate mp->b_rptr[i] = IPOPT_EOL; 6113*7c478bd9Sstevel@tonic-gate /* 6114*7c478bd9Sstevel@tonic-gate * Make sure that src address is not a limited broadcast 6115*7c478bd9Sstevel@tonic-gate * address. Not all broadcast address checking for the 6116*7c478bd9Sstevel@tonic-gate * src address is possible, since we don't know the 6117*7c478bd9Sstevel@tonic-gate * netmask of the src addr. 6118*7c478bd9Sstevel@tonic-gate * No check for destination address is done, since 6119*7c478bd9Sstevel@tonic-gate * IP will not pass up a packet with a broadcast dest address 6120*7c478bd9Sstevel@tonic-gate * to TCP. 6121*7c478bd9Sstevel@tonic-gate */ 6122*7c478bd9Sstevel@tonic-gate if (iph->ip_src.s_addr == INADDR_ANY || 6123*7c478bd9Sstevel@tonic-gate iph->ip_src.s_addr == INADDR_BROADCAST) { 6124*7c478bd9Sstevel@tonic-gate freemsg(mp); 6125*7c478bd9Sstevel@tonic-gate return; 6126*7c478bd9Sstevel@tonic-gate } 6127*7c478bd9Sstevel@tonic-gate 6128*7c478bd9Sstevel@tonic-gate tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6129*7c478bd9Sstevel@tonic-gate if (tcph->th_flags[0] & TH_RST) { 6130*7c478bd9Sstevel@tonic-gate freemsg(mp); 6131*7c478bd9Sstevel@tonic-gate return; 6132*7c478bd9Sstevel@tonic-gate } 6133*7c478bd9Sstevel@tonic-gate /* 6134*7c478bd9Sstevel@tonic-gate * Now copy the original header to a new buffer. The reason 6135*7c478bd9Sstevel@tonic-gate * for doing this is that we need to put extra room before 6136*7c478bd9Sstevel@tonic-gate * the header for the MAC layer address. The original mblk 6137*7c478bd9Sstevel@tonic-gate * does not have this extra head room. 6138*7c478bd9Sstevel@tonic-gate */ 6139*7c478bd9Sstevel@tonic-gate len = ip_hdr_len + sizeof (tcph_t); 6140*7c478bd9Sstevel@tonic-gate if ((new_mp = allocb(len + tcp_wroff_xtra, 0)) == NULL) { 6141*7c478bd9Sstevel@tonic-gate freemsg(mp); 6142*7c478bd9Sstevel@tonic-gate return; 6143*7c478bd9Sstevel@tonic-gate } 6144*7c478bd9Sstevel@tonic-gate new_mp->b_rptr += tcp_wroff_xtra; 6145*7c478bd9Sstevel@tonic-gate bcopy(mp->b_rptr, new_mp->b_rptr, len); 6146*7c478bd9Sstevel@tonic-gate new_mp->b_wptr = new_mp->b_rptr + len; 6147*7c478bd9Sstevel@tonic-gate freemsg(mp); 6148*7c478bd9Sstevel@tonic-gate mp = new_mp; 6149*7c478bd9Sstevel@tonic-gate iph = (struct ip *)mp->b_rptr; 6150*7c478bd9Sstevel@tonic-gate tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6151*7c478bd9Sstevel@tonic-gate 6152*7c478bd9Sstevel@tonic-gate tcph->th_offset_and_rsrvd[0] = (5 << 4); 6153*7c478bd9Sstevel@tonic-gate tcp_xchg(tcph->th_fport, tcph->th_lport, 2); 6154*7c478bd9Sstevel@tonic-gate U32_TO_BE32(ack, tcph->th_ack); 6155*7c478bd9Sstevel@tonic-gate U32_TO_BE32(seq, tcph->th_seq); 6156*7c478bd9Sstevel@tonic-gate U16_TO_BE16(0, tcph->th_win); 6157*7c478bd9Sstevel@tonic-gate bzero(tcph->th_sum, sizeof (int16_t)); 6158*7c478bd9Sstevel@tonic-gate tcph->th_flags[0] = (uint8_t)ctl; 6159*7c478bd9Sstevel@tonic-gate if (ctl & TH_RST) { 6160*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutRsts); 6161*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpOutControl); 6162*7c478bd9Sstevel@tonic-gate } 6163*7c478bd9Sstevel@tonic-gate 6164*7c478bd9Sstevel@tonic-gate iph->ip_len = htons(len); 6165*7c478bd9Sstevel@tonic-gate /* Swap addresses */ 6166*7c478bd9Sstevel@tonic-gate addr = iph->ip_src.s_addr; 6167*7c478bd9Sstevel@tonic-gate iph->ip_src = iph->ip_dst; 6168*7c478bd9Sstevel@tonic-gate iph->ip_dst.s_addr = addr; 6169*7c478bd9Sstevel@tonic-gate iph->ip_id = 0; 6170*7c478bd9Sstevel@tonic-gate iph->ip_ttl = 0; 6171*7c478bd9Sstevel@tonic-gate tcp_set_cksum(mp); 6172*7c478bd9Sstevel@tonic-gate iph->ip_ttl = (uint8_t)tcp_ipv4_ttl; 6173*7c478bd9Sstevel@tonic-gate 6174*7c478bd9Sstevel@tonic-gate /* Dump the packet when debugging. */ 6175*7c478bd9Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_xmit_early_reset", mp); 6176*7c478bd9Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp); 6177*7c478bd9Sstevel@tonic-gate freemsg(mp); 6178*7c478bd9Sstevel@tonic-gate } 6179*7c478bd9Sstevel@tonic-gate 6180*7c478bd9Sstevel@tonic-gate static void 6181*7c478bd9Sstevel@tonic-gate tcp_set_cksum(mblk_t *mp) 6182*7c478bd9Sstevel@tonic-gate { 6183*7c478bd9Sstevel@tonic-gate struct ip *iph; 6184*7c478bd9Sstevel@tonic-gate tcpha_t *tcph; 6185*7c478bd9Sstevel@tonic-gate int len; 6186*7c478bd9Sstevel@tonic-gate 6187*7c478bd9Sstevel@tonic-gate iph = (struct ip *)mp->b_rptr; 6188*7c478bd9Sstevel@tonic-gate tcph = (tcpha_t *)(iph + 1); 6189*7c478bd9Sstevel@tonic-gate len = ntohs(iph->ip_len); 6190*7c478bd9Sstevel@tonic-gate /* 6191*7c478bd9Sstevel@tonic-gate * Calculate the TCP checksum. Need to include the psuedo header, 6192*7c478bd9Sstevel@tonic-gate * which is similar to the real IP header starting at the TTL field. 6193*7c478bd9Sstevel@tonic-gate */ 6194*7c478bd9Sstevel@tonic-gate iph->ip_sum = htons(len - IP_SIMPLE_HDR_LENGTH); 6195*7c478bd9Sstevel@tonic-gate tcph->tha_sum = 0; 6196*7c478bd9Sstevel@tonic-gate tcph->tha_sum = tcp_cksum((uint16_t *)&(iph->ip_ttl), 6197*7c478bd9Sstevel@tonic-gate len - IP_SIMPLE_HDR_LENGTH + 12); 6198*7c478bd9Sstevel@tonic-gate iph->ip_sum = 0; 6199*7c478bd9Sstevel@tonic-gate } 6200*7c478bd9Sstevel@tonic-gate 6201*7c478bd9Sstevel@tonic-gate static uint16_t 6202*7c478bd9Sstevel@tonic-gate tcp_cksum(uint16_t *buf, uint32_t len) 6203*7c478bd9Sstevel@tonic-gate { 6204*7c478bd9Sstevel@tonic-gate /* 6205*7c478bd9Sstevel@tonic-gate * Compute Internet Checksum for "count" bytes 6206*7c478bd9Sstevel@tonic-gate * beginning at location "addr". 6207*7c478bd9Sstevel@tonic-gate */ 6208*7c478bd9Sstevel@tonic-gate int32_t sum = 0; 6209*7c478bd9Sstevel@tonic-gate 6210*7c478bd9Sstevel@tonic-gate while (len > 1) { 6211*7c478bd9Sstevel@tonic-gate /* This is the inner loop */ 6212*7c478bd9Sstevel@tonic-gate sum += *buf++; 6213*7c478bd9Sstevel@tonic-gate len -= 2; 6214*7c478bd9Sstevel@tonic-gate } 6215*7c478bd9Sstevel@tonic-gate 6216*7c478bd9Sstevel@tonic-gate /* Add left-over byte, if any */ 6217*7c478bd9Sstevel@tonic-gate if (len > 0) 6218*7c478bd9Sstevel@tonic-gate sum += *(unsigned char *)buf * 256; 6219*7c478bd9Sstevel@tonic-gate 6220*7c478bd9Sstevel@tonic-gate /* Fold 32-bit sum to 16 bits */ 6221*7c478bd9Sstevel@tonic-gate while (sum >> 16) 6222*7c478bd9Sstevel@tonic-gate sum = (sum & 0xffff) + (sum >> 16); 6223*7c478bd9Sstevel@tonic-gate 6224*7c478bd9Sstevel@tonic-gate return ((uint16_t)~sum); 6225*7c478bd9Sstevel@tonic-gate } 6226*7c478bd9Sstevel@tonic-gate 6227*7c478bd9Sstevel@tonic-gate /* 6228*7c478bd9Sstevel@tonic-gate * Type three generator adapted from the random() function in 4.4 BSD: 6229*7c478bd9Sstevel@tonic-gate */ 6230*7c478bd9Sstevel@tonic-gate 6231*7c478bd9Sstevel@tonic-gate /* 6232*7c478bd9Sstevel@tonic-gate * Copyright (c) 1983, 1993 6233*7c478bd9Sstevel@tonic-gate * The Regents of the University of California. All rights reserved. 6234*7c478bd9Sstevel@tonic-gate * 6235*7c478bd9Sstevel@tonic-gate * Redistribution and use in source and binary forms, with or without 6236*7c478bd9Sstevel@tonic-gate * modification, are permitted provided that the following conditions 6237*7c478bd9Sstevel@tonic-gate * are met: 6238*7c478bd9Sstevel@tonic-gate * 1. Redistributions of source code must retain the above copyright 6239*7c478bd9Sstevel@tonic-gate * notice, this list of conditions and the following disclaimer. 6240*7c478bd9Sstevel@tonic-gate * 2. Redistributions in binary form must reproduce the above copyright 6241*7c478bd9Sstevel@tonic-gate * notice, this list of conditions and the following disclaimer in the 6242*7c478bd9Sstevel@tonic-gate * documentation and/or other materials provided with the distribution. 6243*7c478bd9Sstevel@tonic-gate * 3. All advertising materials mentioning features or use of this software 6244*7c478bd9Sstevel@tonic-gate * must display the following acknowledgement: 6245*7c478bd9Sstevel@tonic-gate * This product includes software developed by the University of 6246*7c478bd9Sstevel@tonic-gate * California, Berkeley and its contributors. 6247*7c478bd9Sstevel@tonic-gate * 4. Neither the name of the University nor the names of its contributors 6248*7c478bd9Sstevel@tonic-gate * may be used to endorse or promote products derived from this software 6249*7c478bd9Sstevel@tonic-gate * without specific prior written permission. 6250*7c478bd9Sstevel@tonic-gate * 6251*7c478bd9Sstevel@tonic-gate * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 6252*7c478bd9Sstevel@tonic-gate * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 6253*7c478bd9Sstevel@tonic-gate * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 6254*7c478bd9Sstevel@tonic-gate * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 6255*7c478bd9Sstevel@tonic-gate * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 6256*7c478bd9Sstevel@tonic-gate * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 6257*7c478bd9Sstevel@tonic-gate * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 6258*7c478bd9Sstevel@tonic-gate * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 6259*7c478bd9Sstevel@tonic-gate * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 6260*7c478bd9Sstevel@tonic-gate * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 6261*7c478bd9Sstevel@tonic-gate * SUCH DAMAGE. 6262*7c478bd9Sstevel@tonic-gate */ 6263*7c478bd9Sstevel@tonic-gate 6264*7c478bd9Sstevel@tonic-gate /* Type 3 -- x**31 + x**3 + 1 */ 6265*7c478bd9Sstevel@tonic-gate #define DEG_3 31 6266*7c478bd9Sstevel@tonic-gate #define SEP_3 3 6267*7c478bd9Sstevel@tonic-gate 6268*7c478bd9Sstevel@tonic-gate 6269*7c478bd9Sstevel@tonic-gate /* Protected by tcp_random_lock */ 6270*7c478bd9Sstevel@tonic-gate static int tcp_randtbl[DEG_3 + 1]; 6271*7c478bd9Sstevel@tonic-gate 6272*7c478bd9Sstevel@tonic-gate static int *tcp_random_fptr = &tcp_randtbl[SEP_3 + 1]; 6273*7c478bd9Sstevel@tonic-gate static int *tcp_random_rptr = &tcp_randtbl[1]; 6274*7c478bd9Sstevel@tonic-gate 6275*7c478bd9Sstevel@tonic-gate static int *tcp_random_state = &tcp_randtbl[1]; 6276*7c478bd9Sstevel@tonic-gate static int *tcp_random_end_ptr = &tcp_randtbl[DEG_3 + 1]; 6277*7c478bd9Sstevel@tonic-gate 6278*7c478bd9Sstevel@tonic-gate static void 6279*7c478bd9Sstevel@tonic-gate tcp_random_init(void) 6280*7c478bd9Sstevel@tonic-gate { 6281*7c478bd9Sstevel@tonic-gate int i; 6282*7c478bd9Sstevel@tonic-gate uint32_t hrt; 6283*7c478bd9Sstevel@tonic-gate uint32_t wallclock; 6284*7c478bd9Sstevel@tonic-gate uint32_t result; 6285*7c478bd9Sstevel@tonic-gate 6286*7c478bd9Sstevel@tonic-gate /* 6287*7c478bd9Sstevel@tonic-gate * 6288*7c478bd9Sstevel@tonic-gate * XXX We don't have high resolution time in standalone... The 6289*7c478bd9Sstevel@tonic-gate * following is just some approximation on the comment below. 6290*7c478bd9Sstevel@tonic-gate * 6291*7c478bd9Sstevel@tonic-gate * Use high-res timer and current time for seed. Gethrtime() returns 6292*7c478bd9Sstevel@tonic-gate * a longlong, which may contain resolution down to nanoseconds. 6293*7c478bd9Sstevel@tonic-gate * The current time will either be a 32-bit or a 64-bit quantity. 6294*7c478bd9Sstevel@tonic-gate * XOR the two together in a 64-bit result variable. 6295*7c478bd9Sstevel@tonic-gate * Convert the result to a 32-bit value by multiplying the high-order 6296*7c478bd9Sstevel@tonic-gate * 32-bits by the low-order 32-bits. 6297*7c478bd9Sstevel@tonic-gate * 6298*7c478bd9Sstevel@tonic-gate * XXX We don't have gethrtime() in prom and the wallclock.... 6299*7c478bd9Sstevel@tonic-gate */ 6300*7c478bd9Sstevel@tonic-gate 6301*7c478bd9Sstevel@tonic-gate hrt = prom_gettime(); 6302*7c478bd9Sstevel@tonic-gate wallclock = (uint32_t)time(NULL); 6303*7c478bd9Sstevel@tonic-gate result = wallclock ^ hrt; 6304*7c478bd9Sstevel@tonic-gate tcp_random_state[0] = result; 6305*7c478bd9Sstevel@tonic-gate 6306*7c478bd9Sstevel@tonic-gate for (i = 1; i < DEG_3; i++) 6307*7c478bd9Sstevel@tonic-gate tcp_random_state[i] = 1103515245 * tcp_random_state[i - 1] 6308*7c478bd9Sstevel@tonic-gate + 12345; 6309*7c478bd9Sstevel@tonic-gate tcp_random_fptr = &tcp_random_state[SEP_3]; 6310*7c478bd9Sstevel@tonic-gate tcp_random_rptr = &tcp_random_state[0]; 6311*7c478bd9Sstevel@tonic-gate for (i = 0; i < 10 * DEG_3; i++) 6312*7c478bd9Sstevel@tonic-gate (void) tcp_random(); 6313*7c478bd9Sstevel@tonic-gate } 6314*7c478bd9Sstevel@tonic-gate 6315*7c478bd9Sstevel@tonic-gate /* 6316*7c478bd9Sstevel@tonic-gate * tcp_random: Return a random number in the range [1 - (128K + 1)]. 6317*7c478bd9Sstevel@tonic-gate * This range is selected to be approximately centered on TCP_ISS / 2, 6318*7c478bd9Sstevel@tonic-gate * and easy to compute. We get this value by generating a 32-bit random 6319*7c478bd9Sstevel@tonic-gate * number, selecting out the high-order 17 bits, and then adding one so 6320*7c478bd9Sstevel@tonic-gate * that we never return zero. 6321*7c478bd9Sstevel@tonic-gate */ 6322*7c478bd9Sstevel@tonic-gate static int 6323*7c478bd9Sstevel@tonic-gate tcp_random(void) 6324*7c478bd9Sstevel@tonic-gate { 6325*7c478bd9Sstevel@tonic-gate int i; 6326*7c478bd9Sstevel@tonic-gate 6327*7c478bd9Sstevel@tonic-gate *tcp_random_fptr += *tcp_random_rptr; 6328*7c478bd9Sstevel@tonic-gate 6329*7c478bd9Sstevel@tonic-gate /* 6330*7c478bd9Sstevel@tonic-gate * The high-order bits are more random than the low-order bits, 6331*7c478bd9Sstevel@tonic-gate * so we select out the high-order 17 bits and add one so that 6332*7c478bd9Sstevel@tonic-gate * we never return zero. 6333*7c478bd9Sstevel@tonic-gate */ 6334*7c478bd9Sstevel@tonic-gate i = ((*tcp_random_fptr >> 15) & 0x1ffff) + 1; 6335*7c478bd9Sstevel@tonic-gate if (++tcp_random_fptr >= tcp_random_end_ptr) { 6336*7c478bd9Sstevel@tonic-gate tcp_random_fptr = tcp_random_state; 6337*7c478bd9Sstevel@tonic-gate ++tcp_random_rptr; 6338*7c478bd9Sstevel@tonic-gate } else if (++tcp_random_rptr >= tcp_random_end_ptr) 6339*7c478bd9Sstevel@tonic-gate tcp_random_rptr = tcp_random_state; 6340*7c478bd9Sstevel@tonic-gate 6341*7c478bd9Sstevel@tonic-gate return (i); 6342*7c478bd9Sstevel@tonic-gate } 6343*7c478bd9Sstevel@tonic-gate 6344*7c478bd9Sstevel@tonic-gate /* 6345*7c478bd9Sstevel@tonic-gate * Generate ISS, taking into account NDD changes may happen halfway through. 6346*7c478bd9Sstevel@tonic-gate * (If the iss is not zero, set it.) 6347*7c478bd9Sstevel@tonic-gate */ 6348*7c478bd9Sstevel@tonic-gate static void 6349*7c478bd9Sstevel@tonic-gate tcp_iss_init(tcp_t *tcp) 6350*7c478bd9Sstevel@tonic-gate { 6351*7c478bd9Sstevel@tonic-gate tcp_iss_incr_extra += (ISS_INCR >> 1); 6352*7c478bd9Sstevel@tonic-gate tcp->tcp_iss = tcp_iss_incr_extra; 6353*7c478bd9Sstevel@tonic-gate tcp->tcp_iss += (prom_gettime() >> ISS_NSEC_SHT) + tcp_random(); 6354*7c478bd9Sstevel@tonic-gate tcp->tcp_valid_bits = TCP_ISS_VALID; 6355*7c478bd9Sstevel@tonic-gate tcp->tcp_fss = tcp->tcp_iss - 1; 6356*7c478bd9Sstevel@tonic-gate tcp->tcp_suna = tcp->tcp_iss; 6357*7c478bd9Sstevel@tonic-gate tcp->tcp_snxt = tcp->tcp_iss + 1; 6358*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 6359*7c478bd9Sstevel@tonic-gate tcp->tcp_csuna = tcp->tcp_snxt; 6360*7c478bd9Sstevel@tonic-gate } 6361*7c478bd9Sstevel@tonic-gate 6362*7c478bd9Sstevel@tonic-gate /* 6363*7c478bd9Sstevel@tonic-gate * Diagnostic routine used to return a string associated with the tcp state. 6364*7c478bd9Sstevel@tonic-gate * Note that if the caller does not supply a buffer, it will use an internal 6365*7c478bd9Sstevel@tonic-gate * static string. This means that if multiple threads call this function at 6366*7c478bd9Sstevel@tonic-gate * the same time, output can be corrupted... Note also that this function 6367*7c478bd9Sstevel@tonic-gate * does not check the size of the supplied buffer. The caller has to make 6368*7c478bd9Sstevel@tonic-gate * sure that it is big enough. 6369*7c478bd9Sstevel@tonic-gate */ 6370*7c478bd9Sstevel@tonic-gate static char * 6371*7c478bd9Sstevel@tonic-gate tcp_display(tcp_t *tcp, char *sup_buf, char format) 6372*7c478bd9Sstevel@tonic-gate { 6373*7c478bd9Sstevel@tonic-gate char buf1[30]; 6374*7c478bd9Sstevel@tonic-gate static char priv_buf[INET_ADDRSTRLEN * 2 + 80]; 6375*7c478bd9Sstevel@tonic-gate char *buf; 6376*7c478bd9Sstevel@tonic-gate char *cp; 6377*7c478bd9Sstevel@tonic-gate char local_addrbuf[INET_ADDRSTRLEN]; 6378*7c478bd9Sstevel@tonic-gate char remote_addrbuf[INET_ADDRSTRLEN]; 6379*7c478bd9Sstevel@tonic-gate struct in_addr addr; 6380*7c478bd9Sstevel@tonic-gate 6381*7c478bd9Sstevel@tonic-gate if (sup_buf != NULL) 6382*7c478bd9Sstevel@tonic-gate buf = sup_buf; 6383*7c478bd9Sstevel@tonic-gate else 6384*7c478bd9Sstevel@tonic-gate buf = priv_buf; 6385*7c478bd9Sstevel@tonic-gate 6386*7c478bd9Sstevel@tonic-gate if (tcp == NULL) 6387*7c478bd9Sstevel@tonic-gate return ("NULL_TCP"); 6388*7c478bd9Sstevel@tonic-gate switch (tcp->tcp_state) { 6389*7c478bd9Sstevel@tonic-gate case TCPS_CLOSED: 6390*7c478bd9Sstevel@tonic-gate cp = "TCP_CLOSED"; 6391*7c478bd9Sstevel@tonic-gate break; 6392*7c478bd9Sstevel@tonic-gate case TCPS_IDLE: 6393*7c478bd9Sstevel@tonic-gate cp = "TCP_IDLE"; 6394*7c478bd9Sstevel@tonic-gate break; 6395*7c478bd9Sstevel@tonic-gate case TCPS_BOUND: 6396*7c478bd9Sstevel@tonic-gate cp = "TCP_BOUND"; 6397*7c478bd9Sstevel@tonic-gate break; 6398*7c478bd9Sstevel@tonic-gate case TCPS_LISTEN: 6399*7c478bd9Sstevel@tonic-gate cp = "TCP_LISTEN"; 6400*7c478bd9Sstevel@tonic-gate break; 6401*7c478bd9Sstevel@tonic-gate case TCPS_SYN_SENT: 6402*7c478bd9Sstevel@tonic-gate cp = "TCP_SYN_SENT"; 6403*7c478bd9Sstevel@tonic-gate break; 6404*7c478bd9Sstevel@tonic-gate case TCPS_SYN_RCVD: 6405*7c478bd9Sstevel@tonic-gate cp = "TCP_SYN_RCVD"; 6406*7c478bd9Sstevel@tonic-gate break; 6407*7c478bd9Sstevel@tonic-gate case TCPS_ESTABLISHED: 6408*7c478bd9Sstevel@tonic-gate cp = "TCP_ESTABLISHED"; 6409*7c478bd9Sstevel@tonic-gate break; 6410*7c478bd9Sstevel@tonic-gate case TCPS_CLOSE_WAIT: 6411*7c478bd9Sstevel@tonic-gate cp = "TCP_CLOSE_WAIT"; 6412*7c478bd9Sstevel@tonic-gate break; 6413*7c478bd9Sstevel@tonic-gate case TCPS_FIN_WAIT_1: 6414*7c478bd9Sstevel@tonic-gate cp = "TCP_FIN_WAIT_1"; 6415*7c478bd9Sstevel@tonic-gate break; 6416*7c478bd9Sstevel@tonic-gate case TCPS_CLOSING: 6417*7c478bd9Sstevel@tonic-gate cp = "TCP_CLOSING"; 6418*7c478bd9Sstevel@tonic-gate break; 6419*7c478bd9Sstevel@tonic-gate case TCPS_LAST_ACK: 6420*7c478bd9Sstevel@tonic-gate cp = "TCP_LAST_ACK"; 6421*7c478bd9Sstevel@tonic-gate break; 6422*7c478bd9Sstevel@tonic-gate case TCPS_FIN_WAIT_2: 6423*7c478bd9Sstevel@tonic-gate cp = "TCP_FIN_WAIT_2"; 6424*7c478bd9Sstevel@tonic-gate break; 6425*7c478bd9Sstevel@tonic-gate case TCPS_TIME_WAIT: 6426*7c478bd9Sstevel@tonic-gate cp = "TCP_TIME_WAIT"; 6427*7c478bd9Sstevel@tonic-gate break; 6428*7c478bd9Sstevel@tonic-gate default: 6429*7c478bd9Sstevel@tonic-gate (void) sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); 6430*7c478bd9Sstevel@tonic-gate cp = buf1; 6431*7c478bd9Sstevel@tonic-gate break; 6432*7c478bd9Sstevel@tonic-gate } 6433*7c478bd9Sstevel@tonic-gate switch (format) { 6434*7c478bd9Sstevel@tonic-gate case DISP_ADDR_AND_PORT: 6435*7c478bd9Sstevel@tonic-gate /* 6436*7c478bd9Sstevel@tonic-gate * Note that we use the remote address in the tcp_b 6437*7c478bd9Sstevel@tonic-gate * structure. This means that it will print out 6438*7c478bd9Sstevel@tonic-gate * the real destination address, not the next hop's 6439*7c478bd9Sstevel@tonic-gate * address if source routing is used. 6440*7c478bd9Sstevel@tonic-gate */ 6441*7c478bd9Sstevel@tonic-gate addr.s_addr = tcp->tcp_bound_source; 6442*7c478bd9Sstevel@tonic-gate bcopy(inet_ntoa(addr), local_addrbuf, sizeof (local_addrbuf)); 6443*7c478bd9Sstevel@tonic-gate addr.s_addr = tcp->tcp_remote; 6444*7c478bd9Sstevel@tonic-gate bcopy(inet_ntoa(addr), remote_addrbuf, sizeof (remote_addrbuf)); 6445*7c478bd9Sstevel@tonic-gate (void) snprintf(buf, sizeof (priv_buf), "[%s.%u, %s.%u] %s", 6446*7c478bd9Sstevel@tonic-gate local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf, 6447*7c478bd9Sstevel@tonic-gate ntohs(tcp->tcp_fport), cp); 6448*7c478bd9Sstevel@tonic-gate break; 6449*7c478bd9Sstevel@tonic-gate case DISP_PORT_ONLY: 6450*7c478bd9Sstevel@tonic-gate default: 6451*7c478bd9Sstevel@tonic-gate (void) snprintf(buf, sizeof (priv_buf), "[%u, %u] %s", 6452*7c478bd9Sstevel@tonic-gate ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp); 6453*7c478bd9Sstevel@tonic-gate break; 6454*7c478bd9Sstevel@tonic-gate } 6455*7c478bd9Sstevel@tonic-gate 6456*7c478bd9Sstevel@tonic-gate return (buf); 6457*7c478bd9Sstevel@tonic-gate } 6458*7c478bd9Sstevel@tonic-gate 6459*7c478bd9Sstevel@tonic-gate /* 6460*7c478bd9Sstevel@tonic-gate * Add a new piece to the tcp reassembly queue. If the gap at the beginning 6461*7c478bd9Sstevel@tonic-gate * is filled, return as much as we can. The message passed in may be 6462*7c478bd9Sstevel@tonic-gate * multi-part, chained using b_cont. "start" is the starting sequence 6463*7c478bd9Sstevel@tonic-gate * number for this piece. 6464*7c478bd9Sstevel@tonic-gate */ 6465*7c478bd9Sstevel@tonic-gate static mblk_t * 6466*7c478bd9Sstevel@tonic-gate tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) 6467*7c478bd9Sstevel@tonic-gate { 6468*7c478bd9Sstevel@tonic-gate uint32_t end; 6469*7c478bd9Sstevel@tonic-gate mblk_t *mp1; 6470*7c478bd9Sstevel@tonic-gate mblk_t *mp2; 6471*7c478bd9Sstevel@tonic-gate mblk_t *next_mp; 6472*7c478bd9Sstevel@tonic-gate uint32_t u1; 6473*7c478bd9Sstevel@tonic-gate 6474*7c478bd9Sstevel@tonic-gate /* Walk through all the new pieces. */ 6475*7c478bd9Sstevel@tonic-gate do { 6476*7c478bd9Sstevel@tonic-gate assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 6477*7c478bd9Sstevel@tonic-gate (uintptr_t)INT_MAX); 6478*7c478bd9Sstevel@tonic-gate end = start + (int)(mp->b_wptr - mp->b_rptr); 6479*7c478bd9Sstevel@tonic-gate next_mp = mp->b_cont; 6480*7c478bd9Sstevel@tonic-gate if (start == end) { 6481*7c478bd9Sstevel@tonic-gate /* Empty. Blast it. */ 6482*7c478bd9Sstevel@tonic-gate freeb(mp); 6483*7c478bd9Sstevel@tonic-gate continue; 6484*7c478bd9Sstevel@tonic-gate } 6485*7c478bd9Sstevel@tonic-gate mp->b_cont = NULL; 6486*7c478bd9Sstevel@tonic-gate TCP_REASS_SET_SEQ(mp, start); 6487*7c478bd9Sstevel@tonic-gate TCP_REASS_SET_END(mp, end); 6488*7c478bd9Sstevel@tonic-gate mp1 = tcp->tcp_reass_tail; 6489*7c478bd9Sstevel@tonic-gate if (!mp1) { 6490*7c478bd9Sstevel@tonic-gate tcp->tcp_reass_tail = mp; 6491*7c478bd9Sstevel@tonic-gate tcp->tcp_reass_head = mp; 6492*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataUnorderSegs); 6493*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataUnorderBytes, end - start); 6494*7c478bd9Sstevel@tonic-gate continue; 6495*7c478bd9Sstevel@tonic-gate } 6496*7c478bd9Sstevel@tonic-gate /* New stuff completely beyond tail? */ 6497*7c478bd9Sstevel@tonic-gate if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { 6498*7c478bd9Sstevel@tonic-gate /* Link it on end. */ 6499*7c478bd9Sstevel@tonic-gate mp1->b_cont = mp; 6500*7c478bd9Sstevel@tonic-gate tcp->tcp_reass_tail = mp; 6501*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataUnorderSegs); 6502*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataUnorderBytes, end - start); 6503*7c478bd9Sstevel@tonic-gate continue; 6504*7c478bd9Sstevel@tonic-gate } 6505*7c478bd9Sstevel@tonic-gate mp1 = tcp->tcp_reass_head; 6506*7c478bd9Sstevel@tonic-gate u1 = TCP_REASS_SEQ(mp1); 6507*7c478bd9Sstevel@tonic-gate /* New stuff at the front? */ 6508*7c478bd9Sstevel@tonic-gate if (SEQ_LT(start, u1)) { 6509*7c478bd9Sstevel@tonic-gate /* Yes... Check for overlap. */ 6510*7c478bd9Sstevel@tonic-gate mp->b_cont = mp1; 6511*7c478bd9Sstevel@tonic-gate tcp->tcp_reass_head = mp; 6512*7c478bd9Sstevel@tonic-gate tcp_reass_elim_overlap(tcp, mp); 6513*7c478bd9Sstevel@tonic-gate continue; 6514*7c478bd9Sstevel@tonic-gate } 6515*7c478bd9Sstevel@tonic-gate /* 6516*7c478bd9Sstevel@tonic-gate * The new piece fits somewhere between the head and tail. 6517*7c478bd9Sstevel@tonic-gate * We find our slot, where mp1 precedes us and mp2 trails. 6518*7c478bd9Sstevel@tonic-gate */ 6519*7c478bd9Sstevel@tonic-gate for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { 6520*7c478bd9Sstevel@tonic-gate u1 = TCP_REASS_SEQ(mp2); 6521*7c478bd9Sstevel@tonic-gate if (SEQ_LEQ(start, u1)) 6522*7c478bd9Sstevel@tonic-gate break; 6523*7c478bd9Sstevel@tonic-gate } 6524*7c478bd9Sstevel@tonic-gate /* Link ourselves in */ 6525*7c478bd9Sstevel@tonic-gate mp->b_cont = mp2; 6526*7c478bd9Sstevel@tonic-gate mp1->b_cont = mp; 6527*7c478bd9Sstevel@tonic-gate 6528*7c478bd9Sstevel@tonic-gate /* Trim overlap with following mblk(s) first */ 6529*7c478bd9Sstevel@tonic-gate tcp_reass_elim_overlap(tcp, mp); 6530*7c478bd9Sstevel@tonic-gate 6531*7c478bd9Sstevel@tonic-gate /* Trim overlap with preceding mblk */ 6532*7c478bd9Sstevel@tonic-gate tcp_reass_elim_overlap(tcp, mp1); 6533*7c478bd9Sstevel@tonic-gate 6534*7c478bd9Sstevel@tonic-gate } while (start = end, mp = next_mp); 6535*7c478bd9Sstevel@tonic-gate mp1 = tcp->tcp_reass_head; 6536*7c478bd9Sstevel@tonic-gate /* Anything ready to go? */ 6537*7c478bd9Sstevel@tonic-gate if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) 6538*7c478bd9Sstevel@tonic-gate return (NULL); 6539*7c478bd9Sstevel@tonic-gate /* Eat what we can off the queue */ 6540*7c478bd9Sstevel@tonic-gate for (;;) { 6541*7c478bd9Sstevel@tonic-gate mp = mp1->b_cont; 6542*7c478bd9Sstevel@tonic-gate end = TCP_REASS_END(mp1); 6543*7c478bd9Sstevel@tonic-gate TCP_REASS_SET_SEQ(mp1, 0); 6544*7c478bd9Sstevel@tonic-gate TCP_REASS_SET_END(mp1, 0); 6545*7c478bd9Sstevel@tonic-gate if (!mp) { 6546*7c478bd9Sstevel@tonic-gate tcp->tcp_reass_tail = NULL; 6547*7c478bd9Sstevel@tonic-gate break; 6548*7c478bd9Sstevel@tonic-gate } 6549*7c478bd9Sstevel@tonic-gate if (end != TCP_REASS_SEQ(mp)) { 6550*7c478bd9Sstevel@tonic-gate mp1->b_cont = NULL; 6551*7c478bd9Sstevel@tonic-gate break; 6552*7c478bd9Sstevel@tonic-gate } 6553*7c478bd9Sstevel@tonic-gate mp1 = mp; 6554*7c478bd9Sstevel@tonic-gate } 6555*7c478bd9Sstevel@tonic-gate mp1 = tcp->tcp_reass_head; 6556*7c478bd9Sstevel@tonic-gate tcp->tcp_reass_head = mp; 6557*7c478bd9Sstevel@tonic-gate return (mp1); 6558*7c478bd9Sstevel@tonic-gate } 6559*7c478bd9Sstevel@tonic-gate 6560*7c478bd9Sstevel@tonic-gate /* Eliminate any overlap that mp may have over later mblks */ 6561*7c478bd9Sstevel@tonic-gate static void 6562*7c478bd9Sstevel@tonic-gate tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) 6563*7c478bd9Sstevel@tonic-gate { 6564*7c478bd9Sstevel@tonic-gate uint32_t end; 6565*7c478bd9Sstevel@tonic-gate mblk_t *mp1; 6566*7c478bd9Sstevel@tonic-gate uint32_t u1; 6567*7c478bd9Sstevel@tonic-gate 6568*7c478bd9Sstevel@tonic-gate end = TCP_REASS_END(mp); 6569*7c478bd9Sstevel@tonic-gate while ((mp1 = mp->b_cont) != NULL) { 6570*7c478bd9Sstevel@tonic-gate u1 = TCP_REASS_SEQ(mp1); 6571*7c478bd9Sstevel@tonic-gate if (!SEQ_GT(end, u1)) 6572*7c478bd9Sstevel@tonic-gate break; 6573*7c478bd9Sstevel@tonic-gate if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { 6574*7c478bd9Sstevel@tonic-gate mp->b_wptr -= end - u1; 6575*7c478bd9Sstevel@tonic-gate TCP_REASS_SET_END(mp, u1); 6576*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataPartDupSegs); 6577*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataPartDupBytes, end - u1); 6578*7c478bd9Sstevel@tonic-gate break; 6579*7c478bd9Sstevel@tonic-gate } 6580*7c478bd9Sstevel@tonic-gate mp->b_cont = mp1->b_cont; 6581*7c478bd9Sstevel@tonic-gate freeb(mp1); 6582*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpInDataDupSegs); 6583*7c478bd9Sstevel@tonic-gate UPDATE_MIB(tcp_mib.tcpInDataDupBytes, end - u1); 6584*7c478bd9Sstevel@tonic-gate } 6585*7c478bd9Sstevel@tonic-gate if (!mp1) 6586*7c478bd9Sstevel@tonic-gate tcp->tcp_reass_tail = mp; 6587*7c478bd9Sstevel@tonic-gate } 6588*7c478bd9Sstevel@tonic-gate 6589*7c478bd9Sstevel@tonic-gate /* 6590*7c478bd9Sstevel@tonic-gate * Remove a connection from the list of detached TIME_WAIT connections. 6591*7c478bd9Sstevel@tonic-gate */ 6592*7c478bd9Sstevel@tonic-gate static void 6593*7c478bd9Sstevel@tonic-gate tcp_time_wait_remove(tcp_t *tcp) 6594*7c478bd9Sstevel@tonic-gate { 6595*7c478bd9Sstevel@tonic-gate if (tcp->tcp_time_wait_expire == 0) { 6596*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_time_wait_next == NULL); 6597*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_time_wait_prev == NULL); 6598*7c478bd9Sstevel@tonic-gate return; 6599*7c478bd9Sstevel@tonic-gate } 6600*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_state == TCPS_TIME_WAIT); 6601*7c478bd9Sstevel@tonic-gate if (tcp == tcp_time_wait_head) { 6602*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_time_wait_prev == NULL); 6603*7c478bd9Sstevel@tonic-gate tcp_time_wait_head = tcp->tcp_time_wait_next; 6604*7c478bd9Sstevel@tonic-gate if (tcp_time_wait_head != NULL) { 6605*7c478bd9Sstevel@tonic-gate tcp_time_wait_head->tcp_time_wait_prev = NULL; 6606*7c478bd9Sstevel@tonic-gate } else { 6607*7c478bd9Sstevel@tonic-gate tcp_time_wait_tail = NULL; 6608*7c478bd9Sstevel@tonic-gate } 6609*7c478bd9Sstevel@tonic-gate } else if (tcp == tcp_time_wait_tail) { 6610*7c478bd9Sstevel@tonic-gate assert(tcp != tcp_time_wait_head); 6611*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_time_wait_next == NULL); 6612*7c478bd9Sstevel@tonic-gate tcp_time_wait_tail = tcp->tcp_time_wait_prev; 6613*7c478bd9Sstevel@tonic-gate assert(tcp_time_wait_tail != NULL); 6614*7c478bd9Sstevel@tonic-gate tcp_time_wait_tail->tcp_time_wait_next = NULL; 6615*7c478bd9Sstevel@tonic-gate } else { 6616*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); 6617*7c478bd9Sstevel@tonic-gate assert(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); 6618*7c478bd9Sstevel@tonic-gate tcp->tcp_time_wait_prev->tcp_time_wait_next = 6619*7c478bd9Sstevel@tonic-gate tcp->tcp_time_wait_next; 6620*7c478bd9Sstevel@tonic-gate tcp->tcp_time_wait_next->tcp_time_wait_prev = 6621*7c478bd9Sstevel@tonic-gate tcp->tcp_time_wait_prev; 6622*7c478bd9Sstevel@tonic-gate } 6623*7c478bd9Sstevel@tonic-gate tcp->tcp_time_wait_next = NULL; 6624*7c478bd9Sstevel@tonic-gate tcp->tcp_time_wait_prev = NULL; 6625*7c478bd9Sstevel@tonic-gate tcp->tcp_time_wait_expire = 0; 6626*7c478bd9Sstevel@tonic-gate } 6627*7c478bd9Sstevel@tonic-gate 6628*7c478bd9Sstevel@tonic-gate /* 6629*7c478bd9Sstevel@tonic-gate * Add a connection to the list of detached TIME_WAIT connections 6630*7c478bd9Sstevel@tonic-gate * and set its time to expire ... 6631*7c478bd9Sstevel@tonic-gate */ 6632*7c478bd9Sstevel@tonic-gate static void 6633*7c478bd9Sstevel@tonic-gate tcp_time_wait_append(tcp_t *tcp) 6634*7c478bd9Sstevel@tonic-gate { 6635*7c478bd9Sstevel@tonic-gate tcp->tcp_time_wait_expire = prom_gettime() + tcp_time_wait_interval; 6636*7c478bd9Sstevel@tonic-gate if (tcp->tcp_time_wait_expire == 0) 6637*7c478bd9Sstevel@tonic-gate tcp->tcp_time_wait_expire = 1; 6638*7c478bd9Sstevel@tonic-gate 6639*7c478bd9Sstevel@tonic-gate if (tcp_time_wait_head == NULL) { 6640*7c478bd9Sstevel@tonic-gate assert(tcp_time_wait_tail == NULL); 6641*7c478bd9Sstevel@tonic-gate tcp_time_wait_head = tcp; 6642*7c478bd9Sstevel@tonic-gate } else { 6643*7c478bd9Sstevel@tonic-gate assert(tcp_time_wait_tail != NULL); 6644*7c478bd9Sstevel@tonic-gate assert(tcp_time_wait_tail->tcp_state == TCPS_TIME_WAIT); 6645*7c478bd9Sstevel@tonic-gate tcp_time_wait_tail->tcp_time_wait_next = tcp; 6646*7c478bd9Sstevel@tonic-gate tcp->tcp_time_wait_prev = tcp_time_wait_tail; 6647*7c478bd9Sstevel@tonic-gate } 6648*7c478bd9Sstevel@tonic-gate tcp_time_wait_tail = tcp; 6649*7c478bd9Sstevel@tonic-gate 6650*7c478bd9Sstevel@tonic-gate /* for ndd stats about compression */ 6651*7c478bd9Sstevel@tonic-gate tcp_cum_timewait++; 6652*7c478bd9Sstevel@tonic-gate } 6653*7c478bd9Sstevel@tonic-gate 6654*7c478bd9Sstevel@tonic-gate /* 6655*7c478bd9Sstevel@tonic-gate * Periodic qtimeout routine run on the default queue. 6656*7c478bd9Sstevel@tonic-gate * Performs 2 functions. 6657*7c478bd9Sstevel@tonic-gate * 1. Does TIME_WAIT compression on all recently added tcps. List 6658*7c478bd9Sstevel@tonic-gate * traversal is done backwards from the tail. 6659*7c478bd9Sstevel@tonic-gate * 2. Blows away all tcps whose TIME_WAIT has expired. List traversal 6660*7c478bd9Sstevel@tonic-gate * is done forwards from the head. 6661*7c478bd9Sstevel@tonic-gate */ 6662*7c478bd9Sstevel@tonic-gate void 6663*7c478bd9Sstevel@tonic-gate tcp_time_wait_collector(void) 6664*7c478bd9Sstevel@tonic-gate { 6665*7c478bd9Sstevel@tonic-gate tcp_t *tcp; 6666*7c478bd9Sstevel@tonic-gate uint32_t now; 6667*7c478bd9Sstevel@tonic-gate 6668*7c478bd9Sstevel@tonic-gate /* 6669*7c478bd9Sstevel@tonic-gate * In order to reap time waits reliably, we should use a 6670*7c478bd9Sstevel@tonic-gate * source of time that is not adjustable by the user 6671*7c478bd9Sstevel@tonic-gate */ 6672*7c478bd9Sstevel@tonic-gate now = prom_gettime(); 6673*7c478bd9Sstevel@tonic-gate while ((tcp = tcp_time_wait_head) != NULL) { 6674*7c478bd9Sstevel@tonic-gate /* 6675*7c478bd9Sstevel@tonic-gate * Compare times using modular arithmetic, since 6676*7c478bd9Sstevel@tonic-gate * lbolt can wrapover. 6677*7c478bd9Sstevel@tonic-gate */ 6678*7c478bd9Sstevel@tonic-gate if ((int32_t)(now - tcp->tcp_time_wait_expire) < 0) { 6679*7c478bd9Sstevel@tonic-gate break; 6680*7c478bd9Sstevel@tonic-gate } 6681*7c478bd9Sstevel@tonic-gate /* 6682*7c478bd9Sstevel@tonic-gate * Note that the err must be 0 as there is no socket 6683*7c478bd9Sstevel@tonic-gate * associated with this TCP... 6684*7c478bd9Sstevel@tonic-gate */ 6685*7c478bd9Sstevel@tonic-gate (void) tcp_clean_death(-1, tcp, 0); 6686*7c478bd9Sstevel@tonic-gate } 6687*7c478bd9Sstevel@tonic-gate /* Schedule next run time. */ 6688*7c478bd9Sstevel@tonic-gate tcp_time_wait_runtime = prom_gettime() + 10000; 6689*7c478bd9Sstevel@tonic-gate } 6690*7c478bd9Sstevel@tonic-gate 6691*7c478bd9Sstevel@tonic-gate void 6692*7c478bd9Sstevel@tonic-gate tcp_time_wait_report(void) 6693*7c478bd9Sstevel@tonic-gate { 6694*7c478bd9Sstevel@tonic-gate tcp_t *tcp; 6695*7c478bd9Sstevel@tonic-gate 6696*7c478bd9Sstevel@tonic-gate printf("Current time %u\n", prom_gettime()); 6697*7c478bd9Sstevel@tonic-gate for (tcp = tcp_time_wait_head; tcp != NULL; 6698*7c478bd9Sstevel@tonic-gate tcp = tcp->tcp_time_wait_next) { 6699*7c478bd9Sstevel@tonic-gate printf("%s expires at %u\n", tcp_display(tcp, NULL, 6700*7c478bd9Sstevel@tonic-gate DISP_ADDR_AND_PORT), tcp->tcp_time_wait_expire); 6701*7c478bd9Sstevel@tonic-gate } 6702*7c478bd9Sstevel@tonic-gate } 6703*7c478bd9Sstevel@tonic-gate 6704*7c478bd9Sstevel@tonic-gate /* 6705*7c478bd9Sstevel@tonic-gate * Send up all messages queued on tcp_rcv_list. 6706*7c478bd9Sstevel@tonic-gate * Have to set tcp_co_norm since we use putnext. 6707*7c478bd9Sstevel@tonic-gate */ 6708*7c478bd9Sstevel@tonic-gate static void 6709*7c478bd9Sstevel@tonic-gate tcp_rcv_drain(int sock_id, tcp_t *tcp) 6710*7c478bd9Sstevel@tonic-gate { 6711*7c478bd9Sstevel@tonic-gate mblk_t *mp; 6712*7c478bd9Sstevel@tonic-gate struct inetgram *in_gram; 6713*7c478bd9Sstevel@tonic-gate mblk_t *in_mp; 6714*7c478bd9Sstevel@tonic-gate int len; 6715*7c478bd9Sstevel@tonic-gate 6716*7c478bd9Sstevel@tonic-gate /* Don't drain if the app has not finished reading all the data. */ 6717*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].so_rcvbuf <= 0) 6718*7c478bd9Sstevel@tonic-gate return; 6719*7c478bd9Sstevel@tonic-gate 6720*7c478bd9Sstevel@tonic-gate /* We might have come here just to updated the rwnd */ 6721*7c478bd9Sstevel@tonic-gate if (tcp->tcp_rcv_list == NULL) 6722*7c478bd9Sstevel@tonic-gate goto win_update; 6723*7c478bd9Sstevel@tonic-gate 6724*7c478bd9Sstevel@tonic-gate if ((in_gram = (struct inetgram *)bkmem_zalloc( 6725*7c478bd9Sstevel@tonic-gate sizeof (struct inetgram))) == NULL) { 6726*7c478bd9Sstevel@tonic-gate return; 6727*7c478bd9Sstevel@tonic-gate } 6728*7c478bd9Sstevel@tonic-gate if ((in_mp = allocb(tcp->tcp_rcv_cnt, 0)) == NULL) { 6729*7c478bd9Sstevel@tonic-gate bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 6730*7c478bd9Sstevel@tonic-gate return; 6731*7c478bd9Sstevel@tonic-gate } 6732*7c478bd9Sstevel@tonic-gate in_gram->igm_level = APP_LVL; 6733*7c478bd9Sstevel@tonic-gate in_gram->igm_mp = in_mp; 6734*7c478bd9Sstevel@tonic-gate in_gram->igm_id = 0; 6735*7c478bd9Sstevel@tonic-gate 6736*7c478bd9Sstevel@tonic-gate while ((mp = tcp->tcp_rcv_list) != NULL) { 6737*7c478bd9Sstevel@tonic-gate tcp->tcp_rcv_list = mp->b_cont; 6738*7c478bd9Sstevel@tonic-gate len = mp->b_wptr - mp->b_rptr; 6739*7c478bd9Sstevel@tonic-gate bcopy(mp->b_rptr, in_mp->b_wptr, len); 6740*7c478bd9Sstevel@tonic-gate in_mp->b_wptr += len; 6741*7c478bd9Sstevel@tonic-gate freeb(mp); 6742*7c478bd9Sstevel@tonic-gate } 6743*7c478bd9Sstevel@tonic-gate 6744*7c478bd9Sstevel@tonic-gate tcp->tcp_rcv_last_tail = NULL; 6745*7c478bd9Sstevel@tonic-gate tcp->tcp_rcv_cnt = 0; 6746*7c478bd9Sstevel@tonic-gate add_grams(&sockets[sock_id].inq, in_gram); 6747*7c478bd9Sstevel@tonic-gate 6748*7c478bd9Sstevel@tonic-gate /* This means that so_rcvbuf can be less than 0. */ 6749*7c478bd9Sstevel@tonic-gate sockets[sock_id].so_rcvbuf -= in_mp->b_wptr - in_mp->b_rptr; 6750*7c478bd9Sstevel@tonic-gate win_update: 6751*7c478bd9Sstevel@tonic-gate /* 6752*7c478bd9Sstevel@tonic-gate * Increase the receive window to max. But we need to do receiver 6753*7c478bd9Sstevel@tonic-gate * SWS avoidance. This means that we need to check the increase of 6754*7c478bd9Sstevel@tonic-gate * of receive window is at least 1 MSS. 6755*7c478bd9Sstevel@tonic-gate */ 6756*7c478bd9Sstevel@tonic-gate if (sockets[sock_id].so_rcvbuf > 0 && 6757*7c478bd9Sstevel@tonic-gate (tcp->tcp_rwnd_max - tcp->tcp_rwnd >= tcp->tcp_mss)) { 6758*7c478bd9Sstevel@tonic-gate tcp->tcp_rwnd = tcp->tcp_rwnd_max; 6759*7c478bd9Sstevel@tonic-gate U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 6760*7c478bd9Sstevel@tonic-gate tcp->tcp_tcph->th_win); 6761*7c478bd9Sstevel@tonic-gate } 6762*7c478bd9Sstevel@tonic-gate } 6763*7c478bd9Sstevel@tonic-gate 6764*7c478bd9Sstevel@tonic-gate /* 6765*7c478bd9Sstevel@tonic-gate * Wrapper for recvfrom to call 6766*7c478bd9Sstevel@tonic-gate */ 6767*7c478bd9Sstevel@tonic-gate void 6768*7c478bd9Sstevel@tonic-gate tcp_rcv_drain_sock(int sock_id) 6769*7c478bd9Sstevel@tonic-gate { 6770*7c478bd9Sstevel@tonic-gate tcp_t *tcp; 6771*7c478bd9Sstevel@tonic-gate if ((tcp = sockets[sock_id].pcb) == NULL) 6772*7c478bd9Sstevel@tonic-gate return; 6773*7c478bd9Sstevel@tonic-gate tcp_rcv_drain(sock_id, tcp); 6774*7c478bd9Sstevel@tonic-gate } 6775*7c478bd9Sstevel@tonic-gate 6776*7c478bd9Sstevel@tonic-gate /* 6777*7c478bd9Sstevel@tonic-gate * If the inq == NULL and the tcp_rcv_list != NULL, we have data that 6778*7c478bd9Sstevel@tonic-gate * recvfrom could read. Place a magic message in the inq to let recvfrom 6779*7c478bd9Sstevel@tonic-gate * know that it needs to call tcp_rcv_drain_sock to pullup the data. 6780*7c478bd9Sstevel@tonic-gate */ 6781*7c478bd9Sstevel@tonic-gate static void 6782*7c478bd9Sstevel@tonic-gate tcp_drain_needed(int sock_id, tcp_t *tcp) 6783*7c478bd9Sstevel@tonic-gate { 6784*7c478bd9Sstevel@tonic-gate struct inetgram *in_gram; 6785*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 6786*7c478bd9Sstevel@tonic-gate printf("tcp_drain_needed: inq %x, tcp_rcv_list %x\n", 6787*7c478bd9Sstevel@tonic-gate sockets[sock_id].inq, tcp->tcp_rcv_list); 6788*7c478bd9Sstevel@tonic-gate #endif 6789*7c478bd9Sstevel@tonic-gate if ((sockets[sock_id].inq != NULL) || 6790*7c478bd9Sstevel@tonic-gate (tcp->tcp_rcv_list == NULL)) 6791*7c478bd9Sstevel@tonic-gate return; 6792*7c478bd9Sstevel@tonic-gate 6793*7c478bd9Sstevel@tonic-gate if ((in_gram = (struct inetgram *)bkmem_zalloc( 6794*7c478bd9Sstevel@tonic-gate sizeof (struct inetgram))) == NULL) 6795*7c478bd9Sstevel@tonic-gate return; 6796*7c478bd9Sstevel@tonic-gate 6797*7c478bd9Sstevel@tonic-gate in_gram->igm_level = APP_LVL; 6798*7c478bd9Sstevel@tonic-gate in_gram->igm_mp = NULL; 6799*7c478bd9Sstevel@tonic-gate in_gram->igm_id = TCP_CALLB_MAGIC_ID; 6800*7c478bd9Sstevel@tonic-gate 6801*7c478bd9Sstevel@tonic-gate add_grams(&sockets[sock_id].inq, in_gram); 6802*7c478bd9Sstevel@tonic-gate } 6803*7c478bd9Sstevel@tonic-gate 6804*7c478bd9Sstevel@tonic-gate /* 6805*7c478bd9Sstevel@tonic-gate * Queue data on tcp_rcv_list which is a b_next chain. 6806*7c478bd9Sstevel@tonic-gate * Each element of the chain is a b_cont chain. 6807*7c478bd9Sstevel@tonic-gate * 6808*7c478bd9Sstevel@tonic-gate * M_DATA messages are added to the current element. 6809*7c478bd9Sstevel@tonic-gate * Other messages are added as new (b_next) elements. 6810*7c478bd9Sstevel@tonic-gate */ 6811*7c478bd9Sstevel@tonic-gate static void 6812*7c478bd9Sstevel@tonic-gate tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) 6813*7c478bd9Sstevel@tonic-gate { 6814*7c478bd9Sstevel@tonic-gate assert(seg_len == msgdsize(mp)); 6815*7c478bd9Sstevel@tonic-gate if (tcp->tcp_rcv_list == NULL) { 6816*7c478bd9Sstevel@tonic-gate tcp->tcp_rcv_list = mp; 6817*7c478bd9Sstevel@tonic-gate } else { 6818*7c478bd9Sstevel@tonic-gate tcp->tcp_rcv_last_tail->b_cont = mp; 6819*7c478bd9Sstevel@tonic-gate } 6820*7c478bd9Sstevel@tonic-gate while (mp->b_cont) 6821*7c478bd9Sstevel@tonic-gate mp = mp->b_cont; 6822*7c478bd9Sstevel@tonic-gate tcp->tcp_rcv_last_tail = mp; 6823*7c478bd9Sstevel@tonic-gate tcp->tcp_rcv_cnt += seg_len; 6824*7c478bd9Sstevel@tonic-gate tcp->tcp_rwnd -= seg_len; 6825*7c478bd9Sstevel@tonic-gate #ifdef DEBUG 6826*7c478bd9Sstevel@tonic-gate printf("tcp_rcv_enqueue rwnd %d\n", tcp->tcp_rwnd); 6827*7c478bd9Sstevel@tonic-gate #endif 6828*7c478bd9Sstevel@tonic-gate U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 6829*7c478bd9Sstevel@tonic-gate } 6830*7c478bd9Sstevel@tonic-gate 6831*7c478bd9Sstevel@tonic-gate /* The minimum of smoothed mean deviation in RTO calculation. */ 6832*7c478bd9Sstevel@tonic-gate #define TCP_SD_MIN 400 6833*7c478bd9Sstevel@tonic-gate 6834*7c478bd9Sstevel@tonic-gate /* 6835*7c478bd9Sstevel@tonic-gate * Set RTO for this connection. The formula is from Jacobson and Karels' 6836*7c478bd9Sstevel@tonic-gate * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 6837*7c478bd9Sstevel@tonic-gate * are the same as those in Appendix A.2 of that paper. 6838*7c478bd9Sstevel@tonic-gate * 6839*7c478bd9Sstevel@tonic-gate * m = new measurement 6840*7c478bd9Sstevel@tonic-gate * sa = smoothed RTT average (8 * average estimates). 6841*7c478bd9Sstevel@tonic-gate * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). 6842*7c478bd9Sstevel@tonic-gate */ 6843*7c478bd9Sstevel@tonic-gate static void 6844*7c478bd9Sstevel@tonic-gate tcp_set_rto(tcp_t *tcp, int32_t rtt) 6845*7c478bd9Sstevel@tonic-gate { 6846*7c478bd9Sstevel@tonic-gate int32_t m = rtt; 6847*7c478bd9Sstevel@tonic-gate uint32_t sa = tcp->tcp_rtt_sa; 6848*7c478bd9Sstevel@tonic-gate uint32_t sv = tcp->tcp_rtt_sd; 6849*7c478bd9Sstevel@tonic-gate uint32_t rto; 6850*7c478bd9Sstevel@tonic-gate 6851*7c478bd9Sstevel@tonic-gate BUMP_MIB(tcp_mib.tcpRttUpdate); 6852*7c478bd9Sstevel@tonic-gate tcp->tcp_rtt_update++; 6853*7c478bd9Sstevel@tonic-gate 6854*7c478bd9Sstevel@tonic-gate /* tcp_rtt_sa is not 0 means this is a new sample. */ 6855*7c478bd9Sstevel@tonic-gate if (sa != 0) { 6856*7c478bd9Sstevel@tonic-gate /* 6857*7c478bd9Sstevel@tonic-gate * Update average estimator: 6858*7c478bd9Sstevel@tonic-gate * new rtt = 7/8 old rtt + 1/8 Error 6859*7c478bd9Sstevel@tonic-gate */ 6860*7c478bd9Sstevel@tonic-gate 6861*7c478bd9Sstevel@tonic-gate /* m is now Error in estimate. */ 6862*7c478bd9Sstevel@tonic-gate m -= sa >> 3; 6863*7c478bd9Sstevel@tonic-gate if ((int32_t)(sa += m) <= 0) { 6864*7c478bd9Sstevel@tonic-gate /* 6865*7c478bd9Sstevel@tonic-gate * Don't allow the smoothed average to be negative. 6866*7c478bd9Sstevel@tonic-gate * We use 0 to denote reinitialization of the 6867*7c478bd9Sstevel@tonic-gate * variables. 6868*7c478bd9Sstevel@tonic-gate */ 6869*7c478bd9Sstevel@tonic-gate sa = 1; 6870*7c478bd9Sstevel@tonic-gate } 6871*7c478bd9Sstevel@tonic-gate 6872*7c478bd9Sstevel@tonic-gate /* 6873*7c478bd9Sstevel@tonic-gate * Update deviation estimator: 6874*7c478bd9Sstevel@tonic-gate * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) 6875*7c478bd9Sstevel@tonic-gate */ 6876*7c478bd9Sstevel@tonic-gate if (m < 0) 6877*7c478bd9Sstevel@tonic-gate m = -m; 6878*7c478bd9Sstevel@tonic-gate m -= sv >> 2; 6879*7c478bd9Sstevel@tonic-gate sv += m; 6880*7c478bd9Sstevel@tonic-gate } else { 6881*7c478bd9Sstevel@tonic-gate /* 6882*7c478bd9Sstevel@tonic-gate * This follows BSD's implementation. So the reinitialized 6883*7c478bd9Sstevel@tonic-gate * RTO is 3 * m. We cannot go less than 2 because if the 6884*7c478bd9Sstevel@tonic-gate * link is bandwidth dominated, doubling the window size 6885*7c478bd9Sstevel@tonic-gate * during slow start means doubling the RTT. We want to be 6886*7c478bd9Sstevel@tonic-gate * more conservative when we reinitialize our estimates. 3 6887*7c478bd9Sstevel@tonic-gate * is just a convenient number. 6888*7c478bd9Sstevel@tonic-gate */ 6889*7c478bd9Sstevel@tonic-gate sa = m << 3; 6890*7c478bd9Sstevel@tonic-gate sv = m << 1; 6891*7c478bd9Sstevel@tonic-gate } 6892*7c478bd9Sstevel@tonic-gate if (sv < TCP_SD_MIN) { 6893*7c478bd9Sstevel@tonic-gate /* 6894*7c478bd9Sstevel@tonic-gate * We do not know that if sa captures the delay ACK 6895*7c478bd9Sstevel@tonic-gate * effect as in a long train of segments, a receiver 6896*7c478bd9Sstevel@tonic-gate * does not delay its ACKs. So set the minimum of sv 6897*7c478bd9Sstevel@tonic-gate * to be TCP_SD_MIN, which is default to 400 ms, twice 6898*7c478bd9Sstevel@tonic-gate * of BSD DATO. That means the minimum of mean 6899*7c478bd9Sstevel@tonic-gate * deviation is 100 ms. 6900*7c478bd9Sstevel@tonic-gate * 6901*7c478bd9Sstevel@tonic-gate */ 6902*7c478bd9Sstevel@tonic-gate sv = TCP_SD_MIN; 6903*7c478bd9Sstevel@tonic-gate } 6904*7c478bd9Sstevel@tonic-gate tcp->tcp_rtt_sa = sa; 6905*7c478bd9Sstevel@tonic-gate tcp->tcp_rtt_sd = sv; 6906*7c478bd9Sstevel@tonic-gate /* 6907*7c478bd9Sstevel@tonic-gate * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) 6908*7c478bd9Sstevel@tonic-gate * 6909*7c478bd9Sstevel@tonic-gate * Add tcp_rexmit_interval extra in case of extreme environment 6910*7c478bd9Sstevel@tonic-gate * where the algorithm fails to work. The default value of 6911*7c478bd9Sstevel@tonic-gate * tcp_rexmit_interval_extra should be 0. 6912*7c478bd9Sstevel@tonic-gate * 6913*7c478bd9Sstevel@tonic-gate * As we use a finer grained clock than BSD and update 6914*7c478bd9Sstevel@tonic-gate * RTO for every ACKs, add in another .25 of RTT to the 6915*7c478bd9Sstevel@tonic-gate * deviation of RTO to accomodate burstiness of 1/4 of 6916*7c478bd9Sstevel@tonic-gate * window size. 6917*7c478bd9Sstevel@tonic-gate */ 6918*7c478bd9Sstevel@tonic-gate rto = (sa >> 3) + sv + tcp_rexmit_interval_extra + (sa >> 5); 6919*7c478bd9Sstevel@tonic-gate 6920*7c478bd9Sstevel@tonic-gate if (rto > tcp_rexmit_interval_max) { 6921*7c478bd9Sstevel@tonic-gate tcp->tcp_rto = tcp_rexmit_interval_max; 6922*7c478bd9Sstevel@tonic-gate } else if (rto < tcp_rexmit_interval_min) { 6923*7c478bd9Sstevel@tonic-gate tcp->tcp_rto = tcp_rexmit_interval_min; 6924*7c478bd9Sstevel@tonic-gate } else { 6925*7c478bd9Sstevel@tonic-gate tcp->tcp_rto = rto; 6926*7c478bd9Sstevel@tonic-gate } 6927*7c478bd9Sstevel@tonic-gate 6928*7c478bd9Sstevel@tonic-gate /* Now, we can reset tcp_timer_backoff to use the new RTO... */ 6929*7c478bd9Sstevel@tonic-gate tcp->tcp_timer_backoff = 0; 6930*7c478bd9Sstevel@tonic-gate } 6931*7c478bd9Sstevel@tonic-gate 6932*7c478bd9Sstevel@tonic-gate /* 6933*7c478bd9Sstevel@tonic-gate * Initiate closedown sequence on an active connection. 6934*7c478bd9Sstevel@tonic-gate * Return value zero for OK return, non-zero for error return. 6935*7c478bd9Sstevel@tonic-gate */ 6936*7c478bd9Sstevel@tonic-gate static int 6937*7c478bd9Sstevel@tonic-gate tcp_xmit_end(tcp_t *tcp, int sock_id) 6938*7c478bd9Sstevel@tonic-gate { 6939*7c478bd9Sstevel@tonic-gate mblk_t *mp; 6940*7c478bd9Sstevel@tonic-gate 6941*7c478bd9Sstevel@tonic-gate if (tcp->tcp_state < TCPS_SYN_RCVD || 6942*7c478bd9Sstevel@tonic-gate tcp->tcp_state > TCPS_CLOSE_WAIT) { 6943*7c478bd9Sstevel@tonic-gate /* 6944*7c478bd9Sstevel@tonic-gate * Invalid state, only states TCPS_SYN_RCVD, 6945*7c478bd9Sstevel@tonic-gate * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid 6946*7c478bd9Sstevel@tonic-gate */ 6947*7c478bd9Sstevel@tonic-gate return (-1); 6948*7c478bd9Sstevel@tonic-gate } 6949*7c478bd9Sstevel@tonic-gate 6950*7c478bd9Sstevel@tonic-gate tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent; 6951*7c478bd9Sstevel@tonic-gate tcp->tcp_valid_bits |= TCP_FSS_VALID; 6952*7c478bd9Sstevel@tonic-gate /* 6953*7c478bd9Sstevel@tonic-gate * If there is nothing more unsent, send the FIN now. 6954*7c478bd9Sstevel@tonic-gate * Otherwise, it will go out with the last segment. 6955*7c478bd9Sstevel@tonic-gate */ 6956*7c478bd9Sstevel@tonic-gate if (tcp->tcp_unsent == 0) { 6957*7c478bd9Sstevel@tonic-gate mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 6958*7c478bd9Sstevel@tonic-gate tcp->tcp_fss, B_FALSE, NULL, B_FALSE); 6959*7c478bd9Sstevel@tonic-gate 6960*7c478bd9Sstevel@tonic-gate if (mp != NULL) { 6961*7c478bd9Sstevel@tonic-gate /* Dump the packet when debugging. */ 6962*7c478bd9Sstevel@tonic-gate TCP_DUMP_PACKET("tcp_xmit_end", mp); 6963*7c478bd9Sstevel@tonic-gate (void) ipv4_tcp_output(sock_id, mp); 6964*7c478bd9Sstevel@tonic-gate freeb(mp); 6965*7c478bd9Sstevel@tonic-gate } else { 6966*7c478bd9Sstevel@tonic-gate /* 6967*7c478bd9Sstevel@tonic-gate * Couldn't allocate msg. Pretend we got it out. 6968*7c478bd9Sstevel@tonic-gate * Wait for rexmit timeout. 6969*7c478bd9Sstevel@tonic-gate */ 6970*7c478bd9Sstevel@tonic-gate tcp->tcp_snxt = tcp->tcp_fss + 1; 6971*7c478bd9Sstevel@tonic-gate TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 6972*7c478bd9Sstevel@tonic-gate } 6973*7c478bd9Sstevel@tonic-gate 6974*7c478bd9Sstevel@tonic-gate /* 6975*7c478bd9Sstevel@tonic-gate * If needed, update tcp_rexmit_snxt as tcp_snxt is 6976*7c478bd9Sstevel@tonic-gate * changed. 6977*7c478bd9Sstevel@tonic-gate */ 6978*7c478bd9Sstevel@tonic-gate if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) { 6979*7c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 6980*7c478bd9Sstevel@tonic-gate } 6981*7c478bd9Sstevel@tonic-gate } else { 6982*7c478bd9Sstevel@tonic-gate tcp_wput_data(tcp, NULL, B_FALSE); 6983*7c478bd9Sstevel@tonic-gate } 6984*7c478bd9Sstevel@tonic-gate 6985*7c478bd9Sstevel@tonic-gate return (0); 6986*7c478bd9Sstevel@tonic-gate } 6987*7c478bd9Sstevel@tonic-gate 6988*7c478bd9Sstevel@tonic-gate int 6989*7c478bd9Sstevel@tonic-gate tcp_opt_set(tcp_t *tcp, int level, int option, const void *optval, 6990*7c478bd9Sstevel@tonic-gate socklen_t optlen) 6991*7c478bd9Sstevel@tonic-gate { 6992*7c478bd9Sstevel@tonic-gate switch (level) { 6993*7c478bd9Sstevel@tonic-gate case SOL_SOCKET: { 6994*7c478bd9Sstevel@tonic-gate switch (option) { 6995*7c478bd9Sstevel@tonic-gate case SO_RCVBUF: 6996*7c478bd9Sstevel@tonic-gate if (optlen == sizeof (int)) { 6997*7c478bd9Sstevel@tonic-gate int val = *(int *)optval; 6998*7c478bd9Sstevel@tonic-gate 6999*7c478bd9Sstevel@tonic-gate if (val > tcp_max_buf) { 7000*7c478bd9Sstevel@tonic-gate errno = ENOBUFS; 7001*7c478bd9Sstevel@tonic-gate break; 7002*7c478bd9Sstevel@tonic-gate } 7003*7c478bd9Sstevel@tonic-gate /* Silently ignore zero */ 7004*7c478bd9Sstevel@tonic-gate if (val != 0) { 7005*7c478bd9Sstevel@tonic-gate val = MSS_ROUNDUP(val, tcp->tcp_mss); 7006*7c478bd9Sstevel@tonic-gate (void) tcp_rwnd_set(tcp, val); 7007*7c478bd9Sstevel@tonic-gate } 7008*7c478bd9Sstevel@tonic-gate } else { 7009*7c478bd9Sstevel@tonic-gate errno = EINVAL; 7010*7c478bd9Sstevel@tonic-gate } 7011*7c478bd9Sstevel@tonic-gate break; 7012*7c478bd9Sstevel@tonic-gate case SO_SNDBUF: 7013*7c478bd9Sstevel@tonic-gate if (optlen == sizeof (int)) { 7014*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_hiwater = *(int *)optval; 7015*7c478bd9Sstevel@tonic-gate if (tcp->tcp_xmit_hiwater > tcp_max_buf) 7016*7c478bd9Sstevel@tonic-gate tcp->tcp_xmit_hiwater = tcp_max_buf; 7017*7c478bd9Sstevel@tonic-gate } else { 7018*7c478bd9Sstevel@tonic-gate errno = EINVAL; 7019*7c478bd9Sstevel@tonic-gate } 7020*7c478bd9Sstevel@tonic-gate break; 7021*7c478bd9Sstevel@tonic-gate case SO_LINGER: 7022*7c478bd9Sstevel@tonic-gate if (optlen == sizeof (struct linger)) { 7023*7c478bd9Sstevel@tonic-gate struct linger *lgr = (struct linger *)optval; 7024*7c478bd9Sstevel@tonic-gate 7025*7c478bd9Sstevel@tonic-gate if (lgr->l_onoff) { 7026*7c478bd9Sstevel@tonic-gate tcp->tcp_linger = 1; 7027*7c478bd9Sstevel@tonic-gate tcp->tcp_lingertime = lgr->l_linger; 7028*7c478bd9Sstevel@tonic-gate } else { 7029*7c478bd9Sstevel@tonic-gate tcp->tcp_linger = 0; 7030*7c478bd9Sstevel@tonic-gate tcp->tcp_lingertime = 0; 7031*7c478bd9Sstevel@tonic-gate } 7032*7c478bd9Sstevel@tonic-gate } else { 7033*7c478bd9Sstevel@tonic-gate errno = EINVAL; 7034*7c478bd9Sstevel@tonic-gate } 7035*7c478bd9Sstevel@tonic-gate break; 7036*7c478bd9Sstevel@tonic-gate default: 7037*7c478bd9Sstevel@tonic-gate errno = ENOPROTOOPT; 7038*7c478bd9Sstevel@tonic-gate break; 7039*7c478bd9Sstevel@tonic-gate } 7040*7c478bd9Sstevel@tonic-gate break; 7041*7c478bd9Sstevel@tonic-gate } /* case SOL_SOCKET */ 7042*7c478bd9Sstevel@tonic-gate case IPPROTO_TCP: { 7043*7c478bd9Sstevel@tonic-gate switch (option) { 7044*7c478bd9Sstevel@tonic-gate default: 7045*7c478bd9Sstevel@tonic-gate errno = ENOPROTOOPT; 7046*7c478bd9Sstevel@tonic-gate break; 7047*7c478bd9Sstevel@tonic-gate } 7048*7c478bd9Sstevel@tonic-gate break; 7049*7c478bd9Sstevel@tonic-gate } /* case IPPROTO_TCP */ 7050*7c478bd9Sstevel@tonic-gate case IPPROTO_IP: { 7051*7c478bd9Sstevel@tonic-gate switch (option) { 7052*7c478bd9Sstevel@tonic-gate default: 7053*7c478bd9Sstevel@tonic-gate errno = ENOPROTOOPT; 7054*7c478bd9Sstevel@tonic-gate break; 7055*7c478bd9Sstevel@tonic-gate } 7056*7c478bd9Sstevel@tonic-gate break; 7057*7c478bd9Sstevel@tonic-gate } /* case IPPROTO_IP */ 7058*7c478bd9Sstevel@tonic-gate default: 7059*7c478bd9Sstevel@tonic-gate errno = ENOPROTOOPT; 7060*7c478bd9Sstevel@tonic-gate break; 7061*7c478bd9Sstevel@tonic-gate } /* switch (level) */ 7062*7c478bd9Sstevel@tonic-gate 7063*7c478bd9Sstevel@tonic-gate if (errno != 0) 7064*7c478bd9Sstevel@tonic-gate return (-1); 7065*7c478bd9Sstevel@tonic-gate else 7066*7c478bd9Sstevel@tonic-gate return (0); 7067*7c478bd9Sstevel@tonic-gate } 7068