1721fffe3SKacheong Poon /* 2721fffe3SKacheong Poon * CDDL HEADER START 3721fffe3SKacheong Poon * 4721fffe3SKacheong Poon * The contents of this file are subject to the terms of the 5721fffe3SKacheong Poon * Common Development and Distribution License (the "License"). 6721fffe3SKacheong Poon * You may not use this file except in compliance with the License. 7721fffe3SKacheong Poon * 8721fffe3SKacheong Poon * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9721fffe3SKacheong Poon * or http://www.opensolaris.org/os/licensing. 10721fffe3SKacheong Poon * See the License for the specific language governing permissions 11721fffe3SKacheong Poon * and limitations under the License. 12721fffe3SKacheong Poon * 13721fffe3SKacheong Poon * When distributing Covered Code, include this CDDL HEADER in each 14721fffe3SKacheong Poon * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15721fffe3SKacheong Poon * If applicable, add the following below this CDDL HEADER, with the 16721fffe3SKacheong Poon * fields enclosed by brackets "[]" replaced with your own identifying 17721fffe3SKacheong Poon * information: Portions Copyright [yyyy] [name of copyright owner] 18721fffe3SKacheong Poon * 19721fffe3SKacheong Poon * CDDL HEADER END 20721fffe3SKacheong Poon */ 21721fffe3SKacheong Poon 22721fffe3SKacheong Poon /* 2366cd0f60SKacheong Poon * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24633fc3a6SSebastien Roy * Copyright (c) 2014 by Delphix. All rights reserved. 25721fffe3SKacheong Poon */ 26721fffe3SKacheong Poon 27721fffe3SKacheong Poon /* This file contains all TCP output processing functions. */ 28721fffe3SKacheong Poon 29721fffe3SKacheong Poon #include <sys/types.h> 30721fffe3SKacheong Poon #include <sys/stream.h> 31721fffe3SKacheong Poon #include <sys/strsun.h> 32721fffe3SKacheong Poon #include <sys/strsubr.h> 33721fffe3SKacheong Poon #include <sys/stropts.h> 34721fffe3SKacheong Poon #include <sys/strlog.h> 35721fffe3SKacheong Poon #define _SUN_TPI_VERSION 2 36721fffe3SKacheong Poon #include <sys/tihdr.h> 37721fffe3SKacheong Poon #include <sys/suntpi.h> 38721fffe3SKacheong Poon #include <sys/xti_inet.h> 39721fffe3SKacheong Poon #include <sys/timod.h> 40721fffe3SKacheong Poon #include <sys/pattr.h> 41721fffe3SKacheong Poon #include <sys/squeue_impl.h> 42721fffe3SKacheong Poon #include <sys/squeue.h> 43721fffe3SKacheong Poon #include <sys/sockio.h> 44721fffe3SKacheong Poon #include <sys/tsol/tnet.h> 45721fffe3SKacheong Poon 46721fffe3SKacheong Poon #include <inet/common.h> 47721fffe3SKacheong Poon #include <inet/ip.h> 48721fffe3SKacheong Poon #include <inet/tcp.h> 49721fffe3SKacheong Poon #include <inet/tcp_impl.h> 50721fffe3SKacheong Poon #include <inet/snmpcom.h> 51721fffe3SKacheong Poon #include <inet/proto_set.h> 52721fffe3SKacheong Poon #include <inet/ipsec_impl.h> 53721fffe3SKacheong Poon #include <inet/ip_ndp.h> 54721fffe3SKacheong Poon 55721fffe3SKacheong Poon static mblk_t *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *); 56721fffe3SKacheong Poon static void tcp_wput_cmdblk(queue_t *, mblk_t *); 57721fffe3SKacheong Poon static void tcp_wput_flush(tcp_t *, mblk_t *); 58721fffe3SKacheong Poon static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); 59721fffe3SKacheong Poon static int tcp_xmit_end(tcp_t *); 60721fffe3SKacheong Poon static int tcp_send(tcp_t *, const int, const int, const int, 61721fffe3SKacheong Poon const int, int *, uint_t *, int *, mblk_t **, mblk_t *); 62721fffe3SKacheong Poon static void tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t, 63721fffe3SKacheong Poon int, ip_recv_attr_t *, ip_stack_t *, conn_t *); 64721fffe3SKacheong Poon static boolean_t tcp_send_rst_chk(tcp_stack_t *); 65721fffe3SKacheong Poon static void tcp_process_shrunk_swnd(tcp_t *, uint32_t); 66721fffe3SKacheong Poon static void tcp_fill_header(tcp_t *, uchar_t *, clock_t, int); 67721fffe3SKacheong Poon 68721fffe3SKacheong Poon /* 69721fffe3SKacheong Poon * Functions called directly via squeue having a prototype of edesc_t. 70721fffe3SKacheong Poon */ 71721fffe3SKacheong Poon static void tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *); 72721fffe3SKacheong Poon static void tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *); 73721fffe3SKacheong Poon static void tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *); 74721fffe3SKacheong Poon 75721fffe3SKacheong Poon /* 76721fffe3SKacheong Poon * This controls how tiny a write must be before we try to copy it 77721fffe3SKacheong Poon * into the mblk on the tail of the transmit queue. Not much 78721fffe3SKacheong Poon * speedup is observed for values larger than sixteen. Zero will 79721fffe3SKacheong Poon * disable the optimisation. 80721fffe3SKacheong Poon */ 81721fffe3SKacheong Poon static int tcp_tx_pull_len = 16; 82721fffe3SKacheong Poon 83721fffe3SKacheong Poon void 84721fffe3SKacheong Poon tcp_wput(queue_t *q, mblk_t *mp) 85721fffe3SKacheong Poon { 86721fffe3SKacheong Poon conn_t *connp = Q_TO_CONN(q); 87721fffe3SKacheong Poon tcp_t *tcp; 88721fffe3SKacheong Poon void (*output_proc)(); 89721fffe3SKacheong Poon t_scalar_t type; 90721fffe3SKacheong Poon uchar_t *rptr; 91721fffe3SKacheong Poon struct iocblk *iocp; 92721fffe3SKacheong Poon size_t size; 93721fffe3SKacheong Poon 94721fffe3SKacheong Poon ASSERT(connp->conn_ref >= 2); 95721fffe3SKacheong Poon 96721fffe3SKacheong Poon switch (DB_TYPE(mp)) { 97721fffe3SKacheong Poon case M_DATA: 98721fffe3SKacheong Poon tcp = connp->conn_tcp; 99721fffe3SKacheong Poon ASSERT(tcp != NULL); 100721fffe3SKacheong Poon 101721fffe3SKacheong Poon size = msgdsize(mp); 102721fffe3SKacheong Poon 103721fffe3SKacheong Poon mutex_enter(&tcp->tcp_non_sq_lock); 104721fffe3SKacheong Poon tcp->tcp_squeue_bytes += size; 105721fffe3SKacheong Poon if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { 106721fffe3SKacheong Poon tcp_setqfull(tcp); 107721fffe3SKacheong Poon } 108721fffe3SKacheong Poon mutex_exit(&tcp->tcp_non_sq_lock); 109721fffe3SKacheong Poon 110721fffe3SKacheong Poon CONN_INC_REF(connp); 111721fffe3SKacheong Poon SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp, 112721fffe3SKacheong Poon NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 113721fffe3SKacheong Poon return; 114721fffe3SKacheong Poon 115721fffe3SKacheong Poon case M_CMD: 116721fffe3SKacheong Poon tcp_wput_cmdblk(q, mp); 117721fffe3SKacheong Poon return; 118721fffe3SKacheong Poon 119721fffe3SKacheong Poon case M_PROTO: 120721fffe3SKacheong Poon case M_PCPROTO: 121721fffe3SKacheong Poon /* 122721fffe3SKacheong Poon * if it is a snmp message, don't get behind the squeue 123721fffe3SKacheong Poon */ 124721fffe3SKacheong Poon tcp = connp->conn_tcp; 125721fffe3SKacheong Poon rptr = mp->b_rptr; 126721fffe3SKacheong Poon if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { 127721fffe3SKacheong Poon type = ((union T_primitives *)rptr)->type; 128721fffe3SKacheong Poon } else { 129721fffe3SKacheong Poon if (connp->conn_debug) { 130721fffe3SKacheong Poon (void) strlog(TCP_MOD_ID, 0, 1, 131721fffe3SKacheong Poon SL_ERROR|SL_TRACE, 132721fffe3SKacheong Poon "tcp_wput_proto, dropping one..."); 133721fffe3SKacheong Poon } 134721fffe3SKacheong Poon freemsg(mp); 135721fffe3SKacheong Poon return; 136721fffe3SKacheong Poon } 137721fffe3SKacheong Poon if (type == T_SVR4_OPTMGMT_REQ) { 138721fffe3SKacheong Poon /* 139721fffe3SKacheong Poon * All Solaris components should pass a db_credp 140721fffe3SKacheong Poon * for this TPI message, hence we ASSERT. 141721fffe3SKacheong Poon * But in case there is some other M_PROTO that looks 142721fffe3SKacheong Poon * like a TPI message sent by some other kernel 143721fffe3SKacheong Poon * component, we check and return an error. 144721fffe3SKacheong Poon */ 145721fffe3SKacheong Poon cred_t *cr = msg_getcred(mp, NULL); 146721fffe3SKacheong Poon 147721fffe3SKacheong Poon ASSERT(cr != NULL); 148721fffe3SKacheong Poon if (cr == NULL) { 149721fffe3SKacheong Poon tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 150721fffe3SKacheong Poon return; 151721fffe3SKacheong Poon } 152721fffe3SKacheong Poon if (snmpcom_req(q, mp, tcp_snmp_set, ip_snmp_get, 153721fffe3SKacheong Poon cr)) { 154721fffe3SKacheong Poon /* 155721fffe3SKacheong Poon * This was a SNMP request 156721fffe3SKacheong Poon */ 157721fffe3SKacheong Poon return; 158721fffe3SKacheong Poon } else { 159721fffe3SKacheong Poon output_proc = tcp_wput_proto; 160721fffe3SKacheong Poon } 161721fffe3SKacheong Poon } else { 162721fffe3SKacheong Poon output_proc = tcp_wput_proto; 163721fffe3SKacheong Poon } 164721fffe3SKacheong Poon break; 165721fffe3SKacheong Poon case M_IOCTL: 166721fffe3SKacheong Poon /* 167721fffe3SKacheong Poon * Most ioctls can be processed right away without going via 168721fffe3SKacheong Poon * squeues - process them right here. Those that do require 169721fffe3SKacheong Poon * squeue (currently _SIOCSOCKFALLBACK) 170721fffe3SKacheong Poon * are processed by tcp_wput_ioctl(). 171721fffe3SKacheong Poon */ 172721fffe3SKacheong Poon iocp = (struct iocblk *)mp->b_rptr; 173721fffe3SKacheong Poon tcp = connp->conn_tcp; 174721fffe3SKacheong Poon 175721fffe3SKacheong Poon switch (iocp->ioc_cmd) { 176721fffe3SKacheong Poon case TCP_IOC_ABORT_CONN: 177721fffe3SKacheong Poon tcp_ioctl_abort_conn(q, mp); 178721fffe3SKacheong Poon return; 179721fffe3SKacheong Poon case TI_GETPEERNAME: 180721fffe3SKacheong Poon case TI_GETMYNAME: 181721fffe3SKacheong Poon mi_copyin(q, mp, NULL, 182721fffe3SKacheong Poon SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 183721fffe3SKacheong Poon return; 184721fffe3SKacheong Poon 185721fffe3SKacheong Poon default: 186721fffe3SKacheong Poon output_proc = tcp_wput_ioctl; 187721fffe3SKacheong Poon break; 188721fffe3SKacheong Poon } 189721fffe3SKacheong Poon break; 190721fffe3SKacheong Poon default: 191721fffe3SKacheong Poon output_proc = tcp_wput_nondata; 192721fffe3SKacheong Poon break; 193721fffe3SKacheong Poon } 194721fffe3SKacheong Poon 195721fffe3SKacheong Poon CONN_INC_REF(connp); 196721fffe3SKacheong Poon SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp, 197721fffe3SKacheong Poon NULL, tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER); 198721fffe3SKacheong Poon } 199721fffe3SKacheong Poon 200721fffe3SKacheong Poon /* 201721fffe3SKacheong Poon * The TCP normal data output path. 202721fffe3SKacheong Poon * NOTE: the logic of the fast path is duplicated from this function. 203721fffe3SKacheong Poon */ 204721fffe3SKacheong Poon void 205721fffe3SKacheong Poon tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) 206721fffe3SKacheong Poon { 207721fffe3SKacheong Poon int len; 208721fffe3SKacheong Poon mblk_t *local_time; 209721fffe3SKacheong Poon mblk_t *mp1; 210721fffe3SKacheong Poon uint32_t snxt; 211721fffe3SKacheong Poon int tail_unsent; 212721fffe3SKacheong Poon int tcpstate; 213721fffe3SKacheong Poon int usable = 0; 214721fffe3SKacheong Poon mblk_t *xmit_tail; 215721fffe3SKacheong Poon int32_t mss; 216721fffe3SKacheong Poon int32_t num_sack_blk = 0; 217721fffe3SKacheong Poon int32_t total_hdr_len; 218721fffe3SKacheong Poon int32_t tcp_hdr_len; 219721fffe3SKacheong Poon int rc; 220721fffe3SKacheong Poon tcp_stack_t *tcps = tcp->tcp_tcps; 221721fffe3SKacheong Poon conn_t *connp = tcp->tcp_connp; 222721fffe3SKacheong Poon clock_t now = LBOLT_FASTPATH; 223721fffe3SKacheong Poon 224721fffe3SKacheong Poon tcpstate = tcp->tcp_state; 225721fffe3SKacheong Poon if (mp == NULL) { 226721fffe3SKacheong Poon /* 227721fffe3SKacheong Poon * tcp_wput_data() with NULL mp should only be called when 228721fffe3SKacheong Poon * there is unsent data. 229721fffe3SKacheong Poon */ 230721fffe3SKacheong Poon ASSERT(tcp->tcp_unsent > 0); 231721fffe3SKacheong Poon /* Really tacky... but we need this for detached closes. */ 232721fffe3SKacheong Poon len = tcp->tcp_unsent; 233721fffe3SKacheong Poon goto data_null; 234721fffe3SKacheong Poon } 235721fffe3SKacheong Poon 236721fffe3SKacheong Poon ASSERT(mp->b_datap->db_type == M_DATA); 237721fffe3SKacheong Poon /* 238721fffe3SKacheong Poon * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ, 239721fffe3SKacheong Poon * or before a connection attempt has begun. 240721fffe3SKacheong Poon */ 241721fffe3SKacheong Poon if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT || 242721fffe3SKacheong Poon (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 243721fffe3SKacheong Poon if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 244721fffe3SKacheong Poon #ifdef DEBUG 245721fffe3SKacheong Poon cmn_err(CE_WARN, 246721fffe3SKacheong Poon "tcp_wput_data: data after ordrel, %s", 247721fffe3SKacheong Poon tcp_display(tcp, NULL, 248721fffe3SKacheong Poon DISP_ADDR_AND_PORT)); 249721fffe3SKacheong Poon #else 250721fffe3SKacheong Poon if (connp->conn_debug) { 251721fffe3SKacheong Poon (void) strlog(TCP_MOD_ID, 0, 1, 252721fffe3SKacheong Poon SL_TRACE|SL_ERROR, 253721fffe3SKacheong Poon "tcp_wput_data: data after ordrel, %s\n", 254721fffe3SKacheong Poon tcp_display(tcp, NULL, 255721fffe3SKacheong Poon DISP_ADDR_AND_PORT)); 256721fffe3SKacheong Poon } 257721fffe3SKacheong Poon #endif /* DEBUG */ 258721fffe3SKacheong Poon } 259721fffe3SKacheong Poon if (tcp->tcp_snd_zcopy_aware && 260721fffe3SKacheong Poon (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 261721fffe3SKacheong Poon tcp_zcopy_notify(tcp); 262721fffe3SKacheong Poon freemsg(mp); 263721fffe3SKacheong Poon mutex_enter(&tcp->tcp_non_sq_lock); 264721fffe3SKacheong Poon if (tcp->tcp_flow_stopped && 265721fffe3SKacheong Poon TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { 266721fffe3SKacheong Poon tcp_clrqfull(tcp); 267721fffe3SKacheong Poon } 268721fffe3SKacheong Poon mutex_exit(&tcp->tcp_non_sq_lock); 269721fffe3SKacheong Poon return; 270721fffe3SKacheong Poon } 271721fffe3SKacheong Poon 272721fffe3SKacheong Poon /* Strip empties */ 273721fffe3SKacheong Poon for (;;) { 274721fffe3SKacheong Poon ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 275721fffe3SKacheong Poon (uintptr_t)INT_MAX); 276721fffe3SKacheong Poon len = (int)(mp->b_wptr - mp->b_rptr); 277721fffe3SKacheong Poon if (len > 0) 278721fffe3SKacheong Poon break; 279721fffe3SKacheong Poon mp1 = mp; 280721fffe3SKacheong Poon mp = mp->b_cont; 281721fffe3SKacheong Poon freeb(mp1); 282721fffe3SKacheong Poon if (mp == NULL) { 283721fffe3SKacheong Poon return; 284721fffe3SKacheong Poon } 285721fffe3SKacheong Poon } 286721fffe3SKacheong Poon 287721fffe3SKacheong Poon /* If we are the first on the list ... */ 288721fffe3SKacheong Poon if (tcp->tcp_xmit_head == NULL) { 289721fffe3SKacheong Poon tcp->tcp_xmit_head = mp; 290721fffe3SKacheong Poon tcp->tcp_xmit_tail = mp; 291721fffe3SKacheong Poon tcp->tcp_xmit_tail_unsent = len; 292721fffe3SKacheong Poon } else { 293721fffe3SKacheong Poon /* If tiny tx and room in txq tail, pullup to save mblks. */ 294721fffe3SKacheong Poon struct datab *dp; 295721fffe3SKacheong Poon 296721fffe3SKacheong Poon mp1 = tcp->tcp_xmit_last; 297721fffe3SKacheong Poon if (len < tcp_tx_pull_len && 298721fffe3SKacheong Poon (dp = mp1->b_datap)->db_ref == 1 && 299721fffe3SKacheong Poon dp->db_lim - mp1->b_wptr >= len) { 300721fffe3SKacheong Poon ASSERT(len > 0); 301721fffe3SKacheong Poon ASSERT(!mp1->b_cont); 302721fffe3SKacheong Poon if (len == 1) { 303721fffe3SKacheong Poon *mp1->b_wptr++ = *mp->b_rptr; 304721fffe3SKacheong Poon } else { 305721fffe3SKacheong Poon bcopy(mp->b_rptr, mp1->b_wptr, len); 306721fffe3SKacheong Poon mp1->b_wptr += len; 307721fffe3SKacheong Poon } 308721fffe3SKacheong Poon if (mp1 == tcp->tcp_xmit_tail) 309721fffe3SKacheong Poon tcp->tcp_xmit_tail_unsent += len; 310721fffe3SKacheong Poon mp1->b_cont = mp->b_cont; 311721fffe3SKacheong Poon if (tcp->tcp_snd_zcopy_aware && 312721fffe3SKacheong Poon (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 313721fffe3SKacheong Poon mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; 314721fffe3SKacheong Poon freeb(mp); 315721fffe3SKacheong Poon mp = mp1; 316721fffe3SKacheong Poon } else { 317721fffe3SKacheong Poon tcp->tcp_xmit_last->b_cont = mp; 318721fffe3SKacheong Poon } 319721fffe3SKacheong Poon len += tcp->tcp_unsent; 320721fffe3SKacheong Poon } 321721fffe3SKacheong Poon 322721fffe3SKacheong Poon /* Tack on however many more positive length mblks we have */ 323721fffe3SKacheong Poon if ((mp1 = mp->b_cont) != NULL) { 324721fffe3SKacheong Poon do { 325721fffe3SKacheong Poon int tlen; 326721fffe3SKacheong Poon ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 327721fffe3SKacheong Poon (uintptr_t)INT_MAX); 328721fffe3SKacheong Poon tlen = (int)(mp1->b_wptr - mp1->b_rptr); 329721fffe3SKacheong Poon if (tlen <= 0) { 330721fffe3SKacheong Poon mp->b_cont = mp1->b_cont; 331721fffe3SKacheong Poon freeb(mp1); 332721fffe3SKacheong Poon } else { 333721fffe3SKacheong Poon len += tlen; 334721fffe3SKacheong Poon mp = mp1; 335721fffe3SKacheong Poon } 336721fffe3SKacheong Poon } while ((mp1 = mp->b_cont) != NULL); 337721fffe3SKacheong Poon } 338721fffe3SKacheong Poon tcp->tcp_xmit_last = mp; 339721fffe3SKacheong Poon tcp->tcp_unsent = len; 340721fffe3SKacheong Poon 341721fffe3SKacheong Poon if (urgent) 342721fffe3SKacheong Poon usable = 1; 343721fffe3SKacheong Poon 344721fffe3SKacheong Poon data_null: 345721fffe3SKacheong Poon snxt = tcp->tcp_snxt; 346721fffe3SKacheong Poon xmit_tail = tcp->tcp_xmit_tail; 347721fffe3SKacheong Poon tail_unsent = tcp->tcp_xmit_tail_unsent; 348721fffe3SKacheong Poon 349721fffe3SKacheong Poon /* 350721fffe3SKacheong Poon * Note that tcp_mss has been adjusted to take into account the 351721fffe3SKacheong Poon * timestamp option if applicable. Because SACK options do not 352721fffe3SKacheong Poon * appear in every TCP segments and they are of variable lengths, 353721fffe3SKacheong Poon * they cannot be included in tcp_mss. Thus we need to calculate 354721fffe3SKacheong Poon * the actual segment length when we need to send a segment which 355721fffe3SKacheong Poon * includes SACK options. 356721fffe3SKacheong Poon */ 357721fffe3SKacheong Poon if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 358721fffe3SKacheong Poon int32_t opt_len; 359721fffe3SKacheong Poon 360721fffe3SKacheong Poon num_sack_blk = MIN(tcp->tcp_max_sack_blk, 361721fffe3SKacheong Poon tcp->tcp_num_sack_blk); 362721fffe3SKacheong Poon opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 363721fffe3SKacheong Poon 2 + TCPOPT_HEADER_LEN; 364721fffe3SKacheong Poon mss = tcp->tcp_mss - opt_len; 365721fffe3SKacheong Poon total_hdr_len = connp->conn_ht_iphc_len + opt_len; 366721fffe3SKacheong Poon tcp_hdr_len = connp->conn_ht_ulp_len + opt_len; 367721fffe3SKacheong Poon } else { 368721fffe3SKacheong Poon mss = tcp->tcp_mss; 369721fffe3SKacheong Poon total_hdr_len = connp->conn_ht_iphc_len; 370721fffe3SKacheong Poon tcp_hdr_len = connp->conn_ht_ulp_len; 371721fffe3SKacheong Poon } 372721fffe3SKacheong Poon 373721fffe3SKacheong Poon if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && 374721fffe3SKacheong Poon (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { 375721fffe3SKacheong Poon TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); 376721fffe3SKacheong Poon } 377721fffe3SKacheong Poon if (tcpstate == TCPS_SYN_RCVD) { 378721fffe3SKacheong Poon /* 379721fffe3SKacheong Poon * The three-way connection establishment handshake is not 380721fffe3SKacheong Poon * complete yet. We want to queue the data for transmission 381721fffe3SKacheong Poon * after entering ESTABLISHED state (RFC793). A jump to 382721fffe3SKacheong Poon * "done" label effectively leaves data on the queue. 383721fffe3SKacheong Poon */ 384721fffe3SKacheong Poon goto done; 385721fffe3SKacheong Poon } else { 386721fffe3SKacheong Poon int usable_r; 387721fffe3SKacheong Poon 388721fffe3SKacheong Poon /* 389721fffe3SKacheong Poon * In the special case when cwnd is zero, which can only 390721fffe3SKacheong Poon * happen if the connection is ECN capable, return now. 391721fffe3SKacheong Poon * New segments is sent using tcp_timer(). The timer 392721fffe3SKacheong Poon * is set in tcp_input_data(). 393721fffe3SKacheong Poon */ 394721fffe3SKacheong Poon if (tcp->tcp_cwnd == 0) { 395721fffe3SKacheong Poon /* 396721fffe3SKacheong Poon * Note that tcp_cwnd is 0 before 3-way handshake is 397721fffe3SKacheong Poon * finished. 398721fffe3SKacheong Poon */ 399721fffe3SKacheong Poon ASSERT(tcp->tcp_ecn_ok || 400721fffe3SKacheong Poon tcp->tcp_state < TCPS_ESTABLISHED); 401721fffe3SKacheong Poon return; 402721fffe3SKacheong Poon } 403721fffe3SKacheong Poon 404721fffe3SKacheong Poon /* NOTE: trouble if xmitting while SYN not acked? */ 405721fffe3SKacheong Poon usable_r = snxt - tcp->tcp_suna; 406721fffe3SKacheong Poon usable_r = tcp->tcp_swnd - usable_r; 407721fffe3SKacheong Poon 408721fffe3SKacheong Poon /* 409721fffe3SKacheong Poon * Check if the receiver has shrunk the window. If 410721fffe3SKacheong Poon * tcp_wput_data() with NULL mp is called, tcp_fin_sent 411721fffe3SKacheong Poon * cannot be set as there is unsent data, so FIN cannot 412721fffe3SKacheong Poon * be sent out. Otherwise, we need to take into account 413721fffe3SKacheong Poon * of FIN as it consumes an "invisible" sequence number. 414721fffe3SKacheong Poon */ 415721fffe3SKacheong Poon ASSERT(tcp->tcp_fin_sent == 0); 416721fffe3SKacheong Poon if (usable_r < 0) { 417721fffe3SKacheong Poon /* 418721fffe3SKacheong Poon * The receiver has shrunk the window and we have sent 419721fffe3SKacheong Poon * -usable_r date beyond the window, re-adjust. 420721fffe3SKacheong Poon * 421721fffe3SKacheong Poon * If TCP window scaling is enabled, there can be 422721fffe3SKacheong Poon * round down error as the advertised receive window 423721fffe3SKacheong Poon * is actually right shifted n bits. This means that 424721fffe3SKacheong Poon * the lower n bits info is wiped out. It will look 425721fffe3SKacheong Poon * like the window is shrunk. Do a check here to 426721fffe3SKacheong Poon * see if the shrunk amount is actually within the 427721fffe3SKacheong Poon * error in window calculation. If it is, just 428721fffe3SKacheong Poon * return. Note that this check is inside the 429721fffe3SKacheong Poon * shrunk window check. This makes sure that even 430721fffe3SKacheong Poon * though tcp_process_shrunk_swnd() is not called, 431721fffe3SKacheong Poon * we will stop further processing. 432721fffe3SKacheong Poon */ 433721fffe3SKacheong Poon if ((-usable_r >> tcp->tcp_snd_ws) > 0) { 434721fffe3SKacheong Poon tcp_process_shrunk_swnd(tcp, -usable_r); 435721fffe3SKacheong Poon } 436721fffe3SKacheong Poon return; 437721fffe3SKacheong Poon } 438721fffe3SKacheong Poon 439721fffe3SKacheong Poon /* usable = MIN(swnd, cwnd) - unacked_bytes */ 440721fffe3SKacheong Poon if (tcp->tcp_swnd > tcp->tcp_cwnd) 441721fffe3SKacheong Poon usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd; 442721fffe3SKacheong Poon 443721fffe3SKacheong Poon /* usable = MIN(usable, unsent) */ 444721fffe3SKacheong Poon if (usable_r > len) 445721fffe3SKacheong Poon usable_r = len; 446721fffe3SKacheong Poon 447721fffe3SKacheong Poon /* usable = MAX(usable, {1 for urgent, 0 for data}) */ 448721fffe3SKacheong Poon if (usable_r > 0) { 449721fffe3SKacheong Poon usable = usable_r; 450721fffe3SKacheong Poon } else { 451721fffe3SKacheong Poon /* Bypass all other unnecessary processing. */ 452721fffe3SKacheong Poon goto done; 453721fffe3SKacheong Poon } 454721fffe3SKacheong Poon } 455721fffe3SKacheong Poon 456721fffe3SKacheong Poon local_time = (mblk_t *)now; 457721fffe3SKacheong Poon 458721fffe3SKacheong Poon /* 459721fffe3SKacheong Poon * "Our" Nagle Algorithm. This is not the same as in the old 460721fffe3SKacheong Poon * BSD. This is more in line with the true intent of Nagle. 461721fffe3SKacheong Poon * 462721fffe3SKacheong Poon * The conditions are: 463721fffe3SKacheong Poon * 1. The amount of unsent data (or amount of data which can be 464721fffe3SKacheong Poon * sent, whichever is smaller) is less than Nagle limit. 465721fffe3SKacheong Poon * 2. The last sent size is also less than Nagle limit. 466721fffe3SKacheong Poon * 3. There is unack'ed data. 467721fffe3SKacheong Poon * 4. Urgent pointer is not set. Send urgent data ignoring the 468721fffe3SKacheong Poon * Nagle algorithm. This reduces the probability that urgent 469721fffe3SKacheong Poon * bytes get "merged" together. 470721fffe3SKacheong Poon * 5. The app has not closed the connection. This eliminates the 471721fffe3SKacheong Poon * wait time of the receiving side waiting for the last piece of 472721fffe3SKacheong Poon * (small) data. 473721fffe3SKacheong Poon * 474721fffe3SKacheong Poon * If all are satisified, exit without sending anything. Note 475721fffe3SKacheong Poon * that Nagle limit can be smaller than 1 MSS. Nagle limit is 476721fffe3SKacheong Poon * the smaller of 1 MSS and global tcp_naglim_def (default to be 477721fffe3SKacheong Poon * 4095). 478721fffe3SKacheong Poon */ 479721fffe3SKacheong Poon if (usable < (int)tcp->tcp_naglim && 480721fffe3SKacheong Poon tcp->tcp_naglim > tcp->tcp_last_sent_len && 481721fffe3SKacheong Poon snxt != tcp->tcp_suna && 482721fffe3SKacheong Poon !(tcp->tcp_valid_bits & TCP_URG_VALID) && 483721fffe3SKacheong Poon !(tcp->tcp_valid_bits & TCP_FSS_VALID)) { 484721fffe3SKacheong Poon goto done; 485721fffe3SKacheong Poon } 486721fffe3SKacheong Poon 487721fffe3SKacheong Poon /* 488721fffe3SKacheong Poon * If tcp_zero_win_probe is not set and the tcp->tcp_cork option 489721fffe3SKacheong Poon * is set, then we have to force TCP not to send partial segment 490721fffe3SKacheong Poon * (smaller than MSS bytes). We are calculating the usable now 491721fffe3SKacheong Poon * based on full mss and will save the rest of remaining data for 492721fffe3SKacheong Poon * later. When tcp_zero_win_probe is set, TCP needs to send out 493721fffe3SKacheong Poon * something to do zero window probe. 494721fffe3SKacheong Poon */ 495721fffe3SKacheong Poon if (tcp->tcp_cork && !tcp->tcp_zero_win_probe) { 496721fffe3SKacheong Poon if (usable < mss) 497721fffe3SKacheong Poon goto done; 498721fffe3SKacheong Poon usable = (usable / mss) * mss; 499721fffe3SKacheong Poon } 500721fffe3SKacheong Poon 501721fffe3SKacheong Poon /* Update the latest receive window size in TCP header. */ 502721fffe3SKacheong Poon tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 503721fffe3SKacheong Poon 504721fffe3SKacheong Poon /* Send the packet. */ 505721fffe3SKacheong Poon rc = tcp_send(tcp, mss, total_hdr_len, tcp_hdr_len, 506721fffe3SKacheong Poon num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, 507721fffe3SKacheong Poon local_time); 508721fffe3SKacheong Poon 509721fffe3SKacheong Poon /* Pretend that all we were trying to send really got sent */ 510721fffe3SKacheong Poon if (rc < 0 && tail_unsent < 0) { 511721fffe3SKacheong Poon do { 512721fffe3SKacheong Poon xmit_tail = xmit_tail->b_cont; 513721fffe3SKacheong Poon xmit_tail->b_prev = local_time; 514721fffe3SKacheong Poon ASSERT((uintptr_t)(xmit_tail->b_wptr - 515721fffe3SKacheong Poon xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 516721fffe3SKacheong Poon tail_unsent += (int)(xmit_tail->b_wptr - 517721fffe3SKacheong Poon xmit_tail->b_rptr); 518721fffe3SKacheong Poon } while (tail_unsent < 0); 519721fffe3SKacheong Poon } 520721fffe3SKacheong Poon done:; 521721fffe3SKacheong Poon tcp->tcp_xmit_tail = xmit_tail; 522721fffe3SKacheong Poon tcp->tcp_xmit_tail_unsent = tail_unsent; 523721fffe3SKacheong Poon len = tcp->tcp_snxt - snxt; 524721fffe3SKacheong Poon if (len) { 525721fffe3SKacheong Poon /* 526721fffe3SKacheong Poon * If new data was sent, need to update the notsack 527721fffe3SKacheong Poon * list, which is, afterall, data blocks that have 528721fffe3SKacheong Poon * not been sack'ed by the receiver. New data is 529721fffe3SKacheong Poon * not sack'ed. 530721fffe3SKacheong Poon */ 531721fffe3SKacheong Poon if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 532721fffe3SKacheong Poon /* len is a negative value. */ 533721fffe3SKacheong Poon tcp->tcp_pipe -= len; 534721fffe3SKacheong Poon tcp_notsack_update(&(tcp->tcp_notsack_list), 535721fffe3SKacheong Poon tcp->tcp_snxt, snxt, 536721fffe3SKacheong Poon &(tcp->tcp_num_notsack_blk), 537721fffe3SKacheong Poon &(tcp->tcp_cnt_notsack_list)); 538721fffe3SKacheong Poon } 539721fffe3SKacheong Poon tcp->tcp_snxt = snxt + tcp->tcp_fin_sent; 540721fffe3SKacheong Poon tcp->tcp_rack = tcp->tcp_rnxt; 541721fffe3SKacheong Poon tcp->tcp_rack_cnt = 0; 542721fffe3SKacheong Poon if ((snxt + len) == tcp->tcp_suna) { 543721fffe3SKacheong Poon TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 544721fffe3SKacheong Poon } 545721fffe3SKacheong Poon } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) { 546721fffe3SKacheong Poon /* 547721fffe3SKacheong Poon * Didn't send anything. Make sure the timer is running 548721fffe3SKacheong Poon * so that we will probe a zero window. 549721fffe3SKacheong Poon */ 550721fffe3SKacheong Poon TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 551721fffe3SKacheong Poon } 552721fffe3SKacheong Poon /* Note that len is the amount we just sent but with a negative sign */ 553721fffe3SKacheong Poon tcp->tcp_unsent += len; 554721fffe3SKacheong Poon mutex_enter(&tcp->tcp_non_sq_lock); 555721fffe3SKacheong Poon if (tcp->tcp_flow_stopped) { 556721fffe3SKacheong Poon if (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { 557721fffe3SKacheong Poon tcp_clrqfull(tcp); 558721fffe3SKacheong Poon } 559721fffe3SKacheong Poon } else if (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf) { 560721fffe3SKacheong Poon if (!(tcp->tcp_detached)) 561721fffe3SKacheong Poon tcp_setqfull(tcp); 562721fffe3SKacheong Poon } 563721fffe3SKacheong Poon mutex_exit(&tcp->tcp_non_sq_lock); 564721fffe3SKacheong Poon } 565721fffe3SKacheong Poon 566721fffe3SKacheong Poon /* 567721fffe3SKacheong Poon * Initial STREAMS write side put() procedure for sockets. It tries to 568721fffe3SKacheong Poon * handle the T_CAPABILITY_REQ which sockfs sends down while setting 569721fffe3SKacheong Poon * up the socket without using the squeue. Non T_CAPABILITY_REQ messages 570721fffe3SKacheong Poon * are handled by tcp_wput() as usual. 571721fffe3SKacheong Poon * 572721fffe3SKacheong Poon * All further messages will also be handled by tcp_wput() because we cannot 573721fffe3SKacheong Poon * be sure that the above short cut is safe later. 574721fffe3SKacheong Poon */ 575721fffe3SKacheong Poon void 576721fffe3SKacheong Poon tcp_wput_sock(queue_t *wq, mblk_t *mp) 577721fffe3SKacheong Poon { 578721fffe3SKacheong Poon conn_t *connp = Q_TO_CONN(wq); 579721fffe3SKacheong Poon tcp_t *tcp = connp->conn_tcp; 580721fffe3SKacheong Poon struct T_capability_req *car = (struct T_capability_req *)mp->b_rptr; 581721fffe3SKacheong Poon 582721fffe3SKacheong Poon ASSERT(wq->q_qinfo == &tcp_sock_winit); 583721fffe3SKacheong Poon wq->q_qinfo = &tcp_winit; 584721fffe3SKacheong Poon 585721fffe3SKacheong Poon ASSERT(IPCL_IS_TCP(connp)); 586721fffe3SKacheong Poon ASSERT(TCP_IS_SOCKET(tcp)); 587721fffe3SKacheong Poon 588721fffe3SKacheong Poon if (DB_TYPE(mp) == M_PCPROTO && 589721fffe3SKacheong Poon MBLKL(mp) == sizeof (struct T_capability_req) && 590721fffe3SKacheong Poon car->PRIM_type == T_CAPABILITY_REQ) { 591721fffe3SKacheong Poon tcp_capability_req(tcp, mp); 592721fffe3SKacheong Poon return; 593721fffe3SKacheong Poon } 594721fffe3SKacheong Poon 595721fffe3SKacheong Poon tcp_wput(wq, mp); 596721fffe3SKacheong Poon } 597721fffe3SKacheong Poon 598721fffe3SKacheong Poon /* ARGSUSED */ 599721fffe3SKacheong Poon void 600721fffe3SKacheong Poon tcp_wput_fallback(queue_t *wq, mblk_t *mp) 601721fffe3SKacheong Poon { 602721fffe3SKacheong Poon #ifdef DEBUG 603721fffe3SKacheong Poon cmn_err(CE_CONT, "tcp_wput_fallback: Message during fallback \n"); 604721fffe3SKacheong Poon #endif 605721fffe3SKacheong Poon freemsg(mp); 606721fffe3SKacheong Poon } 607721fffe3SKacheong Poon 608721fffe3SKacheong Poon /* 609721fffe3SKacheong Poon * Call by tcp_wput() to handle misc non M_DATA messages. 610721fffe3SKacheong Poon */ 611721fffe3SKacheong Poon /* ARGSUSED */ 612721fffe3SKacheong Poon static void 613721fffe3SKacheong Poon tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 614721fffe3SKacheong Poon { 615721fffe3SKacheong Poon conn_t *connp = (conn_t *)arg; 616721fffe3SKacheong Poon tcp_t *tcp = connp->conn_tcp; 617721fffe3SKacheong Poon 618721fffe3SKacheong Poon ASSERT(DB_TYPE(mp) != M_IOCTL); 619721fffe3SKacheong Poon /* 620721fffe3SKacheong Poon * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close. 621721fffe3SKacheong Poon * Once the close starts, streamhead and sockfs will not let any data 622721fffe3SKacheong Poon * packets come down (close ensures that there are no threads using the 623721fffe3SKacheong Poon * queue and no new threads will come down) but since qprocsoff() 624721fffe3SKacheong Poon * hasn't happened yet, a M_FLUSH or some non data message might 625721fffe3SKacheong Poon * get reflected back (in response to our own FLUSHRW) and get 626721fffe3SKacheong Poon * processed after tcp_close() is done. The conn would still be valid 627721fffe3SKacheong Poon * because a ref would have added but we need to check the state 628721fffe3SKacheong Poon * before actually processing the packet. 629721fffe3SKacheong Poon */ 630721fffe3SKacheong Poon if (TCP_IS_DETACHED(tcp) || (tcp->tcp_state == TCPS_CLOSED)) { 631721fffe3SKacheong Poon freemsg(mp); 632721fffe3SKacheong Poon return; 633721fffe3SKacheong Poon } 634721fffe3SKacheong Poon 635721fffe3SKacheong Poon switch (DB_TYPE(mp)) { 636721fffe3SKacheong Poon case M_IOCDATA: 637721fffe3SKacheong Poon tcp_wput_iocdata(tcp, mp); 638721fffe3SKacheong Poon break; 639721fffe3SKacheong Poon case M_FLUSH: 640721fffe3SKacheong Poon tcp_wput_flush(tcp, mp); 641721fffe3SKacheong Poon break; 642721fffe3SKacheong Poon default: 643721fffe3SKacheong Poon ip_wput_nondata(connp->conn_wq, mp); 644721fffe3SKacheong Poon break; 645721fffe3SKacheong Poon } 646721fffe3SKacheong Poon } 647721fffe3SKacheong Poon 648721fffe3SKacheong Poon /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */ 649721fffe3SKacheong Poon static void 650721fffe3SKacheong Poon tcp_wput_flush(tcp_t *tcp, mblk_t *mp) 651721fffe3SKacheong Poon { 652721fffe3SKacheong Poon uchar_t fval = *mp->b_rptr; 653721fffe3SKacheong Poon mblk_t *tail; 654721fffe3SKacheong Poon conn_t *connp = tcp->tcp_connp; 655721fffe3SKacheong Poon queue_t *q = connp->conn_wq; 656721fffe3SKacheong Poon 657721fffe3SKacheong Poon /* TODO: How should flush interact with urgent data? */ 658721fffe3SKacheong Poon if ((fval & FLUSHW) && tcp->tcp_xmit_head != NULL && 659721fffe3SKacheong Poon !(tcp->tcp_valid_bits & TCP_URG_VALID)) { 660721fffe3SKacheong Poon /* 661721fffe3SKacheong Poon * Flush only data that has not yet been put on the wire. If 662721fffe3SKacheong Poon * we flush data that we have already transmitted, life, as we 663721fffe3SKacheong Poon * know it, may come to an end. 664721fffe3SKacheong Poon */ 665721fffe3SKacheong Poon tail = tcp->tcp_xmit_tail; 666721fffe3SKacheong Poon tail->b_wptr -= tcp->tcp_xmit_tail_unsent; 667721fffe3SKacheong Poon tcp->tcp_xmit_tail_unsent = 0; 668721fffe3SKacheong Poon tcp->tcp_unsent = 0; 669721fffe3SKacheong Poon if (tail->b_wptr != tail->b_rptr) 670721fffe3SKacheong Poon tail = tail->b_cont; 671721fffe3SKacheong Poon if (tail) { 672721fffe3SKacheong Poon mblk_t **excess = &tcp->tcp_xmit_head; 673721fffe3SKacheong Poon for (;;) { 674721fffe3SKacheong Poon mblk_t *mp1 = *excess; 675721fffe3SKacheong Poon if (mp1 == tail) 676721fffe3SKacheong Poon break; 677721fffe3SKacheong Poon tcp->tcp_xmit_tail = mp1; 678721fffe3SKacheong Poon tcp->tcp_xmit_last = mp1; 679721fffe3SKacheong Poon excess = &mp1->b_cont; 680721fffe3SKacheong Poon } 681721fffe3SKacheong Poon *excess = NULL; 682721fffe3SKacheong Poon tcp_close_mpp(&tail); 683721fffe3SKacheong Poon if (tcp->tcp_snd_zcopy_aware) 684721fffe3SKacheong Poon tcp_zcopy_notify(tcp); 685721fffe3SKacheong Poon } 686721fffe3SKacheong Poon /* 687721fffe3SKacheong Poon * We have no unsent data, so unsent must be less than 688721fffe3SKacheong Poon * conn_sndlowat, so re-enable flow. 689721fffe3SKacheong Poon */ 690721fffe3SKacheong Poon mutex_enter(&tcp->tcp_non_sq_lock); 691721fffe3SKacheong Poon if (tcp->tcp_flow_stopped) { 692721fffe3SKacheong Poon tcp_clrqfull(tcp); 693721fffe3SKacheong Poon } 694721fffe3SKacheong Poon mutex_exit(&tcp->tcp_non_sq_lock); 695721fffe3SKacheong Poon } 696721fffe3SKacheong Poon /* 697721fffe3SKacheong Poon * TODO: you can't just flush these, you have to increase rwnd for one 698721fffe3SKacheong Poon * thing. For another, how should urgent data interact? 699721fffe3SKacheong Poon */ 700721fffe3SKacheong Poon if (fval & FLUSHR) { 701721fffe3SKacheong Poon *mp->b_rptr = fval & ~FLUSHW; 702721fffe3SKacheong Poon /* XXX */ 703721fffe3SKacheong Poon qreply(q, mp); 704721fffe3SKacheong Poon return; 705721fffe3SKacheong Poon } 706721fffe3SKacheong Poon freemsg(mp); 707721fffe3SKacheong Poon } 708721fffe3SKacheong Poon 709721fffe3SKacheong Poon /* 710721fffe3SKacheong Poon * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA 711721fffe3SKacheong Poon * messages. 712721fffe3SKacheong Poon */ 713721fffe3SKacheong Poon static void 714721fffe3SKacheong Poon tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) 715721fffe3SKacheong Poon { 716721fffe3SKacheong Poon mblk_t *mp1; 717721fffe3SKacheong Poon struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 718721fffe3SKacheong Poon STRUCT_HANDLE(strbuf, sb); 719721fffe3SKacheong Poon uint_t addrlen; 720721fffe3SKacheong Poon conn_t *connp = tcp->tcp_connp; 721721fffe3SKacheong Poon queue_t *q = connp->conn_wq; 722721fffe3SKacheong Poon 723721fffe3SKacheong Poon /* Make sure it is one of ours. */ 724721fffe3SKacheong Poon switch (iocp->ioc_cmd) { 725721fffe3SKacheong Poon case TI_GETMYNAME: 726721fffe3SKacheong Poon case TI_GETPEERNAME: 727721fffe3SKacheong Poon break; 728721fffe3SKacheong Poon default: 729721fffe3SKacheong Poon /* 730721fffe3SKacheong Poon * If the conn is closing, then error the ioctl here. Otherwise 731721fffe3SKacheong Poon * use the CONN_IOCTLREF_* macros to hold off tcp_close until 732721fffe3SKacheong Poon * we're done here. 733721fffe3SKacheong Poon */ 734721fffe3SKacheong Poon mutex_enter(&connp->conn_lock); 735721fffe3SKacheong Poon if (connp->conn_state_flags & CONN_CLOSING) { 736721fffe3SKacheong Poon mutex_exit(&connp->conn_lock); 737721fffe3SKacheong Poon iocp->ioc_error = EINVAL; 738721fffe3SKacheong Poon mp->b_datap->db_type = M_IOCNAK; 739721fffe3SKacheong Poon iocp->ioc_count = 0; 740721fffe3SKacheong Poon qreply(q, mp); 741721fffe3SKacheong Poon return; 742721fffe3SKacheong Poon } 743721fffe3SKacheong Poon 744721fffe3SKacheong Poon CONN_INC_IOCTLREF_LOCKED(connp); 745721fffe3SKacheong Poon ip_wput_nondata(q, mp); 746721fffe3SKacheong Poon CONN_DEC_IOCTLREF(connp); 747721fffe3SKacheong Poon return; 748721fffe3SKacheong Poon } 749721fffe3SKacheong Poon switch (mi_copy_state(q, mp, &mp1)) { 750721fffe3SKacheong Poon case -1: 751721fffe3SKacheong Poon return; 752721fffe3SKacheong Poon case MI_COPY_CASE(MI_COPY_IN, 1): 753721fffe3SKacheong Poon break; 754721fffe3SKacheong Poon case MI_COPY_CASE(MI_COPY_OUT, 1): 755721fffe3SKacheong Poon /* Copy out the strbuf. */ 756721fffe3SKacheong Poon mi_copyout(q, mp); 757721fffe3SKacheong Poon return; 758721fffe3SKacheong Poon case MI_COPY_CASE(MI_COPY_OUT, 2): 759721fffe3SKacheong Poon /* All done. */ 760721fffe3SKacheong Poon mi_copy_done(q, mp, 0); 761721fffe3SKacheong Poon return; 762721fffe3SKacheong Poon default: 763721fffe3SKacheong Poon mi_copy_done(q, mp, EPROTO); 764721fffe3SKacheong Poon return; 765721fffe3SKacheong Poon } 766721fffe3SKacheong Poon /* Check alignment of the strbuf */ 767721fffe3SKacheong Poon if (!OK_32PTR(mp1->b_rptr)) { 768721fffe3SKacheong Poon mi_copy_done(q, mp, EINVAL); 769721fffe3SKacheong Poon return; 770721fffe3SKacheong Poon } 771721fffe3SKacheong Poon 772721fffe3SKacheong Poon STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr); 773721fffe3SKacheong Poon 774721fffe3SKacheong Poon if (connp->conn_family == AF_INET) 775721fffe3SKacheong Poon addrlen = sizeof (sin_t); 776721fffe3SKacheong Poon else 777721fffe3SKacheong Poon addrlen = sizeof (sin6_t); 778721fffe3SKacheong Poon 779721fffe3SKacheong Poon if (STRUCT_FGET(sb, maxlen) < addrlen) { 780721fffe3SKacheong Poon mi_copy_done(q, mp, EINVAL); 781721fffe3SKacheong Poon return; 782721fffe3SKacheong Poon } 783721fffe3SKacheong Poon 784721fffe3SKacheong Poon switch (iocp->ioc_cmd) { 785721fffe3SKacheong Poon case TI_GETMYNAME: 786721fffe3SKacheong Poon break; 787721fffe3SKacheong Poon case TI_GETPEERNAME: 788721fffe3SKacheong Poon if (tcp->tcp_state < TCPS_SYN_RCVD) { 789721fffe3SKacheong Poon mi_copy_done(q, mp, ENOTCONN); 790721fffe3SKacheong Poon return; 791721fffe3SKacheong Poon } 792721fffe3SKacheong Poon break; 793721fffe3SKacheong Poon } 794721fffe3SKacheong Poon mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 795721fffe3SKacheong Poon if (!mp1) 796721fffe3SKacheong Poon return; 797721fffe3SKacheong Poon 798721fffe3SKacheong Poon STRUCT_FSET(sb, len, addrlen); 799721fffe3SKacheong Poon switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 800721fffe3SKacheong Poon case TI_GETMYNAME: 801721fffe3SKacheong Poon (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, 802721fffe3SKacheong Poon &addrlen); 803721fffe3SKacheong Poon break; 804721fffe3SKacheong Poon case TI_GETPEERNAME: 805721fffe3SKacheong Poon (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, 806721fffe3SKacheong Poon &addrlen); 807721fffe3SKacheong Poon break; 808721fffe3SKacheong Poon } 809721fffe3SKacheong Poon mp1->b_wptr += addrlen; 810721fffe3SKacheong Poon /* Copy out the address */ 811721fffe3SKacheong Poon mi_copyout(q, mp); 812721fffe3SKacheong Poon } 813721fffe3SKacheong Poon 814721fffe3SKacheong Poon /* 815721fffe3SKacheong Poon * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL 816721fffe3SKacheong Poon * messages. 817721fffe3SKacheong Poon */ 818721fffe3SKacheong Poon /* ARGSUSED */ 819721fffe3SKacheong Poon static void 820721fffe3SKacheong Poon tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 821721fffe3SKacheong Poon { 822721fffe3SKacheong Poon conn_t *connp = (conn_t *)arg; 823721fffe3SKacheong Poon tcp_t *tcp = connp->conn_tcp; 824721fffe3SKacheong Poon queue_t *q = connp->conn_wq; 825721fffe3SKacheong Poon struct iocblk *iocp; 826721fffe3SKacheong Poon 827721fffe3SKacheong Poon ASSERT(DB_TYPE(mp) == M_IOCTL); 828721fffe3SKacheong Poon /* 829721fffe3SKacheong Poon * Try and ASSERT the minimum possible references on the 830721fffe3SKacheong Poon * conn early enough. Since we are executing on write side, 831721fffe3SKacheong Poon * the connection is obviously not detached and that means 832721fffe3SKacheong Poon * there is a ref each for TCP and IP. Since we are behind 833721fffe3SKacheong Poon * the squeue, the minimum references needed are 3. If the 834721fffe3SKacheong Poon * conn is in classifier hash list, there should be an 835721fffe3SKacheong Poon * extra ref for that (we check both the possibilities). 836721fffe3SKacheong Poon */ 837721fffe3SKacheong Poon ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 838721fffe3SKacheong Poon (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 839721fffe3SKacheong Poon 840721fffe3SKacheong Poon iocp = (struct iocblk *)mp->b_rptr; 841721fffe3SKacheong Poon switch (iocp->ioc_cmd) { 842721fffe3SKacheong Poon case _SIOCSOCKFALLBACK: 843721fffe3SKacheong Poon /* 844721fffe3SKacheong Poon * Either sockmod is about to be popped and the socket 845721fffe3SKacheong Poon * would now be treated as a plain stream, or a module 846721fffe3SKacheong Poon * is about to be pushed so we could no longer use read- 847721fffe3SKacheong Poon * side synchronous streams for fused loopback tcp. 848721fffe3SKacheong Poon * Drain any queued data and disable direct sockfs 849721fffe3SKacheong Poon * interface from now on. 850721fffe3SKacheong Poon */ 851721fffe3SKacheong Poon if (!tcp->tcp_issocket) { 852721fffe3SKacheong Poon DB_TYPE(mp) = M_IOCNAK; 853721fffe3SKacheong Poon iocp->ioc_error = EINVAL; 854721fffe3SKacheong Poon } else { 855721fffe3SKacheong Poon tcp_use_pure_tpi(tcp); 856721fffe3SKacheong Poon DB_TYPE(mp) = M_IOCACK; 857721fffe3SKacheong Poon iocp->ioc_error = 0; 858721fffe3SKacheong Poon } 859721fffe3SKacheong Poon iocp->ioc_count = 0; 860721fffe3SKacheong Poon iocp->ioc_rval = 0; 861721fffe3SKacheong Poon qreply(q, mp); 862721fffe3SKacheong Poon return; 863721fffe3SKacheong Poon } 864721fffe3SKacheong Poon 865721fffe3SKacheong Poon /* 866721fffe3SKacheong Poon * If the conn is closing, then error the ioctl here. Otherwise bump the 867721fffe3SKacheong Poon * conn_ioctlref to hold off tcp_close until we're done here. 868721fffe3SKacheong Poon */ 869721fffe3SKacheong Poon mutex_enter(&(connp)->conn_lock); 870721fffe3SKacheong Poon if ((connp)->conn_state_flags & CONN_CLOSING) { 871721fffe3SKacheong Poon mutex_exit(&(connp)->conn_lock); 872721fffe3SKacheong Poon iocp->ioc_error = EINVAL; 873721fffe3SKacheong Poon mp->b_datap->db_type = M_IOCNAK; 874721fffe3SKacheong Poon iocp->ioc_count = 0; 875721fffe3SKacheong Poon qreply(q, mp); 876721fffe3SKacheong Poon return; 877721fffe3SKacheong Poon } 878721fffe3SKacheong Poon 879721fffe3SKacheong Poon CONN_INC_IOCTLREF_LOCKED(connp); 880721fffe3SKacheong Poon ip_wput_nondata(q, mp); 881721fffe3SKacheong Poon CONN_DEC_IOCTLREF(connp); 882721fffe3SKacheong Poon } 883721fffe3SKacheong Poon 884721fffe3SKacheong Poon /* 885721fffe3SKacheong Poon * This routine is called by tcp_wput() to handle all TPI requests. 886721fffe3SKacheong Poon */ 887721fffe3SKacheong Poon /* ARGSUSED */ 888721fffe3SKacheong Poon static void 889721fffe3SKacheong Poon tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 890721fffe3SKacheong Poon { 891721fffe3SKacheong Poon conn_t *connp = (conn_t *)arg; 892721fffe3SKacheong Poon tcp_t *tcp = connp->conn_tcp; 893721fffe3SKacheong Poon union T_primitives *tprim = (union T_primitives *)mp->b_rptr; 894721fffe3SKacheong Poon uchar_t *rptr; 895721fffe3SKacheong Poon t_scalar_t type; 896721fffe3SKacheong Poon cred_t *cr; 897721fffe3SKacheong Poon 898721fffe3SKacheong Poon /* 899721fffe3SKacheong Poon * Try and ASSERT the minimum possible references on the 900721fffe3SKacheong Poon * conn early enough. Since we are executing on write side, 901721fffe3SKacheong Poon * the connection is obviously not detached and that means 902721fffe3SKacheong Poon * there is a ref each for TCP and IP. Since we are behind 903721fffe3SKacheong Poon * the squeue, the minimum references needed are 3. If the 904721fffe3SKacheong Poon * conn is in classifier hash list, there should be an 905721fffe3SKacheong Poon * extra ref for that (we check both the possibilities). 906721fffe3SKacheong Poon */ 907721fffe3SKacheong Poon ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 908721fffe3SKacheong Poon (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 909721fffe3SKacheong Poon 910721fffe3SKacheong Poon rptr = mp->b_rptr; 911721fffe3SKacheong Poon ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 912721fffe3SKacheong Poon if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { 913721fffe3SKacheong Poon type = ((union T_primitives *)rptr)->type; 914721fffe3SKacheong Poon if (type == T_EXDATA_REQ) { 915721fffe3SKacheong Poon tcp_output_urgent(connp, mp, arg2, NULL); 916721fffe3SKacheong Poon } else if (type != T_DATA_REQ) { 917721fffe3SKacheong Poon goto non_urgent_data; 918721fffe3SKacheong Poon } else { 919721fffe3SKacheong Poon /* TODO: options, flags, ... from user */ 920721fffe3SKacheong Poon /* Set length to zero for reclamation below */ 921721fffe3SKacheong Poon tcp_wput_data(tcp, mp->b_cont, B_TRUE); 922721fffe3SKacheong Poon freeb(mp); 923721fffe3SKacheong Poon } 924721fffe3SKacheong Poon return; 925721fffe3SKacheong Poon } else { 926721fffe3SKacheong Poon if (connp->conn_debug) { 927721fffe3SKacheong Poon (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 928721fffe3SKacheong Poon "tcp_wput_proto, dropping one..."); 929721fffe3SKacheong Poon } 930721fffe3SKacheong Poon freemsg(mp); 931721fffe3SKacheong Poon return; 932721fffe3SKacheong Poon } 933721fffe3SKacheong Poon 934721fffe3SKacheong Poon non_urgent_data: 935721fffe3SKacheong Poon 936721fffe3SKacheong Poon switch ((int)tprim->type) { 937721fffe3SKacheong Poon case O_T_BIND_REQ: /* bind request */ 938721fffe3SKacheong Poon case T_BIND_REQ: /* new semantics bind request */ 939721fffe3SKacheong Poon tcp_tpi_bind(tcp, mp); 940721fffe3SKacheong Poon break; 941721fffe3SKacheong Poon case T_UNBIND_REQ: /* unbind request */ 942721fffe3SKacheong Poon tcp_tpi_unbind(tcp, mp); 943721fffe3SKacheong Poon break; 944721fffe3SKacheong Poon case O_T_CONN_RES: /* old connection response XXX */ 945721fffe3SKacheong Poon case T_CONN_RES: /* connection response */ 946721fffe3SKacheong Poon tcp_tli_accept(tcp, mp); 947721fffe3SKacheong Poon break; 948721fffe3SKacheong Poon case T_CONN_REQ: /* connection request */ 949721fffe3SKacheong Poon tcp_tpi_connect(tcp, mp); 950721fffe3SKacheong Poon break; 951721fffe3SKacheong Poon case T_DISCON_REQ: /* disconnect request */ 952721fffe3SKacheong Poon tcp_disconnect(tcp, mp); 953721fffe3SKacheong Poon break; 954721fffe3SKacheong Poon case T_CAPABILITY_REQ: 955721fffe3SKacheong Poon tcp_capability_req(tcp, mp); /* capability request */ 956721fffe3SKacheong Poon break; 957721fffe3SKacheong Poon case T_INFO_REQ: /* information request */ 958721fffe3SKacheong Poon tcp_info_req(tcp, mp); 959721fffe3SKacheong Poon break; 960721fffe3SKacheong Poon case T_SVR4_OPTMGMT_REQ: /* manage options req */ 961721fffe3SKacheong Poon case T_OPTMGMT_REQ: 962721fffe3SKacheong Poon /* 963721fffe3SKacheong Poon * Note: no support for snmpcom_req() through new 964721fffe3SKacheong Poon * T_OPTMGMT_REQ. See comments in ip.c 965721fffe3SKacheong Poon */ 966721fffe3SKacheong Poon 967721fffe3SKacheong Poon /* 968721fffe3SKacheong Poon * All Solaris components should pass a db_credp 969721fffe3SKacheong Poon * for this TPI message, hence we ASSERT. 970721fffe3SKacheong Poon * But in case there is some other M_PROTO that looks 971721fffe3SKacheong Poon * like a TPI message sent by some other kernel 972721fffe3SKacheong Poon * component, we check and return an error. 973721fffe3SKacheong Poon */ 974721fffe3SKacheong Poon cr = msg_getcred(mp, NULL); 975721fffe3SKacheong Poon ASSERT(cr != NULL); 976721fffe3SKacheong Poon if (cr == NULL) { 977721fffe3SKacheong Poon tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 978721fffe3SKacheong Poon return; 979721fffe3SKacheong Poon } 980721fffe3SKacheong Poon /* 981721fffe3SKacheong Poon * If EINPROGRESS is returned, the request has been queued 982721fffe3SKacheong Poon * for subsequent processing by ip_restart_optmgmt(), which 983721fffe3SKacheong Poon * will do the CONN_DEC_REF(). 984721fffe3SKacheong Poon */ 985721fffe3SKacheong Poon if ((int)tprim->type == T_SVR4_OPTMGMT_REQ) { 986721fffe3SKacheong Poon svr4_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj); 987721fffe3SKacheong Poon } else { 988721fffe3SKacheong Poon tpi_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj); 989721fffe3SKacheong Poon } 990721fffe3SKacheong Poon break; 991721fffe3SKacheong Poon 992721fffe3SKacheong Poon case T_UNITDATA_REQ: /* unitdata request */ 993721fffe3SKacheong Poon tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); 994721fffe3SKacheong Poon break; 995721fffe3SKacheong Poon case T_ORDREL_REQ: /* orderly release req */ 996721fffe3SKacheong Poon freemsg(mp); 997721fffe3SKacheong Poon 998721fffe3SKacheong Poon if (tcp->tcp_fused) 999721fffe3SKacheong Poon tcp_unfuse(tcp); 1000721fffe3SKacheong Poon 1001721fffe3SKacheong Poon if (tcp_xmit_end(tcp) != 0) { 1002721fffe3SKacheong Poon /* 1003721fffe3SKacheong Poon * We were crossing FINs and got a reset from 1004721fffe3SKacheong Poon * the other side. Just ignore it. 1005721fffe3SKacheong Poon */ 1006721fffe3SKacheong Poon if (connp->conn_debug) { 1007721fffe3SKacheong Poon (void) strlog(TCP_MOD_ID, 0, 1, 1008721fffe3SKacheong Poon SL_ERROR|SL_TRACE, 1009721fffe3SKacheong Poon "tcp_wput_proto, T_ORDREL_REQ out of " 1010721fffe3SKacheong Poon "state %s", 1011721fffe3SKacheong Poon tcp_display(tcp, NULL, 1012721fffe3SKacheong Poon DISP_ADDR_AND_PORT)); 1013721fffe3SKacheong Poon } 1014721fffe3SKacheong Poon } 1015721fffe3SKacheong Poon break; 1016721fffe3SKacheong Poon case T_ADDR_REQ: 1017721fffe3SKacheong Poon tcp_addr_req(tcp, mp); 1018721fffe3SKacheong Poon break; 1019721fffe3SKacheong Poon default: 1020721fffe3SKacheong Poon if (connp->conn_debug) { 1021721fffe3SKacheong Poon (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 1022721fffe3SKacheong Poon "tcp_wput_proto, bogus TPI msg, type %d", 1023721fffe3SKacheong Poon tprim->type); 1024721fffe3SKacheong Poon } 1025721fffe3SKacheong Poon /* 1026721fffe3SKacheong Poon * We used to M_ERROR. Sending TNOTSUPPORT gives the user 1027721fffe3SKacheong Poon * to recover. 1028721fffe3SKacheong Poon */ 1029721fffe3SKacheong Poon tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); 1030721fffe3SKacheong Poon break; 1031721fffe3SKacheong Poon } 1032721fffe3SKacheong Poon } 1033721fffe3SKacheong Poon 1034721fffe3SKacheong Poon /* 1035721fffe3SKacheong Poon * Handle special out-of-band ioctl requests (see PSARC/2008/265). 1036721fffe3SKacheong Poon */ 1037721fffe3SKacheong Poon static void 1038721fffe3SKacheong Poon tcp_wput_cmdblk(queue_t *q, mblk_t *mp) 1039721fffe3SKacheong Poon { 1040721fffe3SKacheong Poon void *data; 1041721fffe3SKacheong Poon mblk_t *datamp = mp->b_cont; 1042721fffe3SKacheong Poon conn_t *connp = Q_TO_CONN(q); 1043721fffe3SKacheong Poon tcp_t *tcp = connp->conn_tcp; 1044721fffe3SKacheong Poon cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr; 1045721fffe3SKacheong Poon 1046721fffe3SKacheong Poon if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) { 1047721fffe3SKacheong Poon cmdp->cb_error = EPROTO; 1048721fffe3SKacheong Poon qreply(q, mp); 1049721fffe3SKacheong Poon return; 1050721fffe3SKacheong Poon } 1051721fffe3SKacheong Poon 1052721fffe3SKacheong Poon data = datamp->b_rptr; 1053721fffe3SKacheong Poon 1054721fffe3SKacheong Poon switch (cmdp->cb_cmd) { 1055721fffe3SKacheong Poon case TI_GETPEERNAME: 1056721fffe3SKacheong Poon if (tcp->tcp_state < TCPS_SYN_RCVD) 1057721fffe3SKacheong Poon cmdp->cb_error = ENOTCONN; 1058721fffe3SKacheong Poon else 1059721fffe3SKacheong Poon cmdp->cb_error = conn_getpeername(connp, data, 1060721fffe3SKacheong Poon &cmdp->cb_len); 1061721fffe3SKacheong Poon break; 1062721fffe3SKacheong Poon case TI_GETMYNAME: 1063721fffe3SKacheong Poon cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len); 1064721fffe3SKacheong Poon break; 1065721fffe3SKacheong Poon default: 1066721fffe3SKacheong Poon cmdp->cb_error = EINVAL; 1067721fffe3SKacheong Poon break; 1068721fffe3SKacheong Poon } 1069721fffe3SKacheong Poon 1070721fffe3SKacheong Poon qreply(q, mp); 1071721fffe3SKacheong Poon } 1072721fffe3SKacheong Poon 1073721fffe3SKacheong Poon /* 1074721fffe3SKacheong Poon * The TCP fast path write put procedure. 1075721fffe3SKacheong Poon * NOTE: the logic of the fast path is duplicated from tcp_wput_data() 1076721fffe3SKacheong Poon */ 1077721fffe3SKacheong Poon /* ARGSUSED */ 1078721fffe3SKacheong Poon void 1079721fffe3SKacheong Poon tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1080721fffe3SKacheong Poon { 1081721fffe3SKacheong Poon int len; 1082721fffe3SKacheong Poon int hdrlen; 1083721fffe3SKacheong Poon int plen; 1084721fffe3SKacheong Poon mblk_t *mp1; 1085721fffe3SKacheong Poon uchar_t *rptr; 1086721fffe3SKacheong Poon uint32_t snxt; 1087721fffe3SKacheong Poon tcpha_t *tcpha; 1088721fffe3SKacheong Poon struct datab *db; 1089721fffe3SKacheong Poon uint32_t suna; 1090721fffe3SKacheong Poon uint32_t mss; 1091721fffe3SKacheong Poon ipaddr_t *dst; 1092721fffe3SKacheong Poon ipaddr_t *src; 1093721fffe3SKacheong Poon uint32_t sum; 1094721fffe3SKacheong Poon int usable; 1095721fffe3SKacheong Poon conn_t *connp = (conn_t *)arg; 1096721fffe3SKacheong Poon tcp_t *tcp = connp->conn_tcp; 1097721fffe3SKacheong Poon uint32_t msize; 1098721fffe3SKacheong Poon tcp_stack_t *tcps = tcp->tcp_tcps; 1099721fffe3SKacheong Poon ip_xmit_attr_t *ixa; 1100721fffe3SKacheong Poon clock_t now; 1101721fffe3SKacheong Poon 1102721fffe3SKacheong Poon /* 1103721fffe3SKacheong Poon * Try and ASSERT the minimum possible references on the 1104721fffe3SKacheong Poon * conn early enough. Since we are executing on write side, 1105721fffe3SKacheong Poon * the connection is obviously not detached and that means 1106721fffe3SKacheong Poon * there is a ref each for TCP and IP. Since we are behind 1107721fffe3SKacheong Poon * the squeue, the minimum references needed are 3. If the 1108721fffe3SKacheong Poon * conn is in classifier hash list, there should be an 1109721fffe3SKacheong Poon * extra ref for that (we check both the possibilities). 1110721fffe3SKacheong Poon */ 1111721fffe3SKacheong Poon ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 1112721fffe3SKacheong Poon (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 1113721fffe3SKacheong Poon 1114721fffe3SKacheong Poon ASSERT(DB_TYPE(mp) == M_DATA); 1115721fffe3SKacheong Poon msize = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 1116721fffe3SKacheong Poon 1117721fffe3SKacheong Poon mutex_enter(&tcp->tcp_non_sq_lock); 1118721fffe3SKacheong Poon tcp->tcp_squeue_bytes -= msize; 1119721fffe3SKacheong Poon mutex_exit(&tcp->tcp_non_sq_lock); 1120721fffe3SKacheong Poon 1121721fffe3SKacheong Poon /* Bypass tcp protocol for fused tcp loopback */ 1122721fffe3SKacheong Poon if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) 1123721fffe3SKacheong Poon return; 1124721fffe3SKacheong Poon 1125721fffe3SKacheong Poon mss = tcp->tcp_mss; 1126721fffe3SKacheong Poon /* 1127721fffe3SKacheong Poon * If ZEROCOPY has turned off, try not to send any zero-copy message 1128721fffe3SKacheong Poon * down. Do backoff, now. 1129721fffe3SKacheong Poon */ 1130721fffe3SKacheong Poon if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on) 1131721fffe3SKacheong Poon mp = tcp_zcopy_backoff(tcp, mp, B_FALSE); 1132721fffe3SKacheong Poon 1133721fffe3SKacheong Poon 1134721fffe3SKacheong Poon ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 1135721fffe3SKacheong Poon len = (int)(mp->b_wptr - mp->b_rptr); 1136721fffe3SKacheong Poon 1137721fffe3SKacheong Poon /* 1138721fffe3SKacheong Poon * Criteria for fast path: 1139721fffe3SKacheong Poon * 1140721fffe3SKacheong Poon * 1. no unsent data 1141721fffe3SKacheong Poon * 2. single mblk in request 1142721fffe3SKacheong Poon * 3. connection established 1143721fffe3SKacheong Poon * 4. data in mblk 1144721fffe3SKacheong Poon * 5. len <= mss 1145721fffe3SKacheong Poon * 6. no tcp_valid bits 1146721fffe3SKacheong Poon */ 1147721fffe3SKacheong Poon if ((tcp->tcp_unsent != 0) || 1148721fffe3SKacheong Poon (tcp->tcp_cork) || 1149721fffe3SKacheong Poon (mp->b_cont != NULL) || 1150721fffe3SKacheong Poon (tcp->tcp_state != TCPS_ESTABLISHED) || 1151721fffe3SKacheong Poon (len == 0) || 1152721fffe3SKacheong Poon (len > mss) || 1153721fffe3SKacheong Poon (tcp->tcp_valid_bits != 0)) { 1154721fffe3SKacheong Poon tcp_wput_data(tcp, mp, B_FALSE); 1155721fffe3SKacheong Poon return; 1156721fffe3SKacheong Poon } 1157721fffe3SKacheong Poon 1158721fffe3SKacheong Poon ASSERT(tcp->tcp_xmit_tail_unsent == 0); 1159721fffe3SKacheong Poon ASSERT(tcp->tcp_fin_sent == 0); 1160721fffe3SKacheong Poon 1161721fffe3SKacheong Poon /* queue new packet onto retransmission queue */ 1162721fffe3SKacheong Poon if (tcp->tcp_xmit_head == NULL) { 1163721fffe3SKacheong Poon tcp->tcp_xmit_head = mp; 1164721fffe3SKacheong Poon } else { 1165721fffe3SKacheong Poon tcp->tcp_xmit_last->b_cont = mp; 1166721fffe3SKacheong Poon } 1167721fffe3SKacheong Poon tcp->tcp_xmit_last = mp; 1168721fffe3SKacheong Poon tcp->tcp_xmit_tail = mp; 1169721fffe3SKacheong Poon 1170721fffe3SKacheong Poon /* find out how much we can send */ 1171721fffe3SKacheong Poon /* BEGIN CSTYLED */ 1172721fffe3SKacheong Poon /* 1173721fffe3SKacheong Poon * un-acked usable 1174721fffe3SKacheong Poon * |--------------|-----------------| 1175721fffe3SKacheong Poon * tcp_suna tcp_snxt tcp_suna+tcp_swnd 1176721fffe3SKacheong Poon */ 1177721fffe3SKacheong Poon /* END CSTYLED */ 1178721fffe3SKacheong Poon 1179721fffe3SKacheong Poon /* start sending from tcp_snxt */ 1180721fffe3SKacheong Poon snxt = tcp->tcp_snxt; 1181721fffe3SKacheong Poon 1182721fffe3SKacheong Poon /* 1183721fffe3SKacheong Poon * Check to see if this connection has been idled for some 1184721fffe3SKacheong Poon * time and no ACK is expected. If it is, we need to slow 1185721fffe3SKacheong Poon * start again to get back the connection's "self-clock" as 1186721fffe3SKacheong Poon * described in VJ's paper. 1187721fffe3SKacheong Poon * 1188721fffe3SKacheong Poon * Reinitialize tcp_cwnd after idle. 1189721fffe3SKacheong Poon */ 1190721fffe3SKacheong Poon now = LBOLT_FASTPATH; 1191721fffe3SKacheong Poon if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && 1192721fffe3SKacheong Poon (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { 1193721fffe3SKacheong Poon TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); 1194721fffe3SKacheong Poon } 1195721fffe3SKacheong Poon 1196721fffe3SKacheong Poon usable = tcp->tcp_swnd; /* tcp window size */ 1197721fffe3SKacheong Poon if (usable > tcp->tcp_cwnd) 1198721fffe3SKacheong Poon usable = tcp->tcp_cwnd; /* congestion window smaller */ 1199721fffe3SKacheong Poon usable -= snxt; /* subtract stuff already sent */ 1200721fffe3SKacheong Poon suna = tcp->tcp_suna; 1201721fffe3SKacheong Poon usable += suna; 1202721fffe3SKacheong Poon /* usable can be < 0 if the congestion window is smaller */ 1203721fffe3SKacheong Poon if (len > usable) { 1204721fffe3SKacheong Poon /* Can't send complete M_DATA in one shot */ 1205721fffe3SKacheong Poon goto slow; 1206721fffe3SKacheong Poon } 1207721fffe3SKacheong Poon 1208721fffe3SKacheong Poon mutex_enter(&tcp->tcp_non_sq_lock); 1209721fffe3SKacheong Poon if (tcp->tcp_flow_stopped && 1210721fffe3SKacheong Poon TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { 1211721fffe3SKacheong Poon tcp_clrqfull(tcp); 1212721fffe3SKacheong Poon } 1213721fffe3SKacheong Poon mutex_exit(&tcp->tcp_non_sq_lock); 1214721fffe3SKacheong Poon 1215721fffe3SKacheong Poon /* 1216721fffe3SKacheong Poon * determine if anything to send (Nagle). 1217721fffe3SKacheong Poon * 1218721fffe3SKacheong Poon * 1. len < tcp_mss (i.e. small) 1219721fffe3SKacheong Poon * 2. unacknowledged data present 1220721fffe3SKacheong Poon * 3. len < nagle limit 1221721fffe3SKacheong Poon * 4. last packet sent < nagle limit (previous packet sent) 1222721fffe3SKacheong Poon */ 1223721fffe3SKacheong Poon if ((len < mss) && (snxt != suna) && 1224721fffe3SKacheong Poon (len < (int)tcp->tcp_naglim) && 1225721fffe3SKacheong Poon (tcp->tcp_last_sent_len < tcp->tcp_naglim)) { 1226721fffe3SKacheong Poon /* 1227721fffe3SKacheong Poon * This was the first unsent packet and normally 1228721fffe3SKacheong Poon * mss < xmit_hiwater so there is no need to worry 1229721fffe3SKacheong Poon * about flow control. The next packet will go 1230721fffe3SKacheong Poon * through the flow control check in tcp_wput_data(). 1231721fffe3SKacheong Poon */ 1232721fffe3SKacheong Poon /* leftover work from above */ 1233721fffe3SKacheong Poon tcp->tcp_unsent = len; 1234721fffe3SKacheong Poon tcp->tcp_xmit_tail_unsent = len; 1235721fffe3SKacheong Poon 1236721fffe3SKacheong Poon return; 1237721fffe3SKacheong Poon } 1238721fffe3SKacheong Poon 1239721fffe3SKacheong Poon /* 1240721fffe3SKacheong Poon * len <= tcp->tcp_mss && len == unsent so no sender silly window. Can 1241721fffe3SKacheong Poon * send now. 1242721fffe3SKacheong Poon */ 1243721fffe3SKacheong Poon 1244721fffe3SKacheong Poon if (snxt == suna) { 1245721fffe3SKacheong Poon TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 1246721fffe3SKacheong Poon } 1247721fffe3SKacheong Poon 1248721fffe3SKacheong Poon /* we have always sent something */ 1249721fffe3SKacheong Poon tcp->tcp_rack_cnt = 0; 1250721fffe3SKacheong Poon 1251721fffe3SKacheong Poon tcp->tcp_snxt = snxt + len; 1252721fffe3SKacheong Poon tcp->tcp_rack = tcp->tcp_rnxt; 1253721fffe3SKacheong Poon 1254721fffe3SKacheong Poon if ((mp1 = dupb(mp)) == 0) 1255721fffe3SKacheong Poon goto no_memory; 1256721fffe3SKacheong Poon mp->b_prev = (mblk_t *)(uintptr_t)now; 1257721fffe3SKacheong Poon mp->b_next = (mblk_t *)(uintptr_t)snxt; 1258721fffe3SKacheong Poon 1259721fffe3SKacheong Poon /* adjust tcp header information */ 1260721fffe3SKacheong Poon tcpha = tcp->tcp_tcpha; 1261721fffe3SKacheong Poon tcpha->tha_flags = (TH_ACK|TH_PUSH); 1262721fffe3SKacheong Poon 1263721fffe3SKacheong Poon sum = len + connp->conn_ht_ulp_len + connp->conn_sum; 1264721fffe3SKacheong Poon sum = (sum >> 16) + (sum & 0xFFFF); 1265721fffe3SKacheong Poon tcpha->tha_sum = htons(sum); 1266721fffe3SKacheong Poon 1267721fffe3SKacheong Poon tcpha->tha_seq = htonl(snxt); 1268721fffe3SKacheong Poon 1269721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpOutDataSegs); 1270721fffe3SKacheong Poon TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); 1271721fffe3SKacheong Poon BUMP_LOCAL(tcp->tcp_obsegs); 1272721fffe3SKacheong Poon 1273721fffe3SKacheong Poon /* Update the latest receive window size in TCP header. */ 1274721fffe3SKacheong Poon tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 1275721fffe3SKacheong Poon 1276721fffe3SKacheong Poon tcp->tcp_last_sent_len = (ushort_t)len; 1277721fffe3SKacheong Poon 1278721fffe3SKacheong Poon plen = len + connp->conn_ht_iphc_len; 1279721fffe3SKacheong Poon 1280721fffe3SKacheong Poon ixa = connp->conn_ixa; 1281721fffe3SKacheong Poon ixa->ixa_pktlen = plen; 1282721fffe3SKacheong Poon 1283721fffe3SKacheong Poon if (ixa->ixa_flags & IXAF_IS_IPV4) { 1284721fffe3SKacheong Poon tcp->tcp_ipha->ipha_length = htons(plen); 1285721fffe3SKacheong Poon } else { 1286721fffe3SKacheong Poon tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN); 1287721fffe3SKacheong Poon } 1288721fffe3SKacheong Poon 1289721fffe3SKacheong Poon /* see if we need to allocate a mblk for the headers */ 1290721fffe3SKacheong Poon hdrlen = connp->conn_ht_iphc_len; 1291721fffe3SKacheong Poon rptr = mp1->b_rptr - hdrlen; 1292721fffe3SKacheong Poon db = mp1->b_datap; 1293721fffe3SKacheong Poon if ((db->db_ref != 2) || rptr < db->db_base || 1294721fffe3SKacheong Poon (!OK_32PTR(rptr))) { 1295721fffe3SKacheong Poon /* NOTE: we assume allocb returns an OK_32PTR */ 1296721fffe3SKacheong Poon mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED); 1297721fffe3SKacheong Poon if (!mp) { 1298721fffe3SKacheong Poon freemsg(mp1); 1299721fffe3SKacheong Poon goto no_memory; 1300721fffe3SKacheong Poon } 1301721fffe3SKacheong Poon mp->b_cont = mp1; 1302721fffe3SKacheong Poon mp1 = mp; 1303721fffe3SKacheong Poon /* Leave room for Link Level header */ 1304721fffe3SKacheong Poon rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra]; 1305721fffe3SKacheong Poon mp1->b_wptr = &rptr[hdrlen]; 1306721fffe3SKacheong Poon } 1307721fffe3SKacheong Poon mp1->b_rptr = rptr; 1308721fffe3SKacheong Poon 1309721fffe3SKacheong Poon /* Fill in the timestamp option. */ 1310721fffe3SKacheong Poon if (tcp->tcp_snd_ts_ok) { 1311721fffe3SKacheong Poon uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; 1312721fffe3SKacheong Poon 1313721fffe3SKacheong Poon U32_TO_BE32(llbolt, 1314721fffe3SKacheong Poon (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); 1315721fffe3SKacheong Poon U32_TO_BE32(tcp->tcp_ts_recent, 1316721fffe3SKacheong Poon (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); 1317721fffe3SKacheong Poon } else { 1318721fffe3SKacheong Poon ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); 1319721fffe3SKacheong Poon } 1320721fffe3SKacheong Poon 1321721fffe3SKacheong Poon /* copy header into outgoing packet */ 1322721fffe3SKacheong Poon dst = (ipaddr_t *)rptr; 1323721fffe3SKacheong Poon src = (ipaddr_t *)connp->conn_ht_iphc; 1324721fffe3SKacheong Poon dst[0] = src[0]; 1325721fffe3SKacheong Poon dst[1] = src[1]; 1326721fffe3SKacheong Poon dst[2] = src[2]; 1327721fffe3SKacheong Poon dst[3] = src[3]; 1328721fffe3SKacheong Poon dst[4] = src[4]; 1329721fffe3SKacheong Poon dst[5] = src[5]; 1330721fffe3SKacheong Poon dst[6] = src[6]; 1331721fffe3SKacheong Poon dst[7] = src[7]; 1332721fffe3SKacheong Poon dst[8] = src[8]; 1333721fffe3SKacheong Poon dst[9] = src[9]; 1334721fffe3SKacheong Poon if (hdrlen -= 40) { 1335721fffe3SKacheong Poon hdrlen >>= 2; 1336721fffe3SKacheong Poon dst += 10; 1337721fffe3SKacheong Poon src += 10; 1338721fffe3SKacheong Poon do { 1339721fffe3SKacheong Poon *dst++ = *src++; 1340721fffe3SKacheong Poon } while (--hdrlen); 1341721fffe3SKacheong Poon } 1342721fffe3SKacheong Poon 1343721fffe3SKacheong Poon /* 1344721fffe3SKacheong Poon * Set the ECN info in the TCP header. Note that this 1345721fffe3SKacheong Poon * is not the template header. 1346721fffe3SKacheong Poon */ 1347721fffe3SKacheong Poon if (tcp->tcp_ecn_ok) { 1348721fffe3SKacheong Poon TCP_SET_ECT(tcp, rptr); 1349721fffe3SKacheong Poon 1350721fffe3SKacheong Poon tcpha = (tcpha_t *)(rptr + ixa->ixa_ip_hdr_length); 1351721fffe3SKacheong Poon if (tcp->tcp_ecn_echo_on) 1352721fffe3SKacheong Poon tcpha->tha_flags |= TH_ECE; 1353721fffe3SKacheong Poon if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 1354721fffe3SKacheong Poon tcpha->tha_flags |= TH_CWR; 1355721fffe3SKacheong Poon tcp->tcp_ecn_cwr_sent = B_TRUE; 1356721fffe3SKacheong Poon } 1357721fffe3SKacheong Poon } 1358721fffe3SKacheong Poon 1359721fffe3SKacheong Poon if (tcp->tcp_ip_forward_progress) { 1360721fffe3SKacheong Poon tcp->tcp_ip_forward_progress = B_FALSE; 1361721fffe3SKacheong Poon connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; 1362721fffe3SKacheong Poon } else { 1363721fffe3SKacheong Poon connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; 1364721fffe3SKacheong Poon } 1365721fffe3SKacheong Poon tcp_send_data(tcp, mp1); 1366721fffe3SKacheong Poon return; 1367721fffe3SKacheong Poon 1368721fffe3SKacheong Poon /* 1369721fffe3SKacheong Poon * If we ran out of memory, we pretend to have sent the packet 1370721fffe3SKacheong Poon * and that it was lost on the wire. 1371721fffe3SKacheong Poon */ 1372721fffe3SKacheong Poon no_memory: 1373721fffe3SKacheong Poon return; 1374721fffe3SKacheong Poon 1375721fffe3SKacheong Poon slow: 1376721fffe3SKacheong Poon /* leftover work from above */ 1377721fffe3SKacheong Poon tcp->tcp_unsent = len; 1378721fffe3SKacheong Poon tcp->tcp_xmit_tail_unsent = len; 1379721fffe3SKacheong Poon tcp_wput_data(tcp, NULL, B_FALSE); 1380721fffe3SKacheong Poon } 1381721fffe3SKacheong Poon 1382721fffe3SKacheong Poon /* ARGSUSED2 */ 1383721fffe3SKacheong Poon void 1384721fffe3SKacheong Poon tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1385721fffe3SKacheong Poon { 1386721fffe3SKacheong Poon int len; 1387721fffe3SKacheong Poon uint32_t msize; 1388721fffe3SKacheong Poon conn_t *connp = (conn_t *)arg; 1389721fffe3SKacheong Poon tcp_t *tcp = connp->conn_tcp; 1390721fffe3SKacheong Poon 1391721fffe3SKacheong Poon msize = msgdsize(mp); 1392721fffe3SKacheong Poon 1393721fffe3SKacheong Poon len = msize - 1; 1394721fffe3SKacheong Poon if (len < 0) { 1395721fffe3SKacheong Poon freemsg(mp); 1396721fffe3SKacheong Poon return; 1397721fffe3SKacheong Poon } 1398721fffe3SKacheong Poon 1399721fffe3SKacheong Poon /* 1400721fffe3SKacheong Poon * Try to force urgent data out on the wire. Even if we have unsent 1401721fffe3SKacheong Poon * data this will at least send the urgent flag. 1402721fffe3SKacheong Poon * XXX does not handle more flag correctly. 1403721fffe3SKacheong Poon */ 1404721fffe3SKacheong Poon len += tcp->tcp_unsent; 1405721fffe3SKacheong Poon len += tcp->tcp_snxt; 1406721fffe3SKacheong Poon tcp->tcp_urg = len; 1407721fffe3SKacheong Poon tcp->tcp_valid_bits |= TCP_URG_VALID; 1408721fffe3SKacheong Poon 1409721fffe3SKacheong Poon /* Bypass tcp protocol for fused tcp loopback */ 1410721fffe3SKacheong Poon if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) 1411721fffe3SKacheong Poon return; 1412721fffe3SKacheong Poon 1413721fffe3SKacheong Poon /* Strip off the T_EXDATA_REQ if the data is from TPI */ 1414721fffe3SKacheong Poon if (DB_TYPE(mp) != M_DATA) { 1415721fffe3SKacheong Poon mblk_t *mp1 = mp; 1416721fffe3SKacheong Poon ASSERT(!IPCL_IS_NONSTR(connp)); 1417721fffe3SKacheong Poon mp = mp->b_cont; 1418721fffe3SKacheong Poon freeb(mp1); 1419721fffe3SKacheong Poon } 1420721fffe3SKacheong Poon tcp_wput_data(tcp, mp, B_TRUE); 1421721fffe3SKacheong Poon } 1422721fffe3SKacheong Poon 1423721fffe3SKacheong Poon /* 1424721fffe3SKacheong Poon * Called by streams close routine via squeues when our client blows off her 1425721fffe3SKacheong Poon * descriptor, we take this to mean: "close the stream state NOW, close the tcp 1426721fffe3SKacheong Poon * connection politely" When SO_LINGER is set (with a non-zero linger time and 1427721fffe3SKacheong Poon * it is not a nonblocking socket) then this routine sleeps until the FIN is 1428721fffe3SKacheong Poon * acked. 1429721fffe3SKacheong Poon * 1430721fffe3SKacheong Poon * NOTE: tcp_close potentially returns error when lingering. 1431721fffe3SKacheong Poon * However, the stream head currently does not pass these errors 1432721fffe3SKacheong Poon * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK 1433721fffe3SKacheong Poon * errors to the application (from tsleep()) and not errors 1434721fffe3SKacheong Poon * like ECONNRESET caused by receiving a reset packet. 1435721fffe3SKacheong Poon */ 1436721fffe3SKacheong Poon 1437721fffe3SKacheong Poon /* ARGSUSED */ 1438721fffe3SKacheong Poon void 1439721fffe3SKacheong Poon tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1440721fffe3SKacheong Poon { 1441721fffe3SKacheong Poon char *msg; 1442721fffe3SKacheong Poon conn_t *connp = (conn_t *)arg; 1443721fffe3SKacheong Poon tcp_t *tcp = connp->conn_tcp; 1444721fffe3SKacheong Poon clock_t delta = 0; 1445721fffe3SKacheong Poon tcp_stack_t *tcps = tcp->tcp_tcps; 1446721fffe3SKacheong Poon 14473e95bd4aSAnders Persson /* 14483e95bd4aSAnders Persson * When a non-STREAMS socket is being closed, it does not always 14493e95bd4aSAnders Persson * stick around waiting for tcp_close_output to run and can therefore 14503e95bd4aSAnders Persson * have dropped a reference already. So adjust the asserts accordingly. 14513e95bd4aSAnders Persson */ 14523e95bd4aSAnders Persson ASSERT((connp->conn_fanout != NULL && 14533e95bd4aSAnders Persson connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 3 : 4)) || 14543e95bd4aSAnders Persson (connp->conn_fanout == NULL && 14553e95bd4aSAnders Persson connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3))); 1456721fffe3SKacheong Poon 1457721fffe3SKacheong Poon mutex_enter(&tcp->tcp_eager_lock); 1458721fffe3SKacheong Poon if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 14593e95bd4aSAnders Persson /* 14603e95bd4aSAnders Persson * Cleanup for listener. For non-STREAM sockets sockfs will 14613e95bd4aSAnders Persson * close all the eagers on 'q', so in that case only deal 14623e95bd4aSAnders Persson * with 'q0'. 14633e95bd4aSAnders Persson */ 14643e95bd4aSAnders Persson tcp_eager_cleanup(tcp, IPCL_IS_NONSTR(connp) ? 1 : 0); 1465721fffe3SKacheong Poon tcp->tcp_wait_for_eagers = 1; 1466721fffe3SKacheong Poon } 1467721fffe3SKacheong Poon mutex_exit(&tcp->tcp_eager_lock); 1468721fffe3SKacheong Poon 1469721fffe3SKacheong Poon tcp->tcp_lso = B_FALSE; 1470721fffe3SKacheong Poon 1471721fffe3SKacheong Poon msg = NULL; 1472721fffe3SKacheong Poon switch (tcp->tcp_state) { 1473721fffe3SKacheong Poon case TCPS_CLOSED: 1474721fffe3SKacheong Poon case TCPS_IDLE: 1475b1cd7879SAnders Persson break; 1476721fffe3SKacheong Poon case TCPS_BOUND: 1477b1cd7879SAnders Persson if (tcp->tcp_listener != NULL) { 1478b1cd7879SAnders Persson ASSERT(IPCL_IS_NONSTR(connp)); 1479b1cd7879SAnders Persson /* 1480b1cd7879SAnders Persson * Unlink from the listener and drop the reference 1481b1cd7879SAnders Persson * put on it by the eager. tcp_closei_local will not 1482b1cd7879SAnders Persson * do it because tcp_tconnind_started is TRUE. 1483b1cd7879SAnders Persson */ 1484b1cd7879SAnders Persson mutex_enter(&tcp->tcp_saved_listener->tcp_eager_lock); 1485b1cd7879SAnders Persson tcp_eager_unlink(tcp); 1486b1cd7879SAnders Persson mutex_exit(&tcp->tcp_saved_listener->tcp_eager_lock); 1487b1cd7879SAnders Persson CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); 1488b1cd7879SAnders Persson } 1489b1cd7879SAnders Persson break; 1490721fffe3SKacheong Poon case TCPS_LISTEN: 1491721fffe3SKacheong Poon break; 1492721fffe3SKacheong Poon case TCPS_SYN_SENT: 1493721fffe3SKacheong Poon msg = "tcp_close, during connect"; 1494721fffe3SKacheong Poon break; 1495721fffe3SKacheong Poon case TCPS_SYN_RCVD: 1496721fffe3SKacheong Poon /* 1497721fffe3SKacheong Poon * Close during the connect 3-way handshake 1498721fffe3SKacheong Poon * but here there may or may not be pending data 1499721fffe3SKacheong Poon * already on queue. Process almost same as in 1500721fffe3SKacheong Poon * the ESTABLISHED state. 1501721fffe3SKacheong Poon */ 1502721fffe3SKacheong Poon /* FALLTHRU */ 1503721fffe3SKacheong Poon default: 1504721fffe3SKacheong Poon if (tcp->tcp_fused) 1505721fffe3SKacheong Poon tcp_unfuse(tcp); 1506721fffe3SKacheong Poon 1507721fffe3SKacheong Poon /* 1508721fffe3SKacheong Poon * If SO_LINGER has set a zero linger time, abort the 1509721fffe3SKacheong Poon * connection with a reset. 1510721fffe3SKacheong Poon */ 1511721fffe3SKacheong Poon if (connp->conn_linger && connp->conn_lingertime == 0) { 1512721fffe3SKacheong Poon msg = "tcp_close, zero lingertime"; 1513721fffe3SKacheong Poon break; 1514721fffe3SKacheong Poon } 1515721fffe3SKacheong Poon 1516721fffe3SKacheong Poon /* 1517721fffe3SKacheong Poon * Abort connection if there is unread data queued. 1518721fffe3SKacheong Poon */ 1519721fffe3SKacheong Poon if (tcp->tcp_rcv_list || tcp->tcp_reass_head) { 1520721fffe3SKacheong Poon msg = "tcp_close, unread data"; 1521721fffe3SKacheong Poon break; 1522721fffe3SKacheong Poon } 15233e95bd4aSAnders Persson 1524721fffe3SKacheong Poon /* 15253e95bd4aSAnders Persson * Abort connection if it is being closed without first 15263e95bd4aSAnders Persson * being accepted. This can happen if a listening non-STREAM 15273e95bd4aSAnders Persson * socket wants to get rid of the socket, for example, if the 15283e95bd4aSAnders Persson * listener is closing. 1529721fffe3SKacheong Poon */ 15303e95bd4aSAnders Persson if (tcp->tcp_listener != NULL) { 15313e95bd4aSAnders Persson ASSERT(IPCL_IS_NONSTR(connp)); 15323e95bd4aSAnders Persson msg = "tcp_close, close before accept"; 15333e95bd4aSAnders Persson 15343e95bd4aSAnders Persson /* 15353e95bd4aSAnders Persson * Unlink from the listener and drop the reference 15363e95bd4aSAnders Persson * put on it by the eager. tcp_closei_local will not 15373e95bd4aSAnders Persson * do it because tcp_tconnind_started is TRUE. 15383e95bd4aSAnders Persson */ 15393e95bd4aSAnders Persson mutex_enter(&tcp->tcp_saved_listener->tcp_eager_lock); 15403e95bd4aSAnders Persson tcp_eager_unlink(tcp); 15413e95bd4aSAnders Persson mutex_exit(&tcp->tcp_saved_listener->tcp_eager_lock); 15423e95bd4aSAnders Persson CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); 1543721fffe3SKacheong Poon break; 15443e95bd4aSAnders Persson } 1545721fffe3SKacheong Poon 1546721fffe3SKacheong Poon /* 1547721fffe3SKacheong Poon * Transmit the FIN before detaching the tcp_t. 1548721fffe3SKacheong Poon * After tcp_detach returns this queue/perimeter 1549721fffe3SKacheong Poon * no longer owns the tcp_t thus others can modify it. 1550721fffe3SKacheong Poon */ 1551721fffe3SKacheong Poon (void) tcp_xmit_end(tcp); 1552721fffe3SKacheong Poon 1553721fffe3SKacheong Poon /* 1554721fffe3SKacheong Poon * If lingering on close then wait until the fin is acked, 1555721fffe3SKacheong Poon * the SO_LINGER time passes, or a reset is sent/received. 1556721fffe3SKacheong Poon */ 1557721fffe3SKacheong Poon if (connp->conn_linger && connp->conn_lingertime > 0 && 1558721fffe3SKacheong Poon !(tcp->tcp_fin_acked) && 1559721fffe3SKacheong Poon tcp->tcp_state >= TCPS_ESTABLISHED) { 1560721fffe3SKacheong Poon if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) { 1561721fffe3SKacheong Poon tcp->tcp_client_errno = EWOULDBLOCK; 1562721fffe3SKacheong Poon } else if (tcp->tcp_client_errno == 0) { 1563721fffe3SKacheong Poon 1564721fffe3SKacheong Poon ASSERT(tcp->tcp_linger_tid == 0); 1565721fffe3SKacheong Poon 156666cd0f60SKacheong Poon /* conn_lingertime is in sec. */ 1567721fffe3SKacheong Poon tcp->tcp_linger_tid = TCP_TIMER(tcp, 1568721fffe3SKacheong Poon tcp_close_linger_timeout, 156966cd0f60SKacheong Poon connp->conn_lingertime * MILLISEC); 1570721fffe3SKacheong Poon 1571721fffe3SKacheong Poon /* tcp_close_linger_timeout will finish close */ 1572721fffe3SKacheong Poon if (tcp->tcp_linger_tid == 0) 1573721fffe3SKacheong Poon tcp->tcp_client_errno = ENOSR; 1574721fffe3SKacheong Poon else 1575721fffe3SKacheong Poon return; 1576721fffe3SKacheong Poon } 1577721fffe3SKacheong Poon 1578721fffe3SKacheong Poon /* 1579721fffe3SKacheong Poon * Check if we need to detach or just close 1580721fffe3SKacheong Poon * the instance. 1581721fffe3SKacheong Poon */ 1582721fffe3SKacheong Poon if (tcp->tcp_state <= TCPS_LISTEN) 1583721fffe3SKacheong Poon break; 1584721fffe3SKacheong Poon } 1585721fffe3SKacheong Poon 1586721fffe3SKacheong Poon /* 1587721fffe3SKacheong Poon * Make sure that no other thread will access the conn_rq of 1588721fffe3SKacheong Poon * this instance (through lookups etc.) as conn_rq will go 1589721fffe3SKacheong Poon * away shortly. 1590721fffe3SKacheong Poon */ 1591721fffe3SKacheong Poon tcp_acceptor_hash_remove(tcp); 1592721fffe3SKacheong Poon 1593721fffe3SKacheong Poon mutex_enter(&tcp->tcp_non_sq_lock); 1594721fffe3SKacheong Poon if (tcp->tcp_flow_stopped) { 1595721fffe3SKacheong Poon tcp_clrqfull(tcp); 1596721fffe3SKacheong Poon } 1597721fffe3SKacheong Poon mutex_exit(&tcp->tcp_non_sq_lock); 1598721fffe3SKacheong Poon 1599721fffe3SKacheong Poon if (tcp->tcp_timer_tid != 0) { 1600721fffe3SKacheong Poon delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 1601721fffe3SKacheong Poon tcp->tcp_timer_tid = 0; 1602721fffe3SKacheong Poon } 1603721fffe3SKacheong Poon /* 1604721fffe3SKacheong Poon * Need to cancel those timers which will not be used when 1605721fffe3SKacheong Poon * TCP is detached. This has to be done before the conn_wq 1606721fffe3SKacheong Poon * is set to NULL. 1607721fffe3SKacheong Poon */ 1608721fffe3SKacheong Poon tcp_timers_stop(tcp); 1609721fffe3SKacheong Poon 1610721fffe3SKacheong Poon tcp->tcp_detached = B_TRUE; 1611721fffe3SKacheong Poon if (tcp->tcp_state == TCPS_TIME_WAIT) { 1612721fffe3SKacheong Poon tcp_time_wait_append(tcp); 1613721fffe3SKacheong Poon TCP_DBGSTAT(tcps, tcp_detach_time_wait); 16143e95bd4aSAnders Persson ASSERT(connp->conn_ref >= 16153e95bd4aSAnders Persson (IPCL_IS_NONSTR(connp) ? 2 : 3)); 1616721fffe3SKacheong Poon goto finish; 1617721fffe3SKacheong Poon } 1618721fffe3SKacheong Poon 1619721fffe3SKacheong Poon /* 1620721fffe3SKacheong Poon * If delta is zero the timer event wasn't executed and was 1621721fffe3SKacheong Poon * successfully canceled. In this case we need to restart it 1622721fffe3SKacheong Poon * with the minimal delta possible. 1623721fffe3SKacheong Poon */ 1624721fffe3SKacheong Poon if (delta >= 0) 1625721fffe3SKacheong Poon tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, 1626721fffe3SKacheong Poon delta ? delta : 1); 1627721fffe3SKacheong Poon 16283e95bd4aSAnders Persson ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3)); 1629721fffe3SKacheong Poon goto finish; 1630721fffe3SKacheong Poon } 1631721fffe3SKacheong Poon 1632721fffe3SKacheong Poon /* Detach did not complete. Still need to remove q from stream. */ 1633721fffe3SKacheong Poon if (msg) { 1634721fffe3SKacheong Poon if (tcp->tcp_state == TCPS_ESTABLISHED || 1635721fffe3SKacheong Poon tcp->tcp_state == TCPS_CLOSE_WAIT) 1636721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpEstabResets); 1637721fffe3SKacheong Poon if (tcp->tcp_state == TCPS_SYN_SENT || 1638721fffe3SKacheong Poon tcp->tcp_state == TCPS_SYN_RCVD) 1639721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpAttemptFails); 1640721fffe3SKacheong Poon tcp_xmit_ctl(msg, tcp, tcp->tcp_snxt, 0, TH_RST); 1641721fffe3SKacheong Poon } 1642721fffe3SKacheong Poon 1643721fffe3SKacheong Poon tcp_closei_local(tcp); 1644721fffe3SKacheong Poon CONN_DEC_REF(connp); 16453e95bd4aSAnders Persson ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 1 : 2)); 1646721fffe3SKacheong Poon 1647721fffe3SKacheong Poon finish: 1648721fffe3SKacheong Poon /* 1649721fffe3SKacheong Poon * Don't change the queues in the case of a listener that has 1650721fffe3SKacheong Poon * eagers in its q or q0. It could surprise the eagers. 1651721fffe3SKacheong Poon * Instead wait for the eagers outside the squeue. 16523e95bd4aSAnders Persson * 16533e95bd4aSAnders Persson * For non-STREAMS sockets tcp_wait_for_eagers implies that 16543e95bd4aSAnders Persson * we should delay the su_closed upcall until all eagers have 16553e95bd4aSAnders Persson * dropped their references. 1656721fffe3SKacheong Poon */ 1657721fffe3SKacheong Poon if (!tcp->tcp_wait_for_eagers) { 1658721fffe3SKacheong Poon tcp->tcp_detached = B_TRUE; 1659721fffe3SKacheong Poon connp->conn_rq = NULL; 1660721fffe3SKacheong Poon connp->conn_wq = NULL; 16613e95bd4aSAnders Persson 16623e95bd4aSAnders Persson /* non-STREAM socket, release the upper handle */ 16633e95bd4aSAnders Persson if (IPCL_IS_NONSTR(connp)) { 16643e95bd4aSAnders Persson ASSERT(connp->conn_upper_handle != NULL); 16653e95bd4aSAnders Persson (*connp->conn_upcalls->su_closed) 16663e95bd4aSAnders Persson (connp->conn_upper_handle); 16673e95bd4aSAnders Persson connp->conn_upper_handle = NULL; 16683e95bd4aSAnders Persson connp->conn_upcalls = NULL; 16693e95bd4aSAnders Persson } 1670721fffe3SKacheong Poon } 1671721fffe3SKacheong Poon 1672721fffe3SKacheong Poon /* Signal tcp_close() to finish closing. */ 16733e95bd4aSAnders Persson mutex_enter(&tcp->tcp_closelock); 1674721fffe3SKacheong Poon tcp->tcp_closed = 1; 1675721fffe3SKacheong Poon cv_signal(&tcp->tcp_closecv); 1676721fffe3SKacheong Poon mutex_exit(&tcp->tcp_closelock); 1677721fffe3SKacheong Poon } 1678721fffe3SKacheong Poon 1679721fffe3SKacheong Poon /* ARGSUSED */ 1680721fffe3SKacheong Poon void 1681721fffe3SKacheong Poon tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1682721fffe3SKacheong Poon { 1683721fffe3SKacheong Poon conn_t *connp = (conn_t *)arg; 1684721fffe3SKacheong Poon tcp_t *tcp = connp->conn_tcp; 1685721fffe3SKacheong Poon 1686721fffe3SKacheong Poon freemsg(mp); 1687721fffe3SKacheong Poon 1688721fffe3SKacheong Poon if (tcp->tcp_fused) 1689721fffe3SKacheong Poon tcp_unfuse(tcp); 1690721fffe3SKacheong Poon 1691721fffe3SKacheong Poon if (tcp_xmit_end(tcp) != 0) { 1692721fffe3SKacheong Poon /* 1693721fffe3SKacheong Poon * We were crossing FINs and got a reset from 1694721fffe3SKacheong Poon * the other side. Just ignore it. 1695721fffe3SKacheong Poon */ 1696721fffe3SKacheong Poon if (connp->conn_debug) { 1697721fffe3SKacheong Poon (void) strlog(TCP_MOD_ID, 0, 1, 1698721fffe3SKacheong Poon SL_ERROR|SL_TRACE, 1699721fffe3SKacheong Poon "tcp_shutdown_output() out of state %s", 1700721fffe3SKacheong Poon tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 1701721fffe3SKacheong Poon } 1702721fffe3SKacheong Poon } 1703721fffe3SKacheong Poon } 1704721fffe3SKacheong Poon 1705721fffe3SKacheong Poon #pragma inline(tcp_send_data) 1706721fffe3SKacheong Poon 1707721fffe3SKacheong Poon void 1708721fffe3SKacheong Poon tcp_send_data(tcp_t *tcp, mblk_t *mp) 1709721fffe3SKacheong Poon { 1710721fffe3SKacheong Poon conn_t *connp = tcp->tcp_connp; 1711721fffe3SKacheong Poon 1712721fffe3SKacheong Poon /* 1713721fffe3SKacheong Poon * Check here to avoid sending zero-copy message down to IP when 1714721fffe3SKacheong Poon * ZEROCOPY capability has turned off. We only need to deal with 1715721fffe3SKacheong Poon * the race condition between sockfs and the notification here. 1716721fffe3SKacheong Poon * Since we have tried to backoff the tcp_xmit_head when turning 1717721fffe3SKacheong Poon * zero-copy off and new messages in tcp_output(), we simply drop 1718721fffe3SKacheong Poon * the dup'ed packet here and let tcp retransmit, if tcp_xmit_zc_clean 1719721fffe3SKacheong Poon * is not true. 1720721fffe3SKacheong Poon */ 1721721fffe3SKacheong Poon if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on && 1722721fffe3SKacheong Poon !tcp->tcp_xmit_zc_clean) { 1723721fffe3SKacheong Poon ip_drop_output("TCP ZC was disabled but not clean", mp, NULL); 1724721fffe3SKacheong Poon freemsg(mp); 1725721fffe3SKacheong Poon return; 1726721fffe3SKacheong Poon } 1727721fffe3SKacheong Poon 17289cd928feSAlan Maguire DTRACE_TCP5(send, mblk_t *, NULL, ip_xmit_attr_t *, connp->conn_ixa, 17299cd928feSAlan Maguire __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, tcp, 17309cd928feSAlan Maguire __dtrace_tcp_tcph_t *, 17319cd928feSAlan Maguire &mp->b_rptr[connp->conn_ixa->ixa_ip_hdr_length]); 17329cd928feSAlan Maguire 1733721fffe3SKacheong Poon ASSERT(connp->conn_ixa->ixa_notify_cookie == connp->conn_tcp); 1734721fffe3SKacheong Poon (void) conn_ip_output(mp, connp->conn_ixa); 1735721fffe3SKacheong Poon } 1736721fffe3SKacheong Poon 1737721fffe3SKacheong Poon /* ARGSUSED2 */ 1738721fffe3SKacheong Poon void 1739721fffe3SKacheong Poon tcp_send_synack(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1740721fffe3SKacheong Poon { 1741721fffe3SKacheong Poon conn_t *econnp = (conn_t *)arg; 1742721fffe3SKacheong Poon tcp_t *tcp = econnp->conn_tcp; 1743e5e7971fSErik Nordmark ip_xmit_attr_t *ixa = econnp->conn_ixa; 1744721fffe3SKacheong Poon 1745721fffe3SKacheong Poon /* Guard against a RST having blown it away while on the squeue */ 1746721fffe3SKacheong Poon if (tcp->tcp_state == TCPS_CLOSED) { 1747721fffe3SKacheong Poon freemsg(mp); 1748721fffe3SKacheong Poon return; 1749721fffe3SKacheong Poon } 1750721fffe3SKacheong Poon 1751e5e7971fSErik Nordmark /* 1752e5e7971fSErik Nordmark * In the off-chance that the eager received and responded to 1753e5e7971fSErik Nordmark * some other packet while the SYN|ACK was queued, we recalculate 1754e5e7971fSErik Nordmark * the ixa_pktlen. It would be better to fix the SYN/accept 1755e5e7971fSErik Nordmark * multithreading scheme to avoid this complexity. 1756e5e7971fSErik Nordmark */ 1757e5e7971fSErik Nordmark ixa->ixa_pktlen = msgdsize(mp); 1758e5e7971fSErik Nordmark (void) conn_ip_output(mp, ixa); 1759721fffe3SKacheong Poon } 1760721fffe3SKacheong Poon 1761721fffe3SKacheong Poon /* 1762721fffe3SKacheong Poon * tcp_send() is called by tcp_wput_data() and returns one of the following: 1763721fffe3SKacheong Poon * 1764721fffe3SKacheong Poon * -1 = failed allocation. 1765633fc3a6SSebastien Roy * 0 = We've either successfully sent data, or our usable send window is too 1766633fc3a6SSebastien Roy * small and we'd rather wait until later before sending again. 1767721fffe3SKacheong Poon */ 1768721fffe3SKacheong Poon static int 1769721fffe3SKacheong Poon tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len, 1770721fffe3SKacheong Poon const int tcp_hdr_len, const int num_sack_blk, int *usable, 1771721fffe3SKacheong Poon uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time) 1772721fffe3SKacheong Poon { 1773721fffe3SKacheong Poon int num_lso_seg = 1; 1774721fffe3SKacheong Poon uint_t lso_usable; 1775721fffe3SKacheong Poon boolean_t do_lso_send = B_FALSE; 1776721fffe3SKacheong Poon tcp_stack_t *tcps = tcp->tcp_tcps; 1777721fffe3SKacheong Poon conn_t *connp = tcp->tcp_connp; 1778721fffe3SKacheong Poon ip_xmit_attr_t *ixa = connp->conn_ixa; 1779721fffe3SKacheong Poon 1780721fffe3SKacheong Poon /* 1781721fffe3SKacheong Poon * Check LSO possibility. The value of tcp->tcp_lso indicates whether 1782721fffe3SKacheong Poon * the underlying connection is LSO capable. Will check whether having 1783721fffe3SKacheong Poon * enough available data to initiate LSO transmission in the for(){} 1784721fffe3SKacheong Poon * loops. 1785721fffe3SKacheong Poon */ 1786721fffe3SKacheong Poon if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0) 1787721fffe3SKacheong Poon do_lso_send = B_TRUE; 1788721fffe3SKacheong Poon 1789721fffe3SKacheong Poon for (;;) { 1790721fffe3SKacheong Poon struct datab *db; 1791721fffe3SKacheong Poon tcpha_t *tcpha; 1792721fffe3SKacheong Poon uint32_t sum; 1793721fffe3SKacheong Poon mblk_t *mp, *mp1; 1794721fffe3SKacheong Poon uchar_t *rptr; 1795721fffe3SKacheong Poon int len; 1796721fffe3SKacheong Poon 1797721fffe3SKacheong Poon /* 1798721fffe3SKacheong Poon * Calculate the maximum payload length we can send at one 1799721fffe3SKacheong Poon * time. 1800721fffe3SKacheong Poon */ 1801721fffe3SKacheong Poon if (do_lso_send) { 1802721fffe3SKacheong Poon /* 1803633fc3a6SSebastien Roy * Determine whether or not it's possible to do LSO, 1804633fc3a6SSebastien Roy * and if so, how much data we can send. 1805721fffe3SKacheong Poon */ 1806721fffe3SKacheong Poon lso_usable = MIN(tcp->tcp_lso_max, *usable); 1807*a7e7a35fSDavid Hanisch if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0 && 1808*a7e7a35fSDavid Hanisch (*snxt + lso_usable) == tcp->tcp_fss && 1809*a7e7a35fSDavid Hanisch lso_usable > mss) { 1810*a7e7a35fSDavid Hanisch /* 1811*a7e7a35fSDavid Hanisch * at the end of a tcp stream with TCP_FSS_VALID 1812*a7e7a35fSDavid Hanisch * set we must leave some data (>0, <=mss) to 1813*a7e7a35fSDavid Hanisch * be sent without LSO through tcp_xmit_mp(), 1814*a7e7a35fSDavid Hanisch * see below 1815*a7e7a35fSDavid Hanisch */ 1816*a7e7a35fSDavid Hanisch if (lso_usable % mss) 1817*a7e7a35fSDavid Hanisch lso_usable -= lso_usable % mss; 1818*a7e7a35fSDavid Hanisch else 1819*a7e7a35fSDavid Hanisch lso_usable -= mss; 1820*a7e7a35fSDavid Hanisch } 1821*a7e7a35fSDavid Hanisch if (lso_usable > mss) { 1822721fffe3SKacheong Poon num_lso_seg = lso_usable / mss; 1823721fffe3SKacheong Poon if (lso_usable % mss) { 1824721fffe3SKacheong Poon num_lso_seg++; 1825721fffe3SKacheong Poon tcp->tcp_last_sent_len = (ushort_t) 1826721fffe3SKacheong Poon (lso_usable % mss); 1827721fffe3SKacheong Poon } else { 1828721fffe3SKacheong Poon tcp->tcp_last_sent_len = (ushort_t)mss; 1829721fffe3SKacheong Poon } 1830721fffe3SKacheong Poon } else { 1831721fffe3SKacheong Poon do_lso_send = B_FALSE; 1832721fffe3SKacheong Poon num_lso_seg = 1; 1833721fffe3SKacheong Poon lso_usable = mss; 1834721fffe3SKacheong Poon } 1835721fffe3SKacheong Poon } 1836721fffe3SKacheong Poon 1837721fffe3SKacheong Poon ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1); 1838721fffe3SKacheong Poon 1839721fffe3SKacheong Poon len = mss; 1840721fffe3SKacheong Poon if (len > *usable) { 1841721fffe3SKacheong Poon ASSERT(do_lso_send == B_FALSE); 1842721fffe3SKacheong Poon 1843721fffe3SKacheong Poon len = *usable; 1844721fffe3SKacheong Poon if (len <= 0) { 1845721fffe3SKacheong Poon /* Terminate the loop */ 1846721fffe3SKacheong Poon break; /* success; too small */ 1847721fffe3SKacheong Poon } 1848721fffe3SKacheong Poon /* 1849721fffe3SKacheong Poon * Sender silly-window avoidance. 1850721fffe3SKacheong Poon * Ignore this if we are going to send a 1851721fffe3SKacheong Poon * zero window probe out. 1852721fffe3SKacheong Poon * 1853721fffe3SKacheong Poon * TODO: force data into microscopic window? 1854721fffe3SKacheong Poon * ==> (!pushed || (unsent > usable)) 1855721fffe3SKacheong Poon */ 1856721fffe3SKacheong Poon if (len < (tcp->tcp_max_swnd >> 1) && 1857721fffe3SKacheong Poon (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len && 1858721fffe3SKacheong Poon !((tcp->tcp_valid_bits & TCP_URG_VALID) && 1859721fffe3SKacheong Poon len == 1) && (! tcp->tcp_zero_win_probe)) { 1860721fffe3SKacheong Poon /* 1861721fffe3SKacheong Poon * If the retransmit timer is not running 1862721fffe3SKacheong Poon * we start it so that we will retransmit 1863721fffe3SKacheong Poon * in the case when the receiver has 1864721fffe3SKacheong Poon * decremented the window. 1865721fffe3SKacheong Poon */ 1866721fffe3SKacheong Poon if (*snxt == tcp->tcp_snxt && 1867721fffe3SKacheong Poon *snxt == tcp->tcp_suna) { 1868721fffe3SKacheong Poon /* 1869721fffe3SKacheong Poon * We are not supposed to send 1870721fffe3SKacheong Poon * anything. So let's wait a little 1871721fffe3SKacheong Poon * bit longer before breaking SWS 1872721fffe3SKacheong Poon * avoidance. 1873721fffe3SKacheong Poon * 1874721fffe3SKacheong Poon * What should the value be? 1875721fffe3SKacheong Poon * Suggestion: MAX(init rexmit time, 1876721fffe3SKacheong Poon * tcp->tcp_rto) 1877721fffe3SKacheong Poon */ 1878721fffe3SKacheong Poon TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 1879721fffe3SKacheong Poon } 1880721fffe3SKacheong Poon break; /* success; too small */ 1881721fffe3SKacheong Poon } 1882721fffe3SKacheong Poon } 1883721fffe3SKacheong Poon 1884721fffe3SKacheong Poon tcpha = tcp->tcp_tcpha; 1885721fffe3SKacheong Poon 1886721fffe3SKacheong Poon /* 1887721fffe3SKacheong Poon * The reason to adjust len here is that we need to set flags 1888721fffe3SKacheong Poon * and calculate checksum. 1889721fffe3SKacheong Poon */ 1890721fffe3SKacheong Poon if (do_lso_send) 1891721fffe3SKacheong Poon len = lso_usable; 1892721fffe3SKacheong Poon 1893721fffe3SKacheong Poon *usable -= len; /* Approximate - can be adjusted later */ 1894721fffe3SKacheong Poon if (*usable > 0) 1895721fffe3SKacheong Poon tcpha->tha_flags = TH_ACK; 1896721fffe3SKacheong Poon else 1897721fffe3SKacheong Poon tcpha->tha_flags = (TH_ACK | TH_PUSH); 1898721fffe3SKacheong Poon 1899721fffe3SKacheong Poon /* 1900721fffe3SKacheong Poon * Prime pump for IP's checksumming on our behalf. 1901721fffe3SKacheong Poon * Include the adjustment for a source route if any. 1902721fffe3SKacheong Poon * In case of LSO, the partial pseudo-header checksum should 1903721fffe3SKacheong Poon * exclusive TCP length, so zero tha_sum before IP calculate 1904721fffe3SKacheong Poon * pseudo-header checksum for partial checksum offload. 1905721fffe3SKacheong Poon */ 1906721fffe3SKacheong Poon if (do_lso_send) { 1907721fffe3SKacheong Poon sum = 0; 1908721fffe3SKacheong Poon } else { 1909721fffe3SKacheong Poon sum = len + tcp_hdr_len + connp->conn_sum; 1910721fffe3SKacheong Poon sum = (sum >> 16) + (sum & 0xFFFF); 1911721fffe3SKacheong Poon } 1912721fffe3SKacheong Poon tcpha->tha_sum = htons(sum); 1913721fffe3SKacheong Poon tcpha->tha_seq = htonl(*snxt); 1914721fffe3SKacheong Poon 1915721fffe3SKacheong Poon /* 1916721fffe3SKacheong Poon * Branch off to tcp_xmit_mp() if any of the VALID bits is 1917721fffe3SKacheong Poon * set. For the case when TCP_FSS_VALID is the only valid 1918721fffe3SKacheong Poon * bit (normal active close), branch off only when we think 1919721fffe3SKacheong Poon * that the FIN flag needs to be set. Note for this case, 1920721fffe3SKacheong Poon * that (snxt + len) may not reflect the actual seg_len, 1921721fffe3SKacheong Poon * as len may be further reduced in tcp_xmit_mp(). If len 1922721fffe3SKacheong Poon * gets modified, we will end up here again. 1923721fffe3SKacheong Poon */ 1924721fffe3SKacheong Poon if (tcp->tcp_valid_bits != 0 && 1925721fffe3SKacheong Poon (tcp->tcp_valid_bits != TCP_FSS_VALID || 1926721fffe3SKacheong Poon ((*snxt + len) == tcp->tcp_fss))) { 1927721fffe3SKacheong Poon uchar_t *prev_rptr; 1928721fffe3SKacheong Poon uint32_t prev_snxt = tcp->tcp_snxt; 1929721fffe3SKacheong Poon 1930721fffe3SKacheong Poon if (*tail_unsent == 0) { 1931721fffe3SKacheong Poon ASSERT((*xmit_tail)->b_cont != NULL); 1932721fffe3SKacheong Poon *xmit_tail = (*xmit_tail)->b_cont; 1933721fffe3SKacheong Poon prev_rptr = (*xmit_tail)->b_rptr; 1934721fffe3SKacheong Poon *tail_unsent = (int)((*xmit_tail)->b_wptr - 1935721fffe3SKacheong Poon (*xmit_tail)->b_rptr); 1936721fffe3SKacheong Poon } else { 1937721fffe3SKacheong Poon prev_rptr = (*xmit_tail)->b_rptr; 1938721fffe3SKacheong Poon (*xmit_tail)->b_rptr = (*xmit_tail)->b_wptr - 1939721fffe3SKacheong Poon *tail_unsent; 1940721fffe3SKacheong Poon } 1941721fffe3SKacheong Poon mp = tcp_xmit_mp(tcp, *xmit_tail, len, NULL, NULL, 1942721fffe3SKacheong Poon *snxt, B_FALSE, (uint32_t *)&len, B_FALSE); 1943721fffe3SKacheong Poon /* Restore tcp_snxt so we get amount sent right. */ 1944721fffe3SKacheong Poon tcp->tcp_snxt = prev_snxt; 1945721fffe3SKacheong Poon if (prev_rptr == (*xmit_tail)->b_rptr) { 1946721fffe3SKacheong Poon /* 1947721fffe3SKacheong Poon * If the previous timestamp is still in use, 1948721fffe3SKacheong Poon * don't stomp on it. 1949721fffe3SKacheong Poon */ 1950721fffe3SKacheong Poon if ((*xmit_tail)->b_next == NULL) { 1951721fffe3SKacheong Poon (*xmit_tail)->b_prev = local_time; 1952721fffe3SKacheong Poon (*xmit_tail)->b_next = 1953721fffe3SKacheong Poon (mblk_t *)(uintptr_t)(*snxt); 1954721fffe3SKacheong Poon } 1955721fffe3SKacheong Poon } else 1956721fffe3SKacheong Poon (*xmit_tail)->b_rptr = prev_rptr; 1957721fffe3SKacheong Poon 1958721fffe3SKacheong Poon if (mp == NULL) { 1959721fffe3SKacheong Poon return (-1); 1960721fffe3SKacheong Poon } 1961721fffe3SKacheong Poon mp1 = mp->b_cont; 1962721fffe3SKacheong Poon 1963721fffe3SKacheong Poon if (len <= mss) /* LSO is unusable (!do_lso_send) */ 1964721fffe3SKacheong Poon tcp->tcp_last_sent_len = (ushort_t)len; 1965721fffe3SKacheong Poon while (mp1->b_cont) { 1966721fffe3SKacheong Poon *xmit_tail = (*xmit_tail)->b_cont; 1967721fffe3SKacheong Poon (*xmit_tail)->b_prev = local_time; 1968721fffe3SKacheong Poon (*xmit_tail)->b_next = 1969721fffe3SKacheong Poon (mblk_t *)(uintptr_t)(*snxt); 1970721fffe3SKacheong Poon mp1 = mp1->b_cont; 1971721fffe3SKacheong Poon } 1972721fffe3SKacheong Poon *snxt += len; 1973721fffe3SKacheong Poon *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr; 1974721fffe3SKacheong Poon BUMP_LOCAL(tcp->tcp_obsegs); 1975721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpOutDataSegs); 1976721fffe3SKacheong Poon TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); 1977721fffe3SKacheong Poon tcp_send_data(tcp, mp); 1978721fffe3SKacheong Poon continue; 1979721fffe3SKacheong Poon } 1980721fffe3SKacheong Poon 1981721fffe3SKacheong Poon *snxt += len; /* Adjust later if we don't send all of len */ 1982721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpOutDataSegs); 1983721fffe3SKacheong Poon TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); 1984721fffe3SKacheong Poon 1985721fffe3SKacheong Poon if (*tail_unsent) { 1986721fffe3SKacheong Poon /* Are the bytes above us in flight? */ 1987721fffe3SKacheong Poon rptr = (*xmit_tail)->b_wptr - *tail_unsent; 1988721fffe3SKacheong Poon if (rptr != (*xmit_tail)->b_rptr) { 1989721fffe3SKacheong Poon *tail_unsent -= len; 1990721fffe3SKacheong Poon if (len <= mss) /* LSO is unusable */ 1991721fffe3SKacheong Poon tcp->tcp_last_sent_len = (ushort_t)len; 1992721fffe3SKacheong Poon len += total_hdr_len; 1993721fffe3SKacheong Poon ixa->ixa_pktlen = len; 1994721fffe3SKacheong Poon 1995721fffe3SKacheong Poon if (ixa->ixa_flags & IXAF_IS_IPV4) { 1996721fffe3SKacheong Poon tcp->tcp_ipha->ipha_length = htons(len); 1997721fffe3SKacheong Poon } else { 1998721fffe3SKacheong Poon tcp->tcp_ip6h->ip6_plen = 1999721fffe3SKacheong Poon htons(len - IPV6_HDR_LEN); 2000721fffe3SKacheong Poon } 2001721fffe3SKacheong Poon 2002721fffe3SKacheong Poon mp = dupb(*xmit_tail); 2003721fffe3SKacheong Poon if (mp == NULL) { 2004721fffe3SKacheong Poon return (-1); /* out_of_mem */ 2005721fffe3SKacheong Poon } 2006721fffe3SKacheong Poon mp->b_rptr = rptr; 2007721fffe3SKacheong Poon /* 2008721fffe3SKacheong Poon * If the old timestamp is no longer in use, 2009721fffe3SKacheong Poon * sample a new timestamp now. 2010721fffe3SKacheong Poon */ 2011721fffe3SKacheong Poon if ((*xmit_tail)->b_next == NULL) { 2012721fffe3SKacheong Poon (*xmit_tail)->b_prev = local_time; 2013721fffe3SKacheong Poon (*xmit_tail)->b_next = 2014721fffe3SKacheong Poon (mblk_t *)(uintptr_t)(*snxt-len); 2015721fffe3SKacheong Poon } 2016721fffe3SKacheong Poon goto must_alloc; 2017721fffe3SKacheong Poon } 2018721fffe3SKacheong Poon } else { 2019721fffe3SKacheong Poon *xmit_tail = (*xmit_tail)->b_cont; 2020721fffe3SKacheong Poon ASSERT((uintptr_t)((*xmit_tail)->b_wptr - 2021721fffe3SKacheong Poon (*xmit_tail)->b_rptr) <= (uintptr_t)INT_MAX); 2022721fffe3SKacheong Poon *tail_unsent = (int)((*xmit_tail)->b_wptr - 2023721fffe3SKacheong Poon (*xmit_tail)->b_rptr); 2024721fffe3SKacheong Poon } 2025721fffe3SKacheong Poon 2026721fffe3SKacheong Poon (*xmit_tail)->b_prev = local_time; 2027721fffe3SKacheong Poon (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)(*snxt - len); 2028721fffe3SKacheong Poon 2029721fffe3SKacheong Poon *tail_unsent -= len; 2030721fffe3SKacheong Poon if (len <= mss) /* LSO is unusable (!do_lso_send) */ 2031721fffe3SKacheong Poon tcp->tcp_last_sent_len = (ushort_t)len; 2032721fffe3SKacheong Poon 2033721fffe3SKacheong Poon len += total_hdr_len; 2034721fffe3SKacheong Poon ixa->ixa_pktlen = len; 2035721fffe3SKacheong Poon 2036721fffe3SKacheong Poon if (ixa->ixa_flags & IXAF_IS_IPV4) { 2037721fffe3SKacheong Poon tcp->tcp_ipha->ipha_length = htons(len); 2038721fffe3SKacheong Poon } else { 2039721fffe3SKacheong Poon tcp->tcp_ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2040721fffe3SKacheong Poon } 2041721fffe3SKacheong Poon 2042721fffe3SKacheong Poon mp = dupb(*xmit_tail); 2043721fffe3SKacheong Poon if (mp == NULL) { 2044721fffe3SKacheong Poon return (-1); /* out_of_mem */ 2045721fffe3SKacheong Poon } 2046721fffe3SKacheong Poon 2047721fffe3SKacheong Poon len = total_hdr_len; 2048721fffe3SKacheong Poon /* 2049721fffe3SKacheong Poon * There are four reasons to allocate a new hdr mblk: 2050721fffe3SKacheong Poon * 1) The bytes above us are in use by another packet 2051721fffe3SKacheong Poon * 2) We don't have good alignment 2052721fffe3SKacheong Poon * 3) The mblk is being shared 2053721fffe3SKacheong Poon * 4) We don't have enough room for a header 2054721fffe3SKacheong Poon */ 2055721fffe3SKacheong Poon rptr = mp->b_rptr - len; 2056721fffe3SKacheong Poon if (!OK_32PTR(rptr) || 2057721fffe3SKacheong Poon ((db = mp->b_datap), db->db_ref != 2) || 2058721fffe3SKacheong Poon rptr < db->db_base) { 2059721fffe3SKacheong Poon /* NOTE: we assume allocb returns an OK_32PTR */ 2060721fffe3SKacheong Poon 2061721fffe3SKacheong Poon must_alloc:; 2062721fffe3SKacheong Poon mp1 = allocb(connp->conn_ht_iphc_allocated + 2063721fffe3SKacheong Poon tcps->tcps_wroff_xtra, BPRI_MED); 2064721fffe3SKacheong Poon if (mp1 == NULL) { 2065721fffe3SKacheong Poon freemsg(mp); 2066721fffe3SKacheong Poon return (-1); /* out_of_mem */ 2067721fffe3SKacheong Poon } 2068721fffe3SKacheong Poon mp1->b_cont = mp; 2069721fffe3SKacheong Poon mp = mp1; 2070721fffe3SKacheong Poon /* Leave room for Link Level header */ 2071721fffe3SKacheong Poon len = total_hdr_len; 2072721fffe3SKacheong Poon rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; 2073721fffe3SKacheong Poon mp->b_wptr = &rptr[len]; 2074721fffe3SKacheong Poon } 2075721fffe3SKacheong Poon 2076721fffe3SKacheong Poon /* 2077721fffe3SKacheong Poon * Fill in the header using the template header, and add 2078721fffe3SKacheong Poon * options such as time-stamp, ECN and/or SACK, as needed. 2079721fffe3SKacheong Poon */ 2080721fffe3SKacheong Poon tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk); 2081721fffe3SKacheong Poon 2082721fffe3SKacheong Poon mp->b_rptr = rptr; 2083721fffe3SKacheong Poon 2084721fffe3SKacheong Poon if (*tail_unsent) { 2085721fffe3SKacheong Poon int spill = *tail_unsent; 2086721fffe3SKacheong Poon 2087721fffe3SKacheong Poon mp1 = mp->b_cont; 2088721fffe3SKacheong Poon if (mp1 == NULL) 2089721fffe3SKacheong Poon mp1 = mp; 2090721fffe3SKacheong Poon 2091721fffe3SKacheong Poon /* 2092721fffe3SKacheong Poon * If we're a little short, tack on more mblks until 2093721fffe3SKacheong Poon * there is no more spillover. 2094721fffe3SKacheong Poon */ 2095721fffe3SKacheong Poon while (spill < 0) { 2096721fffe3SKacheong Poon mblk_t *nmp; 2097721fffe3SKacheong Poon int nmpsz; 2098721fffe3SKacheong Poon 2099721fffe3SKacheong Poon nmp = (*xmit_tail)->b_cont; 2100721fffe3SKacheong Poon nmpsz = MBLKL(nmp); 2101721fffe3SKacheong Poon 2102721fffe3SKacheong Poon /* 2103721fffe3SKacheong Poon * Excess data in mblk; can we split it? 2104721fffe3SKacheong Poon * If LSO is enabled for the connection, 2105721fffe3SKacheong Poon * keep on splitting as this is a transient 2106721fffe3SKacheong Poon * send path. 2107721fffe3SKacheong Poon */ 2108721fffe3SKacheong Poon if (!do_lso_send && (spill + nmpsz > 0)) { 2109721fffe3SKacheong Poon /* 2110721fffe3SKacheong Poon * Don't split if stream head was 2111721fffe3SKacheong Poon * told to break up larger writes 2112721fffe3SKacheong Poon * into smaller ones. 2113721fffe3SKacheong Poon */ 2114721fffe3SKacheong Poon if (tcp->tcp_maxpsz_multiplier > 0) 2115721fffe3SKacheong Poon break; 2116721fffe3SKacheong Poon 2117721fffe3SKacheong Poon /* 2118721fffe3SKacheong Poon * Next mblk is less than SMSS/2 2119721fffe3SKacheong Poon * rounded up to nearest 64-byte; 2120721fffe3SKacheong Poon * let it get sent as part of the 2121721fffe3SKacheong Poon * next segment. 2122721fffe3SKacheong Poon */ 2123721fffe3SKacheong Poon if (tcp->tcp_localnet && 2124721fffe3SKacheong Poon !tcp->tcp_cork && 2125721fffe3SKacheong Poon (nmpsz < roundup((mss >> 1), 64))) 2126721fffe3SKacheong Poon break; 2127721fffe3SKacheong Poon } 2128721fffe3SKacheong Poon 2129721fffe3SKacheong Poon *xmit_tail = nmp; 2130721fffe3SKacheong Poon ASSERT((uintptr_t)nmpsz <= (uintptr_t)INT_MAX); 2131721fffe3SKacheong Poon /* Stash for rtt use later */ 2132721fffe3SKacheong Poon (*xmit_tail)->b_prev = local_time; 2133721fffe3SKacheong Poon (*xmit_tail)->b_next = 2134721fffe3SKacheong Poon (mblk_t *)(uintptr_t)(*snxt - len); 2135721fffe3SKacheong Poon mp1->b_cont = dupb(*xmit_tail); 2136721fffe3SKacheong Poon mp1 = mp1->b_cont; 2137721fffe3SKacheong Poon 2138721fffe3SKacheong Poon spill += nmpsz; 2139721fffe3SKacheong Poon if (mp1 == NULL) { 2140721fffe3SKacheong Poon *tail_unsent = spill; 2141721fffe3SKacheong Poon freemsg(mp); 2142721fffe3SKacheong Poon return (-1); /* out_of_mem */ 2143721fffe3SKacheong Poon } 2144721fffe3SKacheong Poon } 2145721fffe3SKacheong Poon 2146721fffe3SKacheong Poon /* Trim back any surplus on the last mblk */ 2147721fffe3SKacheong Poon if (spill >= 0) { 2148721fffe3SKacheong Poon mp1->b_wptr -= spill; 2149721fffe3SKacheong Poon *tail_unsent = spill; 2150721fffe3SKacheong Poon } else { 2151721fffe3SKacheong Poon /* 2152721fffe3SKacheong Poon * We did not send everything we could in 2153721fffe3SKacheong Poon * order to remain within the b_cont limit. 2154721fffe3SKacheong Poon */ 2155721fffe3SKacheong Poon *usable -= spill; 2156721fffe3SKacheong Poon *snxt += spill; 2157721fffe3SKacheong Poon tcp->tcp_last_sent_len += spill; 2158721fffe3SKacheong Poon TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill); 2159721fffe3SKacheong Poon /* 2160721fffe3SKacheong Poon * Adjust the checksum 2161721fffe3SKacheong Poon */ 2162721fffe3SKacheong Poon tcpha = (tcpha_t *)(rptr + 2163721fffe3SKacheong Poon ixa->ixa_ip_hdr_length); 2164721fffe3SKacheong Poon sum += spill; 2165721fffe3SKacheong Poon sum = (sum >> 16) + (sum & 0xFFFF); 2166721fffe3SKacheong Poon tcpha->tha_sum = htons(sum); 2167721fffe3SKacheong Poon if (connp->conn_ipversion == IPV4_VERSION) { 2168721fffe3SKacheong Poon sum = ntohs( 2169721fffe3SKacheong Poon ((ipha_t *)rptr)->ipha_length) + 2170721fffe3SKacheong Poon spill; 2171721fffe3SKacheong Poon ((ipha_t *)rptr)->ipha_length = 2172721fffe3SKacheong Poon htons(sum); 2173721fffe3SKacheong Poon } else { 2174721fffe3SKacheong Poon sum = ntohs( 2175721fffe3SKacheong Poon ((ip6_t *)rptr)->ip6_plen) + 2176721fffe3SKacheong Poon spill; 2177721fffe3SKacheong Poon ((ip6_t *)rptr)->ip6_plen = 2178721fffe3SKacheong Poon htons(sum); 2179721fffe3SKacheong Poon } 2180721fffe3SKacheong Poon ixa->ixa_pktlen += spill; 2181721fffe3SKacheong Poon *tail_unsent = 0; 2182721fffe3SKacheong Poon } 2183721fffe3SKacheong Poon } 2184721fffe3SKacheong Poon if (tcp->tcp_ip_forward_progress) { 2185721fffe3SKacheong Poon tcp->tcp_ip_forward_progress = B_FALSE; 2186721fffe3SKacheong Poon ixa->ixa_flags |= IXAF_REACH_CONF; 2187721fffe3SKacheong Poon } else { 2188721fffe3SKacheong Poon ixa->ixa_flags &= ~IXAF_REACH_CONF; 2189721fffe3SKacheong Poon } 2190721fffe3SKacheong Poon 2191721fffe3SKacheong Poon if (do_lso_send) { 2192721fffe3SKacheong Poon /* Append LSO information to the mp. */ 2193721fffe3SKacheong Poon lso_info_set(mp, mss, HW_LSO); 2194721fffe3SKacheong Poon ixa->ixa_fragsize = IP_MAXPACKET; 2195721fffe3SKacheong Poon ixa->ixa_extra_ident = num_lso_seg - 1; 2196721fffe3SKacheong Poon 2197721fffe3SKacheong Poon DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, 2198721fffe3SKacheong Poon boolean_t, B_TRUE); 2199721fffe3SKacheong Poon 2200721fffe3SKacheong Poon tcp_send_data(tcp, mp); 2201721fffe3SKacheong Poon 2202721fffe3SKacheong Poon /* 2203721fffe3SKacheong Poon * Restore values of ixa_fragsize and ixa_extra_ident. 2204721fffe3SKacheong Poon */ 2205721fffe3SKacheong Poon ixa->ixa_fragsize = ixa->ixa_pmtu; 2206721fffe3SKacheong Poon ixa->ixa_extra_ident = 0; 2207721fffe3SKacheong Poon tcp->tcp_obsegs += num_lso_seg; 2208721fffe3SKacheong Poon TCP_STAT(tcps, tcp_lso_times); 2209721fffe3SKacheong Poon TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg); 2210721fffe3SKacheong Poon } else { 2211721fffe3SKacheong Poon /* 2212721fffe3SKacheong Poon * Make sure to clean up LSO information. Wherever a 2213721fffe3SKacheong Poon * new mp uses the prepended header room after dupb(), 2214721fffe3SKacheong Poon * lso_info_cleanup() should be called. 2215721fffe3SKacheong Poon */ 2216721fffe3SKacheong Poon lso_info_cleanup(mp); 2217721fffe3SKacheong Poon tcp_send_data(tcp, mp); 2218721fffe3SKacheong Poon BUMP_LOCAL(tcp->tcp_obsegs); 2219721fffe3SKacheong Poon } 2220721fffe3SKacheong Poon } 2221721fffe3SKacheong Poon 2222721fffe3SKacheong Poon return (0); 2223721fffe3SKacheong Poon } 2224721fffe3SKacheong Poon 2225721fffe3SKacheong Poon /* 2226721fffe3SKacheong Poon * Initiate closedown sequence on an active connection. (May be called as 2227721fffe3SKacheong Poon * writer.) Return value zero for OK return, non-zero for error return. 2228721fffe3SKacheong Poon */ 2229721fffe3SKacheong Poon static int 2230721fffe3SKacheong Poon tcp_xmit_end(tcp_t *tcp) 2231721fffe3SKacheong Poon { 2232721fffe3SKacheong Poon mblk_t *mp; 2233721fffe3SKacheong Poon tcp_stack_t *tcps = tcp->tcp_tcps; 2234721fffe3SKacheong Poon iulp_t uinfo; 2235721fffe3SKacheong Poon ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 2236721fffe3SKacheong Poon conn_t *connp = tcp->tcp_connp; 2237721fffe3SKacheong Poon 2238721fffe3SKacheong Poon if (tcp->tcp_state < TCPS_SYN_RCVD || 2239721fffe3SKacheong Poon tcp->tcp_state > TCPS_CLOSE_WAIT) { 2240721fffe3SKacheong Poon /* 2241721fffe3SKacheong Poon * Invalid state, only states TCPS_SYN_RCVD, 2242721fffe3SKacheong Poon * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid 2243721fffe3SKacheong Poon */ 2244721fffe3SKacheong Poon return (-1); 2245721fffe3SKacheong Poon } 2246721fffe3SKacheong Poon 2247721fffe3SKacheong Poon tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent; 2248721fffe3SKacheong Poon tcp->tcp_valid_bits |= TCP_FSS_VALID; 2249721fffe3SKacheong Poon /* 2250721fffe3SKacheong Poon * If there is nothing more unsent, send the FIN now. 2251721fffe3SKacheong Poon * Otherwise, it will go out with the last segment. 2252721fffe3SKacheong Poon */ 2253721fffe3SKacheong Poon if (tcp->tcp_unsent == 0) { 2254721fffe3SKacheong Poon mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 2255721fffe3SKacheong Poon tcp->tcp_fss, B_FALSE, NULL, B_FALSE); 2256721fffe3SKacheong Poon 2257721fffe3SKacheong Poon if (mp) { 2258721fffe3SKacheong Poon tcp_send_data(tcp, mp); 2259721fffe3SKacheong Poon } else { 2260721fffe3SKacheong Poon /* 2261721fffe3SKacheong Poon * Couldn't allocate msg. Pretend we got it out. 2262721fffe3SKacheong Poon * Wait for rexmit timeout. 2263721fffe3SKacheong Poon */ 2264721fffe3SKacheong Poon tcp->tcp_snxt = tcp->tcp_fss + 1; 2265721fffe3SKacheong Poon TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 2266721fffe3SKacheong Poon } 2267721fffe3SKacheong Poon 2268721fffe3SKacheong Poon /* 2269721fffe3SKacheong Poon * If needed, update tcp_rexmit_snxt as tcp_snxt is 2270721fffe3SKacheong Poon * changed. 2271721fffe3SKacheong Poon */ 2272721fffe3SKacheong Poon if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) { 2273721fffe3SKacheong Poon tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 2274721fffe3SKacheong Poon } 2275721fffe3SKacheong Poon } else { 2276721fffe3SKacheong Poon /* 2277721fffe3SKacheong Poon * If tcp->tcp_cork is set, then the data will not get sent, 2278721fffe3SKacheong Poon * so we have to check that and unset it first. 2279721fffe3SKacheong Poon */ 2280721fffe3SKacheong Poon if (tcp->tcp_cork) 2281721fffe3SKacheong Poon tcp->tcp_cork = B_FALSE; 2282721fffe3SKacheong Poon tcp_wput_data(tcp, NULL, B_FALSE); 2283721fffe3SKacheong Poon } 2284721fffe3SKacheong Poon 2285721fffe3SKacheong Poon /* 2286721fffe3SKacheong Poon * If TCP does not get enough samples of RTT or tcp_rtt_updates 2287721fffe3SKacheong Poon * is 0, don't update the cache. 2288721fffe3SKacheong Poon */ 2289721fffe3SKacheong Poon if (tcps->tcps_rtt_updates == 0 || 2290721fffe3SKacheong Poon tcp->tcp_rtt_update < tcps->tcps_rtt_updates) 2291721fffe3SKacheong Poon return (0); 2292721fffe3SKacheong Poon 2293721fffe3SKacheong Poon /* 2294721fffe3SKacheong Poon * We do not have a good algorithm to update ssthresh at this time. 2295721fffe3SKacheong Poon * So don't do any update. 2296721fffe3SKacheong Poon */ 2297721fffe3SKacheong Poon bzero(&uinfo, sizeof (uinfo)); 2298721fffe3SKacheong Poon uinfo.iulp_rtt = tcp->tcp_rtt_sa; 2299721fffe3SKacheong Poon uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd; 2300721fffe3SKacheong Poon 2301721fffe3SKacheong Poon /* 2302721fffe3SKacheong Poon * Note that uinfo is kept for conn_faddr in the DCE. Could update even 2303721fffe3SKacheong Poon * if source routed but we don't. 2304721fffe3SKacheong Poon */ 2305721fffe3SKacheong Poon if (connp->conn_ipversion == IPV4_VERSION) { 2306721fffe3SKacheong Poon if (connp->conn_faddr_v4 != tcp->tcp_ipha->ipha_dst) { 2307721fffe3SKacheong Poon return (0); 2308721fffe3SKacheong Poon } 2309721fffe3SKacheong Poon (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst); 2310721fffe3SKacheong Poon } else { 2311721fffe3SKacheong Poon uint_t ifindex; 2312721fffe3SKacheong Poon 2313721fffe3SKacheong Poon if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, 2314721fffe3SKacheong Poon &tcp->tcp_ip6h->ip6_dst))) { 2315721fffe3SKacheong Poon return (0); 2316721fffe3SKacheong Poon } 2317721fffe3SKacheong Poon ifindex = 0; 2318721fffe3SKacheong Poon if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) { 2319721fffe3SKacheong Poon ip_xmit_attr_t *ixa = connp->conn_ixa; 2320721fffe3SKacheong Poon 2321721fffe3SKacheong Poon /* 2322721fffe3SKacheong Poon * If we are going to create a DCE we'd better have 2323721fffe3SKacheong Poon * an ifindex 2324721fffe3SKacheong Poon */ 2325721fffe3SKacheong Poon if (ixa->ixa_nce != NULL) { 2326721fffe3SKacheong Poon ifindex = ixa->ixa_nce->nce_common->ncec_ill-> 2327721fffe3SKacheong Poon ill_phyint->phyint_ifindex; 2328721fffe3SKacheong Poon } else { 2329721fffe3SKacheong Poon return (0); 2330721fffe3SKacheong Poon } 2331721fffe3SKacheong Poon } 2332721fffe3SKacheong Poon 2333721fffe3SKacheong Poon (void) dce_update_uinfo(&connp->conn_faddr_v6, ifindex, &uinfo, 2334721fffe3SKacheong Poon ipst); 2335721fffe3SKacheong Poon } 2336721fffe3SKacheong Poon return (0); 2337721fffe3SKacheong Poon } 2338721fffe3SKacheong Poon 2339721fffe3SKacheong Poon /* 2340721fffe3SKacheong Poon * Send out a control packet on the tcp connection specified. This routine 2341721fffe3SKacheong Poon * is typically called where we need a simple ACK or RST generated. 2342721fffe3SKacheong Poon */ 2343721fffe3SKacheong Poon void 2344721fffe3SKacheong Poon tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) 2345721fffe3SKacheong Poon { 2346721fffe3SKacheong Poon uchar_t *rptr; 2347721fffe3SKacheong Poon tcpha_t *tcpha; 2348721fffe3SKacheong Poon ipha_t *ipha = NULL; 2349721fffe3SKacheong Poon ip6_t *ip6h = NULL; 2350721fffe3SKacheong Poon uint32_t sum; 2351721fffe3SKacheong Poon int total_hdr_len; 2352721fffe3SKacheong Poon int ip_hdr_len; 2353721fffe3SKacheong Poon mblk_t *mp; 2354721fffe3SKacheong Poon tcp_stack_t *tcps = tcp->tcp_tcps; 2355721fffe3SKacheong Poon conn_t *connp = tcp->tcp_connp; 2356721fffe3SKacheong Poon ip_xmit_attr_t *ixa = connp->conn_ixa; 2357721fffe3SKacheong Poon 2358721fffe3SKacheong Poon /* 2359721fffe3SKacheong Poon * Save sum for use in source route later. 2360721fffe3SKacheong Poon */ 2361721fffe3SKacheong Poon sum = connp->conn_ht_ulp_len + connp->conn_sum; 2362721fffe3SKacheong Poon total_hdr_len = connp->conn_ht_iphc_len; 2363721fffe3SKacheong Poon ip_hdr_len = ixa->ixa_ip_hdr_length; 2364721fffe3SKacheong Poon 2365721fffe3SKacheong Poon /* If a text string is passed in with the request, pass it to strlog. */ 2366721fffe3SKacheong Poon if (str != NULL && connp->conn_debug) { 2367721fffe3SKacheong Poon (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 2368721fffe3SKacheong Poon "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x", 2369721fffe3SKacheong Poon str, seq, ack, ctl); 2370721fffe3SKacheong Poon } 2371721fffe3SKacheong Poon mp = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra, 2372721fffe3SKacheong Poon BPRI_MED); 2373721fffe3SKacheong Poon if (mp == NULL) { 2374721fffe3SKacheong Poon return; 2375721fffe3SKacheong Poon } 2376721fffe3SKacheong Poon rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; 2377721fffe3SKacheong Poon mp->b_rptr = rptr; 2378721fffe3SKacheong Poon mp->b_wptr = &rptr[total_hdr_len]; 2379721fffe3SKacheong Poon bcopy(connp->conn_ht_iphc, rptr, total_hdr_len); 2380721fffe3SKacheong Poon 2381721fffe3SKacheong Poon ixa->ixa_pktlen = total_hdr_len; 2382721fffe3SKacheong Poon 2383721fffe3SKacheong Poon if (ixa->ixa_flags & IXAF_IS_IPV4) { 2384721fffe3SKacheong Poon ipha = (ipha_t *)rptr; 2385721fffe3SKacheong Poon ipha->ipha_length = htons(total_hdr_len); 2386721fffe3SKacheong Poon } else { 2387721fffe3SKacheong Poon ip6h = (ip6_t *)rptr; 2388721fffe3SKacheong Poon ip6h->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN); 2389721fffe3SKacheong Poon } 2390721fffe3SKacheong Poon tcpha = (tcpha_t *)&rptr[ip_hdr_len]; 2391721fffe3SKacheong Poon tcpha->tha_flags = (uint8_t)ctl; 2392721fffe3SKacheong Poon if (ctl & TH_RST) { 2393721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpOutRsts); 2394721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpOutControl); 2395721fffe3SKacheong Poon /* 2396721fffe3SKacheong Poon * Don't send TSopt w/ TH_RST packets per RFC 1323. 2397721fffe3SKacheong Poon */ 2398721fffe3SKacheong Poon if (tcp->tcp_snd_ts_ok && 2399721fffe3SKacheong Poon tcp->tcp_state > TCPS_SYN_SENT) { 2400721fffe3SKacheong Poon mp->b_wptr = &rptr[total_hdr_len - TCPOPT_REAL_TS_LEN]; 2401721fffe3SKacheong Poon *(mp->b_wptr) = TCPOPT_EOL; 2402721fffe3SKacheong Poon 2403721fffe3SKacheong Poon ixa->ixa_pktlen = total_hdr_len - TCPOPT_REAL_TS_LEN; 2404721fffe3SKacheong Poon 2405721fffe3SKacheong Poon if (connp->conn_ipversion == IPV4_VERSION) { 2406721fffe3SKacheong Poon ipha->ipha_length = htons(total_hdr_len - 2407721fffe3SKacheong Poon TCPOPT_REAL_TS_LEN); 2408721fffe3SKacheong Poon } else { 2409721fffe3SKacheong Poon ip6h->ip6_plen = htons(total_hdr_len - 2410721fffe3SKacheong Poon IPV6_HDR_LEN - TCPOPT_REAL_TS_LEN); 2411721fffe3SKacheong Poon } 2412721fffe3SKacheong Poon tcpha->tha_offset_and_reserved -= (3 << 4); 2413721fffe3SKacheong Poon sum -= TCPOPT_REAL_TS_LEN; 2414721fffe3SKacheong Poon } 2415721fffe3SKacheong Poon } 2416721fffe3SKacheong Poon if (ctl & TH_ACK) { 2417721fffe3SKacheong Poon if (tcp->tcp_snd_ts_ok) { 2418721fffe3SKacheong Poon uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; 2419721fffe3SKacheong Poon 2420721fffe3SKacheong Poon U32_TO_BE32(llbolt, 2421721fffe3SKacheong Poon (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); 2422721fffe3SKacheong Poon U32_TO_BE32(tcp->tcp_ts_recent, 2423721fffe3SKacheong Poon (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); 2424721fffe3SKacheong Poon } 2425721fffe3SKacheong Poon 2426721fffe3SKacheong Poon /* Update the latest receive window size in TCP header. */ 2427721fffe3SKacheong Poon tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 2428721fffe3SKacheong Poon /* Track what we sent to the peer */ 2429721fffe3SKacheong Poon tcp->tcp_tcpha->tha_win = tcpha->tha_win; 2430721fffe3SKacheong Poon tcp->tcp_rack = ack; 2431721fffe3SKacheong Poon tcp->tcp_rack_cnt = 0; 2432721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpOutAck); 2433721fffe3SKacheong Poon } 2434721fffe3SKacheong Poon BUMP_LOCAL(tcp->tcp_obsegs); 2435721fffe3SKacheong Poon tcpha->tha_seq = htonl(seq); 2436721fffe3SKacheong Poon tcpha->tha_ack = htonl(ack); 2437721fffe3SKacheong Poon /* 2438721fffe3SKacheong Poon * Include the adjustment for a source route if any. 2439721fffe3SKacheong Poon */ 2440721fffe3SKacheong Poon sum = (sum >> 16) + (sum & 0xFFFF); 2441721fffe3SKacheong Poon tcpha->tha_sum = htons(sum); 2442721fffe3SKacheong Poon tcp_send_data(tcp, mp); 2443721fffe3SKacheong Poon } 2444721fffe3SKacheong Poon 2445721fffe3SKacheong Poon /* 2446721fffe3SKacheong Poon * Generate a reset based on an inbound packet, connp is set by caller 2447721fffe3SKacheong Poon * when RST is in response to an unexpected inbound packet for which 2448721fffe3SKacheong Poon * there is active tcp state in the system. 2449721fffe3SKacheong Poon * 2450721fffe3SKacheong Poon * IPSEC NOTE : Try to send the reply with the same protection as it came 2451721fffe3SKacheong Poon * in. We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t. 2452721fffe3SKacheong Poon * That way the packet will go out at the same level of protection as it 2453721fffe3SKacheong Poon * came in with. 2454721fffe3SKacheong Poon */ 2455721fffe3SKacheong Poon static void 2456721fffe3SKacheong Poon tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl, 2457721fffe3SKacheong Poon ip_recv_attr_t *ira, ip_stack_t *ipst, conn_t *connp) 2458721fffe3SKacheong Poon { 2459721fffe3SKacheong Poon ipha_t *ipha = NULL; 2460721fffe3SKacheong Poon ip6_t *ip6h = NULL; 2461721fffe3SKacheong Poon ushort_t len; 2462721fffe3SKacheong Poon tcpha_t *tcpha; 2463721fffe3SKacheong Poon int i; 2464721fffe3SKacheong Poon ipaddr_t v4addr; 2465721fffe3SKacheong Poon in6_addr_t v6addr; 2466721fffe3SKacheong Poon netstack_t *ns = ipst->ips_netstack; 2467721fffe3SKacheong Poon tcp_stack_t *tcps = ns->netstack_tcp; 2468721fffe3SKacheong Poon ip_xmit_attr_t ixas, *ixa; 2469721fffe3SKacheong Poon uint_t ip_hdr_len = ira->ira_ip_hdr_length; 2470721fffe3SKacheong Poon boolean_t need_refrele = B_FALSE; /* ixa_refrele(ixa) */ 2471721fffe3SKacheong Poon ushort_t port; 2472721fffe3SKacheong Poon 2473721fffe3SKacheong Poon if (!tcp_send_rst_chk(tcps)) { 2474721fffe3SKacheong Poon TCP_STAT(tcps, tcp_rst_unsent); 2475721fffe3SKacheong Poon freemsg(mp); 2476721fffe3SKacheong Poon return; 2477721fffe3SKacheong Poon } 2478721fffe3SKacheong Poon 2479721fffe3SKacheong Poon /* 2480721fffe3SKacheong Poon * If connp != NULL we use conn_ixa to keep IP_NEXTHOP and other 2481721fffe3SKacheong Poon * options from the listener. In that case the caller must ensure that 2482721fffe3SKacheong Poon * we are running on the listener = connp squeue. 2483721fffe3SKacheong Poon * 2484721fffe3SKacheong Poon * We get a safe copy of conn_ixa so we don't need to restore anything 2485721fffe3SKacheong Poon * we or ip_output_simple might change in the ixa. 2486721fffe3SKacheong Poon */ 2487721fffe3SKacheong Poon if (connp != NULL) { 2488721fffe3SKacheong Poon ASSERT(connp->conn_on_sqp); 2489721fffe3SKacheong Poon 2490721fffe3SKacheong Poon ixa = conn_get_ixa_exclusive(connp); 2491721fffe3SKacheong Poon if (ixa == NULL) { 2492721fffe3SKacheong Poon TCP_STAT(tcps, tcp_rst_unsent); 2493721fffe3SKacheong Poon freemsg(mp); 2494721fffe3SKacheong Poon return; 2495721fffe3SKacheong Poon } 2496721fffe3SKacheong Poon need_refrele = B_TRUE; 2497721fffe3SKacheong Poon } else { 2498721fffe3SKacheong Poon bzero(&ixas, sizeof (ixas)); 2499721fffe3SKacheong Poon ixa = &ixas; 2500721fffe3SKacheong Poon /* 2501721fffe3SKacheong Poon * IXAF_VERIFY_SOURCE is overkill since we know the 2502721fffe3SKacheong Poon * packet was for us. 2503721fffe3SKacheong Poon */ 2504721fffe3SKacheong Poon ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE; 2505721fffe3SKacheong Poon ixa->ixa_protocol = IPPROTO_TCP; 2506721fffe3SKacheong Poon ixa->ixa_zoneid = ira->ira_zoneid; 2507721fffe3SKacheong Poon ixa->ixa_ifindex = 0; 2508721fffe3SKacheong Poon ixa->ixa_ipst = ipst; 2509721fffe3SKacheong Poon ixa->ixa_cred = kcred; 2510721fffe3SKacheong Poon ixa->ixa_cpid = NOPID; 2511721fffe3SKacheong Poon } 2512721fffe3SKacheong Poon 2513721fffe3SKacheong Poon if (str && tcps->tcps_dbg) { 2514721fffe3SKacheong Poon (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 2515721fffe3SKacheong Poon "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " 2516721fffe3SKacheong Poon "flags 0x%x", 2517721fffe3SKacheong Poon str, seq, ack, ctl); 2518721fffe3SKacheong Poon } 2519721fffe3SKacheong Poon if (mp->b_datap->db_ref != 1) { 2520721fffe3SKacheong Poon mblk_t *mp1 = copyb(mp); 2521721fffe3SKacheong Poon freemsg(mp); 2522721fffe3SKacheong Poon mp = mp1; 2523721fffe3SKacheong Poon if (mp == NULL) 2524721fffe3SKacheong Poon goto done; 2525721fffe3SKacheong Poon } else if (mp->b_cont) { 2526721fffe3SKacheong Poon freemsg(mp->b_cont); 2527721fffe3SKacheong Poon mp->b_cont = NULL; 2528721fffe3SKacheong Poon DB_CKSUMFLAGS(mp) = 0; 2529721fffe3SKacheong Poon } 2530721fffe3SKacheong Poon /* 2531721fffe3SKacheong Poon * We skip reversing source route here. 2532721fffe3SKacheong Poon * (for now we replace all IP options with EOL) 2533721fffe3SKacheong Poon */ 2534721fffe3SKacheong Poon if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 2535721fffe3SKacheong Poon ipha = (ipha_t *)mp->b_rptr; 2536721fffe3SKacheong Poon for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) 2537721fffe3SKacheong Poon mp->b_rptr[i] = IPOPT_EOL; 2538721fffe3SKacheong Poon /* 2539721fffe3SKacheong Poon * Make sure that src address isn't flagrantly invalid. 2540721fffe3SKacheong Poon * Not all broadcast address checking for the src address 2541721fffe3SKacheong Poon * is possible, since we don't know the netmask of the src 2542721fffe3SKacheong Poon * addr. No check for destination address is done, since 2543721fffe3SKacheong Poon * IP will not pass up a packet with a broadcast dest 2544721fffe3SKacheong Poon * address to TCP. Similar checks are done below for IPv6. 2545721fffe3SKacheong Poon */ 2546721fffe3SKacheong Poon if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST || 2547721fffe3SKacheong Poon CLASSD(ipha->ipha_src)) { 2548721fffe3SKacheong Poon BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); 2549721fffe3SKacheong Poon ip_drop_input("ipIfStatsInDiscards", mp, NULL); 2550721fffe3SKacheong Poon freemsg(mp); 2551721fffe3SKacheong Poon goto done; 2552721fffe3SKacheong Poon } 2553721fffe3SKacheong Poon } else { 2554721fffe3SKacheong Poon ip6h = (ip6_t *)mp->b_rptr; 2555721fffe3SKacheong Poon 2556721fffe3SKacheong Poon if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) || 2557721fffe3SKacheong Poon IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) { 2558721fffe3SKacheong Poon BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards); 2559721fffe3SKacheong Poon ip_drop_input("ipIfStatsInDiscards", mp, NULL); 2560721fffe3SKacheong Poon freemsg(mp); 2561721fffe3SKacheong Poon goto done; 2562721fffe3SKacheong Poon } 2563721fffe3SKacheong Poon 2564721fffe3SKacheong Poon /* Remove any extension headers assuming partial overlay */ 2565721fffe3SKacheong Poon if (ip_hdr_len > IPV6_HDR_LEN) { 2566721fffe3SKacheong Poon uint8_t *to; 2567721fffe3SKacheong Poon 2568721fffe3SKacheong Poon to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN; 2569721fffe3SKacheong Poon ovbcopy(ip6h, to, IPV6_HDR_LEN); 2570721fffe3SKacheong Poon mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN; 2571721fffe3SKacheong Poon ip_hdr_len = IPV6_HDR_LEN; 2572721fffe3SKacheong Poon ip6h = (ip6_t *)mp->b_rptr; 2573721fffe3SKacheong Poon ip6h->ip6_nxt = IPPROTO_TCP; 2574721fffe3SKacheong Poon } 2575721fffe3SKacheong Poon } 2576721fffe3SKacheong Poon tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len]; 2577721fffe3SKacheong Poon if (tcpha->tha_flags & TH_RST) { 2578721fffe3SKacheong Poon freemsg(mp); 2579721fffe3SKacheong Poon goto done; 2580721fffe3SKacheong Poon } 2581721fffe3SKacheong Poon tcpha->tha_offset_and_reserved = (5 << 4); 2582721fffe3SKacheong Poon len = ip_hdr_len + sizeof (tcpha_t); 2583721fffe3SKacheong Poon mp->b_wptr = &mp->b_rptr[len]; 2584721fffe3SKacheong Poon if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 2585721fffe3SKacheong Poon ipha->ipha_length = htons(len); 2586721fffe3SKacheong Poon /* Swap addresses */ 2587721fffe3SKacheong Poon v4addr = ipha->ipha_src; 2588721fffe3SKacheong Poon ipha->ipha_src = ipha->ipha_dst; 2589721fffe3SKacheong Poon ipha->ipha_dst = v4addr; 2590721fffe3SKacheong Poon ipha->ipha_ident = 0; 2591721fffe3SKacheong Poon ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl; 2592721fffe3SKacheong Poon ixa->ixa_flags |= IXAF_IS_IPV4; 2593721fffe3SKacheong Poon ixa->ixa_ip_hdr_length = ip_hdr_len; 2594721fffe3SKacheong Poon } else { 2595721fffe3SKacheong Poon ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2596721fffe3SKacheong Poon /* Swap addresses */ 2597721fffe3SKacheong Poon v6addr = ip6h->ip6_src; 2598721fffe3SKacheong Poon ip6h->ip6_src = ip6h->ip6_dst; 2599721fffe3SKacheong Poon ip6h->ip6_dst = v6addr; 2600721fffe3SKacheong Poon ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit; 2601721fffe3SKacheong Poon ixa->ixa_flags &= ~IXAF_IS_IPV4; 2602721fffe3SKacheong Poon 2603721fffe3SKacheong Poon if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) { 2604721fffe3SKacheong Poon ixa->ixa_flags |= IXAF_SCOPEID_SET; 2605721fffe3SKacheong Poon ixa->ixa_scopeid = ira->ira_ruifindex; 2606721fffe3SKacheong Poon } 2607721fffe3SKacheong Poon ixa->ixa_ip_hdr_length = IPV6_HDR_LEN; 2608721fffe3SKacheong Poon } 2609721fffe3SKacheong Poon ixa->ixa_pktlen = len; 2610721fffe3SKacheong Poon 2611721fffe3SKacheong Poon /* Swap the ports */ 2612721fffe3SKacheong Poon port = tcpha->tha_fport; 2613721fffe3SKacheong Poon tcpha->tha_fport = tcpha->tha_lport; 2614721fffe3SKacheong Poon tcpha->tha_lport = port; 2615721fffe3SKacheong Poon 2616721fffe3SKacheong Poon tcpha->tha_ack = htonl(ack); 2617721fffe3SKacheong Poon tcpha->tha_seq = htonl(seq); 2618721fffe3SKacheong Poon tcpha->tha_win = 0; 2619721fffe3SKacheong Poon tcpha->tha_sum = htons(sizeof (tcpha_t)); 2620721fffe3SKacheong Poon tcpha->tha_flags = (uint8_t)ctl; 2621721fffe3SKacheong Poon if (ctl & TH_RST) { 26229cd928feSAlan Maguire if (ctl & TH_ACK) { 26239cd928feSAlan Maguire /* 26249cd928feSAlan Maguire * Probe connection rejection here. 26259cd928feSAlan Maguire * tcp_xmit_listeners_reset() drops non-SYN segments 26269cd928feSAlan Maguire * that do not specify TH_ACK in their flags without 26279cd928feSAlan Maguire * calling this function. As a consequence, if this 26289cd928feSAlan Maguire * function is called with a TH_RST|TH_ACK ctl argument, 26299cd928feSAlan Maguire * it is being called in response to a SYN segment 26309cd928feSAlan Maguire * and thus the tcp:::accept-refused probe point 26319cd928feSAlan Maguire * is valid here. 26329cd928feSAlan Maguire */ 26339cd928feSAlan Maguire DTRACE_TCP5(accept__refused, mblk_t *, NULL, 26349cd928feSAlan Maguire void, NULL, void_ip_t *, mp->b_rptr, tcp_t *, NULL, 26359cd928feSAlan Maguire tcph_t *, tcpha); 26369cd928feSAlan Maguire } 2637721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpOutRsts); 2638721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpOutControl); 2639721fffe3SKacheong Poon } 2640721fffe3SKacheong Poon 2641721fffe3SKacheong Poon /* Discard any old label */ 2642721fffe3SKacheong Poon if (ixa->ixa_free_flags & IXA_FREE_TSL) { 2643721fffe3SKacheong Poon ASSERT(ixa->ixa_tsl != NULL); 2644721fffe3SKacheong Poon label_rele(ixa->ixa_tsl); 2645721fffe3SKacheong Poon ixa->ixa_free_flags &= ~IXA_FREE_TSL; 2646721fffe3SKacheong Poon } 2647721fffe3SKacheong Poon ixa->ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ 2648721fffe3SKacheong Poon 2649721fffe3SKacheong Poon if (ira->ira_flags & IRAF_IPSEC_SECURE) { 2650721fffe3SKacheong Poon /* 2651721fffe3SKacheong Poon * Apply IPsec based on how IPsec was applied to 2652721fffe3SKacheong Poon * the packet that caused the RST. 2653721fffe3SKacheong Poon */ 2654721fffe3SKacheong Poon if (!ipsec_in_to_out(ira, ixa, mp, ipha, ip6h)) { 2655721fffe3SKacheong Poon BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 2656721fffe3SKacheong Poon /* Note: mp already consumed and ip_drop_packet done */ 2657721fffe3SKacheong Poon goto done; 2658721fffe3SKacheong Poon } 2659721fffe3SKacheong Poon } else { 2660721fffe3SKacheong Poon /* 2661721fffe3SKacheong Poon * This is in clear. The RST message we are building 2662721fffe3SKacheong Poon * here should go out in clear, independent of our policy. 2663721fffe3SKacheong Poon */ 2664721fffe3SKacheong Poon ixa->ixa_flags |= IXAF_NO_IPSEC; 2665721fffe3SKacheong Poon } 2666721fffe3SKacheong Poon 26679cd928feSAlan Maguire DTRACE_TCP5(send, mblk_t *, NULL, ip_xmit_attr_t *, ixa, 26689cd928feSAlan Maguire __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, NULL, 26699cd928feSAlan Maguire __dtrace_tcp_tcph_t *, tcpha); 26709cd928feSAlan Maguire 2671721fffe3SKacheong Poon /* 2672721fffe3SKacheong Poon * NOTE: one might consider tracing a TCP packet here, but 2673721fffe3SKacheong Poon * this function has no active TCP state and no tcp structure 2674721fffe3SKacheong Poon * that has a trace buffer. If we traced here, we would have 2675721fffe3SKacheong Poon * to keep a local trace buffer in tcp_record_trace(). 2676721fffe3SKacheong Poon */ 2677721fffe3SKacheong Poon 2678721fffe3SKacheong Poon (void) ip_output_simple(mp, ixa); 2679721fffe3SKacheong Poon done: 2680721fffe3SKacheong Poon ixa_cleanup(ixa); 2681721fffe3SKacheong Poon if (need_refrele) { 2682721fffe3SKacheong Poon ASSERT(ixa != &ixas); 2683721fffe3SKacheong Poon ixa_refrele(ixa); 2684721fffe3SKacheong Poon } 2685721fffe3SKacheong Poon } 2686721fffe3SKacheong Poon 2687721fffe3SKacheong Poon /* 2688721fffe3SKacheong Poon * Generate a "no listener here" RST in response to an "unknown" segment. 2689721fffe3SKacheong Poon * connp is set by caller when RST is in response to an unexpected 2690721fffe3SKacheong Poon * inbound packet for which there is active tcp state in the system. 2691721fffe3SKacheong Poon * Note that we are reusing the incoming mp to construct the outgoing RST. 2692721fffe3SKacheong Poon */ 2693721fffe3SKacheong Poon void 2694721fffe3SKacheong Poon tcp_xmit_listeners_reset(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst, 2695721fffe3SKacheong Poon conn_t *connp) 2696721fffe3SKacheong Poon { 2697721fffe3SKacheong Poon uchar_t *rptr; 2698721fffe3SKacheong Poon uint32_t seg_len; 2699721fffe3SKacheong Poon tcpha_t *tcpha; 2700721fffe3SKacheong Poon uint32_t seg_seq; 2701721fffe3SKacheong Poon uint32_t seg_ack; 2702721fffe3SKacheong Poon uint_t flags; 2703721fffe3SKacheong Poon ipha_t *ipha; 2704721fffe3SKacheong Poon ip6_t *ip6h; 2705721fffe3SKacheong Poon boolean_t policy_present; 2706721fffe3SKacheong Poon netstack_t *ns = ipst->ips_netstack; 2707721fffe3SKacheong Poon tcp_stack_t *tcps = ns->netstack_tcp; 2708721fffe3SKacheong Poon ipsec_stack_t *ipss = tcps->tcps_netstack->netstack_ipsec; 2709721fffe3SKacheong Poon uint_t ip_hdr_len = ira->ira_ip_hdr_length; 2710721fffe3SKacheong Poon 2711721fffe3SKacheong Poon TCP_STAT(tcps, tcp_no_listener); 2712721fffe3SKacheong Poon 27139cd928feSAlan Maguire /* 27149cd928feSAlan Maguire * DTrace this "unknown" segment as a tcp:::receive, as we did 27159cd928feSAlan Maguire * just receive something that was TCP. 27169cd928feSAlan Maguire */ 27179cd928feSAlan Maguire DTRACE_TCP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, NULL, 27189cd928feSAlan Maguire __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, NULL, 27199cd928feSAlan Maguire __dtrace_tcp_tcph_t *, &mp->b_rptr[ip_hdr_len]); 27209cd928feSAlan Maguire 2721721fffe3SKacheong Poon if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 2722721fffe3SKacheong Poon policy_present = ipss->ipsec_inbound_v4_policy_present; 2723721fffe3SKacheong Poon ipha = (ipha_t *)mp->b_rptr; 2724721fffe3SKacheong Poon ip6h = NULL; 2725721fffe3SKacheong Poon } else { 2726721fffe3SKacheong Poon policy_present = ipss->ipsec_inbound_v6_policy_present; 2727721fffe3SKacheong Poon ipha = NULL; 2728721fffe3SKacheong Poon ip6h = (ip6_t *)mp->b_rptr; 2729721fffe3SKacheong Poon } 2730721fffe3SKacheong Poon 2731721fffe3SKacheong Poon if (policy_present) { 2732721fffe3SKacheong Poon /* 2733721fffe3SKacheong Poon * The conn_t parameter is NULL because we already know 2734721fffe3SKacheong Poon * nobody's home. 2735721fffe3SKacheong Poon */ 2736721fffe3SKacheong Poon mp = ipsec_check_global_policy(mp, (conn_t *)NULL, ipha, ip6h, 2737721fffe3SKacheong Poon ira, ns); 2738721fffe3SKacheong Poon if (mp == NULL) 2739721fffe3SKacheong Poon return; 2740721fffe3SKacheong Poon } 2741721fffe3SKacheong Poon if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) { 2742721fffe3SKacheong Poon DTRACE_PROBE2( 2743721fffe3SKacheong Poon tx__ip__log__error__nolistener__tcp, 2744721fffe3SKacheong Poon char *, "Could not reply with RST to mp(1)", 2745721fffe3SKacheong Poon mblk_t *, mp); 2746721fffe3SKacheong Poon ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n")); 2747721fffe3SKacheong Poon freemsg(mp); 2748721fffe3SKacheong Poon return; 2749721fffe3SKacheong Poon } 2750721fffe3SKacheong Poon 2751721fffe3SKacheong Poon rptr = mp->b_rptr; 2752721fffe3SKacheong Poon 2753721fffe3SKacheong Poon tcpha = (tcpha_t *)&rptr[ip_hdr_len]; 2754721fffe3SKacheong Poon seg_seq = ntohl(tcpha->tha_seq); 2755721fffe3SKacheong Poon seg_ack = ntohl(tcpha->tha_ack); 2756721fffe3SKacheong Poon flags = tcpha->tha_flags; 2757721fffe3SKacheong Poon 2758721fffe3SKacheong Poon seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcpha) + ip_hdr_len); 2759721fffe3SKacheong Poon if (flags & TH_RST) { 2760721fffe3SKacheong Poon freemsg(mp); 2761721fffe3SKacheong Poon } else if (flags & TH_ACK) { 2762721fffe3SKacheong Poon tcp_xmit_early_reset("no tcp, reset", mp, seg_ack, 0, TH_RST, 2763721fffe3SKacheong Poon ira, ipst, connp); 2764721fffe3SKacheong Poon } else { 2765721fffe3SKacheong Poon if (flags & TH_SYN) { 2766721fffe3SKacheong Poon seg_len++; 2767721fffe3SKacheong Poon } else { 2768721fffe3SKacheong Poon /* 2769721fffe3SKacheong Poon * Here we violate the RFC. Note that a normal 2770721fffe3SKacheong Poon * TCP will never send a segment without the ACK 2771721fffe3SKacheong Poon * flag, except for RST or SYN segment. This 2772721fffe3SKacheong Poon * segment is neither. Just drop it on the 2773721fffe3SKacheong Poon * floor. 2774721fffe3SKacheong Poon */ 2775721fffe3SKacheong Poon freemsg(mp); 2776721fffe3SKacheong Poon TCP_STAT(tcps, tcp_rst_unsent); 2777721fffe3SKacheong Poon return; 2778721fffe3SKacheong Poon } 2779721fffe3SKacheong Poon 2780721fffe3SKacheong Poon tcp_xmit_early_reset("no tcp, reset/ack", mp, 0, 2781721fffe3SKacheong Poon seg_seq + seg_len, TH_RST | TH_ACK, ira, ipst, connp); 2782721fffe3SKacheong Poon } 2783721fffe3SKacheong Poon } 2784721fffe3SKacheong Poon 2785721fffe3SKacheong Poon /* 2786b7de80edSKacheong Poon * Helper function for tcp_xmit_mp() in handling connection set up flag 2787b7de80edSKacheong Poon * options setting. 2788b7de80edSKacheong Poon */ 2789b7de80edSKacheong Poon static void 2790b7de80edSKacheong Poon tcp_xmit_mp_aux_iss(tcp_t *tcp, conn_t *connp, tcpha_t *tcpha, mblk_t *mp, 2791b7de80edSKacheong Poon uint_t *flags) 2792b7de80edSKacheong Poon { 2793b7de80edSKacheong Poon uint32_t u1; 2794b7de80edSKacheong Poon uint8_t *wptr = mp->b_wptr; 2795b7de80edSKacheong Poon tcp_stack_t *tcps = tcp->tcp_tcps; 2796b7de80edSKacheong Poon boolean_t add_sack = B_FALSE; 2797b7de80edSKacheong Poon 2798b7de80edSKacheong Poon /* 2799b7de80edSKacheong Poon * If TCP_ISS_VALID and the seq number is tcp_iss, 2800b7de80edSKacheong Poon * TCP can only be in SYN-SENT, SYN-RCVD or 2801b7de80edSKacheong Poon * FIN-WAIT-1 state. It can be FIN-WAIT-1 if 2802b7de80edSKacheong Poon * our SYN is not ack'ed but the app closes this 2803b7de80edSKacheong Poon * TCP connection. 2804b7de80edSKacheong Poon */ 2805b7de80edSKacheong Poon ASSERT(tcp->tcp_state == TCPS_SYN_SENT || 2806b7de80edSKacheong Poon tcp->tcp_state == TCPS_SYN_RCVD || 2807b7de80edSKacheong Poon tcp->tcp_state == TCPS_FIN_WAIT_1); 2808b7de80edSKacheong Poon 2809b7de80edSKacheong Poon /* 2810b7de80edSKacheong Poon * Tack on the MSS option. It is always needed 2811b7de80edSKacheong Poon * for both active and passive open. 2812b7de80edSKacheong Poon * 2813b7de80edSKacheong Poon * MSS option value should be interface MTU - MIN 2814b7de80edSKacheong Poon * TCP/IP header according to RFC 793 as it means 2815b7de80edSKacheong Poon * the maximum segment size TCP can receive. But 2816b7de80edSKacheong Poon * to get around some broken middle boxes/end hosts 2817b7de80edSKacheong Poon * out there, we allow the option value to be the 2818b7de80edSKacheong Poon * same as the MSS option size on the peer side. 2819b7de80edSKacheong Poon * In this way, the other side will not send 2820b7de80edSKacheong Poon * anything larger than they can receive. 2821b7de80edSKacheong Poon * 2822b7de80edSKacheong Poon * Note that for SYN_SENT state, the ndd param 2823b7de80edSKacheong Poon * tcp_use_smss_as_mss_opt has no effect as we 2824b7de80edSKacheong Poon * don't know the peer's MSS option value. So 2825b7de80edSKacheong Poon * the only case we need to take care of is in 2826b7de80edSKacheong Poon * SYN_RCVD state, which is done later. 2827b7de80edSKacheong Poon */ 2828b7de80edSKacheong Poon wptr[0] = TCPOPT_MAXSEG; 2829b7de80edSKacheong Poon wptr[1] = TCPOPT_MAXSEG_LEN; 2830b7de80edSKacheong Poon wptr += 2; 2831b7de80edSKacheong Poon u1 = tcp->tcp_initial_pmtu - (connp->conn_ipversion == IPV4_VERSION ? 2832b7de80edSKacheong Poon IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - TCP_MIN_HEADER_LENGTH; 2833b7de80edSKacheong Poon U16_TO_BE16(u1, wptr); 2834b7de80edSKacheong Poon wptr += 2; 2835b7de80edSKacheong Poon 2836b7de80edSKacheong Poon /* Update the offset to cover the additional word */ 2837b7de80edSKacheong Poon tcpha->tha_offset_and_reserved += (1 << 4); 2838b7de80edSKacheong Poon 2839b7de80edSKacheong Poon switch (tcp->tcp_state) { 2840b7de80edSKacheong Poon case TCPS_SYN_SENT: 2841b7de80edSKacheong Poon *flags = TH_SYN; 2842b7de80edSKacheong Poon 2843b7de80edSKacheong Poon if (tcp->tcp_snd_sack_ok) 2844b7de80edSKacheong Poon add_sack = B_TRUE; 2845b7de80edSKacheong Poon 2846b7de80edSKacheong Poon if (tcp->tcp_snd_ts_ok) { 2847b7de80edSKacheong Poon uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; 2848b7de80edSKacheong Poon 2849b7de80edSKacheong Poon if (add_sack) { 2850b7de80edSKacheong Poon wptr[0] = TCPOPT_SACK_PERMITTED; 2851b7de80edSKacheong Poon wptr[1] = TCPOPT_SACK_OK_LEN; 2852b7de80edSKacheong Poon add_sack = B_FALSE; 2853b7de80edSKacheong Poon } else { 2854b7de80edSKacheong Poon wptr[0] = TCPOPT_NOP; 2855b7de80edSKacheong Poon wptr[1] = TCPOPT_NOP; 2856b7de80edSKacheong Poon } 2857b7de80edSKacheong Poon wptr[2] = TCPOPT_TSTAMP; 2858b7de80edSKacheong Poon wptr[3] = TCPOPT_TSTAMP_LEN; 2859b7de80edSKacheong Poon wptr += 4; 2860b7de80edSKacheong Poon U32_TO_BE32(llbolt, wptr); 2861b7de80edSKacheong Poon wptr += 4; 2862b7de80edSKacheong Poon ASSERT(tcp->tcp_ts_recent == 0); 2863b7de80edSKacheong Poon U32_TO_BE32(0L, wptr); 2864b7de80edSKacheong Poon wptr += 4; 2865b7de80edSKacheong Poon tcpha->tha_offset_and_reserved += (3 << 4); 2866b7de80edSKacheong Poon } 2867b7de80edSKacheong Poon 2868b7de80edSKacheong Poon /* 2869b7de80edSKacheong Poon * Set up all the bits to tell other side 2870b7de80edSKacheong Poon * we are ECN capable. 2871b7de80edSKacheong Poon */ 2872b7de80edSKacheong Poon if (tcp->tcp_ecn_ok) 2873b7de80edSKacheong Poon *flags |= (TH_ECE | TH_CWR); 2874b7de80edSKacheong Poon 2875b7de80edSKacheong Poon break; 2876b7de80edSKacheong Poon 2877b7de80edSKacheong Poon case TCPS_SYN_RCVD: 2878b7de80edSKacheong Poon *flags |= TH_SYN; 2879b7de80edSKacheong Poon 2880b7de80edSKacheong Poon /* 2881b7de80edSKacheong Poon * Reset the MSS option value to be SMSS 2882b7de80edSKacheong Poon * We should probably add back the bytes 2883b7de80edSKacheong Poon * for timestamp option and IPsec. We 2884b7de80edSKacheong Poon * don't do that as this is a workaround 2885b7de80edSKacheong Poon * for broken middle boxes/end hosts, it 2886b7de80edSKacheong Poon * is better for us to be more cautious. 2887b7de80edSKacheong Poon * They may not take these things into 2888b7de80edSKacheong Poon * account in their SMSS calculation. Thus 2889b7de80edSKacheong Poon * the peer's calculated SMSS may be smaller 2890b7de80edSKacheong Poon * than what it can be. This should be OK. 2891b7de80edSKacheong Poon */ 2892b7de80edSKacheong Poon if (tcps->tcps_use_smss_as_mss_opt) { 2893b7de80edSKacheong Poon u1 = tcp->tcp_mss; 2894b7de80edSKacheong Poon /* 2895b7de80edSKacheong Poon * Note that wptr points just past the MSS 2896b7de80edSKacheong Poon * option value. 2897b7de80edSKacheong Poon */ 2898b7de80edSKacheong Poon U16_TO_BE16(u1, wptr - 2); 2899b7de80edSKacheong Poon } 2900b7de80edSKacheong Poon 2901b7de80edSKacheong Poon /* 2902b7de80edSKacheong Poon * tcp_snd_ts_ok can only be set in TCPS_SYN_RCVD 2903b7de80edSKacheong Poon * when the peer also uses timestamps option. And 2904b7de80edSKacheong Poon * the TCP header template must have already been 2905b7de80edSKacheong Poon * updated to include the timestamps option. 2906b7de80edSKacheong Poon */ 2907b7de80edSKacheong Poon if (tcp->tcp_snd_sack_ok) { 2908b7de80edSKacheong Poon if (tcp->tcp_snd_ts_ok) { 2909b7de80edSKacheong Poon uint8_t *tmp_wptr; 2910b7de80edSKacheong Poon 2911b7de80edSKacheong Poon /* 2912b7de80edSKacheong Poon * Use the NOP in the header just 2913b7de80edSKacheong Poon * before timestamps opton. 2914b7de80edSKacheong Poon */ 2915b7de80edSKacheong Poon tmp_wptr = (uint8_t *)tcpha + 2916b7de80edSKacheong Poon TCP_MIN_HEADER_LENGTH; 2917b7de80edSKacheong Poon ASSERT(tmp_wptr[0] == TCPOPT_NOP && 2918b7de80edSKacheong Poon tmp_wptr[1] == TCPOPT_NOP); 2919b7de80edSKacheong Poon tmp_wptr[0] = TCPOPT_SACK_PERMITTED; 2920b7de80edSKacheong Poon tmp_wptr[1] = TCPOPT_SACK_OK_LEN; 2921b7de80edSKacheong Poon } else { 2922b7de80edSKacheong Poon add_sack = B_TRUE; 2923b7de80edSKacheong Poon } 2924b7de80edSKacheong Poon } 2925b7de80edSKacheong Poon 2926b7de80edSKacheong Poon 2927b7de80edSKacheong Poon /* 2928b7de80edSKacheong Poon * If the other side is ECN capable, reply 2929b7de80edSKacheong Poon * that we are also ECN capable. 2930b7de80edSKacheong Poon */ 2931b7de80edSKacheong Poon if (tcp->tcp_ecn_ok) 2932b7de80edSKacheong Poon *flags |= TH_ECE; 2933b7de80edSKacheong Poon break; 2934b7de80edSKacheong Poon 2935b7de80edSKacheong Poon default: 2936b7de80edSKacheong Poon /* 2937b7de80edSKacheong Poon * The above ASSERT() makes sure that this 2938b7de80edSKacheong Poon * must be FIN-WAIT-1 state. Our SYN has 2939b7de80edSKacheong Poon * not been ack'ed so retransmit it. 2940b7de80edSKacheong Poon */ 2941b7de80edSKacheong Poon *flags |= TH_SYN; 2942b7de80edSKacheong Poon break; 2943b7de80edSKacheong Poon } 2944b7de80edSKacheong Poon 2945b7de80edSKacheong Poon if (add_sack) { 2946b7de80edSKacheong Poon wptr[0] = TCPOPT_NOP; 2947b7de80edSKacheong Poon wptr[1] = TCPOPT_NOP; 2948b7de80edSKacheong Poon wptr[2] = TCPOPT_SACK_PERMITTED; 2949b7de80edSKacheong Poon wptr[3] = TCPOPT_SACK_OK_LEN; 2950b7de80edSKacheong Poon wptr += TCPOPT_REAL_SACK_OK_LEN; 2951b7de80edSKacheong Poon tcpha->tha_offset_and_reserved += (1 << 4); 2952b7de80edSKacheong Poon } 2953b7de80edSKacheong Poon 2954b7de80edSKacheong Poon if (tcp->tcp_snd_ws_ok) { 2955b7de80edSKacheong Poon wptr[0] = TCPOPT_NOP; 2956b7de80edSKacheong Poon wptr[1] = TCPOPT_WSCALE; 2957b7de80edSKacheong Poon wptr[2] = TCPOPT_WS_LEN; 2958b7de80edSKacheong Poon wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 2959b7de80edSKacheong Poon wptr += TCPOPT_REAL_WS_LEN; 2960b7de80edSKacheong Poon tcpha->tha_offset_and_reserved += (1 << 4); 2961b7de80edSKacheong Poon } 2962b7de80edSKacheong Poon 2963b7de80edSKacheong Poon mp->b_wptr = wptr; 2964b7de80edSKacheong Poon u1 = (int)(mp->b_wptr - mp->b_rptr); 2965b7de80edSKacheong Poon /* 2966b7de80edSKacheong Poon * Get IP set to checksum on our behalf 2967b7de80edSKacheong Poon * Include the adjustment for a source route if any. 2968b7de80edSKacheong Poon */ 2969b7de80edSKacheong Poon u1 += connp->conn_sum; 2970b7de80edSKacheong Poon u1 = (u1 >> 16) + (u1 & 0xFFFF); 2971b7de80edSKacheong Poon tcpha->tha_sum = htons(u1); 2972b7de80edSKacheong Poon TCPS_BUMP_MIB(tcps, tcpOutControl); 2973b7de80edSKacheong Poon } 2974b7de80edSKacheong Poon 2975b7de80edSKacheong Poon /* 2976b7de80edSKacheong Poon * Helper function for tcp_xmit_mp() in handling connection tear down 2977b7de80edSKacheong Poon * flag setting and state changes. 2978b7de80edSKacheong Poon */ 2979b7de80edSKacheong Poon static void 2980b7de80edSKacheong Poon tcp_xmit_mp_aux_fss(tcp_t *tcp, ip_xmit_attr_t *ixa, uint_t *flags) 2981b7de80edSKacheong Poon { 2982b7de80edSKacheong Poon if (!tcp->tcp_fin_acked) { 2983b7de80edSKacheong Poon *flags |= TH_FIN; 2984b7de80edSKacheong Poon TCPS_BUMP_MIB(tcp->tcp_tcps, tcpOutControl); 2985b7de80edSKacheong Poon } 2986b7de80edSKacheong Poon if (!tcp->tcp_fin_sent) { 2987b7de80edSKacheong Poon tcp->tcp_fin_sent = B_TRUE; 2988b7de80edSKacheong Poon switch (tcp->tcp_state) { 2989b7de80edSKacheong Poon case TCPS_SYN_RCVD: 2990b7de80edSKacheong Poon tcp->tcp_state = TCPS_FIN_WAIT_1; 2991b7de80edSKacheong Poon DTRACE_TCP6(state__change, void, NULL, 2992b7de80edSKacheong Poon ip_xmit_attr_t *, ixa, void, NULL, 2993b7de80edSKacheong Poon tcp_t *, tcp, void, NULL, 2994b7de80edSKacheong Poon int32_t, TCPS_SYN_RCVD); 2995b7de80edSKacheong Poon break; 2996b7de80edSKacheong Poon case TCPS_ESTABLISHED: 2997b7de80edSKacheong Poon tcp->tcp_state = TCPS_FIN_WAIT_1; 2998b7de80edSKacheong Poon DTRACE_TCP6(state__change, void, NULL, 2999b7de80edSKacheong Poon ip_xmit_attr_t *, ixa, void, NULL, 3000b7de80edSKacheong Poon tcp_t *, tcp, void, NULL, 3001b7de80edSKacheong Poon int32_t, TCPS_ESTABLISHED); 3002b7de80edSKacheong Poon break; 3003b7de80edSKacheong Poon case TCPS_CLOSE_WAIT: 3004b7de80edSKacheong Poon tcp->tcp_state = TCPS_LAST_ACK; 3005b7de80edSKacheong Poon DTRACE_TCP6(state__change, void, NULL, 3006b7de80edSKacheong Poon ip_xmit_attr_t *, ixa, void, NULL, 3007b7de80edSKacheong Poon tcp_t *, tcp, void, NULL, 3008b7de80edSKacheong Poon int32_t, TCPS_CLOSE_WAIT); 3009b7de80edSKacheong Poon break; 3010b7de80edSKacheong Poon } 3011b7de80edSKacheong Poon if (tcp->tcp_suna == tcp->tcp_snxt) 3012b7de80edSKacheong Poon TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3013b7de80edSKacheong Poon tcp->tcp_snxt = tcp->tcp_fss + 1; 3014b7de80edSKacheong Poon } 3015b7de80edSKacheong Poon } 3016b7de80edSKacheong Poon 3017b7de80edSKacheong Poon /* 3018721fffe3SKacheong Poon * tcp_xmit_mp is called to return a pointer to an mblk chain complete with 3019721fffe3SKacheong Poon * ip and tcp header ready to pass down to IP. If the mp passed in is 3020721fffe3SKacheong Poon * non-NULL, then up to max_to_send bytes of data will be dup'ed off that 3021721fffe3SKacheong Poon * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary 3022721fffe3SKacheong Poon * otherwise it will dup partial mblks.) 3023721fffe3SKacheong Poon * Otherwise, an appropriate ACK packet will be generated. This 3024721fffe3SKacheong Poon * routine is not usually called to send new data for the first time. It 3025721fffe3SKacheong Poon * is mostly called out of the timer for retransmits, and to generate ACKs. 3026721fffe3SKacheong Poon * 3027721fffe3SKacheong Poon * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will 3028721fffe3SKacheong Poon * be adjusted by *offset. And after dupb(), the offset and the ending mblk 3029721fffe3SKacheong Poon * of the original mblk chain will be returned in *offset and *end_mp. 3030721fffe3SKacheong Poon */ 3031721fffe3SKacheong Poon mblk_t * 3032721fffe3SKacheong Poon tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, 3033721fffe3SKacheong Poon mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, 3034721fffe3SKacheong Poon boolean_t rexmit) 3035721fffe3SKacheong Poon { 3036721fffe3SKacheong Poon int data_length; 3037721fffe3SKacheong Poon int32_t off = 0; 3038721fffe3SKacheong Poon uint_t flags; 3039721fffe3SKacheong Poon mblk_t *mp1; 3040721fffe3SKacheong Poon mblk_t *mp2; 3041721fffe3SKacheong Poon uchar_t *rptr; 3042721fffe3SKacheong Poon tcpha_t *tcpha; 3043721fffe3SKacheong Poon int32_t num_sack_blk = 0; 3044721fffe3SKacheong Poon int32_t sack_opt_len = 0; 3045721fffe3SKacheong Poon tcp_stack_t *tcps = tcp->tcp_tcps; 3046721fffe3SKacheong Poon conn_t *connp = tcp->tcp_connp; 3047721fffe3SKacheong Poon ip_xmit_attr_t *ixa = connp->conn_ixa; 3048721fffe3SKacheong Poon 3049721fffe3SKacheong Poon /* Allocate for our maximum TCP header + link-level */ 3050721fffe3SKacheong Poon mp1 = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra, 3051721fffe3SKacheong Poon BPRI_MED); 3052b7de80edSKacheong Poon if (mp1 == NULL) 3053721fffe3SKacheong Poon return (NULL); 3054721fffe3SKacheong Poon data_length = 0; 3055721fffe3SKacheong Poon 3056721fffe3SKacheong Poon /* 3057721fffe3SKacheong Poon * Note that tcp_mss has been adjusted to take into account the 3058721fffe3SKacheong Poon * timestamp option if applicable. Because SACK options do not 3059721fffe3SKacheong Poon * appear in every TCP segments and they are of variable lengths, 3060721fffe3SKacheong Poon * they cannot be included in tcp_mss. Thus we need to calculate 3061721fffe3SKacheong Poon * the actual segment length when we need to send a segment which 3062721fffe3SKacheong Poon * includes SACK options. 3063721fffe3SKacheong Poon */ 3064721fffe3SKacheong Poon if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 3065721fffe3SKacheong Poon num_sack_blk = MIN(tcp->tcp_max_sack_blk, 3066721fffe3SKacheong Poon tcp->tcp_num_sack_blk); 3067721fffe3SKacheong Poon sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 3068721fffe3SKacheong Poon TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 3069721fffe3SKacheong Poon if (max_to_send + sack_opt_len > tcp->tcp_mss) 3070721fffe3SKacheong Poon max_to_send -= sack_opt_len; 3071721fffe3SKacheong Poon } 3072721fffe3SKacheong Poon 3073721fffe3SKacheong Poon if (offset != NULL) { 3074721fffe3SKacheong Poon off = *offset; 3075721fffe3SKacheong Poon /* We use offset as an indicator that end_mp is not NULL. */ 3076721fffe3SKacheong Poon *end_mp = NULL; 3077721fffe3SKacheong Poon } 3078721fffe3SKacheong Poon for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) { 3079721fffe3SKacheong Poon /* This could be faster with cooperation from downstream */ 3080721fffe3SKacheong Poon if (mp2 != mp1 && !sendall && 3081721fffe3SKacheong Poon data_length + (int)(mp->b_wptr - mp->b_rptr) > 3082721fffe3SKacheong Poon max_to_send) 3083721fffe3SKacheong Poon /* 3084721fffe3SKacheong Poon * Don't send the next mblk since the whole mblk 3085721fffe3SKacheong Poon * does not fit. 3086721fffe3SKacheong Poon */ 3087721fffe3SKacheong Poon break; 3088721fffe3SKacheong Poon mp2->b_cont = dupb(mp); 3089721fffe3SKacheong Poon mp2 = mp2->b_cont; 3090721fffe3SKacheong Poon if (!mp2) { 3091721fffe3SKacheong Poon freemsg(mp1); 3092721fffe3SKacheong Poon return (NULL); 3093721fffe3SKacheong Poon } 3094721fffe3SKacheong Poon mp2->b_rptr += off; 3095721fffe3SKacheong Poon ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 3096721fffe3SKacheong Poon (uintptr_t)INT_MAX); 3097721fffe3SKacheong Poon 3098721fffe3SKacheong Poon data_length += (int)(mp2->b_wptr - mp2->b_rptr); 3099721fffe3SKacheong Poon if (data_length > max_to_send) { 3100721fffe3SKacheong Poon mp2->b_wptr -= data_length - max_to_send; 3101721fffe3SKacheong Poon data_length = max_to_send; 3102721fffe3SKacheong Poon off = mp2->b_wptr - mp->b_rptr; 3103721fffe3SKacheong Poon break; 3104721fffe3SKacheong Poon } else { 3105721fffe3SKacheong Poon off = 0; 3106721fffe3SKacheong Poon } 3107721fffe3SKacheong Poon } 3108721fffe3SKacheong Poon if (offset != NULL) { 3109721fffe3SKacheong Poon *offset = off; 3110721fffe3SKacheong Poon *end_mp = mp; 3111721fffe3SKacheong Poon } 3112721fffe3SKacheong Poon if (seg_len != NULL) { 3113721fffe3SKacheong Poon *seg_len = data_length; 3114721fffe3SKacheong Poon } 3115721fffe3SKacheong Poon 3116721fffe3SKacheong Poon /* Update the latest receive window size in TCP header. */ 3117721fffe3SKacheong Poon tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 3118721fffe3SKacheong Poon 3119721fffe3SKacheong Poon rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; 3120721fffe3SKacheong Poon mp1->b_rptr = rptr; 3121721fffe3SKacheong Poon mp1->b_wptr = rptr + connp->conn_ht_iphc_len + sack_opt_len; 3122721fffe3SKacheong Poon bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len); 3123721fffe3SKacheong Poon tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length]; 3124721fffe3SKacheong Poon tcpha->tha_seq = htonl(seq); 3125721fffe3SKacheong Poon 3126721fffe3SKacheong Poon /* 3127721fffe3SKacheong Poon * Use tcp_unsent to determine if the PUSH bit should be used assumes 3128721fffe3SKacheong Poon * that this function was called from tcp_wput_data. Thus, when called 3129721fffe3SKacheong Poon * to retransmit data the setting of the PUSH bit may appear some 3130721fffe3SKacheong Poon * what random in that it might get set when it should not. This 3131721fffe3SKacheong Poon * should not pose any performance issues. 3132721fffe3SKacheong Poon */ 3133721fffe3SKacheong Poon if (data_length != 0 && (tcp->tcp_unsent == 0 || 3134721fffe3SKacheong Poon tcp->tcp_unsent == data_length)) { 3135721fffe3SKacheong Poon flags = TH_ACK | TH_PUSH; 3136721fffe3SKacheong Poon } else { 3137721fffe3SKacheong Poon flags = TH_ACK; 3138721fffe3SKacheong Poon } 3139721fffe3SKacheong Poon 3140721fffe3SKacheong Poon if (tcp->tcp_ecn_ok) { 3141721fffe3SKacheong Poon if (tcp->tcp_ecn_echo_on) 3142721fffe3SKacheong Poon flags |= TH_ECE; 3143721fffe3SKacheong Poon 3144721fffe3SKacheong Poon /* 3145721fffe3SKacheong Poon * Only set ECT bit and ECN_CWR if a segment contains new data. 3146721fffe3SKacheong Poon * There is no TCP flow control for non-data segments, and 3147721fffe3SKacheong Poon * only data segment is transmitted reliably. 3148721fffe3SKacheong Poon */ 3149721fffe3SKacheong Poon if (data_length > 0 && !rexmit) { 3150721fffe3SKacheong Poon TCP_SET_ECT(tcp, rptr); 3151721fffe3SKacheong Poon if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 3152721fffe3SKacheong Poon flags |= TH_CWR; 3153721fffe3SKacheong Poon tcp->tcp_ecn_cwr_sent = B_TRUE; 3154721fffe3SKacheong Poon } 3155721fffe3SKacheong Poon } 3156721fffe3SKacheong Poon } 3157721fffe3SKacheong Poon 3158b7de80edSKacheong Poon /* Check if there is any special processing needs to be done. */ 3159721fffe3SKacheong Poon if (tcp->tcp_valid_bits) { 3160721fffe3SKacheong Poon uint32_t u1; 3161721fffe3SKacheong Poon 3162b7de80edSKacheong Poon /* We don't allow having SYN and FIN in the same segment... */ 3163721fffe3SKacheong Poon if ((tcp->tcp_valid_bits & TCP_ISS_VALID) && 3164721fffe3SKacheong Poon seq == tcp->tcp_iss) { 3165b7de80edSKacheong Poon /* Need to do connection set up processing. */ 3166b7de80edSKacheong Poon tcp_xmit_mp_aux_iss(tcp, connp, tcpha, mp1, &flags); 3167b7de80edSKacheong Poon } else if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 3168721fffe3SKacheong Poon (seq + data_length) == tcp->tcp_fss) { 3169b7de80edSKacheong Poon /* Need to do connection tear down processing. */ 3170b7de80edSKacheong Poon tcp_xmit_mp_aux_fss(tcp, ixa, &flags); 3171721fffe3SKacheong Poon } 3172b7de80edSKacheong Poon 3173721fffe3SKacheong Poon /* 3174b7de80edSKacheong Poon * Need to do urgent pointer processing. 3175b7de80edSKacheong Poon * 3176721fffe3SKacheong Poon * Note the trick here. u1 is unsigned. When tcp_urg 3177721fffe3SKacheong Poon * is smaller than seq, u1 will become a very huge value. 3178721fffe3SKacheong Poon * So the comparison will fail. Also note that tcp_urp 3179721fffe3SKacheong Poon * should be positive, see RFC 793 page 17. 3180721fffe3SKacheong Poon */ 3181721fffe3SKacheong Poon u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION; 3182721fffe3SKacheong Poon if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 && 3183721fffe3SKacheong Poon u1 < (uint32_t)(64 * 1024)) { 3184721fffe3SKacheong Poon flags |= TH_URG; 3185721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpOutUrg); 3186721fffe3SKacheong Poon tcpha->tha_urp = htons(u1); 3187721fffe3SKacheong Poon } 3188721fffe3SKacheong Poon } 3189721fffe3SKacheong Poon tcpha->tha_flags = (uchar_t)flags; 3190721fffe3SKacheong Poon tcp->tcp_rack = tcp->tcp_rnxt; 3191721fffe3SKacheong Poon tcp->tcp_rack_cnt = 0; 3192721fffe3SKacheong Poon 3193b7de80edSKacheong Poon /* Fill in the current value of timestamps option. */ 3194721fffe3SKacheong Poon if (tcp->tcp_snd_ts_ok) { 3195721fffe3SKacheong Poon if (tcp->tcp_state != TCPS_SYN_SENT) { 3196721fffe3SKacheong Poon uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; 3197721fffe3SKacheong Poon 3198721fffe3SKacheong Poon U32_TO_BE32(llbolt, 3199721fffe3SKacheong Poon (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); 3200721fffe3SKacheong Poon U32_TO_BE32(tcp->tcp_ts_recent, 3201721fffe3SKacheong Poon (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); 3202721fffe3SKacheong Poon } 3203721fffe3SKacheong Poon } 3204721fffe3SKacheong Poon 3205b7de80edSKacheong Poon /* Fill in the SACK blocks. */ 3206721fffe3SKacheong Poon if (num_sack_blk > 0) { 3207721fffe3SKacheong Poon uchar_t *wptr = (uchar_t *)tcpha + connp->conn_ht_ulp_len; 3208721fffe3SKacheong Poon sack_blk_t *tmp; 3209721fffe3SKacheong Poon int32_t i; 3210721fffe3SKacheong Poon 3211721fffe3SKacheong Poon wptr[0] = TCPOPT_NOP; 3212721fffe3SKacheong Poon wptr[1] = TCPOPT_NOP; 3213721fffe3SKacheong Poon wptr[2] = TCPOPT_SACK; 3214721fffe3SKacheong Poon wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 3215721fffe3SKacheong Poon sizeof (sack_blk_t); 3216721fffe3SKacheong Poon wptr += TCPOPT_REAL_SACK_LEN; 3217721fffe3SKacheong Poon 3218721fffe3SKacheong Poon tmp = tcp->tcp_sack_list; 3219721fffe3SKacheong Poon for (i = 0; i < num_sack_blk; i++) { 3220721fffe3SKacheong Poon U32_TO_BE32(tmp[i].begin, wptr); 3221721fffe3SKacheong Poon wptr += sizeof (tcp_seq); 3222721fffe3SKacheong Poon U32_TO_BE32(tmp[i].end, wptr); 3223721fffe3SKacheong Poon wptr += sizeof (tcp_seq); 3224721fffe3SKacheong Poon } 3225721fffe3SKacheong Poon tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4); 3226721fffe3SKacheong Poon } 3227721fffe3SKacheong Poon ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); 3228721fffe3SKacheong Poon data_length += (int)(mp1->b_wptr - rptr); 3229721fffe3SKacheong Poon 3230721fffe3SKacheong Poon ixa->ixa_pktlen = data_length; 3231721fffe3SKacheong Poon 3232721fffe3SKacheong Poon if (ixa->ixa_flags & IXAF_IS_IPV4) { 3233721fffe3SKacheong Poon ((ipha_t *)rptr)->ipha_length = htons(data_length); 3234721fffe3SKacheong Poon } else { 3235721fffe3SKacheong Poon ip6_t *ip6 = (ip6_t *)rptr; 3236721fffe3SKacheong Poon 3237721fffe3SKacheong Poon ip6->ip6_plen = htons(data_length - IPV6_HDR_LEN); 3238721fffe3SKacheong Poon } 3239721fffe3SKacheong Poon 3240721fffe3SKacheong Poon /* 3241721fffe3SKacheong Poon * Prime pump for IP 3242721fffe3SKacheong Poon * Include the adjustment for a source route if any. 3243721fffe3SKacheong Poon */ 3244721fffe3SKacheong Poon data_length -= ixa->ixa_ip_hdr_length; 3245721fffe3SKacheong Poon data_length += connp->conn_sum; 3246721fffe3SKacheong Poon data_length = (data_length >> 16) + (data_length & 0xFFFF); 3247721fffe3SKacheong Poon tcpha->tha_sum = htons(data_length); 3248721fffe3SKacheong Poon if (tcp->tcp_ip_forward_progress) { 3249721fffe3SKacheong Poon tcp->tcp_ip_forward_progress = B_FALSE; 3250721fffe3SKacheong Poon connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; 3251721fffe3SKacheong Poon } else { 3252721fffe3SKacheong Poon connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; 3253721fffe3SKacheong Poon } 3254721fffe3SKacheong Poon return (mp1); 3255721fffe3SKacheong Poon } 3256721fffe3SKacheong Poon 3257721fffe3SKacheong Poon /* 3258721fffe3SKacheong Poon * If this routine returns B_TRUE, TCP can generate a RST in response 3259721fffe3SKacheong Poon * to a segment. If it returns B_FALSE, TCP should not respond. 3260721fffe3SKacheong Poon */ 3261721fffe3SKacheong Poon static boolean_t 3262721fffe3SKacheong Poon tcp_send_rst_chk(tcp_stack_t *tcps) 3263721fffe3SKacheong Poon { 3264721fffe3SKacheong Poon int64_t now; 3265721fffe3SKacheong Poon 3266721fffe3SKacheong Poon /* 3267721fffe3SKacheong Poon * TCP needs to protect itself from generating too many RSTs. 3268721fffe3SKacheong Poon * This can be a DoS attack by sending us random segments 3269721fffe3SKacheong Poon * soliciting RSTs. 3270721fffe3SKacheong Poon * 3271721fffe3SKacheong Poon * What we do here is to have a limit of tcp_rst_sent_rate RSTs 3272721fffe3SKacheong Poon * in each 1 second interval. In this way, TCP still generate 3273721fffe3SKacheong Poon * RSTs in normal cases but when under attack, the impact is 3274721fffe3SKacheong Poon * limited. 3275721fffe3SKacheong Poon */ 3276721fffe3SKacheong Poon if (tcps->tcps_rst_sent_rate_enabled != 0) { 3277721fffe3SKacheong Poon now = ddi_get_lbolt64(); 3278721fffe3SKacheong Poon if (TICK_TO_MSEC(now - tcps->tcps_last_rst_intrvl) > 3279721fffe3SKacheong Poon 1*SECONDS) { 3280721fffe3SKacheong Poon tcps->tcps_last_rst_intrvl = now; 3281721fffe3SKacheong Poon tcps->tcps_rst_cnt = 1; 3282721fffe3SKacheong Poon } else if (++tcps->tcps_rst_cnt > tcps->tcps_rst_sent_rate) { 3283721fffe3SKacheong Poon return (B_FALSE); 3284721fffe3SKacheong Poon } 3285721fffe3SKacheong Poon } 3286721fffe3SKacheong Poon return (B_TRUE); 3287721fffe3SKacheong Poon } 3288721fffe3SKacheong Poon 3289721fffe3SKacheong Poon /* 3290721fffe3SKacheong Poon * This function handles all retransmissions if SACK is enabled for this 3291721fffe3SKacheong Poon * connection. First it calculates how many segments can be retransmitted 3292721fffe3SKacheong Poon * based on tcp_pipe. Then it goes thru the notsack list to find eligible 3293721fffe3SKacheong Poon * segments. A segment is eligible if sack_cnt for that segment is greater 3294721fffe3SKacheong Poon * than or equal tcp_dupack_fast_retransmit. After it has retransmitted 3295721fffe3SKacheong Poon * all eligible segments, it checks to see if TCP can send some new segments 3296721fffe3SKacheong Poon * (fast recovery). If it can, set the appropriate flag for tcp_input_data(). 3297721fffe3SKacheong Poon * 3298721fffe3SKacheong Poon * Parameters: 3299721fffe3SKacheong Poon * tcp_t *tcp: the tcp structure of the connection. 3300721fffe3SKacheong Poon * uint_t *flags: in return, appropriate value will be set for 3301721fffe3SKacheong Poon * tcp_input_data(). 3302721fffe3SKacheong Poon */ 3303721fffe3SKacheong Poon void 3304721fffe3SKacheong Poon tcp_sack_rexmit(tcp_t *tcp, uint_t *flags) 3305721fffe3SKacheong Poon { 3306721fffe3SKacheong Poon notsack_blk_t *notsack_blk; 3307721fffe3SKacheong Poon int32_t usable_swnd; 3308721fffe3SKacheong Poon int32_t mss; 3309721fffe3SKacheong Poon uint32_t seg_len; 3310721fffe3SKacheong Poon mblk_t *xmit_mp; 3311721fffe3SKacheong Poon tcp_stack_t *tcps = tcp->tcp_tcps; 3312721fffe3SKacheong Poon 3313721fffe3SKacheong Poon ASSERT(tcp->tcp_notsack_list != NULL); 3314721fffe3SKacheong Poon ASSERT(tcp->tcp_rexmit == B_FALSE); 3315721fffe3SKacheong Poon 3316721fffe3SKacheong Poon /* Defensive coding in case there is a bug... */ 3317721fffe3SKacheong Poon if (tcp->tcp_notsack_list == NULL) { 3318721fffe3SKacheong Poon return; 3319721fffe3SKacheong Poon } 3320721fffe3SKacheong Poon notsack_blk = tcp->tcp_notsack_list; 3321721fffe3SKacheong Poon mss = tcp->tcp_mss; 3322721fffe3SKacheong Poon 3323721fffe3SKacheong Poon /* 3324721fffe3SKacheong Poon * Limit the num of outstanding data in the network to be 3325721fffe3SKacheong Poon * tcp_cwnd_ssthresh, which is half of the original congestion wnd. 3326721fffe3SKacheong Poon */ 3327721fffe3SKacheong Poon usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 3328721fffe3SKacheong Poon 3329721fffe3SKacheong Poon /* At least retransmit 1 MSS of data. */ 3330721fffe3SKacheong Poon if (usable_swnd <= 0) { 3331721fffe3SKacheong Poon usable_swnd = mss; 3332721fffe3SKacheong Poon } 3333721fffe3SKacheong Poon 3334721fffe3SKacheong Poon /* Make sure no new RTT samples will be taken. */ 3335721fffe3SKacheong Poon tcp->tcp_csuna = tcp->tcp_snxt; 3336721fffe3SKacheong Poon 3337721fffe3SKacheong Poon notsack_blk = tcp->tcp_notsack_list; 3338721fffe3SKacheong Poon while (usable_swnd > 0) { 3339721fffe3SKacheong Poon mblk_t *snxt_mp, *tmp_mp; 3340721fffe3SKacheong Poon tcp_seq begin = tcp->tcp_sack_snxt; 3341721fffe3SKacheong Poon tcp_seq end; 3342721fffe3SKacheong Poon int32_t off; 3343721fffe3SKacheong Poon 3344721fffe3SKacheong Poon for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { 3345721fffe3SKacheong Poon if (SEQ_GT(notsack_blk->end, begin) && 3346721fffe3SKacheong Poon (notsack_blk->sack_cnt >= 3347721fffe3SKacheong Poon tcps->tcps_dupack_fast_retransmit)) { 3348721fffe3SKacheong Poon end = notsack_blk->end; 3349721fffe3SKacheong Poon if (SEQ_LT(begin, notsack_blk->begin)) { 3350721fffe3SKacheong Poon begin = notsack_blk->begin; 3351721fffe3SKacheong Poon } 3352721fffe3SKacheong Poon break; 3353721fffe3SKacheong Poon } 3354721fffe3SKacheong Poon } 3355721fffe3SKacheong Poon /* 3356721fffe3SKacheong Poon * All holes are filled. Manipulate tcp_cwnd to send more 3357721fffe3SKacheong Poon * if we can. Note that after the SACK recovery, tcp_cwnd is 3358721fffe3SKacheong Poon * set to tcp_cwnd_ssthresh. 3359721fffe3SKacheong Poon */ 3360721fffe3SKacheong Poon if (notsack_blk == NULL) { 3361721fffe3SKacheong Poon usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 3362721fffe3SKacheong Poon if (usable_swnd <= 0 || tcp->tcp_unsent == 0) { 3363721fffe3SKacheong Poon tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna; 3364721fffe3SKacheong Poon ASSERT(tcp->tcp_cwnd > 0); 3365721fffe3SKacheong Poon return; 3366721fffe3SKacheong Poon } else { 3367721fffe3SKacheong Poon usable_swnd = usable_swnd / mss; 3368721fffe3SKacheong Poon tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna + 3369721fffe3SKacheong Poon MAX(usable_swnd * mss, mss); 3370721fffe3SKacheong Poon *flags |= TH_XMIT_NEEDED; 3371721fffe3SKacheong Poon return; 3372721fffe3SKacheong Poon } 3373721fffe3SKacheong Poon } 3374721fffe3SKacheong Poon 3375721fffe3SKacheong Poon /* 3376721fffe3SKacheong Poon * Note that we may send more than usable_swnd allows here 3377721fffe3SKacheong Poon * because of round off, but no more than 1 MSS of data. 3378721fffe3SKacheong Poon */ 3379721fffe3SKacheong Poon seg_len = end - begin; 3380721fffe3SKacheong Poon if (seg_len > mss) 3381721fffe3SKacheong Poon seg_len = mss; 3382721fffe3SKacheong Poon snxt_mp = tcp_get_seg_mp(tcp, begin, &off); 3383721fffe3SKacheong Poon ASSERT(snxt_mp != NULL); 3384721fffe3SKacheong Poon /* This should not happen. Defensive coding again... */ 3385721fffe3SKacheong Poon if (snxt_mp == NULL) { 3386721fffe3SKacheong Poon return; 3387721fffe3SKacheong Poon } 3388721fffe3SKacheong Poon 3389721fffe3SKacheong Poon xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off, 3390721fffe3SKacheong Poon &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE); 3391721fffe3SKacheong Poon if (xmit_mp == NULL) 3392721fffe3SKacheong Poon return; 3393721fffe3SKacheong Poon 3394721fffe3SKacheong Poon usable_swnd -= seg_len; 3395721fffe3SKacheong Poon tcp->tcp_pipe += seg_len; 3396721fffe3SKacheong Poon tcp->tcp_sack_snxt = begin + seg_len; 3397721fffe3SKacheong Poon 3398721fffe3SKacheong Poon tcp_send_data(tcp, xmit_mp); 3399721fffe3SKacheong Poon 3400721fffe3SKacheong Poon /* 3401721fffe3SKacheong Poon * Update the send timestamp to avoid false retransmission. 3402721fffe3SKacheong Poon */ 3403721fffe3SKacheong Poon snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); 3404721fffe3SKacheong Poon 3405721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpRetransSegs); 3406721fffe3SKacheong Poon TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len); 3407721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs); 3408721fffe3SKacheong Poon /* 3409721fffe3SKacheong Poon * Update tcp_rexmit_max to extend this SACK recovery phase. 3410721fffe3SKacheong Poon * This happens when new data sent during fast recovery is 3411721fffe3SKacheong Poon * also lost. If TCP retransmits those new data, it needs 3412721fffe3SKacheong Poon * to extend SACK recover phase to avoid starting another 3413721fffe3SKacheong Poon * fast retransmit/recovery unnecessarily. 3414721fffe3SKacheong Poon */ 3415721fffe3SKacheong Poon if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) { 3416721fffe3SKacheong Poon tcp->tcp_rexmit_max = tcp->tcp_sack_snxt; 3417721fffe3SKacheong Poon } 3418721fffe3SKacheong Poon } 3419721fffe3SKacheong Poon } 3420721fffe3SKacheong Poon 3421721fffe3SKacheong Poon /* 3422721fffe3SKacheong Poon * tcp_ss_rexmit() is called to do slow start retransmission after a timeout 3423721fffe3SKacheong Poon * or ICMP errors. 3424721fffe3SKacheong Poon */ 3425721fffe3SKacheong Poon void 3426721fffe3SKacheong Poon tcp_ss_rexmit(tcp_t *tcp) 3427721fffe3SKacheong Poon { 3428721fffe3SKacheong Poon uint32_t snxt; 3429721fffe3SKacheong Poon uint32_t smax; 3430721fffe3SKacheong Poon int32_t win; 3431721fffe3SKacheong Poon int32_t mss; 3432721fffe3SKacheong Poon int32_t off; 3433721fffe3SKacheong Poon mblk_t *snxt_mp; 3434721fffe3SKacheong Poon tcp_stack_t *tcps = tcp->tcp_tcps; 3435721fffe3SKacheong Poon 3436721fffe3SKacheong Poon /* 3437721fffe3SKacheong Poon * Note that tcp_rexmit can be set even though TCP has retransmitted 3438721fffe3SKacheong Poon * all unack'ed segments. 3439721fffe3SKacheong Poon */ 3440721fffe3SKacheong Poon if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) { 3441721fffe3SKacheong Poon smax = tcp->tcp_rexmit_max; 3442721fffe3SKacheong Poon snxt = tcp->tcp_rexmit_nxt; 3443721fffe3SKacheong Poon if (SEQ_LT(snxt, tcp->tcp_suna)) { 3444721fffe3SKacheong Poon snxt = tcp->tcp_suna; 3445721fffe3SKacheong Poon } 3446721fffe3SKacheong Poon win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd); 3447721fffe3SKacheong Poon win -= snxt - tcp->tcp_suna; 3448721fffe3SKacheong Poon mss = tcp->tcp_mss; 3449721fffe3SKacheong Poon snxt_mp = tcp_get_seg_mp(tcp, snxt, &off); 3450721fffe3SKacheong Poon 3451633fc3a6SSebastien Roy while (SEQ_LT(snxt, smax) && (win > 0) && (snxt_mp != NULL)) { 3452721fffe3SKacheong Poon mblk_t *xmit_mp; 3453721fffe3SKacheong Poon mblk_t *old_snxt_mp = snxt_mp; 3454721fffe3SKacheong Poon uint32_t cnt = mss; 3455721fffe3SKacheong Poon 3456721fffe3SKacheong Poon if (win < cnt) { 3457721fffe3SKacheong Poon cnt = win; 3458721fffe3SKacheong Poon } 3459721fffe3SKacheong Poon if (SEQ_GT(snxt + cnt, smax)) { 3460721fffe3SKacheong Poon cnt = smax - snxt; 3461721fffe3SKacheong Poon } 3462721fffe3SKacheong Poon xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off, 3463721fffe3SKacheong Poon &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE); 3464721fffe3SKacheong Poon if (xmit_mp == NULL) 3465721fffe3SKacheong Poon return; 3466721fffe3SKacheong Poon 3467721fffe3SKacheong Poon tcp_send_data(tcp, xmit_mp); 3468721fffe3SKacheong Poon 3469721fffe3SKacheong Poon snxt += cnt; 3470721fffe3SKacheong Poon win -= cnt; 3471721fffe3SKacheong Poon /* 3472721fffe3SKacheong Poon * Update the send timestamp to avoid false 3473721fffe3SKacheong Poon * retransmission. 3474721fffe3SKacheong Poon */ 3475721fffe3SKacheong Poon old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); 3476721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpRetransSegs); 3477721fffe3SKacheong Poon TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt); 3478721fffe3SKacheong Poon 3479721fffe3SKacheong Poon tcp->tcp_rexmit_nxt = snxt; 3480721fffe3SKacheong Poon } 3481721fffe3SKacheong Poon /* 3482721fffe3SKacheong Poon * If we have transmitted all we have at the time 3483721fffe3SKacheong Poon * we started the retranmission, we can leave 3484721fffe3SKacheong Poon * the rest of the job to tcp_wput_data(). But we 3485721fffe3SKacheong Poon * need to check the send window first. If the 3486721fffe3SKacheong Poon * win is not 0, go on with tcp_wput_data(). 3487721fffe3SKacheong Poon */ 3488721fffe3SKacheong Poon if (SEQ_LT(snxt, smax) || win == 0) { 3489721fffe3SKacheong Poon return; 3490721fffe3SKacheong Poon } 3491721fffe3SKacheong Poon } 3492721fffe3SKacheong Poon /* Only call tcp_wput_data() if there is data to be sent. */ 3493721fffe3SKacheong Poon if (tcp->tcp_unsent) { 3494721fffe3SKacheong Poon tcp_wput_data(tcp, NULL, B_FALSE); 3495721fffe3SKacheong Poon } 3496721fffe3SKacheong Poon } 3497721fffe3SKacheong Poon 3498721fffe3SKacheong Poon /* 3499721fffe3SKacheong Poon * Do slow start retransmission after ICMP errors of PMTU changes. 3500721fffe3SKacheong Poon */ 3501721fffe3SKacheong Poon void 3502721fffe3SKacheong Poon tcp_rexmit_after_error(tcp_t *tcp) 3503721fffe3SKacheong Poon { 3504721fffe3SKacheong Poon /* 3505721fffe3SKacheong Poon * All sent data has been acknowledged or no data left to send, just 3506721fffe3SKacheong Poon * to return. 3507721fffe3SKacheong Poon */ 3508721fffe3SKacheong Poon if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) || 3509721fffe3SKacheong Poon (tcp->tcp_xmit_head == NULL)) 3510721fffe3SKacheong Poon return; 3511721fffe3SKacheong Poon 3512721fffe3SKacheong Poon if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0)) 3513721fffe3SKacheong Poon tcp->tcp_rexmit_max = tcp->tcp_fss; 3514721fffe3SKacheong Poon else 3515721fffe3SKacheong Poon tcp->tcp_rexmit_max = tcp->tcp_snxt; 3516721fffe3SKacheong Poon 3517721fffe3SKacheong Poon tcp->tcp_rexmit_nxt = tcp->tcp_suna; 3518721fffe3SKacheong Poon tcp->tcp_rexmit = B_TRUE; 3519721fffe3SKacheong Poon tcp->tcp_dupack_cnt = 0; 3520721fffe3SKacheong Poon tcp_ss_rexmit(tcp); 3521721fffe3SKacheong Poon } 3522721fffe3SKacheong Poon 3523721fffe3SKacheong Poon /* 3524721fffe3SKacheong Poon * tcp_get_seg_mp() is called to get the pointer to a segment in the 3525721fffe3SKacheong Poon * send queue which starts at the given sequence number. If the given 3526721fffe3SKacheong Poon * sequence number is equal to last valid sequence number (tcp_snxt), the 3527721fffe3SKacheong Poon * returned mblk is the last valid mblk, and off is set to the length of 3528721fffe3SKacheong Poon * that mblk. 3529721fffe3SKacheong Poon * 3530721fffe3SKacheong Poon * send queue which starts at the given seq. no. 3531721fffe3SKacheong Poon * 3532721fffe3SKacheong Poon * Parameters: 3533721fffe3SKacheong Poon * tcp_t *tcp: the tcp instance pointer. 3534721fffe3SKacheong Poon * uint32_t seq: the starting seq. no of the requested segment. 3535721fffe3SKacheong Poon * int32_t *off: after the execution, *off will be the offset to 3536721fffe3SKacheong Poon * the returned mblk which points to the requested seq no. 3537721fffe3SKacheong Poon * It is the caller's responsibility to send in a non-null off. 3538721fffe3SKacheong Poon * 3539721fffe3SKacheong Poon * Return: 3540721fffe3SKacheong Poon * A mblk_t pointer pointing to the requested segment in send queue. 3541721fffe3SKacheong Poon */ 3542721fffe3SKacheong Poon static mblk_t * 3543721fffe3SKacheong Poon tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) 3544721fffe3SKacheong Poon { 3545721fffe3SKacheong Poon int32_t cnt; 3546721fffe3SKacheong Poon mblk_t *mp; 3547721fffe3SKacheong Poon 3548721fffe3SKacheong Poon /* Defensive coding. Make sure we don't send incorrect data. */ 3549721fffe3SKacheong Poon if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GT(seq, tcp->tcp_snxt)) 3550721fffe3SKacheong Poon return (NULL); 3551721fffe3SKacheong Poon 3552721fffe3SKacheong Poon cnt = seq - tcp->tcp_suna; 3553721fffe3SKacheong Poon mp = tcp->tcp_xmit_head; 3554721fffe3SKacheong Poon while (cnt > 0 && mp != NULL) { 3555721fffe3SKacheong Poon cnt -= mp->b_wptr - mp->b_rptr; 3556721fffe3SKacheong Poon if (cnt <= 0) { 3557721fffe3SKacheong Poon cnt += mp->b_wptr - mp->b_rptr; 3558721fffe3SKacheong Poon break; 3559721fffe3SKacheong Poon } 3560721fffe3SKacheong Poon mp = mp->b_cont; 3561721fffe3SKacheong Poon } 3562721fffe3SKacheong Poon ASSERT(mp != NULL); 3563721fffe3SKacheong Poon *off = cnt; 3564721fffe3SKacheong Poon return (mp); 3565721fffe3SKacheong Poon } 3566721fffe3SKacheong Poon 3567721fffe3SKacheong Poon /* 3568721fffe3SKacheong Poon * This routine adjusts next-to-send sequence number variables, in the 3569721fffe3SKacheong Poon * case where the reciever has shrunk it's window. 3570721fffe3SKacheong Poon */ 3571721fffe3SKacheong Poon void 3572721fffe3SKacheong Poon tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt) 3573721fffe3SKacheong Poon { 3574721fffe3SKacheong Poon mblk_t *xmit_tail; 3575721fffe3SKacheong Poon int32_t offset; 3576721fffe3SKacheong Poon 3577721fffe3SKacheong Poon tcp->tcp_snxt = snxt; 3578721fffe3SKacheong Poon 3579721fffe3SKacheong Poon /* Get the mblk, and the offset in it, as per the shrunk window */ 3580721fffe3SKacheong Poon xmit_tail = tcp_get_seg_mp(tcp, snxt, &offset); 3581721fffe3SKacheong Poon ASSERT(xmit_tail != NULL); 3582721fffe3SKacheong Poon tcp->tcp_xmit_tail = xmit_tail; 3583721fffe3SKacheong Poon tcp->tcp_xmit_tail_unsent = xmit_tail->b_wptr - 3584721fffe3SKacheong Poon xmit_tail->b_rptr - offset; 3585721fffe3SKacheong Poon } 3586721fffe3SKacheong Poon 3587721fffe3SKacheong Poon /* 3588721fffe3SKacheong Poon * This handles the case when the receiver has shrunk its win. Per RFC 1122 3589721fffe3SKacheong Poon * if the receiver shrinks the window, i.e. moves the right window to the 3590721fffe3SKacheong Poon * left, the we should not send new data, but should retransmit normally the 3591721fffe3SKacheong Poon * old unacked data between suna and suna + swnd. We might has sent data 3592721fffe3SKacheong Poon * that is now outside the new window, pretend that we didn't send it. 3593721fffe3SKacheong Poon */ 3594721fffe3SKacheong Poon static void 3595721fffe3SKacheong Poon tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count) 3596721fffe3SKacheong Poon { 3597721fffe3SKacheong Poon uint32_t snxt = tcp->tcp_snxt; 3598721fffe3SKacheong Poon 3599721fffe3SKacheong Poon ASSERT(shrunk_count > 0); 3600721fffe3SKacheong Poon 3601721fffe3SKacheong Poon if (!tcp->tcp_is_wnd_shrnk) { 3602721fffe3SKacheong Poon tcp->tcp_snxt_shrunk = snxt; 3603721fffe3SKacheong Poon tcp->tcp_is_wnd_shrnk = B_TRUE; 3604721fffe3SKacheong Poon } else if (SEQ_GT(snxt, tcp->tcp_snxt_shrunk)) { 3605721fffe3SKacheong Poon tcp->tcp_snxt_shrunk = snxt; 3606721fffe3SKacheong Poon } 3607721fffe3SKacheong Poon 3608721fffe3SKacheong Poon /* Pretend we didn't send the data outside the window */ 3609721fffe3SKacheong Poon snxt -= shrunk_count; 3610721fffe3SKacheong Poon 3611721fffe3SKacheong Poon /* Reset all the values per the now shrunk window */ 3612721fffe3SKacheong Poon tcp_update_xmit_tail(tcp, snxt); 3613721fffe3SKacheong Poon tcp->tcp_unsent += shrunk_count; 3614721fffe3SKacheong Poon 3615721fffe3SKacheong Poon /* 3616721fffe3SKacheong Poon * If the SACK option is set, delete the entire list of 3617721fffe3SKacheong Poon * notsack'ed blocks. 3618721fffe3SKacheong Poon */ 3619721fffe3SKacheong Poon TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp); 3620721fffe3SKacheong Poon 3621721fffe3SKacheong Poon if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0) 3622721fffe3SKacheong Poon /* 3623721fffe3SKacheong Poon * Make sure the timer is running so that we will probe a zero 3624721fffe3SKacheong Poon * window. 3625721fffe3SKacheong Poon */ 3626721fffe3SKacheong Poon TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3627721fffe3SKacheong Poon } 3628721fffe3SKacheong Poon 3629721fffe3SKacheong Poon /* 3630721fffe3SKacheong Poon * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header 3631721fffe3SKacheong Poon * with the template header, as well as other options such as time-stamp, 3632721fffe3SKacheong Poon * ECN and/or SACK. 3633721fffe3SKacheong Poon */ 3634721fffe3SKacheong Poon static void 3635721fffe3SKacheong Poon tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) 3636721fffe3SKacheong Poon { 3637721fffe3SKacheong Poon tcpha_t *tcp_tmpl, *tcpha; 3638721fffe3SKacheong Poon uint32_t *dst, *src; 3639721fffe3SKacheong Poon int hdrlen; 3640721fffe3SKacheong Poon conn_t *connp = tcp->tcp_connp; 3641721fffe3SKacheong Poon 3642721fffe3SKacheong Poon ASSERT(OK_32PTR(rptr)); 3643721fffe3SKacheong Poon 3644721fffe3SKacheong Poon /* Template header */ 3645721fffe3SKacheong Poon tcp_tmpl = tcp->tcp_tcpha; 3646721fffe3SKacheong Poon 3647721fffe3SKacheong Poon /* Header of outgoing packet */ 3648721fffe3SKacheong Poon tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length); 3649721fffe3SKacheong Poon 3650721fffe3SKacheong Poon /* dst and src are opaque 32-bit fields, used for copying */ 3651721fffe3SKacheong Poon dst = (uint32_t *)rptr; 3652721fffe3SKacheong Poon src = (uint32_t *)connp->conn_ht_iphc; 3653721fffe3SKacheong Poon hdrlen = connp->conn_ht_iphc_len; 3654721fffe3SKacheong Poon 3655721fffe3SKacheong Poon /* Fill time-stamp option if needed */ 3656721fffe3SKacheong Poon if (tcp->tcp_snd_ts_ok) { 3657721fffe3SKacheong Poon U32_TO_BE32((uint32_t)now, 3658721fffe3SKacheong Poon (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4); 3659721fffe3SKacheong Poon U32_TO_BE32(tcp->tcp_ts_recent, 3660721fffe3SKacheong Poon (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8); 3661721fffe3SKacheong Poon } else { 3662721fffe3SKacheong Poon ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); 3663721fffe3SKacheong Poon } 3664721fffe3SKacheong Poon 3665721fffe3SKacheong Poon /* 3666721fffe3SKacheong Poon * Copy the template header; is this really more efficient than 3667721fffe3SKacheong Poon * calling bcopy()? For simple IPv4/TCP, it may be the case, 3668721fffe3SKacheong Poon * but perhaps not for other scenarios. 3669721fffe3SKacheong Poon */ 3670721fffe3SKacheong Poon dst[0] = src[0]; 3671721fffe3SKacheong Poon dst[1] = src[1]; 3672721fffe3SKacheong Poon dst[2] = src[2]; 3673721fffe3SKacheong Poon dst[3] = src[3]; 3674721fffe3SKacheong Poon dst[4] = src[4]; 3675721fffe3SKacheong Poon dst[5] = src[5]; 3676721fffe3SKacheong Poon dst[6] = src[6]; 3677721fffe3SKacheong Poon dst[7] = src[7]; 3678721fffe3SKacheong Poon dst[8] = src[8]; 3679721fffe3SKacheong Poon dst[9] = src[9]; 3680721fffe3SKacheong Poon if (hdrlen -= 40) { 3681721fffe3SKacheong Poon hdrlen >>= 2; 3682721fffe3SKacheong Poon dst += 10; 3683721fffe3SKacheong Poon src += 10; 3684721fffe3SKacheong Poon do { 3685721fffe3SKacheong Poon *dst++ = *src++; 3686721fffe3SKacheong Poon } while (--hdrlen); 3687721fffe3SKacheong Poon } 3688721fffe3SKacheong Poon 3689721fffe3SKacheong Poon /* 3690721fffe3SKacheong Poon * Set the ECN info in the TCP header if it is not a zero 3691721fffe3SKacheong Poon * window probe. Zero window probe is only sent in 3692721fffe3SKacheong Poon * tcp_wput_data() and tcp_timer(). 3693721fffe3SKacheong Poon */ 3694721fffe3SKacheong Poon if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) { 3695721fffe3SKacheong Poon TCP_SET_ECT(tcp, rptr); 3696721fffe3SKacheong Poon 3697721fffe3SKacheong Poon if (tcp->tcp_ecn_echo_on) 3698721fffe3SKacheong Poon tcpha->tha_flags |= TH_ECE; 3699721fffe3SKacheong Poon if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 3700721fffe3SKacheong Poon tcpha->tha_flags |= TH_CWR; 3701721fffe3SKacheong Poon tcp->tcp_ecn_cwr_sent = B_TRUE; 3702721fffe3SKacheong Poon } 3703721fffe3SKacheong Poon } 3704721fffe3SKacheong Poon 3705721fffe3SKacheong Poon /* Fill in SACK options */ 3706721fffe3SKacheong Poon if (num_sack_blk > 0) { 3707721fffe3SKacheong Poon uchar_t *wptr = rptr + connp->conn_ht_iphc_len; 3708721fffe3SKacheong Poon sack_blk_t *tmp; 3709721fffe3SKacheong Poon int32_t i; 3710721fffe3SKacheong Poon 3711721fffe3SKacheong Poon wptr[0] = TCPOPT_NOP; 3712721fffe3SKacheong Poon wptr[1] = TCPOPT_NOP; 3713721fffe3SKacheong Poon wptr[2] = TCPOPT_SACK; 3714721fffe3SKacheong Poon wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 3715721fffe3SKacheong Poon sizeof (sack_blk_t); 3716721fffe3SKacheong Poon wptr += TCPOPT_REAL_SACK_LEN; 3717721fffe3SKacheong Poon 3718721fffe3SKacheong Poon tmp = tcp->tcp_sack_list; 3719721fffe3SKacheong Poon for (i = 0; i < num_sack_blk; i++) { 3720721fffe3SKacheong Poon U32_TO_BE32(tmp[i].begin, wptr); 3721721fffe3SKacheong Poon wptr += sizeof (tcp_seq); 3722721fffe3SKacheong Poon U32_TO_BE32(tmp[i].end, wptr); 3723721fffe3SKacheong Poon wptr += sizeof (tcp_seq); 3724721fffe3SKacheong Poon } 3725721fffe3SKacheong Poon tcpha->tha_offset_and_reserved += 3726721fffe3SKacheong Poon ((num_sack_blk * 2 + 1) << 4); 3727721fffe3SKacheong Poon } 3728721fffe3SKacheong Poon } 3729