17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5adccf2f2Skcpoon * Common Development and Distribution License (the "License"). 6adccf2f2Skcpoon * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 21adccf2f2Skcpoon 227c478bd9Sstevel@tonic-gate /* 2366cd0f60SKacheong Poon * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 24*4778e36eSJohn Levon * Copyright 2019 Joyent, Inc. 253d0a255cSGarrett D'Amore * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 26633fc3a6SSebastien Roy * Copyright (c) 2013,2014 by Delphix. All rights reserved. 27a1ca8b43SDan McDonald * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. 287c478bd9Sstevel@tonic-gate */ 297c478bd9Sstevel@tonic-gate /* Copyright (c) 1990 Mentat Inc. */ 307c478bd9Sstevel@tonic-gate 317c478bd9Sstevel@tonic-gate #include <sys/types.h> 327c478bd9Sstevel@tonic-gate #include <sys/stream.h> 337c478bd9Sstevel@tonic-gate #include <sys/strsun.h> 347c478bd9Sstevel@tonic-gate #include <sys/strsubr.h> 357c478bd9Sstevel@tonic-gate #include <sys/stropts.h> 367c478bd9Sstevel@tonic-gate #include <sys/strlog.h> 377c478bd9Sstevel@tonic-gate #define _SUN_TPI_VERSION 2 387c478bd9Sstevel@tonic-gate #include <sys/tihdr.h> 397c478bd9Sstevel@tonic-gate #include <sys/timod.h> 407c478bd9Sstevel@tonic-gate #include <sys/ddi.h> 417c478bd9Sstevel@tonic-gate #include <sys/sunddi.h> 427c478bd9Sstevel@tonic-gate #include <sys/suntpi.h> 437c478bd9Sstevel@tonic-gate #include <sys/xti_inet.h> 447c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 457c478bd9Sstevel@tonic-gate #include <sys/debug.h> 46381a2a9aSdr146992 #include <sys/sdt.h> 477c478bd9Sstevel@tonic-gate #include <sys/vtrace.h> 487c478bd9Sstevel@tonic-gate #include <sys/kmem.h> 497c478bd9Sstevel@tonic-gate #include <sys/ethernet.h> 507c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h> 517c478bd9Sstevel@tonic-gate #include <sys/dlpi.h> 527c478bd9Sstevel@tonic-gate #include <sys/pattr.h> 537c478bd9Sstevel@tonic-gate #include <sys/policy.h> 5445916cd2Sjpk #include <sys/priv.h> 557c478bd9Sstevel@tonic-gate #include <sys/zone.h> 56f4b3ec61Sdh155122 #include <sys/sunldi.h> 577c478bd9Sstevel@tonic-gate 587c478bd9Sstevel@tonic-gate #include <sys/errno.h> 597c478bd9Sstevel@tonic-gate #include <sys/signal.h> 607c478bd9Sstevel@tonic-gate #include <sys/socket.h> 610f1702c5SYu Xiangning #include <sys/socketvar.h> 627c478bd9Sstevel@tonic-gate #include <sys/sockio.h> 637c478bd9Sstevel@tonic-gate #include <sys/isa_defs.h> 647c478bd9Sstevel@tonic-gate #include <sys/md5.h> 657c478bd9Sstevel@tonic-gate #include <sys/random.h> 6617169044Sbrutus #include <sys/uio.h> 6787a18d3fSMadhavan Venkataraman #include <sys/systm.h> 687c478bd9Sstevel@tonic-gate #include <netinet/in.h> 697c478bd9Sstevel@tonic-gate #include <netinet/tcp.h> 707c478bd9Sstevel@tonic-gate #include <netinet/ip6.h> 717c478bd9Sstevel@tonic-gate #include <netinet/icmp6.h> 727c478bd9Sstevel@tonic-gate #include <net/if.h> 737c478bd9Sstevel@tonic-gate #include <net/route.h> 747c478bd9Sstevel@tonic-gate #include <inet/ipsec_impl.h> 757c478bd9Sstevel@tonic-gate 767c478bd9Sstevel@tonic-gate #include <inet/common.h> 777c478bd9Sstevel@tonic-gate #include <inet/ip.h> 78ff550d0eSmasputra #include <inet/ip_impl.h> 797c478bd9Sstevel@tonic-gate #include <inet/ip6.h> 807c478bd9Sstevel@tonic-gate #include <inet/ip_ndp.h> 810f1702c5SYu Xiangning #include <inet/proto_set.h> 827c478bd9Sstevel@tonic-gate #include <inet/mib2.h> 837c478bd9Sstevel@tonic-gate #include <inet/optcom.h> 847c478bd9Sstevel@tonic-gate #include <inet/snmpcom.h> 857c478bd9Sstevel@tonic-gate #include <inet/kstatcom.h> 867c478bd9Sstevel@tonic-gate #include <inet/tcp.h> 87ff550d0eSmasputra #include <inet/tcp_impl.h> 88721fffe3SKacheong Poon #include <inet/tcp_cluster.h> 89ae6aa22aSVenugopal Iyer #include <inet/udp_impl.h> 907c478bd9Sstevel@tonic-gate #include <net/pfkeyv2.h> 917c478bd9Sstevel@tonic-gate #include <inet/ipdrop.h> 927c478bd9Sstevel@tonic-gate 937c478bd9Sstevel@tonic-gate #include <inet/ipclassifier.h> 947c478bd9Sstevel@tonic-gate #include <inet/ip_ire.h> 95c793af95Ssangeeta #include <inet/ip_ftable.h> 967c478bd9Sstevel@tonic-gate #include <inet/ip_if.h> 977c478bd9Sstevel@tonic-gate #include <inet/ipp_common.h> 98bd670b35SErik Nordmark #include <inet/ip_rts.h> 99381a2a9aSdr146992 #include <inet/ip_netinfo.h> 100da14cebeSEric Cheng #include <sys/squeue_impl.h> 1017c478bd9Sstevel@tonic-gate #include <sys/squeue.h> 10245916cd2Sjpk #include <sys/tsol/label.h> 10345916cd2Sjpk #include <sys/tsol/tnet.h> 10445916cd2Sjpk #include <rpc/pmap_prot.h> 10587a18d3fSMadhavan Venkataraman #include <sys/callo.h> 1067c478bd9Sstevel@tonic-gate 1077c478bd9Sstevel@tonic-gate /* 1087c478bd9Sstevel@tonic-gate * TCP Notes: aka FireEngine Phase I (PSARC 2002/433) 1097c478bd9Sstevel@tonic-gate * 1107c478bd9Sstevel@tonic-gate * (Read the detailed design doc in PSARC case directory) 1117c478bd9Sstevel@tonic-gate * 1127c478bd9Sstevel@tonic-gate * The entire tcp state is contained in tcp_t and conn_t structure 1137c478bd9Sstevel@tonic-gate * which are allocated in tandem using ipcl_conn_create() and passing 114bd670b35SErik Nordmark * IPCL_TCPCONN as a flag. We use 'conn_ref' and 'conn_lock' to protect 1157c478bd9Sstevel@tonic-gate * the references on the tcp_t. The tcp_t structure is never compressed 1167c478bd9Sstevel@tonic-gate * and packets always land on the correct TCP perimeter from the time 1177c478bd9Sstevel@tonic-gate * eager is created till the time tcp_t dies (as such the old mentat 1187c478bd9Sstevel@tonic-gate * TCP global queue is not used for detached state and no IPSEC checking 1197c478bd9Sstevel@tonic-gate * is required). The global queue is still allocated to send out resets 1207c478bd9Sstevel@tonic-gate * for connection which have no listeners and IP directly calls 1217c478bd9Sstevel@tonic-gate * tcp_xmit_listeners_reset() which does any policy check. 1227c478bd9Sstevel@tonic-gate * 1237c478bd9Sstevel@tonic-gate * Protection and Synchronisation mechanism: 1247c478bd9Sstevel@tonic-gate * 1257c478bd9Sstevel@tonic-gate * The tcp data structure does not use any kind of lock for protecting 1267c478bd9Sstevel@tonic-gate * its state but instead uses 'squeues' for mutual exclusion from various 1277c478bd9Sstevel@tonic-gate * read and write side threads. To access a tcp member, the thread should 128da14cebeSEric Cheng * always be behind squeue (via squeue_enter with flags as SQ_FILL, SQ_PROCESS, 129da14cebeSEric Cheng * or SQ_NODRAIN). Since the squeues allow a direct function call, caller 1307c478bd9Sstevel@tonic-gate * can pass any tcp function having prototype of edesc_t as argument 1317c478bd9Sstevel@tonic-gate * (different from traditional STREAMs model where packets come in only 1327c478bd9Sstevel@tonic-gate * designated entry points). The list of functions that can be directly 1337c478bd9Sstevel@tonic-gate * called via squeue are listed before the usual function prototype. 1347c478bd9Sstevel@tonic-gate * 1357c478bd9Sstevel@tonic-gate * Referencing: 1367c478bd9Sstevel@tonic-gate * 1377c478bd9Sstevel@tonic-gate * TCP is MT-Hot and we use a reference based scheme to make sure that the 1387c478bd9Sstevel@tonic-gate * tcp structure doesn't disappear when its needed. When the application 1397c478bd9Sstevel@tonic-gate * creates an outgoing connection or accepts an incoming connection, we 1407c478bd9Sstevel@tonic-gate * start out with 2 references on 'conn_ref'. One for TCP and one for IP. 1417c478bd9Sstevel@tonic-gate * The IP reference is just a symbolic reference since ip_tcpclose() 1427c478bd9Sstevel@tonic-gate * looks at tcp structure after tcp_close_output() returns which could 1437c478bd9Sstevel@tonic-gate * have dropped the last TCP reference. So as long as the connection is 1447c478bd9Sstevel@tonic-gate * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the 1457c478bd9Sstevel@tonic-gate * conn_t. The classifier puts its own reference when the connection is 1467c478bd9Sstevel@tonic-gate * inserted in listen or connected hash. Anytime a thread needs to enter 1477c478bd9Sstevel@tonic-gate * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr 1487c478bd9Sstevel@tonic-gate * on write side or by doing a classify on read side and then puts a 1497c478bd9Sstevel@tonic-gate * reference on the conn before doing squeue_enter/tryenter/fill. For 1507c478bd9Sstevel@tonic-gate * read side, the classifier itself puts the reference under fanout lock 1517c478bd9Sstevel@tonic-gate * to make sure that tcp can't disappear before it gets processed. The 1527c478bd9Sstevel@tonic-gate * squeue will drop this reference automatically so the called function 1537c478bd9Sstevel@tonic-gate * doesn't have to do a DEC_REF. 1547c478bd9Sstevel@tonic-gate * 1557c478bd9Sstevel@tonic-gate * Opening a new connection: 1567c478bd9Sstevel@tonic-gate * 157f4b3ec61Sdh155122 * The outgoing connection open is pretty simple. tcp_open() does the 1587c478bd9Sstevel@tonic-gate * work in creating the conn/tcp structure and initializing it. The 1597c478bd9Sstevel@tonic-gate * squeue assignment is done based on the CPU the application 1607c478bd9Sstevel@tonic-gate * is running on. So for outbound connections, processing is always done 1617c478bd9Sstevel@tonic-gate * on application CPU which might be different from the incoming CPU 1627c478bd9Sstevel@tonic-gate * being interrupted by the NIC. An optimal way would be to figure out 1637c478bd9Sstevel@tonic-gate * the NIC <-> CPU binding at listen time, and assign the outgoing 1647c478bd9Sstevel@tonic-gate * connection to the squeue attached to the CPU that will be interrupted 1657c478bd9Sstevel@tonic-gate * for incoming packets (we know the NIC based on the bind IP address). 1667c478bd9Sstevel@tonic-gate * This might seem like a problem if more data is going out but the 1677c478bd9Sstevel@tonic-gate * fact is that in most cases the transmit is ACK driven transmit where 1687c478bd9Sstevel@tonic-gate * the outgoing data normally sits on TCP's xmit queue waiting to be 1697c478bd9Sstevel@tonic-gate * transmitted. 1707c478bd9Sstevel@tonic-gate * 1717c478bd9Sstevel@tonic-gate * Accepting a connection: 1727c478bd9Sstevel@tonic-gate * 1737c478bd9Sstevel@tonic-gate * This is a more interesting case because of various races involved in 1747c478bd9Sstevel@tonic-gate * establishing a eager in its own perimeter. Read the meta comment on 175bd670b35SErik Nordmark * top of tcp_input_listener(). But briefly, the squeue is picked by 176bd670b35SErik Nordmark * ip_fanout based on the ring or the sender (if loopback). 1777c478bd9Sstevel@tonic-gate * 1787c478bd9Sstevel@tonic-gate * Closing a connection: 1797c478bd9Sstevel@tonic-gate * 1807c478bd9Sstevel@tonic-gate * The close is fairly straight forward. tcp_close() calls tcp_close_output() 1817c478bd9Sstevel@tonic-gate * via squeue to do the close and mark the tcp as detached if the connection 1827c478bd9Sstevel@tonic-gate * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its 1837c478bd9Sstevel@tonic-gate * reference but tcp_close() drop IP's reference always. So if tcp was 1847c478bd9Sstevel@tonic-gate * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP 1857c478bd9Sstevel@tonic-gate * and 1 because it is in classifier's connected hash. This is the condition 1867c478bd9Sstevel@tonic-gate * we use to determine that its OK to clean up the tcp outside of squeue 1877c478bd9Sstevel@tonic-gate * when time wait expires (check the ref under fanout and conn_lock and 1887c478bd9Sstevel@tonic-gate * if it is 2, remove it from fanout hash and kill it). 1897c478bd9Sstevel@tonic-gate * 1907c478bd9Sstevel@tonic-gate * Although close just drops the necessary references and marks the 1917c478bd9Sstevel@tonic-gate * tcp_detached state, tcp_close needs to know the tcp_detached has been 1927c478bd9Sstevel@tonic-gate * set (under squeue) before letting the STREAM go away (because a 1937c478bd9Sstevel@tonic-gate * inbound packet might attempt to go up the STREAM while the close 1947c478bd9Sstevel@tonic-gate * has happened and tcp_detached is not set). So a special lock and 1957c478bd9Sstevel@tonic-gate * flag is used along with a condition variable (tcp_closelock, tcp_closed, 1967c478bd9Sstevel@tonic-gate * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked 1977c478bd9Sstevel@tonic-gate * tcp_detached. 1987c478bd9Sstevel@tonic-gate * 1997c478bd9Sstevel@tonic-gate * Special provisions and fast paths: 2007c478bd9Sstevel@tonic-gate * 201bd670b35SErik Nordmark * We make special provisions for sockfs by marking tcp_issocket 2027c478bd9Sstevel@tonic-gate * whenever we have only sockfs on top of TCP. This allows us to skip 2037c478bd9Sstevel@tonic-gate * putting the tcp in acceptor hash since a sockfs listener can never 2047c478bd9Sstevel@tonic-gate * become acceptor and also avoid allocating a tcp_t for acceptor STREAM 2057c478bd9Sstevel@tonic-gate * since eager has already been allocated and the accept now happens 2067c478bd9Sstevel@tonic-gate * on acceptor STREAM. There is a big blob of comment on top of 207bd670b35SErik Nordmark * tcp_input_listener explaining the new accept. When socket is POP'd, 2087c478bd9Sstevel@tonic-gate * sockfs sends us an ioctl to mark the fact and we go back to old 2097c478bd9Sstevel@tonic-gate * behaviour. Once tcp_issocket is unset, its never set for the 2107c478bd9Sstevel@tonic-gate * life of that connection. 2117c478bd9Sstevel@tonic-gate * 2127c478bd9Sstevel@tonic-gate * IPsec notes : 2137c478bd9Sstevel@tonic-gate * 2147c478bd9Sstevel@tonic-gate * Since a packet is always executed on the correct TCP perimeter 2157c478bd9Sstevel@tonic-gate * all IPsec processing is defered to IP including checking new 2167c478bd9Sstevel@tonic-gate * connections and setting IPSEC policies for new connection. The 2177c478bd9Sstevel@tonic-gate * only exception is tcp_xmit_listeners_reset() which is called 2187c478bd9Sstevel@tonic-gate * directly from IP and needs to policy check to see if TH_RST 2197c478bd9Sstevel@tonic-gate * can be sent out. 2207c478bd9Sstevel@tonic-gate */ 2217c478bd9Sstevel@tonic-gate 2227c478bd9Sstevel@tonic-gate /* 2237c478bd9Sstevel@tonic-gate * Values for squeue switch: 224da14cebeSEric Cheng * 1: SQ_NODRAIN 225da14cebeSEric Cheng * 2: SQ_PROCESS 226da14cebeSEric Cheng * 3: SQ_FILL 2277c478bd9Sstevel@tonic-gate */ 228da14cebeSEric Cheng int tcp_squeue_wput = 2; /* /etc/systems */ 229da14cebeSEric Cheng int tcp_squeue_flag; 2307c478bd9Sstevel@tonic-gate 2317c478bd9Sstevel@tonic-gate /* 2321dbf515bSethindra * To prevent memory hog, limit the number of entries in tcp_free_list 2331dbf515bSethindra * to 1% of available memory / number of cpus 2341dbf515bSethindra */ 2351dbf515bSethindra uint_t tcp_free_list_max_cnt = 0; 2367c478bd9Sstevel@tonic-gate 2377c478bd9Sstevel@tonic-gate #define TIDUSZ 4096 /* transport interface data unit size */ 2387c478bd9Sstevel@tonic-gate 2397c478bd9Sstevel@tonic-gate /* 24093fcb0b9SKacheong Poon * Size of acceptor hash list. It has to be a power of 2 for hashing. 2417c478bd9Sstevel@tonic-gate */ 242aec2a073SJerry Jelinek #define TCP_ACCEPTOR_FANOUT_SIZE 512 2437c478bd9Sstevel@tonic-gate 2447c478bd9Sstevel@tonic-gate #ifdef _ILP32 2457c478bd9Sstevel@tonic-gate #define TCP_ACCEPTOR_HASH(accid) \ 24693fcb0b9SKacheong Poon (((uint_t)(accid) >> 8) & (TCP_ACCEPTOR_FANOUT_SIZE - 1)) 2477c478bd9Sstevel@tonic-gate #else 2487c478bd9Sstevel@tonic-gate #define TCP_ACCEPTOR_HASH(accid) \ 24993fcb0b9SKacheong Poon ((uint_t)(accid) & (TCP_ACCEPTOR_FANOUT_SIZE - 1)) 2507c478bd9Sstevel@tonic-gate #endif /* _ILP32 */ 2517c478bd9Sstevel@tonic-gate 2525dd46ab5SKacheong Poon /* 2535dd46ab5SKacheong Poon * Minimum number of connections which can be created per listener. Used 2545dd46ab5SKacheong Poon * when the listener connection count is in effect. 2555dd46ab5SKacheong Poon */ 256721fffe3SKacheong Poon static uint32_t tcp_min_conn_listener = 2; 25793fcb0b9SKacheong Poon 25893fcb0b9SKacheong Poon uint32_t tcp_early_abort = 30; 25993fcb0b9SKacheong Poon 2607c478bd9Sstevel@tonic-gate /* TCP Timer control structure */ 2617c478bd9Sstevel@tonic-gate typedef struct tcpt_s { 2627c478bd9Sstevel@tonic-gate pfv_t tcpt_pfv; /* The routine we are to call */ 2637c478bd9Sstevel@tonic-gate tcp_t *tcpt_tcp; /* The parameter we are to pass in */ 2647c478bd9Sstevel@tonic-gate } tcpt_t; 2657c478bd9Sstevel@tonic-gate 2667c478bd9Sstevel@tonic-gate /* 2677c478bd9Sstevel@tonic-gate * Functions called directly via squeue having a prototype of edesc_t. 2687c478bd9Sstevel@tonic-gate */ 269bd670b35SErik Nordmark void tcp_input_listener(void *arg, mblk_t *mp, void *arg2, 270bd670b35SErik Nordmark ip_recv_attr_t *ira); 271bd670b35SErik Nordmark void tcp_input_data(void *arg, mblk_t *mp, void *arg2, 272bd670b35SErik Nordmark ip_recv_attr_t *ira); 273bd670b35SErik Nordmark static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, 274bd670b35SErik Nordmark ip_recv_attr_t *dummy); 2757c478bd9Sstevel@tonic-gate 2767c478bd9Sstevel@tonic-gate 2777c478bd9Sstevel@tonic-gate /* Prototype for TCP functions */ 2787c478bd9Sstevel@tonic-gate static void tcp_random_init(void); 2797c478bd9Sstevel@tonic-gate int tcp_random(void); 2800f1702c5SYu Xiangning static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, 281bd670b35SErik Nordmark in_port_t dstport, uint_t srcid); 2820f1702c5SYu Xiangning static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, 283bd670b35SErik Nordmark in_port_t dstport, uint32_t flowinfo, 284bd670b35SErik Nordmark uint_t srcid, uint32_t scope_id); 2857c478bd9Sstevel@tonic-gate static void tcp_iss_init(tcp_t *tcp); 2867c478bd9Sstevel@tonic-gate static void tcp_reinit(tcp_t *tcp); 2877c478bd9Sstevel@tonic-gate static void tcp_reinit_values(tcp_t *tcp); 2887c478bd9Sstevel@tonic-gate 2897c478bd9Sstevel@tonic-gate static void tcp_wsrv(queue_t *q); 290bd670b35SErik Nordmark static void tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa); 291bd670b35SErik Nordmark static void tcp_update_zcopy(tcp_t *tcp); 292bd670b35SErik Nordmark static void tcp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t, 293bd670b35SErik Nordmark ixa_notify_arg_t); 294f4b3ec61Sdh155122 static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns); 295f4b3ec61Sdh155122 static void tcp_stack_fini(netstackid_t stackid, void *arg); 296721fffe3SKacheong Poon 297da14cebeSEric Cheng static int tcp_squeue_switch(int); 2987c478bd9Sstevel@tonic-gate 299fc80c0dfSnordmark static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t); 300fc80c0dfSnordmark static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *); 301fc80c0dfSnordmark static int tcp_openv6(queue_t *, dev_t *, int, int, cred_t *); 3027c478bd9Sstevel@tonic-gate 3037c478bd9Sstevel@tonic-gate static void tcp_squeue_add(squeue_t *); 3047c478bd9Sstevel@tonic-gate 305721fffe3SKacheong Poon struct module_info tcp_rinfo = { 306ff550d0eSmasputra TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER 3077c478bd9Sstevel@tonic-gate }; 3087c478bd9Sstevel@tonic-gate 3097c478bd9Sstevel@tonic-gate static struct module_info tcp_winfo = { 310ff550d0eSmasputra TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16 3117c478bd9Sstevel@tonic-gate }; 3127c478bd9Sstevel@tonic-gate 3137c478bd9Sstevel@tonic-gate /* 3147c478bd9Sstevel@tonic-gate * Entry points for TCP as a device. The normal case which supports 3157c478bd9Sstevel@tonic-gate * the TCP functionality. 316fc80c0dfSnordmark * We have separate open functions for the /dev/tcp and /dev/tcp6 devices. 3177c478bd9Sstevel@tonic-gate */ 318fc80c0dfSnordmark struct qinit tcp_rinitv4 = { 3190f1702c5SYu Xiangning NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, NULL, &tcp_rinfo 320fc80c0dfSnordmark }; 321fc80c0dfSnordmark 322fc80c0dfSnordmark struct qinit tcp_rinitv6 = { 3230f1702c5SYu Xiangning NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_tpi_close, NULL, &tcp_rinfo 3247c478bd9Sstevel@tonic-gate }; 3257c478bd9Sstevel@tonic-gate 3267c478bd9Sstevel@tonic-gate struct qinit tcp_winit = { 3277c478bd9Sstevel@tonic-gate (pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 3287c478bd9Sstevel@tonic-gate }; 3297c478bd9Sstevel@tonic-gate 3307c478bd9Sstevel@tonic-gate /* Initial entry point for TCP in socket mode. */ 3317c478bd9Sstevel@tonic-gate struct qinit tcp_sock_winit = { 3327c478bd9Sstevel@tonic-gate (pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 3337c478bd9Sstevel@tonic-gate }; 3347c478bd9Sstevel@tonic-gate 3350f1702c5SYu Xiangning /* TCP entry point during fallback */ 3360f1702c5SYu Xiangning struct qinit tcp_fallback_sock_winit = { 3370f1702c5SYu Xiangning (pfi_t)tcp_wput_fallback, NULL, NULL, NULL, NULL, &tcp_winfo 3380f1702c5SYu Xiangning }; 3390f1702c5SYu Xiangning 3407c478bd9Sstevel@tonic-gate /* 3417c478bd9Sstevel@tonic-gate * Entry points for TCP as a acceptor STREAM opened by sockfs when doing 3427c478bd9Sstevel@tonic-gate * an accept. Avoid allocating data structures since eager has already 3437c478bd9Sstevel@tonic-gate * been created. 3447c478bd9Sstevel@tonic-gate */ 3457c478bd9Sstevel@tonic-gate struct qinit tcp_acceptor_rinit = { 346eead73cfSRao Shoaib NULL, (pfi_t)tcp_rsrv, NULL, tcp_tpi_close_accept, NULL, &tcp_winfo 3477c478bd9Sstevel@tonic-gate }; 3487c478bd9Sstevel@tonic-gate 3497c478bd9Sstevel@tonic-gate struct qinit tcp_acceptor_winit = { 3500f1702c5SYu Xiangning (pfi_t)tcp_tpi_accept, NULL, NULL, NULL, NULL, &tcp_winfo 3517c478bd9Sstevel@tonic-gate }; 3527c478bd9Sstevel@tonic-gate 353fc80c0dfSnordmark /* For AF_INET aka /dev/tcp */ 354fc80c0dfSnordmark struct streamtab tcpinfov4 = { 355fc80c0dfSnordmark &tcp_rinitv4, &tcp_winit 356fc80c0dfSnordmark }; 357fc80c0dfSnordmark 358fc80c0dfSnordmark /* For AF_INET6 aka /dev/tcp6 */ 359fc80c0dfSnordmark struct streamtab tcpinfov6 = { 360fc80c0dfSnordmark &tcp_rinitv6, &tcp_winit 3617c478bd9Sstevel@tonic-gate }; 3627c478bd9Sstevel@tonic-gate 3637c478bd9Sstevel@tonic-gate /* 3647c478bd9Sstevel@tonic-gate * Following assumes TPI alignment requirements stay along 32 bit 3657c478bd9Sstevel@tonic-gate * boundaries 3667c478bd9Sstevel@tonic-gate */ 3677c478bd9Sstevel@tonic-gate #define ROUNDUP32(x) \ 3687c478bd9Sstevel@tonic-gate (((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1)) 3697c478bd9Sstevel@tonic-gate 3707c478bd9Sstevel@tonic-gate /* Template for response to info request. */ 371721fffe3SKacheong Poon struct T_info_ack tcp_g_t_info_ack = { 3727c478bd9Sstevel@tonic-gate T_INFO_ACK, /* PRIM_type */ 3737c478bd9Sstevel@tonic-gate 0, /* TSDU_size */ 3747c478bd9Sstevel@tonic-gate T_INFINITE, /* ETSDU_size */ 3757c478bd9Sstevel@tonic-gate T_INVALID, /* CDATA_size */ 3767c478bd9Sstevel@tonic-gate T_INVALID, /* DDATA_size */ 3777c478bd9Sstevel@tonic-gate sizeof (sin_t), /* ADDR_size */ 3787c478bd9Sstevel@tonic-gate 0, /* OPT_size - not initialized here */ 3797c478bd9Sstevel@tonic-gate TIDUSZ, /* TIDU_size */ 3807c478bd9Sstevel@tonic-gate T_COTS_ORD, /* SERV_type */ 3817c478bd9Sstevel@tonic-gate TCPS_IDLE, /* CURRENT_state */ 3827c478bd9Sstevel@tonic-gate (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 3837c478bd9Sstevel@tonic-gate }; 3847c478bd9Sstevel@tonic-gate 385721fffe3SKacheong Poon struct T_info_ack tcp_g_t_info_ack_v6 = { 3867c478bd9Sstevel@tonic-gate T_INFO_ACK, /* PRIM_type */ 3877c478bd9Sstevel@tonic-gate 0, /* TSDU_size */ 3887c478bd9Sstevel@tonic-gate T_INFINITE, /* ETSDU_size */ 3897c478bd9Sstevel@tonic-gate T_INVALID, /* CDATA_size */ 3907c478bd9Sstevel@tonic-gate T_INVALID, /* DDATA_size */ 3917c478bd9Sstevel@tonic-gate sizeof (sin6_t), /* ADDR_size */ 3927c478bd9Sstevel@tonic-gate 0, /* OPT_size - not initialized here */ 3937c478bd9Sstevel@tonic-gate TIDUSZ, /* TIDU_size */ 3947c478bd9Sstevel@tonic-gate T_COTS_ORD, /* SERV_type */ 3957c478bd9Sstevel@tonic-gate TCPS_IDLE, /* CURRENT_state */ 3967c478bd9Sstevel@tonic-gate (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 3977c478bd9Sstevel@tonic-gate }; 3987c478bd9Sstevel@tonic-gate 3997c478bd9Sstevel@tonic-gate /* 4006e91bba0SGirish Moodalbail * TCP tunables related declarations. Definitions are in tcp_tunables.c 4017c478bd9Sstevel@tonic-gate */ 4026e91bba0SGirish Moodalbail extern mod_prop_info_t tcp_propinfo_tbl[]; 4036e91bba0SGirish Moodalbail extern int tcp_propinfo_count; 4047c478bd9Sstevel@tonic-gate 4057c478bd9Sstevel@tonic-gate #define IS_VMLOANED_MBLK(mp) \ 4067c478bd9Sstevel@tonic-gate (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0) 4077c478bd9Sstevel@tonic-gate 4087c478bd9Sstevel@tonic-gate uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */ 4097c478bd9Sstevel@tonic-gate 4107c478bd9Sstevel@tonic-gate /* 411f4b3ec61Sdh155122 * Forces all connections to obey the value of the tcps_maxpsz_multiplier 4127c478bd9Sstevel@tonic-gate * tunable settable via NDD. Otherwise, the per-connection behavior is 413bd670b35SErik Nordmark * determined dynamically during tcp_set_destination(), which is the default. 4147c478bd9Sstevel@tonic-gate */ 4157c478bd9Sstevel@tonic-gate boolean_t tcp_static_maxpsz = B_FALSE; 4167c478bd9Sstevel@tonic-gate 4177c478bd9Sstevel@tonic-gate /* 418721fffe3SKacheong Poon * If the receive buffer size is changed, this function is called to update 419721fffe3SKacheong Poon * the upper socket layer on the new delayed receive wake up threshold. 420866ba9ddSjprakash */ 42179c0745dSRao Shoaib static void 42279c0745dSRao Shoaib tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh) 42379c0745dSRao Shoaib { 42479c0745dSRao Shoaib uint32_t default_threshold = SOCKET_RECVHIWATER >> 3; 42579c0745dSRao Shoaib 42679c0745dSRao Shoaib if (IPCL_IS_NONSTR(tcp->tcp_connp)) { 42779c0745dSRao Shoaib conn_t *connp = tcp->tcp_connp; 42879c0745dSRao Shoaib struct sock_proto_props sopp; 42979c0745dSRao Shoaib 43079c0745dSRao Shoaib /* 43179c0745dSRao Shoaib * only increase rcvthresh upto default_threshold 43279c0745dSRao Shoaib */ 43379c0745dSRao Shoaib if (new_rcvthresh > default_threshold) 43479c0745dSRao Shoaib new_rcvthresh = default_threshold; 43579c0745dSRao Shoaib 43679c0745dSRao Shoaib sopp.sopp_flags = SOCKOPT_RCVTHRESH; 43779c0745dSRao Shoaib sopp.sopp_rcvthresh = new_rcvthresh; 43879c0745dSRao Shoaib 43979c0745dSRao Shoaib (*connp->conn_upcalls->su_set_proto_props) 44079c0745dSRao Shoaib (connp->conn_upper_handle, &sopp); 44179c0745dSRao Shoaib } 44279c0745dSRao Shoaib } 443721fffe3SKacheong Poon 4447c478bd9Sstevel@tonic-gate /* 4457c478bd9Sstevel@tonic-gate * Figure out the value of window scale opton. Note that the rwnd is 4467c478bd9Sstevel@tonic-gate * ASSUMED to be rounded up to the nearest MSS before the calculation. 4477c478bd9Sstevel@tonic-gate * We cannot find the scale value and then do a round up of tcp_rwnd 4487c478bd9Sstevel@tonic-gate * because the scale value may not be correct after that. 4497c478bd9Sstevel@tonic-gate * 4507c478bd9Sstevel@tonic-gate * Set the compiler flag to make this function inline. 4517c478bd9Sstevel@tonic-gate */ 452721fffe3SKacheong Poon void 4537c478bd9Sstevel@tonic-gate tcp_set_ws_value(tcp_t *tcp) 4547c478bd9Sstevel@tonic-gate { 4557c478bd9Sstevel@tonic-gate int i; 4567c478bd9Sstevel@tonic-gate uint32_t rwnd = tcp->tcp_rwnd; 4577c478bd9Sstevel@tonic-gate 4587c478bd9Sstevel@tonic-gate for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; 4597c478bd9Sstevel@tonic-gate i++, rwnd >>= 1) 4607c478bd9Sstevel@tonic-gate ; 4617c478bd9Sstevel@tonic-gate tcp->tcp_rcv_ws = i; 4627c478bd9Sstevel@tonic-gate } 4637c478bd9Sstevel@tonic-gate 4647c478bd9Sstevel@tonic-gate /* 465f4b3ec61Sdh155122 * Remove cached/latched IPsec references. 466f4b3ec61Sdh155122 */ 467f4b3ec61Sdh155122 void 468f4b3ec61Sdh155122 tcp_ipsec_cleanup(tcp_t *tcp) 469f4b3ec61Sdh155122 { 470f4b3ec61Sdh155122 conn_t *connp = tcp->tcp_connp; 471f4b3ec61Sdh155122 472fc80c0dfSnordmark ASSERT(connp->conn_flags & IPCL_TCPCONN); 473fc80c0dfSnordmark 474f4b3ec61Sdh155122 if (connp->conn_latch != NULL) { 475bd670b35SErik Nordmark IPLATCH_REFRELE(connp->conn_latch); 476f4b3ec61Sdh155122 connp->conn_latch = NULL; 477f4b3ec61Sdh155122 } 478bd670b35SErik Nordmark if (connp->conn_latch_in_policy != NULL) { 479bd670b35SErik Nordmark IPPOL_REFRELE(connp->conn_latch_in_policy); 480bd670b35SErik Nordmark connp->conn_latch_in_policy = NULL; 481bd670b35SErik Nordmark } 482bd670b35SErik Nordmark if (connp->conn_latch_in_action != NULL) { 483bd670b35SErik Nordmark IPACT_REFRELE(connp->conn_latch_in_action); 484bd670b35SErik Nordmark connp->conn_latch_in_action = NULL; 485bd670b35SErik Nordmark } 486f4b3ec61Sdh155122 if (connp->conn_policy != NULL) { 487f4b3ec61Sdh155122 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); 488f4b3ec61Sdh155122 connp->conn_policy = NULL; 489f4b3ec61Sdh155122 } 490f4b3ec61Sdh155122 } 491f4b3ec61Sdh155122 492f4b3ec61Sdh155122 /* 493f4b3ec61Sdh155122 * Cleaup before placing on free list. 494f4b3ec61Sdh155122 * Disassociate from the netstack/tcp_stack_t since the freelist 495f4b3ec61Sdh155122 * is per squeue and not per netstack. 496f4b3ec61Sdh155122 */ 4977c478bd9Sstevel@tonic-gate void 4987c478bd9Sstevel@tonic-gate tcp_cleanup(tcp_t *tcp) 4997c478bd9Sstevel@tonic-gate { 5007c478bd9Sstevel@tonic-gate mblk_t *mp; 5017c478bd9Sstevel@tonic-gate conn_t *connp = tcp->tcp_connp; 502f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 503f4b3ec61Sdh155122 netstack_t *ns = tcps->tcps_netstack; 504f7f8e53dSKacheong Poon mblk_t *tcp_rsrv_mp; 5057c478bd9Sstevel@tonic-gate 5067c478bd9Sstevel@tonic-gate tcp_bind_hash_remove(tcp); 507f4b3ec61Sdh155122 508f4b3ec61Sdh155122 /* Cleanup that which needs the netstack first */ 509f4b3ec61Sdh155122 tcp_ipsec_cleanup(tcp); 510bd670b35SErik Nordmark ixa_cleanup(connp->conn_ixa); 511bd670b35SErik Nordmark 512bd670b35SErik Nordmark if (connp->conn_ht_iphc != NULL) { 513bd670b35SErik Nordmark kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); 514bd670b35SErik Nordmark connp->conn_ht_iphc = NULL; 515bd670b35SErik Nordmark connp->conn_ht_iphc_allocated = 0; 516bd670b35SErik Nordmark connp->conn_ht_iphc_len = 0; 517bd670b35SErik Nordmark connp->conn_ht_ulp = NULL; 518bd670b35SErik Nordmark connp->conn_ht_ulp_len = 0; 519bd670b35SErik Nordmark tcp->tcp_ipha = NULL; 520bd670b35SErik Nordmark tcp->tcp_ip6h = NULL; 521bd670b35SErik Nordmark tcp->tcp_tcpha = NULL; 522bd670b35SErik Nordmark } 523bd670b35SErik Nordmark 524bd670b35SErik Nordmark /* We clear any IP_OPTIONS and extension headers */ 525bd670b35SErik Nordmark ip_pkt_free(&connp->conn_xmit_ipp); 526f4b3ec61Sdh155122 5277c478bd9Sstevel@tonic-gate tcp_free(tcp); 5287c478bd9Sstevel@tonic-gate 5297c478bd9Sstevel@tonic-gate /* 5307c478bd9Sstevel@tonic-gate * Since we will bzero the entire structure, we need to 5317c478bd9Sstevel@tonic-gate * remove it and reinsert it in global hash list. We 5327c478bd9Sstevel@tonic-gate * know the walkers can't get to this conn because we 5337c478bd9Sstevel@tonic-gate * had set CONDEMNED flag earlier and checked reference 5347c478bd9Sstevel@tonic-gate * under conn_lock so walker won't pick it and when we 5357c478bd9Sstevel@tonic-gate * go the ipcl_globalhash_remove() below, no walker 5367c478bd9Sstevel@tonic-gate * can get to it. 5377c478bd9Sstevel@tonic-gate */ 5387c478bd9Sstevel@tonic-gate ipcl_globalhash_remove(connp); 5397c478bd9Sstevel@tonic-gate 5407c478bd9Sstevel@tonic-gate /* Save some state */ 5417c478bd9Sstevel@tonic-gate mp = tcp->tcp_timercache; 5427c478bd9Sstevel@tonic-gate 543f7f8e53dSKacheong Poon tcp_rsrv_mp = tcp->tcp_rsrv_mp; 5447c478bd9Sstevel@tonic-gate 545fc80c0dfSnordmark if (connp->conn_cred != NULL) { 54645916cd2Sjpk crfree(connp->conn_cred); 547fc80c0dfSnordmark connp->conn_cred = NULL; 548fc80c0dfSnordmark } 549fc80c0dfSnordmark ipcl_conn_cleanup(connp); 550fc80c0dfSnordmark connp->conn_flags = IPCL_TCPCONN; 551bd670b35SErik Nordmark 552bd670b35SErik Nordmark /* 553bd670b35SErik Nordmark * Now it is safe to decrement the reference counts. 554bd670b35SErik Nordmark * This might be the last reference on the netstack 555bd670b35SErik Nordmark * in which case it will cause the freeing of the IP Instance. 556bd670b35SErik Nordmark */ 557bd670b35SErik Nordmark connp->conn_netstack = NULL; 558bd670b35SErik Nordmark connp->conn_ixa->ixa_ipst = NULL; 559bd670b35SErik Nordmark netstack_rele(ns); 560bd670b35SErik Nordmark ASSERT(tcps != NULL); 561bd670b35SErik Nordmark tcp->tcp_tcps = NULL; 562bd670b35SErik Nordmark 5637c478bd9Sstevel@tonic-gate bzero(tcp, sizeof (tcp_t)); 5647c478bd9Sstevel@tonic-gate 5657c478bd9Sstevel@tonic-gate /* restore the state */ 5667c478bd9Sstevel@tonic-gate tcp->tcp_timercache = mp; 5677c478bd9Sstevel@tonic-gate 568f7f8e53dSKacheong Poon tcp->tcp_rsrv_mp = tcp_rsrv_mp; 5697c478bd9Sstevel@tonic-gate 5707c478bd9Sstevel@tonic-gate tcp->tcp_connp = connp; 5717c478bd9Sstevel@tonic-gate 572fc80c0dfSnordmark ASSERT(connp->conn_tcp == tcp); 573fc80c0dfSnordmark ASSERT(connp->conn_flags & IPCL_TCPCONN); 5747c478bd9Sstevel@tonic-gate connp->conn_state_flags = CONN_INCIPIENT; 575bd670b35SErik Nordmark ASSERT(connp->conn_proto == IPPROTO_TCP); 576fc80c0dfSnordmark ASSERT(connp->conn_ref == 1); 5777c478bd9Sstevel@tonic-gate } 5787c478bd9Sstevel@tonic-gate 5797c478bd9Sstevel@tonic-gate /* 5807c478bd9Sstevel@tonic-gate * Adapt to the information, such as rtt and rtt_sd, provided from the 581bd670b35SErik Nordmark * DCE and IRE maintained by IP. 5827c478bd9Sstevel@tonic-gate * 5837c478bd9Sstevel@tonic-gate * Checks for multicast and broadcast destination address. 584bd670b35SErik Nordmark * Returns zero if ok; an errno on failure. 5857c478bd9Sstevel@tonic-gate * 5867c478bd9Sstevel@tonic-gate * Note that the MSS calculation here is based on the info given in 587bd670b35SErik Nordmark * the DCE and IRE. We do not do any calculation based on TCP options. They 588bd670b35SErik Nordmark * will be handled in tcp_input_data() when TCP knows which options to use. 5897c478bd9Sstevel@tonic-gate * 5907c478bd9Sstevel@tonic-gate * Note on how TCP gets its parameters for a connection. 5917c478bd9Sstevel@tonic-gate * 5927c478bd9Sstevel@tonic-gate * When a tcp_t structure is allocated, it gets all the default parameters. 593bd670b35SErik Nordmark * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd, 5947c478bd9Sstevel@tonic-gate * spipe, rpipe, ... from the route metrics. Route metric overrides the 595fab254e2SAruna Ramakrishna * default. 5967c478bd9Sstevel@tonic-gate * 597bd670b35SErik Nordmark * An incoming SYN with a multicast or broadcast destination address is dropped 598bd670b35SErik Nordmark * in ip_fanout_v4/v6. 5997c478bd9Sstevel@tonic-gate * 6007c478bd9Sstevel@tonic-gate * An incoming SYN with a multicast or broadcast source address is always 601bd670b35SErik Nordmark * dropped in tcp_set_destination, since IPDF_ALLOW_MCBC is not set in 602bd670b35SErik Nordmark * conn_connect. 603bd670b35SErik Nordmark * The same logic in tcp_set_destination also serves to 6047c478bd9Sstevel@tonic-gate * reject an attempt to connect to a broadcast or multicast (destination) 6057c478bd9Sstevel@tonic-gate * address. 6067c478bd9Sstevel@tonic-gate */ 607721fffe3SKacheong Poon int 608bd670b35SErik Nordmark tcp_set_destination(tcp_t *tcp) 6097c478bd9Sstevel@tonic-gate { 6107c478bd9Sstevel@tonic-gate uint32_t mss_max; 6117c478bd9Sstevel@tonic-gate uint32_t mss; 6127c478bd9Sstevel@tonic-gate boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 6137c478bd9Sstevel@tonic-gate conn_t *connp = tcp->tcp_connp; 614f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 615bd670b35SErik Nordmark iulp_t uinfo; 616bd670b35SErik Nordmark int error; 617bd670b35SErik Nordmark uint32_t flags; 6187c478bd9Sstevel@tonic-gate 619bd670b35SErik Nordmark flags = IPDF_LSO | IPDF_ZCOPY; 62043d18f1cSpriyanka /* 621bd670b35SErik Nordmark * Make sure we have a dce for the destination to avoid dce_ident 622bd670b35SErik Nordmark * contention for connected sockets. 62343d18f1cSpriyanka */ 624bd670b35SErik Nordmark flags |= IPDF_UNIQUE_DCE; 6257c478bd9Sstevel@tonic-gate 626bd670b35SErik Nordmark if (!tcps->tcps_ignore_path_mtu) 627bd670b35SErik Nordmark connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; 6287c478bd9Sstevel@tonic-gate 629bd670b35SErik Nordmark /* Use conn_lock to satify ASSERT; tcp is already serialized */ 630bd670b35SErik Nordmark mutex_enter(&connp->conn_lock); 631bd670b35SErik Nordmark error = conn_connect(connp, &uinfo, flags); 632bd670b35SErik Nordmark mutex_exit(&connp->conn_lock); 633bd670b35SErik Nordmark if (error != 0) 634bd670b35SErik Nordmark return (error); 6357c478bd9Sstevel@tonic-gate 636bd670b35SErik Nordmark error = tcp_build_hdrs(tcp); 637bd670b35SErik Nordmark if (error != 0) 638bd670b35SErik Nordmark return (error); 6397c478bd9Sstevel@tonic-gate 640bd670b35SErik Nordmark tcp->tcp_localnet = uinfo.iulp_localnet; 6417c478bd9Sstevel@tonic-gate 642bd670b35SErik Nordmark if (uinfo.iulp_rtt != 0) { 6437c478bd9Sstevel@tonic-gate clock_t rto; 6447c478bd9Sstevel@tonic-gate 645bd670b35SErik Nordmark tcp->tcp_rtt_sa = uinfo.iulp_rtt; 646bd670b35SErik Nordmark tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd; 6477c478bd9Sstevel@tonic-gate rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 648f4b3ec61Sdh155122 tcps->tcps_rexmit_interval_extra + 649f4b3ec61Sdh155122 (tcp->tcp_rtt_sa >> 5); 6507c478bd9Sstevel@tonic-gate 651707e74bcSKacheong Poon TCP_SET_RTO(tcp, rto); 6527c478bd9Sstevel@tonic-gate } 653bd670b35SErik Nordmark if (uinfo.iulp_ssthresh != 0) 654bd670b35SErik Nordmark tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh; 6557c478bd9Sstevel@tonic-gate else 6567c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 657bd670b35SErik Nordmark if (uinfo.iulp_spipe > 0) { 658bd670b35SErik Nordmark connp->conn_sndbuf = MIN(uinfo.iulp_spipe, 659f4b3ec61Sdh155122 tcps->tcps_max_buf); 660bd670b35SErik Nordmark if (tcps->tcps_snd_lowat_fraction != 0) { 661bd670b35SErik Nordmark connp->conn_sndlowat = connp->conn_sndbuf / 662f4b3ec61Sdh155122 tcps->tcps_snd_lowat_fraction; 663bd670b35SErik Nordmark } 6647c478bd9Sstevel@tonic-gate (void) tcp_maxpsz_set(tcp, B_TRUE); 6657c478bd9Sstevel@tonic-gate } 6667c478bd9Sstevel@tonic-gate /* 6677c478bd9Sstevel@tonic-gate * Note that up till now, acceptor always inherits receive 66843d18f1cSpriyanka * window from the listener. But if there is a metrics 66943d18f1cSpriyanka * associated with a host, we should use that instead of 67043d18f1cSpriyanka * inheriting it from listener. Thus we need to pass this 67143d18f1cSpriyanka * info back to the caller. 6727c478bd9Sstevel@tonic-gate */ 673bd670b35SErik Nordmark if (uinfo.iulp_rpipe > 0) { 674bd670b35SErik Nordmark tcp->tcp_rwnd = MIN(uinfo.iulp_rpipe, 675f4b3ec61Sdh155122 tcps->tcps_max_buf); 6767c478bd9Sstevel@tonic-gate } 67743d18f1cSpriyanka 678bd670b35SErik Nordmark if (uinfo.iulp_rtomax > 0) { 67943d18f1cSpriyanka tcp->tcp_second_timer_threshold = 680bd670b35SErik Nordmark uinfo.iulp_rtomax; 6817c478bd9Sstevel@tonic-gate } 6827c478bd9Sstevel@tonic-gate 6837c478bd9Sstevel@tonic-gate /* 68443d18f1cSpriyanka * Use the metric option settings, iulp_tstamp_ok and 68543d18f1cSpriyanka * iulp_wscale_ok, only for active open. What this means 68643d18f1cSpriyanka * is that if the other side uses timestamp or window 68743d18f1cSpriyanka * scale option, TCP will also use those options. That 68843d18f1cSpriyanka * is for passive open. If the application sets a 68943d18f1cSpriyanka * large window, window scale is enabled regardless of 69043d18f1cSpriyanka * the value in iulp_wscale_ok. This is the behavior 69143d18f1cSpriyanka * since 2.6. So we keep it. 69243d18f1cSpriyanka * The only case left in passive open processing is the 69343d18f1cSpriyanka * check for SACK. 69443d18f1cSpriyanka * For ECN, it should probably be like SACK. But the 69543d18f1cSpriyanka * current value is binary, so we treat it like the other 69643d18f1cSpriyanka * cases. The metric only controls active open.For passive 69743d18f1cSpriyanka * open, the ndd param, tcp_ecn_permitted, controls the 69843d18f1cSpriyanka * behavior. 6997c478bd9Sstevel@tonic-gate */ 7007c478bd9Sstevel@tonic-gate if (!tcp_detached) { 7017c478bd9Sstevel@tonic-gate /* 70243d18f1cSpriyanka * The if check means that the following can only 70343d18f1cSpriyanka * be turned on by the metrics only IRE, but not off. 7047c478bd9Sstevel@tonic-gate */ 705bd670b35SErik Nordmark if (uinfo.iulp_tstamp_ok) 7067c478bd9Sstevel@tonic-gate tcp->tcp_snd_ts_ok = B_TRUE; 707bd670b35SErik Nordmark if (uinfo.iulp_wscale_ok) 7087c478bd9Sstevel@tonic-gate tcp->tcp_snd_ws_ok = B_TRUE; 709bd670b35SErik Nordmark if (uinfo.iulp_sack == 2) 7107c478bd9Sstevel@tonic-gate tcp->tcp_snd_sack_ok = B_TRUE; 711bd670b35SErik Nordmark if (uinfo.iulp_ecn_ok) 7127c478bd9Sstevel@tonic-gate tcp->tcp_ecn_ok = B_TRUE; 7137c478bd9Sstevel@tonic-gate } else { 7147c478bd9Sstevel@tonic-gate /* 7157c478bd9Sstevel@tonic-gate * Passive open. 7167c478bd9Sstevel@tonic-gate * 7177c478bd9Sstevel@tonic-gate * As above, the if check means that SACK can only be 7187c478bd9Sstevel@tonic-gate * turned on by the metric only IRE. 7197c478bd9Sstevel@tonic-gate */ 720bd670b35SErik Nordmark if (uinfo.iulp_sack > 0) { 7217c478bd9Sstevel@tonic-gate tcp->tcp_snd_sack_ok = B_TRUE; 7227c478bd9Sstevel@tonic-gate } 7237c478bd9Sstevel@tonic-gate } 7247c478bd9Sstevel@tonic-gate 7257c478bd9Sstevel@tonic-gate /* 726bd670b35SErik Nordmark * XXX Note that currently, iulp_mtu can be as small as 68 7277c478bd9Sstevel@tonic-gate * because of PMTUd. So tcp_mss may go to negative if combined 7287c478bd9Sstevel@tonic-gate * length of all those options exceeds 28 bytes. But because 7297c478bd9Sstevel@tonic-gate * of the tcp_mss_min check below, we may not have a problem if 7307c478bd9Sstevel@tonic-gate * tcp_mss_min is of a reasonable value. The default is 1 so 7317c478bd9Sstevel@tonic-gate * the negative problem still exists. And the check defeats PMTUd. 7327c478bd9Sstevel@tonic-gate * In fact, if PMTUd finds that the MSS should be smaller than 7337c478bd9Sstevel@tonic-gate * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min 7347c478bd9Sstevel@tonic-gate * value. 7357c478bd9Sstevel@tonic-gate * 7367c478bd9Sstevel@tonic-gate * We do not deal with that now. All those problems related to 7377c478bd9Sstevel@tonic-gate * PMTUd will be fixed later. 7387c478bd9Sstevel@tonic-gate */ 739bd670b35SErik Nordmark ASSERT(uinfo.iulp_mtu != 0); 740bd670b35SErik Nordmark mss = tcp->tcp_initial_pmtu = uinfo.iulp_mtu; 7417c478bd9Sstevel@tonic-gate 7427c478bd9Sstevel@tonic-gate /* Sanity check for MSS value. */ 743bd670b35SErik Nordmark if (connp->conn_ipversion == IPV4_VERSION) 744f4b3ec61Sdh155122 mss_max = tcps->tcps_mss_max_ipv4; 7457c478bd9Sstevel@tonic-gate else 746f4b3ec61Sdh155122 mss_max = tcps->tcps_mss_max_ipv6; 7477c478bd9Sstevel@tonic-gate 7487c478bd9Sstevel@tonic-gate if (tcp->tcp_ipsec_overhead == 0) 7497c478bd9Sstevel@tonic-gate tcp->tcp_ipsec_overhead = conn_ipsec_length(connp); 7507c478bd9Sstevel@tonic-gate 7517c478bd9Sstevel@tonic-gate mss -= tcp->tcp_ipsec_overhead; 7527c478bd9Sstevel@tonic-gate 753f4b3ec61Sdh155122 if (mss < tcps->tcps_mss_min) 754f4b3ec61Sdh155122 mss = tcps->tcps_mss_min; 7557c478bd9Sstevel@tonic-gate if (mss > mss_max) 7567c478bd9Sstevel@tonic-gate mss = mss_max; 7577c478bd9Sstevel@tonic-gate 7587c478bd9Sstevel@tonic-gate /* Note that this is the maximum MSS, excluding all options. */ 7597c478bd9Sstevel@tonic-gate tcp->tcp_mss = mss; 7607c478bd9Sstevel@tonic-gate 7617c478bd9Sstevel@tonic-gate /* 762bd670b35SErik Nordmark * Update the tcp connection with LSO capability. 763bd670b35SErik Nordmark */ 764bd670b35SErik Nordmark tcp_update_lso(tcp, connp->conn_ixa); 765bd670b35SErik Nordmark 766bd670b35SErik Nordmark /* 7677c478bd9Sstevel@tonic-gate * Initialize the ISS here now that we have the full connection ID. 7687c478bd9Sstevel@tonic-gate * The RFC 1948 method of initial sequence number generation requires 7697c478bd9Sstevel@tonic-gate * knowledge of the full connection ID before setting the ISS. 7707c478bd9Sstevel@tonic-gate */ 7717c478bd9Sstevel@tonic-gate tcp_iss_init(tcp); 7727c478bd9Sstevel@tonic-gate 773bd670b35SErik Nordmark tcp->tcp_loopback = (uinfo.iulp_loopback | uinfo.iulp_local); 7747c478bd9Sstevel@tonic-gate 7757c478bd9Sstevel@tonic-gate /* 7767c478bd9Sstevel@tonic-gate * Make sure that conn is not marked incipient 7777c478bd9Sstevel@tonic-gate * for incoming connections. A blind 7787c478bd9Sstevel@tonic-gate * removal of incipient flag is cheaper than 7797c478bd9Sstevel@tonic-gate * check and removal. 7807c478bd9Sstevel@tonic-gate */ 781bd670b35SErik Nordmark mutex_enter(&connp->conn_lock); 7827c478bd9Sstevel@tonic-gate connp->conn_state_flags &= ~CONN_INCIPIENT; 7837c478bd9Sstevel@tonic-gate mutex_exit(&connp->conn_lock); 7847c478bd9Sstevel@tonic-gate return (0); 7857c478bd9Sstevel@tonic-gate } 7867c478bd9Sstevel@tonic-gate 7877c478bd9Sstevel@tonic-gate /* 788866ba9ddSjprakash * tcp_clean_death / tcp_close_detached must not be called more than once 789866ba9ddSjprakash * on a tcp. Thus every function that potentially calls tcp_clean_death 790866ba9ddSjprakash * must check for the tcp state before calling tcp_clean_death. 791bd670b35SErik Nordmark * Eg. tcp_input_data, tcp_eager_kill, tcp_clean_death_wrapper, 792866ba9ddSjprakash * tcp_timer_handler, all check for the tcp state. 793866ba9ddSjprakash */ 794866ba9ddSjprakash /* ARGSUSED */ 795866ba9ddSjprakash void 796bd670b35SErik Nordmark tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2, 797bd670b35SErik Nordmark ip_recv_attr_t *dummy) 798866ba9ddSjprakash { 799866ba9ddSjprakash tcp_t *tcp = ((conn_t *)arg)->conn_tcp; 800866ba9ddSjprakash 801866ba9ddSjprakash freemsg(mp); 802866ba9ddSjprakash if (tcp->tcp_state > TCPS_BOUND) 803721fffe3SKacheong Poon (void) tcp_clean_death(((conn_t *)arg)->conn_tcp, ETIMEDOUT); 804866ba9ddSjprakash } 805866ba9ddSjprakash 806866ba9ddSjprakash /* 8077c478bd9Sstevel@tonic-gate * We are dying for some reason. Try to do it gracefully. (May be called 8087c478bd9Sstevel@tonic-gate * as writer.) 8097c478bd9Sstevel@tonic-gate * 8107c478bd9Sstevel@tonic-gate * Return -1 if the structure was not cleaned up (if the cleanup had to be 8117c478bd9Sstevel@tonic-gate * done by a service procedure). 8127c478bd9Sstevel@tonic-gate * TBD - Should the return value distinguish between the tcp_t being 8137c478bd9Sstevel@tonic-gate * freed and it being reinitialized? 8147c478bd9Sstevel@tonic-gate */ 815721fffe3SKacheong Poon int 816721fffe3SKacheong Poon tcp_clean_death(tcp_t *tcp, int err) 8177c478bd9Sstevel@tonic-gate { 8187c478bd9Sstevel@tonic-gate mblk_t *mp; 8197c478bd9Sstevel@tonic-gate queue_t *q; 8200f1702c5SYu Xiangning conn_t *connp = tcp->tcp_connp; 821f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 8227c478bd9Sstevel@tonic-gate 823d0ab37afSethindra if (tcp->tcp_fused) 824d0ab37afSethindra tcp_unfuse(tcp); 825d0ab37afSethindra 8267c478bd9Sstevel@tonic-gate if (tcp->tcp_linger_tid != 0 && 8277c478bd9Sstevel@tonic-gate TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) { 8287c478bd9Sstevel@tonic-gate tcp_stop_lingering(tcp); 8297c478bd9Sstevel@tonic-gate } 8307c478bd9Sstevel@tonic-gate 8317c478bd9Sstevel@tonic-gate ASSERT(tcp != NULL); 832bd670b35SErik Nordmark ASSERT((connp->conn_family == AF_INET && 833bd670b35SErik Nordmark connp->conn_ipversion == IPV4_VERSION) || 834bd670b35SErik Nordmark (connp->conn_family == AF_INET6 && 835bd670b35SErik Nordmark (connp->conn_ipversion == IPV4_VERSION || 836bd670b35SErik Nordmark connp->conn_ipversion == IPV6_VERSION))); 8377c478bd9Sstevel@tonic-gate 8387c478bd9Sstevel@tonic-gate if (TCP_IS_DETACHED(tcp)) { 8397c478bd9Sstevel@tonic-gate if (tcp->tcp_hard_binding) { 8407c478bd9Sstevel@tonic-gate /* 8417c478bd9Sstevel@tonic-gate * Its an eager that we are dealing with. We close the 8427c478bd9Sstevel@tonic-gate * eager but in case a conn_ind has already gone to the 8437c478bd9Sstevel@tonic-gate * listener, let tcp_accept_finish() send a discon_ind 8447c478bd9Sstevel@tonic-gate * to the listener and drop the last reference. If the 8457c478bd9Sstevel@tonic-gate * listener doesn't even know about the eager i.e. the 8467c478bd9Sstevel@tonic-gate * conn_ind hasn't gone up, blow away the eager and drop 8477c478bd9Sstevel@tonic-gate * the last reference as well. If the conn_ind has gone 8487c478bd9Sstevel@tonic-gate * up, state should be BOUND. tcp_accept_finish 8497c478bd9Sstevel@tonic-gate * will figure out that the connection has received a 8507c478bd9Sstevel@tonic-gate * RST and will send a DISCON_IND to the application. 8517c478bd9Sstevel@tonic-gate */ 8527c478bd9Sstevel@tonic-gate tcp_closei_local(tcp); 853866ba9ddSjprakash if (!tcp->tcp_tconnind_started) { 8540f1702c5SYu Xiangning CONN_DEC_REF(connp); 8557c478bd9Sstevel@tonic-gate } else { 8567c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_BOUND; 8579cd928feSAlan Maguire DTRACE_TCP6(state__change, void, NULL, 8589cd928feSAlan Maguire ip_xmit_attr_t *, connp->conn_ixa, 8599cd928feSAlan Maguire void, NULL, tcp_t *, tcp, void, NULL, 860eb239265SAnders Persson int32_t, TCPS_CLOSED); 8617c478bd9Sstevel@tonic-gate } 8627c478bd9Sstevel@tonic-gate } else { 8637c478bd9Sstevel@tonic-gate tcp_close_detached(tcp); 8647c478bd9Sstevel@tonic-gate } 8657c478bd9Sstevel@tonic-gate return (0); 8667c478bd9Sstevel@tonic-gate } 8677c478bd9Sstevel@tonic-gate 868f4b3ec61Sdh155122 TCP_STAT(tcps, tcp_clean_death_nondetached); 8697c478bd9Sstevel@tonic-gate 87093fcb0b9SKacheong Poon /* 87193fcb0b9SKacheong Poon * The connection is dead. Decrement listener connection counter if 87293fcb0b9SKacheong Poon * necessary. 87393fcb0b9SKacheong Poon */ 87493fcb0b9SKacheong Poon if (tcp->tcp_listen_cnt != NULL) 87593fcb0b9SKacheong Poon TCP_DECR_LISTEN_CNT(tcp); 87693fcb0b9SKacheong Poon 877721fffe3SKacheong Poon /* 878721fffe3SKacheong Poon * When a connection is moved to TIME_WAIT state, the connection 879721fffe3SKacheong Poon * counter is already decremented. So no need to decrement here 880721fffe3SKacheong Poon * again. See SET_TIME_WAIT() macro. 881721fffe3SKacheong Poon */ 882721fffe3SKacheong Poon if (tcp->tcp_state >= TCPS_ESTABLISHED && 883721fffe3SKacheong Poon tcp->tcp_state < TCPS_TIME_WAIT) { 884721fffe3SKacheong Poon TCPS_CONN_DEC(tcps); 885721fffe3SKacheong Poon } 886721fffe3SKacheong Poon 887bd670b35SErik Nordmark q = connp->conn_rq; 8887c478bd9Sstevel@tonic-gate 8897c478bd9Sstevel@tonic-gate /* Trash all inbound data */ 8900f1702c5SYu Xiangning if (!IPCL_IS_NONSTR(connp)) { 8910f1702c5SYu Xiangning ASSERT(q != NULL); 8927c478bd9Sstevel@tonic-gate flushq(q, FLUSHALL); 8930f1702c5SYu Xiangning } 8947c478bd9Sstevel@tonic-gate 8957c478bd9Sstevel@tonic-gate /* 8967c478bd9Sstevel@tonic-gate * If we are at least part way open and there is error 8977c478bd9Sstevel@tonic-gate * (err==0 implies no error) 8987c478bd9Sstevel@tonic-gate * notify our client by a T_DISCON_IND. 8997c478bd9Sstevel@tonic-gate */ 9007c478bd9Sstevel@tonic-gate if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) { 9017c478bd9Sstevel@tonic-gate if (tcp->tcp_state >= TCPS_ESTABLISHED && 9027c478bd9Sstevel@tonic-gate !TCP_IS_SOCKET(tcp)) { 9037c478bd9Sstevel@tonic-gate /* 9047c478bd9Sstevel@tonic-gate * Send M_FLUSH according to TPI. Because sockets will 9057c478bd9Sstevel@tonic-gate * (and must) ignore FLUSHR we do that only for TPI 9067c478bd9Sstevel@tonic-gate * endpoints and sockets in STREAMS mode. 9077c478bd9Sstevel@tonic-gate */ 9087c478bd9Sstevel@tonic-gate (void) putnextctl1(q, M_FLUSH, FLUSHR); 9097c478bd9Sstevel@tonic-gate } 910bd670b35SErik Nordmark if (connp->conn_debug) { 911ff550d0eSmasputra (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 9127c478bd9Sstevel@tonic-gate "tcp_clean_death: discon err %d", err); 9137c478bd9Sstevel@tonic-gate } 9140f1702c5SYu Xiangning if (IPCL_IS_NONSTR(connp)) { 9150f1702c5SYu Xiangning /* Direct socket, use upcall */ 9160f1702c5SYu Xiangning (*connp->conn_upcalls->su_disconnected)( 9170f1702c5SYu Xiangning connp->conn_upper_handle, tcp->tcp_connid, err); 9180f1702c5SYu Xiangning } else { 9197c478bd9Sstevel@tonic-gate mp = mi_tpi_discon_ind(NULL, err, 0); 9207c478bd9Sstevel@tonic-gate if (mp != NULL) { 9217c478bd9Sstevel@tonic-gate putnext(q, mp); 9227c478bd9Sstevel@tonic-gate } else { 923bd670b35SErik Nordmark if (connp->conn_debug) { 924ff550d0eSmasputra (void) strlog(TCP_MOD_ID, 0, 1, 9257c478bd9Sstevel@tonic-gate SL_ERROR|SL_TRACE, 9267c478bd9Sstevel@tonic-gate "tcp_clean_death, sending M_ERROR"); 9277c478bd9Sstevel@tonic-gate } 9287c478bd9Sstevel@tonic-gate (void) putnextctl1(q, M_ERROR, EPROTO); 9297c478bd9Sstevel@tonic-gate } 9300f1702c5SYu Xiangning } 9317c478bd9Sstevel@tonic-gate if (tcp->tcp_state <= TCPS_SYN_RCVD) { 9327c478bd9Sstevel@tonic-gate /* SYN_SENT or SYN_RCVD */ 933721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpAttemptFails); 9347c478bd9Sstevel@tonic-gate } else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) { 9357c478bd9Sstevel@tonic-gate /* ESTABLISHED or CLOSE_WAIT */ 936721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpEstabResets); 9377c478bd9Sstevel@tonic-gate } 9387c478bd9Sstevel@tonic-gate } 9397c478bd9Sstevel@tonic-gate 9403e95bd4aSAnders Persson /* 9413e95bd4aSAnders Persson * ESTABLISHED non-STREAMS eagers are not 'detached' because 9423e95bd4aSAnders Persson * an upper handle is obtained when the SYN-ACK comes in. So it 9433e95bd4aSAnders Persson * should receive the 'disconnected' upcall, but tcp_reinit should 9443e95bd4aSAnders Persson * not be called since this is an eager. 9453e95bd4aSAnders Persson */ 9463e95bd4aSAnders Persson if (tcp->tcp_listener != NULL && IPCL_IS_NONSTR(connp)) { 9473e95bd4aSAnders Persson tcp_closei_local(tcp); 9483e95bd4aSAnders Persson tcp->tcp_state = TCPS_BOUND; 949eb239265SAnders Persson DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 950eb239265SAnders Persson connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, 951eb239265SAnders Persson int32_t, TCPS_CLOSED); 9523e95bd4aSAnders Persson return (0); 9533e95bd4aSAnders Persson } 9543e95bd4aSAnders Persson 9557c478bd9Sstevel@tonic-gate tcp_reinit(tcp); 9560f1702c5SYu Xiangning if (IPCL_IS_NONSTR(connp)) 9570f1702c5SYu Xiangning (void) tcp_do_unbind(connp); 9580f1702c5SYu Xiangning 9597c478bd9Sstevel@tonic-gate return (-1); 9607c478bd9Sstevel@tonic-gate } 9617c478bd9Sstevel@tonic-gate 9627c478bd9Sstevel@tonic-gate /* 9637c478bd9Sstevel@tonic-gate * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout 9647c478bd9Sstevel@tonic-gate * to expire, stop the wait and finish the close. 9657c478bd9Sstevel@tonic-gate */ 966721fffe3SKacheong Poon void 9677c478bd9Sstevel@tonic-gate tcp_stop_lingering(tcp_t *tcp) 9687c478bd9Sstevel@tonic-gate { 9697c478bd9Sstevel@tonic-gate clock_t delta = 0; 970f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 971bd670b35SErik Nordmark conn_t *connp = tcp->tcp_connp; 9727c478bd9Sstevel@tonic-gate 9737c478bd9Sstevel@tonic-gate tcp->tcp_linger_tid = 0; 9747c478bd9Sstevel@tonic-gate if (tcp->tcp_state > TCPS_LISTEN) { 9757c478bd9Sstevel@tonic-gate tcp_acceptor_hash_remove(tcp); 976e0968231Svi117747 mutex_enter(&tcp->tcp_non_sq_lock); 9777c478bd9Sstevel@tonic-gate if (tcp->tcp_flow_stopped) { 9787c478bd9Sstevel@tonic-gate tcp_clrqfull(tcp); 9797c478bd9Sstevel@tonic-gate } 980e0968231Svi117747 mutex_exit(&tcp->tcp_non_sq_lock); 9817c478bd9Sstevel@tonic-gate 9827c478bd9Sstevel@tonic-gate if (tcp->tcp_timer_tid != 0) { 9837c478bd9Sstevel@tonic-gate delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 9847c478bd9Sstevel@tonic-gate tcp->tcp_timer_tid = 0; 9857c478bd9Sstevel@tonic-gate } 9867c478bd9Sstevel@tonic-gate /* 9877c478bd9Sstevel@tonic-gate * Need to cancel those timers which will not be used when 988bd670b35SErik Nordmark * TCP is detached. This has to be done before the conn_wq 989bd670b35SErik Nordmark * is cleared. 9907c478bd9Sstevel@tonic-gate */ 9917c478bd9Sstevel@tonic-gate tcp_timers_stop(tcp); 9927c478bd9Sstevel@tonic-gate 9937c478bd9Sstevel@tonic-gate tcp->tcp_detached = B_TRUE; 994bd670b35SErik Nordmark connp->conn_rq = NULL; 995bd670b35SErik Nordmark connp->conn_wq = NULL; 9967c478bd9Sstevel@tonic-gate 9977c478bd9Sstevel@tonic-gate if (tcp->tcp_state == TCPS_TIME_WAIT) { 9987c478bd9Sstevel@tonic-gate tcp_time_wait_append(tcp); 999f4b3ec61Sdh155122 TCP_DBGSTAT(tcps, tcp_detach_time_wait); 10007c478bd9Sstevel@tonic-gate goto finish; 10017c478bd9Sstevel@tonic-gate } 10027c478bd9Sstevel@tonic-gate 10037c478bd9Sstevel@tonic-gate /* 10047c478bd9Sstevel@tonic-gate * If delta is zero the timer event wasn't executed and was 10057c478bd9Sstevel@tonic-gate * successfully canceled. In this case we need to restart it 10067c478bd9Sstevel@tonic-gate * with the minimal delta possible. 10077c478bd9Sstevel@tonic-gate */ 10087c478bd9Sstevel@tonic-gate if (delta >= 0) { 10097c478bd9Sstevel@tonic-gate tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, 10107c478bd9Sstevel@tonic-gate delta ? delta : 1); 10117c478bd9Sstevel@tonic-gate } 10127c478bd9Sstevel@tonic-gate } else { 10137c478bd9Sstevel@tonic-gate tcp_closei_local(tcp); 1014bd670b35SErik Nordmark CONN_DEC_REF(connp); 10157c478bd9Sstevel@tonic-gate } 10167c478bd9Sstevel@tonic-gate finish: 10177c478bd9Sstevel@tonic-gate tcp->tcp_detached = B_TRUE; 1018bd670b35SErik Nordmark connp->conn_rq = NULL; 1019bd670b35SErik Nordmark connp->conn_wq = NULL; 10200f1702c5SYu Xiangning 10213e95bd4aSAnders Persson /* Signal closing thread that it can complete close */ 10223e95bd4aSAnders Persson mutex_enter(&tcp->tcp_closelock); 10237c478bd9Sstevel@tonic-gate tcp->tcp_closed = 1; 10247c478bd9Sstevel@tonic-gate cv_signal(&tcp->tcp_closecv); 10257c478bd9Sstevel@tonic-gate mutex_exit(&tcp->tcp_closelock); 10263e95bd4aSAnders Persson 10273e95bd4aSAnders Persson /* If we have an upper handle (socket), release it */ 10283e95bd4aSAnders Persson if (IPCL_IS_NONSTR(connp)) { 10293e95bd4aSAnders Persson ASSERT(connp->conn_upper_handle != NULL); 10303e95bd4aSAnders Persson (*connp->conn_upcalls->su_closed)(connp->conn_upper_handle); 10313e95bd4aSAnders Persson connp->conn_upper_handle = NULL; 10323e95bd4aSAnders Persson connp->conn_upcalls = NULL; 10333e95bd4aSAnders Persson } 10347c478bd9Sstevel@tonic-gate } 10357c478bd9Sstevel@tonic-gate 1036721fffe3SKacheong Poon void 10370f1702c5SYu Xiangning tcp_close_common(conn_t *connp, int flags) 10387c478bd9Sstevel@tonic-gate { 10397c478bd9Sstevel@tonic-gate tcp_t *tcp = connp->conn_tcp; 10407c478bd9Sstevel@tonic-gate mblk_t *mp = &tcp->tcp_closemp; 10417c478bd9Sstevel@tonic-gate boolean_t conn_ioctl_cleanup_reqd = B_FALSE; 104259031702Speterte mblk_t *bp; 10437c478bd9Sstevel@tonic-gate 10447c478bd9Sstevel@tonic-gate ASSERT(connp->conn_ref >= 2); 10457c478bd9Sstevel@tonic-gate 10467c478bd9Sstevel@tonic-gate /* 1047bd670b35SErik Nordmark * Mark the conn as closing. ipsq_pending_mp_add will not 10487c478bd9Sstevel@tonic-gate * add any mp to the pending mp list, after this conn has 1049bd670b35SErik Nordmark * started closing. 10507c478bd9Sstevel@tonic-gate */ 10517c478bd9Sstevel@tonic-gate mutex_enter(&connp->conn_lock); 10527c478bd9Sstevel@tonic-gate connp->conn_state_flags |= CONN_CLOSING; 10537c478bd9Sstevel@tonic-gate if (connp->conn_oper_pending_ill != NULL) 10547c478bd9Sstevel@tonic-gate conn_ioctl_cleanup_reqd = B_TRUE; 10557c478bd9Sstevel@tonic-gate CONN_INC_REF_LOCKED(connp); 10567c478bd9Sstevel@tonic-gate mutex_exit(&connp->conn_lock); 10577c478bd9Sstevel@tonic-gate tcp->tcp_closeflags = (uint8_t)flags; 10587c478bd9Sstevel@tonic-gate ASSERT(connp->conn_ref >= 3); 10597c478bd9Sstevel@tonic-gate 1060866ba9ddSjprakash /* 1061866ba9ddSjprakash * tcp_closemp_used is used below without any protection of a lock 1062866ba9ddSjprakash * as we don't expect any one else to use it concurrently at this 10630163a147Sjprakash * point otherwise it would be a major defect. 1064866ba9ddSjprakash */ 1065866ba9ddSjprakash 10660163a147Sjprakash if (mp->b_prev == NULL) 10670163a147Sjprakash tcp->tcp_closemp_used = B_TRUE; 10680163a147Sjprakash else 10690163a147Sjprakash cmn_err(CE_PANIC, "tcp_close: concurrent use of tcp_closemp: " 10700163a147Sjprakash "connp %p tcp %p\n", (void *)connp, (void *)tcp); 1071866ba9ddSjprakash 1072866ba9ddSjprakash TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 1073866ba9ddSjprakash 1074838a4ffaSBrian Ruthven /* 1075838a4ffaSBrian Ruthven * Cleanup any queued ioctls here. This must be done before the wq/rq 1076838a4ffaSBrian Ruthven * are re-written by tcp_close_output(). 1077838a4ffaSBrian Ruthven */ 1078838a4ffaSBrian Ruthven if (conn_ioctl_cleanup_reqd) 1079838a4ffaSBrian Ruthven conn_ioctl_cleanup(connp); 1080838a4ffaSBrian Ruthven 1081838a4ffaSBrian Ruthven /* 1082838a4ffaSBrian Ruthven * As CONN_CLOSING is set, no further ioctls should be passed down to 1083838a4ffaSBrian Ruthven * IP for this conn (see the guards in tcp_ioctl, tcp_wput_ioctl and 1084838a4ffaSBrian Ruthven * tcp_wput_iocdata). If the ioctl was queued on an ipsq, 1085838a4ffaSBrian Ruthven * conn_ioctl_cleanup should have found it and removed it. If the ioctl 1086838a4ffaSBrian Ruthven * was still in flight at the time, we wait for it here. See comments 1087838a4ffaSBrian Ruthven * for CONN_INC_IOCTLREF in ip.h for details. 1088838a4ffaSBrian Ruthven */ 1089838a4ffaSBrian Ruthven mutex_enter(&connp->conn_lock); 1090838a4ffaSBrian Ruthven while (connp->conn_ioctlref > 0) 1091838a4ffaSBrian Ruthven cv_wait(&connp->conn_cv, &connp->conn_lock); 1092838a4ffaSBrian Ruthven ASSERT(connp->conn_ioctlref == 0); 1093838a4ffaSBrian Ruthven ASSERT(connp->conn_oper_pending_ill == NULL); 1094838a4ffaSBrian Ruthven mutex_exit(&connp->conn_lock); 1095838a4ffaSBrian Ruthven 1096da14cebeSEric Cheng SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp, 1097bd670b35SErik Nordmark NULL, tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); 10987c478bd9Sstevel@tonic-gate 10993e95bd4aSAnders Persson /* 11003e95bd4aSAnders Persson * For non-STREAMS sockets, the normal case is that the conn makes 11013e95bd4aSAnders Persson * an upcall when it's finally closed, so there is no need to wait 11023e95bd4aSAnders Persson * in the protocol. But in case of SO_LINGER the thread sleeps here 11033e95bd4aSAnders Persson * so it can properly deal with the thread being interrupted. 11043e95bd4aSAnders Persson */ 11053e95bd4aSAnders Persson if (IPCL_IS_NONSTR(connp) && connp->conn_linger == 0) 11063e95bd4aSAnders Persson goto nowait; 11073e95bd4aSAnders Persson 11087c478bd9Sstevel@tonic-gate mutex_enter(&tcp->tcp_closelock); 110959031702Speterte while (!tcp->tcp_closed) { 111059031702Speterte if (!cv_wait_sig(&tcp->tcp_closecv, &tcp->tcp_closelock)) { 111159031702Speterte /* 11126a43307cSja97890 * The cv_wait_sig() was interrupted. We now do the 11136a43307cSja97890 * following: 11146a43307cSja97890 * 11156a43307cSja97890 * 1) If the endpoint was lingering, we allow this 11166a43307cSja97890 * to be interrupted by cancelling the linger timeout 11176a43307cSja97890 * and closing normally. 11186a43307cSja97890 * 11196a43307cSja97890 * 2) Revert to calling cv_wait() 11206a43307cSja97890 * 11216a43307cSja97890 * We revert to using cv_wait() to avoid an 11226a43307cSja97890 * infinite loop which can occur if the calling 11236a43307cSja97890 * thread is higher priority than the squeue worker 11246a43307cSja97890 * thread and is bound to the same cpu. 112559031702Speterte */ 1126bd670b35SErik Nordmark if (connp->conn_linger && connp->conn_lingertime > 0) { 11277c478bd9Sstevel@tonic-gate mutex_exit(&tcp->tcp_closelock); 112859031702Speterte /* Entering squeue, bump ref count. */ 112959031702Speterte CONN_INC_REF(connp); 113059031702Speterte bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); 1131da14cebeSEric Cheng SQUEUE_ENTER_ONE(connp->conn_sqp, bp, 1132bd670b35SErik Nordmark tcp_linger_interrupted, connp, NULL, 1133da14cebeSEric Cheng tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); 113459031702Speterte mutex_enter(&tcp->tcp_closelock); 113559031702Speterte } 11366a43307cSja97890 break; 113759031702Speterte } 113859031702Speterte } 11396a43307cSja97890 while (!tcp->tcp_closed) 11406a43307cSja97890 cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock); 114159031702Speterte mutex_exit(&tcp->tcp_closelock); 114259031702Speterte 11437c478bd9Sstevel@tonic-gate /* 11447c478bd9Sstevel@tonic-gate * In the case of listener streams that have eagers in the q or q0 1145bd670b35SErik Nordmark * we wait for the eagers to drop their reference to us. conn_rq and 1146bd670b35SErik Nordmark * conn_wq of the eagers point to our queues. By waiting for the 11477c478bd9Sstevel@tonic-gate * refcnt to drop to 1, we are sure that the eagers have cleaned 11487c478bd9Sstevel@tonic-gate * up their queue pointers and also dropped their references to us. 11493e95bd4aSAnders Persson * 11503e95bd4aSAnders Persson * For non-STREAMS sockets we do not have to wait here; the 11513e95bd4aSAnders Persson * listener will instead make a su_closed upcall when the last 11523e95bd4aSAnders Persson * reference is dropped. 11537c478bd9Sstevel@tonic-gate */ 11543e95bd4aSAnders Persson if (tcp->tcp_wait_for_eagers && !IPCL_IS_NONSTR(connp)) { 11557c478bd9Sstevel@tonic-gate mutex_enter(&connp->conn_lock); 11567c478bd9Sstevel@tonic-gate while (connp->conn_ref != 1) { 11577c478bd9Sstevel@tonic-gate cv_wait(&connp->conn_cv, &connp->conn_lock); 11587c478bd9Sstevel@tonic-gate } 11597c478bd9Sstevel@tonic-gate mutex_exit(&connp->conn_lock); 11607c478bd9Sstevel@tonic-gate } 11617c478bd9Sstevel@tonic-gate 11623e95bd4aSAnders Persson nowait: 1163bd670b35SErik Nordmark connp->conn_cpid = NOPID; 11640f1702c5SYu Xiangning } 11650f1702c5SYu Xiangning 116659031702Speterte /* 116759031702Speterte * Called by tcp_close() routine via squeue when lingering is 116859031702Speterte * interrupted by a signal. 116959031702Speterte */ 117059031702Speterte 117159031702Speterte /* ARGSUSED */ 117259031702Speterte static void 1173bd670b35SErik Nordmark tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 117459031702Speterte { 117559031702Speterte conn_t *connp = (conn_t *)arg; 117659031702Speterte tcp_t *tcp = connp->conn_tcp; 117759031702Speterte 117859031702Speterte freeb(mp); 117959031702Speterte if (tcp->tcp_linger_tid != 0 && 118059031702Speterte TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) { 118159031702Speterte tcp_stop_lingering(tcp); 118259031702Speterte tcp->tcp_client_errno = EINTR; 118359031702Speterte } 118459031702Speterte } 11857c478bd9Sstevel@tonic-gate 11867c478bd9Sstevel@tonic-gate /* 11877c478bd9Sstevel@tonic-gate * Clean up the b_next and b_prev fields of every mblk pointed at by *mpp. 11887c478bd9Sstevel@tonic-gate * Some stream heads get upset if they see these later on as anything but NULL. 11897c478bd9Sstevel@tonic-gate */ 1190721fffe3SKacheong Poon void 11917c478bd9Sstevel@tonic-gate tcp_close_mpp(mblk_t **mpp) 11927c478bd9Sstevel@tonic-gate { 11937c478bd9Sstevel@tonic-gate mblk_t *mp; 11947c478bd9Sstevel@tonic-gate 11957c478bd9Sstevel@tonic-gate if ((mp = *mpp) != NULL) { 11967c478bd9Sstevel@tonic-gate do { 11977c478bd9Sstevel@tonic-gate mp->b_next = NULL; 11987c478bd9Sstevel@tonic-gate mp->b_prev = NULL; 11997c478bd9Sstevel@tonic-gate } while ((mp = mp->b_cont) != NULL); 12007c478bd9Sstevel@tonic-gate 12017c478bd9Sstevel@tonic-gate mp = *mpp; 12027c478bd9Sstevel@tonic-gate *mpp = NULL; 12037c478bd9Sstevel@tonic-gate freemsg(mp); 12047c478bd9Sstevel@tonic-gate } 12057c478bd9Sstevel@tonic-gate } 12067c478bd9Sstevel@tonic-gate 12077c478bd9Sstevel@tonic-gate /* Do detached close. */ 1208721fffe3SKacheong Poon void 12097c478bd9Sstevel@tonic-gate tcp_close_detached(tcp_t *tcp) 12107c478bd9Sstevel@tonic-gate { 12117c478bd9Sstevel@tonic-gate if (tcp->tcp_fused) 12127c478bd9Sstevel@tonic-gate tcp_unfuse(tcp); 12137c478bd9Sstevel@tonic-gate 12147c478bd9Sstevel@tonic-gate /* 12157c478bd9Sstevel@tonic-gate * Clustering code serializes TCP disconnect callbacks and 12167c478bd9Sstevel@tonic-gate * cluster tcp list walks by blocking a TCP disconnect callback 12177c478bd9Sstevel@tonic-gate * if a cluster tcp list walk is in progress. This ensures 12187c478bd9Sstevel@tonic-gate * accurate accounting of TCPs in the cluster code even though 12197c478bd9Sstevel@tonic-gate * the TCP list walk itself is not atomic. 12207c478bd9Sstevel@tonic-gate */ 12217c478bd9Sstevel@tonic-gate tcp_closei_local(tcp); 12227c478bd9Sstevel@tonic-gate CONN_DEC_REF(tcp->tcp_connp); 12237c478bd9Sstevel@tonic-gate } 12247c478bd9Sstevel@tonic-gate 12257c478bd9Sstevel@tonic-gate /* 12267c478bd9Sstevel@tonic-gate * The tcp_t is going away. Remove it from all lists and set it 12277c478bd9Sstevel@tonic-gate * to TCPS_CLOSED. The freeing up of memory is deferred until 12287c478bd9Sstevel@tonic-gate * tcp_inactive. This is needed since a thread in tcp_rput might have 12297c478bd9Sstevel@tonic-gate * done a CONN_INC_REF on this structure before it was removed from the 12307c478bd9Sstevel@tonic-gate * hashes. 12317c478bd9Sstevel@tonic-gate */ 1232721fffe3SKacheong Poon void 12337c478bd9Sstevel@tonic-gate tcp_closei_local(tcp_t *tcp) 12347c478bd9Sstevel@tonic-gate { 12357c478bd9Sstevel@tonic-gate conn_t *connp = tcp->tcp_connp; 1236f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 12379cd928feSAlan Maguire int32_t oldstate; 12387c478bd9Sstevel@tonic-gate 12397c478bd9Sstevel@tonic-gate if (!TCP_IS_SOCKET(tcp)) 12407c478bd9Sstevel@tonic-gate tcp_acceptor_hash_remove(tcp); 12417c478bd9Sstevel@tonic-gate 1242721fffe3SKacheong Poon TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs); 12437c478bd9Sstevel@tonic-gate tcp->tcp_ibsegs = 0; 1244721fffe3SKacheong Poon TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs); 12457c478bd9Sstevel@tonic-gate tcp->tcp_obsegs = 0; 1246c28749e9Skais 12477c478bd9Sstevel@tonic-gate /* 1248721fffe3SKacheong Poon * This can be called via tcp_time_wait_processing() if TCP gets a 1249721fffe3SKacheong Poon * SYN with sequence number outside the TIME-WAIT connection's 1250721fffe3SKacheong Poon * window. So we need to check for TIME-WAIT state here as the 1251721fffe3SKacheong Poon * connection counter is already decremented. See SET_TIME_WAIT() 1252721fffe3SKacheong Poon * macro 1253721fffe3SKacheong Poon */ 1254721fffe3SKacheong Poon if (tcp->tcp_state >= TCPS_ESTABLISHED && 1255721fffe3SKacheong Poon tcp->tcp_state < TCPS_TIME_WAIT) { 1256721fffe3SKacheong Poon TCPS_CONN_DEC(tcps); 1257721fffe3SKacheong Poon } 1258721fffe3SKacheong Poon 1259721fffe3SKacheong Poon /* 12607c478bd9Sstevel@tonic-gate * If we are an eager connection hanging off a listener that 12617c478bd9Sstevel@tonic-gate * hasn't formally accepted the connection yet, get off his 12627c478bd9Sstevel@tonic-gate * list and blow off any data that we have accumulated. 12637c478bd9Sstevel@tonic-gate */ 12647c478bd9Sstevel@tonic-gate if (tcp->tcp_listener != NULL) { 12657c478bd9Sstevel@tonic-gate tcp_t *listener = tcp->tcp_listener; 12667c478bd9Sstevel@tonic-gate mutex_enter(&listener->tcp_eager_lock); 12677c478bd9Sstevel@tonic-gate /* 1268866ba9ddSjprakash * tcp_tconnind_started == B_TRUE means that the 12697c478bd9Sstevel@tonic-gate * conn_ind has already gone to listener. At 12707c478bd9Sstevel@tonic-gate * this point, eager will be closed but we 12717c478bd9Sstevel@tonic-gate * leave it in listeners eager list so that 12727c478bd9Sstevel@tonic-gate * if listener decides to close without doing 1273bd670b35SErik Nordmark * accept, we can clean this up. In tcp_tli_accept 1274866ba9ddSjprakash * we take care of the case of accept on closed 12757c478bd9Sstevel@tonic-gate * eager. 12767c478bd9Sstevel@tonic-gate */ 1277866ba9ddSjprakash if (!tcp->tcp_tconnind_started) { 12787c478bd9Sstevel@tonic-gate tcp_eager_unlink(tcp); 12797c478bd9Sstevel@tonic-gate mutex_exit(&listener->tcp_eager_lock); 12807c478bd9Sstevel@tonic-gate /* 12817c478bd9Sstevel@tonic-gate * We don't want to have any pointers to the 12827c478bd9Sstevel@tonic-gate * listener queue, after we have released our 12837c478bd9Sstevel@tonic-gate * reference on the listener 12847c478bd9Sstevel@tonic-gate */ 1285bd670b35SErik Nordmark ASSERT(tcp->tcp_detached); 1286bd670b35SErik Nordmark connp->conn_rq = NULL; 1287bd670b35SErik Nordmark connp->conn_wq = NULL; 12887c478bd9Sstevel@tonic-gate CONN_DEC_REF(listener->tcp_connp); 12897c478bd9Sstevel@tonic-gate } else { 12907c478bd9Sstevel@tonic-gate mutex_exit(&listener->tcp_eager_lock); 12917c478bd9Sstevel@tonic-gate } 12927c478bd9Sstevel@tonic-gate } 12937c478bd9Sstevel@tonic-gate 12947c478bd9Sstevel@tonic-gate /* Stop all the timers */ 12957c478bd9Sstevel@tonic-gate tcp_timers_stop(tcp); 12967c478bd9Sstevel@tonic-gate 12977c478bd9Sstevel@tonic-gate if (tcp->tcp_state == TCPS_LISTEN) { 12987c478bd9Sstevel@tonic-gate if (tcp->tcp_ip_addr_cache) { 12997c478bd9Sstevel@tonic-gate kmem_free((void *)tcp->tcp_ip_addr_cache, 13007c478bd9Sstevel@tonic-gate IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 13017c478bd9Sstevel@tonic-gate tcp->tcp_ip_addr_cache = NULL; 13027c478bd9Sstevel@tonic-gate } 13037c478bd9Sstevel@tonic-gate } 130493fcb0b9SKacheong Poon 130593fcb0b9SKacheong Poon /* Decrement listerner connection counter if necessary. */ 130693fcb0b9SKacheong Poon if (tcp->tcp_listen_cnt != NULL) 130793fcb0b9SKacheong Poon TCP_DECR_LISTEN_CNT(tcp); 130893fcb0b9SKacheong Poon 1309e0968231Svi117747 mutex_enter(&tcp->tcp_non_sq_lock); 13107c478bd9Sstevel@tonic-gate if (tcp->tcp_flow_stopped) 13117c478bd9Sstevel@tonic-gate tcp_clrqfull(tcp); 1312e0968231Svi117747 mutex_exit(&tcp->tcp_non_sq_lock); 13137c478bd9Sstevel@tonic-gate 13147c478bd9Sstevel@tonic-gate tcp_bind_hash_remove(tcp); 13157c478bd9Sstevel@tonic-gate /* 13167c478bd9Sstevel@tonic-gate * If the tcp_time_wait_collector (which runs outside the squeue) 13177c478bd9Sstevel@tonic-gate * is trying to remove this tcp from the time wait list, we will 13187c478bd9Sstevel@tonic-gate * block in tcp_time_wait_remove while trying to acquire the 13197c478bd9Sstevel@tonic-gate * tcp_time_wait_lock. The logic in tcp_time_wait_collector also 13207c478bd9Sstevel@tonic-gate * requires the ipcl_hash_remove to be ordered after the 13217c478bd9Sstevel@tonic-gate * tcp_time_wait_remove for the refcnt checks to work correctly. 13227c478bd9Sstevel@tonic-gate */ 13237c478bd9Sstevel@tonic-gate if (tcp->tcp_state == TCPS_TIME_WAIT) 1324866ba9ddSjprakash (void) tcp_time_wait_remove(tcp, NULL); 1325bd670b35SErik Nordmark CL_INET_DISCONNECT(connp); 13267c478bd9Sstevel@tonic-gate ipcl_hash_remove(connp); 13279cd928feSAlan Maguire oldstate = tcp->tcp_state; 13289cd928feSAlan Maguire tcp->tcp_state = TCPS_CLOSED; 13299cd928feSAlan Maguire /* Need to probe before ixa_cleanup() is called */ 13309cd928feSAlan Maguire DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 13319cd928feSAlan Maguire connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, 13329cd928feSAlan Maguire int32_t, oldstate); 1333bd670b35SErik Nordmark ixa_cleanup(connp->conn_ixa); 13347c478bd9Sstevel@tonic-gate 13357c478bd9Sstevel@tonic-gate /* 1336bd670b35SErik Nordmark * Mark the conn as CONDEMNED 13377c478bd9Sstevel@tonic-gate */ 13387c478bd9Sstevel@tonic-gate mutex_enter(&connp->conn_lock); 13397c478bd9Sstevel@tonic-gate connp->conn_state_flags |= CONN_CONDEMNED; 13407c478bd9Sstevel@tonic-gate mutex_exit(&connp->conn_lock); 13417c478bd9Sstevel@tonic-gate 13427c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_time_wait_next == NULL); 13437c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_time_wait_prev == NULL); 13447c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_time_wait_expire == 0); 1345c28749e9Skais 1346f4b3ec61Sdh155122 tcp_ipsec_cleanup(tcp); 13477c478bd9Sstevel@tonic-gate } 13487c478bd9Sstevel@tonic-gate 13497c478bd9Sstevel@tonic-gate /* 13507c478bd9Sstevel@tonic-gate * tcp is dying (called from ipcl_conn_destroy and error cases). 13517c478bd9Sstevel@tonic-gate * Free the tcp_t in either case. 13527c478bd9Sstevel@tonic-gate */ 13537c478bd9Sstevel@tonic-gate void 13547c478bd9Sstevel@tonic-gate tcp_free(tcp_t *tcp) 13557c478bd9Sstevel@tonic-gate { 13567c478bd9Sstevel@tonic-gate mblk_t *mp; 1357bd670b35SErik Nordmark conn_t *connp = tcp->tcp_connp; 13587c478bd9Sstevel@tonic-gate 13597c478bd9Sstevel@tonic-gate ASSERT(tcp != NULL); 13607c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL); 13617c478bd9Sstevel@tonic-gate 1362bd670b35SErik Nordmark connp->conn_rq = NULL; 1363bd670b35SErik Nordmark connp->conn_wq = NULL; 13647c478bd9Sstevel@tonic-gate 13657c478bd9Sstevel@tonic-gate tcp_close_mpp(&tcp->tcp_xmit_head); 13667c478bd9Sstevel@tonic-gate tcp_close_mpp(&tcp->tcp_reass_head); 13677c478bd9Sstevel@tonic-gate if (tcp->tcp_rcv_list != NULL) { 13687c478bd9Sstevel@tonic-gate /* Free b_next chain */ 13697c478bd9Sstevel@tonic-gate tcp_close_mpp(&tcp->tcp_rcv_list); 13707c478bd9Sstevel@tonic-gate } 13717c478bd9Sstevel@tonic-gate if ((mp = tcp->tcp_urp_mp) != NULL) { 13727c478bd9Sstevel@tonic-gate freemsg(mp); 13737c478bd9Sstevel@tonic-gate } 13747c478bd9Sstevel@tonic-gate if ((mp = tcp->tcp_urp_mark_mp) != NULL) { 13757c478bd9Sstevel@tonic-gate freemsg(mp); 13767c478bd9Sstevel@tonic-gate } 13777c478bd9Sstevel@tonic-gate 13787c478bd9Sstevel@tonic-gate if (tcp->tcp_fused_sigurg_mp != NULL) { 13790f1702c5SYu Xiangning ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 13807c478bd9Sstevel@tonic-gate freeb(tcp->tcp_fused_sigurg_mp); 13817c478bd9Sstevel@tonic-gate tcp->tcp_fused_sigurg_mp = NULL; 13827c478bd9Sstevel@tonic-gate } 13837c478bd9Sstevel@tonic-gate 1384a6fd05c9SKacheong Poon if (tcp->tcp_ordrel_mp != NULL) { 13850f1702c5SYu Xiangning ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 1386a6fd05c9SKacheong Poon freeb(tcp->tcp_ordrel_mp); 1387a6fd05c9SKacheong Poon tcp->tcp_ordrel_mp = NULL; 1388a6fd05c9SKacheong Poon } 1389a6fd05c9SKacheong Poon 139066cd0f60SKacheong Poon TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp); 139166cd0f60SKacheong Poon bzero(&tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 13927c478bd9Sstevel@tonic-gate 13937c478bd9Sstevel@tonic-gate if (tcp->tcp_hopopts != NULL) { 13947c478bd9Sstevel@tonic-gate mi_free(tcp->tcp_hopopts); 13957c478bd9Sstevel@tonic-gate tcp->tcp_hopopts = NULL; 13967c478bd9Sstevel@tonic-gate tcp->tcp_hopoptslen = 0; 13977c478bd9Sstevel@tonic-gate } 13987c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_hopoptslen == 0); 13997c478bd9Sstevel@tonic-gate if (tcp->tcp_dstopts != NULL) { 14007c478bd9Sstevel@tonic-gate mi_free(tcp->tcp_dstopts); 14017c478bd9Sstevel@tonic-gate tcp->tcp_dstopts = NULL; 14027c478bd9Sstevel@tonic-gate tcp->tcp_dstoptslen = 0; 14037c478bd9Sstevel@tonic-gate } 14047c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_dstoptslen == 0); 1405bd670b35SErik Nordmark if (tcp->tcp_rthdrdstopts != NULL) { 1406bd670b35SErik Nordmark mi_free(tcp->tcp_rthdrdstopts); 1407bd670b35SErik Nordmark tcp->tcp_rthdrdstopts = NULL; 1408bd670b35SErik Nordmark tcp->tcp_rthdrdstoptslen = 0; 14097c478bd9Sstevel@tonic-gate } 1410bd670b35SErik Nordmark ASSERT(tcp->tcp_rthdrdstoptslen == 0); 14117c478bd9Sstevel@tonic-gate if (tcp->tcp_rthdr != NULL) { 14127c478bd9Sstevel@tonic-gate mi_free(tcp->tcp_rthdr); 14137c478bd9Sstevel@tonic-gate tcp->tcp_rthdr = NULL; 14147c478bd9Sstevel@tonic-gate tcp->tcp_rthdrlen = 0; 14157c478bd9Sstevel@tonic-gate } 14167c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_rthdrlen == 0); 14177c478bd9Sstevel@tonic-gate 14187c478bd9Sstevel@tonic-gate /* 14197c478bd9Sstevel@tonic-gate * Following is really a blowing away a union. 14207c478bd9Sstevel@tonic-gate * It happens to have exactly two members of identical size 14217c478bd9Sstevel@tonic-gate * the following code is enough. 14227c478bd9Sstevel@tonic-gate */ 14237c478bd9Sstevel@tonic-gate tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); 14243e95bd4aSAnders Persson 14253e95bd4aSAnders Persson /* 14263e95bd4aSAnders Persson * If this is a non-STREAM socket still holding on to an upper 14273e95bd4aSAnders Persson * handle, release it. As a result of fallback we might also see 14283e95bd4aSAnders Persson * STREAMS based conns with upper handles, in which case there is 14293e95bd4aSAnders Persson * nothing to do other than clearing the field. 14303e95bd4aSAnders Persson */ 14313e95bd4aSAnders Persson if (connp->conn_upper_handle != NULL) { 14323e95bd4aSAnders Persson if (IPCL_IS_NONSTR(connp)) { 14333e95bd4aSAnders Persson (*connp->conn_upcalls->su_closed)( 14343e95bd4aSAnders Persson connp->conn_upper_handle); 14353e95bd4aSAnders Persson tcp->tcp_detached = B_TRUE; 14363e95bd4aSAnders Persson } 14373e95bd4aSAnders Persson connp->conn_upper_handle = NULL; 14383e95bd4aSAnders Persson connp->conn_upcalls = NULL; 14393e95bd4aSAnders Persson } 14407c478bd9Sstevel@tonic-gate } 14417c478bd9Sstevel@tonic-gate 14427c478bd9Sstevel@tonic-gate /* 14437c478bd9Sstevel@tonic-gate * tcp_get_conn/tcp_free_conn 14447c478bd9Sstevel@tonic-gate * 14457c478bd9Sstevel@tonic-gate * tcp_get_conn is used to get a clean tcp connection structure. 14467c478bd9Sstevel@tonic-gate * It tries to reuse the connections put on the freelist by the 14477c478bd9Sstevel@tonic-gate * time_wait_collector failing which it goes to kmem_cache. This 14487c478bd9Sstevel@tonic-gate * way has two benefits compared to just allocating from and 14497c478bd9Sstevel@tonic-gate * freeing to kmem_cache. 14507c478bd9Sstevel@tonic-gate * 1) The time_wait_collector can free (which includes the cleanup) 14517c478bd9Sstevel@tonic-gate * outside the squeue. So when the interrupt comes, we have a clean 14527c478bd9Sstevel@tonic-gate * connection sitting in the freelist. Obviously, this buys us 14537c478bd9Sstevel@tonic-gate * performance. 14547c478bd9Sstevel@tonic-gate * 1455bd670b35SErik Nordmark * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener 1456bd670b35SErik Nordmark * has multiple disadvantages - tying up the squeue during alloc. 14577c478bd9Sstevel@tonic-gate * But allocating the conn/tcp in IP land is also not the best since 14587c478bd9Sstevel@tonic-gate * we can't check the 'q' and 'q0' which are protected by squeue and 14597c478bd9Sstevel@tonic-gate * blindly allocate memory which might have to be freed here if we are 14607c478bd9Sstevel@tonic-gate * not allowed to accept the connection. By using the freelist and 14617c478bd9Sstevel@tonic-gate * putting the conn/tcp back in freelist, we don't pay a penalty for 14627c478bd9Sstevel@tonic-gate * allocating memory without checking 'q/q0' and freeing it if we can't 14637c478bd9Sstevel@tonic-gate * accept the connection. 14647c478bd9Sstevel@tonic-gate * 14657c478bd9Sstevel@tonic-gate * Care should be taken to put the conn back in the same squeue's freelist 14667c478bd9Sstevel@tonic-gate * from which it was allocated. Best results are obtained if conn is 14677c478bd9Sstevel@tonic-gate * allocated from listener's squeue and freed to the same. Time wait 14687c478bd9Sstevel@tonic-gate * collector will free up the freelist is the connection ends up sitting 14697c478bd9Sstevel@tonic-gate * there for too long. 14707c478bd9Sstevel@tonic-gate */ 14717c478bd9Sstevel@tonic-gate void * 1472f3124163SAnders Persson tcp_get_conn(void *arg, tcp_stack_t *tcps) 14737c478bd9Sstevel@tonic-gate { 14747c478bd9Sstevel@tonic-gate tcp_t *tcp = NULL; 14757c478bd9Sstevel@tonic-gate conn_t *connp = NULL; 14767c478bd9Sstevel@tonic-gate squeue_t *sqp = (squeue_t *)arg; 14777c478bd9Sstevel@tonic-gate tcp_squeue_priv_t *tcp_time_wait; 1478f4b3ec61Sdh155122 netstack_t *ns; 1479eead73cfSRao Shoaib mblk_t *tcp_rsrv_mp = NULL; 14807c478bd9Sstevel@tonic-gate 14817c478bd9Sstevel@tonic-gate tcp_time_wait = 14827c478bd9Sstevel@tonic-gate *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 14837c478bd9Sstevel@tonic-gate 14847c478bd9Sstevel@tonic-gate mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 14857c478bd9Sstevel@tonic-gate tcp = tcp_time_wait->tcp_free_list; 14861dbf515bSethindra ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0)); 14877c478bd9Sstevel@tonic-gate if (tcp != NULL) { 14887c478bd9Sstevel@tonic-gate tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 14891dbf515bSethindra tcp_time_wait->tcp_free_list_cnt--; 14907c478bd9Sstevel@tonic-gate mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 14917c478bd9Sstevel@tonic-gate tcp->tcp_time_wait_next = NULL; 14927c478bd9Sstevel@tonic-gate connp = tcp->tcp_connp; 14937c478bd9Sstevel@tonic-gate connp->conn_flags |= IPCL_REUSED; 1494f4b3ec61Sdh155122 1495f4b3ec61Sdh155122 ASSERT(tcp->tcp_tcps == NULL); 1496f4b3ec61Sdh155122 ASSERT(connp->conn_netstack == NULL); 1497f3124163SAnders Persson ASSERT(tcp->tcp_rsrv_mp != NULL); 1498f4b3ec61Sdh155122 ns = tcps->tcps_netstack; 1499f4b3ec61Sdh155122 netstack_hold(ns); 1500f4b3ec61Sdh155122 connp->conn_netstack = ns; 1501bd670b35SErik Nordmark connp->conn_ixa->ixa_ipst = ns->netstack_ip; 1502f4b3ec61Sdh155122 tcp->tcp_tcps = tcps; 1503f4b3ec61Sdh155122 ipcl_globalhash_insert(connp); 1504bd670b35SErik Nordmark 1505bd670b35SErik Nordmark connp->conn_ixa->ixa_notify_cookie = tcp; 1506bd670b35SErik Nordmark ASSERT(connp->conn_ixa->ixa_notify == tcp_notify); 1507bd670b35SErik Nordmark connp->conn_recv = tcp_input_data; 1508bd670b35SErik Nordmark ASSERT(connp->conn_recvicmp == tcp_icmp_input); 1509bd670b35SErik Nordmark ASSERT(connp->conn_verifyicmp == tcp_verifyicmp); 15107c478bd9Sstevel@tonic-gate return ((void *)connp); 15117c478bd9Sstevel@tonic-gate } 15127c478bd9Sstevel@tonic-gate mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1513f7f8e53dSKacheong Poon /* 1514f3124163SAnders Persson * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until 1515f3124163SAnders Persson * this conn_t/tcp_t is freed at ipcl_conn_destroy(). 1516f7f8e53dSKacheong Poon */ 1517eead73cfSRao Shoaib tcp_rsrv_mp = allocb(0, BPRI_HI); 1518eead73cfSRao Shoaib if (tcp_rsrv_mp == NULL) 1519f7f8e53dSKacheong Poon return (NULL); 1520f3124163SAnders Persson 1521eead73cfSRao Shoaib if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP, 1522eead73cfSRao Shoaib tcps->tcps_netstack)) == NULL) { 1523eead73cfSRao Shoaib freeb(tcp_rsrv_mp); 1524eead73cfSRao Shoaib return (NULL); 1525eead73cfSRao Shoaib } 1526aa4b59d3SKacheong Poon 1527eead73cfSRao Shoaib tcp = connp->conn_tcp; 1528eead73cfSRao Shoaib tcp->tcp_rsrv_mp = tcp_rsrv_mp; 1529f7f8e53dSKacheong Poon mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL); 1530eead73cfSRao Shoaib 1531f4b3ec61Sdh155122 tcp->tcp_tcps = tcps; 1532f7f8e53dSKacheong Poon 1533bd670b35SErik Nordmark connp->conn_recv = tcp_input_data; 1534bd670b35SErik Nordmark connp->conn_recvicmp = tcp_icmp_input; 1535bd670b35SErik Nordmark connp->conn_verifyicmp = tcp_verifyicmp; 15367c478bd9Sstevel@tonic-gate 153745916cd2Sjpk /* 1538bd670b35SErik Nordmark * Register tcp_notify to listen to capability changes detected by IP. 1539bd670b35SErik Nordmark * This upcall is made in the context of the call to conn_ip_output 1540bd670b35SErik Nordmark * thus it is inside the squeue. 154145916cd2Sjpk */ 1542bd670b35SErik Nordmark connp->conn_ixa->ixa_notify = tcp_notify; 1543bd670b35SErik Nordmark connp->conn_ixa->ixa_notify_cookie = tcp; 154445916cd2Sjpk 1545bd670b35SErik Nordmark return ((void *)connp); 154645916cd2Sjpk } 154745916cd2Sjpk 15487c478bd9Sstevel@tonic-gate /* 15497c478bd9Sstevel@tonic-gate * Handle connect to IPv4 destinations, including connections for AF_INET6 15507c478bd9Sstevel@tonic-gate * sockets connecting to IPv4 mapped IPv6 destinations. 1551bd670b35SErik Nordmark * Returns zero if OK, a positive errno, or a negative TLI error. 15527c478bd9Sstevel@tonic-gate */ 15530f1702c5SYu Xiangning static int 15540f1702c5SYu Xiangning tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport, 1555bd670b35SErik Nordmark uint_t srcid) 15567c478bd9Sstevel@tonic-gate { 15577c478bd9Sstevel@tonic-gate ipaddr_t dstaddr = *dstaddrp; 155842274831Sgt145670 uint16_t lport; 1559bd670b35SErik Nordmark conn_t *connp = tcp->tcp_connp; 1560f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 1561bd670b35SErik Nordmark int error; 15627c478bd9Sstevel@tonic-gate 1563bd670b35SErik Nordmark ASSERT(connp->conn_ipversion == IPV4_VERSION); 15647c478bd9Sstevel@tonic-gate 15657c478bd9Sstevel@tonic-gate /* Check for attempt to connect to INADDR_ANY */ 15667c478bd9Sstevel@tonic-gate if (dstaddr == INADDR_ANY) { 15677c478bd9Sstevel@tonic-gate /* 15687c478bd9Sstevel@tonic-gate * SunOS 4.x and 4.3 BSD allow an application 15697c478bd9Sstevel@tonic-gate * to connect a TCP socket to INADDR_ANY. 15707c478bd9Sstevel@tonic-gate * When they do this, the kernel picks the 15717c478bd9Sstevel@tonic-gate * address of one interface and uses it 15727c478bd9Sstevel@tonic-gate * instead. The kernel usually ends up 15737c478bd9Sstevel@tonic-gate * picking the address of the loopback 15747c478bd9Sstevel@tonic-gate * interface. This is an undocumented feature. 15757c478bd9Sstevel@tonic-gate * However, we provide the same thing here 15767c478bd9Sstevel@tonic-gate * in order to have source and binary 15777c478bd9Sstevel@tonic-gate * compatibility with SunOS 4.x. 15787c478bd9Sstevel@tonic-gate * Update the T_CONN_REQ (sin/sin6) since it is used to 15797c478bd9Sstevel@tonic-gate * generate the T_CONN_CON. 15807c478bd9Sstevel@tonic-gate */ 15817c478bd9Sstevel@tonic-gate dstaddr = htonl(INADDR_LOOPBACK); 15827c478bd9Sstevel@tonic-gate *dstaddrp = dstaddr; 15837c478bd9Sstevel@tonic-gate } 15847c478bd9Sstevel@tonic-gate 15857c478bd9Sstevel@tonic-gate /* Handle __sin6_src_id if socket not bound to an IP address */ 1586bd670b35SErik Nordmark if (srcid != 0 && connp->conn_laddr_v4 == INADDR_ANY) { 1587a1ca8b43SDan McDonald if (!ip_srcid_find_id(srcid, &connp->conn_laddr_v6, 1588a1ca8b43SDan McDonald IPCL_ZONEID(connp), B_TRUE, tcps->tcps_netstack)) { 1589a1ca8b43SDan McDonald /* Mismatch - conn_laddr_v6 would be v6 address. */ 1590a1ca8b43SDan McDonald return (EADDRNOTAVAIL); 1591a1ca8b43SDan McDonald } 1592bd670b35SErik Nordmark connp->conn_saddr_v6 = connp->conn_laddr_v6; 15937c478bd9Sstevel@tonic-gate } 15947c478bd9Sstevel@tonic-gate 1595bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(dstaddr, &connp->conn_faddr_v6); 1596bd670b35SErik Nordmark connp->conn_fport = dstport; 15977c478bd9Sstevel@tonic-gate 15985f9878b0Sken Powell - Sun Microsystem /* 159942274831Sgt145670 * At this point the remote destination address and remote port fields 160042274831Sgt145670 * in the tcp-four-tuple have been filled in the tcp structure. Now we 1601bd670b35SErik Nordmark * have to see which state tcp was in so we can take appropriate action. 160242274831Sgt145670 */ 1603bd670b35SErik Nordmark if (tcp->tcp_state == TCPS_IDLE) { 160442274831Sgt145670 /* 160542274831Sgt145670 * We support a quick connect capability here, allowing 160642274831Sgt145670 * clients to transition directly from IDLE to SYN_SENT 160742274831Sgt145670 * tcp_bindi will pick an unused port, insert the connection 160842274831Sgt145670 * in the bind hash and transition to BOUND state. 160942274831Sgt145670 */ 1610f4b3ec61Sdh155122 lport = tcp_update_next_port(tcps->tcps_next_port_to_try, 1611f4b3ec61Sdh155122 tcp, B_TRUE); 1612bd670b35SErik Nordmark lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE, 161342274831Sgt145670 B_FALSE, B_FALSE); 1614bd670b35SErik Nordmark if (lport == 0) 1615bd670b35SErik Nordmark return (-TNOADDR); 161642274831Sgt145670 } 16170f1702c5SYu Xiangning 1618fc80c0dfSnordmark /* 1619bd670b35SErik Nordmark * Lookup the route to determine a source address and the uinfo. 1620bd670b35SErik Nordmark * Setup TCP parameters based on the metrics/DCE. 1621fc80c0dfSnordmark */ 1622bd670b35SErik Nordmark error = tcp_set_destination(tcp); 1623bd670b35SErik Nordmark if (error != 0) 16240f1702c5SYu Xiangning return (error); 1625bd670b35SErik Nordmark 1626bd670b35SErik Nordmark /* 1627bd670b35SErik Nordmark * Don't let an endpoint connect to itself. 1628bd670b35SErik Nordmark */ 1629bd670b35SErik Nordmark if (connp->conn_faddr_v4 == connp->conn_laddr_v4 && 1630bd670b35SErik Nordmark connp->conn_fport == connp->conn_lport) 1631bd670b35SErik Nordmark return (-TBADADDR); 1632bd670b35SErik Nordmark 1633bd670b35SErik Nordmark tcp->tcp_state = TCPS_SYN_SENT; 1634bd670b35SErik Nordmark 1635bd670b35SErik Nordmark return (ipcl_conn_insert_v4(connp)); 16367c478bd9Sstevel@tonic-gate } 16377c478bd9Sstevel@tonic-gate 16387c478bd9Sstevel@tonic-gate /* 16397c478bd9Sstevel@tonic-gate * Handle connect to IPv6 destinations. 1640bd670b35SErik Nordmark * Returns zero if OK, a positive errno, or a negative TLI error. 16417c478bd9Sstevel@tonic-gate */ 16420f1702c5SYu Xiangning static int 16430f1702c5SYu Xiangning tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport, 1644bd670b35SErik Nordmark uint32_t flowinfo, uint_t srcid, uint32_t scope_id) 16457c478bd9Sstevel@tonic-gate { 164642274831Sgt145670 uint16_t lport; 16470f1702c5SYu Xiangning conn_t *connp = tcp->tcp_connp; 1648bd670b35SErik Nordmark tcp_stack_t *tcps = tcp->tcp_tcps; 1649bd670b35SErik Nordmark int error; 16507c478bd9Sstevel@tonic-gate 1651bd670b35SErik Nordmark ASSERT(connp->conn_family == AF_INET6); 16527c478bd9Sstevel@tonic-gate 16537c478bd9Sstevel@tonic-gate /* 16547c478bd9Sstevel@tonic-gate * If we're here, it means that the destination address is a native 1655bd670b35SErik Nordmark * IPv6 address. Return an error if conn_ipversion is not IPv6. A 16567c478bd9Sstevel@tonic-gate * reason why it might not be IPv6 is if the socket was bound to an 16577c478bd9Sstevel@tonic-gate * IPv4-mapped IPv6 address. 16587c478bd9Sstevel@tonic-gate */ 1659bd670b35SErik Nordmark if (connp->conn_ipversion != IPV6_VERSION) 16600f1702c5SYu Xiangning return (-TBADADDR); 16617c478bd9Sstevel@tonic-gate 16627c478bd9Sstevel@tonic-gate /* 16637c478bd9Sstevel@tonic-gate * Interpret a zero destination to mean loopback. 16647c478bd9Sstevel@tonic-gate * Update the T_CONN_REQ (sin/sin6) since it is used to 16657c478bd9Sstevel@tonic-gate * generate the T_CONN_CON. 16667c478bd9Sstevel@tonic-gate */ 1667bd670b35SErik Nordmark if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) 16687c478bd9Sstevel@tonic-gate *dstaddrp = ipv6_loopback; 16697c478bd9Sstevel@tonic-gate 16707c478bd9Sstevel@tonic-gate /* Handle __sin6_src_id if socket not bound to an IP address */ 1671bd670b35SErik Nordmark if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1672a1ca8b43SDan McDonald if (!ip_srcid_find_id(srcid, &connp->conn_laddr_v6, 1673a1ca8b43SDan McDonald IPCL_ZONEID(connp), B_FALSE, tcps->tcps_netstack)) { 1674a1ca8b43SDan McDonald /* Mismatch - conn_laddr_v6 would be v4-mapped. */ 1675a1ca8b43SDan McDonald return (EADDRNOTAVAIL); 1676a1ca8b43SDan McDonald } 1677bd670b35SErik Nordmark connp->conn_saddr_v6 = connp->conn_laddr_v6; 16787c478bd9Sstevel@tonic-gate } 16797c478bd9Sstevel@tonic-gate 16807c478bd9Sstevel@tonic-gate /* 1681bd670b35SErik Nordmark * Take care of the scope_id now. 16827c478bd9Sstevel@tonic-gate */ 1683bd670b35SErik Nordmark if (scope_id != 0 && IN6_IS_ADDR_LINKSCOPE(dstaddrp)) { 1684bd670b35SErik Nordmark connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 1685bd670b35SErik Nordmark connp->conn_ixa->ixa_scopeid = scope_id; 16867c478bd9Sstevel@tonic-gate } else { 1687bd670b35SErik Nordmark connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 16887c478bd9Sstevel@tonic-gate } 16897c478bd9Sstevel@tonic-gate 1690bd670b35SErik Nordmark connp->conn_flowinfo = flowinfo; 1691bd670b35SErik Nordmark connp->conn_faddr_v6 = *dstaddrp; 1692bd670b35SErik Nordmark connp->conn_fport = dstport; 16937c478bd9Sstevel@tonic-gate 16945f9878b0Sken Powell - Sun Microsystem /* 169542274831Sgt145670 * At this point the remote destination address and remote port fields 169642274831Sgt145670 * in the tcp-four-tuple have been filled in the tcp structure. Now we 1697bd670b35SErik Nordmark * have to see which state tcp was in so we can take appropriate action. 169842274831Sgt145670 */ 1699bd670b35SErik Nordmark if (tcp->tcp_state == TCPS_IDLE) { 170042274831Sgt145670 /* 170142274831Sgt145670 * We support a quick connect capability here, allowing 170242274831Sgt145670 * clients to transition directly from IDLE to SYN_SENT 170342274831Sgt145670 * tcp_bindi will pick an unused port, insert the connection 170442274831Sgt145670 * in the bind hash and transition to BOUND state. 170542274831Sgt145670 */ 1706f4b3ec61Sdh155122 lport = tcp_update_next_port(tcps->tcps_next_port_to_try, 1707f4b3ec61Sdh155122 tcp, B_TRUE); 1708bd670b35SErik Nordmark lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE, 170942274831Sgt145670 B_FALSE, B_FALSE); 1710bd670b35SErik Nordmark if (lport == 0) 1711bd670b35SErik Nordmark return (-TNOADDR); 171242274831Sgt145670 } 1713bd670b35SErik Nordmark 1714bd670b35SErik Nordmark /* 1715bd670b35SErik Nordmark * Lookup the route to determine a source address and the uinfo. 1716bd670b35SErik Nordmark * Setup TCP parameters based on the metrics/DCE. 1717bd670b35SErik Nordmark */ 1718bd670b35SErik Nordmark error = tcp_set_destination(tcp); 1719bd670b35SErik Nordmark if (error != 0) 1720bd670b35SErik Nordmark return (error); 1721bd670b35SErik Nordmark 1722bd670b35SErik Nordmark /* 1723bd670b35SErik Nordmark * Don't let an endpoint connect to itself. 1724bd670b35SErik Nordmark */ 1725bd670b35SErik Nordmark if (IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, &connp->conn_laddr_v6) && 1726bd670b35SErik Nordmark connp->conn_fport == connp->conn_lport) 1727bd670b35SErik Nordmark return (-TBADADDR); 1728bd670b35SErik Nordmark 17297c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_SYN_SENT; 17300f1702c5SYu Xiangning 1731bd670b35SErik Nordmark return (ipcl_conn_insert_v6(connp)); 17327c478bd9Sstevel@tonic-gate } 17337c478bd9Sstevel@tonic-gate 17347c478bd9Sstevel@tonic-gate /* 1735bd670b35SErik Nordmark * Disconnect 1736bd670b35SErik Nordmark * Note that unlike other functions this returns a positive tli error 1737bd670b35SErik Nordmark * when it fails; it never returns an errno. 17387c478bd9Sstevel@tonic-gate */ 17390f1702c5SYu Xiangning static int 17400f1702c5SYu Xiangning tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum) 17417c478bd9Sstevel@tonic-gate { 1742bd670b35SErik Nordmark conn_t *lconnp; 1743f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 1744bd670b35SErik Nordmark conn_t *connp = tcp->tcp_connp; 17457c478bd9Sstevel@tonic-gate 17467c478bd9Sstevel@tonic-gate /* 17477c478bd9Sstevel@tonic-gate * Right now, upper modules pass down a T_DISCON_REQ to TCP, 17487c478bd9Sstevel@tonic-gate * when the stream is in BOUND state. Do not send a reset, 17497c478bd9Sstevel@tonic-gate * since the destination IP address is not valid, and it can 17507c478bd9Sstevel@tonic-gate * be the initialized value of all zeros (broadcast address). 17517c478bd9Sstevel@tonic-gate */ 17520e0e37a8SErik Nordmark if (tcp->tcp_state <= TCPS_BOUND) { 1753bd670b35SErik Nordmark if (connp->conn_debug) { 1754ff550d0eSmasputra (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 17557c478bd9Sstevel@tonic-gate "tcp_disconnect: bad state, %d", tcp->tcp_state); 17567c478bd9Sstevel@tonic-gate } 17570f1702c5SYu Xiangning return (TOUTSTATE); 1758721fffe3SKacheong Poon } else if (tcp->tcp_state >= TCPS_ESTABLISHED) { 1759721fffe3SKacheong Poon TCPS_CONN_DEC(tcps); 17607c478bd9Sstevel@tonic-gate } 17617c478bd9Sstevel@tonic-gate 17627c478bd9Sstevel@tonic-gate if (seqnum == -1 || tcp->tcp_conn_req_max == 0) { 17637c478bd9Sstevel@tonic-gate 17647c478bd9Sstevel@tonic-gate /* 17657c478bd9Sstevel@tonic-gate * According to TPI, for non-listeners, ignore seqnum 17667c478bd9Sstevel@tonic-gate * and disconnect. 17677c478bd9Sstevel@tonic-gate * Following interpretation of -1 seqnum is historical 17687c478bd9Sstevel@tonic-gate * and implied TPI ? (TPI only states that for T_CONN_IND, 17697c478bd9Sstevel@tonic-gate * a valid seqnum should not be -1). 17707c478bd9Sstevel@tonic-gate * 17717c478bd9Sstevel@tonic-gate * -1 means disconnect everything 17727c478bd9Sstevel@tonic-gate * regardless even on a listener. 17737c478bd9Sstevel@tonic-gate */ 17747c478bd9Sstevel@tonic-gate 17757c478bd9Sstevel@tonic-gate int old_state = tcp->tcp_state; 1776f4b3ec61Sdh155122 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 17777c478bd9Sstevel@tonic-gate 17787c478bd9Sstevel@tonic-gate /* 17797c478bd9Sstevel@tonic-gate * The connection can't be on the tcp_time_wait_head list 17807c478bd9Sstevel@tonic-gate * since it is not detached. 17817c478bd9Sstevel@tonic-gate */ 17827c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_time_wait_next == NULL); 17837c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_time_wait_prev == NULL); 17847c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_time_wait_expire == 0); 17857c478bd9Sstevel@tonic-gate /* 17867c478bd9Sstevel@tonic-gate * If it used to be a listener, check to make sure no one else 17877c478bd9Sstevel@tonic-gate * has taken the port before switching back to LISTEN state. 17887c478bd9Sstevel@tonic-gate */ 1789bd670b35SErik Nordmark if (connp->conn_ipversion == IPV4_VERSION) { 1790bd670b35SErik Nordmark lconnp = ipcl_lookup_listener_v4(connp->conn_lport, 1791bd670b35SErik Nordmark connp->conn_laddr_v4, IPCL_ZONEID(connp), ipst); 17927c478bd9Sstevel@tonic-gate } else { 1793bd670b35SErik Nordmark uint_t ifindex = 0; 1794bd670b35SErik Nordmark 1795bd670b35SErik Nordmark if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET) 1796bd670b35SErik Nordmark ifindex = connp->conn_ixa->ixa_scopeid; 1797bd670b35SErik Nordmark 1798bd670b35SErik Nordmark /* Allow conn_bound_if listeners? */ 1799bd670b35SErik Nordmark lconnp = ipcl_lookup_listener_v6(connp->conn_lport, 1800bd670b35SErik Nordmark &connp->conn_laddr_v6, ifindex, IPCL_ZONEID(connp), 1801bd670b35SErik Nordmark ipst); 18027c478bd9Sstevel@tonic-gate } 18030e0e37a8SErik Nordmark if (tcp->tcp_conn_req_max && lconnp == NULL) { 18047c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_LISTEN; 18059cd928feSAlan Maguire DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 18069cd928feSAlan Maguire connp->conn_ixa, void, NULL, tcp_t *, tcp, void, 18079cd928feSAlan Maguire NULL, int32_t, old_state); 18087c478bd9Sstevel@tonic-gate } else if (old_state > TCPS_BOUND) { 18097c478bd9Sstevel@tonic-gate tcp->tcp_conn_req_max = 0; 18107c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_BOUND; 18119cd928feSAlan Maguire DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 18129cd928feSAlan Maguire connp->conn_ixa, void, NULL, tcp_t *, tcp, void, 18139cd928feSAlan Maguire NULL, int32_t, old_state); 181493fcb0b9SKacheong Poon 181593fcb0b9SKacheong Poon /* 181693fcb0b9SKacheong Poon * If this end point is not going to become a listener, 181793fcb0b9SKacheong Poon * decrement the listener connection count if 181893fcb0b9SKacheong Poon * necessary. Note that we do not do this if it is 181993fcb0b9SKacheong Poon * going to be a listner (the above if case) since 182093fcb0b9SKacheong Poon * then it may remove the counter struct. 182193fcb0b9SKacheong Poon */ 182293fcb0b9SKacheong Poon if (tcp->tcp_listen_cnt != NULL) 182393fcb0b9SKacheong Poon TCP_DECR_LISTEN_CNT(tcp); 18247c478bd9Sstevel@tonic-gate } 18250e0e37a8SErik Nordmark if (lconnp != NULL) 1826bd670b35SErik Nordmark CONN_DEC_REF(lconnp); 1827721fffe3SKacheong Poon switch (old_state) { 1828721fffe3SKacheong Poon case TCPS_SYN_SENT: 1829721fffe3SKacheong Poon case TCPS_SYN_RCVD: 1830721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpAttemptFails); 1831721fffe3SKacheong Poon break; 1832721fffe3SKacheong Poon case TCPS_ESTABLISHED: 1833721fffe3SKacheong Poon case TCPS_CLOSE_WAIT: 1834721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpEstabResets); 1835721fffe3SKacheong Poon break; 18367c478bd9Sstevel@tonic-gate } 18377c478bd9Sstevel@tonic-gate 18387c478bd9Sstevel@tonic-gate if (tcp->tcp_fused) 18397c478bd9Sstevel@tonic-gate tcp_unfuse(tcp); 18407c478bd9Sstevel@tonic-gate 18417c478bd9Sstevel@tonic-gate mutex_enter(&tcp->tcp_eager_lock); 18427c478bd9Sstevel@tonic-gate if ((tcp->tcp_conn_req_cnt_q0 != 0) || 18437c478bd9Sstevel@tonic-gate (tcp->tcp_conn_req_cnt_q != 0)) { 18447c478bd9Sstevel@tonic-gate tcp_eager_cleanup(tcp, 0); 18457c478bd9Sstevel@tonic-gate } 18467c478bd9Sstevel@tonic-gate mutex_exit(&tcp->tcp_eager_lock); 18477c478bd9Sstevel@tonic-gate 18487c478bd9Sstevel@tonic-gate tcp_xmit_ctl("tcp_disconnect", tcp, tcp->tcp_snxt, 18497c478bd9Sstevel@tonic-gate tcp->tcp_rnxt, TH_RST | TH_ACK); 18507c478bd9Sstevel@tonic-gate 18517c478bd9Sstevel@tonic-gate tcp_reinit(tcp); 18527c478bd9Sstevel@tonic-gate 18530f1702c5SYu Xiangning return (0); 18547c478bd9Sstevel@tonic-gate } else if (!tcp_eager_blowoff(tcp, seqnum)) { 18550f1702c5SYu Xiangning return (TBADSEQ); 18560f1702c5SYu Xiangning } 18570f1702c5SYu Xiangning return (0); 18580f1702c5SYu Xiangning } 18590f1702c5SYu Xiangning 18600f1702c5SYu Xiangning /* 18610f1702c5SYu Xiangning * Our client hereby directs us to reject the connection request 1862bd670b35SErik Nordmark * that tcp_input_listener() marked with 'seqnum'. Rejection consists 18630f1702c5SYu Xiangning * of sending the appropriate RST, not an ICMP error. 18640f1702c5SYu Xiangning */ 1865721fffe3SKacheong Poon void 18660f1702c5SYu Xiangning tcp_disconnect(tcp_t *tcp, mblk_t *mp) 18670f1702c5SYu Xiangning { 18680f1702c5SYu Xiangning t_scalar_t seqnum; 18690f1702c5SYu Xiangning int error; 1870bd670b35SErik Nordmark conn_t *connp = tcp->tcp_connp; 18710f1702c5SYu Xiangning 18720f1702c5SYu Xiangning ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 18730f1702c5SYu Xiangning if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) { 18740f1702c5SYu Xiangning tcp_err_ack(tcp, mp, TPROTO, 0); 18757c478bd9Sstevel@tonic-gate return; 18767c478bd9Sstevel@tonic-gate } 18770f1702c5SYu Xiangning seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number; 18780f1702c5SYu Xiangning error = tcp_disconnect_common(tcp, seqnum); 18790f1702c5SYu Xiangning if (error != 0) 18800f1702c5SYu Xiangning tcp_err_ack(tcp, mp, error, 0); 18810f1702c5SYu Xiangning else { 18827c478bd9Sstevel@tonic-gate if (tcp->tcp_state >= TCPS_ESTABLISHED) { 18837c478bd9Sstevel@tonic-gate /* Send M_FLUSH according to TPI */ 1884bd670b35SErik Nordmark (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); 18857c478bd9Sstevel@tonic-gate } 18867c478bd9Sstevel@tonic-gate mp = mi_tpi_ok_ack_alloc(mp); 1887bd670b35SErik Nordmark if (mp != NULL) 1888bd670b35SErik Nordmark putnext(connp->conn_rq, mp); 18897c478bd9Sstevel@tonic-gate } 18900f1702c5SYu Xiangning } 18917c478bd9Sstevel@tonic-gate 18927c478bd9Sstevel@tonic-gate /* 18937c478bd9Sstevel@tonic-gate * Handle reinitialization of a tcp structure. 18947c478bd9Sstevel@tonic-gate * Maintain "binding state" resetting the state to BOUND, LISTEN, or IDLE. 18957c478bd9Sstevel@tonic-gate */ 18967c478bd9Sstevel@tonic-gate static void 18977c478bd9Sstevel@tonic-gate tcp_reinit(tcp_t *tcp) 18987c478bd9Sstevel@tonic-gate { 18997c478bd9Sstevel@tonic-gate mblk_t *mp; 1900f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 1901bd670b35SErik Nordmark conn_t *connp = tcp->tcp_connp; 19029cd928feSAlan Maguire int32_t oldstate; 19037c478bd9Sstevel@tonic-gate 19047c478bd9Sstevel@tonic-gate /* tcp_reinit should never be called for detached tcp_t's */ 19057c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_listener == NULL); 1906bd670b35SErik Nordmark ASSERT((connp->conn_family == AF_INET && 1907bd670b35SErik Nordmark connp->conn_ipversion == IPV4_VERSION) || 1908bd670b35SErik Nordmark (connp->conn_family == AF_INET6 && 1909bd670b35SErik Nordmark (connp->conn_ipversion == IPV4_VERSION || 1910bd670b35SErik Nordmark connp->conn_ipversion == IPV6_VERSION))); 19117c478bd9Sstevel@tonic-gate 19127c478bd9Sstevel@tonic-gate /* Cancel outstanding timers */ 19137c478bd9Sstevel@tonic-gate tcp_timers_stop(tcp); 19147c478bd9Sstevel@tonic-gate 19157c478bd9Sstevel@tonic-gate /* 19167c478bd9Sstevel@tonic-gate * Reset everything in the state vector, after updating global 19177c478bd9Sstevel@tonic-gate * MIB data from instance counters. 19187c478bd9Sstevel@tonic-gate */ 1919721fffe3SKacheong Poon TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs); 19207c478bd9Sstevel@tonic-gate tcp->tcp_ibsegs = 0; 1921721fffe3SKacheong Poon TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs); 19227c478bd9Sstevel@tonic-gate tcp->tcp_obsegs = 0; 19237c478bd9Sstevel@tonic-gate 19247c478bd9Sstevel@tonic-gate tcp_close_mpp(&tcp->tcp_xmit_head); 19257c478bd9Sstevel@tonic-gate if (tcp->tcp_snd_zcopy_aware) 19267c478bd9Sstevel@tonic-gate tcp_zcopy_notify(tcp); 19277c478bd9Sstevel@tonic-gate tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL; 19287c478bd9Sstevel@tonic-gate tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0; 1929e0968231Svi117747 mutex_enter(&tcp->tcp_non_sq_lock); 1930ff550d0eSmasputra if (tcp->tcp_flow_stopped && 1931bd670b35SErik Nordmark TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { 1932ff550d0eSmasputra tcp_clrqfull(tcp); 1933ff550d0eSmasputra } 1934e0968231Svi117747 mutex_exit(&tcp->tcp_non_sq_lock); 19357c478bd9Sstevel@tonic-gate tcp_close_mpp(&tcp->tcp_reass_head); 19367c478bd9Sstevel@tonic-gate tcp->tcp_reass_tail = NULL; 19377c478bd9Sstevel@tonic-gate if (tcp->tcp_rcv_list != NULL) { 19387c478bd9Sstevel@tonic-gate /* Free b_next chain */ 19397c478bd9Sstevel@tonic-gate tcp_close_mpp(&tcp->tcp_rcv_list); 19407c478bd9Sstevel@tonic-gate tcp->tcp_rcv_last_head = NULL; 19417c478bd9Sstevel@tonic-gate tcp->tcp_rcv_last_tail = NULL; 19427c478bd9Sstevel@tonic-gate tcp->tcp_rcv_cnt = 0; 19437c478bd9Sstevel@tonic-gate } 19447c478bd9Sstevel@tonic-gate tcp->tcp_rcv_last_tail = NULL; 19457c478bd9Sstevel@tonic-gate 19467c478bd9Sstevel@tonic-gate if ((mp = tcp->tcp_urp_mp) != NULL) { 19477c478bd9Sstevel@tonic-gate freemsg(mp); 19487c478bd9Sstevel@tonic-gate tcp->tcp_urp_mp = NULL; 19497c478bd9Sstevel@tonic-gate } 19507c478bd9Sstevel@tonic-gate if ((mp = tcp->tcp_urp_mark_mp) != NULL) { 19517c478bd9Sstevel@tonic-gate freemsg(mp); 19527c478bd9Sstevel@tonic-gate tcp->tcp_urp_mark_mp = NULL; 19537c478bd9Sstevel@tonic-gate } 19547c478bd9Sstevel@tonic-gate if (tcp->tcp_fused_sigurg_mp != NULL) { 19550f1702c5SYu Xiangning ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 19567c478bd9Sstevel@tonic-gate freeb(tcp->tcp_fused_sigurg_mp); 19577c478bd9Sstevel@tonic-gate tcp->tcp_fused_sigurg_mp = NULL; 19587c478bd9Sstevel@tonic-gate } 1959f7f8e53dSKacheong Poon if (tcp->tcp_ordrel_mp != NULL) { 19600f1702c5SYu Xiangning ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 1961f7f8e53dSKacheong Poon freeb(tcp->tcp_ordrel_mp); 1962f7f8e53dSKacheong Poon tcp->tcp_ordrel_mp = NULL; 1963f7f8e53dSKacheong Poon } 19647c478bd9Sstevel@tonic-gate 19657c478bd9Sstevel@tonic-gate /* 19667c478bd9Sstevel@tonic-gate * Following is a union with two members which are 19677c478bd9Sstevel@tonic-gate * identical types and size so the following cleanup 19687c478bd9Sstevel@tonic-gate * is enough. 19697c478bd9Sstevel@tonic-gate */ 19707c478bd9Sstevel@tonic-gate tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); 19717c478bd9Sstevel@tonic-gate 1972bd670b35SErik Nordmark CL_INET_DISCONNECT(connp); 19737c478bd9Sstevel@tonic-gate 19747c478bd9Sstevel@tonic-gate /* 19757c478bd9Sstevel@tonic-gate * The connection can't be on the tcp_time_wait_head list 19767c478bd9Sstevel@tonic-gate * since it is not detached. 19777c478bd9Sstevel@tonic-gate */ 19787c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_time_wait_next == NULL); 19797c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_time_wait_prev == NULL); 19807c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_time_wait_expire == 0); 19817c478bd9Sstevel@tonic-gate 19827c478bd9Sstevel@tonic-gate /* 19837c478bd9Sstevel@tonic-gate * Reset/preserve other values 19847c478bd9Sstevel@tonic-gate */ 19857c478bd9Sstevel@tonic-gate tcp_reinit_values(tcp); 1986bd670b35SErik Nordmark ipcl_hash_remove(connp); 198776a1033eSErik Nordmark /* Note that ixa_cred gets cleared in ixa_cleanup */ 1988bd670b35SErik Nordmark ixa_cleanup(connp->conn_ixa); 1989f4b3ec61Sdh155122 tcp_ipsec_cleanup(tcp); 19907c478bd9Sstevel@tonic-gate 1991bd670b35SErik Nordmark connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 1992bd670b35SErik Nordmark connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 19939cd928feSAlan Maguire oldstate = tcp->tcp_state; 19945f9878b0Sken Powell - Sun Microsystem 19957c478bd9Sstevel@tonic-gate if (tcp->tcp_conn_req_max != 0) { 19967c478bd9Sstevel@tonic-gate /* 19977c478bd9Sstevel@tonic-gate * This is the case when a TLI program uses the same 19987c478bd9Sstevel@tonic-gate * transport end point to accept a connection. This 19997c478bd9Sstevel@tonic-gate * makes the TCP both a listener and acceptor. When 20007c478bd9Sstevel@tonic-gate * this connection is closed, we need to set the state 20017c478bd9Sstevel@tonic-gate * back to TCPS_LISTEN. Make sure that the eager list 20027c478bd9Sstevel@tonic-gate * is reinitialized. 20037c478bd9Sstevel@tonic-gate * 20047c478bd9Sstevel@tonic-gate * Note that this stream is still bound to the four 20057c478bd9Sstevel@tonic-gate * tuples of the previous connection in IP. If a new 20067c478bd9Sstevel@tonic-gate * SYN with different foreign address comes in, IP will 20077c478bd9Sstevel@tonic-gate * not find it and will send it to the global queue. In 20087c478bd9Sstevel@tonic-gate * the global queue, TCP will do a tcp_lookup_listener() 20097c478bd9Sstevel@tonic-gate * to find this stream. This works because this stream 20107c478bd9Sstevel@tonic-gate * is only removed from connected hash. 20117c478bd9Sstevel@tonic-gate * 20127c478bd9Sstevel@tonic-gate */ 20137c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_LISTEN; 20147c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 2015866ba9ddSjprakash tcp->tcp_eager_next_drop_q0 = tcp; 2016866ba9ddSjprakash tcp->tcp_eager_prev_drop_q0 = tcp; 2017bd670b35SErik Nordmark /* 2018bd670b35SErik Nordmark * Initially set conn_recv to tcp_input_listener_unbound to try 2019bd670b35SErik Nordmark * to pick a good squeue for the listener when the first SYN 2020bd670b35SErik Nordmark * arrives. tcp_input_listener_unbound sets it to 2021bd670b35SErik Nordmark * tcp_input_listener on that first SYN. 2022bd670b35SErik Nordmark */ 2023bd670b35SErik Nordmark connp->conn_recv = tcp_input_listener_unbound; 2024bd670b35SErik Nordmark 2025bd670b35SErik Nordmark connp->conn_proto = IPPROTO_TCP; 2026bd670b35SErik Nordmark connp->conn_faddr_v6 = ipv6_all_zeros; 2027bd670b35SErik Nordmark connp->conn_fport = 0; 2028bd670b35SErik Nordmark 2029bd670b35SErik Nordmark (void) ipcl_bind_insert(connp); 20307c478bd9Sstevel@tonic-gate } else { 20317c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_BOUND; 20327c478bd9Sstevel@tonic-gate } 20337c478bd9Sstevel@tonic-gate 20347c478bd9Sstevel@tonic-gate /* 20357c478bd9Sstevel@tonic-gate * Initialize to default values 20367c478bd9Sstevel@tonic-gate */ 2037707e74bcSKacheong Poon tcp_init_values(tcp, NULL); 20387c478bd9Sstevel@tonic-gate 20399cd928feSAlan Maguire DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 20409cd928feSAlan Maguire connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, 20419cd928feSAlan Maguire int32_t, oldstate); 20429cd928feSAlan Maguire 20437c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_ptpbhn != NULL); 2044bd670b35SErik Nordmark tcp->tcp_rwnd = connp->conn_rcvbuf; 2045bd670b35SErik Nordmark tcp->tcp_mss = connp->conn_ipversion != IPV4_VERSION ? 2046f4b3ec61Sdh155122 tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4; 20477c478bd9Sstevel@tonic-gate } 20487c478bd9Sstevel@tonic-gate 20497c478bd9Sstevel@tonic-gate /* 20507c478bd9Sstevel@tonic-gate * Force values to zero that need be zero. 20517c478bd9Sstevel@tonic-gate * Do not touch values asociated with the BOUND or LISTEN state 20527c478bd9Sstevel@tonic-gate * since the connection will end up in that state after the reinit. 20537c478bd9Sstevel@tonic-gate * NOTE: tcp_reinit_values MUST have a line for each field in the tcp_t 20547c478bd9Sstevel@tonic-gate * structure! 20557c478bd9Sstevel@tonic-gate */ 20567c478bd9Sstevel@tonic-gate static void 20577c478bd9Sstevel@tonic-gate tcp_reinit_values(tcp) 20587c478bd9Sstevel@tonic-gate tcp_t *tcp; 20597c478bd9Sstevel@tonic-gate { 2060f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 2061bd670b35SErik Nordmark conn_t *connp = tcp->tcp_connp; 2062f4b3ec61Sdh155122 20637c478bd9Sstevel@tonic-gate #ifndef lint 20647c478bd9Sstevel@tonic-gate #define DONTCARE(x) 20657c478bd9Sstevel@tonic-gate #define PRESERVE(x) 20667c478bd9Sstevel@tonic-gate #else 20677c478bd9Sstevel@tonic-gate #define DONTCARE(x) ((x) = (x)) 20687c478bd9Sstevel@tonic-gate #define PRESERVE(x) ((x) = (x)) 20697c478bd9Sstevel@tonic-gate #endif /* lint */ 20707c478bd9Sstevel@tonic-gate 20710f1702c5SYu Xiangning PRESERVE(tcp->tcp_bind_hash_port); 20727c478bd9Sstevel@tonic-gate PRESERVE(tcp->tcp_bind_hash); 20737c478bd9Sstevel@tonic-gate PRESERVE(tcp->tcp_ptpbhn); 20747c478bd9Sstevel@tonic-gate PRESERVE(tcp->tcp_acceptor_hash); 20757c478bd9Sstevel@tonic-gate PRESERVE(tcp->tcp_ptpahn); 20767c478bd9Sstevel@tonic-gate 20777c478bd9Sstevel@tonic-gate /* Should be ASSERT NULL on these with new code! */ 20787c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_time_wait_next == NULL); 20797c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_time_wait_prev == NULL); 20807c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_time_wait_expire == 0); 20817c478bd9Sstevel@tonic-gate PRESERVE(tcp->tcp_state); 2082bd670b35SErik Nordmark PRESERVE(connp->conn_rq); 2083bd670b35SErik Nordmark PRESERVE(connp->conn_wq); 20847c478bd9Sstevel@tonic-gate 20857c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_xmit_head == NULL); 20867c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_xmit_last == NULL); 20877c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_unsent == 0); 20887c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_xmit_tail == NULL); 20897c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_xmit_tail_unsent == 0); 20907c478bd9Sstevel@tonic-gate 20917c478bd9Sstevel@tonic-gate tcp->tcp_snxt = 0; /* Displayed in mib */ 20927c478bd9Sstevel@tonic-gate tcp->tcp_suna = 0; /* Displayed in mib */ 20937c478bd9Sstevel@tonic-gate tcp->tcp_swnd = 0; 2094bd670b35SErik Nordmark DONTCARE(tcp->tcp_cwnd); /* Init in tcp_process_options */ 20957c478bd9Sstevel@tonic-gate 20967c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_ibsegs == 0); 20977c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_obsegs == 0); 20987c478bd9Sstevel@tonic-gate 2099bd670b35SErik Nordmark if (connp->conn_ht_iphc != NULL) { 2100bd670b35SErik Nordmark kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); 2101bd670b35SErik Nordmark connp->conn_ht_iphc = NULL; 2102bd670b35SErik Nordmark connp->conn_ht_iphc_allocated = 0; 2103bd670b35SErik Nordmark connp->conn_ht_iphc_len = 0; 2104bd670b35SErik Nordmark connp->conn_ht_ulp = NULL; 2105bd670b35SErik Nordmark connp->conn_ht_ulp_len = 0; 2106bd670b35SErik Nordmark tcp->tcp_ipha = NULL; 2107bd670b35SErik Nordmark tcp->tcp_ip6h = NULL; 2108bd670b35SErik Nordmark tcp->tcp_tcpha = NULL; 21097c478bd9Sstevel@tonic-gate } 21107c478bd9Sstevel@tonic-gate 2111bd670b35SErik Nordmark /* We clear any IP_OPTIONS and extension headers */ 2112bd670b35SErik Nordmark ip_pkt_free(&connp->conn_xmit_ipp); 2113bd670b35SErik Nordmark 21147c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_naglim); /* Init in tcp_init_values */ 21157c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_ipha); 21167c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_ip6h); 2117bd670b35SErik Nordmark DONTCARE(tcp->tcp_tcpha); 21187c478bd9Sstevel@tonic-gate tcp->tcp_valid_bits = 0; 21197c478bd9Sstevel@tonic-gate 21207c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_timer_backoff); /* Init in tcp_init_values */ 21217c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_last_recv_time); /* Init in tcp_init_values */ 21227c478bd9Sstevel@tonic-gate tcp->tcp_last_rcv_lbolt = 0; 21237c478bd9Sstevel@tonic-gate 21247c478bd9Sstevel@tonic-gate tcp->tcp_init_cwnd = 0; 21257c478bd9Sstevel@tonic-gate 21267c478bd9Sstevel@tonic-gate tcp->tcp_urp_last_valid = 0; 21277c478bd9Sstevel@tonic-gate tcp->tcp_hard_binding = 0; 21287c478bd9Sstevel@tonic-gate 21297c478bd9Sstevel@tonic-gate tcp->tcp_fin_acked = 0; 21307c478bd9Sstevel@tonic-gate tcp->tcp_fin_rcvd = 0; 21317c478bd9Sstevel@tonic-gate tcp->tcp_fin_sent = 0; 21327c478bd9Sstevel@tonic-gate tcp->tcp_ordrel_done = 0; 21337c478bd9Sstevel@tonic-gate 21347c478bd9Sstevel@tonic-gate tcp->tcp_detached = 0; 21357c478bd9Sstevel@tonic-gate 21367c478bd9Sstevel@tonic-gate tcp->tcp_snd_ws_ok = B_FALSE; 21377c478bd9Sstevel@tonic-gate tcp->tcp_snd_ts_ok = B_FALSE; 21387c478bd9Sstevel@tonic-gate tcp->tcp_zero_win_probe = 0; 21397c478bd9Sstevel@tonic-gate 21407c478bd9Sstevel@tonic-gate tcp->tcp_loopback = 0; 21417c478bd9Sstevel@tonic-gate tcp->tcp_localnet = 0; 21427c478bd9Sstevel@tonic-gate tcp->tcp_syn_defense = 0; 21437c478bd9Sstevel@tonic-gate tcp->tcp_set_timer = 0; 21447c478bd9Sstevel@tonic-gate 21457c478bd9Sstevel@tonic-gate tcp->tcp_active_open = 0; 21467c478bd9Sstevel@tonic-gate tcp->tcp_rexmit = B_FALSE; 21477c478bd9Sstevel@tonic-gate tcp->tcp_xmit_zc_clean = B_FALSE; 21487c478bd9Sstevel@tonic-gate 21497c478bd9Sstevel@tonic-gate tcp->tcp_snd_sack_ok = B_FALSE; 21507c478bd9Sstevel@tonic-gate tcp->tcp_hwcksum = B_FALSE; 21517c478bd9Sstevel@tonic-gate 2152bd670b35SErik Nordmark DONTCARE(tcp->tcp_maxpsz_multiplier); /* Init in tcp_init_values */ 21537c478bd9Sstevel@tonic-gate 21547c478bd9Sstevel@tonic-gate tcp->tcp_conn_def_q0 = 0; 21557c478bd9Sstevel@tonic-gate tcp->tcp_ip_forward_progress = B_FALSE; 21567c478bd9Sstevel@tonic-gate tcp->tcp_ecn_ok = B_FALSE; 21577c478bd9Sstevel@tonic-gate 21587c478bd9Sstevel@tonic-gate tcp->tcp_cwr = B_FALSE; 21597c478bd9Sstevel@tonic-gate tcp->tcp_ecn_echo_on = B_FALSE; 2160410734d4SPhilip Kirk tcp->tcp_is_wnd_shrnk = B_FALSE; 21617c478bd9Sstevel@tonic-gate 216266cd0f60SKacheong Poon TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp); 216366cd0f60SKacheong Poon bzero(&tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 21647c478bd9Sstevel@tonic-gate 21657c478bd9Sstevel@tonic-gate tcp->tcp_rcv_ws = 0; 21667c478bd9Sstevel@tonic-gate tcp->tcp_snd_ws = 0; 21677c478bd9Sstevel@tonic-gate tcp->tcp_ts_recent = 0; 21687c478bd9Sstevel@tonic-gate tcp->tcp_rnxt = 0; /* Displayed in mib */ 21697c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_rwnd); /* Set in tcp_reinit() */ 2170bd670b35SErik Nordmark tcp->tcp_initial_pmtu = 0; 21717c478bd9Sstevel@tonic-gate 21727c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_reass_head == NULL); 21737c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_reass_tail == NULL); 21747c478bd9Sstevel@tonic-gate 21757c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_cnt = 0; 21767c478bd9Sstevel@tonic-gate 21777c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_rcv_list == NULL); 21787c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_rcv_last_head == NULL); 21797c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_rcv_last_tail == NULL); 21807c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_rcv_cnt == 0); 21817c478bd9Sstevel@tonic-gate 2182bd670b35SErik Nordmark DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_set_destination */ 21837c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_cwnd_max); /* Init in tcp_init_values */ 21847c478bd9Sstevel@tonic-gate tcp->tcp_csuna = 0; 21857c478bd9Sstevel@tonic-gate 21867c478bd9Sstevel@tonic-gate tcp->tcp_rto = 0; /* Displayed in MIB */ 21877c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */ 21887c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */ 21897c478bd9Sstevel@tonic-gate tcp->tcp_rtt_update = 0; 21907c478bd9Sstevel@tonic-gate 21917c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ 21927c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ 21937c478bd9Sstevel@tonic-gate 21947c478bd9Sstevel@tonic-gate tcp->tcp_rack = 0; /* Displayed in mib */ 21957c478bd9Sstevel@tonic-gate tcp->tcp_rack_cnt = 0; 21967c478bd9Sstevel@tonic-gate tcp->tcp_rack_cur_max = 0; 21977c478bd9Sstevel@tonic-gate tcp->tcp_rack_abs_max = 0; 21987c478bd9Sstevel@tonic-gate 21997c478bd9Sstevel@tonic-gate tcp->tcp_max_swnd = 0; 22007c478bd9Sstevel@tonic-gate 22017c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_listener == NULL); 22027c478bd9Sstevel@tonic-gate 22037c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_irs); /* tcp_valid_bits cleared */ 22047c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_iss); /* tcp_valid_bits cleared */ 22057c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_fss); /* tcp_valid_bits cleared */ 22067c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_urg); /* tcp_valid_bits cleared */ 22077c478bd9Sstevel@tonic-gate 22087c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_conn_req_cnt_q == 0); 22097c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_conn_req_cnt_q0 == 0); 22107c478bd9Sstevel@tonic-gate PRESERVE(tcp->tcp_conn_req_max); 22117c478bd9Sstevel@tonic-gate PRESERVE(tcp->tcp_conn_req_seqnum); 22127c478bd9Sstevel@tonic-gate 22137c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */ 22147c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */ 22157c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */ 22167c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */ 22177c478bd9Sstevel@tonic-gate 22187c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_urp_last); /* tcp_urp_last_valid is cleared */ 22197c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_urp_mp == NULL); 22207c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_urp_mark_mp == NULL); 22217c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_fused_sigurg_mp == NULL); 22227c478bd9Sstevel@tonic-gate 22237c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_eager_next_q == NULL); 22247c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_eager_last_q == NULL); 22257c478bd9Sstevel@tonic-gate ASSERT((tcp->tcp_eager_next_q0 == NULL && 22267c478bd9Sstevel@tonic-gate tcp->tcp_eager_prev_q0 == NULL) || 22277c478bd9Sstevel@tonic-gate tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0); 22287c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); 22297c478bd9Sstevel@tonic-gate 2230866ba9ddSjprakash ASSERT((tcp->tcp_eager_next_drop_q0 == NULL && 2231866ba9ddSjprakash tcp->tcp_eager_prev_drop_q0 == NULL) || 2232866ba9ddSjprakash tcp->tcp_eager_next_drop_q0 == tcp->tcp_eager_prev_drop_q0); 2233866ba9ddSjprakash 223418576aacSDan McDonald DONTCARE(tcp->tcp_ka_rinterval); /* Init in tcp_init_values */ 223518576aacSDan McDonald DONTCARE(tcp->tcp_ka_abort_thres); /* Init in tcp_init_values */ 223618576aacSDan McDonald DONTCARE(tcp->tcp_ka_cnt); /* Init in tcp_init_values */ 223718576aacSDan McDonald 22387c478bd9Sstevel@tonic-gate tcp->tcp_client_errno = 0; 22397c478bd9Sstevel@tonic-gate 2240bd670b35SErik Nordmark DONTCARE(connp->conn_sum); /* Init in tcp_init_values */ 22417c478bd9Sstevel@tonic-gate 2242bd670b35SErik Nordmark connp->conn_faddr_v6 = ipv6_all_zeros; /* Displayed in MIB */ 22437c478bd9Sstevel@tonic-gate 2244bd670b35SErik Nordmark PRESERVE(connp->conn_bound_addr_v6); 22457c478bd9Sstevel@tonic-gate tcp->tcp_last_sent_len = 0; 22467c478bd9Sstevel@tonic-gate tcp->tcp_dupack_cnt = 0; 22477c478bd9Sstevel@tonic-gate 2248bd670b35SErik Nordmark connp->conn_fport = 0; /* Displayed in MIB */ 2249bd670b35SErik Nordmark PRESERVE(connp->conn_lport); 22507c478bd9Sstevel@tonic-gate 22517c478bd9Sstevel@tonic-gate PRESERVE(tcp->tcp_acceptor_lockp); 22527c478bd9Sstevel@tonic-gate 2253f7f8e53dSKacheong Poon ASSERT(tcp->tcp_ordrel_mp == NULL); 22547c478bd9Sstevel@tonic-gate PRESERVE(tcp->tcp_acceptor_id); 22557c478bd9Sstevel@tonic-gate DONTCARE(tcp->tcp_ipsec_overhead); 22567c478bd9Sstevel@tonic-gate 2257bd670b35SErik Nordmark PRESERVE(connp->conn_family); 2258bd670b35SErik Nordmark /* Remove any remnants of mapped address binding */ 2259bd670b35SErik Nordmark if (connp->conn_family == AF_INET6) { 2260bd670b35SErik Nordmark connp->conn_ipversion = IPV6_VERSION; 2261f4b3ec61Sdh155122 tcp->tcp_mss = tcps->tcps_mss_def_ipv6; 22627c478bd9Sstevel@tonic-gate } else { 2263bd670b35SErik Nordmark connp->conn_ipversion = IPV4_VERSION; 2264f4b3ec61Sdh155122 tcp->tcp_mss = tcps->tcps_mss_def_ipv4; 22657c478bd9Sstevel@tonic-gate } 22667c478bd9Sstevel@tonic-gate 2267bd670b35SErik Nordmark connp->conn_bound_if = 0; 2268bd670b35SErik Nordmark connp->conn_recv_ancillary.crb_all = 0; 22697c478bd9Sstevel@tonic-gate tcp->tcp_recvifindex = 0; 22707c478bd9Sstevel@tonic-gate tcp->tcp_recvhops = 0; 22717c478bd9Sstevel@tonic-gate tcp->tcp_closed = 0; 22727c478bd9Sstevel@tonic-gate if (tcp->tcp_hopopts != NULL) { 22737c478bd9Sstevel@tonic-gate mi_free(tcp->tcp_hopopts); 22747c478bd9Sstevel@tonic-gate tcp->tcp_hopopts = NULL; 22757c478bd9Sstevel@tonic-gate tcp->tcp_hopoptslen = 0; 22767c478bd9Sstevel@tonic-gate } 22777c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_hopoptslen == 0); 22787c478bd9Sstevel@tonic-gate if (tcp->tcp_dstopts != NULL) { 22797c478bd9Sstevel@tonic-gate mi_free(tcp->tcp_dstopts); 22807c478bd9Sstevel@tonic-gate tcp->tcp_dstopts = NULL; 22817c478bd9Sstevel@tonic-gate tcp->tcp_dstoptslen = 0; 22827c478bd9Sstevel@tonic-gate } 22837c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_dstoptslen == 0); 2284bd670b35SErik Nordmark if (tcp->tcp_rthdrdstopts != NULL) { 2285bd670b35SErik Nordmark mi_free(tcp->tcp_rthdrdstopts); 2286bd670b35SErik Nordmark tcp->tcp_rthdrdstopts = NULL; 2287bd670b35SErik Nordmark tcp->tcp_rthdrdstoptslen = 0; 22887c478bd9Sstevel@tonic-gate } 2289bd670b35SErik Nordmark ASSERT(tcp->tcp_rthdrdstoptslen == 0); 22907c478bd9Sstevel@tonic-gate if (tcp->tcp_rthdr != NULL) { 22917c478bd9Sstevel@tonic-gate mi_free(tcp->tcp_rthdr); 22927c478bd9Sstevel@tonic-gate tcp->tcp_rthdr = NULL; 22937c478bd9Sstevel@tonic-gate tcp->tcp_rthdrlen = 0; 22947c478bd9Sstevel@tonic-gate } 22957c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_rthdrlen == 0); 22967c478bd9Sstevel@tonic-gate 2297ff550d0eSmasputra /* Reset fusion-related fields */ 22987c478bd9Sstevel@tonic-gate tcp->tcp_fused = B_FALSE; 22997c478bd9Sstevel@tonic-gate tcp->tcp_unfusable = B_FALSE; 23007c478bd9Sstevel@tonic-gate tcp->tcp_fused_sigurg = B_FALSE; 23017c478bd9Sstevel@tonic-gate tcp->tcp_loopback_peer = NULL; 23027c478bd9Sstevel@tonic-gate 23038347601bSyl150051 tcp->tcp_lso = B_FALSE; 23048347601bSyl150051 23057c478bd9Sstevel@tonic-gate tcp->tcp_in_ack_unsent = 0; 23067c478bd9Sstevel@tonic-gate tcp->tcp_cork = B_FALSE; 2307866ba9ddSjprakash tcp->tcp_tconnind_started = B_FALSE; 23087c478bd9Sstevel@tonic-gate 2309ff550d0eSmasputra PRESERVE(tcp->tcp_squeue_bytes); 2310ff550d0eSmasputra 23110163a147Sjprakash tcp->tcp_closemp_used = B_FALSE; 2312866ba9ddSjprakash 2313f7f8e53dSKacheong Poon PRESERVE(tcp->tcp_rsrv_mp); 2314f7f8e53dSKacheong Poon PRESERVE(tcp->tcp_rsrv_mp_lock); 2315f7f8e53dSKacheong Poon 2316866ba9ddSjprakash #ifdef DEBUG 2317866ba9ddSjprakash DONTCARE(tcp->tcmp_stk[0]); 2318866ba9ddSjprakash #endif 2319866ba9ddSjprakash 23200f1702c5SYu Xiangning PRESERVE(tcp->tcp_connid); 23210f1702c5SYu Xiangning 232293fcb0b9SKacheong Poon ASSERT(tcp->tcp_listen_cnt == NULL); 232393fcb0b9SKacheong Poon ASSERT(tcp->tcp_reass_tid == 0); 2324866ba9ddSjprakash 23257c478bd9Sstevel@tonic-gate #undef DONTCARE 23267c478bd9Sstevel@tonic-gate #undef PRESERVE 23277c478bd9Sstevel@tonic-gate } 23287c478bd9Sstevel@tonic-gate 2329707e74bcSKacheong Poon /* 2330707e74bcSKacheong Poon * Initialize the various fields in tcp_t. If parent (the listener) is non 2331707e74bcSKacheong Poon * NULL, certain values will be inheritted from it. 2332707e74bcSKacheong Poon */ 2333721fffe3SKacheong Poon void 2334707e74bcSKacheong Poon tcp_init_values(tcp_t *tcp, tcp_t *parent) 23357c478bd9Sstevel@tonic-gate { 2336f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 2337bd670b35SErik Nordmark conn_t *connp = tcp->tcp_connp; 2338707e74bcSKacheong Poon clock_t rto; 23397c478bd9Sstevel@tonic-gate 2340bd670b35SErik Nordmark ASSERT((connp->conn_family == AF_INET && 2341bd670b35SErik Nordmark connp->conn_ipversion == IPV4_VERSION) || 2342bd670b35SErik Nordmark (connp->conn_family == AF_INET6 && 2343bd670b35SErik Nordmark (connp->conn_ipversion == IPV4_VERSION || 2344bd670b35SErik Nordmark connp->conn_ipversion == IPV6_VERSION))); 23457c478bd9Sstevel@tonic-gate 2346707e74bcSKacheong Poon if (parent == NULL) { 2347707e74bcSKacheong Poon tcp->tcp_naglim = tcps->tcps_naglim_def; 2348707e74bcSKacheong Poon 2349707e74bcSKacheong Poon tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial; 2350707e74bcSKacheong Poon tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min; 2351707e74bcSKacheong Poon tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max; 2352707e74bcSKacheong Poon 2353707e74bcSKacheong Poon tcp->tcp_first_ctimer_threshold = 2354707e74bcSKacheong Poon tcps->tcps_ip_notify_cinterval; 2355707e74bcSKacheong Poon tcp->tcp_second_ctimer_threshold = 2356707e74bcSKacheong Poon tcps->tcps_ip_abort_cinterval; 2357707e74bcSKacheong Poon tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval; 2358707e74bcSKacheong Poon tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval; 2359707e74bcSKacheong Poon 2360707e74bcSKacheong Poon tcp->tcp_fin_wait_2_flush_interval = 2361707e74bcSKacheong Poon tcps->tcps_fin_wait_2_flush_interval; 2362707e74bcSKacheong Poon 2363707e74bcSKacheong Poon tcp->tcp_ka_interval = tcps->tcps_keepalive_interval; 2364707e74bcSKacheong Poon tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval; 23653d0a255cSGarrett D'Amore tcp->tcp_ka_cnt = 0; 23663d0a255cSGarrett D'Amore tcp->tcp_ka_rinterval = 0; 2367707e74bcSKacheong Poon 2368707e74bcSKacheong Poon /* 2369707e74bcSKacheong Poon * Default value of tcp_init_cwnd is 0, so no need to set here 2370707e74bcSKacheong Poon * if parent is NULL. But we need to inherit it from parent. 2371707e74bcSKacheong Poon */ 2372707e74bcSKacheong Poon } else { 2373707e74bcSKacheong Poon /* Inherit various TCP parameters from the parent. */ 2374707e74bcSKacheong Poon tcp->tcp_naglim = parent->tcp_naglim; 2375707e74bcSKacheong Poon 2376707e74bcSKacheong Poon tcp->tcp_rto_initial = parent->tcp_rto_initial; 2377707e74bcSKacheong Poon tcp->tcp_rto_min = parent->tcp_rto_min; 2378707e74bcSKacheong Poon tcp->tcp_rto_max = parent->tcp_rto_max; 2379707e74bcSKacheong Poon 2380707e74bcSKacheong Poon tcp->tcp_first_ctimer_threshold = 2381707e74bcSKacheong Poon parent->tcp_first_ctimer_threshold; 2382707e74bcSKacheong Poon tcp->tcp_second_ctimer_threshold = 2383707e74bcSKacheong Poon parent->tcp_second_ctimer_threshold; 2384707e74bcSKacheong Poon tcp->tcp_first_timer_threshold = 2385707e74bcSKacheong Poon parent->tcp_first_timer_threshold; 2386707e74bcSKacheong Poon tcp->tcp_second_timer_threshold = 2387707e74bcSKacheong Poon parent->tcp_second_timer_threshold; 2388707e74bcSKacheong Poon 2389707e74bcSKacheong Poon tcp->tcp_fin_wait_2_flush_interval = 2390707e74bcSKacheong Poon parent->tcp_fin_wait_2_flush_interval; 2391707e74bcSKacheong Poon 2392707e74bcSKacheong Poon tcp->tcp_ka_interval = parent->tcp_ka_interval; 2393707e74bcSKacheong Poon tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres; 239418576aacSDan McDonald tcp->tcp_ka_cnt = parent->tcp_ka_cnt; 239518576aacSDan McDonald tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval; 2396707e74bcSKacheong Poon 2397707e74bcSKacheong Poon tcp->tcp_init_cwnd = parent->tcp_init_cwnd; 2398707e74bcSKacheong Poon } 2399707e74bcSKacheong Poon 24007c478bd9Sstevel@tonic-gate /* 24017c478bd9Sstevel@tonic-gate * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO 24027c478bd9Sstevel@tonic-gate * will be close to tcp_rexmit_interval_initial. By doing this, we 24037c478bd9Sstevel@tonic-gate * allow the algorithm to adjust slowly to large fluctuations of RTT 24047c478bd9Sstevel@tonic-gate * during first few transmissions of a connection as seen in slow 24057c478bd9Sstevel@tonic-gate * links. 24067c478bd9Sstevel@tonic-gate */ 2407707e74bcSKacheong Poon tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2; 2408707e74bcSKacheong Poon tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1; 2409707e74bcSKacheong Poon rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 2410f4b3ec61Sdh155122 tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + 2411f4b3ec61Sdh155122 tcps->tcps_conn_grace_period; 2412707e74bcSKacheong Poon TCP_SET_RTO(tcp, rto); 2413707e74bcSKacheong Poon 24147c478bd9Sstevel@tonic-gate tcp->tcp_timer_backoff = 0; 24157c478bd9Sstevel@tonic-gate tcp->tcp_ms_we_have_waited = 0; 2416d3d50737SRafael Vanoni tcp->tcp_last_recv_time = ddi_get_lbolt(); 2417f4b3ec61Sdh155122 tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_; 241843d18f1cSpriyanka tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 24197c478bd9Sstevel@tonic-gate 2420bd670b35SErik Nordmark tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier; 24217c478bd9Sstevel@tonic-gate 2422bd670b35SErik Nordmark /* NOTE: ISS is now set in tcp_set_destination(). */ 24237c478bd9Sstevel@tonic-gate 2424ff550d0eSmasputra /* Reset fusion-related fields */ 24257c478bd9Sstevel@tonic-gate tcp->tcp_fused = B_FALSE; 24267c478bd9Sstevel@tonic-gate tcp->tcp_unfusable = B_FALSE; 24277c478bd9Sstevel@tonic-gate tcp->tcp_fused_sigurg = B_FALSE; 24287c478bd9Sstevel@tonic-gate tcp->tcp_loopback_peer = NULL; 24297c478bd9Sstevel@tonic-gate 2430bd670b35SErik Nordmark /* We rebuild the header template on the next connect/conn_request */ 2431bd670b35SErik Nordmark 2432bd670b35SErik Nordmark connp->conn_mlp_type = mlptSingle; 24337c478bd9Sstevel@tonic-gate 24347c478bd9Sstevel@tonic-gate /* 24357c478bd9Sstevel@tonic-gate * Init the window scale to the max so tcp_rwnd_set() won't pare 2436bd670b35SErik Nordmark * down tcp_rwnd. tcp_set_destination() will set the right value later. 24377c478bd9Sstevel@tonic-gate */ 24387c478bd9Sstevel@tonic-gate tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; 2439bd670b35SErik Nordmark tcp->tcp_rwnd = connp->conn_rcvbuf; 24407c478bd9Sstevel@tonic-gate 24417c478bd9Sstevel@tonic-gate tcp->tcp_cork = B_FALSE; 24427c478bd9Sstevel@tonic-gate /* 2443bd670b35SErik Nordmark * Init the tcp_debug option if it wasn't already set. This value 2444bd670b35SErik Nordmark * determines whether TCP 24457c478bd9Sstevel@tonic-gate * calls strlog() to print out debug messages. Doing this 24467c478bd9Sstevel@tonic-gate * initialization here means that this value is not inherited thru 24477c478bd9Sstevel@tonic-gate * tcp_reinit(). 24487c478bd9Sstevel@tonic-gate */ 2449bd670b35SErik Nordmark if (!connp->conn_debug) 2450bd670b35SErik Nordmark connp->conn_debug = tcps->tcps_dbg; 24517c478bd9Sstevel@tonic-gate } 24527c478bd9Sstevel@tonic-gate 2453bd670b35SErik Nordmark /* 2454bd670b35SErik Nordmark * Update the TCP connection according to change of PMTU. 2455bd670b35SErik Nordmark * 2456bd670b35SErik Nordmark * Path MTU might have changed by either increase or decrease, so need to 2457bd670b35SErik Nordmark * adjust the MSS based on the value of ixa_pmtu. No need to handle tiny 2458bd670b35SErik Nordmark * or negative MSS, since tcp_mss_set() will do it. 2459bd670b35SErik Nordmark */ 2460721fffe3SKacheong Poon void 2461bd670b35SErik Nordmark tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) 2462bd670b35SErik Nordmark { 2463bd670b35SErik Nordmark uint32_t pmtu; 2464bd670b35SErik Nordmark int32_t mss; 2465bd670b35SErik Nordmark conn_t *connp = tcp->tcp_connp; 2466bd670b35SErik Nordmark ip_xmit_attr_t *ixa = connp->conn_ixa; 2467bd670b35SErik Nordmark iaflags_t ixaflags; 2468bd670b35SErik Nordmark 2469bd670b35SErik Nordmark if (tcp->tcp_tcps->tcps_ignore_path_mtu) 2470bd670b35SErik Nordmark return; 2471bd670b35SErik Nordmark 2472bd670b35SErik Nordmark if (tcp->tcp_state < TCPS_ESTABLISHED) 2473bd670b35SErik Nordmark return; 2474bd670b35SErik Nordmark 2475bd670b35SErik Nordmark /* 2476bd670b35SErik Nordmark * Always call ip_get_pmtu() to make sure that IP has updated 2477bd670b35SErik Nordmark * ixa_flags properly. 2478bd670b35SErik Nordmark */ 2479bd670b35SErik Nordmark pmtu = ip_get_pmtu(ixa); 2480bd670b35SErik Nordmark ixaflags = ixa->ixa_flags; 2481bd670b35SErik Nordmark 2482bd670b35SErik Nordmark /* 2483bd670b35SErik Nordmark * Calculate the MSS by decreasing the PMTU by conn_ht_iphc_len and 2484bd670b35SErik Nordmark * IPsec overhead if applied. Make sure to use the most recent 2485bd670b35SErik Nordmark * IPsec information. 2486bd670b35SErik Nordmark */ 2487bd670b35SErik Nordmark mss = pmtu - connp->conn_ht_iphc_len - conn_ipsec_length(connp); 2488bd670b35SErik Nordmark 2489bd670b35SErik Nordmark /* 2490bd670b35SErik Nordmark * Nothing to change, so just return. 2491bd670b35SErik Nordmark */ 2492bd670b35SErik Nordmark if (mss == tcp->tcp_mss) 2493bd670b35SErik Nordmark return; 2494bd670b35SErik Nordmark 2495bd670b35SErik Nordmark /* 2496bd670b35SErik Nordmark * Currently, for ICMP errors, only PMTU decrease is handled. 2497bd670b35SErik Nordmark */ 2498bd670b35SErik Nordmark if (mss > tcp->tcp_mss && decrease_only) 2499bd670b35SErik Nordmark return; 2500bd670b35SErik Nordmark 2501bd670b35SErik Nordmark DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss); 2502bd670b35SErik Nordmark 2503bd670b35SErik Nordmark /* 2504bd670b35SErik Nordmark * Update ixa_fragsize and ixa_pmtu. 2505bd670b35SErik Nordmark */ 2506bd670b35SErik Nordmark ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu; 2507bd670b35SErik Nordmark 2508bd670b35SErik Nordmark /* 2509bd670b35SErik Nordmark * Adjust MSS and all relevant variables. 2510bd670b35SErik Nordmark */ 2511bd670b35SErik Nordmark tcp_mss_set(tcp, mss); 2512bd670b35SErik Nordmark 2513bd670b35SErik Nordmark /* 2514bd670b35SErik Nordmark * If the PMTU is below the min size maintained by IP, then ip_get_pmtu 2515bd670b35SErik Nordmark * has set IXAF_PMTU_TOO_SMALL and cleared IXAF_PMTU_IPV4_DF. Since TCP 2516bd670b35SErik Nordmark * has a (potentially different) min size we do the same. Make sure to 2517bd670b35SErik Nordmark * clear IXAF_DONTFRAG, which is used by IP to decide whether to 2518bd670b35SErik Nordmark * fragment the packet. 2519bd670b35SErik Nordmark * 2520bd670b35SErik Nordmark * LSO over IPv6 can not be fragmented. So need to disable LSO 2521bd670b35SErik Nordmark * when IPv6 fragmentation is needed. 2522bd670b35SErik Nordmark */ 2523bd670b35SErik Nordmark if (mss < tcp->tcp_tcps->tcps_mss_min) 2524bd670b35SErik Nordmark ixaflags |= IXAF_PMTU_TOO_SMALL; 2525bd670b35SErik Nordmark 2526bd670b35SErik Nordmark if (ixaflags & IXAF_PMTU_TOO_SMALL) 2527bd670b35SErik Nordmark ixaflags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 2528bd670b35SErik Nordmark 2529bd670b35SErik Nordmark if ((connp->conn_ipversion == IPV4_VERSION) && 2530bd670b35SErik Nordmark !(ixaflags & IXAF_PMTU_IPV4_DF)) { 2531bd670b35SErik Nordmark tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; 2532bd670b35SErik Nordmark } 2533bd670b35SErik Nordmark ixa->ixa_flags = ixaflags; 2534bd670b35SErik Nordmark } 2535bd670b35SErik Nordmark 2536ff550d0eSmasputra int 25377c478bd9Sstevel@tonic-gate tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) 25387c478bd9Sstevel@tonic-gate { 2539bd670b35SErik Nordmark conn_t *connp = tcp->tcp_connp; 2540bd670b35SErik Nordmark queue_t *q = connp->conn_rq; 25417c478bd9Sstevel@tonic-gate int32_t mss = tcp->tcp_mss; 25427c478bd9Sstevel@tonic-gate int maxpsz; 25437c478bd9Sstevel@tonic-gate 25447c478bd9Sstevel@tonic-gate if (TCP_IS_DETACHED(tcp)) 25457c478bd9Sstevel@tonic-gate return (mss); 2546ff550d0eSmasputra if (tcp->tcp_fused) { 254779c0745dSRao Shoaib maxpsz = tcp_fuse_maxpsz(tcp); 2548ff550d0eSmasputra mss = INFPSZ; 2549bd670b35SErik Nordmark } else if (tcp->tcp_maxpsz_multiplier == 0) { 25507c478bd9Sstevel@tonic-gate /* 25517c478bd9Sstevel@tonic-gate * Set the sd_qn_maxpsz according to the socket send buffer 25527c478bd9Sstevel@tonic-gate * size, and sd_maxblk to INFPSZ (-1). This will essentially 25537c478bd9Sstevel@tonic-gate * instruct the stream head to copyin user data into contiguous 25547c478bd9Sstevel@tonic-gate * kernel-allocated buffers without breaking it up into smaller 25557c478bd9Sstevel@tonic-gate * chunks. We round up the buffer size to the nearest SMSS. 25567c478bd9Sstevel@tonic-gate */ 2557bd670b35SErik Nordmark maxpsz = MSS_ROUNDUP(connp->conn_sndbuf, mss); 25587c478bd9Sstevel@tonic-gate mss = INFPSZ; 25597c478bd9Sstevel@tonic-gate } else { 25607c478bd9Sstevel@tonic-gate /* 25617c478bd9Sstevel@tonic-gate * Set sd_qn_maxpsz to approx half the (receivers) buffer 25627c478bd9Sstevel@tonic-gate * (and a multiple of the mss). This instructs the stream 25637c478bd9Sstevel@tonic-gate * head to break down larger than SMSS writes into SMSS- 25647c478bd9Sstevel@tonic-gate * size mblks, up to tcp_maxpsz_multiplier mblks at a time. 25657c478bd9Sstevel@tonic-gate */ 2566bd670b35SErik Nordmark maxpsz = tcp->tcp_maxpsz_multiplier * mss; 2567bd670b35SErik Nordmark if (maxpsz > connp->conn_sndbuf / 2) { 2568bd670b35SErik Nordmark maxpsz = connp->conn_sndbuf / 2; 25697c478bd9Sstevel@tonic-gate /* Round up to nearest mss */ 25707c478bd9Sstevel@tonic-gate maxpsz = MSS_ROUNDUP(maxpsz, mss); 25717c478bd9Sstevel@tonic-gate } 25727c478bd9Sstevel@tonic-gate } 25730f1702c5SYu Xiangning 25740f1702c5SYu Xiangning (void) proto_set_maxpsz(q, connp, maxpsz); 2575bd670b35SErik Nordmark if (!(IPCL_IS_NONSTR(connp))) 2576bd670b35SErik Nordmark connp->conn_wq->q_maxpsz = maxpsz; 25777c478bd9Sstevel@tonic-gate if (set_maxblk) 25780f1702c5SYu Xiangning (void) proto_set_tx_maxblk(q, connp, mss); 25797c478bd9Sstevel@tonic-gate return (mss); 25807c478bd9Sstevel@tonic-gate } 25817c478bd9Sstevel@tonic-gate 2582fc80c0dfSnordmark /* For /dev/tcp aka AF_INET open */ 25837c478bd9Sstevel@tonic-gate static int 2584fc80c0dfSnordmark tcp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 2585fc80c0dfSnordmark { 2586fc80c0dfSnordmark return (tcp_open(q, devp, flag, sflag, credp, B_FALSE)); 2587fc80c0dfSnordmark } 2588fc80c0dfSnordmark 2589fc80c0dfSnordmark /* For /dev/tcp6 aka AF_INET6 open */ 2590fc80c0dfSnordmark static int 2591fc80c0dfSnordmark tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 2592fc80c0dfSnordmark { 2593fc80c0dfSnordmark return (tcp_open(q, devp, flag, sflag, credp, B_TRUE)); 2594fc80c0dfSnordmark } 2595fc80c0dfSnordmark 2596721fffe3SKacheong Poon conn_t * 2597bd670b35SErik Nordmark tcp_create_common(cred_t *credp, boolean_t isv6, boolean_t issocket, 2598bd670b35SErik Nordmark int *errorp) 25997c478bd9Sstevel@tonic-gate { 26007c478bd9Sstevel@tonic-gate tcp_t *tcp = NULL; 26017c478bd9Sstevel@tonic-gate conn_t *connp; 2602f4b3ec61Sdh155122 zoneid_t zoneid; 26030f1702c5SYu Xiangning tcp_stack_t *tcps; 26040f1702c5SYu Xiangning squeue_t *sqp; 2605108322fbScarlsonj 26060f1702c5SYu Xiangning ASSERT(errorp != NULL); 26070f1702c5SYu Xiangning /* 26080f1702c5SYu Xiangning * Find the proper zoneid and netstack. 26090f1702c5SYu Xiangning */ 2610f4b3ec61Sdh155122 /* 2611f4b3ec61Sdh155122 * Special case for install: miniroot needs to be able to 2612f4b3ec61Sdh155122 * access files via NFS as though it were always in the 2613f4b3ec61Sdh155122 * global zone. 2614f4b3ec61Sdh155122 */ 2615f4b3ec61Sdh155122 if (credp == kcred && nfs_global_client_only != 0) { 2616f4b3ec61Sdh155122 zoneid = GLOBAL_ZONEID; 2617f4b3ec61Sdh155122 tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)-> 2618f4b3ec61Sdh155122 netstack_tcp; 2619f4b3ec61Sdh155122 ASSERT(tcps != NULL); 2620f4b3ec61Sdh155122 } else { 2621f4b3ec61Sdh155122 netstack_t *ns; 2622634e26ecSCasper H.S. Dik int err; 2623634e26ecSCasper H.S. Dik 2624634e26ecSCasper H.S. Dik if ((err = secpolicy_basic_net_access(credp)) != 0) { 2625634e26ecSCasper H.S. Dik *errorp = err; 2626634e26ecSCasper H.S. Dik return (NULL); 2627634e26ecSCasper H.S. Dik } 2628f4b3ec61Sdh155122 2629f4b3ec61Sdh155122 ns = netstack_find_by_cred(credp); 2630f4b3ec61Sdh155122 ASSERT(ns != NULL); 2631f4b3ec61Sdh155122 tcps = ns->netstack_tcp; 2632f4b3ec61Sdh155122 ASSERT(tcps != NULL); 2633f4b3ec61Sdh155122 2634f4b3ec61Sdh155122 /* 2635f4b3ec61Sdh155122 * For exclusive stacks we set the zoneid to zero 2636f4b3ec61Sdh155122 * to make TCP operate as if in the global zone. 2637f4b3ec61Sdh155122 */ 2638f4b3ec61Sdh155122 if (tcps->tcps_netstack->netstack_stackid != 2639f4b3ec61Sdh155122 GLOBAL_NETSTACKID) 2640f4b3ec61Sdh155122 zoneid = GLOBAL_ZONEID; 2641f4b3ec61Sdh155122 else 2642f4b3ec61Sdh155122 zoneid = crgetzoneid(credp); 2643f4b3ec61Sdh155122 } 26447c478bd9Sstevel@tonic-gate 26450f1702c5SYu Xiangning sqp = IP_SQUEUE_GET((uint_t)gethrtime()); 2646f3124163SAnders Persson connp = (conn_t *)tcp_get_conn(sqp, tcps); 2647f4b3ec61Sdh155122 /* 2648f4b3ec61Sdh155122 * Both tcp_get_conn and netstack_find_by_cred incremented refcnt, 2649f4b3ec61Sdh155122 * so we drop it by one. 2650f4b3ec61Sdh155122 */ 2651f4b3ec61Sdh155122 netstack_rele(tcps->tcps_netstack); 26527c478bd9Sstevel@tonic-gate if (connp == NULL) { 26530f1702c5SYu Xiangning *errorp = ENOSR; 26540f1702c5SYu Xiangning return (NULL); 26557c478bd9Sstevel@tonic-gate } 2656bd670b35SErik Nordmark ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto); 2657bd670b35SErik Nordmark 26580f1702c5SYu Xiangning connp->conn_sqp = sqp; 2659da14cebeSEric Cheng connp->conn_initial_sqp = connp->conn_sqp; 2660bd670b35SErik Nordmark connp->conn_ixa->ixa_sqp = connp->conn_sqp; 26617c478bd9Sstevel@tonic-gate tcp = connp->conn_tcp; 26627c478bd9Sstevel@tonic-gate 26637c478bd9Sstevel@tonic-gate /* 2664bd670b35SErik Nordmark * Besides asking IP to set the checksum for us, have conn_ip_output 2665bd670b35SErik Nordmark * to do the following checks when necessary: 2666bd670b35SErik Nordmark * 2667bd670b35SErik Nordmark * IXAF_VERIFY_SOURCE: drop packets when our outer source goes invalid 2668bd670b35SErik Nordmark * IXAF_VERIFY_PMTU: verify PMTU changes 2669bd670b35SErik Nordmark * IXAF_VERIFY_LSO: verify LSO capability changes 26707c478bd9Sstevel@tonic-gate */ 2671bd670b35SErik Nordmark connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | 2672bd670b35SErik Nordmark IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO; 2673bd670b35SErik Nordmark 2674bd670b35SErik Nordmark if (!tcps->tcps_dev_flow_ctl) 2675bd670b35SErik Nordmark connp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL; 2676bd670b35SErik Nordmark 2677bd670b35SErik Nordmark if (isv6) { 2678bd670b35SErik Nordmark connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT; 2679bd670b35SErik Nordmark connp->conn_ipversion = IPV6_VERSION; 2680bd670b35SErik Nordmark connp->conn_family = AF_INET6; 2681bd670b35SErik Nordmark tcp->tcp_mss = tcps->tcps_mss_def_ipv6; 2682bd670b35SErik Nordmark connp->conn_default_ttl = tcps->tcps_ipv6_hoplimit; 2683bd670b35SErik Nordmark } else { 2684bd670b35SErik Nordmark connp->conn_ipversion = IPV4_VERSION; 2685bd670b35SErik Nordmark connp->conn_family = AF_INET; 2686bd670b35SErik Nordmark tcp->tcp_mss = tcps->tcps_mss_def_ipv4; 2687bd670b35SErik Nordmark connp->conn_default_ttl = tcps->tcps_ipv4_ttl; 2688bd670b35SErik Nordmark } 2689bd670b35SErik Nordmark connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; 2690bd670b35SErik Nordmark 2691bd670b35SErik Nordmark crhold(credp); 2692bd670b35SErik Nordmark connp->conn_cred = credp; 2693bd670b35SErik Nordmark connp->conn_cpid = curproc->p_pid; 2694d3d50737SRafael Vanoni connp->conn_open_time = ddi_get_lbolt64(); 2695bd670b35SErik Nordmark 269676a1033eSErik Nordmark /* Cache things in the ixa without any refhold */ 2697be4c8f74SErik Nordmark ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 269876a1033eSErik Nordmark connp->conn_ixa->ixa_cred = credp; 269976a1033eSErik Nordmark connp->conn_ixa->ixa_cpid = connp->conn_cpid; 270076a1033eSErik Nordmark 27017c478bd9Sstevel@tonic-gate connp->conn_zoneid = zoneid; 2702bd670b35SErik Nordmark /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ 2703bd670b35SErik Nordmark connp->conn_ixa->ixa_zoneid = zoneid; 270445916cd2Sjpk connp->conn_mlp_type = mlptSingle; 2705f4b3ec61Sdh155122 ASSERT(connp->conn_netstack == tcps->tcps_netstack); 2706f4b3ec61Sdh155122 ASSERT(tcp->tcp_tcps == tcps); 270745916cd2Sjpk 270845916cd2Sjpk /* 270945916cd2Sjpk * If the caller has the process-wide flag set, then default to MAC 271045916cd2Sjpk * exempt mode. This allows read-down to unlabeled hosts. 271145916cd2Sjpk */ 271245916cd2Sjpk if (getpflags(NET_MAC_AWARE, credp) != 0) 27135d3b8cb7SBill Sommerfeld connp->conn_mac_mode = CONN_MAC_AWARE; 27147c478bd9Sstevel@tonic-gate 2715bd670b35SErik Nordmark connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); 2716bd670b35SErik Nordmark 27170f1702c5SYu Xiangning if (issocket) { 27180f1702c5SYu Xiangning tcp->tcp_issocket = 1; 27190f1702c5SYu Xiangning } 27200f1702c5SYu Xiangning 2721bd670b35SErik Nordmark connp->conn_rcvbuf = tcps->tcps_recv_hiwat; 2722bd670b35SErik Nordmark connp->conn_sndbuf = tcps->tcps_xmit_hiwat; 2723299625c6SSebastien Roy if (tcps->tcps_snd_lowat_fraction != 0) { 2724299625c6SSebastien Roy connp->conn_sndlowat = connp->conn_sndbuf / 2725299625c6SSebastien Roy tcps->tcps_snd_lowat_fraction; 2726299625c6SSebastien Roy } else { 2727bd670b35SErik Nordmark connp->conn_sndlowat = tcps->tcps_xmit_lowat; 2728299625c6SSebastien Roy } 2729bd670b35SErik Nordmark connp->conn_so_type = SOCK_STREAM; 2730bd670b35SErik Nordmark connp->conn_wroff = connp->conn_ht_iphc_allocated + 2731bd670b35SErik Nordmark tcps->tcps_wroff_xtra; 27320f1702c5SYu Xiangning 27330f1702c5SYu Xiangning SOCK_CONNID_INIT(tcp->tcp_connid); 27349cd928feSAlan Maguire /* DTrace ignores this - it isn't a tcp:::state-change */ 2735bd670b35SErik Nordmark tcp->tcp_state = TCPS_IDLE; 2736707e74bcSKacheong Poon tcp_init_values(tcp, NULL); 27370f1702c5SYu Xiangning return (connp); 27380f1702c5SYu Xiangning } 27390f1702c5SYu Xiangning 27400f1702c5SYu Xiangning static int 27410f1702c5SYu Xiangning tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 27420f1702c5SYu Xiangning boolean_t isv6) 27430f1702c5SYu Xiangning { 27440f1702c5SYu Xiangning tcp_t *tcp = NULL; 27450f1702c5SYu Xiangning conn_t *connp = NULL; 27460f1702c5SYu Xiangning int err; 27470f1702c5SYu Xiangning vmem_t *minor_arena = NULL; 27480f1702c5SYu Xiangning dev_t conn_dev; 27490f1702c5SYu Xiangning boolean_t issocket; 27500f1702c5SYu Xiangning 27510f1702c5SYu Xiangning if (q->q_ptr != NULL) 27520f1702c5SYu Xiangning return (0); 27530f1702c5SYu Xiangning 27540f1702c5SYu Xiangning if (sflag == MODOPEN) 27550f1702c5SYu Xiangning return (EINVAL); 27560f1702c5SYu Xiangning 27570f1702c5SYu Xiangning if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) && 27580f1702c5SYu Xiangning ((conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) { 27590f1702c5SYu Xiangning minor_arena = ip_minor_arena_la; 27600f1702c5SYu Xiangning } else { 27610f1702c5SYu Xiangning /* 27620f1702c5SYu Xiangning * Either minor numbers in the large arena were exhausted 27630f1702c5SYu Xiangning * or a non socket application is doing the open. 27640f1702c5SYu Xiangning * Try to allocate from the small arena. 27650f1702c5SYu Xiangning */ 27660f1702c5SYu Xiangning if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 27670f1702c5SYu Xiangning return (EBUSY); 27680f1702c5SYu Xiangning } 27690f1702c5SYu Xiangning minor_arena = ip_minor_arena_sa; 27700f1702c5SYu Xiangning } 27710f1702c5SYu Xiangning 27720f1702c5SYu Xiangning ASSERT(minor_arena != NULL); 27730f1702c5SYu Xiangning 27740f1702c5SYu Xiangning *devp = makedevice(getmajor(*devp), (minor_t)conn_dev); 27750f1702c5SYu Xiangning 27760f1702c5SYu Xiangning if (flag & SO_FALLBACK) { 27770f1702c5SYu Xiangning /* 27780f1702c5SYu Xiangning * Non streams socket needs a stream to fallback to 27790f1702c5SYu Xiangning */ 27800f1702c5SYu Xiangning RD(q)->q_ptr = (void *)conn_dev; 27810f1702c5SYu Xiangning WR(q)->q_qinfo = &tcp_fallback_sock_winit; 27820f1702c5SYu Xiangning WR(q)->q_ptr = (void *)minor_arena; 27830f1702c5SYu Xiangning qprocson(q); 27840f1702c5SYu Xiangning return (0); 27850f1702c5SYu Xiangning } else if (flag & SO_ACCEPTOR) { 27860f1702c5SYu Xiangning q->q_qinfo = &tcp_acceptor_rinit; 27870f1702c5SYu Xiangning /* 27880f1702c5SYu Xiangning * the conn_dev and minor_arena will be subsequently used by 2789bd670b35SErik Nordmark * tcp_tli_accept() and tcp_tpi_close_accept() to figure out 2790eead73cfSRao Shoaib * the minor device number for this connection from the q_ptr. 27910f1702c5SYu Xiangning */ 27920f1702c5SYu Xiangning RD(q)->q_ptr = (void *)conn_dev; 27930f1702c5SYu Xiangning WR(q)->q_qinfo = &tcp_acceptor_winit; 27940f1702c5SYu Xiangning WR(q)->q_ptr = (void *)minor_arena; 27950f1702c5SYu Xiangning qprocson(q); 27960f1702c5SYu Xiangning return (0); 27970f1702c5SYu Xiangning } 27980f1702c5SYu Xiangning 27990f1702c5SYu Xiangning issocket = flag & SO_SOCKSTR; 2800bd670b35SErik Nordmark connp = tcp_create_common(credp, isv6, issocket, &err); 28010f1702c5SYu Xiangning 28020f1702c5SYu Xiangning if (connp == NULL) { 28030f1702c5SYu Xiangning inet_minor_free(minor_arena, conn_dev); 28040f1702c5SYu Xiangning q->q_ptr = WR(q)->q_ptr = NULL; 28050f1702c5SYu Xiangning return (err); 28060f1702c5SYu Xiangning } 28070f1702c5SYu Xiangning 2808bd670b35SErik Nordmark connp->conn_rq = q; 2809bd670b35SErik Nordmark connp->conn_wq = WR(q); 28100f1702c5SYu Xiangning q->q_ptr = WR(q)->q_ptr = connp; 28110f1702c5SYu Xiangning 28127c478bd9Sstevel@tonic-gate connp->conn_dev = conn_dev; 2813aa92d85bSgt145670 connp->conn_minor_arena = minor_arena; 28147c478bd9Sstevel@tonic-gate 2815fc80c0dfSnordmark ASSERT(q->q_qinfo == &tcp_rinitv4 || q->q_qinfo == &tcp_rinitv6); 28167c478bd9Sstevel@tonic-gate ASSERT(WR(q)->q_qinfo == &tcp_winit); 28177c478bd9Sstevel@tonic-gate 2818eead73cfSRao Shoaib tcp = connp->conn_tcp; 2819eead73cfSRao Shoaib 28200f1702c5SYu Xiangning if (issocket) { 28217c478bd9Sstevel@tonic-gate WR(q)->q_qinfo = &tcp_sock_winit; 28227c478bd9Sstevel@tonic-gate } else { 28237c478bd9Sstevel@tonic-gate #ifdef _ILP32 28247c478bd9Sstevel@tonic-gate tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); 28257c478bd9Sstevel@tonic-gate #else 28267c478bd9Sstevel@tonic-gate tcp->tcp_acceptor_id = conn_dev; 28277c478bd9Sstevel@tonic-gate #endif /* _ILP32 */ 28287c478bd9Sstevel@tonic-gate tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 28297c478bd9Sstevel@tonic-gate } 28307c478bd9Sstevel@tonic-gate 28317c478bd9Sstevel@tonic-gate /* 28327c478bd9Sstevel@tonic-gate * Put the ref for TCP. Ref for IP was already put 28337c478bd9Sstevel@tonic-gate * by ipcl_conn_create. Also Make the conn_t globally 28347c478bd9Sstevel@tonic-gate * visible to walkers 28357c478bd9Sstevel@tonic-gate */ 28367c478bd9Sstevel@tonic-gate mutex_enter(&connp->conn_lock); 28377c478bd9Sstevel@tonic-gate CONN_INC_REF_LOCKED(connp); 28387c478bd9Sstevel@tonic-gate ASSERT(connp->conn_ref == 2); 28397c478bd9Sstevel@tonic-gate connp->conn_state_flags &= ~CONN_INCIPIENT; 28407c478bd9Sstevel@tonic-gate mutex_exit(&connp->conn_lock); 28417c478bd9Sstevel@tonic-gate 28427c478bd9Sstevel@tonic-gate qprocson(q); 28437c478bd9Sstevel@tonic-gate return (0); 28447c478bd9Sstevel@tonic-gate } 28457c478bd9Sstevel@tonic-gate 28467c478bd9Sstevel@tonic-gate /* 2847bd670b35SErik Nordmark * Build/update the tcp header template (in conn_ht_iphc) based on 2848bd670b35SErik Nordmark * conn_xmit_ipp. The headers include ip6_t, any extension 28497c478bd9Sstevel@tonic-gate * headers, and the maximum size tcp header (to avoid reallocation 28507c478bd9Sstevel@tonic-gate * on the fly for additional tcp options). 2851bd670b35SErik Nordmark * 2852bd670b35SErik Nordmark * Assumes the caller has already set conn_{faddr,laddr,fport,lport,flowinfo}. 28537c478bd9Sstevel@tonic-gate * Returns failure if can't allocate memory. 28547c478bd9Sstevel@tonic-gate */ 2855721fffe3SKacheong Poon int 28560f1702c5SYu Xiangning tcp_build_hdrs(tcp_t *tcp) 28577c478bd9Sstevel@tonic-gate { 2858f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 28590f1702c5SYu Xiangning conn_t *connp = tcp->tcp_connp; 2860294f39e4SErik Nordmark char buf[TCP_MAX_HDR_LENGTH]; 2861294f39e4SErik Nordmark uint_t buflen; 2862294f39e4SErik Nordmark uint_t ulplen = TCP_MIN_HEADER_LENGTH; 2863294f39e4SErik Nordmark uint_t extralen = TCP_MAX_TCP_OPTIONS_LENGTH; 2864bd670b35SErik Nordmark tcpha_t *tcpha; 2865bd670b35SErik Nordmark uint32_t cksum; 2866bd670b35SErik Nordmark int error; 2867bd670b35SErik Nordmark 2868294f39e4SErik Nordmark /* 2869294f39e4SErik Nordmark * We might be called after the connection is set up, and we might 2870294f39e4SErik Nordmark * have TS options already in the TCP header. Thus we save any 2871294f39e4SErik Nordmark * existing tcp header. 2872294f39e4SErik Nordmark */ 2873294f39e4SErik Nordmark buflen = connp->conn_ht_ulp_len; 2874294f39e4SErik Nordmark if (buflen != 0) { 2875294f39e4SErik Nordmark bcopy(connp->conn_ht_ulp, buf, buflen); 2876294f39e4SErik Nordmark extralen -= buflen - ulplen; 2877294f39e4SErik Nordmark ulplen = buflen; 2878294f39e4SErik Nordmark } 2879294f39e4SErik Nordmark 2880bd670b35SErik Nordmark /* Grab lock to satisfy ASSERT; TCP is serialized using squeue */ 2881bd670b35SErik Nordmark mutex_enter(&connp->conn_lock); 2882294f39e4SErik Nordmark error = conn_build_hdr_template(connp, ulplen, extralen, 2883294f39e4SErik Nordmark &connp->conn_laddr_v6, &connp->conn_faddr_v6, connp->conn_flowinfo); 2884bd670b35SErik Nordmark mutex_exit(&connp->conn_lock); 2885bd670b35SErik Nordmark if (error != 0) 2886bd670b35SErik Nordmark return (error); 28877c478bd9Sstevel@tonic-gate 28887c478bd9Sstevel@tonic-gate /* 2889bd670b35SErik Nordmark * Any routing header/option has been massaged. The checksum difference 2890bd670b35SErik Nordmark * is stored in conn_sum for later use. 28917c478bd9Sstevel@tonic-gate */ 2892bd670b35SErik Nordmark tcpha = (tcpha_t *)connp->conn_ht_ulp; 2893bd670b35SErik Nordmark tcp->tcp_tcpha = tcpha; 28947c478bd9Sstevel@tonic-gate 2895294f39e4SErik Nordmark /* restore any old tcp header */ 2896294f39e4SErik Nordmark if (buflen != 0) { 2897294f39e4SErik Nordmark bcopy(buf, connp->conn_ht_ulp, buflen); 2898294f39e4SErik Nordmark } else { 2899bd670b35SErik Nordmark tcpha->tha_sum = 0; 29007426816eSMark Fenwick tcpha->tha_urp = 0; 29017426816eSMark Fenwick tcpha->tha_ack = 0; 2902bd670b35SErik Nordmark tcpha->tha_offset_and_reserved = (5 << 4); 290343e4e284SErik Nordmark tcpha->tha_lport = connp->conn_lport; 290443e4e284SErik Nordmark tcpha->tha_fport = connp->conn_fport; 29056b7506c7SErik Nordmark } 29067c478bd9Sstevel@tonic-gate 29077c478bd9Sstevel@tonic-gate /* 2908bd670b35SErik Nordmark * IP wants our header length in the checksum field to 2909bd670b35SErik Nordmark * allow it to perform a single pseudo-header+checksum 2910bd670b35SErik Nordmark * calculation on behalf of TCP. 2911bd670b35SErik Nordmark * Include the adjustment for a source route once IP_OPTIONS is set. 29127c478bd9Sstevel@tonic-gate */ 2913bd670b35SErik Nordmark cksum = sizeof (tcpha_t) + connp->conn_sum; 2914bd670b35SErik Nordmark cksum = (cksum >> 16) + (cksum & 0xFFFF); 2915bd670b35SErik Nordmark ASSERT(cksum < 0x10000); 2916bd670b35SErik Nordmark tcpha->tha_sum = htons(cksum); 29177c478bd9Sstevel@tonic-gate 2918bd670b35SErik Nordmark if (connp->conn_ipversion == IPV4_VERSION) 2919bd670b35SErik Nordmark tcp->tcp_ipha = (ipha_t *)connp->conn_ht_iphc; 2920bd670b35SErik Nordmark else 2921bd670b35SErik Nordmark tcp->tcp_ip6h = (ip6_t *)connp->conn_ht_iphc; 29227c478bd9Sstevel@tonic-gate 2923bd670b35SErik Nordmark if (connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra > 2924bd670b35SErik Nordmark connp->conn_wroff) { 2925bd670b35SErik Nordmark connp->conn_wroff = connp->conn_ht_iphc_allocated + 2926bd670b35SErik Nordmark tcps->tcps_wroff_xtra; 2927bd670b35SErik Nordmark (void) proto_set_tx_wroff(connp->conn_rq, connp, 2928bd670b35SErik Nordmark connp->conn_wroff); 29297c478bd9Sstevel@tonic-gate } 29307c478bd9Sstevel@tonic-gate return (0); 29317c478bd9Sstevel@tonic-gate } 29327c478bd9Sstevel@tonic-gate 29337c478bd9Sstevel@tonic-gate /* 29347c478bd9Sstevel@tonic-gate * tcp_rwnd_set() is called to adjust the receive window to a desired value. 29357c478bd9Sstevel@tonic-gate * We do not allow the receive window to shrink. After setting rwnd, 29367c478bd9Sstevel@tonic-gate * set the flow control hiwat of the stream. 29377c478bd9Sstevel@tonic-gate * 29387c478bd9Sstevel@tonic-gate * This function is called in 2 cases: 29397c478bd9Sstevel@tonic-gate * 2940bd670b35SErik Nordmark * 1) Before data transfer begins, in tcp_input_listener() for accepting a 2941bd670b35SErik Nordmark * connection (passive open) and in tcp_input_data() for active connect. 29427c478bd9Sstevel@tonic-gate * This is called after tcp_mss_set() when the desired MSS value is known. 29437c478bd9Sstevel@tonic-gate * This makes sure that our window size is a mutiple of the other side's 29447c478bd9Sstevel@tonic-gate * MSS. 29457c478bd9Sstevel@tonic-gate * 2) Handling SO_RCVBUF option. 29467c478bd9Sstevel@tonic-gate * 29477c478bd9Sstevel@tonic-gate * It is ASSUMED that the requested size is a multiple of the current MSS. 29487c478bd9Sstevel@tonic-gate * 29497c478bd9Sstevel@tonic-gate * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the 29507c478bd9Sstevel@tonic-gate * user requests so. 29517c478bd9Sstevel@tonic-gate */ 295279c0745dSRao Shoaib int 29537c478bd9Sstevel@tonic-gate tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) 29547c478bd9Sstevel@tonic-gate { 29557c478bd9Sstevel@tonic-gate uint32_t mss = tcp->tcp_mss; 29567c478bd9Sstevel@tonic-gate uint32_t old_max_rwnd; 29577c478bd9Sstevel@tonic-gate uint32_t max_transmittable_rwnd; 29587c478bd9Sstevel@tonic-gate boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 2959f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 2960bd670b35SErik Nordmark conn_t *connp = tcp->tcp_connp; 29617c478bd9Sstevel@tonic-gate 29621b2702b4SRao Shoaib /* 29631b2702b4SRao Shoaib * Insist on a receive window that is at least 29641b2702b4SRao Shoaib * tcp_recv_hiwat_minmss * MSS (default 4 * MSS) to avoid 29651b2702b4SRao Shoaib * funny TCP interactions of Nagle algorithm, SWS avoidance 29661b2702b4SRao Shoaib * and delayed acknowledgement. 29671b2702b4SRao Shoaib */ 29681b2702b4SRao Shoaib rwnd = MAX(rwnd, tcps->tcps_recv_hiwat_minmss * mss); 29691b2702b4SRao Shoaib 2970ff550d0eSmasputra if (tcp->tcp_fused) { 2971ff550d0eSmasputra size_t sth_hiwat; 2972ff550d0eSmasputra tcp_t *peer_tcp = tcp->tcp_loopback_peer; 2973ff550d0eSmasputra 2974ff550d0eSmasputra ASSERT(peer_tcp != NULL); 2975ff550d0eSmasputra sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd); 29760f1702c5SYu Xiangning if (!tcp_detached) { 2977bd670b35SErik Nordmark (void) proto_set_rx_hiwat(connp->conn_rq, connp, 29780f1702c5SYu Xiangning sth_hiwat); 297979c0745dSRao Shoaib tcp_set_recv_threshold(tcp, sth_hiwat >> 3); 29800f1702c5SYu Xiangning } 2981ff550d0eSmasputra 29826b7506c7SErik Nordmark /* Caller could have changed tcp_rwnd; update tha_win */ 29836b7506c7SErik Nordmark if (tcp->tcp_tcpha != NULL) { 29846b7506c7SErik Nordmark tcp->tcp_tcpha->tha_win = 29856b7506c7SErik Nordmark htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 29866b7506c7SErik Nordmark } 29876b7506c7SErik Nordmark if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 29886b7506c7SErik Nordmark tcp->tcp_cwnd_max = rwnd; 29896b7506c7SErik Nordmark 2990ff550d0eSmasputra /* 2991ff550d0eSmasputra * In the fusion case, the maxpsz stream head value of 2992ff550d0eSmasputra * our peer is set according to its send buffer size 2993ff550d0eSmasputra * and our receive buffer size; since the latter may 2994ff550d0eSmasputra * have changed we need to update the peer's maxpsz. 2995ff550d0eSmasputra */ 2996ff550d0eSmasputra (void) tcp_maxpsz_set(peer_tcp, B_TRUE); 299779c0745dSRao Shoaib return (sth_hiwat); 2998ff550d0eSmasputra } 2999ff550d0eSmasputra 3000bd670b35SErik Nordmark if (tcp_detached) 30017c478bd9Sstevel@tonic-gate old_max_rwnd = tcp->tcp_rwnd; 3002bd670b35SErik Nordmark else 3003bd670b35SErik Nordmark old_max_rwnd = connp->conn_rcvbuf; 30047c478bd9Sstevel@tonic-gate 30057c478bd9Sstevel@tonic-gate 30067c478bd9Sstevel@tonic-gate /* 30077c478bd9Sstevel@tonic-gate * If window size info has already been exchanged, TCP should not 30087c478bd9Sstevel@tonic-gate * shrink the window. Shrinking window is doable if done carefully. 30097c478bd9Sstevel@tonic-gate * We may add that support later. But so far there is not a real 30107c478bd9Sstevel@tonic-gate * need to do that. 30117c478bd9Sstevel@tonic-gate */ 30127c478bd9Sstevel@tonic-gate if (rwnd < old_max_rwnd && tcp->tcp_state > TCPS_SYN_SENT) { 30137c478bd9Sstevel@tonic-gate /* MSS may have changed, do a round up again. */ 30147c478bd9Sstevel@tonic-gate rwnd = MSS_ROUNDUP(old_max_rwnd, mss); 30157c478bd9Sstevel@tonic-gate } 30167c478bd9Sstevel@tonic-gate 30177c478bd9Sstevel@tonic-gate /* 30187c478bd9Sstevel@tonic-gate * tcp_rcv_ws starts with TCP_MAX_WINSHIFT so the following check 30197c478bd9Sstevel@tonic-gate * can be applied even before the window scale option is decided. 30207c478bd9Sstevel@tonic-gate */ 30217c478bd9Sstevel@tonic-gate max_transmittable_rwnd = TCP_MAXWIN << tcp->tcp_rcv_ws; 30227c478bd9Sstevel@tonic-gate if (rwnd > max_transmittable_rwnd) { 30237c478bd9Sstevel@tonic-gate rwnd = max_transmittable_rwnd - 30247c478bd9Sstevel@tonic-gate (max_transmittable_rwnd % mss); 30257c478bd9Sstevel@tonic-gate if (rwnd < mss) 30267c478bd9Sstevel@tonic-gate rwnd = max_transmittable_rwnd; 30277c478bd9Sstevel@tonic-gate /* 30287c478bd9Sstevel@tonic-gate * If we're over the limit we may have to back down tcp_rwnd. 30297c478bd9Sstevel@tonic-gate * The increment below won't work for us. So we set all three 30307c478bd9Sstevel@tonic-gate * here and the increment below will have no effect. 30317c478bd9Sstevel@tonic-gate */ 30327c478bd9Sstevel@tonic-gate tcp->tcp_rwnd = old_max_rwnd = rwnd; 30337c478bd9Sstevel@tonic-gate } 30347c478bd9Sstevel@tonic-gate if (tcp->tcp_localnet) { 30357c478bd9Sstevel@tonic-gate tcp->tcp_rack_abs_max = 3036f4b3ec61Sdh155122 MIN(tcps->tcps_local_dacks_max, rwnd / mss / 2); 30377c478bd9Sstevel@tonic-gate } else { 30387c478bd9Sstevel@tonic-gate /* 30397c478bd9Sstevel@tonic-gate * For a remote host on a different subnet (through a router), 30407c478bd9Sstevel@tonic-gate * we ack every other packet to be conforming to RFC1122. 30417c478bd9Sstevel@tonic-gate * tcp_deferred_acks_max is default to 2. 30427c478bd9Sstevel@tonic-gate */ 30437c478bd9Sstevel@tonic-gate tcp->tcp_rack_abs_max = 3044f4b3ec61Sdh155122 MIN(tcps->tcps_deferred_acks_max, rwnd / mss / 2); 30457c478bd9Sstevel@tonic-gate } 30467c478bd9Sstevel@tonic-gate if (tcp->tcp_rack_cur_max > tcp->tcp_rack_abs_max) 30477c478bd9Sstevel@tonic-gate tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; 30487c478bd9Sstevel@tonic-gate else 30497c478bd9Sstevel@tonic-gate tcp->tcp_rack_cur_max = 0; 30507c478bd9Sstevel@tonic-gate /* 30517c478bd9Sstevel@tonic-gate * Increment the current rwnd by the amount the maximum grew (we 30527c478bd9Sstevel@tonic-gate * can not overwrite it since we might be in the middle of a 30537c478bd9Sstevel@tonic-gate * connection.) 30547c478bd9Sstevel@tonic-gate */ 30557c478bd9Sstevel@tonic-gate tcp->tcp_rwnd += rwnd - old_max_rwnd; 3056bd670b35SErik Nordmark connp->conn_rcvbuf = rwnd; 30571b2702b4SRao Shoaib 3058bd670b35SErik Nordmark /* Are we already connected? */ 3059bd670b35SErik Nordmark if (tcp->tcp_tcpha != NULL) { 3060bd670b35SErik Nordmark tcp->tcp_tcpha->tha_win = 3061bd670b35SErik Nordmark htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 3062bd670b35SErik Nordmark } 3063bd670b35SErik Nordmark 30647c478bd9Sstevel@tonic-gate if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 30657c478bd9Sstevel@tonic-gate tcp->tcp_cwnd_max = rwnd; 30667c478bd9Sstevel@tonic-gate 30677c478bd9Sstevel@tonic-gate if (tcp_detached) 30687c478bd9Sstevel@tonic-gate return (rwnd); 306979c0745dSRao Shoaib 307079c0745dSRao Shoaib tcp_set_recv_threshold(tcp, rwnd >> 3); 307179c0745dSRao Shoaib 3072bd670b35SErik Nordmark (void) proto_set_rx_hiwat(connp->conn_rq, connp, rwnd); 30737c478bd9Sstevel@tonic-gate return (rwnd); 30747c478bd9Sstevel@tonic-gate } 30757c478bd9Sstevel@tonic-gate 3076ff550d0eSmasputra int 30770f1702c5SYu Xiangning tcp_do_unbind(conn_t *connp) 30787c478bd9Sstevel@tonic-gate { 30790f1702c5SYu Xiangning tcp_t *tcp = connp->conn_tcp; 30809cd928feSAlan Maguire int32_t oldstate; 30817c478bd9Sstevel@tonic-gate 30827c478bd9Sstevel@tonic-gate switch (tcp->tcp_state) { 30837c478bd9Sstevel@tonic-gate case TCPS_BOUND: 30847c478bd9Sstevel@tonic-gate case TCPS_LISTEN: 30857c478bd9Sstevel@tonic-gate break; 30867c478bd9Sstevel@tonic-gate default: 30870f1702c5SYu Xiangning return (-TOUTSTATE); 30887c478bd9Sstevel@tonic-gate } 30897c478bd9Sstevel@tonic-gate 30907c478bd9Sstevel@tonic-gate /* 30917c478bd9Sstevel@tonic-gate * Need to clean up all the eagers since after the unbind, segments 30927c478bd9Sstevel@tonic-gate * will no longer be delivered to this listener stream. 30937c478bd9Sstevel@tonic-gate */ 30947c478bd9Sstevel@tonic-gate mutex_enter(&tcp->tcp_eager_lock); 30957c478bd9Sstevel@tonic-gate if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 30967c478bd9Sstevel@tonic-gate tcp_eager_cleanup(tcp, 0); 30977c478bd9Sstevel@tonic-gate } 30987c478bd9Sstevel@tonic-gate mutex_exit(&tcp->tcp_eager_lock); 30997c478bd9Sstevel@tonic-gate 310093fcb0b9SKacheong Poon /* Clean up the listener connection counter if necessary. */ 310193fcb0b9SKacheong Poon if (tcp->tcp_listen_cnt != NULL) 310293fcb0b9SKacheong Poon TCP_DECR_LISTEN_CNT(tcp); 3103bd670b35SErik Nordmark connp->conn_laddr_v6 = ipv6_all_zeros; 3104bd670b35SErik Nordmark connp->conn_saddr_v6 = ipv6_all_zeros; 31057c478bd9Sstevel@tonic-gate tcp_bind_hash_remove(tcp); 31069cd928feSAlan Maguire oldstate = tcp->tcp_state; 31077c478bd9Sstevel@tonic-gate tcp->tcp_state = TCPS_IDLE; 31089cd928feSAlan Maguire DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 31099cd928feSAlan Maguire connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, 31109cd928feSAlan Maguire int32_t, oldstate); 31110f1702c5SYu Xiangning 3112bd670b35SErik Nordmark ip_unbind(connp); 31137c478bd9Sstevel@tonic-gate bzero(&connp->conn_ports, sizeof (connp->conn_ports)); 31140f1702c5SYu Xiangning 3115bd670b35SErik Nordmark return (0); 31160f1702c5SYu Xiangning } 31170f1702c5SYu Xiangning 3118bd670b35SErik Nordmark /* 31193e95bd4aSAnders Persson * Collect protocol properties to send to the upper handle. 3120bd670b35SErik Nordmark */ 31217c478bd9Sstevel@tonic-gate void 31223e95bd4aSAnders Persson tcp_get_proto_props(tcp_t *tcp, struct sock_proto_props *sopp) 31237c478bd9Sstevel@tonic-gate { 31243e95bd4aSAnders Persson conn_t *connp = tcp->tcp_connp; 31250f1702c5SYu Xiangning 31263e95bd4aSAnders Persson sopp->sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF; 31273e95bd4aSAnders Persson sopp->sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 31280f1702c5SYu Xiangning 31293e95bd4aSAnders Persson sopp->sopp_rxhiwat = tcp->tcp_fused ? 3130bd670b35SErik Nordmark tcp_fuse_set_rcv_hiwat(tcp, connp->conn_rcvbuf) : 3131bd670b35SErik Nordmark connp->conn_rcvbuf; 31328c0bf406Sja97890 /* 31338c0bf406Sja97890 * Determine what write offset value to use depending on SACK and 31348c0bf406Sja97890 * whether the endpoint is fused or not. 31358c0bf406Sja97890 */ 31367c478bd9Sstevel@tonic-gate if (tcp->tcp_fused) { 31377c478bd9Sstevel@tonic-gate ASSERT(tcp->tcp_loopback); 3138ff550d0eSmasputra ASSERT(tcp->tcp_loopback_peer != NULL); 31397c478bd9Sstevel@tonic-gate /* 31407c478bd9Sstevel@tonic-gate * For fused tcp loopback, set the stream head's write 31417c478bd9Sstevel@tonic-gate * offset value to zero since we won't be needing any room 31427c478bd9Sstevel@tonic-gate * for TCP/IP headers. This would also improve performance 31437c478bd9Sstevel@tonic-gate * since it would reduce the amount of work done by kmem. 31447c478bd9Sstevel@tonic-gate * Non-fused tcp loopback case is handled separately below. 31457c478bd9Sstevel@tonic-gate */ 31463e95bd4aSAnders Persson sopp->sopp_wroff = 0; 31477c478bd9Sstevel@tonic-gate /* 3148ff550d0eSmasputra * Update the peer's transmit parameters according to 3149ff550d0eSmasputra * our recently calculated high water mark value. 3150ff550d0eSmasputra */ 3151ff550d0eSmasputra (void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE); 31527c478bd9Sstevel@tonic-gate } else if (tcp->tcp_snd_sack_ok) { 31533e95bd4aSAnders Persson sopp->sopp_wroff = connp->conn_ht_iphc_allocated + 31543e95bd4aSAnders Persson (tcp->tcp_loopback ? 0 : tcp->tcp_tcps->tcps_wroff_xtra); 31557c478bd9Sstevel@tonic-gate } else { 31563e95bd4aSAnders Persson sopp->sopp_wroff = connp->conn_ht_iphc_len + 31573e95bd4aSAnders Persson (tcp->tcp_loopback ? 0 : tcp->tcp_tcps->tcps_wroff_xtra); 31587c478bd9Sstevel@tonic-gate } 31597c478bd9Sstevel@tonic-gate 3160081c0aa8SAnders Persson if (tcp->tcp_loopback) { 31613e95bd4aSAnders Persson sopp->sopp_flags |= SOCKOPT_LOOPBACK; 31623e95bd4aSAnders Persson sopp->sopp_loopback = B_TRUE; 3163081c0aa8SAnders Persson } 31640f1702c5SYu Xiangning } 31650f1702c5SYu Xiangning 3166bd670b35SErik Nordmark /* 3167bd670b35SErik Nordmark * Check the usability of ZEROCOPY. It's instead checking the flag set by IP. 3168bd670b35SErik Nordmark */ 3169721fffe3SKacheong Poon boolean_t 31707c478bd9Sstevel@tonic-gate tcp_zcopy_check(tcp_t *tcp) 31717c478bd9Sstevel@tonic-gate { 31727c478bd9Sstevel@tonic-gate conn_t *connp = tcp->tcp_connp; 3173bd670b35SErik Nordmark ip_xmit_attr_t *ixa = connp->conn_ixa; 31747c478bd9Sstevel@tonic-gate boolean_t zc_enabled = B_FALSE; 3175f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 31767c478bd9Sstevel@tonic-gate 31777c478bd9Sstevel@tonic-gate if (do_tcpzcopy == 2) 31787c478bd9Sstevel@tonic-gate zc_enabled = B_TRUE; 3179bd670b35SErik Nordmark else if ((do_tcpzcopy == 1) && (ixa->ixa_flags & IXAF_ZCOPY_CAPAB)) 3180bd670b35SErik Nordmark zc_enabled = B_TRUE; 31817c478bd9Sstevel@tonic-gate 31827c478bd9Sstevel@tonic-gate tcp->tcp_snd_zcopy_on = zc_enabled; 31837c478bd9Sstevel@tonic-gate if (!TCP_IS_DETACHED(tcp)) { 31847c478bd9Sstevel@tonic-gate if (zc_enabled) { 3185bd670b35SErik Nordmark ixa->ixa_flags |= IXAF_VERIFY_ZCOPY; 3186bd670b35SErik Nordmark (void) proto_set_tx_copyopt(connp->conn_rq, connp, 31870f1702c5SYu Xiangning ZCVMSAFE); 3188f4b3ec61Sdh155122 TCP_STAT(tcps, tcp_zcopy_on); 31897c478bd9Sstevel@tonic-gate } else { 3190bd670b35SErik Nordmark ixa->ixa_flags &= ~IXAF_VERIFY_ZCOPY; 3191bd670b35SErik Nordmark (void) proto_set_tx_copyopt(connp->conn_rq, connp, 31920f1702c5SYu Xiangning ZCVMUNSAFE); 3193f4b3ec61Sdh155122 TCP_STAT(tcps, tcp_zcopy_off); 31947c478bd9Sstevel@tonic-gate } 31957c478bd9Sstevel@tonic-gate } 31967c478bd9Sstevel@tonic-gate return (zc_enabled); 31977c478bd9Sstevel@tonic-gate } 31987c478bd9Sstevel@tonic-gate 31997c478bd9Sstevel@tonic-gate /* 3200bd670b35SErik Nordmark * Backoff from a zero-copy message by copying data to a new allocated 3201bd670b35SErik Nordmark * message and freeing the original desballoca'ed segmapped message. 3202bd670b35SErik Nordmark * 3203bd670b35SErik Nordmark * This function is called by following two callers: 3204bd670b35SErik Nordmark * 1. tcp_timer: fix_xmitlist is set to B_TRUE, because it's safe to free 3205bd670b35SErik Nordmark * the origial desballoca'ed message and notify sockfs. This is in re- 3206bd670b35SErik Nordmark * transmit state. 3207bd670b35SErik Nordmark * 2. tcp_output: fix_xmitlist is set to B_FALSE. Flag STRUIO_ZCNOTIFY need 3208bd670b35SErik Nordmark * to be copied to new message. 32097c478bd9Sstevel@tonic-gate */ 3210721fffe3SKacheong Poon mblk_t * 3211bd670b35SErik Nordmark tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, boolean_t fix_xmitlist) 32127c478bd9Sstevel@tonic-gate { 3213bd670b35SErik Nordmark mblk_t *nbp; 3214bd670b35SErik Nordmark mblk_t *head = NULL; 3215bd670b35SErik Nordmark mblk_t *tail = NULL; 3216f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 3217f4b3ec61Sdh155122 3218bd670b35SErik Nordmark ASSERT(bp != NULL); 3219bd670b35SErik Nordmark while (bp != NULL) { 32207c478bd9Sstevel@tonic-gate if (IS_VMLOANED_MBLK(bp)) { 3221f4b3ec61Sdh155122 TCP_STAT(tcps, tcp_zcopy_backoff); 3222bd670b35SErik Nordmark if ((nbp = copyb(bp)) == NULL) { 32237c478bd9Sstevel@tonic-gate tcp->tcp_xmit_zc_clean = B_FALSE; 3224bd670b35SErik Nordmark if (tail != NULL) 3225bd670b35SErik Nordmark tail->b_cont = bp; 3226bd670b35SErik Nordmark return ((head == NULL) ? bp : head); 32277c478bd9Sstevel@tonic-gate } 3228bd670b35SErik Nordmark 32297c478bd9Sstevel@tonic-gate if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { 32307c478bd9Sstevel@tonic-gate if (fix_xmitlist) 32317c478bd9Sstevel@tonic-gate tcp_zcopy_notify(tcp); 32327c478bd9Sstevel@tonic-gate else 3233bd670b35SErik Nordmark nbp->b_datap->db_struioflag |= 32347c478bd9Sstevel@tonic-gate STRUIO_ZCNOTIFY; 32357c478bd9Sstevel@tonic-gate } 3236bd670b35SErik Nordmark nbp->b_cont = bp->b_cont; 3237bd670b35SErik Nordmark 3238bd670b35SErik Nordmark /* 3239bd670b35SErik Nordmark * Copy saved information and adjust tcp_xmit_tail 3240bd670b35SErik Nordmark * if needed. 3241bd670b35SErik Nordmark */ 32427c478bd9Sstevel@tonic-gate if (fix_xmitlist) { 3243bd670b35SErik Nordmark nbp->b_prev = bp->b_prev; 3244bd670b35SErik Nordmark nbp->b_next = bp->b_next; 3245bd670b35SErik Nordmark 32467c478bd9Sstevel@tonic-gate if (tcp->tcp_xmit_tail == bp) 3247bd670b35SErik Nordmark tcp->tcp_xmit_tail = nbp; 32487c478bd9Sstevel@tonic-gate } 3249bd670b35SErik Nordmark 3250bd670b35SErik Nordmark /* Free the original message. */ 32517c478bd9Sstevel@tonic-gate bp->b_prev = NULL; 3252bd670b35SErik Nordmark bp->b_next = NULL; 32537c478bd9Sstevel@tonic-gate freeb(bp); 3254bd670b35SErik Nordmark 3255bd670b35SErik Nordmark bp = nbp; 3256bd670b35SErik Nordmark } 3257bd670b35SErik Nordmark 3258bd670b35SErik Nordmark if (head == NULL) { 3259bd670b35SErik Nordmark head = bp; 3260bd670b35SErik Nordmark } 3261bd670b35SErik Nordmark if (tail == NULL) { 3262bd670b35SErik Nordmark tail = bp; 32637c478bd9Sstevel@tonic-gate } else { 3264bd670b35SErik Nordmark tail->b_cont = bp; 3265bd670b35SErik Nordmark tail = bp; 32667c478bd9Sstevel@tonic-gate } 3267bd670b35SErik Nordmark 3268bd670b35SErik Nordmark /* Move forward. */ 3269bd670b35SErik Nordmark bp = bp->b_cont; 32707c478bd9Sstevel@tonic-gate } 3271bd670b35SErik Nordmark 32727c478bd9Sstevel@tonic-gate if (fix_xmitlist) { 32737c478bd9Sstevel@tonic-gate tcp->tcp_xmit_last = tail; 32747c478bd9Sstevel@tonic-gate tcp->tcp_xmit_zc_clean = B_TRUE; 32757c478bd9Sstevel@tonic-gate } 3276bd670b35SErik Nordmark 32777c478bd9Sstevel@tonic-gate return (head); 32787c478bd9Sstevel@tonic-gate } 32797c478bd9Sstevel@tonic-gate 3280721fffe3SKacheong Poon void 32817c478bd9Sstevel@tonic-gate tcp_zcopy_notify(tcp_t *tcp) 32827c478bd9Sstevel@tonic-gate { 32837c478bd9Sstevel@tonic-gate struct stdata *stp; 32840f1702c5SYu Xiangning conn_t *connp; 32857c478bd9Sstevel@tonic-gate 32867c478bd9Sstevel@tonic-gate if (tcp->tcp_detached) 32877c478bd9Sstevel@tonic-gate return; 32880f1702c5SYu Xiangning connp = tcp->tcp_connp; 32890f1702c5SYu Xiangning if (IPCL_IS_NONSTR(connp)) { 32900f1702c5SYu Xiangning (*connp->conn_upcalls->su_zcopy_notify) 32910f1702c5SYu Xiangning (connp->conn_upper_handle); 32920f1702c5SYu Xiangning return; 32930f1702c5SYu Xiangning } 3294bd670b35SErik Nordmark stp = STREAM(connp->conn_rq); 32957c478bd9Sstevel@tonic-gate mutex_enter(&stp->sd_lock); 32967c478bd9Sstevel@tonic-gate stp->sd_flag |= STZCNOTIFY; 32977c478bd9Sstevel@tonic-gate cv_broadcast(&stp->sd_zcopy_wait); 32987c478bd9Sstevel@tonic-gate mutex_exit(&stp->sd_lock); 32997c478bd9Sstevel@tonic-gate } 33007c478bd9Sstevel@tonic-gate 3301bd670b35SErik Nordmark /* 3302bd670b35SErik Nordmark * Update the TCP connection according to change of LSO capability. 3303bd670b35SErik Nordmark */ 3304bd670b35SErik Nordmark static void 3305bd670b35SErik Nordmark tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa) 33067c478bd9Sstevel@tonic-gate { 3307bd670b35SErik Nordmark /* 3308bd670b35SErik Nordmark * We check against IPv4 header length to preserve the old behavior 3309bd670b35SErik Nordmark * of only enabling LSO when there are no IP options. 3310bd670b35SErik Nordmark * But this restriction might not be necessary at all. Before removing 3311bd670b35SErik Nordmark * it, need to verify how LSO is handled for source routing case, with 3312bd670b35SErik Nordmark * which IP does software checksum. 3313bd670b35SErik Nordmark * 3314bd670b35SErik Nordmark * For IPv6, whenever any extension header is needed, LSO is supressed. 3315bd670b35SErik Nordmark */ 3316bd670b35SErik Nordmark if (ixa->ixa_ip_hdr_length != ((ixa->ixa_flags & IXAF_IS_IPV4) ? 3317bd670b35SErik Nordmark IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN)) 3318bd670b35SErik Nordmark return; 3319bd670b35SErik Nordmark 3320bd670b35SErik Nordmark /* 3321bd670b35SErik Nordmark * Either the LSO capability newly became usable, or it has changed. 3322bd670b35SErik Nordmark */ 3323bd670b35SErik Nordmark if (ixa->ixa_flags & IXAF_LSO_CAPAB) { 3324bd670b35SErik Nordmark ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; 3325bd670b35SErik Nordmark 3326bd670b35SErik Nordmark ASSERT(lsoc->ill_lso_max > 0); 3327bd670b35SErik Nordmark tcp->tcp_lso_max = MIN(TCP_MAX_LSO_LENGTH, lsoc->ill_lso_max); 3328bd670b35SErik Nordmark 3329bd670b35SErik Nordmark DTRACE_PROBE3(tcp_update_lso, boolean_t, tcp->tcp_lso, 3330bd670b35SErik Nordmark boolean_t, B_TRUE, uint32_t, tcp->tcp_lso_max); 3331bd670b35SErik Nordmark 3332bd670b35SErik Nordmark /* 3333bd670b35SErik Nordmark * If LSO to be enabled, notify the STREAM header with larger 3334bd670b35SErik Nordmark * data block. 3335bd670b35SErik Nordmark */ 3336bd670b35SErik Nordmark if (!tcp->tcp_lso) 3337bd670b35SErik Nordmark tcp->tcp_maxpsz_multiplier = 0; 3338bd670b35SErik Nordmark 3339bd670b35SErik Nordmark tcp->tcp_lso = B_TRUE; 3340bd670b35SErik Nordmark TCP_STAT(tcp->tcp_tcps, tcp_lso_enabled); 3341bd670b35SErik Nordmark } else { /* LSO capability is not usable any more. */ 3342bd670b35SErik Nordmark DTRACE_PROBE3(tcp_update_lso, boolean_t, tcp->tcp_lso, 3343bd670b35SErik Nordmark boolean_t, B_FALSE, uint32_t, tcp->tcp_lso_max); 3344bd670b35SErik Nordmark 3345bd670b35SErik Nordmark /* 3346bd670b35SErik Nordmark * If LSO to be disabled, notify the STREAM header with smaller 3347bd670b35SErik Nordmark * data block. And need to restore fragsize to PMTU. 3348bd670b35SErik Nordmark */ 3349bd670b35SErik Nordmark if (tcp->tcp_lso) { 3350bd670b35SErik Nordmark tcp->tcp_maxpsz_multiplier = 3351bd670b35SErik Nordmark tcp->tcp_tcps->tcps_maxpsz_multiplier; 3352bd670b35SErik Nordmark ixa->ixa_fragsize = ixa->ixa_pmtu; 3353bd670b35SErik Nordmark tcp->tcp_lso = B_FALSE; 3354bd670b35SErik Nordmark TCP_STAT(tcp->tcp_tcps, tcp_lso_disabled); 3355bd670b35SErik Nordmark } 3356bd670b35SErik Nordmark } 3357bd670b35SErik Nordmark 3358bd670b35SErik Nordmark (void) tcp_maxpsz_set(tcp, B_TRUE); 3359bd670b35SErik Nordmark } 3360bd670b35SErik Nordmark 3361bd670b35SErik Nordmark /* 3362bd670b35SErik Nordmark * Update the TCP connection according to change of ZEROCOPY capability. 3363bd670b35SErik Nordmark */ 3364bd670b35SErik Nordmark static void 3365bd670b35SErik Nordmark tcp_update_zcopy(tcp_t *tcp) 3366bd670b35SErik Nordmark { 33677c478bd9Sstevel@tonic-gate conn_t *connp = tcp->tcp_connp; 3368f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 33697c478bd9Sstevel@tonic-gate 3370bd670b35SErik Nordmark if (tcp->tcp_snd_zcopy_on) { 3371bd670b35SErik Nordmark tcp->tcp_snd_zcopy_on = B_FALSE; 3372bd670b35SErik Nordmark if (!TCP_IS_DETACHED(tcp)) { 3373bd670b35SErik Nordmark (void) proto_set_tx_copyopt(connp->conn_rq, connp, 3374bd670b35SErik Nordmark ZCVMUNSAFE); 3375bd670b35SErik Nordmark TCP_STAT(tcps, tcp_zcopy_off); 3376bd670b35SErik Nordmark } 33777c478bd9Sstevel@tonic-gate } else { 3378bd670b35SErik Nordmark tcp->tcp_snd_zcopy_on = B_TRUE; 3379bd670b35SErik Nordmark if (!TCP_IS_DETACHED(tcp)) { 3380bd670b35SErik Nordmark (void) proto_set_tx_copyopt(connp->conn_rq, connp, 3381bd670b35SErik Nordmark ZCVMSAFE); 3382bd670b35SErik Nordmark TCP_STAT(tcps, tcp_zcopy_on); 33837c478bd9Sstevel@tonic-gate } 33847c478bd9Sstevel@tonic-gate } 33858347601bSyl150051 } 33868347601bSyl150051 3387c793af95Ssangeeta /* 3388bd670b35SErik Nordmark * Notify function registered with ip_xmit_attr_t. It's called in the squeue 3389bd670b35SErik Nordmark * so it's safe to update the TCP connection. 3390c793af95Ssangeeta */ 3391bd670b35SErik Nordmark /* ARGSUSED1 */ 3392bd670b35SErik Nordmark static void 3393bd670b35SErik Nordmark tcp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype, 3394bd670b35SErik Nordmark ixa_notify_arg_t narg) 33958347601bSyl150051 { 3396bd670b35SErik Nordmark tcp_t *tcp = (tcp_t *)arg; 3397bd670b35SErik Nordmark conn_t *connp = tcp->tcp_connp; 33988347601bSyl150051 3399bd670b35SErik Nordmark switch (ntype) { 3400bd670b35SErik Nordmark case IXAN_LSO: 3401bd670b35SErik Nordmark tcp_update_lso(tcp, connp->conn_ixa); 3402bd670b35SErik Nordmark break; 3403bd670b35SErik Nordmark case IXAN_PMTU: 3404bd670b35SErik Nordmark tcp_update_pmtu(tcp, B_FALSE); 3405bd670b35SErik Nordmark break; 3406bd670b35SErik Nordmark case IXAN_ZCOPY: 3407bd670b35SErik Nordmark tcp_update_zcopy(tcp); 3408bd670b35SErik Nordmark break; 3409bd670b35SErik Nordmark default: 3410bd670b35SErik Nordmark break; 34117c478bd9Sstevel@tonic-gate } 34128347601bSyl150051 } 34138347601bSyl150051 34147c478bd9Sstevel@tonic-gate /* 34157c478bd9Sstevel@tonic-gate * The TCP write service routine should never be called... 34167c478bd9Sstevel@tonic-gate */ 34177c478bd9Sstevel@tonic-gate /* ARGSUSED */ 34187c478bd9Sstevel@tonic-gate static void 34197c478bd9Sstevel@tonic-gate tcp_wsrv(queue_t *q) 34207c478bd9Sstevel@tonic-gate { 3421f4b3ec61Sdh155122 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 3422f4b3ec61Sdh155122 3423f4b3ec61Sdh155122 TCP_STAT(tcps, tcp_wsrv_called); 34247c478bd9Sstevel@tonic-gate } 34257c478bd9Sstevel@tonic-gate 34267c478bd9Sstevel@tonic-gate /* 34277c478bd9Sstevel@tonic-gate * Hash list lookup routine for tcp_t structures. 34287c478bd9Sstevel@tonic-gate * Returns with a CONN_INC_REF tcp structure. Caller must do a CONN_DEC_REF. 34297c478bd9Sstevel@tonic-gate */ 3430721fffe3SKacheong Poon tcp_t * 3431f4b3ec61Sdh155122 tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *tcps) 34327c478bd9Sstevel@tonic-gate { 34337c478bd9Sstevel@tonic-gate tf_t *tf; 34347c478bd9Sstevel@tonic-gate tcp_t *tcp; 34357c478bd9Sstevel@tonic-gate 3436f4b3ec61Sdh155122 tf = &tcps->tcps_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; 34377c478bd9Sstevel@tonic-gate mutex_enter(&tf->tf_lock); 34387c478bd9Sstevel@tonic-gate for (tcp = tf->tf_tcp; tcp != NULL; 34397c478bd9Sstevel@tonic-gate tcp = tcp->tcp_acceptor_hash) { 34407c478bd9Sstevel@tonic-gate if (tcp->tcp_acceptor_id == id) { 34417c478bd9Sstevel@tonic-gate CONN_INC_REF(tcp->tcp_connp); 34427c478bd9Sstevel@tonic-gate mutex_exit(&tf->tf_lock); 34437c478bd9Sstevel@tonic-gate return (tcp); 34447c478bd9Sstevel@tonic-gate } 34457c478bd9Sstevel@tonic-gate } 34467c478bd9Sstevel@tonic-gate mutex_exit(&tf->tf_lock); 34477c478bd9Sstevel@tonic-gate return (NULL); 34487c478bd9Sstevel@tonic-gate } 34497c478bd9Sstevel@tonic-gate 34507c478bd9Sstevel@tonic-gate /* 34517c478bd9Sstevel@tonic-gate * Hash list insertion routine for tcp_t structures. 34527c478bd9Sstevel@tonic-gate */ 34537c478bd9Sstevel@tonic-gate void 34547c478bd9Sstevel@tonic-gate tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp) 34557c478bd9Sstevel@tonic-gate { 34567c478bd9Sstevel@tonic-gate tf_t *tf; 34577c478bd9Sstevel@tonic-gate tcp_t **tcpp; 34587c478bd9Sstevel@tonic-gate tcp_t *tcpnext; 3459f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 34607c478bd9Sstevel@tonic-gate 3461f4b3ec61Sdh155122 tf = &tcps->tcps_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; 34627c478bd9Sstevel@tonic-gate 34637c478bd9Sstevel@tonic-gate if (tcp->tcp_ptpahn != NULL) 34647c478bd9Sstevel@tonic-gate tcp_acceptor_hash_remove(tcp); 34657c478bd9Sstevel@tonic-gate tcpp = &tf->tf_tcp; 34667c478bd9Sstevel@tonic-gate mutex_enter(&tf->tf_lock); 34677c478bd9Sstevel@tonic-gate tcpnext = tcpp[0]; 34687c478bd9Sstevel@tonic-gate if (tcpnext) 34697c478bd9Sstevel@tonic-gate tcpnext->tcp_ptpahn = &tcp->tcp_acceptor_hash; 34707c478bd9Sstevel@tonic-gate tcp->tcp_acceptor_hash = tcpnext; 34717c478bd9Sstevel@tonic-gate tcp->tcp_ptpahn = tcpp; 34727c478bd9Sstevel@tonic-gate tcpp[0] = tcp; 34737c478bd9Sstevel@tonic-gate tcp->tcp_acceptor_lockp = &tf->tf_lock; /* For tcp_*_hash_remove */ 34747c478bd9Sstevel@tonic-gate mutex_exit(&tf->tf_lock); 34757c478bd9Sstevel@tonic-gate } 34767c478bd9Sstevel@tonic-gate 34777c478bd9Sstevel@tonic-gate /* 34787c478bd9Sstevel@tonic-gate * Hash list removal routine for tcp_t structures. 34797c478bd9Sstevel@tonic-gate */ 3480721fffe3SKacheong Poon void 34817c478bd9Sstevel@tonic-gate tcp_acceptor_hash_remove(tcp_t *tcp) 34827c478bd9Sstevel@tonic-gate { 34837c478bd9Sstevel@tonic-gate tcp_t *tcpnext; 34847c478bd9Sstevel@tonic-gate kmutex_t *lockp; 34857c478bd9Sstevel@tonic-gate 34867c478bd9Sstevel@tonic-gate /* 34877c478bd9Sstevel@tonic-gate * Extract the lock pointer in case there are concurrent 34887c478bd9Sstevel@tonic-gate * hash_remove's for this instance. 34897c478bd9Sstevel@tonic-gate */ 34907c478bd9Sstevel@tonic-gate lockp = tcp->tcp_acceptor_lockp; 34917c478bd9Sstevel@tonic-gate 34927c478bd9Sstevel@tonic-gate if (tcp->tcp_ptpahn == NULL) 34937c478bd9Sstevel@tonic-gate return; 34947c478bd9Sstevel@tonic-gate 34957c478bd9Sstevel@tonic-gate ASSERT(lockp != NULL); 34967c478bd9Sstevel@tonic-gate mutex_enter(lockp); 34977c478bd9Sstevel@tonic-gate if (tcp->tcp_ptpahn) { 34987c478bd9Sstevel@tonic-gate tcpnext = tcp->tcp_acceptor_hash; 34997c478bd9Sstevel@tonic-gate if (tcpnext) { 35007c478bd9Sstevel@tonic-gate tcpnext->tcp_ptpahn = tcp->tcp_ptpahn; 35017c478bd9Sstevel@tonic-gate tcp->tcp_acceptor_hash = NULL; 35027c478bd9Sstevel@tonic-gate } 35037c478bd9Sstevel@tonic-gate *tcp->tcp_ptpahn = tcpnext; 35047c478bd9Sstevel@tonic-gate tcp->tcp_ptpahn = NULL; 35057c478bd9Sstevel@tonic-gate } 35067c478bd9Sstevel@tonic-gate mutex_exit(lockp); 35077c478bd9Sstevel@tonic-gate tcp->tcp_acceptor_lockp = NULL; 35087c478bd9Sstevel@tonic-gate } 35097c478bd9Sstevel@tonic-gate 35107c478bd9Sstevel@tonic-gate /* 35117c478bd9Sstevel@tonic-gate * Type three generator adapted from the random() function in 4.4 BSD: 35127c478bd9Sstevel@tonic-gate */ 35137c478bd9Sstevel@tonic-gate 35147c478bd9Sstevel@tonic-gate /* 35157c478bd9Sstevel@tonic-gate * Copyright (c) 1983, 1993 35167c478bd9Sstevel@tonic-gate * The Regents of the University of California. All rights reserved. 35177c478bd9Sstevel@tonic-gate * 35187c478bd9Sstevel@tonic-gate * Redistribution and use in source and binary forms, with or without 35197c478bd9Sstevel@tonic-gate * modification, are permitted provided that the following conditions 35207c478bd9Sstevel@tonic-gate * are met: 35217c478bd9Sstevel@tonic-gate * 1. Redistributions of source code must retain the above copyright 35227c478bd9Sstevel@tonic-gate * notice, this list of conditions and the following disclaimer. 35237c478bd9Sstevel@tonic-gate * 2. Redistributions in binary form must reproduce the above copyright 35247c478bd9Sstevel@tonic-gate * notice, this list of conditions and the following disclaimer in the 35257c478bd9Sstevel@tonic-gate * documentation and/or other materials provided with the distribution. 35267c478bd9Sstevel@tonic-gate * 3. All advertising materials mentioning features or use of this software 35277c478bd9Sstevel@tonic-gate * must display the following acknowledgement: 35287c478bd9Sstevel@tonic-gate * This product includes software developed by the University of 35297c478bd9Sstevel@tonic-gate * California, Berkeley and its contributors. 35307c478bd9Sstevel@tonic-gate * 4. Neither the name of the University nor the names of its contributors 35317c478bd9Sstevel@tonic-gate * may be used to endorse or promote products derived from this software 35327c478bd9Sstevel@tonic-gate * without specific prior written permission. 35337c478bd9Sstevel@tonic-gate * 35347c478bd9Sstevel@tonic-gate * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 35357c478bd9Sstevel@tonic-gate * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35367c478bd9Sstevel@tonic-gate * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35377c478bd9Sstevel@tonic-gate * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35387c478bd9Sstevel@tonic-gate * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35397c478bd9Sstevel@tonic-gate * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 35407c478bd9Sstevel@tonic-gate * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35417c478bd9Sstevel@tonic-gate * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35427c478bd9Sstevel@tonic-gate * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35437c478bd9Sstevel@tonic-gate * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35447c478bd9Sstevel@tonic-gate * SUCH DAMAGE. 35457c478bd9Sstevel@tonic-gate */ 35467c478bd9Sstevel@tonic-gate 35477c478bd9Sstevel@tonic-gate /* Type 3 -- x**31 + x**3 + 1 */ 35487c478bd9Sstevel@tonic-gate #define DEG_3 31 35497c478bd9Sstevel@tonic-gate #define SEP_3 3 35507c478bd9Sstevel@tonic-gate 35517c478bd9Sstevel@tonic-gate 35527c478bd9Sstevel@tonic-gate /* Protected by tcp_random_lock */ 35537c478bd9Sstevel@tonic-gate static int tcp_randtbl[DEG_3 + 1]; 35547c478bd9Sstevel@tonic-gate 35557c478bd9Sstevel@tonic-gate static int *tcp_random_fptr = &tcp_randtbl[SEP_3 + 1]; 35567c478bd9Sstevel@tonic-gate static int *tcp_random_rptr = &tcp_randtbl[1]; 35577c478bd9Sstevel@tonic-gate 35587c478bd9Sstevel@tonic-gate static int *tcp_random_state = &tcp_randtbl[1]; 35597c478bd9Sstevel@tonic-gate static int *tcp_random_end_ptr = &tcp_randtbl[DEG_3 + 1]; 35607c478bd9Sstevel@tonic-gate 35617c478bd9Sstevel@tonic-gate kmutex_t tcp_random_lock; 35627c478bd9Sstevel@tonic-gate 35637c478bd9Sstevel@tonic-gate void 35647c478bd9Sstevel@tonic-gate tcp_random_init(void) 35657c478bd9Sstevel@tonic-gate { 35667c478bd9Sstevel@tonic-gate int i; 35677c478bd9Sstevel@tonic-gate hrtime_t hrt; 35687c478bd9Sstevel@tonic-gate time_t wallclock; 35697c478bd9Sstevel@tonic-gate uint64_t result; 35707c478bd9Sstevel@tonic-gate 35717c478bd9Sstevel@tonic-gate /* 35727c478bd9Sstevel@tonic-gate * Use high-res timer and current time for seed. Gethrtime() returns 35737c478bd9Sstevel@tonic-gate * a longlong, which may contain resolution down to nanoseconds. 35747c478bd9Sstevel@tonic-gate * The current time will either be a 32-bit or a 64-bit quantity. 35757c478bd9Sstevel@tonic-gate * XOR the two together in a 64-bit result variable. 35767c478bd9Sstevel@tonic-gate * Convert the result to a 32-bit value by multiplying the high-order 35777c478bd9Sstevel@tonic-gate * 32-bits by the low-order 32-bits. 35787c478bd9Sstevel@tonic-gate */ 35797c478bd9Sstevel@tonic-gate 35807c478bd9Sstevel@tonic-gate hrt = gethrtime(); 35817c478bd9Sstevel@tonic-gate (void) drv_getparm(TIME, &wallclock); 35827c478bd9Sstevel@tonic-gate result = (uint64_t)wallclock ^ (uint64_t)hrt; 35837c478bd9Sstevel@tonic-gate mutex_enter(&tcp_random_lock); 35847c478bd9Sstevel@tonic-gate tcp_random_state[0] = ((result >> 32) & 0xffffffff) * 35857c478bd9Sstevel@tonic-gate (result & 0xffffffff); 35867c478bd9Sstevel@tonic-gate 35877c478bd9Sstevel@tonic-gate for (i = 1; i < DEG_3; i++) 35887c478bd9Sstevel@tonic-gate tcp_random_state[i] = 1103515245 * tcp_random_state[i - 1] 35897c478bd9Sstevel@tonic-gate + 12345; 35907c478bd9Sstevel@tonic-gate tcp_random_fptr = &tcp_random_state[SEP_3]; 35917c478bd9Sstevel@tonic-gate tcp_random_rptr = &tcp_random_state[0]; 35927c478bd9Sstevel@tonic-gate mutex_exit(&tcp_random_lock); 35937c478bd9Sstevel@tonic-gate for (i = 0; i < 10 * DEG_3; i++) 35947c478bd9Sstevel@tonic-gate (void) tcp_random(); 35957c478bd9Sstevel@tonic-gate } 35967c478bd9Sstevel@tonic-gate 35977c478bd9Sstevel@tonic-gate /* 35987c478bd9Sstevel@tonic-gate * tcp_random: Return a random number in the range [1 - (128K + 1)]. 35997c478bd9Sstevel@tonic-gate * This range is selected to be approximately centered on TCP_ISS / 2, 36007c478bd9Sstevel@tonic-gate * and easy to compute. We get this value by generating a 32-bit random 36017c478bd9Sstevel@tonic-gate * number, selecting out the high-order 17 bits, and then adding one so 36027c478bd9Sstevel@tonic-gate * that we never return zero. 36037c478bd9Sstevel@tonic-gate */ 36047c478bd9Sstevel@tonic-gate int 36057c478bd9Sstevel@tonic-gate tcp_random(void) 36067c478bd9Sstevel@tonic-gate { 36077c478bd9Sstevel@tonic-gate int i; 36087c478bd9Sstevel@tonic-gate 36097c478bd9Sstevel@tonic-gate mutex_enter(&tcp_random_lock); 36107c478bd9Sstevel@tonic-gate *tcp_random_fptr += *tcp_random_rptr; 36117c478bd9Sstevel@tonic-gate 36127c478bd9Sstevel@tonic-gate /* 36137c478bd9Sstevel@tonic-gate * The high-order bits are more random than the low-order bits, 36147c478bd9Sstevel@tonic-gate * so we select out the high-order 17 bits and add one so that 36157c478bd9Sstevel@tonic-gate * we never return zero. 36167c478bd9Sstevel@tonic-gate */ 36177c478bd9Sstevel@tonic-gate i = ((*tcp_random_fptr >> 15) & 0x1ffff) + 1; 36187c478bd9Sstevel@tonic-gate if (++tcp_random_fptr >= tcp_random_end_ptr) { 36197c478bd9Sstevel@tonic-gate tcp_random_fptr = tcp_random_state; 36207c478bd9Sstevel@tonic-gate ++tcp_random_rptr; 36217c478bd9Sstevel@tonic-gate } else if (++tcp_random_rptr >= tcp_random_end_ptr) 36227c478bd9Sstevel@tonic-gate tcp_random_rptr = tcp_random_state; 36237c478bd9Sstevel@tonic-gate 36247c478bd9Sstevel@tonic-gate mutex_exit(&tcp_random_lock); 36257c478bd9Sstevel@tonic-gate return (i); 36267c478bd9Sstevel@tonic-gate } 36277c478bd9Sstevel@tonic-gate 36287c478bd9Sstevel@tonic-gate /* 36297c478bd9Sstevel@tonic-gate * Split this function out so that if the secret changes, I'm okay. 36307c478bd9Sstevel@tonic-gate * 36317c478bd9Sstevel@tonic-gate * Initialize the tcp_iss_cookie and tcp_iss_key. 36327c478bd9Sstevel@tonic-gate */ 36337c478bd9Sstevel@tonic-gate 36347c478bd9Sstevel@tonic-gate #define PASSWD_SIZE 16 /* MUST be multiple of 4 */ 36357c478bd9Sstevel@tonic-gate 36366e91bba0SGirish Moodalbail void 3637f4b3ec61Sdh155122 tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *tcps) 36387c478bd9Sstevel@tonic-gate { 36397c478bd9Sstevel@tonic-gate struct { 36407c478bd9Sstevel@tonic-gate int32_t current_time; 36417c478bd9Sstevel@tonic-gate uint32_t randnum; 36427c478bd9Sstevel@tonic-gate uint16_t pad; 36437c478bd9Sstevel@tonic-gate uint8_t ether[6]; 36447c478bd9Sstevel@tonic-gate uint8_t passwd[PASSWD_SIZE]; 36457c478bd9Sstevel@tonic-gate } tcp_iss_cookie; 36467c478bd9Sstevel@tonic-gate time_t t; 36477c478bd9Sstevel@tonic-gate 36487c478bd9Sstevel@tonic-gate /* 36497c478bd9Sstevel@tonic-gate * Start with the current absolute time. 36507c478bd9Sstevel@tonic-gate */ 36517c478bd9Sstevel@tonic-gate (void) drv_getparm(TIME, &t); 36527c478bd9Sstevel@tonic-gate tcp_iss_cookie.current_time = t; 36537c478bd9Sstevel@tonic-gate 36547c478bd9Sstevel@tonic-gate /* 36557c478bd9Sstevel@tonic-gate * XXX - Need a more random number per RFC 1750, not this crap. 36567c478bd9Sstevel@tonic-gate * OTOH, if what follows is pretty random, then I'm in better shape. 36577c478bd9Sstevel@tonic-gate */ 36587c478bd9Sstevel@tonic-gate tcp_iss_cookie.randnum = (uint32_t)(gethrtime() + tcp_random()); 36597c478bd9Sstevel@tonic-gate tcp_iss_cookie.pad = 0x365c; /* Picked from HMAC pad values. */ 36607c478bd9Sstevel@tonic-gate 36617c478bd9Sstevel@tonic-gate /* 36627c478bd9Sstevel@tonic-gate * The cpu_type_info is pretty non-random. Ugggh. It does serve 36637c478bd9Sstevel@tonic-gate * as a good template. 36647c478bd9Sstevel@tonic-gate */ 36657c478bd9Sstevel@tonic-gate bcopy(&cpu_list->cpu_type_info, &tcp_iss_cookie.passwd, 36667c478bd9Sstevel@tonic-gate min(PASSWD_SIZE, sizeof (cpu_list->cpu_type_info))); 36677c478bd9Sstevel@tonic-gate 36687c478bd9Sstevel@tonic-gate /* 36697c478bd9Sstevel@tonic-gate * The pass-phrase. Normally this is supplied by user-called NDD. 36707c478bd9Sstevel@tonic-gate */ 36717c478bd9Sstevel@tonic-gate bcopy(phrase, &tcp_iss_cookie.passwd, min(PASSWD_SIZE, len)); 36727c478bd9Sstevel@tonic-gate 36737c478bd9Sstevel@tonic-gate /* 36747c478bd9Sstevel@tonic-gate * See 4010593 if this section becomes a problem again, 36757c478bd9Sstevel@tonic-gate * but the local ethernet address is useful here. 36767c478bd9Sstevel@tonic-gate */ 36777c478bd9Sstevel@tonic-gate (void) localetheraddr(NULL, 36787c478bd9Sstevel@tonic-gate (struct ether_addr *)&tcp_iss_cookie.ether); 36797c478bd9Sstevel@tonic-gate 36807c478bd9Sstevel@tonic-gate /* 36817c478bd9Sstevel@tonic-gate * Hash 'em all together. The MD5Final is called per-connection. 36827c478bd9Sstevel@tonic-gate */ 3683f4b3ec61Sdh155122 mutex_enter(&tcps->tcps_iss_key_lock); 3684f4b3ec61Sdh155122 MD5Init(&tcps->tcps_iss_key); 3685f4b3ec61Sdh155122 MD5Update(&tcps->tcps_iss_key, (uchar_t *)&tcp_iss_cookie, 36867c478bd9Sstevel@tonic-gate sizeof (tcp_iss_cookie)); 3687f4b3ec61Sdh155122 mutex_exit(&tcps->tcps_iss_key_lock); 36887c478bd9Sstevel@tonic-gate } 36897c478bd9Sstevel@tonic-gate 36907c478bd9Sstevel@tonic-gate /* 3691f4b3ec61Sdh155122 * Called by IP when IP is loaded into the kernel 36927c478bd9Sstevel@tonic-gate */ 3693f4b3ec61Sdh155122 void 3694f4b3ec61Sdh155122 tcp_ddi_g_init(void) 3695f4b3ec61Sdh155122 { 36967c478bd9Sstevel@tonic-gate tcp_timercache = kmem_cache_create("tcp_timercache", 36977c478bd9Sstevel@tonic-gate sizeof (tcp_timer_t) + sizeof (mblk_t), 0, 36987c478bd9Sstevel@tonic-gate NULL, NULL, NULL, NULL, NULL, 0); 36997c478bd9Sstevel@tonic-gate 370066cd0f60SKacheong Poon tcp_notsack_blk_cache = kmem_cache_create("tcp_notsack_blk_cache", 370166cd0f60SKacheong Poon sizeof (notsack_blk_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 37027c478bd9Sstevel@tonic-gate 3703f4b3ec61Sdh155122 mutex_init(&tcp_random_lock, NULL, MUTEX_DEFAULT, NULL); 37047c478bd9Sstevel@tonic-gate 37057c478bd9Sstevel@tonic-gate /* Initialize the random number generator */ 37067c478bd9Sstevel@tonic-gate tcp_random_init(); 37077c478bd9Sstevel@tonic-gate 3708f4b3ec61Sdh155122 /* A single callback independently of how many netstacks we have */ 3709f4b3ec61Sdh155122 ip_squeue_init(tcp_squeue_add); 3710f4b3ec61Sdh155122 3711f4b3ec61Sdh155122 tcp_g_kstat = tcp_g_kstat_init(&tcp_g_statistics); 3712f4b3ec61Sdh155122 3713da14cebeSEric Cheng tcp_squeue_flag = tcp_squeue_switch(tcp_squeue_wput); 3714da14cebeSEric Cheng 3715f4b3ec61Sdh155122 /* 3716f4b3ec61Sdh155122 * We want to be informed each time a stack is created or 3717f4b3ec61Sdh155122 * destroyed in the kernel, so we can maintain the 3718f4b3ec61Sdh155122 * set of tcp_stack_t's. 3719f4b3ec61Sdh155122 */ 3720bd670b35SErik Nordmark netstack_register(NS_TCP, tcp_stack_init, NULL, tcp_stack_fini); 3721f4b3ec61Sdh155122 } 3722f4b3ec61Sdh155122 3723f4b3ec61Sdh155122 37240f1702c5SYu Xiangning #define INET_NAME "ip" 37250f1702c5SYu Xiangning 3726f4b3ec61Sdh155122 /* 3727f4b3ec61Sdh155122 * Initialize the TCP stack instance. 3728f4b3ec61Sdh155122 */ 3729f4b3ec61Sdh155122 static void * 3730f4b3ec61Sdh155122 tcp_stack_init(netstackid_t stackid, netstack_t *ns) 3731f4b3ec61Sdh155122 { 3732f4b3ec61Sdh155122 tcp_stack_t *tcps; 3733f4b3ec61Sdh155122 int i; 37340f1702c5SYu Xiangning int error = 0; 37350f1702c5SYu Xiangning major_t major; 37366e91bba0SGirish Moodalbail size_t arrsz; 3737f4b3ec61Sdh155122 3738f4b3ec61Sdh155122 tcps = (tcp_stack_t *)kmem_zalloc(sizeof (*tcps), KM_SLEEP); 3739f4b3ec61Sdh155122 tcps->tcps_netstack = ns; 3740f4b3ec61Sdh155122 3741f4b3ec61Sdh155122 /* Initialize locks */ 3742f4b3ec61Sdh155122 mutex_init(&tcps->tcps_iss_key_lock, NULL, MUTEX_DEFAULT, NULL); 3743f4b3ec61Sdh155122 mutex_init(&tcps->tcps_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL); 3744f4b3ec61Sdh155122 3745f4b3ec61Sdh155122 tcps->tcps_g_num_epriv_ports = TCP_NUM_EPRIV_PORTS; 37466e91bba0SGirish Moodalbail tcps->tcps_g_epriv_ports[0] = ULP_DEF_EPRIV_PORT1; 37476e91bba0SGirish Moodalbail tcps->tcps_g_epriv_ports[1] = ULP_DEF_EPRIV_PORT2; 3748f4b3ec61Sdh155122 tcps->tcps_min_anonpriv_port = 512; 3749f4b3ec61Sdh155122 3750f4b3ec61Sdh155122 tcps->tcps_bind_fanout = kmem_zalloc(sizeof (tf_t) * 3751f4b3ec61Sdh155122 TCP_BIND_FANOUT_SIZE, KM_SLEEP); 3752f4b3ec61Sdh155122 tcps->tcps_acceptor_fanout = kmem_zalloc(sizeof (tf_t) * 375393fcb0b9SKacheong Poon TCP_ACCEPTOR_FANOUT_SIZE, KM_SLEEP); 3754f4b3ec61Sdh155122 3755f4b3ec61Sdh155122 for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) { 3756f4b3ec61Sdh155122 mutex_init(&tcps->tcps_bind_fanout[i].tf_lock, NULL, 3757f4b3ec61Sdh155122 MUTEX_DEFAULT, NULL); 3758f4b3ec61Sdh155122 } 3759f4b3ec61Sdh155122 376093fcb0b9SKacheong Poon for (i = 0; i < TCP_ACCEPTOR_FANOUT_SIZE; i++) { 3761f4b3ec61Sdh155122 mutex_init(&tcps->tcps_acceptor_fanout[i].tf_lock, NULL, 3762f4b3ec61Sdh155122 MUTEX_DEFAULT, NULL); 3763f4b3ec61Sdh155122 } 3764f4b3ec61Sdh155122 3765f4b3ec61Sdh155122 /* TCP's IPsec code calls the packet dropper. */ 3766f4b3ec61Sdh155122 ip_drop_register(&tcps->tcps_dropper, "TCP IPsec policy enforcement"); 3767f4b3ec61Sdh155122 37686e91bba0SGirish Moodalbail arrsz = tcp_propinfo_count * sizeof (mod_prop_info_t); 37696e91bba0SGirish Moodalbail tcps->tcps_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, 37706e91bba0SGirish Moodalbail KM_SLEEP); 37716e91bba0SGirish Moodalbail bcopy(tcp_propinfo_tbl, tcps->tcps_propinfo_tbl, arrsz); 3772f4b3ec61Sdh155122 3773f4b3ec61Sdh155122 /* 3774f4b3ec61Sdh155122 * Note: To really walk the device tree you need the devinfo 3775f4b3ec61Sdh155122 * pointer to your device which is only available after probe/attach. 3776f4b3ec61Sdh155122 * The following is safe only because it uses ddi_root_node() 3777f4b3ec61Sdh155122 */ 3778f4b3ec61Sdh155122 tcp_max_optsize = optcom_max_optsize(tcp_opt_obj.odb_opt_des_arr, 3779f4b3ec61Sdh155122 tcp_opt_obj.odb_opt_arr_cnt); 3780f4b3ec61Sdh155122 37817c478bd9Sstevel@tonic-gate /* 37827c478bd9Sstevel@tonic-gate * Initialize RFC 1948 secret values. This will probably be reset once 37837c478bd9Sstevel@tonic-gate * by the boot scripts. 37847c478bd9Sstevel@tonic-gate * 37857c478bd9Sstevel@tonic-gate * Use NULL name, as the name is caught by the new lockstats. 37867c478bd9Sstevel@tonic-gate * 37877c478bd9Sstevel@tonic-gate * Initialize with some random, non-guessable string, like the global 37887c478bd9Sstevel@tonic-gate * T_INFO_ACK. 37897c478bd9Sstevel@tonic-gate */ 37907c478bd9Sstevel@tonic-gate 37917c478bd9Sstevel@tonic-gate tcp_iss_key_init((uint8_t *)&tcp_g_t_info_ack, 3792f4b3ec61Sdh155122 sizeof (tcp_g_t_info_ack), tcps); 37937c478bd9Sstevel@tonic-gate 3794721fffe3SKacheong Poon tcps->tcps_kstat = tcp_kstat2_init(stackid); 3795721fffe3SKacheong Poon tcps->tcps_mibkp = tcp_kstat_init(stackid); 3796f4b3ec61Sdh155122 37970f1702c5SYu Xiangning major = mod_name_to_major(INET_NAME); 37980f1702c5SYu Xiangning error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident); 37990f1702c5SYu Xiangning ASSERT(error == 0); 3800bd670b35SErik Nordmark tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL); 3801bd670b35SErik Nordmark ASSERT(tcps->tcps_ixa_cleanup_mp != NULL); 38027c6d7024SJerry Jelinek cv_init(&tcps->tcps_ixa_cleanup_ready_cv, NULL, CV_DEFAULT, NULL); 38037c6d7024SJerry Jelinek cv_init(&tcps->tcps_ixa_cleanup_done_cv, NULL, CV_DEFAULT, NULL); 3804bd670b35SErik Nordmark mutex_init(&tcps->tcps_ixa_cleanup_lock, NULL, MUTEX_DEFAULT, NULL); 3805bd670b35SErik Nordmark 380693fcb0b9SKacheong Poon mutex_init(&tcps->tcps_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 380793fcb0b9SKacheong Poon tcps->tcps_reclaim = B_FALSE; 380893fcb0b9SKacheong Poon tcps->tcps_reclaim_tid = 0; 3809721fffe3SKacheong Poon tcps->tcps_reclaim_period = tcps->tcps_rexmit_interval_max; 3810721fffe3SKacheong Poon 3811721fffe3SKacheong Poon /* 3812721fffe3SKacheong Poon * ncpus is the current number of CPUs, which can be bigger than 3813721fffe3SKacheong Poon * boot_ncpus. But we don't want to use ncpus to allocate all the 3814721fffe3SKacheong Poon * tcp_stats_cpu_t at system boot up time since it will be 1. While 3815721fffe3SKacheong Poon * we handle adding CPU in tcp_cpu_update(), it will be slow if 3816721fffe3SKacheong Poon * there are many CPUs as we will be adding them 1 by 1. 3817721fffe3SKacheong Poon * 3818721fffe3SKacheong Poon * Note that tcps_sc_cnt never decreases and the tcps_sc[x] pointers 3819721fffe3SKacheong Poon * are not freed until the stack is going away. So there is no need 3820721fffe3SKacheong Poon * to grab a lock to access the per CPU tcps_sc[x] pointer. 3821721fffe3SKacheong Poon */ 38225dd46ab5SKacheong Poon mutex_enter(&cpu_lock); 3823721fffe3SKacheong Poon tcps->tcps_sc_cnt = MAX(ncpus, boot_ncpus); 38245dd46ab5SKacheong Poon mutex_exit(&cpu_lock); 3825721fffe3SKacheong Poon tcps->tcps_sc = kmem_zalloc(max_ncpus * sizeof (tcp_stats_cpu_t *), 3826721fffe3SKacheong Poon KM_SLEEP); 3827721fffe3SKacheong Poon for (i = 0; i < tcps->tcps_sc_cnt; i++) { 3828721fffe3SKacheong Poon tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t), 3829721fffe3SKacheong Poon KM_SLEEP); 3830721fffe3SKacheong Poon } 383193fcb0b9SKacheong Poon 383293fcb0b9SKacheong Poon mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL); 383393fcb0b9SKacheong Poon list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t), 383493fcb0b9SKacheong Poon offsetof(tcp_listener_t, tl_link)); 383593fcb0b9SKacheong Poon 3836f4b3ec61Sdh155122 return (tcps); 38377c478bd9Sstevel@tonic-gate } 3838ff550d0eSmasputra 3839f4b3ec61Sdh155122 /* 3840f4b3ec61Sdh155122 * Called when the IP module is about to be unloaded. 3841f4b3ec61Sdh155122 */ 38427c478bd9Sstevel@tonic-gate void 3843f4b3ec61Sdh155122 tcp_ddi_g_destroy(void) 38447c478bd9Sstevel@tonic-gate { 3845f4b3ec61Sdh155122 tcp_g_kstat_fini(tcp_g_kstat); 3846f4b3ec61Sdh155122 tcp_g_kstat = NULL; 3847f4b3ec61Sdh155122 bzero(&tcp_g_statistics, sizeof (tcp_g_statistics)); 38487c478bd9Sstevel@tonic-gate 38497c478bd9Sstevel@tonic-gate mutex_destroy(&tcp_random_lock); 38507c478bd9Sstevel@tonic-gate 38517c478bd9Sstevel@tonic-gate kmem_cache_destroy(tcp_timercache); 385266cd0f60SKacheong Poon kmem_cache_destroy(tcp_notsack_blk_cache); 38537c478bd9Sstevel@tonic-gate 3854f4b3ec61Sdh155122 netstack_unregister(NS_TCP); 3855f4b3ec61Sdh155122 } 3856f4b3ec61Sdh155122 3857f4b3ec61Sdh155122 /* 3858f4b3ec61Sdh155122 * Free the TCP stack instance. 3859f4b3ec61Sdh155122 */ 3860f4b3ec61Sdh155122 static void 3861f4b3ec61Sdh155122 tcp_stack_fini(netstackid_t stackid, void *arg) 3862f4b3ec61Sdh155122 { 3863f4b3ec61Sdh155122 tcp_stack_t *tcps = (tcp_stack_t *)arg; 3864f4b3ec61Sdh155122 int i; 3865f4b3ec61Sdh155122 3866bd670b35SErik Nordmark freeb(tcps->tcps_ixa_cleanup_mp); 3867bd670b35SErik Nordmark tcps->tcps_ixa_cleanup_mp = NULL; 38687c6d7024SJerry Jelinek cv_destroy(&tcps->tcps_ixa_cleanup_ready_cv); 38697c6d7024SJerry Jelinek cv_destroy(&tcps->tcps_ixa_cleanup_done_cv); 3870bd670b35SErik Nordmark mutex_destroy(&tcps->tcps_ixa_cleanup_lock); 3871bd670b35SErik Nordmark 3872721fffe3SKacheong Poon /* 3873721fffe3SKacheong Poon * Set tcps_reclaim to false tells tcp_reclaim_timer() not to restart 3874721fffe3SKacheong Poon * the timer. 3875721fffe3SKacheong Poon */ 3876721fffe3SKacheong Poon mutex_enter(&tcps->tcps_reclaim_lock); 3877721fffe3SKacheong Poon tcps->tcps_reclaim = B_FALSE; 3878721fffe3SKacheong Poon mutex_exit(&tcps->tcps_reclaim_lock); 387993fcb0b9SKacheong Poon if (tcps->tcps_reclaim_tid != 0) 388093fcb0b9SKacheong Poon (void) untimeout(tcps->tcps_reclaim_tid); 388193fcb0b9SKacheong Poon mutex_destroy(&tcps->tcps_reclaim_lock); 388293fcb0b9SKacheong Poon 388393fcb0b9SKacheong Poon tcp_listener_conf_cleanup(tcps); 388493fcb0b9SKacheong Poon 3885721fffe3SKacheong Poon for (i = 0; i < tcps->tcps_sc_cnt; i++) 3886721fffe3SKacheong Poon kmem_free(tcps->tcps_sc[i], sizeof (tcp_stats_cpu_t)); 3887721fffe3SKacheong Poon kmem_free(tcps->tcps_sc, max_ncpus * sizeof (tcp_stats_cpu_t *)); 3888721fffe3SKacheong Poon 38896e91bba0SGirish Moodalbail kmem_free(tcps->tcps_propinfo_tbl, 38906e91bba0SGirish Moodalbail tcp_propinfo_count * sizeof (mod_prop_info_t)); 38916e91bba0SGirish Moodalbail tcps->tcps_propinfo_tbl = NULL; 3892f4b3ec61Sdh155122 3893f4b3ec61Sdh155122 for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) { 3894f4b3ec61Sdh155122 ASSERT(tcps->tcps_bind_fanout[i].tf_tcp == NULL); 3895f4b3ec61Sdh155122 mutex_destroy(&tcps->tcps_bind_fanout[i].tf_lock); 3896f4b3ec61Sdh155122 } 3897f4b3ec61Sdh155122 389893fcb0b9SKacheong Poon for (i = 0; i < TCP_ACCEPTOR_FANOUT_SIZE; i++) { 3899f4b3ec61Sdh155122 ASSERT(tcps->tcps_acceptor_fanout[i].tf_tcp == NULL); 3900f4b3ec61Sdh155122 mutex_destroy(&tcps->tcps_acceptor_fanout[i].tf_lock); 3901f4b3ec61Sdh155122 } 3902f4b3ec61Sdh155122 3903f4b3ec61Sdh155122 kmem_free(tcps->tcps_bind_fanout, sizeof (tf_t) * TCP_BIND_FANOUT_SIZE); 3904f4b3ec61Sdh155122 tcps->tcps_bind_fanout = NULL; 3905f4b3ec61Sdh155122 390693fcb0b9SKacheong Poon kmem_free(tcps->tcps_acceptor_fanout, sizeof (tf_t) * 390793fcb0b9SKacheong Poon TCP_ACCEPTOR_FANOUT_SIZE); 3908f4b3ec61Sdh155122 tcps->tcps_acceptor_fanout = NULL; 3909f4b3ec61Sdh155122 3910f4b3ec61Sdh155122 mutex_destroy(&tcps->tcps_iss_key_lock); 3911f4b3ec61Sdh155122 mutex_destroy(&tcps->tcps_epriv_port_lock); 3912f4b3ec61Sdh155122 3913f4b3ec61Sdh155122 ip_drop_unregister(&tcps->tcps_dropper); 3914f4b3ec61Sdh155122 3915f4b3ec61Sdh155122 tcp_kstat2_fini(stackid, tcps->tcps_kstat); 3916f4b3ec61Sdh155122 tcps->tcps_kstat = NULL; 3917f4b3ec61Sdh155122 3918f4b3ec61Sdh155122 tcp_kstat_fini(stackid, tcps->tcps_mibkp); 3919f4b3ec61Sdh155122 tcps->tcps_mibkp = NULL; 3920f4b3ec61Sdh155122 39210f1702c5SYu Xiangning ldi_ident_release(tcps->tcps_ldi_ident); 3922f4b3ec61Sdh155122 kmem_free(tcps, sizeof (*tcps)); 39237c478bd9Sstevel@tonic-gate } 39247c478bd9Sstevel@tonic-gate 39257c478bd9Sstevel@tonic-gate /* 39267c478bd9Sstevel@tonic-gate * Generate ISS, taking into account NDD changes may happen halfway through. 39277c478bd9Sstevel@tonic-gate * (If the iss is not zero, set it.) 39287c478bd9Sstevel@tonic-gate */ 39297c478bd9Sstevel@tonic-gate 39307c478bd9Sstevel@tonic-gate static void 39317c478bd9Sstevel@tonic-gate tcp_iss_init(tcp_t *tcp) 39327c478bd9Sstevel@tonic-gate { 39337c478bd9Sstevel@tonic-gate MD5_CTX context; 39347c478bd9Sstevel@tonic-gate struct { uint32_t ports; in6_addr_t src; in6_addr_t dst; } arg; 39357c478bd9Sstevel@tonic-gate uint32_t answer[4]; 3936f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 3937bd670b35SErik Nordmark conn_t *connp = tcp->tcp_connp; 39387c478bd9Sstevel@tonic-gate 3939c0e6663fSJerry Jelinek tcps->tcps_iss_incr_extra += (tcps->tcps_iss_incr >> 1); 3940f4b3ec61Sdh155122 tcp->tcp_iss = tcps->tcps_iss_incr_extra; 3941f4b3ec61Sdh155122 switch (tcps->tcps_strong_iss) { 39427c478bd9Sstevel@tonic-gate case 2: 3943f4b3ec61Sdh155122 mutex_enter(&tcps->tcps_iss_key_lock); 3944f4b3ec61Sdh155122 context = tcps->tcps_iss_key; 3945f4b3ec61Sdh155122 mutex_exit(&tcps->tcps_iss_key_lock); 3946bd670b35SErik Nordmark arg.ports = connp->conn_ports; 3947bd670b35SErik Nordmark arg.src = connp->conn_laddr_v6; 3948bd670b35SErik Nordmark arg.dst = connp->conn_faddr_v6; 39497c478bd9Sstevel@tonic-gate MD5Update(&context, (uchar_t *)&arg, sizeof (arg)); 39507c478bd9Sstevel@tonic-gate MD5Final((uchar_t *)answer, &context); 39517c478bd9Sstevel@tonic-gate tcp->tcp_iss += answer[0] ^ answer[1] ^ answer[2] ^ answer[3]; 39527c478bd9Sstevel@tonic-gate /* 39537c478bd9Sstevel@tonic-gate * Now that we've hashed into a unique per-connection sequence 39547c478bd9Sstevel@tonic-gate * space, add a random increment per strong_iss == 1. So I 39557c478bd9Sstevel@tonic-gate * guess we'll have to... 39567c478bd9Sstevel@tonic-gate */ 39577c478bd9Sstevel@tonic-gate /* FALLTHRU */ 39587c478bd9Sstevel@tonic-gate case 1: 39597c478bd9Sstevel@tonic-gate tcp->tcp_iss += (gethrtime() >> ISS_NSEC_SHT) + tcp_random(); 39607c478bd9Sstevel@tonic-gate break; 39617c478bd9Sstevel@tonic-gate default: 3962c0e6663fSJerry Jelinek tcp->tcp_iss += (uint32_t)gethrestime_sec() * 3963c0e6663fSJerry Jelinek tcps->tcps_iss_incr; 39647c478bd9Sstevel@tonic-gate break; 39657c478bd9Sstevel@tonic-gate } 39667c478bd9Sstevel@tonic-gate tcp->tcp_valid_bits = TCP_ISS_VALID; 39677c478bd9Sstevel@tonic-gate tcp->tcp_fss = tcp->tcp_iss - 1; 39687c478bd9Sstevel@tonic-gate tcp->tcp_suna = tcp->tcp_iss; 39697c478bd9Sstevel@tonic-gate tcp->tcp_snxt = tcp->tcp_iss + 1; 39707c478bd9Sstevel@tonic-gate tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 39717c478bd9Sstevel@tonic-gate tcp->tcp_csuna = tcp->tcp_snxt; 39727c478bd9Sstevel@tonic-gate } 39737c478bd9Sstevel@tonic-gate 39747c478bd9Sstevel@tonic-gate /* 3975ff550d0eSmasputra * tcp_{set,clr}qfull() functions are used to either set or clear QFULL 3976ff550d0eSmasputra * on the specified backing STREAMS q. Note, the caller may make the 3977ff550d0eSmasputra * decision to call based on the tcp_t.tcp_flow_stopped value which 3978ff550d0eSmasputra * when check outside the q's lock is only an advisory check ... 3979ff550d0eSmasputra */ 3980ff550d0eSmasputra void 39817c478bd9Sstevel@tonic-gate tcp_setqfull(tcp_t *tcp) 39827c478bd9Sstevel@tonic-gate { 3983f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 39840f1702c5SYu Xiangning conn_t *connp = tcp->tcp_connp; 39850f1702c5SYu Xiangning 39860f1702c5SYu Xiangning if (tcp->tcp_closed) 39870f1702c5SYu Xiangning return; 39880f1702c5SYu Xiangning 3989bd670b35SErik Nordmark conn_setqfull(connp, &tcp->tcp_flow_stopped); 3990bd670b35SErik Nordmark if (tcp->tcp_flow_stopped) 3991f4b3ec61Sdh155122 TCP_STAT(tcps, tcp_flwctl_on); 39920f1702c5SYu Xiangning } 39937c478bd9Sstevel@tonic-gate 3994ff550d0eSmasputra void 39957c478bd9Sstevel@tonic-gate tcp_clrqfull(tcp_t *tcp) 39967c478bd9Sstevel@tonic-gate { 39970f1702c5SYu Xiangning conn_t *connp = tcp->tcp_connp; 39980f1702c5SYu Xiangning 39990f1702c5SYu Xiangning if (tcp->tcp_closed) 40000f1702c5SYu Xiangning return; 4001bd670b35SErik Nordmark conn_clrqfull(connp, &tcp->tcp_flow_stopped); 40020f1702c5SYu Xiangning } 4003f4b3ec61Sdh155122 4004da14cebeSEric Cheng static int 40057c478bd9Sstevel@tonic-gate tcp_squeue_switch(int val) 40067c478bd9Sstevel@tonic-gate { 4007da14cebeSEric Cheng int rval = SQ_FILL; 40087c478bd9Sstevel@tonic-gate 40097c478bd9Sstevel@tonic-gate switch (val) { 40107c478bd9Sstevel@tonic-gate case 1: 4011da14cebeSEric Cheng rval = SQ_NODRAIN; 40127c478bd9Sstevel@tonic-gate break; 40137c478bd9Sstevel@tonic-gate case 2: 4014da14cebeSEric Cheng rval = SQ_PROCESS; 40157c478bd9Sstevel@tonic-gate break; 40167c478bd9Sstevel@tonic-gate default: 40177c478bd9Sstevel@tonic-gate break; 40187c478bd9Sstevel@tonic-gate } 40197c478bd9Sstevel@tonic-gate return (rval); 40207c478bd9Sstevel@tonic-gate } 40217c478bd9Sstevel@tonic-gate 4022f4b3ec61Sdh155122 /* 4023f4b3ec61Sdh155122 * This is called once for each squeue - globally for all stack 4024f4b3ec61Sdh155122 * instances. 4025f4b3ec61Sdh155122 */ 40267c478bd9Sstevel@tonic-gate static void 40277c478bd9Sstevel@tonic-gate tcp_squeue_add(squeue_t *sqp) 40287c478bd9Sstevel@tonic-gate { 40297c478bd9Sstevel@tonic-gate tcp_squeue_priv_t *tcp_time_wait = kmem_zalloc( 40307c478bd9Sstevel@tonic-gate sizeof (tcp_squeue_priv_t), KM_SLEEP); 40317c478bd9Sstevel@tonic-gate 40327c478bd9Sstevel@tonic-gate *squeue_getprivate(sqp, SQPRIVATE_TCP) = (intptr_t)tcp_time_wait; 40331dbf515bSethindra if (tcp_free_list_max_cnt == 0) { 40341dbf515bSethindra int tcp_ncpus = ((boot_max_ncpus == -1) ? 40351dbf515bSethindra max_ncpus : boot_max_ncpus); 40361dbf515bSethindra 40371dbf515bSethindra /* 40381dbf515bSethindra * Limit number of entries to 1% of availble memory / tcp_ncpus 40391dbf515bSethindra */ 40401dbf515bSethindra tcp_free_list_max_cnt = (freemem * PAGESIZE) / 40411dbf515bSethindra (tcp_ncpus * sizeof (tcp_t) * 100); 40421dbf515bSethindra } 40431dbf515bSethindra tcp_time_wait->tcp_free_list_cnt = 0; 40447c478bd9Sstevel@tonic-gate } 40450f1702c5SYu Xiangning /* 40460f1702c5SYu Xiangning * Return unix error is tli error is TSYSERR, otherwise return a negative 40470f1702c5SYu Xiangning * tli error. 40480f1702c5SYu Xiangning */ 40490f1702c5SYu Xiangning int 40500f1702c5SYu Xiangning tcp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, 40510f1702c5SYu Xiangning boolean_t bind_to_req_port_only) 40520f1702c5SYu Xiangning { 40530f1702c5SYu Xiangning int error; 40540f1702c5SYu Xiangning tcp_t *tcp = connp->conn_tcp; 40550f1702c5SYu Xiangning 40560f1702c5SYu Xiangning if (tcp->tcp_state >= TCPS_BOUND) { 4057bd670b35SErik Nordmark if (connp->conn_debug) { 40580f1702c5SYu Xiangning (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 40590f1702c5SYu Xiangning "tcp_bind: bad state, %d", tcp->tcp_state); 40600f1702c5SYu Xiangning } 40610f1702c5SYu Xiangning return (-TOUTSTATE); 40620f1702c5SYu Xiangning } 40630f1702c5SYu Xiangning 40640f1702c5SYu Xiangning error = tcp_bind_check(connp, sa, len, cr, bind_to_req_port_only); 40650f1702c5SYu Xiangning if (error != 0) 40660f1702c5SYu Xiangning return (error); 40670f1702c5SYu Xiangning 40680f1702c5SYu Xiangning ASSERT(tcp->tcp_state == TCPS_BOUND); 40690f1702c5SYu Xiangning tcp->tcp_conn_req_max = 0; 4070bd670b35SErik Nordmark return (0); 40710f1702c5SYu Xiangning } 40720f1702c5SYu Xiangning 40730f1702c5SYu Xiangning /* 40740f1702c5SYu Xiangning * If the return value from this function is positive, it's a UNIX error. 40750f1702c5SYu Xiangning * Otherwise, if it's negative, then the absolute value is a TLI error. 40760f1702c5SYu Xiangning * the TPI routine tcp_tpi_connect() is a wrapper function for this. 40770f1702c5SYu Xiangning */ 40780f1702c5SYu Xiangning int 40790f1702c5SYu Xiangning tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, 40800f1702c5SYu Xiangning cred_t *cr, pid_t pid) 40810f1702c5SYu Xiangning { 40820f1702c5SYu Xiangning tcp_t *tcp = connp->conn_tcp; 40830f1702c5SYu Xiangning sin_t *sin = (sin_t *)sa; 40840f1702c5SYu Xiangning sin6_t *sin6 = (sin6_t *)sa; 40850f1702c5SYu Xiangning ipaddr_t *dstaddrp; 40860f1702c5SYu Xiangning in_port_t dstport; 40870f1702c5SYu Xiangning uint_t srcid; 4088bd670b35SErik Nordmark int error; 4089bd670b35SErik Nordmark uint32_t mss; 4090bd670b35SErik Nordmark mblk_t *syn_mp; 4091bd670b35SErik Nordmark tcp_stack_t *tcps = tcp->tcp_tcps; 4092bd670b35SErik Nordmark int32_t oldstate; 4093bd670b35SErik Nordmark ip_xmit_attr_t *ixa = connp->conn_ixa; 4094bd670b35SErik Nordmark 4095bd670b35SErik Nordmark oldstate = tcp->tcp_state; 40960f1702c5SYu Xiangning 40970f1702c5SYu Xiangning switch (len) { 40980f1702c5SYu Xiangning default: 40990f1702c5SYu Xiangning /* 41000f1702c5SYu Xiangning * Should never happen 41010f1702c5SYu Xiangning */ 41020f1702c5SYu Xiangning return (EINVAL); 41030f1702c5SYu Xiangning 41040f1702c5SYu Xiangning case sizeof (sin_t): 41050f1702c5SYu Xiangning sin = (sin_t *)sa; 41060f1702c5SYu Xiangning if (sin->sin_port == 0) { 41070f1702c5SYu Xiangning return (-TBADADDR); 41080f1702c5SYu Xiangning } 4109bd670b35SErik Nordmark if (connp->conn_ipv6_v6only) { 41100f1702c5SYu Xiangning return (EAFNOSUPPORT); 41110f1702c5SYu Xiangning } 41120f1702c5SYu Xiangning break; 41130f1702c5SYu Xiangning 41140f1702c5SYu Xiangning case sizeof (sin6_t): 41150f1702c5SYu Xiangning sin6 = (sin6_t *)sa; 41160f1702c5SYu Xiangning if (sin6->sin6_port == 0) { 41170f1702c5SYu Xiangning return (-TBADADDR); 41180f1702c5SYu Xiangning } 41190f1702c5SYu Xiangning break; 41200f1702c5SYu Xiangning } 41210f1702c5SYu Xiangning /* 41220f1702c5SYu Xiangning * If we're connecting to an IPv4-mapped IPv6 address, we need to 4123bd670b35SErik Nordmark * make sure that the conn_ipversion is IPV4_VERSION. We 41240f1702c5SYu Xiangning * need to this before we call tcp_bindi() so that the port lookup 41250f1702c5SYu Xiangning * code will look for ports in the correct port space (IPv4 and 41260f1702c5SYu Xiangning * IPv6 have separate port spaces). 41270f1702c5SYu Xiangning */ 4128bd670b35SErik Nordmark if (connp->conn_family == AF_INET6 && 4129bd670b35SErik Nordmark connp->conn_ipversion == IPV6_VERSION && 41300f1702c5SYu Xiangning IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 4131bd670b35SErik Nordmark if (connp->conn_ipv6_v6only) 4132bd670b35SErik Nordmark return (EADDRNOTAVAIL); 41330f1702c5SYu Xiangning 4134bd670b35SErik Nordmark connp->conn_ipversion = IPV4_VERSION; 41350f1702c5SYu Xiangning } 41360f1702c5SYu Xiangning 41370f1702c5SYu Xiangning switch (tcp->tcp_state) { 41380f1702c5SYu Xiangning case TCPS_LISTEN: 41390f1702c5SYu Xiangning /* 41400f1702c5SYu Xiangning * Listening sockets are not allowed to issue connect(). 41410f1702c5SYu Xiangning */ 41420f1702c5SYu Xiangning if (IPCL_IS_NONSTR(connp)) 41430f1702c5SYu Xiangning return (EOPNOTSUPP); 41440f1702c5SYu Xiangning /* FALLTHRU */ 41450f1702c5SYu Xiangning case TCPS_IDLE: 41460f1702c5SYu Xiangning /* 41470f1702c5SYu Xiangning * We support quick connect, refer to comments in 41480f1702c5SYu Xiangning * tcp_connect_*() 41490f1702c5SYu Xiangning */ 41500f1702c5SYu Xiangning /* FALLTHRU */ 41510f1702c5SYu Xiangning case TCPS_BOUND: 4152bd670b35SErik Nordmark break; 4153bd670b35SErik Nordmark default: 4154bd670b35SErik Nordmark return (-TOUTSTATE); 41550f1702c5SYu Xiangning } 4156bd670b35SErik Nordmark 4157bd670b35SErik Nordmark /* 4158bd670b35SErik Nordmark * We update our cred/cpid based on the caller of connect 4159bd670b35SErik Nordmark */ 4160bd670b35SErik Nordmark if (connp->conn_cred != cr) { 4161bd670b35SErik Nordmark crhold(cr); 4162bd670b35SErik Nordmark crfree(connp->conn_cred); 4163bd670b35SErik Nordmark connp->conn_cred = cr; 4164bd670b35SErik Nordmark } 4165bd670b35SErik Nordmark connp->conn_cpid = pid; 4166bd670b35SErik Nordmark 4167bd670b35SErik Nordmark /* Cache things in the ixa without any refhold */ 4168be4c8f74SErik Nordmark ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4169bd670b35SErik Nordmark ixa->ixa_cred = cr; 4170bd670b35SErik Nordmark ixa->ixa_cpid = pid; 4171bd670b35SErik Nordmark if (is_system_labeled()) { 4172bd670b35SErik Nordmark /* We need to restart with a label based on the cred */ 4173bd670b35SErik Nordmark ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 4174bd670b35SErik Nordmark } 4175bd670b35SErik Nordmark 4176bd670b35SErik Nordmark if (connp->conn_family == AF_INET6) { 4177bd670b35SErik Nordmark if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 4178bd670b35SErik Nordmark error = tcp_connect_ipv6(tcp, &sin6->sin6_addr, 4179bd670b35SErik Nordmark sin6->sin6_port, sin6->sin6_flowinfo, 4180bd670b35SErik Nordmark sin6->__sin6_src_id, sin6->sin6_scope_id); 4181bd670b35SErik Nordmark } else { 41820f1702c5SYu Xiangning /* 41830f1702c5SYu Xiangning * Destination adress is mapped IPv6 address. 41840f1702c5SYu Xiangning * Source bound address should be unspecified or 41850f1702c5SYu Xiangning * IPv6 mapped address as well. 41860f1702c5SYu Xiangning */ 41870f1702c5SYu Xiangning if (!IN6_IS_ADDR_UNSPECIFIED( 4188bd670b35SErik Nordmark &connp->conn_bound_addr_v6) && 4189bd670b35SErik Nordmark !IN6_IS_ADDR_V4MAPPED(&connp->conn_bound_addr_v6)) { 41900f1702c5SYu Xiangning return (EADDRNOTAVAIL); 41910f1702c5SYu Xiangning } 41920f1702c5SYu Xiangning dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr)); 41930f1702c5SYu Xiangning dstport = sin6->sin6_port; 41940f1702c5SYu Xiangning srcid = sin6->__sin6_src_id; 4195bd670b35SErik Nordmark error = tcp_connect_ipv4(tcp, dstaddrp, dstport, 4196bd670b35SErik Nordmark srcid); 4197bd670b35SErik Nordmark } 41980f1702c5SYu Xiangning } else { 41990f1702c5SYu Xiangning dstaddrp = &sin->sin_addr.s_addr; 42000f1702c5SYu Xiangning dstport = sin->sin_port; 42010f1702c5SYu Xiangning srcid = 0; 4202bd670b35SErik Nordmark error = tcp_connect_ipv4(tcp, dstaddrp, dstport, srcid); 42030f1702c5SYu Xiangning } 42040f1702c5SYu Xiangning 4205bd670b35SErik Nordmark if (error != 0) 4206bd670b35SErik Nordmark goto connect_failed; 4207bd670b35SErik Nordmark 4208bd670b35SErik Nordmark CL_INET_CONNECT(connp, B_TRUE, error); 4209bd670b35SErik Nordmark if (error != 0) 4210bd670b35SErik Nordmark goto connect_failed; 4211bd670b35SErik Nordmark 4212bd670b35SErik Nordmark /* connect succeeded */ 4213721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpActiveOpens); 4214bd670b35SErik Nordmark tcp->tcp_active_open = 1; 4215bd670b35SErik Nordmark 42160f1702c5SYu Xiangning /* 4217bd670b35SErik Nordmark * tcp_set_destination() does not adjust for TCP/IP header length. 42180f1702c5SYu Xiangning */ 4219bd670b35SErik Nordmark mss = tcp->tcp_mss - connp->conn_ht_iphc_len; 4220bd670b35SErik Nordmark 4221bd670b35SErik Nordmark /* 4222bd670b35SErik Nordmark * Just make sure our rwnd is at least rcvbuf * MSS large, and round up 4223bd670b35SErik Nordmark * to the nearest MSS. 4224bd670b35SErik Nordmark * 4225bd670b35SErik Nordmark * We do the round up here because we need to get the interface MTU 4226bd670b35SErik Nordmark * first before we can do the round up. 4227bd670b35SErik Nordmark */ 4228bd670b35SErik Nordmark tcp->tcp_rwnd = connp->conn_rcvbuf; 4229bd670b35SErik Nordmark tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), 4230bd670b35SErik Nordmark tcps->tcps_recv_hiwat_minmss * mss); 4231bd670b35SErik Nordmark connp->conn_rcvbuf = tcp->tcp_rwnd; 4232bd670b35SErik Nordmark tcp_set_ws_value(tcp); 4233bd670b35SErik Nordmark tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 4234bd670b35SErik Nordmark if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always) 4235bd670b35SErik Nordmark tcp->tcp_snd_ws_ok = B_TRUE; 4236bd670b35SErik Nordmark 4237bd670b35SErik Nordmark /* 4238bd670b35SErik Nordmark * Set tcp_snd_ts_ok to true 4239bd670b35SErik Nordmark * so that tcp_xmit_mp will 4240bd670b35SErik Nordmark * include the timestamp 4241bd670b35SErik Nordmark * option in the SYN segment. 4242bd670b35SErik Nordmark */ 4243bd670b35SErik Nordmark if (tcps->tcps_tstamp_always || 4244bd670b35SErik Nordmark (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) { 4245bd670b35SErik Nordmark tcp->tcp_snd_ts_ok = B_TRUE; 4246bd670b35SErik Nordmark } 4247bd670b35SErik Nordmark 4248bd670b35SErik Nordmark /* 424966cd0f60SKacheong Poon * Note that tcp_snd_sack_ok can be set in tcp_set_destination() if 425066cd0f60SKacheong Poon * the SACK metric is set. So here we just check the per stack SACK 425166cd0f60SKacheong Poon * permitted param. 4252bd670b35SErik Nordmark */ 425366cd0f60SKacheong Poon if (tcps->tcps_sack_permitted == 2) { 425466cd0f60SKacheong Poon ASSERT(tcp->tcp_num_sack_blk == 0); 425566cd0f60SKacheong Poon ASSERT(tcp->tcp_notsack_list == NULL); 4256bd670b35SErik Nordmark tcp->tcp_snd_sack_ok = B_TRUE; 4257bd670b35SErik Nordmark } 4258bd670b35SErik Nordmark 4259bd670b35SErik Nordmark /* 4260bd670b35SErik Nordmark * Should we use ECN? Note that the current 4261bd670b35SErik Nordmark * default value (SunOS 5.9) of tcp_ecn_permitted 4262bd670b35SErik Nordmark * is 1. The reason for doing this is that there 4263bd670b35SErik Nordmark * are equipments out there that will drop ECN 4264bd670b35SErik Nordmark * enabled IP packets. Setting it to 1 avoids 4265bd670b35SErik Nordmark * compatibility problems. 4266bd670b35SErik Nordmark */ 4267bd670b35SErik Nordmark if (tcps->tcps_ecn_permitted == 2) 4268bd670b35SErik Nordmark tcp->tcp_ecn_ok = B_TRUE; 4269bd670b35SErik Nordmark 42709cd928feSAlan Maguire /* Trace change from BOUND -> SYN_SENT here */ 42719cd928feSAlan Maguire DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 42729cd928feSAlan Maguire connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, 42739cd928feSAlan Maguire int32_t, TCPS_BOUND); 42749cd928feSAlan Maguire 4275bd670b35SErik Nordmark TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 4276bd670b35SErik Nordmark syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 4277bd670b35SErik Nordmark tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 4278bd670b35SErik Nordmark if (syn_mp != NULL) { 4279bd670b35SErik Nordmark /* 4280bd670b35SErik Nordmark * We must bump the generation before sending the syn 4281bd670b35SErik Nordmark * to ensure that we use the right generation in case 4282bd670b35SErik Nordmark * this thread issues a "connected" up call. 4283bd670b35SErik Nordmark */ 4284bd670b35SErik Nordmark SOCK_CONNID_BUMP(tcp->tcp_connid); 42859cd928feSAlan Maguire /* 42869cd928feSAlan Maguire * DTrace sending the first SYN as a 42879cd928feSAlan Maguire * tcp:::connect-request event. 42889cd928feSAlan Maguire */ 42899cd928feSAlan Maguire DTRACE_TCP5(connect__request, mblk_t *, NULL, 42909cd928feSAlan Maguire ip_xmit_attr_t *, connp->conn_ixa, 42919cd928feSAlan Maguire void_ip_t *, syn_mp->b_rptr, tcp_t *, tcp, 42929cd928feSAlan Maguire tcph_t *, 42939cd928feSAlan Maguire &syn_mp->b_rptr[connp->conn_ixa->ixa_ip_hdr_length]); 4294bd670b35SErik Nordmark tcp_send_data(tcp, syn_mp); 4295bd670b35SErik Nordmark } 4296bd670b35SErik Nordmark 4297bd670b35SErik Nordmark if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 4298bd670b35SErik Nordmark tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 4299bd670b35SErik Nordmark return (0); 4300bd670b35SErik Nordmark 43010f1702c5SYu Xiangning connect_failed: 4302bd670b35SErik Nordmark connp->conn_faddr_v6 = ipv6_all_zeros; 4303bd670b35SErik Nordmark connp->conn_fport = 0; 4304bd670b35SErik Nordmark tcp->tcp_state = oldstate; 43050f1702c5SYu Xiangning if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 43060f1702c5SYu Xiangning tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 43070f1702c5SYu Xiangning return (error); 43080f1702c5SYu Xiangning } 43090f1702c5SYu Xiangning 43100f1702c5SYu Xiangning int 4311eead73cfSRao Shoaib tcp_do_listen(conn_t *connp, struct sockaddr *sa, socklen_t len, 4312eead73cfSRao Shoaib int backlog, cred_t *cr, boolean_t bind_to_req_port_only) 43130f1702c5SYu Xiangning { 43140f1702c5SYu Xiangning tcp_t *tcp = connp->conn_tcp; 43150f1702c5SYu Xiangning int error = 0; 43160f1702c5SYu Xiangning tcp_stack_t *tcps = tcp->tcp_tcps; 43179cd928feSAlan Maguire int32_t oldstate; 43180f1702c5SYu Xiangning 4319de8c4a14SErik Nordmark /* All Solaris components should pass a cred for this operation. */ 4320de8c4a14SErik Nordmark ASSERT(cr != NULL); 4321de8c4a14SErik Nordmark 43220f1702c5SYu Xiangning if (tcp->tcp_state >= TCPS_BOUND) { 43230f1702c5SYu Xiangning if ((tcp->tcp_state == TCPS_BOUND || 4324a5adac4dSYu Xiangning tcp->tcp_state == TCPS_LISTEN) && backlog > 0) { 43250f1702c5SYu Xiangning /* 43260f1702c5SYu Xiangning * Handle listen() increasing backlog. 43270f1702c5SYu Xiangning * This is more "liberal" then what the TPI spec 43280f1702c5SYu Xiangning * requires but is needed to avoid a t_unbind 43290f1702c5SYu Xiangning * when handling listen() since the port number 43300f1702c5SYu Xiangning * might be "stolen" between the unbind and bind. 43310f1702c5SYu Xiangning */ 43320f1702c5SYu Xiangning goto do_listen; 43330f1702c5SYu Xiangning } 4334bd670b35SErik Nordmark if (connp->conn_debug) { 43350f1702c5SYu Xiangning (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 4336a5adac4dSYu Xiangning "tcp_listen: bad state, %d", tcp->tcp_state); 43370f1702c5SYu Xiangning } 43380f1702c5SYu Xiangning return (-TOUTSTATE); 43390f1702c5SYu Xiangning } else { 43400f1702c5SYu Xiangning sin6_t addr; 4341eead73cfSRao Shoaib sin_t *sin; 4342eead73cfSRao Shoaib sin6_t *sin6; 4343eead73cfSRao Shoaib 4344*4778e36eSJohn Levon if (sa == NULL) { 4345eead73cfSRao Shoaib ASSERT(IPCL_IS_NONSTR(connp)); 43460f1702c5SYu Xiangning /* Do an implicit bind: Request for a generic port. */ 4347bd670b35SErik Nordmark if (connp->conn_family == AF_INET) { 43480f1702c5SYu Xiangning len = sizeof (sin_t); 43490f1702c5SYu Xiangning sin = (sin_t *)&addr; 43500f1702c5SYu Xiangning *sin = sin_null; 43510f1702c5SYu Xiangning sin->sin_family = AF_INET; 43520f1702c5SYu Xiangning } else { 4353bd670b35SErik Nordmark ASSERT(connp->conn_family == AF_INET6); 43540f1702c5SYu Xiangning len = sizeof (sin6_t); 43550f1702c5SYu Xiangning sin6 = (sin6_t *)&addr; 43560f1702c5SYu Xiangning *sin6 = sin6_null; 43570f1702c5SYu Xiangning sin6->sin6_family = AF_INET6; 43580f1702c5SYu Xiangning } 4359eead73cfSRao Shoaib sa = (struct sockaddr *)&addr; 4360eead73cfSRao Shoaib } 43610f1702c5SYu Xiangning 4362eead73cfSRao Shoaib error = tcp_bind_check(connp, sa, len, cr, 4363eead73cfSRao Shoaib bind_to_req_port_only); 43640f1702c5SYu Xiangning if (error) 43650f1702c5SYu Xiangning return (error); 43660f1702c5SYu Xiangning /* Fall through and do the fanout insertion */ 43670f1702c5SYu Xiangning } 43680f1702c5SYu Xiangning 43690f1702c5SYu Xiangning do_listen: 43700f1702c5SYu Xiangning ASSERT(tcp->tcp_state == TCPS_BOUND || tcp->tcp_state == TCPS_LISTEN); 43710f1702c5SYu Xiangning tcp->tcp_conn_req_max = backlog; 43720f1702c5SYu Xiangning if (tcp->tcp_conn_req_max) { 43730f1702c5SYu Xiangning if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min) 43740f1702c5SYu Xiangning tcp->tcp_conn_req_max = tcps->tcps_conn_req_min; 43750f1702c5SYu Xiangning if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q) 43760f1702c5SYu Xiangning tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q; 43770f1702c5SYu Xiangning /* 43780f1702c5SYu Xiangning * If this is a listener, do not reset the eager list 43790f1702c5SYu Xiangning * and other stuffs. Note that we don't check if the 43800f1702c5SYu Xiangning * existing eager list meets the new tcp_conn_req_max 43810f1702c5SYu Xiangning * requirement. 43820f1702c5SYu Xiangning */ 43830f1702c5SYu Xiangning if (tcp->tcp_state != TCPS_LISTEN) { 43840f1702c5SYu Xiangning tcp->tcp_state = TCPS_LISTEN; 43859cd928feSAlan Maguire DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 43869cd928feSAlan Maguire connp->conn_ixa, void, NULL, tcp_t *, tcp, 43879cd928feSAlan Maguire void, NULL, int32_t, TCPS_BOUND); 43880f1702c5SYu Xiangning /* Initialize the chain. Don't need the eager_lock */ 43890f1702c5SYu Xiangning tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 43900f1702c5SYu Xiangning tcp->tcp_eager_next_drop_q0 = tcp; 43910f1702c5SYu Xiangning tcp->tcp_eager_prev_drop_q0 = tcp; 43920f1702c5SYu Xiangning tcp->tcp_second_ctimer_threshold = 43930f1702c5SYu Xiangning tcps->tcps_ip_abort_linterval; 43940f1702c5SYu Xiangning } 43950f1702c5SYu Xiangning } 43960f1702c5SYu Xiangning 43970f1702c5SYu Xiangning /* 43980f1702c5SYu Xiangning * We need to make sure that the conn_recv is set to a non-null 43990f1702c5SYu Xiangning * value before we insert the conn into the classifier table. 44000f1702c5SYu Xiangning * This is to avoid a race with an incoming packet which does an 44010f1702c5SYu Xiangning * ipcl_classify(). 4402bd670b35SErik Nordmark * We initially set it to tcp_input_listener_unbound to try to 4403bd670b35SErik Nordmark * pick a good squeue for the listener when the first SYN arrives. 4404bd670b35SErik Nordmark * tcp_input_listener_unbound sets it to tcp_input_listener on that 4405bd670b35SErik Nordmark * first SYN. 44060f1702c5SYu Xiangning */ 4407bd670b35SErik Nordmark connp->conn_recv = tcp_input_listener_unbound; 4408bd670b35SErik Nordmark 4409bd670b35SErik Nordmark /* Insert the listener in the classifier table */ 4410bd670b35SErik Nordmark error = ip_laddr_fanout_insert(connp); 4411bd670b35SErik Nordmark if (error != 0) { 4412bd670b35SErik Nordmark /* Undo the bind - release the port number */ 44139cd928feSAlan Maguire oldstate = tcp->tcp_state; 4414bd670b35SErik Nordmark tcp->tcp_state = TCPS_IDLE; 44159cd928feSAlan Maguire DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 44169cd928feSAlan Maguire connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, 44179cd928feSAlan Maguire int32_t, oldstate); 4418bd670b35SErik Nordmark connp->conn_bound_addr_v6 = ipv6_all_zeros; 4419bd670b35SErik Nordmark 4420bd670b35SErik Nordmark connp->conn_laddr_v6 = ipv6_all_zeros; 4421bd670b35SErik Nordmark connp->conn_saddr_v6 = ipv6_all_zeros; 4422bd670b35SErik Nordmark connp->conn_ports = 0; 4423bd670b35SErik Nordmark 4424bd670b35SErik Nordmark if (connp->conn_anon_port) { 4425bd670b35SErik Nordmark zone_t *zone; 4426bd670b35SErik Nordmark 4427bd670b35SErik Nordmark zone = crgetzone(cr); 4428bd670b35SErik Nordmark connp->conn_anon_port = B_FALSE; 4429bd670b35SErik Nordmark (void) tsol_mlp_anon(zone, connp->conn_mlp_type, 4430bd670b35SErik Nordmark connp->conn_proto, connp->conn_lport, B_FALSE); 44310f1702c5SYu Xiangning } 4432bd670b35SErik Nordmark connp->conn_mlp_type = mlptSingle; 4433bd670b35SErik Nordmark 4434bd670b35SErik Nordmark tcp_bind_hash_remove(tcp); 4435bd670b35SErik Nordmark return (error); 443693fcb0b9SKacheong Poon } else { 443793fcb0b9SKacheong Poon /* 443893fcb0b9SKacheong Poon * If there is a connection limit, allocate and initialize 443993fcb0b9SKacheong Poon * the counter struct. Note that since listen can be called 444093fcb0b9SKacheong Poon * multiple times, the struct may have been allready allocated. 444193fcb0b9SKacheong Poon */ 444293fcb0b9SKacheong Poon if (!list_is_empty(&tcps->tcps_listener_conf) && 444393fcb0b9SKacheong Poon tcp->tcp_listen_cnt == NULL) { 444493fcb0b9SKacheong Poon tcp_listen_cnt_t *tlc; 444593fcb0b9SKacheong Poon uint32_t ratio; 444693fcb0b9SKacheong Poon 444793fcb0b9SKacheong Poon ratio = tcp_find_listener_conf(tcps, 444893fcb0b9SKacheong Poon ntohs(connp->conn_lport)); 444993fcb0b9SKacheong Poon if (ratio != 0) { 445093fcb0b9SKacheong Poon uint32_t mem_ratio, tot_buf; 445193fcb0b9SKacheong Poon 445293fcb0b9SKacheong Poon tlc = kmem_alloc(sizeof (tcp_listen_cnt_t), 445393fcb0b9SKacheong Poon KM_SLEEP); 445493fcb0b9SKacheong Poon /* 445593fcb0b9SKacheong Poon * Calculate the connection limit based on 445693fcb0b9SKacheong Poon * the configured ratio and maxusers. Maxusers 445793fcb0b9SKacheong Poon * are calculated based on memory size, 445893fcb0b9SKacheong Poon * ~ 1 user per MB. Note that the conn_rcvbuf 445993fcb0b9SKacheong Poon * and conn_sndbuf may change after a 446093fcb0b9SKacheong Poon * connection is accepted. So what we have 446193fcb0b9SKacheong Poon * is only an approximation. 446293fcb0b9SKacheong Poon */ 446393fcb0b9SKacheong Poon if ((tot_buf = connp->conn_rcvbuf + 446493fcb0b9SKacheong Poon connp->conn_sndbuf) < MB) { 446593fcb0b9SKacheong Poon mem_ratio = MB / tot_buf; 446693fcb0b9SKacheong Poon tlc->tlc_max = maxusers / ratio * 446793fcb0b9SKacheong Poon mem_ratio; 446893fcb0b9SKacheong Poon } else { 446993fcb0b9SKacheong Poon mem_ratio = tot_buf / MB; 447093fcb0b9SKacheong Poon tlc->tlc_max = maxusers / ratio / 447193fcb0b9SKacheong Poon mem_ratio; 447293fcb0b9SKacheong Poon } 447393fcb0b9SKacheong Poon /* At least we should allow two connections! */ 447493fcb0b9SKacheong Poon if (tlc->tlc_max <= tcp_min_conn_listener) 447593fcb0b9SKacheong Poon tlc->tlc_max = tcp_min_conn_listener; 447693fcb0b9SKacheong Poon tlc->tlc_cnt = 1; 447793fcb0b9SKacheong Poon tlc->tlc_drop = 0; 447893fcb0b9SKacheong Poon tcp->tcp_listen_cnt = tlc; 447993fcb0b9SKacheong Poon } 448093fcb0b9SKacheong Poon } 4481bd670b35SErik Nordmark } 4482bd670b35SErik Nordmark return (error); 44830f1702c5SYu Xiangning } 4484