1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, Joyent Inc. All rights reserved. 25 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2013, 2016 by Delphix. All rights reserved. 27 * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. 28 */ 29 /* Copyright (c) 1990 Mentat Inc. */ 30 31 #include <sys/types.h> 32 #include <sys/stream.h> 33 #include <sys/strsun.h> 34 #include <sys/strsubr.h> 35 #include <sys/stropts.h> 36 #include <sys/strlog.h> 37 #define _SUN_TPI_VERSION 2 38 #include <sys/tihdr.h> 39 #include <sys/timod.h> 40 #include <sys/ddi.h> 41 #include <sys/sunddi.h> 42 #include <sys/suntpi.h> 43 #include <sys/xti_inet.h> 44 #include <sys/cmn_err.h> 45 #include <sys/debug.h> 46 #include <sys/sdt.h> 47 #include <sys/vtrace.h> 48 #include <sys/kmem.h> 49 #include <sys/ethernet.h> 50 #include <sys/cpuvar.h> 51 #include <sys/dlpi.h> 52 #include <sys/pattr.h> 53 #include <sys/policy.h> 54 #include <sys/priv.h> 55 #include <sys/zone.h> 56 #include <sys/sunldi.h> 57 58 #include <sys/errno.h> 59 #include <sys/signal.h> 60 #include <sys/socket.h> 61 #include <sys/socketvar.h> 62 #include <sys/sockio.h> 63 #include <sys/isa_defs.h> 64 #include <sys/md5.h> 65 #include <sys/random.h> 66 #include <sys/uio.h> 67 #include <sys/systm.h> 68 #include <netinet/in.h> 69 #include <netinet/tcp.h> 70 #include <netinet/ip6.h> 71 #include <netinet/icmp6.h> 72 #include <net/if.h> 73 #include <net/route.h> 74 #include <inet/ipsec_impl.h> 75 76 #include <inet/common.h> 77 #include <inet/ip.h> 78 #include <inet/ip_impl.h> 79 #include <inet/ip6.h> 80 #include <inet/ip_ndp.h> 81 #include <inet/proto_set.h> 82 #include <inet/mib2.h> 83 #include <inet/optcom.h> 84 #include <inet/snmpcom.h> 85 #include <inet/kstatcom.h> 86 #include <inet/tcp.h> 87 #include <inet/tcp_impl.h> 88 #include <inet/tcp_cluster.h> 89 #include <inet/udp_impl.h> 90 #include <net/pfkeyv2.h> 91 #include <inet/ipdrop.h> 92 93 #include <inet/ipclassifier.h> 94 #include <inet/ip_ire.h> 95 #include <inet/ip_ftable.h> 96 #include <inet/ip_if.h> 97 #include <inet/ipp_common.h> 98 #include <inet/ip_rts.h> 99 #include <inet/ip_netinfo.h> 100 #include <sys/squeue_impl.h> 101 #include <sys/squeue.h> 102 #include <sys/tsol/label.h> 103 #include <sys/tsol/tnet.h> 104 #include <rpc/pmap_prot.h> 105 #include <sys/callo.h> 106 107 /* 108 * TCP Notes: aka FireEngine Phase I (PSARC 2002/433) 109 * 110 * (Read the detailed design doc in PSARC case directory) 111 * 112 * The entire tcp state is contained in tcp_t and conn_t structure 113 * which are allocated in tandem using ipcl_conn_create() and passing 114 * IPCL_TCPCONN as a flag. We use 'conn_ref' and 'conn_lock' to protect 115 * the references on the tcp_t. The tcp_t structure is never compressed 116 * and packets always land on the correct TCP perimeter from the time 117 * eager is created till the time tcp_t dies (as such the old mentat 118 * TCP global queue is not used for detached state and no IPSEC checking 119 * is required). The global queue is still allocated to send out resets 120 * for connection which have no listeners and IP directly calls 121 * tcp_xmit_listeners_reset() which does any policy check. 122 * 123 * Protection and Synchronisation mechanism: 124 * 125 * The tcp data structure does not use any kind of lock for protecting 126 * its state but instead uses 'squeues' for mutual exclusion from various 127 * read and write side threads. To access a tcp member, the thread should 128 * always be behind squeue (via squeue_enter with flags as SQ_FILL, SQ_PROCESS, 129 * or SQ_NODRAIN). Since the squeues allow a direct function call, caller 130 * can pass any tcp function having prototype of edesc_t as argument 131 * (different from traditional STREAMs model where packets come in only 132 * designated entry points). The list of functions that can be directly 133 * called via squeue are listed before the usual function prototype. 134 * 135 * Referencing: 136 * 137 * TCP is MT-Hot and we use a reference based scheme to make sure that the 138 * tcp structure doesn't disappear when its needed. When the application 139 * creates an outgoing connection or accepts an incoming connection, we 140 * start out with 2 references on 'conn_ref'. One for TCP and one for IP. 141 * The IP reference is just a symbolic reference since ip_tcpclose() 142 * looks at tcp structure after tcp_close_output() returns which could 143 * have dropped the last TCP reference. So as long as the connection is 144 * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the 145 * conn_t. The classifier puts its own reference when the connection is 146 * inserted in listen or connected hash. Anytime a thread needs to enter 147 * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr 148 * on write side or by doing a classify on read side and then puts a 149 * reference on the conn before doing squeue_enter/tryenter/fill. For 150 * read side, the classifier itself puts the reference under fanout lock 151 * to make sure that tcp can't disappear before it gets processed. The 152 * squeue will drop this reference automatically so the called function 153 * doesn't have to do a DEC_REF. 154 * 155 * Opening a new connection: 156 * 157 * The outgoing connection open is pretty simple. tcp_open() does the 158 * work in creating the conn/tcp structure and initializing it. The 159 * squeue assignment is done based on the CPU the application 160 * is running on. So for outbound connections, processing is always done 161 * on application CPU which might be different from the incoming CPU 162 * being interrupted by the NIC. An optimal way would be to figure out 163 * the NIC <-> CPU binding at listen time, and assign the outgoing 164 * connection to the squeue attached to the CPU that will be interrupted 165 * for incoming packets (we know the NIC based on the bind IP address). 166 * This might seem like a problem if more data is going out but the 167 * fact is that in most cases the transmit is ACK driven transmit where 168 * the outgoing data normally sits on TCP's xmit queue waiting to be 169 * transmitted. 170 * 171 * Accepting a connection: 172 * 173 * This is a more interesting case because of various races involved in 174 * establishing a eager in its own perimeter. Read the meta comment on 175 * top of tcp_input_listener(). But briefly, the squeue is picked by 176 * ip_fanout based on the ring or the sender (if loopback). 177 * 178 * Closing a connection: 179 * 180 * The close is fairly straight forward. tcp_close() calls tcp_close_output() 181 * via squeue to do the close and mark the tcp as detached if the connection 182 * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its 183 * reference but tcp_close() drop IP's reference always. So if tcp was 184 * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP 185 * and 1 because it is in classifier's connected hash. This is the condition 186 * we use to determine that its OK to clean up the tcp outside of squeue 187 * when time wait expires (check the ref under fanout and conn_lock and 188 * if it is 2, remove it from fanout hash and kill it). 189 * 190 * Although close just drops the necessary references and marks the 191 * tcp_detached state, tcp_close needs to know the tcp_detached has been 192 * set (under squeue) before letting the STREAM go away (because a 193 * inbound packet might attempt to go up the STREAM while the close 194 * has happened and tcp_detached is not set). So a special lock and 195 * flag is used along with a condition variable (tcp_closelock, tcp_closed, 196 * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked 197 * tcp_detached. 198 * 199 * Special provisions and fast paths: 200 * 201 * We make special provisions for sockfs by marking tcp_issocket 202 * whenever we have only sockfs on top of TCP. This allows us to skip 203 * putting the tcp in acceptor hash since a sockfs listener can never 204 * become acceptor and also avoid allocating a tcp_t for acceptor STREAM 205 * since eager has already been allocated and the accept now happens 206 * on acceptor STREAM. There is a big blob of comment on top of 207 * tcp_input_listener explaining the new accept. When socket is POP'd, 208 * sockfs sends us an ioctl to mark the fact and we go back to old 209 * behaviour. Once tcp_issocket is unset, its never set for the 210 * life of that connection. 211 * 212 * IPsec notes : 213 * 214 * Since a packet is always executed on the correct TCP perimeter 215 * all IPsec processing is defered to IP including checking new 216 * connections and setting IPSEC policies for new connection. The 217 * only exception is tcp_xmit_listeners_reset() which is called 218 * directly from IP and needs to policy check to see if TH_RST 219 * can be sent out. 220 */ 221 222 /* 223 * Values for squeue switch: 224 * 1: SQ_NODRAIN 225 * 2: SQ_PROCESS 226 * 3: SQ_FILL 227 */ 228 int tcp_squeue_wput = 2; /* /etc/systems */ 229 int tcp_squeue_flag; 230 231 /* 232 * To prevent memory hog, limit the number of entries in tcp_free_list 233 * to 1% of available memory / number of cpus 234 */ 235 uint_t tcp_free_list_max_cnt = 0; 236 237 #define TIDUSZ 4096 /* transport interface data unit size */ 238 239 /* 240 * Size of acceptor hash list. It has to be a power of 2 for hashing. 241 */ 242 #define TCP_ACCEPTOR_FANOUT_SIZE 512 243 244 #ifdef _ILP32 245 #define TCP_ACCEPTOR_HASH(accid) \ 246 (((uint_t)(accid) >> 8) & (TCP_ACCEPTOR_FANOUT_SIZE - 1)) 247 #else 248 #define TCP_ACCEPTOR_HASH(accid) \ 249 ((uint_t)(accid) & (TCP_ACCEPTOR_FANOUT_SIZE - 1)) 250 #endif /* _ILP32 */ 251 252 /* 253 * Minimum number of connections which can be created per listener. Used 254 * when the listener connection count is in effect. 255 */ 256 static uint32_t tcp_min_conn_listener = 2; 257 258 uint32_t tcp_early_abort = 30; 259 260 /* TCP Timer control structure */ 261 typedef struct tcpt_s { 262 pfv_t tcpt_pfv; /* The routine we are to call */ 263 tcp_t *tcpt_tcp; /* The parameter we are to pass in */ 264 } tcpt_t; 265 266 /* 267 * Functions called directly via squeue having a prototype of edesc_t. 268 */ 269 void tcp_input_data(void *arg, mblk_t *mp, void *arg2, 270 ip_recv_attr_t *ira); 271 static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, 272 ip_recv_attr_t *dummy); 273 274 275 /* Prototype for TCP functions */ 276 static void tcp_random_init(void); 277 int tcp_random(void); 278 static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, 279 in_port_t dstport, uint_t srcid); 280 static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, 281 in_port_t dstport, uint32_t flowinfo, 282 uint_t srcid, uint32_t scope_id); 283 static void tcp_iss_init(tcp_t *tcp); 284 static void tcp_reinit(tcp_t *tcp); 285 static void tcp_reinit_values(tcp_t *tcp); 286 287 static int tcp_wsrv(queue_t *q); 288 static void tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa); 289 static void tcp_update_zcopy(tcp_t *tcp); 290 static void tcp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t, 291 ixa_notify_arg_t); 292 static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns); 293 static void tcp_stack_fini(netstackid_t stackid, void *arg); 294 295 static int tcp_squeue_switch(int); 296 297 static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t); 298 static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *); 299 static int tcp_openv6(queue_t *, dev_t *, int, int, cred_t *); 300 301 static void tcp_squeue_add(squeue_t *); 302 303 struct module_info tcp_rinfo = { 304 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER 305 }; 306 307 static struct module_info tcp_winfo = { 308 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16 309 }; 310 311 /* 312 * Entry points for TCP as a device. The normal case which supports 313 * the TCP functionality. 314 * We have separate open functions for the /dev/tcp and /dev/tcp6 devices. 315 */ 316 struct qinit tcp_rinitv4 = { 317 NULL, tcp_rsrv, tcp_openv4, tcp_tpi_close, NULL, &tcp_rinfo 318 }; 319 320 struct qinit tcp_rinitv6 = { 321 NULL, tcp_rsrv, tcp_openv6, tcp_tpi_close, NULL, &tcp_rinfo 322 }; 323 324 struct qinit tcp_winit = { 325 tcp_wput, tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 326 }; 327 328 /* Initial entry point for TCP in socket mode. */ 329 struct qinit tcp_sock_winit = { 330 tcp_wput_sock, tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 331 }; 332 333 /* TCP entry point during fallback */ 334 struct qinit tcp_fallback_sock_winit = { 335 tcp_wput_fallback, NULL, NULL, NULL, NULL, &tcp_winfo 336 }; 337 338 /* 339 * Entry points for TCP as a acceptor STREAM opened by sockfs when doing 340 * an accept. Avoid allocating data structures since eager has already 341 * been created. 342 */ 343 struct qinit tcp_acceptor_rinit = { 344 NULL, tcp_rsrv, NULL, tcp_tpi_close_accept, NULL, &tcp_winfo 345 }; 346 347 struct qinit tcp_acceptor_winit = { 348 tcp_tpi_accept, NULL, NULL, NULL, NULL, &tcp_winfo 349 }; 350 351 /* For AF_INET aka /dev/tcp */ 352 struct streamtab tcpinfov4 = { 353 &tcp_rinitv4, &tcp_winit 354 }; 355 356 /* For AF_INET6 aka /dev/tcp6 */ 357 struct streamtab tcpinfov6 = { 358 &tcp_rinitv6, &tcp_winit 359 }; 360 361 /* 362 * Following assumes TPI alignment requirements stay along 32 bit 363 * boundaries 364 */ 365 #define ROUNDUP32(x) \ 366 (((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1)) 367 368 /* Template for response to info request. */ 369 struct T_info_ack tcp_g_t_info_ack = { 370 T_INFO_ACK, /* PRIM_type */ 371 0, /* TSDU_size */ 372 T_INFINITE, /* ETSDU_size */ 373 T_INVALID, /* CDATA_size */ 374 T_INVALID, /* DDATA_size */ 375 sizeof (sin_t), /* ADDR_size */ 376 0, /* OPT_size - not initialized here */ 377 TIDUSZ, /* TIDU_size */ 378 T_COTS_ORD, /* SERV_type */ 379 TCPS_IDLE, /* CURRENT_state */ 380 (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 381 }; 382 383 struct T_info_ack tcp_g_t_info_ack_v6 = { 384 T_INFO_ACK, /* PRIM_type */ 385 0, /* TSDU_size */ 386 T_INFINITE, /* ETSDU_size */ 387 T_INVALID, /* CDATA_size */ 388 T_INVALID, /* DDATA_size */ 389 sizeof (sin6_t), /* ADDR_size */ 390 0, /* OPT_size - not initialized here */ 391 TIDUSZ, /* TIDU_size */ 392 T_COTS_ORD, /* SERV_type */ 393 TCPS_IDLE, /* CURRENT_state */ 394 (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 395 }; 396 397 /* 398 * TCP tunables related declarations. Definitions are in tcp_tunables.c 399 */ 400 extern mod_prop_info_t tcp_propinfo_tbl[]; 401 extern int tcp_propinfo_count; 402 403 #define IS_VMLOANED_MBLK(mp) \ 404 (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0) 405 406 uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */ 407 408 /* 409 * Forces all connections to obey the value of the tcps_maxpsz_multiplier 410 * tunable settable via NDD. Otherwise, the per-connection behavior is 411 * determined dynamically during tcp_set_destination(), which is the default. 412 */ 413 boolean_t tcp_static_maxpsz = B_FALSE; 414 415 /* 416 * If the receive buffer size is changed, this function is called to update 417 * the upper socket layer on the new delayed receive wake up threshold. 418 */ 419 static void 420 tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh) 421 { 422 uint32_t default_threshold = SOCKET_RECVHIWATER >> 3; 423 424 if (IPCL_IS_NONSTR(tcp->tcp_connp)) { 425 conn_t *connp = tcp->tcp_connp; 426 struct sock_proto_props sopp; 427 428 /* 429 * only increase rcvthresh upto default_threshold 430 */ 431 if (new_rcvthresh > default_threshold) 432 new_rcvthresh = default_threshold; 433 434 sopp.sopp_flags = SOCKOPT_RCVTHRESH; 435 sopp.sopp_rcvthresh = new_rcvthresh; 436 437 (*connp->conn_upcalls->su_set_proto_props) 438 (connp->conn_upper_handle, &sopp); 439 } 440 } 441 442 /* 443 * Figure out the value of window scale opton. Note that the rwnd is 444 * ASSUMED to be rounded up to the nearest MSS before the calculation. 445 * We cannot find the scale value and then do a round up of tcp_rwnd 446 * because the scale value may not be correct after that. 447 * 448 * Set the compiler flag to make this function inline. 449 */ 450 void 451 tcp_set_ws_value(tcp_t *tcp) 452 { 453 int i; 454 uint32_t rwnd = tcp->tcp_rwnd; 455 456 for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; 457 i++, rwnd >>= 1) 458 ; 459 tcp->tcp_rcv_ws = i; 460 } 461 462 /* 463 * Remove cached/latched IPsec references. 464 */ 465 void 466 tcp_ipsec_cleanup(tcp_t *tcp) 467 { 468 conn_t *connp = tcp->tcp_connp; 469 470 ASSERT(connp->conn_flags & IPCL_TCPCONN); 471 472 if (connp->conn_latch != NULL) { 473 IPLATCH_REFRELE(connp->conn_latch); 474 connp->conn_latch = NULL; 475 } 476 if (connp->conn_latch_in_policy != NULL) { 477 IPPOL_REFRELE(connp->conn_latch_in_policy); 478 connp->conn_latch_in_policy = NULL; 479 } 480 if (connp->conn_latch_in_action != NULL) { 481 IPACT_REFRELE(connp->conn_latch_in_action); 482 connp->conn_latch_in_action = NULL; 483 } 484 if (connp->conn_policy != NULL) { 485 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); 486 connp->conn_policy = NULL; 487 } 488 } 489 490 /* 491 * Cleaup before placing on free list. 492 * Disassociate from the netstack/tcp_stack_t since the freelist 493 * is per squeue and not per netstack. 494 */ 495 void 496 tcp_cleanup(tcp_t *tcp) 497 { 498 mblk_t *mp; 499 conn_t *connp = tcp->tcp_connp; 500 tcp_stack_t *tcps = tcp->tcp_tcps; 501 netstack_t *ns = tcps->tcps_netstack; 502 mblk_t *tcp_rsrv_mp; 503 504 tcp_bind_hash_remove(tcp); 505 506 /* Cleanup that which needs the netstack first */ 507 tcp_ipsec_cleanup(tcp); 508 ixa_cleanup(connp->conn_ixa); 509 510 if (connp->conn_ht_iphc != NULL) { 511 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); 512 connp->conn_ht_iphc = NULL; 513 connp->conn_ht_iphc_allocated = 0; 514 connp->conn_ht_iphc_len = 0; 515 connp->conn_ht_ulp = NULL; 516 connp->conn_ht_ulp_len = 0; 517 tcp->tcp_ipha = NULL; 518 tcp->tcp_ip6h = NULL; 519 tcp->tcp_tcpha = NULL; 520 } 521 522 /* We clear any IP_OPTIONS and extension headers */ 523 ip_pkt_free(&connp->conn_xmit_ipp); 524 525 tcp_free(tcp); 526 527 /* 528 * Since we will bzero the entire structure, we need to 529 * remove it and reinsert it in global hash list. We 530 * know the walkers can't get to this conn because we 531 * had set CONDEMNED flag earlier and checked reference 532 * under conn_lock so walker won't pick it and when we 533 * go the ipcl_globalhash_remove() below, no walker 534 * can get to it. 535 */ 536 ipcl_globalhash_remove(connp); 537 538 /* Save some state */ 539 mp = tcp->tcp_timercache; 540 541 tcp_rsrv_mp = tcp->tcp_rsrv_mp; 542 543 if (connp->conn_cred != NULL) { 544 crfree(connp->conn_cred); 545 connp->conn_cred = NULL; 546 } 547 ipcl_conn_cleanup(connp); 548 connp->conn_flags = IPCL_TCPCONN; 549 550 /* 551 * Now it is safe to decrement the reference counts. 552 * This might be the last reference on the netstack 553 * in which case it will cause the freeing of the IP Instance. 554 */ 555 connp->conn_netstack = NULL; 556 connp->conn_ixa->ixa_ipst = NULL; 557 netstack_rele(ns); 558 ASSERT(tcps != NULL); 559 tcp->tcp_tcps = NULL; 560 561 bzero(tcp, sizeof (tcp_t)); 562 563 /* restore the state */ 564 tcp->tcp_timercache = mp; 565 566 tcp->tcp_rsrv_mp = tcp_rsrv_mp; 567 568 tcp->tcp_connp = connp; 569 570 ASSERT(connp->conn_tcp == tcp); 571 ASSERT(connp->conn_flags & IPCL_TCPCONN); 572 connp->conn_state_flags = CONN_INCIPIENT; 573 ASSERT(connp->conn_proto == IPPROTO_TCP); 574 ASSERT(connp->conn_ref == 1); 575 } 576 577 /* 578 * Adapt to the information, such as rtt and rtt_sd, provided from the 579 * DCE and IRE maintained by IP. 580 * 581 * Checks for multicast and broadcast destination address. 582 * Returns zero if ok; an errno on failure. 583 * 584 * Note that the MSS calculation here is based on the info given in 585 * the DCE and IRE. We do not do any calculation based on TCP options. They 586 * will be handled in tcp_input_data() when TCP knows which options to use. 587 * 588 * Note on how TCP gets its parameters for a connection. 589 * 590 * When a tcp_t structure is allocated, it gets all the default parameters. 591 * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd, 592 * spipe, rpipe, ... from the route metrics. Route metric overrides the 593 * default. 594 * 595 * An incoming SYN with a multicast or broadcast destination address is dropped 596 * in ip_fanout_v4/v6. 597 * 598 * An incoming SYN with a multicast or broadcast source address is always 599 * dropped in tcp_set_destination, since IPDF_ALLOW_MCBC is not set in 600 * conn_connect. 601 * The same logic in tcp_set_destination also serves to 602 * reject an attempt to connect to a broadcast or multicast (destination) 603 * address. 604 */ 605 int 606 tcp_set_destination(tcp_t *tcp) 607 { 608 uint32_t mss_max; 609 uint32_t mss; 610 boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 611 conn_t *connp = tcp->tcp_connp; 612 tcp_stack_t *tcps = tcp->tcp_tcps; 613 iulp_t uinfo; 614 int error; 615 uint32_t flags; 616 617 flags = IPDF_LSO | IPDF_ZCOPY; 618 /* 619 * Make sure we have a dce for the destination to avoid dce_ident 620 * contention for connected sockets. 621 */ 622 flags |= IPDF_UNIQUE_DCE; 623 624 if (!tcps->tcps_ignore_path_mtu) 625 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; 626 627 /* Use conn_lock to satify ASSERT; tcp is already serialized */ 628 mutex_enter(&connp->conn_lock); 629 error = conn_connect(connp, &uinfo, flags); 630 mutex_exit(&connp->conn_lock); 631 if (error != 0) 632 return (error); 633 634 error = tcp_build_hdrs(tcp); 635 if (error != 0) 636 return (error); 637 638 tcp->tcp_localnet = uinfo.iulp_localnet; 639 640 if (uinfo.iulp_rtt != 0) { 641 tcp->tcp_rtt_sa = MSEC2NSEC(uinfo.iulp_rtt); 642 tcp->tcp_rtt_sd = MSEC2NSEC(uinfo.iulp_rtt_sd); 643 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0); 644 } 645 if (uinfo.iulp_ssthresh != 0) 646 tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh; 647 else 648 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 649 if (uinfo.iulp_spipe > 0) { 650 connp->conn_sndbuf = MIN(uinfo.iulp_spipe, 651 tcps->tcps_max_buf); 652 if (tcps->tcps_snd_lowat_fraction != 0) { 653 connp->conn_sndlowat = connp->conn_sndbuf / 654 tcps->tcps_snd_lowat_fraction; 655 } 656 (void) tcp_maxpsz_set(tcp, B_TRUE); 657 } 658 /* 659 * Note that up till now, acceptor always inherits receive 660 * window from the listener. But if there is a metrics 661 * associated with a host, we should use that instead of 662 * inheriting it from listener. Thus we need to pass this 663 * info back to the caller. 664 */ 665 if (uinfo.iulp_rpipe > 0) { 666 tcp->tcp_rwnd = MIN(uinfo.iulp_rpipe, 667 tcps->tcps_max_buf); 668 } 669 670 if (uinfo.iulp_rtomax > 0) { 671 tcp->tcp_second_timer_threshold = 672 uinfo.iulp_rtomax; 673 } 674 675 /* 676 * Use the metric option settings, iulp_tstamp_ok and 677 * iulp_wscale_ok, only for active open. What this means 678 * is that if the other side uses timestamp or window 679 * scale option, TCP will also use those options. That 680 * is for passive open. If the application sets a 681 * large window, window scale is enabled regardless of 682 * the value in iulp_wscale_ok. This is the behavior 683 * since 2.6. So we keep it. 684 * The only case left in passive open processing is the 685 * check for SACK. 686 * For ECN, it should probably be like SACK. But the 687 * current value is binary, so we treat it like the other 688 * cases. The metric only controls active open.For passive 689 * open, the ndd param, tcp_ecn_permitted, controls the 690 * behavior. 691 */ 692 if (!tcp_detached) { 693 /* 694 * The if check means that the following can only 695 * be turned on by the metrics only IRE, but not off. 696 */ 697 if (uinfo.iulp_tstamp_ok) 698 tcp->tcp_snd_ts_ok = B_TRUE; 699 if (uinfo.iulp_wscale_ok) 700 tcp->tcp_snd_ws_ok = B_TRUE; 701 if (uinfo.iulp_sack == 2) 702 tcp->tcp_snd_sack_ok = B_TRUE; 703 if (uinfo.iulp_ecn_ok) 704 tcp->tcp_ecn_ok = B_TRUE; 705 } else { 706 /* 707 * Passive open. 708 * 709 * As above, the if check means that SACK can only be 710 * turned on by the metric only IRE. 711 */ 712 if (uinfo.iulp_sack > 0) { 713 tcp->tcp_snd_sack_ok = B_TRUE; 714 } 715 } 716 717 /* 718 * XXX Note that currently, iulp_mtu can be as small as 68 719 * because of PMTUd. So tcp_mss may go to negative if combined 720 * length of all those options exceeds 28 bytes. But because 721 * of the tcp_mss_min check below, we may not have a problem if 722 * tcp_mss_min is of a reasonable value. The default is 1 so 723 * the negative problem still exists. And the check defeats PMTUd. 724 * In fact, if PMTUd finds that the MSS should be smaller than 725 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min 726 * value. 727 * 728 * We do not deal with that now. All those problems related to 729 * PMTUd will be fixed later. 730 */ 731 ASSERT(uinfo.iulp_mtu != 0); 732 mss = tcp->tcp_initial_pmtu = uinfo.iulp_mtu; 733 734 /* Sanity check for MSS value. */ 735 if (connp->conn_ipversion == IPV4_VERSION) 736 mss_max = tcps->tcps_mss_max_ipv4; 737 else 738 mss_max = tcps->tcps_mss_max_ipv6; 739 740 if (tcp->tcp_ipsec_overhead == 0) 741 tcp->tcp_ipsec_overhead = conn_ipsec_length(connp); 742 743 mss -= tcp->tcp_ipsec_overhead; 744 745 if (mss < tcps->tcps_mss_min) 746 mss = tcps->tcps_mss_min; 747 if (mss > mss_max) 748 mss = mss_max; 749 750 /* Note that this is the maximum MSS, excluding all options. */ 751 tcp->tcp_mss = mss; 752 753 /* 754 * Update the tcp connection with LSO capability. 755 */ 756 tcp_update_lso(tcp, connp->conn_ixa); 757 758 /* 759 * Initialize the ISS here now that we have the full connection ID. 760 * The RFC 1948 method of initial sequence number generation requires 761 * knowledge of the full connection ID before setting the ISS. 762 */ 763 tcp_iss_init(tcp); 764 765 tcp->tcp_loopback = (uinfo.iulp_loopback | uinfo.iulp_local); 766 767 /* 768 * Make sure that conn is not marked incipient 769 * for incoming connections. A blind 770 * removal of incipient flag is cheaper than 771 * check and removal. 772 */ 773 mutex_enter(&connp->conn_lock); 774 connp->conn_state_flags &= ~CONN_INCIPIENT; 775 mutex_exit(&connp->conn_lock); 776 return (0); 777 } 778 779 /* 780 * tcp_clean_death / tcp_close_detached must not be called more than once 781 * on a tcp. Thus every function that potentially calls tcp_clean_death 782 * must check for the tcp state before calling tcp_clean_death. 783 * Eg. tcp_input_data, tcp_eager_kill, tcp_clean_death_wrapper, 784 * tcp_timer_handler, all check for the tcp state. 785 */ 786 /* ARGSUSED */ 787 void 788 tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2, 789 ip_recv_attr_t *dummy) 790 { 791 tcp_t *tcp = ((conn_t *)arg)->conn_tcp; 792 793 freemsg(mp); 794 if (tcp->tcp_state > TCPS_BOUND) 795 (void) tcp_clean_death(((conn_t *)arg)->conn_tcp, ETIMEDOUT); 796 } 797 798 /* 799 * We are dying for some reason. Try to do it gracefully. (May be called 800 * as writer.) 801 * 802 * Return -1 if the structure was not cleaned up (if the cleanup had to be 803 * done by a service procedure). 804 * TBD - Should the return value distinguish between the tcp_t being 805 * freed and it being reinitialized? 806 */ 807 int 808 tcp_clean_death(tcp_t *tcp, int err) 809 { 810 mblk_t *mp; 811 queue_t *q; 812 conn_t *connp = tcp->tcp_connp; 813 tcp_stack_t *tcps = tcp->tcp_tcps; 814 815 if (tcp->tcp_fused) 816 tcp_unfuse(tcp); 817 818 if (tcp->tcp_linger_tid != 0 && 819 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) { 820 tcp_stop_lingering(tcp); 821 } 822 823 ASSERT(tcp != NULL); 824 ASSERT((connp->conn_family == AF_INET && 825 connp->conn_ipversion == IPV4_VERSION) || 826 (connp->conn_family == AF_INET6 && 827 (connp->conn_ipversion == IPV4_VERSION || 828 connp->conn_ipversion == IPV6_VERSION))); 829 830 if (TCP_IS_DETACHED(tcp)) { 831 if (tcp->tcp_hard_binding) { 832 /* 833 * Its an eager that we are dealing with. We close the 834 * eager but in case a conn_ind has already gone to the 835 * listener, let tcp_accept_finish() send a discon_ind 836 * to the listener and drop the last reference. If the 837 * listener doesn't even know about the eager i.e. the 838 * conn_ind hasn't gone up, blow away the eager and drop 839 * the last reference as well. If the conn_ind has gone 840 * up, state should be BOUND. tcp_accept_finish 841 * will figure out that the connection has received a 842 * RST and will send a DISCON_IND to the application. 843 */ 844 tcp_closei_local(tcp); 845 if (!tcp->tcp_tconnind_started) { 846 CONN_DEC_REF(connp); 847 } else { 848 tcp->tcp_state = TCPS_BOUND; 849 DTRACE_TCP6(state__change, void, NULL, 850 ip_xmit_attr_t *, connp->conn_ixa, 851 void, NULL, tcp_t *, tcp, void, NULL, 852 int32_t, TCPS_CLOSED); 853 } 854 } else { 855 tcp_close_detached(tcp); 856 } 857 return (0); 858 } 859 860 TCP_STAT(tcps, tcp_clean_death_nondetached); 861 862 /* 863 * The connection is dead. Decrement listener connection counter if 864 * necessary. 865 */ 866 if (tcp->tcp_listen_cnt != NULL) 867 TCP_DECR_LISTEN_CNT(tcp); 868 869 /* 870 * When a connection is moved to TIME_WAIT state, the connection 871 * counter is already decremented. So no need to decrement here 872 * again. See SET_TIME_WAIT() macro. 873 */ 874 if (tcp->tcp_state >= TCPS_ESTABLISHED && 875 tcp->tcp_state < TCPS_TIME_WAIT) { 876 TCPS_CONN_DEC(tcps); 877 } 878 879 q = connp->conn_rq; 880 881 /* Trash all inbound data */ 882 if (!IPCL_IS_NONSTR(connp)) { 883 ASSERT(q != NULL); 884 flushq(q, FLUSHALL); 885 } 886 887 /* 888 * If we are at least part way open and there is error 889 * (err==0 implies no error) 890 * notify our client by a T_DISCON_IND. 891 */ 892 if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) { 893 if (tcp->tcp_state >= TCPS_ESTABLISHED && 894 !TCP_IS_SOCKET(tcp)) { 895 /* 896 * Send M_FLUSH according to TPI. Because sockets will 897 * (and must) ignore FLUSHR we do that only for TPI 898 * endpoints and sockets in STREAMS mode. 899 */ 900 (void) putnextctl1(q, M_FLUSH, FLUSHR); 901 } 902 if (connp->conn_debug) { 903 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 904 "tcp_clean_death: discon err %d", err); 905 } 906 if (IPCL_IS_NONSTR(connp)) { 907 /* Direct socket, use upcall */ 908 (*connp->conn_upcalls->su_disconnected)( 909 connp->conn_upper_handle, tcp->tcp_connid, err); 910 } else { 911 mp = mi_tpi_discon_ind(NULL, err, 0); 912 if (mp != NULL) { 913 putnext(q, mp); 914 } else { 915 if (connp->conn_debug) { 916 (void) strlog(TCP_MOD_ID, 0, 1, 917 SL_ERROR|SL_TRACE, 918 "tcp_clean_death, sending M_ERROR"); 919 } 920 (void) putnextctl1(q, M_ERROR, EPROTO); 921 } 922 } 923 if (tcp->tcp_state <= TCPS_SYN_RCVD) { 924 /* SYN_SENT or SYN_RCVD */ 925 TCPS_BUMP_MIB(tcps, tcpAttemptFails); 926 } else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) { 927 /* ESTABLISHED or CLOSE_WAIT */ 928 TCPS_BUMP_MIB(tcps, tcpEstabResets); 929 } 930 } 931 932 /* 933 * ESTABLISHED non-STREAMS eagers are not 'detached' because 934 * an upper handle is obtained when the SYN-ACK comes in. So it 935 * should receive the 'disconnected' upcall, but tcp_reinit should 936 * not be called since this is an eager. 937 */ 938 if (tcp->tcp_listener != NULL && IPCL_IS_NONSTR(connp)) { 939 tcp_closei_local(tcp); 940 tcp->tcp_state = TCPS_BOUND; 941 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 942 connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, 943 int32_t, TCPS_CLOSED); 944 return (0); 945 } 946 947 tcp_reinit(tcp); 948 if (IPCL_IS_NONSTR(connp)) 949 (void) tcp_do_unbind(connp); 950 951 return (-1); 952 } 953 954 /* 955 * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout 956 * to expire, stop the wait and finish the close. 957 */ 958 void 959 tcp_stop_lingering(tcp_t *tcp) 960 { 961 clock_t delta = 0; 962 tcp_stack_t *tcps = tcp->tcp_tcps; 963 conn_t *connp = tcp->tcp_connp; 964 965 tcp->tcp_linger_tid = 0; 966 if (tcp->tcp_state > TCPS_LISTEN) { 967 tcp_acceptor_hash_remove(tcp); 968 mutex_enter(&tcp->tcp_non_sq_lock); 969 if (tcp->tcp_flow_stopped) { 970 tcp_clrqfull(tcp); 971 } 972 mutex_exit(&tcp->tcp_non_sq_lock); 973 974 if (tcp->tcp_timer_tid != 0) { 975 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 976 tcp->tcp_timer_tid = 0; 977 } 978 /* 979 * Need to cancel those timers which will not be used when 980 * TCP is detached. This has to be done before the conn_wq 981 * is cleared. 982 */ 983 tcp_timers_stop(tcp); 984 985 tcp->tcp_detached = B_TRUE; 986 connp->conn_rq = NULL; 987 connp->conn_wq = NULL; 988 989 if (tcp->tcp_state == TCPS_TIME_WAIT) { 990 tcp_time_wait_append(tcp); 991 TCP_DBGSTAT(tcps, tcp_detach_time_wait); 992 goto finish; 993 } 994 995 /* 996 * If delta is zero the timer event wasn't executed and was 997 * successfully canceled. In this case we need to restart it 998 * with the minimal delta possible. 999 */ 1000 if (delta >= 0) { 1001 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, 1002 delta ? delta : 1); 1003 } 1004 } else { 1005 tcp_closei_local(tcp); 1006 CONN_DEC_REF(connp); 1007 } 1008 finish: 1009 tcp->tcp_detached = B_TRUE; 1010 connp->conn_rq = NULL; 1011 connp->conn_wq = NULL; 1012 1013 /* Signal closing thread that it can complete close */ 1014 mutex_enter(&tcp->tcp_closelock); 1015 tcp->tcp_closed = 1; 1016 cv_signal(&tcp->tcp_closecv); 1017 mutex_exit(&tcp->tcp_closelock); 1018 1019 /* If we have an upper handle (socket), release it */ 1020 if (IPCL_IS_NONSTR(connp)) { 1021 ASSERT(connp->conn_upper_handle != NULL); 1022 (*connp->conn_upcalls->su_closed)(connp->conn_upper_handle); 1023 connp->conn_upper_handle = NULL; 1024 connp->conn_upcalls = NULL; 1025 } 1026 } 1027 1028 void 1029 tcp_close_common(conn_t *connp, int flags) 1030 { 1031 tcp_t *tcp = connp->conn_tcp; 1032 mblk_t *mp = &tcp->tcp_closemp; 1033 boolean_t conn_ioctl_cleanup_reqd = B_FALSE; 1034 mblk_t *bp; 1035 1036 ASSERT(connp->conn_ref >= 2); 1037 1038 /* 1039 * Mark the conn as closing. ipsq_pending_mp_add will not 1040 * add any mp to the pending mp list, after this conn has 1041 * started closing. 1042 */ 1043 mutex_enter(&connp->conn_lock); 1044 connp->conn_state_flags |= CONN_CLOSING; 1045 if (connp->conn_oper_pending_ill != NULL) 1046 conn_ioctl_cleanup_reqd = B_TRUE; 1047 CONN_INC_REF_LOCKED(connp); 1048 mutex_exit(&connp->conn_lock); 1049 tcp->tcp_closeflags = (uint8_t)flags; 1050 ASSERT(connp->conn_ref >= 3); 1051 1052 /* 1053 * tcp_closemp_used is used below without any protection of a lock 1054 * as we don't expect any one else to use it concurrently at this 1055 * point otherwise it would be a major defect. 1056 */ 1057 1058 if (mp->b_prev == NULL) 1059 tcp->tcp_closemp_used = B_TRUE; 1060 else 1061 cmn_err(CE_PANIC, "tcp_close: concurrent use of tcp_closemp: " 1062 "connp %p tcp %p\n", (void *)connp, (void *)tcp); 1063 1064 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 1065 1066 /* 1067 * Cleanup any queued ioctls here. This must be done before the wq/rq 1068 * are re-written by tcp_close_output(). 1069 */ 1070 if (conn_ioctl_cleanup_reqd) 1071 conn_ioctl_cleanup(connp); 1072 1073 /* 1074 * As CONN_CLOSING is set, no further ioctls should be passed down to 1075 * IP for this conn (see the guards in tcp_ioctl, tcp_wput_ioctl and 1076 * tcp_wput_iocdata). If the ioctl was queued on an ipsq, 1077 * conn_ioctl_cleanup should have found it and removed it. If the ioctl 1078 * was still in flight at the time, we wait for it here. See comments 1079 * for CONN_INC_IOCTLREF in ip.h for details. 1080 */ 1081 mutex_enter(&connp->conn_lock); 1082 while (connp->conn_ioctlref > 0) 1083 cv_wait(&connp->conn_cv, &connp->conn_lock); 1084 ASSERT(connp->conn_ioctlref == 0); 1085 ASSERT(connp->conn_oper_pending_ill == NULL); 1086 mutex_exit(&connp->conn_lock); 1087 1088 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp, 1089 NULL, tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); 1090 1091 /* 1092 * For non-STREAMS sockets, the normal case is that the conn makes 1093 * an upcall when it's finally closed, so there is no need to wait 1094 * in the protocol. But in case of SO_LINGER the thread sleeps here 1095 * so it can properly deal with the thread being interrupted. 1096 */ 1097 if (IPCL_IS_NONSTR(connp) && connp->conn_linger == 0) 1098 goto nowait; 1099 1100 mutex_enter(&tcp->tcp_closelock); 1101 while (!tcp->tcp_closed) { 1102 if (!cv_wait_sig(&tcp->tcp_closecv, &tcp->tcp_closelock)) { 1103 /* 1104 * The cv_wait_sig() was interrupted. We now do the 1105 * following: 1106 * 1107 * 1) If the endpoint was lingering, we allow this 1108 * to be interrupted by cancelling the linger timeout 1109 * and closing normally. 1110 * 1111 * 2) Revert to calling cv_wait() 1112 * 1113 * We revert to using cv_wait() to avoid an 1114 * infinite loop which can occur if the calling 1115 * thread is higher priority than the squeue worker 1116 * thread and is bound to the same cpu. 1117 */ 1118 if (connp->conn_linger && connp->conn_lingertime > 0) { 1119 mutex_exit(&tcp->tcp_closelock); 1120 /* Entering squeue, bump ref count. */ 1121 CONN_INC_REF(connp); 1122 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); 1123 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, 1124 tcp_linger_interrupted, connp, NULL, 1125 tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); 1126 mutex_enter(&tcp->tcp_closelock); 1127 } 1128 break; 1129 } 1130 } 1131 while (!tcp->tcp_closed) 1132 cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock); 1133 mutex_exit(&tcp->tcp_closelock); 1134 1135 /* 1136 * In the case of listener streams that have eagers in the q or q0 1137 * we wait for the eagers to drop their reference to us. conn_rq and 1138 * conn_wq of the eagers point to our queues. By waiting for the 1139 * refcnt to drop to 1, we are sure that the eagers have cleaned 1140 * up their queue pointers and also dropped their references to us. 1141 * 1142 * For non-STREAMS sockets we do not have to wait here; the 1143 * listener will instead make a su_closed upcall when the last 1144 * reference is dropped. 1145 */ 1146 if (tcp->tcp_wait_for_eagers && !IPCL_IS_NONSTR(connp)) { 1147 mutex_enter(&connp->conn_lock); 1148 while (connp->conn_ref != 1) { 1149 cv_wait(&connp->conn_cv, &connp->conn_lock); 1150 } 1151 mutex_exit(&connp->conn_lock); 1152 } 1153 1154 nowait: 1155 connp->conn_cpid = NOPID; 1156 } 1157 1158 /* 1159 * Called by tcp_close() routine via squeue when lingering is 1160 * interrupted by a signal. 1161 */ 1162 1163 /* ARGSUSED */ 1164 static void 1165 tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1166 { 1167 conn_t *connp = (conn_t *)arg; 1168 tcp_t *tcp = connp->conn_tcp; 1169 1170 freeb(mp); 1171 if (tcp->tcp_linger_tid != 0 && 1172 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) { 1173 tcp_stop_lingering(tcp); 1174 tcp->tcp_client_errno = EINTR; 1175 } 1176 } 1177 1178 /* 1179 * Clean up the b_next and b_prev fields of every mblk pointed at by *mpp. 1180 * Some stream heads get upset if they see these later on as anything but NULL. 1181 */ 1182 void 1183 tcp_close_mpp(mblk_t **mpp) 1184 { 1185 mblk_t *mp; 1186 1187 if ((mp = *mpp) != NULL) { 1188 do { 1189 mp->b_next = NULL; 1190 mp->b_prev = NULL; 1191 } while ((mp = mp->b_cont) != NULL); 1192 1193 mp = *mpp; 1194 *mpp = NULL; 1195 freemsg(mp); 1196 } 1197 } 1198 1199 /* Do detached close. */ 1200 void 1201 tcp_close_detached(tcp_t *tcp) 1202 { 1203 if (tcp->tcp_fused) 1204 tcp_unfuse(tcp); 1205 1206 /* 1207 * Clustering code serializes TCP disconnect callbacks and 1208 * cluster tcp list walks by blocking a TCP disconnect callback 1209 * if a cluster tcp list walk is in progress. This ensures 1210 * accurate accounting of TCPs in the cluster code even though 1211 * the TCP list walk itself is not atomic. 1212 */ 1213 tcp_closei_local(tcp); 1214 CONN_DEC_REF(tcp->tcp_connp); 1215 } 1216 1217 /* 1218 * The tcp_t is going away. Remove it from all lists and set it 1219 * to TCPS_CLOSED. The freeing up of memory is deferred until 1220 * tcp_inactive. This is needed since a thread in tcp_rput might have 1221 * done a CONN_INC_REF on this structure before it was removed from the 1222 * hashes. 1223 */ 1224 void 1225 tcp_closei_local(tcp_t *tcp) 1226 { 1227 conn_t *connp = tcp->tcp_connp; 1228 tcp_stack_t *tcps = tcp->tcp_tcps; 1229 int32_t oldstate; 1230 1231 if (!TCP_IS_SOCKET(tcp)) 1232 tcp_acceptor_hash_remove(tcp); 1233 1234 /* 1235 * This can be called via tcp_time_wait_processing() if TCP gets a 1236 * SYN with sequence number outside the TIME-WAIT connection's 1237 * window. So we need to check for TIME-WAIT state here as the 1238 * connection counter is already decremented. See SET_TIME_WAIT() 1239 * macro 1240 */ 1241 if (tcp->tcp_state >= TCPS_ESTABLISHED && 1242 tcp->tcp_state < TCPS_TIME_WAIT) { 1243 TCPS_CONN_DEC(tcps); 1244 } 1245 1246 /* 1247 * If we are an eager connection hanging off a listener that 1248 * hasn't formally accepted the connection yet, get off its 1249 * list and blow off any data that we have accumulated. 1250 */ 1251 if (tcp->tcp_listener != NULL) { 1252 tcp_t *listener = tcp->tcp_listener; 1253 mutex_enter(&listener->tcp_eager_lock); 1254 /* 1255 * tcp_tconnind_started == B_TRUE means that the 1256 * conn_ind has already gone to listener. At 1257 * this point, eager will be closed but we 1258 * leave it in listeners eager list so that 1259 * if listener decides to close without doing 1260 * accept, we can clean this up. In tcp_tli_accept 1261 * we take care of the case of accept on closed 1262 * eager. 1263 */ 1264 if (!tcp->tcp_tconnind_started) { 1265 tcp_eager_unlink(tcp); 1266 mutex_exit(&listener->tcp_eager_lock); 1267 /* 1268 * We don't want to have any pointers to the 1269 * listener queue, after we have released our 1270 * reference on the listener 1271 */ 1272 ASSERT(tcp->tcp_detached); 1273 connp->conn_rq = NULL; 1274 connp->conn_wq = NULL; 1275 CONN_DEC_REF(listener->tcp_connp); 1276 } else { 1277 mutex_exit(&listener->tcp_eager_lock); 1278 } 1279 } 1280 1281 /* Stop all the timers */ 1282 tcp_timers_stop(tcp); 1283 1284 if (tcp->tcp_state == TCPS_LISTEN) { 1285 if (tcp->tcp_ip_addr_cache) { 1286 kmem_free((void *)tcp->tcp_ip_addr_cache, 1287 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1288 tcp->tcp_ip_addr_cache = NULL; 1289 } 1290 } 1291 1292 /* Decrement listerner connection counter if necessary. */ 1293 if (tcp->tcp_listen_cnt != NULL) 1294 TCP_DECR_LISTEN_CNT(tcp); 1295 1296 mutex_enter(&tcp->tcp_non_sq_lock); 1297 if (tcp->tcp_flow_stopped) 1298 tcp_clrqfull(tcp); 1299 mutex_exit(&tcp->tcp_non_sq_lock); 1300 1301 tcp_bind_hash_remove(tcp); 1302 /* 1303 * If the tcp_time_wait_collector (which runs outside the squeue) 1304 * is trying to remove this tcp from the time wait list, we will 1305 * block in tcp_time_wait_remove while trying to acquire the 1306 * tcp_time_wait_lock. The logic in tcp_time_wait_collector also 1307 * requires the ipcl_hash_remove to be ordered after the 1308 * tcp_time_wait_remove for the refcnt checks to work correctly. 1309 */ 1310 if (tcp->tcp_state == TCPS_TIME_WAIT) 1311 (void) tcp_time_wait_remove(tcp, NULL); 1312 CL_INET_DISCONNECT(connp); 1313 ipcl_hash_remove(connp); 1314 oldstate = tcp->tcp_state; 1315 tcp->tcp_state = TCPS_CLOSED; 1316 /* Need to probe before ixa_cleanup() is called */ 1317 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 1318 connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, 1319 int32_t, oldstate); 1320 ixa_cleanup(connp->conn_ixa); 1321 1322 /* 1323 * Mark the conn as CONDEMNED 1324 */ 1325 mutex_enter(&connp->conn_lock); 1326 connp->conn_state_flags |= CONN_CONDEMNED; 1327 mutex_exit(&connp->conn_lock); 1328 1329 ASSERT(tcp->tcp_time_wait_next == NULL); 1330 ASSERT(tcp->tcp_time_wait_prev == NULL); 1331 ASSERT(tcp->tcp_time_wait_expire == 0); 1332 1333 tcp_ipsec_cleanup(tcp); 1334 } 1335 1336 /* 1337 * tcp is dying (called from ipcl_conn_destroy and error cases). 1338 * Free the tcp_t in either case. 1339 */ 1340 void 1341 tcp_free(tcp_t *tcp) 1342 { 1343 mblk_t *mp; 1344 conn_t *connp = tcp->tcp_connp; 1345 1346 ASSERT(tcp != NULL); 1347 ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL); 1348 1349 connp->conn_rq = NULL; 1350 connp->conn_wq = NULL; 1351 1352 tcp_close_mpp(&tcp->tcp_xmit_head); 1353 tcp_close_mpp(&tcp->tcp_reass_head); 1354 if (tcp->tcp_rcv_list != NULL) { 1355 /* Free b_next chain */ 1356 tcp_close_mpp(&tcp->tcp_rcv_list); 1357 } 1358 if ((mp = tcp->tcp_urp_mp) != NULL) { 1359 freemsg(mp); 1360 } 1361 if ((mp = tcp->tcp_urp_mark_mp) != NULL) { 1362 freemsg(mp); 1363 } 1364 1365 if (tcp->tcp_fused_sigurg_mp != NULL) { 1366 ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 1367 freeb(tcp->tcp_fused_sigurg_mp); 1368 tcp->tcp_fused_sigurg_mp = NULL; 1369 } 1370 1371 if (tcp->tcp_ordrel_mp != NULL) { 1372 ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 1373 freeb(tcp->tcp_ordrel_mp); 1374 tcp->tcp_ordrel_mp = NULL; 1375 } 1376 1377 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp); 1378 bzero(&tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 1379 1380 if (tcp->tcp_hopopts != NULL) { 1381 mi_free(tcp->tcp_hopopts); 1382 tcp->tcp_hopopts = NULL; 1383 tcp->tcp_hopoptslen = 0; 1384 } 1385 ASSERT(tcp->tcp_hopoptslen == 0); 1386 if (tcp->tcp_dstopts != NULL) { 1387 mi_free(tcp->tcp_dstopts); 1388 tcp->tcp_dstopts = NULL; 1389 tcp->tcp_dstoptslen = 0; 1390 } 1391 ASSERT(tcp->tcp_dstoptslen == 0); 1392 if (tcp->tcp_rthdrdstopts != NULL) { 1393 mi_free(tcp->tcp_rthdrdstopts); 1394 tcp->tcp_rthdrdstopts = NULL; 1395 tcp->tcp_rthdrdstoptslen = 0; 1396 } 1397 ASSERT(tcp->tcp_rthdrdstoptslen == 0); 1398 if (tcp->tcp_rthdr != NULL) { 1399 mi_free(tcp->tcp_rthdr); 1400 tcp->tcp_rthdr = NULL; 1401 tcp->tcp_rthdrlen = 0; 1402 } 1403 ASSERT(tcp->tcp_rthdrlen == 0); 1404 1405 /* 1406 * Following is really a blowing away a union. 1407 * It happens to have exactly two members of identical size 1408 * the following code is enough. 1409 */ 1410 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); 1411 1412 /* 1413 * If this is a non-STREAM socket still holding on to an upper 1414 * handle, release it. As a result of fallback we might also see 1415 * STREAMS based conns with upper handles, in which case there is 1416 * nothing to do other than clearing the field. 1417 */ 1418 if (connp->conn_upper_handle != NULL) { 1419 if (IPCL_IS_NONSTR(connp)) { 1420 (*connp->conn_upcalls->su_closed)( 1421 connp->conn_upper_handle); 1422 tcp->tcp_detached = B_TRUE; 1423 } 1424 connp->conn_upper_handle = NULL; 1425 connp->conn_upcalls = NULL; 1426 } 1427 } 1428 1429 /* 1430 * tcp_get_conn/tcp_free_conn 1431 * 1432 * tcp_get_conn is used to get a clean tcp connection structure. 1433 * It tries to reuse the connections put on the freelist by the 1434 * time_wait_collector failing which it goes to kmem_cache. This 1435 * way has two benefits compared to just allocating from and 1436 * freeing to kmem_cache. 1437 * 1) The time_wait_collector can free (which includes the cleanup) 1438 * outside the squeue. So when the interrupt comes, we have a clean 1439 * connection sitting in the freelist. Obviously, this buys us 1440 * performance. 1441 * 1442 * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener 1443 * has multiple disadvantages - tying up the squeue during alloc. 1444 * But allocating the conn/tcp in IP land is also not the best since 1445 * we can't check the 'q' and 'q0' which are protected by squeue and 1446 * blindly allocate memory which might have to be freed here if we are 1447 * not allowed to accept the connection. By using the freelist and 1448 * putting the conn/tcp back in freelist, we don't pay a penalty for 1449 * allocating memory without checking 'q/q0' and freeing it if we can't 1450 * accept the connection. 1451 * 1452 * Care should be taken to put the conn back in the same squeue's freelist 1453 * from which it was allocated. Best results are obtained if conn is 1454 * allocated from listener's squeue and freed to the same. Time wait 1455 * collector will free up the freelist is the connection ends up sitting 1456 * there for too long. 1457 */ 1458 void * 1459 tcp_get_conn(void *arg, tcp_stack_t *tcps) 1460 { 1461 tcp_t *tcp = NULL; 1462 conn_t *connp = NULL; 1463 squeue_t *sqp = (squeue_t *)arg; 1464 tcp_squeue_priv_t *tcp_time_wait; 1465 netstack_t *ns; 1466 mblk_t *tcp_rsrv_mp = NULL; 1467 1468 tcp_time_wait = 1469 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 1470 1471 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1472 tcp = tcp_time_wait->tcp_free_list; 1473 ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0)); 1474 if (tcp != NULL) { 1475 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 1476 tcp_time_wait->tcp_free_list_cnt--; 1477 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1478 tcp->tcp_time_wait_next = NULL; 1479 connp = tcp->tcp_connp; 1480 connp->conn_flags |= IPCL_REUSED; 1481 1482 ASSERT(tcp->tcp_tcps == NULL); 1483 ASSERT(connp->conn_netstack == NULL); 1484 ASSERT(tcp->tcp_rsrv_mp != NULL); 1485 ns = tcps->tcps_netstack; 1486 netstack_hold(ns); 1487 connp->conn_netstack = ns; 1488 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 1489 tcp->tcp_tcps = tcps; 1490 ipcl_globalhash_insert(connp); 1491 1492 connp->conn_ixa->ixa_notify_cookie = tcp; 1493 ASSERT(connp->conn_ixa->ixa_notify == tcp_notify); 1494 connp->conn_recv = tcp_input_data; 1495 ASSERT(connp->conn_recvicmp == tcp_icmp_input); 1496 ASSERT(connp->conn_verifyicmp == tcp_verifyicmp); 1497 return ((void *)connp); 1498 } 1499 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1500 /* 1501 * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until 1502 * this conn_t/tcp_t is freed at ipcl_conn_destroy(). 1503 */ 1504 tcp_rsrv_mp = allocb(0, BPRI_HI); 1505 if (tcp_rsrv_mp == NULL) 1506 return (NULL); 1507 1508 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP, 1509 tcps->tcps_netstack)) == NULL) { 1510 freeb(tcp_rsrv_mp); 1511 return (NULL); 1512 } 1513 1514 tcp = connp->conn_tcp; 1515 tcp->tcp_rsrv_mp = tcp_rsrv_mp; 1516 mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL); 1517 1518 tcp->tcp_tcps = tcps; 1519 1520 connp->conn_recv = tcp_input_data; 1521 connp->conn_recvicmp = tcp_icmp_input; 1522 connp->conn_verifyicmp = tcp_verifyicmp; 1523 1524 /* 1525 * Register tcp_notify to listen to capability changes detected by IP. 1526 * This upcall is made in the context of the call to conn_ip_output 1527 * thus it is inside the squeue. 1528 */ 1529 connp->conn_ixa->ixa_notify = tcp_notify; 1530 connp->conn_ixa->ixa_notify_cookie = tcp; 1531 1532 return ((void *)connp); 1533 } 1534 1535 /* 1536 * Handle connect to IPv4 destinations, including connections for AF_INET6 1537 * sockets connecting to IPv4 mapped IPv6 destinations. 1538 * Returns zero if OK, a positive errno, or a negative TLI error. 1539 */ 1540 static int 1541 tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport, 1542 uint_t srcid) 1543 { 1544 ipaddr_t dstaddr = *dstaddrp; 1545 uint16_t lport; 1546 conn_t *connp = tcp->tcp_connp; 1547 tcp_stack_t *tcps = tcp->tcp_tcps; 1548 int error; 1549 1550 ASSERT(connp->conn_ipversion == IPV4_VERSION); 1551 1552 /* Check for attempt to connect to INADDR_ANY */ 1553 if (dstaddr == INADDR_ANY) { 1554 /* 1555 * SunOS 4.x and 4.3 BSD allow an application 1556 * to connect a TCP socket to INADDR_ANY. 1557 * When they do this, the kernel picks the 1558 * address of one interface and uses it 1559 * instead. The kernel usually ends up 1560 * picking the address of the loopback 1561 * interface. This is an undocumented feature. 1562 * However, we provide the same thing here 1563 * in order to have source and binary 1564 * compatibility with SunOS 4.x. 1565 * Update the T_CONN_REQ (sin/sin6) since it is used to 1566 * generate the T_CONN_CON. 1567 */ 1568 dstaddr = htonl(INADDR_LOOPBACK); 1569 *dstaddrp = dstaddr; 1570 } 1571 1572 /* Handle __sin6_src_id if socket not bound to an IP address */ 1573 if (srcid != 0 && connp->conn_laddr_v4 == INADDR_ANY) { 1574 if (!ip_srcid_find_id(srcid, &connp->conn_laddr_v6, 1575 IPCL_ZONEID(connp), B_TRUE, tcps->tcps_netstack)) { 1576 /* Mismatch - conn_laddr_v6 would be v6 address. */ 1577 return (EADDRNOTAVAIL); 1578 } 1579 connp->conn_saddr_v6 = connp->conn_laddr_v6; 1580 } 1581 1582 IN6_IPADDR_TO_V4MAPPED(dstaddr, &connp->conn_faddr_v6); 1583 connp->conn_fport = dstport; 1584 1585 /* 1586 * At this point the remote destination address and remote port fields 1587 * in the tcp-four-tuple have been filled in the tcp structure. Now we 1588 * have to see which state tcp was in so we can take appropriate action. 1589 */ 1590 if (tcp->tcp_state == TCPS_IDLE) { 1591 /* 1592 * We support a quick connect capability here, allowing 1593 * clients to transition directly from IDLE to SYN_SENT 1594 * tcp_bindi will pick an unused port, insert the connection 1595 * in the bind hash and transition to BOUND state. 1596 */ 1597 lport = tcp_update_next_port(tcps->tcps_next_port_to_try, 1598 tcp, B_TRUE); 1599 lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE, 1600 B_FALSE, B_FALSE); 1601 if (lport == 0) 1602 return (-TNOADDR); 1603 } 1604 1605 /* 1606 * Lookup the route to determine a source address and the uinfo. 1607 * Setup TCP parameters based on the metrics/DCE. 1608 */ 1609 error = tcp_set_destination(tcp); 1610 if (error != 0) 1611 return (error); 1612 1613 /* 1614 * Don't let an endpoint connect to itself. 1615 */ 1616 if (connp->conn_faddr_v4 == connp->conn_laddr_v4 && 1617 connp->conn_fport == connp->conn_lport) 1618 return (-TBADADDR); 1619 1620 tcp->tcp_state = TCPS_SYN_SENT; 1621 1622 return (ipcl_conn_insert_v4(connp)); 1623 } 1624 1625 /* 1626 * Handle connect to IPv6 destinations. 1627 * Returns zero if OK, a positive errno, or a negative TLI error. 1628 */ 1629 static int 1630 tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport, 1631 uint32_t flowinfo, uint_t srcid, uint32_t scope_id) 1632 { 1633 uint16_t lport; 1634 conn_t *connp = tcp->tcp_connp; 1635 tcp_stack_t *tcps = tcp->tcp_tcps; 1636 int error; 1637 1638 ASSERT(connp->conn_family == AF_INET6); 1639 1640 /* 1641 * If we're here, it means that the destination address is a native 1642 * IPv6 address. Return an error if conn_ipversion is not IPv6. A 1643 * reason why it might not be IPv6 is if the socket was bound to an 1644 * IPv4-mapped IPv6 address. 1645 */ 1646 if (connp->conn_ipversion != IPV6_VERSION) 1647 return (-TBADADDR); 1648 1649 /* 1650 * Interpret a zero destination to mean loopback. 1651 * Update the T_CONN_REQ (sin/sin6) since it is used to 1652 * generate the T_CONN_CON. 1653 */ 1654 if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) 1655 *dstaddrp = ipv6_loopback; 1656 1657 /* Handle __sin6_src_id if socket not bound to an IP address */ 1658 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1659 if (!ip_srcid_find_id(srcid, &connp->conn_laddr_v6, 1660 IPCL_ZONEID(connp), B_FALSE, tcps->tcps_netstack)) { 1661 /* Mismatch - conn_laddr_v6 would be v4-mapped. */ 1662 return (EADDRNOTAVAIL); 1663 } 1664 connp->conn_saddr_v6 = connp->conn_laddr_v6; 1665 } 1666 1667 /* 1668 * Take care of the scope_id now. 1669 */ 1670 if (scope_id != 0 && IN6_IS_ADDR_LINKSCOPE(dstaddrp)) { 1671 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 1672 connp->conn_ixa->ixa_scopeid = scope_id; 1673 } else { 1674 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 1675 } 1676 1677 connp->conn_flowinfo = flowinfo; 1678 connp->conn_faddr_v6 = *dstaddrp; 1679 connp->conn_fport = dstport; 1680 1681 /* 1682 * At this point the remote destination address and remote port fields 1683 * in the tcp-four-tuple have been filled in the tcp structure. Now we 1684 * have to see which state tcp was in so we can take appropriate action. 1685 */ 1686 if (tcp->tcp_state == TCPS_IDLE) { 1687 /* 1688 * We support a quick connect capability here, allowing 1689 * clients to transition directly from IDLE to SYN_SENT 1690 * tcp_bindi will pick an unused port, insert the connection 1691 * in the bind hash and transition to BOUND state. 1692 */ 1693 lport = tcp_update_next_port(tcps->tcps_next_port_to_try, 1694 tcp, B_TRUE); 1695 lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE, 1696 B_FALSE, B_FALSE); 1697 if (lport == 0) 1698 return (-TNOADDR); 1699 } 1700 1701 /* 1702 * Lookup the route to determine a source address and the uinfo. 1703 * Setup TCP parameters based on the metrics/DCE. 1704 */ 1705 error = tcp_set_destination(tcp); 1706 if (error != 0) 1707 return (error); 1708 1709 /* 1710 * Don't let an endpoint connect to itself. 1711 */ 1712 if (IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, &connp->conn_laddr_v6) && 1713 connp->conn_fport == connp->conn_lport) 1714 return (-TBADADDR); 1715 1716 tcp->tcp_state = TCPS_SYN_SENT; 1717 1718 return (ipcl_conn_insert_v6(connp)); 1719 } 1720 1721 /* 1722 * Disconnect 1723 * Note that unlike other functions this returns a positive tli error 1724 * when it fails; it never returns an errno. 1725 */ 1726 static int 1727 tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum) 1728 { 1729 conn_t *lconnp; 1730 tcp_stack_t *tcps = tcp->tcp_tcps; 1731 conn_t *connp = tcp->tcp_connp; 1732 1733 /* 1734 * Right now, upper modules pass down a T_DISCON_REQ to TCP, 1735 * when the stream is in BOUND state. Do not send a reset, 1736 * since the destination IP address is not valid, and it can 1737 * be the initialized value of all zeros (broadcast address). 1738 */ 1739 if (tcp->tcp_state <= TCPS_BOUND) { 1740 if (connp->conn_debug) { 1741 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 1742 "tcp_disconnect: bad state, %d", tcp->tcp_state); 1743 } 1744 return (TOUTSTATE); 1745 } else if (tcp->tcp_state >= TCPS_ESTABLISHED) { 1746 TCPS_CONN_DEC(tcps); 1747 } 1748 1749 if (seqnum == -1 || tcp->tcp_conn_req_max == 0) { 1750 1751 /* 1752 * According to TPI, for non-listeners, ignore seqnum 1753 * and disconnect. 1754 * Following interpretation of -1 seqnum is historical 1755 * and implied TPI ? (TPI only states that for T_CONN_IND, 1756 * a valid seqnum should not be -1). 1757 * 1758 * -1 means disconnect everything 1759 * regardless even on a listener. 1760 */ 1761 1762 int old_state = tcp->tcp_state; 1763 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 1764 1765 /* 1766 * The connection can't be on the tcp_time_wait_head list 1767 * since it is not detached. 1768 */ 1769 ASSERT(tcp->tcp_time_wait_next == NULL); 1770 ASSERT(tcp->tcp_time_wait_prev == NULL); 1771 ASSERT(tcp->tcp_time_wait_expire == 0); 1772 /* 1773 * If it used to be a listener, check to make sure no one else 1774 * has taken the port before switching back to LISTEN state. 1775 */ 1776 if (connp->conn_ipversion == IPV4_VERSION) { 1777 lconnp = ipcl_lookup_listener_v4(connp->conn_lport, 1778 connp->conn_laddr_v4, IPCL_ZONEID(connp), ipst); 1779 } else { 1780 uint_t ifindex = 0; 1781 1782 if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET) 1783 ifindex = connp->conn_ixa->ixa_scopeid; 1784 1785 /* Allow conn_bound_if listeners? */ 1786 lconnp = ipcl_lookup_listener_v6(connp->conn_lport, 1787 &connp->conn_laddr_v6, ifindex, IPCL_ZONEID(connp), 1788 ipst); 1789 } 1790 if (tcp->tcp_conn_req_max && lconnp == NULL) { 1791 tcp->tcp_state = TCPS_LISTEN; 1792 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 1793 connp->conn_ixa, void, NULL, tcp_t *, tcp, void, 1794 NULL, int32_t, old_state); 1795 } else if (old_state > TCPS_BOUND) { 1796 tcp->tcp_conn_req_max = 0; 1797 tcp->tcp_state = TCPS_BOUND; 1798 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 1799 connp->conn_ixa, void, NULL, tcp_t *, tcp, void, 1800 NULL, int32_t, old_state); 1801 1802 /* 1803 * If this end point is not going to become a listener, 1804 * decrement the listener connection count if 1805 * necessary. Note that we do not do this if it is 1806 * going to be a listner (the above if case) since 1807 * then it may remove the counter struct. 1808 */ 1809 if (tcp->tcp_listen_cnt != NULL) 1810 TCP_DECR_LISTEN_CNT(tcp); 1811 } 1812 if (lconnp != NULL) 1813 CONN_DEC_REF(lconnp); 1814 switch (old_state) { 1815 case TCPS_SYN_SENT: 1816 case TCPS_SYN_RCVD: 1817 TCPS_BUMP_MIB(tcps, tcpAttemptFails); 1818 break; 1819 case TCPS_ESTABLISHED: 1820 case TCPS_CLOSE_WAIT: 1821 TCPS_BUMP_MIB(tcps, tcpEstabResets); 1822 break; 1823 } 1824 1825 if (tcp->tcp_fused) 1826 tcp_unfuse(tcp); 1827 1828 mutex_enter(&tcp->tcp_eager_lock); 1829 if ((tcp->tcp_conn_req_cnt_q0 != 0) || 1830 (tcp->tcp_conn_req_cnt_q != 0)) { 1831 tcp_eager_cleanup(tcp, 0); 1832 } 1833 mutex_exit(&tcp->tcp_eager_lock); 1834 1835 tcp_xmit_ctl("tcp_disconnect", tcp, tcp->tcp_snxt, 1836 tcp->tcp_rnxt, TH_RST | TH_ACK); 1837 1838 tcp_reinit(tcp); 1839 1840 return (0); 1841 } else if (!tcp_eager_blowoff(tcp, seqnum)) { 1842 return (TBADSEQ); 1843 } 1844 return (0); 1845 } 1846 1847 /* 1848 * Our client hereby directs us to reject the connection request 1849 * that tcp_input_listener() marked with 'seqnum'. Rejection consists 1850 * of sending the appropriate RST, not an ICMP error. 1851 */ 1852 void 1853 tcp_disconnect(tcp_t *tcp, mblk_t *mp) 1854 { 1855 t_scalar_t seqnum; 1856 int error; 1857 conn_t *connp = tcp->tcp_connp; 1858 1859 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 1860 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) { 1861 tcp_err_ack(tcp, mp, TPROTO, 0); 1862 return; 1863 } 1864 seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number; 1865 error = tcp_disconnect_common(tcp, seqnum); 1866 if (error != 0) 1867 tcp_err_ack(tcp, mp, error, 0); 1868 else { 1869 if (tcp->tcp_state >= TCPS_ESTABLISHED) { 1870 /* Send M_FLUSH according to TPI */ 1871 (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); 1872 } 1873 mp = mi_tpi_ok_ack_alloc(mp); 1874 if (mp != NULL) 1875 putnext(connp->conn_rq, mp); 1876 } 1877 } 1878 1879 /* 1880 * Handle reinitialization of a tcp structure. 1881 * Maintain "binding state" resetting the state to BOUND, LISTEN, or IDLE. 1882 */ 1883 static void 1884 tcp_reinit(tcp_t *tcp) 1885 { 1886 mblk_t *mp; 1887 tcp_stack_t *tcps = tcp->tcp_tcps; 1888 conn_t *connp = tcp->tcp_connp; 1889 int32_t oldstate; 1890 1891 /* tcp_reinit should never be called for detached tcp_t's */ 1892 ASSERT(tcp->tcp_listener == NULL); 1893 ASSERT((connp->conn_family == AF_INET && 1894 connp->conn_ipversion == IPV4_VERSION) || 1895 (connp->conn_family == AF_INET6 && 1896 (connp->conn_ipversion == IPV4_VERSION || 1897 connp->conn_ipversion == IPV6_VERSION))); 1898 1899 /* Cancel outstanding timers */ 1900 tcp_timers_stop(tcp); 1901 1902 tcp_close_mpp(&tcp->tcp_xmit_head); 1903 if (tcp->tcp_snd_zcopy_aware) 1904 tcp_zcopy_notify(tcp); 1905 tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL; 1906 tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0; 1907 mutex_enter(&tcp->tcp_non_sq_lock); 1908 if (tcp->tcp_flow_stopped && 1909 TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { 1910 tcp_clrqfull(tcp); 1911 } 1912 mutex_exit(&tcp->tcp_non_sq_lock); 1913 tcp_close_mpp(&tcp->tcp_reass_head); 1914 tcp->tcp_reass_tail = NULL; 1915 if (tcp->tcp_rcv_list != NULL) { 1916 /* Free b_next chain */ 1917 tcp_close_mpp(&tcp->tcp_rcv_list); 1918 tcp->tcp_rcv_last_head = NULL; 1919 tcp->tcp_rcv_last_tail = NULL; 1920 tcp->tcp_rcv_cnt = 0; 1921 } 1922 tcp->tcp_rcv_last_tail = NULL; 1923 1924 if ((mp = tcp->tcp_urp_mp) != NULL) { 1925 freemsg(mp); 1926 tcp->tcp_urp_mp = NULL; 1927 } 1928 if ((mp = tcp->tcp_urp_mark_mp) != NULL) { 1929 freemsg(mp); 1930 tcp->tcp_urp_mark_mp = NULL; 1931 } 1932 if (tcp->tcp_fused_sigurg_mp != NULL) { 1933 ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 1934 freeb(tcp->tcp_fused_sigurg_mp); 1935 tcp->tcp_fused_sigurg_mp = NULL; 1936 } 1937 if (tcp->tcp_ordrel_mp != NULL) { 1938 ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 1939 freeb(tcp->tcp_ordrel_mp); 1940 tcp->tcp_ordrel_mp = NULL; 1941 } 1942 1943 /* 1944 * Following is a union with two members which are 1945 * identical types and size so the following cleanup 1946 * is enough. 1947 */ 1948 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); 1949 1950 CL_INET_DISCONNECT(connp); 1951 1952 /* 1953 * The connection can't be on the tcp_time_wait_head list 1954 * since it is not detached. 1955 */ 1956 ASSERT(tcp->tcp_time_wait_next == NULL); 1957 ASSERT(tcp->tcp_time_wait_prev == NULL); 1958 ASSERT(tcp->tcp_time_wait_expire == 0); 1959 1960 /* 1961 * Reset/preserve other values 1962 */ 1963 tcp_reinit_values(tcp); 1964 ipcl_hash_remove(connp); 1965 /* Note that ixa_cred gets cleared in ixa_cleanup */ 1966 ixa_cleanup(connp->conn_ixa); 1967 tcp_ipsec_cleanup(tcp); 1968 1969 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 1970 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 1971 oldstate = tcp->tcp_state; 1972 1973 if (tcp->tcp_conn_req_max != 0) { 1974 /* 1975 * This is the case when a TLI program uses the same 1976 * transport end point to accept a connection. This 1977 * makes the TCP both a listener and acceptor. When 1978 * this connection is closed, we need to set the state 1979 * back to TCPS_LISTEN. Make sure that the eager list 1980 * is reinitialized. 1981 * 1982 * Note that this stream is still bound to the four 1983 * tuples of the previous connection in IP. If a new 1984 * SYN with different foreign address comes in, IP will 1985 * not find it and will send it to the global queue. In 1986 * the global queue, TCP will do a tcp_lookup_listener() 1987 * to find this stream. This works because this stream 1988 * is only removed from connected hash. 1989 * 1990 */ 1991 tcp->tcp_state = TCPS_LISTEN; 1992 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 1993 tcp->tcp_eager_next_drop_q0 = tcp; 1994 tcp->tcp_eager_prev_drop_q0 = tcp; 1995 /* 1996 * Initially set conn_recv to tcp_input_listener_unbound to try 1997 * to pick a good squeue for the listener when the first SYN 1998 * arrives. tcp_input_listener_unbound sets it to 1999 * tcp_input_listener on that first SYN. 2000 */ 2001 connp->conn_recv = tcp_input_listener_unbound; 2002 2003 connp->conn_proto = IPPROTO_TCP; 2004 connp->conn_faddr_v6 = ipv6_all_zeros; 2005 connp->conn_fport = 0; 2006 2007 (void) ipcl_bind_insert(connp); 2008 } else { 2009 tcp->tcp_state = TCPS_BOUND; 2010 } 2011 2012 /* 2013 * Initialize to default values 2014 */ 2015 tcp_init_values(tcp, NULL); 2016 2017 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 2018 connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, 2019 int32_t, oldstate); 2020 2021 ASSERT(tcp->tcp_ptpbhn != NULL); 2022 tcp->tcp_rwnd = connp->conn_rcvbuf; 2023 tcp->tcp_mss = connp->conn_ipversion != IPV4_VERSION ? 2024 tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4; 2025 } 2026 2027 /* 2028 * Force values to zero that need be zero. 2029 * Do not touch values asociated with the BOUND or LISTEN state 2030 * since the connection will end up in that state after the reinit. 2031 * NOTE: tcp_reinit_values MUST have a line for each field in the tcp_t 2032 * structure! 2033 */ 2034 static void 2035 tcp_reinit_values(tcp_t *tcp) 2036 { 2037 tcp_stack_t *tcps = tcp->tcp_tcps; 2038 conn_t *connp = tcp->tcp_connp; 2039 2040 #ifndef lint 2041 #define DONTCARE(x) 2042 #define PRESERVE(x) 2043 #else 2044 #define DONTCARE(x) ((x) = (x)) 2045 #define PRESERVE(x) ((x) = (x)) 2046 #endif /* lint */ 2047 2048 PRESERVE(tcp->tcp_bind_hash_port); 2049 PRESERVE(tcp->tcp_bind_hash); 2050 PRESERVE(tcp->tcp_ptpbhn); 2051 PRESERVE(tcp->tcp_acceptor_hash); 2052 PRESERVE(tcp->tcp_ptpahn); 2053 2054 /* Should be ASSERT NULL on these with new code! */ 2055 ASSERT(tcp->tcp_time_wait_next == NULL); 2056 ASSERT(tcp->tcp_time_wait_prev == NULL); 2057 ASSERT(tcp->tcp_time_wait_expire == 0); 2058 PRESERVE(tcp->tcp_state); 2059 PRESERVE(connp->conn_rq); 2060 PRESERVE(connp->conn_wq); 2061 2062 ASSERT(tcp->tcp_xmit_head == NULL); 2063 ASSERT(tcp->tcp_xmit_last == NULL); 2064 ASSERT(tcp->tcp_unsent == 0); 2065 ASSERT(tcp->tcp_xmit_tail == NULL); 2066 ASSERT(tcp->tcp_xmit_tail_unsent == 0); 2067 2068 tcp->tcp_snxt = 0; /* Displayed in mib */ 2069 tcp->tcp_suna = 0; /* Displayed in mib */ 2070 tcp->tcp_swnd = 0; 2071 DONTCARE(tcp->tcp_cwnd); /* Init in tcp_process_options */ 2072 2073 if (connp->conn_ht_iphc != NULL) { 2074 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); 2075 connp->conn_ht_iphc = NULL; 2076 connp->conn_ht_iphc_allocated = 0; 2077 connp->conn_ht_iphc_len = 0; 2078 connp->conn_ht_ulp = NULL; 2079 connp->conn_ht_ulp_len = 0; 2080 tcp->tcp_ipha = NULL; 2081 tcp->tcp_ip6h = NULL; 2082 tcp->tcp_tcpha = NULL; 2083 } 2084 2085 /* We clear any IP_OPTIONS and extension headers */ 2086 ip_pkt_free(&connp->conn_xmit_ipp); 2087 2088 DONTCARE(tcp->tcp_naglim); /* Init in tcp_init_values */ 2089 DONTCARE(tcp->tcp_ipha); 2090 DONTCARE(tcp->tcp_ip6h); 2091 DONTCARE(tcp->tcp_tcpha); 2092 tcp->tcp_valid_bits = 0; 2093 2094 DONTCARE(tcp->tcp_timer_backoff); /* Init in tcp_init_values */ 2095 DONTCARE(tcp->tcp_last_recv_time); /* Init in tcp_init_values */ 2096 tcp->tcp_last_rcv_lbolt = 0; 2097 2098 tcp->tcp_init_cwnd = 0; 2099 2100 tcp->tcp_urp_last_valid = 0; 2101 tcp->tcp_hard_binding = 0; 2102 2103 tcp->tcp_fin_acked = 0; 2104 tcp->tcp_fin_rcvd = 0; 2105 tcp->tcp_fin_sent = 0; 2106 tcp->tcp_ordrel_done = 0; 2107 2108 tcp->tcp_detached = 0; 2109 2110 tcp->tcp_snd_ws_ok = B_FALSE; 2111 tcp->tcp_snd_ts_ok = B_FALSE; 2112 tcp->tcp_zero_win_probe = 0; 2113 2114 tcp->tcp_loopback = 0; 2115 tcp->tcp_localnet = 0; 2116 tcp->tcp_syn_defense = 0; 2117 tcp->tcp_set_timer = 0; 2118 2119 tcp->tcp_active_open = 0; 2120 tcp->tcp_rexmit = B_FALSE; 2121 tcp->tcp_xmit_zc_clean = B_FALSE; 2122 2123 tcp->tcp_snd_sack_ok = B_FALSE; 2124 tcp->tcp_hwcksum = B_FALSE; 2125 2126 DONTCARE(tcp->tcp_maxpsz_multiplier); /* Init in tcp_init_values */ 2127 2128 tcp->tcp_conn_def_q0 = 0; 2129 tcp->tcp_ip_forward_progress = B_FALSE; 2130 tcp->tcp_ecn_ok = B_FALSE; 2131 2132 tcp->tcp_cwr = B_FALSE; 2133 tcp->tcp_ecn_echo_on = B_FALSE; 2134 tcp->tcp_is_wnd_shrnk = B_FALSE; 2135 2136 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp); 2137 bzero(&tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 2138 2139 tcp->tcp_rcv_ws = 0; 2140 tcp->tcp_snd_ws = 0; 2141 tcp->tcp_ts_recent = 0; 2142 tcp->tcp_rnxt = 0; /* Displayed in mib */ 2143 DONTCARE(tcp->tcp_rwnd); /* Set in tcp_reinit() */ 2144 tcp->tcp_initial_pmtu = 0; 2145 2146 ASSERT(tcp->tcp_reass_head == NULL); 2147 ASSERT(tcp->tcp_reass_tail == NULL); 2148 2149 tcp->tcp_cwnd_cnt = 0; 2150 2151 ASSERT(tcp->tcp_rcv_list == NULL); 2152 ASSERT(tcp->tcp_rcv_last_head == NULL); 2153 ASSERT(tcp->tcp_rcv_last_tail == NULL); 2154 ASSERT(tcp->tcp_rcv_cnt == 0); 2155 2156 DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_set_destination */ 2157 DONTCARE(tcp->tcp_cwnd_max); /* Init in tcp_init_values */ 2158 tcp->tcp_csuna = 0; 2159 2160 tcp->tcp_rto = 0; /* Displayed in MIB */ 2161 DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */ 2162 DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */ 2163 tcp->tcp_rtt_update = 0; 2164 tcp->tcp_rtt_sum = 0; 2165 tcp->tcp_rtt_cnt = 0; 2166 2167 DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ 2168 DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ 2169 2170 tcp->tcp_rack = 0; /* Displayed in mib */ 2171 tcp->tcp_rack_cnt = 0; 2172 tcp->tcp_rack_cur_max = 0; 2173 tcp->tcp_rack_abs_max = 0; 2174 2175 tcp->tcp_max_swnd = 0; 2176 2177 ASSERT(tcp->tcp_listener == NULL); 2178 2179 DONTCARE(tcp->tcp_irs); /* tcp_valid_bits cleared */ 2180 DONTCARE(tcp->tcp_iss); /* tcp_valid_bits cleared */ 2181 DONTCARE(tcp->tcp_fss); /* tcp_valid_bits cleared */ 2182 DONTCARE(tcp->tcp_urg); /* tcp_valid_bits cleared */ 2183 2184 ASSERT(tcp->tcp_conn_req_cnt_q == 0); 2185 ASSERT(tcp->tcp_conn_req_cnt_q0 == 0); 2186 PRESERVE(tcp->tcp_conn_req_max); 2187 PRESERVE(tcp->tcp_conn_req_seqnum); 2188 2189 DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */ 2190 DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */ 2191 DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */ 2192 DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */ 2193 2194 DONTCARE(tcp->tcp_urp_last); /* tcp_urp_last_valid is cleared */ 2195 ASSERT(tcp->tcp_urp_mp == NULL); 2196 ASSERT(tcp->tcp_urp_mark_mp == NULL); 2197 ASSERT(tcp->tcp_fused_sigurg_mp == NULL); 2198 2199 ASSERT(tcp->tcp_eager_next_q == NULL); 2200 ASSERT(tcp->tcp_eager_last_q == NULL); 2201 ASSERT((tcp->tcp_eager_next_q0 == NULL && 2202 tcp->tcp_eager_prev_q0 == NULL) || 2203 tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0); 2204 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); 2205 2206 ASSERT((tcp->tcp_eager_next_drop_q0 == NULL && 2207 tcp->tcp_eager_prev_drop_q0 == NULL) || 2208 tcp->tcp_eager_next_drop_q0 == tcp->tcp_eager_prev_drop_q0); 2209 2210 DONTCARE(tcp->tcp_ka_rinterval); /* Init in tcp_init_values */ 2211 DONTCARE(tcp->tcp_ka_abort_thres); /* Init in tcp_init_values */ 2212 DONTCARE(tcp->tcp_ka_cnt); /* Init in tcp_init_values */ 2213 2214 tcp->tcp_client_errno = 0; 2215 2216 DONTCARE(connp->conn_sum); /* Init in tcp_init_values */ 2217 2218 connp->conn_faddr_v6 = ipv6_all_zeros; /* Displayed in MIB */ 2219 2220 PRESERVE(connp->conn_bound_addr_v6); 2221 tcp->tcp_last_sent_len = 0; 2222 tcp->tcp_dupack_cnt = 0; 2223 2224 connp->conn_fport = 0; /* Displayed in MIB */ 2225 PRESERVE(connp->conn_lport); 2226 2227 PRESERVE(tcp->tcp_acceptor_lockp); 2228 2229 ASSERT(tcp->tcp_ordrel_mp == NULL); 2230 PRESERVE(tcp->tcp_acceptor_id); 2231 DONTCARE(tcp->tcp_ipsec_overhead); 2232 2233 PRESERVE(connp->conn_family); 2234 /* Remove any remnants of mapped address binding */ 2235 if (connp->conn_family == AF_INET6) { 2236 connp->conn_ipversion = IPV6_VERSION; 2237 tcp->tcp_mss = tcps->tcps_mss_def_ipv6; 2238 } else { 2239 connp->conn_ipversion = IPV4_VERSION; 2240 tcp->tcp_mss = tcps->tcps_mss_def_ipv4; 2241 } 2242 2243 connp->conn_bound_if = 0; 2244 connp->conn_recv_ancillary.crb_all = 0; 2245 tcp->tcp_recvifindex = 0; 2246 tcp->tcp_recvhops = 0; 2247 tcp->tcp_closed = 0; 2248 if (tcp->tcp_hopopts != NULL) { 2249 mi_free(tcp->tcp_hopopts); 2250 tcp->tcp_hopopts = NULL; 2251 tcp->tcp_hopoptslen = 0; 2252 } 2253 ASSERT(tcp->tcp_hopoptslen == 0); 2254 if (tcp->tcp_dstopts != NULL) { 2255 mi_free(tcp->tcp_dstopts); 2256 tcp->tcp_dstopts = NULL; 2257 tcp->tcp_dstoptslen = 0; 2258 } 2259 ASSERT(tcp->tcp_dstoptslen == 0); 2260 if (tcp->tcp_rthdrdstopts != NULL) { 2261 mi_free(tcp->tcp_rthdrdstopts); 2262 tcp->tcp_rthdrdstopts = NULL; 2263 tcp->tcp_rthdrdstoptslen = 0; 2264 } 2265 ASSERT(tcp->tcp_rthdrdstoptslen == 0); 2266 if (tcp->tcp_rthdr != NULL) { 2267 mi_free(tcp->tcp_rthdr); 2268 tcp->tcp_rthdr = NULL; 2269 tcp->tcp_rthdrlen = 0; 2270 } 2271 ASSERT(tcp->tcp_rthdrlen == 0); 2272 2273 /* Reset fusion-related fields */ 2274 tcp->tcp_fused = B_FALSE; 2275 tcp->tcp_unfusable = B_FALSE; 2276 tcp->tcp_fused_sigurg = B_FALSE; 2277 tcp->tcp_loopback_peer = NULL; 2278 2279 tcp->tcp_lso = B_FALSE; 2280 2281 tcp->tcp_in_ack_unsent = 0; 2282 tcp->tcp_cork = B_FALSE; 2283 tcp->tcp_tconnind_started = B_FALSE; 2284 2285 PRESERVE(tcp->tcp_squeue_bytes); 2286 2287 tcp->tcp_closemp_used = B_FALSE; 2288 2289 PRESERVE(tcp->tcp_rsrv_mp); 2290 PRESERVE(tcp->tcp_rsrv_mp_lock); 2291 2292 #ifdef DEBUG 2293 DONTCARE(tcp->tcmp_stk[0]); 2294 #endif 2295 2296 PRESERVE(tcp->tcp_connid); 2297 2298 ASSERT(tcp->tcp_listen_cnt == NULL); 2299 ASSERT(tcp->tcp_reass_tid == 0); 2300 2301 #undef DONTCARE 2302 #undef PRESERVE 2303 } 2304 2305 /* 2306 * Initialize the various fields in tcp_t. If parent (the listener) is non 2307 * NULL, certain values will be inheritted from it. 2308 */ 2309 void 2310 tcp_init_values(tcp_t *tcp, tcp_t *parent) 2311 { 2312 tcp_stack_t *tcps = tcp->tcp_tcps; 2313 conn_t *connp = tcp->tcp_connp; 2314 2315 ASSERT((connp->conn_family == AF_INET && 2316 connp->conn_ipversion == IPV4_VERSION) || 2317 (connp->conn_family == AF_INET6 && 2318 (connp->conn_ipversion == IPV4_VERSION || 2319 connp->conn_ipversion == IPV6_VERSION))); 2320 2321 if (parent == NULL) { 2322 tcp->tcp_naglim = tcps->tcps_naglim_def; 2323 2324 tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial; 2325 tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min; 2326 tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max; 2327 2328 tcp->tcp_first_ctimer_threshold = 2329 tcps->tcps_ip_notify_cinterval; 2330 tcp->tcp_second_ctimer_threshold = 2331 tcps->tcps_ip_abort_cinterval; 2332 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval; 2333 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval; 2334 2335 tcp->tcp_fin_wait_2_flush_interval = 2336 tcps->tcps_fin_wait_2_flush_interval; 2337 2338 tcp->tcp_ka_interval = tcps->tcps_keepalive_interval; 2339 tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval; 2340 tcp->tcp_ka_cnt = 0; 2341 tcp->tcp_ka_rinterval = 0; 2342 2343 /* 2344 * Default value of tcp_init_cwnd is 0, so no need to set here 2345 * if parent is NULL. But we need to inherit it from parent. 2346 */ 2347 } else { 2348 /* Inherit various TCP parameters from the parent. */ 2349 tcp->tcp_naglim = parent->tcp_naglim; 2350 2351 tcp->tcp_rto_initial = parent->tcp_rto_initial; 2352 tcp->tcp_rto_min = parent->tcp_rto_min; 2353 tcp->tcp_rto_max = parent->tcp_rto_max; 2354 2355 tcp->tcp_first_ctimer_threshold = 2356 parent->tcp_first_ctimer_threshold; 2357 tcp->tcp_second_ctimer_threshold = 2358 parent->tcp_second_ctimer_threshold; 2359 tcp->tcp_first_timer_threshold = 2360 parent->tcp_first_timer_threshold; 2361 tcp->tcp_second_timer_threshold = 2362 parent->tcp_second_timer_threshold; 2363 2364 tcp->tcp_fin_wait_2_flush_interval = 2365 parent->tcp_fin_wait_2_flush_interval; 2366 2367 tcp->tcp_ka_interval = parent->tcp_ka_interval; 2368 tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres; 2369 tcp->tcp_ka_cnt = parent->tcp_ka_cnt; 2370 tcp->tcp_ka_rinterval = parent->tcp_ka_rinterval; 2371 2372 tcp->tcp_init_cwnd = parent->tcp_init_cwnd; 2373 } 2374 2375 /* 2376 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO 2377 * will be close to tcp_rexmit_interval_initial. By doing this, we 2378 * allow the algorithm to adjust slowly to large fluctuations of RTT 2379 * during first few transmissions of a connection as seen in slow 2380 * links. 2381 */ 2382 tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2; 2383 tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1; 2384 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 2385 tcps->tcps_conn_grace_period); 2386 2387 tcp->tcp_timer_backoff = 0; 2388 tcp->tcp_ms_we_have_waited = 0; 2389 tcp->tcp_last_recv_time = ddi_get_lbolt(); 2390 tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_; 2391 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 2392 2393 tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier; 2394 2395 /* NOTE: ISS is now set in tcp_set_destination(). */ 2396 2397 /* Reset fusion-related fields */ 2398 tcp->tcp_fused = B_FALSE; 2399 tcp->tcp_unfusable = B_FALSE; 2400 tcp->tcp_fused_sigurg = B_FALSE; 2401 tcp->tcp_loopback_peer = NULL; 2402 2403 /* We rebuild the header template on the next connect/conn_request */ 2404 2405 connp->conn_mlp_type = mlptSingle; 2406 2407 /* 2408 * Init the window scale to the max so tcp_rwnd_set() won't pare 2409 * down tcp_rwnd. tcp_set_destination() will set the right value later. 2410 */ 2411 tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; 2412 tcp->tcp_rwnd = connp->conn_rcvbuf; 2413 2414 tcp->tcp_cork = B_FALSE; 2415 /* 2416 * Init the tcp_debug option if it wasn't already set. This value 2417 * determines whether TCP 2418 * calls strlog() to print out debug messages. Doing this 2419 * initialization here means that this value is not inherited thru 2420 * tcp_reinit(). 2421 */ 2422 if (!connp->conn_debug) 2423 connp->conn_debug = tcps->tcps_dbg; 2424 } 2425 2426 /* 2427 * Update the TCP connection according to change of PMTU. 2428 * 2429 * Path MTU might have changed by either increase or decrease, so need to 2430 * adjust the MSS based on the value of ixa_pmtu. No need to handle tiny 2431 * or negative MSS, since tcp_mss_set() will do it. 2432 */ 2433 void 2434 tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) 2435 { 2436 uint32_t pmtu; 2437 int32_t mss; 2438 conn_t *connp = tcp->tcp_connp; 2439 ip_xmit_attr_t *ixa = connp->conn_ixa; 2440 iaflags_t ixaflags; 2441 2442 if (tcp->tcp_tcps->tcps_ignore_path_mtu) 2443 return; 2444 2445 if (tcp->tcp_state < TCPS_ESTABLISHED) 2446 return; 2447 2448 /* 2449 * Always call ip_get_pmtu() to make sure that IP has updated 2450 * ixa_flags properly. 2451 */ 2452 pmtu = ip_get_pmtu(ixa); 2453 ixaflags = ixa->ixa_flags; 2454 2455 /* 2456 * Calculate the MSS by decreasing the PMTU by conn_ht_iphc_len and 2457 * IPsec overhead if applied. Make sure to use the most recent 2458 * IPsec information. 2459 */ 2460 mss = pmtu - connp->conn_ht_iphc_len - conn_ipsec_length(connp); 2461 2462 /* 2463 * Nothing to change, so just return. 2464 */ 2465 if (mss == tcp->tcp_mss) 2466 return; 2467 2468 /* 2469 * Currently, for ICMP errors, only PMTU decrease is handled. 2470 */ 2471 if (mss > tcp->tcp_mss && decrease_only) 2472 return; 2473 2474 DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss); 2475 2476 /* 2477 * Update ixa_fragsize and ixa_pmtu. 2478 */ 2479 ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu; 2480 2481 /* 2482 * Adjust MSS and all relevant variables. 2483 */ 2484 tcp_mss_set(tcp, mss); 2485 2486 /* 2487 * If the PMTU is below the min size maintained by IP, then ip_get_pmtu 2488 * has set IXAF_PMTU_TOO_SMALL and cleared IXAF_PMTU_IPV4_DF. Since TCP 2489 * has a (potentially different) min size we do the same. Make sure to 2490 * clear IXAF_DONTFRAG, which is used by IP to decide whether to 2491 * fragment the packet. 2492 * 2493 * LSO over IPv6 can not be fragmented. So need to disable LSO 2494 * when IPv6 fragmentation is needed. 2495 */ 2496 if (mss < tcp->tcp_tcps->tcps_mss_min) 2497 ixaflags |= IXAF_PMTU_TOO_SMALL; 2498 2499 if (ixaflags & IXAF_PMTU_TOO_SMALL) 2500 ixaflags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 2501 2502 if ((connp->conn_ipversion == IPV4_VERSION) && 2503 !(ixaflags & IXAF_PMTU_IPV4_DF)) { 2504 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; 2505 } 2506 ixa->ixa_flags = ixaflags; 2507 } 2508 2509 int 2510 tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) 2511 { 2512 conn_t *connp = tcp->tcp_connp; 2513 queue_t *q = connp->conn_rq; 2514 int32_t mss = tcp->tcp_mss; 2515 int maxpsz; 2516 2517 if (TCP_IS_DETACHED(tcp)) 2518 return (mss); 2519 if (tcp->tcp_fused) { 2520 maxpsz = tcp_fuse_maxpsz(tcp); 2521 mss = INFPSZ; 2522 } else if (tcp->tcp_maxpsz_multiplier == 0) { 2523 /* 2524 * Set the sd_qn_maxpsz according to the socket send buffer 2525 * size, and sd_maxblk to INFPSZ (-1). This will essentially 2526 * instruct the stream head to copyin user data into contiguous 2527 * kernel-allocated buffers without breaking it up into smaller 2528 * chunks. We round up the buffer size to the nearest SMSS. 2529 */ 2530 maxpsz = MSS_ROUNDUP(connp->conn_sndbuf, mss); 2531 mss = INFPSZ; 2532 } else { 2533 /* 2534 * Set sd_qn_maxpsz to approx half the (receivers) buffer 2535 * (and a multiple of the mss). This instructs the stream 2536 * head to break down larger than SMSS writes into SMSS- 2537 * size mblks, up to tcp_maxpsz_multiplier mblks at a time. 2538 */ 2539 maxpsz = tcp->tcp_maxpsz_multiplier * mss; 2540 if (maxpsz > connp->conn_sndbuf / 2) { 2541 maxpsz = connp->conn_sndbuf / 2; 2542 /* Round up to nearest mss */ 2543 maxpsz = MSS_ROUNDUP(maxpsz, mss); 2544 } 2545 } 2546 2547 (void) proto_set_maxpsz(q, connp, maxpsz); 2548 if (!(IPCL_IS_NONSTR(connp))) 2549 connp->conn_wq->q_maxpsz = maxpsz; 2550 if (set_maxblk) 2551 (void) proto_set_tx_maxblk(q, connp, mss); 2552 return (mss); 2553 } 2554 2555 /* For /dev/tcp aka AF_INET open */ 2556 static int 2557 tcp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 2558 { 2559 return (tcp_open(q, devp, flag, sflag, credp, B_FALSE)); 2560 } 2561 2562 /* For /dev/tcp6 aka AF_INET6 open */ 2563 static int 2564 tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 2565 { 2566 return (tcp_open(q, devp, flag, sflag, credp, B_TRUE)); 2567 } 2568 2569 conn_t * 2570 tcp_create_common(cred_t *credp, boolean_t isv6, boolean_t issocket, 2571 int *errorp) 2572 { 2573 tcp_t *tcp = NULL; 2574 conn_t *connp; 2575 zoneid_t zoneid; 2576 tcp_stack_t *tcps; 2577 squeue_t *sqp; 2578 2579 ASSERT(errorp != NULL); 2580 /* 2581 * Find the proper zoneid and netstack. 2582 */ 2583 /* 2584 * Special case for install: miniroot needs to be able to 2585 * access files via NFS as though it were always in the 2586 * global zone. 2587 */ 2588 if (credp == kcred && nfs_global_client_only != 0) { 2589 zoneid = GLOBAL_ZONEID; 2590 tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)-> 2591 netstack_tcp; 2592 ASSERT(tcps != NULL); 2593 } else { 2594 netstack_t *ns; 2595 int err; 2596 2597 if ((err = secpolicy_basic_net_access(credp)) != 0) { 2598 *errorp = err; 2599 return (NULL); 2600 } 2601 2602 ns = netstack_find_by_cred(credp); 2603 ASSERT(ns != NULL); 2604 tcps = ns->netstack_tcp; 2605 ASSERT(tcps != NULL); 2606 2607 /* 2608 * For exclusive stacks we set the zoneid to zero 2609 * to make TCP operate as if in the global zone. 2610 */ 2611 if (tcps->tcps_netstack->netstack_stackid != 2612 GLOBAL_NETSTACKID) 2613 zoneid = GLOBAL_ZONEID; 2614 else 2615 zoneid = crgetzoneid(credp); 2616 } 2617 2618 sqp = IP_SQUEUE_GET((uint_t)gethrtime()); 2619 connp = (conn_t *)tcp_get_conn(sqp, tcps); 2620 /* 2621 * Both tcp_get_conn and netstack_find_by_cred incremented refcnt, 2622 * so we drop it by one. 2623 */ 2624 netstack_rele(tcps->tcps_netstack); 2625 if (connp == NULL) { 2626 *errorp = ENOSR; 2627 return (NULL); 2628 } 2629 ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto); 2630 2631 connp->conn_sqp = sqp; 2632 connp->conn_initial_sqp = connp->conn_sqp; 2633 connp->conn_ixa->ixa_sqp = connp->conn_sqp; 2634 tcp = connp->conn_tcp; 2635 2636 /* 2637 * Besides asking IP to set the checksum for us, have conn_ip_output 2638 * to do the following checks when necessary: 2639 * 2640 * IXAF_VERIFY_SOURCE: drop packets when our outer source goes invalid 2641 * IXAF_VERIFY_PMTU: verify PMTU changes 2642 * IXAF_VERIFY_LSO: verify LSO capability changes 2643 */ 2644 connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | 2645 IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO; 2646 2647 if (!tcps->tcps_dev_flow_ctl) 2648 connp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL; 2649 2650 if (isv6) { 2651 connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT; 2652 connp->conn_ipversion = IPV6_VERSION; 2653 connp->conn_family = AF_INET6; 2654 tcp->tcp_mss = tcps->tcps_mss_def_ipv6; 2655 connp->conn_default_ttl = tcps->tcps_ipv6_hoplimit; 2656 } else { 2657 connp->conn_ipversion = IPV4_VERSION; 2658 connp->conn_family = AF_INET; 2659 tcp->tcp_mss = tcps->tcps_mss_def_ipv4; 2660 connp->conn_default_ttl = tcps->tcps_ipv4_ttl; 2661 } 2662 connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; 2663 2664 crhold(credp); 2665 connp->conn_cred = credp; 2666 connp->conn_cpid = curproc->p_pid; 2667 connp->conn_open_time = ddi_get_lbolt64(); 2668 2669 /* Cache things in the ixa without any refhold */ 2670 ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 2671 connp->conn_ixa->ixa_cred = credp; 2672 connp->conn_ixa->ixa_cpid = connp->conn_cpid; 2673 2674 connp->conn_zoneid = zoneid; 2675 /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ 2676 connp->conn_ixa->ixa_zoneid = zoneid; 2677 connp->conn_mlp_type = mlptSingle; 2678 ASSERT(connp->conn_netstack == tcps->tcps_netstack); 2679 ASSERT(tcp->tcp_tcps == tcps); 2680 2681 /* 2682 * If the caller has the process-wide flag set, then default to MAC 2683 * exempt mode. This allows read-down to unlabeled hosts. 2684 */ 2685 if (getpflags(NET_MAC_AWARE, credp) != 0) 2686 connp->conn_mac_mode = CONN_MAC_AWARE; 2687 2688 connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); 2689 2690 if (issocket) { 2691 tcp->tcp_issocket = 1; 2692 } 2693 2694 connp->conn_rcvbuf = tcps->tcps_recv_hiwat; 2695 connp->conn_sndbuf = tcps->tcps_xmit_hiwat; 2696 if (tcps->tcps_snd_lowat_fraction != 0) { 2697 connp->conn_sndlowat = connp->conn_sndbuf / 2698 tcps->tcps_snd_lowat_fraction; 2699 } else { 2700 connp->conn_sndlowat = tcps->tcps_xmit_lowat; 2701 } 2702 connp->conn_so_type = SOCK_STREAM; 2703 connp->conn_wroff = connp->conn_ht_iphc_allocated + 2704 tcps->tcps_wroff_xtra; 2705 2706 SOCK_CONNID_INIT(tcp->tcp_connid); 2707 /* DTrace ignores this - it isn't a tcp:::state-change */ 2708 tcp->tcp_state = TCPS_IDLE; 2709 tcp_init_values(tcp, NULL); 2710 return (connp); 2711 } 2712 2713 static int 2714 tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 2715 boolean_t isv6) 2716 { 2717 tcp_t *tcp = NULL; 2718 conn_t *connp = NULL; 2719 int err; 2720 vmem_t *minor_arena = NULL; 2721 dev_t conn_dev; 2722 boolean_t issocket; 2723 2724 if (q->q_ptr != NULL) 2725 return (0); 2726 2727 if (sflag == MODOPEN) 2728 return (EINVAL); 2729 2730 if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) && 2731 ((conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) { 2732 minor_arena = ip_minor_arena_la; 2733 } else { 2734 /* 2735 * Either minor numbers in the large arena were exhausted 2736 * or a non socket application is doing the open. 2737 * Try to allocate from the small arena. 2738 */ 2739 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 2740 return (EBUSY); 2741 } 2742 minor_arena = ip_minor_arena_sa; 2743 } 2744 2745 ASSERT(minor_arena != NULL); 2746 2747 *devp = makedevice(getmajor(*devp), (minor_t)conn_dev); 2748 2749 if (flag & SO_FALLBACK) { 2750 /* 2751 * Non streams socket needs a stream to fallback to 2752 */ 2753 RD(q)->q_ptr = (void *)conn_dev; 2754 WR(q)->q_qinfo = &tcp_fallback_sock_winit; 2755 WR(q)->q_ptr = (void *)minor_arena; 2756 qprocson(q); 2757 return (0); 2758 } else if (flag & SO_ACCEPTOR) { 2759 q->q_qinfo = &tcp_acceptor_rinit; 2760 /* 2761 * the conn_dev and minor_arena will be subsequently used by 2762 * tcp_tli_accept() and tcp_tpi_close_accept() to figure out 2763 * the minor device number for this connection from the q_ptr. 2764 */ 2765 RD(q)->q_ptr = (void *)conn_dev; 2766 WR(q)->q_qinfo = &tcp_acceptor_winit; 2767 WR(q)->q_ptr = (void *)minor_arena; 2768 qprocson(q); 2769 return (0); 2770 } 2771 2772 issocket = flag & SO_SOCKSTR; 2773 connp = tcp_create_common(credp, isv6, issocket, &err); 2774 2775 if (connp == NULL) { 2776 inet_minor_free(minor_arena, conn_dev); 2777 q->q_ptr = WR(q)->q_ptr = NULL; 2778 return (err); 2779 } 2780 2781 connp->conn_rq = q; 2782 connp->conn_wq = WR(q); 2783 q->q_ptr = WR(q)->q_ptr = connp; 2784 2785 connp->conn_dev = conn_dev; 2786 connp->conn_minor_arena = minor_arena; 2787 2788 ASSERT(q->q_qinfo == &tcp_rinitv4 || q->q_qinfo == &tcp_rinitv6); 2789 ASSERT(WR(q)->q_qinfo == &tcp_winit); 2790 2791 tcp = connp->conn_tcp; 2792 2793 if (issocket) { 2794 WR(q)->q_qinfo = &tcp_sock_winit; 2795 } else { 2796 #ifdef _ILP32 2797 tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); 2798 #else 2799 tcp->tcp_acceptor_id = conn_dev; 2800 #endif /* _ILP32 */ 2801 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 2802 } 2803 2804 /* 2805 * Put the ref for TCP. Ref for IP was already put 2806 * by ipcl_conn_create. Also Make the conn_t globally 2807 * visible to walkers 2808 */ 2809 mutex_enter(&connp->conn_lock); 2810 CONN_INC_REF_LOCKED(connp); 2811 ASSERT(connp->conn_ref == 2); 2812 connp->conn_state_flags &= ~CONN_INCIPIENT; 2813 mutex_exit(&connp->conn_lock); 2814 2815 qprocson(q); 2816 return (0); 2817 } 2818 2819 /* 2820 * Build/update the tcp header template (in conn_ht_iphc) based on 2821 * conn_xmit_ipp. The headers include ip6_t, any extension 2822 * headers, and the maximum size tcp header (to avoid reallocation 2823 * on the fly for additional tcp options). 2824 * 2825 * Assumes the caller has already set conn_{faddr,laddr,fport,lport,flowinfo}. 2826 * Returns failure if can't allocate memory. 2827 */ 2828 int 2829 tcp_build_hdrs(tcp_t *tcp) 2830 { 2831 tcp_stack_t *tcps = tcp->tcp_tcps; 2832 conn_t *connp = tcp->tcp_connp; 2833 char buf[TCP_MAX_HDR_LENGTH]; 2834 uint_t buflen; 2835 uint_t ulplen = TCP_MIN_HEADER_LENGTH; 2836 uint_t extralen = TCP_MAX_TCP_OPTIONS_LENGTH; 2837 tcpha_t *tcpha; 2838 uint32_t cksum; 2839 int error; 2840 2841 /* 2842 * We might be called after the connection is set up, and we might 2843 * have TS options already in the TCP header. Thus we save any 2844 * existing tcp header. 2845 */ 2846 buflen = connp->conn_ht_ulp_len; 2847 if (buflen != 0) { 2848 bcopy(connp->conn_ht_ulp, buf, buflen); 2849 extralen -= buflen - ulplen; 2850 ulplen = buflen; 2851 } 2852 2853 /* Grab lock to satisfy ASSERT; TCP is serialized using squeue */ 2854 mutex_enter(&connp->conn_lock); 2855 error = conn_build_hdr_template(connp, ulplen, extralen, 2856 &connp->conn_laddr_v6, &connp->conn_faddr_v6, connp->conn_flowinfo); 2857 mutex_exit(&connp->conn_lock); 2858 if (error != 0) 2859 return (error); 2860 2861 /* 2862 * Any routing header/option has been massaged. The checksum difference 2863 * is stored in conn_sum for later use. 2864 */ 2865 tcpha = (tcpha_t *)connp->conn_ht_ulp; 2866 tcp->tcp_tcpha = tcpha; 2867 2868 /* restore any old tcp header */ 2869 if (buflen != 0) { 2870 bcopy(buf, connp->conn_ht_ulp, buflen); 2871 } else { 2872 tcpha->tha_sum = 0; 2873 tcpha->tha_urp = 0; 2874 tcpha->tha_ack = 0; 2875 tcpha->tha_offset_and_reserved = (5 << 4); 2876 tcpha->tha_lport = connp->conn_lport; 2877 tcpha->tha_fport = connp->conn_fport; 2878 } 2879 2880 /* 2881 * IP wants our header length in the checksum field to 2882 * allow it to perform a single pseudo-header+checksum 2883 * calculation on behalf of TCP. 2884 * Include the adjustment for a source route once IP_OPTIONS is set. 2885 */ 2886 cksum = sizeof (tcpha_t) + connp->conn_sum; 2887 cksum = (cksum >> 16) + (cksum & 0xFFFF); 2888 ASSERT(cksum < 0x10000); 2889 tcpha->tha_sum = htons(cksum); 2890 2891 if (connp->conn_ipversion == IPV4_VERSION) 2892 tcp->tcp_ipha = (ipha_t *)connp->conn_ht_iphc; 2893 else 2894 tcp->tcp_ip6h = (ip6_t *)connp->conn_ht_iphc; 2895 2896 if (connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra > 2897 connp->conn_wroff) { 2898 connp->conn_wroff = connp->conn_ht_iphc_allocated + 2899 tcps->tcps_wroff_xtra; 2900 (void) proto_set_tx_wroff(connp->conn_rq, connp, 2901 connp->conn_wroff); 2902 } 2903 return (0); 2904 } 2905 2906 /* 2907 * tcp_rwnd_set() is called to adjust the receive window to a desired value. 2908 * We do not allow the receive window to shrink. After setting rwnd, 2909 * set the flow control hiwat of the stream. 2910 * 2911 * This function is called in 2 cases: 2912 * 2913 * 1) Before data transfer begins, in tcp_input_listener() for accepting a 2914 * connection (passive open) and in tcp_input_data() for active connect. 2915 * This is called after tcp_mss_set() when the desired MSS value is known. 2916 * This makes sure that our window size is a mutiple of the other side's 2917 * MSS. 2918 * 2) Handling SO_RCVBUF option. 2919 * 2920 * It is ASSUMED that the requested size is a multiple of the current MSS. 2921 * 2922 * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the 2923 * user requests so. 2924 */ 2925 int 2926 tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) 2927 { 2928 uint32_t mss = tcp->tcp_mss; 2929 uint32_t old_max_rwnd; 2930 uint32_t max_transmittable_rwnd; 2931 boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 2932 tcp_stack_t *tcps = tcp->tcp_tcps; 2933 conn_t *connp = tcp->tcp_connp; 2934 2935 /* 2936 * Insist on a receive window that is at least 2937 * tcp_recv_hiwat_minmss * MSS (default 4 * MSS) to avoid 2938 * funny TCP interactions of Nagle algorithm, SWS avoidance 2939 * and delayed acknowledgement. 2940 */ 2941 rwnd = MAX(rwnd, tcps->tcps_recv_hiwat_minmss * mss); 2942 2943 if (tcp->tcp_fused) { 2944 size_t sth_hiwat; 2945 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 2946 2947 ASSERT(peer_tcp != NULL); 2948 sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd); 2949 if (!tcp_detached) { 2950 (void) proto_set_rx_hiwat(connp->conn_rq, connp, 2951 sth_hiwat); 2952 tcp_set_recv_threshold(tcp, sth_hiwat >> 3); 2953 } 2954 2955 /* Caller could have changed tcp_rwnd; update tha_win */ 2956 if (tcp->tcp_tcpha != NULL) { 2957 tcp->tcp_tcpha->tha_win = 2958 htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 2959 } 2960 if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 2961 tcp->tcp_cwnd_max = rwnd; 2962 2963 /* 2964 * In the fusion case, the maxpsz stream head value of 2965 * our peer is set according to its send buffer size 2966 * and our receive buffer size; since the latter may 2967 * have changed we need to update the peer's maxpsz. 2968 */ 2969 (void) tcp_maxpsz_set(peer_tcp, B_TRUE); 2970 return (sth_hiwat); 2971 } 2972 2973 if (tcp_detached) 2974 old_max_rwnd = tcp->tcp_rwnd; 2975 else 2976 old_max_rwnd = connp->conn_rcvbuf; 2977 2978 2979 /* 2980 * If window size info has already been exchanged, TCP should not 2981 * shrink the window. Shrinking window is doable if done carefully. 2982 * We may add that support later. But so far there is not a real 2983 * need to do that. 2984 */ 2985 if (rwnd < old_max_rwnd && tcp->tcp_state > TCPS_SYN_SENT) { 2986 /* MSS may have changed, do a round up again. */ 2987 rwnd = MSS_ROUNDUP(old_max_rwnd, mss); 2988 } 2989 2990 /* 2991 * tcp_rcv_ws starts with TCP_MAX_WINSHIFT so the following check 2992 * can be applied even before the window scale option is decided. 2993 */ 2994 max_transmittable_rwnd = TCP_MAXWIN << tcp->tcp_rcv_ws; 2995 if (rwnd > max_transmittable_rwnd) { 2996 rwnd = max_transmittable_rwnd - 2997 (max_transmittable_rwnd % mss); 2998 if (rwnd < mss) 2999 rwnd = max_transmittable_rwnd; 3000 /* 3001 * If we're over the limit we may have to back down tcp_rwnd. 3002 * The increment below won't work for us. So we set all three 3003 * here and the increment below will have no effect. 3004 */ 3005 tcp->tcp_rwnd = old_max_rwnd = rwnd; 3006 } 3007 if (tcp->tcp_localnet) { 3008 tcp->tcp_rack_abs_max = 3009 MIN(tcps->tcps_local_dacks_max, rwnd / mss / 2); 3010 } else { 3011 /* 3012 * For a remote host on a different subnet (through a router), 3013 * we ack every other packet to be conforming to RFC1122. 3014 * tcp_deferred_acks_max is default to 2. 3015 */ 3016 tcp->tcp_rack_abs_max = 3017 MIN(tcps->tcps_deferred_acks_max, rwnd / mss / 2); 3018 } 3019 if (tcp->tcp_rack_cur_max > tcp->tcp_rack_abs_max) 3020 tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; 3021 else 3022 tcp->tcp_rack_cur_max = 0; 3023 /* 3024 * Increment the current rwnd by the amount the maximum grew (we 3025 * can not overwrite it since we might be in the middle of a 3026 * connection.) 3027 */ 3028 tcp->tcp_rwnd += rwnd - old_max_rwnd; 3029 connp->conn_rcvbuf = rwnd; 3030 3031 /* Are we already connected? */ 3032 if (tcp->tcp_tcpha != NULL) { 3033 tcp->tcp_tcpha->tha_win = 3034 htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 3035 } 3036 3037 if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 3038 tcp->tcp_cwnd_max = rwnd; 3039 3040 if (tcp_detached) 3041 return (rwnd); 3042 3043 tcp_set_recv_threshold(tcp, rwnd >> 3); 3044 3045 (void) proto_set_rx_hiwat(connp->conn_rq, connp, rwnd); 3046 return (rwnd); 3047 } 3048 3049 int 3050 tcp_do_unbind(conn_t *connp) 3051 { 3052 tcp_t *tcp = connp->conn_tcp; 3053 int32_t oldstate; 3054 3055 switch (tcp->tcp_state) { 3056 case TCPS_BOUND: 3057 case TCPS_LISTEN: 3058 break; 3059 default: 3060 return (-TOUTSTATE); 3061 } 3062 3063 /* 3064 * Need to clean up all the eagers since after the unbind, segments 3065 * will no longer be delivered to this listener stream. 3066 */ 3067 mutex_enter(&tcp->tcp_eager_lock); 3068 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 3069 tcp_eager_cleanup(tcp, 0); 3070 } 3071 mutex_exit(&tcp->tcp_eager_lock); 3072 3073 /* Clean up the listener connection counter if necessary. */ 3074 if (tcp->tcp_listen_cnt != NULL) 3075 TCP_DECR_LISTEN_CNT(tcp); 3076 connp->conn_laddr_v6 = ipv6_all_zeros; 3077 connp->conn_saddr_v6 = ipv6_all_zeros; 3078 tcp_bind_hash_remove(tcp); 3079 oldstate = tcp->tcp_state; 3080 tcp->tcp_state = TCPS_IDLE; 3081 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 3082 connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, 3083 int32_t, oldstate); 3084 3085 ip_unbind(connp); 3086 bzero(&connp->conn_ports, sizeof (connp->conn_ports)); 3087 3088 return (0); 3089 } 3090 3091 /* 3092 * Collect protocol properties to send to the upper handle. 3093 */ 3094 void 3095 tcp_get_proto_props(tcp_t *tcp, struct sock_proto_props *sopp) 3096 { 3097 conn_t *connp = tcp->tcp_connp; 3098 3099 sopp->sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF; 3100 sopp->sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 3101 3102 sopp->sopp_rxhiwat = tcp->tcp_fused ? 3103 tcp_fuse_set_rcv_hiwat(tcp, connp->conn_rcvbuf) : 3104 connp->conn_rcvbuf; 3105 /* 3106 * Determine what write offset value to use depending on SACK and 3107 * whether the endpoint is fused or not. 3108 */ 3109 if (tcp->tcp_fused) { 3110 ASSERT(tcp->tcp_loopback); 3111 ASSERT(tcp->tcp_loopback_peer != NULL); 3112 /* 3113 * For fused tcp loopback, set the stream head's write 3114 * offset value to zero since we won't be needing any room 3115 * for TCP/IP headers. This would also improve performance 3116 * since it would reduce the amount of work done by kmem. 3117 * Non-fused tcp loopback case is handled separately below. 3118 */ 3119 sopp->sopp_wroff = 0; 3120 /* 3121 * Update the peer's transmit parameters according to 3122 * our recently calculated high water mark value. 3123 */ 3124 (void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE); 3125 } else if (tcp->tcp_snd_sack_ok) { 3126 sopp->sopp_wroff = connp->conn_ht_iphc_allocated + 3127 (tcp->tcp_loopback ? 0 : tcp->tcp_tcps->tcps_wroff_xtra); 3128 } else { 3129 sopp->sopp_wroff = connp->conn_ht_iphc_len + 3130 (tcp->tcp_loopback ? 0 : tcp->tcp_tcps->tcps_wroff_xtra); 3131 } 3132 3133 if (tcp->tcp_loopback) { 3134 sopp->sopp_flags |= SOCKOPT_LOOPBACK; 3135 sopp->sopp_loopback = B_TRUE; 3136 } 3137 } 3138 3139 /* 3140 * Check the usability of ZEROCOPY. It's instead checking the flag set by IP. 3141 */ 3142 boolean_t 3143 tcp_zcopy_check(tcp_t *tcp) 3144 { 3145 conn_t *connp = tcp->tcp_connp; 3146 ip_xmit_attr_t *ixa = connp->conn_ixa; 3147 boolean_t zc_enabled = B_FALSE; 3148 tcp_stack_t *tcps = tcp->tcp_tcps; 3149 3150 if (do_tcpzcopy == 2) 3151 zc_enabled = B_TRUE; 3152 else if ((do_tcpzcopy == 1) && (ixa->ixa_flags & IXAF_ZCOPY_CAPAB)) 3153 zc_enabled = B_TRUE; 3154 3155 tcp->tcp_snd_zcopy_on = zc_enabled; 3156 if (!TCP_IS_DETACHED(tcp)) { 3157 if (zc_enabled) { 3158 ixa->ixa_flags |= IXAF_VERIFY_ZCOPY; 3159 (void) proto_set_tx_copyopt(connp->conn_rq, connp, 3160 ZCVMSAFE); 3161 TCP_STAT(tcps, tcp_zcopy_on); 3162 } else { 3163 ixa->ixa_flags &= ~IXAF_VERIFY_ZCOPY; 3164 (void) proto_set_tx_copyopt(connp->conn_rq, connp, 3165 ZCVMUNSAFE); 3166 TCP_STAT(tcps, tcp_zcopy_off); 3167 } 3168 } 3169 return (zc_enabled); 3170 } 3171 3172 /* 3173 * Backoff from a zero-copy message by copying data to a new allocated 3174 * message and freeing the original desballoca'ed segmapped message. 3175 * 3176 * This function is called by following two callers: 3177 * 1. tcp_timer: fix_xmitlist is set to B_TRUE, because it's safe to free 3178 * the origial desballoca'ed message and notify sockfs. This is in re- 3179 * transmit state. 3180 * 2. tcp_output: fix_xmitlist is set to B_FALSE. Flag STRUIO_ZCNOTIFY need 3181 * to be copied to new message. 3182 */ 3183 mblk_t * 3184 tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, boolean_t fix_xmitlist) 3185 { 3186 mblk_t *nbp; 3187 mblk_t *head = NULL; 3188 mblk_t *tail = NULL; 3189 tcp_stack_t *tcps = tcp->tcp_tcps; 3190 3191 ASSERT(bp != NULL); 3192 while (bp != NULL) { 3193 if (IS_VMLOANED_MBLK(bp)) { 3194 TCP_STAT(tcps, tcp_zcopy_backoff); 3195 if ((nbp = copyb(bp)) == NULL) { 3196 tcp->tcp_xmit_zc_clean = B_FALSE; 3197 if (tail != NULL) 3198 tail->b_cont = bp; 3199 return ((head == NULL) ? bp : head); 3200 } 3201 3202 if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { 3203 if (fix_xmitlist) 3204 tcp_zcopy_notify(tcp); 3205 else 3206 nbp->b_datap->db_struioflag |= 3207 STRUIO_ZCNOTIFY; 3208 } 3209 nbp->b_cont = bp->b_cont; 3210 3211 /* 3212 * Copy saved information and adjust tcp_xmit_tail 3213 * if needed. 3214 */ 3215 if (fix_xmitlist) { 3216 nbp->b_prev = bp->b_prev; 3217 nbp->b_next = bp->b_next; 3218 3219 if (tcp->tcp_xmit_tail == bp) 3220 tcp->tcp_xmit_tail = nbp; 3221 } 3222 3223 /* Free the original message. */ 3224 bp->b_prev = NULL; 3225 bp->b_next = NULL; 3226 freeb(bp); 3227 3228 bp = nbp; 3229 } 3230 3231 if (head == NULL) { 3232 head = bp; 3233 } 3234 if (tail == NULL) { 3235 tail = bp; 3236 } else { 3237 tail->b_cont = bp; 3238 tail = bp; 3239 } 3240 3241 /* Move forward. */ 3242 bp = bp->b_cont; 3243 } 3244 3245 if (fix_xmitlist) { 3246 tcp->tcp_xmit_last = tail; 3247 tcp->tcp_xmit_zc_clean = B_TRUE; 3248 } 3249 3250 return (head); 3251 } 3252 3253 void 3254 tcp_zcopy_notify(tcp_t *tcp) 3255 { 3256 struct stdata *stp; 3257 conn_t *connp; 3258 3259 if (tcp->tcp_detached) 3260 return; 3261 connp = tcp->tcp_connp; 3262 if (IPCL_IS_NONSTR(connp)) { 3263 (*connp->conn_upcalls->su_zcopy_notify) 3264 (connp->conn_upper_handle); 3265 return; 3266 } 3267 stp = STREAM(connp->conn_rq); 3268 mutex_enter(&stp->sd_lock); 3269 stp->sd_flag |= STZCNOTIFY; 3270 cv_broadcast(&stp->sd_zcopy_wait); 3271 mutex_exit(&stp->sd_lock); 3272 } 3273 3274 /* 3275 * Update the TCP connection according to change of LSO capability. 3276 */ 3277 static void 3278 tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa) 3279 { 3280 /* 3281 * We check against IPv4 header length to preserve the old behavior 3282 * of only enabling LSO when there are no IP options. 3283 * But this restriction might not be necessary at all. Before removing 3284 * it, need to verify how LSO is handled for source routing case, with 3285 * which IP does software checksum. 3286 * 3287 * For IPv6, whenever any extension header is needed, LSO is supressed. 3288 */ 3289 if (ixa->ixa_ip_hdr_length != ((ixa->ixa_flags & IXAF_IS_IPV4) ? 3290 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN)) 3291 return; 3292 3293 /* 3294 * Either the LSO capability newly became usable, or it has changed. 3295 */ 3296 if (ixa->ixa_flags & IXAF_LSO_CAPAB) { 3297 ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; 3298 3299 ASSERT(lsoc->ill_lso_max > 0); 3300 tcp->tcp_lso_max = MIN(TCP_MAX_LSO_LENGTH, lsoc->ill_lso_max); 3301 3302 DTRACE_PROBE3(tcp_update_lso, boolean_t, tcp->tcp_lso, 3303 boolean_t, B_TRUE, uint32_t, tcp->tcp_lso_max); 3304 3305 /* 3306 * If LSO to be enabled, notify the STREAM header with larger 3307 * data block. 3308 */ 3309 if (!tcp->tcp_lso) 3310 tcp->tcp_maxpsz_multiplier = 0; 3311 3312 tcp->tcp_lso = B_TRUE; 3313 TCP_STAT(tcp->tcp_tcps, tcp_lso_enabled); 3314 } else { /* LSO capability is not usable any more. */ 3315 DTRACE_PROBE3(tcp_update_lso, boolean_t, tcp->tcp_lso, 3316 boolean_t, B_FALSE, uint32_t, tcp->tcp_lso_max); 3317 3318 /* 3319 * If LSO to be disabled, notify the STREAM header with smaller 3320 * data block. And need to restore fragsize to PMTU. 3321 */ 3322 if (tcp->tcp_lso) { 3323 tcp->tcp_maxpsz_multiplier = 3324 tcp->tcp_tcps->tcps_maxpsz_multiplier; 3325 ixa->ixa_fragsize = ixa->ixa_pmtu; 3326 tcp->tcp_lso = B_FALSE; 3327 TCP_STAT(tcp->tcp_tcps, tcp_lso_disabled); 3328 } 3329 } 3330 3331 (void) tcp_maxpsz_set(tcp, B_TRUE); 3332 } 3333 3334 /* 3335 * Update the TCP connection according to change of ZEROCOPY capability. 3336 */ 3337 static void 3338 tcp_update_zcopy(tcp_t *tcp) 3339 { 3340 conn_t *connp = tcp->tcp_connp; 3341 tcp_stack_t *tcps = tcp->tcp_tcps; 3342 3343 if (tcp->tcp_snd_zcopy_on) { 3344 tcp->tcp_snd_zcopy_on = B_FALSE; 3345 if (!TCP_IS_DETACHED(tcp)) { 3346 (void) proto_set_tx_copyopt(connp->conn_rq, connp, 3347 ZCVMUNSAFE); 3348 TCP_STAT(tcps, tcp_zcopy_off); 3349 } 3350 } else { 3351 tcp->tcp_snd_zcopy_on = B_TRUE; 3352 if (!TCP_IS_DETACHED(tcp)) { 3353 (void) proto_set_tx_copyopt(connp->conn_rq, connp, 3354 ZCVMSAFE); 3355 TCP_STAT(tcps, tcp_zcopy_on); 3356 } 3357 } 3358 } 3359 3360 /* 3361 * Notify function registered with ip_xmit_attr_t. It's called in the squeue 3362 * so it's safe to update the TCP connection. 3363 */ 3364 /* ARGSUSED1 */ 3365 static void 3366 tcp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype, 3367 ixa_notify_arg_t narg) 3368 { 3369 tcp_t *tcp = (tcp_t *)arg; 3370 conn_t *connp = tcp->tcp_connp; 3371 3372 switch (ntype) { 3373 case IXAN_LSO: 3374 tcp_update_lso(tcp, connp->conn_ixa); 3375 break; 3376 case IXAN_PMTU: 3377 tcp_update_pmtu(tcp, B_FALSE); 3378 break; 3379 case IXAN_ZCOPY: 3380 tcp_update_zcopy(tcp); 3381 break; 3382 default: 3383 break; 3384 } 3385 } 3386 3387 /* 3388 * The TCP write service routine should never be called... 3389 */ 3390 /* ARGSUSED */ 3391 static int 3392 tcp_wsrv(queue_t *q) 3393 { 3394 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 3395 3396 TCP_STAT(tcps, tcp_wsrv_called); 3397 return (0); 3398 } 3399 3400 /* 3401 * Hash list lookup routine for tcp_t structures. 3402 * Returns with a CONN_INC_REF tcp structure. Caller must do a CONN_DEC_REF. 3403 */ 3404 tcp_t * 3405 tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *tcps) 3406 { 3407 tf_t *tf; 3408 tcp_t *tcp; 3409 3410 tf = &tcps->tcps_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; 3411 mutex_enter(&tf->tf_lock); 3412 for (tcp = tf->tf_tcp; tcp != NULL; 3413 tcp = tcp->tcp_acceptor_hash) { 3414 if (tcp->tcp_acceptor_id == id) { 3415 CONN_INC_REF(tcp->tcp_connp); 3416 mutex_exit(&tf->tf_lock); 3417 return (tcp); 3418 } 3419 } 3420 mutex_exit(&tf->tf_lock); 3421 return (NULL); 3422 } 3423 3424 /* 3425 * Hash list insertion routine for tcp_t structures. 3426 */ 3427 void 3428 tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp) 3429 { 3430 tf_t *tf; 3431 tcp_t **tcpp; 3432 tcp_t *tcpnext; 3433 tcp_stack_t *tcps = tcp->tcp_tcps; 3434 3435 tf = &tcps->tcps_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; 3436 3437 if (tcp->tcp_ptpahn != NULL) 3438 tcp_acceptor_hash_remove(tcp); 3439 tcpp = &tf->tf_tcp; 3440 mutex_enter(&tf->tf_lock); 3441 tcpnext = tcpp[0]; 3442 if (tcpnext) 3443 tcpnext->tcp_ptpahn = &tcp->tcp_acceptor_hash; 3444 tcp->tcp_acceptor_hash = tcpnext; 3445 tcp->tcp_ptpahn = tcpp; 3446 tcpp[0] = tcp; 3447 tcp->tcp_acceptor_lockp = &tf->tf_lock; /* For tcp_*_hash_remove */ 3448 mutex_exit(&tf->tf_lock); 3449 } 3450 3451 /* 3452 * Hash list removal routine for tcp_t structures. 3453 */ 3454 void 3455 tcp_acceptor_hash_remove(tcp_t *tcp) 3456 { 3457 tcp_t *tcpnext; 3458 kmutex_t *lockp; 3459 3460 /* 3461 * Extract the lock pointer in case there are concurrent 3462 * hash_remove's for this instance. 3463 */ 3464 lockp = tcp->tcp_acceptor_lockp; 3465 3466 if (tcp->tcp_ptpahn == NULL) 3467 return; 3468 3469 ASSERT(lockp != NULL); 3470 mutex_enter(lockp); 3471 if (tcp->tcp_ptpahn) { 3472 tcpnext = tcp->tcp_acceptor_hash; 3473 if (tcpnext) { 3474 tcpnext->tcp_ptpahn = tcp->tcp_ptpahn; 3475 tcp->tcp_acceptor_hash = NULL; 3476 } 3477 *tcp->tcp_ptpahn = tcpnext; 3478 tcp->tcp_ptpahn = NULL; 3479 } 3480 mutex_exit(lockp); 3481 tcp->tcp_acceptor_lockp = NULL; 3482 } 3483 3484 /* 3485 * Type three generator adapted from the random() function in 4.4 BSD: 3486 */ 3487 3488 /* 3489 * Copyright (c) 1983, 1993 3490 * The Regents of the University of California. All rights reserved. 3491 * 3492 * Redistribution and use in source and binary forms, with or without 3493 * modification, are permitted provided that the following conditions 3494 * are met: 3495 * 1. Redistributions of source code must retain the above copyright 3496 * notice, this list of conditions and the following disclaimer. 3497 * 2. Redistributions in binary form must reproduce the above copyright 3498 * notice, this list of conditions and the following disclaimer in the 3499 * documentation and/or other materials provided with the distribution. 3500 * 3. All advertising materials mentioning features or use of this software 3501 * must display the following acknowledgement: 3502 * This product includes software developed by the University of 3503 * California, Berkeley and its contributors. 3504 * 4. Neither the name of the University nor the names of its contributors 3505 * may be used to endorse or promote products derived from this software 3506 * without specific prior written permission. 3507 * 3508 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 3509 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 3510 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 3511 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 3512 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 3513 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 3514 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 3515 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 3516 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3517 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3518 * SUCH DAMAGE. 3519 */ 3520 3521 /* Type 3 -- x**31 + x**3 + 1 */ 3522 #define DEG_3 31 3523 #define SEP_3 3 3524 3525 3526 /* Protected by tcp_random_lock */ 3527 static int tcp_randtbl[DEG_3 + 1]; 3528 3529 static int *tcp_random_fptr = &tcp_randtbl[SEP_3 + 1]; 3530 static int *tcp_random_rptr = &tcp_randtbl[1]; 3531 3532 static int *tcp_random_state = &tcp_randtbl[1]; 3533 static int *tcp_random_end_ptr = &tcp_randtbl[DEG_3 + 1]; 3534 3535 kmutex_t tcp_random_lock; 3536 3537 void 3538 tcp_random_init(void) 3539 { 3540 int i; 3541 hrtime_t hrt; 3542 time_t wallclock; 3543 uint64_t result; 3544 3545 /* 3546 * Use high-res timer and current time for seed. Gethrtime() returns 3547 * a longlong, which may contain resolution down to nanoseconds. 3548 * The current time will either be a 32-bit or a 64-bit quantity. 3549 * XOR the two together in a 64-bit result variable. 3550 * Convert the result to a 32-bit value by multiplying the high-order 3551 * 32-bits by the low-order 32-bits. 3552 */ 3553 3554 hrt = gethrtime(); 3555 (void) drv_getparm(TIME, &wallclock); 3556 result = (uint64_t)wallclock ^ (uint64_t)hrt; 3557 mutex_enter(&tcp_random_lock); 3558 tcp_random_state[0] = ((result >> 32) & 0xffffffff) * 3559 (result & 0xffffffff); 3560 3561 for (i = 1; i < DEG_3; i++) 3562 tcp_random_state[i] = 1103515245 * tcp_random_state[i - 1] 3563 + 12345; 3564 tcp_random_fptr = &tcp_random_state[SEP_3]; 3565 tcp_random_rptr = &tcp_random_state[0]; 3566 mutex_exit(&tcp_random_lock); 3567 for (i = 0; i < 10 * DEG_3; i++) 3568 (void) tcp_random(); 3569 } 3570 3571 /* 3572 * tcp_random: Return a random number in the range [1 - (128K + 1)]. 3573 * This range is selected to be approximately centered on TCP_ISS / 2, 3574 * and easy to compute. We get this value by generating a 32-bit random 3575 * number, selecting out the high-order 17 bits, and then adding one so 3576 * that we never return zero. 3577 */ 3578 int 3579 tcp_random(void) 3580 { 3581 int i; 3582 3583 mutex_enter(&tcp_random_lock); 3584 *tcp_random_fptr += *tcp_random_rptr; 3585 3586 /* 3587 * The high-order bits are more random than the low-order bits, 3588 * so we select out the high-order 17 bits and add one so that 3589 * we never return zero. 3590 */ 3591 i = ((*tcp_random_fptr >> 15) & 0x1ffff) + 1; 3592 if (++tcp_random_fptr >= tcp_random_end_ptr) { 3593 tcp_random_fptr = tcp_random_state; 3594 ++tcp_random_rptr; 3595 } else if (++tcp_random_rptr >= tcp_random_end_ptr) 3596 tcp_random_rptr = tcp_random_state; 3597 3598 mutex_exit(&tcp_random_lock); 3599 return (i); 3600 } 3601 3602 /* 3603 * Split this function out so that if the secret changes, I'm okay. 3604 * 3605 * Initialize the tcp_iss_cookie and tcp_iss_key. 3606 */ 3607 3608 #define PASSWD_SIZE 16 /* MUST be multiple of 4 */ 3609 3610 void 3611 tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *tcps) 3612 { 3613 struct { 3614 int32_t current_time; 3615 uint32_t randnum; 3616 uint16_t pad; 3617 uint8_t ether[6]; 3618 uint8_t passwd[PASSWD_SIZE]; 3619 } tcp_iss_cookie; 3620 time_t t; 3621 3622 /* 3623 * Start with the current absolute time. 3624 */ 3625 (void) drv_getparm(TIME, &t); 3626 tcp_iss_cookie.current_time = t; 3627 3628 /* 3629 * XXX - Need a more random number per RFC 1750, not this crap. 3630 * OTOH, if what follows is pretty random, then I'm in better shape. 3631 */ 3632 tcp_iss_cookie.randnum = (uint32_t)(gethrtime() + tcp_random()); 3633 tcp_iss_cookie.pad = 0x365c; /* Picked from HMAC pad values. */ 3634 3635 /* 3636 * The cpu_type_info is pretty non-random. Ugggh. It does serve 3637 * as a good template. 3638 */ 3639 bcopy(&cpu_list->cpu_type_info, &tcp_iss_cookie.passwd, 3640 min(PASSWD_SIZE, sizeof (cpu_list->cpu_type_info))); 3641 3642 /* 3643 * The pass-phrase. Normally this is supplied by user-called NDD. 3644 */ 3645 bcopy(phrase, &tcp_iss_cookie.passwd, min(PASSWD_SIZE, len)); 3646 3647 /* 3648 * See 4010593 if this section becomes a problem again, 3649 * but the local ethernet address is useful here. 3650 */ 3651 (void) localetheraddr(NULL, 3652 (struct ether_addr *)&tcp_iss_cookie.ether); 3653 3654 /* 3655 * Hash 'em all together. The MD5Final is called per-connection. 3656 */ 3657 mutex_enter(&tcps->tcps_iss_key_lock); 3658 MD5Init(&tcps->tcps_iss_key); 3659 MD5Update(&tcps->tcps_iss_key, (uchar_t *)&tcp_iss_cookie, 3660 sizeof (tcp_iss_cookie)); 3661 mutex_exit(&tcps->tcps_iss_key_lock); 3662 } 3663 3664 /* 3665 * Called by IP when IP is loaded into the kernel 3666 */ 3667 void 3668 tcp_ddi_g_init(void) 3669 { 3670 tcp_timercache = kmem_cache_create("tcp_timercache", 3671 sizeof (tcp_timer_t) + sizeof (mblk_t), 0, 3672 NULL, NULL, NULL, NULL, NULL, 0); 3673 3674 tcp_notsack_blk_cache = kmem_cache_create("tcp_notsack_blk_cache", 3675 sizeof (notsack_blk_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 3676 3677 mutex_init(&tcp_random_lock, NULL, MUTEX_DEFAULT, NULL); 3678 3679 /* Initialize the random number generator */ 3680 tcp_random_init(); 3681 3682 /* A single callback independently of how many netstacks we have */ 3683 ip_squeue_init(tcp_squeue_add); 3684 3685 tcp_g_kstat = tcp_g_kstat_init(&tcp_g_statistics); 3686 3687 tcp_squeue_flag = tcp_squeue_switch(tcp_squeue_wput); 3688 3689 /* 3690 * We want to be informed each time a stack is created or 3691 * destroyed in the kernel, so we can maintain the 3692 * set of tcp_stack_t's. 3693 */ 3694 netstack_register(NS_TCP, tcp_stack_init, NULL, tcp_stack_fini); 3695 } 3696 3697 3698 #define INET_NAME "ip" 3699 3700 /* 3701 * Initialize the TCP stack instance. 3702 */ 3703 static void * 3704 tcp_stack_init(netstackid_t stackid, netstack_t *ns) 3705 { 3706 tcp_stack_t *tcps; 3707 int i; 3708 int error = 0; 3709 major_t major; 3710 size_t arrsz; 3711 3712 tcps = (tcp_stack_t *)kmem_zalloc(sizeof (*tcps), KM_SLEEP); 3713 tcps->tcps_netstack = ns; 3714 3715 /* Initialize locks */ 3716 mutex_init(&tcps->tcps_iss_key_lock, NULL, MUTEX_DEFAULT, NULL); 3717 mutex_init(&tcps->tcps_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL); 3718 3719 tcps->tcps_g_num_epriv_ports = TCP_NUM_EPRIV_PORTS; 3720 tcps->tcps_g_epriv_ports[0] = ULP_DEF_EPRIV_PORT1; 3721 tcps->tcps_g_epriv_ports[1] = ULP_DEF_EPRIV_PORT2; 3722 tcps->tcps_min_anonpriv_port = 512; 3723 3724 tcps->tcps_bind_fanout = kmem_zalloc(sizeof (tf_t) * 3725 TCP_BIND_FANOUT_SIZE, KM_SLEEP); 3726 tcps->tcps_acceptor_fanout = kmem_zalloc(sizeof (tf_t) * 3727 TCP_ACCEPTOR_FANOUT_SIZE, KM_SLEEP); 3728 3729 for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) { 3730 mutex_init(&tcps->tcps_bind_fanout[i].tf_lock, NULL, 3731 MUTEX_DEFAULT, NULL); 3732 } 3733 3734 for (i = 0; i < TCP_ACCEPTOR_FANOUT_SIZE; i++) { 3735 mutex_init(&tcps->tcps_acceptor_fanout[i].tf_lock, NULL, 3736 MUTEX_DEFAULT, NULL); 3737 } 3738 3739 /* TCP's IPsec code calls the packet dropper. */ 3740 ip_drop_register(&tcps->tcps_dropper, "TCP IPsec policy enforcement"); 3741 3742 arrsz = tcp_propinfo_count * sizeof (mod_prop_info_t); 3743 tcps->tcps_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, 3744 KM_SLEEP); 3745 bcopy(tcp_propinfo_tbl, tcps->tcps_propinfo_tbl, arrsz); 3746 3747 /* 3748 * Note: To really walk the device tree you need the devinfo 3749 * pointer to your device which is only available after probe/attach. 3750 * The following is safe only because it uses ddi_root_node() 3751 */ 3752 tcp_max_optsize = optcom_max_optsize(tcp_opt_obj.odb_opt_des_arr, 3753 tcp_opt_obj.odb_opt_arr_cnt); 3754 3755 /* 3756 * Initialize RFC 1948 secret values. This will probably be reset once 3757 * by the boot scripts. 3758 * 3759 * Use NULL name, as the name is caught by the new lockstats. 3760 * 3761 * Initialize with some random, non-guessable string, like the global 3762 * T_INFO_ACK. 3763 */ 3764 3765 tcp_iss_key_init((uint8_t *)&tcp_g_t_info_ack, 3766 sizeof (tcp_g_t_info_ack), tcps); 3767 3768 tcps->tcps_kstat = tcp_kstat2_init(stackid); 3769 tcps->tcps_mibkp = tcp_kstat_init(stackid); 3770 3771 major = mod_name_to_major(INET_NAME); 3772 error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident); 3773 ASSERT(error == 0); 3774 tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL); 3775 ASSERT(tcps->tcps_ixa_cleanup_mp != NULL); 3776 cv_init(&tcps->tcps_ixa_cleanup_ready_cv, NULL, CV_DEFAULT, NULL); 3777 cv_init(&tcps->tcps_ixa_cleanup_done_cv, NULL, CV_DEFAULT, NULL); 3778 mutex_init(&tcps->tcps_ixa_cleanup_lock, NULL, MUTEX_DEFAULT, NULL); 3779 3780 mutex_init(&tcps->tcps_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 3781 tcps->tcps_reclaim = B_FALSE; 3782 tcps->tcps_reclaim_tid = 0; 3783 tcps->tcps_reclaim_period = tcps->tcps_rexmit_interval_max; 3784 3785 /* 3786 * ncpus is the current number of CPUs, which can be bigger than 3787 * boot_ncpus. But we don't want to use ncpus to allocate all the 3788 * tcp_stats_cpu_t at system boot up time since it will be 1. While 3789 * we handle adding CPU in tcp_cpu_update(), it will be slow if 3790 * there are many CPUs as we will be adding them 1 by 1. 3791 * 3792 * Note that tcps_sc_cnt never decreases and the tcps_sc[x] pointers 3793 * are not freed until the stack is going away. So there is no need 3794 * to grab a lock to access the per CPU tcps_sc[x] pointer. 3795 */ 3796 mutex_enter(&cpu_lock); 3797 tcps->tcps_sc_cnt = MAX(ncpus, boot_ncpus); 3798 mutex_exit(&cpu_lock); 3799 tcps->tcps_sc = kmem_zalloc(max_ncpus * sizeof (tcp_stats_cpu_t *), 3800 KM_SLEEP); 3801 for (i = 0; i < tcps->tcps_sc_cnt; i++) { 3802 tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t), 3803 KM_SLEEP); 3804 } 3805 3806 mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL); 3807 list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t), 3808 offsetof(tcp_listener_t, tl_link)); 3809 3810 return (tcps); 3811 } 3812 3813 /* 3814 * Called when the IP module is about to be unloaded. 3815 */ 3816 void 3817 tcp_ddi_g_destroy(void) 3818 { 3819 tcp_g_kstat_fini(tcp_g_kstat); 3820 tcp_g_kstat = NULL; 3821 bzero(&tcp_g_statistics, sizeof (tcp_g_statistics)); 3822 3823 mutex_destroy(&tcp_random_lock); 3824 3825 kmem_cache_destroy(tcp_timercache); 3826 kmem_cache_destroy(tcp_notsack_blk_cache); 3827 3828 netstack_unregister(NS_TCP); 3829 } 3830 3831 /* 3832 * Free the TCP stack instance. 3833 */ 3834 static void 3835 tcp_stack_fini(netstackid_t stackid, void *arg) 3836 { 3837 tcp_stack_t *tcps = (tcp_stack_t *)arg; 3838 int i; 3839 3840 freeb(tcps->tcps_ixa_cleanup_mp); 3841 tcps->tcps_ixa_cleanup_mp = NULL; 3842 cv_destroy(&tcps->tcps_ixa_cleanup_ready_cv); 3843 cv_destroy(&tcps->tcps_ixa_cleanup_done_cv); 3844 mutex_destroy(&tcps->tcps_ixa_cleanup_lock); 3845 3846 /* 3847 * Set tcps_reclaim to false tells tcp_reclaim_timer() not to restart 3848 * the timer. 3849 */ 3850 mutex_enter(&tcps->tcps_reclaim_lock); 3851 tcps->tcps_reclaim = B_FALSE; 3852 mutex_exit(&tcps->tcps_reclaim_lock); 3853 if (tcps->tcps_reclaim_tid != 0) 3854 (void) untimeout(tcps->tcps_reclaim_tid); 3855 mutex_destroy(&tcps->tcps_reclaim_lock); 3856 3857 tcp_listener_conf_cleanup(tcps); 3858 3859 for (i = 0; i < tcps->tcps_sc_cnt; i++) 3860 kmem_free(tcps->tcps_sc[i], sizeof (tcp_stats_cpu_t)); 3861 kmem_free(tcps->tcps_sc, max_ncpus * sizeof (tcp_stats_cpu_t *)); 3862 3863 kmem_free(tcps->tcps_propinfo_tbl, 3864 tcp_propinfo_count * sizeof (mod_prop_info_t)); 3865 tcps->tcps_propinfo_tbl = NULL; 3866 3867 for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) { 3868 ASSERT(tcps->tcps_bind_fanout[i].tf_tcp == NULL); 3869 mutex_destroy(&tcps->tcps_bind_fanout[i].tf_lock); 3870 } 3871 3872 for (i = 0; i < TCP_ACCEPTOR_FANOUT_SIZE; i++) { 3873 ASSERT(tcps->tcps_acceptor_fanout[i].tf_tcp == NULL); 3874 mutex_destroy(&tcps->tcps_acceptor_fanout[i].tf_lock); 3875 } 3876 3877 kmem_free(tcps->tcps_bind_fanout, sizeof (tf_t) * TCP_BIND_FANOUT_SIZE); 3878 tcps->tcps_bind_fanout = NULL; 3879 3880 kmem_free(tcps->tcps_acceptor_fanout, sizeof (tf_t) * 3881 TCP_ACCEPTOR_FANOUT_SIZE); 3882 tcps->tcps_acceptor_fanout = NULL; 3883 3884 mutex_destroy(&tcps->tcps_iss_key_lock); 3885 mutex_destroy(&tcps->tcps_epriv_port_lock); 3886 3887 ip_drop_unregister(&tcps->tcps_dropper); 3888 3889 tcp_kstat2_fini(stackid, tcps->tcps_kstat); 3890 tcps->tcps_kstat = NULL; 3891 3892 tcp_kstat_fini(stackid, tcps->tcps_mibkp); 3893 tcps->tcps_mibkp = NULL; 3894 3895 ldi_ident_release(tcps->tcps_ldi_ident); 3896 kmem_free(tcps, sizeof (*tcps)); 3897 } 3898 3899 /* 3900 * Generate ISS, taking into account NDD changes may happen halfway through. 3901 * (If the iss is not zero, set it.) 3902 */ 3903 3904 static void 3905 tcp_iss_init(tcp_t *tcp) 3906 { 3907 MD5_CTX context; 3908 struct { uint32_t ports; in6_addr_t src; in6_addr_t dst; } arg; 3909 uint32_t answer[4]; 3910 tcp_stack_t *tcps = tcp->tcp_tcps; 3911 conn_t *connp = tcp->tcp_connp; 3912 3913 tcps->tcps_iss_incr_extra += (tcps->tcps_iss_incr >> 1); 3914 tcp->tcp_iss = tcps->tcps_iss_incr_extra; 3915 switch (tcps->tcps_strong_iss) { 3916 case 2: 3917 mutex_enter(&tcps->tcps_iss_key_lock); 3918 context = tcps->tcps_iss_key; 3919 mutex_exit(&tcps->tcps_iss_key_lock); 3920 arg.ports = connp->conn_ports; 3921 arg.src = connp->conn_laddr_v6; 3922 arg.dst = connp->conn_faddr_v6; 3923 MD5Update(&context, (uchar_t *)&arg, sizeof (arg)); 3924 MD5Final((uchar_t *)answer, &context); 3925 tcp->tcp_iss += answer[0] ^ answer[1] ^ answer[2] ^ answer[3]; 3926 /* 3927 * Now that we've hashed into a unique per-connection sequence 3928 * space, add a random increment per strong_iss == 1. So I 3929 * guess we'll have to... 3930 */ 3931 /* FALLTHRU */ 3932 case 1: 3933 tcp->tcp_iss += (gethrtime() >> ISS_NSEC_SHT) + tcp_random(); 3934 break; 3935 default: 3936 tcp->tcp_iss += (uint32_t)gethrestime_sec() * 3937 tcps->tcps_iss_incr; 3938 break; 3939 } 3940 tcp->tcp_valid_bits = TCP_ISS_VALID; 3941 tcp->tcp_fss = tcp->tcp_iss - 1; 3942 tcp->tcp_suna = tcp->tcp_iss; 3943 tcp->tcp_snxt = tcp->tcp_iss + 1; 3944 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 3945 tcp->tcp_csuna = tcp->tcp_snxt; 3946 } 3947 3948 /* 3949 * tcp_{set,clr}qfull() functions are used to either set or clear QFULL 3950 * on the specified backing STREAMS q. Note, the caller may make the 3951 * decision to call based on the tcp_t.tcp_flow_stopped value which 3952 * when check outside the q's lock is only an advisory check ... 3953 */ 3954 void 3955 tcp_setqfull(tcp_t *tcp) 3956 { 3957 tcp_stack_t *tcps = tcp->tcp_tcps; 3958 conn_t *connp = tcp->tcp_connp; 3959 3960 if (tcp->tcp_closed) 3961 return; 3962 3963 conn_setqfull(connp, &tcp->tcp_flow_stopped); 3964 if (tcp->tcp_flow_stopped) 3965 TCP_STAT(tcps, tcp_flwctl_on); 3966 } 3967 3968 void 3969 tcp_clrqfull(tcp_t *tcp) 3970 { 3971 conn_t *connp = tcp->tcp_connp; 3972 3973 if (tcp->tcp_closed) 3974 return; 3975 conn_clrqfull(connp, &tcp->tcp_flow_stopped); 3976 } 3977 3978 static int 3979 tcp_squeue_switch(int val) 3980 { 3981 int rval = SQ_FILL; 3982 3983 switch (val) { 3984 case 1: 3985 rval = SQ_NODRAIN; 3986 break; 3987 case 2: 3988 rval = SQ_PROCESS; 3989 break; 3990 default: 3991 break; 3992 } 3993 return (rval); 3994 } 3995 3996 /* 3997 * This is called once for each squeue - globally for all stack 3998 * instances. 3999 */ 4000 static void 4001 tcp_squeue_add(squeue_t *sqp) 4002 { 4003 tcp_squeue_priv_t *tcp_time_wait = kmem_zalloc( 4004 sizeof (tcp_squeue_priv_t), KM_SLEEP); 4005 4006 *squeue_getprivate(sqp, SQPRIVATE_TCP) = (intptr_t)tcp_time_wait; 4007 if (tcp_free_list_max_cnt == 0) { 4008 int tcp_ncpus = ((boot_max_ncpus == -1) ? 4009 max_ncpus : boot_max_ncpus); 4010 4011 /* 4012 * Limit number of entries to 1% of availble memory / tcp_ncpus 4013 */ 4014 tcp_free_list_max_cnt = (freemem * PAGESIZE) / 4015 (tcp_ncpus * sizeof (tcp_t) * 100); 4016 } 4017 tcp_time_wait->tcp_free_list_cnt = 0; 4018 } 4019 /* 4020 * Return unix error is tli error is TSYSERR, otherwise return a negative 4021 * tli error. 4022 */ 4023 int 4024 tcp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, 4025 boolean_t bind_to_req_port_only) 4026 { 4027 int error; 4028 tcp_t *tcp = connp->conn_tcp; 4029 4030 if (tcp->tcp_state >= TCPS_BOUND) { 4031 if (connp->conn_debug) { 4032 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 4033 "tcp_bind: bad state, %d", tcp->tcp_state); 4034 } 4035 return (-TOUTSTATE); 4036 } 4037 4038 error = tcp_bind_check(connp, sa, len, cr, bind_to_req_port_only); 4039 if (error != 0) 4040 return (error); 4041 4042 ASSERT(tcp->tcp_state == TCPS_BOUND); 4043 tcp->tcp_conn_req_max = 0; 4044 return (0); 4045 } 4046 4047 /* 4048 * If the return value from this function is positive, it's a UNIX error. 4049 * Otherwise, if it's negative, then the absolute value is a TLI error. 4050 * the TPI routine tcp_tpi_connect() is a wrapper function for this. 4051 */ 4052 int 4053 tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, 4054 cred_t *cr, pid_t pid) 4055 { 4056 tcp_t *tcp = connp->conn_tcp; 4057 sin_t *sin = (sin_t *)sa; 4058 sin6_t *sin6 = (sin6_t *)sa; 4059 ipaddr_t *dstaddrp; 4060 in_port_t dstport; 4061 uint_t srcid; 4062 int error; 4063 uint32_t mss; 4064 mblk_t *syn_mp; 4065 tcp_stack_t *tcps = tcp->tcp_tcps; 4066 int32_t oldstate; 4067 ip_xmit_attr_t *ixa = connp->conn_ixa; 4068 4069 oldstate = tcp->tcp_state; 4070 4071 switch (len) { 4072 default: 4073 /* 4074 * Should never happen 4075 */ 4076 return (EINVAL); 4077 4078 case sizeof (sin_t): 4079 sin = (sin_t *)sa; 4080 if (sin->sin_port == 0) { 4081 return (-TBADADDR); 4082 } 4083 if (connp->conn_ipv6_v6only) { 4084 return (EAFNOSUPPORT); 4085 } 4086 break; 4087 4088 case sizeof (sin6_t): 4089 sin6 = (sin6_t *)sa; 4090 if (sin6->sin6_port == 0) { 4091 return (-TBADADDR); 4092 } 4093 break; 4094 } 4095 /* 4096 * If we're connecting to an IPv4-mapped IPv6 address, we need to 4097 * make sure that the conn_ipversion is IPV4_VERSION. We 4098 * need to this before we call tcp_bindi() so that the port lookup 4099 * code will look for ports in the correct port space (IPv4 and 4100 * IPv6 have separate port spaces). 4101 */ 4102 if (connp->conn_family == AF_INET6 && 4103 connp->conn_ipversion == IPV6_VERSION && 4104 IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 4105 if (connp->conn_ipv6_v6only) 4106 return (EADDRNOTAVAIL); 4107 4108 connp->conn_ipversion = IPV4_VERSION; 4109 } 4110 4111 switch (tcp->tcp_state) { 4112 case TCPS_LISTEN: 4113 /* 4114 * Listening sockets are not allowed to issue connect(). 4115 */ 4116 if (IPCL_IS_NONSTR(connp)) 4117 return (EOPNOTSUPP); 4118 /* FALLTHRU */ 4119 case TCPS_IDLE: 4120 /* 4121 * We support quick connect, refer to comments in 4122 * tcp_connect_*() 4123 */ 4124 /* FALLTHRU */ 4125 case TCPS_BOUND: 4126 break; 4127 default: 4128 return (-TOUTSTATE); 4129 } 4130 4131 /* 4132 * We update our cred/cpid based on the caller of connect 4133 */ 4134 if (connp->conn_cred != cr) { 4135 crhold(cr); 4136 crfree(connp->conn_cred); 4137 connp->conn_cred = cr; 4138 } 4139 connp->conn_cpid = pid; 4140 4141 /* Cache things in the ixa without any refhold */ 4142 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4143 ixa->ixa_cred = cr; 4144 ixa->ixa_cpid = pid; 4145 if (is_system_labeled()) { 4146 /* We need to restart with a label based on the cred */ 4147 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 4148 } 4149 4150 if (connp->conn_family == AF_INET6) { 4151 if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 4152 error = tcp_connect_ipv6(tcp, &sin6->sin6_addr, 4153 sin6->sin6_port, sin6->sin6_flowinfo, 4154 sin6->__sin6_src_id, sin6->sin6_scope_id); 4155 } else { 4156 /* 4157 * Destination adress is mapped IPv6 address. 4158 * Source bound address should be unspecified or 4159 * IPv6 mapped address as well. 4160 */ 4161 if (!IN6_IS_ADDR_UNSPECIFIED( 4162 &connp->conn_bound_addr_v6) && 4163 !IN6_IS_ADDR_V4MAPPED(&connp->conn_bound_addr_v6)) { 4164 return (EADDRNOTAVAIL); 4165 } 4166 dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr)); 4167 dstport = sin6->sin6_port; 4168 srcid = sin6->__sin6_src_id; 4169 error = tcp_connect_ipv4(tcp, dstaddrp, dstport, 4170 srcid); 4171 } 4172 } else { 4173 dstaddrp = &sin->sin_addr.s_addr; 4174 dstport = sin->sin_port; 4175 srcid = 0; 4176 error = tcp_connect_ipv4(tcp, dstaddrp, dstport, srcid); 4177 } 4178 4179 if (error != 0) 4180 goto connect_failed; 4181 4182 CL_INET_CONNECT(connp, B_TRUE, error); 4183 if (error != 0) 4184 goto connect_failed; 4185 4186 /* connect succeeded */ 4187 TCPS_BUMP_MIB(tcps, tcpActiveOpens); 4188 tcp->tcp_active_open = 1; 4189 4190 /* 4191 * tcp_set_destination() does not adjust for TCP/IP header length. 4192 */ 4193 mss = tcp->tcp_mss - connp->conn_ht_iphc_len; 4194 4195 /* 4196 * Just make sure our rwnd is at least rcvbuf * MSS large, and round up 4197 * to the nearest MSS. 4198 * 4199 * We do the round up here because we need to get the interface MTU 4200 * first before we can do the round up. 4201 */ 4202 tcp->tcp_rwnd = connp->conn_rcvbuf; 4203 tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), 4204 tcps->tcps_recv_hiwat_minmss * mss); 4205 connp->conn_rcvbuf = tcp->tcp_rwnd; 4206 tcp_set_ws_value(tcp); 4207 tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 4208 if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always) 4209 tcp->tcp_snd_ws_ok = B_TRUE; 4210 4211 /* 4212 * Set tcp_snd_ts_ok to true 4213 * so that tcp_xmit_mp will 4214 * include the timestamp 4215 * option in the SYN segment. 4216 */ 4217 if (tcps->tcps_tstamp_always || 4218 (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) { 4219 tcp->tcp_snd_ts_ok = B_TRUE; 4220 } 4221 4222 /* 4223 * Note that tcp_snd_sack_ok can be set in tcp_set_destination() if 4224 * the SACK metric is set. So here we just check the per stack SACK 4225 * permitted param. 4226 */ 4227 if (tcps->tcps_sack_permitted == 2) { 4228 ASSERT(tcp->tcp_num_sack_blk == 0); 4229 ASSERT(tcp->tcp_notsack_list == NULL); 4230 tcp->tcp_snd_sack_ok = B_TRUE; 4231 } 4232 4233 /* 4234 * Should we use ECN? Note that the current 4235 * default value (SunOS 5.9) of tcp_ecn_permitted 4236 * is 1. The reason for doing this is that there 4237 * are equipments out there that will drop ECN 4238 * enabled IP packets. Setting it to 1 avoids 4239 * compatibility problems. 4240 */ 4241 if (tcps->tcps_ecn_permitted == 2) 4242 tcp->tcp_ecn_ok = B_TRUE; 4243 4244 /* Trace change from BOUND -> SYN_SENT here */ 4245 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 4246 connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, 4247 int32_t, TCPS_BOUND); 4248 4249 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 4250 syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 4251 tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 4252 if (syn_mp != NULL) { 4253 /* 4254 * We must bump the generation before sending the syn 4255 * to ensure that we use the right generation in case 4256 * this thread issues a "connected" up call. 4257 */ 4258 SOCK_CONNID_BUMP(tcp->tcp_connid); 4259 /* 4260 * DTrace sending the first SYN as a 4261 * tcp:::connect-request event. 4262 */ 4263 DTRACE_TCP5(connect__request, mblk_t *, NULL, 4264 ip_xmit_attr_t *, connp->conn_ixa, 4265 void_ip_t *, syn_mp->b_rptr, tcp_t *, tcp, 4266 tcph_t *, 4267 &syn_mp->b_rptr[connp->conn_ixa->ixa_ip_hdr_length]); 4268 tcp_send_data(tcp, syn_mp); 4269 } 4270 4271 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 4272 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 4273 return (0); 4274 4275 connect_failed: 4276 connp->conn_faddr_v6 = ipv6_all_zeros; 4277 connp->conn_fport = 0; 4278 tcp->tcp_state = oldstate; 4279 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 4280 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 4281 return (error); 4282 } 4283 4284 int 4285 tcp_do_listen(conn_t *connp, struct sockaddr *sa, socklen_t len, 4286 int backlog, cred_t *cr, boolean_t bind_to_req_port_only) 4287 { 4288 tcp_t *tcp = connp->conn_tcp; 4289 int error = 0; 4290 tcp_stack_t *tcps = tcp->tcp_tcps; 4291 int32_t oldstate; 4292 4293 /* All Solaris components should pass a cred for this operation. */ 4294 ASSERT(cr != NULL); 4295 4296 if (tcp->tcp_state >= TCPS_BOUND) { 4297 if ((tcp->tcp_state == TCPS_BOUND || 4298 tcp->tcp_state == TCPS_LISTEN) && backlog > 0) { 4299 /* 4300 * Handle listen() increasing backlog. 4301 * This is more "liberal" then what the TPI spec 4302 * requires but is needed to avoid a t_unbind 4303 * when handling listen() since the port number 4304 * might be "stolen" between the unbind and bind. 4305 */ 4306 goto do_listen; 4307 } 4308 if (connp->conn_debug) { 4309 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 4310 "tcp_listen: bad state, %d", tcp->tcp_state); 4311 } 4312 return (-TOUTSTATE); 4313 } else { 4314 if (sa == NULL) { 4315 sin6_t addr; 4316 sin_t *sin; 4317 sin6_t *sin6; 4318 4319 ASSERT(IPCL_IS_NONSTR(connp)); 4320 /* Do an implicit bind: Request for a generic port. */ 4321 if (connp->conn_family == AF_INET) { 4322 len = sizeof (sin_t); 4323 sin = (sin_t *)&addr; 4324 *sin = sin_null; 4325 sin->sin_family = AF_INET; 4326 } else { 4327 ASSERT(connp->conn_family == AF_INET6); 4328 len = sizeof (sin6_t); 4329 sin6 = (sin6_t *)&addr; 4330 *sin6 = sin6_null; 4331 sin6->sin6_family = AF_INET6; 4332 } 4333 sa = (struct sockaddr *)&addr; 4334 } 4335 4336 error = tcp_bind_check(connp, sa, len, cr, 4337 bind_to_req_port_only); 4338 if (error) 4339 return (error); 4340 /* Fall through and do the fanout insertion */ 4341 } 4342 4343 do_listen: 4344 ASSERT(tcp->tcp_state == TCPS_BOUND || tcp->tcp_state == TCPS_LISTEN); 4345 tcp->tcp_conn_req_max = backlog; 4346 if (tcp->tcp_conn_req_max) { 4347 if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min) 4348 tcp->tcp_conn_req_max = tcps->tcps_conn_req_min; 4349 if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q) 4350 tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q; 4351 /* 4352 * If this is a listener, do not reset the eager list 4353 * and other stuffs. Note that we don't check if the 4354 * existing eager list meets the new tcp_conn_req_max 4355 * requirement. 4356 */ 4357 if (tcp->tcp_state != TCPS_LISTEN) { 4358 tcp->tcp_state = TCPS_LISTEN; 4359 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 4360 connp->conn_ixa, void, NULL, tcp_t *, tcp, 4361 void, NULL, int32_t, TCPS_BOUND); 4362 /* Initialize the chain. Don't need the eager_lock */ 4363 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 4364 tcp->tcp_eager_next_drop_q0 = tcp; 4365 tcp->tcp_eager_prev_drop_q0 = tcp; 4366 tcp->tcp_second_ctimer_threshold = 4367 tcps->tcps_ip_abort_linterval; 4368 } 4369 } 4370 4371 /* 4372 * We need to make sure that the conn_recv is set to a non-null 4373 * value before we insert the conn into the classifier table. 4374 * This is to avoid a race with an incoming packet which does an 4375 * ipcl_classify(). 4376 * We initially set it to tcp_input_listener_unbound to try to 4377 * pick a good squeue for the listener when the first SYN arrives. 4378 * tcp_input_listener_unbound sets it to tcp_input_listener on that 4379 * first SYN. 4380 */ 4381 connp->conn_recv = tcp_input_listener_unbound; 4382 4383 /* Insert the listener in the classifier table */ 4384 error = ip_laddr_fanout_insert(connp); 4385 if (error != 0) { 4386 /* Undo the bind - release the port number */ 4387 oldstate = tcp->tcp_state; 4388 tcp->tcp_state = TCPS_IDLE; 4389 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 4390 connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, 4391 int32_t, oldstate); 4392 connp->conn_bound_addr_v6 = ipv6_all_zeros; 4393 4394 connp->conn_laddr_v6 = ipv6_all_zeros; 4395 connp->conn_saddr_v6 = ipv6_all_zeros; 4396 connp->conn_ports = 0; 4397 4398 if (connp->conn_anon_port) { 4399 zone_t *zone; 4400 4401 zone = crgetzone(cr); 4402 connp->conn_anon_port = B_FALSE; 4403 (void) tsol_mlp_anon(zone, connp->conn_mlp_type, 4404 connp->conn_proto, connp->conn_lport, B_FALSE); 4405 } 4406 connp->conn_mlp_type = mlptSingle; 4407 4408 tcp_bind_hash_remove(tcp); 4409 return (error); 4410 } else { 4411 /* 4412 * If there is a connection limit, allocate and initialize 4413 * the counter struct. Note that since listen can be called 4414 * multiple times, the struct may have been allready allocated. 4415 */ 4416 if (!list_is_empty(&tcps->tcps_listener_conf) && 4417 tcp->tcp_listen_cnt == NULL) { 4418 tcp_listen_cnt_t *tlc; 4419 uint32_t ratio; 4420 4421 ratio = tcp_find_listener_conf(tcps, 4422 ntohs(connp->conn_lport)); 4423 if (ratio != 0) { 4424 uint32_t mem_ratio, tot_buf; 4425 4426 tlc = kmem_alloc(sizeof (tcp_listen_cnt_t), 4427 KM_SLEEP); 4428 /* 4429 * Calculate the connection limit based on 4430 * the configured ratio and maxusers. Maxusers 4431 * are calculated based on memory size, 4432 * ~ 1 user per MB. Note that the conn_rcvbuf 4433 * and conn_sndbuf may change after a 4434 * connection is accepted. So what we have 4435 * is only an approximation. 4436 */ 4437 if ((tot_buf = connp->conn_rcvbuf + 4438 connp->conn_sndbuf) < MB) { 4439 mem_ratio = MB / tot_buf; 4440 tlc->tlc_max = maxusers / ratio * 4441 mem_ratio; 4442 } else { 4443 mem_ratio = tot_buf / MB; 4444 tlc->tlc_max = maxusers / ratio / 4445 mem_ratio; 4446 } 4447 /* At least we should allow two connections! */ 4448 if (tlc->tlc_max <= tcp_min_conn_listener) 4449 tlc->tlc_max = tcp_min_conn_listener; 4450 tlc->tlc_cnt = 1; 4451 tlc->tlc_drop = 0; 4452 tcp->tcp_listen_cnt = tlc; 4453 } 4454 } 4455 } 4456 return (error); 4457 } 4458