1721fffe3SKacheong Poon /* 2721fffe3SKacheong Poon * CDDL HEADER START 3721fffe3SKacheong Poon * 4721fffe3SKacheong Poon * The contents of this file are subject to the terms of the 5721fffe3SKacheong Poon * Common Development and Distribution License (the "License"). 6721fffe3SKacheong Poon * You may not use this file except in compliance with the License. 7721fffe3SKacheong Poon * 8721fffe3SKacheong Poon * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9721fffe3SKacheong Poon * or http://www.opensolaris.org/os/licensing. 10721fffe3SKacheong Poon * See the License for the specific language governing permissions 11721fffe3SKacheong Poon * and limitations under the License. 12721fffe3SKacheong Poon * 13721fffe3SKacheong Poon * When distributing Covered Code, include this CDDL HEADER in each 14721fffe3SKacheong Poon * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15721fffe3SKacheong Poon * If applicable, add the following below this CDDL HEADER, with the 16721fffe3SKacheong Poon * fields enclosed by brackets "[]" replaced with your own identifying 17721fffe3SKacheong Poon * information: Portions Copyright [yyyy] [name of copyright owner] 18721fffe3SKacheong Poon * 19721fffe3SKacheong Poon * CDDL HEADER END 20721fffe3SKacheong Poon */ 21721fffe3SKacheong Poon 22721fffe3SKacheong Poon /* 2366cd0f60SKacheong Poon * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24*c0e6663fSJerry Jelinek * Copyright (c) 2011, Joyent Inc. All rights reserved. 25721fffe3SKacheong Poon */ 26721fffe3SKacheong Poon 27721fffe3SKacheong Poon /* 28721fffe3SKacheong Poon * This file contains functions related to TCP time wait processing. Also 29721fffe3SKacheong Poon * refer to the time wait handling comments in tcp_impl.h. 30721fffe3SKacheong Poon */ 31721fffe3SKacheong Poon 32721fffe3SKacheong Poon #include <sys/types.h> 33721fffe3SKacheong Poon #include <sys/strsun.h> 34721fffe3SKacheong Poon #include <sys/squeue_impl.h> 35721fffe3SKacheong Poon #include <sys/squeue.h> 36721fffe3SKacheong Poon #include <sys/callo.h> 37721fffe3SKacheong Poon 38721fffe3SKacheong Poon #include <inet/common.h> 39721fffe3SKacheong Poon #include <inet/ip.h> 40721fffe3SKacheong Poon #include <inet/tcp.h> 41721fffe3SKacheong Poon #include <inet/tcp_impl.h> 42721fffe3SKacheong Poon #include <inet/tcp_cluster.h> 43721fffe3SKacheong Poon 44721fffe3SKacheong Poon static void tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *); 45721fffe3SKacheong Poon 46721fffe3SKacheong Poon /* 47721fffe3SKacheong Poon * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. 48721fffe3SKacheong Poon * Running it every 5 seconds seems to give the best results. 49721fffe3SKacheong Poon */ 50721fffe3SKacheong Poon #define TCP_TIME_WAIT_DELAY ((hrtime_t)5 * NANOSEC) 51721fffe3SKacheong Poon 52721fffe3SKacheong Poon /* 53721fffe3SKacheong Poon * Remove a connection from the list of detached TIME_WAIT connections. 54721fffe3SKacheong Poon * It returns B_FALSE if it can't remove the connection from the list 55721fffe3SKacheong Poon * as the connection has already been removed from the list due to an 56721fffe3SKacheong Poon * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE. 57721fffe3SKacheong Poon */ 58721fffe3SKacheong Poon boolean_t 59721fffe3SKacheong Poon tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) 60721fffe3SKacheong Poon { 61721fffe3SKacheong Poon boolean_t locked = B_FALSE; 62721fffe3SKacheong Poon 63721fffe3SKacheong Poon if (tcp_time_wait == NULL) { 64721fffe3SKacheong Poon tcp_time_wait = *((tcp_squeue_priv_t **) 65721fffe3SKacheong Poon squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); 66721fffe3SKacheong Poon mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 67721fffe3SKacheong Poon locked = B_TRUE; 68721fffe3SKacheong Poon } else { 69721fffe3SKacheong Poon ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock)); 70721fffe3SKacheong Poon } 71721fffe3SKacheong Poon 72721fffe3SKacheong Poon /* 0 means that the tcp_t has not been added to the time wait list. */ 73721fffe3SKacheong Poon if (tcp->tcp_time_wait_expire == 0) { 74721fffe3SKacheong Poon ASSERT(tcp->tcp_time_wait_next == NULL); 75721fffe3SKacheong Poon ASSERT(tcp->tcp_time_wait_prev == NULL); 76721fffe3SKacheong Poon if (locked) 77721fffe3SKacheong Poon mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 78721fffe3SKacheong Poon return (B_FALSE); 79721fffe3SKacheong Poon } 80721fffe3SKacheong Poon ASSERT(TCP_IS_DETACHED(tcp)); 81721fffe3SKacheong Poon ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 82721fffe3SKacheong Poon 83721fffe3SKacheong Poon if (tcp == tcp_time_wait->tcp_time_wait_head) { 84721fffe3SKacheong Poon ASSERT(tcp->tcp_time_wait_prev == NULL); 85721fffe3SKacheong Poon tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; 86721fffe3SKacheong Poon if (tcp_time_wait->tcp_time_wait_head != NULL) { 87721fffe3SKacheong Poon tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = 88721fffe3SKacheong Poon NULL; 89721fffe3SKacheong Poon } else { 90721fffe3SKacheong Poon tcp_time_wait->tcp_time_wait_tail = NULL; 91721fffe3SKacheong Poon } 92721fffe3SKacheong Poon } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { 93721fffe3SKacheong Poon ASSERT(tcp->tcp_time_wait_next == NULL); 94721fffe3SKacheong Poon tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; 95721fffe3SKacheong Poon ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 96721fffe3SKacheong Poon tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; 97721fffe3SKacheong Poon } else { 98721fffe3SKacheong Poon ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); 99721fffe3SKacheong Poon ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); 100721fffe3SKacheong Poon tcp->tcp_time_wait_prev->tcp_time_wait_next = 101721fffe3SKacheong Poon tcp->tcp_time_wait_next; 102721fffe3SKacheong Poon tcp->tcp_time_wait_next->tcp_time_wait_prev = 103721fffe3SKacheong Poon tcp->tcp_time_wait_prev; 104721fffe3SKacheong Poon } 105721fffe3SKacheong Poon tcp->tcp_time_wait_next = NULL; 106721fffe3SKacheong Poon tcp->tcp_time_wait_prev = NULL; 107721fffe3SKacheong Poon tcp->tcp_time_wait_expire = 0; 108721fffe3SKacheong Poon 109721fffe3SKacheong Poon if (locked) 110721fffe3SKacheong Poon mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 111721fffe3SKacheong Poon return (B_TRUE); 112721fffe3SKacheong Poon } 113721fffe3SKacheong Poon 114721fffe3SKacheong Poon /* 115721fffe3SKacheong Poon * Add a connection to the list of detached TIME_WAIT connections 116721fffe3SKacheong Poon * and set its time to expire. 117721fffe3SKacheong Poon */ 118721fffe3SKacheong Poon void 119721fffe3SKacheong Poon tcp_time_wait_append(tcp_t *tcp) 120721fffe3SKacheong Poon { 121721fffe3SKacheong Poon tcp_stack_t *tcps = tcp->tcp_tcps; 12266cd0f60SKacheong Poon squeue_t *sqp = tcp->tcp_connp->conn_sqp; 123721fffe3SKacheong Poon tcp_squeue_priv_t *tcp_time_wait = 12466cd0f60SKacheong Poon *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 125721fffe3SKacheong Poon 126721fffe3SKacheong Poon tcp_timers_stop(tcp); 127721fffe3SKacheong Poon 128721fffe3SKacheong Poon /* Freed above */ 129721fffe3SKacheong Poon ASSERT(tcp->tcp_timer_tid == 0); 130721fffe3SKacheong Poon ASSERT(tcp->tcp_ack_tid == 0); 131721fffe3SKacheong Poon 132721fffe3SKacheong Poon /* must have happened at the time of detaching the tcp */ 133721fffe3SKacheong Poon ASSERT(tcp->tcp_ptpahn == NULL); 134721fffe3SKacheong Poon ASSERT(tcp->tcp_flow_stopped == 0); 135721fffe3SKacheong Poon ASSERT(tcp->tcp_time_wait_next == NULL); 136721fffe3SKacheong Poon ASSERT(tcp->tcp_time_wait_prev == NULL); 13766cd0f60SKacheong Poon ASSERT(tcp->tcp_time_wait_expire == 0); 138721fffe3SKacheong Poon ASSERT(tcp->tcp_listener == NULL); 139721fffe3SKacheong Poon 14066cd0f60SKacheong Poon tcp->tcp_time_wait_expire = ddi_get_lbolt64(); 141721fffe3SKacheong Poon /* 14266cd0f60SKacheong Poon * Since tcp_time_wait_expire is lbolt64, it should not wrap around 14366cd0f60SKacheong Poon * in practice. Hence it cannot be 0. Note that zero means that the 14466cd0f60SKacheong Poon * tcp_t is not in the TIME_WAIT list. 145721fffe3SKacheong Poon */ 146721fffe3SKacheong Poon tcp->tcp_time_wait_expire += MSEC_TO_TICK( 147721fffe3SKacheong Poon tcps->tcps_time_wait_interval); 148721fffe3SKacheong Poon 149721fffe3SKacheong Poon ASSERT(TCP_IS_DETACHED(tcp)); 150721fffe3SKacheong Poon ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 151721fffe3SKacheong Poon ASSERT(tcp->tcp_time_wait_next == NULL); 152721fffe3SKacheong Poon ASSERT(tcp->tcp_time_wait_prev == NULL); 153721fffe3SKacheong Poon TCP_DBGSTAT(tcps, tcp_time_wait); 154721fffe3SKacheong Poon 155721fffe3SKacheong Poon mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 156721fffe3SKacheong Poon if (tcp_time_wait->tcp_time_wait_head == NULL) { 157721fffe3SKacheong Poon ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); 158721fffe3SKacheong Poon tcp_time_wait->tcp_time_wait_head = tcp; 15966cd0f60SKacheong Poon 16066cd0f60SKacheong Poon /* 16166cd0f60SKacheong Poon * Even if the list was empty before, there may be a timer 16266cd0f60SKacheong Poon * running since a tcp_t can be removed from the list 16366cd0f60SKacheong Poon * in other places, such as tcp_clean_death(). So check if 16466cd0f60SKacheong Poon * a timer is needed. 16566cd0f60SKacheong Poon */ 16666cd0f60SKacheong Poon if (tcp_time_wait->tcp_time_wait_tid == 0) { 16766cd0f60SKacheong Poon tcp_time_wait->tcp_time_wait_tid = 16866cd0f60SKacheong Poon timeout_generic(CALLOUT_NORMAL, 16966cd0f60SKacheong Poon tcp_time_wait_collector, sqp, 17066cd0f60SKacheong Poon (hrtime_t)(tcps->tcps_time_wait_interval + 1) * 17166cd0f60SKacheong Poon MICROSEC, CALLOUT_TCP_RESOLUTION, 17266cd0f60SKacheong Poon CALLOUT_FLAG_ROUNDUP); 17366cd0f60SKacheong Poon } 174721fffe3SKacheong Poon } else { 1750870f17bSKacheong Poon /* 1760870f17bSKacheong Poon * The list is not empty, so a timer must be running. If not, 1770870f17bSKacheong Poon * tcp_time_wait_collector() must be running on this 1780870f17bSKacheong Poon * tcp_time_wait list at the same time. 1790870f17bSKacheong Poon */ 1800870f17bSKacheong Poon ASSERT(tcp_time_wait->tcp_time_wait_tid != 0 || 1810870f17bSKacheong Poon tcp_time_wait->tcp_time_wait_running); 182721fffe3SKacheong Poon ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 183721fffe3SKacheong Poon ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == 184721fffe3SKacheong Poon TCPS_TIME_WAIT); 185721fffe3SKacheong Poon tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp; 186721fffe3SKacheong Poon tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail; 18766cd0f60SKacheong Poon 188721fffe3SKacheong Poon } 189721fffe3SKacheong Poon tcp_time_wait->tcp_time_wait_tail = tcp; 190721fffe3SKacheong Poon mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 191721fffe3SKacheong Poon } 192721fffe3SKacheong Poon 193721fffe3SKacheong Poon /* 194721fffe3SKacheong Poon * Wrapper to call tcp_close_detached() via squeue to clean up TIME-WAIT 195721fffe3SKacheong Poon * tcp_t. Used in tcp_time_wait_collector(). 196721fffe3SKacheong Poon */ 197721fffe3SKacheong Poon /* ARGSUSED */ 198721fffe3SKacheong Poon static void 199721fffe3SKacheong Poon tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 200721fffe3SKacheong Poon { 201721fffe3SKacheong Poon conn_t *connp = (conn_t *)arg; 202721fffe3SKacheong Poon tcp_t *tcp = connp->conn_tcp; 203721fffe3SKacheong Poon 204721fffe3SKacheong Poon ASSERT(tcp != NULL); 205721fffe3SKacheong Poon if (tcp->tcp_state == TCPS_CLOSED) { 206721fffe3SKacheong Poon return; 207721fffe3SKacheong Poon } 208721fffe3SKacheong Poon 209721fffe3SKacheong Poon ASSERT((connp->conn_family == AF_INET && 210721fffe3SKacheong Poon connp->conn_ipversion == IPV4_VERSION) || 211721fffe3SKacheong Poon (connp->conn_family == AF_INET6 && 212721fffe3SKacheong Poon (connp->conn_ipversion == IPV4_VERSION || 213721fffe3SKacheong Poon connp->conn_ipversion == IPV6_VERSION))); 214721fffe3SKacheong Poon ASSERT(!tcp->tcp_listener); 215721fffe3SKacheong Poon 216721fffe3SKacheong Poon ASSERT(TCP_IS_DETACHED(tcp)); 217721fffe3SKacheong Poon 218721fffe3SKacheong Poon /* 219721fffe3SKacheong Poon * Because they have no upstream client to rebind or tcp_close() 220721fffe3SKacheong Poon * them later, we axe the connection here and now. 221721fffe3SKacheong Poon */ 222721fffe3SKacheong Poon tcp_close_detached(tcp); 223721fffe3SKacheong Poon } 224721fffe3SKacheong Poon 225721fffe3SKacheong Poon /* 226721fffe3SKacheong Poon * Blows away all tcps whose TIME_WAIT has expired. List traversal 227721fffe3SKacheong Poon * is done forwards from the head. 228721fffe3SKacheong Poon * This walks all stack instances since 229721fffe3SKacheong Poon * tcp_time_wait remains global across all stacks. 230721fffe3SKacheong Poon */ 231721fffe3SKacheong Poon /* ARGSUSED */ 232721fffe3SKacheong Poon void 233721fffe3SKacheong Poon tcp_time_wait_collector(void *arg) 234721fffe3SKacheong Poon { 235721fffe3SKacheong Poon tcp_t *tcp; 23666cd0f60SKacheong Poon int64_t now; 237721fffe3SKacheong Poon mblk_t *mp; 238721fffe3SKacheong Poon conn_t *connp; 239721fffe3SKacheong Poon kmutex_t *lock; 240721fffe3SKacheong Poon boolean_t removed; 241721fffe3SKacheong Poon extern void (*cl_inet_disconnect)(netstackid_t, uint8_t, sa_family_t, 242721fffe3SKacheong Poon uint8_t *, in_port_t, uint8_t *, in_port_t, void *); 243721fffe3SKacheong Poon 244721fffe3SKacheong Poon squeue_t *sqp = (squeue_t *)arg; 245721fffe3SKacheong Poon tcp_squeue_priv_t *tcp_time_wait = 246721fffe3SKacheong Poon *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 247721fffe3SKacheong Poon 248721fffe3SKacheong Poon mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 249721fffe3SKacheong Poon tcp_time_wait->tcp_time_wait_tid = 0; 2500870f17bSKacheong Poon #ifdef DEBUG 2510870f17bSKacheong Poon tcp_time_wait->tcp_time_wait_running = B_TRUE; 2520870f17bSKacheong Poon #endif 253721fffe3SKacheong Poon 254721fffe3SKacheong Poon if (tcp_time_wait->tcp_free_list != NULL && 255721fffe3SKacheong Poon tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { 256721fffe3SKacheong Poon TCP_G_STAT(tcp_freelist_cleanup); 257721fffe3SKacheong Poon while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { 258721fffe3SKacheong Poon tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 259721fffe3SKacheong Poon tcp->tcp_time_wait_next = NULL; 260721fffe3SKacheong Poon tcp_time_wait->tcp_free_list_cnt--; 261721fffe3SKacheong Poon ASSERT(tcp->tcp_tcps == NULL); 262721fffe3SKacheong Poon CONN_DEC_REF(tcp->tcp_connp); 263721fffe3SKacheong Poon } 264721fffe3SKacheong Poon ASSERT(tcp_time_wait->tcp_free_list_cnt == 0); 265721fffe3SKacheong Poon } 266721fffe3SKacheong Poon 267721fffe3SKacheong Poon /* 268721fffe3SKacheong Poon * In order to reap time waits reliably, we should use a 269721fffe3SKacheong Poon * source of time that is not adjustable by the user -- hence 27066cd0f60SKacheong Poon * the call to ddi_get_lbolt64(). 271721fffe3SKacheong Poon */ 27266cd0f60SKacheong Poon now = ddi_get_lbolt64(); 273721fffe3SKacheong Poon while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { 274721fffe3SKacheong Poon /* 27566cd0f60SKacheong Poon * lbolt64 should not wrap around in practice... So we can 27666cd0f60SKacheong Poon * do a direct comparison. 277721fffe3SKacheong Poon */ 27866cd0f60SKacheong Poon if (now < tcp->tcp_time_wait_expire) 279721fffe3SKacheong Poon break; 280721fffe3SKacheong Poon 281721fffe3SKacheong Poon removed = tcp_time_wait_remove(tcp, tcp_time_wait); 282721fffe3SKacheong Poon ASSERT(removed); 283721fffe3SKacheong Poon 284721fffe3SKacheong Poon connp = tcp->tcp_connp; 285721fffe3SKacheong Poon ASSERT(connp->conn_fanout != NULL); 286721fffe3SKacheong Poon lock = &connp->conn_fanout->connf_lock; 287721fffe3SKacheong Poon /* 288721fffe3SKacheong Poon * This is essentially a TW reclaim fast path optimization for 289721fffe3SKacheong Poon * performance where the timewait collector checks under the 290721fffe3SKacheong Poon * fanout lock (so that no one else can get access to the 291721fffe3SKacheong Poon * conn_t) that the refcnt is 2 i.e. one for TCP and one for 292721fffe3SKacheong Poon * the classifier hash list. If ref count is indeed 2, we can 293721fffe3SKacheong Poon * just remove the conn under the fanout lock and avoid 294721fffe3SKacheong Poon * cleaning up the conn under the squeue, provided that 295721fffe3SKacheong Poon * clustering callbacks are not enabled. If clustering is 296721fffe3SKacheong Poon * enabled, we need to make the clustering callback before 297721fffe3SKacheong Poon * setting the CONDEMNED flag and after dropping all locks and 298721fffe3SKacheong Poon * so we forego this optimization and fall back to the slow 299721fffe3SKacheong Poon * path. Also please see the comments in tcp_closei_local 300721fffe3SKacheong Poon * regarding the refcnt logic. 301721fffe3SKacheong Poon * 302721fffe3SKacheong Poon * Since we are holding the tcp_time_wait_lock, its better 303721fffe3SKacheong Poon * not to block on the fanout_lock because other connections 304721fffe3SKacheong Poon * can't add themselves to time_wait list. So we do a 305721fffe3SKacheong Poon * tryenter instead of mutex_enter. 306721fffe3SKacheong Poon */ 307721fffe3SKacheong Poon if (mutex_tryenter(lock)) { 308721fffe3SKacheong Poon mutex_enter(&connp->conn_lock); 309721fffe3SKacheong Poon if ((connp->conn_ref == 2) && 310721fffe3SKacheong Poon (cl_inet_disconnect == NULL)) { 311721fffe3SKacheong Poon ipcl_hash_remove_locked(connp, 312721fffe3SKacheong Poon connp->conn_fanout); 313721fffe3SKacheong Poon /* 314721fffe3SKacheong Poon * Set the CONDEMNED flag now itself so that 315721fffe3SKacheong Poon * the refcnt cannot increase due to any 316721fffe3SKacheong Poon * walker. 317721fffe3SKacheong Poon */ 318721fffe3SKacheong Poon connp->conn_state_flags |= CONN_CONDEMNED; 319721fffe3SKacheong Poon mutex_exit(lock); 320721fffe3SKacheong Poon mutex_exit(&connp->conn_lock); 321721fffe3SKacheong Poon if (tcp_time_wait->tcp_free_list_cnt < 322721fffe3SKacheong Poon tcp_free_list_max_cnt) { 323721fffe3SKacheong Poon /* Add to head of tcp_free_list */ 324721fffe3SKacheong Poon mutex_exit( 325721fffe3SKacheong Poon &tcp_time_wait->tcp_time_wait_lock); 326721fffe3SKacheong Poon tcp_cleanup(tcp); 327721fffe3SKacheong Poon ASSERT(connp->conn_latch == NULL); 328721fffe3SKacheong Poon ASSERT(connp->conn_policy == NULL); 329721fffe3SKacheong Poon ASSERT(tcp->tcp_tcps == NULL); 330721fffe3SKacheong Poon ASSERT(connp->conn_netstack == NULL); 331721fffe3SKacheong Poon 332721fffe3SKacheong Poon mutex_enter( 333721fffe3SKacheong Poon &tcp_time_wait->tcp_time_wait_lock); 334721fffe3SKacheong Poon tcp->tcp_time_wait_next = 335721fffe3SKacheong Poon tcp_time_wait->tcp_free_list; 336721fffe3SKacheong Poon tcp_time_wait->tcp_free_list = tcp; 337721fffe3SKacheong Poon tcp_time_wait->tcp_free_list_cnt++; 338721fffe3SKacheong Poon continue; 339721fffe3SKacheong Poon } else { 340721fffe3SKacheong Poon /* Do not add to tcp_free_list */ 341721fffe3SKacheong Poon mutex_exit( 342721fffe3SKacheong Poon &tcp_time_wait->tcp_time_wait_lock); 343721fffe3SKacheong Poon tcp_bind_hash_remove(tcp); 344721fffe3SKacheong Poon ixa_cleanup(tcp->tcp_connp->conn_ixa); 345721fffe3SKacheong Poon tcp_ipsec_cleanup(tcp); 346721fffe3SKacheong Poon CONN_DEC_REF(tcp->tcp_connp); 347721fffe3SKacheong Poon } 348721fffe3SKacheong Poon } else { 349721fffe3SKacheong Poon CONN_INC_REF_LOCKED(connp); 350721fffe3SKacheong Poon mutex_exit(lock); 351721fffe3SKacheong Poon mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 352721fffe3SKacheong Poon mutex_exit(&connp->conn_lock); 353721fffe3SKacheong Poon /* 354721fffe3SKacheong Poon * We can reuse the closemp here since conn has 355721fffe3SKacheong Poon * detached (otherwise we wouldn't even be in 356721fffe3SKacheong Poon * time_wait list). tcp_closemp_used can safely 357721fffe3SKacheong Poon * be changed without taking a lock as no other 358721fffe3SKacheong Poon * thread can concurrently access it at this 359721fffe3SKacheong Poon * point in the connection lifecycle. 360721fffe3SKacheong Poon */ 361721fffe3SKacheong Poon 362721fffe3SKacheong Poon if (tcp->tcp_closemp.b_prev == NULL) 363721fffe3SKacheong Poon tcp->tcp_closemp_used = B_TRUE; 364721fffe3SKacheong Poon else 365721fffe3SKacheong Poon cmn_err(CE_PANIC, 366721fffe3SKacheong Poon "tcp_timewait_collector: " 367721fffe3SKacheong Poon "concurrent use of tcp_closemp: " 368721fffe3SKacheong Poon "connp %p tcp %p\n", (void *)connp, 369721fffe3SKacheong Poon (void *)tcp); 370721fffe3SKacheong Poon 371721fffe3SKacheong Poon TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 372721fffe3SKacheong Poon mp = &tcp->tcp_closemp; 373721fffe3SKacheong Poon SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 374721fffe3SKacheong Poon tcp_timewait_close, connp, NULL, 375721fffe3SKacheong Poon SQ_FILL, SQTAG_TCP_TIMEWAIT); 376721fffe3SKacheong Poon } 377721fffe3SKacheong Poon } else { 378721fffe3SKacheong Poon mutex_enter(&connp->conn_lock); 379721fffe3SKacheong Poon CONN_INC_REF_LOCKED(connp); 380721fffe3SKacheong Poon mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 381721fffe3SKacheong Poon mutex_exit(&connp->conn_lock); 382721fffe3SKacheong Poon /* 383721fffe3SKacheong Poon * We can reuse the closemp here since conn has 384721fffe3SKacheong Poon * detached (otherwise we wouldn't even be in 385721fffe3SKacheong Poon * time_wait list). tcp_closemp_used can safely 386721fffe3SKacheong Poon * be changed without taking a lock as no other 387721fffe3SKacheong Poon * thread can concurrently access it at this 388721fffe3SKacheong Poon * point in the connection lifecycle. 389721fffe3SKacheong Poon */ 390721fffe3SKacheong Poon 391721fffe3SKacheong Poon if (tcp->tcp_closemp.b_prev == NULL) 392721fffe3SKacheong Poon tcp->tcp_closemp_used = B_TRUE; 393721fffe3SKacheong Poon else 394721fffe3SKacheong Poon cmn_err(CE_PANIC, "tcp_timewait_collector: " 395721fffe3SKacheong Poon "concurrent use of tcp_closemp: " 396721fffe3SKacheong Poon "connp %p tcp %p\n", (void *)connp, 397721fffe3SKacheong Poon (void *)tcp); 398721fffe3SKacheong Poon 399721fffe3SKacheong Poon TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 400721fffe3SKacheong Poon mp = &tcp->tcp_closemp; 401721fffe3SKacheong Poon SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 402721fffe3SKacheong Poon tcp_timewait_close, connp, NULL, 403721fffe3SKacheong Poon SQ_FILL, SQTAG_TCP_TIMEWAIT); 404721fffe3SKacheong Poon } 405721fffe3SKacheong Poon mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 406721fffe3SKacheong Poon } 407721fffe3SKacheong Poon 408721fffe3SKacheong Poon if (tcp_time_wait->tcp_free_list != NULL) 409721fffe3SKacheong Poon tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE; 410721fffe3SKacheong Poon 41166cd0f60SKacheong Poon /* 41266cd0f60SKacheong Poon * If the time wait list is not empty and there is no timer running, 41366cd0f60SKacheong Poon * restart it. 41466cd0f60SKacheong Poon */ 41566cd0f60SKacheong Poon if ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL && 41666cd0f60SKacheong Poon tcp_time_wait->tcp_time_wait_tid == 0) { 41766cd0f60SKacheong Poon hrtime_t firetime; 41866cd0f60SKacheong Poon 41966cd0f60SKacheong Poon firetime = TICK_TO_NSEC(tcp->tcp_time_wait_expire - now); 42066cd0f60SKacheong Poon /* This ensures that we won't wake up too often. */ 42166cd0f60SKacheong Poon firetime = MAX(TCP_TIME_WAIT_DELAY, firetime); 422721fffe3SKacheong Poon tcp_time_wait->tcp_time_wait_tid = 42366cd0f60SKacheong Poon timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, 42466cd0f60SKacheong Poon sqp, firetime, CALLOUT_TCP_RESOLUTION, 425721fffe3SKacheong Poon CALLOUT_FLAG_ROUNDUP); 42666cd0f60SKacheong Poon } 4270870f17bSKacheong Poon #ifdef DEBUG 4280870f17bSKacheong Poon tcp_time_wait->tcp_time_wait_running = B_FALSE; 4290870f17bSKacheong Poon #endif 430721fffe3SKacheong Poon mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 431721fffe3SKacheong Poon } 432721fffe3SKacheong Poon 433721fffe3SKacheong Poon /* 434721fffe3SKacheong Poon * tcp_time_wait_processing() handles processing of incoming packets when 435721fffe3SKacheong Poon * the tcp_t is in the TIME_WAIT state. 436721fffe3SKacheong Poon * 437721fffe3SKacheong Poon * A TIME_WAIT tcp_t that has an associated open TCP end point (not in 438721fffe3SKacheong Poon * detached state) is never put on the time wait list. 439721fffe3SKacheong Poon */ 440721fffe3SKacheong Poon void 441721fffe3SKacheong Poon tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, 442721fffe3SKacheong Poon uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira) 443721fffe3SKacheong Poon { 444721fffe3SKacheong Poon int32_t bytes_acked; 445721fffe3SKacheong Poon int32_t gap; 446721fffe3SKacheong Poon int32_t rgap; 447721fffe3SKacheong Poon tcp_opt_t tcpopt; 448721fffe3SKacheong Poon uint_t flags; 449721fffe3SKacheong Poon uint32_t new_swnd = 0; 450721fffe3SKacheong Poon conn_t *nconnp; 451721fffe3SKacheong Poon conn_t *connp = tcp->tcp_connp; 452721fffe3SKacheong Poon tcp_stack_t *tcps = tcp->tcp_tcps; 453721fffe3SKacheong Poon 454721fffe3SKacheong Poon BUMP_LOCAL(tcp->tcp_ibsegs); 455721fffe3SKacheong Poon DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); 456721fffe3SKacheong Poon 457721fffe3SKacheong Poon flags = (unsigned int)tcpha->tha_flags & 0xFF; 458721fffe3SKacheong Poon new_swnd = ntohs(tcpha->tha_win) << 459721fffe3SKacheong Poon ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); 460721fffe3SKacheong Poon if (tcp->tcp_snd_ts_ok) { 461721fffe3SKacheong Poon if (!tcp_paws_check(tcp, tcpha, &tcpopt)) { 462721fffe3SKacheong Poon tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 463721fffe3SKacheong Poon tcp->tcp_rnxt, TH_ACK); 464721fffe3SKacheong Poon goto done; 465721fffe3SKacheong Poon } 466721fffe3SKacheong Poon } 467721fffe3SKacheong Poon gap = seg_seq - tcp->tcp_rnxt; 468721fffe3SKacheong Poon rgap = tcp->tcp_rwnd - (gap + seg_len); 469721fffe3SKacheong Poon if (gap < 0) { 470721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpInDataDupSegs); 471721fffe3SKacheong Poon TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes, 472721fffe3SKacheong Poon (seg_len > -gap ? -gap : seg_len)); 473721fffe3SKacheong Poon seg_len += gap; 474721fffe3SKacheong Poon if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 475721fffe3SKacheong Poon if (flags & TH_RST) { 476721fffe3SKacheong Poon goto done; 477721fffe3SKacheong Poon } 478721fffe3SKacheong Poon if ((flags & TH_FIN) && seg_len == -1) { 479721fffe3SKacheong Poon /* 480721fffe3SKacheong Poon * When TCP receives a duplicate FIN in 481721fffe3SKacheong Poon * TIME_WAIT state, restart the 2 MSL timer. 482721fffe3SKacheong Poon * See page 73 in RFC 793. Make sure this TCP 483721fffe3SKacheong Poon * is already on the TIME_WAIT list. If not, 484721fffe3SKacheong Poon * just restart the timer. 485721fffe3SKacheong Poon */ 486721fffe3SKacheong Poon if (TCP_IS_DETACHED(tcp)) { 487721fffe3SKacheong Poon if (tcp_time_wait_remove(tcp, NULL) == 488721fffe3SKacheong Poon B_TRUE) { 489721fffe3SKacheong Poon tcp_time_wait_append(tcp); 490721fffe3SKacheong Poon TCP_DBGSTAT(tcps, 491721fffe3SKacheong Poon tcp_rput_time_wait); 492721fffe3SKacheong Poon } 493721fffe3SKacheong Poon } else { 494721fffe3SKacheong Poon ASSERT(tcp != NULL); 495721fffe3SKacheong Poon TCP_TIMER_RESTART(tcp, 496721fffe3SKacheong Poon tcps->tcps_time_wait_interval); 497721fffe3SKacheong Poon } 498721fffe3SKacheong Poon tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 499721fffe3SKacheong Poon tcp->tcp_rnxt, TH_ACK); 500721fffe3SKacheong Poon goto done; 501721fffe3SKacheong Poon } 502721fffe3SKacheong Poon flags |= TH_ACK_NEEDED; 503721fffe3SKacheong Poon seg_len = 0; 504721fffe3SKacheong Poon goto process_ack; 505721fffe3SKacheong Poon } 506721fffe3SKacheong Poon 507721fffe3SKacheong Poon /* Fix seg_seq, and chew the gap off the front. */ 508721fffe3SKacheong Poon seg_seq = tcp->tcp_rnxt; 509721fffe3SKacheong Poon } 510721fffe3SKacheong Poon 511721fffe3SKacheong Poon if ((flags & TH_SYN) && gap > 0 && rgap < 0) { 512721fffe3SKacheong Poon /* 513721fffe3SKacheong Poon * Make sure that when we accept the connection, pick 514*c0e6663fSJerry Jelinek * an ISS greater than (tcp_snxt + tcp_iss_incr/2) for the 515721fffe3SKacheong Poon * old connection. 516721fffe3SKacheong Poon * 517721fffe3SKacheong Poon * The next ISS generated is equal to tcp_iss_incr_extra 518*c0e6663fSJerry Jelinek * + tcp_iss_incr/2 + other components depending on the 519721fffe3SKacheong Poon * value of tcp_strong_iss. We pre-calculate the new 520721fffe3SKacheong Poon * ISS here and compare with tcp_snxt to determine if 521721fffe3SKacheong Poon * we need to make adjustment to tcp_iss_incr_extra. 522721fffe3SKacheong Poon * 523721fffe3SKacheong Poon * The above calculation is ugly and is a 524721fffe3SKacheong Poon * waste of CPU cycles... 525721fffe3SKacheong Poon */ 526721fffe3SKacheong Poon uint32_t new_iss = tcps->tcps_iss_incr_extra; 527721fffe3SKacheong Poon int32_t adj; 528721fffe3SKacheong Poon ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 529721fffe3SKacheong Poon 530721fffe3SKacheong Poon switch (tcps->tcps_strong_iss) { 531721fffe3SKacheong Poon case 2: { 532721fffe3SKacheong Poon /* Add time and MD5 components. */ 533721fffe3SKacheong Poon uint32_t answer[4]; 534721fffe3SKacheong Poon struct { 535721fffe3SKacheong Poon uint32_t ports; 536721fffe3SKacheong Poon in6_addr_t src; 537721fffe3SKacheong Poon in6_addr_t dst; 538721fffe3SKacheong Poon } arg; 539721fffe3SKacheong Poon MD5_CTX context; 540721fffe3SKacheong Poon 541721fffe3SKacheong Poon mutex_enter(&tcps->tcps_iss_key_lock); 542721fffe3SKacheong Poon context = tcps->tcps_iss_key; 543721fffe3SKacheong Poon mutex_exit(&tcps->tcps_iss_key_lock); 544721fffe3SKacheong Poon arg.ports = connp->conn_ports; 545721fffe3SKacheong Poon /* We use MAPPED addresses in tcp_iss_init */ 546721fffe3SKacheong Poon arg.src = connp->conn_laddr_v6; 547721fffe3SKacheong Poon arg.dst = connp->conn_faddr_v6; 548721fffe3SKacheong Poon MD5Update(&context, (uchar_t *)&arg, 549721fffe3SKacheong Poon sizeof (arg)); 550721fffe3SKacheong Poon MD5Final((uchar_t *)answer, &context); 551721fffe3SKacheong Poon answer[0] ^= answer[1] ^ answer[2] ^ answer[3]; 552721fffe3SKacheong Poon new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0]; 553721fffe3SKacheong Poon break; 554721fffe3SKacheong Poon } 555721fffe3SKacheong Poon case 1: 556721fffe3SKacheong Poon /* Add time component and min random (i.e. 1). */ 557721fffe3SKacheong Poon new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1; 558721fffe3SKacheong Poon break; 559721fffe3SKacheong Poon default: 560721fffe3SKacheong Poon /* Add only time component. */ 561*c0e6663fSJerry Jelinek new_iss += (uint32_t)gethrestime_sec() * 562*c0e6663fSJerry Jelinek tcps->tcps_iss_incr; 563721fffe3SKacheong Poon break; 564721fffe3SKacheong Poon } 565721fffe3SKacheong Poon if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { 566721fffe3SKacheong Poon /* 567*c0e6663fSJerry Jelinek * New ISS not guaranteed to be tcp_iss_incr/2 568721fffe3SKacheong Poon * ahead of the current tcp_snxt, so add the 569721fffe3SKacheong Poon * difference to tcp_iss_incr_extra. 570721fffe3SKacheong Poon */ 571721fffe3SKacheong Poon tcps->tcps_iss_incr_extra += adj; 572721fffe3SKacheong Poon } 573721fffe3SKacheong Poon /* 574721fffe3SKacheong Poon * If tcp_clean_death() can not perform the task now, 575721fffe3SKacheong Poon * drop the SYN packet and let the other side re-xmit. 576721fffe3SKacheong Poon * Otherwise pass the SYN packet back in, since the 577721fffe3SKacheong Poon * old tcp state has been cleaned up or freed. 578721fffe3SKacheong Poon */ 579721fffe3SKacheong Poon if (tcp_clean_death(tcp, 0) == -1) 580721fffe3SKacheong Poon goto done; 581721fffe3SKacheong Poon nconnp = ipcl_classify(mp, ira, ipst); 582721fffe3SKacheong Poon if (nconnp != NULL) { 583721fffe3SKacheong Poon TCP_STAT(tcps, tcp_time_wait_syn_success); 584721fffe3SKacheong Poon /* Drops ref on nconnp */ 585721fffe3SKacheong Poon tcp_reinput(nconnp, mp, ira, ipst); 586721fffe3SKacheong Poon return; 587721fffe3SKacheong Poon } 588721fffe3SKacheong Poon goto done; 589721fffe3SKacheong Poon } 590721fffe3SKacheong Poon 591721fffe3SKacheong Poon /* 592721fffe3SKacheong Poon * rgap is the amount of stuff received out of window. A negative 593721fffe3SKacheong Poon * value is the amount out of window. 594721fffe3SKacheong Poon */ 595721fffe3SKacheong Poon if (rgap < 0) { 596721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs); 597721fffe3SKacheong Poon TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap); 598721fffe3SKacheong Poon /* Fix seg_len and make sure there is something left. */ 599721fffe3SKacheong Poon seg_len += rgap; 600721fffe3SKacheong Poon if (seg_len <= 0) { 601721fffe3SKacheong Poon if (flags & TH_RST) { 602721fffe3SKacheong Poon goto done; 603721fffe3SKacheong Poon } 604721fffe3SKacheong Poon flags |= TH_ACK_NEEDED; 605721fffe3SKacheong Poon seg_len = 0; 606721fffe3SKacheong Poon goto process_ack; 607721fffe3SKacheong Poon } 608721fffe3SKacheong Poon } 609721fffe3SKacheong Poon /* 610721fffe3SKacheong Poon * Check whether we can update tcp_ts_recent. This test is 611721fffe3SKacheong Poon * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 612721fffe3SKacheong Poon * Extensions for High Performance: An Update", Internet Draft. 613721fffe3SKacheong Poon */ 614721fffe3SKacheong Poon if (tcp->tcp_snd_ts_ok && 615721fffe3SKacheong Poon TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 616721fffe3SKacheong Poon SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 617721fffe3SKacheong Poon tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 618721fffe3SKacheong Poon tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64(); 619721fffe3SKacheong Poon } 620721fffe3SKacheong Poon 621721fffe3SKacheong Poon if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { 622721fffe3SKacheong Poon /* Always ack out of order packets */ 623721fffe3SKacheong Poon flags |= TH_ACK_NEEDED; 624721fffe3SKacheong Poon seg_len = 0; 625721fffe3SKacheong Poon } else if (seg_len > 0) { 626721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpInClosed); 627721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs); 628721fffe3SKacheong Poon TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len); 629721fffe3SKacheong Poon } 630721fffe3SKacheong Poon if (flags & TH_RST) { 631721fffe3SKacheong Poon (void) tcp_clean_death(tcp, 0); 632721fffe3SKacheong Poon goto done; 633721fffe3SKacheong Poon } 634721fffe3SKacheong Poon if (flags & TH_SYN) { 635721fffe3SKacheong Poon tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 636721fffe3SKacheong Poon TH_RST|TH_ACK); 637721fffe3SKacheong Poon /* 638721fffe3SKacheong Poon * Do not delete the TCP structure if it is in 639721fffe3SKacheong Poon * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. 640721fffe3SKacheong Poon */ 641721fffe3SKacheong Poon goto done; 642721fffe3SKacheong Poon } 643721fffe3SKacheong Poon process_ack: 644721fffe3SKacheong Poon if (flags & TH_ACK) { 645721fffe3SKacheong Poon bytes_acked = (int)(seg_ack - tcp->tcp_suna); 646721fffe3SKacheong Poon if (bytes_acked <= 0) { 647721fffe3SKacheong Poon if (bytes_acked == 0 && seg_len == 0 && 648721fffe3SKacheong Poon new_swnd == tcp->tcp_swnd) 649721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpInDupAck); 650721fffe3SKacheong Poon } else { 651721fffe3SKacheong Poon /* Acks something not sent */ 652721fffe3SKacheong Poon flags |= TH_ACK_NEEDED; 653721fffe3SKacheong Poon } 654721fffe3SKacheong Poon } 655721fffe3SKacheong Poon if (flags & TH_ACK_NEEDED) { 656721fffe3SKacheong Poon /* 657721fffe3SKacheong Poon * Time to send an ack for some reason. 658721fffe3SKacheong Poon */ 659721fffe3SKacheong Poon tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 660721fffe3SKacheong Poon tcp->tcp_rnxt, TH_ACK); 661721fffe3SKacheong Poon } 662721fffe3SKacheong Poon done: 663721fffe3SKacheong Poon freemsg(mp); 664721fffe3SKacheong Poon } 665