1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, Joyent Inc. All rights reserved. 25 */ 26 27 /* 28 * This file contains functions related to TCP time wait processing. Also 29 * refer to the time wait handling comments in tcp_impl.h. 30 */ 31 32 #include <sys/types.h> 33 #include <sys/strsun.h> 34 #include <sys/squeue_impl.h> 35 #include <sys/squeue.h> 36 #include <sys/callo.h> 37 38 #include <inet/common.h> 39 #include <inet/ip.h> 40 #include <inet/tcp.h> 41 #include <inet/tcp_impl.h> 42 #include <inet/tcp_cluster.h> 43 44 static void tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *); 45 46 /* 47 * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. 48 * Running it every 5 seconds seems to give the best results. 49 */ 50 #define TCP_TIME_WAIT_DELAY ((hrtime_t)5 * NANOSEC) 51 52 /* 53 * Remove a connection from the list of detached TIME_WAIT connections. 54 * It returns B_FALSE if it can't remove the connection from the list 55 * as the connection has already been removed from the list due to an 56 * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE. 57 */ 58 boolean_t 59 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) 60 { 61 boolean_t locked = B_FALSE; 62 63 if (tcp_time_wait == NULL) { 64 tcp_time_wait = *((tcp_squeue_priv_t **) 65 squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); 66 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 67 locked = B_TRUE; 68 } else { 69 ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock)); 70 } 71 72 /* 0 means that the tcp_t has not been added to the time wait list. */ 73 if (tcp->tcp_time_wait_expire == 0) { 74 ASSERT(tcp->tcp_time_wait_next == NULL); 75 ASSERT(tcp->tcp_time_wait_prev == NULL); 76 if (locked) 77 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 78 return (B_FALSE); 79 } 80 ASSERT(TCP_IS_DETACHED(tcp)); 81 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 82 83 if (tcp == tcp_time_wait->tcp_time_wait_head) { 84 ASSERT(tcp->tcp_time_wait_prev == NULL); 85 tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; 86 if (tcp_time_wait->tcp_time_wait_head != NULL) { 87 tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = 88 NULL; 89 } else { 90 tcp_time_wait->tcp_time_wait_tail = NULL; 91 } 92 } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { 93 ASSERT(tcp->tcp_time_wait_next == NULL); 94 tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; 95 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 96 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; 97 } else { 98 ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); 99 ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); 100 tcp->tcp_time_wait_prev->tcp_time_wait_next = 101 tcp->tcp_time_wait_next; 102 tcp->tcp_time_wait_next->tcp_time_wait_prev = 103 tcp->tcp_time_wait_prev; 104 } 105 tcp->tcp_time_wait_next = NULL; 106 tcp->tcp_time_wait_prev = NULL; 107 tcp->tcp_time_wait_expire = 0; 108 109 if (locked) 110 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 111 return (B_TRUE); 112 } 113 114 /* Constants used for fast checking of a localhost address */ 115 #if defined(_BIG_ENDIAN) 116 #define IPv4_LOCALHOST 0x7f000000U 117 #define IPv4_LH_MASK 0xffffff00U 118 #else 119 #define IPv4_LOCALHOST 0x0000007fU 120 #define IPv4_LH_MASK 0x00ffffffU 121 #endif 122 123 #define IS_LOCAL_HOST(x) ( \ 124 ((x)->tcp_connp->conn_ipversion == IPV4_VERSION && \ 125 ((x)->tcp_connp->conn_laddr_v4 & IPv4_LH_MASK) == IPv4_LOCALHOST) || \ 126 ((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \ 127 IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6))) 128 129 /* 130 * Add a connection to the list of detached TIME_WAIT connections 131 * and set its time to expire. 132 */ 133 void 134 tcp_time_wait_append(tcp_t *tcp) 135 { 136 tcp_stack_t *tcps = tcp->tcp_tcps; 137 squeue_t *sqp = tcp->tcp_connp->conn_sqp; 138 tcp_squeue_priv_t *tcp_time_wait = 139 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 140 hrtime_t firetime = 0; 141 142 tcp_timers_stop(tcp); 143 144 /* Freed above */ 145 ASSERT(tcp->tcp_timer_tid == 0); 146 ASSERT(tcp->tcp_ack_tid == 0); 147 148 /* must have happened at the time of detaching the tcp */ 149 ASSERT(tcp->tcp_ptpahn == NULL); 150 ASSERT(tcp->tcp_flow_stopped == 0); 151 ASSERT(tcp->tcp_time_wait_next == NULL); 152 ASSERT(tcp->tcp_time_wait_prev == NULL); 153 ASSERT(tcp->tcp_time_wait_expire == 0); 154 ASSERT(tcp->tcp_listener == NULL); 155 156 tcp->tcp_time_wait_expire = ddi_get_lbolt64(); 157 if (IS_LOCAL_HOST(tcp)) { 158 /* 159 * This is the fastpath for handling localhost connections. 160 * Since we don't have to worry about packets on the localhost 161 * showing up after a long network delay, we want to expire 162 * these quickly so the port range on the localhost doesn't 163 * get starved by short-running, local apps. 164 * 165 * Leave tcp_time_wait_expire at the current time. This 166 * essentially means the connection is expired now and it will 167 * clean up the next time tcp_time_wait_collector runs. We set 168 * firetime to use a short delay so that if we have to start a 169 * tcp_time_wait_collector thread below, it runs soon instead 170 * of after a delay of time_wait_interval. firetime being set 171 * to a non-0 value is also our indicator that we should add 172 * this connection to the head of the time wait list (since we 173 * are already expired) so that its sure to get cleaned up on 174 * the next run of tcp_time_wait_collector (which expects the 175 * entries to appear in time-order and stops when it hits the 176 * first non-expired entry). 177 */ 178 firetime = TCP_TIME_WAIT_DELAY; 179 } else { 180 /* 181 * Since tcp_time_wait_expire is lbolt64, it should not wrap 182 * around in practice. Hence it cannot be 0. Note that zero 183 * means that the tcp_t is not in the TIME_WAIT list. 184 */ 185 tcp->tcp_time_wait_expire += MSEC_TO_TICK( 186 tcps->tcps_time_wait_interval); 187 } 188 189 ASSERT(TCP_IS_DETACHED(tcp)); 190 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 191 ASSERT(tcp->tcp_time_wait_next == NULL); 192 ASSERT(tcp->tcp_time_wait_prev == NULL); 193 TCP_DBGSTAT(tcps, tcp_time_wait); 194 195 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 196 if (tcp_time_wait->tcp_time_wait_head == NULL) { 197 ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); 198 tcp_time_wait->tcp_time_wait_head = tcp; 199 200 /* 201 * Even if the list was empty before, there may be a timer 202 * running since a tcp_t can be removed from the list 203 * in other places, such as tcp_clean_death(). So check if 204 * a timer is needed. 205 */ 206 if (tcp_time_wait->tcp_time_wait_tid == 0) { 207 if (firetime == 0) 208 firetime = (hrtime_t) 209 (tcps->tcps_time_wait_interval + 1) * 210 MICROSEC; 211 212 tcp_time_wait->tcp_time_wait_tid = 213 timeout_generic(CALLOUT_NORMAL, 214 tcp_time_wait_collector, sqp, firetime, 215 CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); 216 } 217 tcp_time_wait->tcp_time_wait_tail = tcp; 218 } else { 219 /* 220 * The list is not empty, so a timer must be running. If not, 221 * tcp_time_wait_collector() must be running on this 222 * tcp_time_wait list at the same time. 223 */ 224 ASSERT(tcp_time_wait->tcp_time_wait_tid != 0 || 225 tcp_time_wait->tcp_time_wait_running); 226 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 227 ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == 228 TCPS_TIME_WAIT); 229 230 if (firetime == 0) { 231 /* add at end */ 232 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = 233 tcp; 234 tcp->tcp_time_wait_prev = 235 tcp_time_wait->tcp_time_wait_tail; 236 tcp_time_wait->tcp_time_wait_tail = tcp; 237 } else { 238 /* add at head */ 239 tcp->tcp_time_wait_next = 240 tcp_time_wait->tcp_time_wait_head; 241 tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = 242 tcp; 243 tcp_time_wait->tcp_time_wait_head = tcp; 244 } 245 } 246 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 247 } 248 249 /* 250 * Wrapper to call tcp_close_detached() via squeue to clean up TIME-WAIT 251 * tcp_t. Used in tcp_time_wait_collector(). 252 */ 253 /* ARGSUSED */ 254 static void 255 tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 256 { 257 conn_t *connp = (conn_t *)arg; 258 tcp_t *tcp = connp->conn_tcp; 259 260 ASSERT(tcp != NULL); 261 if (tcp->tcp_state == TCPS_CLOSED) { 262 return; 263 } 264 265 ASSERT((connp->conn_family == AF_INET && 266 connp->conn_ipversion == IPV4_VERSION) || 267 (connp->conn_family == AF_INET6 && 268 (connp->conn_ipversion == IPV4_VERSION || 269 connp->conn_ipversion == IPV6_VERSION))); 270 ASSERT(!tcp->tcp_listener); 271 272 ASSERT(TCP_IS_DETACHED(tcp)); 273 274 /* 275 * Because they have no upstream client to rebind or tcp_close() 276 * them later, we axe the connection here and now. 277 */ 278 tcp_close_detached(tcp); 279 } 280 281 /* 282 * Blows away all tcps whose TIME_WAIT has expired. List traversal 283 * is done forwards from the head. 284 * This walks all stack instances since 285 * tcp_time_wait remains global across all stacks. 286 */ 287 /* ARGSUSED */ 288 void 289 tcp_time_wait_collector(void *arg) 290 { 291 tcp_t *tcp; 292 int64_t now; 293 mblk_t *mp; 294 conn_t *connp; 295 kmutex_t *lock; 296 boolean_t removed; 297 extern void (*cl_inet_disconnect)(netstackid_t, uint8_t, sa_family_t, 298 uint8_t *, in_port_t, uint8_t *, in_port_t, void *); 299 300 squeue_t *sqp = (squeue_t *)arg; 301 tcp_squeue_priv_t *tcp_time_wait = 302 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 303 304 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 305 tcp_time_wait->tcp_time_wait_tid = 0; 306 #ifdef DEBUG 307 tcp_time_wait->tcp_time_wait_running = B_TRUE; 308 #endif 309 310 if (tcp_time_wait->tcp_free_list != NULL && 311 tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { 312 TCP_G_STAT(tcp_freelist_cleanup); 313 while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { 314 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 315 tcp->tcp_time_wait_next = NULL; 316 tcp_time_wait->tcp_free_list_cnt--; 317 ASSERT(tcp->tcp_tcps == NULL); 318 CONN_DEC_REF(tcp->tcp_connp); 319 } 320 ASSERT(tcp_time_wait->tcp_free_list_cnt == 0); 321 } 322 323 /* 324 * In order to reap time waits reliably, we should use a 325 * source of time that is not adjustable by the user -- hence 326 * the call to ddi_get_lbolt64(). 327 */ 328 now = ddi_get_lbolt64(); 329 while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { 330 /* 331 * lbolt64 should not wrap around in practice... So we can 332 * do a direct comparison. 333 */ 334 if (now < tcp->tcp_time_wait_expire) 335 break; 336 337 removed = tcp_time_wait_remove(tcp, tcp_time_wait); 338 ASSERT(removed); 339 340 connp = tcp->tcp_connp; 341 ASSERT(connp->conn_fanout != NULL); 342 lock = &connp->conn_fanout->connf_lock; 343 /* 344 * This is essentially a TW reclaim fast path optimization for 345 * performance where the timewait collector checks under the 346 * fanout lock (so that no one else can get access to the 347 * conn_t) that the refcnt is 2 i.e. one for TCP and one for 348 * the classifier hash list. If ref count is indeed 2, we can 349 * just remove the conn under the fanout lock and avoid 350 * cleaning up the conn under the squeue, provided that 351 * clustering callbacks are not enabled. If clustering is 352 * enabled, we need to make the clustering callback before 353 * setting the CONDEMNED flag and after dropping all locks and 354 * so we forego this optimization and fall back to the slow 355 * path. Also please see the comments in tcp_closei_local 356 * regarding the refcnt logic. 357 * 358 * Since we are holding the tcp_time_wait_lock, its better 359 * not to block on the fanout_lock because other connections 360 * can't add themselves to time_wait list. So we do a 361 * tryenter instead of mutex_enter. 362 */ 363 if (mutex_tryenter(lock)) { 364 mutex_enter(&connp->conn_lock); 365 if ((connp->conn_ref == 2) && 366 (cl_inet_disconnect == NULL)) { 367 ipcl_hash_remove_locked(connp, 368 connp->conn_fanout); 369 /* 370 * Set the CONDEMNED flag now itself so that 371 * the refcnt cannot increase due to any 372 * walker. 373 */ 374 connp->conn_state_flags |= CONN_CONDEMNED; 375 mutex_exit(lock); 376 mutex_exit(&connp->conn_lock); 377 if (tcp_time_wait->tcp_free_list_cnt < 378 tcp_free_list_max_cnt) { 379 /* Add to head of tcp_free_list */ 380 mutex_exit( 381 &tcp_time_wait->tcp_time_wait_lock); 382 tcp_cleanup(tcp); 383 ASSERT(connp->conn_latch == NULL); 384 ASSERT(connp->conn_policy == NULL); 385 ASSERT(tcp->tcp_tcps == NULL); 386 ASSERT(connp->conn_netstack == NULL); 387 388 mutex_enter( 389 &tcp_time_wait->tcp_time_wait_lock); 390 tcp->tcp_time_wait_next = 391 tcp_time_wait->tcp_free_list; 392 tcp_time_wait->tcp_free_list = tcp; 393 tcp_time_wait->tcp_free_list_cnt++; 394 continue; 395 } else { 396 /* Do not add to tcp_free_list */ 397 mutex_exit( 398 &tcp_time_wait->tcp_time_wait_lock); 399 tcp_bind_hash_remove(tcp); 400 ixa_cleanup(tcp->tcp_connp->conn_ixa); 401 tcp_ipsec_cleanup(tcp); 402 CONN_DEC_REF(tcp->tcp_connp); 403 } 404 } else { 405 CONN_INC_REF_LOCKED(connp); 406 mutex_exit(lock); 407 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 408 mutex_exit(&connp->conn_lock); 409 /* 410 * We can reuse the closemp here since conn has 411 * detached (otherwise we wouldn't even be in 412 * time_wait list). tcp_closemp_used can safely 413 * be changed without taking a lock as no other 414 * thread can concurrently access it at this 415 * point in the connection lifecycle. 416 */ 417 418 if (tcp->tcp_closemp.b_prev == NULL) 419 tcp->tcp_closemp_used = B_TRUE; 420 else 421 cmn_err(CE_PANIC, 422 "tcp_timewait_collector: " 423 "concurrent use of tcp_closemp: " 424 "connp %p tcp %p\n", (void *)connp, 425 (void *)tcp); 426 427 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 428 mp = &tcp->tcp_closemp; 429 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 430 tcp_timewait_close, connp, NULL, 431 SQ_FILL, SQTAG_TCP_TIMEWAIT); 432 } 433 } else { 434 mutex_enter(&connp->conn_lock); 435 CONN_INC_REF_LOCKED(connp); 436 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 437 mutex_exit(&connp->conn_lock); 438 /* 439 * We can reuse the closemp here since conn has 440 * detached (otherwise we wouldn't even be in 441 * time_wait list). tcp_closemp_used can safely 442 * be changed without taking a lock as no other 443 * thread can concurrently access it at this 444 * point in the connection lifecycle. 445 */ 446 447 if (tcp->tcp_closemp.b_prev == NULL) 448 tcp->tcp_closemp_used = B_TRUE; 449 else 450 cmn_err(CE_PANIC, "tcp_timewait_collector: " 451 "concurrent use of tcp_closemp: " 452 "connp %p tcp %p\n", (void *)connp, 453 (void *)tcp); 454 455 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 456 mp = &tcp->tcp_closemp; 457 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 458 tcp_timewait_close, connp, NULL, 459 SQ_FILL, SQTAG_TCP_TIMEWAIT); 460 } 461 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 462 } 463 464 if (tcp_time_wait->tcp_free_list != NULL) 465 tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE; 466 467 /* 468 * If the time wait list is not empty and there is no timer running, 469 * restart it. 470 */ 471 if ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL && 472 tcp_time_wait->tcp_time_wait_tid == 0) { 473 hrtime_t firetime; 474 475 /* shouldn't be necessary, but just in case */ 476 if (tcp->tcp_time_wait_expire < now) 477 tcp->tcp_time_wait_expire = now; 478 479 firetime = TICK_TO_NSEC(tcp->tcp_time_wait_expire - now); 480 /* This ensures that we won't wake up too often. */ 481 firetime = MAX(TCP_TIME_WAIT_DELAY, firetime); 482 tcp_time_wait->tcp_time_wait_tid = 483 timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, 484 sqp, firetime, CALLOUT_TCP_RESOLUTION, 485 CALLOUT_FLAG_ROUNDUP); 486 } 487 #ifdef DEBUG 488 tcp_time_wait->tcp_time_wait_running = B_FALSE; 489 #endif 490 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 491 } 492 493 /* 494 * tcp_time_wait_processing() handles processing of incoming packets when 495 * the tcp_t is in the TIME_WAIT state. 496 * 497 * A TIME_WAIT tcp_t that has an associated open TCP end point (not in 498 * detached state) is never put on the time wait list. 499 */ 500 void 501 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, 502 uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira) 503 { 504 int32_t bytes_acked; 505 int32_t gap; 506 int32_t rgap; 507 tcp_opt_t tcpopt; 508 uint_t flags; 509 uint32_t new_swnd = 0; 510 conn_t *nconnp; 511 conn_t *connp = tcp->tcp_connp; 512 tcp_stack_t *tcps = tcp->tcp_tcps; 513 514 BUMP_LOCAL(tcp->tcp_ibsegs); 515 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); 516 517 flags = (unsigned int)tcpha->tha_flags & 0xFF; 518 new_swnd = ntohs(tcpha->tha_win) << 519 ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); 520 521 if (tcp->tcp_snd_ts_ok && !(tcpha->tha_flags & TH_RST)) { 522 int options; 523 if (tcp->tcp_snd_sack_ok) 524 tcpopt.tcp = tcp; 525 else 526 tcpopt.tcp = NULL; 527 options = tcp_parse_options(tcpha, &tcpopt); 528 if (!(options & TCP_OPT_TSTAMP_PRESENT)) { 529 DTRACE_TCP1(droppedtimestamp, tcp_t *, tcp); 530 goto done; 531 } else if (!tcp_paws_check(tcp, &tcpopt)) { 532 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, 533 TH_ACK); 534 goto done; 535 } 536 } 537 gap = seg_seq - tcp->tcp_rnxt; 538 rgap = tcp->tcp_rwnd - (gap + seg_len); 539 if (gap < 0) { 540 TCPS_BUMP_MIB(tcps, tcpInDataDupSegs); 541 TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes, 542 (seg_len > -gap ? -gap : seg_len)); 543 seg_len += gap; 544 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 545 if (flags & TH_RST) { 546 goto done; 547 } 548 if ((flags & TH_FIN) && seg_len == -1) { 549 /* 550 * When TCP receives a duplicate FIN in 551 * TIME_WAIT state, restart the 2 MSL timer. 552 * See page 73 in RFC 793. Make sure this TCP 553 * is already on the TIME_WAIT list. If not, 554 * just restart the timer. 555 */ 556 if (TCP_IS_DETACHED(tcp)) { 557 if (tcp_time_wait_remove(tcp, NULL) == 558 B_TRUE) { 559 tcp_time_wait_append(tcp); 560 TCP_DBGSTAT(tcps, 561 tcp_rput_time_wait); 562 } 563 } else { 564 ASSERT(tcp != NULL); 565 TCP_TIMER_RESTART(tcp, 566 tcps->tcps_time_wait_interval); 567 } 568 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 569 tcp->tcp_rnxt, TH_ACK); 570 goto done; 571 } 572 flags |= TH_ACK_NEEDED; 573 seg_len = 0; 574 goto process_ack; 575 } 576 577 /* Fix seg_seq, and chew the gap off the front. */ 578 seg_seq = tcp->tcp_rnxt; 579 } 580 581 if ((flags & TH_SYN) && gap > 0 && rgap < 0) { 582 /* 583 * Make sure that when we accept the connection, pick 584 * an ISS greater than (tcp_snxt + tcp_iss_incr/2) for the 585 * old connection. 586 * 587 * The next ISS generated is equal to tcp_iss_incr_extra 588 * + tcp_iss_incr/2 + other components depending on the 589 * value of tcp_strong_iss. We pre-calculate the new 590 * ISS here and compare with tcp_snxt to determine if 591 * we need to make adjustment to tcp_iss_incr_extra. 592 * 593 * The above calculation is ugly and is a 594 * waste of CPU cycles... 595 */ 596 uint32_t new_iss = tcps->tcps_iss_incr_extra; 597 int32_t adj; 598 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 599 600 switch (tcps->tcps_strong_iss) { 601 case 2: { 602 /* Add time and MD5 components. */ 603 uint32_t answer[4]; 604 struct { 605 uint32_t ports; 606 in6_addr_t src; 607 in6_addr_t dst; 608 } arg; 609 MD5_CTX context; 610 611 mutex_enter(&tcps->tcps_iss_key_lock); 612 context = tcps->tcps_iss_key; 613 mutex_exit(&tcps->tcps_iss_key_lock); 614 arg.ports = connp->conn_ports; 615 /* We use MAPPED addresses in tcp_iss_init */ 616 arg.src = connp->conn_laddr_v6; 617 arg.dst = connp->conn_faddr_v6; 618 MD5Update(&context, (uchar_t *)&arg, 619 sizeof (arg)); 620 MD5Final((uchar_t *)answer, &context); 621 answer[0] ^= answer[1] ^ answer[2] ^ answer[3]; 622 new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0]; 623 break; 624 } 625 case 1: 626 /* Add time component and min random (i.e. 1). */ 627 new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1; 628 break; 629 default: 630 /* Add only time component. */ 631 new_iss += (uint32_t)gethrestime_sec() * 632 tcps->tcps_iss_incr; 633 break; 634 } 635 if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { 636 /* 637 * New ISS not guaranteed to be tcp_iss_incr/2 638 * ahead of the current tcp_snxt, so add the 639 * difference to tcp_iss_incr_extra. 640 */ 641 tcps->tcps_iss_incr_extra += adj; 642 } 643 /* 644 * If tcp_clean_death() can not perform the task now, 645 * drop the SYN packet and let the other side re-xmit. 646 * Otherwise pass the SYN packet back in, since the 647 * old tcp state has been cleaned up or freed. 648 */ 649 if (tcp_clean_death(tcp, 0) == -1) 650 goto done; 651 nconnp = ipcl_classify(mp, ira, ipst); 652 if (nconnp != NULL) { 653 TCP_STAT(tcps, tcp_time_wait_syn_success); 654 /* Drops ref on nconnp */ 655 tcp_reinput(nconnp, mp, ira, ipst); 656 return; 657 } 658 goto done; 659 } 660 661 /* 662 * rgap is the amount of stuff received out of window. A negative 663 * value is the amount out of window. 664 */ 665 if (rgap < 0) { 666 TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs); 667 TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap); 668 /* Fix seg_len and make sure there is something left. */ 669 seg_len += rgap; 670 if (seg_len <= 0) { 671 if (flags & TH_RST) { 672 goto done; 673 } 674 flags |= TH_ACK_NEEDED; 675 seg_len = 0; 676 goto process_ack; 677 } 678 } 679 /* 680 * Check whether we can update tcp_ts_recent. This test is from RFC 681 * 7323, section 5.3. 682 */ 683 if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) && 684 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 685 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 686 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 687 tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64(); 688 } 689 690 if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { 691 /* Always ack out of order packets */ 692 flags |= TH_ACK_NEEDED; 693 seg_len = 0; 694 } else if (seg_len > 0) { 695 TCPS_BUMP_MIB(tcps, tcpInClosed); 696 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs); 697 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len); 698 } 699 if (flags & TH_RST) { 700 (void) tcp_clean_death(tcp, 0); 701 goto done; 702 } 703 if (flags & TH_SYN) { 704 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 705 TH_RST|TH_ACK); 706 /* 707 * Do not delete the TCP structure if it is in 708 * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. 709 */ 710 goto done; 711 } 712 process_ack: 713 if (flags & TH_ACK) { 714 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 715 if (bytes_acked <= 0) { 716 if (bytes_acked == 0 && seg_len == 0 && 717 new_swnd == tcp->tcp_swnd) 718 TCPS_BUMP_MIB(tcps, tcpInDupAck); 719 } else { 720 /* Acks something not sent */ 721 flags |= TH_ACK_NEEDED; 722 } 723 } 724 if (flags & TH_ACK_NEEDED) { 725 /* 726 * Time to send an ack for some reason. 727 */ 728 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 729 tcp->tcp_rnxt, TH_ACK); 730 } 731 done: 732 freemsg(mp); 733 } 734