1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2016 Joyent, Inc. 25 */ 26 27 /* 28 * This file contains functions related to TCP time wait processing. Also 29 * refer to the time wait handling comments in tcp_impl.h. 30 */ 31 32 #include <sys/types.h> 33 #include <sys/strsun.h> 34 #include <sys/squeue_impl.h> 35 #include <sys/squeue.h> 36 #include <sys/callo.h> 37 38 #include <inet/common.h> 39 #include <inet/ip.h> 40 #include <inet/tcp.h> 41 #include <inet/tcp_impl.h> 42 #include <inet/tcp_cluster.h> 43 44 static void tcp_time_wait_purge(tcp_t *, tcp_squeue_priv_t *); 45 46 #define TW_BUCKET(t) \ 47 (((t) / MSEC_TO_TICK(TCP_TIME_WAIT_DELAY)) % TCP_TIME_WAIT_BUCKETS) 48 49 #define TW_BUCKET_NEXT(b) (((b) + 1) % TCP_TIME_WAIT_BUCKETS) 50 51 52 /* 53 * Remove a connection from the list of detached TIME_WAIT connections. 54 * It returns B_FALSE if it can't remove the connection from the list 55 * as the connection has already been removed from the list due to an 56 * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE. 57 */ 58 boolean_t 59 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tsp) 60 { 61 boolean_t locked = B_FALSE; 62 63 if (tsp == NULL) { 64 tsp = *((tcp_squeue_priv_t **) 65 squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); 66 mutex_enter(&tsp->tcp_time_wait_lock); 67 locked = B_TRUE; 68 } else { 69 ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock)); 70 } 71 72 /* 0 means that the tcp_t has not been added to the time wait list. */ 73 if (tcp->tcp_time_wait_expire == 0) { 74 ASSERT(tcp->tcp_time_wait_next == NULL); 75 ASSERT(tcp->tcp_time_wait_prev == NULL); 76 if (locked) 77 mutex_exit(&tsp->tcp_time_wait_lock); 78 return (B_FALSE); 79 } 80 ASSERT(TCP_IS_DETACHED(tcp)); 81 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 82 ASSERT(tsp->tcp_time_wait_cnt > 0); 83 84 if (tcp->tcp_time_wait_next != NULL) { 85 tcp->tcp_time_wait_next->tcp_time_wait_prev = 86 tcp->tcp_time_wait_prev; 87 } 88 if (tcp->tcp_time_wait_prev != NULL) { 89 tcp->tcp_time_wait_prev->tcp_time_wait_next = 90 tcp->tcp_time_wait_next; 91 } else { 92 unsigned int bucket; 93 94 bucket = TW_BUCKET(tcp->tcp_time_wait_expire); 95 ASSERT(tsp->tcp_time_wait_bucket[bucket] == tcp); 96 tsp->tcp_time_wait_bucket[bucket] = tcp->tcp_time_wait_next; 97 } 98 tcp->tcp_time_wait_next = NULL; 99 tcp->tcp_time_wait_prev = NULL; 100 tcp->tcp_time_wait_expire = 0; 101 tsp->tcp_time_wait_cnt--; 102 103 if (locked) 104 mutex_exit(&tsp->tcp_time_wait_lock); 105 return (B_TRUE); 106 } 107 108 /* Constants used for fast checking of a localhost address */ 109 #if defined(_BIG_ENDIAN) 110 #define IPv4_LOCALHOST 0x7f000000U 111 #define IPv4_LH_MASK 0xffffff00U 112 #else 113 #define IPv4_LOCALHOST 0x0000007fU 114 #define IPv4_LH_MASK 0x00ffffffU 115 #endif 116 117 #define IS_LOCAL_HOST(x) ( \ 118 ((x)->tcp_connp->conn_ipversion == IPV4_VERSION && \ 119 ((x)->tcp_connp->conn_laddr_v4 & IPv4_LH_MASK) == IPv4_LOCALHOST) || \ 120 ((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \ 121 IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6))) 122 123 124 /* 125 * Add a connection to the list of detached TIME_WAIT connections 126 * and set its time to expire. 127 */ 128 void 129 tcp_time_wait_append(tcp_t *tcp) 130 { 131 tcp_stack_t *tcps = tcp->tcp_tcps; 132 squeue_t *sqp = tcp->tcp_connp->conn_sqp; 133 tcp_squeue_priv_t *tsp = 134 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 135 int64_t now, schedule; 136 unsigned int bucket; 137 138 tcp_timers_stop(tcp); 139 140 /* Freed above */ 141 ASSERT(tcp->tcp_timer_tid == 0); 142 ASSERT(tcp->tcp_ack_tid == 0); 143 144 /* must have happened at the time of detaching the tcp */ 145 ASSERT(TCP_IS_DETACHED(tcp)); 146 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 147 ASSERT(tcp->tcp_ptpahn == NULL); 148 ASSERT(tcp->tcp_flow_stopped == 0); 149 ASSERT(tcp->tcp_time_wait_next == NULL); 150 ASSERT(tcp->tcp_time_wait_prev == NULL); 151 ASSERT(tcp->tcp_time_wait_expire == 0); 152 ASSERT(tcp->tcp_listener == NULL); 153 154 TCP_DBGSTAT(tcps, tcp_time_wait); 155 mutex_enter(&tsp->tcp_time_wait_lock); 156 157 /* 158 * Immediately expire loopback connections. Since there is no worry 159 * about packets on the local host showing up after a long network 160 * delay, this is safe and allows much higher rates of connection churn 161 * for applications operating locally. 162 * 163 * This typically bypasses the tcp_free_list fast path due to squeue 164 * re-entry for the loopback close operation. 165 */ 166 if (tcp->tcp_loopback) { 167 tcp_time_wait_purge(tcp, tsp); 168 mutex_exit(&tsp->tcp_time_wait_lock); 169 return; 170 } 171 172 /* 173 * In order to reap TIME_WAITs reliably, we should use a source of time 174 * that is not adjustable by the user. While it would be more accurate 175 * to grab this timestamp before (potentially) sleeping on the 176 * tcp_time_wait_lock, doing so complicates bucket addressing later. 177 */ 178 now = ddi_get_lbolt64(); 179 180 /* 181 * Each squeue uses an arbitrary time offset when scheduling 182 * expiration timers. This prevents the bucketing from forcing 183 * tcp_time_wait_collector to run in locksetup across squeues. 184 * 185 * This offset is (re)initialized when a new TIME_WAIT connection is 186 * added to an squeue which has no connections waiting to expire. 187 */ 188 if (tsp->tcp_time_wait_tid == 0) { 189 ASSERT(tsp->tcp_time_wait_cnt == 0); 190 tsp->tcp_time_wait_offset = 191 now % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); 192 } 193 now -= tsp->tcp_time_wait_offset; 194 195 /* 196 * Use the netstack-defined timeout, rounded up to the minimum 197 * time_wait_collector interval. 198 */ 199 schedule = now + MSEC_TO_TICK(tcps->tcps_time_wait_interval); 200 tcp->tcp_time_wait_expire = schedule; 201 202 /* 203 * Append the connection into the appropriate bucket. 204 */ 205 bucket = TW_BUCKET(tcp->tcp_time_wait_expire); 206 tcp->tcp_time_wait_next = tsp->tcp_time_wait_bucket[bucket]; 207 tsp->tcp_time_wait_bucket[bucket] = tcp; 208 if (tcp->tcp_time_wait_next != NULL) { 209 ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == NULL); 210 tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp; 211 } 212 tsp->tcp_time_wait_cnt++; 213 214 /* 215 * Round delay up to the nearest bucket boundary. 216 */ 217 schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); 218 schedule -= schedule % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); 219 220 /* 221 * The newly inserted entry may require a tighter schedule for the 222 * expiration timer. 223 */ 224 if (schedule < tsp->tcp_time_wait_schedule) { 225 callout_id_t old_tid = tsp->tcp_time_wait_tid; 226 227 tsp->tcp_time_wait_schedule = schedule; 228 tsp->tcp_time_wait_tid = 229 timeout_generic(CALLOUT_NORMAL, 230 tcp_time_wait_collector, sqp, 231 TICK_TO_NSEC(schedule - now), 232 CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); 233 234 /* 235 * It is possible for the timer to fire before the untimeout 236 * action is able to complete. In that case, the exclusion 237 * offered by the tcp_time_wait_collector_active flag will 238 * prevent multiple collector threads from processing records 239 * simultaneously from the same squeue. 240 */ 241 mutex_exit(&tsp->tcp_time_wait_lock); 242 (void) untimeout_default(old_tid, 0); 243 return; 244 } 245 246 /* 247 * Start a fresh timer if none exists. 248 */ 249 if (tsp->tcp_time_wait_schedule == 0) { 250 ASSERT(tsp->tcp_time_wait_tid == 0); 251 252 tsp->tcp_time_wait_schedule = schedule; 253 tsp->tcp_time_wait_tid = 254 timeout_generic(CALLOUT_NORMAL, 255 tcp_time_wait_collector, sqp, 256 TICK_TO_NSEC(schedule - now), 257 CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); 258 } 259 mutex_exit(&tsp->tcp_time_wait_lock); 260 } 261 262 /* 263 * Wrapper to call tcp_close_detached() via squeue to clean up TIME-WAIT 264 * tcp_t. Used in tcp_time_wait_collector(). 265 */ 266 /* ARGSUSED */ 267 static void 268 tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 269 { 270 conn_t *connp = (conn_t *)arg; 271 tcp_t *tcp = connp->conn_tcp; 272 273 ASSERT(tcp != NULL); 274 if (tcp->tcp_state == TCPS_CLOSED) { 275 return; 276 } 277 278 ASSERT((connp->conn_family == AF_INET && 279 connp->conn_ipversion == IPV4_VERSION) || 280 (connp->conn_family == AF_INET6 && 281 (connp->conn_ipversion == IPV4_VERSION || 282 connp->conn_ipversion == IPV6_VERSION))); 283 ASSERT(!tcp->tcp_listener); 284 285 ASSERT(TCP_IS_DETACHED(tcp)); 286 287 /* 288 * Because they have no upstream client to rebind or tcp_close() 289 * them later, we axe the connection here and now. 290 */ 291 tcp_close_detached(tcp); 292 } 293 294 295 static void 296 tcp_time_wait_purge(tcp_t *tcp, tcp_squeue_priv_t *tsp) 297 { 298 mblk_t *mp; 299 conn_t *connp = tcp->tcp_connp; 300 kmutex_t *lock; 301 302 ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock)); 303 ASSERT(connp->conn_fanout != NULL); 304 305 lock = &connp->conn_fanout->connf_lock; 306 307 /* 308 * This is essentially a TIME_WAIT reclaim fast path optimization for 309 * performance where the connection is checked under the fanout lock 310 * (so that no one else can get access to the conn_t) that the refcnt 311 * is 2 (one each for TCP and the classifier hash list). That is the 312 * case and clustering callbacks are not enabled, the conn can be 313 * removed under the fanout lock and avoid clean-up under the squeue. 314 * 315 * This optimization is forgone when clustering is enabled since the 316 * clustering callback must be made before setting the CONDEMNED flag 317 * and after dropping all locks 318 * 319 * See the comments in tcp_closei_local for additional information 320 * regarding the refcnt logic. 321 */ 322 if (mutex_tryenter(lock)) { 323 mutex_enter(&connp->conn_lock); 324 if (connp->conn_ref == 2 && cl_inet_disconnect == NULL) { 325 ipcl_hash_remove_locked(connp, connp->conn_fanout); 326 /* 327 * Set the CONDEMNED flag now itself so that the refcnt 328 * cannot increase due to any walker. 329 */ 330 connp->conn_state_flags |= CONN_CONDEMNED; 331 mutex_exit(&connp->conn_lock); 332 mutex_exit(lock); 333 if (tsp->tcp_free_list_cnt < tcp_free_list_max_cnt) { 334 /* 335 * Add to head of tcp_free_list 336 */ 337 tcp_cleanup(tcp); 338 ASSERT(connp->conn_latch == NULL); 339 ASSERT(connp->conn_policy == NULL); 340 ASSERT(tcp->tcp_tcps == NULL); 341 ASSERT(connp->conn_netstack == NULL); 342 343 tcp->tcp_time_wait_next = tsp->tcp_free_list; 344 tcp->tcp_in_free_list = B_TRUE; 345 tsp->tcp_free_list = tcp; 346 tsp->tcp_free_list_cnt++; 347 } else { 348 /* 349 * Do not add to tcp_free_list 350 */ 351 tcp_bind_hash_remove(tcp); 352 ixa_cleanup(tcp->tcp_connp->conn_ixa); 353 tcp_ipsec_cleanup(tcp); 354 CONN_DEC_REF(tcp->tcp_connp); 355 } 356 357 /* 358 * With the fast-path complete, we can bail. 359 */ 360 return; 361 } else { 362 /* 363 * Fall back to slow path. 364 */ 365 CONN_INC_REF_LOCKED(connp); 366 mutex_exit(&connp->conn_lock); 367 mutex_exit(lock); 368 } 369 } else { 370 CONN_INC_REF(connp); 371 } 372 373 /* 374 * We can reuse the closemp here since conn has detached (otherwise we 375 * wouldn't even be in time_wait list). It is safe to change 376 * tcp_closemp_used without taking a lock as no other thread can 377 * concurrently access it at this point in the connection lifecycle. 378 */ 379 if (tcp->tcp_closemp.b_prev == NULL) { 380 tcp->tcp_closemp_used = B_TRUE; 381 } else { 382 cmn_err(CE_PANIC, 383 "tcp_timewait_collector: concurrent use of tcp_closemp: " 384 "connp %p tcp %p\n", (void *)connp, (void *)tcp); 385 } 386 387 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 388 mp = &tcp->tcp_closemp; 389 mutex_exit(&tsp->tcp_time_wait_lock); 390 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timewait_close, connp, NULL, 391 SQ_FILL, SQTAG_TCP_TIMEWAIT); 392 mutex_enter(&tsp->tcp_time_wait_lock); 393 } 394 395 /* 396 * Purge any tcp_t instances associated with this squeue which have expired 397 * from the TIME_WAIT state. 398 */ 399 void 400 tcp_time_wait_collector(void *arg) 401 { 402 tcp_t *tcp; 403 int64_t now, sched_active, sched_cur, sched_new; 404 unsigned int idx; 405 406 squeue_t *sqp = (squeue_t *)arg; 407 tcp_squeue_priv_t *tsp = 408 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 409 410 mutex_enter(&tsp->tcp_time_wait_lock); 411 412 /* 413 * Because of timer scheduling complexity and the fact that the 414 * tcp_time_wait_lock is dropped during tcp_time_wait_purge, it is 415 * possible for multiple tcp_time_wait_collector threads to run against 416 * the same squeue. This flag is used to exclude other collectors from 417 * the squeue during execution. 418 */ 419 if (tsp->tcp_time_wait_collector_active) { 420 mutex_exit(&tsp->tcp_time_wait_lock); 421 return; 422 } 423 tsp->tcp_time_wait_collector_active = B_TRUE; 424 425 /* 426 * After its assignment here, the value of sched_active must not be 427 * altered as it is used to validate the state of the 428 * tcp_time_wait_collector callout schedule for this squeue. 429 * 430 * The same does not hold true of sched_cur, which holds the timestamp 431 * of the bucket undergoing processing. While it is initially equal to 432 * sched_active, certain conditions below can walk it forward, 433 * triggering the retry loop. 434 */ 435 sched_cur = sched_active = tsp->tcp_time_wait_schedule; 436 437 /* 438 * Purge the free list if necessary 439 */ 440 if (tsp->tcp_free_list != NULL) { 441 TCP_G_STAT(tcp_freelist_cleanup); 442 while ((tcp = tsp->tcp_free_list) != NULL) { 443 tsp->tcp_free_list = tcp->tcp_time_wait_next; 444 tcp->tcp_time_wait_next = NULL; 445 tsp->tcp_free_list_cnt--; 446 ASSERT(tcp->tcp_tcps == NULL); 447 CONN_DEC_REF(tcp->tcp_connp); 448 } 449 ASSERT(tsp->tcp_free_list_cnt == 0); 450 } 451 452 /* 453 * If there are no connections pending, clear timer-related state to be 454 * reinitialized by the next caller. 455 */ 456 if (tsp->tcp_time_wait_cnt == 0) { 457 tsp->tcp_time_wait_offset = 0; 458 tsp->tcp_time_wait_schedule = 0; 459 tsp->tcp_time_wait_tid = 0; 460 tsp->tcp_time_wait_collector_active = B_FALSE; 461 mutex_exit(&tsp->tcp_time_wait_lock); 462 return; 463 } 464 465 retry: 466 /* 467 * Grab the bucket which we were scheduled to cleanse. 468 */ 469 idx = TW_BUCKET(sched_cur - 1); 470 now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset; 471 tcp = tsp->tcp_time_wait_bucket[idx]; 472 473 while (tcp != NULL) { 474 /* 475 * Since the bucket count is sized to prevent wrap-around 476 * during typical operation and timers are schedule to process 477 * buckets with only expired connections, there is only one 478 * reason to encounter a connection expiring in the future: 479 * The tcp_time_wait_collector thread has been so delayed in 480 * its processing that connections have wrapped around the 481 * timing wheel into this bucket. 482 * 483 * In that case, the remaining entires in the bucket can be 484 * ignored since, being appended sequentially, they should all 485 * expire in the future. 486 */ 487 if (now < tcp->tcp_time_wait_expire) { 488 break; 489 } 490 491 /* 492 * Pull the connection out of the bucket. 493 */ 494 VERIFY(tcp_time_wait_remove(tcp, tsp)); 495 496 /* 497 * Purge the connection. 498 * 499 * While tcp_time_wait_lock will be temporarily dropped as part 500 * of the process, there is no risk of the timer being 501 * (re)scheduled while the collector is running since a value 502 * corresponding to the past is left in tcp_time_wait_schedule. 503 */ 504 tcp_time_wait_purge(tcp, tsp); 505 506 /* 507 * Because tcp_time_wait_remove clears the tcp_time_wait_next 508 * field, the next item must be grabbed directly from the 509 * bucket itself. 510 */ 511 tcp = tsp->tcp_time_wait_bucket[idx]; 512 } 513 514 if (tsp->tcp_time_wait_cnt == 0) { 515 /* 516 * There is not a need for the collector to schedule a new 517 * timer if no pending items remain. The timer state can be 518 * cleared only if it was untouched while the collector dropped 519 * its locks during tcp_time_wait_purge. 520 */ 521 if (tsp->tcp_time_wait_schedule == sched_active) { 522 tsp->tcp_time_wait_offset = 0; 523 tsp->tcp_time_wait_schedule = 0; 524 tsp->tcp_time_wait_tid = 0; 525 } 526 tsp->tcp_time_wait_collector_active = B_FALSE; 527 mutex_exit(&tsp->tcp_time_wait_lock); 528 return; 529 } else { 530 unsigned int nidx; 531 532 /* 533 * Locate the next bucket containing entries. 534 */ 535 sched_new = sched_cur + MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); 536 nidx = TW_BUCKET_NEXT(idx); 537 while (tsp->tcp_time_wait_bucket[nidx] == NULL) { 538 if (nidx == idx) { 539 break; 540 } 541 nidx = TW_BUCKET_NEXT(nidx); 542 sched_new += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); 543 } 544 ASSERT(tsp->tcp_time_wait_bucket[nidx] != NULL); 545 } 546 547 /* 548 * It is possible that the system is under such dire load that between 549 * the timer scheduling and TIME_WAIT processing delay, execution 550 * overran the interval allocated to this bucket. 551 */ 552 now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset; 553 if (sched_new <= now) { 554 /* 555 * Attempt to right the situation by immediately performing a 556 * purge on the next bucket. This loop will continue as needed 557 * until the schedule can be pushed out ahead of the clock. 558 */ 559 sched_cur = sched_new; 560 DTRACE_PROBE3(tcp__time__wait__overrun, 561 tcp_squeue_priv_t *, tsp, int64_t, sched_new, int64_t, now); 562 goto retry; 563 } 564 565 /* 566 * Another thread may have snuck in to reschedule the timer while locks 567 * were dropped during tcp_time_wait_purge. Defer to the running timer 568 * if that is the case. 569 */ 570 if (tsp->tcp_time_wait_schedule != sched_active) { 571 tsp->tcp_time_wait_collector_active = B_FALSE; 572 mutex_exit(&tsp->tcp_time_wait_lock); 573 return; 574 } 575 576 /* 577 * Schedule the next timer. 578 */ 579 tsp->tcp_time_wait_schedule = sched_new; 580 tsp->tcp_time_wait_tid = 581 timeout_generic(CALLOUT_NORMAL, 582 tcp_time_wait_collector, sqp, 583 TICK_TO_NSEC(sched_new - now), 584 CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); 585 tsp->tcp_time_wait_collector_active = B_FALSE; 586 mutex_exit(&tsp->tcp_time_wait_lock); 587 } 588 589 /* 590 * tcp_time_wait_processing() handles processing of incoming packets when 591 * the tcp_t is in the TIME_WAIT state. 592 * 593 * A TIME_WAIT tcp_t that has an associated open TCP end point (not in 594 * detached state) is never put on the time wait list. 595 */ 596 void 597 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, 598 uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira) 599 { 600 int32_t bytes_acked; 601 int32_t gap; 602 int32_t rgap; 603 tcp_opt_t tcpopt; 604 uint_t flags; 605 uint32_t new_swnd = 0; 606 conn_t *nconnp; 607 conn_t *connp = tcp->tcp_connp; 608 tcp_stack_t *tcps = tcp->tcp_tcps; 609 610 BUMP_LOCAL(tcp->tcp_ibsegs); 611 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); 612 613 flags = (unsigned int)tcpha->tha_flags & 0xFF; 614 new_swnd = ntohs(tcpha->tha_win) << 615 ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); 616 617 if (tcp->tcp_snd_ts_ok && !(tcpha->tha_flags & TH_RST)) { 618 int options; 619 if (tcp->tcp_snd_sack_ok) 620 tcpopt.tcp = tcp; 621 else 622 tcpopt.tcp = NULL; 623 options = tcp_parse_options(tcpha, &tcpopt); 624 if (!(options & TCP_OPT_TSTAMP_PRESENT)) { 625 DTRACE_TCP1(droppedtimestamp, tcp_t *, tcp); 626 goto done; 627 } else if (!tcp_paws_check(tcp, &tcpopt)) { 628 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, 629 TH_ACK); 630 goto done; 631 } 632 } 633 gap = seg_seq - tcp->tcp_rnxt; 634 rgap = tcp->tcp_rwnd - (gap + seg_len); 635 if (gap < 0) { 636 TCPS_BUMP_MIB(tcps, tcpInDataDupSegs); 637 TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes, 638 (seg_len > -gap ? -gap : seg_len)); 639 seg_len += gap; 640 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 641 if (flags & TH_RST) { 642 goto done; 643 } 644 if ((flags & TH_FIN) && seg_len == -1) { 645 /* 646 * When TCP receives a duplicate FIN in 647 * TIME_WAIT state, restart the 2 MSL timer. 648 * See page 73 in RFC 793. Make sure this TCP 649 * is already on the TIME_WAIT list. If not, 650 * just restart the timer. 651 */ 652 if (TCP_IS_DETACHED(tcp)) { 653 if (tcp_time_wait_remove(tcp, NULL) == 654 B_TRUE) { 655 tcp_time_wait_append(tcp); 656 TCP_DBGSTAT(tcps, 657 tcp_rput_time_wait); 658 } 659 } else { 660 ASSERT(tcp != NULL); 661 TCP_TIMER_RESTART(tcp, 662 tcps->tcps_time_wait_interval); 663 } 664 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 665 tcp->tcp_rnxt, TH_ACK); 666 goto done; 667 } 668 flags |= TH_ACK_NEEDED; 669 seg_len = 0; 670 goto process_ack; 671 } 672 673 /* Fix seg_seq, and chew the gap off the front. */ 674 seg_seq = tcp->tcp_rnxt; 675 } 676 677 if ((flags & TH_SYN) && gap > 0 && rgap < 0) { 678 /* 679 * Make sure that when we accept the connection, pick 680 * an ISS greater than (tcp_snxt + tcp_iss_incr/2) for the 681 * old connection. 682 * 683 * The next ISS generated is equal to tcp_iss_incr_extra 684 * + tcp_iss_incr/2 + other components depending on the 685 * value of tcp_strong_iss. We pre-calculate the new 686 * ISS here and compare with tcp_snxt to determine if 687 * we need to make adjustment to tcp_iss_incr_extra. 688 * 689 * The above calculation is ugly and is a 690 * waste of CPU cycles... 691 */ 692 uint32_t new_iss = tcps->tcps_iss_incr_extra; 693 int32_t adj; 694 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 695 696 switch (tcps->tcps_strong_iss) { 697 case 2: { 698 /* Add time and MD5 components. */ 699 uint32_t answer[4]; 700 struct { 701 uint32_t ports; 702 in6_addr_t src; 703 in6_addr_t dst; 704 } arg; 705 MD5_CTX context; 706 707 mutex_enter(&tcps->tcps_iss_key_lock); 708 context = tcps->tcps_iss_key; 709 mutex_exit(&tcps->tcps_iss_key_lock); 710 arg.ports = connp->conn_ports; 711 /* We use MAPPED addresses in tcp_iss_init */ 712 arg.src = connp->conn_laddr_v6; 713 arg.dst = connp->conn_faddr_v6; 714 MD5Update(&context, (uchar_t *)&arg, 715 sizeof (arg)); 716 MD5Final((uchar_t *)answer, &context); 717 answer[0] ^= answer[1] ^ answer[2] ^ answer[3]; 718 new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0]; 719 break; 720 } 721 case 1: 722 /* Add time component and min random (i.e. 1). */ 723 new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1; 724 break; 725 default: 726 /* Add only time component. */ 727 new_iss += (uint32_t)gethrestime_sec() * 728 tcps->tcps_iss_incr; 729 break; 730 } 731 if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { 732 /* 733 * New ISS not guaranteed to be tcp_iss_incr/2 734 * ahead of the current tcp_snxt, so add the 735 * difference to tcp_iss_incr_extra. 736 */ 737 tcps->tcps_iss_incr_extra += adj; 738 } 739 /* 740 * If tcp_clean_death() can not perform the task now, 741 * drop the SYN packet and let the other side re-xmit. 742 * Otherwise pass the SYN packet back in, since the 743 * old tcp state has been cleaned up or freed. 744 */ 745 if (tcp_clean_death(tcp, 0) == -1) 746 goto done; 747 nconnp = ipcl_classify(mp, ira, ipst); 748 if (nconnp != NULL) { 749 TCP_STAT(tcps, tcp_time_wait_syn_success); 750 /* Drops ref on nconnp */ 751 tcp_reinput(nconnp, mp, ira, ipst); 752 return; 753 } 754 goto done; 755 } 756 757 /* 758 * rgap is the amount of stuff received out of window. A negative 759 * value is the amount out of window. 760 */ 761 if (rgap < 0) { 762 TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs); 763 TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap); 764 /* Fix seg_len and make sure there is something left. */ 765 seg_len += rgap; 766 if (seg_len <= 0) { 767 if (flags & TH_RST) { 768 goto done; 769 } 770 flags |= TH_ACK_NEEDED; 771 seg_len = 0; 772 goto process_ack; 773 } 774 } 775 /* 776 * Check whether we can update tcp_ts_recent. This test is from RFC 777 * 7323, section 5.3. 778 */ 779 if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) && 780 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 781 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 782 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 783 tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64(); 784 } 785 786 if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { 787 /* Always ack out of order packets */ 788 flags |= TH_ACK_NEEDED; 789 seg_len = 0; 790 } else if (seg_len > 0) { 791 TCPS_BUMP_MIB(tcps, tcpInClosed); 792 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs); 793 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len); 794 } 795 if (flags & TH_RST) { 796 (void) tcp_clean_death(tcp, 0); 797 goto done; 798 } 799 if (flags & TH_SYN) { 800 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 801 TH_RST|TH_ACK); 802 /* 803 * Do not delete the TCP structure if it is in 804 * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. 805 */ 806 goto done; 807 } 808 process_ack: 809 if (flags & TH_ACK) { 810 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 811 if (bytes_acked <= 0) { 812 if (bytes_acked == 0 && seg_len == 0 && 813 new_swnd == tcp->tcp_swnd) 814 TCPS_BUMP_MIB(tcps, tcpInDupAck); 815 } else { 816 /* Acks something not sent */ 817 flags |= TH_ACK_NEEDED; 818 } 819 } 820 if (flags & TH_ACK_NEEDED) { 821 /* 822 * Time to send an ack for some reason. 823 */ 824 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 825 tcp->tcp_rnxt, TH_ACK); 826 } 827 done: 828 freemsg(mp); 829 } 830