1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* This file contains all TCP input processing functions. */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsun.h> 31 #include <sys/strsubr.h> 32 #include <sys/stropts.h> 33 #include <sys/strlog.h> 34 #define _SUN_TPI_VERSION 2 35 #include <sys/tihdr.h> 36 #include <sys/suntpi.h> 37 #include <sys/xti_inet.h> 38 #include <sys/squeue_impl.h> 39 #include <sys/squeue.h> 40 #include <sys/tsol/tnet.h> 41 42 #include <inet/common.h> 43 #include <inet/ip.h> 44 #include <inet/tcp.h> 45 #include <inet/tcp_impl.h> 46 #include <inet/tcp_cluster.h> 47 #include <inet/proto_set.h> 48 #include <inet/ipsec_impl.h> 49 50 /* 51 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing 52 */ 53 54 #ifdef _BIG_ENDIAN 55 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 56 (TCPOPT_TSTAMP << 8) | 10) 57 #else 58 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 59 (TCPOPT_NOP << 8) | TCPOPT_NOP) 60 #endif 61 62 /* 63 * Flags returned from tcp_parse_options. 64 */ 65 #define TCP_OPT_MSS_PRESENT 1 66 #define TCP_OPT_WSCALE_PRESENT 2 67 #define TCP_OPT_TSTAMP_PRESENT 4 68 #define TCP_OPT_SACK_OK_PRESENT 8 69 #define TCP_OPT_SACK_PRESENT 16 70 71 /* 72 * PAWS needs a timer for 24 days. This is the number of ticks in 24 days 73 */ 74 #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz)) 75 76 /* 77 * Since tcp_listener is not cleared atomically with tcp_detached 78 * being cleared we need this extra bit to tell a detached connection 79 * apart from one that is in the process of being accepted. 80 */ 81 #define TCP_IS_DETACHED_NONEAGER(tcp) \ 82 (TCP_IS_DETACHED(tcp) && \ 83 (!(tcp)->tcp_hard_binding)) 84 85 /* 86 * Steps to do when a tcp_t moves to TIME-WAIT state. 87 * 88 * This connection is done, we don't need to account for it. Decrement 89 * the listener connection counter if needed. 90 * 91 * Decrement the connection counter of the stack. Note that this counter 92 * is per CPU. So the total number of connections in a stack is the sum of all 93 * of them. Since there is no lock for handling all of them exclusively, the 94 * resulting sum is only an approximation. 95 * 96 * Unconditionally clear the exclusive binding bit so this TIME-WAIT 97 * connection won't interfere with new ones. 98 * 99 * Start the TIME-WAIT timer. If upper layer has not closed the connection, 100 * the timer is handled within the context of this tcp_t. When the timer 101 * fires, tcp_clean_death() is called. If upper layer closes the connection 102 * during this period, tcp_time_wait_append() will be called to add this 103 * tcp_t to the global TIME-WAIT list. Note that this means that the 104 * actual wait time in TIME-WAIT state will be longer than the 105 * tcps_time_wait_interval since the period before upper layer closes the 106 * connection is not accounted for when tcp_time_wait_append() is called. 107 * 108 * If uppser layer has closed the connection, call tcp_time_wait_append() 109 * directly. 110 * 111 */ 112 #define SET_TIME_WAIT(tcps, tcp, connp) \ 113 { \ 114 (tcp)->tcp_state = TCPS_TIME_WAIT; \ 115 if ((tcp)->tcp_listen_cnt != NULL) \ 116 TCP_DECR_LISTEN_CNT(tcp); \ 117 atomic_dec_64( \ 118 (uint64_t *)&(tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_conn_cnt); \ 119 (connp)->conn_exclbind = 0; \ 120 if (!TCP_IS_DETACHED(tcp)) { \ 121 TCP_TIMER_RESTART(tcp, (tcps)->tcps_time_wait_interval); \ 122 } else { \ 123 tcp_time_wait_append(tcp); \ 124 TCP_DBGSTAT(tcps, tcp_rput_time_wait); \ 125 } \ 126 } 127 128 /* 129 * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more 130 * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent 131 * data, TCP will not respond with an ACK. RFC 793 requires that 132 * TCP responds with an ACK for such a bogus ACK. By not following 133 * the RFC, we prevent TCP from getting into an ACK storm if somehow 134 * an attacker successfully spoofs an acceptable segment to our 135 * peer; or when our peer is "confused." 136 */ 137 static uint32_t tcp_drop_ack_unsent_cnt = 10; 138 139 /* 140 * The shift factor applied to tcp_mss to decide if the peer sends us a 141 * valid initial receive window. By default, if the peer receive window 142 * is smaller than 1 MSS (shift factor is 0), it is considered as invalid. 143 */ 144 static uint32_t tcp_init_wnd_shft = 0; 145 146 /* Process ICMP source quench message or not. */ 147 static boolean_t tcp_icmp_source_quench = B_FALSE; 148 149 static boolean_t tcp_outbound_squeue_switch = B_FALSE; 150 151 static mblk_t *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *, 152 ip_recv_attr_t *); 153 static mblk_t *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *, 154 ip_recv_attr_t *); 155 static boolean_t tcp_drop_q0(tcp_t *); 156 static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *); 157 static mblk_t *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *, 158 ip_recv_attr_t *); 159 static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *); 160 static int tcp_parse_options(tcpha_t *, tcp_opt_t *); 161 static void tcp_process_options(tcp_t *, tcpha_t *); 162 static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); 163 static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); 164 static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *); 165 static void tcp_set_rto(tcp_t *, time_t); 166 static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); 167 168 /* 169 * Set the MSS associated with a particular tcp based on its current value, 170 * and a new one passed in. Observe minimums and maximums, and reset other 171 * state variables that we want to view as multiples of MSS. 172 * 173 * The value of MSS could be either increased or descreased. 174 */ 175 void 176 tcp_mss_set(tcp_t *tcp, uint32_t mss) 177 { 178 uint32_t mss_max; 179 tcp_stack_t *tcps = tcp->tcp_tcps; 180 conn_t *connp = tcp->tcp_connp; 181 182 if (connp->conn_ipversion == IPV4_VERSION) 183 mss_max = tcps->tcps_mss_max_ipv4; 184 else 185 mss_max = tcps->tcps_mss_max_ipv6; 186 187 if (mss < tcps->tcps_mss_min) 188 mss = tcps->tcps_mss_min; 189 if (mss > mss_max) 190 mss = mss_max; 191 /* 192 * Unless naglim has been set by our client to 193 * a non-mss value, force naglim to track mss. 194 * This can help to aggregate small writes. 195 */ 196 if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) 197 tcp->tcp_naglim = mss; 198 /* 199 * TCP should be able to buffer at least 4 MSS data for obvious 200 * performance reason. 201 */ 202 if ((mss << 2) > connp->conn_sndbuf) 203 connp->conn_sndbuf = mss << 2; 204 205 /* 206 * Set the send lowater to at least twice of MSS. 207 */ 208 if ((mss << 1) > connp->conn_sndlowat) 209 connp->conn_sndlowat = mss << 1; 210 211 /* 212 * Update tcp_cwnd according to the new value of MSS. Keep the 213 * previous ratio to preserve the transmit rate. 214 */ 215 tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss; 216 tcp->tcp_cwnd_cnt = 0; 217 218 tcp->tcp_mss = mss; 219 (void) tcp_maxpsz_set(tcp, B_TRUE); 220 } 221 222 /* 223 * Extract option values from a tcp header. We put any found values into the 224 * tcpopt struct and return a bitmask saying which options were found. 225 */ 226 static int 227 tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt) 228 { 229 uchar_t *endp; 230 int len; 231 uint32_t mss; 232 uchar_t *up = (uchar_t *)tcpha; 233 int found = 0; 234 int32_t sack_len; 235 tcp_seq sack_begin, sack_end; 236 tcp_t *tcp; 237 238 endp = up + TCP_HDR_LENGTH(tcpha); 239 up += TCP_MIN_HEADER_LENGTH; 240 while (up < endp) { 241 len = endp - up; 242 switch (*up) { 243 case TCPOPT_EOL: 244 break; 245 246 case TCPOPT_NOP: 247 up++; 248 continue; 249 250 case TCPOPT_MAXSEG: 251 if (len < TCPOPT_MAXSEG_LEN || 252 up[1] != TCPOPT_MAXSEG_LEN) 253 break; 254 255 mss = BE16_TO_U16(up+2); 256 /* Caller must handle tcp_mss_min and tcp_mss_max_* */ 257 tcpopt->tcp_opt_mss = mss; 258 found |= TCP_OPT_MSS_PRESENT; 259 260 up += TCPOPT_MAXSEG_LEN; 261 continue; 262 263 case TCPOPT_WSCALE: 264 if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) 265 break; 266 267 if (up[2] > TCP_MAX_WINSHIFT) 268 tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; 269 else 270 tcpopt->tcp_opt_wscale = up[2]; 271 found |= TCP_OPT_WSCALE_PRESENT; 272 273 up += TCPOPT_WS_LEN; 274 continue; 275 276 case TCPOPT_SACK_PERMITTED: 277 if (len < TCPOPT_SACK_OK_LEN || 278 up[1] != TCPOPT_SACK_OK_LEN) 279 break; 280 found |= TCP_OPT_SACK_OK_PRESENT; 281 up += TCPOPT_SACK_OK_LEN; 282 continue; 283 284 case TCPOPT_SACK: 285 if (len <= 2 || up[1] <= 2 || len < up[1]) 286 break; 287 288 /* If TCP is not interested in SACK blks... */ 289 if ((tcp = tcpopt->tcp) == NULL) { 290 up += up[1]; 291 continue; 292 } 293 sack_len = up[1] - TCPOPT_HEADER_LEN; 294 up += TCPOPT_HEADER_LEN; 295 296 /* 297 * If the list is empty, allocate one and assume 298 * nothing is sack'ed. 299 */ 300 if (tcp->tcp_notsack_list == NULL) { 301 tcp_notsack_update(&(tcp->tcp_notsack_list), 302 tcp->tcp_suna, tcp->tcp_snxt, 303 &(tcp->tcp_num_notsack_blk), 304 &(tcp->tcp_cnt_notsack_list)); 305 306 /* 307 * Make sure tcp_notsack_list is not NULL. 308 * This happens when kmem_alloc(KM_NOSLEEP) 309 * returns NULL. 310 */ 311 if (tcp->tcp_notsack_list == NULL) { 312 up += sack_len; 313 continue; 314 } 315 tcp->tcp_fack = tcp->tcp_suna; 316 } 317 318 while (sack_len > 0) { 319 if (up + 8 > endp) { 320 up = endp; 321 break; 322 } 323 sack_begin = BE32_TO_U32(up); 324 up += 4; 325 sack_end = BE32_TO_U32(up); 326 up += 4; 327 sack_len -= 8; 328 /* 329 * Bounds checking. Make sure the SACK 330 * info is within tcp_suna and tcp_snxt. 331 * If this SACK blk is out of bound, ignore 332 * it but continue to parse the following 333 * blks. 334 */ 335 if (SEQ_LEQ(sack_end, sack_begin) || 336 SEQ_LT(sack_begin, tcp->tcp_suna) || 337 SEQ_GT(sack_end, tcp->tcp_snxt)) { 338 continue; 339 } 340 tcp_notsack_insert(&(tcp->tcp_notsack_list), 341 sack_begin, sack_end, 342 &(tcp->tcp_num_notsack_blk), 343 &(tcp->tcp_cnt_notsack_list)); 344 if (SEQ_GT(sack_end, tcp->tcp_fack)) { 345 tcp->tcp_fack = sack_end; 346 } 347 } 348 found |= TCP_OPT_SACK_PRESENT; 349 continue; 350 351 case TCPOPT_TSTAMP: 352 if (len < TCPOPT_TSTAMP_LEN || 353 up[1] != TCPOPT_TSTAMP_LEN) 354 break; 355 356 tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); 357 tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); 358 359 found |= TCP_OPT_TSTAMP_PRESENT; 360 361 up += TCPOPT_TSTAMP_LEN; 362 continue; 363 364 default: 365 if (len <= 1 || len < (int)up[1] || up[1] == 0) 366 break; 367 up += up[1]; 368 continue; 369 } 370 break; 371 } 372 return (found); 373 } 374 375 /* 376 * Process all TCP option in SYN segment. Note that this function should 377 * be called after tcp_set_destination() is called so that the necessary info 378 * from IRE is already set in the tcp structure. 379 * 380 * This function sets up the correct tcp_mss value according to the 381 * MSS option value and our header size. It also sets up the window scale 382 * and timestamp values, and initialize SACK info blocks. But it does not 383 * change receive window size after setting the tcp_mss value. The caller 384 * should do the appropriate change. 385 */ 386 static void 387 tcp_process_options(tcp_t *tcp, tcpha_t *tcpha) 388 { 389 int options; 390 tcp_opt_t tcpopt; 391 uint32_t mss_max; 392 char *tmp_tcph; 393 tcp_stack_t *tcps = tcp->tcp_tcps; 394 conn_t *connp = tcp->tcp_connp; 395 396 tcpopt.tcp = NULL; 397 options = tcp_parse_options(tcpha, &tcpopt); 398 399 /* 400 * Process MSS option. Note that MSS option value does not account 401 * for IP or TCP options. This means that it is equal to MTU - minimum 402 * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for 403 * IPv6. 404 */ 405 if (!(options & TCP_OPT_MSS_PRESENT)) { 406 if (connp->conn_ipversion == IPV4_VERSION) 407 tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv4; 408 else 409 tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv6; 410 } else { 411 if (connp->conn_ipversion == IPV4_VERSION) 412 mss_max = tcps->tcps_mss_max_ipv4; 413 else 414 mss_max = tcps->tcps_mss_max_ipv6; 415 if (tcpopt.tcp_opt_mss < tcps->tcps_mss_min) 416 tcpopt.tcp_opt_mss = tcps->tcps_mss_min; 417 else if (tcpopt.tcp_opt_mss > mss_max) 418 tcpopt.tcp_opt_mss = mss_max; 419 } 420 421 /* Process Window Scale option. */ 422 if (options & TCP_OPT_WSCALE_PRESENT) { 423 tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; 424 tcp->tcp_snd_ws_ok = B_TRUE; 425 } else { 426 tcp->tcp_snd_ws = B_FALSE; 427 tcp->tcp_snd_ws_ok = B_FALSE; 428 tcp->tcp_rcv_ws = B_FALSE; 429 } 430 431 /* Process Timestamp option. */ 432 if ((options & TCP_OPT_TSTAMP_PRESENT) && 433 (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) { 434 tmp_tcph = (char *)tcp->tcp_tcpha; 435 436 tcp->tcp_snd_ts_ok = B_TRUE; 437 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 438 tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64(); 439 ASSERT(OK_32PTR(tmp_tcph)); 440 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); 441 442 /* Fill in our template header with basic timestamp option. */ 443 tmp_tcph += connp->conn_ht_ulp_len; 444 tmp_tcph[0] = TCPOPT_NOP; 445 tmp_tcph[1] = TCPOPT_NOP; 446 tmp_tcph[2] = TCPOPT_TSTAMP; 447 tmp_tcph[3] = TCPOPT_TSTAMP_LEN; 448 connp->conn_ht_iphc_len += TCPOPT_REAL_TS_LEN; 449 connp->conn_ht_ulp_len += TCPOPT_REAL_TS_LEN; 450 tcp->tcp_tcpha->tha_offset_and_reserved += (3 << 4); 451 } else { 452 tcp->tcp_snd_ts_ok = B_FALSE; 453 } 454 455 /* 456 * Process SACK options. If SACK is enabled for this connection, 457 * then allocate the SACK info structure. Note the following ways 458 * when tcp_snd_sack_ok is set to true. 459 * 460 * For active connection: in tcp_set_destination() called in 461 * tcp_connect(). 462 * 463 * For passive connection: in tcp_set_destination() called in 464 * tcp_input_listener(). 465 * 466 * That's the reason why the extra TCP_IS_DETACHED() check is there. 467 * That check makes sure that if we did not send a SACK OK option, 468 * we will not enable SACK for this connection even though the other 469 * side sends us SACK OK option. For active connection, the SACK 470 * info structure has already been allocated. So we need to free 471 * it if SACK is disabled. 472 */ 473 if ((options & TCP_OPT_SACK_OK_PRESENT) && 474 (tcp->tcp_snd_sack_ok || 475 (tcps->tcps_sack_permitted != 0 && TCP_IS_DETACHED(tcp)))) { 476 ASSERT(tcp->tcp_num_sack_blk == 0); 477 ASSERT(tcp->tcp_notsack_list == NULL); 478 479 tcp->tcp_snd_sack_ok = B_TRUE; 480 if (tcp->tcp_snd_ts_ok) { 481 tcp->tcp_max_sack_blk = 3; 482 } else { 483 tcp->tcp_max_sack_blk = 4; 484 } 485 } else if (tcp->tcp_snd_sack_ok) { 486 /* 487 * Resetting tcp_snd_sack_ok to B_FALSE so that 488 * no SACK info will be used for this 489 * connection. This assumes that SACK usage 490 * permission is negotiated. This may need 491 * to be changed once this is clarified. 492 */ 493 ASSERT(tcp->tcp_num_sack_blk == 0); 494 ASSERT(tcp->tcp_notsack_list == NULL); 495 tcp->tcp_snd_sack_ok = B_FALSE; 496 } 497 498 /* 499 * Now we know the exact TCP/IP header length, subtract 500 * that from tcp_mss to get our side's MSS. 501 */ 502 tcp->tcp_mss -= connp->conn_ht_iphc_len; 503 504 /* 505 * Here we assume that the other side's header size will be equal to 506 * our header size. We calculate the real MSS accordingly. Need to 507 * take into additional stuffs IPsec puts in. 508 * 509 * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) 510 */ 511 tcpopt.tcp_opt_mss -= connp->conn_ht_iphc_len + 512 tcp->tcp_ipsec_overhead - 513 ((connp->conn_ipversion == IPV4_VERSION ? 514 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH); 515 516 /* 517 * Set MSS to the smaller one of both ends of the connection. 518 * We should not have called tcp_mss_set() before, but our 519 * side of the MSS should have been set to a proper value 520 * by tcp_set_destination(). tcp_mss_set() will also set up the 521 * STREAM head parameters properly. 522 * 523 * If we have a larger-than-16-bit window but the other side 524 * didn't want to do window scale, tcp_rwnd_set() will take 525 * care of that. 526 */ 527 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); 528 529 /* 530 * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been 531 * updated properly. 532 */ 533 TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial); 534 } 535 536 /* 537 * Add a new piece to the tcp reassembly queue. If the gap at the beginning 538 * is filled, return as much as we can. The message passed in may be 539 * multi-part, chained using b_cont. "start" is the starting sequence 540 * number for this piece. 541 */ 542 static mblk_t * 543 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) 544 { 545 uint32_t end; 546 mblk_t *mp1; 547 mblk_t *mp2; 548 mblk_t *next_mp; 549 uint32_t u1; 550 tcp_stack_t *tcps = tcp->tcp_tcps; 551 552 553 /* Walk through all the new pieces. */ 554 do { 555 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 556 (uintptr_t)INT_MAX); 557 end = start + (int)(mp->b_wptr - mp->b_rptr); 558 next_mp = mp->b_cont; 559 if (start == end) { 560 /* Empty. Blast it. */ 561 freeb(mp); 562 continue; 563 } 564 mp->b_cont = NULL; 565 TCP_REASS_SET_SEQ(mp, start); 566 TCP_REASS_SET_END(mp, end); 567 mp1 = tcp->tcp_reass_tail; 568 if (!mp1) { 569 tcp->tcp_reass_tail = mp; 570 tcp->tcp_reass_head = mp; 571 TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs); 572 TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, 573 end - start); 574 continue; 575 } 576 /* New stuff completely beyond tail? */ 577 if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { 578 /* Link it on end. */ 579 mp1->b_cont = mp; 580 tcp->tcp_reass_tail = mp; 581 TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs); 582 TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, 583 end - start); 584 continue; 585 } 586 mp1 = tcp->tcp_reass_head; 587 u1 = TCP_REASS_SEQ(mp1); 588 /* New stuff at the front? */ 589 if (SEQ_LT(start, u1)) { 590 /* Yes... Check for overlap. */ 591 mp->b_cont = mp1; 592 tcp->tcp_reass_head = mp; 593 tcp_reass_elim_overlap(tcp, mp); 594 continue; 595 } 596 /* 597 * The new piece fits somewhere between the head and tail. 598 * We find our slot, where mp1 precedes us and mp2 trails. 599 */ 600 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { 601 u1 = TCP_REASS_SEQ(mp2); 602 if (SEQ_LEQ(start, u1)) 603 break; 604 } 605 /* Link ourselves in */ 606 mp->b_cont = mp2; 607 mp1->b_cont = mp; 608 609 /* Trim overlap with following mblk(s) first */ 610 tcp_reass_elim_overlap(tcp, mp); 611 612 /* Trim overlap with preceding mblk */ 613 tcp_reass_elim_overlap(tcp, mp1); 614 615 } while (start = end, mp = next_mp); 616 mp1 = tcp->tcp_reass_head; 617 /* Anything ready to go? */ 618 if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) 619 return (NULL); 620 /* Eat what we can off the queue */ 621 for (;;) { 622 mp = mp1->b_cont; 623 end = TCP_REASS_END(mp1); 624 TCP_REASS_SET_SEQ(mp1, 0); 625 TCP_REASS_SET_END(mp1, 0); 626 if (!mp) { 627 tcp->tcp_reass_tail = NULL; 628 break; 629 } 630 if (end != TCP_REASS_SEQ(mp)) { 631 mp1->b_cont = NULL; 632 break; 633 } 634 mp1 = mp; 635 } 636 mp1 = tcp->tcp_reass_head; 637 tcp->tcp_reass_head = mp; 638 return (mp1); 639 } 640 641 /* Eliminate any overlap that mp may have over later mblks */ 642 static void 643 tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) 644 { 645 uint32_t end; 646 mblk_t *mp1; 647 uint32_t u1; 648 tcp_stack_t *tcps = tcp->tcp_tcps; 649 650 end = TCP_REASS_END(mp); 651 while ((mp1 = mp->b_cont) != NULL) { 652 u1 = TCP_REASS_SEQ(mp1); 653 if (!SEQ_GT(end, u1)) 654 break; 655 if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { 656 mp->b_wptr -= end - u1; 657 TCP_REASS_SET_END(mp, u1); 658 TCPS_BUMP_MIB(tcps, tcpInDataPartDupSegs); 659 TCPS_UPDATE_MIB(tcps, tcpInDataPartDupBytes, 660 end - u1); 661 break; 662 } 663 mp->b_cont = mp1->b_cont; 664 TCP_REASS_SET_SEQ(mp1, 0); 665 TCP_REASS_SET_END(mp1, 0); 666 freeb(mp1); 667 TCPS_BUMP_MIB(tcps, tcpInDataDupSegs); 668 TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes, end - u1); 669 } 670 if (!mp1) 671 tcp->tcp_reass_tail = mp; 672 } 673 674 /* 675 * This function does PAWS protection check. Returns B_TRUE if the 676 * segment passes the PAWS test, else returns B_FALSE. 677 */ 678 boolean_t 679 tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp) 680 { 681 uint8_t flags; 682 int options; 683 uint8_t *up; 684 conn_t *connp = tcp->tcp_connp; 685 686 flags = (unsigned int)tcpha->tha_flags & 0xFF; 687 /* 688 * If timestamp option is aligned nicely, get values inline, 689 * otherwise call general routine to parse. Only do that 690 * if timestamp is the only option. 691 */ 692 if (TCP_HDR_LENGTH(tcpha) == (uint32_t)TCP_MIN_HEADER_LENGTH + 693 TCPOPT_REAL_TS_LEN && 694 OK_32PTR((up = ((uint8_t *)tcpha) + 695 TCP_MIN_HEADER_LENGTH)) && 696 *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { 697 tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); 698 tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); 699 700 options = TCP_OPT_TSTAMP_PRESENT; 701 } else { 702 if (tcp->tcp_snd_sack_ok) { 703 tcpoptp->tcp = tcp; 704 } else { 705 tcpoptp->tcp = NULL; 706 } 707 options = tcp_parse_options(tcpha, tcpoptp); 708 } 709 710 if (options & TCP_OPT_TSTAMP_PRESENT) { 711 /* 712 * Do PAWS per RFC 1323 section 4.2. Accept RST 713 * regardless of the timestamp, page 18 RFC 1323.bis. 714 */ 715 if ((flags & TH_RST) == 0 && 716 TSTMP_LT(tcpoptp->tcp_opt_ts_val, 717 tcp->tcp_ts_recent)) { 718 if (LBOLT_FASTPATH64 < 719 (tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) { 720 /* This segment is not acceptable. */ 721 return (B_FALSE); 722 } else { 723 /* 724 * Connection has been idle for 725 * too long. Reset the timestamp 726 * and assume the segment is valid. 727 */ 728 tcp->tcp_ts_recent = 729 tcpoptp->tcp_opt_ts_val; 730 } 731 } 732 } else { 733 /* 734 * If we don't get a timestamp on every packet, we 735 * figure we can't really trust 'em, so we stop sending 736 * and parsing them. 737 */ 738 tcp->tcp_snd_ts_ok = B_FALSE; 739 740 connp->conn_ht_iphc_len -= TCPOPT_REAL_TS_LEN; 741 connp->conn_ht_ulp_len -= TCPOPT_REAL_TS_LEN; 742 tcp->tcp_tcpha->tha_offset_and_reserved -= (3 << 4); 743 /* 744 * Adjust the tcp_mss and tcp_cwnd accordingly. We avoid 745 * doing a slow start here so as to not to lose on the 746 * transfer rate built up so far. 747 */ 748 tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN); 749 if (tcp->tcp_snd_sack_ok) 750 tcp->tcp_max_sack_blk = 4; 751 } 752 return (B_TRUE); 753 } 754 755 /* 756 * Defense for the SYN attack - 757 * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest 758 * one from the list of droppable eagers. This list is a subset of q0. 759 * see comments before the definition of MAKE_DROPPABLE(). 760 * 2. Don't drop a SYN request before its first timeout. This gives every 761 * request at least til the first timeout to complete its 3-way handshake. 762 * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many 763 * requests currently on the queue that has timed out. This will be used 764 * as an indicator of whether an attack is under way, so that appropriate 765 * actions can be taken. (It's incremented in tcp_timer() and decremented 766 * either when eager goes into ESTABLISHED, or gets freed up.) 767 * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on 768 * # of timeout drops back to <= q0len/32 => SYN alert off 769 */ 770 static boolean_t 771 tcp_drop_q0(tcp_t *tcp) 772 { 773 tcp_t *eager; 774 mblk_t *mp; 775 tcp_stack_t *tcps = tcp->tcp_tcps; 776 777 ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock)); 778 ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); 779 780 /* Pick oldest eager from the list of droppable eagers */ 781 eager = tcp->tcp_eager_prev_drop_q0; 782 783 /* If list is empty. return B_FALSE */ 784 if (eager == tcp) { 785 return (B_FALSE); 786 } 787 788 /* If allocated, the mp will be freed in tcp_clean_death_wrapper() */ 789 if ((mp = allocb(0, BPRI_HI)) == NULL) 790 return (B_FALSE); 791 792 /* 793 * Take this eager out from the list of droppable eagers since we are 794 * going to drop it. 795 */ 796 MAKE_UNDROPPABLE(eager); 797 798 if (tcp->tcp_connp->conn_debug) { 799 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, 800 "tcp_drop_q0: listen half-open queue (max=%d) overflow" 801 " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0, 802 tcp->tcp_conn_req_cnt_q0, 803 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 804 } 805 806 TCPS_BUMP_MIB(tcps, tcpHalfOpenDrop); 807 808 /* Put a reference on the conn as we are enqueueing it in the sqeue */ 809 CONN_INC_REF(eager->tcp_connp); 810 811 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, 812 tcp_clean_death_wrapper, eager->tcp_connp, NULL, 813 SQ_FILL, SQTAG_TCP_DROP_Q0); 814 815 return (B_TRUE); 816 } 817 818 /* 819 * Handle a SYN on an AF_INET6 socket; can be either IPv4 or IPv6 820 */ 821 static mblk_t * 822 tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, 823 ip_recv_attr_t *ira) 824 { 825 tcp_t *ltcp = lconnp->conn_tcp; 826 tcp_t *tcp = connp->conn_tcp; 827 mblk_t *tpi_mp; 828 ipha_t *ipha; 829 ip6_t *ip6h; 830 sin6_t sin6; 831 uint_t ifindex = ira->ira_ruifindex; 832 tcp_stack_t *tcps = tcp->tcp_tcps; 833 834 if (ira->ira_flags & IRAF_IS_IPV4) { 835 ipha = (ipha_t *)mp->b_rptr; 836 837 connp->conn_ipversion = IPV4_VERSION; 838 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); 839 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); 840 connp->conn_saddr_v6 = connp->conn_laddr_v6; 841 842 sin6 = sin6_null; 843 sin6.sin6_addr = connp->conn_faddr_v6; 844 sin6.sin6_port = connp->conn_fport; 845 sin6.sin6_family = AF_INET6; 846 sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, 847 IPCL_ZONEID(lconnp), tcps->tcps_netstack); 848 849 if (connp->conn_recv_ancillary.crb_recvdstaddr) { 850 sin6_t sin6d; 851 852 sin6d = sin6_null; 853 sin6d.sin6_addr = connp->conn_laddr_v6; 854 sin6d.sin6_port = connp->conn_lport; 855 sin6d.sin6_family = AF_INET; 856 tpi_mp = mi_tpi_extconn_ind(NULL, 857 (char *)&sin6d, sizeof (sin6_t), 858 (char *)&tcp, 859 (t_scalar_t)sizeof (intptr_t), 860 (char *)&sin6d, sizeof (sin6_t), 861 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 862 } else { 863 tpi_mp = mi_tpi_conn_ind(NULL, 864 (char *)&sin6, sizeof (sin6_t), 865 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 866 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 867 } 868 } else { 869 ip6h = (ip6_t *)mp->b_rptr; 870 871 connp->conn_ipversion = IPV6_VERSION; 872 connp->conn_laddr_v6 = ip6h->ip6_dst; 873 connp->conn_faddr_v6 = ip6h->ip6_src; 874 connp->conn_saddr_v6 = connp->conn_laddr_v6; 875 876 sin6 = sin6_null; 877 sin6.sin6_addr = connp->conn_faddr_v6; 878 sin6.sin6_port = connp->conn_fport; 879 sin6.sin6_family = AF_INET6; 880 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 881 sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, 882 IPCL_ZONEID(lconnp), tcps->tcps_netstack); 883 884 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { 885 /* Pass up the scope_id of remote addr */ 886 sin6.sin6_scope_id = ifindex; 887 } else { 888 sin6.sin6_scope_id = 0; 889 } 890 if (connp->conn_recv_ancillary.crb_recvdstaddr) { 891 sin6_t sin6d; 892 893 sin6d = sin6_null; 894 sin6.sin6_addr = connp->conn_laddr_v6; 895 sin6d.sin6_port = connp->conn_lport; 896 sin6d.sin6_family = AF_INET6; 897 if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_laddr_v6)) 898 sin6d.sin6_scope_id = ifindex; 899 900 tpi_mp = mi_tpi_extconn_ind(NULL, 901 (char *)&sin6d, sizeof (sin6_t), 902 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 903 (char *)&sin6d, sizeof (sin6_t), 904 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 905 } else { 906 tpi_mp = mi_tpi_conn_ind(NULL, 907 (char *)&sin6, sizeof (sin6_t), 908 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 909 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 910 } 911 } 912 913 tcp->tcp_mss = tcps->tcps_mss_def_ipv6; 914 return (tpi_mp); 915 } 916 917 /* Handle a SYN on an AF_INET socket */ 918 static mblk_t * 919 tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp, 920 ip_recv_attr_t *ira) 921 { 922 tcp_t *ltcp = lconnp->conn_tcp; 923 tcp_t *tcp = connp->conn_tcp; 924 sin_t sin; 925 mblk_t *tpi_mp = NULL; 926 tcp_stack_t *tcps = tcp->tcp_tcps; 927 ipha_t *ipha; 928 929 ASSERT(ira->ira_flags & IRAF_IS_IPV4); 930 ipha = (ipha_t *)mp->b_rptr; 931 932 connp->conn_ipversion = IPV4_VERSION; 933 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); 934 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); 935 connp->conn_saddr_v6 = connp->conn_laddr_v6; 936 937 sin = sin_null; 938 sin.sin_addr.s_addr = connp->conn_faddr_v4; 939 sin.sin_port = connp->conn_fport; 940 sin.sin_family = AF_INET; 941 if (lconnp->conn_recv_ancillary.crb_recvdstaddr) { 942 sin_t sind; 943 944 sind = sin_null; 945 sind.sin_addr.s_addr = connp->conn_laddr_v4; 946 sind.sin_port = connp->conn_lport; 947 sind.sin_family = AF_INET; 948 tpi_mp = mi_tpi_extconn_ind(NULL, 949 (char *)&sind, sizeof (sin_t), (char *)&tcp, 950 (t_scalar_t)sizeof (intptr_t), (char *)&sind, 951 sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum); 952 } else { 953 tpi_mp = mi_tpi_conn_ind(NULL, 954 (char *)&sin, sizeof (sin_t), 955 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 956 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 957 } 958 959 tcp->tcp_mss = tcps->tcps_mss_def_ipv4; 960 return (tpi_mp); 961 } 962 963 /* 964 * Called via squeue to get on to eager's perimeter. It sends a 965 * TH_RST if eager is in the fanout table. The listener wants the 966 * eager to disappear either by means of tcp_eager_blowoff() or 967 * tcp_eager_cleanup() being called. tcp_eager_kill() can also be 968 * called (via squeue) if the eager cannot be inserted in the 969 * fanout table in tcp_input_listener(). 970 */ 971 /* ARGSUSED */ 972 void 973 tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 974 { 975 conn_t *econnp = (conn_t *)arg; 976 tcp_t *eager = econnp->conn_tcp; 977 tcp_t *listener = eager->tcp_listener; 978 979 /* 980 * We could be called because listener is closing. Since 981 * the eager was using listener's queue's, we avoid 982 * using the listeners queues from now on. 983 */ 984 ASSERT(eager->tcp_detached); 985 econnp->conn_rq = NULL; 986 econnp->conn_wq = NULL; 987 988 /* 989 * An eager's conn_fanout will be NULL if it's a duplicate 990 * for an existing 4-tuples in the conn fanout table. 991 * We don't want to send an RST out in such case. 992 */ 993 if (econnp->conn_fanout != NULL && eager->tcp_state > TCPS_LISTEN) { 994 tcp_xmit_ctl("tcp_eager_kill, can't wait", 995 eager, eager->tcp_snxt, 0, TH_RST); 996 } 997 998 /* We are here because listener wants this eager gone */ 999 if (listener != NULL) { 1000 mutex_enter(&listener->tcp_eager_lock); 1001 tcp_eager_unlink(eager); 1002 if (eager->tcp_tconnind_started) { 1003 /* 1004 * The eager has sent a conn_ind up to the 1005 * listener but listener decides to close 1006 * instead. We need to drop the extra ref 1007 * placed on eager in tcp_input_data() before 1008 * sending the conn_ind to listener. 1009 */ 1010 CONN_DEC_REF(econnp); 1011 } 1012 mutex_exit(&listener->tcp_eager_lock); 1013 CONN_DEC_REF(listener->tcp_connp); 1014 } 1015 1016 if (eager->tcp_state != TCPS_CLOSED) 1017 tcp_close_detached(eager); 1018 } 1019 1020 /* 1021 * Reset any eager connection hanging off this listener marked 1022 * with 'seqnum' and then reclaim it's resources. 1023 */ 1024 boolean_t 1025 tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum) 1026 { 1027 tcp_t *eager; 1028 mblk_t *mp; 1029 1030 eager = listener; 1031 mutex_enter(&listener->tcp_eager_lock); 1032 do { 1033 eager = eager->tcp_eager_next_q; 1034 if (eager == NULL) { 1035 mutex_exit(&listener->tcp_eager_lock); 1036 return (B_FALSE); 1037 } 1038 } while (eager->tcp_conn_req_seqnum != seqnum); 1039 1040 if (eager->tcp_closemp_used) { 1041 mutex_exit(&listener->tcp_eager_lock); 1042 return (B_TRUE); 1043 } 1044 eager->tcp_closemp_used = B_TRUE; 1045 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 1046 CONN_INC_REF(eager->tcp_connp); 1047 mutex_exit(&listener->tcp_eager_lock); 1048 mp = &eager->tcp_closemp; 1049 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill, 1050 eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF); 1051 return (B_TRUE); 1052 } 1053 1054 /* 1055 * Reset any eager connection hanging off this listener 1056 * and then reclaim it's resources. 1057 */ 1058 void 1059 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) 1060 { 1061 tcp_t *eager; 1062 mblk_t *mp; 1063 tcp_stack_t *tcps = listener->tcp_tcps; 1064 1065 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 1066 1067 if (!q0_only) { 1068 /* First cleanup q */ 1069 TCP_STAT(tcps, tcp_eager_blowoff_q); 1070 eager = listener->tcp_eager_next_q; 1071 while (eager != NULL) { 1072 if (!eager->tcp_closemp_used) { 1073 eager->tcp_closemp_used = B_TRUE; 1074 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 1075 CONN_INC_REF(eager->tcp_connp); 1076 mp = &eager->tcp_closemp; 1077 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, 1078 tcp_eager_kill, eager->tcp_connp, NULL, 1079 SQ_FILL, SQTAG_TCP_EAGER_CLEANUP); 1080 } 1081 eager = eager->tcp_eager_next_q; 1082 } 1083 } 1084 /* Then cleanup q0 */ 1085 TCP_STAT(tcps, tcp_eager_blowoff_q0); 1086 eager = listener->tcp_eager_next_q0; 1087 while (eager != listener) { 1088 if (!eager->tcp_closemp_used) { 1089 eager->tcp_closemp_used = B_TRUE; 1090 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 1091 CONN_INC_REF(eager->tcp_connp); 1092 mp = &eager->tcp_closemp; 1093 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, 1094 tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL, 1095 SQTAG_TCP_EAGER_CLEANUP_Q0); 1096 } 1097 eager = eager->tcp_eager_next_q0; 1098 } 1099 } 1100 1101 /* 1102 * If we are an eager connection hanging off a listener that hasn't 1103 * formally accepted the connection yet, get off his list and blow off 1104 * any data that we have accumulated. 1105 */ 1106 void 1107 tcp_eager_unlink(tcp_t *tcp) 1108 { 1109 tcp_t *listener = tcp->tcp_listener; 1110 1111 ASSERT(listener != NULL); 1112 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 1113 if (tcp->tcp_eager_next_q0 != NULL) { 1114 ASSERT(tcp->tcp_eager_prev_q0 != NULL); 1115 1116 /* Remove the eager tcp from q0 */ 1117 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 1118 tcp->tcp_eager_prev_q0; 1119 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 1120 tcp->tcp_eager_next_q0; 1121 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1122 listener->tcp_conn_req_cnt_q0--; 1123 1124 tcp->tcp_eager_next_q0 = NULL; 1125 tcp->tcp_eager_prev_q0 = NULL; 1126 1127 /* 1128 * Take the eager out, if it is in the list of droppable 1129 * eagers. 1130 */ 1131 MAKE_UNDROPPABLE(tcp); 1132 1133 if (tcp->tcp_syn_rcvd_timeout != 0) { 1134 /* we have timed out before */ 1135 ASSERT(listener->tcp_syn_rcvd_timeout > 0); 1136 listener->tcp_syn_rcvd_timeout--; 1137 } 1138 } else { 1139 tcp_t **tcpp = &listener->tcp_eager_next_q; 1140 tcp_t *prev = NULL; 1141 1142 for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { 1143 if (tcpp[0] == tcp) { 1144 if (listener->tcp_eager_last_q == tcp) { 1145 /* 1146 * If we are unlinking the last 1147 * element on the list, adjust 1148 * tail pointer. Set tail pointer 1149 * to nil when list is empty. 1150 */ 1151 ASSERT(tcp->tcp_eager_next_q == NULL); 1152 if (listener->tcp_eager_last_q == 1153 listener->tcp_eager_next_q) { 1154 listener->tcp_eager_last_q = 1155 NULL; 1156 } else { 1157 /* 1158 * We won't get here if there 1159 * is only one eager in the 1160 * list. 1161 */ 1162 ASSERT(prev != NULL); 1163 listener->tcp_eager_last_q = 1164 prev; 1165 } 1166 } 1167 tcpp[0] = tcp->tcp_eager_next_q; 1168 tcp->tcp_eager_next_q = NULL; 1169 tcp->tcp_eager_last_q = NULL; 1170 ASSERT(listener->tcp_conn_req_cnt_q > 0); 1171 listener->tcp_conn_req_cnt_q--; 1172 break; 1173 } 1174 prev = tcpp[0]; 1175 } 1176 } 1177 tcp->tcp_listener = NULL; 1178 } 1179 1180 /* BEGIN CSTYLED */ 1181 /* 1182 * 1183 * The sockfs ACCEPT path: 1184 * ======================= 1185 * 1186 * The eager is now established in its own perimeter as soon as SYN is 1187 * received in tcp_input_listener(). When sockfs receives conn_ind, it 1188 * completes the accept processing on the acceptor STREAM. The sending 1189 * of conn_ind part is common for both sockfs listener and a TLI/XTI 1190 * listener but a TLI/XTI listener completes the accept processing 1191 * on the listener perimeter. 1192 * 1193 * Common control flow for 3 way handshake: 1194 * ---------------------------------------- 1195 * 1196 * incoming SYN (listener perimeter) -> tcp_input_listener() 1197 * 1198 * incoming SYN-ACK-ACK (eager perim) -> tcp_input_data() 1199 * send T_CONN_IND (listener perim) -> tcp_send_conn_ind() 1200 * 1201 * Sockfs ACCEPT Path: 1202 * ------------------- 1203 * 1204 * open acceptor stream (tcp_open allocates tcp_tli_accept() 1205 * as STREAM entry point) 1206 * 1207 * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_tli_accept() 1208 * 1209 * tcp_tli_accept() extracts the eager and makes the q->q_ptr <-> eager 1210 * association (we are not behind eager's squeue but sockfs is protecting us 1211 * and no one knows about this stream yet. The STREAMS entry point q->q_info 1212 * is changed to point at tcp_wput(). 1213 * 1214 * tcp_accept_common() sends any deferred eagers via tcp_send_pending() to 1215 * listener (done on listener's perimeter). 1216 * 1217 * tcp_tli_accept() calls tcp_accept_finish() on eagers perimeter to finish 1218 * accept. 1219 * 1220 * TLI/XTI client ACCEPT path: 1221 * --------------------------- 1222 * 1223 * soaccept() sends T_CONN_RES on the listener STREAM. 1224 * 1225 * tcp_tli_accept() -> tcp_accept_swap() complete the processing and send 1226 * a M_SETOPS mblk to eager perimeter to finish accept (tcp_accept_finish()). 1227 * 1228 * Locks: 1229 * ====== 1230 * 1231 * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and 1232 * and listeners->tcp_eager_next_q. 1233 * 1234 * Referencing: 1235 * ============ 1236 * 1237 * 1) We start out in tcp_input_listener by eager placing a ref on 1238 * listener and listener adding eager to listeners->tcp_eager_next_q0. 1239 * 1240 * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before 1241 * doing so we place a ref on the eager. This ref is finally dropped at the 1242 * end of tcp_accept_finish() while unwinding from the squeue, i.e. the 1243 * reference is dropped by the squeue framework. 1244 * 1245 * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish 1246 * 1247 * The reference must be released by the same entity that added the reference 1248 * In the above scheme, the eager is the entity that adds and releases the 1249 * references. Note that tcp_accept_finish executes in the squeue of the eager 1250 * (albeit after it is attached to the acceptor stream). Though 1. executes 1251 * in the listener's squeue, the eager is nascent at this point and the 1252 * reference can be considered to have been added on behalf of the eager. 1253 * 1254 * Eager getting a Reset or listener closing: 1255 * ========================================== 1256 * 1257 * Once the listener and eager are linked, the listener never does the unlink. 1258 * If the listener needs to close, tcp_eager_cleanup() is called which queues 1259 * a message on all eager perimeter. The eager then does the unlink, clears 1260 * any pointers to the listener's queue and drops the reference to the 1261 * listener. The listener waits in tcp_close outside the squeue until its 1262 * refcount has dropped to 1. This ensures that the listener has waited for 1263 * all eagers to clear their association with the listener. 1264 * 1265 * Similarly, if eager decides to go away, it can unlink itself and close. 1266 * When the T_CONN_RES comes down, we check if eager has closed. Note that 1267 * the reference to eager is still valid because of the extra ref we put 1268 * in tcp_send_conn_ind. 1269 * 1270 * Listener can always locate the eager under the protection 1271 * of the listener->tcp_eager_lock, and then do a refhold 1272 * on the eager during the accept processing. 1273 * 1274 * The acceptor stream accesses the eager in the accept processing 1275 * based on the ref placed on eager before sending T_conn_ind. 1276 * The only entity that can negate this refhold is a listener close 1277 * which is mutually exclusive with an active acceptor stream. 1278 * 1279 * Eager's reference on the listener 1280 * =================================== 1281 * 1282 * If the accept happens (even on a closed eager) the eager drops its 1283 * reference on the listener at the start of tcp_accept_finish. If the 1284 * eager is killed due to an incoming RST before the T_conn_ind is sent up, 1285 * the reference is dropped in tcp_closei_local. If the listener closes, 1286 * the reference is dropped in tcp_eager_kill. In all cases the reference 1287 * is dropped while executing in the eager's context (squeue). 1288 */ 1289 /* END CSTYLED */ 1290 1291 /* Process the SYN packet, mp, directed at the listener 'tcp' */ 1292 1293 /* 1294 * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN. 1295 * tcp_input_data will not see any packets for listeners since the listener 1296 * has conn_recv set to tcp_input_listener. 1297 */ 1298 /* ARGSUSED */ 1299 static void 1300 tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 1301 { 1302 tcpha_t *tcpha; 1303 uint32_t seg_seq; 1304 tcp_t *eager; 1305 int err; 1306 conn_t *econnp = NULL; 1307 squeue_t *new_sqp; 1308 mblk_t *mp1; 1309 uint_t ip_hdr_len; 1310 conn_t *lconnp = (conn_t *)arg; 1311 tcp_t *listener = lconnp->conn_tcp; 1312 tcp_stack_t *tcps = listener->tcp_tcps; 1313 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 1314 uint_t flags; 1315 mblk_t *tpi_mp; 1316 uint_t ifindex = ira->ira_ruifindex; 1317 boolean_t tlc_set = B_FALSE; 1318 1319 ip_hdr_len = ira->ira_ip_hdr_length; 1320 tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len]; 1321 flags = (unsigned int)tcpha->tha_flags & 0xFF; 1322 1323 DTRACE_TCP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, lconnp->conn_ixa, 1324 __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, listener, 1325 __dtrace_tcp_tcph_t *, tcpha); 1326 1327 if (!(flags & TH_SYN)) { 1328 if ((flags & TH_RST) || (flags & TH_URG)) { 1329 freemsg(mp); 1330 return; 1331 } 1332 if (flags & TH_ACK) { 1333 /* Note this executes in listener's squeue */ 1334 tcp_xmit_listeners_reset(mp, ira, ipst, lconnp); 1335 return; 1336 } 1337 1338 freemsg(mp); 1339 return; 1340 } 1341 1342 if (listener->tcp_state != TCPS_LISTEN) 1343 goto error2; 1344 1345 ASSERT(IPCL_IS_BOUND(lconnp)); 1346 1347 mutex_enter(&listener->tcp_eager_lock); 1348 1349 /* 1350 * The system is under memory pressure, so we need to do our part 1351 * to relieve the pressure. So we only accept new request if there 1352 * is nothing waiting to be accepted or waiting to complete the 3-way 1353 * handshake. This means that busy listener will not get too many 1354 * new requests which they cannot handle in time while non-busy 1355 * listener is still functioning properly. 1356 */ 1357 if (tcps->tcps_reclaim && (listener->tcp_conn_req_cnt_q > 0 || 1358 listener->tcp_conn_req_cnt_q0 > 0)) { 1359 mutex_exit(&listener->tcp_eager_lock); 1360 TCP_STAT(tcps, tcp_listen_mem_drop); 1361 goto error2; 1362 } 1363 1364 if (listener->tcp_conn_req_cnt_q >= listener->tcp_conn_req_max) { 1365 mutex_exit(&listener->tcp_eager_lock); 1366 TCP_STAT(tcps, tcp_listendrop); 1367 TCPS_BUMP_MIB(tcps, tcpListenDrop); 1368 if (lconnp->conn_debug) { 1369 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 1370 "tcp_input_listener: listen backlog (max=%d) " 1371 "overflow (%d pending) on %s", 1372 listener->tcp_conn_req_max, 1373 listener->tcp_conn_req_cnt_q, 1374 tcp_display(listener, NULL, DISP_PORT_ONLY)); 1375 } 1376 goto error2; 1377 } 1378 1379 if (listener->tcp_conn_req_cnt_q0 >= 1380 listener->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) { 1381 /* 1382 * Q0 is full. Drop a pending half-open req from the queue 1383 * to make room for the new SYN req. Also mark the time we 1384 * drop a SYN. 1385 * 1386 * A more aggressive defense against SYN attack will 1387 * be to set the "tcp_syn_defense" flag now. 1388 */ 1389 TCP_STAT(tcps, tcp_listendropq0); 1390 listener->tcp_last_rcv_lbolt = ddi_get_lbolt64(); 1391 if (!tcp_drop_q0(listener)) { 1392 mutex_exit(&listener->tcp_eager_lock); 1393 TCPS_BUMP_MIB(tcps, tcpListenDropQ0); 1394 if (lconnp->conn_debug) { 1395 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, 1396 "tcp_input_listener: listen half-open " 1397 "queue (max=%d) full (%d pending) on %s", 1398 tcps->tcps_conn_req_max_q0, 1399 listener->tcp_conn_req_cnt_q0, 1400 tcp_display(listener, NULL, 1401 DISP_PORT_ONLY)); 1402 } 1403 goto error2; 1404 } 1405 } 1406 1407 /* 1408 * Enforce the limit set on the number of connections per listener. 1409 * Note that tlc_cnt starts with 1. So need to add 1 to tlc_max 1410 * for comparison. 1411 */ 1412 if (listener->tcp_listen_cnt != NULL) { 1413 tcp_listen_cnt_t *tlc = listener->tcp_listen_cnt; 1414 int64_t now; 1415 1416 if (atomic_add_32_nv(&tlc->tlc_cnt, 1) > tlc->tlc_max + 1) { 1417 mutex_exit(&listener->tcp_eager_lock); 1418 now = ddi_get_lbolt64(); 1419 atomic_add_32(&tlc->tlc_cnt, -1); 1420 TCP_STAT(tcps, tcp_listen_cnt_drop); 1421 tlc->tlc_drop++; 1422 if (now - tlc->tlc_report_time > 1423 MSEC_TO_TICK(TCP_TLC_REPORT_INTERVAL)) { 1424 zcmn_err(lconnp->conn_zoneid, CE_WARN, 1425 "Listener (port %d) connection max (%u) " 1426 "reached: %u attempts dropped total\n", 1427 ntohs(listener->tcp_connp->conn_lport), 1428 tlc->tlc_max, tlc->tlc_drop); 1429 tlc->tlc_report_time = now; 1430 } 1431 goto error2; 1432 } 1433 tlc_set = B_TRUE; 1434 } 1435 1436 mutex_exit(&listener->tcp_eager_lock); 1437 1438 /* 1439 * IP sets ira_sqp to either the senders conn_sqp (for loopback) 1440 * or based on the ring (for packets from GLD). Otherwise it is 1441 * set based on lbolt i.e., a somewhat random number. 1442 */ 1443 ASSERT(ira->ira_sqp != NULL); 1444 new_sqp = ira->ira_sqp; 1445 1446 econnp = (conn_t *)tcp_get_conn(arg2, tcps); 1447 if (econnp == NULL) 1448 goto error2; 1449 1450 ASSERT(econnp->conn_netstack == lconnp->conn_netstack); 1451 econnp->conn_sqp = new_sqp; 1452 econnp->conn_initial_sqp = new_sqp; 1453 econnp->conn_ixa->ixa_sqp = new_sqp; 1454 1455 econnp->conn_fport = tcpha->tha_lport; 1456 econnp->conn_lport = tcpha->tha_fport; 1457 1458 err = conn_inherit_parent(lconnp, econnp); 1459 if (err != 0) 1460 goto error3; 1461 1462 /* We already know the laddr of the new connection is ours */ 1463 econnp->conn_ixa->ixa_src_generation = ipst->ips_src_generation; 1464 1465 ASSERT(OK_32PTR(mp->b_rptr)); 1466 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION || 1467 IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION); 1468 1469 if (lconnp->conn_family == AF_INET) { 1470 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION); 1471 tpi_mp = tcp_conn_create_v4(lconnp, econnp, mp, ira); 1472 } else { 1473 tpi_mp = tcp_conn_create_v6(lconnp, econnp, mp, ira); 1474 } 1475 1476 if (tpi_mp == NULL) 1477 goto error3; 1478 1479 eager = econnp->conn_tcp; 1480 eager->tcp_detached = B_TRUE; 1481 SOCK_CONNID_INIT(eager->tcp_connid); 1482 1483 /* 1484 * Initialize the eager's tcp_t and inherit some parameters from 1485 * the listener. 1486 */ 1487 tcp_init_values(eager, listener); 1488 1489 ASSERT((econnp->conn_ixa->ixa_flags & 1490 (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | 1491 IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)) == 1492 (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | 1493 IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)); 1494 1495 if (!tcps->tcps_dev_flow_ctl) 1496 econnp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL; 1497 1498 /* Prepare for diffing against previous packets */ 1499 eager->tcp_recvifindex = 0; 1500 eager->tcp_recvhops = 0xffffffffU; 1501 1502 if (!(ira->ira_flags & IRAF_IS_IPV4) && econnp->conn_bound_if == 0) { 1503 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_faddr_v6) || 1504 IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6)) { 1505 econnp->conn_incoming_ifindex = ifindex; 1506 econnp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 1507 econnp->conn_ixa->ixa_scopeid = ifindex; 1508 } 1509 } 1510 1511 if ((ira->ira_flags & (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS)) == 1512 (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS) && 1513 tcps->tcps_rev_src_routes) { 1514 ipha_t *ipha = (ipha_t *)mp->b_rptr; 1515 ip_pkt_t *ipp = &econnp->conn_xmit_ipp; 1516 1517 /* Source routing option copyover (reverse it) */ 1518 err = ip_find_hdr_v4(ipha, ipp, B_TRUE); 1519 if (err != 0) { 1520 freemsg(tpi_mp); 1521 goto error3; 1522 } 1523 ip_pkt_source_route_reverse_v4(ipp); 1524 } 1525 1526 ASSERT(eager->tcp_conn.tcp_eager_conn_ind == NULL); 1527 ASSERT(!eager->tcp_tconnind_started); 1528 /* 1529 * If the SYN came with a credential, it's a loopback packet or a 1530 * labeled packet; attach the credential to the TPI message. 1531 */ 1532 if (ira->ira_cred != NULL) 1533 mblk_setcred(tpi_mp, ira->ira_cred, ira->ira_cpid); 1534 1535 eager->tcp_conn.tcp_eager_conn_ind = tpi_mp; 1536 ASSERT(eager->tcp_ordrel_mp == NULL); 1537 1538 /* Inherit the listener's non-STREAMS flag */ 1539 if (IPCL_IS_NONSTR(lconnp)) { 1540 econnp->conn_flags |= IPCL_NONSTR; 1541 /* All non-STREAMS tcp_ts are sockets */ 1542 eager->tcp_issocket = B_TRUE; 1543 } else { 1544 /* 1545 * Pre-allocate the T_ordrel_ind mblk for TPI socket so that 1546 * at close time, we will always have that to send up. 1547 * Otherwise, we need to do special handling in case the 1548 * allocation fails at that time. 1549 */ 1550 if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) 1551 goto error3; 1552 } 1553 /* 1554 * Now that the IP addresses and ports are setup in econnp we 1555 * can do the IPsec policy work. 1556 */ 1557 if (ira->ira_flags & IRAF_IPSEC_SECURE) { 1558 if (lconnp->conn_policy != NULL) { 1559 /* 1560 * Inherit the policy from the listener; use 1561 * actions from ira 1562 */ 1563 if (!ip_ipsec_policy_inherit(econnp, lconnp, ira)) { 1564 CONN_DEC_REF(econnp); 1565 freemsg(mp); 1566 goto error3; 1567 } 1568 } 1569 } 1570 1571 /* 1572 * tcp_set_destination() may set tcp_rwnd according to the route 1573 * metrics. If it does not, the eager's receive window will be set 1574 * to the listener's receive window later in this function. 1575 */ 1576 eager->tcp_rwnd = 0; 1577 1578 if (is_system_labeled()) { 1579 ip_xmit_attr_t *ixa = econnp->conn_ixa; 1580 1581 ASSERT(ira->ira_tsl != NULL); 1582 /* Discard any old label */ 1583 if (ixa->ixa_free_flags & IXA_FREE_TSL) { 1584 ASSERT(ixa->ixa_tsl != NULL); 1585 label_rele(ixa->ixa_tsl); 1586 ixa->ixa_free_flags &= ~IXA_FREE_TSL; 1587 ixa->ixa_tsl = NULL; 1588 } 1589 if ((lconnp->conn_mlp_type != mlptSingle || 1590 lconnp->conn_mac_mode != CONN_MAC_DEFAULT) && 1591 ira->ira_tsl != NULL) { 1592 /* 1593 * If this is an MLP connection or a MAC-Exempt 1594 * connection with an unlabeled node, packets are to be 1595 * exchanged using the security label of the received 1596 * SYN packet instead of the server application's label. 1597 * tsol_check_dest called from ip_set_destination 1598 * might later update TSF_UNLABELED by replacing 1599 * ixa_tsl with a new label. 1600 */ 1601 label_hold(ira->ira_tsl); 1602 ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl); 1603 DTRACE_PROBE2(mlp_syn_accept, conn_t *, 1604 econnp, ts_label_t *, ixa->ixa_tsl) 1605 } else { 1606 ixa->ixa_tsl = crgetlabel(econnp->conn_cred); 1607 DTRACE_PROBE2(syn_accept, conn_t *, 1608 econnp, ts_label_t *, ixa->ixa_tsl) 1609 } 1610 /* 1611 * conn_connect() called from tcp_set_destination will verify 1612 * the destination is allowed to receive packets at the 1613 * security label of the SYN-ACK we are generating. As part of 1614 * that, tsol_check_dest() may create a new effective label for 1615 * this connection. 1616 * Finally conn_connect() will call conn_update_label. 1617 * All that remains for TCP to do is to call 1618 * conn_build_hdr_template which is done as part of 1619 * tcp_set_destination. 1620 */ 1621 } 1622 1623 /* 1624 * Since we will clear tcp_listener before we clear tcp_detached 1625 * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress 1626 * so we can tell a TCP_IS_DETACHED_NONEAGER apart. 1627 */ 1628 eager->tcp_hard_binding = B_TRUE; 1629 1630 tcp_bind_hash_insert(&tcps->tcps_bind_fanout[ 1631 TCP_BIND_HASH(econnp->conn_lport)], eager, 0); 1632 1633 CL_INET_CONNECT(econnp, B_FALSE, err); 1634 if (err != 0) { 1635 tcp_bind_hash_remove(eager); 1636 goto error3; 1637 } 1638 1639 SOCK_CONNID_BUMP(eager->tcp_connid); 1640 1641 /* 1642 * Adapt our mss, ttl, ... based on the remote address. 1643 */ 1644 1645 if (tcp_set_destination(eager) != 0) { 1646 TCPS_BUMP_MIB(tcps, tcpAttemptFails); 1647 /* Undo the bind_hash_insert */ 1648 tcp_bind_hash_remove(eager); 1649 goto error3; 1650 } 1651 1652 /* Process all TCP options. */ 1653 tcp_process_options(eager, tcpha); 1654 1655 /* Is the other end ECN capable? */ 1656 if (tcps->tcps_ecn_permitted >= 1 && 1657 (tcpha->tha_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { 1658 eager->tcp_ecn_ok = B_TRUE; 1659 } 1660 1661 /* 1662 * The listener's conn_rcvbuf should be the default window size or a 1663 * window size changed via SO_RCVBUF option. First round up the 1664 * eager's tcp_rwnd to the nearest MSS. Then find out the window 1665 * scale option value if needed. Call tcp_rwnd_set() to finish the 1666 * setting. 1667 * 1668 * Note if there is a rpipe metric associated with the remote host, 1669 * we should not inherit receive window size from listener. 1670 */ 1671 eager->tcp_rwnd = MSS_ROUNDUP( 1672 (eager->tcp_rwnd == 0 ? econnp->conn_rcvbuf : 1673 eager->tcp_rwnd), eager->tcp_mss); 1674 if (eager->tcp_snd_ws_ok) 1675 tcp_set_ws_value(eager); 1676 /* 1677 * Note that this is the only place tcp_rwnd_set() is called for 1678 * accepting a connection. We need to call it here instead of 1679 * after the 3-way handshake because we need to tell the other 1680 * side our rwnd in the SYN-ACK segment. 1681 */ 1682 (void) tcp_rwnd_set(eager, eager->tcp_rwnd); 1683 1684 ASSERT(eager->tcp_connp->conn_rcvbuf != 0 && 1685 eager->tcp_connp->conn_rcvbuf == eager->tcp_rwnd); 1686 1687 ASSERT(econnp->conn_rcvbuf != 0 && 1688 econnp->conn_rcvbuf == eager->tcp_rwnd); 1689 1690 /* Put a ref on the listener for the eager. */ 1691 CONN_INC_REF(lconnp); 1692 mutex_enter(&listener->tcp_eager_lock); 1693 listener->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; 1694 eager->tcp_eager_next_q0 = listener->tcp_eager_next_q0; 1695 listener->tcp_eager_next_q0 = eager; 1696 eager->tcp_eager_prev_q0 = listener; 1697 1698 /* Set tcp_listener before adding it to tcp_conn_fanout */ 1699 eager->tcp_listener = listener; 1700 eager->tcp_saved_listener = listener; 1701 1702 /* 1703 * Set tcp_listen_cnt so that when the connection is done, the counter 1704 * is decremented. 1705 */ 1706 eager->tcp_listen_cnt = listener->tcp_listen_cnt; 1707 1708 /* 1709 * Tag this detached tcp vector for later retrieval 1710 * by our listener client in tcp_accept(). 1711 */ 1712 eager->tcp_conn_req_seqnum = listener->tcp_conn_req_seqnum; 1713 listener->tcp_conn_req_cnt_q0++; 1714 if (++listener->tcp_conn_req_seqnum == -1) { 1715 /* 1716 * -1 is "special" and defined in TPI as something 1717 * that should never be used in T_CONN_IND 1718 */ 1719 ++listener->tcp_conn_req_seqnum; 1720 } 1721 mutex_exit(&listener->tcp_eager_lock); 1722 1723 if (listener->tcp_syn_defense) { 1724 /* Don't drop the SYN that comes from a good IP source */ 1725 ipaddr_t *addr_cache; 1726 1727 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 1728 if (addr_cache != NULL && econnp->conn_faddr_v4 == 1729 addr_cache[IP_ADDR_CACHE_HASH(econnp->conn_faddr_v4)]) { 1730 eager->tcp_dontdrop = B_TRUE; 1731 } 1732 } 1733 1734 /* 1735 * We need to insert the eager in its own perimeter but as soon 1736 * as we do that, we expose the eager to the classifier and 1737 * should not touch any field outside the eager's perimeter. 1738 * So do all the work necessary before inserting the eager 1739 * in its own perimeter. Be optimistic that conn_connect() 1740 * will succeed but undo everything if it fails. 1741 */ 1742 seg_seq = ntohl(tcpha->tha_seq); 1743 eager->tcp_irs = seg_seq; 1744 eager->tcp_rack = seg_seq; 1745 eager->tcp_rnxt = seg_seq + 1; 1746 eager->tcp_tcpha->tha_ack = htonl(eager->tcp_rnxt); 1747 TCPS_BUMP_MIB(tcps, tcpPassiveOpens); 1748 eager->tcp_state = TCPS_SYN_RCVD; 1749 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 1750 econnp->conn_ixa, void, NULL, tcp_t *, eager, void, NULL, 1751 int32_t, TCPS_LISTEN); 1752 1753 mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss, 1754 NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE); 1755 if (mp1 == NULL) { 1756 /* 1757 * Increment the ref count as we are going to 1758 * enqueueing an mp in squeue 1759 */ 1760 CONN_INC_REF(econnp); 1761 goto error; 1762 } 1763 1764 /* 1765 * We need to start the rto timer. In normal case, we start 1766 * the timer after sending the packet on the wire (or at 1767 * least believing that packet was sent by waiting for 1768 * conn_ip_output() to return). Since this is the first packet 1769 * being sent on the wire for the eager, our initial tcp_rto 1770 * is at least tcp_rexmit_interval_min which is a fairly 1771 * large value to allow the algorithm to adjust slowly to large 1772 * fluctuations of RTT during first few transmissions. 1773 * 1774 * Starting the timer first and then sending the packet in this 1775 * case shouldn't make much difference since tcp_rexmit_interval_min 1776 * is of the order of several 100ms and starting the timer 1777 * first and then sending the packet will result in difference 1778 * of few micro seconds. 1779 * 1780 * Without this optimization, we are forced to hold the fanout 1781 * lock across the ipcl_bind_insert() and sending the packet 1782 * so that we don't race against an incoming packet (maybe RST) 1783 * for this eager. 1784 * 1785 * It is necessary to acquire an extra reference on the eager 1786 * at this point and hold it until after tcp_send_data() to 1787 * ensure against an eager close race. 1788 */ 1789 1790 CONN_INC_REF(econnp); 1791 1792 TCP_TIMER_RESTART(eager, eager->tcp_rto); 1793 1794 /* 1795 * Insert the eager in its own perimeter now. We are ready to deal 1796 * with any packets on eager. 1797 */ 1798 if (ipcl_conn_insert(econnp) != 0) 1799 goto error; 1800 1801 ASSERT(econnp->conn_ixa->ixa_notify_cookie == econnp->conn_tcp); 1802 freemsg(mp); 1803 /* 1804 * Send the SYN-ACK. Use the right squeue so that conn_ixa is 1805 * only used by one thread at a time. 1806 */ 1807 if (econnp->conn_sqp == lconnp->conn_sqp) { 1808 DTRACE_TCP5(send, mblk_t *, NULL, ip_xmit_attr_t *, 1809 econnp->conn_ixa, __dtrace_tcp_void_ip_t *, mp1->b_rptr, 1810 tcp_t *, eager, __dtrace_tcp_tcph_t *, 1811 &mp1->b_rptr[econnp->conn_ixa->ixa_ip_hdr_length]); 1812 (void) conn_ip_output(mp1, econnp->conn_ixa); 1813 CONN_DEC_REF(econnp); 1814 } else { 1815 SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_send_synack, 1816 econnp, NULL, SQ_PROCESS, SQTAG_TCP_SEND_SYNACK); 1817 } 1818 return; 1819 error: 1820 freemsg(mp1); 1821 eager->tcp_closemp_used = B_TRUE; 1822 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 1823 mp1 = &eager->tcp_closemp; 1824 SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill, 1825 econnp, NULL, SQ_FILL, SQTAG_TCP_CONN_REQ_2); 1826 1827 /* 1828 * If a connection already exists, send the mp to that connections so 1829 * that it can be appropriately dealt with. 1830 */ 1831 ipst = tcps->tcps_netstack->netstack_ip; 1832 1833 if ((econnp = ipcl_classify(mp, ira, ipst)) != NULL) { 1834 if (!IPCL_IS_CONNECTED(econnp)) { 1835 /* 1836 * Something bad happened. ipcl_conn_insert() 1837 * failed because a connection already existed 1838 * in connected hash but we can't find it 1839 * anymore (someone blew it away). Just 1840 * free this message and hopefully remote 1841 * will retransmit at which time the SYN can be 1842 * treated as a new connection or dealth with 1843 * a TH_RST if a connection already exists. 1844 */ 1845 CONN_DEC_REF(econnp); 1846 freemsg(mp); 1847 } else { 1848 SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, tcp_input_data, 1849 econnp, ira, SQ_FILL, SQTAG_TCP_CONN_REQ_1); 1850 } 1851 } else { 1852 /* Nobody wants this packet */ 1853 freemsg(mp); 1854 } 1855 return; 1856 error3: 1857 CONN_DEC_REF(econnp); 1858 error2: 1859 freemsg(mp); 1860 if (tlc_set) 1861 atomic_add_32(&listener->tcp_listen_cnt->tlc_cnt, -1); 1862 } 1863 1864 /* 1865 * In an ideal case of vertical partition in NUMA architecture, its 1866 * beneficial to have the listener and all the incoming connections 1867 * tied to the same squeue. The other constraint is that incoming 1868 * connections should be tied to the squeue attached to interrupted 1869 * CPU for obvious locality reason so this leaves the listener to 1870 * be tied to the same squeue. Our only problem is that when listener 1871 * is binding, the CPU that will get interrupted by the NIC whose 1872 * IP address the listener is binding to is not even known. So 1873 * the code below allows us to change that binding at the time the 1874 * CPU is interrupted by virtue of incoming connection's squeue. 1875 * 1876 * This is usefull only in case of a listener bound to a specific IP 1877 * address. For other kind of listeners, they get bound the 1878 * very first time and there is no attempt to rebind them. 1879 */ 1880 void 1881 tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2, 1882 ip_recv_attr_t *ira) 1883 { 1884 conn_t *connp = (conn_t *)arg; 1885 squeue_t *sqp = (squeue_t *)arg2; 1886 squeue_t *new_sqp; 1887 uint32_t conn_flags; 1888 1889 /* 1890 * IP sets ira_sqp to either the senders conn_sqp (for loopback) 1891 * or based on the ring (for packets from GLD). Otherwise it is 1892 * set based on lbolt i.e., a somewhat random number. 1893 */ 1894 ASSERT(ira->ira_sqp != NULL); 1895 new_sqp = ira->ira_sqp; 1896 1897 if (connp->conn_fanout == NULL) 1898 goto done; 1899 1900 if (!(connp->conn_flags & IPCL_FULLY_BOUND)) { 1901 mutex_enter(&connp->conn_fanout->connf_lock); 1902 mutex_enter(&connp->conn_lock); 1903 /* 1904 * No one from read or write side can access us now 1905 * except for already queued packets on this squeue. 1906 * But since we haven't changed the squeue yet, they 1907 * can't execute. If they are processed after we have 1908 * changed the squeue, they are sent back to the 1909 * correct squeue down below. 1910 * But a listner close can race with processing of 1911 * incoming SYN. If incoming SYN processing changes 1912 * the squeue then the listener close which is waiting 1913 * to enter the squeue would operate on the wrong 1914 * squeue. Hence we don't change the squeue here unless 1915 * the refcount is exactly the minimum refcount. The 1916 * minimum refcount of 4 is counted as - 1 each for 1917 * TCP and IP, 1 for being in the classifier hash, and 1918 * 1 for the mblk being processed. 1919 */ 1920 1921 if (connp->conn_ref != 4 || 1922 connp->conn_tcp->tcp_state != TCPS_LISTEN) { 1923 mutex_exit(&connp->conn_lock); 1924 mutex_exit(&connp->conn_fanout->connf_lock); 1925 goto done; 1926 } 1927 if (connp->conn_sqp != new_sqp) { 1928 while (connp->conn_sqp != new_sqp) 1929 (void) casptr(&connp->conn_sqp, sqp, new_sqp); 1930 /* No special MT issues for outbound ixa_sqp hint */ 1931 connp->conn_ixa->ixa_sqp = new_sqp; 1932 } 1933 1934 do { 1935 conn_flags = connp->conn_flags; 1936 conn_flags |= IPCL_FULLY_BOUND; 1937 (void) cas32(&connp->conn_flags, connp->conn_flags, 1938 conn_flags); 1939 } while (!(connp->conn_flags & IPCL_FULLY_BOUND)); 1940 1941 mutex_exit(&connp->conn_fanout->connf_lock); 1942 mutex_exit(&connp->conn_lock); 1943 1944 /* 1945 * Assume we have picked a good squeue for the listener. Make 1946 * subsequent SYNs not try to change the squeue. 1947 */ 1948 connp->conn_recv = tcp_input_listener; 1949 } 1950 1951 done: 1952 if (connp->conn_sqp != sqp) { 1953 CONN_INC_REF(connp); 1954 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp, 1955 ira, SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND); 1956 } else { 1957 tcp_input_listener(connp, mp, sqp, ira); 1958 } 1959 } 1960 1961 /* 1962 * Send up all messages queued on tcp_rcv_list. 1963 */ 1964 uint_t 1965 tcp_rcv_drain(tcp_t *tcp) 1966 { 1967 mblk_t *mp; 1968 uint_t ret = 0; 1969 #ifdef DEBUG 1970 uint_t cnt = 0; 1971 #endif 1972 queue_t *q = tcp->tcp_connp->conn_rq; 1973 1974 /* Can't drain on an eager connection */ 1975 if (tcp->tcp_listener != NULL) 1976 return (ret); 1977 1978 /* Can't be a non-STREAMS connection */ 1979 ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 1980 1981 /* No need for the push timer now. */ 1982 if (tcp->tcp_push_tid != 0) { 1983 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 1984 tcp->tcp_push_tid = 0; 1985 } 1986 1987 /* 1988 * Handle two cases here: we are currently fused or we were 1989 * previously fused and have some urgent data to be delivered 1990 * upstream. The latter happens because we either ran out of 1991 * memory or were detached and therefore sending the SIGURG was 1992 * deferred until this point. In either case we pass control 1993 * over to tcp_fuse_rcv_drain() since it may need to complete 1994 * some work. 1995 */ 1996 if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) { 1997 if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL : 1998 &tcp->tcp_fused_sigurg_mp)) 1999 return (ret); 2000 } 2001 2002 while ((mp = tcp->tcp_rcv_list) != NULL) { 2003 tcp->tcp_rcv_list = mp->b_next; 2004 mp->b_next = NULL; 2005 #ifdef DEBUG 2006 cnt += msgdsize(mp); 2007 #endif 2008 putnext(q, mp); 2009 } 2010 #ifdef DEBUG 2011 ASSERT(cnt == tcp->tcp_rcv_cnt); 2012 #endif 2013 tcp->tcp_rcv_last_head = NULL; 2014 tcp->tcp_rcv_last_tail = NULL; 2015 tcp->tcp_rcv_cnt = 0; 2016 2017 if (canputnext(q)) 2018 return (tcp_rwnd_reopen(tcp)); 2019 2020 return (ret); 2021 } 2022 2023 /* 2024 * Queue data on tcp_rcv_list which is a b_next chain. 2025 * tcp_rcv_last_head/tail is the last element of this chain. 2026 * Each element of the chain is a b_cont chain. 2027 * 2028 * M_DATA messages are added to the current element. 2029 * Other messages are added as new (b_next) elements. 2030 */ 2031 void 2032 tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len, cred_t *cr) 2033 { 2034 ASSERT(seg_len == msgdsize(mp)); 2035 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL); 2036 2037 if (is_system_labeled()) { 2038 ASSERT(cr != NULL || msg_getcred(mp, NULL) != NULL); 2039 /* 2040 * Provide for protocols above TCP such as RPC. NOPID leaves 2041 * db_cpid unchanged. 2042 * The cred could have already been set. 2043 */ 2044 if (cr != NULL) 2045 mblk_setcred(mp, cr, NOPID); 2046 } 2047 2048 if (tcp->tcp_rcv_list == NULL) { 2049 ASSERT(tcp->tcp_rcv_last_head == NULL); 2050 tcp->tcp_rcv_list = mp; 2051 tcp->tcp_rcv_last_head = mp; 2052 } else if (DB_TYPE(mp) == DB_TYPE(tcp->tcp_rcv_last_head)) { 2053 tcp->tcp_rcv_last_tail->b_cont = mp; 2054 } else { 2055 tcp->tcp_rcv_last_head->b_next = mp; 2056 tcp->tcp_rcv_last_head = mp; 2057 } 2058 2059 while (mp->b_cont) 2060 mp = mp->b_cont; 2061 2062 tcp->tcp_rcv_last_tail = mp; 2063 tcp->tcp_rcv_cnt += seg_len; 2064 tcp->tcp_rwnd -= seg_len; 2065 } 2066 2067 /* Generate an ACK-only (no data) segment for a TCP endpoint */ 2068 mblk_t * 2069 tcp_ack_mp(tcp_t *tcp) 2070 { 2071 uint32_t seq_no; 2072 tcp_stack_t *tcps = tcp->tcp_tcps; 2073 conn_t *connp = tcp->tcp_connp; 2074 2075 /* 2076 * There are a few cases to be considered while setting the sequence no. 2077 * Essentially, we can come here while processing an unacceptable pkt 2078 * in the TCPS_SYN_RCVD state, in which case we set the sequence number 2079 * to snxt (per RFC 793), note the swnd wouldn't have been set yet. 2080 * If we are here for a zero window probe, stick with suna. In all 2081 * other cases, we check if suna + swnd encompasses snxt and set 2082 * the sequence number to snxt, if so. If snxt falls outside the 2083 * window (the receiver probably shrunk its window), we will go with 2084 * suna + swnd, otherwise the sequence no will be unacceptable to the 2085 * receiver. 2086 */ 2087 if (tcp->tcp_zero_win_probe) { 2088 seq_no = tcp->tcp_suna; 2089 } else if (tcp->tcp_state == TCPS_SYN_RCVD) { 2090 ASSERT(tcp->tcp_swnd == 0); 2091 seq_no = tcp->tcp_snxt; 2092 } else { 2093 seq_no = SEQ_GT(tcp->tcp_snxt, 2094 (tcp->tcp_suna + tcp->tcp_swnd)) ? 2095 (tcp->tcp_suna + tcp->tcp_swnd) : tcp->tcp_snxt; 2096 } 2097 2098 if (tcp->tcp_valid_bits) { 2099 /* 2100 * For the complex case where we have to send some 2101 * controls (FIN or SYN), let tcp_xmit_mp do it. 2102 */ 2103 return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, seq_no, B_FALSE, 2104 NULL, B_FALSE)); 2105 } else { 2106 /* Generate a simple ACK */ 2107 int data_length; 2108 uchar_t *rptr; 2109 tcpha_t *tcpha; 2110 mblk_t *mp1; 2111 int32_t total_hdr_len; 2112 int32_t tcp_hdr_len; 2113 int32_t num_sack_blk = 0; 2114 int32_t sack_opt_len; 2115 ip_xmit_attr_t *ixa = connp->conn_ixa; 2116 2117 /* 2118 * Allocate space for TCP + IP headers 2119 * and link-level header 2120 */ 2121 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 2122 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 2123 tcp->tcp_num_sack_blk); 2124 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 2125 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 2126 total_hdr_len = connp->conn_ht_iphc_len + sack_opt_len; 2127 tcp_hdr_len = connp->conn_ht_ulp_len + sack_opt_len; 2128 } else { 2129 total_hdr_len = connp->conn_ht_iphc_len; 2130 tcp_hdr_len = connp->conn_ht_ulp_len; 2131 } 2132 mp1 = allocb(total_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED); 2133 if (!mp1) 2134 return (NULL); 2135 2136 /* Update the latest receive window size in TCP header. */ 2137 tcp->tcp_tcpha->tha_win = 2138 htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 2139 /* copy in prototype TCP + IP header */ 2140 rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; 2141 mp1->b_rptr = rptr; 2142 mp1->b_wptr = rptr + total_hdr_len; 2143 bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len); 2144 2145 tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length]; 2146 2147 /* Set the TCP sequence number. */ 2148 tcpha->tha_seq = htonl(seq_no); 2149 2150 /* Set up the TCP flag field. */ 2151 tcpha->tha_flags = (uchar_t)TH_ACK; 2152 if (tcp->tcp_ecn_echo_on) 2153 tcpha->tha_flags |= TH_ECE; 2154 2155 tcp->tcp_rack = tcp->tcp_rnxt; 2156 tcp->tcp_rack_cnt = 0; 2157 2158 /* fill in timestamp option if in use */ 2159 if (tcp->tcp_snd_ts_ok) { 2160 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; 2161 2162 U32_TO_BE32(llbolt, 2163 (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); 2164 U32_TO_BE32(tcp->tcp_ts_recent, 2165 (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); 2166 } 2167 2168 /* Fill in SACK options */ 2169 if (num_sack_blk > 0) { 2170 uchar_t *wptr = (uchar_t *)tcpha + 2171 connp->conn_ht_ulp_len; 2172 sack_blk_t *tmp; 2173 int32_t i; 2174 2175 wptr[0] = TCPOPT_NOP; 2176 wptr[1] = TCPOPT_NOP; 2177 wptr[2] = TCPOPT_SACK; 2178 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 2179 sizeof (sack_blk_t); 2180 wptr += TCPOPT_REAL_SACK_LEN; 2181 2182 tmp = tcp->tcp_sack_list; 2183 for (i = 0; i < num_sack_blk; i++) { 2184 U32_TO_BE32(tmp[i].begin, wptr); 2185 wptr += sizeof (tcp_seq); 2186 U32_TO_BE32(tmp[i].end, wptr); 2187 wptr += sizeof (tcp_seq); 2188 } 2189 tcpha->tha_offset_and_reserved += 2190 ((num_sack_blk * 2 + 1) << 4); 2191 } 2192 2193 ixa->ixa_pktlen = total_hdr_len; 2194 2195 if (ixa->ixa_flags & IXAF_IS_IPV4) { 2196 ((ipha_t *)rptr)->ipha_length = htons(total_hdr_len); 2197 } else { 2198 ip6_t *ip6 = (ip6_t *)rptr; 2199 2200 ip6->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN); 2201 } 2202 2203 /* 2204 * Prime pump for checksum calculation in IP. Include the 2205 * adjustment for a source route if any. 2206 */ 2207 data_length = tcp_hdr_len + connp->conn_sum; 2208 data_length = (data_length >> 16) + (data_length & 0xFFFF); 2209 tcpha->tha_sum = htons(data_length); 2210 2211 if (tcp->tcp_ip_forward_progress) { 2212 tcp->tcp_ip_forward_progress = B_FALSE; 2213 connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; 2214 } else { 2215 connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; 2216 } 2217 return (mp1); 2218 } 2219 } 2220 2221 /* 2222 * Handle M_DATA messages from IP. Its called directly from IP via 2223 * squeue for received IP packets. 2224 * 2225 * The first argument is always the connp/tcp to which the mp belongs. 2226 * There are no exceptions to this rule. The caller has already put 2227 * a reference on this connp/tcp and once tcp_input_data() returns, 2228 * the squeue will do the refrele. 2229 * 2230 * The TH_SYN for the listener directly go to tcp_input_listener via 2231 * squeue. ICMP errors go directly to tcp_icmp_input(). 2232 * 2233 * sqp: NULL = recursive, sqp != NULL means called from squeue 2234 */ 2235 void 2236 tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 2237 { 2238 int32_t bytes_acked; 2239 int32_t gap; 2240 mblk_t *mp1; 2241 uint_t flags; 2242 uint32_t new_swnd = 0; 2243 uchar_t *iphdr; 2244 uchar_t *rptr; 2245 int32_t rgap; 2246 uint32_t seg_ack; 2247 int seg_len; 2248 uint_t ip_hdr_len; 2249 uint32_t seg_seq; 2250 tcpha_t *tcpha; 2251 int urp; 2252 tcp_opt_t tcpopt; 2253 ip_pkt_t ipp; 2254 boolean_t ofo_seg = B_FALSE; /* Out of order segment */ 2255 uint32_t cwnd; 2256 uint32_t add; 2257 int npkt; 2258 int mss; 2259 conn_t *connp = (conn_t *)arg; 2260 squeue_t *sqp = (squeue_t *)arg2; 2261 tcp_t *tcp = connp->conn_tcp; 2262 tcp_stack_t *tcps = tcp->tcp_tcps; 2263 2264 /* 2265 * RST from fused tcp loopback peer should trigger an unfuse. 2266 */ 2267 if (tcp->tcp_fused) { 2268 TCP_STAT(tcps, tcp_fusion_aborted); 2269 tcp_unfuse(tcp); 2270 } 2271 2272 iphdr = mp->b_rptr; 2273 rptr = mp->b_rptr; 2274 ASSERT(OK_32PTR(rptr)); 2275 2276 ip_hdr_len = ira->ira_ip_hdr_length; 2277 if (connp->conn_recv_ancillary.crb_all != 0) { 2278 /* 2279 * Record packet information in the ip_pkt_t 2280 */ 2281 ipp.ipp_fields = 0; 2282 if (ira->ira_flags & IRAF_IS_IPV4) { 2283 (void) ip_find_hdr_v4((ipha_t *)rptr, &ipp, 2284 B_FALSE); 2285 } else { 2286 uint8_t nexthdrp; 2287 2288 /* 2289 * IPv6 packets can only be received by applications 2290 * that are prepared to receive IPv6 addresses. 2291 * The IP fanout must ensure this. 2292 */ 2293 ASSERT(connp->conn_family == AF_INET6); 2294 2295 (void) ip_find_hdr_v6(mp, (ip6_t *)rptr, B_TRUE, &ipp, 2296 &nexthdrp); 2297 ASSERT(nexthdrp == IPPROTO_TCP); 2298 2299 /* Could have caused a pullup? */ 2300 iphdr = mp->b_rptr; 2301 rptr = mp->b_rptr; 2302 } 2303 } 2304 ASSERT(DB_TYPE(mp) == M_DATA); 2305 ASSERT(mp->b_next == NULL); 2306 2307 tcpha = (tcpha_t *)&rptr[ip_hdr_len]; 2308 seg_seq = ntohl(tcpha->tha_seq); 2309 seg_ack = ntohl(tcpha->tha_ack); 2310 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 2311 seg_len = (int)(mp->b_wptr - rptr) - 2312 (ip_hdr_len + TCP_HDR_LENGTH(tcpha)); 2313 if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) { 2314 do { 2315 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 2316 (uintptr_t)INT_MAX); 2317 seg_len += (int)(mp1->b_wptr - mp1->b_rptr); 2318 } while ((mp1 = mp1->b_cont) != NULL && 2319 mp1->b_datap->db_type == M_DATA); 2320 } 2321 2322 DTRACE_TCP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, connp->conn_ixa, 2323 __dtrace_tcp_void_ip_t *, iphdr, tcp_t *, tcp, 2324 __dtrace_tcp_tcph_t *, tcpha); 2325 2326 if (tcp->tcp_state == TCPS_TIME_WAIT) { 2327 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, 2328 seg_len, tcpha, ira); 2329 return; 2330 } 2331 2332 if (sqp != NULL) { 2333 /* 2334 * This is the correct place to update tcp_last_recv_time. Note 2335 * that it is also updated for tcp structure that belongs to 2336 * global and listener queues which do not really need updating. 2337 * But that should not cause any harm. And it is updated for 2338 * all kinds of incoming segments, not only for data segments. 2339 */ 2340 tcp->tcp_last_recv_time = LBOLT_FASTPATH; 2341 } 2342 2343 flags = (unsigned int)tcpha->tha_flags & 0xFF; 2344 2345 BUMP_LOCAL(tcp->tcp_ibsegs); 2346 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); 2347 2348 if ((flags & TH_URG) && sqp != NULL) { 2349 /* 2350 * TCP can't handle urgent pointers that arrive before 2351 * the connection has been accept()ed since it can't 2352 * buffer OOB data. Discard segment if this happens. 2353 * 2354 * We can't just rely on a non-null tcp_listener to indicate 2355 * that the accept() has completed since unlinking of the 2356 * eager and completion of the accept are not atomic. 2357 * tcp_detached, when it is not set (B_FALSE) indicates 2358 * that the accept() has completed. 2359 * 2360 * Nor can it reassemble urgent pointers, so discard 2361 * if it's not the next segment expected. 2362 * 2363 * Otherwise, collapse chain into one mblk (discard if 2364 * that fails). This makes sure the headers, retransmitted 2365 * data, and new data all are in the same mblk. 2366 */ 2367 ASSERT(mp != NULL); 2368 if (tcp->tcp_detached || !pullupmsg(mp, -1)) { 2369 freemsg(mp); 2370 return; 2371 } 2372 /* Update pointers into message */ 2373 iphdr = rptr = mp->b_rptr; 2374 tcpha = (tcpha_t *)&rptr[ip_hdr_len]; 2375 if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) { 2376 /* 2377 * Since we can't handle any data with this urgent 2378 * pointer that is out of sequence, we expunge 2379 * the data. This allows us to still register 2380 * the urgent mark and generate the M_PCSIG, 2381 * which we can do. 2382 */ 2383 mp->b_wptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha); 2384 seg_len = 0; 2385 } 2386 } 2387 2388 switch (tcp->tcp_state) { 2389 case TCPS_SYN_SENT: 2390 if (connp->conn_final_sqp == NULL && 2391 tcp_outbound_squeue_switch && sqp != NULL) { 2392 ASSERT(connp->conn_initial_sqp == connp->conn_sqp); 2393 connp->conn_final_sqp = sqp; 2394 if (connp->conn_final_sqp != connp->conn_sqp) { 2395 DTRACE_PROBE1(conn__final__sqp__switch, 2396 conn_t *, connp); 2397 CONN_INC_REF(connp); 2398 SQUEUE_SWITCH(connp, connp->conn_final_sqp); 2399 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 2400 tcp_input_data, connp, ira, ip_squeue_flag, 2401 SQTAG_CONNECT_FINISH); 2402 return; 2403 } 2404 DTRACE_PROBE1(conn__final__sqp__same, conn_t *, connp); 2405 } 2406 if (flags & TH_ACK) { 2407 /* 2408 * Note that our stack cannot send data before a 2409 * connection is established, therefore the 2410 * following check is valid. Otherwise, it has 2411 * to be changed. 2412 */ 2413 if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || 2414 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 2415 freemsg(mp); 2416 if (flags & TH_RST) 2417 return; 2418 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", 2419 tcp, seg_ack, 0, TH_RST); 2420 return; 2421 } 2422 ASSERT(tcp->tcp_suna + 1 == seg_ack); 2423 } 2424 if (flags & TH_RST) { 2425 if (flags & TH_ACK) { 2426 DTRACE_TCP5(connect__refused, mblk_t *, NULL, 2427 ip_xmit_attr_t *, connp->conn_ixa, 2428 void_ip_t *, iphdr, tcp_t *, tcp, 2429 tcph_t *, tcpha); 2430 (void) tcp_clean_death(tcp, ECONNREFUSED); 2431 } 2432 freemsg(mp); 2433 return; 2434 } 2435 if (!(flags & TH_SYN)) { 2436 freemsg(mp); 2437 return; 2438 } 2439 2440 /* Process all TCP options. */ 2441 tcp_process_options(tcp, tcpha); 2442 /* 2443 * The following changes our rwnd to be a multiple of the 2444 * MIN(peer MSS, our MSS) for performance reason. 2445 */ 2446 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(connp->conn_rcvbuf, 2447 tcp->tcp_mss)); 2448 2449 /* Is the other end ECN capable? */ 2450 if (tcp->tcp_ecn_ok) { 2451 if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { 2452 tcp->tcp_ecn_ok = B_FALSE; 2453 } 2454 } 2455 /* 2456 * Clear ECN flags because it may interfere with later 2457 * processing. 2458 */ 2459 flags &= ~(TH_ECE|TH_CWR); 2460 2461 tcp->tcp_irs = seg_seq; 2462 tcp->tcp_rack = seg_seq; 2463 tcp->tcp_rnxt = seg_seq + 1; 2464 tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt); 2465 if (!TCP_IS_DETACHED(tcp)) { 2466 /* Allocate room for SACK options if needed. */ 2467 connp->conn_wroff = connp->conn_ht_iphc_len; 2468 if (tcp->tcp_snd_sack_ok) 2469 connp->conn_wroff += TCPOPT_MAX_SACK_LEN; 2470 if (!tcp->tcp_loopback) 2471 connp->conn_wroff += tcps->tcps_wroff_xtra; 2472 2473 (void) proto_set_tx_wroff(connp->conn_rq, connp, 2474 connp->conn_wroff); 2475 } 2476 if (flags & TH_ACK) { 2477 /* 2478 * If we can't get the confirmation upstream, pretend 2479 * we didn't even see this one. 2480 * 2481 * XXX: how can we pretend we didn't see it if we 2482 * have updated rnxt et. al. 2483 * 2484 * For loopback we defer sending up the T_CONN_CON 2485 * until after some checks below. 2486 */ 2487 mp1 = NULL; 2488 /* 2489 * tcp_sendmsg() checks tcp_state without entering 2490 * the squeue so tcp_state should be updated before 2491 * sending up connection confirmation. Probe the 2492 * state change below when we are sure the connection 2493 * confirmation has been sent. 2494 */ 2495 tcp->tcp_state = TCPS_ESTABLISHED; 2496 if (!tcp_conn_con(tcp, iphdr, mp, 2497 tcp->tcp_loopback ? &mp1 : NULL, ira)) { 2498 tcp->tcp_state = TCPS_SYN_SENT; 2499 freemsg(mp); 2500 return; 2501 } 2502 TCPS_CONN_INC(tcps); 2503 /* SYN was acked - making progress */ 2504 tcp->tcp_ip_forward_progress = B_TRUE; 2505 2506 /* One for the SYN */ 2507 tcp->tcp_suna = tcp->tcp_iss + 1; 2508 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 2509 2510 /* 2511 * If SYN was retransmitted, need to reset all 2512 * retransmission info. This is because this 2513 * segment will be treated as a dup ACK. 2514 */ 2515 if (tcp->tcp_rexmit) { 2516 tcp->tcp_rexmit = B_FALSE; 2517 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 2518 tcp->tcp_rexmit_max = tcp->tcp_snxt; 2519 tcp->tcp_snd_burst = tcp->tcp_localnet ? 2520 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 2521 tcp->tcp_ms_we_have_waited = 0; 2522 2523 /* 2524 * Set tcp_cwnd back to 1 MSS, per 2525 * recommendation from 2526 * draft-floyd-incr-init-win-01.txt, 2527 * Increasing TCP's Initial Window. 2528 */ 2529 tcp->tcp_cwnd = tcp->tcp_mss; 2530 } 2531 2532 tcp->tcp_swl1 = seg_seq; 2533 tcp->tcp_swl2 = seg_ack; 2534 2535 new_swnd = ntohs(tcpha->tha_win); 2536 tcp->tcp_swnd = new_swnd; 2537 if (new_swnd > tcp->tcp_max_swnd) 2538 tcp->tcp_max_swnd = new_swnd; 2539 2540 /* 2541 * Always send the three-way handshake ack immediately 2542 * in order to make the connection complete as soon as 2543 * possible on the accepting host. 2544 */ 2545 flags |= TH_ACK_NEEDED; 2546 2547 /* 2548 * Trace connect-established here. 2549 */ 2550 DTRACE_TCP5(connect__established, mblk_t *, NULL, 2551 ip_xmit_attr_t *, tcp->tcp_connp->conn_ixa, 2552 void_ip_t *, iphdr, tcp_t *, tcp, tcph_t *, tcpha); 2553 2554 /* Trace change from SYN_SENT -> ESTABLISHED here */ 2555 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 2556 connp->conn_ixa, void, NULL, tcp_t *, tcp, 2557 void, NULL, int32_t, TCPS_SYN_SENT); 2558 2559 /* 2560 * Special case for loopback. At this point we have 2561 * received SYN-ACK from the remote endpoint. In 2562 * order to ensure that both endpoints reach the 2563 * fused state prior to any data exchange, the final 2564 * ACK needs to be sent before we indicate T_CONN_CON 2565 * to the module upstream. 2566 */ 2567 if (tcp->tcp_loopback) { 2568 mblk_t *ack_mp; 2569 2570 ASSERT(!tcp->tcp_unfusable); 2571 ASSERT(mp1 != NULL); 2572 /* 2573 * For loopback, we always get a pure SYN-ACK 2574 * and only need to send back the final ACK 2575 * with no data (this is because the other 2576 * tcp is ours and we don't do T/TCP). This 2577 * final ACK triggers the passive side to 2578 * perform fusion in ESTABLISHED state. 2579 */ 2580 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) { 2581 if (tcp->tcp_ack_tid != 0) { 2582 (void) TCP_TIMER_CANCEL(tcp, 2583 tcp->tcp_ack_tid); 2584 tcp->tcp_ack_tid = 0; 2585 } 2586 tcp_send_data(tcp, ack_mp); 2587 BUMP_LOCAL(tcp->tcp_obsegs); 2588 TCPS_BUMP_MIB(tcps, tcpOutAck); 2589 2590 if (!IPCL_IS_NONSTR(connp)) { 2591 /* Send up T_CONN_CON */ 2592 if (ira->ira_cred != NULL) { 2593 mblk_setcred(mp1, 2594 ira->ira_cred, 2595 ira->ira_cpid); 2596 } 2597 putnext(connp->conn_rq, mp1); 2598 } else { 2599 (*connp->conn_upcalls-> 2600 su_connected) 2601 (connp->conn_upper_handle, 2602 tcp->tcp_connid, 2603 ira->ira_cred, 2604 ira->ira_cpid); 2605 freemsg(mp1); 2606 } 2607 2608 freemsg(mp); 2609 return; 2610 } 2611 /* 2612 * Forget fusion; we need to handle more 2613 * complex cases below. Send the deferred 2614 * T_CONN_CON message upstream and proceed 2615 * as usual. Mark this tcp as not capable 2616 * of fusion. 2617 */ 2618 TCP_STAT(tcps, tcp_fusion_unfusable); 2619 tcp->tcp_unfusable = B_TRUE; 2620 if (!IPCL_IS_NONSTR(connp)) { 2621 if (ira->ira_cred != NULL) { 2622 mblk_setcred(mp1, ira->ira_cred, 2623 ira->ira_cpid); 2624 } 2625 putnext(connp->conn_rq, mp1); 2626 } else { 2627 (*connp->conn_upcalls->su_connected) 2628 (connp->conn_upper_handle, 2629 tcp->tcp_connid, ira->ira_cred, 2630 ira->ira_cpid); 2631 freemsg(mp1); 2632 } 2633 } 2634 2635 /* 2636 * Check to see if there is data to be sent. If 2637 * yes, set the transmit flag. Then check to see 2638 * if received data processing needs to be done. 2639 * If not, go straight to xmit_check. This short 2640 * cut is OK as we don't support T/TCP. 2641 */ 2642 if (tcp->tcp_unsent) 2643 flags |= TH_XMIT_NEEDED; 2644 2645 if (seg_len == 0 && !(flags & TH_URG)) { 2646 freemsg(mp); 2647 goto xmit_check; 2648 } 2649 2650 flags &= ~TH_SYN; 2651 seg_seq++; 2652 break; 2653 } 2654 tcp->tcp_state = TCPS_SYN_RCVD; 2655 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 2656 connp->conn_ixa, void_ip_t *, NULL, tcp_t *, tcp, 2657 tcph_t *, NULL, int32_t, TCPS_SYN_SENT); 2658 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, 2659 NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 2660 if (mp1 != NULL) { 2661 tcp_send_data(tcp, mp1); 2662 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 2663 } 2664 freemsg(mp); 2665 return; 2666 case TCPS_SYN_RCVD: 2667 if (flags & TH_ACK) { 2668 /* 2669 * In this state, a SYN|ACK packet is either bogus 2670 * because the other side must be ACKing our SYN which 2671 * indicates it has seen the ACK for their SYN and 2672 * shouldn't retransmit it or we're crossing SYNs 2673 * on active open. 2674 */ 2675 if ((flags & TH_SYN) && !tcp->tcp_active_open) { 2676 freemsg(mp); 2677 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_syn", 2678 tcp, seg_ack, 0, TH_RST); 2679 return; 2680 } 2681 /* 2682 * NOTE: RFC 793 pg. 72 says this should be 2683 * tcp->tcp_suna <= seg_ack <= tcp->tcp_snxt 2684 * but that would mean we have an ack that ignored 2685 * our SYN. 2686 */ 2687 if (SEQ_LEQ(seg_ack, tcp->tcp_suna) || 2688 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 2689 freemsg(mp); 2690 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack", 2691 tcp, seg_ack, 0, TH_RST); 2692 return; 2693 } 2694 /* 2695 * No sane TCP stack will send such a small window 2696 * without receiving any data. Just drop this invalid 2697 * ACK. We also shorten the abort timeout in case 2698 * this is an attack. 2699 */ 2700 if ((ntohs(tcpha->tha_win) << tcp->tcp_snd_ws) < 2701 (tcp->tcp_mss >> tcp_init_wnd_shft)) { 2702 freemsg(mp); 2703 TCP_STAT(tcps, tcp_zwin_ack_syn); 2704 tcp->tcp_second_ctimer_threshold = 2705 tcp_early_abort * SECONDS; 2706 return; 2707 } 2708 } 2709 break; 2710 case TCPS_LISTEN: 2711 /* 2712 * Only a TLI listener can come through this path when a 2713 * acceptor is going back to be a listener and a packet 2714 * for the acceptor hits the classifier. For a socket 2715 * listener, this can never happen because a listener 2716 * can never accept connection on itself and hence a 2717 * socket acceptor can not go back to being a listener. 2718 */ 2719 ASSERT(!TCP_IS_SOCKET(tcp)); 2720 /*FALLTHRU*/ 2721 case TCPS_CLOSED: 2722 case TCPS_BOUND: { 2723 conn_t *new_connp; 2724 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 2725 2726 /* 2727 * Don't accept any input on a closed tcp as this TCP logically 2728 * does not exist on the system. Don't proceed further with 2729 * this TCP. For instance, this packet could trigger another 2730 * close of this tcp which would be disastrous for tcp_refcnt. 2731 * tcp_close_detached / tcp_clean_death / tcp_closei_local must 2732 * be called at most once on a TCP. In this case we need to 2733 * refeed the packet into the classifier and figure out where 2734 * the packet should go. 2735 */ 2736 new_connp = ipcl_classify(mp, ira, ipst); 2737 if (new_connp != NULL) { 2738 /* Drops ref on new_connp */ 2739 tcp_reinput(new_connp, mp, ira, ipst); 2740 return; 2741 } 2742 /* We failed to classify. For now just drop the packet */ 2743 freemsg(mp); 2744 return; 2745 } 2746 case TCPS_IDLE: 2747 /* 2748 * Handle the case where the tcp_clean_death() has happened 2749 * on a connection (application hasn't closed yet) but a packet 2750 * was already queued on squeue before tcp_clean_death() 2751 * was processed. Calling tcp_clean_death() twice on same 2752 * connection can result in weird behaviour. 2753 */ 2754 freemsg(mp); 2755 return; 2756 default: 2757 break; 2758 } 2759 2760 /* 2761 * Already on the correct queue/perimeter. 2762 * If this is a detached connection and not an eager 2763 * connection hanging off a listener then new data 2764 * (past the FIN) will cause a reset. 2765 * We do a special check here where it 2766 * is out of the main line, rather than check 2767 * if we are detached every time we see new 2768 * data down below. 2769 */ 2770 if (TCP_IS_DETACHED_NONEAGER(tcp) && 2771 (seg_len > 0 && SEQ_GT(seg_seq + seg_len, tcp->tcp_rnxt))) { 2772 TCPS_BUMP_MIB(tcps, tcpInClosed); 2773 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); 2774 freemsg(mp); 2775 tcp_xmit_ctl("new data when detached", tcp, 2776 tcp->tcp_snxt, 0, TH_RST); 2777 (void) tcp_clean_death(tcp, EPROTO); 2778 return; 2779 } 2780 2781 mp->b_rptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha); 2782 urp = ntohs(tcpha->tha_urp) - TCP_OLD_URP_INTERPRETATION; 2783 new_swnd = ntohs(tcpha->tha_win) << 2784 ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); 2785 2786 if (tcp->tcp_snd_ts_ok) { 2787 if (!tcp_paws_check(tcp, tcpha, &tcpopt)) { 2788 /* 2789 * This segment is not acceptable. 2790 * Drop it and send back an ACK. 2791 */ 2792 freemsg(mp); 2793 flags |= TH_ACK_NEEDED; 2794 goto ack_check; 2795 } 2796 } else if (tcp->tcp_snd_sack_ok) { 2797 tcpopt.tcp = tcp; 2798 /* 2799 * SACK info in already updated in tcp_parse_options. Ignore 2800 * all other TCP options... 2801 */ 2802 (void) tcp_parse_options(tcpha, &tcpopt); 2803 } 2804 try_again:; 2805 mss = tcp->tcp_mss; 2806 gap = seg_seq - tcp->tcp_rnxt; 2807 rgap = tcp->tcp_rwnd - (gap + seg_len); 2808 /* 2809 * gap is the amount of sequence space between what we expect to see 2810 * and what we got for seg_seq. A positive value for gap means 2811 * something got lost. A negative value means we got some old stuff. 2812 */ 2813 if (gap < 0) { 2814 /* Old stuff present. Is the SYN in there? */ 2815 if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) && 2816 (seg_len != 0)) { 2817 flags &= ~TH_SYN; 2818 seg_seq++; 2819 urp--; 2820 /* Recompute the gaps after noting the SYN. */ 2821 goto try_again; 2822 } 2823 TCPS_BUMP_MIB(tcps, tcpInDataDupSegs); 2824 TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes, 2825 (seg_len > -gap ? -gap : seg_len)); 2826 /* Remove the old stuff from seg_len. */ 2827 seg_len += gap; 2828 /* 2829 * Anything left? 2830 * Make sure to check for unack'd FIN when rest of data 2831 * has been previously ack'd. 2832 */ 2833 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 2834 /* 2835 * Resets are only valid if they lie within our offered 2836 * window. If the RST bit is set, we just ignore this 2837 * segment. 2838 */ 2839 if (flags & TH_RST) { 2840 freemsg(mp); 2841 return; 2842 } 2843 2844 /* 2845 * The arriving of dup data packets indicate that we 2846 * may have postponed an ack for too long, or the other 2847 * side's RTT estimate is out of shape. Start acking 2848 * more often. 2849 */ 2850 if (SEQ_GEQ(seg_seq + seg_len - gap, tcp->tcp_rack) && 2851 tcp->tcp_rack_cnt >= 1 && 2852 tcp->tcp_rack_abs_max > 2) { 2853 tcp->tcp_rack_abs_max--; 2854 } 2855 tcp->tcp_rack_cur_max = 1; 2856 2857 /* 2858 * This segment is "unacceptable". None of its 2859 * sequence space lies within our advertized window. 2860 * 2861 * Adjust seg_len to the original value for tracing. 2862 */ 2863 seg_len -= gap; 2864 if (connp->conn_debug) { 2865 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 2866 "tcp_rput: unacceptable, gap %d, rgap %d, " 2867 "flags 0x%x, seg_seq %u, seg_ack %u, " 2868 "seg_len %d, rnxt %u, snxt %u, %s", 2869 gap, rgap, flags, seg_seq, seg_ack, 2870 seg_len, tcp->tcp_rnxt, tcp->tcp_snxt, 2871 tcp_display(tcp, NULL, 2872 DISP_ADDR_AND_PORT)); 2873 } 2874 2875 /* 2876 * Arrange to send an ACK in response to the 2877 * unacceptable segment per RFC 793 page 69. There 2878 * is only one small difference between ours and the 2879 * acceptability test in the RFC - we accept ACK-only 2880 * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK 2881 * will be generated. 2882 * 2883 * Note that we have to ACK an ACK-only packet at least 2884 * for stacks that send 0-length keep-alives with 2885 * SEG.SEQ = SND.NXT-1 as recommended by RFC1122, 2886 * section 4.2.3.6. As long as we don't ever generate 2887 * an unacceptable packet in response to an incoming 2888 * packet that is unacceptable, it should not cause 2889 * "ACK wars". 2890 */ 2891 flags |= TH_ACK_NEEDED; 2892 2893 /* 2894 * Continue processing this segment in order to use the 2895 * ACK information it contains, but skip all other 2896 * sequence-number processing. Processing the ACK 2897 * information is necessary in order to 2898 * re-synchronize connections that may have lost 2899 * synchronization. 2900 * 2901 * We clear seg_len and flag fields related to 2902 * sequence number processing as they are not 2903 * to be trusted for an unacceptable segment. 2904 */ 2905 seg_len = 0; 2906 flags &= ~(TH_SYN | TH_FIN | TH_URG); 2907 goto process_ack; 2908 } 2909 2910 /* Fix seg_seq, and chew the gap off the front. */ 2911 seg_seq = tcp->tcp_rnxt; 2912 urp += gap; 2913 do { 2914 mblk_t *mp2; 2915 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 2916 (uintptr_t)UINT_MAX); 2917 gap += (uint_t)(mp->b_wptr - mp->b_rptr); 2918 if (gap > 0) { 2919 mp->b_rptr = mp->b_wptr - gap; 2920 break; 2921 } 2922 mp2 = mp; 2923 mp = mp->b_cont; 2924 freeb(mp2); 2925 } while (gap < 0); 2926 /* 2927 * If the urgent data has already been acknowledged, we 2928 * should ignore TH_URG below 2929 */ 2930 if (urp < 0) 2931 flags &= ~TH_URG; 2932 } 2933 /* 2934 * rgap is the amount of stuff received out of window. A negative 2935 * value is the amount out of window. 2936 */ 2937 if (rgap < 0) { 2938 mblk_t *mp2; 2939 2940 if (tcp->tcp_rwnd == 0) { 2941 TCPS_BUMP_MIB(tcps, tcpInWinProbe); 2942 } else { 2943 TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs); 2944 TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap); 2945 } 2946 2947 /* 2948 * seg_len does not include the FIN, so if more than 2949 * just the FIN is out of window, we act like we don't 2950 * see it. (If just the FIN is out of window, rgap 2951 * will be zero and we will go ahead and acknowledge 2952 * the FIN.) 2953 */ 2954 flags &= ~TH_FIN; 2955 2956 /* Fix seg_len and make sure there is something left. */ 2957 seg_len += rgap; 2958 if (seg_len <= 0) { 2959 /* 2960 * Resets are only valid if they lie within our offered 2961 * window. If the RST bit is set, we just ignore this 2962 * segment. 2963 */ 2964 if (flags & TH_RST) { 2965 freemsg(mp); 2966 return; 2967 } 2968 2969 /* Per RFC 793, we need to send back an ACK. */ 2970 flags |= TH_ACK_NEEDED; 2971 2972 /* 2973 * Send SIGURG as soon as possible i.e. even 2974 * if the TH_URG was delivered in a window probe 2975 * packet (which will be unacceptable). 2976 * 2977 * We generate a signal if none has been generated 2978 * for this connection or if this is a new urgent 2979 * byte. Also send a zero-length "unmarked" message 2980 * to inform SIOCATMARK that this is not the mark. 2981 * 2982 * tcp_urp_last_valid is cleared when the T_exdata_ind 2983 * is sent up. This plus the check for old data 2984 * (gap >= 0) handles the wraparound of the sequence 2985 * number space without having to always track the 2986 * correct MAX(tcp_urp_last, tcp_rnxt). (BSD tracks 2987 * this max in its rcv_up variable). 2988 * 2989 * This prevents duplicate SIGURGS due to a "late" 2990 * zero-window probe when the T_EXDATA_IND has already 2991 * been sent up. 2992 */ 2993 if ((flags & TH_URG) && 2994 (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, 2995 tcp->tcp_urp_last))) { 2996 if (IPCL_IS_NONSTR(connp)) { 2997 if (!TCP_IS_DETACHED(tcp)) { 2998 (*connp->conn_upcalls-> 2999 su_signal_oob) 3000 (connp->conn_upper_handle, 3001 urp); 3002 } 3003 } else { 3004 mp1 = allocb(0, BPRI_MED); 3005 if (mp1 == NULL) { 3006 freemsg(mp); 3007 return; 3008 } 3009 if (!TCP_IS_DETACHED(tcp) && 3010 !putnextctl1(connp->conn_rq, 3011 M_PCSIG, SIGURG)) { 3012 /* Try again on the rexmit. */ 3013 freemsg(mp1); 3014 freemsg(mp); 3015 return; 3016 } 3017 /* 3018 * If the next byte would be the mark 3019 * then mark with MARKNEXT else mark 3020 * with NOTMARKNEXT. 3021 */ 3022 if (gap == 0 && urp == 0) 3023 mp1->b_flag |= MSGMARKNEXT; 3024 else 3025 mp1->b_flag |= MSGNOTMARKNEXT; 3026 freemsg(tcp->tcp_urp_mark_mp); 3027 tcp->tcp_urp_mark_mp = mp1; 3028 flags |= TH_SEND_URP_MARK; 3029 } 3030 tcp->tcp_urp_last_valid = B_TRUE; 3031 tcp->tcp_urp_last = urp + seg_seq; 3032 } 3033 /* 3034 * If this is a zero window probe, continue to 3035 * process the ACK part. But we need to set seg_len 3036 * to 0 to avoid data processing. Otherwise just 3037 * drop the segment and send back an ACK. 3038 */ 3039 if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) { 3040 flags &= ~(TH_SYN | TH_URG); 3041 seg_len = 0; 3042 goto process_ack; 3043 } else { 3044 freemsg(mp); 3045 goto ack_check; 3046 } 3047 } 3048 /* Pitch out of window stuff off the end. */ 3049 rgap = seg_len; 3050 mp2 = mp; 3051 do { 3052 ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 3053 (uintptr_t)INT_MAX); 3054 rgap -= (int)(mp2->b_wptr - mp2->b_rptr); 3055 if (rgap < 0) { 3056 mp2->b_wptr += rgap; 3057 if ((mp1 = mp2->b_cont) != NULL) { 3058 mp2->b_cont = NULL; 3059 freemsg(mp1); 3060 } 3061 break; 3062 } 3063 } while ((mp2 = mp2->b_cont) != NULL); 3064 } 3065 ok:; 3066 /* 3067 * TCP should check ECN info for segments inside the window only. 3068 * Therefore the check should be done here. 3069 */ 3070 if (tcp->tcp_ecn_ok) { 3071 if (flags & TH_CWR) { 3072 tcp->tcp_ecn_echo_on = B_FALSE; 3073 } 3074 /* 3075 * Note that both ECN_CE and CWR can be set in the 3076 * same segment. In this case, we once again turn 3077 * on ECN_ECHO. 3078 */ 3079 if (connp->conn_ipversion == IPV4_VERSION) { 3080 uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service; 3081 3082 if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { 3083 tcp->tcp_ecn_echo_on = B_TRUE; 3084 } 3085 } else { 3086 uint32_t vcf = ((ip6_t *)rptr)->ip6_vcf; 3087 3088 if ((vcf & htonl(IPH_ECN_CE << 20)) == 3089 htonl(IPH_ECN_CE << 20)) { 3090 tcp->tcp_ecn_echo_on = B_TRUE; 3091 } 3092 } 3093 } 3094 3095 /* 3096 * Check whether we can update tcp_ts_recent. This test is 3097 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 3098 * Extensions for High Performance: An Update", Internet Draft. 3099 */ 3100 if (tcp->tcp_snd_ts_ok && 3101 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 3102 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 3103 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 3104 tcp->tcp_last_rcv_lbolt = LBOLT_FASTPATH64; 3105 } 3106 3107 if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) { 3108 /* 3109 * FIN in an out of order segment. We record this in 3110 * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq. 3111 * Clear the FIN so that any check on FIN flag will fail. 3112 * Remember that FIN also counts in the sequence number 3113 * space. So we need to ack out of order FIN only segments. 3114 */ 3115 if (flags & TH_FIN) { 3116 tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID; 3117 tcp->tcp_ofo_fin_seq = seg_seq + seg_len; 3118 flags &= ~TH_FIN; 3119 flags |= TH_ACK_NEEDED; 3120 } 3121 if (seg_len > 0) { 3122 /* Fill in the SACK blk list. */ 3123 if (tcp->tcp_snd_sack_ok) { 3124 tcp_sack_insert(tcp->tcp_sack_list, 3125 seg_seq, seg_seq + seg_len, 3126 &(tcp->tcp_num_sack_blk)); 3127 } 3128 3129 /* 3130 * Attempt reassembly and see if we have something 3131 * ready to go. 3132 */ 3133 mp = tcp_reass(tcp, mp, seg_seq); 3134 /* Always ack out of order packets */ 3135 flags |= TH_ACK_NEEDED | TH_PUSH; 3136 if (mp) { 3137 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 3138 (uintptr_t)INT_MAX); 3139 seg_len = mp->b_cont ? msgdsize(mp) : 3140 (int)(mp->b_wptr - mp->b_rptr); 3141 seg_seq = tcp->tcp_rnxt; 3142 /* 3143 * A gap is filled and the seq num and len 3144 * of the gap match that of a previously 3145 * received FIN, put the FIN flag back in. 3146 */ 3147 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3148 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3149 flags |= TH_FIN; 3150 tcp->tcp_valid_bits &= 3151 ~TCP_OFO_FIN_VALID; 3152 } 3153 if (tcp->tcp_reass_tid != 0) { 3154 (void) TCP_TIMER_CANCEL(tcp, 3155 tcp->tcp_reass_tid); 3156 /* 3157 * Restart the timer if there is still 3158 * data in the reassembly queue. 3159 */ 3160 if (tcp->tcp_reass_head != NULL) { 3161 tcp->tcp_reass_tid = TCP_TIMER( 3162 tcp, tcp_reass_timer, 3163 tcps->tcps_reass_timeout); 3164 } else { 3165 tcp->tcp_reass_tid = 0; 3166 } 3167 } 3168 } else { 3169 /* 3170 * Keep going even with NULL mp. 3171 * There may be a useful ACK or something else 3172 * we don't want to miss. 3173 * 3174 * But TCP should not perform fast retransmit 3175 * because of the ack number. TCP uses 3176 * seg_len == 0 to determine if it is a pure 3177 * ACK. And this is not a pure ACK. 3178 */ 3179 seg_len = 0; 3180 ofo_seg = B_TRUE; 3181 3182 if (tcps->tcps_reass_timeout != 0 && 3183 tcp->tcp_reass_tid == 0) { 3184 tcp->tcp_reass_tid = TCP_TIMER(tcp, 3185 tcp_reass_timer, 3186 tcps->tcps_reass_timeout); 3187 } 3188 } 3189 } 3190 } else if (seg_len > 0) { 3191 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs); 3192 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len); 3193 /* 3194 * If an out of order FIN was received before, and the seq 3195 * num and len of the new segment match that of the FIN, 3196 * put the FIN flag back in. 3197 */ 3198 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3199 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3200 flags |= TH_FIN; 3201 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID; 3202 } 3203 } 3204 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) { 3205 if (flags & TH_RST) { 3206 freemsg(mp); 3207 switch (tcp->tcp_state) { 3208 case TCPS_SYN_RCVD: 3209 (void) tcp_clean_death(tcp, ECONNREFUSED); 3210 break; 3211 case TCPS_ESTABLISHED: 3212 case TCPS_FIN_WAIT_1: 3213 case TCPS_FIN_WAIT_2: 3214 case TCPS_CLOSE_WAIT: 3215 (void) tcp_clean_death(tcp, ECONNRESET); 3216 break; 3217 case TCPS_CLOSING: 3218 case TCPS_LAST_ACK: 3219 (void) tcp_clean_death(tcp, 0); 3220 break; 3221 default: 3222 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 3223 (void) tcp_clean_death(tcp, ENXIO); 3224 break; 3225 } 3226 return; 3227 } 3228 if (flags & TH_SYN) { 3229 /* 3230 * See RFC 793, Page 71 3231 * 3232 * The seq number must be in the window as it should 3233 * be "fixed" above. If it is outside window, it should 3234 * be already rejected. Note that we allow seg_seq to be 3235 * rnxt + rwnd because we want to accept 0 window probe. 3236 */ 3237 ASSERT(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) && 3238 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd)); 3239 freemsg(mp); 3240 /* 3241 * If the ACK flag is not set, just use our snxt as the 3242 * seq number of the RST segment. 3243 */ 3244 if (!(flags & TH_ACK)) { 3245 seg_ack = tcp->tcp_snxt; 3246 } 3247 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 3248 TH_RST|TH_ACK); 3249 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 3250 (void) tcp_clean_death(tcp, ECONNRESET); 3251 return; 3252 } 3253 /* 3254 * urp could be -1 when the urp field in the packet is 0 3255 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent 3256 * byte was at seg_seq - 1, in which case we ignore the urgent flag. 3257 */ 3258 if (flags & TH_URG && urp >= 0) { 3259 if (!tcp->tcp_urp_last_valid || 3260 SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { 3261 /* 3262 * Non-STREAMS sockets handle the urgent data a litte 3263 * differently from STREAMS based sockets. There is no 3264 * need to mark any mblks with the MSG{NOT,}MARKNEXT 3265 * flags to keep SIOCATMARK happy. Instead a 3266 * su_signal_oob upcall is made to update the mark. 3267 * Neither is a T_EXDATA_IND mblk needed to be 3268 * prepended to the urgent data. The urgent data is 3269 * delivered using the su_recv upcall, where we set 3270 * the MSG_OOB flag to indicate that it is urg data. 3271 * 3272 * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED 3273 * are used by non-STREAMS sockets. 3274 */ 3275 if (IPCL_IS_NONSTR(connp)) { 3276 if (!TCP_IS_DETACHED(tcp)) { 3277 (*connp->conn_upcalls->su_signal_oob) 3278 (connp->conn_upper_handle, urp); 3279 } 3280 } else { 3281 /* 3282 * If we haven't generated the signal yet for 3283 * this urgent pointer value, do it now. Also, 3284 * send up a zero-length M_DATA indicating 3285 * whether or not this is the mark. The latter 3286 * is not needed when a T_EXDATA_IND is sent up. 3287 * However, if there are allocation failures 3288 * this code relies on the sender retransmitting 3289 * and the socket code for determining the mark 3290 * should not block waiting for the peer to 3291 * transmit. Thus, for simplicity we always 3292 * send up the mark indication. 3293 */ 3294 mp1 = allocb(0, BPRI_MED); 3295 if (mp1 == NULL) { 3296 freemsg(mp); 3297 return; 3298 } 3299 if (!TCP_IS_DETACHED(tcp) && 3300 !putnextctl1(connp->conn_rq, M_PCSIG, 3301 SIGURG)) { 3302 /* Try again on the rexmit. */ 3303 freemsg(mp1); 3304 freemsg(mp); 3305 return; 3306 } 3307 /* 3308 * Mark with NOTMARKNEXT for now. 3309 * The code below will change this to MARKNEXT 3310 * if we are at the mark. 3311 * 3312 * If there are allocation failures (e.g. in 3313 * dupmsg below) the next time tcp_input_data 3314 * sees the urgent segment it will send up the 3315 * MSGMARKNEXT message. 3316 */ 3317 mp1->b_flag |= MSGNOTMARKNEXT; 3318 freemsg(tcp->tcp_urp_mark_mp); 3319 tcp->tcp_urp_mark_mp = mp1; 3320 flags |= TH_SEND_URP_MARK; 3321 #ifdef DEBUG 3322 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 3323 "tcp_rput: sent M_PCSIG 2 seq %x urp %x " 3324 "last %x, %s", 3325 seg_seq, urp, tcp->tcp_urp_last, 3326 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 3327 #endif /* DEBUG */ 3328 } 3329 tcp->tcp_urp_last_valid = B_TRUE; 3330 tcp->tcp_urp_last = urp + seg_seq; 3331 } else if (tcp->tcp_urp_mark_mp != NULL) { 3332 /* 3333 * An allocation failure prevented the previous 3334 * tcp_input_data from sending up the allocated 3335 * MSG*MARKNEXT message - send it up this time 3336 * around. 3337 */ 3338 flags |= TH_SEND_URP_MARK; 3339 } 3340 3341 /* 3342 * If the urgent byte is in this segment, make sure that it is 3343 * all by itself. This makes it much easier to deal with the 3344 * possibility of an allocation failure on the T_exdata_ind. 3345 * Note that seg_len is the number of bytes in the segment, and 3346 * urp is the offset into the segment of the urgent byte. 3347 * urp < seg_len means that the urgent byte is in this segment. 3348 */ 3349 if (urp < seg_len) { 3350 if (seg_len != 1) { 3351 uint32_t tmp_rnxt; 3352 /* 3353 * Break it up and feed it back in. 3354 * Re-attach the IP header. 3355 */ 3356 mp->b_rptr = iphdr; 3357 if (urp > 0) { 3358 /* 3359 * There is stuff before the urgent 3360 * byte. 3361 */ 3362 mp1 = dupmsg(mp); 3363 if (!mp1) { 3364 /* 3365 * Trim from urgent byte on. 3366 * The rest will come back. 3367 */ 3368 (void) adjmsg(mp, 3369 urp - seg_len); 3370 tcp_input_data(connp, 3371 mp, NULL, ira); 3372 return; 3373 } 3374 (void) adjmsg(mp1, urp - seg_len); 3375 /* Feed this piece back in. */ 3376 tmp_rnxt = tcp->tcp_rnxt; 3377 tcp_input_data(connp, mp1, NULL, ira); 3378 /* 3379 * If the data passed back in was not 3380 * processed (ie: bad ACK) sending 3381 * the remainder back in will cause a 3382 * loop. In this case, drop the 3383 * packet and let the sender try 3384 * sending a good packet. 3385 */ 3386 if (tmp_rnxt == tcp->tcp_rnxt) { 3387 freemsg(mp); 3388 return; 3389 } 3390 } 3391 if (urp != seg_len - 1) { 3392 uint32_t tmp_rnxt; 3393 /* 3394 * There is stuff after the urgent 3395 * byte. 3396 */ 3397 mp1 = dupmsg(mp); 3398 if (!mp1) { 3399 /* 3400 * Trim everything beyond the 3401 * urgent byte. The rest will 3402 * come back. 3403 */ 3404 (void) adjmsg(mp, 3405 urp + 1 - seg_len); 3406 tcp_input_data(connp, 3407 mp, NULL, ira); 3408 return; 3409 } 3410 (void) adjmsg(mp1, urp + 1 - seg_len); 3411 tmp_rnxt = tcp->tcp_rnxt; 3412 tcp_input_data(connp, mp1, NULL, ira); 3413 /* 3414 * If the data passed back in was not 3415 * processed (ie: bad ACK) sending 3416 * the remainder back in will cause a 3417 * loop. In this case, drop the 3418 * packet and let the sender try 3419 * sending a good packet. 3420 */ 3421 if (tmp_rnxt == tcp->tcp_rnxt) { 3422 freemsg(mp); 3423 return; 3424 } 3425 } 3426 tcp_input_data(connp, mp, NULL, ira); 3427 return; 3428 } 3429 /* 3430 * This segment contains only the urgent byte. We 3431 * have to allocate the T_exdata_ind, if we can. 3432 */ 3433 if (IPCL_IS_NONSTR(connp)) { 3434 int error; 3435 3436 (*connp->conn_upcalls->su_recv) 3437 (connp->conn_upper_handle, mp, seg_len, 3438 MSG_OOB, &error, NULL); 3439 /* 3440 * We should never be in middle of a 3441 * fallback, the squeue guarantees that. 3442 */ 3443 ASSERT(error != EOPNOTSUPP); 3444 mp = NULL; 3445 goto update_ack; 3446 } else if (!tcp->tcp_urp_mp) { 3447 struct T_exdata_ind *tei; 3448 mp1 = allocb(sizeof (struct T_exdata_ind), 3449 BPRI_MED); 3450 if (!mp1) { 3451 /* 3452 * Sigh... It'll be back. 3453 * Generate any MSG*MARK message now. 3454 */ 3455 freemsg(mp); 3456 seg_len = 0; 3457 if (flags & TH_SEND_URP_MARK) { 3458 3459 3460 ASSERT(tcp->tcp_urp_mark_mp); 3461 tcp->tcp_urp_mark_mp->b_flag &= 3462 ~MSGNOTMARKNEXT; 3463 tcp->tcp_urp_mark_mp->b_flag |= 3464 MSGMARKNEXT; 3465 } 3466 goto ack_check; 3467 } 3468 mp1->b_datap->db_type = M_PROTO; 3469 tei = (struct T_exdata_ind *)mp1->b_rptr; 3470 tei->PRIM_type = T_EXDATA_IND; 3471 tei->MORE_flag = 0; 3472 mp1->b_wptr = (uchar_t *)&tei[1]; 3473 tcp->tcp_urp_mp = mp1; 3474 #ifdef DEBUG 3475 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 3476 "tcp_rput: allocated exdata_ind %s", 3477 tcp_display(tcp, NULL, 3478 DISP_PORT_ONLY)); 3479 #endif /* DEBUG */ 3480 /* 3481 * There is no need to send a separate MSG*MARK 3482 * message since the T_EXDATA_IND will be sent 3483 * now. 3484 */ 3485 flags &= ~TH_SEND_URP_MARK; 3486 freemsg(tcp->tcp_urp_mark_mp); 3487 tcp->tcp_urp_mark_mp = NULL; 3488 } 3489 /* 3490 * Now we are all set. On the next putnext upstream, 3491 * tcp_urp_mp will be non-NULL and will get prepended 3492 * to what has to be this piece containing the urgent 3493 * byte. If for any reason we abort this segment below, 3494 * if it comes back, we will have this ready, or it 3495 * will get blown off in close. 3496 */ 3497 } else if (urp == seg_len) { 3498 /* 3499 * The urgent byte is the next byte after this sequence 3500 * number. If this endpoint is non-STREAMS, then there 3501 * is nothing to do here since the socket has already 3502 * been notified about the urg pointer by the 3503 * su_signal_oob call above. 3504 * 3505 * In case of STREAMS, some more work might be needed. 3506 * If there is data it is marked with MSGMARKNEXT and 3507 * and any tcp_urp_mark_mp is discarded since it is not 3508 * needed. Otherwise, if the code above just allocated 3509 * a zero-length tcp_urp_mark_mp message, that message 3510 * is tagged with MSGMARKNEXT. Sending up these 3511 * MSGMARKNEXT messages makes SIOCATMARK work correctly 3512 * even though the T_EXDATA_IND will not be sent up 3513 * until the urgent byte arrives. 3514 */ 3515 if (!IPCL_IS_NONSTR(tcp->tcp_connp)) { 3516 if (seg_len != 0) { 3517 flags |= TH_MARKNEXT_NEEDED; 3518 freemsg(tcp->tcp_urp_mark_mp); 3519 tcp->tcp_urp_mark_mp = NULL; 3520 flags &= ~TH_SEND_URP_MARK; 3521 } else if (tcp->tcp_urp_mark_mp != NULL) { 3522 flags |= TH_SEND_URP_MARK; 3523 tcp->tcp_urp_mark_mp->b_flag &= 3524 ~MSGNOTMARKNEXT; 3525 tcp->tcp_urp_mark_mp->b_flag |= 3526 MSGMARKNEXT; 3527 } 3528 } 3529 #ifdef DEBUG 3530 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 3531 "tcp_rput: AT MARK, len %d, flags 0x%x, %s", 3532 seg_len, flags, 3533 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 3534 #endif /* DEBUG */ 3535 } 3536 #ifdef DEBUG 3537 else { 3538 /* Data left until we hit mark */ 3539 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 3540 "tcp_rput: URP %d bytes left, %s", 3541 urp - seg_len, tcp_display(tcp, NULL, 3542 DISP_PORT_ONLY)); 3543 } 3544 #endif /* DEBUG */ 3545 } 3546 3547 process_ack: 3548 if (!(flags & TH_ACK)) { 3549 freemsg(mp); 3550 goto xmit_check; 3551 } 3552 } 3553 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 3554 3555 if (bytes_acked > 0) 3556 tcp->tcp_ip_forward_progress = B_TRUE; 3557 if (tcp->tcp_state == TCPS_SYN_RCVD) { 3558 /* 3559 * tcp_sendmsg() checks tcp_state without entering 3560 * the squeue so tcp_state should be updated before 3561 * sending up a connection confirmation or a new 3562 * connection indication. 3563 */ 3564 tcp->tcp_state = TCPS_ESTABLISHED; 3565 3566 /* 3567 * We are seeing the final ack in the three way 3568 * hand shake of a active open'ed connection 3569 * so we must send up a T_CONN_CON 3570 */ 3571 if (tcp->tcp_active_open) { 3572 if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) { 3573 freemsg(mp); 3574 tcp->tcp_state = TCPS_SYN_RCVD; 3575 return; 3576 } 3577 /* 3578 * Don't fuse the loopback endpoints for 3579 * simultaneous active opens. 3580 */ 3581 if (tcp->tcp_loopback) { 3582 TCP_STAT(tcps, tcp_fusion_unfusable); 3583 tcp->tcp_unfusable = B_TRUE; 3584 } 3585 /* 3586 * For simultaneous active open, trace receipt of final 3587 * ACK as tcp:::connect-established. 3588 */ 3589 DTRACE_TCP5(connect__established, mblk_t *, NULL, 3590 ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *, 3591 iphdr, tcp_t *, tcp, tcph_t *, tcpha); 3592 } else if (IPCL_IS_NONSTR(connp)) { 3593 /* 3594 * 3-way handshake has completed, so notify socket 3595 * of the new connection. 3596 * 3597 * We are here means eager is fine but it can 3598 * get a TH_RST at any point between now and till 3599 * accept completes and disappear. We need to 3600 * ensure that reference to eager is valid after 3601 * we get out of eager's perimeter. So we do 3602 * an extra refhold. 3603 */ 3604 CONN_INC_REF(connp); 3605 3606 if (!tcp_newconn_notify(tcp, ira)) { 3607 freemsg(mp); 3608 /* notification did not go up, so drop ref */ 3609 CONN_DEC_REF(connp); 3610 return; 3611 } 3612 /* 3613 * For passive open, trace receipt of final ACK as 3614 * tcp:::accept-established. 3615 */ 3616 DTRACE_TCP5(accept__established, mlbk_t *, NULL, 3617 ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *, 3618 iphdr, tcp_t *, tcp, tcph_t *, tcpha); 3619 } else { 3620 /* 3621 * 3-way handshake complete - this is a STREAMS based 3622 * socket, so pass up the T_CONN_IND. 3623 */ 3624 tcp_t *listener = tcp->tcp_listener; 3625 mblk_t *mp = tcp->tcp_conn.tcp_eager_conn_ind; 3626 3627 tcp->tcp_tconnind_started = B_TRUE; 3628 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 3629 ASSERT(mp != NULL); 3630 /* 3631 * We are here means eager is fine but it can 3632 * get a TH_RST at any point between now and till 3633 * accept completes and disappear. We need to 3634 * ensure that reference to eager is valid after 3635 * we get out of eager's perimeter. So we do 3636 * an extra refhold. 3637 */ 3638 CONN_INC_REF(connp); 3639 3640 /* 3641 * The listener also exists because of the refhold 3642 * done in tcp_input_listener. Its possible that it 3643 * might have closed. We will check that once we 3644 * get inside listeners context. 3645 */ 3646 CONN_INC_REF(listener->tcp_connp); 3647 if (listener->tcp_connp->conn_sqp == 3648 connp->conn_sqp) { 3649 /* 3650 * We optimize by not calling an SQUEUE_ENTER 3651 * on the listener since we know that the 3652 * listener and eager squeues are the same. 3653 * We are able to make this check safely only 3654 * because neither the eager nor the listener 3655 * can change its squeue. Only an active connect 3656 * can change its squeue 3657 */ 3658 tcp_send_conn_ind(listener->tcp_connp, mp, 3659 listener->tcp_connp->conn_sqp); 3660 CONN_DEC_REF(listener->tcp_connp); 3661 } else if (!tcp->tcp_loopback) { 3662 SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, 3663 mp, tcp_send_conn_ind, 3664 listener->tcp_connp, NULL, SQ_FILL, 3665 SQTAG_TCP_CONN_IND); 3666 } else { 3667 SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, 3668 mp, tcp_send_conn_ind, 3669 listener->tcp_connp, NULL, SQ_NODRAIN, 3670 SQTAG_TCP_CONN_IND); 3671 } 3672 /* 3673 * For passive open, trace receipt of final ACK as 3674 * tcp:::accept-established. 3675 */ 3676 DTRACE_TCP5(accept__established, mlbk_t *, NULL, 3677 ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *, 3678 iphdr, tcp_t *, tcp, tcph_t *, tcpha); 3679 } 3680 TCPS_CONN_INC(tcps); 3681 3682 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ 3683 bytes_acked--; 3684 /* SYN was acked - making progress */ 3685 tcp->tcp_ip_forward_progress = B_TRUE; 3686 3687 /* 3688 * If SYN was retransmitted, need to reset all 3689 * retransmission info as this segment will be 3690 * treated as a dup ACK. 3691 */ 3692 if (tcp->tcp_rexmit) { 3693 tcp->tcp_rexmit = B_FALSE; 3694 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 3695 tcp->tcp_rexmit_max = tcp->tcp_snxt; 3696 tcp->tcp_snd_burst = tcp->tcp_localnet ? 3697 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 3698 tcp->tcp_ms_we_have_waited = 0; 3699 tcp->tcp_cwnd = mss; 3700 } 3701 3702 /* 3703 * We set the send window to zero here. 3704 * This is needed if there is data to be 3705 * processed already on the queue. 3706 * Later (at swnd_update label), the 3707 * "new_swnd > tcp_swnd" condition is satisfied 3708 * the XMIT_NEEDED flag is set in the current 3709 * (SYN_RCVD) state. This ensures tcp_wput_data() is 3710 * called if there is already data on queue in 3711 * this state. 3712 */ 3713 tcp->tcp_swnd = 0; 3714 3715 if (new_swnd > tcp->tcp_max_swnd) 3716 tcp->tcp_max_swnd = new_swnd; 3717 tcp->tcp_swl1 = seg_seq; 3718 tcp->tcp_swl2 = seg_ack; 3719 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 3720 3721 /* Trace change from SYN_RCVD -> ESTABLISHED here */ 3722 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 3723 connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, 3724 int32_t, TCPS_SYN_RCVD); 3725 3726 /* Fuse when both sides are in ESTABLISHED state */ 3727 if (tcp->tcp_loopback && do_tcp_fusion) 3728 tcp_fuse(tcp, iphdr, tcpha); 3729 3730 } 3731 /* This code follows 4.4BSD-Lite2 mostly. */ 3732 if (bytes_acked < 0) 3733 goto est; 3734 3735 /* 3736 * If TCP is ECN capable and the congestion experience bit is 3737 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be 3738 * done once per window (or more loosely, per RTT). 3739 */ 3740 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) 3741 tcp->tcp_cwr = B_FALSE; 3742 if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { 3743 if (!tcp->tcp_cwr) { 3744 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss; 3745 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; 3746 tcp->tcp_cwnd = npkt * mss; 3747 /* 3748 * If the cwnd is 0, use the timer to clock out 3749 * new segments. This is required by the ECN spec. 3750 */ 3751 if (npkt == 0) { 3752 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3753 /* 3754 * This makes sure that when the ACK comes 3755 * back, we will increase tcp_cwnd by 1 MSS. 3756 */ 3757 tcp->tcp_cwnd_cnt = 0; 3758 } 3759 tcp->tcp_cwr = B_TRUE; 3760 /* 3761 * This marks the end of the current window of in 3762 * flight data. That is why we don't use 3763 * tcp_suna + tcp_swnd. Only data in flight can 3764 * provide ECN info. 3765 */ 3766 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 3767 tcp->tcp_ecn_cwr_sent = B_FALSE; 3768 } 3769 } 3770 3771 mp1 = tcp->tcp_xmit_head; 3772 if (bytes_acked == 0) { 3773 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { 3774 int dupack_cnt; 3775 3776 TCPS_BUMP_MIB(tcps, tcpInDupAck); 3777 /* 3778 * Fast retransmit. When we have seen exactly three 3779 * identical ACKs while we have unacked data 3780 * outstanding we take it as a hint that our peer 3781 * dropped something. 3782 * 3783 * If TCP is retransmitting, don't do fast retransmit. 3784 */ 3785 if (mp1 && tcp->tcp_suna != tcp->tcp_snxt && 3786 ! tcp->tcp_rexmit) { 3787 /* Do Limited Transmit */ 3788 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < 3789 tcps->tcps_dupack_fast_retransmit) { 3790 /* 3791 * RFC 3042 3792 * 3793 * What we need to do is temporarily 3794 * increase tcp_cwnd so that new 3795 * data can be sent if it is allowed 3796 * by the receive window (tcp_rwnd). 3797 * tcp_wput_data() will take care of 3798 * the rest. 3799 * 3800 * If the connection is SACK capable, 3801 * only do limited xmit when there 3802 * is SACK info. 3803 * 3804 * Note how tcp_cwnd is incremented. 3805 * The first dup ACK will increase 3806 * it by 1 MSS. The second dup ACK 3807 * will increase it by 2 MSS. This 3808 * means that only 1 new segment will 3809 * be sent for each dup ACK. 3810 */ 3811 if (tcp->tcp_unsent > 0 && 3812 (!tcp->tcp_snd_sack_ok || 3813 (tcp->tcp_snd_sack_ok && 3814 tcp->tcp_notsack_list != NULL))) { 3815 tcp->tcp_cwnd += mss << 3816 (tcp->tcp_dupack_cnt - 1); 3817 flags |= TH_LIMIT_XMIT; 3818 } 3819 } else if (dupack_cnt == 3820 tcps->tcps_dupack_fast_retransmit) { 3821 3822 /* 3823 * If we have reduced tcp_ssthresh 3824 * because of ECN, do not reduce it again 3825 * unless it is already one window of data 3826 * away. After one window of data, tcp_cwr 3827 * should then be cleared. Note that 3828 * for non ECN capable connection, tcp_cwr 3829 * should always be false. 3830 * 3831 * Adjust cwnd since the duplicate 3832 * ack indicates that a packet was 3833 * dropped (due to congestion.) 3834 */ 3835 if (!tcp->tcp_cwr) { 3836 npkt = ((tcp->tcp_snxt - 3837 tcp->tcp_suna) >> 1) / mss; 3838 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * 3839 mss; 3840 tcp->tcp_cwnd = (npkt + 3841 tcp->tcp_dupack_cnt) * mss; 3842 } 3843 if (tcp->tcp_ecn_ok) { 3844 tcp->tcp_cwr = B_TRUE; 3845 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 3846 tcp->tcp_ecn_cwr_sent = B_FALSE; 3847 } 3848 3849 /* 3850 * We do Hoe's algorithm. Refer to her 3851 * paper "Improving the Start-up Behavior 3852 * of a Congestion Control Scheme for TCP," 3853 * appeared in SIGCOMM'96. 3854 * 3855 * Save highest seq no we have sent so far. 3856 * Be careful about the invisible FIN byte. 3857 */ 3858 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 3859 (tcp->tcp_unsent == 0)) { 3860 tcp->tcp_rexmit_max = tcp->tcp_fss; 3861 } else { 3862 tcp->tcp_rexmit_max = tcp->tcp_snxt; 3863 } 3864 3865 /* 3866 * Do not allow bursty traffic during. 3867 * fast recovery. Refer to Fall and Floyd's 3868 * paper "Simulation-based Comparisons of 3869 * Tahoe, Reno and SACK TCP" (in CCR?) 3870 * This is a best current practise. 3871 */ 3872 tcp->tcp_snd_burst = TCP_CWND_SS; 3873 3874 /* 3875 * For SACK: 3876 * Calculate tcp_pipe, which is the 3877 * estimated number of bytes in 3878 * network. 3879 * 3880 * tcp_fack is the highest sack'ed seq num 3881 * TCP has received. 3882 * 3883 * tcp_pipe is explained in the above quoted 3884 * Fall and Floyd's paper. tcp_fack is 3885 * explained in Mathis and Mahdavi's 3886 * "Forward Acknowledgment: Refining TCP 3887 * Congestion Control" in SIGCOMM '96. 3888 */ 3889 if (tcp->tcp_snd_sack_ok) { 3890 if (tcp->tcp_notsack_list != NULL) { 3891 tcp->tcp_pipe = tcp->tcp_snxt - 3892 tcp->tcp_fack; 3893 tcp->tcp_sack_snxt = seg_ack; 3894 flags |= TH_NEED_SACK_REXMIT; 3895 } else { 3896 /* 3897 * Always initialize tcp_pipe 3898 * even though we don't have 3899 * any SACK info. If later 3900 * we get SACK info and 3901 * tcp_pipe is not initialized, 3902 * funny things will happen. 3903 */ 3904 tcp->tcp_pipe = 3905 tcp->tcp_cwnd_ssthresh; 3906 } 3907 } else { 3908 flags |= TH_REXMIT_NEEDED; 3909 } /* tcp_snd_sack_ok */ 3910 3911 } else { 3912 /* 3913 * Here we perform congestion 3914 * avoidance, but NOT slow start. 3915 * This is known as the Fast 3916 * Recovery Algorithm. 3917 */ 3918 if (tcp->tcp_snd_sack_ok && 3919 tcp->tcp_notsack_list != NULL) { 3920 flags |= TH_NEED_SACK_REXMIT; 3921 tcp->tcp_pipe -= mss; 3922 if (tcp->tcp_pipe < 0) 3923 tcp->tcp_pipe = 0; 3924 } else { 3925 /* 3926 * We know that one more packet has 3927 * left the pipe thus we can update 3928 * cwnd. 3929 */ 3930 cwnd = tcp->tcp_cwnd + mss; 3931 if (cwnd > tcp->tcp_cwnd_max) 3932 cwnd = tcp->tcp_cwnd_max; 3933 tcp->tcp_cwnd = cwnd; 3934 if (tcp->tcp_unsent > 0) 3935 flags |= TH_XMIT_NEEDED; 3936 } 3937 } 3938 } 3939 } else if (tcp->tcp_zero_win_probe) { 3940 /* 3941 * If the window has opened, need to arrange 3942 * to send additional data. 3943 */ 3944 if (new_swnd != 0) { 3945 /* tcp_suna != tcp_snxt */ 3946 /* Packet contains a window update */ 3947 TCPS_BUMP_MIB(tcps, tcpInWinUpdate); 3948 tcp->tcp_zero_win_probe = 0; 3949 tcp->tcp_timer_backoff = 0; 3950 tcp->tcp_ms_we_have_waited = 0; 3951 3952 /* 3953 * Transmit starting with tcp_suna since 3954 * the one byte probe is not ack'ed. 3955 * If TCP has sent more than one identical 3956 * probe, tcp_rexmit will be set. That means 3957 * tcp_ss_rexmit() will send out the one 3958 * byte along with new data. Otherwise, 3959 * fake the retransmission. 3960 */ 3961 flags |= TH_XMIT_NEEDED; 3962 if (!tcp->tcp_rexmit) { 3963 tcp->tcp_rexmit = B_TRUE; 3964 tcp->tcp_dupack_cnt = 0; 3965 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 3966 tcp->tcp_rexmit_max = tcp->tcp_suna + 1; 3967 } 3968 } 3969 } 3970 goto swnd_update; 3971 } 3972 3973 /* 3974 * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73. 3975 * If the ACK value acks something that we have not yet sent, it might 3976 * be an old duplicate segment. Send an ACK to re-synchronize the 3977 * other side. 3978 * Note: reset in response to unacceptable ACK in SYN_RECEIVE 3979 * state is handled above, so we can always just drop the segment and 3980 * send an ACK here. 3981 * 3982 * In the case where the peer shrinks the window, we see the new window 3983 * update, but all the data sent previously is queued up by the peer. 3984 * To account for this, in tcp_process_shrunk_swnd(), the sequence 3985 * number, which was already sent, and within window, is recorded. 3986 * tcp_snxt is then updated. 3987 * 3988 * If the window has previously shrunk, and an ACK for data not yet 3989 * sent, according to tcp_snxt is recieved, it may still be valid. If 3990 * the ACK is for data within the window at the time the window was 3991 * shrunk, then the ACK is acceptable. In this case tcp_snxt is set to 3992 * the sequence number ACK'ed. 3993 * 3994 * If the ACK covers all the data sent at the time the window was 3995 * shrunk, we can now set tcp_is_wnd_shrnk to B_FALSE. 3996 * 3997 * Should we send ACKs in response to ACK only segments? 3998 */ 3999 4000 if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { 4001 if ((tcp->tcp_is_wnd_shrnk) && 4002 (SEQ_LEQ(seg_ack, tcp->tcp_snxt_shrunk))) { 4003 uint32_t data_acked_ahead_snxt; 4004 4005 data_acked_ahead_snxt = seg_ack - tcp->tcp_snxt; 4006 tcp_update_xmit_tail(tcp, seg_ack); 4007 tcp->tcp_unsent -= data_acked_ahead_snxt; 4008 } else { 4009 TCPS_BUMP_MIB(tcps, tcpInAckUnsent); 4010 /* drop the received segment */ 4011 freemsg(mp); 4012 4013 /* 4014 * Send back an ACK. If tcp_drop_ack_unsent_cnt is 4015 * greater than 0, check if the number of such 4016 * bogus ACks is greater than that count. If yes, 4017 * don't send back any ACK. This prevents TCP from 4018 * getting into an ACK storm if somehow an attacker 4019 * successfully spoofs an acceptable segment to our 4020 * peer. If this continues (count > 2 X threshold), 4021 * we should abort this connection. 4022 */ 4023 if (tcp_drop_ack_unsent_cnt > 0 && 4024 ++tcp->tcp_in_ack_unsent > 4025 tcp_drop_ack_unsent_cnt) { 4026 TCP_STAT(tcps, tcp_in_ack_unsent_drop); 4027 if (tcp->tcp_in_ack_unsent > 2 * 4028 tcp_drop_ack_unsent_cnt) { 4029 (void) tcp_clean_death(tcp, EPROTO); 4030 } 4031 return; 4032 } 4033 mp = tcp_ack_mp(tcp); 4034 if (mp != NULL) { 4035 BUMP_LOCAL(tcp->tcp_obsegs); 4036 TCPS_BUMP_MIB(tcps, tcpOutAck); 4037 tcp_send_data(tcp, mp); 4038 } 4039 return; 4040 } 4041 } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack, 4042 tcp->tcp_snxt_shrunk)) { 4043 tcp->tcp_is_wnd_shrnk = B_FALSE; 4044 } 4045 4046 /* 4047 * TCP gets a new ACK, update the notsack'ed list to delete those 4048 * blocks that are covered by this ACK. 4049 */ 4050 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 4051 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack, 4052 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list)); 4053 } 4054 4055 /* 4056 * If we got an ACK after fast retransmit, check to see 4057 * if it is a partial ACK. If it is not and the congestion 4058 * window was inflated to account for the other side's 4059 * cached packets, retract it. If it is, do Hoe's algorithm. 4060 */ 4061 if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) { 4062 ASSERT(tcp->tcp_rexmit == B_FALSE); 4063 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { 4064 tcp->tcp_dupack_cnt = 0; 4065 /* 4066 * Restore the orig tcp_cwnd_ssthresh after 4067 * fast retransmit phase. 4068 */ 4069 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { 4070 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; 4071 } 4072 tcp->tcp_rexmit_max = seg_ack; 4073 tcp->tcp_cwnd_cnt = 0; 4074 tcp->tcp_snd_burst = tcp->tcp_localnet ? 4075 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 4076 4077 /* 4078 * Remove all notsack info to avoid confusion with 4079 * the next fast retrasnmit/recovery phase. 4080 */ 4081 if (tcp->tcp_snd_sack_ok) { 4082 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, 4083 tcp); 4084 } 4085 } else { 4086 if (tcp->tcp_snd_sack_ok && 4087 tcp->tcp_notsack_list != NULL) { 4088 flags |= TH_NEED_SACK_REXMIT; 4089 tcp->tcp_pipe -= mss; 4090 if (tcp->tcp_pipe < 0) 4091 tcp->tcp_pipe = 0; 4092 } else { 4093 /* 4094 * Hoe's algorithm: 4095 * 4096 * Retransmit the unack'ed segment and 4097 * restart fast recovery. Note that we 4098 * need to scale back tcp_cwnd to the 4099 * original value when we started fast 4100 * recovery. This is to prevent overly 4101 * aggressive behaviour in sending new 4102 * segments. 4103 */ 4104 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + 4105 tcps->tcps_dupack_fast_retransmit * mss; 4106 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; 4107 flags |= TH_REXMIT_NEEDED; 4108 } 4109 } 4110 } else { 4111 tcp->tcp_dupack_cnt = 0; 4112 if (tcp->tcp_rexmit) { 4113 /* 4114 * TCP is retranmitting. If the ACK ack's all 4115 * outstanding data, update tcp_rexmit_max and 4116 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt 4117 * to the correct value. 4118 * 4119 * Note that SEQ_LEQ() is used. This is to avoid 4120 * unnecessary fast retransmit caused by dup ACKs 4121 * received when TCP does slow start retransmission 4122 * after a time out. During this phase, TCP may 4123 * send out segments which are already received. 4124 * This causes dup ACKs to be sent back. 4125 */ 4126 if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) { 4127 if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) { 4128 tcp->tcp_rexmit_nxt = seg_ack; 4129 } 4130 if (seg_ack != tcp->tcp_rexmit_max) { 4131 flags |= TH_XMIT_NEEDED; 4132 } 4133 } else { 4134 tcp->tcp_rexmit = B_FALSE; 4135 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 4136 tcp->tcp_snd_burst = tcp->tcp_localnet ? 4137 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 4138 } 4139 tcp->tcp_ms_we_have_waited = 0; 4140 } 4141 } 4142 4143 TCPS_BUMP_MIB(tcps, tcpInAckSegs); 4144 TCPS_UPDATE_MIB(tcps, tcpInAckBytes, bytes_acked); 4145 tcp->tcp_suna = seg_ack; 4146 if (tcp->tcp_zero_win_probe != 0) { 4147 tcp->tcp_zero_win_probe = 0; 4148 tcp->tcp_timer_backoff = 0; 4149 } 4150 4151 /* 4152 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed. 4153 * Note that it cannot be the SYN being ack'ed. The code flow 4154 * will not reach here. 4155 */ 4156 if (mp1 == NULL) { 4157 goto fin_acked; 4158 } 4159 4160 /* 4161 * Update the congestion window. 4162 * 4163 * If TCP is not ECN capable or TCP is ECN capable but the 4164 * congestion experience bit is not set, increase the tcp_cwnd as 4165 * usual. 4166 */ 4167 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { 4168 cwnd = tcp->tcp_cwnd; 4169 add = mss; 4170 4171 if (cwnd >= tcp->tcp_cwnd_ssthresh) { 4172 /* 4173 * This is to prevent an increase of less than 1 MSS of 4174 * tcp_cwnd. With partial increase, tcp_wput_data() 4175 * may send out tinygrams in order to preserve mblk 4176 * boundaries. 4177 * 4178 * By initializing tcp_cwnd_cnt to new tcp_cwnd and 4179 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is 4180 * increased by 1 MSS for every RTTs. 4181 */ 4182 if (tcp->tcp_cwnd_cnt <= 0) { 4183 tcp->tcp_cwnd_cnt = cwnd + add; 4184 } else { 4185 tcp->tcp_cwnd_cnt -= add; 4186 add = 0; 4187 } 4188 } 4189 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); 4190 } 4191 4192 /* See if the latest urgent data has been acknowledged */ 4193 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && 4194 SEQ_GT(seg_ack, tcp->tcp_urg)) 4195 tcp->tcp_valid_bits &= ~TCP_URG_VALID; 4196 4197 /* Can we update the RTT estimates? */ 4198 if (tcp->tcp_snd_ts_ok) { 4199 /* Ignore zero timestamp echo-reply. */ 4200 if (tcpopt.tcp_opt_ts_ecr != 0) { 4201 tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - 4202 (int32_t)tcpopt.tcp_opt_ts_ecr); 4203 } 4204 4205 /* If needed, restart the timer. */ 4206 if (tcp->tcp_set_timer == 1) { 4207 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 4208 tcp->tcp_set_timer = 0; 4209 } 4210 /* 4211 * Update tcp_csuna in case the other side stops sending 4212 * us timestamps. 4213 */ 4214 tcp->tcp_csuna = tcp->tcp_snxt; 4215 } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { 4216 /* 4217 * An ACK sequence we haven't seen before, so get the RTT 4218 * and update the RTO. But first check if the timestamp is 4219 * valid to use. 4220 */ 4221 if ((mp1->b_next != NULL) && 4222 SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) 4223 tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - 4224 (int32_t)(intptr_t)mp1->b_prev); 4225 else 4226 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate); 4227 4228 /* Remeber the last sequence to be ACKed */ 4229 tcp->tcp_csuna = seg_ack; 4230 if (tcp->tcp_set_timer == 1) { 4231 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 4232 tcp->tcp_set_timer = 0; 4233 } 4234 } else { 4235 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate); 4236 } 4237 4238 /* Eat acknowledged bytes off the xmit queue. */ 4239 for (;;) { 4240 mblk_t *mp2; 4241 uchar_t *wptr; 4242 4243 wptr = mp1->b_wptr; 4244 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX); 4245 bytes_acked -= (int)(wptr - mp1->b_rptr); 4246 if (bytes_acked < 0) { 4247 mp1->b_rptr = wptr + bytes_acked; 4248 /* 4249 * Set a new timestamp if all the bytes timed by the 4250 * old timestamp have been ack'ed. 4251 */ 4252 if (SEQ_GT(seg_ack, 4253 (uint32_t)(uintptr_t)(mp1->b_next))) { 4254 mp1->b_prev = 4255 (mblk_t *)(uintptr_t)LBOLT_FASTPATH; 4256 mp1->b_next = NULL; 4257 } 4258 break; 4259 } 4260 mp1->b_next = NULL; 4261 mp1->b_prev = NULL; 4262 mp2 = mp1; 4263 mp1 = mp1->b_cont; 4264 4265 /* 4266 * This notification is required for some zero-copy 4267 * clients to maintain a copy semantic. After the data 4268 * is ack'ed, client is safe to modify or reuse the buffer. 4269 */ 4270 if (tcp->tcp_snd_zcopy_aware && 4271 (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 4272 tcp_zcopy_notify(tcp); 4273 freeb(mp2); 4274 if (bytes_acked == 0) { 4275 if (mp1 == NULL) { 4276 /* Everything is ack'ed, clear the tail. */ 4277 tcp->tcp_xmit_tail = NULL; 4278 /* 4279 * Cancel the timer unless we are still 4280 * waiting for an ACK for the FIN packet. 4281 */ 4282 if (tcp->tcp_timer_tid != 0 && 4283 tcp->tcp_snxt == tcp->tcp_suna) { 4284 (void) TCP_TIMER_CANCEL(tcp, 4285 tcp->tcp_timer_tid); 4286 tcp->tcp_timer_tid = 0; 4287 } 4288 goto pre_swnd_update; 4289 } 4290 if (mp2 != tcp->tcp_xmit_tail) 4291 break; 4292 tcp->tcp_xmit_tail = mp1; 4293 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 4294 (uintptr_t)INT_MAX); 4295 tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr - 4296 mp1->b_rptr); 4297 break; 4298 } 4299 if (mp1 == NULL) { 4300 /* 4301 * More was acked but there is nothing more 4302 * outstanding. This means that the FIN was 4303 * just acked or that we're talking to a clown. 4304 */ 4305 fin_acked: 4306 ASSERT(tcp->tcp_fin_sent); 4307 tcp->tcp_xmit_tail = NULL; 4308 if (tcp->tcp_fin_sent) { 4309 /* FIN was acked - making progress */ 4310 if (!tcp->tcp_fin_acked) 4311 tcp->tcp_ip_forward_progress = B_TRUE; 4312 tcp->tcp_fin_acked = B_TRUE; 4313 if (tcp->tcp_linger_tid != 0 && 4314 TCP_TIMER_CANCEL(tcp, 4315 tcp->tcp_linger_tid) >= 0) { 4316 tcp_stop_lingering(tcp); 4317 freemsg(mp); 4318 mp = NULL; 4319 } 4320 } else { 4321 /* 4322 * We should never get here because 4323 * we have already checked that the 4324 * number of bytes ack'ed should be 4325 * smaller than or equal to what we 4326 * have sent so far (it is the 4327 * acceptability check of the ACK). 4328 * We can only get here if the send 4329 * queue is corrupted. 4330 * 4331 * Terminate the connection and 4332 * panic the system. It is better 4333 * for us to panic instead of 4334 * continuing to avoid other disaster. 4335 */ 4336 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 4337 tcp->tcp_rnxt, TH_RST|TH_ACK); 4338 panic("Memory corruption " 4339 "detected for connection %s.", 4340 tcp_display(tcp, NULL, 4341 DISP_ADDR_AND_PORT)); 4342 /*NOTREACHED*/ 4343 } 4344 goto pre_swnd_update; 4345 } 4346 ASSERT(mp2 != tcp->tcp_xmit_tail); 4347 } 4348 if (tcp->tcp_unsent) { 4349 flags |= TH_XMIT_NEEDED; 4350 } 4351 pre_swnd_update: 4352 tcp->tcp_xmit_head = mp1; 4353 swnd_update: 4354 /* 4355 * The following check is different from most other implementations. 4356 * For bi-directional transfer, when segments are dropped, the 4357 * "normal" check will not accept a window update in those 4358 * retransmitted segemnts. Failing to do that, TCP may send out 4359 * segments which are outside receiver's window. As TCP accepts 4360 * the ack in those retransmitted segments, if the window update in 4361 * the same segment is not accepted, TCP will incorrectly calculates 4362 * that it can send more segments. This can create a deadlock 4363 * with the receiver if its window becomes zero. 4364 */ 4365 if (SEQ_LT(tcp->tcp_swl2, seg_ack) || 4366 SEQ_LT(tcp->tcp_swl1, seg_seq) || 4367 (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) { 4368 /* 4369 * The criteria for update is: 4370 * 4371 * 1. the segment acknowledges some data. Or 4372 * 2. the segment is new, i.e. it has a higher seq num. Or 4373 * 3. the segment is not old and the advertised window is 4374 * larger than the previous advertised window. 4375 */ 4376 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd) 4377 flags |= TH_XMIT_NEEDED; 4378 tcp->tcp_swnd = new_swnd; 4379 if (new_swnd > tcp->tcp_max_swnd) 4380 tcp->tcp_max_swnd = new_swnd; 4381 tcp->tcp_swl1 = seg_seq; 4382 tcp->tcp_swl2 = seg_ack; 4383 } 4384 est: 4385 if (tcp->tcp_state > TCPS_ESTABLISHED) { 4386 4387 switch (tcp->tcp_state) { 4388 case TCPS_FIN_WAIT_1: 4389 if (tcp->tcp_fin_acked) { 4390 tcp->tcp_state = TCPS_FIN_WAIT_2; 4391 DTRACE_TCP6(state__change, void, NULL, 4392 ip_xmit_attr_t *, connp->conn_ixa, 4393 void, NULL, tcp_t *, tcp, void, NULL, 4394 int32_t, TCPS_FIN_WAIT_1); 4395 /* 4396 * We implement the non-standard BSD/SunOS 4397 * FIN_WAIT_2 flushing algorithm. 4398 * If there is no user attached to this 4399 * TCP endpoint, then this TCP struct 4400 * could hang around forever in FIN_WAIT_2 4401 * state if the peer forgets to send us 4402 * a FIN. To prevent this, we wait only 4403 * 2*MSL (a convenient time value) for 4404 * the FIN to arrive. If it doesn't show up, 4405 * we flush the TCP endpoint. This algorithm, 4406 * though a violation of RFC-793, has worked 4407 * for over 10 years in BSD systems. 4408 * Note: SunOS 4.x waits 675 seconds before 4409 * flushing the FIN_WAIT_2 connection. 4410 */ 4411 TCP_TIMER_RESTART(tcp, 4412 tcp->tcp_fin_wait_2_flush_interval); 4413 } 4414 break; 4415 case TCPS_FIN_WAIT_2: 4416 break; /* Shutdown hook? */ 4417 case TCPS_LAST_ACK: 4418 freemsg(mp); 4419 if (tcp->tcp_fin_acked) { 4420 (void) tcp_clean_death(tcp, 0); 4421 return; 4422 } 4423 goto xmit_check; 4424 case TCPS_CLOSING: 4425 if (tcp->tcp_fin_acked) { 4426 SET_TIME_WAIT(tcps, tcp, connp); 4427 DTRACE_TCP6(state__change, void, NULL, 4428 ip_xmit_attr_t *, connp->conn_ixa, void, 4429 NULL, tcp_t *, tcp, void, NULL, int32_t, 4430 TCPS_CLOSING); 4431 } 4432 /*FALLTHRU*/ 4433 case TCPS_CLOSE_WAIT: 4434 freemsg(mp); 4435 goto xmit_check; 4436 default: 4437 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 4438 break; 4439 } 4440 } 4441 if (flags & TH_FIN) { 4442 /* Make sure we ack the fin */ 4443 flags |= TH_ACK_NEEDED; 4444 if (!tcp->tcp_fin_rcvd) { 4445 tcp->tcp_fin_rcvd = B_TRUE; 4446 tcp->tcp_rnxt++; 4447 tcpha = tcp->tcp_tcpha; 4448 tcpha->tha_ack = htonl(tcp->tcp_rnxt); 4449 4450 /* 4451 * Generate the ordrel_ind at the end unless the 4452 * conn is detached or it is a STREAMS based eager. 4453 * In the eager case we defer the notification until 4454 * tcp_accept_finish has run. 4455 */ 4456 if (!TCP_IS_DETACHED(tcp) && (IPCL_IS_NONSTR(connp) || 4457 (tcp->tcp_listener == NULL && 4458 !tcp->tcp_hard_binding))) 4459 flags |= TH_ORDREL_NEEDED; 4460 switch (tcp->tcp_state) { 4461 case TCPS_SYN_RCVD: 4462 tcp->tcp_state = TCPS_CLOSE_WAIT; 4463 DTRACE_TCP6(state__change, void, NULL, 4464 ip_xmit_attr_t *, connp->conn_ixa, 4465 void, NULL, tcp_t *, tcp, void, NULL, 4466 int32_t, TCPS_SYN_RCVD); 4467 /* Keepalive? */ 4468 break; 4469 case TCPS_ESTABLISHED: 4470 tcp->tcp_state = TCPS_CLOSE_WAIT; 4471 DTRACE_TCP6(state__change, void, NULL, 4472 ip_xmit_attr_t *, connp->conn_ixa, 4473 void, NULL, tcp_t *, tcp, void, NULL, 4474 int32_t, TCPS_ESTABLISHED); 4475 /* Keepalive? */ 4476 break; 4477 case TCPS_FIN_WAIT_1: 4478 if (!tcp->tcp_fin_acked) { 4479 tcp->tcp_state = TCPS_CLOSING; 4480 DTRACE_TCP6(state__change, void, NULL, 4481 ip_xmit_attr_t *, connp->conn_ixa, 4482 void, NULL, tcp_t *, tcp, void, 4483 NULL, int32_t, TCPS_FIN_WAIT_1); 4484 break; 4485 } 4486 /* FALLTHRU */ 4487 case TCPS_FIN_WAIT_2: 4488 SET_TIME_WAIT(tcps, tcp, connp); 4489 DTRACE_TCP6(state__change, void, NULL, 4490 ip_xmit_attr_t *, connp->conn_ixa, void, 4491 NULL, tcp_t *, tcp, void, NULL, int32_t, 4492 TCPS_FIN_WAIT_2); 4493 if (seg_len) { 4494 /* 4495 * implies data piggybacked on FIN. 4496 * break to handle data. 4497 */ 4498 break; 4499 } 4500 freemsg(mp); 4501 goto ack_check; 4502 } 4503 } 4504 } 4505 if (mp == NULL) 4506 goto xmit_check; 4507 if (seg_len == 0) { 4508 freemsg(mp); 4509 goto xmit_check; 4510 } 4511 if (mp->b_rptr == mp->b_wptr) { 4512 /* 4513 * The header has been consumed, so we remove the 4514 * zero-length mblk here. 4515 */ 4516 mp1 = mp; 4517 mp = mp->b_cont; 4518 freeb(mp1); 4519 } 4520 update_ack: 4521 tcpha = tcp->tcp_tcpha; 4522 tcp->tcp_rack_cnt++; 4523 { 4524 uint32_t cur_max; 4525 4526 cur_max = tcp->tcp_rack_cur_max; 4527 if (tcp->tcp_rack_cnt >= cur_max) { 4528 /* 4529 * We have more unacked data than we should - send 4530 * an ACK now. 4531 */ 4532 flags |= TH_ACK_NEEDED; 4533 cur_max++; 4534 if (cur_max > tcp->tcp_rack_abs_max) 4535 tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; 4536 else 4537 tcp->tcp_rack_cur_max = cur_max; 4538 } else if (TCP_IS_DETACHED(tcp)) { 4539 /* We don't have an ACK timer for detached TCP. */ 4540 flags |= TH_ACK_NEEDED; 4541 } else if (seg_len < mss) { 4542 /* 4543 * If we get a segment that is less than an mss, and we 4544 * already have unacknowledged data, and the amount 4545 * unacknowledged is not a multiple of mss, then we 4546 * better generate an ACK now. Otherwise, this may be 4547 * the tail piece of a transaction, and we would rather 4548 * wait for the response. 4549 */ 4550 uint32_t udif; 4551 ASSERT((uintptr_t)(tcp->tcp_rnxt - tcp->tcp_rack) <= 4552 (uintptr_t)INT_MAX); 4553 udif = (int)(tcp->tcp_rnxt - tcp->tcp_rack); 4554 if (udif && (udif % mss)) 4555 flags |= TH_ACK_NEEDED; 4556 else 4557 flags |= TH_ACK_TIMER_NEEDED; 4558 } else { 4559 /* Start delayed ack timer */ 4560 flags |= TH_ACK_TIMER_NEEDED; 4561 } 4562 } 4563 tcp->tcp_rnxt += seg_len; 4564 tcpha->tha_ack = htonl(tcp->tcp_rnxt); 4565 4566 if (mp == NULL) 4567 goto xmit_check; 4568 4569 /* Update SACK list */ 4570 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 4571 tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, 4572 &(tcp->tcp_num_sack_blk)); 4573 } 4574 4575 if (tcp->tcp_urp_mp) { 4576 tcp->tcp_urp_mp->b_cont = mp; 4577 mp = tcp->tcp_urp_mp; 4578 tcp->tcp_urp_mp = NULL; 4579 /* Ready for a new signal. */ 4580 tcp->tcp_urp_last_valid = B_FALSE; 4581 #ifdef DEBUG 4582 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 4583 "tcp_rput: sending exdata_ind %s", 4584 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 4585 #endif /* DEBUG */ 4586 } 4587 4588 /* 4589 * Check for ancillary data changes compared to last segment. 4590 */ 4591 if (connp->conn_recv_ancillary.crb_all != 0) { 4592 mp = tcp_input_add_ancillary(tcp, mp, &ipp, ira); 4593 if (mp == NULL) 4594 return; 4595 } 4596 4597 if (IPCL_IS_NONSTR(connp)) { 4598 /* 4599 * Non-STREAMS socket 4600 */ 4601 boolean_t push = flags & (TH_PUSH|TH_FIN); 4602 int error; 4603 4604 if ((*connp->conn_upcalls->su_recv)( 4605 connp->conn_upper_handle, 4606 mp, seg_len, 0, &error, &push) <= 0) { 4607 /* 4608 * We should never be in middle of a 4609 * fallback, the squeue guarantees that. 4610 */ 4611 ASSERT(error != EOPNOTSUPP); 4612 if (error == ENOSPC) 4613 tcp->tcp_rwnd -= seg_len; 4614 } else if (push) { 4615 /* PUSH bit set and sockfs is not flow controlled */ 4616 flags |= tcp_rwnd_reopen(tcp); 4617 } 4618 } else if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) { 4619 /* 4620 * Side queue inbound data until the accept happens. 4621 * tcp_accept/tcp_rput drains this when the accept happens. 4622 * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or 4623 * T_EXDATA_IND) it is queued on b_next. 4624 * XXX Make urgent data use this. Requires: 4625 * Removing tcp_listener check for TH_URG 4626 * Making M_PCPROTO and MARK messages skip the eager case 4627 */ 4628 4629 tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); 4630 } else { 4631 /* Active STREAMS socket */ 4632 if (mp->b_datap->db_type != M_DATA || 4633 (flags & TH_MARKNEXT_NEEDED)) { 4634 if (tcp->tcp_rcv_list != NULL) { 4635 flags |= tcp_rcv_drain(tcp); 4636 } 4637 ASSERT(tcp->tcp_rcv_list == NULL || 4638 tcp->tcp_fused_sigurg); 4639 4640 if (flags & TH_MARKNEXT_NEEDED) { 4641 #ifdef DEBUG 4642 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 4643 "tcp_rput: sending MSGMARKNEXT %s", 4644 tcp_display(tcp, NULL, 4645 DISP_PORT_ONLY)); 4646 #endif /* DEBUG */ 4647 mp->b_flag |= MSGMARKNEXT; 4648 flags &= ~TH_MARKNEXT_NEEDED; 4649 } 4650 4651 if (is_system_labeled()) 4652 tcp_setcred_data(mp, ira); 4653 4654 putnext(connp->conn_rq, mp); 4655 if (!canputnext(connp->conn_rq)) 4656 tcp->tcp_rwnd -= seg_len; 4657 } else if ((flags & (TH_PUSH|TH_FIN)) || 4658 tcp->tcp_rcv_cnt + seg_len >= connp->conn_rcvbuf >> 3) { 4659 if (tcp->tcp_rcv_list != NULL) { 4660 /* 4661 * Enqueue the new segment first and then 4662 * call tcp_rcv_drain() to send all data 4663 * up. The other way to do this is to 4664 * send all queued data up and then call 4665 * putnext() to send the new segment up. 4666 * This way can remove the else part later 4667 * on. 4668 * 4669 * We don't do this to avoid one more call to 4670 * canputnext() as tcp_rcv_drain() needs to 4671 * call canputnext(). 4672 */ 4673 tcp_rcv_enqueue(tcp, mp, seg_len, 4674 ira->ira_cred); 4675 flags |= tcp_rcv_drain(tcp); 4676 } else { 4677 if (is_system_labeled()) 4678 tcp_setcred_data(mp, ira); 4679 4680 putnext(connp->conn_rq, mp); 4681 if (!canputnext(connp->conn_rq)) 4682 tcp->tcp_rwnd -= seg_len; 4683 } 4684 } else { 4685 /* 4686 * Enqueue all packets when processing an mblk 4687 * from the co queue and also enqueue normal packets. 4688 */ 4689 tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); 4690 } 4691 /* 4692 * Make sure the timer is running if we have data waiting 4693 * for a push bit. This provides resiliency against 4694 * implementations that do not correctly generate push bits. 4695 */ 4696 if (tcp->tcp_rcv_list != NULL && tcp->tcp_push_tid == 0) { 4697 /* 4698 * The connection may be closed at this point, so don't 4699 * do anything for a detached tcp. 4700 */ 4701 if (!TCP_IS_DETACHED(tcp)) 4702 tcp->tcp_push_tid = TCP_TIMER(tcp, 4703 tcp_push_timer, 4704 tcps->tcps_push_timer_interval); 4705 } 4706 } 4707 4708 xmit_check: 4709 /* Is there anything left to do? */ 4710 ASSERT(!(flags & TH_MARKNEXT_NEEDED)); 4711 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED| 4712 TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED| 4713 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) 4714 goto done; 4715 4716 /* Any transmit work to do and a non-zero window? */ 4717 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT| 4718 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) { 4719 if (flags & TH_REXMIT_NEEDED) { 4720 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; 4721 4722 TCPS_BUMP_MIB(tcps, tcpOutFastRetrans); 4723 if (snd_size > mss) 4724 snd_size = mss; 4725 if (snd_size > tcp->tcp_swnd) 4726 snd_size = tcp->tcp_swnd; 4727 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, 4728 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, 4729 B_TRUE); 4730 4731 if (mp1 != NULL) { 4732 tcp->tcp_xmit_head->b_prev = 4733 (mblk_t *)LBOLT_FASTPATH; 4734 tcp->tcp_csuna = tcp->tcp_snxt; 4735 TCPS_BUMP_MIB(tcps, tcpRetransSegs); 4736 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, 4737 snd_size); 4738 tcp_send_data(tcp, mp1); 4739 } 4740 } 4741 if (flags & TH_NEED_SACK_REXMIT) { 4742 tcp_sack_rexmit(tcp, &flags); 4743 } 4744 /* 4745 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send 4746 * out new segment. Note that tcp_rexmit should not be 4747 * set, otherwise TH_LIMIT_XMIT should not be set. 4748 */ 4749 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) { 4750 if (!tcp->tcp_rexmit) { 4751 tcp_wput_data(tcp, NULL, B_FALSE); 4752 } else { 4753 tcp_ss_rexmit(tcp); 4754 } 4755 } 4756 /* 4757 * Adjust tcp_cwnd back to normal value after sending 4758 * new data segments. 4759 */ 4760 if (flags & TH_LIMIT_XMIT) { 4761 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1); 4762 /* 4763 * This will restart the timer. Restarting the 4764 * timer is used to avoid a timeout before the 4765 * limited transmitted segment's ACK gets back. 4766 */ 4767 if (tcp->tcp_xmit_head != NULL) 4768 tcp->tcp_xmit_head->b_prev = 4769 (mblk_t *)LBOLT_FASTPATH; 4770 } 4771 4772 /* Anything more to do? */ 4773 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED| 4774 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) 4775 goto done; 4776 } 4777 ack_check: 4778 if (flags & TH_SEND_URP_MARK) { 4779 ASSERT(tcp->tcp_urp_mark_mp); 4780 ASSERT(!IPCL_IS_NONSTR(connp)); 4781 /* 4782 * Send up any queued data and then send the mark message 4783 */ 4784 if (tcp->tcp_rcv_list != NULL) { 4785 flags |= tcp_rcv_drain(tcp); 4786 4787 } 4788 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 4789 mp1 = tcp->tcp_urp_mark_mp; 4790 tcp->tcp_urp_mark_mp = NULL; 4791 if (is_system_labeled()) 4792 tcp_setcred_data(mp1, ira); 4793 4794 putnext(connp->conn_rq, mp1); 4795 #ifdef DEBUG 4796 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 4797 "tcp_rput: sending zero-length %s %s", 4798 ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" : 4799 "MSGNOTMARKNEXT"), 4800 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 4801 #endif /* DEBUG */ 4802 flags &= ~TH_SEND_URP_MARK; 4803 } 4804 if (flags & TH_ACK_NEEDED) { 4805 /* 4806 * Time to send an ack for some reason. 4807 */ 4808 mp1 = tcp_ack_mp(tcp); 4809 4810 if (mp1 != NULL) { 4811 tcp_send_data(tcp, mp1); 4812 BUMP_LOCAL(tcp->tcp_obsegs); 4813 TCPS_BUMP_MIB(tcps, tcpOutAck); 4814 } 4815 if (tcp->tcp_ack_tid != 0) { 4816 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); 4817 tcp->tcp_ack_tid = 0; 4818 } 4819 } 4820 if (flags & TH_ACK_TIMER_NEEDED) { 4821 /* 4822 * Arrange for deferred ACK or push wait timeout. 4823 * Start timer if it is not already running. 4824 */ 4825 if (tcp->tcp_ack_tid == 0) { 4826 tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer, 4827 tcp->tcp_localnet ? 4828 tcps->tcps_local_dack_interval : 4829 tcps->tcps_deferred_ack_interval); 4830 } 4831 } 4832 if (flags & TH_ORDREL_NEEDED) { 4833 /* 4834 * Notify upper layer about an orderly release. If this is 4835 * a non-STREAMS socket, then just make an upcall. For STREAMS 4836 * we send up an ordrel_ind, unless this is an eager, in which 4837 * case the ordrel will be sent when tcp_accept_finish runs. 4838 * Note that for non-STREAMS we make an upcall even if it is an 4839 * eager, because we have an upper handle to send it to. 4840 */ 4841 ASSERT(IPCL_IS_NONSTR(connp) || tcp->tcp_listener == NULL); 4842 ASSERT(!tcp->tcp_detached); 4843 4844 if (IPCL_IS_NONSTR(connp)) { 4845 ASSERT(tcp->tcp_ordrel_mp == NULL); 4846 tcp->tcp_ordrel_done = B_TRUE; 4847 (*connp->conn_upcalls->su_opctl) 4848 (connp->conn_upper_handle, SOCK_OPCTL_SHUT_RECV, 0); 4849 goto done; 4850 } 4851 4852 if (tcp->tcp_rcv_list != NULL) { 4853 /* 4854 * Push any mblk(s) enqueued from co processing. 4855 */ 4856 flags |= tcp_rcv_drain(tcp); 4857 } 4858 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 4859 4860 mp1 = tcp->tcp_ordrel_mp; 4861 tcp->tcp_ordrel_mp = NULL; 4862 tcp->tcp_ordrel_done = B_TRUE; 4863 putnext(connp->conn_rq, mp1); 4864 } 4865 done: 4866 ASSERT(!(flags & TH_MARKNEXT_NEEDED)); 4867 } 4868 4869 /* 4870 * Attach ancillary data to a received TCP segments for the 4871 * ancillary pieces requested by the application that are 4872 * different than they were in the previous data segment. 4873 * 4874 * Save the "current" values once memory allocation is ok so that 4875 * when memory allocation fails we can just wait for the next data segment. 4876 */ 4877 static mblk_t * 4878 tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, 4879 ip_recv_attr_t *ira) 4880 { 4881 struct T_optdata_ind *todi; 4882 int optlen; 4883 uchar_t *optptr; 4884 struct T_opthdr *toh; 4885 crb_t addflag; /* Which pieces to add */ 4886 mblk_t *mp1; 4887 conn_t *connp = tcp->tcp_connp; 4888 4889 optlen = 0; 4890 addflag.crb_all = 0; 4891 /* If app asked for pktinfo and the index has changed ... */ 4892 if (connp->conn_recv_ancillary.crb_ip_recvpktinfo && 4893 ira->ira_ruifindex != tcp->tcp_recvifindex) { 4894 optlen += sizeof (struct T_opthdr) + 4895 sizeof (struct in6_pktinfo); 4896 addflag.crb_ip_recvpktinfo = 1; 4897 } 4898 /* If app asked for hoplimit and it has changed ... */ 4899 if (connp->conn_recv_ancillary.crb_ipv6_recvhoplimit && 4900 ipp->ipp_hoplimit != tcp->tcp_recvhops) { 4901 optlen += sizeof (struct T_opthdr) + sizeof (uint_t); 4902 addflag.crb_ipv6_recvhoplimit = 1; 4903 } 4904 /* If app asked for tclass and it has changed ... */ 4905 if (connp->conn_recv_ancillary.crb_ipv6_recvtclass && 4906 ipp->ipp_tclass != tcp->tcp_recvtclass) { 4907 optlen += sizeof (struct T_opthdr) + sizeof (uint_t); 4908 addflag.crb_ipv6_recvtclass = 1; 4909 } 4910 /* 4911 * If app asked for hopbyhop headers and it has changed ... 4912 * For security labels, note that (1) security labels can't change on 4913 * a connected socket at all, (2) we're connected to at most one peer, 4914 * (3) if anything changes, then it must be some other extra option. 4915 */ 4916 if (connp->conn_recv_ancillary.crb_ipv6_recvhopopts && 4917 ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen, 4918 (ipp->ipp_fields & IPPF_HOPOPTS), 4919 ipp->ipp_hopopts, ipp->ipp_hopoptslen)) { 4920 optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen; 4921 addflag.crb_ipv6_recvhopopts = 1; 4922 if (!ip_allocbuf((void **)&tcp->tcp_hopopts, 4923 &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), 4924 ipp->ipp_hopopts, ipp->ipp_hopoptslen)) 4925 return (mp); 4926 } 4927 /* If app asked for dst headers before routing headers ... */ 4928 if (connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts && 4929 ip_cmpbuf(tcp->tcp_rthdrdstopts, tcp->tcp_rthdrdstoptslen, 4930 (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), 4931 ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) { 4932 optlen += sizeof (struct T_opthdr) + 4933 ipp->ipp_rthdrdstoptslen; 4934 addflag.crb_ipv6_recvrthdrdstopts = 1; 4935 if (!ip_allocbuf((void **)&tcp->tcp_rthdrdstopts, 4936 &tcp->tcp_rthdrdstoptslen, 4937 (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), 4938 ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) 4939 return (mp); 4940 } 4941 /* If app asked for routing headers and it has changed ... */ 4942 if (connp->conn_recv_ancillary.crb_ipv6_recvrthdr && 4943 ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen, 4944 (ipp->ipp_fields & IPPF_RTHDR), 4945 ipp->ipp_rthdr, ipp->ipp_rthdrlen)) { 4946 optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen; 4947 addflag.crb_ipv6_recvrthdr = 1; 4948 if (!ip_allocbuf((void **)&tcp->tcp_rthdr, 4949 &tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR), 4950 ipp->ipp_rthdr, ipp->ipp_rthdrlen)) 4951 return (mp); 4952 } 4953 /* If app asked for dest headers and it has changed ... */ 4954 if ((connp->conn_recv_ancillary.crb_ipv6_recvdstopts || 4955 connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts) && 4956 ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen, 4957 (ipp->ipp_fields & IPPF_DSTOPTS), 4958 ipp->ipp_dstopts, ipp->ipp_dstoptslen)) { 4959 optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen; 4960 addflag.crb_ipv6_recvdstopts = 1; 4961 if (!ip_allocbuf((void **)&tcp->tcp_dstopts, 4962 &tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS), 4963 ipp->ipp_dstopts, ipp->ipp_dstoptslen)) 4964 return (mp); 4965 } 4966 4967 if (optlen == 0) { 4968 /* Nothing to add */ 4969 return (mp); 4970 } 4971 mp1 = allocb(sizeof (struct T_optdata_ind) + optlen, BPRI_MED); 4972 if (mp1 == NULL) { 4973 /* 4974 * Defer sending ancillary data until the next TCP segment 4975 * arrives. 4976 */ 4977 return (mp); 4978 } 4979 mp1->b_cont = mp; 4980 mp = mp1; 4981 mp->b_wptr += sizeof (*todi) + optlen; 4982 mp->b_datap->db_type = M_PROTO; 4983 todi = (struct T_optdata_ind *)mp->b_rptr; 4984 todi->PRIM_type = T_OPTDATA_IND; 4985 todi->DATA_flag = 1; /* MORE data */ 4986 todi->OPT_length = optlen; 4987 todi->OPT_offset = sizeof (*todi); 4988 optptr = (uchar_t *)&todi[1]; 4989 /* 4990 * If app asked for pktinfo and the index has changed ... 4991 * Note that the local address never changes for the connection. 4992 */ 4993 if (addflag.crb_ip_recvpktinfo) { 4994 struct in6_pktinfo *pkti; 4995 uint_t ifindex; 4996 4997 ifindex = ira->ira_ruifindex; 4998 toh = (struct T_opthdr *)optptr; 4999 toh->level = IPPROTO_IPV6; 5000 toh->name = IPV6_PKTINFO; 5001 toh->len = sizeof (*toh) + sizeof (*pkti); 5002 toh->status = 0; 5003 optptr += sizeof (*toh); 5004 pkti = (struct in6_pktinfo *)optptr; 5005 pkti->ipi6_addr = connp->conn_laddr_v6; 5006 pkti->ipi6_ifindex = ifindex; 5007 optptr += sizeof (*pkti); 5008 ASSERT(OK_32PTR(optptr)); 5009 /* Save as "last" value */ 5010 tcp->tcp_recvifindex = ifindex; 5011 } 5012 /* If app asked for hoplimit and it has changed ... */ 5013 if (addflag.crb_ipv6_recvhoplimit) { 5014 toh = (struct T_opthdr *)optptr; 5015 toh->level = IPPROTO_IPV6; 5016 toh->name = IPV6_HOPLIMIT; 5017 toh->len = sizeof (*toh) + sizeof (uint_t); 5018 toh->status = 0; 5019 optptr += sizeof (*toh); 5020 *(uint_t *)optptr = ipp->ipp_hoplimit; 5021 optptr += sizeof (uint_t); 5022 ASSERT(OK_32PTR(optptr)); 5023 /* Save as "last" value */ 5024 tcp->tcp_recvhops = ipp->ipp_hoplimit; 5025 } 5026 /* If app asked for tclass and it has changed ... */ 5027 if (addflag.crb_ipv6_recvtclass) { 5028 toh = (struct T_opthdr *)optptr; 5029 toh->level = IPPROTO_IPV6; 5030 toh->name = IPV6_TCLASS; 5031 toh->len = sizeof (*toh) + sizeof (uint_t); 5032 toh->status = 0; 5033 optptr += sizeof (*toh); 5034 *(uint_t *)optptr = ipp->ipp_tclass; 5035 optptr += sizeof (uint_t); 5036 ASSERT(OK_32PTR(optptr)); 5037 /* Save as "last" value */ 5038 tcp->tcp_recvtclass = ipp->ipp_tclass; 5039 } 5040 if (addflag.crb_ipv6_recvhopopts) { 5041 toh = (struct T_opthdr *)optptr; 5042 toh->level = IPPROTO_IPV6; 5043 toh->name = IPV6_HOPOPTS; 5044 toh->len = sizeof (*toh) + ipp->ipp_hopoptslen; 5045 toh->status = 0; 5046 optptr += sizeof (*toh); 5047 bcopy((uchar_t *)ipp->ipp_hopopts, optptr, ipp->ipp_hopoptslen); 5048 optptr += ipp->ipp_hopoptslen; 5049 ASSERT(OK_32PTR(optptr)); 5050 /* Save as last value */ 5051 ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen, 5052 (ipp->ipp_fields & IPPF_HOPOPTS), 5053 ipp->ipp_hopopts, ipp->ipp_hopoptslen); 5054 } 5055 if (addflag.crb_ipv6_recvrthdrdstopts) { 5056 toh = (struct T_opthdr *)optptr; 5057 toh->level = IPPROTO_IPV6; 5058 toh->name = IPV6_RTHDRDSTOPTS; 5059 toh->len = sizeof (*toh) + ipp->ipp_rthdrdstoptslen; 5060 toh->status = 0; 5061 optptr += sizeof (*toh); 5062 bcopy(ipp->ipp_rthdrdstopts, optptr, ipp->ipp_rthdrdstoptslen); 5063 optptr += ipp->ipp_rthdrdstoptslen; 5064 ASSERT(OK_32PTR(optptr)); 5065 /* Save as last value */ 5066 ip_savebuf((void **)&tcp->tcp_rthdrdstopts, 5067 &tcp->tcp_rthdrdstoptslen, 5068 (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), 5069 ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen); 5070 } 5071 if (addflag.crb_ipv6_recvrthdr) { 5072 toh = (struct T_opthdr *)optptr; 5073 toh->level = IPPROTO_IPV6; 5074 toh->name = IPV6_RTHDR; 5075 toh->len = sizeof (*toh) + ipp->ipp_rthdrlen; 5076 toh->status = 0; 5077 optptr += sizeof (*toh); 5078 bcopy(ipp->ipp_rthdr, optptr, ipp->ipp_rthdrlen); 5079 optptr += ipp->ipp_rthdrlen; 5080 ASSERT(OK_32PTR(optptr)); 5081 /* Save as last value */ 5082 ip_savebuf((void **)&tcp->tcp_rthdr, &tcp->tcp_rthdrlen, 5083 (ipp->ipp_fields & IPPF_RTHDR), 5084 ipp->ipp_rthdr, ipp->ipp_rthdrlen); 5085 } 5086 if (addflag.crb_ipv6_recvdstopts) { 5087 toh = (struct T_opthdr *)optptr; 5088 toh->level = IPPROTO_IPV6; 5089 toh->name = IPV6_DSTOPTS; 5090 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen; 5091 toh->status = 0; 5092 optptr += sizeof (*toh); 5093 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen); 5094 optptr += ipp->ipp_dstoptslen; 5095 ASSERT(OK_32PTR(optptr)); 5096 /* Save as last value */ 5097 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen, 5098 (ipp->ipp_fields & IPPF_DSTOPTS), 5099 ipp->ipp_dstopts, ipp->ipp_dstoptslen); 5100 } 5101 ASSERT(optptr == mp->b_wptr); 5102 return (mp); 5103 } 5104 5105 /* The minimum of smoothed mean deviation in RTO calculation. */ 5106 #define TCP_SD_MIN 400 5107 5108 /* 5109 * Set RTO for this connection. The formula is from Jacobson and Karels' 5110 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 5111 * are the same as those in Appendix A.2 of that paper. 5112 * 5113 * m = new measurement 5114 * sa = smoothed RTT average (8 * average estimates). 5115 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). 5116 */ 5117 static void 5118 tcp_set_rto(tcp_t *tcp, clock_t rtt) 5119 { 5120 long m = TICK_TO_MSEC(rtt); 5121 clock_t sa = tcp->tcp_rtt_sa; 5122 clock_t sv = tcp->tcp_rtt_sd; 5123 clock_t rto; 5124 tcp_stack_t *tcps = tcp->tcp_tcps; 5125 5126 TCPS_BUMP_MIB(tcps, tcpRttUpdate); 5127 tcp->tcp_rtt_update++; 5128 5129 /* tcp_rtt_sa is not 0 means this is a new sample. */ 5130 if (sa != 0) { 5131 /* 5132 * Update average estimator: 5133 * new rtt = 7/8 old rtt + 1/8 Error 5134 */ 5135 5136 /* m is now Error in estimate. */ 5137 m -= sa >> 3; 5138 if ((sa += m) <= 0) { 5139 /* 5140 * Don't allow the smoothed average to be negative. 5141 * We use 0 to denote reinitialization of the 5142 * variables. 5143 */ 5144 sa = 1; 5145 } 5146 5147 /* 5148 * Update deviation estimator: 5149 * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) 5150 */ 5151 if (m < 0) 5152 m = -m; 5153 m -= sv >> 2; 5154 sv += m; 5155 } else { 5156 /* 5157 * This follows BSD's implementation. So the reinitialized 5158 * RTO is 3 * m. We cannot go less than 2 because if the 5159 * link is bandwidth dominated, doubling the window size 5160 * during slow start means doubling the RTT. We want to be 5161 * more conservative when we reinitialize our estimates. 3 5162 * is just a convenient number. 5163 */ 5164 sa = m << 3; 5165 sv = m << 1; 5166 } 5167 if (sv < TCP_SD_MIN) { 5168 /* 5169 * We do not know that if sa captures the delay ACK 5170 * effect as in a long train of segments, a receiver 5171 * does not delay its ACKs. So set the minimum of sv 5172 * to be TCP_SD_MIN, which is default to 400 ms, twice 5173 * of BSD DATO. That means the minimum of mean 5174 * deviation is 100 ms. 5175 * 5176 */ 5177 sv = TCP_SD_MIN; 5178 } 5179 tcp->tcp_rtt_sa = sa; 5180 tcp->tcp_rtt_sd = sv; 5181 /* 5182 * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) 5183 * 5184 * Add tcp_rexmit_interval extra in case of extreme environment 5185 * where the algorithm fails to work. The default value of 5186 * tcp_rexmit_interval_extra should be 0. 5187 * 5188 * As we use a finer grained clock than BSD and update 5189 * RTO for every ACKs, add in another .25 of RTT to the 5190 * deviation of RTO to accomodate burstiness of 1/4 of 5191 * window size. 5192 */ 5193 rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5); 5194 5195 TCP_SET_RTO(tcp, rto); 5196 5197 /* Now, we can reset tcp_timer_backoff to use the new RTO... */ 5198 tcp->tcp_timer_backoff = 0; 5199 } 5200 5201 /* 5202 * On a labeled system we have some protocols above TCP, such as RPC, which 5203 * appear to assume that every mblk in a chain has a db_credp. 5204 */ 5205 static void 5206 tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira) 5207 { 5208 ASSERT(is_system_labeled()); 5209 ASSERT(ira->ira_cred != NULL); 5210 5211 while (mp != NULL) { 5212 mblk_setcred(mp, ira->ira_cred, NOPID); 5213 mp = mp->b_cont; 5214 } 5215 } 5216 5217 uint_t 5218 tcp_rwnd_reopen(tcp_t *tcp) 5219 { 5220 uint_t ret = 0; 5221 uint_t thwin; 5222 conn_t *connp = tcp->tcp_connp; 5223 5224 /* Learn the latest rwnd information that we sent to the other side. */ 5225 thwin = ((uint_t)ntohs(tcp->tcp_tcpha->tha_win)) 5226 << tcp->tcp_rcv_ws; 5227 /* This is peer's calculated send window (our receive window). */ 5228 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 5229 /* 5230 * Increase the receive window to max. But we need to do receiver 5231 * SWS avoidance. This means that we need to check the increase of 5232 * of receive window is at least 1 MSS. 5233 */ 5234 if (connp->conn_rcvbuf - thwin >= tcp->tcp_mss) { 5235 /* 5236 * If the window that the other side knows is less than max 5237 * deferred acks segments, send an update immediately. 5238 */ 5239 if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) { 5240 TCPS_BUMP_MIB(tcp->tcp_tcps, tcpOutWinUpdate); 5241 ret = TH_ACK_NEEDED; 5242 } 5243 tcp->tcp_rwnd = connp->conn_rcvbuf; 5244 } 5245 return (ret); 5246 } 5247 5248 /* 5249 * Handle a packet that has been reclassified by TCP. 5250 * This function drops the ref on connp that the caller had. 5251 */ 5252 void 5253 tcp_reinput(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) 5254 { 5255 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 5256 5257 if (connp->conn_incoming_ifindex != 0 && 5258 connp->conn_incoming_ifindex != ira->ira_ruifindex) { 5259 freemsg(mp); 5260 CONN_DEC_REF(connp); 5261 return; 5262 } 5263 5264 if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || 5265 (ira->ira_flags & IRAF_IPSEC_SECURE)) { 5266 ip6_t *ip6h; 5267 ipha_t *ipha; 5268 5269 if (ira->ira_flags & IRAF_IS_IPV4) { 5270 ipha = (ipha_t *)mp->b_rptr; 5271 ip6h = NULL; 5272 } else { 5273 ipha = NULL; 5274 ip6h = (ip6_t *)mp->b_rptr; 5275 } 5276 mp = ipsec_check_inbound_policy(mp, connp, ipha, ip6h, ira); 5277 if (mp == NULL) { 5278 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); 5279 /* Note that mp is NULL */ 5280 ip_drop_input("ipIfStatsInDiscards", mp, NULL); 5281 CONN_DEC_REF(connp); 5282 return; 5283 } 5284 } 5285 5286 if (IPCL_IS_TCP(connp)) { 5287 /* 5288 * do not drain, certain use cases can blow 5289 * the stack 5290 */ 5291 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 5292 connp->conn_recv, connp, ira, 5293 SQ_NODRAIN, SQTAG_IP_TCP_INPUT); 5294 } else { 5295 /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ 5296 (connp->conn_recv)(connp, mp, NULL, 5297 ira); 5298 CONN_DEC_REF(connp); 5299 } 5300 5301 } 5302 5303 /* ARGSUSED */ 5304 static void 5305 tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 5306 { 5307 conn_t *connp = (conn_t *)arg; 5308 tcp_t *tcp = connp->conn_tcp; 5309 queue_t *q = connp->conn_rq; 5310 5311 ASSERT(!IPCL_IS_NONSTR(connp)); 5312 mutex_enter(&tcp->tcp_rsrv_mp_lock); 5313 tcp->tcp_rsrv_mp = mp; 5314 mutex_exit(&tcp->tcp_rsrv_mp_lock); 5315 5316 if (TCP_IS_DETACHED(tcp) || q == NULL) { 5317 return; 5318 } 5319 5320 if (tcp->tcp_fused) { 5321 tcp_fuse_backenable(tcp); 5322 return; 5323 } 5324 5325 if (canputnext(q)) { 5326 /* Not flow-controlled, open rwnd */ 5327 tcp->tcp_rwnd = connp->conn_rcvbuf; 5328 5329 /* 5330 * Send back a window update immediately if TCP is above 5331 * ESTABLISHED state and the increase of the rcv window 5332 * that the other side knows is at least 1 MSS after flow 5333 * control is lifted. 5334 */ 5335 if (tcp->tcp_state >= TCPS_ESTABLISHED && 5336 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 5337 tcp_xmit_ctl(NULL, tcp, 5338 (tcp->tcp_swnd == 0) ? tcp->tcp_suna : 5339 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 5340 } 5341 } 5342 } 5343 5344 /* 5345 * The read side service routine is called mostly when we get back-enabled as a 5346 * result of flow control relief. Since we don't actually queue anything in 5347 * TCP, we have no data to send out of here. What we do is clear the receive 5348 * window, and send out a window update. 5349 */ 5350 void 5351 tcp_rsrv(queue_t *q) 5352 { 5353 conn_t *connp = Q_TO_CONN(q); 5354 tcp_t *tcp = connp->conn_tcp; 5355 mblk_t *mp; 5356 5357 /* No code does a putq on the read side */ 5358 ASSERT(q->q_first == NULL); 5359 5360 /* 5361 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_rsrv() has already 5362 * been run. So just return. 5363 */ 5364 mutex_enter(&tcp->tcp_rsrv_mp_lock); 5365 if ((mp = tcp->tcp_rsrv_mp) == NULL) { 5366 mutex_exit(&tcp->tcp_rsrv_mp_lock); 5367 return; 5368 } 5369 tcp->tcp_rsrv_mp = NULL; 5370 mutex_exit(&tcp->tcp_rsrv_mp_lock); 5371 5372 CONN_INC_REF(connp); 5373 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_rsrv_input, connp, 5374 NULL, SQ_PROCESS, SQTAG_TCP_RSRV); 5375 } 5376 5377 /* At minimum we need 8 bytes in the TCP header for the lookup */ 5378 #define ICMP_MIN_TCP_HDR 8 5379 5380 /* 5381 * tcp_icmp_input is called as conn_recvicmp to process ICMP error messages 5382 * passed up by IP. The message is always received on the correct tcp_t. 5383 * Assumes that IP has pulled up everything up to and including the ICMP header. 5384 */ 5385 /* ARGSUSED2 */ 5386 void 5387 tcp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 5388 { 5389 conn_t *connp = (conn_t *)arg1; 5390 icmph_t *icmph; 5391 ipha_t *ipha; 5392 int iph_hdr_length; 5393 tcpha_t *tcpha; 5394 uint32_t seg_seq; 5395 tcp_t *tcp = connp->conn_tcp; 5396 5397 /* Assume IP provides aligned packets */ 5398 ASSERT(OK_32PTR(mp->b_rptr)); 5399 ASSERT((MBLKL(mp) >= sizeof (ipha_t))); 5400 5401 /* 5402 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent 5403 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6. 5404 */ 5405 if (!(ira->ira_flags & IRAF_IS_IPV4)) { 5406 tcp_icmp_error_ipv6(tcp, mp, ira); 5407 return; 5408 } 5409 5410 /* Skip past the outer IP and ICMP headers */ 5411 iph_hdr_length = ira->ira_ip_hdr_length; 5412 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 5413 /* 5414 * If we don't have the correct outer IP header length 5415 * or if we don't have a complete inner IP header 5416 * drop it. 5417 */ 5418 if (iph_hdr_length < sizeof (ipha_t) || 5419 (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) { 5420 noticmpv4: 5421 freemsg(mp); 5422 return; 5423 } 5424 ipha = (ipha_t *)&icmph[1]; 5425 5426 /* Skip past the inner IP and find the ULP header */ 5427 iph_hdr_length = IPH_HDR_LENGTH(ipha); 5428 tcpha = (tcpha_t *)((char *)ipha + iph_hdr_length); 5429 /* 5430 * If we don't have the correct inner IP header length or if the ULP 5431 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR 5432 * bytes of TCP header, drop it. 5433 */ 5434 if (iph_hdr_length < sizeof (ipha_t) || 5435 ipha->ipha_protocol != IPPROTO_TCP || 5436 (uchar_t *)tcpha + ICMP_MIN_TCP_HDR > mp->b_wptr) { 5437 goto noticmpv4; 5438 } 5439 5440 seg_seq = ntohl(tcpha->tha_seq); 5441 switch (icmph->icmph_type) { 5442 case ICMP_DEST_UNREACHABLE: 5443 switch (icmph->icmph_code) { 5444 case ICMP_FRAGMENTATION_NEEDED: 5445 /* 5446 * Update Path MTU, then try to send something out. 5447 */ 5448 tcp_update_pmtu(tcp, B_TRUE); 5449 tcp_rexmit_after_error(tcp); 5450 break; 5451 case ICMP_PORT_UNREACHABLE: 5452 case ICMP_PROTOCOL_UNREACHABLE: 5453 switch (tcp->tcp_state) { 5454 case TCPS_SYN_SENT: 5455 case TCPS_SYN_RCVD: 5456 /* 5457 * ICMP can snipe away incipient 5458 * TCP connections as long as 5459 * seq number is same as initial 5460 * send seq number. 5461 */ 5462 if (seg_seq == tcp->tcp_iss) { 5463 (void) tcp_clean_death(tcp, 5464 ECONNREFUSED); 5465 } 5466 break; 5467 } 5468 break; 5469 case ICMP_HOST_UNREACHABLE: 5470 case ICMP_NET_UNREACHABLE: 5471 /* Record the error in case we finally time out. */ 5472 if (icmph->icmph_code == ICMP_HOST_UNREACHABLE) 5473 tcp->tcp_client_errno = EHOSTUNREACH; 5474 else 5475 tcp->tcp_client_errno = ENETUNREACH; 5476 if (tcp->tcp_state == TCPS_SYN_RCVD) { 5477 if (tcp->tcp_listener != NULL && 5478 tcp->tcp_listener->tcp_syn_defense) { 5479 /* 5480 * Ditch the half-open connection if we 5481 * suspect a SYN attack is under way. 5482 */ 5483 (void) tcp_clean_death(tcp, 5484 tcp->tcp_client_errno); 5485 } 5486 } 5487 break; 5488 default: 5489 break; 5490 } 5491 break; 5492 case ICMP_SOURCE_QUENCH: { 5493 /* 5494 * use a global boolean to control 5495 * whether TCP should respond to ICMP_SOURCE_QUENCH. 5496 * The default is false. 5497 */ 5498 if (tcp_icmp_source_quench) { 5499 /* 5500 * Reduce the sending rate as if we got a 5501 * retransmit timeout 5502 */ 5503 uint32_t npkt; 5504 5505 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / 5506 tcp->tcp_mss; 5507 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss; 5508 tcp->tcp_cwnd = tcp->tcp_mss; 5509 tcp->tcp_cwnd_cnt = 0; 5510 } 5511 break; 5512 } 5513 } 5514 freemsg(mp); 5515 } 5516 5517 /* 5518 * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6 5519 * error messages passed up by IP. 5520 * Assumes that IP has pulled up all the extension headers as well 5521 * as the ICMPv6 header. 5522 */ 5523 static void 5524 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira) 5525 { 5526 icmp6_t *icmp6; 5527 ip6_t *ip6h; 5528 uint16_t iph_hdr_length = ira->ira_ip_hdr_length; 5529 tcpha_t *tcpha; 5530 uint8_t *nexthdrp; 5531 uint32_t seg_seq; 5532 5533 /* 5534 * Verify that we have a complete IP header. 5535 */ 5536 ASSERT((MBLKL(mp) >= sizeof (ip6_t))); 5537 5538 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 5539 ip6h = (ip6_t *)&icmp6[1]; 5540 /* 5541 * Verify if we have a complete ICMP and inner IP header. 5542 */ 5543 if ((uchar_t *)&ip6h[1] > mp->b_wptr) { 5544 noticmpv6: 5545 freemsg(mp); 5546 return; 5547 } 5548 5549 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) 5550 goto noticmpv6; 5551 tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length); 5552 /* 5553 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't 5554 * have at least ICMP_MIN_TCP_HDR bytes of TCP header drop the 5555 * packet. 5556 */ 5557 if ((*nexthdrp != IPPROTO_TCP) || 5558 ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) { 5559 goto noticmpv6; 5560 } 5561 5562 seg_seq = ntohl(tcpha->tha_seq); 5563 switch (icmp6->icmp6_type) { 5564 case ICMP6_PACKET_TOO_BIG: 5565 /* 5566 * Update Path MTU, then try to send something out. 5567 */ 5568 tcp_update_pmtu(tcp, B_TRUE); 5569 tcp_rexmit_after_error(tcp); 5570 break; 5571 case ICMP6_DST_UNREACH: 5572 switch (icmp6->icmp6_code) { 5573 case ICMP6_DST_UNREACH_NOPORT: 5574 if (((tcp->tcp_state == TCPS_SYN_SENT) || 5575 (tcp->tcp_state == TCPS_SYN_RCVD)) && 5576 (seg_seq == tcp->tcp_iss)) { 5577 (void) tcp_clean_death(tcp, ECONNREFUSED); 5578 } 5579 break; 5580 case ICMP6_DST_UNREACH_ADMIN: 5581 case ICMP6_DST_UNREACH_NOROUTE: 5582 case ICMP6_DST_UNREACH_BEYONDSCOPE: 5583 case ICMP6_DST_UNREACH_ADDR: 5584 /* Record the error in case we finally time out. */ 5585 tcp->tcp_client_errno = EHOSTUNREACH; 5586 if (((tcp->tcp_state == TCPS_SYN_SENT) || 5587 (tcp->tcp_state == TCPS_SYN_RCVD)) && 5588 (seg_seq == tcp->tcp_iss)) { 5589 if (tcp->tcp_listener != NULL && 5590 tcp->tcp_listener->tcp_syn_defense) { 5591 /* 5592 * Ditch the half-open connection if we 5593 * suspect a SYN attack is under way. 5594 */ 5595 (void) tcp_clean_death(tcp, 5596 tcp->tcp_client_errno); 5597 } 5598 } 5599 5600 5601 break; 5602 default: 5603 break; 5604 } 5605 break; 5606 case ICMP6_PARAM_PROB: 5607 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 5608 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 5609 (uchar_t *)ip6h + icmp6->icmp6_pptr == 5610 (uchar_t *)nexthdrp) { 5611 if (tcp->tcp_state == TCPS_SYN_SENT || 5612 tcp->tcp_state == TCPS_SYN_RCVD) { 5613 (void) tcp_clean_death(tcp, ECONNREFUSED); 5614 } 5615 break; 5616 } 5617 break; 5618 5619 case ICMP6_TIME_EXCEEDED: 5620 default: 5621 break; 5622 } 5623 freemsg(mp); 5624 } 5625 5626 /* 5627 * CALLED OUTSIDE OF SQUEUE! It can not follow any pointers that tcp might 5628 * change. But it can refer to fields like tcp_suna and tcp_snxt. 5629 * 5630 * Function tcp_verifyicmp is called as conn_verifyicmp to verify the ICMP 5631 * error messages received by IP. The message is always received on the correct 5632 * tcp_t. 5633 */ 5634 /* ARGSUSED */ 5635 boolean_t 5636 tcp_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6, 5637 ip_recv_attr_t *ira) 5638 { 5639 tcpha_t *tcpha = (tcpha_t *)arg2; 5640 uint32_t seq = ntohl(tcpha->tha_seq); 5641 tcp_t *tcp = connp->conn_tcp; 5642 5643 /* 5644 * TCP sequence number contained in payload of the ICMP error message 5645 * should be within the range SND.UNA <= SEG.SEQ < SND.NXT. Otherwise, 5646 * the message is either a stale ICMP error, or an attack from the 5647 * network. Fail the verification. 5648 */ 5649 if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt)) 5650 return (B_FALSE); 5651 5652 /* For "too big" we also check the ignore flag */ 5653 if (ira->ira_flags & IRAF_IS_IPV4) { 5654 ASSERT(icmph != NULL); 5655 if (icmph->icmph_type == ICMP_DEST_UNREACHABLE && 5656 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED && 5657 tcp->tcp_tcps->tcps_ignore_path_mtu) 5658 return (B_FALSE); 5659 } else { 5660 ASSERT(icmp6 != NULL); 5661 if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG && 5662 tcp->tcp_tcps->tcps_ignore_path_mtu) 5663 return (B_FALSE); 5664 } 5665 return (B_TRUE); 5666 } 5667