1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2019 Joyent, Inc. 26 * Copyright (c) 2014, 2016 by Delphix. All rights reserved. 27 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 28 * Copyright 2024 Oxide Computer Company 29 */ 30 31 /* This file contains all TCP input processing functions. */ 32 33 #include <sys/types.h> 34 #include <sys/stream.h> 35 #include <sys/strsun.h> 36 #include <sys/strsubr.h> 37 #include <sys/stropts.h> 38 #include <sys/strlog.h> 39 #define _SUN_TPI_VERSION 2 40 #include <sys/tihdr.h> 41 #include <sys/suntpi.h> 42 #include <sys/xti_inet.h> 43 #include <sys/squeue_impl.h> 44 #include <sys/squeue.h> 45 #include <sys/tsol/tnet.h> 46 47 #include <inet/common.h> 48 #include <inet/ip.h> 49 #include <inet/tcp.h> 50 #include <inet/tcp_impl.h> 51 #include <inet/tcp_cluster.h> 52 #include <inet/proto_set.h> 53 #include <inet/ipsec_impl.h> 54 #include <inet/tcp_sig.h> 55 56 /* 57 * RFC7323-recommended phrasing of TSTAMP option, for easier parsing 58 */ 59 60 #ifdef _BIG_ENDIAN 61 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 62 (TCPOPT_TSTAMP << 8) | 10) 63 #else 64 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 65 (TCPOPT_NOP << 8) | TCPOPT_NOP) 66 #endif 67 68 /* 69 * PAWS needs a timer for 24 days. This is the number of ticks in 24 days 70 */ 71 #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz)) 72 73 /* 74 * Since tcp_listener is not cleared atomically with tcp_detached 75 * being cleared we need this extra bit to tell a detached connection 76 * apart from one that is in the process of being accepted. 77 */ 78 #define TCP_IS_DETACHED_NONEAGER(tcp) \ 79 (TCP_IS_DETACHED(tcp) && \ 80 (!(tcp)->tcp_hard_binding)) 81 82 /* 83 * Steps to do when a tcp_t moves to TIME-WAIT state. 84 * 85 * This connection is done, we don't need to account for it. Decrement 86 * the listener connection counter if needed. 87 * 88 * Decrement the connection counter of the stack. Note that this counter 89 * is per CPU. So the total number of connections in a stack is the sum of all 90 * of them. Since there is no lock for handling all of them exclusively, the 91 * resulting sum is only an approximation. 92 * 93 * Unconditionally clear the exclusive binding bit so this TIME-WAIT 94 * connection won't interfere with new ones. 95 * 96 * Start the TIME-WAIT timer. If upper layer has not closed the connection, 97 * the timer is handled within the context of this tcp_t. When the timer 98 * fires, tcp_clean_death() is called. If upper layer closes the connection 99 * during this period, tcp_time_wait_append() will be called to add this 100 * tcp_t to the global TIME-WAIT list. Note that this means that the 101 * actual wait time in TIME-WAIT state will be longer than the 102 * tcps_time_wait_interval since the period before upper layer closes the 103 * connection is not accounted for when tcp_time_wait_append() is called. 104 * 105 * If upper layer has closed the connection, call tcp_time_wait_append() 106 * directly. 107 * 108 */ 109 #define SET_TIME_WAIT(tcps, tcp, connp) \ 110 { \ 111 (tcp)->tcp_state = TCPS_TIME_WAIT; \ 112 if ((tcp)->tcp_listen_cnt != NULL) \ 113 TCP_DECR_LISTEN_CNT(tcp); \ 114 atomic_dec_64( \ 115 (uint64_t *)&(tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_conn_cnt); \ 116 (connp)->conn_exclbind = 0; \ 117 if (!TCP_IS_DETACHED(tcp)) { \ 118 TCP_TIMER_RESTART(tcp, (tcps)->tcps_time_wait_interval); \ 119 } else { \ 120 tcp_time_wait_append(tcp); \ 121 TCP_DBGSTAT(tcps, tcp_rput_time_wait); \ 122 } \ 123 } 124 125 /* 126 * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more 127 * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent 128 * data, TCP will not respond with an ACK. RFC 793 requires that 129 * TCP responds with an ACK for such a bogus ACK. By not following 130 * the RFC, we prevent TCP from getting into an ACK storm if somehow 131 * an attacker successfully spoofs an acceptable segment to our 132 * peer; or when our peer is "confused." 133 */ 134 static uint32_t tcp_drop_ack_unsent_cnt = 10; 135 136 /* 137 * To protect TCP against attacker using a small window and requesting 138 * large amount of data (DoS attack by conuming memory), TCP checks the 139 * window advertised in the last ACK of the 3-way handshake. TCP uses 140 * the tcp_mss (the size of one packet) value for comparion. The window 141 * should be larger than tcp_mss. But while a sane TCP should advertise 142 * a receive window larger than or equal to 4*MSS to avoid stop and go 143 * tarrfic, not all TCP stacks do that. This is especially true when 144 * tcp_mss is a big value. 145 * 146 * To work around this issue, an additional fixed value for comparison 147 * is also used. If the advertised window is smaller than both tcp_mss 148 * and tcp_init_wnd_chk, the ACK is considered as invalid. So for large 149 * tcp_mss value (say, 8K), a window larger than tcp_init_wnd_chk but 150 * smaller than 8K is considered to be OK. 151 */ 152 static uint32_t tcp_init_wnd_chk = 4096; 153 154 /* Process ICMP source quench message or not. */ 155 static boolean_t tcp_icmp_source_quench = B_FALSE; 156 157 static boolean_t tcp_outbound_squeue_switch = B_FALSE; 158 159 static mblk_t *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *, 160 ip_recv_attr_t *); 161 static mblk_t *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *, 162 ip_recv_attr_t *); 163 static boolean_t tcp_drop_q0(tcp_t *); 164 static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *); 165 static mblk_t *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *, 166 ip_recv_attr_t *); 167 static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *); 168 static boolean_t tcp_process_options(mblk_t *mp, tcp_t *, tcpha_t *, 169 ip_recv_attr_t *, boolean_t); 170 static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); 171 static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); 172 static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *); 173 static void tcp_set_rto(tcp_t *, hrtime_t); 174 static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); 175 176 /* 177 * CC wrapper hook functions 178 */ 179 static void 180 cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked, 181 uint16_t type) 182 { 183 uint32_t old_cwnd = tcp->tcp_cwnd; 184 185 tcp->tcp_ccv.bytes_this_ack = bytes_acked; 186 if (tcp->tcp_cwnd <= tcp->tcp_swnd) 187 tcp->tcp_ccv.flags |= CCF_CWND_LIMITED; 188 else 189 tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED; 190 191 if (type == CC_ACK) { 192 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { 193 if (tcp->tcp_ccv.flags & CCF_RTO) 194 tcp->tcp_ccv.flags &= ~CCF_RTO; 195 196 tcp->tcp_ccv.t_bytes_acked += 197 min(tcp->tcp_ccv.bytes_this_ack, 198 tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss); 199 if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) { 200 tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd; 201 tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND; 202 } 203 } else { 204 tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND; 205 tcp->tcp_ccv.t_bytes_acked = 0; 206 } 207 } 208 209 if (CC_ALGO(tcp)->ack_received != NULL) { 210 /* 211 * The FreeBSD code where this originated had a comment "Find 212 * a way to live without this" in several places where curack 213 * got set. If they eventually dump curack from the cc 214 * variables, we'll need to adapt our code. 215 */ 216 tcp->tcp_ccv.curack = seg_ack; 217 CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type); 218 } 219 220 DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd, 221 uint32_t, tcp->tcp_cwnd); 222 } 223 224 void 225 cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type) 226 { 227 uint32_t old_cwnd = tcp->tcp_cwnd; 228 uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh; 229 switch (type) { 230 case CC_NDUPACK: 231 if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) { 232 tcp->tcp_rexmit_max = tcp->tcp_snxt; 233 if (tcp->tcp_ecn_ok) { 234 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 235 tcp->tcp_cwr = B_TRUE; 236 tcp->tcp_ecn_cwr_sent = B_FALSE; 237 } 238 } 239 break; 240 case CC_ECN: 241 if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) { 242 tcp->tcp_rexmit_max = tcp->tcp_snxt; 243 if (tcp->tcp_ecn_ok) { 244 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 245 tcp->tcp_cwr = B_TRUE; 246 tcp->tcp_ecn_cwr_sent = B_FALSE; 247 } 248 } 249 break; 250 case CC_RTO: 251 tcp->tcp_ccv.flags |= CCF_RTO; 252 tcp->tcp_dupack_cnt = 0; 253 tcp->tcp_ccv.t_bytes_acked = 0; 254 /* 255 * Give up on fast recovery and congestion recovery if we were 256 * attempting either. 257 */ 258 EXIT_RECOVERY(tcp->tcp_ccv.flags); 259 if (CC_ALGO(tcp)->cong_signal == NULL) { 260 /* 261 * RFC5681 Section 3.1 262 * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4) 263 */ 264 tcp->tcp_cwnd_ssthresh = max( 265 (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss, 266 2) * tcp->tcp_mss; 267 tcp->tcp_cwnd = tcp->tcp_mss; 268 } 269 270 if (tcp->tcp_ecn_ok) { 271 tcp->tcp_cwr = B_TRUE; 272 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 273 tcp->tcp_ecn_cwr_sent = B_FALSE; 274 } 275 break; 276 } 277 278 if (CC_ALGO(tcp)->cong_signal != NULL) { 279 tcp->tcp_ccv.curack = seg_ack; 280 CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type); 281 } 282 283 DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd, 284 uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh, 285 uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type); 286 } 287 288 static void 289 cc_post_recovery(tcp_t *tcp, uint32_t seg_ack) 290 { 291 uint32_t old_cwnd = tcp->tcp_cwnd; 292 293 if (CC_ALGO(tcp)->post_recovery != NULL) { 294 tcp->tcp_ccv.curack = seg_ack; 295 CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv); 296 } 297 tcp->tcp_ccv.t_bytes_acked = 0; 298 299 DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp, 300 uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd); 301 } 302 303 /* 304 * Set the MSS associated with a particular tcp based on its current value, 305 * and a new one passed in. Observe minimums and maximums, and reset other 306 * state variables that we want to view as multiples of MSS. 307 * 308 * The value of MSS could be either increased or descreased. 309 */ 310 void 311 tcp_mss_set(tcp_t *tcp, uint32_t mss) 312 { 313 uint32_t mss_max; 314 tcp_stack_t *tcps = tcp->tcp_tcps; 315 conn_t *connp = tcp->tcp_connp; 316 317 if (connp->conn_ipversion == IPV4_VERSION) 318 mss_max = tcps->tcps_mss_max_ipv4; 319 else 320 mss_max = tcps->tcps_mss_max_ipv6; 321 322 if (mss < tcps->tcps_mss_min) 323 mss = tcps->tcps_mss_min; 324 if (mss > mss_max) 325 mss = mss_max; 326 /* 327 * Unless naglim has been set by our client to 328 * a non-mss value, force naglim to track mss. 329 * This can help to aggregate small writes. 330 */ 331 if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) 332 tcp->tcp_naglim = mss; 333 /* 334 * TCP should be able to buffer at least 4 MSS data for obvious 335 * performance reason. 336 */ 337 if ((mss << 2) > connp->conn_sndbuf) 338 connp->conn_sndbuf = mss << 2; 339 340 /* 341 * Set the send lowater to at least twice of MSS. 342 */ 343 if ((mss << 1) > connp->conn_sndlowat) 344 connp->conn_sndlowat = mss << 1; 345 346 /* 347 * Update tcp_cwnd according to the new value of MSS. Keep the 348 * previous ratio to preserve the transmit rate. 349 */ 350 tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss; 351 tcp->tcp_cwnd_cnt = 0; 352 353 tcp->tcp_mss = mss; 354 (void) tcp_maxpsz_set(tcp, B_TRUE); 355 } 356 357 /* 358 * Extract option values from a tcp header. We put any found values into the 359 * tcpopt struct and return a bitmask saying which options were found. 360 */ 361 int 362 tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt) 363 { 364 uchar_t *endp; 365 int len; 366 uint32_t mss; 367 uchar_t *up = (uchar_t *)tcpha; 368 int found = 0; 369 int32_t sack_len; 370 tcp_seq sack_begin, sack_end; 371 tcp_t *tcp; 372 373 endp = up + TCP_HDR_LENGTH(tcpha); 374 up += TCP_MIN_HEADER_LENGTH; 375 /* 376 * If timestamp option is aligned as recommended in RFC 7323 Appendix 377 * A, and is the only option, return quickly. 378 */ 379 if (TCP_HDR_LENGTH(tcpha) == (uint32_t)TCP_MIN_HEADER_LENGTH + 380 TCPOPT_REAL_TS_LEN && 381 OK_32PTR(up) && 382 *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { 383 tcpopt->tcp_opt_ts_val = ABE32_TO_U32((up+4)); 384 tcpopt->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); 385 386 return (TCP_OPT_TSTAMP_PRESENT); 387 } 388 while (up < endp) { 389 len = endp - up; 390 switch (*up) { 391 case TCPOPT_EOL: 392 break; 393 394 case TCPOPT_NOP: 395 up++; 396 continue; 397 398 case TCPOPT_MAXSEG: 399 if (len < TCPOPT_MAXSEG_LEN || 400 up[1] != TCPOPT_MAXSEG_LEN) 401 break; 402 403 mss = BE16_TO_U16(up+2); 404 /* Caller must handle tcp_mss_min and tcp_mss_max_* */ 405 tcpopt->tcp_opt_mss = mss; 406 found |= TCP_OPT_MSS_PRESENT; 407 408 up += TCPOPT_MAXSEG_LEN; 409 continue; 410 411 case TCPOPT_WSCALE: 412 if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) 413 break; 414 415 if (up[2] > TCP_MAX_WINSHIFT) 416 tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; 417 else 418 tcpopt->tcp_opt_wscale = up[2]; 419 found |= TCP_OPT_WSCALE_PRESENT; 420 421 up += TCPOPT_WS_LEN; 422 continue; 423 424 case TCPOPT_SACK_PERMITTED: 425 if (len < TCPOPT_SACK_OK_LEN || 426 up[1] != TCPOPT_SACK_OK_LEN) 427 break; 428 found |= TCP_OPT_SACK_OK_PRESENT; 429 up += TCPOPT_SACK_OK_LEN; 430 continue; 431 432 case TCPOPT_SACK: 433 if (len <= 2 || up[1] <= 2 || len < up[1]) 434 break; 435 436 /* If TCP is not interested in SACK blks... */ 437 if ((tcp = tcpopt->tcp) == NULL) { 438 up += up[1]; 439 continue; 440 } 441 sack_len = up[1] - TCPOPT_HEADER_LEN; 442 up += TCPOPT_HEADER_LEN; 443 444 /* 445 * If the list is empty, allocate one and assume 446 * nothing is sack'ed. 447 */ 448 if (tcp->tcp_notsack_list == NULL) { 449 tcp_notsack_update(&(tcp->tcp_notsack_list), 450 tcp->tcp_suna, tcp->tcp_snxt, 451 &(tcp->tcp_num_notsack_blk), 452 &(tcp->tcp_cnt_notsack_list)); 453 454 /* 455 * Make sure tcp_notsack_list is not NULL. 456 * This happens when kmem_alloc(KM_NOSLEEP) 457 * returns NULL. 458 */ 459 if (tcp->tcp_notsack_list == NULL) { 460 up += sack_len; 461 continue; 462 } 463 tcp->tcp_fack = tcp->tcp_suna; 464 } 465 466 while (sack_len > 0) { 467 if (up + 8 > endp) { 468 up = endp; 469 break; 470 } 471 sack_begin = BE32_TO_U32(up); 472 up += 4; 473 sack_end = BE32_TO_U32(up); 474 up += 4; 475 sack_len -= 8; 476 /* 477 * Bounds checking. Make sure the SACK 478 * info is within tcp_suna and tcp_snxt. 479 * If this SACK blk is out of bound, ignore 480 * it but continue to parse the following 481 * blks. 482 */ 483 if (SEQ_LEQ(sack_end, sack_begin) || 484 SEQ_LT(sack_begin, tcp->tcp_suna) || 485 SEQ_GT(sack_end, tcp->tcp_snxt)) { 486 continue; 487 } 488 tcp_notsack_insert(&(tcp->tcp_notsack_list), 489 sack_begin, sack_end, 490 &(tcp->tcp_num_notsack_blk), 491 &(tcp->tcp_cnt_notsack_list)); 492 if (SEQ_GT(sack_end, tcp->tcp_fack)) { 493 tcp->tcp_fack = sack_end; 494 } 495 } 496 found |= TCP_OPT_SACK_PRESENT; 497 continue; 498 499 case TCPOPT_TSTAMP: 500 if (len < TCPOPT_TSTAMP_LEN || 501 up[1] != TCPOPT_TSTAMP_LEN) 502 break; 503 504 tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); 505 tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); 506 507 found |= TCP_OPT_TSTAMP_PRESENT; 508 509 up += TCPOPT_TSTAMP_LEN; 510 continue; 511 512 case TCPOPT_MD5: 513 if (len < TCPOPT_MD5_LEN || up[1] != TCPOPT_MD5_LEN) 514 break; 515 516 bcopy(up + 2, tcpopt->tcp_opt_sig, 517 sizeof (tcpopt->tcp_opt_sig)); 518 519 found |= TCP_OPT_SIG_PRESENT; 520 up += TCPOPT_MD5_LEN; 521 continue; 522 523 default: 524 if (len <= 1 || len < (int)up[1] || up[1] == 0) 525 break; 526 up += up[1]; 527 continue; 528 } 529 break; 530 } 531 return (found); 532 } 533 534 /* 535 * Process all TCP option in SYN segment. Note that this function should 536 * be called after tcp_set_destination() is called so that the necessary info 537 * from IRE is already set in the tcp structure. 538 * 539 * This function sets up the correct tcp_mss value according to the 540 * MSS option value and our header size. It also sets up the window scale 541 * and timestamp values, and initialize SACK info blocks. But it does not 542 * change receive window size after setting the tcp_mss value. The caller 543 * should do the appropriate change. 544 */ 545 static boolean_t 546 tcp_process_options(mblk_t *mp, tcp_t *tcp, tcpha_t *tcpha, ip_recv_attr_t *ira, 547 boolean_t incoming) 548 { 549 int options; 550 tcp_opt_t tcpopt; 551 uint32_t mss_max; 552 char *tmp_tcph; 553 tcp_stack_t *tcps = tcp->tcp_tcps; 554 conn_t *connp = tcp->tcp_connp; 555 556 tcpopt.tcp = NULL; 557 options = tcp_parse_options(tcpha, &tcpopt); 558 559 if (tcp->tcp_md5sig) { 560 if ((options & TCP_OPT_SIG_PRESENT)) { 561 if (!tcpsig_verify(mp->b_cont, tcp, tcpha, ira, 562 tcpopt.tcp_opt_sig)) { 563 return (B_FALSE); 564 } 565 } else if (incoming) { 566 567 /* 568 * This is a SYN packet for a listener which has the 569 * TCP_MD5SIG option enabled, but the incoming SYN did 570 * not contain a signature. If there is a configured SA 571 * for this connection we must silently drop the 572 * incoming packet. Otherwise we will gracefully 573 * degrade to a connection without the option enabled. 574 */ 575 if (tcpsig_sa_exists(tcp, true, NULL)) { 576 TCP_STAT(tcp->tcp_tcps, tcp_sig_no_option); 577 return (B_FALSE); 578 } 579 TCP_STAT(tcp->tcp_tcps, tcp_sig_degraded); 580 tcp->tcp_md5sig = 0; 581 } else { 582 TCP_STAT(tcp->tcp_tcps, tcp_sig_no_option); 583 return (B_FALSE); 584 } 585 } 586 587 /* 588 * Process MSS option. Note that MSS option value does not account 589 * for IP or TCP options. This means that it is equal to MTU - minimum 590 * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for 591 * IPv6. 592 */ 593 if (!(options & TCP_OPT_MSS_PRESENT)) { 594 if (connp->conn_ipversion == IPV4_VERSION) 595 tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv4; 596 else 597 tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv6; 598 } else { 599 if (connp->conn_ipversion == IPV4_VERSION) 600 mss_max = tcps->tcps_mss_max_ipv4; 601 else 602 mss_max = tcps->tcps_mss_max_ipv6; 603 if (tcpopt.tcp_opt_mss < tcps->tcps_mss_min) 604 tcpopt.tcp_opt_mss = tcps->tcps_mss_min; 605 else if (tcpopt.tcp_opt_mss > mss_max) 606 tcpopt.tcp_opt_mss = mss_max; 607 } 608 609 /* Process Window Scale option. */ 610 if (options & TCP_OPT_WSCALE_PRESENT) { 611 tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; 612 tcp->tcp_snd_ws_ok = B_TRUE; 613 } else { 614 tcp->tcp_snd_ws = B_FALSE; 615 tcp->tcp_snd_ws_ok = B_FALSE; 616 tcp->tcp_rcv_ws = B_FALSE; 617 } 618 619 /* Process Timestamp option. */ 620 if ((options & TCP_OPT_TSTAMP_PRESENT) && 621 (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) { 622 tmp_tcph = (char *)tcp->tcp_tcpha; 623 624 tcp->tcp_snd_ts_ok = B_TRUE; 625 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 626 tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64(); 627 ASSERT(OK_32PTR(tmp_tcph)); 628 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); 629 630 /* Fill in our template header with basic timestamp option. */ 631 tmp_tcph += connp->conn_ht_ulp_len; 632 tmp_tcph[0] = TCPOPT_NOP; 633 tmp_tcph[1] = TCPOPT_NOP; 634 tmp_tcph[2] = TCPOPT_TSTAMP; 635 tmp_tcph[3] = TCPOPT_TSTAMP_LEN; 636 connp->conn_ht_iphc_len += TCPOPT_REAL_TS_LEN; 637 connp->conn_ht_ulp_len += TCPOPT_REAL_TS_LEN; 638 tcp->tcp_tcpha->tha_offset_and_reserved += (3 << 4); 639 } else { 640 tcp->tcp_snd_ts_ok = B_FALSE; 641 } 642 643 /* 644 * Process SACK options. If SACK is enabled for this connection, 645 * then allocate the SACK info structure. Note the following ways 646 * when tcp_snd_sack_ok is set to true. 647 * 648 * For active connection: in tcp_set_destination() called in 649 * tcp_connect(). 650 * 651 * For passive connection: in tcp_set_destination() called in 652 * tcp_input_listener(). 653 * 654 * That's the reason why the extra TCP_IS_DETACHED() check is there. 655 * That check makes sure that if we did not send a SACK OK option, 656 * we will not enable SACK for this connection even though the other 657 * side sends us SACK OK option. For active connection, the SACK 658 * info structure has already been allocated. So we need to free 659 * it if SACK is disabled. 660 */ 661 if ((options & TCP_OPT_SACK_OK_PRESENT) && 662 (tcp->tcp_snd_sack_ok || 663 (tcps->tcps_sack_permitted != 0 && TCP_IS_DETACHED(tcp)))) { 664 ASSERT(tcp->tcp_num_sack_blk == 0); 665 ASSERT(tcp->tcp_notsack_list == NULL); 666 667 tcp->tcp_snd_sack_ok = B_TRUE; 668 if (tcp->tcp_snd_ts_ok) { 669 tcp->tcp_max_sack_blk = 3; 670 } else { 671 tcp->tcp_max_sack_blk = 4; 672 } 673 } else if (tcp->tcp_snd_sack_ok) { 674 /* 675 * Resetting tcp_snd_sack_ok to B_FALSE so that 676 * no SACK info will be used for this 677 * connection. This assumes that SACK usage 678 * permission is negotiated. This may need 679 * to be changed once this is clarified. 680 */ 681 ASSERT(tcp->tcp_num_sack_blk == 0); 682 ASSERT(tcp->tcp_notsack_list == NULL); 683 tcp->tcp_snd_sack_ok = B_FALSE; 684 } 685 686 /* 687 * Now we know the exact TCP/IP header length, subtract 688 * that from tcp_mss to get our side's MSS. 689 */ 690 tcp->tcp_mss -= connp->conn_ht_iphc_len; 691 692 /* 693 * Here we assume that the other side's header size will be equal to 694 * our header size. We calculate the real MSS accordingly. Need to 695 * take into additional stuffs IPsec puts in. 696 * 697 * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) 698 */ 699 tcpopt.tcp_opt_mss -= connp->conn_ht_iphc_len + 700 tcp->tcp_ipsec_overhead - 701 ((connp->conn_ipversion == IPV4_VERSION ? 702 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH); 703 704 /* 705 * Set MSS to the smaller one of both ends of the connection. 706 * We should not have called tcp_mss_set() before, but our 707 * side of the MSS should have been set to a proper value 708 * by tcp_set_destination(). tcp_mss_set() will also set up the 709 * STREAM head parameters properly. 710 * 711 * If we have a larger-than-16-bit window but the other side 712 * didn't want to do window scale, tcp_rwnd_set() will take 713 * care of that. 714 */ 715 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); 716 717 /* 718 * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been 719 * updated properly. 720 */ 721 TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial); 722 723 if (tcp->tcp_cc_algo->conn_init != NULL) 724 tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv); 725 726 return (B_TRUE); 727 } 728 729 /* 730 * Add a new piece to the tcp reassembly queue. If the gap at the beginning 731 * is filled, return as much as we can. The message passed in may be 732 * multi-part, chained using b_cont. "start" is the starting sequence 733 * number for this piece. 734 */ 735 static mblk_t * 736 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) 737 { 738 uint32_t end, bytes; 739 mblk_t *mp1; 740 mblk_t *mp2; 741 mblk_t *next_mp; 742 uint32_t u1; 743 tcp_stack_t *tcps = tcp->tcp_tcps; 744 745 746 /* Walk through all the new pieces. */ 747 do { 748 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 749 (uintptr_t)INT_MAX); 750 end = start + (int)(mp->b_wptr - mp->b_rptr); 751 next_mp = mp->b_cont; 752 if (start == end) { 753 /* Empty. Blast it. */ 754 freeb(mp); 755 continue; 756 } 757 bytes = end - start; 758 mp->b_cont = NULL; 759 TCP_REASS_SET_SEQ(mp, start); 760 TCP_REASS_SET_END(mp, end); 761 mp1 = tcp->tcp_reass_tail; 762 if (mp1 == NULL || SEQ_GEQ(start, TCP_REASS_END(mp1))) { 763 if (mp1 != NULL) { 764 /* 765 * New stuff is beyond the tail; link it on the 766 * end. 767 */ 768 mp1->b_cont = mp; 769 } else { 770 tcp->tcp_reass_head = mp; 771 } 772 tcp->tcp_reass_tail = mp; 773 TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs); 774 TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, bytes); 775 tcp->tcp_cs.tcp_in_data_unorder_segs++; 776 tcp->tcp_cs.tcp_in_data_unorder_bytes += bytes; 777 continue; 778 } 779 mp1 = tcp->tcp_reass_head; 780 u1 = TCP_REASS_SEQ(mp1); 781 /* New stuff at the front? */ 782 if (SEQ_LT(start, u1)) { 783 /* Yes... Check for overlap. */ 784 mp->b_cont = mp1; 785 tcp->tcp_reass_head = mp; 786 tcp_reass_elim_overlap(tcp, mp); 787 continue; 788 } 789 /* 790 * The new piece fits somewhere between the head and tail. 791 * We find our slot, where mp1 precedes us and mp2 trails. 792 */ 793 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { 794 u1 = TCP_REASS_SEQ(mp2); 795 if (SEQ_LEQ(start, u1)) 796 break; 797 } 798 /* Link ourselves in */ 799 mp->b_cont = mp2; 800 mp1->b_cont = mp; 801 802 /* Trim overlap with following mblk(s) first */ 803 tcp_reass_elim_overlap(tcp, mp); 804 805 /* Trim overlap with preceding mblk */ 806 tcp_reass_elim_overlap(tcp, mp1); 807 808 } while (start = end, mp = next_mp); 809 mp1 = tcp->tcp_reass_head; 810 /* Anything ready to go? */ 811 if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) 812 return (NULL); 813 /* Eat what we can off the queue */ 814 for (;;) { 815 mp = mp1->b_cont; 816 end = TCP_REASS_END(mp1); 817 TCP_REASS_SET_SEQ(mp1, 0); 818 TCP_REASS_SET_END(mp1, 0); 819 if (!mp) { 820 tcp->tcp_reass_tail = NULL; 821 break; 822 } 823 if (end != TCP_REASS_SEQ(mp)) { 824 mp1->b_cont = NULL; 825 break; 826 } 827 mp1 = mp; 828 } 829 mp1 = tcp->tcp_reass_head; 830 tcp->tcp_reass_head = mp; 831 return (mp1); 832 } 833 834 /* Eliminate any overlap that mp may have over later mblks */ 835 static void 836 tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) 837 { 838 uint32_t end; 839 mblk_t *mp1; 840 uint32_t u1; 841 tcp_stack_t *tcps = tcp->tcp_tcps; 842 843 end = TCP_REASS_END(mp); 844 while ((mp1 = mp->b_cont) != NULL) { 845 u1 = TCP_REASS_SEQ(mp1); 846 if (!SEQ_GT(end, u1)) 847 break; 848 if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { 849 mp->b_wptr -= end - u1; 850 TCP_REASS_SET_END(mp, u1); 851 TCPS_BUMP_MIB(tcps, tcpInDataPartDupSegs); 852 TCPS_UPDATE_MIB(tcps, tcpInDataPartDupBytes, 853 end - u1); 854 break; 855 } 856 mp->b_cont = mp1->b_cont; 857 TCP_REASS_SET_SEQ(mp1, 0); 858 TCP_REASS_SET_END(mp1, 0); 859 freeb(mp1); 860 TCPS_BUMP_MIB(tcps, tcpInDataDupSegs); 861 TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes, end - u1); 862 } 863 if (!mp1) 864 tcp->tcp_reass_tail = mp; 865 } 866 867 /* 868 * This function does PAWS protection check, per RFC 7323 section 5. Requires 869 * that timestamp options are already processed into tcpoptp. Returns B_TRUE if 870 * the segment passes the PAWS test, else returns B_FALSE. 871 */ 872 boolean_t 873 tcp_paws_check(tcp_t *tcp, const tcp_opt_t *tcpoptp) 874 { 875 if (TSTMP_LT(tcpoptp->tcp_opt_ts_val, 876 tcp->tcp_ts_recent)) { 877 if (LBOLT_FASTPATH64 < 878 (tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) { 879 /* This segment is not acceptable. */ 880 return (B_FALSE); 881 } else { 882 /* 883 * Connection has been idle for 884 * too long. Reset the timestamp 885 */ 886 tcp->tcp_ts_recent = 887 tcpoptp->tcp_opt_ts_val; 888 } 889 } 890 return (B_TRUE); 891 } 892 893 /* 894 * Defense for the SYN attack - 895 * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest 896 * one from the list of droppable eagers. This list is a subset of q0. 897 * see comments before the definition of MAKE_DROPPABLE(). 898 * 2. Don't drop a SYN request before its first timeout. This gives every 899 * request at least til the first timeout to complete its 3-way handshake. 900 * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many 901 * requests currently on the queue that has timed out. This will be used 902 * as an indicator of whether an attack is under way, so that appropriate 903 * actions can be taken. (It's incremented in tcp_timer() and decremented 904 * either when eager goes into ESTABLISHED, or gets freed up.) 905 * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on 906 * # of timeout drops back to <= q0len/32 => SYN alert off 907 */ 908 static boolean_t 909 tcp_drop_q0(tcp_t *tcp) 910 { 911 tcp_t *eager; 912 mblk_t *mp; 913 tcp_stack_t *tcps = tcp->tcp_tcps; 914 915 ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock)); 916 ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); 917 918 /* Pick oldest eager from the list of droppable eagers */ 919 eager = tcp->tcp_eager_prev_drop_q0; 920 921 /* If list is empty. return B_FALSE */ 922 if (eager == tcp) { 923 return (B_FALSE); 924 } 925 926 /* If allocated, the mp will be freed in tcp_clean_death_wrapper() */ 927 if ((mp = allocb(0, BPRI_HI)) == NULL) 928 return (B_FALSE); 929 930 /* 931 * Take this eager out from the list of droppable eagers since we are 932 * going to drop it. 933 */ 934 MAKE_UNDROPPABLE(eager); 935 936 if (tcp->tcp_connp->conn_debug) { 937 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, 938 "tcp_drop_q0: listen half-open queue (max=%d) overflow" 939 " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0, 940 tcp->tcp_conn_req_cnt_q0, 941 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 942 } 943 944 TCPS_BUMP_MIB(tcps, tcpHalfOpenDrop); 945 946 /* Put a reference on the conn as we are enqueueing it in the sqeue */ 947 CONN_INC_REF(eager->tcp_connp); 948 949 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, 950 tcp_clean_death_wrapper, eager->tcp_connp, NULL, 951 SQ_FILL, SQTAG_TCP_DROP_Q0); 952 953 return (B_TRUE); 954 } 955 956 /* 957 * Handle a SYN on an AF_INET6 socket; can be either IPv4 or IPv6 958 */ 959 static mblk_t * 960 tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, 961 ip_recv_attr_t *ira) 962 { 963 tcp_t *ltcp = lconnp->conn_tcp; 964 tcp_t *tcp = connp->conn_tcp; 965 mblk_t *tpi_mp; 966 ipha_t *ipha; 967 ip6_t *ip6h; 968 sin6_t sin6; 969 uint_t ifindex = ira->ira_ruifindex; 970 tcp_stack_t *tcps = tcp->tcp_tcps; 971 972 if (ira->ira_flags & IRAF_IS_IPV4) { 973 ipha = (ipha_t *)mp->b_rptr; 974 975 connp->conn_ipversion = IPV4_VERSION; 976 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); 977 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); 978 connp->conn_saddr_v6 = connp->conn_laddr_v6; 979 980 sin6 = sin6_null; 981 sin6.sin6_addr = connp->conn_faddr_v6; 982 sin6.sin6_port = connp->conn_fport; 983 sin6.sin6_family = AF_INET6; 984 sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, 985 IPCL_ZONEID(lconnp), tcps->tcps_netstack); 986 987 if (connp->conn_recv_ancillary.crb_recvdstaddr) { 988 sin6_t sin6d; 989 990 sin6d = sin6_null; 991 sin6d.sin6_addr = connp->conn_laddr_v6; 992 sin6d.sin6_port = connp->conn_lport; 993 sin6d.sin6_family = AF_INET; 994 tpi_mp = mi_tpi_extconn_ind(NULL, 995 (char *)&sin6d, sizeof (sin6_t), 996 (char *)&tcp, 997 (t_scalar_t)sizeof (intptr_t), 998 (char *)&sin6d, sizeof (sin6_t), 999 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 1000 } else { 1001 tpi_mp = mi_tpi_conn_ind(NULL, 1002 (char *)&sin6, sizeof (sin6_t), 1003 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 1004 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 1005 } 1006 } else { 1007 ip6h = (ip6_t *)mp->b_rptr; 1008 1009 connp->conn_ipversion = IPV6_VERSION; 1010 connp->conn_laddr_v6 = ip6h->ip6_dst; 1011 connp->conn_faddr_v6 = ip6h->ip6_src; 1012 connp->conn_saddr_v6 = connp->conn_laddr_v6; 1013 1014 sin6 = sin6_null; 1015 sin6.sin6_addr = connp->conn_faddr_v6; 1016 sin6.sin6_port = connp->conn_fport; 1017 sin6.sin6_family = AF_INET6; 1018 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 1019 sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, 1020 IPCL_ZONEID(lconnp), tcps->tcps_netstack); 1021 1022 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { 1023 /* Pass up the scope_id of remote addr */ 1024 sin6.sin6_scope_id = ifindex; 1025 } else { 1026 sin6.sin6_scope_id = 0; 1027 } 1028 if (connp->conn_recv_ancillary.crb_recvdstaddr) { 1029 sin6_t sin6d; 1030 1031 sin6d = sin6_null; 1032 sin6.sin6_addr = connp->conn_laddr_v6; 1033 sin6d.sin6_port = connp->conn_lport; 1034 sin6d.sin6_family = AF_INET6; 1035 if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_laddr_v6)) 1036 sin6d.sin6_scope_id = ifindex; 1037 1038 tpi_mp = mi_tpi_extconn_ind(NULL, 1039 (char *)&sin6d, sizeof (sin6_t), 1040 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 1041 (char *)&sin6d, sizeof (sin6_t), 1042 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 1043 } else { 1044 tpi_mp = mi_tpi_conn_ind(NULL, 1045 (char *)&sin6, sizeof (sin6_t), 1046 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 1047 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 1048 } 1049 } 1050 1051 tcp->tcp_mss = tcps->tcps_mss_def_ipv6; 1052 return (tpi_mp); 1053 } 1054 1055 /* Handle a SYN on an AF_INET socket */ 1056 static mblk_t * 1057 tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp, 1058 ip_recv_attr_t *ira) 1059 { 1060 tcp_t *ltcp = lconnp->conn_tcp; 1061 tcp_t *tcp = connp->conn_tcp; 1062 sin_t sin; 1063 mblk_t *tpi_mp = NULL; 1064 tcp_stack_t *tcps = tcp->tcp_tcps; 1065 ipha_t *ipha; 1066 1067 ASSERT(ira->ira_flags & IRAF_IS_IPV4); 1068 ipha = (ipha_t *)mp->b_rptr; 1069 1070 connp->conn_ipversion = IPV4_VERSION; 1071 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); 1072 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); 1073 connp->conn_saddr_v6 = connp->conn_laddr_v6; 1074 1075 sin = sin_null; 1076 sin.sin_addr.s_addr = connp->conn_faddr_v4; 1077 sin.sin_port = connp->conn_fport; 1078 sin.sin_family = AF_INET; 1079 if (lconnp->conn_recv_ancillary.crb_recvdstaddr) { 1080 sin_t sind; 1081 1082 sind = sin_null; 1083 sind.sin_addr.s_addr = connp->conn_laddr_v4; 1084 sind.sin_port = connp->conn_lport; 1085 sind.sin_family = AF_INET; 1086 tpi_mp = mi_tpi_extconn_ind(NULL, 1087 (char *)&sind, sizeof (sin_t), (char *)&tcp, 1088 (t_scalar_t)sizeof (intptr_t), (char *)&sind, 1089 sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum); 1090 } else { 1091 tpi_mp = mi_tpi_conn_ind(NULL, 1092 (char *)&sin, sizeof (sin_t), 1093 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 1094 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 1095 } 1096 1097 tcp->tcp_mss = tcps->tcps_mss_def_ipv4; 1098 return (tpi_mp); 1099 } 1100 1101 /* 1102 * Called via squeue to get on to eager's perimeter. It sends a 1103 * TH_RST if eager is in the fanout table. The listener wants the 1104 * eager to disappear either by means of tcp_eager_blowoff() or 1105 * tcp_eager_cleanup() being called. tcp_eager_kill() can also be 1106 * called (via squeue) if the eager cannot be inserted in the 1107 * fanout table in tcp_input_listener(). 1108 */ 1109 /* ARGSUSED */ 1110 void 1111 tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1112 { 1113 conn_t *econnp = (conn_t *)arg; 1114 tcp_t *eager = econnp->conn_tcp; 1115 tcp_t *listener = eager->tcp_listener; 1116 1117 /* 1118 * We could be called because listener is closing. Since 1119 * the eager was using listener's queue's, we avoid 1120 * using the listeners queues from now on. 1121 */ 1122 ASSERT(eager->tcp_detached); 1123 econnp->conn_rq = NULL; 1124 econnp->conn_wq = NULL; 1125 1126 /* 1127 * An eager's conn_fanout will be NULL if it's a duplicate 1128 * for an existing 4-tuples in the conn fanout table. 1129 * We don't want to send an RST out in such case. 1130 */ 1131 if (econnp->conn_fanout != NULL && eager->tcp_state > TCPS_LISTEN) { 1132 tcp_xmit_ctl("tcp_eager_kill, can't wait", 1133 eager, eager->tcp_snxt, 0, TH_RST); 1134 } 1135 1136 /* We are here because listener wants this eager gone */ 1137 if (listener != NULL) { 1138 mutex_enter(&listener->tcp_eager_lock); 1139 tcp_eager_unlink(eager); 1140 if (eager->tcp_tconnind_started) { 1141 /* 1142 * The eager has sent a conn_ind up to the 1143 * listener but listener decides to close 1144 * instead. We need to drop the extra ref 1145 * placed on eager in tcp_input_data() before 1146 * sending the conn_ind to listener. 1147 */ 1148 CONN_DEC_REF(econnp); 1149 } 1150 mutex_exit(&listener->tcp_eager_lock); 1151 CONN_DEC_REF(listener->tcp_connp); 1152 } 1153 1154 if (eager->tcp_state != TCPS_CLOSED) 1155 tcp_close_detached(eager); 1156 } 1157 1158 /* 1159 * Reset any eager connection hanging off this listener marked 1160 * with 'seqnum' and then reclaim it's resources. 1161 */ 1162 boolean_t 1163 tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum) 1164 { 1165 tcp_t *eager; 1166 mblk_t *mp; 1167 1168 eager = listener; 1169 mutex_enter(&listener->tcp_eager_lock); 1170 do { 1171 eager = eager->tcp_eager_next_q; 1172 if (eager == NULL) { 1173 mutex_exit(&listener->tcp_eager_lock); 1174 return (B_FALSE); 1175 } 1176 } while (eager->tcp_conn_req_seqnum != seqnum); 1177 1178 if (eager->tcp_closemp_used) { 1179 mutex_exit(&listener->tcp_eager_lock); 1180 return (B_TRUE); 1181 } 1182 eager->tcp_closemp_used = B_TRUE; 1183 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 1184 CONN_INC_REF(eager->tcp_connp); 1185 mutex_exit(&listener->tcp_eager_lock); 1186 mp = &eager->tcp_closemp; 1187 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill, 1188 eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF); 1189 return (B_TRUE); 1190 } 1191 1192 /* 1193 * Reset any eager connection hanging off this listener 1194 * and then reclaim it's resources. 1195 */ 1196 void 1197 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) 1198 { 1199 tcp_t *eager; 1200 mblk_t *mp; 1201 tcp_stack_t *tcps = listener->tcp_tcps; 1202 1203 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 1204 1205 if (!q0_only) { 1206 /* First cleanup q */ 1207 TCP_STAT(tcps, tcp_eager_blowoff_q); 1208 eager = listener->tcp_eager_next_q; 1209 while (eager != NULL) { 1210 if (!eager->tcp_closemp_used) { 1211 eager->tcp_closemp_used = B_TRUE; 1212 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 1213 CONN_INC_REF(eager->tcp_connp); 1214 mp = &eager->tcp_closemp; 1215 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, 1216 tcp_eager_kill, eager->tcp_connp, NULL, 1217 SQ_FILL, SQTAG_TCP_EAGER_CLEANUP); 1218 } 1219 eager = eager->tcp_eager_next_q; 1220 } 1221 } 1222 /* Then cleanup q0 */ 1223 TCP_STAT(tcps, tcp_eager_blowoff_q0); 1224 eager = listener->tcp_eager_next_q0; 1225 while (eager != listener) { 1226 if (!eager->tcp_closemp_used) { 1227 eager->tcp_closemp_used = B_TRUE; 1228 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 1229 CONN_INC_REF(eager->tcp_connp); 1230 mp = &eager->tcp_closemp; 1231 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, 1232 tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL, 1233 SQTAG_TCP_EAGER_CLEANUP_Q0); 1234 } 1235 eager = eager->tcp_eager_next_q0; 1236 } 1237 } 1238 1239 /* 1240 * If we are an eager connection hanging off a listener that hasn't 1241 * formally accepted the connection yet, get off its list and blow off 1242 * any data that we have accumulated. 1243 */ 1244 void 1245 tcp_eager_unlink(tcp_t *tcp) 1246 { 1247 tcp_t *listener = tcp->tcp_listener; 1248 1249 ASSERT(listener != NULL); 1250 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 1251 if (tcp->tcp_eager_next_q0 != NULL) { 1252 ASSERT(tcp->tcp_eager_prev_q0 != NULL); 1253 1254 /* Remove the eager tcp from q0 */ 1255 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 1256 tcp->tcp_eager_prev_q0; 1257 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 1258 tcp->tcp_eager_next_q0; 1259 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1260 listener->tcp_conn_req_cnt_q0--; 1261 1262 tcp->tcp_eager_next_q0 = NULL; 1263 tcp->tcp_eager_prev_q0 = NULL; 1264 1265 /* 1266 * Take the eager out, if it is in the list of droppable 1267 * eagers. 1268 */ 1269 MAKE_UNDROPPABLE(tcp); 1270 1271 if (tcp->tcp_syn_rcvd_timeout != 0) { 1272 /* we have timed out before */ 1273 ASSERT(listener->tcp_syn_rcvd_timeout > 0); 1274 listener->tcp_syn_rcvd_timeout--; 1275 } 1276 } else { 1277 tcp_t **tcpp = &listener->tcp_eager_next_q; 1278 tcp_t *prev = NULL; 1279 1280 for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { 1281 if (tcpp[0] == tcp) { 1282 if (listener->tcp_eager_last_q == tcp) { 1283 /* 1284 * If we are unlinking the last 1285 * element on the list, adjust 1286 * tail pointer. Set tail pointer 1287 * to nil when list is empty. 1288 */ 1289 ASSERT(tcp->tcp_eager_next_q == NULL); 1290 if (listener->tcp_eager_last_q == 1291 listener->tcp_eager_next_q) { 1292 listener->tcp_eager_last_q = 1293 NULL; 1294 } else { 1295 /* 1296 * We won't get here if there 1297 * is only one eager in the 1298 * list. 1299 */ 1300 ASSERT(prev != NULL); 1301 listener->tcp_eager_last_q = 1302 prev; 1303 } 1304 } 1305 tcpp[0] = tcp->tcp_eager_next_q; 1306 tcp->tcp_eager_next_q = NULL; 1307 tcp->tcp_eager_last_q = NULL; 1308 ASSERT(listener->tcp_conn_req_cnt_q > 0); 1309 listener->tcp_conn_req_cnt_q--; 1310 break; 1311 } 1312 prev = tcpp[0]; 1313 } 1314 } 1315 tcp->tcp_listener = NULL; 1316 } 1317 1318 /* BEGIN CSTYLED */ 1319 /* 1320 * 1321 * The sockfs ACCEPT path: 1322 * ======================= 1323 * 1324 * The eager is now established in its own perimeter as soon as SYN is 1325 * received in tcp_input_listener(). When sockfs receives conn_ind, it 1326 * completes the accept processing on the acceptor STREAM. The sending 1327 * of conn_ind part is common for both sockfs listener and a TLI/XTI 1328 * listener but a TLI/XTI listener completes the accept processing 1329 * on the listener perimeter. 1330 * 1331 * Common control flow for 3 way handshake: 1332 * ---------------------------------------- 1333 * 1334 * incoming SYN (listener perimeter) -> tcp_input_listener() 1335 * 1336 * incoming SYN-ACK-ACK (eager perim) -> tcp_input_data() 1337 * send T_CONN_IND (listener perim) -> tcp_send_conn_ind() 1338 * 1339 * Sockfs ACCEPT Path: 1340 * ------------------- 1341 * 1342 * open acceptor stream (tcp_open allocates tcp_tli_accept() 1343 * as STREAM entry point) 1344 * 1345 * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_tli_accept() 1346 * 1347 * tcp_tli_accept() extracts the eager and makes the q->q_ptr <-> eager 1348 * association (we are not behind eager's squeue but sockfs is protecting us 1349 * and no one knows about this stream yet. The STREAMS entry point q->q_info 1350 * is changed to point at tcp_wput(). 1351 * 1352 * tcp_accept_common() sends any deferred eagers via tcp_send_pending() to 1353 * listener (done on listener's perimeter). 1354 * 1355 * tcp_tli_accept() calls tcp_accept_finish() on eagers perimeter to finish 1356 * accept. 1357 * 1358 * TLI/XTI client ACCEPT path: 1359 * --------------------------- 1360 * 1361 * soaccept() sends T_CONN_RES on the listener STREAM. 1362 * 1363 * tcp_tli_accept() -> tcp_accept_swap() complete the processing and send 1364 * a M_SETOPS mblk to eager perimeter to finish accept (tcp_accept_finish()). 1365 * 1366 * Locks: 1367 * ====== 1368 * 1369 * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and 1370 * and listeners->tcp_eager_next_q. 1371 * 1372 * Referencing: 1373 * ============ 1374 * 1375 * 1) We start out in tcp_input_listener by eager placing a ref on 1376 * listener and listener adding eager to listeners->tcp_eager_next_q0. 1377 * 1378 * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before 1379 * doing so we place a ref on the eager. This ref is finally dropped at the 1380 * end of tcp_accept_finish() while unwinding from the squeue, i.e. the 1381 * reference is dropped by the squeue framework. 1382 * 1383 * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish 1384 * 1385 * The reference must be released by the same entity that added the reference 1386 * In the above scheme, the eager is the entity that adds and releases the 1387 * references. Note that tcp_accept_finish executes in the squeue of the eager 1388 * (albeit after it is attached to the acceptor stream). Though 1. executes 1389 * in the listener's squeue, the eager is nascent at this point and the 1390 * reference can be considered to have been added on behalf of the eager. 1391 * 1392 * Eager getting a Reset or listener closing: 1393 * ========================================== 1394 * 1395 * Once the listener and eager are linked, the listener never does the unlink. 1396 * If the listener needs to close, tcp_eager_cleanup() is called which queues 1397 * a message on all eager perimeter. The eager then does the unlink, clears 1398 * any pointers to the listener's queue and drops the reference to the 1399 * listener. The listener waits in tcp_close outside the squeue until its 1400 * refcount has dropped to 1. This ensures that the listener has waited for 1401 * all eagers to clear their association with the listener. 1402 * 1403 * Similarly, if eager decides to go away, it can unlink itself and close. 1404 * When the T_CONN_RES comes down, we check if eager has closed. Note that 1405 * the reference to eager is still valid because of the extra ref we put 1406 * in tcp_send_conn_ind. 1407 * 1408 * Listener can always locate the eager under the protection 1409 * of the listener->tcp_eager_lock, and then do a refhold 1410 * on the eager during the accept processing. 1411 * 1412 * The acceptor stream accesses the eager in the accept processing 1413 * based on the ref placed on eager before sending T_conn_ind. 1414 * The only entity that can negate this refhold is a listener close 1415 * which is mutually exclusive with an active acceptor stream. 1416 * 1417 * Eager's reference on the listener 1418 * =================================== 1419 * 1420 * If the accept happens (even on a closed eager) the eager drops its 1421 * reference on the listener at the start of tcp_accept_finish. If the 1422 * eager is killed due to an incoming RST before the T_conn_ind is sent up, 1423 * the reference is dropped in tcp_closei_local. If the listener closes, 1424 * the reference is dropped in tcp_eager_kill. In all cases the reference 1425 * is dropped while executing in the eager's context (squeue). 1426 */ 1427 /* END CSTYLED */ 1428 1429 /* Process the SYN packet, mp, directed at the listener 'tcp' */ 1430 1431 /* 1432 * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN. 1433 * tcp_input_data will not see any packets for listeners since the listener 1434 * has conn_recv set to tcp_input_listener. 1435 */ 1436 /* ARGSUSED */ 1437 static void 1438 tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 1439 { 1440 tcpha_t *tcpha; 1441 uint32_t seg_seq; 1442 tcp_t *eager; 1443 int err; 1444 conn_t *econnp = NULL; 1445 squeue_t *new_sqp; 1446 mblk_t *mp1; 1447 uint_t ip_hdr_len; 1448 conn_t *lconnp = (conn_t *)arg; 1449 tcp_t *listener = lconnp->conn_tcp; 1450 tcp_stack_t *tcps = listener->tcp_tcps; 1451 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 1452 uint_t flags; 1453 mblk_t *tpi_mp; 1454 uint_t ifindex = ira->ira_ruifindex; 1455 boolean_t tlc_set = B_FALSE; 1456 1457 ip_hdr_len = ira->ira_ip_hdr_length; 1458 tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len]; 1459 flags = (unsigned int)tcpha->tha_flags & 0xFF; 1460 1461 DTRACE_TCP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, lconnp->conn_ixa, 1462 __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, listener, 1463 __dtrace_tcp_tcph_t *, tcpha); 1464 1465 if (!(flags & TH_SYN)) { 1466 if ((flags & TH_RST) || (flags & TH_URG)) { 1467 freemsg(mp); 1468 return; 1469 } 1470 if (flags & TH_ACK) { 1471 /* Note this executes in listener's squeue */ 1472 tcp_xmit_listeners_reset(mp, ira, ipst, lconnp); 1473 return; 1474 } 1475 1476 freemsg(mp); 1477 return; 1478 } 1479 1480 if (listener->tcp_state != TCPS_LISTEN) 1481 goto error2; 1482 1483 ASSERT(IPCL_IS_BOUND(lconnp)); 1484 1485 mutex_enter(&listener->tcp_eager_lock); 1486 1487 /* 1488 * The system is under memory pressure, so we need to do our part 1489 * to relieve the pressure. So we only accept new request if there 1490 * is nothing waiting to be accepted or waiting to complete the 3-way 1491 * handshake. This means that busy listener will not get too many 1492 * new requests which they cannot handle in time while non-busy 1493 * listener is still functioning properly. 1494 */ 1495 if (tcps->tcps_reclaim && (listener->tcp_conn_req_cnt_q > 0 || 1496 listener->tcp_conn_req_cnt_q0 > 0)) { 1497 mutex_exit(&listener->tcp_eager_lock); 1498 TCP_STAT(tcps, tcp_listen_mem_drop); 1499 goto error2; 1500 } 1501 1502 if (listener->tcp_conn_req_cnt_q >= listener->tcp_conn_req_max) { 1503 mutex_exit(&listener->tcp_eager_lock); 1504 TCP_STAT(tcps, tcp_listendrop); 1505 TCPS_BUMP_MIB(tcps, tcpListenDrop); 1506 if (lconnp->conn_debug) { 1507 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 1508 "tcp_input_listener: listen backlog (max=%d) " 1509 "overflow (%d pending) on %s", 1510 listener->tcp_conn_req_max, 1511 listener->tcp_conn_req_cnt_q, 1512 tcp_display(listener, NULL, DISP_PORT_ONLY)); 1513 } 1514 goto error2; 1515 } 1516 1517 if (listener->tcp_conn_req_cnt_q0 >= 1518 listener->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) { 1519 /* 1520 * Q0 is full. Drop a pending half-open req from the queue 1521 * to make room for the new SYN req. Also mark the time we 1522 * drop a SYN. 1523 * 1524 * A more aggressive defense against SYN attack will 1525 * be to set the "tcp_syn_defense" flag now. 1526 */ 1527 TCP_STAT(tcps, tcp_listendropq0); 1528 listener->tcp_last_rcv_lbolt = ddi_get_lbolt64(); 1529 if (!tcp_drop_q0(listener)) { 1530 mutex_exit(&listener->tcp_eager_lock); 1531 TCPS_BUMP_MIB(tcps, tcpListenDropQ0); 1532 if (lconnp->conn_debug) { 1533 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, 1534 "tcp_input_listener: listen half-open " 1535 "queue (max=%d) full (%d pending) on %s", 1536 tcps->tcps_conn_req_max_q0, 1537 listener->tcp_conn_req_cnt_q0, 1538 tcp_display(listener, NULL, 1539 DISP_PORT_ONLY)); 1540 } 1541 goto error2; 1542 } 1543 } 1544 1545 /* 1546 * Enforce the limit set on the number of connections per listener. 1547 * Note that tlc_cnt starts with 1. So need to add 1 to tlc_max 1548 * for comparison. 1549 */ 1550 if (listener->tcp_listen_cnt != NULL) { 1551 tcp_listen_cnt_t *tlc = listener->tcp_listen_cnt; 1552 int64_t now; 1553 1554 if (atomic_inc_32_nv(&tlc->tlc_cnt) > tlc->tlc_max + 1) { 1555 mutex_exit(&listener->tcp_eager_lock); 1556 now = ddi_get_lbolt64(); 1557 atomic_dec_32(&tlc->tlc_cnt); 1558 TCP_STAT(tcps, tcp_listen_cnt_drop); 1559 tlc->tlc_drop++; 1560 if (now - tlc->tlc_report_time > 1561 MSEC_TO_TICK(TCP_TLC_REPORT_INTERVAL)) { 1562 zcmn_err(lconnp->conn_zoneid, CE_WARN, 1563 "Listener (port %d) connection max (%u) " 1564 "reached: %u attempts dropped total\n", 1565 ntohs(listener->tcp_connp->conn_lport), 1566 tlc->tlc_max, tlc->tlc_drop); 1567 tlc->tlc_report_time = now; 1568 } 1569 goto error2; 1570 } 1571 tlc_set = B_TRUE; 1572 } 1573 1574 mutex_exit(&listener->tcp_eager_lock); 1575 1576 /* 1577 * IP sets ira_sqp to either the senders conn_sqp (for loopback) 1578 * or based on the ring (for packets from GLD). Otherwise it is 1579 * set based on lbolt i.e., a somewhat random number. 1580 */ 1581 ASSERT(ira->ira_sqp != NULL); 1582 new_sqp = ira->ira_sqp; 1583 1584 econnp = tcp_get_conn(arg2, tcps); 1585 if (econnp == NULL) 1586 goto error2; 1587 1588 ASSERT(econnp->conn_netstack == lconnp->conn_netstack); 1589 econnp->conn_sqp = new_sqp; 1590 econnp->conn_initial_sqp = new_sqp; 1591 econnp->conn_ixa->ixa_sqp = new_sqp; 1592 1593 econnp->conn_fport = tcpha->tha_lport; 1594 econnp->conn_lport = tcpha->tha_fport; 1595 1596 err = conn_inherit_parent(lconnp, econnp); 1597 if (err != 0) 1598 goto error3; 1599 1600 /* We already know the laddr of the new connection is ours */ 1601 econnp->conn_ixa->ixa_src_generation = ipst->ips_src_generation; 1602 1603 ASSERT(OK_32PTR(mp->b_rptr)); 1604 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION || 1605 IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION); 1606 1607 if (lconnp->conn_family == AF_INET) { 1608 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION); 1609 tpi_mp = tcp_conn_create_v4(lconnp, econnp, mp, ira); 1610 } else { 1611 tpi_mp = tcp_conn_create_v6(lconnp, econnp, mp, ira); 1612 } 1613 1614 if (tpi_mp == NULL) 1615 goto error3; 1616 1617 eager = econnp->conn_tcp; 1618 eager->tcp_detached = B_TRUE; 1619 SOCK_CONNID_INIT(eager->tcp_connid); 1620 1621 /* 1622 * Initialize the eager's tcp_t and inherit some parameters from 1623 * the listener. 1624 */ 1625 tcp_init_values(eager, listener); 1626 1627 ASSERT((econnp->conn_ixa->ixa_flags & 1628 (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | 1629 IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)) == 1630 (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | 1631 IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)); 1632 1633 if (!tcps->tcps_dev_flow_ctl) 1634 econnp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL; 1635 1636 /* Prepare for diffing against previous packets */ 1637 eager->tcp_recvifindex = 0; 1638 eager->tcp_recvhops = 0xffffffffU; 1639 1640 if (!(ira->ira_flags & IRAF_IS_IPV4) && econnp->conn_bound_if == 0) { 1641 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_faddr_v6) || 1642 IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6)) { 1643 econnp->conn_incoming_ifindex = ifindex; 1644 econnp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 1645 econnp->conn_ixa->ixa_scopeid = ifindex; 1646 } 1647 } 1648 1649 if ((ira->ira_flags & (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS)) == 1650 (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS) && 1651 tcps->tcps_rev_src_routes) { 1652 ipha_t *ipha = (ipha_t *)mp->b_rptr; 1653 ip_pkt_t *ipp = &econnp->conn_xmit_ipp; 1654 1655 /* Source routing option copyover (reverse it) */ 1656 err = ip_find_hdr_v4(ipha, ipp, B_TRUE); 1657 if (err != 0) { 1658 freemsg(tpi_mp); 1659 goto error3; 1660 } 1661 ip_pkt_source_route_reverse_v4(ipp); 1662 } 1663 1664 ASSERT(eager->tcp_conn.tcp_eager_conn_ind == NULL); 1665 ASSERT(!eager->tcp_tconnind_started); 1666 /* 1667 * If the SYN came with a credential, it's a loopback packet or a 1668 * labeled packet; attach the credential to the TPI message. 1669 */ 1670 if (ira->ira_cred != NULL) 1671 mblk_setcred(tpi_mp, ira->ira_cred, ira->ira_cpid); 1672 1673 eager->tcp_conn.tcp_eager_conn_ind = tpi_mp; 1674 ASSERT(eager->tcp_ordrel_mp == NULL); 1675 1676 /* Inherit the listener's non-STREAMS flag */ 1677 if (IPCL_IS_NONSTR(lconnp)) { 1678 econnp->conn_flags |= IPCL_NONSTR; 1679 /* All non-STREAMS tcp_ts are sockets */ 1680 eager->tcp_issocket = B_TRUE; 1681 } else { 1682 /* 1683 * Pre-allocate the T_ordrel_ind mblk for TPI socket so that 1684 * at close time, we will always have that to send up. 1685 * Otherwise, we need to do special handling in case the 1686 * allocation fails at that time. 1687 */ 1688 if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) 1689 goto error3; 1690 } 1691 /* 1692 * Now that the IP addresses and ports are setup in econnp we 1693 * can do the IPsec policy work. 1694 */ 1695 if (ira->ira_flags & IRAF_IPSEC_SECURE) { 1696 if (lconnp->conn_policy != NULL) { 1697 /* 1698 * Inherit the policy from the listener; use 1699 * actions from ira 1700 */ 1701 if (!ip_ipsec_policy_inherit(econnp, lconnp, ira)) { 1702 CONN_DEC_REF(econnp); 1703 freemsg(mp); 1704 goto error3; 1705 } 1706 } 1707 } 1708 1709 /* 1710 * tcp_set_destination() may set tcp_rwnd according to the route 1711 * metrics. If it does not, the eager's receive window will be set 1712 * to the listener's receive window later in this function. 1713 */ 1714 eager->tcp_rwnd = 0; 1715 1716 if (is_system_labeled()) { 1717 ip_xmit_attr_t *ixa = econnp->conn_ixa; 1718 1719 ASSERT(ira->ira_tsl != NULL); 1720 /* Discard any old label */ 1721 if (ixa->ixa_free_flags & IXA_FREE_TSL) { 1722 ASSERT(ixa->ixa_tsl != NULL); 1723 label_rele(ixa->ixa_tsl); 1724 ixa->ixa_free_flags &= ~IXA_FREE_TSL; 1725 ixa->ixa_tsl = NULL; 1726 } 1727 if ((lconnp->conn_mlp_type != mlptSingle || 1728 lconnp->conn_mac_mode != CONN_MAC_DEFAULT) && 1729 ira->ira_tsl != NULL) { 1730 /* 1731 * If this is an MLP connection or a MAC-Exempt 1732 * connection with an unlabeled node, packets are to be 1733 * exchanged using the security label of the received 1734 * SYN packet instead of the server application's label. 1735 * tsol_check_dest called from ip_set_destination 1736 * might later update TSF_UNLABELED by replacing 1737 * ixa_tsl with a new label. 1738 */ 1739 label_hold(ira->ira_tsl); 1740 ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl); 1741 DTRACE_PROBE2(mlp_syn_accept, conn_t *, 1742 econnp, ts_label_t *, ixa->ixa_tsl) 1743 } else { 1744 ixa->ixa_tsl = crgetlabel(econnp->conn_cred); 1745 DTRACE_PROBE2(syn_accept, conn_t *, 1746 econnp, ts_label_t *, ixa->ixa_tsl) 1747 } 1748 /* 1749 * conn_connect() called from tcp_set_destination will verify 1750 * the destination is allowed to receive packets at the 1751 * security label of the SYN-ACK we are generating. As part of 1752 * that, tsol_check_dest() may create a new effective label for 1753 * this connection. 1754 * Finally conn_connect() will call conn_update_label. 1755 * All that remains for TCP to do is to call 1756 * conn_build_hdr_template which is done as part of 1757 * tcp_set_destination. 1758 */ 1759 } 1760 1761 /* 1762 * Since we will clear tcp_listener before we clear tcp_detached 1763 * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress 1764 * so we can tell a TCP_IS_DETACHED_NONEAGER apart. 1765 */ 1766 eager->tcp_hard_binding = B_TRUE; 1767 1768 tcp_bind_hash_insert(&tcps->tcps_bind_fanout[ 1769 TCP_BIND_HASH(econnp->conn_lport)], eager, 0); 1770 1771 CL_INET_CONNECT(econnp, B_FALSE, err); 1772 if (err != 0) { 1773 tcp_bind_hash_remove(eager); 1774 goto error3; 1775 } 1776 1777 SOCK_CONNID_BUMP(eager->tcp_connid); 1778 1779 /* 1780 * Adapt our mss, ttl, ... based on the remote address. 1781 */ 1782 1783 if (tcp_set_destination(eager) != 0) { 1784 TCPS_BUMP_MIB(tcps, tcpAttemptFails); 1785 /* Undo the bind_hash_insert */ 1786 tcp_bind_hash_remove(eager); 1787 goto error3; 1788 } 1789 1790 /* Process all TCP options. */ 1791 if (!tcp_process_options(mp, eager, tcpha, ira, B_TRUE)) { 1792 tcp_bind_hash_remove(eager); 1793 goto error3; 1794 } 1795 1796 /* Is the other end ECN capable? */ 1797 if (tcps->tcps_ecn_permitted >= 1 && 1798 (tcpha->tha_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { 1799 eager->tcp_ecn_ok = B_TRUE; 1800 } 1801 1802 /* 1803 * The listener's conn_rcvbuf should be the default window size or a 1804 * window size changed via SO_RCVBUF option. First round up the 1805 * eager's tcp_rwnd to the nearest MSS. Then find out the window 1806 * scale option value if needed. Call tcp_rwnd_set() to finish the 1807 * setting. 1808 * 1809 * Note if there is a rpipe metric associated with the remote host, 1810 * we should not inherit receive window size from listener. 1811 */ 1812 eager->tcp_rwnd = MSS_ROUNDUP( 1813 (eager->tcp_rwnd == 0 ? econnp->conn_rcvbuf : 1814 eager->tcp_rwnd), eager->tcp_mss); 1815 if (eager->tcp_snd_ws_ok) 1816 tcp_set_ws_value(eager); 1817 /* 1818 * Note that this is the only place tcp_rwnd_set() is called for 1819 * accepting a connection. We need to call it here instead of 1820 * after the 3-way handshake because we need to tell the other 1821 * side our rwnd in the SYN-ACK segment. 1822 */ 1823 (void) tcp_rwnd_set(eager, eager->tcp_rwnd); 1824 1825 ASSERT(eager->tcp_connp->conn_rcvbuf != 0 && 1826 eager->tcp_connp->conn_rcvbuf == eager->tcp_rwnd); 1827 1828 ASSERT(econnp->conn_rcvbuf != 0 && 1829 econnp->conn_rcvbuf == eager->tcp_rwnd); 1830 1831 /* Put a ref on the listener for the eager. */ 1832 CONN_INC_REF(lconnp); 1833 mutex_enter(&listener->tcp_eager_lock); 1834 listener->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; 1835 eager->tcp_eager_next_q0 = listener->tcp_eager_next_q0; 1836 listener->tcp_eager_next_q0 = eager; 1837 eager->tcp_eager_prev_q0 = listener; 1838 1839 /* Set tcp_listener before adding it to tcp_conn_fanout */ 1840 eager->tcp_listener = listener; 1841 eager->tcp_saved_listener = listener; 1842 1843 /* 1844 * Set tcp_listen_cnt so that when the connection is done, the counter 1845 * is decremented. 1846 */ 1847 eager->tcp_listen_cnt = listener->tcp_listen_cnt; 1848 1849 /* 1850 * Tag this detached tcp vector for later retrieval 1851 * by our listener client in tcp_accept(). 1852 */ 1853 eager->tcp_conn_req_seqnum = listener->tcp_conn_req_seqnum; 1854 listener->tcp_conn_req_cnt_q0++; 1855 if (++listener->tcp_conn_req_seqnum == -1) { 1856 /* 1857 * -1 is "special" and defined in TPI as something 1858 * that should never be used in T_CONN_IND 1859 */ 1860 ++listener->tcp_conn_req_seqnum; 1861 } 1862 mutex_exit(&listener->tcp_eager_lock); 1863 1864 if (listener->tcp_syn_defense) { 1865 /* Don't drop the SYN that comes from a good IP source */ 1866 ipaddr_t *addr_cache; 1867 1868 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 1869 if (addr_cache != NULL && econnp->conn_faddr_v4 == 1870 addr_cache[IP_ADDR_CACHE_HASH(econnp->conn_faddr_v4)]) { 1871 eager->tcp_dontdrop = B_TRUE; 1872 } 1873 } 1874 1875 /* 1876 * We need to insert the eager in its own perimeter but as soon 1877 * as we do that, we expose the eager to the classifier and 1878 * should not touch any field outside the eager's perimeter. 1879 * So do all the work necessary before inserting the eager 1880 * in its own perimeter. Be optimistic that conn_connect() 1881 * will succeed but undo everything if it fails. 1882 */ 1883 seg_seq = ntohl(tcpha->tha_seq); 1884 eager->tcp_irs = seg_seq; 1885 eager->tcp_rack = seg_seq; 1886 eager->tcp_rnxt = seg_seq + 1; 1887 eager->tcp_tcpha->tha_ack = htonl(eager->tcp_rnxt); 1888 TCPS_BUMP_MIB(tcps, tcpPassiveOpens); 1889 eager->tcp_state = TCPS_SYN_RCVD; 1890 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 1891 econnp->conn_ixa, void, NULL, tcp_t *, eager, void, NULL, 1892 int32_t, TCPS_LISTEN); 1893 1894 mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss, 1895 NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE); 1896 if (mp1 == NULL) { 1897 /* 1898 * Increment the ref count as we are going to 1899 * enqueueing an mp in squeue 1900 */ 1901 CONN_INC_REF(econnp); 1902 goto error; 1903 } 1904 1905 /* 1906 * We need to start the rto timer. In normal case, we start 1907 * the timer after sending the packet on the wire (or at 1908 * least believing that packet was sent by waiting for 1909 * conn_ip_output() to return). Since this is the first packet 1910 * being sent on the wire for the eager, our initial tcp_rto 1911 * is at least tcp_rexmit_interval_min which is a fairly 1912 * large value to allow the algorithm to adjust slowly to large 1913 * fluctuations of RTT during first few transmissions. 1914 * 1915 * Starting the timer first and then sending the packet in this 1916 * case shouldn't make much difference since tcp_rexmit_interval_min 1917 * is of the order of several 100ms and starting the timer 1918 * first and then sending the packet will result in difference 1919 * of few micro seconds. 1920 * 1921 * Without this optimization, we are forced to hold the fanout 1922 * lock across the ipcl_bind_insert() and sending the packet 1923 * so that we don't race against an incoming packet (maybe RST) 1924 * for this eager. 1925 * 1926 * It is necessary to acquire an extra reference on the eager 1927 * at this point and hold it until after tcp_send_data() to 1928 * ensure against an eager close race. 1929 */ 1930 1931 CONN_INC_REF(econnp); 1932 1933 TCP_TIMER_RESTART(eager, eager->tcp_rto); 1934 1935 /* 1936 * Insert the eager in its own perimeter now. We are ready to deal 1937 * with any packets on eager. 1938 */ 1939 if (ipcl_conn_insert(econnp) != 0) 1940 goto error; 1941 1942 ASSERT(econnp->conn_ixa->ixa_notify_cookie == econnp->conn_tcp); 1943 freemsg(mp); 1944 /* 1945 * Send the SYN-ACK. Use the right squeue so that conn_ixa is 1946 * only used by one thread at a time. 1947 */ 1948 if (econnp->conn_sqp == lconnp->conn_sqp) { 1949 DTRACE_TCP5(send, mblk_t *, NULL, ip_xmit_attr_t *, 1950 econnp->conn_ixa, __dtrace_tcp_void_ip_t *, mp1->b_rptr, 1951 tcp_t *, eager, __dtrace_tcp_tcph_t *, 1952 &mp1->b_rptr[econnp->conn_ixa->ixa_ip_hdr_length]); 1953 (void) conn_ip_output(mp1, econnp->conn_ixa); 1954 CONN_DEC_REF(econnp); 1955 } else { 1956 SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_send_synack, 1957 econnp, NULL, SQ_PROCESS, SQTAG_TCP_SEND_SYNACK); 1958 } 1959 return; 1960 error: 1961 freemsg(mp1); 1962 eager->tcp_closemp_used = B_TRUE; 1963 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 1964 mp1 = &eager->tcp_closemp; 1965 SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill, 1966 econnp, NULL, SQ_FILL, SQTAG_TCP_CONN_REQ_2); 1967 1968 /* 1969 * If a connection already exists, send the mp to that connections so 1970 * that it can be appropriately dealt with. 1971 */ 1972 ipst = tcps->tcps_netstack->netstack_ip; 1973 1974 if ((econnp = ipcl_classify(mp, ira, ipst)) != NULL) { 1975 if (!IPCL_IS_CONNECTED(econnp)) { 1976 /* 1977 * Something bad happened. ipcl_conn_insert() 1978 * failed because a connection already existed 1979 * in connected hash but we can't find it 1980 * anymore (someone blew it away). Just 1981 * free this message and hopefully remote 1982 * will retransmit at which time the SYN can be 1983 * treated as a new connection or dealth with 1984 * a TH_RST if a connection already exists. 1985 */ 1986 CONN_DEC_REF(econnp); 1987 freemsg(mp); 1988 } else { 1989 SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, tcp_input_data, 1990 econnp, ira, SQ_FILL, SQTAG_TCP_CONN_REQ_1); 1991 } 1992 } else { 1993 /* Nobody wants this packet */ 1994 freemsg(mp); 1995 } 1996 return; 1997 error3: 1998 CONN_DEC_REF(econnp); 1999 error2: 2000 freemsg(mp); 2001 if (tlc_set) 2002 atomic_dec_32(&listener->tcp_listen_cnt->tlc_cnt); 2003 } 2004 2005 /* 2006 * In an ideal case of vertical partition in NUMA architecture, its 2007 * beneficial to have the listener and all the incoming connections 2008 * tied to the same squeue. The other constraint is that incoming 2009 * connections should be tied to the squeue attached to interrupted 2010 * CPU for obvious locality reason so this leaves the listener to 2011 * be tied to the same squeue. Our only problem is that when listener 2012 * is binding, the CPU that will get interrupted by the NIC whose 2013 * IP address the listener is binding to is not even known. So 2014 * the code below allows us to change that binding at the time the 2015 * CPU is interrupted by virtue of incoming connection's squeue. 2016 * 2017 * This is usefull only in case of a listener bound to a specific IP 2018 * address. For other kind of listeners, they get bound the 2019 * very first time and there is no attempt to rebind them. 2020 */ 2021 void 2022 tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2, 2023 ip_recv_attr_t *ira) 2024 { 2025 conn_t *connp = (conn_t *)arg; 2026 squeue_t *sqp = (squeue_t *)arg2; 2027 squeue_t *new_sqp; 2028 uint32_t conn_flags; 2029 2030 /* 2031 * IP sets ira_sqp to either the senders conn_sqp (for loopback) 2032 * or based on the ring (for packets from GLD). Otherwise it is 2033 * set based on lbolt i.e., a somewhat random number. 2034 */ 2035 ASSERT(ira->ira_sqp != NULL); 2036 new_sqp = ira->ira_sqp; 2037 2038 if (connp->conn_fanout == NULL) 2039 goto done; 2040 2041 if (!(connp->conn_flags & IPCL_FULLY_BOUND)) { 2042 mutex_enter(&connp->conn_fanout->connf_lock); 2043 mutex_enter(&connp->conn_lock); 2044 /* 2045 * No one from read or write side can access us now 2046 * except for already queued packets on this squeue. 2047 * But since we haven't changed the squeue yet, they 2048 * can't execute. If they are processed after we have 2049 * changed the squeue, they are sent back to the 2050 * correct squeue down below. 2051 * But a listner close can race with processing of 2052 * incoming SYN. If incoming SYN processing changes 2053 * the squeue then the listener close which is waiting 2054 * to enter the squeue would operate on the wrong 2055 * squeue. Hence we don't change the squeue here unless 2056 * the refcount is exactly the minimum refcount. The 2057 * minimum refcount of 4 is counted as - 1 each for 2058 * TCP and IP, 1 for being in the classifier hash, and 2059 * 1 for the mblk being processed. 2060 */ 2061 2062 if (connp->conn_ref != 4 || 2063 connp->conn_tcp->tcp_state != TCPS_LISTEN) { 2064 mutex_exit(&connp->conn_lock); 2065 mutex_exit(&connp->conn_fanout->connf_lock); 2066 goto done; 2067 } 2068 if (connp->conn_sqp != new_sqp) { 2069 while (connp->conn_sqp != new_sqp) 2070 (void) atomic_cas_ptr(&connp->conn_sqp, sqp, 2071 new_sqp); 2072 /* No special MT issues for outbound ixa_sqp hint */ 2073 connp->conn_ixa->ixa_sqp = new_sqp; 2074 } 2075 2076 do { 2077 conn_flags = connp->conn_flags; 2078 conn_flags |= IPCL_FULLY_BOUND; 2079 (void) atomic_cas_32(&connp->conn_flags, 2080 connp->conn_flags, conn_flags); 2081 } while (!(connp->conn_flags & IPCL_FULLY_BOUND)); 2082 2083 mutex_exit(&connp->conn_fanout->connf_lock); 2084 mutex_exit(&connp->conn_lock); 2085 2086 /* 2087 * Assume we have picked a good squeue for the listener. Make 2088 * subsequent SYNs not try to change the squeue. 2089 */ 2090 connp->conn_recv = tcp_input_listener; 2091 } 2092 2093 done: 2094 if (connp->conn_sqp != sqp) { 2095 CONN_INC_REF(connp); 2096 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp, 2097 ira, SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND); 2098 } else { 2099 tcp_input_listener(connp, mp, sqp, ira); 2100 } 2101 } 2102 2103 /* 2104 * Send up all messages queued on tcp_rcv_list. 2105 */ 2106 uint_t 2107 tcp_rcv_drain(tcp_t *tcp) 2108 { 2109 mblk_t *mp; 2110 uint_t ret = 0; 2111 #ifdef DEBUG 2112 uint_t cnt = 0; 2113 #endif 2114 queue_t *q = tcp->tcp_connp->conn_rq; 2115 2116 /* Can't drain on an eager connection */ 2117 if (tcp->tcp_listener != NULL) 2118 return (ret); 2119 2120 /* Can't be a non-STREAMS connection */ 2121 ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 2122 2123 /* No need for the push timer now. */ 2124 if (tcp->tcp_push_tid != 0) { 2125 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 2126 tcp->tcp_push_tid = 0; 2127 } 2128 2129 /* 2130 * Handle two cases here: we are currently fused or we were 2131 * previously fused and have some urgent data to be delivered 2132 * upstream. The latter happens because we either ran out of 2133 * memory or were detached and therefore sending the SIGURG was 2134 * deferred until this point. In either case we pass control 2135 * over to tcp_fuse_rcv_drain() since it may need to complete 2136 * some work. 2137 */ 2138 if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) { 2139 if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL : 2140 &tcp->tcp_fused_sigurg_mp)) 2141 return (ret); 2142 } 2143 2144 while ((mp = tcp->tcp_rcv_list) != NULL) { 2145 tcp->tcp_rcv_list = mp->b_next; 2146 mp->b_next = NULL; 2147 #ifdef DEBUG 2148 cnt += msgdsize(mp); 2149 #endif 2150 putnext(q, mp); 2151 } 2152 #ifdef DEBUG 2153 ASSERT(cnt == tcp->tcp_rcv_cnt); 2154 #endif 2155 tcp->tcp_rcv_last_head = NULL; 2156 tcp->tcp_rcv_last_tail = NULL; 2157 tcp->tcp_rcv_cnt = 0; 2158 2159 if (canputnext(q)) 2160 return (tcp_rwnd_reopen(tcp)); 2161 2162 return (ret); 2163 } 2164 2165 /* 2166 * Queue data on tcp_rcv_list which is a b_next chain. 2167 * tcp_rcv_last_head/tail is the last element of this chain. 2168 * Each element of the chain is a b_cont chain. 2169 * 2170 * M_DATA messages are added to the current element. 2171 * Other messages are added as new (b_next) elements. 2172 */ 2173 void 2174 tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len, cred_t *cr) 2175 { 2176 ASSERT(seg_len == msgdsize(mp)); 2177 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL); 2178 2179 if (is_system_labeled()) { 2180 ASSERT(cr != NULL || msg_getcred(mp, NULL) != NULL); 2181 /* 2182 * Provide for protocols above TCP such as RPC. NOPID leaves 2183 * db_cpid unchanged. 2184 * The cred could have already been set. 2185 */ 2186 if (cr != NULL) 2187 mblk_setcred(mp, cr, NOPID); 2188 } 2189 2190 if (tcp->tcp_rcv_list == NULL) { 2191 ASSERT(tcp->tcp_rcv_last_head == NULL); 2192 tcp->tcp_rcv_list = mp; 2193 tcp->tcp_rcv_last_head = mp; 2194 } else if (DB_TYPE(mp) == DB_TYPE(tcp->tcp_rcv_last_head)) { 2195 tcp->tcp_rcv_last_tail->b_cont = mp; 2196 } else { 2197 tcp->tcp_rcv_last_head->b_next = mp; 2198 tcp->tcp_rcv_last_head = mp; 2199 } 2200 2201 while (mp->b_cont) 2202 mp = mp->b_cont; 2203 2204 tcp->tcp_rcv_last_tail = mp; 2205 tcp->tcp_rcv_cnt += seg_len; 2206 tcp->tcp_rwnd -= seg_len; 2207 } 2208 2209 /* Generate an ACK-only (no data) segment for a TCP endpoint */ 2210 mblk_t * 2211 tcp_ack_mp(tcp_t *tcp) 2212 { 2213 uint32_t seq_no; 2214 tcp_stack_t *tcps = tcp->tcp_tcps; 2215 conn_t *connp = tcp->tcp_connp; 2216 2217 /* 2218 * There are a few cases to be considered while setting the sequence no. 2219 * Essentially, we can come here while processing an unacceptable pkt 2220 * in the TCPS_SYN_RCVD state, in which case we set the sequence number 2221 * to snxt (per RFC 793), note the swnd wouldn't have been set yet. 2222 * If we are here for a zero window probe, stick with suna. In all 2223 * other cases, we check if suna + swnd encompasses snxt and set 2224 * the sequence number to snxt, if so. If snxt falls outside the 2225 * window (the receiver probably shrunk its window), we will go with 2226 * suna + swnd, otherwise the sequence no will be unacceptable to the 2227 * receiver. 2228 */ 2229 if (tcp->tcp_zero_win_probe) { 2230 seq_no = tcp->tcp_suna; 2231 } else if (tcp->tcp_state == TCPS_SYN_RCVD) { 2232 ASSERT(tcp->tcp_swnd == 0); 2233 seq_no = tcp->tcp_snxt; 2234 } else { 2235 seq_no = SEQ_GT(tcp->tcp_snxt, 2236 (tcp->tcp_suna + tcp->tcp_swnd)) ? 2237 (tcp->tcp_suna + tcp->tcp_swnd) : tcp->tcp_snxt; 2238 } 2239 2240 if (tcp->tcp_valid_bits || tcp->tcp_md5sig) { 2241 /* 2242 * For the complex cases where we have to send some 2243 * controls (FIN or SYN), or add an MD5 signature 2244 * option, let tcp_xmit_mp do it. 2245 */ 2246 return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, seq_no, B_FALSE, 2247 NULL, B_FALSE)); 2248 } else { 2249 /* Generate a simple ACK */ 2250 int data_length; 2251 uchar_t *rptr; 2252 tcpha_t *tcpha; 2253 mblk_t *mp1; 2254 int32_t total_hdr_len; 2255 int32_t tcp_hdr_len; 2256 int32_t num_sack_blk = 0; 2257 int32_t sack_opt_len; 2258 ip_xmit_attr_t *ixa = connp->conn_ixa; 2259 2260 /* 2261 * Allocate space for TCP + IP headers 2262 * and link-level header 2263 */ 2264 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 2265 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 2266 tcp->tcp_num_sack_blk); 2267 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 2268 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 2269 total_hdr_len = connp->conn_ht_iphc_len + sack_opt_len; 2270 tcp_hdr_len = connp->conn_ht_ulp_len + sack_opt_len; 2271 } else { 2272 total_hdr_len = connp->conn_ht_iphc_len; 2273 tcp_hdr_len = connp->conn_ht_ulp_len; 2274 } 2275 mp1 = allocb(total_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED); 2276 if (!mp1) 2277 return (NULL); 2278 2279 /* Update the latest receive window size in TCP header. */ 2280 tcp->tcp_tcpha->tha_win = 2281 htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 2282 /* copy in prototype TCP + IP header */ 2283 rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; 2284 mp1->b_rptr = rptr; 2285 mp1->b_wptr = rptr + total_hdr_len; 2286 bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len); 2287 2288 tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length]; 2289 2290 /* Set the TCP sequence number. */ 2291 tcpha->tha_seq = htonl(seq_no); 2292 2293 /* Set up the TCP flag field. */ 2294 tcpha->tha_flags = (uchar_t)TH_ACK; 2295 if (tcp->tcp_ecn_echo_on) 2296 tcpha->tha_flags |= TH_ECE; 2297 2298 tcp->tcp_rack = tcp->tcp_rnxt; 2299 tcp->tcp_rack_cnt = 0; 2300 2301 /* fill in timestamp option if in use */ 2302 if (tcp->tcp_snd_ts_ok) { 2303 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; 2304 2305 U32_TO_BE32(llbolt, 2306 (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); 2307 U32_TO_BE32(tcp->tcp_ts_recent, 2308 (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); 2309 } 2310 2311 /* Fill in SACK options */ 2312 if (num_sack_blk > 0) { 2313 uchar_t *wptr = (uchar_t *)tcpha + 2314 connp->conn_ht_ulp_len; 2315 sack_blk_t *tmp; 2316 int32_t i; 2317 2318 wptr[0] = TCPOPT_NOP; 2319 wptr[1] = TCPOPT_NOP; 2320 wptr[2] = TCPOPT_SACK; 2321 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 2322 sizeof (sack_blk_t); 2323 wptr += TCPOPT_REAL_SACK_LEN; 2324 2325 tmp = tcp->tcp_sack_list; 2326 for (i = 0; i < num_sack_blk; i++) { 2327 U32_TO_BE32(tmp[i].begin, wptr); 2328 wptr += sizeof (tcp_seq); 2329 U32_TO_BE32(tmp[i].end, wptr); 2330 wptr += sizeof (tcp_seq); 2331 } 2332 tcpha->tha_offset_and_reserved += 2333 ((num_sack_blk * 2 + 1) << 4); 2334 } 2335 2336 ixa->ixa_pktlen = total_hdr_len; 2337 2338 if (ixa->ixa_flags & IXAF_IS_IPV4) { 2339 ((ipha_t *)rptr)->ipha_length = htons(total_hdr_len); 2340 } else { 2341 ip6_t *ip6 = (ip6_t *)rptr; 2342 2343 ip6->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN); 2344 } 2345 2346 /* 2347 * Prime pump for checksum calculation in IP. Include the 2348 * adjustment for a source route if any. 2349 */ 2350 data_length = tcp_hdr_len + connp->conn_sum; 2351 data_length = (data_length >> 16) + (data_length & 0xFFFF); 2352 tcpha->tha_sum = htons(data_length); 2353 2354 if (tcp->tcp_ip_forward_progress) { 2355 tcp->tcp_ip_forward_progress = B_FALSE; 2356 connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; 2357 } else { 2358 connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; 2359 } 2360 return (mp1); 2361 } 2362 } 2363 2364 /* 2365 * Dummy socket upcalls for if/when the conn_t gets detached from a 2366 * direct-callback sonode via a user-driven close(). Easy to catch with 2367 * DTrace FBT, and should be mostly harmless. 2368 */ 2369 2370 /* ARGSUSED */ 2371 static sock_upper_handle_t 2372 tcp_dummy_newconn(sock_upper_handle_t x, sock_lower_handle_t y, 2373 sock_downcalls_t *z, cred_t *cr, pid_t pid, sock_upcalls_t **ignored) 2374 { 2375 ASSERT(0); /* Panic in debug, otherwise ignore. */ 2376 return (NULL); 2377 } 2378 2379 /* ARGSUSED */ 2380 static void 2381 tcp_dummy_connected(sock_upper_handle_t x, sock_connid_t y, cred_t *cr, 2382 pid_t pid) 2383 { 2384 ASSERT(x == NULL); 2385 /* Normally we'd crhold(cr) and attach it to socket state. */ 2386 /* LINTED */ 2387 } 2388 2389 /* ARGSUSED */ 2390 static int 2391 tcp_dummy_disconnected(sock_upper_handle_t x, sock_connid_t y, int blah) 2392 { 2393 ASSERT(0); /* Panic in debug, otherwise ignore. */ 2394 return (-1); 2395 } 2396 2397 /* ARGSUSED */ 2398 static void 2399 tcp_dummy_opctl(sock_upper_handle_t x, sock_opctl_action_t y, uintptr_t blah) 2400 { 2401 ASSERT(x == NULL); 2402 /* We really want this one to be a harmless NOP for now. */ 2403 /* LINTED */ 2404 } 2405 2406 /* ARGSUSED */ 2407 static ssize_t 2408 tcp_dummy_recv(sock_upper_handle_t x, mblk_t *mp, size_t len, int flags, 2409 int *error, boolean_t *push) 2410 { 2411 ASSERT(x == NULL); 2412 2413 /* 2414 * Consume the message, set ESHUTDOWN, and return an error. 2415 * Nobody's home! 2416 */ 2417 freemsg(mp); 2418 *error = ESHUTDOWN; 2419 return (-1); 2420 } 2421 2422 /* ARGSUSED */ 2423 static void 2424 tcp_dummy_set_proto_props(sock_upper_handle_t x, struct sock_proto_props *y) 2425 { 2426 ASSERT(0); /* Panic in debug, otherwise ignore. */ 2427 } 2428 2429 /* ARGSUSED */ 2430 static void 2431 tcp_dummy_txq_full(sock_upper_handle_t x, boolean_t y) 2432 { 2433 ASSERT(0); /* Panic in debug, otherwise ignore. */ 2434 } 2435 2436 /* ARGSUSED */ 2437 static void 2438 tcp_dummy_signal_oob(sock_upper_handle_t x, ssize_t len) 2439 { 2440 ASSERT(x == NULL); 2441 /* Otherwise, this would signal socket state about OOB data. */ 2442 } 2443 2444 /* ARGSUSED */ 2445 static void 2446 tcp_dummy_set_error(sock_upper_handle_t x, int err) 2447 { 2448 ASSERT(0); /* Panic in debug, otherwise ignore. */ 2449 } 2450 2451 /* ARGSUSED */ 2452 static void 2453 tcp_dummy_onearg(sock_upper_handle_t x) 2454 { 2455 ASSERT(0); /* Panic in debug, otherwise ignore. */ 2456 } 2457 2458 static sock_upcalls_t tcp_dummy_upcalls = { 2459 tcp_dummy_newconn, 2460 tcp_dummy_connected, 2461 tcp_dummy_disconnected, 2462 tcp_dummy_opctl, 2463 tcp_dummy_recv, 2464 tcp_dummy_set_proto_props, 2465 tcp_dummy_txq_full, 2466 tcp_dummy_signal_oob, 2467 tcp_dummy_onearg, 2468 tcp_dummy_set_error, 2469 tcp_dummy_onearg 2470 }; 2471 2472 /* 2473 * Handle M_DATA messages from IP. Its called directly from IP via 2474 * squeue for received IP packets. 2475 * 2476 * The first argument is always the connp/tcp to which the mp belongs. 2477 * There are no exceptions to this rule. The caller has already put 2478 * a reference on this connp/tcp and once tcp_input_data() returns, 2479 * the squeue will do the refrele. 2480 * 2481 * The TH_SYN for the listener directly go to tcp_input_listener via 2482 * squeue. ICMP errors go directly to tcp_icmp_input(). 2483 * 2484 * sqp: NULL = recursive, sqp != NULL means called from squeue 2485 */ 2486 void 2487 tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 2488 { 2489 int32_t bytes_acked; 2490 int32_t gap; 2491 mblk_t *mp1; 2492 uint_t flags; 2493 uint32_t new_swnd = 0; 2494 uchar_t *iphdr; 2495 uchar_t *rptr; 2496 int32_t rgap; 2497 uint32_t seg_ack; 2498 int seg_len; 2499 uint_t ip_hdr_len; 2500 uint32_t seg_seq; 2501 tcpha_t *tcpha; 2502 int urp; 2503 tcp_opt_t tcpopt; 2504 ip_pkt_t ipp; 2505 boolean_t ofo_seg = B_FALSE; /* Out of order segment */ 2506 uint32_t cwnd; 2507 int mss; 2508 conn_t *connp = (conn_t *)arg; 2509 squeue_t *sqp = (squeue_t *)arg2; 2510 tcp_t *tcp = connp->conn_tcp; 2511 tcp_stack_t *tcps = tcp->tcp_tcps; 2512 sock_upcalls_t *sockupcalls; 2513 2514 /* 2515 * RST from fused tcp loopback peer should trigger an unfuse. 2516 */ 2517 if (tcp->tcp_fused) { 2518 TCP_STAT(tcps, tcp_fusion_aborted); 2519 tcp_unfuse(tcp); 2520 } 2521 2522 mss = 0; 2523 iphdr = mp->b_rptr; 2524 rptr = mp->b_rptr; 2525 ASSERT(OK_32PTR(rptr)); 2526 2527 ip_hdr_len = ira->ira_ip_hdr_length; 2528 if (connp->conn_recv_ancillary.crb_all != 0) { 2529 /* 2530 * Record packet information in the ip_pkt_t 2531 */ 2532 ipp.ipp_fields = 0; 2533 if (ira->ira_flags & IRAF_IS_IPV4) { 2534 (void) ip_find_hdr_v4((ipha_t *)rptr, &ipp, 2535 B_FALSE); 2536 } else { 2537 uint8_t nexthdrp; 2538 2539 /* 2540 * IPv6 packets can only be received by applications 2541 * that are prepared to receive IPv6 addresses. 2542 * The IP fanout must ensure this. 2543 */ 2544 ASSERT(connp->conn_family == AF_INET6); 2545 2546 (void) ip_find_hdr_v6(mp, (ip6_t *)rptr, B_TRUE, &ipp, 2547 &nexthdrp); 2548 ASSERT(nexthdrp == IPPROTO_TCP); 2549 2550 /* Could have caused a pullup? */ 2551 iphdr = mp->b_rptr; 2552 rptr = mp->b_rptr; 2553 } 2554 } 2555 ASSERT(DB_TYPE(mp) == M_DATA); 2556 ASSERT(mp->b_next == NULL); 2557 2558 tcpha = (tcpha_t *)&rptr[ip_hdr_len]; 2559 seg_seq = ntohl(tcpha->tha_seq); 2560 seg_ack = ntohl(tcpha->tha_ack); 2561 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 2562 seg_len = (int)(mp->b_wptr - rptr) - 2563 (ip_hdr_len + TCP_HDR_LENGTH(tcpha)); 2564 if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) { 2565 do { 2566 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 2567 (uintptr_t)INT_MAX); 2568 seg_len += (int)(mp1->b_wptr - mp1->b_rptr); 2569 } while ((mp1 = mp1->b_cont) != NULL && 2570 mp1->b_datap->db_type == M_DATA); 2571 } 2572 2573 DTRACE_TCP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, connp->conn_ixa, 2574 __dtrace_tcp_void_ip_t *, iphdr, tcp_t *, tcp, 2575 __dtrace_tcp_tcph_t *, tcpha); 2576 2577 if (tcp->tcp_state == TCPS_TIME_WAIT) { 2578 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, 2579 seg_len, tcpha, ira); 2580 return; 2581 } 2582 2583 if (sqp != NULL) { 2584 /* 2585 * This is the correct place to update tcp_last_recv_time. Note 2586 * that it is also updated for tcp structure that belongs to 2587 * global and listener queues which do not really need updating. 2588 * But that should not cause any harm. And it is updated for 2589 * all kinds of incoming segments, not only for data segments. 2590 */ 2591 tcp->tcp_last_recv_time = LBOLT_FASTPATH; 2592 } 2593 2594 flags = (unsigned int)tcpha->tha_flags & 0xFF; 2595 2596 TCPS_BUMP_MIB(tcps, tcpHCInSegs); 2597 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); 2598 2599 if ((flags & TH_URG) && sqp != NULL) { 2600 /* 2601 * TCP can't handle urgent pointers that arrive before 2602 * the connection has been accept()ed since it can't 2603 * buffer OOB data. Discard segment if this happens. 2604 * 2605 * We can't just rely on a non-null tcp_listener to indicate 2606 * that the accept() has completed since unlinking of the 2607 * eager and completion of the accept are not atomic. 2608 * tcp_detached, when it is not set (B_FALSE) indicates 2609 * that the accept() has completed. 2610 * 2611 * Nor can it reassemble urgent pointers, so discard 2612 * if it's not the next segment expected. 2613 * 2614 * Otherwise, collapse chain into one mblk (discard if 2615 * that fails). This makes sure the headers, retransmitted 2616 * data, and new data all are in the same mblk. 2617 */ 2618 ASSERT(mp != NULL); 2619 if (tcp->tcp_detached || !pullupmsg(mp, -1)) { 2620 freemsg(mp); 2621 return; 2622 } 2623 /* Update pointers into message */ 2624 iphdr = rptr = mp->b_rptr; 2625 tcpha = (tcpha_t *)&rptr[ip_hdr_len]; 2626 if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) { 2627 /* 2628 * Since we can't handle any data with this urgent 2629 * pointer that is out of sequence, we expunge 2630 * the data. This allows us to still register 2631 * the urgent mark and generate the M_PCSIG, 2632 * which we can do. 2633 */ 2634 mp->b_wptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha); 2635 seg_len = 0; 2636 } 2637 } 2638 2639 sockupcalls = connp->conn_upcalls; 2640 /* A conn_t may have belonged to a now-closed socket. Be careful. */ 2641 if (sockupcalls == NULL) 2642 sockupcalls = &tcp_dummy_upcalls; 2643 2644 switch (tcp->tcp_state) { 2645 case TCPS_SYN_SENT: 2646 if (connp->conn_final_sqp == NULL && 2647 tcp_outbound_squeue_switch && sqp != NULL) { 2648 ASSERT(connp->conn_initial_sqp == connp->conn_sqp); 2649 connp->conn_final_sqp = sqp; 2650 if (connp->conn_final_sqp != connp->conn_sqp) { 2651 DTRACE_PROBE1(conn__final__sqp__switch, 2652 conn_t *, connp); 2653 CONN_INC_REF(connp); 2654 SQUEUE_SWITCH(connp, connp->conn_final_sqp); 2655 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 2656 tcp_input_data, connp, ira, ip_squeue_flag, 2657 SQTAG_CONNECT_FINISH); 2658 return; 2659 } 2660 DTRACE_PROBE1(conn__final__sqp__same, conn_t *, connp); 2661 } 2662 if (flags & TH_ACK) { 2663 /* 2664 * Note that our stack cannot send data before a 2665 * connection is established, therefore the 2666 * following check is valid. Otherwise, it has 2667 * to be changed. 2668 */ 2669 if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || 2670 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 2671 freemsg(mp); 2672 if (flags & TH_RST) 2673 return; 2674 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", 2675 tcp, seg_ack, 0, TH_RST); 2676 return; 2677 } 2678 ASSERT(tcp->tcp_suna + 1 == seg_ack); 2679 } 2680 if (flags & TH_RST) { 2681 if (flags & TH_ACK) { 2682 DTRACE_TCP5(connect__refused, mblk_t *, NULL, 2683 ip_xmit_attr_t *, connp->conn_ixa, 2684 void_ip_t *, iphdr, tcp_t *, tcp, 2685 tcph_t *, tcpha); 2686 (void) tcp_clean_death(tcp, ECONNREFUSED); 2687 } 2688 freemsg(mp); 2689 return; 2690 } 2691 if (!(flags & TH_SYN)) { 2692 freemsg(mp); 2693 return; 2694 } 2695 2696 /* Process all TCP options. */ 2697 if (!tcp_process_options(mp, tcp, tcpha, ira, B_FALSE)) { 2698 freemsg(mp); 2699 return; 2700 } 2701 2702 /* 2703 * The following changes our rwnd to be a multiple of the 2704 * MIN(peer MSS, our MSS) for performance reason. 2705 */ 2706 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(connp->conn_rcvbuf, 2707 tcp->tcp_mss)); 2708 2709 /* Is the other end ECN capable? */ 2710 if (tcp->tcp_ecn_ok) { 2711 if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { 2712 tcp->tcp_ecn_ok = B_FALSE; 2713 } 2714 } 2715 /* 2716 * Clear ECN flags because it may interfere with later 2717 * processing. 2718 */ 2719 flags &= ~(TH_ECE|TH_CWR); 2720 2721 tcp->tcp_irs = seg_seq; 2722 tcp->tcp_rack = seg_seq; 2723 tcp->tcp_rnxt = seg_seq + 1; 2724 tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt); 2725 if (!TCP_IS_DETACHED(tcp)) { 2726 /* Allocate room for SACK options if needed. */ 2727 connp->conn_wroff = connp->conn_ht_iphc_len; 2728 if (tcp->tcp_snd_sack_ok) 2729 connp->conn_wroff += TCPOPT_MAX_SACK_LEN; 2730 if (!tcp->tcp_loopback) 2731 connp->conn_wroff += tcps->tcps_wroff_xtra; 2732 2733 (void) proto_set_tx_wroff(connp->conn_rq, connp, 2734 connp->conn_wroff); 2735 } 2736 if (flags & TH_ACK) { 2737 /* 2738 * If we can't get the confirmation upstream, pretend 2739 * we didn't even see this one. 2740 * 2741 * XXX: how can we pretend we didn't see it if we 2742 * have updated rnxt et. al. 2743 * 2744 * For loopback we defer sending up the T_CONN_CON 2745 * until after some checks below. 2746 */ 2747 mp1 = NULL; 2748 /* 2749 * tcp_sendmsg() checks tcp_state without entering 2750 * the squeue so tcp_state should be updated before 2751 * sending up connection confirmation. Probe the 2752 * state change below when we are sure the connection 2753 * confirmation has been sent. 2754 */ 2755 tcp->tcp_state = TCPS_ESTABLISHED; 2756 if (!tcp_conn_con(tcp, iphdr, mp, 2757 tcp->tcp_loopback ? &mp1 : NULL, ira)) { 2758 tcp->tcp_state = TCPS_SYN_SENT; 2759 freemsg(mp); 2760 return; 2761 } 2762 TCPS_CONN_INC(tcps); 2763 /* SYN was acked - making progress */ 2764 tcp->tcp_ip_forward_progress = B_TRUE; 2765 2766 /* One for the SYN */ 2767 tcp->tcp_suna = tcp->tcp_iss + 1; 2768 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 2769 2770 /* 2771 * If SYN was retransmitted, need to reset all 2772 * retransmission info. This is because this 2773 * segment will be treated as a dup ACK. 2774 */ 2775 if (tcp->tcp_rexmit) { 2776 tcp->tcp_rexmit = B_FALSE; 2777 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 2778 tcp->tcp_rexmit_max = tcp->tcp_snxt; 2779 tcp->tcp_ms_we_have_waited = 0; 2780 2781 /* 2782 * Set tcp_cwnd back to 1 MSS, per 2783 * recommendation from 2784 * draft-floyd-incr-init-win-01.txt, 2785 * Increasing TCP's Initial Window. 2786 */ 2787 DTRACE_PROBE3(cwnd__retransmitted__syn, 2788 tcp_t *, tcp, uint32_t, tcp->tcp_cwnd, 2789 uint32_t, tcp->tcp_mss); 2790 tcp->tcp_cwnd = tcp->tcp_mss; 2791 } 2792 2793 tcp->tcp_swl1 = seg_seq; 2794 tcp->tcp_swl2 = seg_ack; 2795 2796 new_swnd = ntohs(tcpha->tha_win); 2797 tcp->tcp_swnd = new_swnd; 2798 if (new_swnd > tcp->tcp_max_swnd) 2799 tcp->tcp_max_swnd = new_swnd; 2800 2801 /* 2802 * Always send the three-way handshake ack immediately 2803 * in order to make the connection complete as soon as 2804 * possible on the accepting host. 2805 */ 2806 flags |= TH_ACK_NEEDED; 2807 2808 /* 2809 * Trace connect-established here. 2810 */ 2811 DTRACE_TCP5(connect__established, mblk_t *, NULL, 2812 ip_xmit_attr_t *, tcp->tcp_connp->conn_ixa, 2813 void_ip_t *, iphdr, tcp_t *, tcp, tcph_t *, tcpha); 2814 2815 /* Trace change from SYN_SENT -> ESTABLISHED here */ 2816 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 2817 connp->conn_ixa, void, NULL, tcp_t *, tcp, 2818 void, NULL, int32_t, TCPS_SYN_SENT); 2819 2820 /* 2821 * Special case for loopback. At this point we have 2822 * received SYN-ACK from the remote endpoint. In 2823 * order to ensure that both endpoints reach the 2824 * fused state prior to any data exchange, the final 2825 * ACK needs to be sent before we indicate T_CONN_CON 2826 * to the module upstream. 2827 */ 2828 if (tcp->tcp_loopback) { 2829 mblk_t *ack_mp; 2830 2831 ASSERT(!tcp->tcp_unfusable); 2832 ASSERT(mp1 != NULL); 2833 /* 2834 * For loopback, we always get a pure SYN-ACK 2835 * and only need to send back the final ACK 2836 * with no data (this is because the other 2837 * tcp is ours and we don't do T/TCP). This 2838 * final ACK triggers the passive side to 2839 * perform fusion in ESTABLISHED state. 2840 */ 2841 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) { 2842 if (tcp->tcp_ack_tid != 0) { 2843 (void) TCP_TIMER_CANCEL(tcp, 2844 tcp->tcp_ack_tid); 2845 tcp->tcp_ack_tid = 0; 2846 } 2847 tcp_send_data(tcp, ack_mp); 2848 TCPS_BUMP_MIB(tcps, tcpHCOutSegs); 2849 TCPS_BUMP_MIB(tcps, tcpOutAck); 2850 2851 if (!IPCL_IS_NONSTR(connp)) { 2852 /* Send up T_CONN_CON */ 2853 if (ira->ira_cred != NULL) { 2854 mblk_setcred(mp1, 2855 ira->ira_cred, 2856 ira->ira_cpid); 2857 } 2858 putnext(connp->conn_rq, mp1); 2859 } else { 2860 (*sockupcalls->su_connected) 2861 (connp->conn_upper_handle, 2862 tcp->tcp_connid, 2863 ira->ira_cred, 2864 ira->ira_cpid); 2865 freemsg(mp1); 2866 } 2867 2868 freemsg(mp); 2869 return; 2870 } 2871 /* 2872 * Forget fusion; we need to handle more 2873 * complex cases below. Send the deferred 2874 * T_CONN_CON message upstream and proceed 2875 * as usual. Mark this tcp as not capable 2876 * of fusion. 2877 */ 2878 TCP_STAT(tcps, tcp_fusion_unfusable); 2879 tcp->tcp_unfusable = B_TRUE; 2880 if (!IPCL_IS_NONSTR(connp)) { 2881 if (ira->ira_cred != NULL) { 2882 mblk_setcred(mp1, ira->ira_cred, 2883 ira->ira_cpid); 2884 } 2885 putnext(connp->conn_rq, mp1); 2886 } else { 2887 (*sockupcalls->su_connected) 2888 (connp->conn_upper_handle, 2889 tcp->tcp_connid, ira->ira_cred, 2890 ira->ira_cpid); 2891 freemsg(mp1); 2892 } 2893 } 2894 2895 /* 2896 * Check to see if there is data to be sent. If 2897 * yes, set the transmit flag. Then check to see 2898 * if received data processing needs to be done. 2899 * If not, go straight to xmit_check. This short 2900 * cut is OK as we don't support T/TCP. 2901 */ 2902 if (tcp->tcp_unsent) 2903 flags |= TH_XMIT_NEEDED; 2904 2905 if (seg_len == 0 && !(flags & TH_URG)) { 2906 freemsg(mp); 2907 goto xmit_check; 2908 } 2909 2910 flags &= ~TH_SYN; 2911 seg_seq++; 2912 break; 2913 } 2914 tcp->tcp_state = TCPS_SYN_RCVD; 2915 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 2916 connp->conn_ixa, void_ip_t *, NULL, tcp_t *, tcp, 2917 tcph_t *, NULL, int32_t, TCPS_SYN_SENT); 2918 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, 2919 NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 2920 if (mp1 != NULL) { 2921 tcp_send_data(tcp, mp1); 2922 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 2923 } 2924 freemsg(mp); 2925 return; 2926 case TCPS_SYN_RCVD: 2927 if (flags & TH_ACK) { 2928 uint32_t pinit_wnd; 2929 2930 /* 2931 * In this state, a SYN|ACK packet is either bogus 2932 * because the other side must be ACKing our SYN which 2933 * indicates it has seen the ACK for their SYN and 2934 * shouldn't retransmit it or we're crossing SYNs 2935 * on active open. 2936 */ 2937 if ((flags & TH_SYN) && !tcp->tcp_active_open) { 2938 freemsg(mp); 2939 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_syn", 2940 tcp, seg_ack, 0, TH_RST); 2941 return; 2942 } 2943 /* 2944 * NOTE: RFC 793 pg. 72 says this should be 2945 * tcp->tcp_suna <= seg_ack <= tcp->tcp_snxt 2946 * but that would mean we have an ack that ignored 2947 * our SYN. 2948 */ 2949 if (SEQ_LEQ(seg_ack, tcp->tcp_suna) || 2950 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 2951 freemsg(mp); 2952 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack", 2953 tcp, seg_ack, 0, TH_RST); 2954 return; 2955 } 2956 /* 2957 * No sane TCP stack will send such a small window 2958 * without receiving any data. Just drop this invalid 2959 * ACK. We also shorten the abort timeout in case 2960 * this is an attack. 2961 */ 2962 pinit_wnd = ntohs(tcpha->tha_win) << tcp->tcp_snd_ws; 2963 if (pinit_wnd < tcp->tcp_mss && 2964 pinit_wnd < tcp_init_wnd_chk) { 2965 freemsg(mp); 2966 TCP_STAT(tcps, tcp_zwin_ack_syn); 2967 tcp->tcp_second_ctimer_threshold = 2968 tcp_early_abort * SECONDS; 2969 return; 2970 } 2971 } 2972 break; 2973 case TCPS_LISTEN: 2974 /* 2975 * Only a TLI listener can come through this path when a 2976 * acceptor is going back to be a listener and a packet 2977 * for the acceptor hits the classifier. For a socket 2978 * listener, this can never happen because a listener 2979 * can never accept connection on itself and hence a 2980 * socket acceptor can not go back to being a listener. 2981 */ 2982 ASSERT(!TCP_IS_SOCKET(tcp)); 2983 /*FALLTHRU*/ 2984 case TCPS_CLOSED: 2985 case TCPS_BOUND: { 2986 conn_t *new_connp; 2987 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 2988 2989 /* 2990 * Don't accept any input on a closed tcp as this TCP logically 2991 * does not exist on the system. Don't proceed further with 2992 * this TCP. For instance, this packet could trigger another 2993 * close of this tcp which would be disastrous for tcp_refcnt. 2994 * tcp_close_detached / tcp_clean_death / tcp_closei_local must 2995 * be called at most once on a TCP. In this case we need to 2996 * refeed the packet into the classifier and figure out where 2997 * the packet should go. 2998 */ 2999 new_connp = ipcl_classify(mp, ira, ipst); 3000 if (new_connp != NULL) { 3001 /* Drops ref on new_connp */ 3002 tcp_reinput(new_connp, mp, ira, ipst); 3003 return; 3004 } 3005 /* We failed to classify. For now just drop the packet */ 3006 freemsg(mp); 3007 return; 3008 } 3009 case TCPS_IDLE: 3010 /* 3011 * Handle the case where the tcp_clean_death() has happened 3012 * on a connection (application hasn't closed yet) but a packet 3013 * was already queued on squeue before tcp_clean_death() 3014 * was processed. Calling tcp_clean_death() twice on same 3015 * connection can result in weird behaviour. 3016 */ 3017 freemsg(mp); 3018 return; 3019 default: 3020 break; 3021 } 3022 3023 /* 3024 * Already on the correct queue/perimeter. 3025 * If this is a detached connection and not an eager 3026 * connection hanging off a listener then new data 3027 * (past the FIN) will cause a reset. 3028 * We do a special check here where it 3029 * is out of the main line, rather than check 3030 * if we are detached every time we see new 3031 * data down below. 3032 */ 3033 if (TCP_IS_DETACHED_NONEAGER(tcp) && 3034 (seg_len > 0 && SEQ_GT(seg_seq + seg_len, tcp->tcp_rnxt))) { 3035 TCPS_BUMP_MIB(tcps, tcpInClosed); 3036 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); 3037 freemsg(mp); 3038 tcp_xmit_ctl("new data when detached", tcp, 3039 tcp->tcp_snxt, 0, TH_RST); 3040 (void) tcp_clean_death(tcp, EPROTO); 3041 return; 3042 } 3043 3044 mp->b_rptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha); 3045 urp = ntohs(tcpha->tha_urp) - TCP_OLD_URP_INTERPRETATION; 3046 new_swnd = ntohs(tcpha->tha_win) << 3047 ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); 3048 3049 /* 3050 * We are interested in three TCP options: timestamps (if negotiated), 3051 * SACK (if negotiated) and MD5. Skip option parsing if none of these 3052 * is enabled/negotiated. 3053 */ 3054 if (tcp->tcp_snd_ts_ok || tcp->tcp_snd_sack_ok || tcp->tcp_md5sig) { 3055 int options; 3056 3057 if (tcp->tcp_snd_sack_ok) 3058 tcpopt.tcp = tcp; 3059 else 3060 tcpopt.tcp = NULL; 3061 3062 options = tcp_parse_options(tcpha, &tcpopt); 3063 3064 if (tcp->tcp_md5sig) { 3065 if ((options & TCP_OPT_SIG_PRESENT) == 0) { 3066 TCP_STAT(tcp->tcp_tcps, tcp_sig_no_option); 3067 freemsg(mp); 3068 return; 3069 } 3070 if (!tcpsig_verify(mp, tcp, tcpha, ira, 3071 tcpopt.tcp_opt_sig)) { 3072 freemsg(mp); 3073 return; 3074 } 3075 } 3076 /* 3077 * RST segments must not be subject to PAWS and are not 3078 * required to have timestamps. 3079 * We do not drop keepalive segments without 3080 * timestamps, to maintain compatibility with legacy TCP stacks. 3081 */ 3082 boolean_t keepalive = (seg_len == 0 || seg_len == 1) && 3083 (seg_seq + 1 == tcp->tcp_rnxt); 3084 if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) && !keepalive) { 3085 /* 3086 * Per RFC 7323 section 3.2., silently drop non-RST 3087 * segments without expected TSopt. This is a 'SHOULD' 3088 * requirement. 3089 * We accept keepalives without TSopt to maintain 3090 * interoperability with tcp implementations that omit 3091 * the TSopt on these. Keepalive data is discarded, so 3092 * there is no risk corrupting data by accepting these. 3093 */ 3094 if (!(options & TCP_OPT_TSTAMP_PRESENT)) { 3095 /* 3096 * Leave a breadcrumb for people to detect this 3097 * behavior. 3098 */ 3099 DTRACE_TCP1(droppedtimestamp, tcp_t *, tcp); 3100 freemsg(mp); 3101 return; 3102 } 3103 3104 if (!tcp_paws_check(tcp, &tcpopt)) { 3105 /* 3106 * This segment is not acceptable. 3107 * Drop it and send back an ACK. 3108 */ 3109 freemsg(mp); 3110 flags |= TH_ACK_NEEDED; 3111 goto ack_check; 3112 } 3113 } 3114 } 3115 try_again:; 3116 mss = tcp->tcp_mss; 3117 gap = seg_seq - tcp->tcp_rnxt; 3118 rgap = tcp->tcp_rwnd - (gap + seg_len); 3119 /* 3120 * gap is the amount of sequence space between what we expect to see 3121 * and what we got for seg_seq. A positive value for gap means 3122 * something got lost. A negative value means we got some old stuff. 3123 */ 3124 if (gap < 0) { 3125 /* Old stuff present. Is the SYN in there? */ 3126 if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) && 3127 (seg_len != 0)) { 3128 flags &= ~TH_SYN; 3129 seg_seq++; 3130 urp--; 3131 /* Recompute the gaps after noting the SYN. */ 3132 goto try_again; 3133 } 3134 TCPS_BUMP_MIB(tcps, tcpInDataDupSegs); 3135 TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes, 3136 (seg_len > -gap ? -gap : seg_len)); 3137 /* Remove the old stuff from seg_len. */ 3138 seg_len += gap; 3139 /* 3140 * Anything left? 3141 * Make sure to check for unack'd FIN when rest of data 3142 * has been previously ack'd. 3143 */ 3144 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 3145 /* 3146 * Resets are only valid if they lie within our offered 3147 * window. If the RST bit is set, we just ignore this 3148 * segment. 3149 */ 3150 if (flags & TH_RST) { 3151 freemsg(mp); 3152 return; 3153 } 3154 3155 /* 3156 * The arriving of dup data packets indicate that we 3157 * may have postponed an ack for too long, or the other 3158 * side's RTT estimate is out of shape. Start acking 3159 * more often. 3160 */ 3161 if (SEQ_GEQ(seg_seq + seg_len - gap, tcp->tcp_rack) && 3162 tcp->tcp_rack_cnt >= 1 && 3163 tcp->tcp_rack_abs_max > 2) { 3164 tcp->tcp_rack_abs_max--; 3165 } 3166 tcp->tcp_rack_cur_max = 1; 3167 3168 /* 3169 * This segment is "unacceptable". None of its 3170 * sequence space lies within our advertized window. 3171 * 3172 * Adjust seg_len to the original value for tracing. 3173 */ 3174 seg_len -= gap; 3175 if (connp->conn_debug) { 3176 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 3177 "tcp_rput: unacceptable, gap %d, rgap %d, " 3178 "flags 0x%x, seg_seq %u, seg_ack %u, " 3179 "seg_len %d, rnxt %u, snxt %u, %s", 3180 gap, rgap, flags, seg_seq, seg_ack, 3181 seg_len, tcp->tcp_rnxt, tcp->tcp_snxt, 3182 tcp_display(tcp, NULL, 3183 DISP_ADDR_AND_PORT)); 3184 } 3185 3186 /* 3187 * Arrange to send an ACK in response to the 3188 * unacceptable segment per RFC 793 page 69. There 3189 * is only one small difference between ours and the 3190 * acceptability test in the RFC - we accept ACK-only 3191 * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK 3192 * will be generated. 3193 * 3194 * Note that we have to ACK an ACK-only packet at least 3195 * for stacks that send 0-length keep-alives with 3196 * SEG.SEQ = SND.NXT-1 as recommended by RFC1122, 3197 * section 4.2.3.6. As long as we don't ever generate 3198 * an unacceptable packet in response to an incoming 3199 * packet that is unacceptable, it should not cause 3200 * "ACK wars". 3201 */ 3202 flags |= TH_ACK_NEEDED; 3203 3204 /* 3205 * Continue processing this segment in order to use the 3206 * ACK information it contains, but skip all other 3207 * sequence-number processing. Processing the ACK 3208 * information is necessary in order to 3209 * re-synchronize connections that may have lost 3210 * synchronization. 3211 * 3212 * We clear seg_len and flag fields related to 3213 * sequence number processing as they are not 3214 * to be trusted for an unacceptable segment. 3215 */ 3216 seg_len = 0; 3217 flags &= ~(TH_SYN | TH_FIN | TH_URG); 3218 goto process_ack; 3219 } 3220 3221 /* Fix seg_seq, and chew the gap off the front. */ 3222 seg_seq = tcp->tcp_rnxt; 3223 urp += gap; 3224 do { 3225 mblk_t *mp2; 3226 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 3227 (uintptr_t)UINT_MAX); 3228 gap += (uint_t)(mp->b_wptr - mp->b_rptr); 3229 if (gap > 0) { 3230 mp->b_rptr = mp->b_wptr - gap; 3231 break; 3232 } 3233 mp2 = mp; 3234 mp = mp->b_cont; 3235 freeb(mp2); 3236 } while (gap < 0); 3237 /* 3238 * If the urgent data has already been acknowledged, we 3239 * should ignore TH_URG below 3240 */ 3241 if (urp < 0) 3242 flags &= ~TH_URG; 3243 } 3244 /* 3245 * rgap is the amount of stuff received out of window. A negative 3246 * value is the amount out of window. 3247 */ 3248 if (rgap < 0) { 3249 mblk_t *mp2; 3250 3251 if (tcp->tcp_rwnd == 0) { 3252 TCPS_BUMP_MIB(tcps, tcpInWinProbe); 3253 tcp->tcp_cs.tcp_in_zwnd_probes++; 3254 } else { 3255 TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs); 3256 TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap); 3257 } 3258 3259 /* 3260 * seg_len does not include the FIN, so if more than 3261 * just the FIN is out of window, we act like we don't 3262 * see it. (If just the FIN is out of window, rgap 3263 * will be zero and we will go ahead and acknowledge 3264 * the FIN.) 3265 */ 3266 flags &= ~TH_FIN; 3267 3268 /* Fix seg_len and make sure there is something left. */ 3269 seg_len += rgap; 3270 if (seg_len <= 0) { 3271 /* 3272 * Resets are only valid if they lie within our offered 3273 * window. If the RST bit is set, we just ignore this 3274 * segment. 3275 */ 3276 if (flags & TH_RST) { 3277 freemsg(mp); 3278 return; 3279 } 3280 3281 /* Per RFC 793, we need to send back an ACK. */ 3282 flags |= TH_ACK_NEEDED; 3283 3284 /* 3285 * Send SIGURG as soon as possible i.e. even 3286 * if the TH_URG was delivered in a window probe 3287 * packet (which will be unacceptable). 3288 * 3289 * We generate a signal if none has been generated 3290 * for this connection or if this is a new urgent 3291 * byte. Also send a zero-length "unmarked" message 3292 * to inform SIOCATMARK that this is not the mark. 3293 * 3294 * tcp_urp_last_valid is cleared when the T_exdata_ind 3295 * is sent up. This plus the check for old data 3296 * (gap >= 0) handles the wraparound of the sequence 3297 * number space without having to always track the 3298 * correct MAX(tcp_urp_last, tcp_rnxt). (BSD tracks 3299 * this max in its rcv_up variable). 3300 * 3301 * This prevents duplicate SIGURGS due to a "late" 3302 * zero-window probe when the T_EXDATA_IND has already 3303 * been sent up. 3304 */ 3305 if ((flags & TH_URG) && 3306 (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, 3307 tcp->tcp_urp_last))) { 3308 if (IPCL_IS_NONSTR(connp)) { 3309 if (!TCP_IS_DETACHED(tcp)) { 3310 (*sockupcalls->su_signal_oob) 3311 (connp->conn_upper_handle, 3312 urp); 3313 } 3314 } else { 3315 mp1 = allocb(0, BPRI_MED); 3316 if (mp1 == NULL) { 3317 freemsg(mp); 3318 return; 3319 } 3320 if (!TCP_IS_DETACHED(tcp) && 3321 !putnextctl1(connp->conn_rq, 3322 M_PCSIG, SIGURG)) { 3323 /* Try again on the rexmit. */ 3324 freemsg(mp1); 3325 freemsg(mp); 3326 return; 3327 } 3328 /* 3329 * If the next byte would be the mark 3330 * then mark with MARKNEXT else mark 3331 * with NOTMARKNEXT. 3332 */ 3333 if (gap == 0 && urp == 0) 3334 mp1->b_flag |= MSGMARKNEXT; 3335 else 3336 mp1->b_flag |= MSGNOTMARKNEXT; 3337 freemsg(tcp->tcp_urp_mark_mp); 3338 tcp->tcp_urp_mark_mp = mp1; 3339 flags |= TH_SEND_URP_MARK; 3340 } 3341 tcp->tcp_urp_last_valid = B_TRUE; 3342 tcp->tcp_urp_last = urp + seg_seq; 3343 } 3344 /* 3345 * If this is a zero window probe, continue to 3346 * process the ACK part. But we need to set seg_len 3347 * to 0 to avoid data processing. Otherwise just 3348 * drop the segment and send back an ACK. 3349 */ 3350 if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) { 3351 flags &= ~(TH_SYN | TH_URG); 3352 seg_len = 0; 3353 goto process_ack; 3354 } else { 3355 freemsg(mp); 3356 goto ack_check; 3357 } 3358 } 3359 /* Pitch out of window stuff off the end. */ 3360 rgap = seg_len; 3361 mp2 = mp; 3362 do { 3363 ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 3364 (uintptr_t)INT_MAX); 3365 rgap -= (int)(mp2->b_wptr - mp2->b_rptr); 3366 if (rgap < 0) { 3367 mp2->b_wptr += rgap; 3368 if ((mp1 = mp2->b_cont) != NULL) { 3369 mp2->b_cont = NULL; 3370 freemsg(mp1); 3371 } 3372 break; 3373 } 3374 } while ((mp2 = mp2->b_cont) != NULL); 3375 } 3376 ok:; 3377 /* 3378 * TCP should check ECN info for segments inside the window only. 3379 * Therefore the check should be done here. 3380 */ 3381 if (tcp->tcp_ecn_ok) { 3382 if (flags & TH_CWR) { 3383 tcp->tcp_ecn_echo_on = B_FALSE; 3384 } 3385 /* 3386 * Note that both ECN_CE and CWR can be set in the 3387 * same segment. In this case, we once again turn 3388 * on ECN_ECHO. 3389 */ 3390 if (connp->conn_ipversion == IPV4_VERSION) { 3391 uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service; 3392 3393 if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { 3394 tcp->tcp_ecn_echo_on = B_TRUE; 3395 } 3396 } else { 3397 uint32_t vcf = ((ip6_t *)rptr)->ip6_vcf; 3398 3399 if ((vcf & htonl(IPH_ECN_CE << 20)) == 3400 htonl(IPH_ECN_CE << 20)) { 3401 tcp->tcp_ecn_echo_on = B_TRUE; 3402 } 3403 } 3404 } 3405 3406 /* 3407 * Check whether we can update tcp_ts_recent. This test is from RFC 3408 * 7323, section 5.3. 3409 */ 3410 if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) && 3411 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 3412 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 3413 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 3414 tcp->tcp_last_rcv_lbolt = LBOLT_FASTPATH64; 3415 } 3416 3417 if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) { 3418 /* 3419 * FIN in an out of order segment. We record this in 3420 * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq. 3421 * Clear the FIN so that any check on FIN flag will fail. 3422 * Remember that FIN also counts in the sequence number 3423 * space. So we need to ack out of order FIN only segments. 3424 */ 3425 if (flags & TH_FIN) { 3426 tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID; 3427 tcp->tcp_ofo_fin_seq = seg_seq + seg_len; 3428 flags &= ~TH_FIN; 3429 flags |= TH_ACK_NEEDED; 3430 } 3431 if (seg_len > 0) { 3432 /* Fill in the SACK blk list. */ 3433 if (tcp->tcp_snd_sack_ok) { 3434 tcp_sack_insert(tcp->tcp_sack_list, 3435 seg_seq, seg_seq + seg_len, 3436 &(tcp->tcp_num_sack_blk)); 3437 } 3438 3439 /* 3440 * Attempt reassembly and see if we have something 3441 * ready to go. 3442 */ 3443 mp = tcp_reass(tcp, mp, seg_seq); 3444 /* Always ack out of order packets */ 3445 flags |= TH_ACK_NEEDED | TH_PUSH; 3446 if (mp) { 3447 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 3448 (uintptr_t)INT_MAX); 3449 seg_len = mp->b_cont ? msgdsize(mp) : 3450 (int)(mp->b_wptr - mp->b_rptr); 3451 seg_seq = tcp->tcp_rnxt; 3452 /* 3453 * A gap is filled and the seq num and len 3454 * of the gap match that of a previously 3455 * received FIN, put the FIN flag back in. 3456 */ 3457 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3458 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3459 flags |= TH_FIN; 3460 tcp->tcp_valid_bits &= 3461 ~TCP_OFO_FIN_VALID; 3462 } 3463 if (tcp->tcp_reass_tid != 0) { 3464 (void) TCP_TIMER_CANCEL(tcp, 3465 tcp->tcp_reass_tid); 3466 /* 3467 * Restart the timer if there is still 3468 * data in the reassembly queue. 3469 */ 3470 if (tcp->tcp_reass_head != NULL) { 3471 tcp->tcp_reass_tid = TCP_TIMER( 3472 tcp, tcp_reass_timer, 3473 tcps->tcps_reass_timeout); 3474 } else { 3475 tcp->tcp_reass_tid = 0; 3476 } 3477 } 3478 } else { 3479 /* 3480 * Keep going even with NULL mp. 3481 * There may be a useful ACK or something else 3482 * we don't want to miss. 3483 * 3484 * But TCP should not perform fast retransmit 3485 * because of the ack number. TCP uses 3486 * seg_len == 0 to determine if it is a pure 3487 * ACK. And this is not a pure ACK. 3488 */ 3489 seg_len = 0; 3490 ofo_seg = B_TRUE; 3491 3492 if (tcps->tcps_reass_timeout != 0 && 3493 tcp->tcp_reass_tid == 0) { 3494 tcp->tcp_reass_tid = TCP_TIMER(tcp, 3495 tcp_reass_timer, 3496 tcps->tcps_reass_timeout); 3497 } 3498 } 3499 } 3500 } else if (seg_len > 0) { 3501 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs); 3502 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len); 3503 tcp->tcp_cs.tcp_in_data_inorder_segs++; 3504 tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len; 3505 3506 /* 3507 * If an out of order FIN was received before, and the seq 3508 * num and len of the new segment match that of the FIN, 3509 * put the FIN flag back in. 3510 */ 3511 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3512 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3513 flags |= TH_FIN; 3514 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID; 3515 } 3516 } 3517 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) { 3518 if (flags & TH_RST) { 3519 freemsg(mp); 3520 switch (tcp->tcp_state) { 3521 case TCPS_SYN_RCVD: 3522 (void) tcp_clean_death(tcp, ECONNREFUSED); 3523 break; 3524 case TCPS_ESTABLISHED: 3525 case TCPS_FIN_WAIT_1: 3526 case TCPS_FIN_WAIT_2: 3527 case TCPS_CLOSE_WAIT: 3528 (void) tcp_clean_death(tcp, ECONNRESET); 3529 break; 3530 case TCPS_CLOSING: 3531 case TCPS_LAST_ACK: 3532 (void) tcp_clean_death(tcp, 0); 3533 break; 3534 default: 3535 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 3536 (void) tcp_clean_death(tcp, ENXIO); 3537 break; 3538 } 3539 return; 3540 } 3541 if (flags & TH_SYN) { 3542 /* 3543 * See RFC 793, Page 71 3544 * 3545 * The seq number must be in the window as it should 3546 * be "fixed" above. If it is outside window, it should 3547 * be already rejected. Note that we allow seg_seq to be 3548 * rnxt + rwnd because we want to accept 0 window probe. 3549 */ 3550 ASSERT(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) && 3551 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd)); 3552 freemsg(mp); 3553 /* 3554 * If the ACK flag is not set, just use our snxt as the 3555 * seq number of the RST segment. 3556 */ 3557 if (!(flags & TH_ACK)) { 3558 seg_ack = tcp->tcp_snxt; 3559 } 3560 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 3561 TH_RST|TH_ACK); 3562 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 3563 (void) tcp_clean_death(tcp, ECONNRESET); 3564 return; 3565 } 3566 /* 3567 * urp could be -1 when the urp field in the packet is 0 3568 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent 3569 * byte was at seg_seq - 1, in which case we ignore the urgent flag. 3570 */ 3571 if ((flags & TH_URG) && urp >= 0) { 3572 if (!tcp->tcp_urp_last_valid || 3573 SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { 3574 /* 3575 * Non-STREAMS sockets handle the urgent data a litte 3576 * differently from STREAMS based sockets. There is no 3577 * need to mark any mblks with the MSG{NOT,}MARKNEXT 3578 * flags to keep SIOCATMARK happy. Instead a 3579 * su_signal_oob upcall is made to update the mark. 3580 * Neither is a T_EXDATA_IND mblk needed to be 3581 * prepended to the urgent data. The urgent data is 3582 * delivered using the su_recv upcall, where we set 3583 * the MSG_OOB flag to indicate that it is urg data. 3584 * 3585 * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED 3586 * are used by non-STREAMS sockets. 3587 */ 3588 if (IPCL_IS_NONSTR(connp)) { 3589 if (!TCP_IS_DETACHED(tcp)) { 3590 (*sockupcalls->su_signal_oob) 3591 (connp->conn_upper_handle, urp); 3592 } 3593 } else { 3594 /* 3595 * If we haven't generated the signal yet for 3596 * this urgent pointer value, do it now. Also, 3597 * send up a zero-length M_DATA indicating 3598 * whether or not this is the mark. The latter 3599 * is not needed when a T_EXDATA_IND is sent up. 3600 * However, if there are allocation failures 3601 * this code relies on the sender retransmitting 3602 * and the socket code for determining the mark 3603 * should not block waiting for the peer to 3604 * transmit. Thus, for simplicity we always 3605 * send up the mark indication. 3606 */ 3607 mp1 = allocb(0, BPRI_MED); 3608 if (mp1 == NULL) { 3609 freemsg(mp); 3610 return; 3611 } 3612 if (!TCP_IS_DETACHED(tcp) && 3613 !putnextctl1(connp->conn_rq, M_PCSIG, 3614 SIGURG)) { 3615 /* Try again on the rexmit. */ 3616 freemsg(mp1); 3617 freemsg(mp); 3618 return; 3619 } 3620 /* 3621 * Mark with NOTMARKNEXT for now. 3622 * The code below will change this to MARKNEXT 3623 * if we are at the mark. 3624 * 3625 * If there are allocation failures (e.g. in 3626 * dupmsg below) the next time tcp_input_data 3627 * sees the urgent segment it will send up the 3628 * MSGMARKNEXT message. 3629 */ 3630 mp1->b_flag |= MSGNOTMARKNEXT; 3631 freemsg(tcp->tcp_urp_mark_mp); 3632 tcp->tcp_urp_mark_mp = mp1; 3633 flags |= TH_SEND_URP_MARK; 3634 #ifdef DEBUG 3635 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 3636 "tcp_rput: sent M_PCSIG 2 seq %x urp %x " 3637 "last %x, %s", 3638 seg_seq, urp, tcp->tcp_urp_last, 3639 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 3640 #endif /* DEBUG */ 3641 } 3642 tcp->tcp_urp_last_valid = B_TRUE; 3643 tcp->tcp_urp_last = urp + seg_seq; 3644 } else if (tcp->tcp_urp_mark_mp != NULL) { 3645 /* 3646 * An allocation failure prevented the previous 3647 * tcp_input_data from sending up the allocated 3648 * MSG*MARKNEXT message - send it up this time 3649 * around. 3650 */ 3651 flags |= TH_SEND_URP_MARK; 3652 } 3653 3654 /* 3655 * If the urgent byte is in this segment, make sure that it is 3656 * all by itself. This makes it much easier to deal with the 3657 * possibility of an allocation failure on the T_exdata_ind. 3658 * Note that seg_len is the number of bytes in the segment, and 3659 * urp is the offset into the segment of the urgent byte. 3660 * urp < seg_len means that the urgent byte is in this segment. 3661 */ 3662 if (urp < seg_len) { 3663 if (seg_len != 1) { 3664 uint32_t tmp_rnxt; 3665 /* 3666 * Break it up and feed it back in. 3667 * Re-attach the IP header. 3668 */ 3669 mp->b_rptr = iphdr; 3670 if (urp > 0) { 3671 /* 3672 * There is stuff before the urgent 3673 * byte. 3674 */ 3675 mp1 = dupmsg(mp); 3676 if (!mp1) { 3677 /* 3678 * Trim from urgent byte on. 3679 * The rest will come back. 3680 */ 3681 (void) adjmsg(mp, 3682 urp - seg_len); 3683 tcp_input_data(connp, 3684 mp, NULL, ira); 3685 return; 3686 } 3687 (void) adjmsg(mp1, urp - seg_len); 3688 /* Feed this piece back in. */ 3689 tmp_rnxt = tcp->tcp_rnxt; 3690 tcp_input_data(connp, mp1, NULL, ira); 3691 /* 3692 * If the data passed back in was not 3693 * processed (ie: bad ACK) sending 3694 * the remainder back in will cause a 3695 * loop. In this case, drop the 3696 * packet and let the sender try 3697 * sending a good packet. 3698 */ 3699 if (tmp_rnxt == tcp->tcp_rnxt) { 3700 freemsg(mp); 3701 return; 3702 } 3703 } 3704 if (urp != seg_len - 1) { 3705 uint32_t tmp_rnxt; 3706 /* 3707 * There is stuff after the urgent 3708 * byte. 3709 */ 3710 mp1 = dupmsg(mp); 3711 if (!mp1) { 3712 /* 3713 * Trim everything beyond the 3714 * urgent byte. The rest will 3715 * come back. 3716 */ 3717 (void) adjmsg(mp, 3718 urp + 1 - seg_len); 3719 tcp_input_data(connp, 3720 mp, NULL, ira); 3721 return; 3722 } 3723 (void) adjmsg(mp1, urp + 1 - seg_len); 3724 tmp_rnxt = tcp->tcp_rnxt; 3725 tcp_input_data(connp, mp1, NULL, ira); 3726 /* 3727 * If the data passed back in was not 3728 * processed (ie: bad ACK) sending 3729 * the remainder back in will cause a 3730 * loop. In this case, drop the 3731 * packet and let the sender try 3732 * sending a good packet. 3733 */ 3734 if (tmp_rnxt == tcp->tcp_rnxt) { 3735 freemsg(mp); 3736 return; 3737 } 3738 } 3739 tcp_input_data(connp, mp, NULL, ira); 3740 return; 3741 } 3742 /* 3743 * This segment contains only the urgent byte. We 3744 * have to allocate the T_exdata_ind, if we can. 3745 */ 3746 if (IPCL_IS_NONSTR(connp)) { 3747 int error; 3748 3749 (*sockupcalls->su_recv) 3750 (connp->conn_upper_handle, mp, seg_len, 3751 MSG_OOB, &error, NULL); 3752 /* 3753 * We should never be in middle of a 3754 * fallback, the squeue guarantees that. 3755 */ 3756 ASSERT(error != EOPNOTSUPP); 3757 mp = NULL; 3758 goto update_ack; 3759 } else if (!tcp->tcp_urp_mp) { 3760 struct T_exdata_ind *tei; 3761 mp1 = allocb(sizeof (struct T_exdata_ind), 3762 BPRI_MED); 3763 if (!mp1) { 3764 /* 3765 * Sigh... It'll be back. 3766 * Generate any MSG*MARK message now. 3767 */ 3768 freemsg(mp); 3769 seg_len = 0; 3770 if (flags & TH_SEND_URP_MARK) { 3771 3772 3773 ASSERT(tcp->tcp_urp_mark_mp); 3774 tcp->tcp_urp_mark_mp->b_flag &= 3775 ~MSGNOTMARKNEXT; 3776 tcp->tcp_urp_mark_mp->b_flag |= 3777 MSGMARKNEXT; 3778 } 3779 goto ack_check; 3780 } 3781 mp1->b_datap->db_type = M_PROTO; 3782 tei = (struct T_exdata_ind *)mp1->b_rptr; 3783 tei->PRIM_type = T_EXDATA_IND; 3784 tei->MORE_flag = 0; 3785 mp1->b_wptr = (uchar_t *)&tei[1]; 3786 tcp->tcp_urp_mp = mp1; 3787 #ifdef DEBUG 3788 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 3789 "tcp_rput: allocated exdata_ind %s", 3790 tcp_display(tcp, NULL, 3791 DISP_PORT_ONLY)); 3792 #endif /* DEBUG */ 3793 /* 3794 * There is no need to send a separate MSG*MARK 3795 * message since the T_EXDATA_IND will be sent 3796 * now. 3797 */ 3798 flags &= ~TH_SEND_URP_MARK; 3799 freemsg(tcp->tcp_urp_mark_mp); 3800 tcp->tcp_urp_mark_mp = NULL; 3801 } 3802 /* 3803 * Now we are all set. On the next putnext upstream, 3804 * tcp_urp_mp will be non-NULL and will get prepended 3805 * to what has to be this piece containing the urgent 3806 * byte. If for any reason we abort this segment below, 3807 * if it comes back, we will have this ready, or it 3808 * will get blown off in close. 3809 */ 3810 } else if (urp == seg_len) { 3811 /* 3812 * The urgent byte is the next byte after this sequence 3813 * number. If this endpoint is non-STREAMS, then there 3814 * is nothing to do here since the socket has already 3815 * been notified about the urg pointer by the 3816 * su_signal_oob call above. 3817 * 3818 * In case of STREAMS, some more work might be needed. 3819 * If there is data it is marked with MSGMARKNEXT and 3820 * and any tcp_urp_mark_mp is discarded since it is not 3821 * needed. Otherwise, if the code above just allocated 3822 * a zero-length tcp_urp_mark_mp message, that message 3823 * is tagged with MSGMARKNEXT. Sending up these 3824 * MSGMARKNEXT messages makes SIOCATMARK work correctly 3825 * even though the T_EXDATA_IND will not be sent up 3826 * until the urgent byte arrives. 3827 */ 3828 if (!IPCL_IS_NONSTR(tcp->tcp_connp)) { 3829 if (seg_len != 0) { 3830 flags |= TH_MARKNEXT_NEEDED; 3831 freemsg(tcp->tcp_urp_mark_mp); 3832 tcp->tcp_urp_mark_mp = NULL; 3833 flags &= ~TH_SEND_URP_MARK; 3834 } else if (tcp->tcp_urp_mark_mp != NULL) { 3835 flags |= TH_SEND_URP_MARK; 3836 tcp->tcp_urp_mark_mp->b_flag &= 3837 ~MSGNOTMARKNEXT; 3838 tcp->tcp_urp_mark_mp->b_flag |= 3839 MSGMARKNEXT; 3840 } 3841 } 3842 #ifdef DEBUG 3843 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 3844 "tcp_rput: AT MARK, len %d, flags 0x%x, %s", 3845 seg_len, flags, 3846 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 3847 #endif /* DEBUG */ 3848 } 3849 #ifdef DEBUG 3850 else { 3851 /* Data left until we hit mark */ 3852 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 3853 "tcp_rput: URP %d bytes left, %s", 3854 urp - seg_len, tcp_display(tcp, NULL, 3855 DISP_PORT_ONLY)); 3856 } 3857 #endif /* DEBUG */ 3858 } 3859 3860 process_ack: 3861 if (!(flags & TH_ACK)) { 3862 freemsg(mp); 3863 goto xmit_check; 3864 } 3865 } 3866 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 3867 3868 if (bytes_acked > 0) 3869 tcp->tcp_ip_forward_progress = B_TRUE; 3870 if (tcp->tcp_state == TCPS_SYN_RCVD) { 3871 /* 3872 * tcp_sendmsg() checks tcp_state without entering 3873 * the squeue so tcp_state should be updated before 3874 * sending up a connection confirmation or a new 3875 * connection indication. 3876 */ 3877 tcp->tcp_state = TCPS_ESTABLISHED; 3878 3879 /* 3880 * We are seeing the final ack in the three way 3881 * hand shake of a active open'ed connection 3882 * so we must send up a T_CONN_CON 3883 */ 3884 if (tcp->tcp_active_open) { 3885 if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) { 3886 freemsg(mp); 3887 tcp->tcp_state = TCPS_SYN_RCVD; 3888 return; 3889 } 3890 /* 3891 * Don't fuse the loopback endpoints for 3892 * simultaneous active opens. 3893 */ 3894 if (tcp->tcp_loopback) { 3895 TCP_STAT(tcps, tcp_fusion_unfusable); 3896 tcp->tcp_unfusable = B_TRUE; 3897 } 3898 /* 3899 * For simultaneous active open, trace receipt of final 3900 * ACK as tcp:::connect-established. 3901 */ 3902 DTRACE_TCP5(connect__established, mblk_t *, NULL, 3903 ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *, 3904 iphdr, tcp_t *, tcp, tcph_t *, tcpha); 3905 } else if (IPCL_IS_NONSTR(connp)) { 3906 /* 3907 * 3-way handshake has completed, so notify socket 3908 * of the new connection. 3909 * 3910 * We are here means eager is fine but it can 3911 * get a TH_RST at any point between now and till 3912 * accept completes and disappear. We need to 3913 * ensure that reference to eager is valid after 3914 * we get out of eager's perimeter. So we do 3915 * an extra refhold. 3916 */ 3917 CONN_INC_REF(connp); 3918 3919 if (!tcp_newconn_notify(tcp, ira)) { 3920 /* 3921 * The state-change probe for SYN_RCVD -> 3922 * ESTABLISHED has not fired yet. We reset 3923 * the state to SYN_RCVD so that future 3924 * state-change probes report correct state 3925 * transistions. 3926 */ 3927 tcp->tcp_state = TCPS_SYN_RCVD; 3928 freemsg(mp); 3929 /* notification did not go up, so drop ref */ 3930 CONN_DEC_REF(connp); 3931 /* ... and close the eager */ 3932 ASSERT(TCP_IS_DETACHED(tcp)); 3933 (void) tcp_close_detached(tcp); 3934 return; 3935 } 3936 /* 3937 * tcp_newconn_notify() changes conn_upcalls and 3938 * connp->conn_upper_handle. Fix things now, in case 3939 * there's data attached to this ack. 3940 */ 3941 if (connp->conn_upcalls != NULL) 3942 sockupcalls = connp->conn_upcalls; 3943 /* 3944 * For passive open, trace receipt of final ACK as 3945 * tcp:::accept-established. 3946 */ 3947 DTRACE_TCP5(accept__established, mlbk_t *, NULL, 3948 ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *, 3949 iphdr, tcp_t *, tcp, tcph_t *, tcpha); 3950 } else { 3951 /* 3952 * 3-way handshake complete - this is a STREAMS based 3953 * socket, so pass up the T_CONN_IND. 3954 */ 3955 tcp_t *listener = tcp->tcp_listener; 3956 mblk_t *mp = tcp->tcp_conn.tcp_eager_conn_ind; 3957 3958 tcp->tcp_tconnind_started = B_TRUE; 3959 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 3960 ASSERT(mp != NULL); 3961 /* 3962 * We are here means eager is fine but it can 3963 * get a TH_RST at any point between now and till 3964 * accept completes and disappear. We need to 3965 * ensure that reference to eager is valid after 3966 * we get out of eager's perimeter. So we do 3967 * an extra refhold. 3968 */ 3969 CONN_INC_REF(connp); 3970 3971 /* 3972 * The listener also exists because of the refhold 3973 * done in tcp_input_listener. Its possible that it 3974 * might have closed. We will check that once we 3975 * get inside listeners context. 3976 */ 3977 CONN_INC_REF(listener->tcp_connp); 3978 if (listener->tcp_connp->conn_sqp == 3979 connp->conn_sqp) { 3980 /* 3981 * We optimize by not calling an SQUEUE_ENTER 3982 * on the listener since we know that the 3983 * listener and eager squeues are the same. 3984 * We are able to make this check safely only 3985 * because neither the eager nor the listener 3986 * can change its squeue. Only an active connect 3987 * can change its squeue 3988 */ 3989 tcp_send_conn_ind(listener->tcp_connp, mp, 3990 listener->tcp_connp->conn_sqp); 3991 CONN_DEC_REF(listener->tcp_connp); 3992 } else if (!tcp->tcp_loopback) { 3993 SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, 3994 mp, tcp_send_conn_ind, 3995 listener->tcp_connp, NULL, SQ_FILL, 3996 SQTAG_TCP_CONN_IND); 3997 } else { 3998 SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, 3999 mp, tcp_send_conn_ind, 4000 listener->tcp_connp, NULL, SQ_NODRAIN, 4001 SQTAG_TCP_CONN_IND); 4002 } 4003 /* 4004 * For passive open, trace receipt of final ACK as 4005 * tcp:::accept-established. 4006 */ 4007 DTRACE_TCP5(accept__established, mlbk_t *, NULL, 4008 ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *, 4009 iphdr, tcp_t *, tcp, tcph_t *, tcpha); 4010 } 4011 TCPS_CONN_INC(tcps); 4012 4013 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ 4014 bytes_acked--; 4015 /* SYN was acked - making progress */ 4016 tcp->tcp_ip_forward_progress = B_TRUE; 4017 4018 /* 4019 * If SYN was retransmitted, need to reset all 4020 * retransmission info as this segment will be 4021 * treated as a dup ACK. 4022 */ 4023 if (tcp->tcp_rexmit) { 4024 tcp->tcp_rexmit = B_FALSE; 4025 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 4026 tcp->tcp_rexmit_max = tcp->tcp_snxt; 4027 tcp->tcp_ms_we_have_waited = 0; 4028 DTRACE_PROBE3(cwnd__retransmitted__syn, 4029 tcp_t *, tcp, uint32_t, tcp->tcp_cwnd, 4030 uint32_t, tcp->tcp_mss); 4031 tcp->tcp_cwnd = mss; 4032 } 4033 4034 /* 4035 * We set the send window to zero here. 4036 * This is needed if there is data to be 4037 * processed already on the queue. 4038 * Later (at swnd_update label), the 4039 * "new_swnd > tcp_swnd" condition is satisfied 4040 * the XMIT_NEEDED flag is set in the current 4041 * (SYN_RCVD) state. This ensures tcp_wput_data() is 4042 * called if there is already data on queue in 4043 * this state. 4044 */ 4045 tcp->tcp_swnd = 0; 4046 4047 if (new_swnd > tcp->tcp_max_swnd) 4048 tcp->tcp_max_swnd = new_swnd; 4049 tcp->tcp_swl1 = seg_seq; 4050 tcp->tcp_swl2 = seg_ack; 4051 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 4052 4053 /* Trace change from SYN_RCVD -> ESTABLISHED here */ 4054 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, 4055 connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, 4056 int32_t, TCPS_SYN_RCVD); 4057 4058 /* Fuse when both sides are in ESTABLISHED state */ 4059 if (tcp->tcp_loopback && do_tcp_fusion) 4060 tcp_fuse(tcp, iphdr, tcpha); 4061 4062 } 4063 /* This code follows 4.4BSD-Lite2 mostly. */ 4064 if (bytes_acked < 0) 4065 goto est; 4066 4067 /* 4068 * If TCP is ECN capable and the congestion experience bit is 4069 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be 4070 * done once per window (or more loosely, per RTT). 4071 */ 4072 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) 4073 tcp->tcp_cwr = B_FALSE; 4074 if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) { 4075 cc_cong_signal(tcp, seg_ack, CC_ECN); 4076 /* 4077 * If the cwnd is 0, use the timer to clock out 4078 * new segments. This is required by the ECN spec. 4079 */ 4080 if (tcp->tcp_cwnd == 0) 4081 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 4082 tcp->tcp_cwr = B_TRUE; 4083 /* 4084 * This marks the end of the current window of in 4085 * flight data. That is why we don't use 4086 * tcp_suna + tcp_swnd. Only data in flight can 4087 * provide ECN info. 4088 */ 4089 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 4090 } 4091 4092 mp1 = tcp->tcp_xmit_head; 4093 if (bytes_acked == 0) { 4094 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { 4095 int dupack_cnt; 4096 4097 TCPS_BUMP_MIB(tcps, tcpInDupAck); 4098 /* 4099 * Fast retransmit. When we have seen exactly three 4100 * identical ACKs while we have unacked data 4101 * outstanding we take it as a hint that our peer 4102 * dropped something. 4103 * 4104 * If TCP is retransmitting, don't do fast retransmit. 4105 */ 4106 if (mp1 && tcp->tcp_suna != tcp->tcp_snxt && 4107 ! tcp->tcp_rexmit) { 4108 /* Do Limited Transmit */ 4109 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < 4110 tcps->tcps_dupack_fast_retransmit) { 4111 cc_ack_received(tcp, seg_ack, 4112 bytes_acked, CC_DUPACK); 4113 /* 4114 * RFC 3042 4115 * 4116 * What we need to do is temporarily 4117 * increase tcp_cwnd so that new 4118 * data can be sent if it is allowed 4119 * by the receive window (tcp_rwnd). 4120 * tcp_wput_data() will take care of 4121 * the rest. 4122 * 4123 * If the connection is SACK capable, 4124 * only do limited xmit when there 4125 * is SACK info. 4126 * 4127 * Note how tcp_cwnd is incremented. 4128 * The first dup ACK will increase 4129 * it by 1 MSS. The second dup ACK 4130 * will increase it by 2 MSS. This 4131 * means that only 1 new segment will 4132 * be sent for each dup ACK. 4133 */ 4134 if (tcp->tcp_unsent > 0 && 4135 (!tcp->tcp_snd_sack_ok || 4136 (tcp->tcp_snd_sack_ok && 4137 tcp->tcp_notsack_list != NULL))) { 4138 tcp->tcp_cwnd += mss << 4139 (tcp->tcp_dupack_cnt - 1); 4140 flags |= TH_LIMIT_XMIT; 4141 } 4142 } else if (dupack_cnt == 4143 tcps->tcps_dupack_fast_retransmit) { 4144 4145 /* 4146 * If we have reduced tcp_ssthresh 4147 * because of ECN, do not reduce it again 4148 * unless it is already one window of data 4149 * away. After one window of data, tcp_cwr 4150 * should then be cleared. Note that 4151 * for non ECN capable connection, tcp_cwr 4152 * should always be false. 4153 * 4154 * Adjust cwnd since the duplicate 4155 * ack indicates that a packet was 4156 * dropped (due to congestion.) 4157 */ 4158 if (!tcp->tcp_cwr) { 4159 cc_cong_signal(tcp, seg_ack, 4160 CC_NDUPACK); 4161 cc_ack_received(tcp, seg_ack, 4162 bytes_acked, CC_DUPACK); 4163 } 4164 if (tcp->tcp_ecn_ok) { 4165 tcp->tcp_cwr = B_TRUE; 4166 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 4167 tcp->tcp_ecn_cwr_sent = B_FALSE; 4168 } 4169 4170 /* 4171 * We do Hoe's algorithm. Refer to her 4172 * paper "Improving the Start-up Behavior 4173 * of a Congestion Control Scheme for TCP," 4174 * appeared in SIGCOMM'96. 4175 * 4176 * Save highest seq no we have sent so far. 4177 * Be careful about the invisible FIN byte. 4178 */ 4179 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 4180 (tcp->tcp_unsent == 0)) { 4181 tcp->tcp_rexmit_max = tcp->tcp_fss; 4182 } else { 4183 tcp->tcp_rexmit_max = tcp->tcp_snxt; 4184 } 4185 4186 /* 4187 * For SACK: 4188 * Calculate tcp_pipe, which is the 4189 * estimated number of bytes in 4190 * network. 4191 * 4192 * tcp_fack is the highest sack'ed seq num 4193 * TCP has received. 4194 * 4195 * tcp_pipe is explained in the above quoted 4196 * Fall and Floyd's paper. tcp_fack is 4197 * explained in Mathis and Mahdavi's 4198 * "Forward Acknowledgment: Refining TCP 4199 * Congestion Control" in SIGCOMM '96. 4200 */ 4201 if (tcp->tcp_snd_sack_ok) { 4202 if (tcp->tcp_notsack_list != NULL) { 4203 tcp->tcp_pipe = tcp->tcp_snxt - 4204 tcp->tcp_fack; 4205 tcp->tcp_sack_snxt = seg_ack; 4206 flags |= TH_NEED_SACK_REXMIT; 4207 } else { 4208 /* 4209 * Always initialize tcp_pipe 4210 * even though we don't have 4211 * any SACK info. If later 4212 * we get SACK info and 4213 * tcp_pipe is not initialized, 4214 * funny things will happen. 4215 */ 4216 tcp->tcp_pipe = 4217 tcp->tcp_cwnd_ssthresh; 4218 } 4219 } else { 4220 flags |= TH_REXMIT_NEEDED; 4221 } /* tcp_snd_sack_ok */ 4222 4223 } else { 4224 cc_ack_received(tcp, seg_ack, 4225 bytes_acked, CC_DUPACK); 4226 /* 4227 * Here we perform congestion 4228 * avoidance, but NOT slow start. 4229 * This is known as the Fast 4230 * Recovery Algorithm. 4231 */ 4232 if (tcp->tcp_snd_sack_ok && 4233 tcp->tcp_notsack_list != NULL) { 4234 flags |= TH_NEED_SACK_REXMIT; 4235 tcp->tcp_pipe -= mss; 4236 if (tcp->tcp_pipe < 0) 4237 tcp->tcp_pipe = 0; 4238 } else { 4239 /* 4240 * We know that one more packet has 4241 * left the pipe thus we can update 4242 * cwnd. 4243 */ 4244 cwnd = tcp->tcp_cwnd + mss; 4245 if (cwnd > tcp->tcp_cwnd_max) 4246 cwnd = tcp->tcp_cwnd_max; 4247 DTRACE_PROBE3(cwnd__fast__recovery, 4248 tcp_t *, tcp, 4249 uint32_t, tcp->tcp_cwnd, 4250 uint32_t, cwnd); 4251 tcp->tcp_cwnd = cwnd; 4252 if (tcp->tcp_unsent > 0) 4253 flags |= TH_XMIT_NEEDED; 4254 } 4255 } 4256 } 4257 } else if (tcp->tcp_zero_win_probe) { 4258 /* 4259 * If the window has opened, need to arrange 4260 * to send additional data. 4261 */ 4262 if (new_swnd != 0) { 4263 /* tcp_suna != tcp_snxt */ 4264 /* Packet contains a window update */ 4265 TCPS_BUMP_MIB(tcps, tcpInWinUpdate); 4266 tcp->tcp_zero_win_probe = 0; 4267 tcp->tcp_timer_backoff = 0; 4268 tcp->tcp_ms_we_have_waited = 0; 4269 4270 /* 4271 * Transmit starting with tcp_suna since 4272 * the one byte probe is not ack'ed. 4273 * If TCP has sent more than one identical 4274 * probe, tcp_rexmit will be set. That means 4275 * tcp_ss_rexmit() will send out the one 4276 * byte along with new data. Otherwise, 4277 * fake the retransmission. 4278 */ 4279 flags |= TH_XMIT_NEEDED; 4280 if (!tcp->tcp_rexmit) { 4281 tcp->tcp_rexmit = B_TRUE; 4282 tcp->tcp_dupack_cnt = 0; 4283 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 4284 tcp->tcp_rexmit_max = tcp->tcp_suna + 1; 4285 } 4286 } 4287 } 4288 goto swnd_update; 4289 } 4290 4291 /* 4292 * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73. 4293 * If the ACK value acks something that we have not yet sent, it might 4294 * be an old duplicate segment. Send an ACK to re-synchronize the 4295 * other side. 4296 * Note: reset in response to unacceptable ACK in SYN_RECEIVE 4297 * state is handled above, so we can always just drop the segment and 4298 * send an ACK here. 4299 * 4300 * In the case where the peer shrinks the window, we see the new window 4301 * update, but all the data sent previously is queued up by the peer. 4302 * To account for this, in tcp_process_shrunk_swnd(), the sequence 4303 * number, which was already sent, and within window, is recorded. 4304 * tcp_snxt is then updated. 4305 * 4306 * If the window has previously shrunk, and an ACK for data not yet 4307 * sent, according to tcp_snxt is recieved, it may still be valid. If 4308 * the ACK is for data within the window at the time the window was 4309 * shrunk, then the ACK is acceptable. In this case tcp_snxt is set to 4310 * the sequence number ACK'ed. 4311 * 4312 * If the ACK covers all the data sent at the time the window was 4313 * shrunk, we can now set tcp_is_wnd_shrnk to B_FALSE. 4314 * 4315 * Should we send ACKs in response to ACK only segments? 4316 */ 4317 4318 if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { 4319 if ((tcp->tcp_is_wnd_shrnk) && 4320 (SEQ_LEQ(seg_ack, tcp->tcp_snxt_shrunk))) { 4321 uint32_t data_acked_ahead_snxt; 4322 4323 data_acked_ahead_snxt = seg_ack - tcp->tcp_snxt; 4324 tcp_update_xmit_tail(tcp, seg_ack); 4325 tcp->tcp_unsent -= data_acked_ahead_snxt; 4326 } else { 4327 TCPS_BUMP_MIB(tcps, tcpInAckUnsent); 4328 /* drop the received segment */ 4329 freemsg(mp); 4330 4331 /* 4332 * Send back an ACK. If tcp_drop_ack_unsent_cnt is 4333 * greater than 0, check if the number of such 4334 * bogus ACks is greater than that count. If yes, 4335 * don't send back any ACK. This prevents TCP from 4336 * getting into an ACK storm if somehow an attacker 4337 * successfully spoofs an acceptable segment to our 4338 * peer. If this continues (count > 2 X threshold), 4339 * we should abort this connection. 4340 */ 4341 if (tcp_drop_ack_unsent_cnt > 0 && 4342 ++tcp->tcp_in_ack_unsent > 4343 tcp_drop_ack_unsent_cnt) { 4344 TCP_STAT(tcps, tcp_in_ack_unsent_drop); 4345 if (tcp->tcp_in_ack_unsent > 2 * 4346 tcp_drop_ack_unsent_cnt) { 4347 (void) tcp_clean_death(tcp, EPROTO); 4348 } 4349 return; 4350 } 4351 mp = tcp_ack_mp(tcp); 4352 if (mp != NULL) { 4353 TCPS_BUMP_MIB(tcps, tcpHCOutSegs); 4354 TCPS_BUMP_MIB(tcps, tcpOutAck); 4355 tcp_send_data(tcp, mp); 4356 } 4357 return; 4358 } 4359 } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack, 4360 tcp->tcp_snxt_shrunk)) { 4361 tcp->tcp_is_wnd_shrnk = B_FALSE; 4362 } 4363 4364 /* 4365 * TCP gets a new ACK, update the notsack'ed list to delete those 4366 * blocks that are covered by this ACK. 4367 */ 4368 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 4369 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack, 4370 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list)); 4371 } 4372 4373 /* 4374 * If we got an ACK after fast retransmit, check to see 4375 * if it is a partial ACK. If it is not and the congestion 4376 * window was inflated to account for the other side's 4377 * cached packets, retract it. If it is, do Hoe's algorithm. 4378 */ 4379 if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) { 4380 ASSERT(tcp->tcp_rexmit == B_FALSE); 4381 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { 4382 tcp->tcp_dupack_cnt = 0; 4383 4384 cc_post_recovery(tcp, seg_ack); 4385 4386 tcp->tcp_rexmit_max = seg_ack; 4387 4388 /* 4389 * Remove all notsack info to avoid confusion with 4390 * the next fast retrasnmit/recovery phase. 4391 */ 4392 if (tcp->tcp_snd_sack_ok) { 4393 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, 4394 tcp); 4395 } 4396 } else { 4397 if (tcp->tcp_snd_sack_ok && 4398 tcp->tcp_notsack_list != NULL) { 4399 flags |= TH_NEED_SACK_REXMIT; 4400 tcp->tcp_pipe -= mss; 4401 if (tcp->tcp_pipe < 0) 4402 tcp->tcp_pipe = 0; 4403 } else { 4404 /* 4405 * Hoe's algorithm: 4406 * 4407 * Retransmit the unack'ed segment and 4408 * restart fast recovery. Note that we 4409 * need to scale back tcp_cwnd to the 4410 * original value when we started fast 4411 * recovery. This is to prevent overly 4412 * aggressive behaviour in sending new 4413 * segments. 4414 */ 4415 cwnd = tcp->tcp_cwnd_ssthresh + 4416 tcps->tcps_dupack_fast_retransmit * mss; 4417 DTRACE_PROBE3(cwnd__fast__retransmit__part__ack, 4418 tcp_t *, tcp, uint32_t, tcp->tcp_cwnd, 4419 uint32_t, cwnd); 4420 tcp->tcp_cwnd = cwnd; 4421 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; 4422 flags |= TH_REXMIT_NEEDED; 4423 } 4424 } 4425 } else { 4426 tcp->tcp_dupack_cnt = 0; 4427 if (tcp->tcp_rexmit) { 4428 /* 4429 * TCP is retranmitting. If the ACK ack's all 4430 * outstanding data, update tcp_rexmit_max and 4431 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt 4432 * to the correct value. 4433 * 4434 * Note that SEQ_LEQ() is used. This is to avoid 4435 * unnecessary fast retransmit caused by dup ACKs 4436 * received when TCP does slow start retransmission 4437 * after a time out. During this phase, TCP may 4438 * send out segments which are already received. 4439 * This causes dup ACKs to be sent back. 4440 */ 4441 if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) { 4442 if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) { 4443 tcp->tcp_rexmit_nxt = seg_ack; 4444 } 4445 if (seg_ack != tcp->tcp_rexmit_max) { 4446 flags |= TH_XMIT_NEEDED; 4447 } 4448 } else { 4449 tcp->tcp_rexmit = B_FALSE; 4450 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 4451 } 4452 tcp->tcp_ms_we_have_waited = 0; 4453 } 4454 } 4455 4456 TCPS_BUMP_MIB(tcps, tcpInAckSegs); 4457 TCPS_UPDATE_MIB(tcps, tcpInAckBytes, bytes_acked); 4458 tcp->tcp_suna = seg_ack; 4459 if (tcp->tcp_zero_win_probe != 0) { 4460 tcp->tcp_zero_win_probe = 0; 4461 tcp->tcp_timer_backoff = 0; 4462 } 4463 4464 /* 4465 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed. 4466 * Note that it cannot be the SYN being ack'ed. The code flow 4467 * will not reach here. 4468 */ 4469 if (mp1 == NULL) { 4470 goto fin_acked; 4471 } 4472 4473 /* 4474 * Update the congestion window. 4475 * 4476 * If TCP is not ECN capable or TCP is ECN capable but the 4477 * congestion experience bit is not set, increase the tcp_cwnd as 4478 * usual. 4479 */ 4480 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { 4481 if (IN_RECOVERY(tcp->tcp_ccv.flags)) { 4482 EXIT_RECOVERY(tcp->tcp_ccv.flags); 4483 } 4484 cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK); 4485 } 4486 4487 /* See if the latest urgent data has been acknowledged */ 4488 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && 4489 SEQ_GT(seg_ack, tcp->tcp_urg)) 4490 tcp->tcp_valid_bits &= ~TCP_URG_VALID; 4491 4492 /* 4493 * Update the RTT estimates. Note that we don't use the TCP 4494 * timestamp option to calculate RTT even if one is present. This is 4495 * because the timestamp option's resolution (CPU tick) is 4496 * too coarse to measure modern datacenter networks' microsecond 4497 * latencies. The timestamp field's resolution is limited by its 4498 * 4-byte width (see RFC1323), and since we always store a 4499 * high-resolution nanosecond presision timestamp along with the data, 4500 * there is no point to ever using the timestamp option. 4501 */ 4502 if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { 4503 /* 4504 * An ACK sequence we haven't seen before, so get the RTT 4505 * and update the RTO. But first check if the timestamp is 4506 * valid to use. 4507 */ 4508 if ((mp1->b_next != NULL) && 4509 SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) { 4510 tcp_set_rto(tcp, gethrtime() - 4511 (hrtime_t)(intptr_t)mp1->b_prev); 4512 } else { 4513 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate); 4514 } 4515 4516 /* Remeber the last sequence to be ACKed */ 4517 tcp->tcp_csuna = seg_ack; 4518 if (tcp->tcp_set_timer == 1) { 4519 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 4520 tcp->tcp_set_timer = 0; 4521 } 4522 } else { 4523 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate); 4524 } 4525 4526 /* Eat acknowledged bytes off the xmit queue. */ 4527 for (;;) { 4528 mblk_t *mp2; 4529 uchar_t *wptr; 4530 4531 wptr = mp1->b_wptr; 4532 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX); 4533 bytes_acked -= (int)(wptr - mp1->b_rptr); 4534 if (bytes_acked < 0) { 4535 mp1->b_rptr = wptr + bytes_acked; 4536 /* 4537 * Set a new timestamp if all the bytes timed by the 4538 * old timestamp have been ack'ed. 4539 */ 4540 if (SEQ_GT(seg_ack, 4541 (uint32_t)(uintptr_t)(mp1->b_next))) { 4542 mp1->b_prev = 4543 (mblk_t *)(intptr_t)gethrtime(); 4544 mp1->b_next = NULL; 4545 } 4546 break; 4547 } 4548 mp1->b_next = NULL; 4549 mp1->b_prev = NULL; 4550 mp2 = mp1; 4551 mp1 = mp1->b_cont; 4552 4553 /* 4554 * This notification is required for some zero-copy 4555 * clients to maintain a copy semantic. After the data 4556 * is ack'ed, client is safe to modify or reuse the buffer. 4557 */ 4558 if (tcp->tcp_snd_zcopy_aware && 4559 (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 4560 tcp_zcopy_notify(tcp); 4561 freeb(mp2); 4562 if (bytes_acked == 0) { 4563 if (mp1 == NULL) { 4564 /* Everything is ack'ed, clear the tail. */ 4565 tcp->tcp_xmit_tail = NULL; 4566 /* 4567 * Cancel the timer unless we are still 4568 * waiting for an ACK for the FIN packet. 4569 */ 4570 if (tcp->tcp_timer_tid != 0 && 4571 tcp->tcp_snxt == tcp->tcp_suna) { 4572 (void) TCP_TIMER_CANCEL(tcp, 4573 tcp->tcp_timer_tid); 4574 tcp->tcp_timer_tid = 0; 4575 } 4576 goto pre_swnd_update; 4577 } 4578 if (mp2 != tcp->tcp_xmit_tail) 4579 break; 4580 tcp->tcp_xmit_tail = mp1; 4581 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 4582 (uintptr_t)INT_MAX); 4583 tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr - 4584 mp1->b_rptr); 4585 break; 4586 } 4587 if (mp1 == NULL) { 4588 /* 4589 * More was acked but there is nothing more 4590 * outstanding. This means that the FIN was 4591 * just acked or that we're talking to a clown. 4592 */ 4593 fin_acked: 4594 ASSERT(tcp->tcp_fin_sent); 4595 tcp->tcp_xmit_tail = NULL; 4596 if (tcp->tcp_fin_sent) { 4597 /* FIN was acked - making progress */ 4598 if (!tcp->tcp_fin_acked) 4599 tcp->tcp_ip_forward_progress = B_TRUE; 4600 tcp->tcp_fin_acked = B_TRUE; 4601 if (tcp->tcp_linger_tid != 0 && 4602 TCP_TIMER_CANCEL(tcp, 4603 tcp->tcp_linger_tid) >= 0) { 4604 tcp_stop_lingering(tcp); 4605 freemsg(mp); 4606 mp = NULL; 4607 } 4608 } else { 4609 /* 4610 * We should never get here because 4611 * we have already checked that the 4612 * number of bytes ack'ed should be 4613 * smaller than or equal to what we 4614 * have sent so far (it is the 4615 * acceptability check of the ACK). 4616 * We can only get here if the send 4617 * queue is corrupted. 4618 * 4619 * Terminate the connection and 4620 * panic the system. It is better 4621 * for us to panic instead of 4622 * continuing to avoid other disaster. 4623 */ 4624 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 4625 tcp->tcp_rnxt, TH_RST|TH_ACK); 4626 panic("Memory corruption " 4627 "detected for connection %s.", 4628 tcp_display(tcp, NULL, 4629 DISP_ADDR_AND_PORT)); 4630 /*NOTREACHED*/ 4631 } 4632 goto pre_swnd_update; 4633 } 4634 ASSERT(mp2 != tcp->tcp_xmit_tail); 4635 } 4636 if (tcp->tcp_unsent) { 4637 flags |= TH_XMIT_NEEDED; 4638 } 4639 pre_swnd_update: 4640 tcp->tcp_xmit_head = mp1; 4641 swnd_update: 4642 /* 4643 * The following check is different from most other implementations. 4644 * For bi-directional transfer, when segments are dropped, the 4645 * "normal" check will not accept a window update in those 4646 * retransmitted segemnts. Failing to do that, TCP may send out 4647 * segments which are outside receiver's window. As TCP accepts 4648 * the ack in those retransmitted segments, if the window update in 4649 * the same segment is not accepted, TCP will incorrectly calculates 4650 * that it can send more segments. This can create a deadlock 4651 * with the receiver if its window becomes zero. 4652 */ 4653 if (SEQ_LT(tcp->tcp_swl2, seg_ack) || 4654 SEQ_LT(tcp->tcp_swl1, seg_seq) || 4655 (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) { 4656 /* 4657 * The criteria for update is: 4658 * 4659 * 1. the segment acknowledges some data. Or 4660 * 2. the segment is new, i.e. it has a higher seq num. Or 4661 * 3. the segment is not old and the advertised window is 4662 * larger than the previous advertised window. 4663 */ 4664 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd) 4665 flags |= TH_XMIT_NEEDED; 4666 tcp->tcp_swnd = new_swnd; 4667 if (new_swnd > tcp->tcp_max_swnd) 4668 tcp->tcp_max_swnd = new_swnd; 4669 tcp->tcp_swl1 = seg_seq; 4670 tcp->tcp_swl2 = seg_ack; 4671 } 4672 est: 4673 if (tcp->tcp_state > TCPS_ESTABLISHED) { 4674 4675 switch (tcp->tcp_state) { 4676 case TCPS_FIN_WAIT_1: 4677 if (tcp->tcp_fin_acked) { 4678 tcp->tcp_state = TCPS_FIN_WAIT_2; 4679 DTRACE_TCP6(state__change, void, NULL, 4680 ip_xmit_attr_t *, connp->conn_ixa, 4681 void, NULL, tcp_t *, tcp, void, NULL, 4682 int32_t, TCPS_FIN_WAIT_1); 4683 /* 4684 * We implement the non-standard BSD/SunOS 4685 * FIN_WAIT_2 flushing algorithm. 4686 * If there is no user attached to this 4687 * TCP endpoint, then this TCP struct 4688 * could hang around forever in FIN_WAIT_2 4689 * state if the peer forgets to send us 4690 * a FIN. To prevent this, we wait only 4691 * 2*MSL (a convenient time value) for 4692 * the FIN to arrive. If it doesn't show up, 4693 * we flush the TCP endpoint. This algorithm, 4694 * though a violation of RFC-793, has worked 4695 * for over 10 years in BSD systems. 4696 * Note: SunOS 4.x waits 675 seconds before 4697 * flushing the FIN_WAIT_2 connection. 4698 */ 4699 TCP_TIMER_RESTART(tcp, 4700 tcp->tcp_fin_wait_2_flush_interval); 4701 } 4702 break; 4703 case TCPS_FIN_WAIT_2: 4704 break; /* Shutdown hook? */ 4705 case TCPS_LAST_ACK: 4706 freemsg(mp); 4707 if (tcp->tcp_fin_acked) { 4708 (void) tcp_clean_death(tcp, 0); 4709 return; 4710 } 4711 goto xmit_check; 4712 case TCPS_CLOSING: 4713 if (tcp->tcp_fin_acked) { 4714 SET_TIME_WAIT(tcps, tcp, connp); 4715 DTRACE_TCP6(state__change, void, NULL, 4716 ip_xmit_attr_t *, connp->conn_ixa, void, 4717 NULL, tcp_t *, tcp, void, NULL, int32_t, 4718 TCPS_CLOSING); 4719 } 4720 /*FALLTHRU*/ 4721 case TCPS_CLOSE_WAIT: 4722 freemsg(mp); 4723 goto xmit_check; 4724 default: 4725 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 4726 break; 4727 } 4728 } 4729 if (flags & TH_FIN) { 4730 /* Make sure we ack the fin */ 4731 flags |= TH_ACK_NEEDED; 4732 if (!tcp->tcp_fin_rcvd) { 4733 tcp->tcp_fin_rcvd = B_TRUE; 4734 tcp->tcp_rnxt++; 4735 tcpha = tcp->tcp_tcpha; 4736 tcpha->tha_ack = htonl(tcp->tcp_rnxt); 4737 4738 /* 4739 * Generate the ordrel_ind at the end unless the 4740 * conn is detached or it is a STREAMS based eager. 4741 * In the eager case we defer the notification until 4742 * tcp_accept_finish has run. 4743 */ 4744 if (!TCP_IS_DETACHED(tcp) && (IPCL_IS_NONSTR(connp) || 4745 (tcp->tcp_listener == NULL && 4746 !tcp->tcp_hard_binding))) 4747 flags |= TH_ORDREL_NEEDED; 4748 switch (tcp->tcp_state) { 4749 case TCPS_SYN_RCVD: 4750 tcp->tcp_state = TCPS_CLOSE_WAIT; 4751 DTRACE_TCP6(state__change, void, NULL, 4752 ip_xmit_attr_t *, connp->conn_ixa, 4753 void, NULL, tcp_t *, tcp, void, NULL, 4754 int32_t, TCPS_SYN_RCVD); 4755 /* Keepalive? */ 4756 break; 4757 case TCPS_ESTABLISHED: 4758 tcp->tcp_state = TCPS_CLOSE_WAIT; 4759 DTRACE_TCP6(state__change, void, NULL, 4760 ip_xmit_attr_t *, connp->conn_ixa, 4761 void, NULL, tcp_t *, tcp, void, NULL, 4762 int32_t, TCPS_ESTABLISHED); 4763 /* Keepalive? */ 4764 break; 4765 case TCPS_FIN_WAIT_1: 4766 if (!tcp->tcp_fin_acked) { 4767 tcp->tcp_state = TCPS_CLOSING; 4768 DTRACE_TCP6(state__change, void, NULL, 4769 ip_xmit_attr_t *, connp->conn_ixa, 4770 void, NULL, tcp_t *, tcp, void, 4771 NULL, int32_t, TCPS_FIN_WAIT_1); 4772 break; 4773 } 4774 /* FALLTHRU */ 4775 case TCPS_FIN_WAIT_2: 4776 SET_TIME_WAIT(tcps, tcp, connp); 4777 DTRACE_TCP6(state__change, void, NULL, 4778 ip_xmit_attr_t *, connp->conn_ixa, void, 4779 NULL, tcp_t *, tcp, void, NULL, int32_t, 4780 TCPS_FIN_WAIT_2); 4781 if (seg_len) { 4782 /* 4783 * implies data piggybacked on FIN. 4784 * break to handle data. 4785 */ 4786 break; 4787 } 4788 freemsg(mp); 4789 goto ack_check; 4790 } 4791 } 4792 } 4793 if (mp == NULL) 4794 goto xmit_check; 4795 if (seg_len == 0) { 4796 freemsg(mp); 4797 goto xmit_check; 4798 } 4799 if (mp->b_rptr == mp->b_wptr) { 4800 /* 4801 * The header has been consumed, so we remove the 4802 * zero-length mblk here. 4803 */ 4804 mp1 = mp; 4805 mp = mp->b_cont; 4806 freeb(mp1); 4807 } 4808 update_ack: 4809 tcpha = tcp->tcp_tcpha; 4810 tcp->tcp_rack_cnt++; 4811 { 4812 uint32_t cur_max; 4813 4814 cur_max = tcp->tcp_rack_cur_max; 4815 if (tcp->tcp_rack_cnt >= cur_max) { 4816 /* 4817 * We have more unacked data than we should - send 4818 * an ACK now. 4819 */ 4820 flags |= TH_ACK_NEEDED; 4821 cur_max++; 4822 if (cur_max > tcp->tcp_rack_abs_max) 4823 tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; 4824 else 4825 tcp->tcp_rack_cur_max = cur_max; 4826 } else if (tcp->tcp_quickack) { 4827 /* The executable asked that we ack each packet */ 4828 flags |= TH_ACK_NEEDED; 4829 } else if (TCP_IS_DETACHED(tcp)) { 4830 /* We don't have an ACK timer for detached TCP. */ 4831 flags |= TH_ACK_NEEDED; 4832 } else if (seg_len < mss) { 4833 /* 4834 * If we get a segment that is less than an mss, and we 4835 * already have unacknowledged data, and the amount 4836 * unacknowledged is not a multiple of mss, then we 4837 * better generate an ACK now. Otherwise, this may be 4838 * the tail piece of a transaction, and we would rather 4839 * wait for the response. 4840 */ 4841 uint32_t udif; 4842 ASSERT((uintptr_t)(tcp->tcp_rnxt - tcp->tcp_rack) <= 4843 (uintptr_t)INT_MAX); 4844 udif = (int)(tcp->tcp_rnxt - tcp->tcp_rack); 4845 if (udif && (udif % mss)) 4846 flags |= TH_ACK_NEEDED; 4847 else 4848 flags |= TH_ACK_TIMER_NEEDED; 4849 } else { 4850 /* Start delayed ack timer */ 4851 flags |= TH_ACK_TIMER_NEEDED; 4852 } 4853 } 4854 tcp->tcp_rnxt += seg_len; 4855 tcpha->tha_ack = htonl(tcp->tcp_rnxt); 4856 4857 if (mp == NULL) 4858 goto xmit_check; 4859 4860 /* Update SACK list */ 4861 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 4862 tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, 4863 &(tcp->tcp_num_sack_blk)); 4864 } 4865 4866 if (tcp->tcp_urp_mp) { 4867 tcp->tcp_urp_mp->b_cont = mp; 4868 mp = tcp->tcp_urp_mp; 4869 tcp->tcp_urp_mp = NULL; 4870 /* Ready for a new signal. */ 4871 tcp->tcp_urp_last_valid = B_FALSE; 4872 #ifdef DEBUG 4873 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 4874 "tcp_rput: sending exdata_ind %s", 4875 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 4876 #endif /* DEBUG */ 4877 } 4878 4879 /* 4880 * Check for ancillary data changes compared to last segment. 4881 */ 4882 if (connp->conn_recv_ancillary.crb_all != 0) { 4883 mp = tcp_input_add_ancillary(tcp, mp, &ipp, ira); 4884 if (mp == NULL) 4885 return; 4886 } 4887 4888 if (IPCL_IS_NONSTR(connp)) { 4889 /* 4890 * Non-STREAMS socket 4891 */ 4892 boolean_t push = flags & (TH_PUSH|TH_FIN); 4893 int error; 4894 4895 if ((*sockupcalls->su_recv)(connp->conn_upper_handle, 4896 mp, seg_len, 0, &error, &push) <= 0) { 4897 /* 4898 * We should never be in middle of a 4899 * fallback, the squeue guarantees that. 4900 */ 4901 ASSERT(error != EOPNOTSUPP); 4902 if (error == ENOSPC) 4903 tcp->tcp_rwnd -= seg_len; 4904 } else if (push) { 4905 /* PUSH bit set and sockfs is not flow controlled */ 4906 flags |= tcp_rwnd_reopen(tcp); 4907 } 4908 } else if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) { 4909 /* 4910 * Side queue inbound data until the accept happens. 4911 * tcp_accept/tcp_rput drains this when the accept happens. 4912 * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or 4913 * T_EXDATA_IND) it is queued on b_next. 4914 * XXX Make urgent data use this. Requires: 4915 * Removing tcp_listener check for TH_URG 4916 * Making M_PCPROTO and MARK messages skip the eager case 4917 */ 4918 4919 tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); 4920 } else { 4921 /* Active STREAMS socket */ 4922 if (mp->b_datap->db_type != M_DATA || 4923 (flags & TH_MARKNEXT_NEEDED)) { 4924 if (tcp->tcp_rcv_list != NULL) { 4925 flags |= tcp_rcv_drain(tcp); 4926 } 4927 ASSERT(tcp->tcp_rcv_list == NULL || 4928 tcp->tcp_fused_sigurg); 4929 4930 if (flags & TH_MARKNEXT_NEEDED) { 4931 #ifdef DEBUG 4932 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 4933 "tcp_rput: sending MSGMARKNEXT %s", 4934 tcp_display(tcp, NULL, 4935 DISP_PORT_ONLY)); 4936 #endif /* DEBUG */ 4937 mp->b_flag |= MSGMARKNEXT; 4938 flags &= ~TH_MARKNEXT_NEEDED; 4939 } 4940 4941 if (is_system_labeled()) 4942 tcp_setcred_data(mp, ira); 4943 4944 putnext(connp->conn_rq, mp); 4945 if (!canputnext(connp->conn_rq)) 4946 tcp->tcp_rwnd -= seg_len; 4947 } else if ((flags & (TH_PUSH|TH_FIN)) || 4948 tcp->tcp_rcv_cnt + seg_len >= connp->conn_rcvbuf >> 3) { 4949 if (tcp->tcp_rcv_list != NULL) { 4950 /* 4951 * Enqueue the new segment first and then 4952 * call tcp_rcv_drain() to send all data 4953 * up. The other way to do this is to 4954 * send all queued data up and then call 4955 * putnext() to send the new segment up. 4956 * This way can remove the else part later 4957 * on. 4958 * 4959 * We don't do this to avoid one more call to 4960 * canputnext() as tcp_rcv_drain() needs to 4961 * call canputnext(). 4962 */ 4963 tcp_rcv_enqueue(tcp, mp, seg_len, 4964 ira->ira_cred); 4965 flags |= tcp_rcv_drain(tcp); 4966 } else { 4967 if (is_system_labeled()) 4968 tcp_setcred_data(mp, ira); 4969 4970 putnext(connp->conn_rq, mp); 4971 if (!canputnext(connp->conn_rq)) 4972 tcp->tcp_rwnd -= seg_len; 4973 } 4974 } else { 4975 /* 4976 * Enqueue all packets when processing an mblk 4977 * from the co queue and also enqueue normal packets. 4978 */ 4979 tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); 4980 } 4981 /* 4982 * Make sure the timer is running if we have data waiting 4983 * for a push bit. This provides resiliency against 4984 * implementations that do not correctly generate push bits. 4985 */ 4986 if (tcp->tcp_rcv_list != NULL && tcp->tcp_push_tid == 0) { 4987 /* 4988 * The connection may be closed at this point, so don't 4989 * do anything for a detached tcp. 4990 */ 4991 if (!TCP_IS_DETACHED(tcp)) 4992 tcp->tcp_push_tid = TCP_TIMER(tcp, 4993 tcp_push_timer, 4994 tcps->tcps_push_timer_interval); 4995 } 4996 } 4997 4998 xmit_check: 4999 /* Is there anything left to do? */ 5000 ASSERT(!(flags & TH_MARKNEXT_NEEDED)); 5001 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED| 5002 TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED| 5003 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) 5004 goto done; 5005 5006 /* Any transmit work to do and a non-zero window? */ 5007 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT| 5008 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) { 5009 if (flags & TH_REXMIT_NEEDED) { 5010 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; 5011 5012 TCPS_BUMP_MIB(tcps, tcpOutFastRetrans); 5013 if (snd_size > mss) 5014 snd_size = mss; 5015 if (snd_size > tcp->tcp_swnd) 5016 snd_size = tcp->tcp_swnd; 5017 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, 5018 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, 5019 B_TRUE); 5020 5021 if (mp1 != NULL) { 5022 tcp->tcp_xmit_head->b_prev = 5023 (mblk_t *)(intptr_t)gethrtime(); 5024 tcp->tcp_csuna = tcp->tcp_snxt; 5025 TCPS_BUMP_MIB(tcps, tcpRetransSegs); 5026 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, 5027 snd_size); 5028 tcp->tcp_cs.tcp_out_retrans_segs++; 5029 tcp->tcp_cs.tcp_out_retrans_bytes += snd_size; 5030 tcp_send_data(tcp, mp1); 5031 } 5032 } 5033 if (flags & TH_NEED_SACK_REXMIT) { 5034 tcp_sack_rexmit(tcp, &flags); 5035 } 5036 /* 5037 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send 5038 * out new segment. Note that tcp_rexmit should not be 5039 * set, otherwise TH_LIMIT_XMIT should not be set. 5040 */ 5041 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) { 5042 if (!tcp->tcp_rexmit) { 5043 tcp_wput_data(tcp, NULL, B_FALSE); 5044 } else { 5045 tcp_ss_rexmit(tcp); 5046 } 5047 } 5048 /* 5049 * Adjust tcp_cwnd back to normal value after sending 5050 * new data segments. 5051 */ 5052 if (flags & TH_LIMIT_XMIT) { 5053 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1); 5054 /* 5055 * This will restart the timer. Restarting the 5056 * timer is used to avoid a timeout before the 5057 * limited transmitted segment's ACK gets back. 5058 */ 5059 if (tcp->tcp_xmit_head != NULL) { 5060 tcp->tcp_xmit_head->b_prev = 5061 (mblk_t *)(intptr_t)gethrtime(); 5062 } 5063 } 5064 5065 /* Anything more to do? */ 5066 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED| 5067 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) 5068 goto done; 5069 } 5070 ack_check: 5071 if (flags & TH_SEND_URP_MARK) { 5072 ASSERT(tcp->tcp_urp_mark_mp); 5073 ASSERT(!IPCL_IS_NONSTR(connp)); 5074 /* 5075 * Send up any queued data and then send the mark message 5076 */ 5077 if (tcp->tcp_rcv_list != NULL) { 5078 flags |= tcp_rcv_drain(tcp); 5079 5080 } 5081 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 5082 mp1 = tcp->tcp_urp_mark_mp; 5083 tcp->tcp_urp_mark_mp = NULL; 5084 if (is_system_labeled()) 5085 tcp_setcred_data(mp1, ira); 5086 5087 putnext(connp->conn_rq, mp1); 5088 #ifdef DEBUG 5089 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 5090 "tcp_rput: sending zero-length %s %s", 5091 ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" : 5092 "MSGNOTMARKNEXT"), 5093 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 5094 #endif /* DEBUG */ 5095 flags &= ~TH_SEND_URP_MARK; 5096 } 5097 if (flags & TH_ACK_NEEDED) { 5098 /* 5099 * Time to send an ack for some reason. 5100 */ 5101 mp1 = tcp_ack_mp(tcp); 5102 5103 if (mp1 != NULL) { 5104 tcp_send_data(tcp, mp1); 5105 TCPS_BUMP_MIB(tcps, tcpHCOutSegs); 5106 TCPS_BUMP_MIB(tcps, tcpOutAck); 5107 } 5108 if (tcp->tcp_ack_tid != 0) { 5109 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); 5110 tcp->tcp_ack_tid = 0; 5111 } 5112 } 5113 if (flags & TH_ACK_TIMER_NEEDED) { 5114 /* 5115 * Arrange for deferred ACK or push wait timeout. 5116 * Start timer if it is not already running. 5117 */ 5118 if (tcp->tcp_ack_tid == 0) { 5119 tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer, 5120 tcp->tcp_localnet ? 5121 tcps->tcps_local_dack_interval : 5122 tcps->tcps_deferred_ack_interval); 5123 } 5124 } 5125 if (flags & TH_ORDREL_NEEDED) { 5126 /* 5127 * Notify upper layer about an orderly release. If this is 5128 * a non-STREAMS socket, then just make an upcall. For STREAMS 5129 * we send up an ordrel_ind, unless this is an eager, in which 5130 * case the ordrel will be sent when tcp_accept_finish runs. 5131 * Note that for non-STREAMS we make an upcall even if it is an 5132 * eager, because we have an upper handle to send it to. 5133 */ 5134 ASSERT(IPCL_IS_NONSTR(connp) || tcp->tcp_listener == NULL); 5135 ASSERT(!tcp->tcp_detached); 5136 5137 if (IPCL_IS_NONSTR(connp)) { 5138 ASSERT(tcp->tcp_ordrel_mp == NULL); 5139 tcp->tcp_ordrel_done = B_TRUE; 5140 (*sockupcalls->su_opctl)(connp->conn_upper_handle, 5141 SOCK_OPCTL_SHUT_RECV, 0); 5142 goto done; 5143 } 5144 5145 if (tcp->tcp_rcv_list != NULL) { 5146 /* 5147 * Push any mblk(s) enqueued from co processing. 5148 */ 5149 flags |= tcp_rcv_drain(tcp); 5150 } 5151 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 5152 5153 mp1 = tcp->tcp_ordrel_mp; 5154 tcp->tcp_ordrel_mp = NULL; 5155 tcp->tcp_ordrel_done = B_TRUE; 5156 putnext(connp->conn_rq, mp1); 5157 } 5158 done: 5159 ASSERT(!(flags & TH_MARKNEXT_NEEDED)); 5160 } 5161 5162 /* 5163 * Attach ancillary data to a received TCP segments for the 5164 * ancillary pieces requested by the application that are 5165 * different than they were in the previous data segment. 5166 * 5167 * Save the "current" values once memory allocation is ok so that 5168 * when memory allocation fails we can just wait for the next data segment. 5169 */ 5170 static mblk_t * 5171 tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, 5172 ip_recv_attr_t *ira) 5173 { 5174 struct T_optdata_ind *todi; 5175 int optlen; 5176 uchar_t *optptr; 5177 struct T_opthdr *toh; 5178 crb_t addflag; /* Which pieces to add */ 5179 mblk_t *mp1; 5180 conn_t *connp = tcp->tcp_connp; 5181 5182 optlen = 0; 5183 addflag.crb_all = 0; 5184 5185 /* If app asked for TOS and it has changed ... */ 5186 if (connp->conn_recv_ancillary.crb_recvtos && 5187 ipp->ipp_type_of_service != tcp->tcp_recvtos && 5188 (ira->ira_flags & IRAF_IS_IPV4)) { 5189 optlen += sizeof (struct T_opthdr) + 5190 P2ROUNDUP(sizeof (uint8_t), __TPI_ALIGN_SIZE); 5191 addflag.crb_recvtos = 1; 5192 } 5193 /* If app asked for pktinfo and the index has changed ... */ 5194 if (connp->conn_recv_ancillary.crb_ip_recvpktinfo && 5195 ira->ira_ruifindex != tcp->tcp_recvifindex) { 5196 optlen += sizeof (struct T_opthdr) + 5197 sizeof (struct in6_pktinfo); 5198 addflag.crb_ip_recvpktinfo = 1; 5199 } 5200 /* If app asked for hoplimit and it has changed ... */ 5201 if (connp->conn_recv_ancillary.crb_ipv6_recvhoplimit && 5202 ipp->ipp_hoplimit != tcp->tcp_recvhops) { 5203 optlen += sizeof (struct T_opthdr) + sizeof (uint_t); 5204 addflag.crb_ipv6_recvhoplimit = 1; 5205 } 5206 /* If app asked for tclass and it has changed ... */ 5207 if (connp->conn_recv_ancillary.crb_ipv6_recvtclass && 5208 ipp->ipp_tclass != tcp->tcp_recvtclass) { 5209 optlen += sizeof (struct T_opthdr) + sizeof (uint_t); 5210 addflag.crb_ipv6_recvtclass = 1; 5211 } 5212 5213 /* 5214 * If app asked for hop-by-hop headers and it has changed ... 5215 * For security labels, note that (1) security labels can't change on 5216 * a connected socket at all, (2) we're connected to at most one peer, 5217 * (3) if anything changes, then it must be some other extra option. 5218 */ 5219 if (connp->conn_recv_ancillary.crb_ipv6_recvhopopts && 5220 ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen, 5221 (ipp->ipp_fields & IPPF_HOPOPTS), 5222 ipp->ipp_hopopts, ipp->ipp_hopoptslen)) { 5223 optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen; 5224 addflag.crb_ipv6_recvhopopts = 1; 5225 if (!ip_allocbuf((void **)&tcp->tcp_hopopts, 5226 &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), 5227 ipp->ipp_hopopts, ipp->ipp_hopoptslen)) 5228 return (mp); 5229 } 5230 /* If app asked for dst headers before routing headers ... */ 5231 if (connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts && 5232 ip_cmpbuf(tcp->tcp_rthdrdstopts, tcp->tcp_rthdrdstoptslen, 5233 (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), 5234 ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) { 5235 optlen += sizeof (struct T_opthdr) + 5236 ipp->ipp_rthdrdstoptslen; 5237 addflag.crb_ipv6_recvrthdrdstopts = 1; 5238 if (!ip_allocbuf((void **)&tcp->tcp_rthdrdstopts, 5239 &tcp->tcp_rthdrdstoptslen, 5240 (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), 5241 ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) 5242 return (mp); 5243 } 5244 /* If app asked for routing headers and it has changed ... */ 5245 if (connp->conn_recv_ancillary.crb_ipv6_recvrthdr && 5246 ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen, 5247 (ipp->ipp_fields & IPPF_RTHDR), 5248 ipp->ipp_rthdr, ipp->ipp_rthdrlen)) { 5249 optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen; 5250 addflag.crb_ipv6_recvrthdr = 1; 5251 if (!ip_allocbuf((void **)&tcp->tcp_rthdr, 5252 &tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR), 5253 ipp->ipp_rthdr, ipp->ipp_rthdrlen)) 5254 return (mp); 5255 } 5256 /* If app asked for dest headers and it has changed ... */ 5257 if ((connp->conn_recv_ancillary.crb_ipv6_recvdstopts || 5258 connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts) && 5259 ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen, 5260 (ipp->ipp_fields & IPPF_DSTOPTS), 5261 ipp->ipp_dstopts, ipp->ipp_dstoptslen)) { 5262 optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen; 5263 addflag.crb_ipv6_recvdstopts = 1; 5264 if (!ip_allocbuf((void **)&tcp->tcp_dstopts, 5265 &tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS), 5266 ipp->ipp_dstopts, ipp->ipp_dstoptslen)) 5267 return (mp); 5268 } 5269 5270 if (optlen == 0) { 5271 /* Nothing to add */ 5272 return (mp); 5273 } 5274 mp1 = allocb(sizeof (struct T_optdata_ind) + optlen, BPRI_MED); 5275 if (mp1 == NULL) { 5276 /* 5277 * Defer sending ancillary data until the next TCP segment 5278 * arrives. 5279 */ 5280 return (mp); 5281 } 5282 mp1->b_cont = mp; 5283 mp = mp1; 5284 mp->b_wptr += sizeof (*todi) + optlen; 5285 mp->b_datap->db_type = M_PROTO; 5286 todi = (struct T_optdata_ind *)mp->b_rptr; 5287 todi->PRIM_type = T_OPTDATA_IND; 5288 todi->DATA_flag = 1; /* MORE data */ 5289 todi->OPT_length = optlen; 5290 todi->OPT_offset = sizeof (*todi); 5291 optptr = (uchar_t *)&todi[1]; 5292 5293 /* If app asked for TOS and it has changed ... */ 5294 if (addflag.crb_recvtos) { 5295 toh = (struct T_opthdr *)optptr; 5296 toh->level = IPPROTO_IP; 5297 toh->name = IP_RECVTOS; 5298 toh->len = sizeof (*toh) + 5299 P2ROUNDUP(sizeof (uint8_t), __TPI_ALIGN_SIZE); 5300 toh->status = 0; 5301 optptr += sizeof (*toh); 5302 *(uint8_t *)optptr = ipp->ipp_type_of_service; 5303 optptr = (uchar_t *)toh + toh->len; 5304 ASSERT(__TPI_TOPT_ISALIGNED(optptr)); 5305 /* Save as "last" value */ 5306 tcp->tcp_recvtos = ipp->ipp_type_of_service; 5307 } 5308 5309 /* 5310 * If app asked for pktinfo and the index has changed ... 5311 * Note that the local address never changes for the connection. 5312 */ 5313 if (addflag.crb_ip_recvpktinfo) { 5314 struct in6_pktinfo *pkti; 5315 uint_t ifindex; 5316 5317 ifindex = ira->ira_ruifindex; 5318 toh = (struct T_opthdr *)optptr; 5319 toh->level = IPPROTO_IPV6; 5320 toh->name = IPV6_PKTINFO; 5321 toh->len = sizeof (*toh) + sizeof (*pkti); 5322 toh->status = 0; 5323 optptr += sizeof (*toh); 5324 pkti = (struct in6_pktinfo *)optptr; 5325 pkti->ipi6_addr = connp->conn_laddr_v6; 5326 pkti->ipi6_ifindex = ifindex; 5327 optptr += sizeof (*pkti); 5328 ASSERT(OK_32PTR(optptr)); 5329 /* Save as "last" value */ 5330 tcp->tcp_recvifindex = ifindex; 5331 } 5332 /* If app asked for hoplimit and it has changed ... */ 5333 if (addflag.crb_ipv6_recvhoplimit) { 5334 toh = (struct T_opthdr *)optptr; 5335 toh->level = IPPROTO_IPV6; 5336 toh->name = IPV6_HOPLIMIT; 5337 toh->len = sizeof (*toh) + sizeof (uint_t); 5338 toh->status = 0; 5339 optptr += sizeof (*toh); 5340 *(uint_t *)optptr = ipp->ipp_hoplimit; 5341 optptr += sizeof (uint_t); 5342 ASSERT(OK_32PTR(optptr)); 5343 /* Save as "last" value */ 5344 tcp->tcp_recvhops = ipp->ipp_hoplimit; 5345 } 5346 /* If app asked for tclass and it has changed ... */ 5347 if (addflag.crb_ipv6_recvtclass) { 5348 toh = (struct T_opthdr *)optptr; 5349 toh->level = IPPROTO_IPV6; 5350 toh->name = IPV6_TCLASS; 5351 toh->len = sizeof (*toh) + sizeof (uint_t); 5352 toh->status = 0; 5353 optptr += sizeof (*toh); 5354 *(uint_t *)optptr = ipp->ipp_tclass; 5355 optptr += sizeof (uint_t); 5356 ASSERT(OK_32PTR(optptr)); 5357 /* Save as "last" value */ 5358 tcp->tcp_recvtclass = ipp->ipp_tclass; 5359 } 5360 if (addflag.crb_ipv6_recvhopopts) { 5361 toh = (struct T_opthdr *)optptr; 5362 toh->level = IPPROTO_IPV6; 5363 toh->name = IPV6_HOPOPTS; 5364 toh->len = sizeof (*toh) + ipp->ipp_hopoptslen; 5365 toh->status = 0; 5366 optptr += sizeof (*toh); 5367 bcopy((uchar_t *)ipp->ipp_hopopts, optptr, ipp->ipp_hopoptslen); 5368 optptr += ipp->ipp_hopoptslen; 5369 ASSERT(OK_32PTR(optptr)); 5370 /* Save as last value */ 5371 ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen, 5372 (ipp->ipp_fields & IPPF_HOPOPTS), 5373 ipp->ipp_hopopts, ipp->ipp_hopoptslen); 5374 } 5375 if (addflag.crb_ipv6_recvrthdrdstopts) { 5376 toh = (struct T_opthdr *)optptr; 5377 toh->level = IPPROTO_IPV6; 5378 toh->name = IPV6_RTHDRDSTOPTS; 5379 toh->len = sizeof (*toh) + ipp->ipp_rthdrdstoptslen; 5380 toh->status = 0; 5381 optptr += sizeof (*toh); 5382 bcopy(ipp->ipp_rthdrdstopts, optptr, ipp->ipp_rthdrdstoptslen); 5383 optptr += ipp->ipp_rthdrdstoptslen; 5384 ASSERT(OK_32PTR(optptr)); 5385 /* Save as last value */ 5386 ip_savebuf((void **)&tcp->tcp_rthdrdstopts, 5387 &tcp->tcp_rthdrdstoptslen, 5388 (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), 5389 ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen); 5390 } 5391 if (addflag.crb_ipv6_recvrthdr) { 5392 toh = (struct T_opthdr *)optptr; 5393 toh->level = IPPROTO_IPV6; 5394 toh->name = IPV6_RTHDR; 5395 toh->len = sizeof (*toh) + ipp->ipp_rthdrlen; 5396 toh->status = 0; 5397 optptr += sizeof (*toh); 5398 bcopy(ipp->ipp_rthdr, optptr, ipp->ipp_rthdrlen); 5399 optptr += ipp->ipp_rthdrlen; 5400 ASSERT(OK_32PTR(optptr)); 5401 /* Save as last value */ 5402 ip_savebuf((void **)&tcp->tcp_rthdr, &tcp->tcp_rthdrlen, 5403 (ipp->ipp_fields & IPPF_RTHDR), 5404 ipp->ipp_rthdr, ipp->ipp_rthdrlen); 5405 } 5406 if (addflag.crb_ipv6_recvdstopts) { 5407 toh = (struct T_opthdr *)optptr; 5408 toh->level = IPPROTO_IPV6; 5409 toh->name = IPV6_DSTOPTS; 5410 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen; 5411 toh->status = 0; 5412 optptr += sizeof (*toh); 5413 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen); 5414 optptr += ipp->ipp_dstoptslen; 5415 ASSERT(OK_32PTR(optptr)); 5416 /* Save as last value */ 5417 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen, 5418 (ipp->ipp_fields & IPPF_DSTOPTS), 5419 ipp->ipp_dstopts, ipp->ipp_dstoptslen); 5420 } 5421 ASSERT(optptr == mp->b_wptr); 5422 return (mp); 5423 } 5424 5425 /* The minimum of smoothed mean deviation in RTO calculation (nsec). */ 5426 #define TCP_SD_MIN 400000000 5427 5428 /* 5429 * Set RTO for this connection based on a new round-trip time measurement. 5430 * The formula is from Jacobson and Karels' "Congestion Avoidance and Control" 5431 * in SIGCOMM '88. The variable names are the same as those in Appendix A.2 5432 * of that paper. 5433 * 5434 * m = new measurement 5435 * sa = smoothed RTT average (8 * average estimates). 5436 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). 5437 */ 5438 static void 5439 tcp_set_rto(tcp_t *tcp, hrtime_t rtt) 5440 { 5441 hrtime_t m = rtt; 5442 hrtime_t sa = tcp->tcp_rtt_sa; 5443 hrtime_t sv = tcp->tcp_rtt_sd; 5444 tcp_stack_t *tcps = tcp->tcp_tcps; 5445 5446 TCPS_BUMP_MIB(tcps, tcpRttUpdate); 5447 tcp->tcp_rtt_update++; 5448 tcp->tcp_rtt_sum += m; 5449 tcp->tcp_rtt_cnt++; 5450 5451 /* tcp_rtt_sa is not 0 means this is a new sample. */ 5452 if (sa != 0) { 5453 /* 5454 * Update average estimator (see section 2.3 of RFC6298): 5455 * SRTT = 7/8 SRTT + 1/8 rtt 5456 * 5457 * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to: 5458 * tcp_rtt_sa = 7 * SRTT + rtt 5459 * tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt 5460 * tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt 5461 * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8)) 5462 * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3)) 5463 * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3)) 5464 * 5465 * (rtt - tcp_rtt_sa / 8) is simply the difference 5466 * between the new rtt measurement and the existing smoothed 5467 * RTT average. This is referred to as "Error" in subsequent 5468 * calculations. 5469 */ 5470 5471 /* m is now Error. */ 5472 m -= sa >> 3; 5473 if ((sa += m) <= 0) { 5474 /* 5475 * Don't allow the smoothed average to be negative. 5476 * We use 0 to denote reinitialization of the 5477 * variables. 5478 */ 5479 sa = 1; 5480 } 5481 5482 /* 5483 * Update deviation estimator: 5484 * mdev = 3/4 mdev + 1/4 abs(Error) 5485 * 5486 * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to: 5487 * tcp_rtt_sd = 3 * mdev + abs(Error) 5488 * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 4) + abs(Error) 5489 * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 2^2) + abs(Error) 5490 * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd >> 2) + abs(Error) 5491 */ 5492 if (m < 0) 5493 m = -m; 5494 m -= sv >> 2; 5495 sv += m; 5496 } else { 5497 /* 5498 * This follows BSD's implementation. So the reinitialized 5499 * RTO is 3 * m. We cannot go less than 2 because if the 5500 * link is bandwidth dominated, doubling the window size 5501 * during slow start means doubling the RTT. We want to be 5502 * more conservative when we reinitialize our estimates. 3 5503 * is just a convenient number. 5504 */ 5505 sa = m << 3; 5506 sv = m << 1; 5507 } 5508 if (sv < TCP_SD_MIN) { 5509 /* 5510 * Since a receiver doesn't delay its ACKs during a long run of 5511 * segments, sa may not have captured the effect of delayed ACK 5512 * timeouts on the RTT. To make sure we always account for the 5513 * possible delay (and avoid the unnecessary retransmission), 5514 * TCP_SD_MIN is set to 400ms, twice the delayed ACK timeout of 5515 * 200ms on older SunOS/BSD systems and modern Windows systems 5516 * (as of 2019). This means that the minimum possible mean 5517 * deviation is 100 ms. 5518 */ 5519 sv = TCP_SD_MIN; 5520 } 5521 tcp->tcp_rtt_sa = sa; 5522 tcp->tcp_rtt_sd = sv; 5523 5524 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0); 5525 5526 /* Now, we can reset tcp_timer_backoff to use the new RTO... */ 5527 tcp->tcp_timer_backoff = 0; 5528 } 5529 5530 /* 5531 * On a labeled system we have some protocols above TCP, such as RPC, which 5532 * appear to assume that every mblk in a chain has a db_credp. 5533 */ 5534 static void 5535 tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira) 5536 { 5537 ASSERT(is_system_labeled()); 5538 ASSERT(ira->ira_cred != NULL); 5539 5540 while (mp != NULL) { 5541 mblk_setcred(mp, ira->ira_cred, NOPID); 5542 mp = mp->b_cont; 5543 } 5544 } 5545 5546 uint_t 5547 tcp_rwnd_reopen(tcp_t *tcp) 5548 { 5549 uint_t ret = 0; 5550 uint_t thwin; 5551 conn_t *connp = tcp->tcp_connp; 5552 5553 /* Learn the latest rwnd information that we sent to the other side. */ 5554 thwin = ((uint_t)ntohs(tcp->tcp_tcpha->tha_win)) 5555 << tcp->tcp_rcv_ws; 5556 /* This is peer's calculated send window (our receive window). */ 5557 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 5558 /* 5559 * Increase the receive window to max. But we need to do receiver 5560 * SWS avoidance. This means that we need to check the increase of 5561 * of receive window is at least 1 MSS. 5562 */ 5563 if (connp->conn_rcvbuf - thwin >= tcp->tcp_mss) { 5564 /* 5565 * If the window that the other side knows is less than max 5566 * deferred acks segments, send an update immediately. 5567 */ 5568 if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) { 5569 TCPS_BUMP_MIB(tcp->tcp_tcps, tcpOutWinUpdate); 5570 ret = TH_ACK_NEEDED; 5571 } 5572 tcp->tcp_rwnd = connp->conn_rcvbuf; 5573 } 5574 return (ret); 5575 } 5576 5577 /* 5578 * Handle a packet that has been reclassified by TCP. 5579 * This function drops the ref on connp that the caller had. 5580 */ 5581 void 5582 tcp_reinput(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) 5583 { 5584 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 5585 5586 if (connp->conn_incoming_ifindex != 0 && 5587 connp->conn_incoming_ifindex != ira->ira_ruifindex) { 5588 freemsg(mp); 5589 CONN_DEC_REF(connp); 5590 return; 5591 } 5592 if (connp->conn_min_ttl != 0 && connp->conn_min_ttl > ira->ira_ttl) { 5593 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); 5594 ip_drop_input("ipIfStatsInDiscards", mp, NULL); 5595 freemsg(mp); 5596 CONN_DEC_REF(connp); 5597 return; 5598 } 5599 if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || 5600 (ira->ira_flags & IRAF_IPSEC_SECURE)) { 5601 ip6_t *ip6h; 5602 ipha_t *ipha; 5603 5604 if (ira->ira_flags & IRAF_IS_IPV4) { 5605 ipha = (ipha_t *)mp->b_rptr; 5606 ip6h = NULL; 5607 } else { 5608 ipha = NULL; 5609 ip6h = (ip6_t *)mp->b_rptr; 5610 } 5611 mp = ipsec_check_inbound_policy(mp, connp, ipha, ip6h, ira); 5612 if (mp == NULL) { 5613 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); 5614 /* Note that mp is NULL */ 5615 ip_drop_input("ipIfStatsInDiscards", mp, NULL); 5616 CONN_DEC_REF(connp); 5617 return; 5618 } 5619 } 5620 5621 if (IPCL_IS_TCP(connp)) { 5622 /* 5623 * do not drain, certain use cases can blow 5624 * the stack 5625 */ 5626 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 5627 connp->conn_recv, connp, ira, 5628 SQ_NODRAIN, SQTAG_IP_TCP_INPUT); 5629 } else { 5630 /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ 5631 (connp->conn_recv)(connp, mp, NULL, 5632 ira); 5633 CONN_DEC_REF(connp); 5634 } 5635 5636 } 5637 5638 /* ARGSUSED */ 5639 static void 5640 tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 5641 { 5642 conn_t *connp = (conn_t *)arg; 5643 tcp_t *tcp = connp->conn_tcp; 5644 queue_t *q = connp->conn_rq; 5645 5646 ASSERT(!IPCL_IS_NONSTR(connp)); 5647 mutex_enter(&tcp->tcp_rsrv_mp_lock); 5648 tcp->tcp_rsrv_mp = mp; 5649 mutex_exit(&tcp->tcp_rsrv_mp_lock); 5650 5651 if (TCP_IS_DETACHED(tcp) || q == NULL) { 5652 return; 5653 } 5654 5655 if (tcp->tcp_fused) { 5656 tcp_fuse_backenable(tcp); 5657 return; 5658 } 5659 5660 if (canputnext(q)) { 5661 /* Not flow-controlled, open rwnd */ 5662 tcp->tcp_rwnd = connp->conn_rcvbuf; 5663 5664 /* 5665 * Send back a window update immediately if TCP is above 5666 * ESTABLISHED state and the increase of the rcv window 5667 * that the other side knows is at least 1 MSS after flow 5668 * control is lifted. 5669 */ 5670 if (tcp->tcp_state >= TCPS_ESTABLISHED && 5671 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 5672 tcp_xmit_ctl(NULL, tcp, 5673 (tcp->tcp_swnd == 0) ? tcp->tcp_suna : 5674 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 5675 } 5676 } 5677 } 5678 5679 /* 5680 * The read side service routine is called mostly when we get back-enabled as a 5681 * result of flow control relief. Since we don't actually queue anything in 5682 * TCP, we have no data to send out of here. What we do is clear the receive 5683 * window, and send out a window update. 5684 */ 5685 int 5686 tcp_rsrv(queue_t *q) 5687 { 5688 conn_t *connp = Q_TO_CONN(q); 5689 tcp_t *tcp = connp->conn_tcp; 5690 mblk_t *mp; 5691 5692 /* No code does a putq on the read side */ 5693 ASSERT(q->q_first == NULL); 5694 5695 /* 5696 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_rsrv() has already 5697 * been run. So just return. 5698 */ 5699 mutex_enter(&tcp->tcp_rsrv_mp_lock); 5700 if ((mp = tcp->tcp_rsrv_mp) == NULL) { 5701 mutex_exit(&tcp->tcp_rsrv_mp_lock); 5702 return (0); 5703 } 5704 tcp->tcp_rsrv_mp = NULL; 5705 mutex_exit(&tcp->tcp_rsrv_mp_lock); 5706 5707 CONN_INC_REF(connp); 5708 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_rsrv_input, connp, 5709 NULL, SQ_PROCESS, SQTAG_TCP_RSRV); 5710 return (0); 5711 } 5712 5713 /* At minimum we need 8 bytes in the TCP header for the lookup */ 5714 #define ICMP_MIN_TCP_HDR 8 5715 5716 /* 5717 * tcp_icmp_input is called as conn_recvicmp to process ICMP error messages 5718 * passed up by IP. The message is always received on the correct tcp_t. 5719 * Assumes that IP has pulled up everything up to and including the ICMP header. 5720 */ 5721 /* ARGSUSED2 */ 5722 void 5723 tcp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 5724 { 5725 conn_t *connp = (conn_t *)arg1; 5726 icmph_t *icmph; 5727 ipha_t *ipha; 5728 int iph_hdr_length; 5729 tcpha_t *tcpha; 5730 uint32_t seg_seq; 5731 tcp_t *tcp = connp->conn_tcp; 5732 5733 /* Assume IP provides aligned packets */ 5734 ASSERT(OK_32PTR(mp->b_rptr)); 5735 ASSERT((MBLKL(mp) >= sizeof (ipha_t))); 5736 5737 /* 5738 * It's possible we have a closed, but not yet destroyed, TCP 5739 * connection. Several fields (e.g. conn_ixa->ixa_ire) are invalid 5740 * in the closed state, so don't take any chances and drop the packet. 5741 */ 5742 if (tcp->tcp_state == TCPS_CLOSED) { 5743 freemsg(mp); 5744 return; 5745 } 5746 5747 /* 5748 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent 5749 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6. 5750 */ 5751 if (!(ira->ira_flags & IRAF_IS_IPV4)) { 5752 tcp_icmp_error_ipv6(tcp, mp, ira); 5753 return; 5754 } 5755 5756 /* Skip past the outer IP and ICMP headers */ 5757 iph_hdr_length = ira->ira_ip_hdr_length; 5758 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 5759 /* 5760 * If we don't have the correct outer IP header length 5761 * or if we don't have a complete inner IP header 5762 * drop it. 5763 */ 5764 if (iph_hdr_length < sizeof (ipha_t) || 5765 (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) { 5766 noticmpv4: 5767 freemsg(mp); 5768 return; 5769 } 5770 ipha = (ipha_t *)&icmph[1]; 5771 5772 /* Skip past the inner IP and find the ULP header */ 5773 iph_hdr_length = IPH_HDR_LENGTH(ipha); 5774 tcpha = (tcpha_t *)((char *)ipha + iph_hdr_length); 5775 /* 5776 * If we don't have the correct inner IP header length or if the ULP 5777 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR 5778 * bytes of TCP header, drop it. 5779 */ 5780 if (iph_hdr_length < sizeof (ipha_t) || 5781 ipha->ipha_protocol != IPPROTO_TCP || 5782 (uchar_t *)tcpha + ICMP_MIN_TCP_HDR > mp->b_wptr) { 5783 goto noticmpv4; 5784 } 5785 5786 seg_seq = ntohl(tcpha->tha_seq); 5787 switch (icmph->icmph_type) { 5788 case ICMP_DEST_UNREACHABLE: 5789 switch (icmph->icmph_code) { 5790 case ICMP_FRAGMENTATION_NEEDED: 5791 /* 5792 * Update Path MTU, then try to send something out. 5793 */ 5794 tcp_update_pmtu(tcp, B_TRUE); 5795 tcp_rexmit_after_error(tcp); 5796 break; 5797 case ICMP_PORT_UNREACHABLE: 5798 case ICMP_PROTOCOL_UNREACHABLE: 5799 switch (tcp->tcp_state) { 5800 case TCPS_SYN_SENT: 5801 case TCPS_SYN_RCVD: 5802 /* 5803 * ICMP can snipe away incipient 5804 * TCP connections as long as 5805 * seq number is same as initial 5806 * send seq number. 5807 */ 5808 if (seg_seq == tcp->tcp_iss) { 5809 (void) tcp_clean_death(tcp, 5810 ECONNREFUSED); 5811 } 5812 break; 5813 } 5814 break; 5815 case ICMP_HOST_UNREACHABLE: 5816 case ICMP_NET_UNREACHABLE: 5817 /* Record the error in case we finally time out. */ 5818 if (icmph->icmph_code == ICMP_HOST_UNREACHABLE) 5819 tcp->tcp_client_errno = EHOSTUNREACH; 5820 else 5821 tcp->tcp_client_errno = ENETUNREACH; 5822 if (tcp->tcp_state == TCPS_SYN_RCVD) { 5823 if (tcp->tcp_listener != NULL && 5824 tcp->tcp_listener->tcp_syn_defense) { 5825 /* 5826 * Ditch the half-open connection if we 5827 * suspect a SYN attack is under way. 5828 */ 5829 (void) tcp_clean_death(tcp, 5830 tcp->tcp_client_errno); 5831 } 5832 } 5833 break; 5834 default: 5835 break; 5836 } 5837 break; 5838 case ICMP_SOURCE_QUENCH: { 5839 /* 5840 * use a global boolean to control 5841 * whether TCP should respond to ICMP_SOURCE_QUENCH. 5842 * The default is false. 5843 */ 5844 if (tcp_icmp_source_quench) { 5845 /* 5846 * Reduce the sending rate as if we got a 5847 * retransmit timeout 5848 */ 5849 uint32_t npkt; 5850 5851 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / 5852 tcp->tcp_mss; 5853 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss; 5854 5855 DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp, 5856 uint32_t, tcp->tcp_cwnd, 5857 uint32_t, tcp->tcp_mss); 5858 tcp->tcp_cwnd = tcp->tcp_mss; 5859 tcp->tcp_cwnd_cnt = 0; 5860 } 5861 break; 5862 } 5863 } 5864 freemsg(mp); 5865 } 5866 5867 /* 5868 * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6 5869 * error messages passed up by IP. 5870 * Assumes that IP has pulled up all the extension headers as well 5871 * as the ICMPv6 header. 5872 */ 5873 static void 5874 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira) 5875 { 5876 icmp6_t *icmp6; 5877 ip6_t *ip6h; 5878 uint16_t iph_hdr_length = ira->ira_ip_hdr_length; 5879 tcpha_t *tcpha; 5880 uint8_t *nexthdrp; 5881 uint32_t seg_seq; 5882 5883 /* 5884 * Verify that we have a complete IP header. 5885 */ 5886 ASSERT((MBLKL(mp) >= sizeof (ip6_t))); 5887 5888 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 5889 ip6h = (ip6_t *)&icmp6[1]; 5890 /* 5891 * Verify if we have a complete ICMP and inner IP header. 5892 */ 5893 if ((uchar_t *)&ip6h[1] > mp->b_wptr) { 5894 noticmpv6: 5895 freemsg(mp); 5896 return; 5897 } 5898 5899 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) 5900 goto noticmpv6; 5901 tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length); 5902 /* 5903 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't 5904 * have at least ICMP_MIN_TCP_HDR bytes of TCP header drop the 5905 * packet. 5906 */ 5907 if ((*nexthdrp != IPPROTO_TCP) || 5908 ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) { 5909 goto noticmpv6; 5910 } 5911 5912 seg_seq = ntohl(tcpha->tha_seq); 5913 switch (icmp6->icmp6_type) { 5914 case ICMP6_PACKET_TOO_BIG: 5915 /* 5916 * Update Path MTU, then try to send something out. 5917 */ 5918 tcp_update_pmtu(tcp, B_TRUE); 5919 tcp_rexmit_after_error(tcp); 5920 break; 5921 case ICMP6_DST_UNREACH: 5922 switch (icmp6->icmp6_code) { 5923 case ICMP6_DST_UNREACH_NOPORT: 5924 if (((tcp->tcp_state == TCPS_SYN_SENT) || 5925 (tcp->tcp_state == TCPS_SYN_RCVD)) && 5926 (seg_seq == tcp->tcp_iss)) { 5927 (void) tcp_clean_death(tcp, ECONNREFUSED); 5928 } 5929 break; 5930 case ICMP6_DST_UNREACH_ADMIN: 5931 case ICMP6_DST_UNREACH_NOROUTE: 5932 case ICMP6_DST_UNREACH_BEYONDSCOPE: 5933 case ICMP6_DST_UNREACH_ADDR: 5934 /* Record the error in case we finally time out. */ 5935 tcp->tcp_client_errno = EHOSTUNREACH; 5936 if (((tcp->tcp_state == TCPS_SYN_SENT) || 5937 (tcp->tcp_state == TCPS_SYN_RCVD)) && 5938 (seg_seq == tcp->tcp_iss)) { 5939 if (tcp->tcp_listener != NULL && 5940 tcp->tcp_listener->tcp_syn_defense) { 5941 /* 5942 * Ditch the half-open connection if we 5943 * suspect a SYN attack is under way. 5944 */ 5945 (void) tcp_clean_death(tcp, 5946 tcp->tcp_client_errno); 5947 } 5948 } 5949 5950 5951 break; 5952 default: 5953 break; 5954 } 5955 break; 5956 case ICMP6_PARAM_PROB: 5957 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 5958 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 5959 (uchar_t *)ip6h + icmp6->icmp6_pptr == 5960 (uchar_t *)nexthdrp) { 5961 if (tcp->tcp_state == TCPS_SYN_SENT || 5962 tcp->tcp_state == TCPS_SYN_RCVD) { 5963 (void) tcp_clean_death(tcp, ECONNREFUSED); 5964 } 5965 break; 5966 } 5967 break; 5968 5969 case ICMP6_TIME_EXCEEDED: 5970 default: 5971 break; 5972 } 5973 freemsg(mp); 5974 } 5975 5976 /* 5977 * CALLED OUTSIDE OF SQUEUE! It can not follow any pointers that tcp might 5978 * change. But it can refer to fields like tcp_suna and tcp_snxt. 5979 * 5980 * Function tcp_verifyicmp is called as conn_verifyicmp to verify the ICMP 5981 * error messages received by IP. The message is always received on the correct 5982 * tcp_t. 5983 */ 5984 /* ARGSUSED */ 5985 boolean_t 5986 tcp_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6, 5987 ip_recv_attr_t *ira) 5988 { 5989 tcpha_t *tcpha = (tcpha_t *)arg2; 5990 uint32_t seq = ntohl(tcpha->tha_seq); 5991 tcp_t *tcp = connp->conn_tcp; 5992 5993 /* 5994 * TCP sequence number contained in payload of the ICMP error message 5995 * should be within the range SND.UNA <= SEG.SEQ < SND.NXT. Otherwise, 5996 * the message is either a stale ICMP error, or an attack from the 5997 * network. Fail the verification. 5998 */ 5999 if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt)) 6000 return (B_FALSE); 6001 6002 /* For "too big" we also check the ignore flag */ 6003 if (ira->ira_flags & IRAF_IS_IPV4) { 6004 ASSERT(icmph != NULL); 6005 if (icmph->icmph_type == ICMP_DEST_UNREACHABLE && 6006 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED && 6007 tcp->tcp_tcps->tcps_ignore_path_mtu) 6008 return (B_FALSE); 6009 } else { 6010 ASSERT(icmp6 != NULL); 6011 if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG && 6012 tcp->tcp_tcps->tcps_ignore_path_mtu) 6013 return (B_FALSE); 6014 } 6015 return (B_TRUE); 6016 } 6017