1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* This file contains all TCP output processing functions. */ 28 29 #include <sys/types.h> 30 #include <sys/stream.h> 31 #include <sys/strsun.h> 32 #include <sys/strsubr.h> 33 #include <sys/stropts.h> 34 #include <sys/strlog.h> 35 #define _SUN_TPI_VERSION 2 36 #include <sys/tihdr.h> 37 #include <sys/suntpi.h> 38 #include <sys/xti_inet.h> 39 #include <sys/timod.h> 40 #include <sys/pattr.h> 41 #include <sys/squeue_impl.h> 42 #include <sys/squeue.h> 43 #include <sys/sockio.h> 44 #include <sys/tsol/tnet.h> 45 46 #include <inet/common.h> 47 #include <inet/ip.h> 48 #include <inet/tcp.h> 49 #include <inet/tcp_impl.h> 50 #include <inet/snmpcom.h> 51 #include <inet/proto_set.h> 52 #include <inet/ipsec_impl.h> 53 #include <inet/ip_ndp.h> 54 55 static mblk_t *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *); 56 static void tcp_wput_cmdblk(queue_t *, mblk_t *); 57 static void tcp_wput_flush(tcp_t *, mblk_t *); 58 static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); 59 static int tcp_xmit_end(tcp_t *); 60 static int tcp_send(tcp_t *, const int, const int, const int, 61 const int, int *, uint_t *, int *, mblk_t **, mblk_t *); 62 static void tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t, 63 int, ip_recv_attr_t *, ip_stack_t *, conn_t *); 64 static boolean_t tcp_send_rst_chk(tcp_stack_t *); 65 static void tcp_process_shrunk_swnd(tcp_t *, uint32_t); 66 static void tcp_fill_header(tcp_t *, uchar_t *, clock_t, int); 67 68 /* 69 * Functions called directly via squeue having a prototype of edesc_t. 70 */ 71 static void tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *); 72 static void tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *); 73 static void tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *); 74 75 /* 76 * This controls how tiny a write must be before we try to copy it 77 * into the mblk on the tail of the transmit queue. Not much 78 * speedup is observed for values larger than sixteen. Zero will 79 * disable the optimisation. 80 */ 81 static int tcp_tx_pull_len = 16; 82 83 void 84 tcp_wput(queue_t *q, mblk_t *mp) 85 { 86 conn_t *connp = Q_TO_CONN(q); 87 tcp_t *tcp; 88 void (*output_proc)(); 89 t_scalar_t type; 90 uchar_t *rptr; 91 struct iocblk *iocp; 92 size_t size; 93 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 94 95 ASSERT(connp->conn_ref >= 2); 96 97 switch (DB_TYPE(mp)) { 98 case M_DATA: 99 tcp = connp->conn_tcp; 100 ASSERT(tcp != NULL); 101 102 size = msgdsize(mp); 103 104 mutex_enter(&tcp->tcp_non_sq_lock); 105 tcp->tcp_squeue_bytes += size; 106 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { 107 tcp_setqfull(tcp); 108 } 109 mutex_exit(&tcp->tcp_non_sq_lock); 110 111 CONN_INC_REF(connp); 112 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp, 113 NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 114 return; 115 116 case M_CMD: 117 tcp_wput_cmdblk(q, mp); 118 return; 119 120 case M_PROTO: 121 case M_PCPROTO: 122 /* 123 * if it is a snmp message, don't get behind the squeue 124 */ 125 tcp = connp->conn_tcp; 126 rptr = mp->b_rptr; 127 if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { 128 type = ((union T_primitives *)rptr)->type; 129 } else { 130 if (connp->conn_debug) { 131 (void) strlog(TCP_MOD_ID, 0, 1, 132 SL_ERROR|SL_TRACE, 133 "tcp_wput_proto, dropping one..."); 134 } 135 freemsg(mp); 136 return; 137 } 138 if (type == T_SVR4_OPTMGMT_REQ) { 139 /* 140 * All Solaris components should pass a db_credp 141 * for this TPI message, hence we ASSERT. 142 * But in case there is some other M_PROTO that looks 143 * like a TPI message sent by some other kernel 144 * component, we check and return an error. 145 */ 146 cred_t *cr = msg_getcred(mp, NULL); 147 148 ASSERT(cr != NULL); 149 if (cr == NULL) { 150 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 151 return; 152 } 153 if (snmpcom_req(q, mp, tcp_snmp_set, ip_snmp_get, 154 cr)) { 155 /* 156 * This was a SNMP request 157 */ 158 return; 159 } else { 160 output_proc = tcp_wput_proto; 161 } 162 } else { 163 output_proc = tcp_wput_proto; 164 } 165 break; 166 case M_IOCTL: 167 /* 168 * Most ioctls can be processed right away without going via 169 * squeues - process them right here. Those that do require 170 * squeue (currently _SIOCSOCKFALLBACK) 171 * are processed by tcp_wput_ioctl(). 172 */ 173 iocp = (struct iocblk *)mp->b_rptr; 174 tcp = connp->conn_tcp; 175 176 switch (iocp->ioc_cmd) { 177 case TCP_IOC_ABORT_CONN: 178 tcp_ioctl_abort_conn(q, mp); 179 return; 180 case TI_GETPEERNAME: 181 case TI_GETMYNAME: 182 mi_copyin(q, mp, NULL, 183 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 184 return; 185 case ND_SET: 186 /* nd_getset does the necessary checks */ 187 case ND_GET: 188 if (nd_getset(q, tcps->tcps_g_nd, mp)) { 189 qreply(q, mp); 190 return; 191 } 192 CONN_INC_IOCTLREF(connp); 193 ip_wput_nondata(q, mp); 194 CONN_DEC_IOCTLREF(connp); 195 return; 196 197 default: 198 output_proc = tcp_wput_ioctl; 199 break; 200 } 201 break; 202 default: 203 output_proc = tcp_wput_nondata; 204 break; 205 } 206 207 CONN_INC_REF(connp); 208 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp, 209 NULL, tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER); 210 } 211 212 /* 213 * The TCP normal data output path. 214 * NOTE: the logic of the fast path is duplicated from this function. 215 */ 216 void 217 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) 218 { 219 int len; 220 mblk_t *local_time; 221 mblk_t *mp1; 222 uint32_t snxt; 223 int tail_unsent; 224 int tcpstate; 225 int usable = 0; 226 mblk_t *xmit_tail; 227 int32_t mss; 228 int32_t num_sack_blk = 0; 229 int32_t total_hdr_len; 230 int32_t tcp_hdr_len; 231 int rc; 232 tcp_stack_t *tcps = tcp->tcp_tcps; 233 conn_t *connp = tcp->tcp_connp; 234 clock_t now = LBOLT_FASTPATH; 235 236 tcpstate = tcp->tcp_state; 237 if (mp == NULL) { 238 /* 239 * tcp_wput_data() with NULL mp should only be called when 240 * there is unsent data. 241 */ 242 ASSERT(tcp->tcp_unsent > 0); 243 /* Really tacky... but we need this for detached closes. */ 244 len = tcp->tcp_unsent; 245 goto data_null; 246 } 247 248 ASSERT(mp->b_datap->db_type == M_DATA); 249 /* 250 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ, 251 * or before a connection attempt has begun. 252 */ 253 if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT || 254 (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 255 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 256 #ifdef DEBUG 257 cmn_err(CE_WARN, 258 "tcp_wput_data: data after ordrel, %s", 259 tcp_display(tcp, NULL, 260 DISP_ADDR_AND_PORT)); 261 #else 262 if (connp->conn_debug) { 263 (void) strlog(TCP_MOD_ID, 0, 1, 264 SL_TRACE|SL_ERROR, 265 "tcp_wput_data: data after ordrel, %s\n", 266 tcp_display(tcp, NULL, 267 DISP_ADDR_AND_PORT)); 268 } 269 #endif /* DEBUG */ 270 } 271 if (tcp->tcp_snd_zcopy_aware && 272 (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 273 tcp_zcopy_notify(tcp); 274 freemsg(mp); 275 mutex_enter(&tcp->tcp_non_sq_lock); 276 if (tcp->tcp_flow_stopped && 277 TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { 278 tcp_clrqfull(tcp); 279 } 280 mutex_exit(&tcp->tcp_non_sq_lock); 281 return; 282 } 283 284 /* Strip empties */ 285 for (;;) { 286 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 287 (uintptr_t)INT_MAX); 288 len = (int)(mp->b_wptr - mp->b_rptr); 289 if (len > 0) 290 break; 291 mp1 = mp; 292 mp = mp->b_cont; 293 freeb(mp1); 294 if (mp == NULL) { 295 return; 296 } 297 } 298 299 /* If we are the first on the list ... */ 300 if (tcp->tcp_xmit_head == NULL) { 301 tcp->tcp_xmit_head = mp; 302 tcp->tcp_xmit_tail = mp; 303 tcp->tcp_xmit_tail_unsent = len; 304 } else { 305 /* If tiny tx and room in txq tail, pullup to save mblks. */ 306 struct datab *dp; 307 308 mp1 = tcp->tcp_xmit_last; 309 if (len < tcp_tx_pull_len && 310 (dp = mp1->b_datap)->db_ref == 1 && 311 dp->db_lim - mp1->b_wptr >= len) { 312 ASSERT(len > 0); 313 ASSERT(!mp1->b_cont); 314 if (len == 1) { 315 *mp1->b_wptr++ = *mp->b_rptr; 316 } else { 317 bcopy(mp->b_rptr, mp1->b_wptr, len); 318 mp1->b_wptr += len; 319 } 320 if (mp1 == tcp->tcp_xmit_tail) 321 tcp->tcp_xmit_tail_unsent += len; 322 mp1->b_cont = mp->b_cont; 323 if (tcp->tcp_snd_zcopy_aware && 324 (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 325 mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; 326 freeb(mp); 327 mp = mp1; 328 } else { 329 tcp->tcp_xmit_last->b_cont = mp; 330 } 331 len += tcp->tcp_unsent; 332 } 333 334 /* Tack on however many more positive length mblks we have */ 335 if ((mp1 = mp->b_cont) != NULL) { 336 do { 337 int tlen; 338 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 339 (uintptr_t)INT_MAX); 340 tlen = (int)(mp1->b_wptr - mp1->b_rptr); 341 if (tlen <= 0) { 342 mp->b_cont = mp1->b_cont; 343 freeb(mp1); 344 } else { 345 len += tlen; 346 mp = mp1; 347 } 348 } while ((mp1 = mp->b_cont) != NULL); 349 } 350 tcp->tcp_xmit_last = mp; 351 tcp->tcp_unsent = len; 352 353 if (urgent) 354 usable = 1; 355 356 data_null: 357 snxt = tcp->tcp_snxt; 358 xmit_tail = tcp->tcp_xmit_tail; 359 tail_unsent = tcp->tcp_xmit_tail_unsent; 360 361 /* 362 * Note that tcp_mss has been adjusted to take into account the 363 * timestamp option if applicable. Because SACK options do not 364 * appear in every TCP segments and they are of variable lengths, 365 * they cannot be included in tcp_mss. Thus we need to calculate 366 * the actual segment length when we need to send a segment which 367 * includes SACK options. 368 */ 369 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 370 int32_t opt_len; 371 372 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 373 tcp->tcp_num_sack_blk); 374 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 375 2 + TCPOPT_HEADER_LEN; 376 mss = tcp->tcp_mss - opt_len; 377 total_hdr_len = connp->conn_ht_iphc_len + opt_len; 378 tcp_hdr_len = connp->conn_ht_ulp_len + opt_len; 379 } else { 380 mss = tcp->tcp_mss; 381 total_hdr_len = connp->conn_ht_iphc_len; 382 tcp_hdr_len = connp->conn_ht_ulp_len; 383 } 384 385 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && 386 (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { 387 TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); 388 } 389 if (tcpstate == TCPS_SYN_RCVD) { 390 /* 391 * The three-way connection establishment handshake is not 392 * complete yet. We want to queue the data for transmission 393 * after entering ESTABLISHED state (RFC793). A jump to 394 * "done" label effectively leaves data on the queue. 395 */ 396 goto done; 397 } else { 398 int usable_r; 399 400 /* 401 * In the special case when cwnd is zero, which can only 402 * happen if the connection is ECN capable, return now. 403 * New segments is sent using tcp_timer(). The timer 404 * is set in tcp_input_data(). 405 */ 406 if (tcp->tcp_cwnd == 0) { 407 /* 408 * Note that tcp_cwnd is 0 before 3-way handshake is 409 * finished. 410 */ 411 ASSERT(tcp->tcp_ecn_ok || 412 tcp->tcp_state < TCPS_ESTABLISHED); 413 return; 414 } 415 416 /* NOTE: trouble if xmitting while SYN not acked? */ 417 usable_r = snxt - tcp->tcp_suna; 418 usable_r = tcp->tcp_swnd - usable_r; 419 420 /* 421 * Check if the receiver has shrunk the window. If 422 * tcp_wput_data() with NULL mp is called, tcp_fin_sent 423 * cannot be set as there is unsent data, so FIN cannot 424 * be sent out. Otherwise, we need to take into account 425 * of FIN as it consumes an "invisible" sequence number. 426 */ 427 ASSERT(tcp->tcp_fin_sent == 0); 428 if (usable_r < 0) { 429 /* 430 * The receiver has shrunk the window and we have sent 431 * -usable_r date beyond the window, re-adjust. 432 * 433 * If TCP window scaling is enabled, there can be 434 * round down error as the advertised receive window 435 * is actually right shifted n bits. This means that 436 * the lower n bits info is wiped out. It will look 437 * like the window is shrunk. Do a check here to 438 * see if the shrunk amount is actually within the 439 * error in window calculation. If it is, just 440 * return. Note that this check is inside the 441 * shrunk window check. This makes sure that even 442 * though tcp_process_shrunk_swnd() is not called, 443 * we will stop further processing. 444 */ 445 if ((-usable_r >> tcp->tcp_snd_ws) > 0) { 446 tcp_process_shrunk_swnd(tcp, -usable_r); 447 } 448 return; 449 } 450 451 /* usable = MIN(swnd, cwnd) - unacked_bytes */ 452 if (tcp->tcp_swnd > tcp->tcp_cwnd) 453 usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd; 454 455 /* usable = MIN(usable, unsent) */ 456 if (usable_r > len) 457 usable_r = len; 458 459 /* usable = MAX(usable, {1 for urgent, 0 for data}) */ 460 if (usable_r > 0) { 461 usable = usable_r; 462 } else { 463 /* Bypass all other unnecessary processing. */ 464 goto done; 465 } 466 } 467 468 local_time = (mblk_t *)now; 469 470 /* 471 * "Our" Nagle Algorithm. This is not the same as in the old 472 * BSD. This is more in line with the true intent of Nagle. 473 * 474 * The conditions are: 475 * 1. The amount of unsent data (or amount of data which can be 476 * sent, whichever is smaller) is less than Nagle limit. 477 * 2. The last sent size is also less than Nagle limit. 478 * 3. There is unack'ed data. 479 * 4. Urgent pointer is not set. Send urgent data ignoring the 480 * Nagle algorithm. This reduces the probability that urgent 481 * bytes get "merged" together. 482 * 5. The app has not closed the connection. This eliminates the 483 * wait time of the receiving side waiting for the last piece of 484 * (small) data. 485 * 486 * If all are satisified, exit without sending anything. Note 487 * that Nagle limit can be smaller than 1 MSS. Nagle limit is 488 * the smaller of 1 MSS and global tcp_naglim_def (default to be 489 * 4095). 490 */ 491 if (usable < (int)tcp->tcp_naglim && 492 tcp->tcp_naglim > tcp->tcp_last_sent_len && 493 snxt != tcp->tcp_suna && 494 !(tcp->tcp_valid_bits & TCP_URG_VALID) && 495 !(tcp->tcp_valid_bits & TCP_FSS_VALID)) { 496 goto done; 497 } 498 499 /* 500 * If tcp_zero_win_probe is not set and the tcp->tcp_cork option 501 * is set, then we have to force TCP not to send partial segment 502 * (smaller than MSS bytes). We are calculating the usable now 503 * based on full mss and will save the rest of remaining data for 504 * later. When tcp_zero_win_probe is set, TCP needs to send out 505 * something to do zero window probe. 506 */ 507 if (tcp->tcp_cork && !tcp->tcp_zero_win_probe) { 508 if (usable < mss) 509 goto done; 510 usable = (usable / mss) * mss; 511 } 512 513 /* Update the latest receive window size in TCP header. */ 514 tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 515 516 /* Send the packet. */ 517 rc = tcp_send(tcp, mss, total_hdr_len, tcp_hdr_len, 518 num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, 519 local_time); 520 521 /* Pretend that all we were trying to send really got sent */ 522 if (rc < 0 && tail_unsent < 0) { 523 do { 524 xmit_tail = xmit_tail->b_cont; 525 xmit_tail->b_prev = local_time; 526 ASSERT((uintptr_t)(xmit_tail->b_wptr - 527 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 528 tail_unsent += (int)(xmit_tail->b_wptr - 529 xmit_tail->b_rptr); 530 } while (tail_unsent < 0); 531 } 532 done:; 533 tcp->tcp_xmit_tail = xmit_tail; 534 tcp->tcp_xmit_tail_unsent = tail_unsent; 535 len = tcp->tcp_snxt - snxt; 536 if (len) { 537 /* 538 * If new data was sent, need to update the notsack 539 * list, which is, afterall, data blocks that have 540 * not been sack'ed by the receiver. New data is 541 * not sack'ed. 542 */ 543 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 544 /* len is a negative value. */ 545 tcp->tcp_pipe -= len; 546 tcp_notsack_update(&(tcp->tcp_notsack_list), 547 tcp->tcp_snxt, snxt, 548 &(tcp->tcp_num_notsack_blk), 549 &(tcp->tcp_cnt_notsack_list)); 550 } 551 tcp->tcp_snxt = snxt + tcp->tcp_fin_sent; 552 tcp->tcp_rack = tcp->tcp_rnxt; 553 tcp->tcp_rack_cnt = 0; 554 if ((snxt + len) == tcp->tcp_suna) { 555 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 556 } 557 } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) { 558 /* 559 * Didn't send anything. Make sure the timer is running 560 * so that we will probe a zero window. 561 */ 562 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 563 } 564 /* Note that len is the amount we just sent but with a negative sign */ 565 tcp->tcp_unsent += len; 566 mutex_enter(&tcp->tcp_non_sq_lock); 567 if (tcp->tcp_flow_stopped) { 568 if (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { 569 tcp_clrqfull(tcp); 570 } 571 } else if (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf) { 572 if (!(tcp->tcp_detached)) 573 tcp_setqfull(tcp); 574 } 575 mutex_exit(&tcp->tcp_non_sq_lock); 576 } 577 578 /* 579 * Initial STREAMS write side put() procedure for sockets. It tries to 580 * handle the T_CAPABILITY_REQ which sockfs sends down while setting 581 * up the socket without using the squeue. Non T_CAPABILITY_REQ messages 582 * are handled by tcp_wput() as usual. 583 * 584 * All further messages will also be handled by tcp_wput() because we cannot 585 * be sure that the above short cut is safe later. 586 */ 587 void 588 tcp_wput_sock(queue_t *wq, mblk_t *mp) 589 { 590 conn_t *connp = Q_TO_CONN(wq); 591 tcp_t *tcp = connp->conn_tcp; 592 struct T_capability_req *car = (struct T_capability_req *)mp->b_rptr; 593 594 ASSERT(wq->q_qinfo == &tcp_sock_winit); 595 wq->q_qinfo = &tcp_winit; 596 597 ASSERT(IPCL_IS_TCP(connp)); 598 ASSERT(TCP_IS_SOCKET(tcp)); 599 600 if (DB_TYPE(mp) == M_PCPROTO && 601 MBLKL(mp) == sizeof (struct T_capability_req) && 602 car->PRIM_type == T_CAPABILITY_REQ) { 603 tcp_capability_req(tcp, mp); 604 return; 605 } 606 607 tcp_wput(wq, mp); 608 } 609 610 /* ARGSUSED */ 611 void 612 tcp_wput_fallback(queue_t *wq, mblk_t *mp) 613 { 614 #ifdef DEBUG 615 cmn_err(CE_CONT, "tcp_wput_fallback: Message during fallback \n"); 616 #endif 617 freemsg(mp); 618 } 619 620 /* 621 * Call by tcp_wput() to handle misc non M_DATA messages. 622 */ 623 /* ARGSUSED */ 624 static void 625 tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 626 { 627 conn_t *connp = (conn_t *)arg; 628 tcp_t *tcp = connp->conn_tcp; 629 630 ASSERT(DB_TYPE(mp) != M_IOCTL); 631 /* 632 * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close. 633 * Once the close starts, streamhead and sockfs will not let any data 634 * packets come down (close ensures that there are no threads using the 635 * queue and no new threads will come down) but since qprocsoff() 636 * hasn't happened yet, a M_FLUSH or some non data message might 637 * get reflected back (in response to our own FLUSHRW) and get 638 * processed after tcp_close() is done. The conn would still be valid 639 * because a ref would have added but we need to check the state 640 * before actually processing the packet. 641 */ 642 if (TCP_IS_DETACHED(tcp) || (tcp->tcp_state == TCPS_CLOSED)) { 643 freemsg(mp); 644 return; 645 } 646 647 switch (DB_TYPE(mp)) { 648 case M_IOCDATA: 649 tcp_wput_iocdata(tcp, mp); 650 break; 651 case M_FLUSH: 652 tcp_wput_flush(tcp, mp); 653 break; 654 default: 655 ip_wput_nondata(connp->conn_wq, mp); 656 break; 657 } 658 } 659 660 /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */ 661 static void 662 tcp_wput_flush(tcp_t *tcp, mblk_t *mp) 663 { 664 uchar_t fval = *mp->b_rptr; 665 mblk_t *tail; 666 conn_t *connp = tcp->tcp_connp; 667 queue_t *q = connp->conn_wq; 668 669 /* TODO: How should flush interact with urgent data? */ 670 if ((fval & FLUSHW) && tcp->tcp_xmit_head != NULL && 671 !(tcp->tcp_valid_bits & TCP_URG_VALID)) { 672 /* 673 * Flush only data that has not yet been put on the wire. If 674 * we flush data that we have already transmitted, life, as we 675 * know it, may come to an end. 676 */ 677 tail = tcp->tcp_xmit_tail; 678 tail->b_wptr -= tcp->tcp_xmit_tail_unsent; 679 tcp->tcp_xmit_tail_unsent = 0; 680 tcp->tcp_unsent = 0; 681 if (tail->b_wptr != tail->b_rptr) 682 tail = tail->b_cont; 683 if (tail) { 684 mblk_t **excess = &tcp->tcp_xmit_head; 685 for (;;) { 686 mblk_t *mp1 = *excess; 687 if (mp1 == tail) 688 break; 689 tcp->tcp_xmit_tail = mp1; 690 tcp->tcp_xmit_last = mp1; 691 excess = &mp1->b_cont; 692 } 693 *excess = NULL; 694 tcp_close_mpp(&tail); 695 if (tcp->tcp_snd_zcopy_aware) 696 tcp_zcopy_notify(tcp); 697 } 698 /* 699 * We have no unsent data, so unsent must be less than 700 * conn_sndlowat, so re-enable flow. 701 */ 702 mutex_enter(&tcp->tcp_non_sq_lock); 703 if (tcp->tcp_flow_stopped) { 704 tcp_clrqfull(tcp); 705 } 706 mutex_exit(&tcp->tcp_non_sq_lock); 707 } 708 /* 709 * TODO: you can't just flush these, you have to increase rwnd for one 710 * thing. For another, how should urgent data interact? 711 */ 712 if (fval & FLUSHR) { 713 *mp->b_rptr = fval & ~FLUSHW; 714 /* XXX */ 715 qreply(q, mp); 716 return; 717 } 718 freemsg(mp); 719 } 720 721 /* 722 * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA 723 * messages. 724 */ 725 static void 726 tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) 727 { 728 mblk_t *mp1; 729 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 730 STRUCT_HANDLE(strbuf, sb); 731 uint_t addrlen; 732 conn_t *connp = tcp->tcp_connp; 733 queue_t *q = connp->conn_wq; 734 735 /* Make sure it is one of ours. */ 736 switch (iocp->ioc_cmd) { 737 case TI_GETMYNAME: 738 case TI_GETPEERNAME: 739 break; 740 default: 741 /* 742 * If the conn is closing, then error the ioctl here. Otherwise 743 * use the CONN_IOCTLREF_* macros to hold off tcp_close until 744 * we're done here. 745 */ 746 mutex_enter(&connp->conn_lock); 747 if (connp->conn_state_flags & CONN_CLOSING) { 748 mutex_exit(&connp->conn_lock); 749 iocp->ioc_error = EINVAL; 750 mp->b_datap->db_type = M_IOCNAK; 751 iocp->ioc_count = 0; 752 qreply(q, mp); 753 return; 754 } 755 756 CONN_INC_IOCTLREF_LOCKED(connp); 757 ip_wput_nondata(q, mp); 758 CONN_DEC_IOCTLREF(connp); 759 return; 760 } 761 switch (mi_copy_state(q, mp, &mp1)) { 762 case -1: 763 return; 764 case MI_COPY_CASE(MI_COPY_IN, 1): 765 break; 766 case MI_COPY_CASE(MI_COPY_OUT, 1): 767 /* Copy out the strbuf. */ 768 mi_copyout(q, mp); 769 return; 770 case MI_COPY_CASE(MI_COPY_OUT, 2): 771 /* All done. */ 772 mi_copy_done(q, mp, 0); 773 return; 774 default: 775 mi_copy_done(q, mp, EPROTO); 776 return; 777 } 778 /* Check alignment of the strbuf */ 779 if (!OK_32PTR(mp1->b_rptr)) { 780 mi_copy_done(q, mp, EINVAL); 781 return; 782 } 783 784 STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr); 785 786 if (connp->conn_family == AF_INET) 787 addrlen = sizeof (sin_t); 788 else 789 addrlen = sizeof (sin6_t); 790 791 if (STRUCT_FGET(sb, maxlen) < addrlen) { 792 mi_copy_done(q, mp, EINVAL); 793 return; 794 } 795 796 switch (iocp->ioc_cmd) { 797 case TI_GETMYNAME: 798 break; 799 case TI_GETPEERNAME: 800 if (tcp->tcp_state < TCPS_SYN_RCVD) { 801 mi_copy_done(q, mp, ENOTCONN); 802 return; 803 } 804 break; 805 } 806 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 807 if (!mp1) 808 return; 809 810 STRUCT_FSET(sb, len, addrlen); 811 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 812 case TI_GETMYNAME: 813 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, 814 &addrlen); 815 break; 816 case TI_GETPEERNAME: 817 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, 818 &addrlen); 819 break; 820 } 821 mp1->b_wptr += addrlen; 822 /* Copy out the address */ 823 mi_copyout(q, mp); 824 } 825 826 /* 827 * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL 828 * messages. 829 */ 830 /* ARGSUSED */ 831 static void 832 tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 833 { 834 conn_t *connp = (conn_t *)arg; 835 tcp_t *tcp = connp->conn_tcp; 836 queue_t *q = connp->conn_wq; 837 struct iocblk *iocp; 838 839 ASSERT(DB_TYPE(mp) == M_IOCTL); 840 /* 841 * Try and ASSERT the minimum possible references on the 842 * conn early enough. Since we are executing on write side, 843 * the connection is obviously not detached and that means 844 * there is a ref each for TCP and IP. Since we are behind 845 * the squeue, the minimum references needed are 3. If the 846 * conn is in classifier hash list, there should be an 847 * extra ref for that (we check both the possibilities). 848 */ 849 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 850 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 851 852 iocp = (struct iocblk *)mp->b_rptr; 853 switch (iocp->ioc_cmd) { 854 case _SIOCSOCKFALLBACK: 855 /* 856 * Either sockmod is about to be popped and the socket 857 * would now be treated as a plain stream, or a module 858 * is about to be pushed so we could no longer use read- 859 * side synchronous streams for fused loopback tcp. 860 * Drain any queued data and disable direct sockfs 861 * interface from now on. 862 */ 863 if (!tcp->tcp_issocket) { 864 DB_TYPE(mp) = M_IOCNAK; 865 iocp->ioc_error = EINVAL; 866 } else { 867 tcp_use_pure_tpi(tcp); 868 DB_TYPE(mp) = M_IOCACK; 869 iocp->ioc_error = 0; 870 } 871 iocp->ioc_count = 0; 872 iocp->ioc_rval = 0; 873 qreply(q, mp); 874 return; 875 } 876 877 /* 878 * If the conn is closing, then error the ioctl here. Otherwise bump the 879 * conn_ioctlref to hold off tcp_close until we're done here. 880 */ 881 mutex_enter(&(connp)->conn_lock); 882 if ((connp)->conn_state_flags & CONN_CLOSING) { 883 mutex_exit(&(connp)->conn_lock); 884 iocp->ioc_error = EINVAL; 885 mp->b_datap->db_type = M_IOCNAK; 886 iocp->ioc_count = 0; 887 qreply(q, mp); 888 return; 889 } 890 891 CONN_INC_IOCTLREF_LOCKED(connp); 892 ip_wput_nondata(q, mp); 893 CONN_DEC_IOCTLREF(connp); 894 } 895 896 /* 897 * This routine is called by tcp_wput() to handle all TPI requests. 898 */ 899 /* ARGSUSED */ 900 static void 901 tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 902 { 903 conn_t *connp = (conn_t *)arg; 904 tcp_t *tcp = connp->conn_tcp; 905 union T_primitives *tprim = (union T_primitives *)mp->b_rptr; 906 uchar_t *rptr; 907 t_scalar_t type; 908 cred_t *cr; 909 910 /* 911 * Try and ASSERT the minimum possible references on the 912 * conn early enough. Since we are executing on write side, 913 * the connection is obviously not detached and that means 914 * there is a ref each for TCP and IP. Since we are behind 915 * the squeue, the minimum references needed are 3. If the 916 * conn is in classifier hash list, there should be an 917 * extra ref for that (we check both the possibilities). 918 */ 919 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 920 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 921 922 rptr = mp->b_rptr; 923 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 924 if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { 925 type = ((union T_primitives *)rptr)->type; 926 if (type == T_EXDATA_REQ) { 927 tcp_output_urgent(connp, mp, arg2, NULL); 928 } else if (type != T_DATA_REQ) { 929 goto non_urgent_data; 930 } else { 931 /* TODO: options, flags, ... from user */ 932 /* Set length to zero for reclamation below */ 933 tcp_wput_data(tcp, mp->b_cont, B_TRUE); 934 freeb(mp); 935 } 936 return; 937 } else { 938 if (connp->conn_debug) { 939 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 940 "tcp_wput_proto, dropping one..."); 941 } 942 freemsg(mp); 943 return; 944 } 945 946 non_urgent_data: 947 948 switch ((int)tprim->type) { 949 case T_SSL_PROXY_BIND_REQ: /* an SSL proxy endpoint bind request */ 950 /* 951 * save the kssl_ent_t from the next block, and convert this 952 * back to a normal bind_req. 953 */ 954 if (mp->b_cont != NULL) { 955 ASSERT(MBLKL(mp->b_cont) >= sizeof (kssl_ent_t)); 956 957 if (tcp->tcp_kssl_ent != NULL) { 958 kssl_release_ent(tcp->tcp_kssl_ent, NULL, 959 KSSL_NO_PROXY); 960 tcp->tcp_kssl_ent = NULL; 961 } 962 bcopy(mp->b_cont->b_rptr, &tcp->tcp_kssl_ent, 963 sizeof (kssl_ent_t)); 964 kssl_hold_ent(tcp->tcp_kssl_ent); 965 freemsg(mp->b_cont); 966 mp->b_cont = NULL; 967 } 968 tprim->type = T_BIND_REQ; 969 970 /* FALLTHROUGH */ 971 case O_T_BIND_REQ: /* bind request */ 972 case T_BIND_REQ: /* new semantics bind request */ 973 tcp_tpi_bind(tcp, mp); 974 break; 975 case T_UNBIND_REQ: /* unbind request */ 976 tcp_tpi_unbind(tcp, mp); 977 break; 978 case O_T_CONN_RES: /* old connection response XXX */ 979 case T_CONN_RES: /* connection response */ 980 tcp_tli_accept(tcp, mp); 981 break; 982 case T_CONN_REQ: /* connection request */ 983 tcp_tpi_connect(tcp, mp); 984 break; 985 case T_DISCON_REQ: /* disconnect request */ 986 tcp_disconnect(tcp, mp); 987 break; 988 case T_CAPABILITY_REQ: 989 tcp_capability_req(tcp, mp); /* capability request */ 990 break; 991 case T_INFO_REQ: /* information request */ 992 tcp_info_req(tcp, mp); 993 break; 994 case T_SVR4_OPTMGMT_REQ: /* manage options req */ 995 case T_OPTMGMT_REQ: 996 /* 997 * Note: no support for snmpcom_req() through new 998 * T_OPTMGMT_REQ. See comments in ip.c 999 */ 1000 1001 /* 1002 * All Solaris components should pass a db_credp 1003 * for this TPI message, hence we ASSERT. 1004 * But in case there is some other M_PROTO that looks 1005 * like a TPI message sent by some other kernel 1006 * component, we check and return an error. 1007 */ 1008 cr = msg_getcred(mp, NULL); 1009 ASSERT(cr != NULL); 1010 if (cr == NULL) { 1011 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 1012 return; 1013 } 1014 /* 1015 * If EINPROGRESS is returned, the request has been queued 1016 * for subsequent processing by ip_restart_optmgmt(), which 1017 * will do the CONN_DEC_REF(). 1018 */ 1019 if ((int)tprim->type == T_SVR4_OPTMGMT_REQ) { 1020 svr4_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj); 1021 } else { 1022 tpi_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj); 1023 } 1024 break; 1025 1026 case T_UNITDATA_REQ: /* unitdata request */ 1027 tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); 1028 break; 1029 case T_ORDREL_REQ: /* orderly release req */ 1030 freemsg(mp); 1031 1032 if (tcp->tcp_fused) 1033 tcp_unfuse(tcp); 1034 1035 if (tcp_xmit_end(tcp) != 0) { 1036 /* 1037 * We were crossing FINs and got a reset from 1038 * the other side. Just ignore it. 1039 */ 1040 if (connp->conn_debug) { 1041 (void) strlog(TCP_MOD_ID, 0, 1, 1042 SL_ERROR|SL_TRACE, 1043 "tcp_wput_proto, T_ORDREL_REQ out of " 1044 "state %s", 1045 tcp_display(tcp, NULL, 1046 DISP_ADDR_AND_PORT)); 1047 } 1048 } 1049 break; 1050 case T_ADDR_REQ: 1051 tcp_addr_req(tcp, mp); 1052 break; 1053 default: 1054 if (connp->conn_debug) { 1055 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 1056 "tcp_wput_proto, bogus TPI msg, type %d", 1057 tprim->type); 1058 } 1059 /* 1060 * We used to M_ERROR. Sending TNOTSUPPORT gives the user 1061 * to recover. 1062 */ 1063 tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); 1064 break; 1065 } 1066 } 1067 1068 /* 1069 * Handle special out-of-band ioctl requests (see PSARC/2008/265). 1070 */ 1071 static void 1072 tcp_wput_cmdblk(queue_t *q, mblk_t *mp) 1073 { 1074 void *data; 1075 mblk_t *datamp = mp->b_cont; 1076 conn_t *connp = Q_TO_CONN(q); 1077 tcp_t *tcp = connp->conn_tcp; 1078 cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr; 1079 1080 if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) { 1081 cmdp->cb_error = EPROTO; 1082 qreply(q, mp); 1083 return; 1084 } 1085 1086 data = datamp->b_rptr; 1087 1088 switch (cmdp->cb_cmd) { 1089 case TI_GETPEERNAME: 1090 if (tcp->tcp_state < TCPS_SYN_RCVD) 1091 cmdp->cb_error = ENOTCONN; 1092 else 1093 cmdp->cb_error = conn_getpeername(connp, data, 1094 &cmdp->cb_len); 1095 break; 1096 case TI_GETMYNAME: 1097 cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len); 1098 break; 1099 default: 1100 cmdp->cb_error = EINVAL; 1101 break; 1102 } 1103 1104 qreply(q, mp); 1105 } 1106 1107 /* 1108 * The TCP fast path write put procedure. 1109 * NOTE: the logic of the fast path is duplicated from tcp_wput_data() 1110 */ 1111 /* ARGSUSED */ 1112 void 1113 tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1114 { 1115 int len; 1116 int hdrlen; 1117 int plen; 1118 mblk_t *mp1; 1119 uchar_t *rptr; 1120 uint32_t snxt; 1121 tcpha_t *tcpha; 1122 struct datab *db; 1123 uint32_t suna; 1124 uint32_t mss; 1125 ipaddr_t *dst; 1126 ipaddr_t *src; 1127 uint32_t sum; 1128 int usable; 1129 conn_t *connp = (conn_t *)arg; 1130 tcp_t *tcp = connp->conn_tcp; 1131 uint32_t msize; 1132 tcp_stack_t *tcps = tcp->tcp_tcps; 1133 ip_xmit_attr_t *ixa; 1134 clock_t now; 1135 1136 /* 1137 * Try and ASSERT the minimum possible references on the 1138 * conn early enough. Since we are executing on write side, 1139 * the connection is obviously not detached and that means 1140 * there is a ref each for TCP and IP. Since we are behind 1141 * the squeue, the minimum references needed are 3. If the 1142 * conn is in classifier hash list, there should be an 1143 * extra ref for that (we check both the possibilities). 1144 */ 1145 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 1146 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 1147 1148 ASSERT(DB_TYPE(mp) == M_DATA); 1149 msize = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 1150 1151 mutex_enter(&tcp->tcp_non_sq_lock); 1152 tcp->tcp_squeue_bytes -= msize; 1153 mutex_exit(&tcp->tcp_non_sq_lock); 1154 1155 /* Bypass tcp protocol for fused tcp loopback */ 1156 if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) 1157 return; 1158 1159 mss = tcp->tcp_mss; 1160 /* 1161 * If ZEROCOPY has turned off, try not to send any zero-copy message 1162 * down. Do backoff, now. 1163 */ 1164 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on) 1165 mp = tcp_zcopy_backoff(tcp, mp, B_FALSE); 1166 1167 1168 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 1169 len = (int)(mp->b_wptr - mp->b_rptr); 1170 1171 /* 1172 * Criteria for fast path: 1173 * 1174 * 1. no unsent data 1175 * 2. single mblk in request 1176 * 3. connection established 1177 * 4. data in mblk 1178 * 5. len <= mss 1179 * 6. no tcp_valid bits 1180 */ 1181 if ((tcp->tcp_unsent != 0) || 1182 (tcp->tcp_cork) || 1183 (mp->b_cont != NULL) || 1184 (tcp->tcp_state != TCPS_ESTABLISHED) || 1185 (len == 0) || 1186 (len > mss) || 1187 (tcp->tcp_valid_bits != 0)) { 1188 tcp_wput_data(tcp, mp, B_FALSE); 1189 return; 1190 } 1191 1192 ASSERT(tcp->tcp_xmit_tail_unsent == 0); 1193 ASSERT(tcp->tcp_fin_sent == 0); 1194 1195 /* queue new packet onto retransmission queue */ 1196 if (tcp->tcp_xmit_head == NULL) { 1197 tcp->tcp_xmit_head = mp; 1198 } else { 1199 tcp->tcp_xmit_last->b_cont = mp; 1200 } 1201 tcp->tcp_xmit_last = mp; 1202 tcp->tcp_xmit_tail = mp; 1203 1204 /* find out how much we can send */ 1205 /* BEGIN CSTYLED */ 1206 /* 1207 * un-acked usable 1208 * |--------------|-----------------| 1209 * tcp_suna tcp_snxt tcp_suna+tcp_swnd 1210 */ 1211 /* END CSTYLED */ 1212 1213 /* start sending from tcp_snxt */ 1214 snxt = tcp->tcp_snxt; 1215 1216 /* 1217 * Check to see if this connection has been idled for some 1218 * time and no ACK is expected. If it is, we need to slow 1219 * start again to get back the connection's "self-clock" as 1220 * described in VJ's paper. 1221 * 1222 * Reinitialize tcp_cwnd after idle. 1223 */ 1224 now = LBOLT_FASTPATH; 1225 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && 1226 (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { 1227 TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); 1228 } 1229 1230 usable = tcp->tcp_swnd; /* tcp window size */ 1231 if (usable > tcp->tcp_cwnd) 1232 usable = tcp->tcp_cwnd; /* congestion window smaller */ 1233 usable -= snxt; /* subtract stuff already sent */ 1234 suna = tcp->tcp_suna; 1235 usable += suna; 1236 /* usable can be < 0 if the congestion window is smaller */ 1237 if (len > usable) { 1238 /* Can't send complete M_DATA in one shot */ 1239 goto slow; 1240 } 1241 1242 mutex_enter(&tcp->tcp_non_sq_lock); 1243 if (tcp->tcp_flow_stopped && 1244 TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { 1245 tcp_clrqfull(tcp); 1246 } 1247 mutex_exit(&tcp->tcp_non_sq_lock); 1248 1249 /* 1250 * determine if anything to send (Nagle). 1251 * 1252 * 1. len < tcp_mss (i.e. small) 1253 * 2. unacknowledged data present 1254 * 3. len < nagle limit 1255 * 4. last packet sent < nagle limit (previous packet sent) 1256 */ 1257 if ((len < mss) && (snxt != suna) && 1258 (len < (int)tcp->tcp_naglim) && 1259 (tcp->tcp_last_sent_len < tcp->tcp_naglim)) { 1260 /* 1261 * This was the first unsent packet and normally 1262 * mss < xmit_hiwater so there is no need to worry 1263 * about flow control. The next packet will go 1264 * through the flow control check in tcp_wput_data(). 1265 */ 1266 /* leftover work from above */ 1267 tcp->tcp_unsent = len; 1268 tcp->tcp_xmit_tail_unsent = len; 1269 1270 return; 1271 } 1272 1273 /* 1274 * len <= tcp->tcp_mss && len == unsent so no sender silly window. Can 1275 * send now. 1276 */ 1277 1278 if (snxt == suna) { 1279 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 1280 } 1281 1282 /* we have always sent something */ 1283 tcp->tcp_rack_cnt = 0; 1284 1285 tcp->tcp_snxt = snxt + len; 1286 tcp->tcp_rack = tcp->tcp_rnxt; 1287 1288 if ((mp1 = dupb(mp)) == 0) 1289 goto no_memory; 1290 mp->b_prev = (mblk_t *)(uintptr_t)now; 1291 mp->b_next = (mblk_t *)(uintptr_t)snxt; 1292 1293 /* adjust tcp header information */ 1294 tcpha = tcp->tcp_tcpha; 1295 tcpha->tha_flags = (TH_ACK|TH_PUSH); 1296 1297 sum = len + connp->conn_ht_ulp_len + connp->conn_sum; 1298 sum = (sum >> 16) + (sum & 0xFFFF); 1299 tcpha->tha_sum = htons(sum); 1300 1301 tcpha->tha_seq = htonl(snxt); 1302 1303 TCPS_BUMP_MIB(tcps, tcpOutDataSegs); 1304 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); 1305 BUMP_LOCAL(tcp->tcp_obsegs); 1306 1307 /* Update the latest receive window size in TCP header. */ 1308 tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 1309 1310 tcp->tcp_last_sent_len = (ushort_t)len; 1311 1312 plen = len + connp->conn_ht_iphc_len; 1313 1314 ixa = connp->conn_ixa; 1315 ixa->ixa_pktlen = plen; 1316 1317 if (ixa->ixa_flags & IXAF_IS_IPV4) { 1318 tcp->tcp_ipha->ipha_length = htons(plen); 1319 } else { 1320 tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN); 1321 } 1322 1323 /* see if we need to allocate a mblk for the headers */ 1324 hdrlen = connp->conn_ht_iphc_len; 1325 rptr = mp1->b_rptr - hdrlen; 1326 db = mp1->b_datap; 1327 if ((db->db_ref != 2) || rptr < db->db_base || 1328 (!OK_32PTR(rptr))) { 1329 /* NOTE: we assume allocb returns an OK_32PTR */ 1330 mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED); 1331 if (!mp) { 1332 freemsg(mp1); 1333 goto no_memory; 1334 } 1335 mp->b_cont = mp1; 1336 mp1 = mp; 1337 /* Leave room for Link Level header */ 1338 rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra]; 1339 mp1->b_wptr = &rptr[hdrlen]; 1340 } 1341 mp1->b_rptr = rptr; 1342 1343 /* Fill in the timestamp option. */ 1344 if (tcp->tcp_snd_ts_ok) { 1345 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; 1346 1347 U32_TO_BE32(llbolt, 1348 (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); 1349 U32_TO_BE32(tcp->tcp_ts_recent, 1350 (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); 1351 } else { 1352 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); 1353 } 1354 1355 /* copy header into outgoing packet */ 1356 dst = (ipaddr_t *)rptr; 1357 src = (ipaddr_t *)connp->conn_ht_iphc; 1358 dst[0] = src[0]; 1359 dst[1] = src[1]; 1360 dst[2] = src[2]; 1361 dst[3] = src[3]; 1362 dst[4] = src[4]; 1363 dst[5] = src[5]; 1364 dst[6] = src[6]; 1365 dst[7] = src[7]; 1366 dst[8] = src[8]; 1367 dst[9] = src[9]; 1368 if (hdrlen -= 40) { 1369 hdrlen >>= 2; 1370 dst += 10; 1371 src += 10; 1372 do { 1373 *dst++ = *src++; 1374 } while (--hdrlen); 1375 } 1376 1377 /* 1378 * Set the ECN info in the TCP header. Note that this 1379 * is not the template header. 1380 */ 1381 if (tcp->tcp_ecn_ok) { 1382 TCP_SET_ECT(tcp, rptr); 1383 1384 tcpha = (tcpha_t *)(rptr + ixa->ixa_ip_hdr_length); 1385 if (tcp->tcp_ecn_echo_on) 1386 tcpha->tha_flags |= TH_ECE; 1387 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 1388 tcpha->tha_flags |= TH_CWR; 1389 tcp->tcp_ecn_cwr_sent = B_TRUE; 1390 } 1391 } 1392 1393 if (tcp->tcp_ip_forward_progress) { 1394 tcp->tcp_ip_forward_progress = B_FALSE; 1395 connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; 1396 } else { 1397 connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; 1398 } 1399 tcp_send_data(tcp, mp1); 1400 return; 1401 1402 /* 1403 * If we ran out of memory, we pretend to have sent the packet 1404 * and that it was lost on the wire. 1405 */ 1406 no_memory: 1407 return; 1408 1409 slow: 1410 /* leftover work from above */ 1411 tcp->tcp_unsent = len; 1412 tcp->tcp_xmit_tail_unsent = len; 1413 tcp_wput_data(tcp, NULL, B_FALSE); 1414 } 1415 1416 /* ARGSUSED2 */ 1417 void 1418 tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1419 { 1420 int len; 1421 uint32_t msize; 1422 conn_t *connp = (conn_t *)arg; 1423 tcp_t *tcp = connp->conn_tcp; 1424 1425 msize = msgdsize(mp); 1426 1427 len = msize - 1; 1428 if (len < 0) { 1429 freemsg(mp); 1430 return; 1431 } 1432 1433 /* 1434 * Try to force urgent data out on the wire. Even if we have unsent 1435 * data this will at least send the urgent flag. 1436 * XXX does not handle more flag correctly. 1437 */ 1438 len += tcp->tcp_unsent; 1439 len += tcp->tcp_snxt; 1440 tcp->tcp_urg = len; 1441 tcp->tcp_valid_bits |= TCP_URG_VALID; 1442 1443 /* Bypass tcp protocol for fused tcp loopback */ 1444 if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) 1445 return; 1446 1447 /* Strip off the T_EXDATA_REQ if the data is from TPI */ 1448 if (DB_TYPE(mp) != M_DATA) { 1449 mblk_t *mp1 = mp; 1450 ASSERT(!IPCL_IS_NONSTR(connp)); 1451 mp = mp->b_cont; 1452 freeb(mp1); 1453 } 1454 tcp_wput_data(tcp, mp, B_TRUE); 1455 } 1456 1457 /* 1458 * Called by streams close routine via squeues when our client blows off her 1459 * descriptor, we take this to mean: "close the stream state NOW, close the tcp 1460 * connection politely" When SO_LINGER is set (with a non-zero linger time and 1461 * it is not a nonblocking socket) then this routine sleeps until the FIN is 1462 * acked. 1463 * 1464 * NOTE: tcp_close potentially returns error when lingering. 1465 * However, the stream head currently does not pass these errors 1466 * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK 1467 * errors to the application (from tsleep()) and not errors 1468 * like ECONNRESET caused by receiving a reset packet. 1469 */ 1470 1471 /* ARGSUSED */ 1472 void 1473 tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1474 { 1475 char *msg; 1476 conn_t *connp = (conn_t *)arg; 1477 tcp_t *tcp = connp->conn_tcp; 1478 clock_t delta = 0; 1479 tcp_stack_t *tcps = tcp->tcp_tcps; 1480 1481 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 1482 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 1483 1484 mutex_enter(&tcp->tcp_eager_lock); 1485 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 1486 /* Cleanup for listener */ 1487 tcp_eager_cleanup(tcp, 0); 1488 tcp->tcp_wait_for_eagers = 1; 1489 } 1490 mutex_exit(&tcp->tcp_eager_lock); 1491 1492 tcp->tcp_lso = B_FALSE; 1493 1494 msg = NULL; 1495 switch (tcp->tcp_state) { 1496 case TCPS_CLOSED: 1497 case TCPS_IDLE: 1498 case TCPS_BOUND: 1499 case TCPS_LISTEN: 1500 break; 1501 case TCPS_SYN_SENT: 1502 msg = "tcp_close, during connect"; 1503 break; 1504 case TCPS_SYN_RCVD: 1505 /* 1506 * Close during the connect 3-way handshake 1507 * but here there may or may not be pending data 1508 * already on queue. Process almost same as in 1509 * the ESTABLISHED state. 1510 */ 1511 /* FALLTHRU */ 1512 default: 1513 if (tcp->tcp_fused) 1514 tcp_unfuse(tcp); 1515 1516 /* 1517 * If SO_LINGER has set a zero linger time, abort the 1518 * connection with a reset. 1519 */ 1520 if (connp->conn_linger && connp->conn_lingertime == 0) { 1521 msg = "tcp_close, zero lingertime"; 1522 break; 1523 } 1524 1525 /* 1526 * Abort connection if there is unread data queued. 1527 */ 1528 if (tcp->tcp_rcv_list || tcp->tcp_reass_head) { 1529 msg = "tcp_close, unread data"; 1530 break; 1531 } 1532 /* 1533 * We have done a qwait() above which could have possibly 1534 * drained more messages in turn causing transition to a 1535 * different state. Check whether we have to do the rest 1536 * of the processing or not. 1537 */ 1538 if (tcp->tcp_state <= TCPS_LISTEN) 1539 break; 1540 1541 /* 1542 * Transmit the FIN before detaching the tcp_t. 1543 * After tcp_detach returns this queue/perimeter 1544 * no longer owns the tcp_t thus others can modify it. 1545 */ 1546 (void) tcp_xmit_end(tcp); 1547 1548 /* 1549 * If lingering on close then wait until the fin is acked, 1550 * the SO_LINGER time passes, or a reset is sent/received. 1551 */ 1552 if (connp->conn_linger && connp->conn_lingertime > 0 && 1553 !(tcp->tcp_fin_acked) && 1554 tcp->tcp_state >= TCPS_ESTABLISHED) { 1555 if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) { 1556 tcp->tcp_client_errno = EWOULDBLOCK; 1557 } else if (tcp->tcp_client_errno == 0) { 1558 1559 ASSERT(tcp->tcp_linger_tid == 0); 1560 1561 tcp->tcp_linger_tid = TCP_TIMER(tcp, 1562 tcp_close_linger_timeout, 1563 connp->conn_lingertime * hz); 1564 1565 /* tcp_close_linger_timeout will finish close */ 1566 if (tcp->tcp_linger_tid == 0) 1567 tcp->tcp_client_errno = ENOSR; 1568 else 1569 return; 1570 } 1571 1572 /* 1573 * Check if we need to detach or just close 1574 * the instance. 1575 */ 1576 if (tcp->tcp_state <= TCPS_LISTEN) 1577 break; 1578 } 1579 1580 /* 1581 * Make sure that no other thread will access the conn_rq of 1582 * this instance (through lookups etc.) as conn_rq will go 1583 * away shortly. 1584 */ 1585 tcp_acceptor_hash_remove(tcp); 1586 1587 mutex_enter(&tcp->tcp_non_sq_lock); 1588 if (tcp->tcp_flow_stopped) { 1589 tcp_clrqfull(tcp); 1590 } 1591 mutex_exit(&tcp->tcp_non_sq_lock); 1592 1593 if (tcp->tcp_timer_tid != 0) { 1594 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 1595 tcp->tcp_timer_tid = 0; 1596 } 1597 /* 1598 * Need to cancel those timers which will not be used when 1599 * TCP is detached. This has to be done before the conn_wq 1600 * is set to NULL. 1601 */ 1602 tcp_timers_stop(tcp); 1603 1604 tcp->tcp_detached = B_TRUE; 1605 if (tcp->tcp_state == TCPS_TIME_WAIT) { 1606 tcp_time_wait_append(tcp); 1607 TCP_DBGSTAT(tcps, tcp_detach_time_wait); 1608 ASSERT(connp->conn_ref >= 3); 1609 goto finish; 1610 } 1611 1612 /* 1613 * If delta is zero the timer event wasn't executed and was 1614 * successfully canceled. In this case we need to restart it 1615 * with the minimal delta possible. 1616 */ 1617 if (delta >= 0) 1618 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, 1619 delta ? delta : 1); 1620 1621 ASSERT(connp->conn_ref >= 3); 1622 goto finish; 1623 } 1624 1625 /* Detach did not complete. Still need to remove q from stream. */ 1626 if (msg) { 1627 if (tcp->tcp_state == TCPS_ESTABLISHED || 1628 tcp->tcp_state == TCPS_CLOSE_WAIT) 1629 TCPS_BUMP_MIB(tcps, tcpEstabResets); 1630 if (tcp->tcp_state == TCPS_SYN_SENT || 1631 tcp->tcp_state == TCPS_SYN_RCVD) 1632 TCPS_BUMP_MIB(tcps, tcpAttemptFails); 1633 tcp_xmit_ctl(msg, tcp, tcp->tcp_snxt, 0, TH_RST); 1634 } 1635 1636 tcp_closei_local(tcp); 1637 CONN_DEC_REF(connp); 1638 ASSERT(connp->conn_ref >= 2); 1639 1640 finish: 1641 mutex_enter(&tcp->tcp_closelock); 1642 /* 1643 * Don't change the queues in the case of a listener that has 1644 * eagers in its q or q0. It could surprise the eagers. 1645 * Instead wait for the eagers outside the squeue. 1646 */ 1647 if (!tcp->tcp_wait_for_eagers) { 1648 tcp->tcp_detached = B_TRUE; 1649 connp->conn_rq = NULL; 1650 connp->conn_wq = NULL; 1651 } 1652 1653 /* Signal tcp_close() to finish closing. */ 1654 tcp->tcp_closed = 1; 1655 cv_signal(&tcp->tcp_closecv); 1656 mutex_exit(&tcp->tcp_closelock); 1657 } 1658 1659 /* ARGSUSED */ 1660 void 1661 tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1662 { 1663 conn_t *connp = (conn_t *)arg; 1664 tcp_t *tcp = connp->conn_tcp; 1665 1666 freemsg(mp); 1667 1668 if (tcp->tcp_fused) 1669 tcp_unfuse(tcp); 1670 1671 if (tcp_xmit_end(tcp) != 0) { 1672 /* 1673 * We were crossing FINs and got a reset from 1674 * the other side. Just ignore it. 1675 */ 1676 if (connp->conn_debug) { 1677 (void) strlog(TCP_MOD_ID, 0, 1, 1678 SL_ERROR|SL_TRACE, 1679 "tcp_shutdown_output() out of state %s", 1680 tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 1681 } 1682 } 1683 } 1684 1685 #pragma inline(tcp_send_data) 1686 1687 void 1688 tcp_send_data(tcp_t *tcp, mblk_t *mp) 1689 { 1690 conn_t *connp = tcp->tcp_connp; 1691 1692 /* 1693 * Check here to avoid sending zero-copy message down to IP when 1694 * ZEROCOPY capability has turned off. We only need to deal with 1695 * the race condition between sockfs and the notification here. 1696 * Since we have tried to backoff the tcp_xmit_head when turning 1697 * zero-copy off and new messages in tcp_output(), we simply drop 1698 * the dup'ed packet here and let tcp retransmit, if tcp_xmit_zc_clean 1699 * is not true. 1700 */ 1701 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on && 1702 !tcp->tcp_xmit_zc_clean) { 1703 ip_drop_output("TCP ZC was disabled but not clean", mp, NULL); 1704 freemsg(mp); 1705 return; 1706 } 1707 1708 ASSERT(connp->conn_ixa->ixa_notify_cookie == connp->conn_tcp); 1709 (void) conn_ip_output(mp, connp->conn_ixa); 1710 } 1711 1712 /* ARGSUSED2 */ 1713 void 1714 tcp_send_synack(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1715 { 1716 conn_t *econnp = (conn_t *)arg; 1717 tcp_t *tcp = econnp->conn_tcp; 1718 1719 /* Guard against a RST having blown it away while on the squeue */ 1720 if (tcp->tcp_state == TCPS_CLOSED) { 1721 freemsg(mp); 1722 return; 1723 } 1724 1725 (void) conn_ip_output(mp, econnp->conn_ixa); 1726 } 1727 1728 /* 1729 * tcp_send() is called by tcp_wput_data() and returns one of the following: 1730 * 1731 * -1 = failed allocation. 1732 * 0 = success; burst count reached, or usable send window is too small, 1733 * and that we'd rather wait until later before sending again. 1734 */ 1735 static int 1736 tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len, 1737 const int tcp_hdr_len, const int num_sack_blk, int *usable, 1738 uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time) 1739 { 1740 int num_burst_seg = tcp->tcp_snd_burst; 1741 int num_lso_seg = 1; 1742 uint_t lso_usable; 1743 boolean_t do_lso_send = B_FALSE; 1744 tcp_stack_t *tcps = tcp->tcp_tcps; 1745 conn_t *connp = tcp->tcp_connp; 1746 ip_xmit_attr_t *ixa = connp->conn_ixa; 1747 1748 /* 1749 * Check LSO possibility. The value of tcp->tcp_lso indicates whether 1750 * the underlying connection is LSO capable. Will check whether having 1751 * enough available data to initiate LSO transmission in the for(){} 1752 * loops. 1753 */ 1754 if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0) 1755 do_lso_send = B_TRUE; 1756 1757 for (;;) { 1758 struct datab *db; 1759 tcpha_t *tcpha; 1760 uint32_t sum; 1761 mblk_t *mp, *mp1; 1762 uchar_t *rptr; 1763 int len; 1764 1765 /* 1766 * Burst count reached, return successfully. 1767 */ 1768 if (num_burst_seg == 0) 1769 break; 1770 1771 /* 1772 * Calculate the maximum payload length we can send at one 1773 * time. 1774 */ 1775 if (do_lso_send) { 1776 /* 1777 * Check whether be able to to do LSO for the current 1778 * available data. 1779 */ 1780 if (num_burst_seg >= 2 && (*usable - 1) / mss >= 1) { 1781 lso_usable = MIN(tcp->tcp_lso_max, *usable); 1782 lso_usable = MIN(lso_usable, 1783 num_burst_seg * mss); 1784 1785 num_lso_seg = lso_usable / mss; 1786 if (lso_usable % mss) { 1787 num_lso_seg++; 1788 tcp->tcp_last_sent_len = (ushort_t) 1789 (lso_usable % mss); 1790 } else { 1791 tcp->tcp_last_sent_len = (ushort_t)mss; 1792 } 1793 } else { 1794 do_lso_send = B_FALSE; 1795 num_lso_seg = 1; 1796 lso_usable = mss; 1797 } 1798 } 1799 1800 ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1); 1801 #ifdef DEBUG 1802 DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, boolean_t, 1803 do_lso_send); 1804 #endif 1805 /* 1806 * Adjust num_burst_seg here. 1807 */ 1808 num_burst_seg -= num_lso_seg; 1809 1810 len = mss; 1811 if (len > *usable) { 1812 ASSERT(do_lso_send == B_FALSE); 1813 1814 len = *usable; 1815 if (len <= 0) { 1816 /* Terminate the loop */ 1817 break; /* success; too small */ 1818 } 1819 /* 1820 * Sender silly-window avoidance. 1821 * Ignore this if we are going to send a 1822 * zero window probe out. 1823 * 1824 * TODO: force data into microscopic window? 1825 * ==> (!pushed || (unsent > usable)) 1826 */ 1827 if (len < (tcp->tcp_max_swnd >> 1) && 1828 (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len && 1829 !((tcp->tcp_valid_bits & TCP_URG_VALID) && 1830 len == 1) && (! tcp->tcp_zero_win_probe)) { 1831 /* 1832 * If the retransmit timer is not running 1833 * we start it so that we will retransmit 1834 * in the case when the receiver has 1835 * decremented the window. 1836 */ 1837 if (*snxt == tcp->tcp_snxt && 1838 *snxt == tcp->tcp_suna) { 1839 /* 1840 * We are not supposed to send 1841 * anything. So let's wait a little 1842 * bit longer before breaking SWS 1843 * avoidance. 1844 * 1845 * What should the value be? 1846 * Suggestion: MAX(init rexmit time, 1847 * tcp->tcp_rto) 1848 */ 1849 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 1850 } 1851 break; /* success; too small */ 1852 } 1853 } 1854 1855 tcpha = tcp->tcp_tcpha; 1856 1857 /* 1858 * The reason to adjust len here is that we need to set flags 1859 * and calculate checksum. 1860 */ 1861 if (do_lso_send) 1862 len = lso_usable; 1863 1864 *usable -= len; /* Approximate - can be adjusted later */ 1865 if (*usable > 0) 1866 tcpha->tha_flags = TH_ACK; 1867 else 1868 tcpha->tha_flags = (TH_ACK | TH_PUSH); 1869 1870 /* 1871 * Prime pump for IP's checksumming on our behalf. 1872 * Include the adjustment for a source route if any. 1873 * In case of LSO, the partial pseudo-header checksum should 1874 * exclusive TCP length, so zero tha_sum before IP calculate 1875 * pseudo-header checksum for partial checksum offload. 1876 */ 1877 if (do_lso_send) { 1878 sum = 0; 1879 } else { 1880 sum = len + tcp_hdr_len + connp->conn_sum; 1881 sum = (sum >> 16) + (sum & 0xFFFF); 1882 } 1883 tcpha->tha_sum = htons(sum); 1884 tcpha->tha_seq = htonl(*snxt); 1885 1886 /* 1887 * Branch off to tcp_xmit_mp() if any of the VALID bits is 1888 * set. For the case when TCP_FSS_VALID is the only valid 1889 * bit (normal active close), branch off only when we think 1890 * that the FIN flag needs to be set. Note for this case, 1891 * that (snxt + len) may not reflect the actual seg_len, 1892 * as len may be further reduced in tcp_xmit_mp(). If len 1893 * gets modified, we will end up here again. 1894 */ 1895 if (tcp->tcp_valid_bits != 0 && 1896 (tcp->tcp_valid_bits != TCP_FSS_VALID || 1897 ((*snxt + len) == tcp->tcp_fss))) { 1898 uchar_t *prev_rptr; 1899 uint32_t prev_snxt = tcp->tcp_snxt; 1900 1901 if (*tail_unsent == 0) { 1902 ASSERT((*xmit_tail)->b_cont != NULL); 1903 *xmit_tail = (*xmit_tail)->b_cont; 1904 prev_rptr = (*xmit_tail)->b_rptr; 1905 *tail_unsent = (int)((*xmit_tail)->b_wptr - 1906 (*xmit_tail)->b_rptr); 1907 } else { 1908 prev_rptr = (*xmit_tail)->b_rptr; 1909 (*xmit_tail)->b_rptr = (*xmit_tail)->b_wptr - 1910 *tail_unsent; 1911 } 1912 mp = tcp_xmit_mp(tcp, *xmit_tail, len, NULL, NULL, 1913 *snxt, B_FALSE, (uint32_t *)&len, B_FALSE); 1914 /* Restore tcp_snxt so we get amount sent right. */ 1915 tcp->tcp_snxt = prev_snxt; 1916 if (prev_rptr == (*xmit_tail)->b_rptr) { 1917 /* 1918 * If the previous timestamp is still in use, 1919 * don't stomp on it. 1920 */ 1921 if ((*xmit_tail)->b_next == NULL) { 1922 (*xmit_tail)->b_prev = local_time; 1923 (*xmit_tail)->b_next = 1924 (mblk_t *)(uintptr_t)(*snxt); 1925 } 1926 } else 1927 (*xmit_tail)->b_rptr = prev_rptr; 1928 1929 if (mp == NULL) { 1930 return (-1); 1931 } 1932 mp1 = mp->b_cont; 1933 1934 if (len <= mss) /* LSO is unusable (!do_lso_send) */ 1935 tcp->tcp_last_sent_len = (ushort_t)len; 1936 while (mp1->b_cont) { 1937 *xmit_tail = (*xmit_tail)->b_cont; 1938 (*xmit_tail)->b_prev = local_time; 1939 (*xmit_tail)->b_next = 1940 (mblk_t *)(uintptr_t)(*snxt); 1941 mp1 = mp1->b_cont; 1942 } 1943 *snxt += len; 1944 *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr; 1945 BUMP_LOCAL(tcp->tcp_obsegs); 1946 TCPS_BUMP_MIB(tcps, tcpOutDataSegs); 1947 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); 1948 tcp_send_data(tcp, mp); 1949 continue; 1950 } 1951 1952 *snxt += len; /* Adjust later if we don't send all of len */ 1953 TCPS_BUMP_MIB(tcps, tcpOutDataSegs); 1954 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); 1955 1956 if (*tail_unsent) { 1957 /* Are the bytes above us in flight? */ 1958 rptr = (*xmit_tail)->b_wptr - *tail_unsent; 1959 if (rptr != (*xmit_tail)->b_rptr) { 1960 *tail_unsent -= len; 1961 if (len <= mss) /* LSO is unusable */ 1962 tcp->tcp_last_sent_len = (ushort_t)len; 1963 len += total_hdr_len; 1964 ixa->ixa_pktlen = len; 1965 1966 if (ixa->ixa_flags & IXAF_IS_IPV4) { 1967 tcp->tcp_ipha->ipha_length = htons(len); 1968 } else { 1969 tcp->tcp_ip6h->ip6_plen = 1970 htons(len - IPV6_HDR_LEN); 1971 } 1972 1973 mp = dupb(*xmit_tail); 1974 if (mp == NULL) { 1975 return (-1); /* out_of_mem */ 1976 } 1977 mp->b_rptr = rptr; 1978 /* 1979 * If the old timestamp is no longer in use, 1980 * sample a new timestamp now. 1981 */ 1982 if ((*xmit_tail)->b_next == NULL) { 1983 (*xmit_tail)->b_prev = local_time; 1984 (*xmit_tail)->b_next = 1985 (mblk_t *)(uintptr_t)(*snxt-len); 1986 } 1987 goto must_alloc; 1988 } 1989 } else { 1990 *xmit_tail = (*xmit_tail)->b_cont; 1991 ASSERT((uintptr_t)((*xmit_tail)->b_wptr - 1992 (*xmit_tail)->b_rptr) <= (uintptr_t)INT_MAX); 1993 *tail_unsent = (int)((*xmit_tail)->b_wptr - 1994 (*xmit_tail)->b_rptr); 1995 } 1996 1997 (*xmit_tail)->b_prev = local_time; 1998 (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)(*snxt - len); 1999 2000 *tail_unsent -= len; 2001 if (len <= mss) /* LSO is unusable (!do_lso_send) */ 2002 tcp->tcp_last_sent_len = (ushort_t)len; 2003 2004 len += total_hdr_len; 2005 ixa->ixa_pktlen = len; 2006 2007 if (ixa->ixa_flags & IXAF_IS_IPV4) { 2008 tcp->tcp_ipha->ipha_length = htons(len); 2009 } else { 2010 tcp->tcp_ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2011 } 2012 2013 mp = dupb(*xmit_tail); 2014 if (mp == NULL) { 2015 return (-1); /* out_of_mem */ 2016 } 2017 2018 len = total_hdr_len; 2019 /* 2020 * There are four reasons to allocate a new hdr mblk: 2021 * 1) The bytes above us are in use by another packet 2022 * 2) We don't have good alignment 2023 * 3) The mblk is being shared 2024 * 4) We don't have enough room for a header 2025 */ 2026 rptr = mp->b_rptr - len; 2027 if (!OK_32PTR(rptr) || 2028 ((db = mp->b_datap), db->db_ref != 2) || 2029 rptr < db->db_base) { 2030 /* NOTE: we assume allocb returns an OK_32PTR */ 2031 2032 must_alloc:; 2033 mp1 = allocb(connp->conn_ht_iphc_allocated + 2034 tcps->tcps_wroff_xtra, BPRI_MED); 2035 if (mp1 == NULL) { 2036 freemsg(mp); 2037 return (-1); /* out_of_mem */ 2038 } 2039 mp1->b_cont = mp; 2040 mp = mp1; 2041 /* Leave room for Link Level header */ 2042 len = total_hdr_len; 2043 rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; 2044 mp->b_wptr = &rptr[len]; 2045 } 2046 2047 /* 2048 * Fill in the header using the template header, and add 2049 * options such as time-stamp, ECN and/or SACK, as needed. 2050 */ 2051 tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk); 2052 2053 mp->b_rptr = rptr; 2054 2055 if (*tail_unsent) { 2056 int spill = *tail_unsent; 2057 2058 mp1 = mp->b_cont; 2059 if (mp1 == NULL) 2060 mp1 = mp; 2061 2062 /* 2063 * If we're a little short, tack on more mblks until 2064 * there is no more spillover. 2065 */ 2066 while (spill < 0) { 2067 mblk_t *nmp; 2068 int nmpsz; 2069 2070 nmp = (*xmit_tail)->b_cont; 2071 nmpsz = MBLKL(nmp); 2072 2073 /* 2074 * Excess data in mblk; can we split it? 2075 * If LSO is enabled for the connection, 2076 * keep on splitting as this is a transient 2077 * send path. 2078 */ 2079 if (!do_lso_send && (spill + nmpsz > 0)) { 2080 /* 2081 * Don't split if stream head was 2082 * told to break up larger writes 2083 * into smaller ones. 2084 */ 2085 if (tcp->tcp_maxpsz_multiplier > 0) 2086 break; 2087 2088 /* 2089 * Next mblk is less than SMSS/2 2090 * rounded up to nearest 64-byte; 2091 * let it get sent as part of the 2092 * next segment. 2093 */ 2094 if (tcp->tcp_localnet && 2095 !tcp->tcp_cork && 2096 (nmpsz < roundup((mss >> 1), 64))) 2097 break; 2098 } 2099 2100 *xmit_tail = nmp; 2101 ASSERT((uintptr_t)nmpsz <= (uintptr_t)INT_MAX); 2102 /* Stash for rtt use later */ 2103 (*xmit_tail)->b_prev = local_time; 2104 (*xmit_tail)->b_next = 2105 (mblk_t *)(uintptr_t)(*snxt - len); 2106 mp1->b_cont = dupb(*xmit_tail); 2107 mp1 = mp1->b_cont; 2108 2109 spill += nmpsz; 2110 if (mp1 == NULL) { 2111 *tail_unsent = spill; 2112 freemsg(mp); 2113 return (-1); /* out_of_mem */ 2114 } 2115 } 2116 2117 /* Trim back any surplus on the last mblk */ 2118 if (spill >= 0) { 2119 mp1->b_wptr -= spill; 2120 *tail_unsent = spill; 2121 } else { 2122 /* 2123 * We did not send everything we could in 2124 * order to remain within the b_cont limit. 2125 */ 2126 *usable -= spill; 2127 *snxt += spill; 2128 tcp->tcp_last_sent_len += spill; 2129 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill); 2130 /* 2131 * Adjust the checksum 2132 */ 2133 tcpha = (tcpha_t *)(rptr + 2134 ixa->ixa_ip_hdr_length); 2135 sum += spill; 2136 sum = (sum >> 16) + (sum & 0xFFFF); 2137 tcpha->tha_sum = htons(sum); 2138 if (connp->conn_ipversion == IPV4_VERSION) { 2139 sum = ntohs( 2140 ((ipha_t *)rptr)->ipha_length) + 2141 spill; 2142 ((ipha_t *)rptr)->ipha_length = 2143 htons(sum); 2144 } else { 2145 sum = ntohs( 2146 ((ip6_t *)rptr)->ip6_plen) + 2147 spill; 2148 ((ip6_t *)rptr)->ip6_plen = 2149 htons(sum); 2150 } 2151 ixa->ixa_pktlen += spill; 2152 *tail_unsent = 0; 2153 } 2154 } 2155 if (tcp->tcp_ip_forward_progress) { 2156 tcp->tcp_ip_forward_progress = B_FALSE; 2157 ixa->ixa_flags |= IXAF_REACH_CONF; 2158 } else { 2159 ixa->ixa_flags &= ~IXAF_REACH_CONF; 2160 } 2161 2162 if (do_lso_send) { 2163 /* Append LSO information to the mp. */ 2164 lso_info_set(mp, mss, HW_LSO); 2165 ixa->ixa_fragsize = IP_MAXPACKET; 2166 ixa->ixa_extra_ident = num_lso_seg - 1; 2167 2168 DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, 2169 boolean_t, B_TRUE); 2170 2171 tcp_send_data(tcp, mp); 2172 2173 /* 2174 * Restore values of ixa_fragsize and ixa_extra_ident. 2175 */ 2176 ixa->ixa_fragsize = ixa->ixa_pmtu; 2177 ixa->ixa_extra_ident = 0; 2178 tcp->tcp_obsegs += num_lso_seg; 2179 TCP_STAT(tcps, tcp_lso_times); 2180 TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg); 2181 } else { 2182 /* 2183 * Make sure to clean up LSO information. Wherever a 2184 * new mp uses the prepended header room after dupb(), 2185 * lso_info_cleanup() should be called. 2186 */ 2187 lso_info_cleanup(mp); 2188 tcp_send_data(tcp, mp); 2189 BUMP_LOCAL(tcp->tcp_obsegs); 2190 } 2191 } 2192 2193 return (0); 2194 } 2195 2196 /* 2197 * Initiate closedown sequence on an active connection. (May be called as 2198 * writer.) Return value zero for OK return, non-zero for error return. 2199 */ 2200 static int 2201 tcp_xmit_end(tcp_t *tcp) 2202 { 2203 mblk_t *mp; 2204 tcp_stack_t *tcps = tcp->tcp_tcps; 2205 iulp_t uinfo; 2206 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 2207 conn_t *connp = tcp->tcp_connp; 2208 2209 if (tcp->tcp_state < TCPS_SYN_RCVD || 2210 tcp->tcp_state > TCPS_CLOSE_WAIT) { 2211 /* 2212 * Invalid state, only states TCPS_SYN_RCVD, 2213 * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid 2214 */ 2215 return (-1); 2216 } 2217 2218 tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent; 2219 tcp->tcp_valid_bits |= TCP_FSS_VALID; 2220 /* 2221 * If there is nothing more unsent, send the FIN now. 2222 * Otherwise, it will go out with the last segment. 2223 */ 2224 if (tcp->tcp_unsent == 0) { 2225 mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 2226 tcp->tcp_fss, B_FALSE, NULL, B_FALSE); 2227 2228 if (mp) { 2229 tcp_send_data(tcp, mp); 2230 } else { 2231 /* 2232 * Couldn't allocate msg. Pretend we got it out. 2233 * Wait for rexmit timeout. 2234 */ 2235 tcp->tcp_snxt = tcp->tcp_fss + 1; 2236 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 2237 } 2238 2239 /* 2240 * If needed, update tcp_rexmit_snxt as tcp_snxt is 2241 * changed. 2242 */ 2243 if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) { 2244 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 2245 } 2246 } else { 2247 /* 2248 * If tcp->tcp_cork is set, then the data will not get sent, 2249 * so we have to check that and unset it first. 2250 */ 2251 if (tcp->tcp_cork) 2252 tcp->tcp_cork = B_FALSE; 2253 tcp_wput_data(tcp, NULL, B_FALSE); 2254 } 2255 2256 /* 2257 * If TCP does not get enough samples of RTT or tcp_rtt_updates 2258 * is 0, don't update the cache. 2259 */ 2260 if (tcps->tcps_rtt_updates == 0 || 2261 tcp->tcp_rtt_update < tcps->tcps_rtt_updates) 2262 return (0); 2263 2264 /* 2265 * We do not have a good algorithm to update ssthresh at this time. 2266 * So don't do any update. 2267 */ 2268 bzero(&uinfo, sizeof (uinfo)); 2269 uinfo.iulp_rtt = tcp->tcp_rtt_sa; 2270 uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd; 2271 2272 /* 2273 * Note that uinfo is kept for conn_faddr in the DCE. Could update even 2274 * if source routed but we don't. 2275 */ 2276 if (connp->conn_ipversion == IPV4_VERSION) { 2277 if (connp->conn_faddr_v4 != tcp->tcp_ipha->ipha_dst) { 2278 return (0); 2279 } 2280 (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst); 2281 } else { 2282 uint_t ifindex; 2283 2284 if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, 2285 &tcp->tcp_ip6h->ip6_dst))) { 2286 return (0); 2287 } 2288 ifindex = 0; 2289 if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) { 2290 ip_xmit_attr_t *ixa = connp->conn_ixa; 2291 2292 /* 2293 * If we are going to create a DCE we'd better have 2294 * an ifindex 2295 */ 2296 if (ixa->ixa_nce != NULL) { 2297 ifindex = ixa->ixa_nce->nce_common->ncec_ill-> 2298 ill_phyint->phyint_ifindex; 2299 } else { 2300 return (0); 2301 } 2302 } 2303 2304 (void) dce_update_uinfo(&connp->conn_faddr_v6, ifindex, &uinfo, 2305 ipst); 2306 } 2307 return (0); 2308 } 2309 2310 /* 2311 * Send out a control packet on the tcp connection specified. This routine 2312 * is typically called where we need a simple ACK or RST generated. 2313 */ 2314 void 2315 tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) 2316 { 2317 uchar_t *rptr; 2318 tcpha_t *tcpha; 2319 ipha_t *ipha = NULL; 2320 ip6_t *ip6h = NULL; 2321 uint32_t sum; 2322 int total_hdr_len; 2323 int ip_hdr_len; 2324 mblk_t *mp; 2325 tcp_stack_t *tcps = tcp->tcp_tcps; 2326 conn_t *connp = tcp->tcp_connp; 2327 ip_xmit_attr_t *ixa = connp->conn_ixa; 2328 2329 /* 2330 * Save sum for use in source route later. 2331 */ 2332 sum = connp->conn_ht_ulp_len + connp->conn_sum; 2333 total_hdr_len = connp->conn_ht_iphc_len; 2334 ip_hdr_len = ixa->ixa_ip_hdr_length; 2335 2336 /* If a text string is passed in with the request, pass it to strlog. */ 2337 if (str != NULL && connp->conn_debug) { 2338 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 2339 "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x", 2340 str, seq, ack, ctl); 2341 } 2342 mp = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra, 2343 BPRI_MED); 2344 if (mp == NULL) { 2345 return; 2346 } 2347 rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; 2348 mp->b_rptr = rptr; 2349 mp->b_wptr = &rptr[total_hdr_len]; 2350 bcopy(connp->conn_ht_iphc, rptr, total_hdr_len); 2351 2352 ixa->ixa_pktlen = total_hdr_len; 2353 2354 if (ixa->ixa_flags & IXAF_IS_IPV4) { 2355 ipha = (ipha_t *)rptr; 2356 ipha->ipha_length = htons(total_hdr_len); 2357 } else { 2358 ip6h = (ip6_t *)rptr; 2359 ip6h->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN); 2360 } 2361 tcpha = (tcpha_t *)&rptr[ip_hdr_len]; 2362 tcpha->tha_flags = (uint8_t)ctl; 2363 if (ctl & TH_RST) { 2364 TCPS_BUMP_MIB(tcps, tcpOutRsts); 2365 TCPS_BUMP_MIB(tcps, tcpOutControl); 2366 /* 2367 * Don't send TSopt w/ TH_RST packets per RFC 1323. 2368 */ 2369 if (tcp->tcp_snd_ts_ok && 2370 tcp->tcp_state > TCPS_SYN_SENT) { 2371 mp->b_wptr = &rptr[total_hdr_len - TCPOPT_REAL_TS_LEN]; 2372 *(mp->b_wptr) = TCPOPT_EOL; 2373 2374 ixa->ixa_pktlen = total_hdr_len - TCPOPT_REAL_TS_LEN; 2375 2376 if (connp->conn_ipversion == IPV4_VERSION) { 2377 ipha->ipha_length = htons(total_hdr_len - 2378 TCPOPT_REAL_TS_LEN); 2379 } else { 2380 ip6h->ip6_plen = htons(total_hdr_len - 2381 IPV6_HDR_LEN - TCPOPT_REAL_TS_LEN); 2382 } 2383 tcpha->tha_offset_and_reserved -= (3 << 4); 2384 sum -= TCPOPT_REAL_TS_LEN; 2385 } 2386 } 2387 if (ctl & TH_ACK) { 2388 if (tcp->tcp_snd_ts_ok) { 2389 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; 2390 2391 U32_TO_BE32(llbolt, 2392 (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); 2393 U32_TO_BE32(tcp->tcp_ts_recent, 2394 (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); 2395 } 2396 2397 /* Update the latest receive window size in TCP header. */ 2398 tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 2399 /* Track what we sent to the peer */ 2400 tcp->tcp_tcpha->tha_win = tcpha->tha_win; 2401 tcp->tcp_rack = ack; 2402 tcp->tcp_rack_cnt = 0; 2403 TCPS_BUMP_MIB(tcps, tcpOutAck); 2404 } 2405 BUMP_LOCAL(tcp->tcp_obsegs); 2406 tcpha->tha_seq = htonl(seq); 2407 tcpha->tha_ack = htonl(ack); 2408 /* 2409 * Include the adjustment for a source route if any. 2410 */ 2411 sum = (sum >> 16) + (sum & 0xFFFF); 2412 tcpha->tha_sum = htons(sum); 2413 tcp_send_data(tcp, mp); 2414 } 2415 2416 /* 2417 * Generate a reset based on an inbound packet, connp is set by caller 2418 * when RST is in response to an unexpected inbound packet for which 2419 * there is active tcp state in the system. 2420 * 2421 * IPSEC NOTE : Try to send the reply with the same protection as it came 2422 * in. We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t. 2423 * That way the packet will go out at the same level of protection as it 2424 * came in with. 2425 */ 2426 static void 2427 tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl, 2428 ip_recv_attr_t *ira, ip_stack_t *ipst, conn_t *connp) 2429 { 2430 ipha_t *ipha = NULL; 2431 ip6_t *ip6h = NULL; 2432 ushort_t len; 2433 tcpha_t *tcpha; 2434 int i; 2435 ipaddr_t v4addr; 2436 in6_addr_t v6addr; 2437 netstack_t *ns = ipst->ips_netstack; 2438 tcp_stack_t *tcps = ns->netstack_tcp; 2439 ip_xmit_attr_t ixas, *ixa; 2440 uint_t ip_hdr_len = ira->ira_ip_hdr_length; 2441 boolean_t need_refrele = B_FALSE; /* ixa_refrele(ixa) */ 2442 ushort_t port; 2443 2444 if (!tcp_send_rst_chk(tcps)) { 2445 TCP_STAT(tcps, tcp_rst_unsent); 2446 freemsg(mp); 2447 return; 2448 } 2449 2450 /* 2451 * If connp != NULL we use conn_ixa to keep IP_NEXTHOP and other 2452 * options from the listener. In that case the caller must ensure that 2453 * we are running on the listener = connp squeue. 2454 * 2455 * We get a safe copy of conn_ixa so we don't need to restore anything 2456 * we or ip_output_simple might change in the ixa. 2457 */ 2458 if (connp != NULL) { 2459 ASSERT(connp->conn_on_sqp); 2460 2461 ixa = conn_get_ixa_exclusive(connp); 2462 if (ixa == NULL) { 2463 TCP_STAT(tcps, tcp_rst_unsent); 2464 freemsg(mp); 2465 return; 2466 } 2467 need_refrele = B_TRUE; 2468 } else { 2469 bzero(&ixas, sizeof (ixas)); 2470 ixa = &ixas; 2471 /* 2472 * IXAF_VERIFY_SOURCE is overkill since we know the 2473 * packet was for us. 2474 */ 2475 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE; 2476 ixa->ixa_protocol = IPPROTO_TCP; 2477 ixa->ixa_zoneid = ira->ira_zoneid; 2478 ixa->ixa_ifindex = 0; 2479 ixa->ixa_ipst = ipst; 2480 ixa->ixa_cred = kcred; 2481 ixa->ixa_cpid = NOPID; 2482 } 2483 2484 if (str && tcps->tcps_dbg) { 2485 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 2486 "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " 2487 "flags 0x%x", 2488 str, seq, ack, ctl); 2489 } 2490 if (mp->b_datap->db_ref != 1) { 2491 mblk_t *mp1 = copyb(mp); 2492 freemsg(mp); 2493 mp = mp1; 2494 if (mp == NULL) 2495 goto done; 2496 } else if (mp->b_cont) { 2497 freemsg(mp->b_cont); 2498 mp->b_cont = NULL; 2499 DB_CKSUMFLAGS(mp) = 0; 2500 } 2501 /* 2502 * We skip reversing source route here. 2503 * (for now we replace all IP options with EOL) 2504 */ 2505 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 2506 ipha = (ipha_t *)mp->b_rptr; 2507 for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) 2508 mp->b_rptr[i] = IPOPT_EOL; 2509 /* 2510 * Make sure that src address isn't flagrantly invalid. 2511 * Not all broadcast address checking for the src address 2512 * is possible, since we don't know the netmask of the src 2513 * addr. No check for destination address is done, since 2514 * IP will not pass up a packet with a broadcast dest 2515 * address to TCP. Similar checks are done below for IPv6. 2516 */ 2517 if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST || 2518 CLASSD(ipha->ipha_src)) { 2519 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); 2520 ip_drop_input("ipIfStatsInDiscards", mp, NULL); 2521 freemsg(mp); 2522 goto done; 2523 } 2524 } else { 2525 ip6h = (ip6_t *)mp->b_rptr; 2526 2527 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) || 2528 IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) { 2529 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards); 2530 ip_drop_input("ipIfStatsInDiscards", mp, NULL); 2531 freemsg(mp); 2532 goto done; 2533 } 2534 2535 /* Remove any extension headers assuming partial overlay */ 2536 if (ip_hdr_len > IPV6_HDR_LEN) { 2537 uint8_t *to; 2538 2539 to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN; 2540 ovbcopy(ip6h, to, IPV6_HDR_LEN); 2541 mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN; 2542 ip_hdr_len = IPV6_HDR_LEN; 2543 ip6h = (ip6_t *)mp->b_rptr; 2544 ip6h->ip6_nxt = IPPROTO_TCP; 2545 } 2546 } 2547 tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len]; 2548 if (tcpha->tha_flags & TH_RST) { 2549 freemsg(mp); 2550 goto done; 2551 } 2552 tcpha->tha_offset_and_reserved = (5 << 4); 2553 len = ip_hdr_len + sizeof (tcpha_t); 2554 mp->b_wptr = &mp->b_rptr[len]; 2555 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 2556 ipha->ipha_length = htons(len); 2557 /* Swap addresses */ 2558 v4addr = ipha->ipha_src; 2559 ipha->ipha_src = ipha->ipha_dst; 2560 ipha->ipha_dst = v4addr; 2561 ipha->ipha_ident = 0; 2562 ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl; 2563 ixa->ixa_flags |= IXAF_IS_IPV4; 2564 ixa->ixa_ip_hdr_length = ip_hdr_len; 2565 } else { 2566 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2567 /* Swap addresses */ 2568 v6addr = ip6h->ip6_src; 2569 ip6h->ip6_src = ip6h->ip6_dst; 2570 ip6h->ip6_dst = v6addr; 2571 ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit; 2572 ixa->ixa_flags &= ~IXAF_IS_IPV4; 2573 2574 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) { 2575 ixa->ixa_flags |= IXAF_SCOPEID_SET; 2576 ixa->ixa_scopeid = ira->ira_ruifindex; 2577 } 2578 ixa->ixa_ip_hdr_length = IPV6_HDR_LEN; 2579 } 2580 ixa->ixa_pktlen = len; 2581 2582 /* Swap the ports */ 2583 port = tcpha->tha_fport; 2584 tcpha->tha_fport = tcpha->tha_lport; 2585 tcpha->tha_lport = port; 2586 2587 tcpha->tha_ack = htonl(ack); 2588 tcpha->tha_seq = htonl(seq); 2589 tcpha->tha_win = 0; 2590 tcpha->tha_sum = htons(sizeof (tcpha_t)); 2591 tcpha->tha_flags = (uint8_t)ctl; 2592 if (ctl & TH_RST) { 2593 TCPS_BUMP_MIB(tcps, tcpOutRsts); 2594 TCPS_BUMP_MIB(tcps, tcpOutControl); 2595 } 2596 2597 /* Discard any old label */ 2598 if (ixa->ixa_free_flags & IXA_FREE_TSL) { 2599 ASSERT(ixa->ixa_tsl != NULL); 2600 label_rele(ixa->ixa_tsl); 2601 ixa->ixa_free_flags &= ~IXA_FREE_TSL; 2602 } 2603 ixa->ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ 2604 2605 if (ira->ira_flags & IRAF_IPSEC_SECURE) { 2606 /* 2607 * Apply IPsec based on how IPsec was applied to 2608 * the packet that caused the RST. 2609 */ 2610 if (!ipsec_in_to_out(ira, ixa, mp, ipha, ip6h)) { 2611 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 2612 /* Note: mp already consumed and ip_drop_packet done */ 2613 goto done; 2614 } 2615 } else { 2616 /* 2617 * This is in clear. The RST message we are building 2618 * here should go out in clear, independent of our policy. 2619 */ 2620 ixa->ixa_flags |= IXAF_NO_IPSEC; 2621 } 2622 2623 /* 2624 * NOTE: one might consider tracing a TCP packet here, but 2625 * this function has no active TCP state and no tcp structure 2626 * that has a trace buffer. If we traced here, we would have 2627 * to keep a local trace buffer in tcp_record_trace(). 2628 */ 2629 2630 (void) ip_output_simple(mp, ixa); 2631 done: 2632 ixa_cleanup(ixa); 2633 if (need_refrele) { 2634 ASSERT(ixa != &ixas); 2635 ixa_refrele(ixa); 2636 } 2637 } 2638 2639 /* 2640 * Generate a "no listener here" RST in response to an "unknown" segment. 2641 * connp is set by caller when RST is in response to an unexpected 2642 * inbound packet for which there is active tcp state in the system. 2643 * Note that we are reusing the incoming mp to construct the outgoing RST. 2644 */ 2645 void 2646 tcp_xmit_listeners_reset(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst, 2647 conn_t *connp) 2648 { 2649 uchar_t *rptr; 2650 uint32_t seg_len; 2651 tcpha_t *tcpha; 2652 uint32_t seg_seq; 2653 uint32_t seg_ack; 2654 uint_t flags; 2655 ipha_t *ipha; 2656 ip6_t *ip6h; 2657 boolean_t policy_present; 2658 netstack_t *ns = ipst->ips_netstack; 2659 tcp_stack_t *tcps = ns->netstack_tcp; 2660 ipsec_stack_t *ipss = tcps->tcps_netstack->netstack_ipsec; 2661 uint_t ip_hdr_len = ira->ira_ip_hdr_length; 2662 2663 TCP_STAT(tcps, tcp_no_listener); 2664 2665 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 2666 policy_present = ipss->ipsec_inbound_v4_policy_present; 2667 ipha = (ipha_t *)mp->b_rptr; 2668 ip6h = NULL; 2669 } else { 2670 policy_present = ipss->ipsec_inbound_v6_policy_present; 2671 ipha = NULL; 2672 ip6h = (ip6_t *)mp->b_rptr; 2673 } 2674 2675 if (policy_present) { 2676 /* 2677 * The conn_t parameter is NULL because we already know 2678 * nobody's home. 2679 */ 2680 mp = ipsec_check_global_policy(mp, (conn_t *)NULL, ipha, ip6h, 2681 ira, ns); 2682 if (mp == NULL) 2683 return; 2684 } 2685 if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) { 2686 DTRACE_PROBE2( 2687 tx__ip__log__error__nolistener__tcp, 2688 char *, "Could not reply with RST to mp(1)", 2689 mblk_t *, mp); 2690 ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n")); 2691 freemsg(mp); 2692 return; 2693 } 2694 2695 rptr = mp->b_rptr; 2696 2697 tcpha = (tcpha_t *)&rptr[ip_hdr_len]; 2698 seg_seq = ntohl(tcpha->tha_seq); 2699 seg_ack = ntohl(tcpha->tha_ack); 2700 flags = tcpha->tha_flags; 2701 2702 seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcpha) + ip_hdr_len); 2703 if (flags & TH_RST) { 2704 freemsg(mp); 2705 } else if (flags & TH_ACK) { 2706 tcp_xmit_early_reset("no tcp, reset", mp, seg_ack, 0, TH_RST, 2707 ira, ipst, connp); 2708 } else { 2709 if (flags & TH_SYN) { 2710 seg_len++; 2711 } else { 2712 /* 2713 * Here we violate the RFC. Note that a normal 2714 * TCP will never send a segment without the ACK 2715 * flag, except for RST or SYN segment. This 2716 * segment is neither. Just drop it on the 2717 * floor. 2718 */ 2719 freemsg(mp); 2720 TCP_STAT(tcps, tcp_rst_unsent); 2721 return; 2722 } 2723 2724 tcp_xmit_early_reset("no tcp, reset/ack", mp, 0, 2725 seg_seq + seg_len, TH_RST | TH_ACK, ira, ipst, connp); 2726 } 2727 } 2728 2729 /* 2730 * tcp_xmit_mp is called to return a pointer to an mblk chain complete with 2731 * ip and tcp header ready to pass down to IP. If the mp passed in is 2732 * non-NULL, then up to max_to_send bytes of data will be dup'ed off that 2733 * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary 2734 * otherwise it will dup partial mblks.) 2735 * Otherwise, an appropriate ACK packet will be generated. This 2736 * routine is not usually called to send new data for the first time. It 2737 * is mostly called out of the timer for retransmits, and to generate ACKs. 2738 * 2739 * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will 2740 * be adjusted by *offset. And after dupb(), the offset and the ending mblk 2741 * of the original mblk chain will be returned in *offset and *end_mp. 2742 */ 2743 mblk_t * 2744 tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, 2745 mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, 2746 boolean_t rexmit) 2747 { 2748 int data_length; 2749 int32_t off = 0; 2750 uint_t flags; 2751 mblk_t *mp1; 2752 mblk_t *mp2; 2753 uchar_t *rptr; 2754 tcpha_t *tcpha; 2755 int32_t num_sack_blk = 0; 2756 int32_t sack_opt_len = 0; 2757 tcp_stack_t *tcps = tcp->tcp_tcps; 2758 conn_t *connp = tcp->tcp_connp; 2759 ip_xmit_attr_t *ixa = connp->conn_ixa; 2760 2761 /* Allocate for our maximum TCP header + link-level */ 2762 mp1 = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra, 2763 BPRI_MED); 2764 if (!mp1) 2765 return (NULL); 2766 data_length = 0; 2767 2768 /* 2769 * Note that tcp_mss has been adjusted to take into account the 2770 * timestamp option if applicable. Because SACK options do not 2771 * appear in every TCP segments and they are of variable lengths, 2772 * they cannot be included in tcp_mss. Thus we need to calculate 2773 * the actual segment length when we need to send a segment which 2774 * includes SACK options. 2775 */ 2776 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 2777 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 2778 tcp->tcp_num_sack_blk); 2779 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 2780 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 2781 if (max_to_send + sack_opt_len > tcp->tcp_mss) 2782 max_to_send -= sack_opt_len; 2783 } 2784 2785 if (offset != NULL) { 2786 off = *offset; 2787 /* We use offset as an indicator that end_mp is not NULL. */ 2788 *end_mp = NULL; 2789 } 2790 for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) { 2791 /* This could be faster with cooperation from downstream */ 2792 if (mp2 != mp1 && !sendall && 2793 data_length + (int)(mp->b_wptr - mp->b_rptr) > 2794 max_to_send) 2795 /* 2796 * Don't send the next mblk since the whole mblk 2797 * does not fit. 2798 */ 2799 break; 2800 mp2->b_cont = dupb(mp); 2801 mp2 = mp2->b_cont; 2802 if (!mp2) { 2803 freemsg(mp1); 2804 return (NULL); 2805 } 2806 mp2->b_rptr += off; 2807 ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 2808 (uintptr_t)INT_MAX); 2809 2810 data_length += (int)(mp2->b_wptr - mp2->b_rptr); 2811 if (data_length > max_to_send) { 2812 mp2->b_wptr -= data_length - max_to_send; 2813 data_length = max_to_send; 2814 off = mp2->b_wptr - mp->b_rptr; 2815 break; 2816 } else { 2817 off = 0; 2818 } 2819 } 2820 if (offset != NULL) { 2821 *offset = off; 2822 *end_mp = mp; 2823 } 2824 if (seg_len != NULL) { 2825 *seg_len = data_length; 2826 } 2827 2828 /* Update the latest receive window size in TCP header. */ 2829 tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 2830 2831 rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; 2832 mp1->b_rptr = rptr; 2833 mp1->b_wptr = rptr + connp->conn_ht_iphc_len + sack_opt_len; 2834 bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len); 2835 tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length]; 2836 tcpha->tha_seq = htonl(seq); 2837 2838 /* 2839 * Use tcp_unsent to determine if the PUSH bit should be used assumes 2840 * that this function was called from tcp_wput_data. Thus, when called 2841 * to retransmit data the setting of the PUSH bit may appear some 2842 * what random in that it might get set when it should not. This 2843 * should not pose any performance issues. 2844 */ 2845 if (data_length != 0 && (tcp->tcp_unsent == 0 || 2846 tcp->tcp_unsent == data_length)) { 2847 flags = TH_ACK | TH_PUSH; 2848 } else { 2849 flags = TH_ACK; 2850 } 2851 2852 if (tcp->tcp_ecn_ok) { 2853 if (tcp->tcp_ecn_echo_on) 2854 flags |= TH_ECE; 2855 2856 /* 2857 * Only set ECT bit and ECN_CWR if a segment contains new data. 2858 * There is no TCP flow control for non-data segments, and 2859 * only data segment is transmitted reliably. 2860 */ 2861 if (data_length > 0 && !rexmit) { 2862 TCP_SET_ECT(tcp, rptr); 2863 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 2864 flags |= TH_CWR; 2865 tcp->tcp_ecn_cwr_sent = B_TRUE; 2866 } 2867 } 2868 } 2869 2870 if (tcp->tcp_valid_bits) { 2871 uint32_t u1; 2872 2873 if ((tcp->tcp_valid_bits & TCP_ISS_VALID) && 2874 seq == tcp->tcp_iss) { 2875 uchar_t *wptr; 2876 2877 /* 2878 * If TCP_ISS_VALID and the seq number is tcp_iss, 2879 * TCP can only be in SYN-SENT, SYN-RCVD or 2880 * FIN-WAIT-1 state. It can be FIN-WAIT-1 if 2881 * our SYN is not ack'ed but the app closes this 2882 * TCP connection. 2883 */ 2884 ASSERT(tcp->tcp_state == TCPS_SYN_SENT || 2885 tcp->tcp_state == TCPS_SYN_RCVD || 2886 tcp->tcp_state == TCPS_FIN_WAIT_1); 2887 2888 /* 2889 * Tack on the MSS option. It is always needed 2890 * for both active and passive open. 2891 * 2892 * MSS option value should be interface MTU - MIN 2893 * TCP/IP header according to RFC 793 as it means 2894 * the maximum segment size TCP can receive. But 2895 * to get around some broken middle boxes/end hosts 2896 * out there, we allow the option value to be the 2897 * same as the MSS option size on the peer side. 2898 * In this way, the other side will not send 2899 * anything larger than they can receive. 2900 * 2901 * Note that for SYN_SENT state, the ndd param 2902 * tcp_use_smss_as_mss_opt has no effect as we 2903 * don't know the peer's MSS option value. So 2904 * the only case we need to take care of is in 2905 * SYN_RCVD state, which is done later. 2906 */ 2907 wptr = mp1->b_wptr; 2908 wptr[0] = TCPOPT_MAXSEG; 2909 wptr[1] = TCPOPT_MAXSEG_LEN; 2910 wptr += 2; 2911 u1 = tcp->tcp_initial_pmtu - 2912 (connp->conn_ipversion == IPV4_VERSION ? 2913 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - 2914 TCP_MIN_HEADER_LENGTH; 2915 U16_TO_BE16(u1, wptr); 2916 mp1->b_wptr = wptr + 2; 2917 /* Update the offset to cover the additional word */ 2918 tcpha->tha_offset_and_reserved += (1 << 4); 2919 2920 /* 2921 * Note that the following way of filling in 2922 * TCP options are not optimal. Some NOPs can 2923 * be saved. But there is no need at this time 2924 * to optimize it. When it is needed, we will 2925 * do it. 2926 */ 2927 switch (tcp->tcp_state) { 2928 case TCPS_SYN_SENT: 2929 flags = TH_SYN; 2930 2931 if (tcp->tcp_snd_ts_ok) { 2932 uint32_t llbolt = 2933 (uint32_t)LBOLT_FASTPATH; 2934 2935 wptr = mp1->b_wptr; 2936 wptr[0] = TCPOPT_NOP; 2937 wptr[1] = TCPOPT_NOP; 2938 wptr[2] = TCPOPT_TSTAMP; 2939 wptr[3] = TCPOPT_TSTAMP_LEN; 2940 wptr += 4; 2941 U32_TO_BE32(llbolt, wptr); 2942 wptr += 4; 2943 ASSERT(tcp->tcp_ts_recent == 0); 2944 U32_TO_BE32(0L, wptr); 2945 mp1->b_wptr += TCPOPT_REAL_TS_LEN; 2946 tcpha->tha_offset_and_reserved += 2947 (3 << 4); 2948 } 2949 2950 /* 2951 * Set up all the bits to tell other side 2952 * we are ECN capable. 2953 */ 2954 if (tcp->tcp_ecn_ok) { 2955 flags |= (TH_ECE | TH_CWR); 2956 } 2957 break; 2958 case TCPS_SYN_RCVD: 2959 flags |= TH_SYN; 2960 2961 /* 2962 * Reset the MSS option value to be SMSS 2963 * We should probably add back the bytes 2964 * for timestamp option and IPsec. We 2965 * don't do that as this is a workaround 2966 * for broken middle boxes/end hosts, it 2967 * is better for us to be more cautious. 2968 * They may not take these things into 2969 * account in their SMSS calculation. Thus 2970 * the peer's calculated SMSS may be smaller 2971 * than what it can be. This should be OK. 2972 */ 2973 if (tcps->tcps_use_smss_as_mss_opt) { 2974 u1 = tcp->tcp_mss; 2975 U16_TO_BE16(u1, wptr); 2976 } 2977 2978 /* 2979 * If the other side is ECN capable, reply 2980 * that we are also ECN capable. 2981 */ 2982 if (tcp->tcp_ecn_ok) 2983 flags |= TH_ECE; 2984 break; 2985 default: 2986 /* 2987 * The above ASSERT() makes sure that this 2988 * must be FIN-WAIT-1 state. Our SYN has 2989 * not been ack'ed so retransmit it. 2990 */ 2991 flags |= TH_SYN; 2992 break; 2993 } 2994 2995 if (tcp->tcp_snd_ws_ok) { 2996 wptr = mp1->b_wptr; 2997 wptr[0] = TCPOPT_NOP; 2998 wptr[1] = TCPOPT_WSCALE; 2999 wptr[2] = TCPOPT_WS_LEN; 3000 wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 3001 mp1->b_wptr += TCPOPT_REAL_WS_LEN; 3002 tcpha->tha_offset_and_reserved += (1 << 4); 3003 } 3004 3005 if (tcp->tcp_snd_sack_ok) { 3006 wptr = mp1->b_wptr; 3007 wptr[0] = TCPOPT_NOP; 3008 wptr[1] = TCPOPT_NOP; 3009 wptr[2] = TCPOPT_SACK_PERMITTED; 3010 wptr[3] = TCPOPT_SACK_OK_LEN; 3011 mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 3012 tcpha->tha_offset_and_reserved += (1 << 4); 3013 } 3014 3015 /* allocb() of adequate mblk assures space */ 3016 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 3017 (uintptr_t)INT_MAX); 3018 u1 = (int)(mp1->b_wptr - mp1->b_rptr); 3019 /* 3020 * Get IP set to checksum on our behalf 3021 * Include the adjustment for a source route if any. 3022 */ 3023 u1 += connp->conn_sum; 3024 u1 = (u1 >> 16) + (u1 & 0xFFFF); 3025 tcpha->tha_sum = htons(u1); 3026 TCPS_BUMP_MIB(tcps, tcpOutControl); 3027 } 3028 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 3029 (seq + data_length) == tcp->tcp_fss) { 3030 if (!tcp->tcp_fin_acked) { 3031 flags |= TH_FIN; 3032 TCPS_BUMP_MIB(tcps, tcpOutControl); 3033 } 3034 if (!tcp->tcp_fin_sent) { 3035 tcp->tcp_fin_sent = B_TRUE; 3036 switch (tcp->tcp_state) { 3037 case TCPS_SYN_RCVD: 3038 case TCPS_ESTABLISHED: 3039 tcp->tcp_state = TCPS_FIN_WAIT_1; 3040 break; 3041 case TCPS_CLOSE_WAIT: 3042 tcp->tcp_state = TCPS_LAST_ACK; 3043 break; 3044 } 3045 if (tcp->tcp_suna == tcp->tcp_snxt) 3046 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3047 tcp->tcp_snxt = tcp->tcp_fss + 1; 3048 } 3049 } 3050 /* 3051 * Note the trick here. u1 is unsigned. When tcp_urg 3052 * is smaller than seq, u1 will become a very huge value. 3053 * So the comparison will fail. Also note that tcp_urp 3054 * should be positive, see RFC 793 page 17. 3055 */ 3056 u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION; 3057 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 && 3058 u1 < (uint32_t)(64 * 1024)) { 3059 flags |= TH_URG; 3060 TCPS_BUMP_MIB(tcps, tcpOutUrg); 3061 tcpha->tha_urp = htons(u1); 3062 } 3063 } 3064 tcpha->tha_flags = (uchar_t)flags; 3065 tcp->tcp_rack = tcp->tcp_rnxt; 3066 tcp->tcp_rack_cnt = 0; 3067 3068 if (tcp->tcp_snd_ts_ok) { 3069 if (tcp->tcp_state != TCPS_SYN_SENT) { 3070 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; 3071 3072 U32_TO_BE32(llbolt, 3073 (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); 3074 U32_TO_BE32(tcp->tcp_ts_recent, 3075 (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); 3076 } 3077 } 3078 3079 if (num_sack_blk > 0) { 3080 uchar_t *wptr = (uchar_t *)tcpha + connp->conn_ht_ulp_len; 3081 sack_blk_t *tmp; 3082 int32_t i; 3083 3084 wptr[0] = TCPOPT_NOP; 3085 wptr[1] = TCPOPT_NOP; 3086 wptr[2] = TCPOPT_SACK; 3087 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 3088 sizeof (sack_blk_t); 3089 wptr += TCPOPT_REAL_SACK_LEN; 3090 3091 tmp = tcp->tcp_sack_list; 3092 for (i = 0; i < num_sack_blk; i++) { 3093 U32_TO_BE32(tmp[i].begin, wptr); 3094 wptr += sizeof (tcp_seq); 3095 U32_TO_BE32(tmp[i].end, wptr); 3096 wptr += sizeof (tcp_seq); 3097 } 3098 tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4); 3099 } 3100 ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); 3101 data_length += (int)(mp1->b_wptr - rptr); 3102 3103 ixa->ixa_pktlen = data_length; 3104 3105 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3106 ((ipha_t *)rptr)->ipha_length = htons(data_length); 3107 } else { 3108 ip6_t *ip6 = (ip6_t *)rptr; 3109 3110 ip6->ip6_plen = htons(data_length - IPV6_HDR_LEN); 3111 } 3112 3113 /* 3114 * Prime pump for IP 3115 * Include the adjustment for a source route if any. 3116 */ 3117 data_length -= ixa->ixa_ip_hdr_length; 3118 data_length += connp->conn_sum; 3119 data_length = (data_length >> 16) + (data_length & 0xFFFF); 3120 tcpha->tha_sum = htons(data_length); 3121 if (tcp->tcp_ip_forward_progress) { 3122 tcp->tcp_ip_forward_progress = B_FALSE; 3123 connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; 3124 } else { 3125 connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; 3126 } 3127 return (mp1); 3128 } 3129 3130 /* 3131 * If this routine returns B_TRUE, TCP can generate a RST in response 3132 * to a segment. If it returns B_FALSE, TCP should not respond. 3133 */ 3134 static boolean_t 3135 tcp_send_rst_chk(tcp_stack_t *tcps) 3136 { 3137 int64_t now; 3138 3139 /* 3140 * TCP needs to protect itself from generating too many RSTs. 3141 * This can be a DoS attack by sending us random segments 3142 * soliciting RSTs. 3143 * 3144 * What we do here is to have a limit of tcp_rst_sent_rate RSTs 3145 * in each 1 second interval. In this way, TCP still generate 3146 * RSTs in normal cases but when under attack, the impact is 3147 * limited. 3148 */ 3149 if (tcps->tcps_rst_sent_rate_enabled != 0) { 3150 now = ddi_get_lbolt64(); 3151 if (TICK_TO_MSEC(now - tcps->tcps_last_rst_intrvl) > 3152 1*SECONDS) { 3153 tcps->tcps_last_rst_intrvl = now; 3154 tcps->tcps_rst_cnt = 1; 3155 } else if (++tcps->tcps_rst_cnt > tcps->tcps_rst_sent_rate) { 3156 return (B_FALSE); 3157 } 3158 } 3159 return (B_TRUE); 3160 } 3161 3162 /* 3163 * This function handles all retransmissions if SACK is enabled for this 3164 * connection. First it calculates how many segments can be retransmitted 3165 * based on tcp_pipe. Then it goes thru the notsack list to find eligible 3166 * segments. A segment is eligible if sack_cnt for that segment is greater 3167 * than or equal tcp_dupack_fast_retransmit. After it has retransmitted 3168 * all eligible segments, it checks to see if TCP can send some new segments 3169 * (fast recovery). If it can, set the appropriate flag for tcp_input_data(). 3170 * 3171 * Parameters: 3172 * tcp_t *tcp: the tcp structure of the connection. 3173 * uint_t *flags: in return, appropriate value will be set for 3174 * tcp_input_data(). 3175 */ 3176 void 3177 tcp_sack_rexmit(tcp_t *tcp, uint_t *flags) 3178 { 3179 notsack_blk_t *notsack_blk; 3180 int32_t usable_swnd; 3181 int32_t mss; 3182 uint32_t seg_len; 3183 mblk_t *xmit_mp; 3184 tcp_stack_t *tcps = tcp->tcp_tcps; 3185 3186 ASSERT(tcp->tcp_sack_info != NULL); 3187 ASSERT(tcp->tcp_notsack_list != NULL); 3188 ASSERT(tcp->tcp_rexmit == B_FALSE); 3189 3190 /* Defensive coding in case there is a bug... */ 3191 if (tcp->tcp_notsack_list == NULL) { 3192 return; 3193 } 3194 notsack_blk = tcp->tcp_notsack_list; 3195 mss = tcp->tcp_mss; 3196 3197 /* 3198 * Limit the num of outstanding data in the network to be 3199 * tcp_cwnd_ssthresh, which is half of the original congestion wnd. 3200 */ 3201 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 3202 3203 /* At least retransmit 1 MSS of data. */ 3204 if (usable_swnd <= 0) { 3205 usable_swnd = mss; 3206 } 3207 3208 /* Make sure no new RTT samples will be taken. */ 3209 tcp->tcp_csuna = tcp->tcp_snxt; 3210 3211 notsack_blk = tcp->tcp_notsack_list; 3212 while (usable_swnd > 0) { 3213 mblk_t *snxt_mp, *tmp_mp; 3214 tcp_seq begin = tcp->tcp_sack_snxt; 3215 tcp_seq end; 3216 int32_t off; 3217 3218 for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { 3219 if (SEQ_GT(notsack_blk->end, begin) && 3220 (notsack_blk->sack_cnt >= 3221 tcps->tcps_dupack_fast_retransmit)) { 3222 end = notsack_blk->end; 3223 if (SEQ_LT(begin, notsack_blk->begin)) { 3224 begin = notsack_blk->begin; 3225 } 3226 break; 3227 } 3228 } 3229 /* 3230 * All holes are filled. Manipulate tcp_cwnd to send more 3231 * if we can. Note that after the SACK recovery, tcp_cwnd is 3232 * set to tcp_cwnd_ssthresh. 3233 */ 3234 if (notsack_blk == NULL) { 3235 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 3236 if (usable_swnd <= 0 || tcp->tcp_unsent == 0) { 3237 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna; 3238 ASSERT(tcp->tcp_cwnd > 0); 3239 return; 3240 } else { 3241 usable_swnd = usable_swnd / mss; 3242 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna + 3243 MAX(usable_swnd * mss, mss); 3244 *flags |= TH_XMIT_NEEDED; 3245 return; 3246 } 3247 } 3248 3249 /* 3250 * Note that we may send more than usable_swnd allows here 3251 * because of round off, but no more than 1 MSS of data. 3252 */ 3253 seg_len = end - begin; 3254 if (seg_len > mss) 3255 seg_len = mss; 3256 snxt_mp = tcp_get_seg_mp(tcp, begin, &off); 3257 ASSERT(snxt_mp != NULL); 3258 /* This should not happen. Defensive coding again... */ 3259 if (snxt_mp == NULL) { 3260 return; 3261 } 3262 3263 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off, 3264 &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE); 3265 if (xmit_mp == NULL) 3266 return; 3267 3268 usable_swnd -= seg_len; 3269 tcp->tcp_pipe += seg_len; 3270 tcp->tcp_sack_snxt = begin + seg_len; 3271 3272 tcp_send_data(tcp, xmit_mp); 3273 3274 /* 3275 * Update the send timestamp to avoid false retransmission. 3276 */ 3277 snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); 3278 3279 TCPS_BUMP_MIB(tcps, tcpRetransSegs); 3280 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len); 3281 TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs); 3282 /* 3283 * Update tcp_rexmit_max to extend this SACK recovery phase. 3284 * This happens when new data sent during fast recovery is 3285 * also lost. If TCP retransmits those new data, it needs 3286 * to extend SACK recover phase to avoid starting another 3287 * fast retransmit/recovery unnecessarily. 3288 */ 3289 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) { 3290 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt; 3291 } 3292 } 3293 } 3294 3295 /* 3296 * tcp_ss_rexmit() is called to do slow start retransmission after a timeout 3297 * or ICMP errors. 3298 * 3299 * To limit the number of duplicate segments, we limit the number of segment 3300 * to be sent in one time to tcp_snd_burst, the burst variable. 3301 */ 3302 void 3303 tcp_ss_rexmit(tcp_t *tcp) 3304 { 3305 uint32_t snxt; 3306 uint32_t smax; 3307 int32_t win; 3308 int32_t mss; 3309 int32_t off; 3310 int32_t burst = tcp->tcp_snd_burst; 3311 mblk_t *snxt_mp; 3312 tcp_stack_t *tcps = tcp->tcp_tcps; 3313 3314 /* 3315 * Note that tcp_rexmit can be set even though TCP has retransmitted 3316 * all unack'ed segments. 3317 */ 3318 if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) { 3319 smax = tcp->tcp_rexmit_max; 3320 snxt = tcp->tcp_rexmit_nxt; 3321 if (SEQ_LT(snxt, tcp->tcp_suna)) { 3322 snxt = tcp->tcp_suna; 3323 } 3324 win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd); 3325 win -= snxt - tcp->tcp_suna; 3326 mss = tcp->tcp_mss; 3327 snxt_mp = tcp_get_seg_mp(tcp, snxt, &off); 3328 3329 while (SEQ_LT(snxt, smax) && (win > 0) && 3330 (burst > 0) && (snxt_mp != NULL)) { 3331 mblk_t *xmit_mp; 3332 mblk_t *old_snxt_mp = snxt_mp; 3333 uint32_t cnt = mss; 3334 3335 if (win < cnt) { 3336 cnt = win; 3337 } 3338 if (SEQ_GT(snxt + cnt, smax)) { 3339 cnt = smax - snxt; 3340 } 3341 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off, 3342 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE); 3343 if (xmit_mp == NULL) 3344 return; 3345 3346 tcp_send_data(tcp, xmit_mp); 3347 3348 snxt += cnt; 3349 win -= cnt; 3350 /* 3351 * Update the send timestamp to avoid false 3352 * retransmission. 3353 */ 3354 old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); 3355 TCPS_BUMP_MIB(tcps, tcpRetransSegs); 3356 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt); 3357 3358 tcp->tcp_rexmit_nxt = snxt; 3359 burst--; 3360 } 3361 /* 3362 * If we have transmitted all we have at the time 3363 * we started the retranmission, we can leave 3364 * the rest of the job to tcp_wput_data(). But we 3365 * need to check the send window first. If the 3366 * win is not 0, go on with tcp_wput_data(). 3367 */ 3368 if (SEQ_LT(snxt, smax) || win == 0) { 3369 return; 3370 } 3371 } 3372 /* Only call tcp_wput_data() if there is data to be sent. */ 3373 if (tcp->tcp_unsent) { 3374 tcp_wput_data(tcp, NULL, B_FALSE); 3375 } 3376 } 3377 3378 /* 3379 * Do slow start retransmission after ICMP errors of PMTU changes. 3380 */ 3381 void 3382 tcp_rexmit_after_error(tcp_t *tcp) 3383 { 3384 /* 3385 * All sent data has been acknowledged or no data left to send, just 3386 * to return. 3387 */ 3388 if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) || 3389 (tcp->tcp_xmit_head == NULL)) 3390 return; 3391 3392 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0)) 3393 tcp->tcp_rexmit_max = tcp->tcp_fss; 3394 else 3395 tcp->tcp_rexmit_max = tcp->tcp_snxt; 3396 3397 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 3398 tcp->tcp_rexmit = B_TRUE; 3399 tcp->tcp_dupack_cnt = 0; 3400 tcp->tcp_snd_burst = TCP_CWND_SS; 3401 tcp_ss_rexmit(tcp); 3402 } 3403 3404 /* 3405 * tcp_get_seg_mp() is called to get the pointer to a segment in the 3406 * send queue which starts at the given sequence number. If the given 3407 * sequence number is equal to last valid sequence number (tcp_snxt), the 3408 * returned mblk is the last valid mblk, and off is set to the length of 3409 * that mblk. 3410 * 3411 * send queue which starts at the given seq. no. 3412 * 3413 * Parameters: 3414 * tcp_t *tcp: the tcp instance pointer. 3415 * uint32_t seq: the starting seq. no of the requested segment. 3416 * int32_t *off: after the execution, *off will be the offset to 3417 * the returned mblk which points to the requested seq no. 3418 * It is the caller's responsibility to send in a non-null off. 3419 * 3420 * Return: 3421 * A mblk_t pointer pointing to the requested segment in send queue. 3422 */ 3423 static mblk_t * 3424 tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) 3425 { 3426 int32_t cnt; 3427 mblk_t *mp; 3428 3429 /* Defensive coding. Make sure we don't send incorrect data. */ 3430 if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GT(seq, tcp->tcp_snxt)) 3431 return (NULL); 3432 3433 cnt = seq - tcp->tcp_suna; 3434 mp = tcp->tcp_xmit_head; 3435 while (cnt > 0 && mp != NULL) { 3436 cnt -= mp->b_wptr - mp->b_rptr; 3437 if (cnt <= 0) { 3438 cnt += mp->b_wptr - mp->b_rptr; 3439 break; 3440 } 3441 mp = mp->b_cont; 3442 } 3443 ASSERT(mp != NULL); 3444 *off = cnt; 3445 return (mp); 3446 } 3447 3448 /* 3449 * This routine adjusts next-to-send sequence number variables, in the 3450 * case where the reciever has shrunk it's window. 3451 */ 3452 void 3453 tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt) 3454 { 3455 mblk_t *xmit_tail; 3456 int32_t offset; 3457 3458 tcp->tcp_snxt = snxt; 3459 3460 /* Get the mblk, and the offset in it, as per the shrunk window */ 3461 xmit_tail = tcp_get_seg_mp(tcp, snxt, &offset); 3462 ASSERT(xmit_tail != NULL); 3463 tcp->tcp_xmit_tail = xmit_tail; 3464 tcp->tcp_xmit_tail_unsent = xmit_tail->b_wptr - 3465 xmit_tail->b_rptr - offset; 3466 } 3467 3468 /* 3469 * This handles the case when the receiver has shrunk its win. Per RFC 1122 3470 * if the receiver shrinks the window, i.e. moves the right window to the 3471 * left, the we should not send new data, but should retransmit normally the 3472 * old unacked data between suna and suna + swnd. We might has sent data 3473 * that is now outside the new window, pretend that we didn't send it. 3474 */ 3475 static void 3476 tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count) 3477 { 3478 uint32_t snxt = tcp->tcp_snxt; 3479 3480 ASSERT(shrunk_count > 0); 3481 3482 if (!tcp->tcp_is_wnd_shrnk) { 3483 tcp->tcp_snxt_shrunk = snxt; 3484 tcp->tcp_is_wnd_shrnk = B_TRUE; 3485 } else if (SEQ_GT(snxt, tcp->tcp_snxt_shrunk)) { 3486 tcp->tcp_snxt_shrunk = snxt; 3487 } 3488 3489 /* Pretend we didn't send the data outside the window */ 3490 snxt -= shrunk_count; 3491 3492 /* Reset all the values per the now shrunk window */ 3493 tcp_update_xmit_tail(tcp, snxt); 3494 tcp->tcp_unsent += shrunk_count; 3495 3496 /* 3497 * If the SACK option is set, delete the entire list of 3498 * notsack'ed blocks. 3499 */ 3500 if (tcp->tcp_sack_info != NULL) { 3501 if (tcp->tcp_notsack_list != NULL) 3502 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp); 3503 } 3504 3505 if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0) 3506 /* 3507 * Make sure the timer is running so that we will probe a zero 3508 * window. 3509 */ 3510 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3511 } 3512 3513 /* 3514 * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header 3515 * with the template header, as well as other options such as time-stamp, 3516 * ECN and/or SACK. 3517 */ 3518 static void 3519 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) 3520 { 3521 tcpha_t *tcp_tmpl, *tcpha; 3522 uint32_t *dst, *src; 3523 int hdrlen; 3524 conn_t *connp = tcp->tcp_connp; 3525 3526 ASSERT(OK_32PTR(rptr)); 3527 3528 /* Template header */ 3529 tcp_tmpl = tcp->tcp_tcpha; 3530 3531 /* Header of outgoing packet */ 3532 tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length); 3533 3534 /* dst and src are opaque 32-bit fields, used for copying */ 3535 dst = (uint32_t *)rptr; 3536 src = (uint32_t *)connp->conn_ht_iphc; 3537 hdrlen = connp->conn_ht_iphc_len; 3538 3539 /* Fill time-stamp option if needed */ 3540 if (tcp->tcp_snd_ts_ok) { 3541 U32_TO_BE32((uint32_t)now, 3542 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4); 3543 U32_TO_BE32(tcp->tcp_ts_recent, 3544 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8); 3545 } else { 3546 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); 3547 } 3548 3549 /* 3550 * Copy the template header; is this really more efficient than 3551 * calling bcopy()? For simple IPv4/TCP, it may be the case, 3552 * but perhaps not for other scenarios. 3553 */ 3554 dst[0] = src[0]; 3555 dst[1] = src[1]; 3556 dst[2] = src[2]; 3557 dst[3] = src[3]; 3558 dst[4] = src[4]; 3559 dst[5] = src[5]; 3560 dst[6] = src[6]; 3561 dst[7] = src[7]; 3562 dst[8] = src[8]; 3563 dst[9] = src[9]; 3564 if (hdrlen -= 40) { 3565 hdrlen >>= 2; 3566 dst += 10; 3567 src += 10; 3568 do { 3569 *dst++ = *src++; 3570 } while (--hdrlen); 3571 } 3572 3573 /* 3574 * Set the ECN info in the TCP header if it is not a zero 3575 * window probe. Zero window probe is only sent in 3576 * tcp_wput_data() and tcp_timer(). 3577 */ 3578 if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) { 3579 TCP_SET_ECT(tcp, rptr); 3580 3581 if (tcp->tcp_ecn_echo_on) 3582 tcpha->tha_flags |= TH_ECE; 3583 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 3584 tcpha->tha_flags |= TH_CWR; 3585 tcp->tcp_ecn_cwr_sent = B_TRUE; 3586 } 3587 } 3588 3589 /* Fill in SACK options */ 3590 if (num_sack_blk > 0) { 3591 uchar_t *wptr = rptr + connp->conn_ht_iphc_len; 3592 sack_blk_t *tmp; 3593 int32_t i; 3594 3595 wptr[0] = TCPOPT_NOP; 3596 wptr[1] = TCPOPT_NOP; 3597 wptr[2] = TCPOPT_SACK; 3598 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 3599 sizeof (sack_blk_t); 3600 wptr += TCPOPT_REAL_SACK_LEN; 3601 3602 tmp = tcp->tcp_sack_list; 3603 for (i = 0; i < num_sack_blk; i++) { 3604 U32_TO_BE32(tmp[i].begin, wptr); 3605 wptr += sizeof (tcp_seq); 3606 U32_TO_BE32(tmp[i].end, wptr); 3607 wptr += sizeof (tcp_seq); 3608 } 3609 tcpha->tha_offset_and_reserved += 3610 ((num_sack_blk * 2 + 1) << 4); 3611 } 3612 } 3613