1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright (c) 2016 by Delphix. All rights reserved. 26 * 27 * tcp.c, Code implementing the TCP protocol. 28 */ 29 30 #include <sys/types.h> 31 #include <socket_impl.h> 32 #include <socket_inet.h> 33 #include <sys/sysmacros.h> 34 #include <sys/promif.h> 35 #include <sys/socket.h> 36 #include <netinet/in_systm.h> 37 #include <netinet/in.h> 38 #include <netinet/ip.h> 39 #include <netinet/tcp.h> 40 #include <net/if_types.h> 41 #include <sys/salib.h> 42 43 #include "ipv4.h" 44 #include "ipv4_impl.h" 45 #include "mac.h" 46 #include "mac_impl.h" 47 #include "v4_sum_impl.h" 48 #include <sys/bootdebug.h> 49 #include "tcp_inet.h" 50 #include "tcp_sack.h" 51 #include <inet/common.h> 52 #include <inet/mib2.h> 53 54 /* 55 * We need to redefine BUMP_MIB/UPDATE_MIB to not have DTrace probes. 56 */ 57 #undef BUMP_MIB 58 #define BUMP_MIB(x) (x)++ 59 60 #undef UPDATE_MIB 61 #define UPDATE_MIB(x, y) x += y 62 63 /* 64 * MIB-2 stuff for SNMP 65 */ 66 mib2_tcp_t tcp_mib; /* SNMP fixed size info */ 67 68 /* The TCP mib does not include the following errors. */ 69 static uint_t tcp_cksum_errors; 70 static uint_t tcp_drops; 71 72 /* Macros for timestamp comparisons */ 73 #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) 74 #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) 75 76 /* 77 * Parameters for TCP Initial Send Sequence number (ISS) generation. 78 * The ISS is calculated by adding three components: a time component 79 * which grows by 1 every 4096 nanoseconds (versus every 4 microseconds 80 * suggested by RFC 793, page 27); 81 * a per-connection component which grows by 125000 for every new connection; 82 * and an "extra" component that grows by a random amount centered 83 * approximately on 64000. This causes the the ISS generator to cycle every 84 * 4.89 hours if no TCP connections are made, and faster if connections are 85 * made. 86 */ 87 #define ISS_INCR 250000 88 #define ISS_NSEC_SHT 0 89 90 static uint32_t tcp_iss_incr_extra; /* Incremented for each connection */ 91 92 #define TCP_XMIT_LOWATER 4096 93 #define TCP_XMIT_HIWATER 49152 94 #define TCP_RECV_LOWATER 2048 95 #define TCP_RECV_HIWATER 49152 96 97 /* 98 * PAWS needs a timer for 24 days. This is the number of ms in 24 days 99 */ 100 #define PAWS_TIMEOUT ((uint32_t)(24*24*60*60*1000)) 101 102 /* 103 * TCP options struct returned from tcp_parse_options. 104 */ 105 typedef struct tcp_opt_s { 106 uint32_t tcp_opt_mss; 107 uint32_t tcp_opt_wscale; 108 uint32_t tcp_opt_ts_val; 109 uint32_t tcp_opt_ts_ecr; 110 tcp_t *tcp; 111 } tcp_opt_t; 112 113 /* 114 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing 115 */ 116 117 #ifdef _BIG_ENDIAN 118 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 119 (TCPOPT_TSTAMP << 8) | 10) 120 #else 121 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 122 (TCPOPT_NOP << 8) | TCPOPT_NOP) 123 #endif 124 125 /* 126 * Flags returned from tcp_parse_options. 127 */ 128 #define TCP_OPT_MSS_PRESENT 1 129 #define TCP_OPT_WSCALE_PRESENT 2 130 #define TCP_OPT_TSTAMP_PRESENT 4 131 #define TCP_OPT_SACK_OK_PRESENT 8 132 #define TCP_OPT_SACK_PRESENT 16 133 134 /* TCP option length */ 135 #define TCPOPT_NOP_LEN 1 136 #define TCPOPT_MAXSEG_LEN 4 137 #define TCPOPT_WS_LEN 3 138 #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) 139 #define TCPOPT_TSTAMP_LEN 10 140 #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) 141 #define TCPOPT_SACK_OK_LEN 2 142 #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) 143 #define TCPOPT_REAL_SACK_LEN 4 144 #define TCPOPT_MAX_SACK_LEN 36 145 #define TCPOPT_HEADER_LEN 2 146 147 /* TCP cwnd burst factor. */ 148 #define TCP_CWND_INFINITE 65535 149 #define TCP_CWND_SS 3 150 #define TCP_CWND_NORMAL 5 151 152 /* Named Dispatch Parameter Management Structure */ 153 typedef struct tcpparam_s { 154 uint32_t tcp_param_min; 155 uint32_t tcp_param_max; 156 uint32_t tcp_param_val; 157 char *tcp_param_name; 158 } tcpparam_t; 159 160 /* Max size IP datagram is 64k - 1 */ 161 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (struct ip) + \ 162 sizeof (tcph_t))) 163 164 /* Max of the above */ 165 #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 166 167 /* Largest TCP port number */ 168 #define TCP_MAX_PORT (64 * 1024 - 1) 169 170 /* Round up the value to the nearest mss. */ 171 #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) 172 173 #define MS 1L 174 #define SECONDS (1000 * MS) 175 #define MINUTES (60 * SECONDS) 176 #define HOURS (60 * MINUTES) 177 #define DAYS (24 * HOURS) 178 179 /* All NDD params in the core TCP became static variables. */ 180 static int tcp_time_wait_interval = 1 * MINUTES; 181 static int tcp_conn_req_max_q = 128; 182 static int tcp_conn_req_max_q0 = 1024; 183 static int tcp_conn_req_min = 1; 184 static int tcp_conn_grace_period = 0 * SECONDS; 185 static int tcp_cwnd_max_ = 1024 * 1024; 186 static int tcp_smallest_nonpriv_port = 1024; 187 static int tcp_ip_abort_cinterval = 3 * MINUTES; 188 static int tcp_ip_abort_linterval = 3 * MINUTES; 189 static int tcp_ip_abort_interval = 8 * MINUTES; 190 static int tcp_ip_notify_cinterval = 10 * SECONDS; 191 static int tcp_ip_notify_interval = 10 * SECONDS; 192 static int tcp_ipv4_ttl = 64; 193 static int tcp_mss_def_ipv4 = 536; 194 static int tcp_mss_max_ipv4 = TCP_MSS_MAX_IPV4; 195 static int tcp_mss_min = 108; 196 static int tcp_naglim_def = (4*1024)-1; 197 static int tcp_rexmit_interval_initial = 3 * SECONDS; 198 static int tcp_rexmit_interval_max = 60 * SECONDS; 199 static int tcp_rexmit_interval_min = 400 * MS; 200 static int tcp_dupack_fast_retransmit = 3; 201 static int tcp_smallest_anon_port = 32 * 1024; 202 static int tcp_largest_anon_port = TCP_MAX_PORT; 203 static int tcp_xmit_lowat = TCP_XMIT_LOWATER; 204 static int tcp_recv_hiwat_minmss = 4; 205 static int tcp_fin_wait_2_flush_interval = 1 * MINUTES; 206 static int tcp_max_buf = 1024 * 1024; 207 static int tcp_wscale_always = 1; 208 static int tcp_tstamp_always = 1; 209 static int tcp_tstamp_if_wscale = 1; 210 static int tcp_rexmit_interval_extra = 0; 211 static int tcp_slow_start_after_idle = 2; 212 static int tcp_slow_start_initial = 2; 213 static int tcp_sack_permitted = 2; 214 static int tcp_ecn_permitted = 2; 215 216 /* Extra room to fit in headers. */ 217 static uint_t tcp_wroff_xtra; 218 219 /* Hint for next port to try. */ 220 static in_port_t tcp_next_port_to_try = 32*1024; 221 222 /* 223 * Figure out the value of window scale opton. Note that the rwnd is 224 * ASSUMED to be rounded up to the nearest MSS before the calculation. 225 * We cannot find the scale value and then do a round up of tcp_rwnd 226 * because the scale value may not be correct after that. 227 */ 228 #define SET_WS_VALUE(tcp) \ 229 { \ 230 int i; \ 231 uint32_t rwnd = (tcp)->tcp_rwnd; \ 232 for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; \ 233 i++, rwnd >>= 1) \ 234 ; \ 235 (tcp)->tcp_rcv_ws = i; \ 236 } 237 238 /* 239 * Set ECN capable transport (ECT) code point in IP header. 240 * 241 * Note that there are 2 ECT code points '01' and '10', which are called 242 * ECT(1) and ECT(0) respectively. Here we follow the original ECT code 243 * point ECT(0) for TCP as described in RFC 2481. 244 */ 245 #define SET_ECT(tcp, iph) \ 246 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 247 /* We need to clear the code point first. */ \ 248 ((struct ip *)(iph))->ip_tos &= 0xFC; \ 249 ((struct ip *)(iph))->ip_tos |= IPH_ECN_ECT0; \ 250 } 251 252 /* 253 * The format argument to pass to tcp_display(). 254 * DISP_PORT_ONLY means that the returned string has only port info. 255 * DISP_ADDR_AND_PORT means that the returned string also contains the 256 * remote and local IP address. 257 */ 258 #define DISP_PORT_ONLY 1 259 #define DISP_ADDR_AND_PORT 2 260 261 /* 262 * TCP reassembly macros. We hide starting and ending sequence numbers in 263 * b_next and b_prev of messages on the reassembly queue. The messages are 264 * chained using b_cont. These macros are used in tcp_reass() so we don't 265 * have to see the ugly casts and assignments. 266 * Note. use uintptr_t to suppress the gcc warning. 267 */ 268 #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next)) 269 #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \ 270 (mblk_t *)((uintptr_t)(u))) 271 #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev)) 272 #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \ 273 (mblk_t *)((uintptr_t)(u))) 274 275 #define TCP_TIMER_RESTART(tcp, intvl) \ 276 (tcp)->tcp_rto_timeout = prom_gettime() + intvl; \ 277 (tcp)->tcp_timer_running = B_TRUE; 278 279 static int tcp_accept_comm(tcp_t *, tcp_t *, mblk_t *, uint_t); 280 static mblk_t *tcp_ack_mp(tcp_t *); 281 static in_port_t tcp_bindi(in_port_t, in_addr_t *, boolean_t, boolean_t); 282 static uint16_t tcp_cksum(uint16_t *, uint32_t); 283 static void tcp_clean_death(int, tcp_t *, int err); 284 static tcp_t *tcp_conn_request(tcp_t *, mblk_t *mp, uint_t, uint_t); 285 static char *tcp_display(tcp_t *, char *, char); 286 static int tcp_drain_input(tcp_t *, int, int); 287 static void tcp_drain_needed(int, tcp_t *); 288 static boolean_t tcp_drop_q0(tcp_t *); 289 static mblk_t *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *); 290 static int tcp_header_len(struct inetgram *); 291 static in_port_t tcp_report_ports(uint16_t *, enum Ports); 292 static int tcp_input(int); 293 static void tcp_iss_init(tcp_t *); 294 static tcp_t *tcp_lookup_ipv4(struct ip *, tcpha_t *, int, int *); 295 static tcp_t *tcp_lookup_listener_ipv4(in_addr_t, in_port_t, int *); 296 static int tcp_conn_check(tcp_t *); 297 static int tcp_close(int); 298 static void tcp_close_detached(tcp_t *); 299 static void tcp_eager_cleanup(tcp_t *, boolean_t, int); 300 static void tcp_eager_unlink(tcp_t *); 301 static void tcp_free(tcp_t *); 302 static int tcp_header_init_ipv4(tcp_t *); 303 static void tcp_mss_set(tcp_t *, uint32_t); 304 static int tcp_parse_options(tcph_t *, tcp_opt_t *); 305 static boolean_t tcp_paws_check(tcp_t *, tcph_t *, tcp_opt_t *); 306 static void tcp_process_options(tcp_t *, tcph_t *); 307 static int tcp_random(void); 308 static void tcp_random_init(void); 309 static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); 310 static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); 311 static void tcp_rcv_drain(int sock_id, tcp_t *); 312 static void tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t); 313 static void tcp_rput_data(tcp_t *, mblk_t *, int); 314 static int tcp_rwnd_set(tcp_t *, uint32_t); 315 static int32_t tcp_sack_rxmit(tcp_t *, int); 316 static void tcp_set_cksum(mblk_t *); 317 static void tcp_set_rto(tcp_t *, int32_t); 318 static void tcp_ss_rexmit(tcp_t *, int); 319 static int tcp_state_wait(int, tcp_t *, int); 320 static void tcp_timer(tcp_t *, int); 321 static void tcp_time_wait_append(tcp_t *); 322 static void tcp_time_wait_collector(void); 323 static void tcp_time_wait_processing(tcp_t *, mblk_t *, uint32_t, 324 uint32_t, int, tcph_t *, int sock_id); 325 static void tcp_time_wait_remove(tcp_t *); 326 static in_port_t tcp_update_next_port(in_port_t); 327 static int tcp_verify_cksum(mblk_t *); 328 static void tcp_wput_data(tcp_t *, mblk_t *, int); 329 static void tcp_xmit_ctl(char *, tcp_t *, mblk_t *, uint32_t, uint32_t, 330 int, uint_t, int); 331 static void tcp_xmit_early_reset(char *, int, mblk_t *, uint32_t, uint32_t, 332 int, uint_t); 333 static int tcp_xmit_end(tcp_t *, int); 334 static void tcp_xmit_listeners_reset(int, mblk_t *, uint_t); 335 static mblk_t *tcp_xmit_mp(tcp_t *, mblk_t *, int32_t, int32_t *, 336 mblk_t **, uint32_t, boolean_t, uint32_t *, boolean_t); 337 static int tcp_init_values(tcp_t *, struct inetboot_socket *); 338 339 #if DEBUG > 1 340 #define TCP_DUMP_PACKET(str, mp) \ 341 { \ 342 int len = (mp)->b_wptr - (mp)->b_rptr; \ 343 \ 344 printf("%s: dump TCP(%d): \n", (str), len); \ 345 hexdump((char *)(mp)->b_rptr, len); \ 346 } 347 #else 348 #define TCP_DUMP_PACKET(str, mp) 349 #endif 350 351 #ifdef DEBUG 352 #define DEBUG_1(str, arg) printf(str, (arg)) 353 #define DEBUG_2(str, arg1, arg2) printf(str, (arg1), (arg2)) 354 #define DEBUG_3(str, arg1, arg2, arg3) printf(str, (arg1), (arg2), (arg3)) 355 #else 356 #define DEBUG_1(str, arg) 357 #define DEBUG_2(str, arg1, arg2) 358 #define DEBUG_3(str, arg1, arg2, arg3) 359 #endif 360 361 /* Whether it is the first time TCP is used. */ 362 static boolean_t tcp_initialized = B_FALSE; 363 364 /* TCP time wait list. */ 365 static tcp_t *tcp_time_wait_head; 366 static tcp_t *tcp_time_wait_tail; 367 static uint32_t tcp_cum_timewait; 368 /* When the tcp_time_wait_collector is run. */ 369 static uint32_t tcp_time_wait_runtime; 370 371 #define TCP_RUN_TIME_WAIT_COLLECTOR() \ 372 if (prom_gettime() > tcp_time_wait_runtime) \ 373 tcp_time_wait_collector(); 374 375 /* 376 * Accept will return with an error if there is no connection coming in 377 * after this (in ms). 378 */ 379 static int tcp_accept_timeout = 60000; 380 381 /* 382 * Initialize the TCP-specific parts of a socket. 383 */ 384 void 385 tcp_socket_init(struct inetboot_socket *isp) 386 { 387 /* Do some initializations. */ 388 if (!tcp_initialized) { 389 tcp_random_init(); 390 /* Extra head room for the MAC layer address. */ 391 if ((tcp_wroff_xtra = mac_get_hdr_len()) & 0x3) { 392 tcp_wroff_xtra = (tcp_wroff_xtra & ~0x3) + 0x4; 393 } 394 /* Schedule the first time wait cleanup time */ 395 tcp_time_wait_runtime = prom_gettime() + tcp_time_wait_interval; 396 tcp_initialized = B_TRUE; 397 } 398 TCP_RUN_TIME_WAIT_COLLECTOR(); 399 400 isp->proto = IPPROTO_TCP; 401 isp->input[TRANSPORT_LVL] = tcp_input; 402 /* Socket layer should call tcp_send() directly. */ 403 isp->output[TRANSPORT_LVL] = NULL; 404 isp->close[TRANSPORT_LVL] = tcp_close; 405 isp->headerlen[TRANSPORT_LVL] = tcp_header_len; 406 isp->ports = tcp_report_ports; 407 if ((isp->pcb = bkmem_alloc(sizeof (tcp_t))) == NULL) { 408 errno = ENOBUFS; 409 return; 410 } 411 if ((errno = tcp_init_values((tcp_t *)isp->pcb, isp)) != 0) { 412 bkmem_free(isp->pcb, sizeof (tcp_t)); 413 return; 414 } 415 /* 416 * This is set last because this field is used to determine if 417 * a socket is in use or not. 418 */ 419 isp->type = INETBOOT_STREAM; 420 } 421 422 /* 423 * Return the size of a TCP header including TCP option. 424 */ 425 static int 426 tcp_header_len(struct inetgram *igm) 427 { 428 mblk_t *pkt; 429 int ipvers; 430 431 /* Just returns the standard TCP header without option */ 432 if (igm == NULL) 433 return (sizeof (tcph_t)); 434 435 if ((pkt = igm->igm_mp) == NULL) 436 return (0); 437 438 ipvers = ((struct ip *)pkt->b_rptr)->ip_v; 439 if (ipvers == IPV4_VERSION) { 440 return (TCP_HDR_LENGTH((tcph_t *)(pkt + IPH_HDR_LENGTH(pkt)))); 441 } else { 442 dprintf("tcp_header_len: non-IPv4 packet.\n"); 443 return (0); 444 } 445 } 446 447 /* 448 * Return the requested port number in network order. 449 */ 450 static in_port_t 451 tcp_report_ports(uint16_t *tcphp, enum Ports request) 452 { 453 if (request == SOURCE) 454 return (*(uint16_t *)(((tcph_t *)tcphp)->th_lport)); 455 return (*(uint16_t *)(((tcph_t *)tcphp)->th_fport)); 456 } 457 458 /* 459 * Because inetboot is not interrupt driven, TCP can only poll. This 460 * means that there can be packets stuck in the NIC buffer waiting to 461 * be processed. Thus we need to drain them before, for example, sending 462 * anything because an ACK may actually be stuck there. 463 * 464 * The timeout arguments determine how long we should wait for draining. 465 */ 466 static int 467 tcp_drain_input(tcp_t *tcp, int sock_id, int timeout) 468 { 469 struct inetgram *in_gram; 470 struct inetgram *old_in_gram; 471 int old_timeout; 472 mblk_t *mp; 473 int i; 474 475 dprintf("tcp_drain_input(%d): %s\n", sock_id, 476 tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 477 478 /* 479 * Since the driver uses the in_timeout value in the socket 480 * structure to determine the timeout value, we need to save 481 * the original one so that we can restore that after draining. 482 */ 483 old_timeout = sockets[sock_id].in_timeout; 484 sockets[sock_id].in_timeout = timeout; 485 486 /* 487 * We do this because the input queue may have some user 488 * data already. 489 */ 490 old_in_gram = sockets[sock_id].inq; 491 sockets[sock_id].inq = NULL; 492 493 /* Go out and check the wire */ 494 for (i = MEDIA_LVL; i < TRANSPORT_LVL; i++) { 495 if (sockets[sock_id].input[i] != NULL) { 496 if (sockets[sock_id].input[i](sock_id) < 0) { 497 sockets[sock_id].in_timeout = old_timeout; 498 if (sockets[sock_id].inq != NULL) 499 nuke_grams(&sockets[sock_id].inq); 500 sockets[sock_id].inq = old_in_gram; 501 return (-1); 502 } 503 } 504 } 505 #if DEBUG 506 printf("tcp_drain_input: done with checking packets\n"); 507 #endif 508 while ((in_gram = sockets[sock_id].inq) != NULL) { 509 /* Remove unknown inetgrams from the head of inq. */ 510 if (in_gram->igm_level != TRANSPORT_LVL) { 511 #if DEBUG 512 printf("tcp_drain_input: unexpected packet " 513 "level %d frame found\n", in_gram->igm_level); 514 #endif 515 del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 516 continue; 517 } 518 mp = in_gram->igm_mp; 519 del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 520 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 521 tcp_rput_data(tcp, mp, sock_id); 522 sockets[sock_id].in_timeout = old_timeout; 523 524 /* 525 * The other side may have closed this connection or 526 * RST us. But we need to continue to process other 527 * packets in the socket's queue because they may be 528 * belong to another TCP connections. 529 */ 530 if (sockets[sock_id].pcb == NULL) 531 tcp = NULL; 532 } 533 534 if (tcp == NULL || sockets[sock_id].pcb == NULL) { 535 if (sockets[sock_id].so_error != 0) 536 return (-1); 537 else 538 return (0); 539 } 540 #if DEBUG 541 printf("tcp_drain_input: done with processing packets\n"); 542 #endif 543 sockets[sock_id].in_timeout = old_timeout; 544 sockets[sock_id].inq = old_in_gram; 545 546 /* 547 * Data may have been received so indicate it is available 548 */ 549 tcp_drain_needed(sock_id, tcp); 550 return (0); 551 } 552 553 /* 554 * The receive entry point for upper layer to call to get data. Note 555 * that this follows the current architecture that lower layer receive 556 * routines have been called already. Thus if the inq of socket is 557 * not NULL, the packets must be for us. 558 */ 559 static int 560 tcp_input(int sock_id) 561 { 562 struct inetgram *in_gram; 563 mblk_t *mp; 564 tcp_t *tcp; 565 566 TCP_RUN_TIME_WAIT_COLLECTOR(); 567 568 if ((tcp = sockets[sock_id].pcb) == NULL) 569 return (-1); 570 571 while ((in_gram = sockets[sock_id].inq) != NULL) { 572 /* Remove unknown inetgrams from the head of inq. */ 573 if (in_gram->igm_level != TRANSPORT_LVL) { 574 #ifdef DEBUG 575 printf("tcp_input: unexpected packet " 576 "level %d frame found\n", in_gram->igm_level); 577 #endif 578 del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 579 continue; 580 } 581 mp = in_gram->igm_mp; 582 del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 583 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 584 tcp_rput_data(tcp, mp, sock_id); 585 /* The TCP may be gone because it gets a RST. */ 586 if (sockets[sock_id].pcb == NULL) 587 return (-1); 588 } 589 590 /* Flush the receive list. */ 591 if (tcp->tcp_rcv_list != NULL) { 592 tcp_rcv_drain(sock_id, tcp); 593 } else { 594 /* The other side has closed the connection, report this up. */ 595 if (tcp->tcp_state == TCPS_CLOSE_WAIT) { 596 sockets[sock_id].so_state |= SS_CANTRCVMORE; 597 return (0); 598 } 599 } 600 return (0); 601 } 602 603 /* 604 * The send entry point for upper layer to call to send data. In order 605 * to minimize changes to the core TCP code, we need to put the 606 * data into mblks. 607 */ 608 int 609 tcp_send(int sock_id, tcp_t *tcp, const void *msg, int len) 610 { 611 mblk_t *mp; 612 mblk_t *head = NULL; 613 mblk_t *tail; 614 int mss = tcp->tcp_mss; 615 int cnt = 0; 616 int win_size; 617 char *buf = (char *)msg; 618 619 TCP_RUN_TIME_WAIT_COLLECTOR(); 620 621 /* We don't want to append 0 size mblk. */ 622 if (len == 0) 623 return (0); 624 while (len > 0) { 625 if (len < mss) { 626 mss = len; 627 } 628 /* 629 * If we cannot allocate more buffer, stop here and 630 * the number of bytes buffered will be returned. 631 * 632 * Note that we follow the core TCP optimization that 633 * each mblk contains only MSS bytes data. 634 */ 635 if ((mp = allocb(mss + tcp->tcp_ip_hdr_len + 636 TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 0)) == NULL) { 637 break; 638 } 639 mp->b_rptr += tcp->tcp_hdr_len + tcp_wroff_xtra; 640 bcopy(buf, mp->b_rptr, mss); 641 mp->b_wptr = mp->b_rptr + mss; 642 buf += mss; 643 cnt += mss; 644 len -= mss; 645 646 if (head == NULL) { 647 head = mp; 648 tail = mp; 649 } else { 650 tail->b_cont = mp; 651 tail = mp; 652 } 653 } 654 655 /* 656 * Since inetboot is not interrupt driven, there may be 657 * some ACKs in the MAC's buffer. Drain them first, 658 * otherwise, we may not be able to send. 659 * 660 * We expect an ACK in two cases: 661 * 662 * 1) We have un-ACK'ed data. 663 * 664 * 2) All ACK's have been received and the sender's window has been 665 * closed. We need an ACK back to open the window so that we can 666 * send. In this case, call tcp_drain_input() if the window size is 667 * less than 2 * MSS. 668 */ 669 670 /* window size = MIN(swnd, cwnd) - unacked bytes */ 671 win_size = (tcp->tcp_swnd > tcp->tcp_cwnd) ? tcp->tcp_cwnd : 672 tcp->tcp_swnd; 673 win_size -= tcp->tcp_snxt; 674 win_size += tcp->tcp_suna; 675 if (win_size < (2 * tcp->tcp_mss)) 676 if (tcp_drain_input(tcp, sock_id, 5) < 0) 677 return (-1); 678 679 tcp_wput_data(tcp, head, sock_id); 680 /* 681 * errno should be reset here as it may be 682 * set to ETIMEDOUT. This may be set by 683 * the MAC driver in case it has timed out 684 * waiting for ARP reply. Any segment which 685 * was not transmitted because of ARP timeout 686 * will be retransmitted by TCP. 687 */ 688 if (errno == ETIMEDOUT) 689 errno = 0; 690 return (cnt); 691 } 692 693 /* Free up all TCP related stuff */ 694 static void 695 tcp_free(tcp_t *tcp) 696 { 697 if (tcp->tcp_iphc != NULL) { 698 bkmem_free((caddr_t)tcp->tcp_iphc, tcp->tcp_iphc_len); 699 tcp->tcp_iphc = NULL; 700 } 701 if (tcp->tcp_xmit_head != NULL) { 702 freemsg(tcp->tcp_xmit_head); 703 tcp->tcp_xmit_head = NULL; 704 } 705 if (tcp->tcp_rcv_list != NULL) { 706 freemsg(tcp->tcp_rcv_list); 707 tcp->tcp_rcv_list = NULL; 708 } 709 if (tcp->tcp_reass_head != NULL) { 710 freemsg(tcp->tcp_reass_head); 711 tcp->tcp_reass_head = NULL; 712 } 713 if (tcp->tcp_sack_info != NULL) { 714 bkmem_free((caddr_t)tcp->tcp_sack_info, 715 sizeof (tcp_sack_info_t)); 716 tcp->tcp_sack_info = NULL; 717 } 718 } 719 720 static void 721 tcp_close_detached(tcp_t *tcp) 722 { 723 if (tcp->tcp_listener != NULL) 724 tcp_eager_unlink(tcp); 725 tcp_free(tcp); 726 bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 727 } 728 729 /* 730 * If we are an eager connection hanging off a listener that hasn't 731 * formally accepted the connection yet, get off its list and blow off 732 * any data that we have accumulated. 733 */ 734 static void 735 tcp_eager_unlink(tcp_t *tcp) 736 { 737 tcp_t *listener = tcp->tcp_listener; 738 739 assert(listener != NULL); 740 if (tcp->tcp_eager_next_q0 != NULL) { 741 assert(tcp->tcp_eager_prev_q0 != NULL); 742 743 /* Remove the eager tcp from q0 */ 744 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 745 tcp->tcp_eager_prev_q0; 746 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 747 tcp->tcp_eager_next_q0; 748 listener->tcp_conn_req_cnt_q0--; 749 } else { 750 tcp_t **tcpp = &listener->tcp_eager_next_q; 751 tcp_t *prev = NULL; 752 753 for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { 754 if (tcpp[0] == tcp) { 755 if (listener->tcp_eager_last_q == tcp) { 756 /* 757 * If we are unlinking the last 758 * element on the list, adjust 759 * tail pointer. Set tail pointer 760 * to nil when list is empty. 761 */ 762 assert(tcp->tcp_eager_next_q == NULL); 763 if (listener->tcp_eager_last_q == 764 listener->tcp_eager_next_q) { 765 listener->tcp_eager_last_q = 766 NULL; 767 } else { 768 /* 769 * We won't get here if there 770 * is only one eager in the 771 * list. 772 */ 773 assert(prev != NULL); 774 listener->tcp_eager_last_q = 775 prev; 776 } 777 } 778 tcpp[0] = tcp->tcp_eager_next_q; 779 tcp->tcp_eager_next_q = NULL; 780 tcp->tcp_eager_last_q = NULL; 781 listener->tcp_conn_req_cnt_q--; 782 break; 783 } 784 prev = tcpp[0]; 785 } 786 } 787 tcp->tcp_listener = NULL; 788 } 789 790 /* 791 * Reset any eager connection hanging off this listener 792 * and then reclaim it's resources. 793 */ 794 static void 795 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only, int sock_id) 796 { 797 tcp_t *eager; 798 799 if (!q0_only) { 800 /* First cleanup q */ 801 while ((eager = listener->tcp_eager_next_q) != NULL) { 802 assert(listener->tcp_eager_last_q != NULL); 803 tcp_xmit_ctl("tcp_eager_cleanup, can't wait", 804 eager, NULL, eager->tcp_snxt, 0, TH_RST, 0, 805 sock_id); 806 tcp_close_detached(eager); 807 } 808 assert(listener->tcp_eager_last_q == NULL); 809 } 810 /* Then cleanup q0 */ 811 while ((eager = listener->tcp_eager_next_q0) != listener) { 812 tcp_xmit_ctl("tcp_eager_cleanup, can't wait", 813 eager, NULL, eager->tcp_snxt, 0, TH_RST, 0, sock_id); 814 tcp_close_detached(eager); 815 } 816 } 817 818 /* 819 * To handle the shutdown request. Called from shutdown() 820 */ 821 int 822 tcp_shutdown(int sock_id) 823 { 824 tcp_t *tcp; 825 826 DEBUG_1("tcp_shutdown: sock_id %x\n", sock_id); 827 828 if ((tcp = sockets[sock_id].pcb) == NULL) { 829 return (-1); 830 } 831 832 /* 833 * Since inetboot is not interrupt driven, there may be 834 * some ACKs in the MAC's buffer. Drain them first, 835 * otherwise, we may not be able to send. 836 */ 837 if (tcp_drain_input(tcp, sock_id, 5) < 0) { 838 /* 839 * If we return now without freeing TCP, there will be 840 * a memory leak. 841 */ 842 if (sockets[sock_id].pcb != NULL) 843 tcp_clean_death(sock_id, tcp, 0); 844 return (-1); 845 } 846 847 DEBUG_1("tcp_shutdown: tcp_state %x\n", tcp->tcp_state); 848 switch (tcp->tcp_state) { 849 850 case TCPS_SYN_RCVD: 851 /* 852 * Shutdown during the connect 3-way handshake 853 */ 854 case TCPS_ESTABLISHED: 855 /* 856 * Transmit the FIN 857 * wait for the FIN to be ACKed, 858 * then remain in FIN_WAIT_2 859 */ 860 dprintf("tcp_shutdown: sending fin\n"); 861 if (tcp_xmit_end(tcp, sock_id) == 0 && 862 tcp_state_wait(sock_id, tcp, TCPS_FIN_WAIT_2) < 0) { 863 /* During the wait, TCP may be gone... */ 864 if (sockets[sock_id].pcb == NULL) 865 return (-1); 866 } 867 dprintf("tcp_shutdown: done\n"); 868 break; 869 870 default: 871 break; 872 873 } 874 return (0); 875 } 876 877 /* To handle closing of the socket */ 878 static int 879 tcp_close(int sock_id) 880 { 881 char *msg; 882 tcp_t *tcp; 883 int error = 0; 884 885 if ((tcp = sockets[sock_id].pcb) == NULL) { 886 return (-1); 887 } 888 889 TCP_RUN_TIME_WAIT_COLLECTOR(); 890 891 /* 892 * Since inetboot is not interrupt driven, there may be 893 * some ACKs in the MAC's buffer. Drain them first, 894 * otherwise, we may not be able to send. 895 */ 896 if (tcp_drain_input(tcp, sock_id, 5) < 0) { 897 /* 898 * If we return now without freeing TCP, there will be 899 * a memory leak. 900 */ 901 if (sockets[sock_id].pcb != NULL) 902 tcp_clean_death(sock_id, tcp, 0); 903 return (-1); 904 } 905 906 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 907 /* Cleanup for listener */ 908 tcp_eager_cleanup(tcp, 0, sock_id); 909 } 910 911 msg = NULL; 912 switch (tcp->tcp_state) { 913 case TCPS_CLOSED: 914 case TCPS_IDLE: 915 case TCPS_BOUND: 916 case TCPS_LISTEN: 917 break; 918 case TCPS_SYN_SENT: 919 msg = "tcp_close, during connect"; 920 break; 921 case TCPS_SYN_RCVD: 922 /* 923 * Close during the connect 3-way handshake 924 * but here there may or may not be pending data 925 * already on queue. Process almost same as in 926 * the ESTABLISHED state. 927 */ 928 /* FALLTHRU */ 929 default: 930 /* 931 * If SO_LINGER has set a zero linger time, abort the 932 * connection with a reset. 933 */ 934 if (tcp->tcp_linger && tcp->tcp_lingertime == 0) { 935 msg = "tcp_close, zero lingertime"; 936 break; 937 } 938 939 /* 940 * Abort connection if there is unread data queued. 941 */ 942 if (tcp->tcp_rcv_list != NULL || 943 tcp->tcp_reass_head != NULL) { 944 msg = "tcp_close, unread data"; 945 break; 946 } 947 if (tcp->tcp_state <= TCPS_LISTEN) 948 break; 949 950 /* 951 * Transmit the FIN before detaching the tcp_t. 952 * After tcp_detach returns this queue/perimeter 953 * no longer owns the tcp_t thus others can modify it. 954 * The TCP could be closed in tcp_state_wait called by 955 * tcp_wput_data called by tcp_xmit_end. 956 */ 957 (void) tcp_xmit_end(tcp, sock_id); 958 if (sockets[sock_id].pcb == NULL) 959 return (0); 960 961 /* 962 * If lingering on close then wait until the fin is acked, 963 * the SO_LINGER time passes, or a reset is sent/received. 964 */ 965 if (tcp->tcp_linger && tcp->tcp_lingertime > 0 && 966 !(tcp->tcp_fin_acked) && 967 tcp->tcp_state >= TCPS_ESTABLISHED) { 968 uint32_t stoptime; /* in ms */ 969 970 tcp->tcp_client_errno = 0; 971 stoptime = prom_gettime() + 972 (tcp->tcp_lingertime * 1000); 973 while (!(tcp->tcp_fin_acked) && 974 tcp->tcp_state >= TCPS_ESTABLISHED && 975 tcp->tcp_client_errno == 0 && 976 ((int32_t)(stoptime - prom_gettime()) > 0)) { 977 if (tcp_drain_input(tcp, sock_id, 5) < 0) { 978 if (sockets[sock_id].pcb != NULL) { 979 tcp_clean_death(sock_id, 980 tcp, 0); 981 } 982 return (-1); 983 } 984 } 985 tcp->tcp_client_errno = 0; 986 } 987 if (tcp_state_wait(sock_id, tcp, TCPS_TIME_WAIT) < 0) { 988 /* During the wait, TCP may be gone... */ 989 if (sockets[sock_id].pcb == NULL) 990 return (0); 991 msg = "tcp_close, couldn't detach"; 992 } else { 993 return (0); 994 } 995 break; 996 } 997 998 /* Something went wrong... Send a RST and report the error */ 999 if (msg != NULL) { 1000 if (tcp->tcp_state == TCPS_ESTABLISHED || 1001 tcp->tcp_state == TCPS_CLOSE_WAIT) 1002 BUMP_MIB(tcp_mib.tcpEstabResets); 1003 if (tcp->tcp_state == TCPS_SYN_SENT || 1004 tcp->tcp_state == TCPS_SYN_RCVD) 1005 BUMP_MIB(tcp_mib.tcpAttemptFails); 1006 tcp_xmit_ctl(msg, tcp, NULL, tcp->tcp_snxt, 0, TH_RST, 0, 1007 sock_id); 1008 } 1009 1010 tcp_free(tcp); 1011 bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 1012 sockets[sock_id].pcb = NULL; 1013 return (error); 1014 } 1015 1016 /* To make an endpoint a listener. */ 1017 int 1018 tcp_listen(int sock_id, int backlog) 1019 { 1020 tcp_t *tcp; 1021 1022 if ((tcp = (tcp_t *)(sockets[sock_id].pcb)) == NULL) { 1023 errno = EINVAL; 1024 return (-1); 1025 } 1026 /* We allow calling listen() multiple times to change the backlog. */ 1027 if (tcp->tcp_state > TCPS_LISTEN || tcp->tcp_state < TCPS_BOUND) { 1028 errno = EOPNOTSUPP; 1029 return (-1); 1030 } 1031 /* The following initialization should only be done once. */ 1032 if (tcp->tcp_state != TCPS_LISTEN) { 1033 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 1034 tcp->tcp_eager_next_q = NULL; 1035 tcp->tcp_state = TCPS_LISTEN; 1036 tcp->tcp_second_ctimer_threshold = tcp_ip_abort_linterval; 1037 } 1038 if ((tcp->tcp_conn_req_max = backlog) > tcp_conn_req_max_q) { 1039 tcp->tcp_conn_req_max = tcp_conn_req_max_q; 1040 } 1041 if (tcp->tcp_conn_req_max < tcp_conn_req_min) { 1042 tcp->tcp_conn_req_max = tcp_conn_req_min; 1043 } 1044 return (0); 1045 } 1046 1047 /* To accept connections. */ 1048 int 1049 tcp_accept(int sock_id, struct sockaddr *addr, socklen_t *addr_len) 1050 { 1051 tcp_t *listener; 1052 tcp_t *eager; 1053 int sd, new_sock_id; 1054 struct sockaddr_in *new_addr = (struct sockaddr_in *)addr; 1055 int timeout; 1056 1057 /* Sanity check. */ 1058 if ((listener = (tcp_t *)(sockets[sock_id].pcb)) == NULL || 1059 new_addr == NULL || addr_len == NULL || 1060 *addr_len < sizeof (struct sockaddr_in) || 1061 listener->tcp_state != TCPS_LISTEN) { 1062 errno = EINVAL; 1063 return (-1); 1064 } 1065 1066 if (sockets[sock_id].in_timeout > tcp_accept_timeout) 1067 timeout = prom_gettime() + sockets[sock_id].in_timeout; 1068 else 1069 timeout = prom_gettime() + tcp_accept_timeout; 1070 while (listener->tcp_eager_next_q == NULL && 1071 timeout > prom_gettime()) { 1072 #if DEBUG 1073 printf("tcp_accept: Waiting in tcp_accept()\n"); 1074 #endif 1075 if (tcp_drain_input(listener, sock_id, 5) < 0) { 1076 return (-1); 1077 } 1078 } 1079 /* If there is an eager, don't timeout... */ 1080 if (timeout <= prom_gettime() && listener->tcp_eager_next_q == NULL) { 1081 #if DEBUG 1082 printf("tcp_accept: timeout\n"); 1083 #endif 1084 errno = ETIMEDOUT; 1085 return (-1); 1086 } 1087 #if DEBUG 1088 printf("tcp_accept: got a connection\n"); 1089 #endif 1090 1091 /* Now create the socket for this new TCP. */ 1092 if ((sd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 1093 return (-1); 1094 } 1095 if ((new_sock_id = so_check_fd(sd, &errno)) == -1) 1096 /* This should not happen! */ 1097 prom_panic("so_check_fd() fails in tcp_accept()"); 1098 /* Free the TCP PCB in the original socket. */ 1099 bkmem_free((caddr_t)(sockets[new_sock_id].pcb), sizeof (tcp_t)); 1100 /* Dequeue the eager and attach it to the socket. */ 1101 eager = listener->tcp_eager_next_q; 1102 listener->tcp_eager_next_q = eager->tcp_eager_next_q; 1103 if (listener->tcp_eager_last_q == eager) 1104 listener->tcp_eager_last_q = NULL; 1105 eager->tcp_eager_next_q = NULL; 1106 sockets[new_sock_id].pcb = eager; 1107 listener->tcp_conn_req_cnt_q--; 1108 1109 /* Copy in the address info. */ 1110 bcopy(&eager->tcp_remote, &new_addr->sin_addr.s_addr, 1111 sizeof (in_addr_t)); 1112 bcopy(&eager->tcp_fport, &new_addr->sin_port, sizeof (in_port_t)); 1113 new_addr->sin_family = AF_INET; 1114 1115 #ifdef DEBUG 1116 printf("tcp_accept(), new sock_id: %d\n", sd); 1117 #endif 1118 return (sd); 1119 } 1120 1121 /* Update the next anonymous port to use. */ 1122 static in_port_t 1123 tcp_update_next_port(in_port_t port) 1124 { 1125 /* Don't allow the port to fall out of the anonymous port range. */ 1126 if (port < tcp_smallest_anon_port || port > tcp_largest_anon_port) 1127 port = (in_port_t)tcp_smallest_anon_port; 1128 1129 if (port < tcp_smallest_nonpriv_port) 1130 port = (in_port_t)tcp_smallest_nonpriv_port; 1131 return (port); 1132 } 1133 1134 /* To check whether a bind to a port is allowed. */ 1135 static in_port_t 1136 tcp_bindi(in_port_t port, in_addr_t *addr, boolean_t reuseaddr, 1137 boolean_t bind_to_req_port_only) 1138 { 1139 int i, count; 1140 tcp_t *tcp; 1141 1142 count = tcp_largest_anon_port - tcp_smallest_anon_port; 1143 try_again: 1144 for (i = 0; i < MAXSOCKET; i++) { 1145 if (sockets[i].type != INETBOOT_STREAM || 1146 ((tcp = (tcp_t *)sockets[i].pcb) == NULL) || 1147 ntohs(tcp->tcp_lport) != port) { 1148 continue; 1149 } 1150 /* 1151 * Both TCPs have the same port. If SO_REUSEDADDR is 1152 * set and the bound TCP has a state greater than 1153 * TCPS_LISTEN, it is fine. 1154 */ 1155 if (reuseaddr && tcp->tcp_state > TCPS_LISTEN) { 1156 continue; 1157 } 1158 if (tcp->tcp_bound_source != INADDR_ANY && 1159 *addr != INADDR_ANY && 1160 tcp->tcp_bound_source != *addr) { 1161 continue; 1162 } 1163 if (bind_to_req_port_only) { 1164 return (0); 1165 } 1166 if (--count > 0) { 1167 port = tcp_update_next_port(++port); 1168 goto try_again; 1169 } else { 1170 return (0); 1171 } 1172 } 1173 return (port); 1174 } 1175 1176 /* To handle the bind request. */ 1177 int 1178 tcp_bind(int sock_id) 1179 { 1180 tcp_t *tcp; 1181 in_port_t requested_port, allocated_port; 1182 boolean_t bind_to_req_port_only; 1183 boolean_t reuseaddr; 1184 1185 if ((tcp = (tcp_t *)sockets[sock_id].pcb) == NULL) { 1186 errno = EINVAL; 1187 return (-1); 1188 } 1189 1190 if (tcp->tcp_state >= TCPS_BOUND) { 1191 /* We don't allow multiple bind(). */ 1192 errno = EPROTO; 1193 return (-1); 1194 } 1195 1196 requested_port = ntohs(sockets[sock_id].bind.sin_port); 1197 1198 /* The bound source can be INADDR_ANY. */ 1199 tcp->tcp_bound_source = sockets[sock_id].bind.sin_addr.s_addr; 1200 1201 tcp->tcp_ipha->ip_src.s_addr = tcp->tcp_bound_source; 1202 1203 /* Verify the port is available. */ 1204 if (requested_port == 0) 1205 bind_to_req_port_only = B_FALSE; 1206 else /* T_BIND_REQ and requested_port != 0 */ 1207 bind_to_req_port_only = B_TRUE; 1208 1209 if (requested_port == 0) { 1210 requested_port = tcp_update_next_port(++tcp_next_port_to_try); 1211 } 1212 reuseaddr = sockets[sock_id].so_opt & SO_REUSEADDR; 1213 allocated_port = tcp_bindi(requested_port, &(tcp->tcp_bound_source), 1214 reuseaddr, bind_to_req_port_only); 1215 1216 if (allocated_port == 0) { 1217 errno = EADDRINUSE; 1218 return (-1); 1219 } 1220 tcp->tcp_lport = htons(allocated_port); 1221 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; 1222 sockets[sock_id].bind.sin_port = tcp->tcp_lport; 1223 tcp->tcp_state = TCPS_BOUND; 1224 return (0); 1225 } 1226 1227 /* 1228 * Check for duplicate TCP connections. 1229 */ 1230 static int 1231 tcp_conn_check(tcp_t *tcp) 1232 { 1233 int i; 1234 tcp_t *tmp_tcp; 1235 1236 for (i = 0; i < MAXSOCKET; i++) { 1237 if (sockets[i].type != INETBOOT_STREAM) 1238 continue; 1239 /* Socket may not be closed but the TCP can be gone. */ 1240 if ((tmp_tcp = (tcp_t *)sockets[i].pcb) == NULL) 1241 continue; 1242 /* We only care about TCP in states later than SYN_SENT. */ 1243 if (tmp_tcp->tcp_state < TCPS_SYN_SENT) 1244 continue; 1245 if (tmp_tcp->tcp_lport != tcp->tcp_lport || 1246 tmp_tcp->tcp_fport != tcp->tcp_fport || 1247 tmp_tcp->tcp_bound_source != tcp->tcp_bound_source || 1248 tmp_tcp->tcp_remote != tcp->tcp_remote) { 1249 continue; 1250 } else { 1251 return (-1); 1252 } 1253 } 1254 return (0); 1255 } 1256 1257 /* To handle a connect request. */ 1258 int 1259 tcp_connect(int sock_id) 1260 { 1261 tcp_t *tcp; 1262 in_addr_t dstaddr; 1263 in_port_t dstport; 1264 tcph_t *tcph; 1265 int mss; 1266 mblk_t *syn_mp; 1267 1268 if ((tcp = (tcp_t *)(sockets[sock_id].pcb)) == NULL) { 1269 errno = EINVAL; 1270 return (-1); 1271 } 1272 1273 TCP_RUN_TIME_WAIT_COLLECTOR(); 1274 1275 dstaddr = sockets[sock_id].remote.sin_addr.s_addr; 1276 dstport = sockets[sock_id].remote.sin_port; 1277 1278 /* 1279 * Check for attempt to connect to INADDR_ANY or non-unicast addrress. 1280 * We don't have enough info to check for broadcast addr, except 1281 * for the all 1 broadcast. 1282 */ 1283 if (dstaddr == INADDR_ANY || IN_CLASSD(ntohl(dstaddr)) || 1284 dstaddr == INADDR_BROADCAST) { 1285 /* 1286 * SunOS 4.x and 4.3 BSD allow an application 1287 * to connect a TCP socket to INADDR_ANY. 1288 * When they do this, the kernel picks the 1289 * address of one interface and uses it 1290 * instead. The kernel usually ends up 1291 * picking the address of the loopback 1292 * interface. This is an undocumented feature. 1293 * However, we provide the same thing here 1294 * in order to have source and binary 1295 * compatibility with SunOS 4.x. 1296 * Update the T_CONN_REQ (sin/sin6) since it is used to 1297 * generate the T_CONN_CON. 1298 * 1299 * Fail this for inetboot TCP. 1300 */ 1301 errno = EINVAL; 1302 return (-1); 1303 } 1304 1305 /* It is not bound to any address yet... */ 1306 if (tcp->tcp_bound_source == INADDR_ANY) { 1307 ipv4_getipaddr(&(sockets[sock_id].bind.sin_addr)); 1308 /* We don't have an address! */ 1309 if (ntohl(sockets[sock_id].bind.sin_addr.s_addr) == 1310 INADDR_ANY) { 1311 errno = EPROTO; 1312 return (-1); 1313 } 1314 tcp->tcp_bound_source = sockets[sock_id].bind.sin_addr.s_addr; 1315 tcp->tcp_ipha->ip_src.s_addr = tcp->tcp_bound_source; 1316 } 1317 1318 /* 1319 * Don't let an endpoint connect to itself. 1320 */ 1321 if (dstaddr == tcp->tcp_ipha->ip_src.s_addr && 1322 dstport == tcp->tcp_lport) { 1323 errno = EINVAL; 1324 return (-1); 1325 } 1326 1327 tcp->tcp_ipha->ip_dst.s_addr = dstaddr; 1328 tcp->tcp_remote = dstaddr; 1329 tcph = tcp->tcp_tcph; 1330 *(uint16_t *)tcph->th_fport = dstport; 1331 tcp->tcp_fport = dstport; 1332 1333 /* 1334 * Don't allow this connection to completely duplicate 1335 * an existing connection. 1336 */ 1337 if (tcp_conn_check(tcp) < 0) { 1338 errno = EADDRINUSE; 1339 return (-1); 1340 } 1341 1342 /* 1343 * Just make sure our rwnd is at 1344 * least tcp_recv_hiwat_mss * MSS 1345 * large, and round up to the nearest 1346 * MSS. 1347 * 1348 * We do the round up here because 1349 * we need to get the interface 1350 * MTU first before we can do the 1351 * round up. 1352 */ 1353 mss = tcp->tcp_mss - tcp->tcp_hdr_len; 1354 tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), 1355 tcp_recv_hiwat_minmss * mss); 1356 tcp->tcp_rwnd_max = tcp->tcp_rwnd; 1357 SET_WS_VALUE(tcp); 1358 U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), 1359 tcp->tcp_tcph->th_win); 1360 if (tcp->tcp_rcv_ws > 0 || tcp_wscale_always) 1361 tcp->tcp_snd_ws_ok = B_TRUE; 1362 1363 /* 1364 * Set tcp_snd_ts_ok to true 1365 * so that tcp_xmit_mp will 1366 * include the timestamp 1367 * option in the SYN segment. 1368 */ 1369 if (tcp_tstamp_always || 1370 (tcp->tcp_rcv_ws && tcp_tstamp_if_wscale)) { 1371 tcp->tcp_snd_ts_ok = B_TRUE; 1372 } 1373 1374 if (tcp_sack_permitted == 2 || 1375 tcp->tcp_snd_sack_ok) { 1376 assert(tcp->tcp_sack_info == NULL); 1377 if ((tcp->tcp_sack_info = (tcp_sack_info_t *)bkmem_zalloc( 1378 sizeof (tcp_sack_info_t))) == NULL) { 1379 tcp->tcp_snd_sack_ok = B_FALSE; 1380 } else { 1381 tcp->tcp_snd_sack_ok = B_TRUE; 1382 } 1383 } 1384 /* 1385 * Should we use ECN? Note that the current 1386 * default value (SunOS 5.9) of tcp_ecn_permitted 1387 * is 2. The reason for doing this is that there 1388 * are equipments out there that will drop ECN 1389 * enabled IP packets. Setting it to 1 avoids 1390 * compatibility problems. 1391 */ 1392 if (tcp_ecn_permitted == 2) 1393 tcp->tcp_ecn_ok = B_TRUE; 1394 1395 tcp_iss_init(tcp); 1396 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 1397 tcp->tcp_active_open = B_TRUE; 1398 1399 tcp->tcp_state = TCPS_SYN_SENT; 1400 syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, tcp->tcp_iss, B_FALSE, 1401 NULL, B_FALSE); 1402 if (syn_mp != NULL) { 1403 int ret; 1404 1405 /* Dump the packet when debugging. */ 1406 TCP_DUMP_PACKET("tcp_connect", syn_mp); 1407 /* Send out the SYN packet. */ 1408 ret = ipv4_tcp_output(sock_id, syn_mp); 1409 freeb(syn_mp); 1410 /* 1411 * errno ETIMEDOUT is set by the mac driver 1412 * in case it is not able to receive ARP reply. 1413 * TCP will retransmit this segment so we can 1414 * ignore the ARP timeout. 1415 */ 1416 if ((ret < 0) && (errno != ETIMEDOUT)) { 1417 return (-1); 1418 } 1419 /* tcp_state_wait() will finish the 3 way handshake. */ 1420 return (tcp_state_wait(sock_id, tcp, TCPS_ESTABLISHED)); 1421 } else { 1422 errno = ENOBUFS; 1423 return (-1); 1424 } 1425 } 1426 1427 /* 1428 * Common accept code. Called by tcp_conn_request. 1429 * cr_pkt is the SYN packet. 1430 */ 1431 static int 1432 tcp_accept_comm(tcp_t *listener, tcp_t *acceptor, mblk_t *cr_pkt, 1433 uint_t ip_hdr_len) 1434 { 1435 tcph_t *tcph; 1436 1437 #ifdef DEBUG 1438 printf("tcp_accept_comm #######################\n"); 1439 #endif 1440 1441 /* 1442 * When we get here, we know that the acceptor header template 1443 * has already been initialized. 1444 * However, it may not match the listener if the listener 1445 * includes options... 1446 * It may also not match the listener if the listener is v6 and 1447 * and the acceptor is v4 1448 */ 1449 acceptor->tcp_lport = listener->tcp_lport; 1450 1451 if (listener->tcp_ipversion == acceptor->tcp_ipversion) { 1452 if (acceptor->tcp_iphc_len != listener->tcp_iphc_len) { 1453 /* 1454 * Listener had options of some sort; acceptor inherits. 1455 * Free up the acceptor template and allocate one 1456 * of the right size. 1457 */ 1458 bkmem_free(acceptor->tcp_iphc, acceptor->tcp_iphc_len); 1459 acceptor->tcp_iphc = bkmem_zalloc( 1460 listener->tcp_iphc_len); 1461 if (acceptor->tcp_iphc == NULL) { 1462 acceptor->tcp_iphc_len = 0; 1463 return (ENOMEM); 1464 } 1465 acceptor->tcp_iphc_len = listener->tcp_iphc_len; 1466 } 1467 acceptor->tcp_hdr_len = listener->tcp_hdr_len; 1468 acceptor->tcp_ip_hdr_len = listener->tcp_ip_hdr_len; 1469 acceptor->tcp_tcp_hdr_len = listener->tcp_tcp_hdr_len; 1470 1471 /* 1472 * Copy the IP+TCP header template from listener to acceptor 1473 */ 1474 bcopy(listener->tcp_iphc, acceptor->tcp_iphc, 1475 listener->tcp_hdr_len); 1476 acceptor->tcp_ipha = (struct ip *)acceptor->tcp_iphc; 1477 acceptor->tcp_tcph = (tcph_t *)(acceptor->tcp_iphc + 1478 acceptor->tcp_ip_hdr_len); 1479 } else { 1480 prom_panic("tcp_accept_comm: version not equal"); 1481 } 1482 1483 /* Copy our new dest and fport from the connection request packet */ 1484 if (acceptor->tcp_ipversion == IPV4_VERSION) { 1485 struct ip *ipha; 1486 1487 ipha = (struct ip *)cr_pkt->b_rptr; 1488 acceptor->tcp_ipha->ip_dst = ipha->ip_src; 1489 acceptor->tcp_remote = ipha->ip_src.s_addr; 1490 acceptor->tcp_ipha->ip_src = ipha->ip_dst; 1491 acceptor->tcp_bound_source = ipha->ip_dst.s_addr; 1492 tcph = (tcph_t *)&cr_pkt->b_rptr[ip_hdr_len]; 1493 } else { 1494 prom_panic("tcp_accept_comm: not IPv4"); 1495 } 1496 bcopy(tcph->th_lport, acceptor->tcp_tcph->th_fport, sizeof (in_port_t)); 1497 bcopy(acceptor->tcp_tcph->th_fport, &acceptor->tcp_fport, 1498 sizeof (in_port_t)); 1499 /* 1500 * For an all-port proxy listener, the local port is determined by 1501 * the port number field in the SYN packet. 1502 */ 1503 if (listener->tcp_lport == 0) { 1504 acceptor->tcp_lport = *(in_port_t *)tcph->th_fport; 1505 bcopy(tcph->th_fport, acceptor->tcp_tcph->th_lport, 1506 sizeof (in_port_t)); 1507 } 1508 /* Inherit various TCP parameters from the listener */ 1509 acceptor->tcp_naglim = listener->tcp_naglim; 1510 acceptor->tcp_first_timer_threshold = 1511 listener->tcp_first_timer_threshold; 1512 acceptor->tcp_second_timer_threshold = 1513 listener->tcp_second_timer_threshold; 1514 1515 acceptor->tcp_first_ctimer_threshold = 1516 listener->tcp_first_ctimer_threshold; 1517 acceptor->tcp_second_ctimer_threshold = 1518 listener->tcp_second_ctimer_threshold; 1519 1520 acceptor->tcp_xmit_hiwater = listener->tcp_xmit_hiwater; 1521 1522 acceptor->tcp_state = TCPS_LISTEN; 1523 tcp_iss_init(acceptor); 1524 1525 /* Process all TCP options. */ 1526 tcp_process_options(acceptor, tcph); 1527 1528 /* Is the other end ECN capable? */ 1529 if (tcp_ecn_permitted >= 1 && 1530 (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { 1531 acceptor->tcp_ecn_ok = B_TRUE; 1532 } 1533 1534 /* 1535 * listener->tcp_rq->q_hiwat should be the default window size or a 1536 * window size changed via SO_RCVBUF option. First round up the 1537 * acceptor's tcp_rwnd to the nearest MSS. Then find out the window 1538 * scale option value if needed. Call tcp_rwnd_set() to finish the 1539 * setting. 1540 * 1541 * Note if there is a rpipe metric associated with the remote host, 1542 * we should not inherit receive window size from listener. 1543 */ 1544 acceptor->tcp_rwnd = MSS_ROUNDUP( 1545 (acceptor->tcp_rwnd == 0 ? listener->tcp_rwnd_max : 1546 acceptor->tcp_rwnd), acceptor->tcp_mss); 1547 if (acceptor->tcp_snd_ws_ok) 1548 SET_WS_VALUE(acceptor); 1549 /* 1550 * Note that this is the only place tcp_rwnd_set() is called for 1551 * accepting a connection. We need to call it here instead of 1552 * after the 3-way handshake because we need to tell the other 1553 * side our rwnd in the SYN-ACK segment. 1554 */ 1555 (void) tcp_rwnd_set(acceptor, acceptor->tcp_rwnd); 1556 1557 return (0); 1558 } 1559 1560 /* 1561 * Defense for the SYN attack - 1562 * 1. When q0 is full, drop from the tail (tcp_eager_prev_q0) the oldest 1563 * one that doesn't have the dontdrop bit set. 1564 * 2. Don't drop a SYN request before its first timeout. This gives every 1565 * request at least til the first timeout to complete its 3-way handshake. 1566 * 3. The current threshold is - # of timeout > q0len/4 => SYN alert on 1567 * # of timeout drops back to <= q0len/32 => SYN alert off 1568 */ 1569 static boolean_t 1570 tcp_drop_q0(tcp_t *tcp) 1571 { 1572 tcp_t *eager; 1573 1574 assert(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); 1575 /* 1576 * New one is added after next_q0 so prev_q0 points to the oldest 1577 * Also do not drop any established connections that are deferred on 1578 * q0 due to q being full 1579 */ 1580 1581 eager = tcp->tcp_eager_prev_q0; 1582 while (eager->tcp_dontdrop || eager->tcp_conn_def_q0) { 1583 /* XXX should move the eager to the head */ 1584 eager = eager->tcp_eager_prev_q0; 1585 if (eager == tcp) { 1586 eager = tcp->tcp_eager_prev_q0; 1587 break; 1588 } 1589 } 1590 dprintf("tcp_drop_q0: listen half-open queue (max=%d) overflow" 1591 " (%d pending) on %s, drop one", tcp_conn_req_max_q0, 1592 tcp->tcp_conn_req_cnt_q0, 1593 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1594 1595 BUMP_MIB(tcp_mib.tcpHalfOpenDrop); 1596 bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1597 return (B_TRUE); 1598 } 1599 1600 /* ARGSUSED */ 1601 static tcp_t * 1602 tcp_conn_request(tcp_t *tcp, mblk_t *mp, uint_t sock_id, uint_t ip_hdr_len) 1603 { 1604 tcp_t *eager; 1605 struct ip *ipha; 1606 int err; 1607 1608 #ifdef DEBUG 1609 printf("tcp_conn_request ###################\n"); 1610 #endif 1611 1612 if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) { 1613 BUMP_MIB(tcp_mib.tcpListenDrop); 1614 dprintf("tcp_conn_request: listen backlog (max=%d) " 1615 "overflow (%d pending) on %s", 1616 tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q, 1617 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1618 return (NULL); 1619 } 1620 1621 assert(OK_32PTR(mp->b_rptr)); 1622 1623 if (tcp->tcp_conn_req_cnt_q0 >= 1624 tcp->tcp_conn_req_max + tcp_conn_req_max_q0) { 1625 /* 1626 * Q0 is full. Drop a pending half-open req from the queue 1627 * to make room for the new SYN req. Also mark the time we 1628 * drop a SYN. 1629 */ 1630 tcp->tcp_last_rcv_lbolt = prom_gettime(); 1631 if (!tcp_drop_q0(tcp)) { 1632 freemsg(mp); 1633 BUMP_MIB(tcp_mib.tcpListenDropQ0); 1634 dprintf("tcp_conn_request: listen half-open queue " 1635 "(max=%d) full (%d pending) on %s", 1636 tcp_conn_req_max_q0, 1637 tcp->tcp_conn_req_cnt_q0, 1638 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1639 return (NULL); 1640 } 1641 } 1642 1643 ipha = (struct ip *)mp->b_rptr; 1644 if (IN_CLASSD(ntohl(ipha->ip_src.s_addr)) || 1645 ipha->ip_src.s_addr == INADDR_BROADCAST || 1646 ipha->ip_src.s_addr == INADDR_ANY || 1647 ipha->ip_dst.s_addr == INADDR_BROADCAST) { 1648 freemsg(mp); 1649 return (NULL); 1650 } 1651 /* 1652 * We allow the connection to proceed 1653 * by generating a detached tcp state vector and put it in 1654 * the eager queue. When an accept happens, it will be 1655 * dequeued sequentially. 1656 */ 1657 if ((eager = (tcp_t *)bkmem_alloc(sizeof (tcp_t))) == NULL) { 1658 freemsg(mp); 1659 errno = ENOBUFS; 1660 return (NULL); 1661 } 1662 if ((errno = tcp_init_values(eager, NULL)) != 0) { 1663 freemsg(mp); 1664 bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1665 return (NULL); 1666 } 1667 1668 /* 1669 * Eager connection inherits address form from its listener, 1670 * but its packet form comes from the version of the received 1671 * SYN segment. 1672 */ 1673 eager->tcp_family = tcp->tcp_family; 1674 1675 err = tcp_accept_comm(tcp, eager, mp, ip_hdr_len); 1676 if (err) { 1677 bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1678 return (NULL); 1679 } 1680 1681 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; 1682 eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 1683 tcp->tcp_eager_next_q0 = eager; 1684 eager->tcp_eager_prev_q0 = tcp; 1685 1686 /* Set tcp_listener before adding it to tcp_conn_fanout */ 1687 eager->tcp_listener = tcp; 1688 tcp->tcp_conn_req_cnt_q0++; 1689 1690 return (eager); 1691 } 1692 1693 /* 1694 * To get around the non-interrupt problem of inetboot. 1695 * Keep on processing packets until a certain state is reached or the 1696 * TCP is destroyed because of getting a RST packet. 1697 */ 1698 static int 1699 tcp_state_wait(int sock_id, tcp_t *tcp, int state) 1700 { 1701 int i; 1702 struct inetgram *in_gram; 1703 mblk_t *mp; 1704 int timeout; 1705 boolean_t changed = B_FALSE; 1706 1707 /* 1708 * We need to make sure that the MAC does not wait longer 1709 * than RTO for any packet so that TCP can do retransmission. 1710 * But if the MAC timeout is less than tcp_rto, we are fine 1711 * and do not need to change it. 1712 */ 1713 timeout = sockets[sock_id].in_timeout; 1714 if (timeout > tcp->tcp_rto) { 1715 sockets[sock_id].in_timeout = tcp->tcp_rto; 1716 changed = B_TRUE; 1717 } 1718 retry: 1719 if (sockets[sock_id].inq == NULL) { 1720 /* Go out and check the wire */ 1721 for (i = MEDIA_LVL; i < TRANSPORT_LVL; i++) { 1722 if (sockets[sock_id].input[i] != NULL) { 1723 if (sockets[sock_id].input[i](sock_id) < 0) { 1724 if (changed) { 1725 sockets[sock_id].in_timeout = 1726 timeout; 1727 } 1728 return (-1); 1729 } 1730 } 1731 } 1732 } 1733 1734 while ((in_gram = sockets[sock_id].inq) != NULL) { 1735 if (tcp->tcp_state == state) 1736 break; 1737 1738 /* Remove unknown inetgrams from the head of inq. */ 1739 if (in_gram->igm_level != TRANSPORT_LVL) { 1740 #ifdef DEBUG 1741 printf("tcp_state_wait for state %d: unexpected " 1742 "packet level %d frame found\n", state, 1743 in_gram->igm_level); 1744 #endif 1745 del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 1746 continue; 1747 } 1748 mp = in_gram->igm_mp; 1749 del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 1750 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 1751 tcp_rput_data(tcp, mp, sock_id); 1752 1753 /* 1754 * The other side may have closed this connection or 1755 * RST us. But we need to continue to process other 1756 * packets in the socket's queue because they may be 1757 * belong to another TCP connections. 1758 */ 1759 if (sockets[sock_id].pcb == NULL) { 1760 tcp = NULL; 1761 } 1762 } 1763 1764 /* If the other side has closed the connection, just return. */ 1765 if (tcp == NULL || sockets[sock_id].pcb == NULL) { 1766 #ifdef DEBUG 1767 printf("tcp_state_wait other side dead: state %d " 1768 "error %d\n", state, sockets[sock_id].so_error); 1769 #endif 1770 if (sockets[sock_id].so_error != 0) 1771 return (-1); 1772 else 1773 return (0); 1774 } 1775 /* 1776 * TCPS_ALL_ACKED is not a valid TCP state, it is just used as an 1777 * indicator to tcp_state_wait to mean that it is being called 1778 * to wait till we have received acks for all the new segments sent. 1779 */ 1780 if ((state == TCPS_ALL_ACKED) && (tcp->tcp_suna == tcp->tcp_snxt)) { 1781 goto done; 1782 } 1783 if (tcp->tcp_state != state) { 1784 if (prom_gettime() > tcp->tcp_rto_timeout) 1785 tcp_timer(tcp, sock_id); 1786 goto retry; 1787 } 1788 done: 1789 if (changed) 1790 sockets[sock_id].in_timeout = timeout; 1791 1792 tcp_drain_needed(sock_id, tcp); 1793 return (0); 1794 } 1795 1796 /* Verify the checksum of a segment. */ 1797 static int 1798 tcp_verify_cksum(mblk_t *mp) 1799 { 1800 struct ip *iph; 1801 tcpha_t *tcph; 1802 int len; 1803 uint16_t old_sum; 1804 1805 iph = (struct ip *)mp->b_rptr; 1806 tcph = (tcpha_t *)(iph + 1); 1807 len = ntohs(iph->ip_len); 1808 1809 /* 1810 * Calculate the TCP checksum. Need to include the psuedo header, 1811 * which is similar to the real IP header starting at the TTL field. 1812 */ 1813 iph->ip_sum = htons(len - IP_SIMPLE_HDR_LENGTH); 1814 old_sum = tcph->tha_sum; 1815 tcph->tha_sum = 0; 1816 iph->ip_ttl = 0; 1817 if (old_sum == tcp_cksum((uint16_t *)&(iph->ip_ttl), 1818 len - IP_SIMPLE_HDR_LENGTH + 12)) { 1819 return (0); 1820 } else { 1821 tcp_cksum_errors++; 1822 return (-1); 1823 } 1824 } 1825 1826 /* To find a TCP connection matching the incoming segment. */ 1827 static tcp_t * 1828 tcp_lookup_ipv4(struct ip *iph, tcpha_t *tcph, int min_state, int *sock_id) 1829 { 1830 int i; 1831 tcp_t *tcp; 1832 1833 for (i = 0; i < MAXSOCKET; i++) { 1834 if (sockets[i].type == INETBOOT_STREAM && 1835 (tcp = (tcp_t *)sockets[i].pcb) != NULL) { 1836 if (tcph->tha_lport == tcp->tcp_fport && 1837 tcph->tha_fport == tcp->tcp_lport && 1838 iph->ip_src.s_addr == tcp->tcp_remote && 1839 iph->ip_dst.s_addr == tcp->tcp_bound_source && 1840 tcp->tcp_state >= min_state) { 1841 *sock_id = i; 1842 return (tcp); 1843 } 1844 } 1845 } 1846 /* Find it in the time wait list. */ 1847 for (tcp = tcp_time_wait_head; tcp != NULL; 1848 tcp = tcp->tcp_time_wait_next) { 1849 if (tcph->tha_lport == tcp->tcp_fport && 1850 tcph->tha_fport == tcp->tcp_lport && 1851 iph->ip_src.s_addr == tcp->tcp_remote && 1852 iph->ip_dst.s_addr == tcp->tcp_bound_source && 1853 tcp->tcp_state >= min_state) { 1854 *sock_id = -1; 1855 return (tcp); 1856 } 1857 } 1858 return (NULL); 1859 } 1860 1861 /* To find a TCP listening connection matching the incoming segment. */ 1862 static tcp_t * 1863 tcp_lookup_listener_ipv4(in_addr_t addr, in_port_t port, int *sock_id) 1864 { 1865 int i; 1866 tcp_t *tcp; 1867 1868 for (i = 0; i < MAXSOCKET; i++) { 1869 if (sockets[i].type == INETBOOT_STREAM && 1870 (tcp = (tcp_t *)sockets[i].pcb) != NULL) { 1871 if (tcp->tcp_lport == port && 1872 (tcp->tcp_bound_source == addr || 1873 tcp->tcp_bound_source == INADDR_ANY)) { 1874 *sock_id = i; 1875 return (tcp); 1876 } 1877 } 1878 } 1879 1880 return (NULL); 1881 } 1882 1883 /* To find a TCP eager matching the incoming segment. */ 1884 static tcp_t * 1885 tcp_lookup_eager_ipv4(tcp_t *listener, struct ip *iph, tcpha_t *tcph) 1886 { 1887 tcp_t *tcp; 1888 1889 #ifdef DEBUG 1890 printf("tcp_lookup_eager_ipv4 ###############\n"); 1891 #endif 1892 for (tcp = listener->tcp_eager_next_q; tcp != NULL; 1893 tcp = tcp->tcp_eager_next_q) { 1894 if (tcph->tha_lport == tcp->tcp_fport && 1895 tcph->tha_fport == tcp->tcp_lport && 1896 iph->ip_src.s_addr == tcp->tcp_remote && 1897 iph->ip_dst.s_addr == tcp->tcp_bound_source) { 1898 return (tcp); 1899 } 1900 } 1901 1902 for (tcp = listener->tcp_eager_next_q0; tcp != listener; 1903 tcp = tcp->tcp_eager_next_q0) { 1904 if (tcph->tha_lport == tcp->tcp_fport && 1905 tcph->tha_fport == tcp->tcp_lport && 1906 iph->ip_src.s_addr == tcp->tcp_remote && 1907 iph->ip_dst.s_addr == tcp->tcp_bound_source) { 1908 return (tcp); 1909 } 1910 } 1911 #ifdef DEBUG 1912 printf("No eager found\n"); 1913 #endif 1914 return (NULL); 1915 } 1916 1917 /* To destroy a TCP control block. */ 1918 static void 1919 tcp_clean_death(int sock_id, tcp_t *tcp, int err) 1920 { 1921 tcp_free(tcp); 1922 if (tcp->tcp_state == TCPS_TIME_WAIT) 1923 tcp_time_wait_remove(tcp); 1924 1925 if (sock_id >= 0) { 1926 sockets[sock_id].pcb = NULL; 1927 if (err != 0) 1928 sockets[sock_id].so_error = err; 1929 } 1930 bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 1931 } 1932 1933 /* 1934 * tcp_rwnd_set() is called to adjust the receive window to a desired value. 1935 * We do not allow the receive window to shrink. After setting rwnd, 1936 * set the flow control hiwat of the stream. 1937 * 1938 * This function is called in 2 cases: 1939 * 1940 * 1) Before data transfer begins, in tcp_accept_comm() for accepting a 1941 * connection (passive open) and in tcp_rput_data() for active connect. 1942 * This is called after tcp_mss_set() when the desired MSS value is known. 1943 * This makes sure that our window size is a mutiple of the other side's 1944 * MSS. 1945 * 2) Handling SO_RCVBUF option. 1946 * 1947 * It is ASSUMED that the requested size is a multiple of the current MSS. 1948 * 1949 * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the 1950 * user requests so. 1951 */ 1952 static int 1953 tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) 1954 { 1955 uint32_t mss = tcp->tcp_mss; 1956 uint32_t old_max_rwnd; 1957 uint32_t max_transmittable_rwnd; 1958 1959 if (tcp->tcp_rwnd_max != 0) 1960 old_max_rwnd = tcp->tcp_rwnd_max; 1961 else 1962 old_max_rwnd = tcp->tcp_rwnd; 1963 1964 /* 1965 * Insist on a receive window that is at least 1966 * tcp_recv_hiwat_minmss * MSS (default 4 * MSS) to avoid 1967 * funny TCP interactions of Nagle algorithm, SWS avoidance 1968 * and delayed acknowledgement. 1969 */ 1970 rwnd = MAX(rwnd, tcp_recv_hiwat_minmss * mss); 1971 1972 /* 1973 * If window size info has already been exchanged, TCP should not 1974 * shrink the window. Shrinking window is doable if done carefully. 1975 * We may add that support later. But so far there is not a real 1976 * need to do that. 1977 */ 1978 if (rwnd < old_max_rwnd && tcp->tcp_state > TCPS_SYN_SENT) { 1979 /* MSS may have changed, do a round up again. */ 1980 rwnd = MSS_ROUNDUP(old_max_rwnd, mss); 1981 } 1982 1983 /* 1984 * tcp_rcv_ws starts with TCP_MAX_WINSHIFT so the following check 1985 * can be applied even before the window scale option is decided. 1986 */ 1987 max_transmittable_rwnd = TCP_MAXWIN << tcp->tcp_rcv_ws; 1988 if (rwnd > max_transmittable_rwnd) { 1989 rwnd = max_transmittable_rwnd - 1990 (max_transmittable_rwnd % mss); 1991 if (rwnd < mss) 1992 rwnd = max_transmittable_rwnd; 1993 /* 1994 * If we're over the limit we may have to back down tcp_rwnd. 1995 * The increment below won't work for us. So we set all three 1996 * here and the increment below will have no effect. 1997 */ 1998 tcp->tcp_rwnd = old_max_rwnd = rwnd; 1999 } 2000 2001 /* 2002 * Increment the current rwnd by the amount the maximum grew (we 2003 * can not overwrite it since we might be in the middle of a 2004 * connection.) 2005 */ 2006 tcp->tcp_rwnd += rwnd - old_max_rwnd; 2007 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 2008 if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 2009 tcp->tcp_cwnd_max = rwnd; 2010 tcp->tcp_rwnd_max = rwnd; 2011 2012 return (rwnd); 2013 } 2014 2015 /* 2016 * Extract option values from a tcp header. We put any found values into the 2017 * tcpopt struct and return a bitmask saying which options were found. 2018 */ 2019 static int 2020 tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt) 2021 { 2022 uchar_t *endp; 2023 int len; 2024 uint32_t mss; 2025 uchar_t *up = (uchar_t *)tcph; 2026 int found = 0; 2027 int32_t sack_len; 2028 tcp_seq sack_begin, sack_end; 2029 tcp_t *tcp; 2030 2031 endp = up + TCP_HDR_LENGTH(tcph); 2032 up += TCP_MIN_HEADER_LENGTH; 2033 while (up < endp) { 2034 len = endp - up; 2035 switch (*up) { 2036 case TCPOPT_EOL: 2037 break; 2038 2039 case TCPOPT_NOP: 2040 up++; 2041 continue; 2042 2043 case TCPOPT_MAXSEG: 2044 if (len < TCPOPT_MAXSEG_LEN || 2045 up[1] != TCPOPT_MAXSEG_LEN) 2046 break; 2047 2048 mss = BE16_TO_U16(up+2); 2049 /* Caller must handle tcp_mss_min and tcp_mss_max_* */ 2050 tcpopt->tcp_opt_mss = mss; 2051 found |= TCP_OPT_MSS_PRESENT; 2052 2053 up += TCPOPT_MAXSEG_LEN; 2054 continue; 2055 2056 case TCPOPT_WSCALE: 2057 if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) 2058 break; 2059 2060 if (up[2] > TCP_MAX_WINSHIFT) 2061 tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; 2062 else 2063 tcpopt->tcp_opt_wscale = up[2]; 2064 found |= TCP_OPT_WSCALE_PRESENT; 2065 2066 up += TCPOPT_WS_LEN; 2067 continue; 2068 2069 case TCPOPT_SACK_PERMITTED: 2070 if (len < TCPOPT_SACK_OK_LEN || 2071 up[1] != TCPOPT_SACK_OK_LEN) 2072 break; 2073 found |= TCP_OPT_SACK_OK_PRESENT; 2074 up += TCPOPT_SACK_OK_LEN; 2075 continue; 2076 2077 case TCPOPT_SACK: 2078 if (len <= 2 || up[1] <= 2 || len < up[1]) 2079 break; 2080 2081 /* If TCP is not interested in SACK blks... */ 2082 if ((tcp = tcpopt->tcp) == NULL) { 2083 up += up[1]; 2084 continue; 2085 } 2086 sack_len = up[1] - TCPOPT_HEADER_LEN; 2087 up += TCPOPT_HEADER_LEN; 2088 2089 /* 2090 * If the list is empty, allocate one and assume 2091 * nothing is sack'ed. 2092 */ 2093 assert(tcp->tcp_sack_info != NULL); 2094 if (tcp->tcp_notsack_list == NULL) { 2095 tcp_notsack_update(&(tcp->tcp_notsack_list), 2096 tcp->tcp_suna, tcp->tcp_snxt, 2097 &(tcp->tcp_num_notsack_blk), 2098 &(tcp->tcp_cnt_notsack_list)); 2099 2100 /* 2101 * Make sure tcp_notsack_list is not NULL. 2102 * This happens when kmem_alloc(KM_NOSLEEP) 2103 * returns NULL. 2104 */ 2105 if (tcp->tcp_notsack_list == NULL) { 2106 up += sack_len; 2107 continue; 2108 } 2109 tcp->tcp_fack = tcp->tcp_suna; 2110 } 2111 2112 while (sack_len > 0) { 2113 if (up + 8 > endp) { 2114 up = endp; 2115 break; 2116 } 2117 sack_begin = BE32_TO_U32(up); 2118 up += 4; 2119 sack_end = BE32_TO_U32(up); 2120 up += 4; 2121 sack_len -= 8; 2122 /* 2123 * Bounds checking. Make sure the SACK 2124 * info is within tcp_suna and tcp_snxt. 2125 * If this SACK blk is out of bound, ignore 2126 * it but continue to parse the following 2127 * blks. 2128 */ 2129 if (SEQ_LEQ(sack_end, sack_begin) || 2130 SEQ_LT(sack_begin, tcp->tcp_suna) || 2131 SEQ_GT(sack_end, tcp->tcp_snxt)) { 2132 continue; 2133 } 2134 tcp_notsack_insert(&(tcp->tcp_notsack_list), 2135 sack_begin, sack_end, 2136 &(tcp->tcp_num_notsack_blk), 2137 &(tcp->tcp_cnt_notsack_list)); 2138 if (SEQ_GT(sack_end, tcp->tcp_fack)) { 2139 tcp->tcp_fack = sack_end; 2140 } 2141 } 2142 found |= TCP_OPT_SACK_PRESENT; 2143 continue; 2144 2145 case TCPOPT_TSTAMP: 2146 if (len < TCPOPT_TSTAMP_LEN || 2147 up[1] != TCPOPT_TSTAMP_LEN) 2148 break; 2149 2150 tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); 2151 tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); 2152 2153 found |= TCP_OPT_TSTAMP_PRESENT; 2154 2155 up += TCPOPT_TSTAMP_LEN; 2156 continue; 2157 2158 default: 2159 if (len <= 1 || len < (int)up[1] || up[1] == 0) 2160 break; 2161 up += up[1]; 2162 continue; 2163 } 2164 break; 2165 } 2166 return (found); 2167 } 2168 2169 /* 2170 * Set the mss associated with a particular tcp based on its current value, 2171 * and a new one passed in. Observe minimums and maximums, and reset 2172 * other state variables that we want to view as multiples of mss. 2173 * 2174 * This function is called in various places mainly because 2175 * 1) Various stuffs, tcp_mss, tcp_cwnd, ... need to be adjusted when the 2176 * other side's SYN/SYN-ACK packet arrives. 2177 * 2) PMTUd may get us a new MSS. 2178 * 3) If the other side stops sending us timestamp option, we need to 2179 * increase the MSS size to use the extra bytes available. 2180 */ 2181 static void 2182 tcp_mss_set(tcp_t *tcp, uint32_t mss) 2183 { 2184 uint32_t mss_max; 2185 2186 mss_max = tcp_mss_max_ipv4; 2187 2188 if (mss < tcp_mss_min) 2189 mss = tcp_mss_min; 2190 if (mss > mss_max) 2191 mss = mss_max; 2192 /* 2193 * Unless naglim has been set by our client to 2194 * a non-mss value, force naglim to track mss. 2195 * This can help to aggregate small writes. 2196 */ 2197 if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) 2198 tcp->tcp_naglim = mss; 2199 /* 2200 * TCP should be able to buffer at least 4 MSS data for obvious 2201 * performance reason. 2202 */ 2203 if ((mss << 2) > tcp->tcp_xmit_hiwater) 2204 tcp->tcp_xmit_hiwater = mss << 2; 2205 tcp->tcp_mss = mss; 2206 /* 2207 * Initialize cwnd according to draft-floyd-incr-init-win-01.txt. 2208 * Previously, we use tcp_slow_start_initial to control the size 2209 * of the initial cwnd. Now, when tcp_slow_start_initial * mss 2210 * is smaller than the cwnd calculated from the formula suggested in 2211 * the draft, we use tcp_slow_start_initial * mss as the cwnd. 2212 * Otherwise, use the cwnd from the draft's formula. The default 2213 * of tcp_slow_start_initial is 2. 2214 */ 2215 tcp->tcp_cwnd = MIN(tcp_slow_start_initial * mss, 2216 MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); 2217 tcp->tcp_cwnd_cnt = 0; 2218 } 2219 2220 /* 2221 * Process all TCP option in SYN segment. 2222 * 2223 * This function sets up the correct tcp_mss value according to the 2224 * MSS option value and our header size. It also sets up the window scale 2225 * and timestamp values, and initialize SACK info blocks. But it does not 2226 * change receive window size after setting the tcp_mss value. The caller 2227 * should do the appropriate change. 2228 */ 2229 void 2230 tcp_process_options(tcp_t *tcp, tcph_t *tcph) 2231 { 2232 int options; 2233 tcp_opt_t tcpopt; 2234 uint32_t mss_max; 2235 char *tmp_tcph; 2236 2237 tcpopt.tcp = NULL; 2238 options = tcp_parse_options(tcph, &tcpopt); 2239 2240 /* 2241 * Process MSS option. Note that MSS option value does not account 2242 * for IP or TCP options. This means that it is equal to MTU - minimum 2243 * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for 2244 * IPv6. 2245 */ 2246 if (!(options & TCP_OPT_MSS_PRESENT)) { 2247 tcpopt.tcp_opt_mss = tcp_mss_def_ipv4; 2248 } else { 2249 if (tcp->tcp_ipversion == IPV4_VERSION) 2250 mss_max = tcp_mss_max_ipv4; 2251 if (tcpopt.tcp_opt_mss < tcp_mss_min) 2252 tcpopt.tcp_opt_mss = tcp_mss_min; 2253 else if (tcpopt.tcp_opt_mss > mss_max) 2254 tcpopt.tcp_opt_mss = mss_max; 2255 } 2256 2257 /* Process Window Scale option. */ 2258 if (options & TCP_OPT_WSCALE_PRESENT) { 2259 tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; 2260 tcp->tcp_snd_ws_ok = B_TRUE; 2261 } else { 2262 tcp->tcp_snd_ws = B_FALSE; 2263 tcp->tcp_snd_ws_ok = B_FALSE; 2264 tcp->tcp_rcv_ws = B_FALSE; 2265 } 2266 2267 /* Process Timestamp option. */ 2268 if ((options & TCP_OPT_TSTAMP_PRESENT) && 2269 (tcp->tcp_snd_ts_ok || !tcp->tcp_active_open)) { 2270 tmp_tcph = (char *)tcp->tcp_tcph; 2271 2272 tcp->tcp_snd_ts_ok = B_TRUE; 2273 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 2274 tcp->tcp_last_rcv_lbolt = prom_gettime(); 2275 assert(OK_32PTR(tmp_tcph)); 2276 assert(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 2277 2278 /* Fill in our template header with basic timestamp option. */ 2279 tmp_tcph += tcp->tcp_tcp_hdr_len; 2280 tmp_tcph[0] = TCPOPT_NOP; 2281 tmp_tcph[1] = TCPOPT_NOP; 2282 tmp_tcph[2] = TCPOPT_TSTAMP; 2283 tmp_tcph[3] = TCPOPT_TSTAMP_LEN; 2284 tcp->tcp_hdr_len += TCPOPT_REAL_TS_LEN; 2285 tcp->tcp_tcp_hdr_len += TCPOPT_REAL_TS_LEN; 2286 tcp->tcp_tcph->th_offset_and_rsrvd[0] += (3 << 4); 2287 } else { 2288 tcp->tcp_snd_ts_ok = B_FALSE; 2289 } 2290 2291 /* 2292 * Process SACK options. If SACK is enabled for this connection, 2293 * then allocate the SACK info structure. 2294 */ 2295 if ((options & TCP_OPT_SACK_OK_PRESENT) && 2296 (tcp->tcp_snd_sack_ok || 2297 (tcp_sack_permitted != 0 && !tcp->tcp_active_open))) { 2298 /* This should be true only in the passive case. */ 2299 if (tcp->tcp_sack_info == NULL) { 2300 tcp->tcp_sack_info = (tcp_sack_info_t *)bkmem_zalloc( 2301 sizeof (tcp_sack_info_t)); 2302 } 2303 if (tcp->tcp_sack_info == NULL) { 2304 tcp->tcp_snd_sack_ok = B_FALSE; 2305 } else { 2306 tcp->tcp_snd_sack_ok = B_TRUE; 2307 if (tcp->tcp_snd_ts_ok) { 2308 tcp->tcp_max_sack_blk = 3; 2309 } else { 2310 tcp->tcp_max_sack_blk = 4; 2311 } 2312 } 2313 } else { 2314 /* 2315 * Resetting tcp_snd_sack_ok to B_FALSE so that 2316 * no SACK info will be used for this 2317 * connection. This assumes that SACK usage 2318 * permission is negotiated. This may need 2319 * to be changed once this is clarified. 2320 */ 2321 if (tcp->tcp_sack_info != NULL) { 2322 bkmem_free((caddr_t)tcp->tcp_sack_info, 2323 sizeof (tcp_sack_info_t)); 2324 tcp->tcp_sack_info = NULL; 2325 } 2326 tcp->tcp_snd_sack_ok = B_FALSE; 2327 } 2328 2329 /* 2330 * Now we know the exact TCP/IP header length, subtract 2331 * that from tcp_mss to get our side's MSS. 2332 */ 2333 tcp->tcp_mss -= tcp->tcp_hdr_len; 2334 /* 2335 * Here we assume that the other side's header size will be equal to 2336 * our header size. We calculate the real MSS accordingly. Need to 2337 * take into additional stuffs IPsec puts in. 2338 * 2339 * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) 2340 */ 2341 tcpopt.tcp_opt_mss -= tcp->tcp_hdr_len - 2342 (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH); 2343 2344 /* 2345 * Set MSS to the smaller one of both ends of the connection. 2346 * We should not have called tcp_mss_set() before, but our 2347 * side of the MSS should have been set to a proper value 2348 * by tcp_adapt_ire(). tcp_mss_set() will also set up the 2349 * STREAM head parameters properly. 2350 * 2351 * If we have a larger-than-16-bit window but the other side 2352 * didn't want to do window scale, tcp_rwnd_set() will take 2353 * care of that. 2354 */ 2355 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); 2356 } 2357 2358 /* 2359 * This function does PAWS protection check. Returns B_TRUE if the 2360 * segment passes the PAWS test, else returns B_FALSE. 2361 */ 2362 boolean_t 2363 tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) 2364 { 2365 uint8_t flags; 2366 int options; 2367 uint8_t *up; 2368 2369 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 2370 /* 2371 * If timestamp option is aligned nicely, get values inline, 2372 * otherwise call general routine to parse. Only do that 2373 * if timestamp is the only option. 2374 */ 2375 if (TCP_HDR_LENGTH(tcph) == (uint32_t)TCP_MIN_HEADER_LENGTH + 2376 TCPOPT_REAL_TS_LEN && 2377 OK_32PTR((up = ((uint8_t *)tcph) + 2378 TCP_MIN_HEADER_LENGTH)) && 2379 *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { 2380 tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); 2381 tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); 2382 2383 options = TCP_OPT_TSTAMP_PRESENT; 2384 } else { 2385 if (tcp->tcp_snd_sack_ok) { 2386 tcpoptp->tcp = tcp; 2387 } else { 2388 tcpoptp->tcp = NULL; 2389 } 2390 options = tcp_parse_options(tcph, tcpoptp); 2391 } 2392 2393 if (options & TCP_OPT_TSTAMP_PRESENT) { 2394 /* 2395 * Do PAWS per RFC 1323 section 4.2. Accept RST 2396 * regardless of the timestamp, page 18 RFC 1323.bis. 2397 */ 2398 if ((flags & TH_RST) == 0 && 2399 TSTMP_LT(tcpoptp->tcp_opt_ts_val, 2400 tcp->tcp_ts_recent)) { 2401 if (TSTMP_LT(prom_gettime(), 2402 tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) { 2403 /* This segment is not acceptable. */ 2404 return (B_FALSE); 2405 } else { 2406 /* 2407 * Connection has been idle for 2408 * too long. Reset the timestamp 2409 * and assume the segment is valid. 2410 */ 2411 tcp->tcp_ts_recent = 2412 tcpoptp->tcp_opt_ts_val; 2413 } 2414 } 2415 } else { 2416 /* 2417 * If we don't get a timestamp on every packet, we 2418 * figure we can't really trust 'em, so we stop sending 2419 * and parsing them. 2420 */ 2421 tcp->tcp_snd_ts_ok = B_FALSE; 2422 2423 tcp->tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 2424 tcp->tcp_tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 2425 tcp->tcp_tcph->th_offset_and_rsrvd[0] -= (3 << 4); 2426 tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN); 2427 if (tcp->tcp_snd_sack_ok) { 2428 assert(tcp->tcp_sack_info != NULL); 2429 tcp->tcp_max_sack_blk = 4; 2430 } 2431 } 2432 return (B_TRUE); 2433 } 2434 2435 /* 2436 * tcp_get_seg_mp() is called to get the pointer to a segment in the 2437 * send queue which starts at the given seq. no. 2438 * 2439 * Parameters: 2440 * tcp_t *tcp: the tcp instance pointer. 2441 * uint32_t seq: the starting seq. no of the requested segment. 2442 * int32_t *off: after the execution, *off will be the offset to 2443 * the returned mblk which points to the requested seq no. 2444 * 2445 * Return: 2446 * A mblk_t pointer pointing to the requested segment in send queue. 2447 */ 2448 static mblk_t * 2449 tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) 2450 { 2451 int32_t cnt; 2452 mblk_t *mp; 2453 2454 /* Defensive coding. Make sure we don't send incorrect data. */ 2455 if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt) || 2456 off == NULL) { 2457 return (NULL); 2458 } 2459 cnt = seq - tcp->tcp_suna; 2460 mp = tcp->tcp_xmit_head; 2461 while (cnt > 0 && mp) { 2462 cnt -= mp->b_wptr - mp->b_rptr; 2463 if (cnt < 0) { 2464 cnt += mp->b_wptr - mp->b_rptr; 2465 break; 2466 } 2467 mp = mp->b_cont; 2468 } 2469 assert(mp != NULL); 2470 *off = cnt; 2471 return (mp); 2472 } 2473 2474 /* 2475 * This function handles all retransmissions if SACK is enabled for this 2476 * connection. First it calculates how many segments can be retransmitted 2477 * based on tcp_pipe. Then it goes thru the notsack list to find eligible 2478 * segments. A segment is eligible if sack_cnt for that segment is greater 2479 * than or equal tcp_dupack_fast_retransmit. After it has retransmitted 2480 * all eligible segments, it checks to see if TCP can send some new segments 2481 * (fast recovery). If it can, it returns 1. Otherwise it returns 0. 2482 * 2483 * Parameters: 2484 * tcp_t *tcp: the tcp structure of the connection. 2485 * 2486 * Return: 2487 * 1 if the pipe is not full (new data can be sent), 0 otherwise 2488 */ 2489 static int32_t 2490 tcp_sack_rxmit(tcp_t *tcp, int sock_id) 2491 { 2492 notsack_blk_t *notsack_blk; 2493 int32_t usable_swnd; 2494 int32_t mss; 2495 uint32_t seg_len; 2496 mblk_t *xmit_mp; 2497 2498 assert(tcp->tcp_sack_info != NULL); 2499 assert(tcp->tcp_notsack_list != NULL); 2500 assert(tcp->tcp_rexmit == B_FALSE); 2501 2502 /* Defensive coding in case there is a bug... */ 2503 if (tcp->tcp_notsack_list == NULL) { 2504 return (0); 2505 } 2506 notsack_blk = tcp->tcp_notsack_list; 2507 mss = tcp->tcp_mss; 2508 2509 /* 2510 * Limit the num of outstanding data in the network to be 2511 * tcp_cwnd_ssthresh, which is half of the original congestion wnd. 2512 */ 2513 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 2514 2515 /* At least retransmit 1 MSS of data. */ 2516 if (usable_swnd <= 0) { 2517 usable_swnd = mss; 2518 } 2519 2520 /* Make sure no new RTT samples will be taken. */ 2521 tcp->tcp_csuna = tcp->tcp_snxt; 2522 2523 notsack_blk = tcp->tcp_notsack_list; 2524 while (usable_swnd > 0) { 2525 mblk_t *snxt_mp, *tmp_mp; 2526 tcp_seq begin = tcp->tcp_sack_snxt; 2527 tcp_seq end; 2528 int32_t off; 2529 2530 for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { 2531 if (SEQ_GT(notsack_blk->end, begin) && 2532 (notsack_blk->sack_cnt >= 2533 tcp_dupack_fast_retransmit)) { 2534 end = notsack_blk->end; 2535 if (SEQ_LT(begin, notsack_blk->begin)) { 2536 begin = notsack_blk->begin; 2537 } 2538 break; 2539 } 2540 } 2541 /* 2542 * All holes are filled. Manipulate tcp_cwnd to send more 2543 * if we can. Note that after the SACK recovery, tcp_cwnd is 2544 * set to tcp_cwnd_ssthresh. 2545 */ 2546 if (notsack_blk == NULL) { 2547 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 2548 if (usable_swnd <= 0) { 2549 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna; 2550 assert(tcp->tcp_cwnd > 0); 2551 return (0); 2552 } else { 2553 usable_swnd = usable_swnd / mss; 2554 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna + 2555 MAX(usable_swnd * mss, mss); 2556 return (1); 2557 } 2558 } 2559 2560 /* 2561 * Note that we may send more than usable_swnd allows here 2562 * because of round off, but no more than 1 MSS of data. 2563 */ 2564 seg_len = end - begin; 2565 if (seg_len > mss) 2566 seg_len = mss; 2567 snxt_mp = tcp_get_seg_mp(tcp, begin, &off); 2568 assert(snxt_mp != NULL); 2569 /* This should not happen. Defensive coding again... */ 2570 if (snxt_mp == NULL) { 2571 return (0); 2572 } 2573 2574 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off, 2575 &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE); 2576 2577 if (xmit_mp == NULL) 2578 return (0); 2579 2580 usable_swnd -= seg_len; 2581 tcp->tcp_pipe += seg_len; 2582 tcp->tcp_sack_snxt = begin + seg_len; 2583 TCP_DUMP_PACKET("tcp_sack_rxmit", xmit_mp); 2584 (void) ipv4_tcp_output(sock_id, xmit_mp); 2585 freeb(xmit_mp); 2586 2587 /* 2588 * Update the send timestamp to avoid false retransmission. 2589 * Note. use uintptr_t to suppress the gcc warning. 2590 */ 2591 snxt_mp->b_prev = (mblk_t *)(uintptr_t)prom_gettime(); 2592 2593 BUMP_MIB(tcp_mib.tcpRetransSegs); 2594 UPDATE_MIB(tcp_mib.tcpRetransBytes, seg_len); 2595 BUMP_MIB(tcp_mib.tcpOutSackRetransSegs); 2596 /* 2597 * Update tcp_rexmit_max to extend this SACK recovery phase. 2598 * This happens when new data sent during fast recovery is 2599 * also lost. If TCP retransmits those new data, it needs 2600 * to extend SACK recover phase to avoid starting another 2601 * fast retransmit/recovery unnecessarily. 2602 */ 2603 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) { 2604 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt; 2605 } 2606 } 2607 return (0); 2608 } 2609 2610 static void 2611 tcp_rput_data(tcp_t *tcp, mblk_t *mp, int sock_id) 2612 { 2613 uchar_t *rptr; 2614 struct ip *iph; 2615 tcp_t *tcp1; 2616 tcpha_t *tcph; 2617 uint32_t seg_ack; 2618 int seg_len; 2619 uint_t ip_hdr_len; 2620 uint32_t seg_seq; 2621 mblk_t *mp1; 2622 uint_t flags; 2623 uint32_t new_swnd = 0; 2624 int mss; 2625 boolean_t ofo_seg = B_FALSE; /* Out of order segment */ 2626 int32_t gap; 2627 int32_t rgap; 2628 tcp_opt_t tcpopt; 2629 int32_t bytes_acked; 2630 int npkt; 2631 uint32_t cwnd; 2632 uint32_t add; 2633 2634 #ifdef DEBUG 2635 printf("tcp_rput_data sock %d mp %x mp_datap %x #################\n", 2636 sock_id, mp, mp->b_datap); 2637 #endif 2638 2639 /* Dump the packet when debugging. */ 2640 TCP_DUMP_PACKET("tcp_rput_data", mp); 2641 2642 assert(OK_32PTR(mp->b_rptr)); 2643 2644 rptr = mp->b_rptr; 2645 iph = (struct ip *)rptr; 2646 ip_hdr_len = IPH_HDR_LENGTH(rptr); 2647 if (ip_hdr_len != IP_SIMPLE_HDR_LENGTH) { 2648 #ifdef DEBUG 2649 printf("Not simple IP header\n"); 2650 #endif 2651 /* We cannot handle IP option yet... */ 2652 tcp_drops++; 2653 freeb(mp); 2654 return; 2655 } 2656 /* The TCP header must be aligned. */ 2657 tcph = (tcpha_t *)&rptr[ip_hdr_len]; 2658 seg_seq = ntohl(tcph->tha_seq); 2659 seg_ack = ntohl(tcph->tha_ack); 2660 assert((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 2661 seg_len = (int)(mp->b_wptr - rptr) - 2662 (ip_hdr_len + TCP_HDR_LENGTH(((tcph_t *)tcph))); 2663 /* In inetboot, b_cont should always be NULL. */ 2664 assert(mp->b_cont == NULL); 2665 2666 /* Verify the checksum. */ 2667 if (tcp_verify_cksum(mp) < 0) { 2668 #ifdef DEBUG 2669 printf("tcp_rput_data: wrong cksum\n"); 2670 #endif 2671 freemsg(mp); 2672 return; 2673 } 2674 2675 /* 2676 * This segment is not for us, try to find its 2677 * intended receiver. 2678 */ 2679 if (tcp == NULL || 2680 tcph->tha_lport != tcp->tcp_fport || 2681 tcph->tha_fport != tcp->tcp_lport || 2682 iph->ip_src.s_addr != tcp->tcp_remote || 2683 iph->ip_dst.s_addr != tcp->tcp_bound_source) { 2684 #ifdef DEBUG 2685 printf("tcp_rput_data: not for us, state %d\n", 2686 tcp->tcp_state); 2687 #endif 2688 /* 2689 * First try to find a established connection. If none 2690 * is found, look for a listener. 2691 * 2692 * If a listener is found, we need to check to see if the 2693 * incoming segment is for one of its eagers. If it is, 2694 * give it to the eager. If not, listener should take care 2695 * of it. 2696 */ 2697 if ((tcp1 = tcp_lookup_ipv4(iph, tcph, TCPS_SYN_SENT, 2698 &sock_id)) != NULL || 2699 (tcp1 = tcp_lookup_listener_ipv4(iph->ip_dst.s_addr, 2700 tcph->tha_fport, &sock_id)) != NULL) { 2701 if (tcp1->tcp_state == TCPS_LISTEN) { 2702 if ((tcp = tcp_lookup_eager_ipv4(tcp1, 2703 iph, tcph)) == NULL) { 2704 /* No eager... sent to listener */ 2705 #ifdef DEBUG 2706 printf("found the listener: %s\n", 2707 tcp_display(tcp1, NULL, 2708 DISP_ADDR_AND_PORT)); 2709 #endif 2710 tcp = tcp1; 2711 } 2712 #ifdef DEBUG 2713 else { 2714 printf("found the eager: %s\n", 2715 tcp_display(tcp, NULL, 2716 DISP_ADDR_AND_PORT)); 2717 } 2718 #endif 2719 } else { 2720 /* Non listener found... */ 2721 #ifdef DEBUG 2722 printf("found the connection: %s\n", 2723 tcp_display(tcp1, NULL, 2724 DISP_ADDR_AND_PORT)); 2725 #endif 2726 tcp = tcp1; 2727 } 2728 } else { 2729 /* 2730 * No connection for this segment... 2731 * Send a RST to the other side. 2732 */ 2733 tcp_xmit_listeners_reset(sock_id, mp, ip_hdr_len); 2734 return; 2735 } 2736 } 2737 2738 flags = tcph->tha_flags & 0xFF; 2739 BUMP_MIB(tcp_mib.tcpInSegs); 2740 if (tcp->tcp_state == TCPS_TIME_WAIT) { 2741 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, 2742 seg_len, (tcph_t *)tcph, sock_id); 2743 return; 2744 } 2745 /* 2746 * From this point we can assume that the tcp is not compressed, 2747 * since we would have branched off to tcp_time_wait_processing() 2748 * in such a case. 2749 */ 2750 assert(tcp != NULL && tcp->tcp_state != TCPS_TIME_WAIT); 2751 2752 /* 2753 * After this point, we know we have the correct TCP, so update 2754 * the receive time. 2755 */ 2756 tcp->tcp_last_recv_time = prom_gettime(); 2757 2758 /* In inetboot, we do not handle urgent pointer... */ 2759 if (flags & TH_URG) { 2760 freemsg(mp); 2761 DEBUG_1("tcp_rput_data(%d): received segment with urgent " 2762 "pointer\n", sock_id); 2763 tcp_drops++; 2764 return; 2765 } 2766 2767 switch (tcp->tcp_state) { 2768 case TCPS_LISTEN: 2769 if ((flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN) { 2770 if (flags & TH_RST) { 2771 freemsg(mp); 2772 return; 2773 } 2774 if (flags & TH_ACK) { 2775 tcp_xmit_early_reset("TCPS_LISTEN-TH_ACK", 2776 sock_id, mp, seg_ack, 0, TH_RST, 2777 ip_hdr_len); 2778 return; 2779 } 2780 if (!(flags & TH_SYN)) { 2781 freemsg(mp); 2782 return; 2783 } 2784 printf("tcp_rput_data: %d\n", __LINE__); 2785 prom_panic("inetboot"); 2786 } 2787 if (tcp->tcp_conn_req_max > 0) { 2788 tcp = tcp_conn_request(tcp, mp, sock_id, ip_hdr_len); 2789 if (tcp == NULL) { 2790 freemsg(mp); 2791 return; 2792 } 2793 #ifdef DEBUG 2794 printf("tcp_rput_data: new tcp created\n"); 2795 #endif 2796 } 2797 tcp->tcp_irs = seg_seq; 2798 tcp->tcp_rack = seg_seq; 2799 tcp->tcp_rnxt = seg_seq + 1; 2800 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 2801 BUMP_MIB(tcp_mib.tcpPassiveOpens); 2802 goto syn_rcvd; 2803 case TCPS_SYN_SENT: 2804 if (flags & TH_ACK) { 2805 /* 2806 * Note that our stack cannot send data before a 2807 * connection is established, therefore the 2808 * following check is valid. Otherwise, it has 2809 * to be changed. 2810 */ 2811 if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || 2812 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 2813 if (flags & TH_RST) { 2814 freemsg(mp); 2815 return; 2816 } 2817 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", 2818 tcp, mp, seg_ack, 0, TH_RST, 2819 ip_hdr_len, sock_id); 2820 return; 2821 } 2822 assert(tcp->tcp_suna + 1 == seg_ack); 2823 } 2824 if (flags & TH_RST) { 2825 freemsg(mp); 2826 if (flags & TH_ACK) { 2827 tcp_clean_death(sock_id, tcp, ECONNREFUSED); 2828 } 2829 return; 2830 } 2831 if (!(flags & TH_SYN)) { 2832 freemsg(mp); 2833 return; 2834 } 2835 2836 /* Process all TCP options. */ 2837 tcp_process_options(tcp, (tcph_t *)tcph); 2838 /* 2839 * The following changes our rwnd to be a multiple of the 2840 * MIN(peer MSS, our MSS) for performance reason. 2841 */ 2842 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rwnd, 2843 tcp->tcp_mss)); 2844 2845 /* Is the other end ECN capable? */ 2846 if (tcp->tcp_ecn_ok) { 2847 if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { 2848 tcp->tcp_ecn_ok = B_FALSE; 2849 } 2850 } 2851 /* 2852 * Clear ECN flags because it may interfere with later 2853 * processing. 2854 */ 2855 flags &= ~(TH_ECE|TH_CWR); 2856 2857 tcp->tcp_irs = seg_seq; 2858 tcp->tcp_rack = seg_seq; 2859 tcp->tcp_rnxt = seg_seq + 1; 2860 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 2861 2862 if (flags & TH_ACK) { 2863 /* One for the SYN */ 2864 tcp->tcp_suna = tcp->tcp_iss + 1; 2865 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 2866 tcp->tcp_state = TCPS_ESTABLISHED; 2867 2868 /* 2869 * If SYN was retransmitted, need to reset all 2870 * retransmission info. This is because this 2871 * segment will be treated as a dup ACK. 2872 */ 2873 if (tcp->tcp_rexmit) { 2874 tcp->tcp_rexmit = B_FALSE; 2875 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 2876 tcp->tcp_rexmit_max = tcp->tcp_snxt; 2877 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 2878 2879 /* 2880 * Set tcp_cwnd back to 1 MSS, per 2881 * recommendation from 2882 * draft-floyd-incr-init-win-01.txt, 2883 * Increasing TCP's Initial Window. 2884 */ 2885 tcp->tcp_cwnd = tcp->tcp_mss; 2886 } 2887 2888 tcp->tcp_swl1 = seg_seq; 2889 tcp->tcp_swl2 = seg_ack; 2890 2891 new_swnd = BE16_TO_U16(((tcph_t *)tcph)->th_win); 2892 tcp->tcp_swnd = new_swnd; 2893 if (new_swnd > tcp->tcp_max_swnd) 2894 tcp->tcp_max_swnd = new_swnd; 2895 2896 /* 2897 * Always send the three-way handshake ack immediately 2898 * in order to make the connection complete as soon as 2899 * possible on the accepting host. 2900 */ 2901 flags |= TH_ACK_NEEDED; 2902 /* 2903 * Check to see if there is data to be sent. If 2904 * yes, set the transmit flag. Then check to see 2905 * if received data processing needs to be done. 2906 * If not, go straight to xmit_check. This short 2907 * cut is OK as we don't support T/TCP. 2908 */ 2909 if (tcp->tcp_unsent) 2910 flags |= TH_XMIT_NEEDED; 2911 2912 if (seg_len == 0) { 2913 freemsg(mp); 2914 goto xmit_check; 2915 } 2916 2917 flags &= ~TH_SYN; 2918 seg_seq++; 2919 break; 2920 } 2921 syn_rcvd: 2922 tcp->tcp_state = TCPS_SYN_RCVD; 2923 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, 2924 NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 2925 if (mp1 != NULL) { 2926 TCP_DUMP_PACKET("tcp_rput_data replying SYN", mp1); 2927 (void) ipv4_tcp_output(sock_id, mp1); 2928 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 2929 freeb(mp1); 2930 /* 2931 * Let's wait till our SYN has been ACKED since we 2932 * don't have a timer. 2933 */ 2934 if (tcp_state_wait(sock_id, tcp, TCPS_ALL_ACKED) < 0) { 2935 freemsg(mp); 2936 return; 2937 } 2938 } 2939 freemsg(mp); 2940 return; 2941 default: 2942 break; 2943 } 2944 mp->b_rptr = (uchar_t *)tcph + TCP_HDR_LENGTH((tcph_t *)tcph); 2945 new_swnd = ntohs(tcph->tha_win) << 2946 ((flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); 2947 mss = tcp->tcp_mss; 2948 2949 if (tcp->tcp_snd_ts_ok) { 2950 if (!tcp_paws_check(tcp, (tcph_t *)tcph, &tcpopt)) { 2951 /* 2952 * This segment is not acceptable. 2953 * Drop it and send back an ACK. 2954 */ 2955 freemsg(mp); 2956 flags |= TH_ACK_NEEDED; 2957 goto ack_check; 2958 } 2959 } else if (tcp->tcp_snd_sack_ok) { 2960 assert(tcp->tcp_sack_info != NULL); 2961 tcpopt.tcp = tcp; 2962 /* 2963 * SACK info in already updated in tcp_parse_options. Ignore 2964 * all other TCP options... 2965 */ 2966 (void) tcp_parse_options((tcph_t *)tcph, &tcpopt); 2967 } 2968 try_again:; 2969 gap = seg_seq - tcp->tcp_rnxt; 2970 rgap = tcp->tcp_rwnd - (gap + seg_len); 2971 /* 2972 * gap is the amount of sequence space between what we expect to see 2973 * and what we got for seg_seq. A positive value for gap means 2974 * something got lost. A negative value means we got some old stuff. 2975 */ 2976 if (gap < 0) { 2977 /* Old stuff present. Is the SYN in there? */ 2978 if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) && 2979 (seg_len != 0)) { 2980 flags &= ~TH_SYN; 2981 seg_seq++; 2982 /* Recompute the gaps after noting the SYN. */ 2983 goto try_again; 2984 } 2985 BUMP_MIB(tcp_mib.tcpInDataDupSegs); 2986 UPDATE_MIB(tcp_mib.tcpInDataDupBytes, 2987 (seg_len > -gap ? -gap : seg_len)); 2988 /* Remove the old stuff from seg_len. */ 2989 seg_len += gap; 2990 /* 2991 * Anything left? 2992 * Make sure to check for unack'd FIN when rest of data 2993 * has been previously ack'd. 2994 */ 2995 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 2996 /* 2997 * Resets are only valid if they lie within our offered 2998 * window. If the RST bit is set, we just ignore this 2999 * segment. 3000 */ 3001 if (flags & TH_RST) { 3002 freemsg(mp); 3003 return; 3004 } 3005 3006 /* 3007 * This segment is "unacceptable". None of its 3008 * sequence space lies within our advertized window. 3009 * 3010 * Adjust seg_len to the original value for tracing. 3011 */ 3012 seg_len -= gap; 3013 #ifdef DEBUG 3014 printf("tcp_rput: unacceptable, gap %d, rgap " 3015 "%d, flags 0x%x, seg_seq %u, seg_ack %u, " 3016 "seg_len %d, rnxt %u, snxt %u, %s", 3017 gap, rgap, flags, seg_seq, seg_ack, 3018 seg_len, tcp->tcp_rnxt, tcp->tcp_snxt, 3019 tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 3020 #endif 3021 3022 /* 3023 * Arrange to send an ACK in response to the 3024 * unacceptable segment per RFC 793 page 69. There 3025 * is only one small difference between ours and the 3026 * acceptability test in the RFC - we accept ACK-only 3027 * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK 3028 * will be generated. 3029 * 3030 * Note that we have to ACK an ACK-only packet at least 3031 * for stacks that send 0-length keep-alives with 3032 * SEG.SEQ = SND.NXT-1 as recommended by RFC1122, 3033 * section 4.2.3.6. As long as we don't ever generate 3034 * an unacceptable packet in response to an incoming 3035 * packet that is unacceptable, it should not cause 3036 * "ACK wars". 3037 */ 3038 flags |= TH_ACK_NEEDED; 3039 3040 /* 3041 * Continue processing this segment in order to use the 3042 * ACK information it contains, but skip all other 3043 * sequence-number processing. Processing the ACK 3044 * information is necessary in order to 3045 * re-synchronize connections that may have lost 3046 * synchronization. 3047 * 3048 * We clear seg_len and flag fields related to 3049 * sequence number processing as they are not 3050 * to be trusted for an unacceptable segment. 3051 */ 3052 seg_len = 0; 3053 flags &= ~(TH_SYN | TH_FIN | TH_URG); 3054 goto process_ack; 3055 } 3056 3057 /* Fix seg_seq, and chew the gap off the front. */ 3058 seg_seq = tcp->tcp_rnxt; 3059 do { 3060 mblk_t *mp2; 3061 assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 3062 (uintptr_t)UINT_MAX); 3063 gap += (uint_t)(mp->b_wptr - mp->b_rptr); 3064 if (gap > 0) { 3065 mp->b_rptr = mp->b_wptr - gap; 3066 break; 3067 } 3068 mp2 = mp; 3069 mp = mp->b_cont; 3070 freeb(mp2); 3071 } while (gap < 0); 3072 } 3073 /* 3074 * rgap is the amount of stuff received out of window. A negative 3075 * value is the amount out of window. 3076 */ 3077 if (rgap < 0) { 3078 mblk_t *mp2; 3079 3080 if (tcp->tcp_rwnd == 0) 3081 BUMP_MIB(tcp_mib.tcpInWinProbe); 3082 else { 3083 BUMP_MIB(tcp_mib.tcpInDataPastWinSegs); 3084 UPDATE_MIB(tcp_mib.tcpInDataPastWinBytes, -rgap); 3085 } 3086 3087 /* 3088 * seg_len does not include the FIN, so if more than 3089 * just the FIN is out of window, we act like we don't 3090 * see it. (If just the FIN is out of window, rgap 3091 * will be zero and we will go ahead and acknowledge 3092 * the FIN.) 3093 */ 3094 flags &= ~TH_FIN; 3095 3096 /* Fix seg_len and make sure there is something left. */ 3097 seg_len += rgap; 3098 if (seg_len <= 0) { 3099 /* 3100 * Resets are only valid if they lie within our offered 3101 * window. If the RST bit is set, we just ignore this 3102 * segment. 3103 */ 3104 if (flags & TH_RST) { 3105 freemsg(mp); 3106 return; 3107 } 3108 3109 /* Per RFC 793, we need to send back an ACK. */ 3110 flags |= TH_ACK_NEEDED; 3111 3112 /* 3113 * If this is a zero window probe, continue to 3114 * process the ACK part. But we need to set seg_len 3115 * to 0 to avoid data processing. Otherwise just 3116 * drop the segment and send back an ACK. 3117 */ 3118 if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) { 3119 flags &= ~(TH_SYN | TH_URG); 3120 seg_len = 0; 3121 /* Let's see if we can update our rwnd */ 3122 tcp_rcv_drain(sock_id, tcp); 3123 goto process_ack; 3124 } else { 3125 freemsg(mp); 3126 goto ack_check; 3127 } 3128 } 3129 /* Pitch out of window stuff off the end. */ 3130 rgap = seg_len; 3131 mp2 = mp; 3132 do { 3133 assert((uintptr_t)(mp2->b_wptr - 3134 mp2->b_rptr) <= (uintptr_t)INT_MAX); 3135 rgap -= (int)(mp2->b_wptr - mp2->b_rptr); 3136 if (rgap < 0) { 3137 mp2->b_wptr += rgap; 3138 if ((mp1 = mp2->b_cont) != NULL) { 3139 mp2->b_cont = NULL; 3140 freemsg(mp1); 3141 } 3142 break; 3143 } 3144 } while ((mp2 = mp2->b_cont) != NULL); 3145 } 3146 ok:; 3147 /* 3148 * TCP should check ECN info for segments inside the window only. 3149 * Therefore the check should be done here. 3150 */ 3151 if (tcp->tcp_ecn_ok) { 3152 uchar_t tos = ((struct ip *)rptr)->ip_tos; 3153 3154 if (flags & TH_CWR) { 3155 tcp->tcp_ecn_echo_on = B_FALSE; 3156 } 3157 /* 3158 * Note that both ECN_CE and CWR can be set in the 3159 * same segment. In this case, we once again turn 3160 * on ECN_ECHO. 3161 */ 3162 if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { 3163 tcp->tcp_ecn_echo_on = B_TRUE; 3164 } 3165 } 3166 3167 /* 3168 * Check whether we can update tcp_ts_recent. This test is 3169 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 3170 * Extensions for High Performance: An Update", Internet Draft. 3171 */ 3172 if (tcp->tcp_snd_ts_ok && 3173 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 3174 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 3175 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 3176 tcp->tcp_last_rcv_lbolt = prom_gettime(); 3177 } 3178 3179 if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) { 3180 /* 3181 * FIN in an out of order segment. We record this in 3182 * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq. 3183 * Clear the FIN so that any check on FIN flag will fail. 3184 * Remember that FIN also counts in the sequence number 3185 * space. So we need to ack out of order FIN only segments. 3186 */ 3187 if (flags & TH_FIN) { 3188 tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID; 3189 tcp->tcp_ofo_fin_seq = seg_seq + seg_len; 3190 flags &= ~TH_FIN; 3191 flags |= TH_ACK_NEEDED; 3192 } 3193 if (seg_len > 0) { 3194 /* Fill in the SACK blk list. */ 3195 if (tcp->tcp_snd_sack_ok) { 3196 assert(tcp->tcp_sack_info != NULL); 3197 tcp_sack_insert(tcp->tcp_sack_list, 3198 seg_seq, seg_seq + seg_len, 3199 &(tcp->tcp_num_sack_blk)); 3200 } 3201 3202 /* 3203 * Attempt reassembly and see if we have something 3204 * ready to go. 3205 */ 3206 mp = tcp_reass(tcp, mp, seg_seq); 3207 /* Always ack out of order packets */ 3208 flags |= TH_ACK_NEEDED | TH_PUSH; 3209 if (mp != NULL) { 3210 assert((uintptr_t)(mp->b_wptr - 3211 mp->b_rptr) <= (uintptr_t)INT_MAX); 3212 seg_len = mp->b_cont ? msgdsize(mp) : 3213 (int)(mp->b_wptr - mp->b_rptr); 3214 seg_seq = tcp->tcp_rnxt; 3215 /* 3216 * A gap is filled and the seq num and len 3217 * of the gap match that of a previously 3218 * received FIN, put the FIN flag back in. 3219 */ 3220 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3221 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3222 flags |= TH_FIN; 3223 tcp->tcp_valid_bits &= 3224 ~TCP_OFO_FIN_VALID; 3225 } 3226 } else { 3227 /* 3228 * Keep going even with NULL mp. 3229 * There may be a useful ACK or something else 3230 * we don't want to miss. 3231 * 3232 * But TCP should not perform fast retransmit 3233 * because of the ack number. TCP uses 3234 * seg_len == 0 to determine if it is a pure 3235 * ACK. And this is not a pure ACK. 3236 */ 3237 seg_len = 0; 3238 ofo_seg = B_TRUE; 3239 } 3240 } 3241 } else if (seg_len > 0) { 3242 BUMP_MIB(tcp_mib.tcpInDataInorderSegs); 3243 UPDATE_MIB(tcp_mib.tcpInDataInorderBytes, seg_len); 3244 /* 3245 * If an out of order FIN was received before, and the seq 3246 * num and len of the new segment match that of the FIN, 3247 * put the FIN flag back in. 3248 */ 3249 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3250 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3251 flags |= TH_FIN; 3252 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID; 3253 } 3254 } 3255 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) { 3256 if (flags & TH_RST) { 3257 freemsg(mp); 3258 switch (tcp->tcp_state) { 3259 case TCPS_SYN_RCVD: 3260 (void) tcp_clean_death(sock_id, tcp, ECONNREFUSED); 3261 break; 3262 case TCPS_ESTABLISHED: 3263 case TCPS_FIN_WAIT_1: 3264 case TCPS_FIN_WAIT_2: 3265 case TCPS_CLOSE_WAIT: 3266 (void) tcp_clean_death(sock_id, tcp, ECONNRESET); 3267 break; 3268 case TCPS_CLOSING: 3269 case TCPS_LAST_ACK: 3270 (void) tcp_clean_death(sock_id, tcp, 0); 3271 break; 3272 default: 3273 assert(tcp->tcp_state != TCPS_TIME_WAIT); 3274 (void) tcp_clean_death(sock_id, tcp, ENXIO); 3275 break; 3276 } 3277 return; 3278 } 3279 if (flags & TH_SYN) { 3280 /* 3281 * See RFC 793, Page 71 3282 * 3283 * The seq number must be in the window as it should 3284 * be "fixed" above. If it is outside window, it should 3285 * be already rejected. Note that we allow seg_seq to be 3286 * rnxt + rwnd because we want to accept 0 window probe. 3287 */ 3288 assert(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) && 3289 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd)); 3290 freemsg(mp); 3291 /* 3292 * If the ACK flag is not set, just use our snxt as the 3293 * seq number of the RST segment. 3294 */ 3295 if (!(flags & TH_ACK)) { 3296 seg_ack = tcp->tcp_snxt; 3297 } 3298 tcp_xmit_ctl("TH_SYN", tcp, NULL, seg_ack, 3299 seg_seq + 1, TH_RST|TH_ACK, 0, sock_id); 3300 assert(tcp->tcp_state != TCPS_TIME_WAIT); 3301 (void) tcp_clean_death(sock_id, tcp, ECONNRESET); 3302 return; 3303 } 3304 3305 process_ack: 3306 if (!(flags & TH_ACK)) { 3307 #ifdef DEBUG 3308 printf("No ack in segment, dropped it, seq:%x\n", seg_seq); 3309 #endif 3310 freemsg(mp); 3311 goto xmit_check; 3312 } 3313 } 3314 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 3315 3316 if (tcp->tcp_state == TCPS_SYN_RCVD) { 3317 tcp_t *listener = tcp->tcp_listener; 3318 #ifdef DEBUG 3319 printf("Done with eager 3-way handshake\n"); 3320 #endif 3321 /* 3322 * NOTE: RFC 793 pg. 72 says this should be 'bytes_acked < 0' 3323 * but that would mean we have an ack that ignored our SYN. 3324 */ 3325 if (bytes_acked < 1 || SEQ_GT(seg_ack, tcp->tcp_snxt)) { 3326 freemsg(mp); 3327 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack", 3328 tcp, NULL, seg_ack, 0, TH_RST, 0, sock_id); 3329 return; 3330 } 3331 3332 /* 3333 * if the conn_req_q is full defer processing 3334 * until space is availabe after accept() 3335 * processing 3336 */ 3337 if (listener->tcp_conn_req_cnt_q < 3338 listener->tcp_conn_req_max) { 3339 tcp_t *tail; 3340 3341 listener->tcp_conn_req_cnt_q0--; 3342 listener->tcp_conn_req_cnt_q++; 3343 3344 /* Move from SYN_RCVD to ESTABLISHED list */ 3345 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 3346 tcp->tcp_eager_prev_q0; 3347 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 3348 tcp->tcp_eager_next_q0; 3349 tcp->tcp_eager_prev_q0 = NULL; 3350 tcp->tcp_eager_next_q0 = NULL; 3351 3352 /* 3353 * Insert at end of the queue because sockfs 3354 * sends down T_CONN_RES in chronological 3355 * order. Leaving the older conn indications 3356 * at front of the queue helps reducing search 3357 * time. 3358 */ 3359 tail = listener->tcp_eager_last_q; 3360 if (tail != NULL) { 3361 tail->tcp_eager_next_q = tcp; 3362 } else { 3363 listener->tcp_eager_next_q = tcp; 3364 } 3365 listener->tcp_eager_last_q = tcp; 3366 tcp->tcp_eager_next_q = NULL; 3367 } else { 3368 /* 3369 * Defer connection on q0 and set deferred 3370 * connection bit true 3371 */ 3372 tcp->tcp_conn_def_q0 = B_TRUE; 3373 3374 /* take tcp out of q0 ... */ 3375 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 3376 tcp->tcp_eager_next_q0; 3377 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 3378 tcp->tcp_eager_prev_q0; 3379 3380 /* ... and place it at the end of q0 */ 3381 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 3382 tcp->tcp_eager_next_q0 = listener; 3383 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 3384 listener->tcp_eager_prev_q0 = tcp; 3385 } 3386 3387 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ 3388 bytes_acked--; 3389 3390 /* 3391 * If SYN was retransmitted, need to reset all 3392 * retransmission info as this segment will be 3393 * treated as a dup ACK. 3394 */ 3395 if (tcp->tcp_rexmit) { 3396 tcp->tcp_rexmit = B_FALSE; 3397 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 3398 tcp->tcp_rexmit_max = tcp->tcp_snxt; 3399 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3400 tcp->tcp_ms_we_have_waited = 0; 3401 tcp->tcp_cwnd = mss; 3402 } 3403 3404 /* 3405 * We set the send window to zero here. 3406 * This is needed if there is data to be 3407 * processed already on the queue. 3408 * Later (at swnd_update label), the 3409 * "new_swnd > tcp_swnd" condition is satisfied 3410 * the XMIT_NEEDED flag is set in the current 3411 * (SYN_RCVD) state. This ensures tcp_wput_data() is 3412 * called if there is already data on queue in 3413 * this state. 3414 */ 3415 tcp->tcp_swnd = 0; 3416 3417 if (new_swnd > tcp->tcp_max_swnd) 3418 tcp->tcp_max_swnd = new_swnd; 3419 tcp->tcp_swl1 = seg_seq; 3420 tcp->tcp_swl2 = seg_ack; 3421 tcp->tcp_state = TCPS_ESTABLISHED; 3422 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 3423 } 3424 /* This code follows 4.4BSD-Lite2 mostly. */ 3425 if (bytes_acked < 0) 3426 goto est; 3427 3428 /* 3429 * If TCP is ECN capable and the congestion experience bit is 3430 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be 3431 * done once per window (or more loosely, per RTT). 3432 */ 3433 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) 3434 tcp->tcp_cwr = B_FALSE; 3435 if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { 3436 if (!tcp->tcp_cwr) { 3437 npkt = (MIN(tcp->tcp_cwnd, tcp->tcp_swnd) >> 1) / mss; 3438 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; 3439 tcp->tcp_cwnd = npkt * mss; 3440 /* 3441 * If the cwnd is 0, use the timer to clock out 3442 * new segments. This is required by the ECN spec. 3443 */ 3444 if (npkt == 0) { 3445 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3446 /* 3447 * This makes sure that when the ACK comes 3448 * back, we will increase tcp_cwnd by 1 MSS. 3449 */ 3450 tcp->tcp_cwnd_cnt = 0; 3451 } 3452 tcp->tcp_cwr = B_TRUE; 3453 /* 3454 * This marks the end of the current window of in 3455 * flight data. That is why we don't use 3456 * tcp_suna + tcp_swnd. Only data in flight can 3457 * provide ECN info. 3458 */ 3459 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 3460 tcp->tcp_ecn_cwr_sent = B_FALSE; 3461 } 3462 } 3463 3464 mp1 = tcp->tcp_xmit_head; 3465 if (bytes_acked == 0) { 3466 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { 3467 int dupack_cnt; 3468 3469 BUMP_MIB(tcp_mib.tcpInDupAck); 3470 /* 3471 * Fast retransmit. When we have seen exactly three 3472 * identical ACKs while we have unacked data 3473 * outstanding we take it as a hint that our peer 3474 * dropped something. 3475 * 3476 * If TCP is retransmitting, don't do fast retransmit. 3477 */ 3478 if (mp1 != NULL && tcp->tcp_suna != tcp->tcp_snxt && 3479 ! tcp->tcp_rexmit) { 3480 /* Do Limited Transmit */ 3481 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < 3482 tcp_dupack_fast_retransmit) { 3483 /* 3484 * RFC 3042 3485 * 3486 * What we need to do is temporarily 3487 * increase tcp_cwnd so that new 3488 * data can be sent if it is allowed 3489 * by the receive window (tcp_rwnd). 3490 * tcp_wput_data() will take care of 3491 * the rest. 3492 * 3493 * If the connection is SACK capable, 3494 * only do limited xmit when there 3495 * is SACK info. 3496 * 3497 * Note how tcp_cwnd is incremented. 3498 * The first dup ACK will increase 3499 * it by 1 MSS. The second dup ACK 3500 * will increase it by 2 MSS. This 3501 * means that only 1 new segment will 3502 * be sent for each dup ACK. 3503 */ 3504 if (tcp->tcp_unsent > 0 && 3505 (!tcp->tcp_snd_sack_ok || 3506 (tcp->tcp_snd_sack_ok && 3507 tcp->tcp_notsack_list != NULL))) { 3508 tcp->tcp_cwnd += mss << 3509 (tcp->tcp_dupack_cnt - 1); 3510 flags |= TH_LIMIT_XMIT; 3511 } 3512 } else if (dupack_cnt == 3513 tcp_dupack_fast_retransmit) { 3514 3515 BUMP_MIB(tcp_mib.tcpOutFastRetrans); 3516 /* 3517 * If we have reduced tcp_ssthresh 3518 * because of ECN, do not reduce it again 3519 * unless it is already one window of data 3520 * away. After one window of data, tcp_cwr 3521 * should then be cleared. Note that 3522 * for non ECN capable connection, tcp_cwr 3523 * should always be false. 3524 * 3525 * Adjust cwnd since the duplicate 3526 * ack indicates that a packet was 3527 * dropped (due to congestion.) 3528 */ 3529 if (!tcp->tcp_cwr) { 3530 npkt = (MIN(tcp->tcp_cwnd, 3531 tcp->tcp_swnd) >> 1) / mss; 3532 if (npkt < 2) 3533 npkt = 2; 3534 tcp->tcp_cwnd_ssthresh = npkt * mss; 3535 tcp->tcp_cwnd = (npkt + 3536 tcp->tcp_dupack_cnt) * mss; 3537 } 3538 if (tcp->tcp_ecn_ok) { 3539 tcp->tcp_cwr = B_TRUE; 3540 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 3541 tcp->tcp_ecn_cwr_sent = B_FALSE; 3542 } 3543 3544 /* 3545 * We do Hoe's algorithm. Refer to her 3546 * paper "Improving the Start-up Behavior 3547 * of a Congestion Control Scheme for TCP," 3548 * appeared in SIGCOMM'96. 3549 * 3550 * Save highest seq no we have sent so far. 3551 * Be careful about the invisible FIN byte. 3552 */ 3553 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 3554 (tcp->tcp_unsent == 0)) { 3555 tcp->tcp_rexmit_max = tcp->tcp_fss; 3556 } else { 3557 tcp->tcp_rexmit_max = tcp->tcp_snxt; 3558 } 3559 3560 /* 3561 * Do not allow bursty traffic during. 3562 * fast recovery. Refer to Fall and Floyd's 3563 * paper "Simulation-based Comparisons of 3564 * Tahoe, Reno and SACK TCP" (in CCR ??) 3565 * This is a best current practise. 3566 */ 3567 tcp->tcp_snd_burst = TCP_CWND_SS; 3568 3569 /* 3570 * For SACK: 3571 * Calculate tcp_pipe, which is the 3572 * estimated number of bytes in 3573 * network. 3574 * 3575 * tcp_fack is the highest sack'ed seq num 3576 * TCP has received. 3577 * 3578 * tcp_pipe is explained in the above quoted 3579 * Fall and Floyd's paper. tcp_fack is 3580 * explained in Mathis and Mahdavi's 3581 * "Forward Acknowledgment: Refining TCP 3582 * Congestion Control" in SIGCOMM '96. 3583 */ 3584 if (tcp->tcp_snd_sack_ok) { 3585 assert(tcp->tcp_sack_info != NULL); 3586 if (tcp->tcp_notsack_list != NULL) { 3587 tcp->tcp_pipe = tcp->tcp_snxt - 3588 tcp->tcp_fack; 3589 tcp->tcp_sack_snxt = seg_ack; 3590 flags |= TH_NEED_SACK_REXMIT; 3591 } else { 3592 /* 3593 * Always initialize tcp_pipe 3594 * even though we don't have 3595 * any SACK info. If later 3596 * we get SACK info and 3597 * tcp_pipe is not initialized, 3598 * funny things will happen. 3599 */ 3600 tcp->tcp_pipe = 3601 tcp->tcp_cwnd_ssthresh; 3602 } 3603 } else { 3604 flags |= TH_REXMIT_NEEDED; 3605 } /* tcp_snd_sack_ok */ 3606 3607 } else { 3608 /* 3609 * Here we perform congestion 3610 * avoidance, but NOT slow start. 3611 * This is known as the Fast 3612 * Recovery Algorithm. 3613 */ 3614 if (tcp->tcp_snd_sack_ok && 3615 tcp->tcp_notsack_list != NULL) { 3616 flags |= TH_NEED_SACK_REXMIT; 3617 tcp->tcp_pipe -= mss; 3618 if (tcp->tcp_pipe < 0) 3619 tcp->tcp_pipe = 0; 3620 } else { 3621 /* 3622 * We know that one more packet has 3623 * left the pipe thus we can update 3624 * cwnd. 3625 */ 3626 cwnd = tcp->tcp_cwnd + mss; 3627 if (cwnd > tcp->tcp_cwnd_max) 3628 cwnd = tcp->tcp_cwnd_max; 3629 tcp->tcp_cwnd = cwnd; 3630 flags |= TH_XMIT_NEEDED; 3631 } 3632 } 3633 } 3634 } else if (tcp->tcp_zero_win_probe) { 3635 /* 3636 * If the window has opened, need to arrange 3637 * to send additional data. 3638 */ 3639 if (new_swnd != 0) { 3640 /* tcp_suna != tcp_snxt */ 3641 /* Packet contains a window update */ 3642 BUMP_MIB(tcp_mib.tcpInWinUpdate); 3643 tcp->tcp_zero_win_probe = 0; 3644 tcp->tcp_timer_backoff = 0; 3645 tcp->tcp_ms_we_have_waited = 0; 3646 3647 /* 3648 * Transmit starting with tcp_suna since 3649 * the one byte probe is not ack'ed. 3650 * If TCP has sent more than one identical 3651 * probe, tcp_rexmit will be set. That means 3652 * tcp_ss_rexmit() will send out the one 3653 * byte along with new data. Otherwise, 3654 * fake the retransmission. 3655 */ 3656 flags |= TH_XMIT_NEEDED; 3657 if (!tcp->tcp_rexmit) { 3658 tcp->tcp_rexmit = B_TRUE; 3659 tcp->tcp_dupack_cnt = 0; 3660 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 3661 tcp->tcp_rexmit_max = tcp->tcp_suna + 1; 3662 } 3663 } 3664 } 3665 goto swnd_update; 3666 } 3667 3668 /* 3669 * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73. 3670 * If the ACK value acks something that we have not yet sent, it might 3671 * be an old duplicate segment. Send an ACK to re-synchronize the 3672 * other side. 3673 * Note: reset in response to unacceptable ACK in SYN_RECEIVE 3674 * state is handled above, so we can always just drop the segment and 3675 * send an ACK here. 3676 * 3677 * Should we send ACKs in response to ACK only segments? 3678 */ 3679 if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { 3680 BUMP_MIB(tcp_mib.tcpInAckUnsent); 3681 /* drop the received segment */ 3682 freemsg(mp); 3683 3684 /* Send back an ACK. */ 3685 mp = tcp_ack_mp(tcp); 3686 3687 if (mp == NULL) { 3688 return; 3689 } 3690 BUMP_MIB(tcp_mib.tcpOutAck); 3691 (void) ipv4_tcp_output(sock_id, mp); 3692 freeb(mp); 3693 return; 3694 } 3695 3696 /* 3697 * TCP gets a new ACK, update the notsack'ed list to delete those 3698 * blocks that are covered by this ACK. 3699 */ 3700 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 3701 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack, 3702 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list)); 3703 } 3704 3705 /* 3706 * If we got an ACK after fast retransmit, check to see 3707 * if it is a partial ACK. If it is not and the congestion 3708 * window was inflated to account for the other side's 3709 * cached packets, retract it. If it is, do Hoe's algorithm. 3710 */ 3711 if (tcp->tcp_dupack_cnt >= tcp_dupack_fast_retransmit) { 3712 assert(tcp->tcp_rexmit == B_FALSE); 3713 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { 3714 tcp->tcp_dupack_cnt = 0; 3715 /* 3716 * Restore the orig tcp_cwnd_ssthresh after 3717 * fast retransmit phase. 3718 */ 3719 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { 3720 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; 3721 } 3722 tcp->tcp_rexmit_max = seg_ack; 3723 tcp->tcp_cwnd_cnt = 0; 3724 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3725 3726 /* 3727 * Remove all notsack info to avoid confusion with 3728 * the next fast retrasnmit/recovery phase. 3729 */ 3730 if (tcp->tcp_snd_sack_ok && 3731 tcp->tcp_notsack_list != NULL) { 3732 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 3733 } 3734 } else { 3735 if (tcp->tcp_snd_sack_ok && 3736 tcp->tcp_notsack_list != NULL) { 3737 flags |= TH_NEED_SACK_REXMIT; 3738 tcp->tcp_pipe -= mss; 3739 if (tcp->tcp_pipe < 0) 3740 tcp->tcp_pipe = 0; 3741 } else { 3742 /* 3743 * Hoe's algorithm: 3744 * 3745 * Retransmit the unack'ed segment and 3746 * restart fast recovery. Note that we 3747 * need to scale back tcp_cwnd to the 3748 * original value when we started fast 3749 * recovery. This is to prevent overly 3750 * aggressive behaviour in sending new 3751 * segments. 3752 */ 3753 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + 3754 tcp_dupack_fast_retransmit * mss; 3755 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; 3756 BUMP_MIB(tcp_mib.tcpOutFastRetrans); 3757 flags |= TH_REXMIT_NEEDED; 3758 } 3759 } 3760 } else { 3761 tcp->tcp_dupack_cnt = 0; 3762 if (tcp->tcp_rexmit) { 3763 /* 3764 * TCP is retranmitting. If the ACK ack's all 3765 * outstanding data, update tcp_rexmit_max and 3766 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt 3767 * to the correct value. 3768 * 3769 * Note that SEQ_LEQ() is used. This is to avoid 3770 * unnecessary fast retransmit caused by dup ACKs 3771 * received when TCP does slow start retransmission 3772 * after a time out. During this phase, TCP may 3773 * send out segments which are already received. 3774 * This causes dup ACKs to be sent back. 3775 */ 3776 if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) { 3777 if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) { 3778 tcp->tcp_rexmit_nxt = seg_ack; 3779 } 3780 if (seg_ack != tcp->tcp_rexmit_max) { 3781 flags |= TH_XMIT_NEEDED; 3782 } 3783 } else { 3784 tcp->tcp_rexmit = B_FALSE; 3785 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 3786 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3787 } 3788 tcp->tcp_ms_we_have_waited = 0; 3789 } 3790 } 3791 3792 BUMP_MIB(tcp_mib.tcpInAckSegs); 3793 UPDATE_MIB(tcp_mib.tcpInAckBytes, bytes_acked); 3794 tcp->tcp_suna = seg_ack; 3795 if (tcp->tcp_zero_win_probe != 0) { 3796 tcp->tcp_zero_win_probe = 0; 3797 tcp->tcp_timer_backoff = 0; 3798 } 3799 3800 /* 3801 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed. 3802 * Note that it cannot be the SYN being ack'ed. The code flow 3803 * will not reach here. 3804 */ 3805 if (mp1 == NULL) { 3806 goto fin_acked; 3807 } 3808 3809 /* 3810 * Update the congestion window. 3811 * 3812 * If TCP is not ECN capable or TCP is ECN capable but the 3813 * congestion experience bit is not set, increase the tcp_cwnd as 3814 * usual. 3815 */ 3816 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { 3817 cwnd = tcp->tcp_cwnd; 3818 add = mss; 3819 3820 if (cwnd >= tcp->tcp_cwnd_ssthresh) { 3821 /* 3822 * This is to prevent an increase of less than 1 MSS of 3823 * tcp_cwnd. With partial increase, tcp_wput_data() 3824 * may send out tinygrams in order to preserve mblk 3825 * boundaries. 3826 * 3827 * By initializing tcp_cwnd_cnt to new tcp_cwnd and 3828 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is 3829 * increased by 1 MSS for every RTTs. 3830 */ 3831 if (tcp->tcp_cwnd_cnt <= 0) { 3832 tcp->tcp_cwnd_cnt = cwnd + add; 3833 } else { 3834 tcp->tcp_cwnd_cnt -= add; 3835 add = 0; 3836 } 3837 } 3838 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); 3839 } 3840 3841 /* Can we update the RTT estimates? */ 3842 if (tcp->tcp_snd_ts_ok) { 3843 /* Ignore zero timestamp echo-reply. */ 3844 if (tcpopt.tcp_opt_ts_ecr != 0) { 3845 tcp_set_rto(tcp, (int32_t)(prom_gettime() - 3846 tcpopt.tcp_opt_ts_ecr)); 3847 } 3848 3849 /* If needed, restart the timer. */ 3850 if (tcp->tcp_set_timer == 1) { 3851 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3852 tcp->tcp_set_timer = 0; 3853 } 3854 /* 3855 * Update tcp_csuna in case the other side stops sending 3856 * us timestamps. 3857 */ 3858 tcp->tcp_csuna = tcp->tcp_snxt; 3859 } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { 3860 /* 3861 * An ACK sequence we haven't seen before, so get the RTT 3862 * and update the RTO. 3863 * Note. use uintptr_t to suppress the gcc warning. 3864 */ 3865 tcp_set_rto(tcp, (int32_t)(prom_gettime() - 3866 (uint32_t)(uintptr_t)mp1->b_prev)); 3867 3868 /* Remeber the last sequence to be ACKed */ 3869 tcp->tcp_csuna = seg_ack; 3870 if (tcp->tcp_set_timer == 1) { 3871 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3872 tcp->tcp_set_timer = 0; 3873 } 3874 } else { 3875 BUMP_MIB(tcp_mib.tcpRttNoUpdate); 3876 } 3877 3878 /* Eat acknowledged bytes off the xmit queue. */ 3879 for (;;) { 3880 mblk_t *mp2; 3881 uchar_t *wptr; 3882 3883 wptr = mp1->b_wptr; 3884 assert((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX); 3885 bytes_acked -= (int)(wptr - mp1->b_rptr); 3886 if (bytes_acked < 0) { 3887 mp1->b_rptr = wptr + bytes_acked; 3888 break; 3889 } 3890 mp1->b_prev = NULL; 3891 mp2 = mp1; 3892 mp1 = mp1->b_cont; 3893 freeb(mp2); 3894 if (bytes_acked == 0) { 3895 if (mp1 == NULL) { 3896 /* Everything is ack'ed, clear the tail. */ 3897 tcp->tcp_xmit_tail = NULL; 3898 goto pre_swnd_update; 3899 } 3900 if (mp2 != tcp->tcp_xmit_tail) 3901 break; 3902 tcp->tcp_xmit_tail = mp1; 3903 assert((uintptr_t)(mp1->b_wptr - 3904 mp1->b_rptr) <= (uintptr_t)INT_MAX); 3905 tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr - 3906 mp1->b_rptr); 3907 break; 3908 } 3909 if (mp1 == NULL) { 3910 /* 3911 * More was acked but there is nothing more 3912 * outstanding. This means that the FIN was 3913 * just acked or that we're talking to a clown. 3914 */ 3915 fin_acked: 3916 assert(tcp->tcp_fin_sent); 3917 tcp->tcp_xmit_tail = NULL; 3918 if (tcp->tcp_fin_sent) { 3919 tcp->tcp_fin_acked = B_TRUE; 3920 } else { 3921 /* 3922 * We should never got here because 3923 * we have already checked that the 3924 * number of bytes ack'ed should be 3925 * smaller than or equal to what we 3926 * have sent so far (it is the 3927 * acceptability check of the ACK). 3928 * We can only get here if the send 3929 * queue is corrupted. 3930 * 3931 * Terminate the connection and 3932 * panic the system. It is better 3933 * for us to panic instead of 3934 * continuing to avoid other disaster. 3935 */ 3936 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 3937 tcp->tcp_rnxt, TH_RST|TH_ACK, 0, sock_id); 3938 printf("Memory corruption " 3939 "detected for connection %s.\n", 3940 tcp_display(tcp, NULL, 3941 DISP_ADDR_AND_PORT)); 3942 /* We should never get here... */ 3943 prom_panic("tcp_rput_data"); 3944 } 3945 goto pre_swnd_update; 3946 } 3947 assert(mp2 != tcp->tcp_xmit_tail); 3948 } 3949 if (tcp->tcp_unsent) { 3950 flags |= TH_XMIT_NEEDED; 3951 } 3952 pre_swnd_update: 3953 tcp->tcp_xmit_head = mp1; 3954 swnd_update: 3955 /* 3956 * The following check is different from most other implementations. 3957 * For bi-directional transfer, when segments are dropped, the 3958 * "normal" check will not accept a window update in those 3959 * retransmitted segemnts. Failing to do that, TCP may send out 3960 * segments which are outside receiver's window. As TCP accepts 3961 * the ack in those retransmitted segments, if the window update in 3962 * the same segment is not accepted, TCP will incorrectly calculates 3963 * that it can send more segments. This can create a deadlock 3964 * with the receiver if its window becomes zero. 3965 */ 3966 if (SEQ_LT(tcp->tcp_swl2, seg_ack) || 3967 SEQ_LT(tcp->tcp_swl1, seg_seq) || 3968 (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) { 3969 /* 3970 * The criteria for update is: 3971 * 3972 * 1. the segment acknowledges some data. Or 3973 * 2. the segment is new, i.e. it has a higher seq num. Or 3974 * 3. the segment is not old and the advertised window is 3975 * larger than the previous advertised window. 3976 */ 3977 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd) 3978 flags |= TH_XMIT_NEEDED; 3979 tcp->tcp_swnd = new_swnd; 3980 if (new_swnd > tcp->tcp_max_swnd) 3981 tcp->tcp_max_swnd = new_swnd; 3982 tcp->tcp_swl1 = seg_seq; 3983 tcp->tcp_swl2 = seg_ack; 3984 } 3985 est: 3986 if (tcp->tcp_state > TCPS_ESTABLISHED) { 3987 switch (tcp->tcp_state) { 3988 case TCPS_FIN_WAIT_1: 3989 if (tcp->tcp_fin_acked) { 3990 tcp->tcp_state = TCPS_FIN_WAIT_2; 3991 /* 3992 * We implement the non-standard BSD/SunOS 3993 * FIN_WAIT_2 flushing algorithm. 3994 * If there is no user attached to this 3995 * TCP endpoint, then this TCP struct 3996 * could hang around forever in FIN_WAIT_2 3997 * state if the peer forgets to send us 3998 * a FIN. To prevent this, we wait only 3999 * 2*MSL (a convenient time value) for 4000 * the FIN to arrive. If it doesn't show up, 4001 * we flush the TCP endpoint. This algorithm, 4002 * though a violation of RFC-793, has worked 4003 * for over 10 years in BSD systems. 4004 * Note: SunOS 4.x waits 675 seconds before 4005 * flushing the FIN_WAIT_2 connection. 4006 */ 4007 TCP_TIMER_RESTART(tcp, 4008 tcp_fin_wait_2_flush_interval); 4009 } 4010 break; 4011 case TCPS_FIN_WAIT_2: 4012 break; /* Shutdown hook? */ 4013 case TCPS_LAST_ACK: 4014 freemsg(mp); 4015 if (tcp->tcp_fin_acked) { 4016 (void) tcp_clean_death(sock_id, tcp, 0); 4017 return; 4018 } 4019 goto xmit_check; 4020 case TCPS_CLOSING: 4021 if (tcp->tcp_fin_acked) { 4022 tcp->tcp_state = TCPS_TIME_WAIT; 4023 tcp_time_wait_append(tcp); 4024 TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 4025 } 4026 /*FALLTHRU*/ 4027 case TCPS_CLOSE_WAIT: 4028 freemsg(mp); 4029 goto xmit_check; 4030 default: 4031 assert(tcp->tcp_state != TCPS_TIME_WAIT); 4032 break; 4033 } 4034 } 4035 if (flags & TH_FIN) { 4036 /* Make sure we ack the fin */ 4037 flags |= TH_ACK_NEEDED; 4038 if (!tcp->tcp_fin_rcvd) { 4039 tcp->tcp_fin_rcvd = B_TRUE; 4040 tcp->tcp_rnxt++; 4041 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 4042 4043 switch (tcp->tcp_state) { 4044 case TCPS_SYN_RCVD: 4045 case TCPS_ESTABLISHED: 4046 tcp->tcp_state = TCPS_CLOSE_WAIT; 4047 /* Keepalive? */ 4048 break; 4049 case TCPS_FIN_WAIT_1: 4050 if (!tcp->tcp_fin_acked) { 4051 tcp->tcp_state = TCPS_CLOSING; 4052 break; 4053 } 4054 /* FALLTHRU */ 4055 case TCPS_FIN_WAIT_2: 4056 tcp->tcp_state = TCPS_TIME_WAIT; 4057 tcp_time_wait_append(tcp); 4058 TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 4059 if (seg_len) { 4060 /* 4061 * implies data piggybacked on FIN. 4062 * break to handle data. 4063 */ 4064 break; 4065 } 4066 freemsg(mp); 4067 goto ack_check; 4068 } 4069 } 4070 } 4071 if (mp == NULL) 4072 goto xmit_check; 4073 if (seg_len == 0) { 4074 freemsg(mp); 4075 goto xmit_check; 4076 } 4077 if (mp->b_rptr == mp->b_wptr) { 4078 /* 4079 * The header has been consumed, so we remove the 4080 * zero-length mblk here. 4081 */ 4082 mp1 = mp; 4083 mp = mp->b_cont; 4084 freeb(mp1); 4085 } 4086 /* 4087 * ACK every other segments, unless the input queue is empty 4088 * as we don't have a timer available. 4089 */ 4090 if (++tcp->tcp_rack_cnt == 2 || sockets[sock_id].inq == NULL) { 4091 flags |= TH_ACK_NEEDED; 4092 tcp->tcp_rack_cnt = 0; 4093 } 4094 tcp->tcp_rnxt += seg_len; 4095 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 4096 4097 /* Update SACK list */ 4098 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 4099 tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, 4100 &(tcp->tcp_num_sack_blk)); 4101 } 4102 4103 if (tcp->tcp_listener) { 4104 /* 4105 * Side queue inbound data until the accept happens. 4106 * tcp_accept/tcp_rput drains this when the accept happens. 4107 */ 4108 tcp_rcv_enqueue(tcp, mp, seg_len); 4109 } else { 4110 /* Just queue the data until the app calls read. */ 4111 tcp_rcv_enqueue(tcp, mp, seg_len); 4112 /* 4113 * Make sure the timer is running if we have data waiting 4114 * for a push bit. This provides resiliency against 4115 * implementations that do not correctly generate push bits. 4116 */ 4117 if (tcp->tcp_rcv_list != NULL) 4118 flags |= TH_TIMER_NEEDED; 4119 } 4120 4121 xmit_check: 4122 /* Is there anything left to do? */ 4123 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED| 4124 TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_TIMER_NEEDED)) == 0) 4125 return; 4126 4127 /* Any transmit work to do and a non-zero window? */ 4128 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT| 4129 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) { 4130 if (flags & TH_REXMIT_NEEDED) { 4131 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; 4132 4133 if (snd_size > mss) 4134 snd_size = mss; 4135 if (snd_size > tcp->tcp_swnd) 4136 snd_size = tcp->tcp_swnd; 4137 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, 4138 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, 4139 B_TRUE); 4140 4141 if (mp1 != NULL) { 4142 /* use uintptr_t to suppress the gcc warning */ 4143 tcp->tcp_xmit_head->b_prev = 4144 (mblk_t *)(uintptr_t)prom_gettime(); 4145 tcp->tcp_csuna = tcp->tcp_snxt; 4146 BUMP_MIB(tcp_mib.tcpRetransSegs); 4147 UPDATE_MIB(tcp_mib.tcpRetransBytes, snd_size); 4148 (void) ipv4_tcp_output(sock_id, mp1); 4149 freeb(mp1); 4150 } 4151 } 4152 if (flags & TH_NEED_SACK_REXMIT) { 4153 if (tcp_sack_rxmit(tcp, sock_id) != 0) { 4154 flags |= TH_XMIT_NEEDED; 4155 } 4156 } 4157 /* 4158 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send 4159 * out new segment. Note that tcp_rexmit should not be 4160 * set, otherwise TH_LIMIT_XMIT should not be set. 4161 */ 4162 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) { 4163 if (!tcp->tcp_rexmit) { 4164 tcp_wput_data(tcp, NULL, sock_id); 4165 } else { 4166 tcp_ss_rexmit(tcp, sock_id); 4167 } 4168 /* 4169 * The TCP could be closed in tcp_state_wait via 4170 * tcp_wput_data (tcp_ss_rexmit could call 4171 * tcp_wput_data as well). 4172 */ 4173 if (sockets[sock_id].pcb == NULL) 4174 return; 4175 } 4176 /* 4177 * Adjust tcp_cwnd back to normal value after sending 4178 * new data segments. 4179 */ 4180 if (flags & TH_LIMIT_XMIT) { 4181 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1); 4182 } 4183 4184 /* Anything more to do? */ 4185 if ((flags & (TH_ACK_NEEDED|TH_TIMER_NEEDED)) == 0) 4186 return; 4187 } 4188 ack_check: 4189 if (flags & TH_ACK_NEEDED) { 4190 /* 4191 * Time to send an ack for some reason. 4192 */ 4193 if ((mp1 = tcp_ack_mp(tcp)) != NULL) { 4194 TCP_DUMP_PACKET("tcp_rput_data: ack mp", mp1); 4195 (void) ipv4_tcp_output(sock_id, mp1); 4196 BUMP_MIB(tcp_mib.tcpOutAck); 4197 freeb(mp1); 4198 } 4199 } 4200 } 4201 4202 /* 4203 * tcp_ss_rexmit() is called in tcp_rput_data() to do slow start 4204 * retransmission after a timeout. 4205 * 4206 * To limit the number of duplicate segments, we limit the number of segment 4207 * to be sent in one time to tcp_snd_burst, the burst variable. 4208 */ 4209 static void 4210 tcp_ss_rexmit(tcp_t *tcp, int sock_id) 4211 { 4212 uint32_t snxt; 4213 uint32_t smax; 4214 int32_t win; 4215 int32_t mss; 4216 int32_t off; 4217 int32_t burst = tcp->tcp_snd_burst; 4218 mblk_t *snxt_mp; 4219 4220 /* 4221 * Note that tcp_rexmit can be set even though TCP has retransmitted 4222 * all unack'ed segments. 4223 */ 4224 if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) { 4225 smax = tcp->tcp_rexmit_max; 4226 snxt = tcp->tcp_rexmit_nxt; 4227 if (SEQ_LT(snxt, tcp->tcp_suna)) { 4228 snxt = tcp->tcp_suna; 4229 } 4230 win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd); 4231 win -= snxt - tcp->tcp_suna; 4232 mss = tcp->tcp_mss; 4233 snxt_mp = tcp_get_seg_mp(tcp, snxt, &off); 4234 4235 while (SEQ_LT(snxt, smax) && (win > 0) && 4236 (burst > 0) && (snxt_mp != NULL)) { 4237 mblk_t *xmit_mp; 4238 mblk_t *old_snxt_mp = snxt_mp; 4239 uint32_t cnt = mss; 4240 4241 if (win < cnt) { 4242 cnt = win; 4243 } 4244 if (SEQ_GT(snxt + cnt, smax)) { 4245 cnt = smax - snxt; 4246 } 4247 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off, 4248 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE); 4249 4250 if (xmit_mp == NULL) 4251 return; 4252 4253 (void) ipv4_tcp_output(sock_id, xmit_mp); 4254 freeb(xmit_mp); 4255 4256 snxt += cnt; 4257 win -= cnt; 4258 /* 4259 * Update the send timestamp to avoid false 4260 * retransmission. 4261 * Note. use uintptr_t to suppress the gcc warning. 4262 */ 4263 old_snxt_mp->b_prev = 4264 (mblk_t *)(uintptr_t)prom_gettime(); 4265 BUMP_MIB(tcp_mib.tcpRetransSegs); 4266 UPDATE_MIB(tcp_mib.tcpRetransBytes, cnt); 4267 4268 tcp->tcp_rexmit_nxt = snxt; 4269 burst--; 4270 } 4271 /* 4272 * If we have transmitted all we have at the time 4273 * we started the retranmission, we can leave 4274 * the rest of the job to tcp_wput_data(). But we 4275 * need to check the send window first. If the 4276 * win is not 0, go on with tcp_wput_data(). 4277 */ 4278 if (SEQ_LT(snxt, smax) || win == 0) { 4279 return; 4280 } 4281 } 4282 /* Only call tcp_wput_data() if there is data to be sent. */ 4283 if (tcp->tcp_unsent) { 4284 tcp_wput_data(tcp, NULL, sock_id); 4285 } 4286 } 4287 4288 /* 4289 * tcp_timer is the timer service routine. It handles all timer events for 4290 * a tcp instance except keepalives. It figures out from the state of the 4291 * tcp instance what kind of action needs to be done at the time it is called. 4292 */ 4293 static void 4294 tcp_timer(tcp_t *tcp, int sock_id) 4295 { 4296 mblk_t *mp; 4297 uint32_t first_threshold; 4298 uint32_t second_threshold; 4299 uint32_t ms; 4300 uint32_t mss; 4301 4302 first_threshold = tcp->tcp_first_timer_threshold; 4303 second_threshold = tcp->tcp_second_timer_threshold; 4304 switch (tcp->tcp_state) { 4305 case TCPS_IDLE: 4306 case TCPS_BOUND: 4307 case TCPS_LISTEN: 4308 return; 4309 case TCPS_SYN_RCVD: 4310 case TCPS_SYN_SENT: 4311 first_threshold = tcp->tcp_first_ctimer_threshold; 4312 second_threshold = tcp->tcp_second_ctimer_threshold; 4313 break; 4314 case TCPS_ESTABLISHED: 4315 case TCPS_FIN_WAIT_1: 4316 case TCPS_CLOSING: 4317 case TCPS_CLOSE_WAIT: 4318 case TCPS_LAST_ACK: 4319 /* If we have data to rexmit */ 4320 if (tcp->tcp_suna != tcp->tcp_snxt) { 4321 int32_t time_to_wait; 4322 4323 BUMP_MIB(tcp_mib.tcpTimRetrans); 4324 if (tcp->tcp_xmit_head == NULL) 4325 break; 4326 /* use uintptr_t to suppress the gcc warning */ 4327 time_to_wait = (int32_t)(prom_gettime() - 4328 (uint32_t)(uintptr_t)tcp->tcp_xmit_head->b_prev); 4329 time_to_wait = tcp->tcp_rto - time_to_wait; 4330 if (time_to_wait > 0) { 4331 /* 4332 * Timer fired too early, so restart it. 4333 */ 4334 TCP_TIMER_RESTART(tcp, time_to_wait); 4335 return; 4336 } 4337 /* 4338 * When we probe zero windows, we force the swnd open. 4339 * If our peer acks with a closed window swnd will be 4340 * set to zero by tcp_rput(). As long as we are 4341 * receiving acks tcp_rput will 4342 * reset 'tcp_ms_we_have_waited' so as not to trip the 4343 * first and second interval actions. NOTE: the timer 4344 * interval is allowed to continue its exponential 4345 * backoff. 4346 */ 4347 if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { 4348 DEBUG_1("tcp_timer (%d): zero win", sock_id); 4349 break; 4350 } else { 4351 /* 4352 * After retransmission, we need to do 4353 * slow start. Set the ssthresh to one 4354 * half of current effective window and 4355 * cwnd to one MSS. Also reset 4356 * tcp_cwnd_cnt. 4357 * 4358 * Note that if tcp_ssthresh is reduced because 4359 * of ECN, do not reduce it again unless it is 4360 * already one window of data away (tcp_cwr 4361 * should then be cleared) or this is a 4362 * timeout for a retransmitted segment. 4363 */ 4364 uint32_t npkt; 4365 4366 if (!tcp->tcp_cwr || tcp->tcp_rexmit) { 4367 npkt = (MIN((tcp->tcp_timer_backoff ? 4368 tcp->tcp_cwnd_ssthresh : 4369 tcp->tcp_cwnd), 4370 tcp->tcp_swnd) >> 1) / 4371 tcp->tcp_mss; 4372 if (npkt < 2) 4373 npkt = 2; 4374 tcp->tcp_cwnd_ssthresh = npkt * 4375 tcp->tcp_mss; 4376 } 4377 tcp->tcp_cwnd = tcp->tcp_mss; 4378 tcp->tcp_cwnd_cnt = 0; 4379 if (tcp->tcp_ecn_ok) { 4380 tcp->tcp_cwr = B_TRUE; 4381 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 4382 tcp->tcp_ecn_cwr_sent = B_FALSE; 4383 } 4384 } 4385 break; 4386 } 4387 /* 4388 * We have something to send yet we cannot send. The 4389 * reason can be: 4390 * 4391 * 1. Zero send window: we need to do zero window probe. 4392 * 2. Zero cwnd: because of ECN, we need to "clock out 4393 * segments. 4394 * 3. SWS avoidance: receiver may have shrunk window, 4395 * reset our knowledge. 4396 * 4397 * Note that condition 2 can happen with either 1 or 4398 * 3. But 1 and 3 are exclusive. 4399 */ 4400 if (tcp->tcp_unsent != 0) { 4401 if (tcp->tcp_cwnd == 0) { 4402 /* 4403 * Set tcp_cwnd to 1 MSS so that a 4404 * new segment can be sent out. We 4405 * are "clocking out" new data when 4406 * the network is really congested. 4407 */ 4408 assert(tcp->tcp_ecn_ok); 4409 tcp->tcp_cwnd = tcp->tcp_mss; 4410 } 4411 if (tcp->tcp_swnd == 0) { 4412 /* Extend window for zero window probe */ 4413 tcp->tcp_swnd++; 4414 tcp->tcp_zero_win_probe = B_TRUE; 4415 BUMP_MIB(tcp_mib.tcpOutWinProbe); 4416 } else { 4417 /* 4418 * Handle timeout from sender SWS avoidance. 4419 * Reset our knowledge of the max send window 4420 * since the receiver might have reduced its 4421 * receive buffer. Avoid setting tcp_max_swnd 4422 * to one since that will essentially disable 4423 * the SWS checks. 4424 * 4425 * Note that since we don't have a SWS 4426 * state variable, if the timeout is set 4427 * for ECN but not for SWS, this 4428 * code will also be executed. This is 4429 * fine as tcp_max_swnd is updated 4430 * constantly and it will not affect 4431 * anything. 4432 */ 4433 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); 4434 } 4435 tcp_wput_data(tcp, NULL, sock_id); 4436 return; 4437 } 4438 /* Is there a FIN that needs to be to re retransmitted? */ 4439 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 4440 !tcp->tcp_fin_acked) 4441 break; 4442 /* Nothing to do, return without restarting timer. */ 4443 return; 4444 case TCPS_FIN_WAIT_2: 4445 /* 4446 * User closed the TCP endpoint and peer ACK'ed our FIN. 4447 * We waited some time for for peer's FIN, but it hasn't 4448 * arrived. We flush the connection now to avoid 4449 * case where the peer has rebooted. 4450 */ 4451 /* FALLTHRU */ 4452 case TCPS_TIME_WAIT: 4453 (void) tcp_clean_death(sock_id, tcp, 0); 4454 return; 4455 default: 4456 DEBUG_3("tcp_timer (%d): strange state (%d) %s", sock_id, 4457 tcp->tcp_state, tcp_display(tcp, NULL, 4458 DISP_PORT_ONLY)); 4459 return; 4460 } 4461 if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { 4462 /* 4463 * For zero window probe, we need to send indefinitely, 4464 * unless we have not heard from the other side for some 4465 * time... 4466 */ 4467 if ((tcp->tcp_zero_win_probe == 0) || 4468 ((prom_gettime() - tcp->tcp_last_recv_time) > 4469 second_threshold)) { 4470 BUMP_MIB(tcp_mib.tcpTimRetransDrop); 4471 /* 4472 * If TCP is in SYN_RCVD state, send back a 4473 * RST|ACK as BSD does. Note that tcp_zero_win_probe 4474 * should be zero in TCPS_SYN_RCVD state. 4475 */ 4476 if (tcp->tcp_state == TCPS_SYN_RCVD) { 4477 tcp_xmit_ctl("tcp_timer: RST sent on timeout " 4478 "in SYN_RCVD", 4479 tcp, NULL, tcp->tcp_snxt, 4480 tcp->tcp_rnxt, TH_RST | TH_ACK, 0, sock_id); 4481 } 4482 (void) tcp_clean_death(sock_id, tcp, 4483 tcp->tcp_client_errno ? 4484 tcp->tcp_client_errno : ETIMEDOUT); 4485 return; 4486 } else { 4487 /* 4488 * Set tcp_ms_we_have_waited to second_threshold 4489 * so that in next timeout, we will do the above 4490 * check (lbolt - tcp_last_recv_time). This is 4491 * also to avoid overflow. 4492 * 4493 * We don't need to decrement tcp_timer_backoff 4494 * to avoid overflow because it will be decremented 4495 * later if new timeout value is greater than 4496 * tcp_rexmit_interval_max. In the case when 4497 * tcp_rexmit_interval_max is greater than 4498 * second_threshold, it means that we will wait 4499 * longer than second_threshold to send the next 4500 * window probe. 4501 */ 4502 tcp->tcp_ms_we_have_waited = second_threshold; 4503 } 4504 } else if (ms > first_threshold && tcp->tcp_rtt_sa != 0) { 4505 /* 4506 * We have been retransmitting for too long... The RTT 4507 * we calculated is probably incorrect. Reinitialize it. 4508 * Need to compensate for 0 tcp_rtt_sa. Reset 4509 * tcp_rtt_update so that we won't accidentally cache a 4510 * bad value. But only do this if this is not a zero 4511 * window probe. 4512 */ 4513 if (tcp->tcp_zero_win_probe == 0) { 4514 tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + 4515 (tcp->tcp_rtt_sa >> 5); 4516 tcp->tcp_rtt_sa = 0; 4517 tcp->tcp_rtt_update = 0; 4518 } 4519 } 4520 tcp->tcp_timer_backoff++; 4521 if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 4522 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < 4523 tcp_rexmit_interval_min) { 4524 /* 4525 * This means the original RTO is tcp_rexmit_interval_min. 4526 * So we will use tcp_rexmit_interval_min as the RTO value 4527 * and do the backoff. 4528 */ 4529 ms = tcp_rexmit_interval_min << tcp->tcp_timer_backoff; 4530 } else { 4531 ms <<= tcp->tcp_timer_backoff; 4532 } 4533 if (ms > tcp_rexmit_interval_max) { 4534 ms = tcp_rexmit_interval_max; 4535 /* 4536 * ms is at max, decrement tcp_timer_backoff to avoid 4537 * overflow. 4538 */ 4539 tcp->tcp_timer_backoff--; 4540 } 4541 tcp->tcp_ms_we_have_waited += ms; 4542 if (tcp->tcp_zero_win_probe == 0) { 4543 tcp->tcp_rto = ms; 4544 } 4545 TCP_TIMER_RESTART(tcp, ms); 4546 /* 4547 * This is after a timeout and tcp_rto is backed off. Set 4548 * tcp_set_timer to 1 so that next time RTO is updated, we will 4549 * restart the timer with a correct value. 4550 */ 4551 tcp->tcp_set_timer = 1; 4552 mss = tcp->tcp_snxt - tcp->tcp_suna; 4553 if (mss > tcp->tcp_mss) 4554 mss = tcp->tcp_mss; 4555 if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) 4556 mss = tcp->tcp_swnd; 4557 4558 if ((mp = tcp->tcp_xmit_head) != NULL) { 4559 /* use uintptr_t to suppress the gcc warning */ 4560 mp->b_prev = (mblk_t *)(uintptr_t)prom_gettime(); 4561 } 4562 mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, 4563 B_TRUE); 4564 if (mp == NULL) 4565 return; 4566 tcp->tcp_csuna = tcp->tcp_snxt; 4567 BUMP_MIB(tcp_mib.tcpRetransSegs); 4568 UPDATE_MIB(tcp_mib.tcpRetransBytes, mss); 4569 /* Dump the packet when debugging. */ 4570 TCP_DUMP_PACKET("tcp_timer", mp); 4571 4572 (void) ipv4_tcp_output(sock_id, mp); 4573 freeb(mp); 4574 4575 /* 4576 * When slow start after retransmission begins, start with 4577 * this seq no. tcp_rexmit_max marks the end of special slow 4578 * start phase. tcp_snd_burst controls how many segments 4579 * can be sent because of an ack. 4580 */ 4581 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 4582 tcp->tcp_snd_burst = TCP_CWND_SS; 4583 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 4584 (tcp->tcp_unsent == 0)) { 4585 tcp->tcp_rexmit_max = tcp->tcp_fss; 4586 } else { 4587 tcp->tcp_rexmit_max = tcp->tcp_snxt; 4588 } 4589 tcp->tcp_rexmit = B_TRUE; 4590 tcp->tcp_dupack_cnt = 0; 4591 4592 /* 4593 * Remove all rexmit SACK blk to start from fresh. 4594 */ 4595 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 4596 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 4597 tcp->tcp_num_notsack_blk = 0; 4598 tcp->tcp_cnt_notsack_list = 0; 4599 } 4600 } 4601 4602 /* 4603 * The TCP normal data output path. 4604 * NOTE: the logic of the fast path is duplicated from this function. 4605 */ 4606 static void 4607 tcp_wput_data(tcp_t *tcp, mblk_t *mp, int sock_id) 4608 { 4609 int len; 4610 mblk_t *local_time; 4611 mblk_t *mp1; 4612 uchar_t *rptr; 4613 uint32_t snxt; 4614 int tail_unsent; 4615 int tcpstate; 4616 int usable = 0; 4617 mblk_t *xmit_tail; 4618 int32_t num_burst_seg; 4619 int32_t mss; 4620 int32_t num_sack_blk = 0; 4621 int32_t tcp_hdr_len; 4622 ipaddr_t *dst; 4623 ipaddr_t *src; 4624 4625 #ifdef DEBUG 4626 printf("tcp_wput_data(%d) ##############################\n", sock_id); 4627 #endif 4628 tcpstate = tcp->tcp_state; 4629 if (mp == NULL) { 4630 /* Really tacky... but we need this for detached closes. */ 4631 len = tcp->tcp_unsent; 4632 goto data_null; 4633 } 4634 4635 /* 4636 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ, 4637 * or before a connection attempt has begun. 4638 * 4639 * The following should not happen in inetboot.... 4640 */ 4641 if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT || 4642 (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 4643 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 4644 printf("tcp_wput_data: data after ordrel, %s\n", 4645 tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 4646 } 4647 freemsg(mp); 4648 return; 4649 } 4650 4651 /* Strip empties */ 4652 for (;;) { 4653 assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 4654 (uintptr_t)INT_MAX); 4655 len = (int)(mp->b_wptr - mp->b_rptr); 4656 if (len > 0) 4657 break; 4658 mp1 = mp; 4659 mp = mp->b_cont; 4660 freeb(mp1); 4661 if (mp == NULL) { 4662 return; 4663 } 4664 } 4665 4666 /* If we are the first on the list ... */ 4667 if (tcp->tcp_xmit_head == NULL) { 4668 tcp->tcp_xmit_head = mp; 4669 tcp->tcp_xmit_tail = mp; 4670 tcp->tcp_xmit_tail_unsent = len; 4671 } else { 4672 tcp->tcp_xmit_last->b_cont = mp; 4673 len += tcp->tcp_unsent; 4674 } 4675 4676 /* Tack on however many more positive length mblks we have */ 4677 if ((mp1 = mp->b_cont) != NULL) { 4678 do { 4679 int tlen; 4680 assert((uintptr_t)(mp1->b_wptr - 4681 mp1->b_rptr) <= (uintptr_t)INT_MAX); 4682 tlen = (int)(mp1->b_wptr - mp1->b_rptr); 4683 if (tlen <= 0) { 4684 mp->b_cont = mp1->b_cont; 4685 freeb(mp1); 4686 } else { 4687 len += tlen; 4688 mp = mp1; 4689 } 4690 } while ((mp1 = mp->b_cont) != NULL); 4691 } 4692 tcp->tcp_xmit_last = mp; 4693 tcp->tcp_unsent = len; 4694 4695 data_null: 4696 snxt = tcp->tcp_snxt; 4697 xmit_tail = tcp->tcp_xmit_tail; 4698 tail_unsent = tcp->tcp_xmit_tail_unsent; 4699 4700 /* 4701 * Note that tcp_mss has been adjusted to take into account the 4702 * timestamp option if applicable. Because SACK options do not 4703 * appear in every TCP segments and they are of variable lengths, 4704 * they cannot be included in tcp_mss. Thus we need to calculate 4705 * the actual segment length when we need to send a segment which 4706 * includes SACK options. 4707 */ 4708 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 4709 int32_t opt_len; 4710 4711 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 4712 tcp->tcp_num_sack_blk); 4713 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 4714 2 + TCPOPT_HEADER_LEN; 4715 mss = tcp->tcp_mss - opt_len; 4716 tcp_hdr_len = tcp->tcp_hdr_len + opt_len; 4717 } else { 4718 mss = tcp->tcp_mss; 4719 tcp_hdr_len = tcp->tcp_hdr_len; 4720 } 4721 4722 if ((tcp->tcp_suna == snxt) && 4723 (prom_gettime() - tcp->tcp_last_recv_time) >= tcp->tcp_rto) { 4724 tcp->tcp_cwnd = MIN(tcp_slow_start_after_idle * mss, 4725 MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); 4726 } 4727 if (tcpstate == TCPS_SYN_RCVD) { 4728 /* 4729 * The three-way connection establishment handshake is not 4730 * complete yet. We want to queue the data for transmission 4731 * after entering ESTABLISHED state (RFC793). Setting usable to 4732 * zero cause a jump to "done" label effectively leaving data 4733 * on the queue. 4734 */ 4735 4736 usable = 0; 4737 } else { 4738 int usable_r = tcp->tcp_swnd; 4739 4740 /* 4741 * In the special case when cwnd is zero, which can only 4742 * happen if the connection is ECN capable, return now. 4743 * New segments is sent using tcp_timer(). The timer 4744 * is set in tcp_rput_data(). 4745 */ 4746 if (tcp->tcp_cwnd == 0) { 4747 /* 4748 * Note that tcp_cwnd is 0 before 3-way handshake is 4749 * finished. 4750 */ 4751 assert(tcp->tcp_ecn_ok || 4752 tcp->tcp_state < TCPS_ESTABLISHED); 4753 return; 4754 } 4755 4756 /* usable = MIN(swnd, cwnd) - unacked_bytes */ 4757 if (usable_r > tcp->tcp_cwnd) 4758 usable_r = tcp->tcp_cwnd; 4759 4760 /* NOTE: trouble if xmitting while SYN not acked? */ 4761 usable_r -= snxt; 4762 usable_r += tcp->tcp_suna; 4763 4764 /* usable = MIN(usable, unsent) */ 4765 if (usable_r > len) 4766 usable_r = len; 4767 4768 /* usable = MAX(usable, {1 for urgent, 0 for data}) */ 4769 if (usable_r != 0) 4770 usable = usable_r; 4771 } 4772 4773 /* use uintptr_t to suppress the gcc warning */ 4774 local_time = (mblk_t *)(uintptr_t)prom_gettime(); 4775 4776 /* 4777 * "Our" Nagle Algorithm. This is not the same as in the old 4778 * BSD. This is more in line with the true intent of Nagle. 4779 * 4780 * The conditions are: 4781 * 1. The amount of unsent data (or amount of data which can be 4782 * sent, whichever is smaller) is less than Nagle limit. 4783 * 2. The last sent size is also less than Nagle limit. 4784 * 3. There is unack'ed data. 4785 * 4. Urgent pointer is not set. Send urgent data ignoring the 4786 * Nagle algorithm. This reduces the probability that urgent 4787 * bytes get "merged" together. 4788 * 5. The app has not closed the connection. This eliminates the 4789 * wait time of the receiving side waiting for the last piece of 4790 * (small) data. 4791 * 4792 * If all are satisified, exit without sending anything. Note 4793 * that Nagle limit can be smaller than 1 MSS. Nagle limit is 4794 * the smaller of 1 MSS and global tcp_naglim_def (default to be 4795 * 4095). 4796 */ 4797 if (usable < (int)tcp->tcp_naglim && 4798 tcp->tcp_naglim > tcp->tcp_last_sent_len && 4799 snxt != tcp->tcp_suna && 4800 !(tcp->tcp_valid_bits & TCP_URG_VALID)) 4801 goto done; 4802 4803 num_burst_seg = tcp->tcp_snd_burst; 4804 for (;;) { 4805 tcph_t *tcph; 4806 mblk_t *new_mp; 4807 4808 if (num_burst_seg-- == 0) 4809 goto done; 4810 4811 len = mss; 4812 if (len > usable) { 4813 len = usable; 4814 if (len <= 0) { 4815 /* Terminate the loop */ 4816 goto done; 4817 } 4818 /* 4819 * Sender silly-window avoidance. 4820 * Ignore this if we are going to send a 4821 * zero window probe out. 4822 * 4823 * TODO: force data into microscopic window ?? 4824 * ==> (!pushed || (unsent > usable)) 4825 */ 4826 if (len < (tcp->tcp_max_swnd >> 1) && 4827 (tcp->tcp_unsent - (snxt - tcp->tcp_snxt)) > len && 4828 !((tcp->tcp_valid_bits & TCP_URG_VALID) && 4829 len == 1) && (! tcp->tcp_zero_win_probe)) { 4830 /* 4831 * If the retransmit timer is not running 4832 * we start it so that we will retransmit 4833 * in the case when the the receiver has 4834 * decremented the window. 4835 */ 4836 if (snxt == tcp->tcp_snxt && 4837 snxt == tcp->tcp_suna) { 4838 /* 4839 * We are not supposed to send 4840 * anything. So let's wait a little 4841 * bit longer before breaking SWS 4842 * avoidance. 4843 * 4844 * What should the value be? 4845 * Suggestion: MAX(init rexmit time, 4846 * tcp->tcp_rto) 4847 */ 4848 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 4849 } 4850 goto done; 4851 } 4852 } 4853 4854 tcph = tcp->tcp_tcph; 4855 4856 usable -= len; /* Approximate - can be adjusted later */ 4857 if (usable > 0) 4858 tcph->th_flags[0] = TH_ACK; 4859 else 4860 tcph->th_flags[0] = (TH_ACK | TH_PUSH); 4861 4862 U32_TO_ABE32(snxt, tcph->th_seq); 4863 4864 if (tcp->tcp_valid_bits) { 4865 uchar_t *prev_rptr = xmit_tail->b_rptr; 4866 uint32_t prev_snxt = tcp->tcp_snxt; 4867 4868 if (tail_unsent == 0) { 4869 assert(xmit_tail->b_cont != NULL); 4870 xmit_tail = xmit_tail->b_cont; 4871 prev_rptr = xmit_tail->b_rptr; 4872 tail_unsent = (int)(xmit_tail->b_wptr - 4873 xmit_tail->b_rptr); 4874 } else { 4875 xmit_tail->b_rptr = xmit_tail->b_wptr - 4876 tail_unsent; 4877 } 4878 mp = tcp_xmit_mp(tcp, xmit_tail, len, NULL, NULL, 4879 snxt, B_FALSE, (uint32_t *)&len, B_FALSE); 4880 /* Restore tcp_snxt so we get amount sent right. */ 4881 tcp->tcp_snxt = prev_snxt; 4882 if (prev_rptr == xmit_tail->b_rptr) 4883 xmit_tail->b_prev = local_time; 4884 else 4885 xmit_tail->b_rptr = prev_rptr; 4886 4887 if (mp == NULL) 4888 break; 4889 4890 mp1 = mp->b_cont; 4891 4892 snxt += len; 4893 tcp->tcp_last_sent_len = (ushort_t)len; 4894 while (mp1->b_cont) { 4895 xmit_tail = xmit_tail->b_cont; 4896 xmit_tail->b_prev = local_time; 4897 mp1 = mp1->b_cont; 4898 } 4899 tail_unsent = xmit_tail->b_wptr - mp1->b_wptr; 4900 BUMP_MIB(tcp_mib.tcpOutDataSegs); 4901 UPDATE_MIB(tcp_mib.tcpOutDataBytes, len); 4902 /* Dump the packet when debugging. */ 4903 TCP_DUMP_PACKET("tcp_wput_data (valid bits)", mp); 4904 (void) ipv4_tcp_output(sock_id, mp); 4905 freeb(mp); 4906 continue; 4907 } 4908 4909 snxt += len; /* Adjust later if we don't send all of len */ 4910 BUMP_MIB(tcp_mib.tcpOutDataSegs); 4911 UPDATE_MIB(tcp_mib.tcpOutDataBytes, len); 4912 4913 if (tail_unsent) { 4914 /* Are the bytes above us in flight? */ 4915 rptr = xmit_tail->b_wptr - tail_unsent; 4916 if (rptr != xmit_tail->b_rptr) { 4917 tail_unsent -= len; 4918 len += tcp_hdr_len; 4919 tcp->tcp_ipha->ip_len = htons(len); 4920 mp = dupb(xmit_tail); 4921 if (!mp) 4922 break; 4923 mp->b_rptr = rptr; 4924 goto must_alloc; 4925 } 4926 } else { 4927 xmit_tail = xmit_tail->b_cont; 4928 assert((uintptr_t)(xmit_tail->b_wptr - 4929 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 4930 tail_unsent = (int)(xmit_tail->b_wptr - 4931 xmit_tail->b_rptr); 4932 } 4933 4934 tail_unsent -= len; 4935 tcp->tcp_last_sent_len = (ushort_t)len; 4936 4937 len += tcp_hdr_len; 4938 if (tcp->tcp_ipversion == IPV4_VERSION) 4939 tcp->tcp_ipha->ip_len = htons(len); 4940 4941 xmit_tail->b_prev = local_time; 4942 4943 mp = dupb(xmit_tail); 4944 if (mp == NULL) 4945 goto out_of_mem; 4946 4947 len = tcp_hdr_len; 4948 /* 4949 * There are four reasons to allocate a new hdr mblk: 4950 * 1) The bytes above us are in use by another packet 4951 * 2) We don't have good alignment 4952 * 3) The mblk is being shared 4953 * 4) We don't have enough room for a header 4954 */ 4955 rptr = mp->b_rptr - len; 4956 if (!OK_32PTR(rptr) || 4957 rptr < mp->b_datap) { 4958 /* NOTE: we assume allocb returns an OK_32PTR */ 4959 4960 must_alloc:; 4961 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 4962 tcp_wroff_xtra, 0); 4963 if (mp1 == NULL) { 4964 freemsg(mp); 4965 goto out_of_mem; 4966 } 4967 mp1->b_cont = mp; 4968 mp = mp1; 4969 /* Leave room for Link Level header */ 4970 len = tcp_hdr_len; 4971 rptr = &mp->b_rptr[tcp_wroff_xtra]; 4972 mp->b_wptr = &rptr[len]; 4973 } 4974 4975 if (tcp->tcp_snd_ts_ok) { 4976 /* use uintptr_t to suppress the gcc warning */ 4977 U32_TO_BE32((uint32_t)(uintptr_t)local_time, 4978 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 4979 U32_TO_BE32(tcp->tcp_ts_recent, 4980 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 4981 } else { 4982 assert(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 4983 } 4984 4985 mp->b_rptr = rptr; 4986 4987 /* Copy the template header. */ 4988 dst = (ipaddr_t *)rptr; 4989 src = (ipaddr_t *)tcp->tcp_iphc; 4990 dst[0] = src[0]; 4991 dst[1] = src[1]; 4992 dst[2] = src[2]; 4993 dst[3] = src[3]; 4994 dst[4] = src[4]; 4995 dst[5] = src[5]; 4996 dst[6] = src[6]; 4997 dst[7] = src[7]; 4998 dst[8] = src[8]; 4999 dst[9] = src[9]; 5000 len = tcp->tcp_hdr_len; 5001 if (len -= 40) { 5002 len >>= 2; 5003 dst += 10; 5004 src += 10; 5005 do { 5006 *dst++ = *src++; 5007 } while (--len); 5008 } 5009 5010 /* 5011 * Set tcph to point to the header of the outgoing packet, 5012 * not to the template header. 5013 */ 5014 tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 5015 5016 /* 5017 * Set the ECN info in the TCP header if it is not a zero 5018 * window probe. Zero window probe is only sent in 5019 * tcp_wput_data() and tcp_timer(). 5020 */ 5021 if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) { 5022 SET_ECT(tcp, rptr); 5023 5024 if (tcp->tcp_ecn_echo_on) 5025 tcph->th_flags[0] |= TH_ECE; 5026 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 5027 tcph->th_flags[0] |= TH_CWR; 5028 tcp->tcp_ecn_cwr_sent = B_TRUE; 5029 } 5030 } 5031 5032 /* Fill in SACK options */ 5033 if (num_sack_blk > 0) { 5034 uchar_t *wptr = rptr + tcp->tcp_hdr_len; 5035 sack_blk_t *tmp; 5036 int32_t i; 5037 5038 wptr[0] = TCPOPT_NOP; 5039 wptr[1] = TCPOPT_NOP; 5040 wptr[2] = TCPOPT_SACK; 5041 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 5042 sizeof (sack_blk_t); 5043 wptr += TCPOPT_REAL_SACK_LEN; 5044 5045 tmp = tcp->tcp_sack_list; 5046 for (i = 0; i < num_sack_blk; i++) { 5047 U32_TO_BE32(tmp[i].begin, wptr); 5048 wptr += sizeof (tcp_seq); 5049 U32_TO_BE32(tmp[i].end, wptr); 5050 wptr += sizeof (tcp_seq); 5051 } 5052 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 5053 << 4); 5054 } 5055 5056 if (tail_unsent) { 5057 mp1 = mp->b_cont; 5058 if (mp1 == NULL) 5059 mp1 = mp; 5060 /* 5061 * If we're a little short, tack on more mblks 5062 * as long as we don't need to split an mblk. 5063 */ 5064 while (tail_unsent < 0 && 5065 tail_unsent + (int)(xmit_tail->b_cont->b_wptr - 5066 xmit_tail->b_cont->b_rptr) <= 0) { 5067 xmit_tail = xmit_tail->b_cont; 5068 /* Stash for rtt use later */ 5069 xmit_tail->b_prev = local_time; 5070 mp1->b_cont = dupb(xmit_tail); 5071 mp1 = mp1->b_cont; 5072 assert((uintptr_t)(xmit_tail->b_wptr - 5073 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 5074 tail_unsent += (int)(xmit_tail->b_wptr - 5075 xmit_tail->b_rptr); 5076 if (mp1 == NULL) { 5077 freemsg(mp); 5078 goto out_of_mem; 5079 } 5080 } 5081 /* Trim back any surplus on the last mblk */ 5082 if (tail_unsent > 0) 5083 mp1->b_wptr -= tail_unsent; 5084 if (tail_unsent < 0) { 5085 uint32_t ip_len; 5086 5087 /* 5088 * We did not send everything we could in 5089 * order to preserve mblk boundaries. 5090 */ 5091 usable -= tail_unsent; 5092 snxt += tail_unsent; 5093 tcp->tcp_last_sent_len += tail_unsent; 5094 UPDATE_MIB(tcp_mib.tcpOutDataBytes, 5095 tail_unsent); 5096 /* Adjust the IP length field. */ 5097 ip_len = ntohs(((struct ip *)rptr)->ip_len) + 5098 tail_unsent; 5099 ((struct ip *)rptr)->ip_len = htons(ip_len); 5100 tail_unsent = 0; 5101 } 5102 } 5103 5104 if (mp == NULL) 5105 goto out_of_mem; 5106 5107 /* 5108 * Performance hit! We need to pullup the whole message 5109 * in order to do checksum and for the MAC output routine. 5110 */ 5111 if (mp->b_cont != NULL) { 5112 int mp_size; 5113 #ifdef DEBUG 5114 printf("Multiple mblk %d\n", msgdsize(mp)); 5115 #endif 5116 new_mp = allocb(msgdsize(mp) + tcp_wroff_xtra, 0); 5117 new_mp->b_rptr += tcp_wroff_xtra; 5118 new_mp->b_wptr = new_mp->b_rptr; 5119 while (mp != NULL) { 5120 mp_size = mp->b_wptr - mp->b_rptr; 5121 bcopy(mp->b_rptr, new_mp->b_wptr, mp_size); 5122 new_mp->b_wptr += mp_size; 5123 mp = mp->b_cont; 5124 } 5125 freemsg(mp); 5126 mp = new_mp; 5127 } 5128 tcp_set_cksum(mp); 5129 ((struct ip *)mp->b_rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5130 TCP_DUMP_PACKET("tcp_wput_data", mp); 5131 (void) ipv4_tcp_output(sock_id, mp); 5132 freemsg(mp); 5133 } 5134 out_of_mem:; 5135 /* Pretend that all we were trying to send really got sent */ 5136 if (tail_unsent < 0) { 5137 do { 5138 xmit_tail = xmit_tail->b_cont; 5139 xmit_tail->b_prev = local_time; 5140 assert((uintptr_t)(xmit_tail->b_wptr - 5141 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 5142 tail_unsent += (int)(xmit_tail->b_wptr - 5143 xmit_tail->b_rptr); 5144 } while (tail_unsent < 0); 5145 } 5146 done:; 5147 tcp->tcp_xmit_tail = xmit_tail; 5148 tcp->tcp_xmit_tail_unsent = tail_unsent; 5149 len = tcp->tcp_snxt - snxt; 5150 if (len) { 5151 /* 5152 * If new data was sent, need to update the notsack 5153 * list, which is, afterall, data blocks that have 5154 * not been sack'ed by the receiver. New data is 5155 * not sack'ed. 5156 */ 5157 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 5158 /* len is a negative value. */ 5159 tcp->tcp_pipe -= len; 5160 tcp_notsack_update(&(tcp->tcp_notsack_list), 5161 tcp->tcp_snxt, snxt, 5162 &(tcp->tcp_num_notsack_blk), 5163 &(tcp->tcp_cnt_notsack_list)); 5164 } 5165 tcp->tcp_snxt = snxt + tcp->tcp_fin_sent; 5166 tcp->tcp_rack = tcp->tcp_rnxt; 5167 tcp->tcp_rack_cnt = 0; 5168 if ((snxt + len) == tcp->tcp_suna) { 5169 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5170 } 5171 /* 5172 * Note that len is the amount we just sent but with a negative 5173 * sign. We update tcp_unsent here since we may come back to 5174 * tcp_wput_data from tcp_state_wait. 5175 */ 5176 len += tcp->tcp_unsent; 5177 tcp->tcp_unsent = len; 5178 5179 /* 5180 * Let's wait till all the segments have been acked, since we 5181 * don't have a timer. 5182 */ 5183 (void) tcp_state_wait(sock_id, tcp, TCPS_ALL_ACKED); 5184 return; 5185 } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) { 5186 /* 5187 * Didn't send anything. Make sure the timer is running 5188 * so that we will probe a zero window. 5189 */ 5190 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5191 } 5192 5193 /* Note that len is the amount we just sent but with a negative sign */ 5194 len += tcp->tcp_unsent; 5195 tcp->tcp_unsent = len; 5196 5197 } 5198 5199 static void 5200 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, 5201 uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcph_t *tcph, 5202 int sock_id) 5203 { 5204 int32_t bytes_acked; 5205 int32_t gap; 5206 int32_t rgap; 5207 tcp_opt_t tcpopt; 5208 uint_t flags; 5209 uint32_t new_swnd = 0; 5210 5211 #ifdef DEBUG 5212 printf("Time wait processing called ###############3\n"); 5213 #endif 5214 5215 /* Just make sure we send the right sock_id to tcp_clean_death */ 5216 if ((sockets[sock_id].pcb == NULL) || (sockets[sock_id].pcb != tcp)) 5217 sock_id = -1; 5218 5219 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 5220 new_swnd = BE16_TO_U16(tcph->th_win) << 5221 ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); 5222 if (tcp->tcp_snd_ts_ok) { 5223 if (!tcp_paws_check(tcp, tcph, &tcpopt)) { 5224 freemsg(mp); 5225 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5226 tcp->tcp_rnxt, TH_ACK, 0, -1); 5227 return; 5228 } 5229 } 5230 gap = seg_seq - tcp->tcp_rnxt; 5231 rgap = tcp->tcp_rwnd - (gap + seg_len); 5232 if (gap < 0) { 5233 BUMP_MIB(tcp_mib.tcpInDataDupSegs); 5234 UPDATE_MIB(tcp_mib.tcpInDataDupBytes, 5235 (seg_len > -gap ? -gap : seg_len)); 5236 seg_len += gap; 5237 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 5238 if (flags & TH_RST) { 5239 freemsg(mp); 5240 return; 5241 } 5242 if ((flags & TH_FIN) && seg_len == -1) { 5243 /* 5244 * When TCP receives a duplicate FIN in 5245 * TIME_WAIT state, restart the 2 MSL timer. 5246 * See page 73 in RFC 793. Make sure this TCP 5247 * is already on the TIME_WAIT list. If not, 5248 * just restart the timer. 5249 */ 5250 tcp_time_wait_remove(tcp); 5251 tcp_time_wait_append(tcp); 5252 TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 5253 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5254 tcp->tcp_rnxt, TH_ACK, 0, -1); 5255 freemsg(mp); 5256 return; 5257 } 5258 flags |= TH_ACK_NEEDED; 5259 seg_len = 0; 5260 goto process_ack; 5261 } 5262 5263 /* Fix seg_seq, and chew the gap off the front. */ 5264 seg_seq = tcp->tcp_rnxt; 5265 } 5266 5267 if ((flags & TH_SYN) && gap > 0 && rgap < 0) { 5268 /* 5269 * Make sure that when we accept the connection, pick 5270 * an ISS greater than (tcp_snxt + ISS_INCR/2) for the 5271 * old connection. 5272 * 5273 * The next ISS generated is equal to tcp_iss_incr_extra 5274 * + ISS_INCR/2 + other components depending on the 5275 * value of tcp_strong_iss. We pre-calculate the new 5276 * ISS here and compare with tcp_snxt to determine if 5277 * we need to make adjustment to tcp_iss_incr_extra. 5278 * 5279 * Note that since we are now in the global queue 5280 * perimeter and need to do a lateral_put() to the 5281 * listener queue, there can be other connection requests/ 5282 * attempts while the lateral_put() is going on. That 5283 * means what we calculate here may not be correct. This 5284 * is extremely difficult to solve unless TCP and IP 5285 * modules are merged and there is no perimeter, but just 5286 * locks. The above calculation is ugly and is a 5287 * waste of CPU cycles... 5288 */ 5289 uint32_t new_iss = tcp_iss_incr_extra; 5290 int32_t adj; 5291 5292 /* Add time component and min random (i.e. 1). */ 5293 new_iss += (prom_gettime() >> ISS_NSEC_SHT) + 1; 5294 if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { 5295 /* 5296 * New ISS not guaranteed to be ISS_INCR/2 5297 * ahead of the current tcp_snxt, so add the 5298 * difference to tcp_iss_incr_extra. 5299 */ 5300 tcp_iss_incr_extra += adj; 5301 } 5302 tcp_clean_death(sock_id, tcp, 0); 5303 5304 /* 5305 * This is a passive open. Right now we do not 5306 * do anything... 5307 */ 5308 freemsg(mp); 5309 return; 5310 } 5311 5312 /* 5313 * rgap is the amount of stuff received out of window. A negative 5314 * value is the amount out of window. 5315 */ 5316 if (rgap < 0) { 5317 BUMP_MIB(tcp_mib.tcpInDataPastWinSegs); 5318 UPDATE_MIB(tcp_mib.tcpInDataPastWinBytes, -rgap); 5319 /* Fix seg_len and make sure there is something left. */ 5320 seg_len += rgap; 5321 if (seg_len <= 0) { 5322 if (flags & TH_RST) { 5323 freemsg(mp); 5324 return; 5325 } 5326 flags |= TH_ACK_NEEDED; 5327 seg_len = 0; 5328 goto process_ack; 5329 } 5330 } 5331 /* 5332 * Check whether we can update tcp_ts_recent. This test is 5333 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 5334 * Extensions for High Performance: An Update", Internet Draft. 5335 */ 5336 if (tcp->tcp_snd_ts_ok && 5337 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 5338 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 5339 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 5340 tcp->tcp_last_rcv_lbolt = prom_gettime(); 5341 } 5342 5343 if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { 5344 /* Always ack out of order packets */ 5345 flags |= TH_ACK_NEEDED; 5346 seg_len = 0; 5347 } else if (seg_len > 0) { 5348 BUMP_MIB(tcp_mib.tcpInDataInorderSegs); 5349 UPDATE_MIB(tcp_mib.tcpInDataInorderBytes, seg_len); 5350 } 5351 if (flags & TH_RST) { 5352 freemsg(mp); 5353 (void) tcp_clean_death(sock_id, tcp, 0); 5354 return; 5355 } 5356 if (flags & TH_SYN) { 5357 freemsg(mp); 5358 tcp_xmit_ctl("TH_SYN", tcp, NULL, seg_ack, seg_seq + 1, 5359 TH_RST|TH_ACK, 0, -1); 5360 /* 5361 * Do not delete the TCP structure if it is in 5362 * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. 5363 */ 5364 return; 5365 } 5366 process_ack: 5367 if (flags & TH_ACK) { 5368 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 5369 if (bytes_acked <= 0) { 5370 if (bytes_acked == 0 && seg_len == 0 && 5371 new_swnd == tcp->tcp_swnd) 5372 BUMP_MIB(tcp_mib.tcpInDupAck); 5373 } else { 5374 /* Acks something not sent */ 5375 flags |= TH_ACK_NEEDED; 5376 } 5377 } 5378 freemsg(mp); 5379 if (flags & TH_ACK_NEEDED) { 5380 /* 5381 * Time to send an ack for some reason. 5382 */ 5383 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5384 tcp->tcp_rnxt, TH_ACK, 0, -1); 5385 } 5386 } 5387 5388 static int 5389 tcp_init_values(tcp_t *tcp, struct inetboot_socket *isp) 5390 { 5391 int err; 5392 5393 tcp->tcp_family = AF_INET; 5394 tcp->tcp_ipversion = IPV4_VERSION; 5395 5396 /* 5397 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO 5398 * will be close to tcp_rexmit_interval_initial. By doing this, we 5399 * allow the algorithm to adjust slowly to large fluctuations of RTT 5400 * during first few transmissions of a connection as seen in slow 5401 * links. 5402 */ 5403 tcp->tcp_rtt_sa = tcp_rexmit_interval_initial << 2; 5404 tcp->tcp_rtt_sd = tcp_rexmit_interval_initial >> 1; 5405 tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 5406 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + 5407 tcp_conn_grace_period; 5408 if (tcp->tcp_rto < tcp_rexmit_interval_min) 5409 tcp->tcp_rto = tcp_rexmit_interval_min; 5410 tcp->tcp_timer_backoff = 0; 5411 tcp->tcp_ms_we_have_waited = 0; 5412 tcp->tcp_last_recv_time = prom_gettime(); 5413 tcp->tcp_cwnd_max = tcp_cwnd_max_; 5414 tcp->tcp_snd_burst = TCP_CWND_INFINITE; 5415 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 5416 /* For Ethernet, the mtu returned is actually 1550... */ 5417 if (mac_get_type() == IFT_ETHER) { 5418 tcp->tcp_if_mtu = mac_get_mtu() - 50; 5419 } else { 5420 tcp->tcp_if_mtu = mac_get_mtu(); 5421 } 5422 tcp->tcp_mss = tcp->tcp_if_mtu; 5423 5424 tcp->tcp_first_timer_threshold = tcp_ip_notify_interval; 5425 tcp->tcp_first_ctimer_threshold = tcp_ip_notify_cinterval; 5426 tcp->tcp_second_timer_threshold = tcp_ip_abort_interval; 5427 /* 5428 * Fix it to tcp_ip_abort_linterval later if it turns out to be a 5429 * passive open. 5430 */ 5431 tcp->tcp_second_ctimer_threshold = tcp_ip_abort_cinterval; 5432 5433 tcp->tcp_naglim = tcp_naglim_def; 5434 5435 /* NOTE: ISS is now set in tcp_adapt_ire(). */ 5436 5437 /* Initialize the header template */ 5438 if (tcp->tcp_ipversion == IPV4_VERSION) { 5439 err = tcp_header_init_ipv4(tcp); 5440 } 5441 if (err) 5442 return (err); 5443 5444 /* 5445 * Init the window scale to the max so tcp_rwnd_set() won't pare 5446 * down tcp_rwnd. tcp_adapt_ire() will set the right value later. 5447 */ 5448 tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; 5449 tcp->tcp_xmit_lowater = tcp_xmit_lowat; 5450 if (isp != NULL) { 5451 tcp->tcp_xmit_hiwater = isp->so_sndbuf; 5452 tcp->tcp_rwnd = isp->so_rcvbuf; 5453 tcp->tcp_rwnd_max = isp->so_rcvbuf; 5454 } 5455 tcp->tcp_state = TCPS_IDLE; 5456 return (0); 5457 } 5458 5459 /* 5460 * Initialize the IPv4 header. Loses any record of any IP options. 5461 */ 5462 static int 5463 tcp_header_init_ipv4(tcp_t *tcp) 5464 { 5465 tcph_t *tcph; 5466 5467 /* 5468 * This is a simple initialization. If there's 5469 * already a template, it should never be too small, 5470 * so reuse it. Otherwise, allocate space for the new one. 5471 */ 5472 if (tcp->tcp_iphc != NULL) { 5473 assert(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 5474 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 5475 } else { 5476 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; 5477 tcp->tcp_iphc = bkmem_zalloc(tcp->tcp_iphc_len); 5478 if (tcp->tcp_iphc == NULL) { 5479 tcp->tcp_iphc_len = 0; 5480 return (ENOMEM); 5481 } 5482 } 5483 tcp->tcp_ipha = (struct ip *)tcp->tcp_iphc; 5484 tcp->tcp_ipversion = IPV4_VERSION; 5485 5486 /* 5487 * Note that it does not include TCP options yet. It will 5488 * after the connection is established. 5489 */ 5490 tcp->tcp_hdr_len = sizeof (struct ip) + sizeof (tcph_t); 5491 tcp->tcp_tcp_hdr_len = sizeof (tcph_t); 5492 tcp->tcp_ip_hdr_len = sizeof (struct ip); 5493 tcp->tcp_ipha->ip_v = IP_VERSION; 5494 /* We don't support IP options... */ 5495 tcp->tcp_ipha->ip_hl = IP_SIMPLE_HDR_LENGTH_IN_WORDS; 5496 tcp->tcp_ipha->ip_p = IPPROTO_TCP; 5497 /* We are not supposed to do PMTU discovery... */ 5498 tcp->tcp_ipha->ip_sum = 0; 5499 5500 tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (struct ip)); 5501 tcp->tcp_tcph = tcph; 5502 tcph->th_offset_and_rsrvd[0] = (5 << 4); 5503 return (0); 5504 } 5505 5506 /* 5507 * Send out a control packet on the tcp connection specified. This routine 5508 * is typically called where we need a simple ACK or RST generated. 5509 * 5510 * This function is called with or without a mp. 5511 */ 5512 static void 5513 tcp_xmit_ctl(char *str, tcp_t *tcp, mblk_t *mp, uint32_t seq, 5514 uint32_t ack, int ctl, uint_t ip_hdr_len, int sock_id) 5515 { 5516 uchar_t *rptr; 5517 tcph_t *tcph; 5518 struct ip *iph = NULL; 5519 int tcp_hdr_len; 5520 int tcp_ip_hdr_len; 5521 5522 tcp_hdr_len = tcp->tcp_hdr_len; 5523 tcp_ip_hdr_len = tcp->tcp_ip_hdr_len; 5524 5525 if (mp) { 5526 assert(ip_hdr_len != 0); 5527 rptr = mp->b_rptr; 5528 tcph = (tcph_t *)(rptr + ip_hdr_len); 5529 /* Don't reply to a RST segment. */ 5530 if (tcph->th_flags[0] & TH_RST) { 5531 freeb(mp); 5532 return; 5533 } 5534 freemsg(mp); 5535 rptr = NULL; 5536 } else { 5537 assert(ip_hdr_len == 0); 5538 } 5539 /* If a text string is passed in with the request, print it out. */ 5540 if (str != NULL) { 5541 dprintf("tcp_xmit_ctl(%d): '%s', seq 0x%x, ack 0x%x, " 5542 "ctl 0x%x\n", sock_id, str, seq, ack, ctl); 5543 } 5544 mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 0); 5545 if (mp == NULL) { 5546 dprintf("tcp_xmit_ctl(%d): Cannot allocate memory\n", sock_id); 5547 return; 5548 } 5549 rptr = &mp->b_rptr[tcp_wroff_xtra]; 5550 mp->b_rptr = rptr; 5551 mp->b_wptr = &rptr[tcp_hdr_len]; 5552 bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len); 5553 5554 iph = (struct ip *)rptr; 5555 iph->ip_len = htons(tcp_hdr_len); 5556 5557 tcph = (tcph_t *)&rptr[tcp_ip_hdr_len]; 5558 tcph->th_flags[0] = (uint8_t)ctl; 5559 if (ctl & TH_RST) { 5560 BUMP_MIB(tcp_mib.tcpOutRsts); 5561 BUMP_MIB(tcp_mib.tcpOutControl); 5562 /* 5563 * Don't send TSopt w/ TH_RST packets per RFC 1323. 5564 */ 5565 if (tcp->tcp_snd_ts_ok && tcp->tcp_state > TCPS_SYN_SENT) { 5566 mp->b_wptr = &rptr[tcp_hdr_len - TCPOPT_REAL_TS_LEN]; 5567 *(mp->b_wptr) = TCPOPT_EOL; 5568 iph->ip_len = htons(tcp_hdr_len - 5569 TCPOPT_REAL_TS_LEN); 5570 tcph->th_offset_and_rsrvd[0] -= (3 << 4); 5571 } 5572 } 5573 if (ctl & TH_ACK) { 5574 uint32_t now = prom_gettime(); 5575 5576 if (tcp->tcp_snd_ts_ok) { 5577 U32_TO_BE32(now, 5578 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5579 U32_TO_BE32(tcp->tcp_ts_recent, 5580 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5581 } 5582 tcp->tcp_rack = ack; 5583 tcp->tcp_rack_cnt = 0; 5584 BUMP_MIB(tcp_mib.tcpOutAck); 5585 } 5586 BUMP_MIB(tcp_mib.tcpOutSegs); 5587 U32_TO_BE32(seq, tcph->th_seq); 5588 U32_TO_BE32(ack, tcph->th_ack); 5589 5590 tcp_set_cksum(mp); 5591 iph->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5592 TCP_DUMP_PACKET("tcp_xmit_ctl", mp); 5593 (void) ipv4_tcp_output(sock_id, mp); 5594 freeb(mp); 5595 } 5596 5597 /* Generate an ACK-only (no data) segment for a TCP endpoint */ 5598 static mblk_t * 5599 tcp_ack_mp(tcp_t *tcp) 5600 { 5601 if (tcp->tcp_valid_bits) { 5602 /* 5603 * For the complex case where we have to send some 5604 * controls (FIN or SYN), let tcp_xmit_mp do it. 5605 * When sending an ACK-only segment (no data) 5606 * into a zero window, always set the seq number to 5607 * suna, since snxt will be extended past the window. 5608 * If we used snxt, the receiver might consider the ACK 5609 * unacceptable. 5610 */ 5611 return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 5612 (tcp->tcp_zero_win_probe) ? 5613 tcp->tcp_suna : 5614 tcp->tcp_snxt, B_FALSE, NULL, B_FALSE)); 5615 } else { 5616 /* Generate a simple ACK */ 5617 uchar_t *rptr; 5618 tcph_t *tcph; 5619 mblk_t *mp1; 5620 int32_t tcp_hdr_len; 5621 int32_t num_sack_blk = 0; 5622 int32_t sack_opt_len; 5623 5624 /* 5625 * Allocate space for TCP + IP headers 5626 * and link-level header 5627 */ 5628 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 5629 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 5630 tcp->tcp_num_sack_blk); 5631 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 5632 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 5633 tcp_hdr_len = tcp->tcp_hdr_len + sack_opt_len; 5634 } else { 5635 tcp_hdr_len = tcp->tcp_hdr_len; 5636 } 5637 mp1 = allocb(tcp_hdr_len + tcp_wroff_xtra, 0); 5638 if (mp1 == NULL) 5639 return (NULL); 5640 5641 /* copy in prototype TCP + IP header */ 5642 rptr = mp1->b_rptr + tcp_wroff_xtra; 5643 mp1->b_rptr = rptr; 5644 mp1->b_wptr = rptr + tcp_hdr_len; 5645 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 5646 5647 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 5648 5649 /* 5650 * Set the TCP sequence number. 5651 * When sending an ACK-only segment (no data) 5652 * into a zero window, always set the seq number to 5653 * suna, since snxt will be extended past the window. 5654 * If we used snxt, the receiver might consider the ACK 5655 * unacceptable. 5656 */ 5657 U32_TO_ABE32((tcp->tcp_zero_win_probe) ? 5658 tcp->tcp_suna : tcp->tcp_snxt, tcph->th_seq); 5659 5660 /* Set up the TCP flag field. */ 5661 tcph->th_flags[0] = (uchar_t)TH_ACK; 5662 if (tcp->tcp_ecn_echo_on) 5663 tcph->th_flags[0] |= TH_ECE; 5664 5665 tcp->tcp_rack = tcp->tcp_rnxt; 5666 tcp->tcp_rack_cnt = 0; 5667 5668 /* fill in timestamp option if in use */ 5669 if (tcp->tcp_snd_ts_ok) { 5670 uint32_t llbolt = (uint32_t)prom_gettime(); 5671 5672 U32_TO_BE32(llbolt, 5673 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5674 U32_TO_BE32(tcp->tcp_ts_recent, 5675 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5676 } 5677 5678 /* Fill in SACK options */ 5679 if (num_sack_blk > 0) { 5680 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 5681 sack_blk_t *tmp; 5682 int32_t i; 5683 5684 wptr[0] = TCPOPT_NOP; 5685 wptr[1] = TCPOPT_NOP; 5686 wptr[2] = TCPOPT_SACK; 5687 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 5688 sizeof (sack_blk_t); 5689 wptr += TCPOPT_REAL_SACK_LEN; 5690 5691 tmp = tcp->tcp_sack_list; 5692 for (i = 0; i < num_sack_blk; i++) { 5693 U32_TO_BE32(tmp[i].begin, wptr); 5694 wptr += sizeof (tcp_seq); 5695 U32_TO_BE32(tmp[i].end, wptr); 5696 wptr += sizeof (tcp_seq); 5697 } 5698 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 5699 << 4); 5700 } 5701 5702 ((struct ip *)rptr)->ip_len = htons(tcp_hdr_len); 5703 tcp_set_cksum(mp1); 5704 ((struct ip *)rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5705 return (mp1); 5706 } 5707 } 5708 5709 /* 5710 * tcp_xmit_mp is called to return a pointer to an mblk chain complete with 5711 * ip and tcp header ready to pass down to IP. If the mp passed in is 5712 * non-NULL, then up to max_to_send bytes of data will be dup'ed off that 5713 * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary 5714 * otherwise it will dup partial mblks.) 5715 * Otherwise, an appropriate ACK packet will be generated. This 5716 * routine is not usually called to send new data for the first time. It 5717 * is mostly called out of the timer for retransmits, and to generate ACKs. 5718 * 5719 * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will 5720 * be adjusted by *offset. And after dupb(), the offset and the ending mblk 5721 * of the original mblk chain will be returned in *offset and *end_mp. 5722 */ 5723 static mblk_t * 5724 tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, 5725 mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, 5726 boolean_t rexmit) 5727 { 5728 int data_length; 5729 int32_t off = 0; 5730 uint_t flags; 5731 mblk_t *mp1; 5732 mblk_t *mp2; 5733 mblk_t *new_mp; 5734 uchar_t *rptr; 5735 tcph_t *tcph; 5736 int32_t num_sack_blk = 0; 5737 int32_t sack_opt_len = 0; 5738 5739 /* Allocate for our maximum TCP header + link-level */ 5740 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 5741 tcp_wroff_xtra, 0); 5742 if (mp1 == NULL) 5743 return (NULL); 5744 data_length = 0; 5745 5746 /* 5747 * Note that tcp_mss has been adjusted to take into account the 5748 * timestamp option if applicable. Because SACK options do not 5749 * appear in every TCP segments and they are of variable lengths, 5750 * they cannot be included in tcp_mss. Thus we need to calculate 5751 * the actual segment length when we need to send a segment which 5752 * includes SACK options. 5753 */ 5754 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 5755 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 5756 tcp->tcp_num_sack_blk); 5757 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 5758 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 5759 if (max_to_send + sack_opt_len > tcp->tcp_mss) 5760 max_to_send -= sack_opt_len; 5761 } 5762 5763 if (offset != NULL) { 5764 off = *offset; 5765 /* We use offset as an indicator that end_mp is not NULL. */ 5766 *end_mp = NULL; 5767 } 5768 for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) { 5769 /* This could be faster with cooperation from downstream */ 5770 if (mp2 != mp1 && !sendall && 5771 data_length + (int)(mp->b_wptr - mp->b_rptr) > 5772 max_to_send) 5773 /* 5774 * Don't send the next mblk since the whole mblk 5775 * does not fit. 5776 */ 5777 break; 5778 mp2->b_cont = dupb(mp); 5779 mp2 = mp2->b_cont; 5780 if (mp2 == NULL) { 5781 freemsg(mp1); 5782 return (NULL); 5783 } 5784 mp2->b_rptr += off; 5785 assert((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 5786 (uintptr_t)INT_MAX); 5787 5788 data_length += (int)(mp2->b_wptr - mp2->b_rptr); 5789 if (data_length > max_to_send) { 5790 mp2->b_wptr -= data_length - max_to_send; 5791 data_length = max_to_send; 5792 off = mp2->b_wptr - mp->b_rptr; 5793 break; 5794 } else { 5795 off = 0; 5796 } 5797 } 5798 if (offset != NULL) { 5799 *offset = off; 5800 *end_mp = mp; 5801 } 5802 if (seg_len != NULL) { 5803 *seg_len = data_length; 5804 } 5805 5806 rptr = mp1->b_rptr + tcp_wroff_xtra; 5807 mp1->b_rptr = rptr; 5808 mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len; 5809 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 5810 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 5811 U32_TO_ABE32(seq, tcph->th_seq); 5812 5813 /* 5814 * Use tcp_unsent to determine if the PUSH bit should be used assumes 5815 * that this function was called from tcp_wput_data. Thus, when called 5816 * to retransmit data the setting of the PUSH bit may appear some 5817 * what random in that it might get set when it should not. This 5818 * should not pose any performance issues. 5819 */ 5820 if (data_length != 0 && (tcp->tcp_unsent == 0 || 5821 tcp->tcp_unsent == data_length)) { 5822 flags = TH_ACK | TH_PUSH; 5823 } else { 5824 flags = TH_ACK; 5825 } 5826 5827 if (tcp->tcp_ecn_ok) { 5828 if (tcp->tcp_ecn_echo_on) 5829 flags |= TH_ECE; 5830 5831 /* 5832 * Only set ECT bit and ECN_CWR if a segment contains new data. 5833 * There is no TCP flow control for non-data segments, and 5834 * only data segment is transmitted reliably. 5835 */ 5836 if (data_length > 0 && !rexmit) { 5837 SET_ECT(tcp, rptr); 5838 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 5839 flags |= TH_CWR; 5840 tcp->tcp_ecn_cwr_sent = B_TRUE; 5841 } 5842 } 5843 } 5844 5845 if (tcp->tcp_valid_bits) { 5846 uint32_t u1; 5847 5848 if ((tcp->tcp_valid_bits & TCP_ISS_VALID) && 5849 seq == tcp->tcp_iss) { 5850 uchar_t *wptr; 5851 5852 /* 5853 * Tack on the MSS option. It is always needed 5854 * for both active and passive open. 5855 */ 5856 wptr = mp1->b_wptr; 5857 wptr[0] = TCPOPT_MAXSEG; 5858 wptr[1] = TCPOPT_MAXSEG_LEN; 5859 wptr += 2; 5860 /* 5861 * MSS option value should be interface MTU - MIN 5862 * TCP/IP header. 5863 */ 5864 u1 = tcp->tcp_if_mtu - IP_SIMPLE_HDR_LENGTH - 5865 TCP_MIN_HEADER_LENGTH; 5866 U16_TO_BE16(u1, wptr); 5867 mp1->b_wptr = wptr + 2; 5868 /* Update the offset to cover the additional word */ 5869 tcph->th_offset_and_rsrvd[0] += (1 << 4); 5870 5871 /* 5872 * Note that the following way of filling in 5873 * TCP options are not optimal. Some NOPs can 5874 * be saved. But there is no need at this time 5875 * to optimize it. When it is needed, we will 5876 * do it. 5877 */ 5878 switch (tcp->tcp_state) { 5879 case TCPS_SYN_SENT: 5880 flags = TH_SYN; 5881 5882 if (tcp->tcp_snd_ws_ok) { 5883 wptr = mp1->b_wptr; 5884 wptr[0] = TCPOPT_NOP; 5885 wptr[1] = TCPOPT_WSCALE; 5886 wptr[2] = TCPOPT_WS_LEN; 5887 wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 5888 mp1->b_wptr += TCPOPT_REAL_WS_LEN; 5889 tcph->th_offset_and_rsrvd[0] += 5890 (1 << 4); 5891 } 5892 5893 if (tcp->tcp_snd_ts_ok) { 5894 uint32_t llbolt; 5895 5896 llbolt = prom_gettime(); 5897 wptr = mp1->b_wptr; 5898 wptr[0] = TCPOPT_NOP; 5899 wptr[1] = TCPOPT_NOP; 5900 wptr[2] = TCPOPT_TSTAMP; 5901 wptr[3] = TCPOPT_TSTAMP_LEN; 5902 wptr += 4; 5903 U32_TO_BE32(llbolt, wptr); 5904 wptr += 4; 5905 assert(tcp->tcp_ts_recent == 0); 5906 U32_TO_BE32(0L, wptr); 5907 mp1->b_wptr += TCPOPT_REAL_TS_LEN; 5908 tcph->th_offset_and_rsrvd[0] += 5909 (3 << 4); 5910 } 5911 5912 if (tcp->tcp_snd_sack_ok) { 5913 wptr = mp1->b_wptr; 5914 wptr[0] = TCPOPT_NOP; 5915 wptr[1] = TCPOPT_NOP; 5916 wptr[2] = TCPOPT_SACK_PERMITTED; 5917 wptr[3] = TCPOPT_SACK_OK_LEN; 5918 mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 5919 tcph->th_offset_and_rsrvd[0] += 5920 (1 << 4); 5921 } 5922 5923 /* 5924 * Set up all the bits to tell other side 5925 * we are ECN capable. 5926 */ 5927 if (tcp->tcp_ecn_ok) { 5928 flags |= (TH_ECE | TH_CWR); 5929 } 5930 break; 5931 case TCPS_SYN_RCVD: 5932 flags |= TH_SYN; 5933 5934 if (tcp->tcp_snd_ws_ok) { 5935 wptr = mp1->b_wptr; 5936 wptr[0] = TCPOPT_NOP; 5937 wptr[1] = TCPOPT_WSCALE; 5938 wptr[2] = TCPOPT_WS_LEN; 5939 wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 5940 mp1->b_wptr += TCPOPT_REAL_WS_LEN; 5941 tcph->th_offset_and_rsrvd[0] += (1 << 4); 5942 } 5943 5944 if (tcp->tcp_snd_sack_ok) { 5945 wptr = mp1->b_wptr; 5946 wptr[0] = TCPOPT_NOP; 5947 wptr[1] = TCPOPT_NOP; 5948 wptr[2] = TCPOPT_SACK_PERMITTED; 5949 wptr[3] = TCPOPT_SACK_OK_LEN; 5950 mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 5951 tcph->th_offset_and_rsrvd[0] += 5952 (1 << 4); 5953 } 5954 5955 /* 5956 * If the other side is ECN capable, reply 5957 * that we are also ECN capable. 5958 */ 5959 if (tcp->tcp_ecn_ok) { 5960 flags |= TH_ECE; 5961 } 5962 break; 5963 default: 5964 break; 5965 } 5966 /* allocb() of adequate mblk assures space */ 5967 assert((uintptr_t)(mp1->b_wptr - 5968 mp1->b_rptr) <= (uintptr_t)INT_MAX); 5969 if (flags & TH_SYN) 5970 BUMP_MIB(tcp_mib.tcpOutControl); 5971 } 5972 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 5973 (seq + data_length) == tcp->tcp_fss) { 5974 if (!tcp->tcp_fin_acked) { 5975 flags |= TH_FIN; 5976 BUMP_MIB(tcp_mib.tcpOutControl); 5977 } 5978 if (!tcp->tcp_fin_sent) { 5979 tcp->tcp_fin_sent = B_TRUE; 5980 switch (tcp->tcp_state) { 5981 case TCPS_SYN_RCVD: 5982 case TCPS_ESTABLISHED: 5983 tcp->tcp_state = TCPS_FIN_WAIT_1; 5984 break; 5985 case TCPS_CLOSE_WAIT: 5986 tcp->tcp_state = TCPS_LAST_ACK; 5987 break; 5988 } 5989 if (tcp->tcp_suna == tcp->tcp_snxt) { 5990 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5991 } 5992 tcp->tcp_snxt = tcp->tcp_fss + 1; 5993 } 5994 } 5995 } 5996 tcph->th_flags[0] = (uchar_t)flags; 5997 tcp->tcp_rack = tcp->tcp_rnxt; 5998 tcp->tcp_rack_cnt = 0; 5999 6000 if (tcp->tcp_snd_ts_ok) { 6001 if (tcp->tcp_state != TCPS_SYN_SENT) { 6002 uint32_t llbolt = prom_gettime(); 6003 6004 U32_TO_BE32(llbolt, 6005 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 6006 U32_TO_BE32(tcp->tcp_ts_recent, 6007 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 6008 } 6009 } 6010 6011 if (num_sack_blk > 0) { 6012 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 6013 sack_blk_t *tmp; 6014 int32_t i; 6015 6016 wptr[0] = TCPOPT_NOP; 6017 wptr[1] = TCPOPT_NOP; 6018 wptr[2] = TCPOPT_SACK; 6019 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 6020 sizeof (sack_blk_t); 6021 wptr += TCPOPT_REAL_SACK_LEN; 6022 6023 tmp = tcp->tcp_sack_list; 6024 for (i = 0; i < num_sack_blk; i++) { 6025 U32_TO_BE32(tmp[i].begin, wptr); 6026 wptr += sizeof (tcp_seq); 6027 U32_TO_BE32(tmp[i].end, wptr); 6028 wptr += sizeof (tcp_seq); 6029 } 6030 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) << 4); 6031 } 6032 assert((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); 6033 data_length += (int)(mp1->b_wptr - rptr); 6034 if (tcp->tcp_ipversion == IPV4_VERSION) 6035 ((struct ip *)rptr)->ip_len = htons(data_length); 6036 6037 /* 6038 * Performance hit! We need to pullup the whole message 6039 * in order to do checksum and for the MAC output routine. 6040 */ 6041 if (mp1->b_cont != NULL) { 6042 int mp_size; 6043 #ifdef DEBUG 6044 printf("Multiple mblk %d\n", msgdsize(mp1)); 6045 #endif 6046 mp2 = mp1; 6047 new_mp = allocb(msgdsize(mp1) + tcp_wroff_xtra, 0); 6048 new_mp->b_rptr += tcp_wroff_xtra; 6049 new_mp->b_wptr = new_mp->b_rptr; 6050 while (mp1 != NULL) { 6051 mp_size = mp1->b_wptr - mp1->b_rptr; 6052 bcopy(mp1->b_rptr, new_mp->b_wptr, mp_size); 6053 new_mp->b_wptr += mp_size; 6054 mp1 = mp1->b_cont; 6055 } 6056 freemsg(mp2); 6057 mp1 = new_mp; 6058 } 6059 tcp_set_cksum(mp1); 6060 /* Fill in the TTL field as it is 0 in the header template. */ 6061 ((struct ip *)mp1->b_rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 6062 6063 return (mp1); 6064 } 6065 6066 /* 6067 * Generate a "no listener here" reset in response to the 6068 * connection request contained within 'mp' 6069 */ 6070 static void 6071 tcp_xmit_listeners_reset(int sock_id, mblk_t *mp, uint_t ip_hdr_len) 6072 { 6073 uchar_t *rptr; 6074 uint32_t seg_len; 6075 tcph_t *tcph; 6076 uint32_t seg_seq; 6077 uint32_t seg_ack; 6078 uint_t flags; 6079 6080 rptr = mp->b_rptr; 6081 6082 tcph = (tcph_t *)&rptr[ip_hdr_len]; 6083 seg_seq = BE32_TO_U32(tcph->th_seq); 6084 seg_ack = BE32_TO_U32(tcph->th_ack); 6085 flags = tcph->th_flags[0]; 6086 6087 seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len); 6088 if (flags & TH_RST) { 6089 freeb(mp); 6090 } else if (flags & TH_ACK) { 6091 tcp_xmit_early_reset("no tcp, reset", 6092 sock_id, mp, seg_ack, 0, TH_RST, ip_hdr_len); 6093 } else { 6094 if (flags & TH_SYN) 6095 seg_len++; 6096 tcp_xmit_early_reset("no tcp, reset/ack", sock_id, 6097 mp, 0, seg_seq + seg_len, 6098 TH_RST | TH_ACK, ip_hdr_len); 6099 } 6100 } 6101 6102 /* Non overlapping byte exchanger */ 6103 static void 6104 tcp_xchg(uchar_t *a, uchar_t *b, int len) 6105 { 6106 uchar_t uch; 6107 6108 while (len-- > 0) { 6109 uch = a[len]; 6110 a[len] = b[len]; 6111 b[len] = uch; 6112 } 6113 } 6114 6115 /* 6116 * Generate a reset based on an inbound packet for which there is no active 6117 * tcp state that we can find. 6118 */ 6119 static void 6120 tcp_xmit_early_reset(char *str, int sock_id, mblk_t *mp, uint32_t seq, 6121 uint32_t ack, int ctl, uint_t ip_hdr_len) 6122 { 6123 struct ip *iph = NULL; 6124 ushort_t len; 6125 tcph_t *tcph; 6126 int i; 6127 ipaddr_t addr; 6128 mblk_t *new_mp; 6129 6130 if (str != NULL) { 6131 dprintf("tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " 6132 "flags 0x%x\n", str, seq, ack, ctl); 6133 } 6134 6135 /* 6136 * We skip reversing source route here. 6137 * (for now we replace all IP options with EOL) 6138 */ 6139 iph = (struct ip *)mp->b_rptr; 6140 for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) 6141 mp->b_rptr[i] = IPOPT_EOL; 6142 /* 6143 * Make sure that src address is not a limited broadcast 6144 * address. Not all broadcast address checking for the 6145 * src address is possible, since we don't know the 6146 * netmask of the src addr. 6147 * No check for destination address is done, since 6148 * IP will not pass up a packet with a broadcast dest address 6149 * to TCP. 6150 */ 6151 if (iph->ip_src.s_addr == INADDR_ANY || 6152 iph->ip_src.s_addr == INADDR_BROADCAST) { 6153 freemsg(mp); 6154 return; 6155 } 6156 6157 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6158 if (tcph->th_flags[0] & TH_RST) { 6159 freemsg(mp); 6160 return; 6161 } 6162 /* 6163 * Now copy the original header to a new buffer. The reason 6164 * for doing this is that we need to put extra room before 6165 * the header for the MAC layer address. The original mblk 6166 * does not have this extra head room. 6167 */ 6168 len = ip_hdr_len + sizeof (tcph_t); 6169 if ((new_mp = allocb(len + tcp_wroff_xtra, 0)) == NULL) { 6170 freemsg(mp); 6171 return; 6172 } 6173 new_mp->b_rptr += tcp_wroff_xtra; 6174 bcopy(mp->b_rptr, new_mp->b_rptr, len); 6175 new_mp->b_wptr = new_mp->b_rptr + len; 6176 freemsg(mp); 6177 mp = new_mp; 6178 iph = (struct ip *)mp->b_rptr; 6179 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6180 6181 tcph->th_offset_and_rsrvd[0] = (5 << 4); 6182 tcp_xchg(tcph->th_fport, tcph->th_lport, 2); 6183 U32_TO_BE32(ack, tcph->th_ack); 6184 U32_TO_BE32(seq, tcph->th_seq); 6185 U16_TO_BE16(0, tcph->th_win); 6186 bzero(tcph->th_sum, sizeof (int16_t)); 6187 tcph->th_flags[0] = (uint8_t)ctl; 6188 if (ctl & TH_RST) { 6189 BUMP_MIB(tcp_mib.tcpOutRsts); 6190 BUMP_MIB(tcp_mib.tcpOutControl); 6191 } 6192 6193 iph->ip_len = htons(len); 6194 /* Swap addresses */ 6195 addr = iph->ip_src.s_addr; 6196 iph->ip_src = iph->ip_dst; 6197 iph->ip_dst.s_addr = addr; 6198 iph->ip_id = 0; 6199 iph->ip_ttl = 0; 6200 tcp_set_cksum(mp); 6201 iph->ip_ttl = (uint8_t)tcp_ipv4_ttl; 6202 6203 /* Dump the packet when debugging. */ 6204 TCP_DUMP_PACKET("tcp_xmit_early_reset", mp); 6205 (void) ipv4_tcp_output(sock_id, mp); 6206 freemsg(mp); 6207 } 6208 6209 static void 6210 tcp_set_cksum(mblk_t *mp) 6211 { 6212 struct ip *iph; 6213 tcpha_t *tcph; 6214 int len; 6215 6216 iph = (struct ip *)mp->b_rptr; 6217 tcph = (tcpha_t *)(iph + 1); 6218 len = ntohs(iph->ip_len); 6219 /* 6220 * Calculate the TCP checksum. Need to include the psuedo header, 6221 * which is similar to the real IP header starting at the TTL field. 6222 */ 6223 iph->ip_sum = htons(len - IP_SIMPLE_HDR_LENGTH); 6224 tcph->tha_sum = 0; 6225 tcph->tha_sum = tcp_cksum((uint16_t *)&(iph->ip_ttl), 6226 len - IP_SIMPLE_HDR_LENGTH + 12); 6227 iph->ip_sum = 0; 6228 } 6229 6230 static uint16_t 6231 tcp_cksum(uint16_t *buf, uint32_t len) 6232 { 6233 /* 6234 * Compute Internet Checksum for "count" bytes 6235 * beginning at location "addr". 6236 */ 6237 int32_t sum = 0; 6238 6239 while (len > 1) { 6240 /* This is the inner loop */ 6241 sum += *buf++; 6242 len -= 2; 6243 } 6244 6245 /* Add left-over byte, if any */ 6246 if (len > 0) 6247 sum += *(unsigned char *)buf * 256; 6248 6249 /* Fold 32-bit sum to 16 bits */ 6250 while (sum >> 16) 6251 sum = (sum & 0xffff) + (sum >> 16); 6252 6253 return ((uint16_t)~sum); 6254 } 6255 6256 /* 6257 * Type three generator adapted from the random() function in 4.4 BSD: 6258 */ 6259 6260 /* 6261 * Copyright (c) 1983, 1993 6262 * The Regents of the University of California. All rights reserved. 6263 * 6264 * Redistribution and use in source and binary forms, with or without 6265 * modification, are permitted provided that the following conditions 6266 * are met: 6267 * 1. Redistributions of source code must retain the above copyright 6268 * notice, this list of conditions and the following disclaimer. 6269 * 2. Redistributions in binary form must reproduce the above copyright 6270 * notice, this list of conditions and the following disclaimer in the 6271 * documentation and/or other materials provided with the distribution. 6272 * 3. All advertising materials mentioning features or use of this software 6273 * must display the following acknowledgement: 6274 * This product includes software developed by the University of 6275 * California, Berkeley and its contributors. 6276 * 4. Neither the name of the University nor the names of its contributors 6277 * may be used to endorse or promote products derived from this software 6278 * without specific prior written permission. 6279 * 6280 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 6281 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 6282 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 6283 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 6284 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 6285 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 6286 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 6287 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 6288 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 6289 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 6290 * SUCH DAMAGE. 6291 */ 6292 6293 /* Type 3 -- x**31 + x**3 + 1 */ 6294 #define DEG_3 31 6295 #define SEP_3 3 6296 6297 6298 /* Protected by tcp_random_lock */ 6299 static int tcp_randtbl[DEG_3 + 1]; 6300 6301 static int *tcp_random_fptr = &tcp_randtbl[SEP_3 + 1]; 6302 static int *tcp_random_rptr = &tcp_randtbl[1]; 6303 6304 static int *tcp_random_state = &tcp_randtbl[1]; 6305 static int *tcp_random_end_ptr = &tcp_randtbl[DEG_3 + 1]; 6306 6307 static void 6308 tcp_random_init(void) 6309 { 6310 int i; 6311 uint32_t hrt; 6312 uint32_t wallclock; 6313 uint32_t result; 6314 6315 /* 6316 * 6317 * XXX We don't have high resolution time in standalone... The 6318 * following is just some approximation on the comment below. 6319 * 6320 * Use high-res timer and current time for seed. Gethrtime() returns 6321 * a longlong, which may contain resolution down to nanoseconds. 6322 * The current time will either be a 32-bit or a 64-bit quantity. 6323 * XOR the two together in a 64-bit result variable. 6324 * Convert the result to a 32-bit value by multiplying the high-order 6325 * 32-bits by the low-order 32-bits. 6326 * 6327 * XXX We don't have gethrtime() in prom and the wallclock.... 6328 */ 6329 6330 hrt = prom_gettime(); 6331 wallclock = (uint32_t)time(NULL); 6332 result = wallclock ^ hrt; 6333 tcp_random_state[0] = result; 6334 6335 for (i = 1; i < DEG_3; i++) 6336 tcp_random_state[i] = 1103515245 * tcp_random_state[i - 1] 6337 + 12345; 6338 tcp_random_fptr = &tcp_random_state[SEP_3]; 6339 tcp_random_rptr = &tcp_random_state[0]; 6340 for (i = 0; i < 10 * DEG_3; i++) 6341 (void) tcp_random(); 6342 } 6343 6344 /* 6345 * tcp_random: Return a random number in the range [1 - (128K + 1)]. 6346 * This range is selected to be approximately centered on TCP_ISS / 2, 6347 * and easy to compute. We get this value by generating a 32-bit random 6348 * number, selecting out the high-order 17 bits, and then adding one so 6349 * that we never return zero. 6350 */ 6351 static int 6352 tcp_random(void) 6353 { 6354 int i; 6355 6356 *tcp_random_fptr += *tcp_random_rptr; 6357 6358 /* 6359 * The high-order bits are more random than the low-order bits, 6360 * so we select out the high-order 17 bits and add one so that 6361 * we never return zero. 6362 */ 6363 i = ((*tcp_random_fptr >> 15) & 0x1ffff) + 1; 6364 if (++tcp_random_fptr >= tcp_random_end_ptr) { 6365 tcp_random_fptr = tcp_random_state; 6366 ++tcp_random_rptr; 6367 } else if (++tcp_random_rptr >= tcp_random_end_ptr) 6368 tcp_random_rptr = tcp_random_state; 6369 6370 return (i); 6371 } 6372 6373 /* 6374 * Generate ISS, taking into account NDD changes may happen halfway through. 6375 * (If the iss is not zero, set it.) 6376 */ 6377 static void 6378 tcp_iss_init(tcp_t *tcp) 6379 { 6380 tcp_iss_incr_extra += (ISS_INCR >> 1); 6381 tcp->tcp_iss = tcp_iss_incr_extra; 6382 tcp->tcp_iss += (prom_gettime() >> ISS_NSEC_SHT) + tcp_random(); 6383 tcp->tcp_valid_bits = TCP_ISS_VALID; 6384 tcp->tcp_fss = tcp->tcp_iss - 1; 6385 tcp->tcp_suna = tcp->tcp_iss; 6386 tcp->tcp_snxt = tcp->tcp_iss + 1; 6387 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 6388 tcp->tcp_csuna = tcp->tcp_snxt; 6389 } 6390 6391 /* 6392 * Diagnostic routine used to return a string associated with the tcp state. 6393 * Note that if the caller does not supply a buffer, it will use an internal 6394 * static string. This means that if multiple threads call this function at 6395 * the same time, output can be corrupted... Note also that this function 6396 * does not check the size of the supplied buffer. The caller has to make 6397 * sure that it is big enough. 6398 */ 6399 static char * 6400 tcp_display(tcp_t *tcp, char *sup_buf, char format) 6401 { 6402 char buf1[30]; 6403 static char priv_buf[INET_ADDRSTRLEN * 2 + 80]; 6404 char *buf; 6405 char *cp; 6406 char local_addrbuf[INET_ADDRSTRLEN]; 6407 char remote_addrbuf[INET_ADDRSTRLEN]; 6408 struct in_addr addr; 6409 6410 if (sup_buf != NULL) 6411 buf = sup_buf; 6412 else 6413 buf = priv_buf; 6414 6415 if (tcp == NULL) 6416 return ("NULL_TCP"); 6417 switch (tcp->tcp_state) { 6418 case TCPS_CLOSED: 6419 cp = "TCP_CLOSED"; 6420 break; 6421 case TCPS_IDLE: 6422 cp = "TCP_IDLE"; 6423 break; 6424 case TCPS_BOUND: 6425 cp = "TCP_BOUND"; 6426 break; 6427 case TCPS_LISTEN: 6428 cp = "TCP_LISTEN"; 6429 break; 6430 case TCPS_SYN_SENT: 6431 cp = "TCP_SYN_SENT"; 6432 break; 6433 case TCPS_SYN_RCVD: 6434 cp = "TCP_SYN_RCVD"; 6435 break; 6436 case TCPS_ESTABLISHED: 6437 cp = "TCP_ESTABLISHED"; 6438 break; 6439 case TCPS_CLOSE_WAIT: 6440 cp = "TCP_CLOSE_WAIT"; 6441 break; 6442 case TCPS_FIN_WAIT_1: 6443 cp = "TCP_FIN_WAIT_1"; 6444 break; 6445 case TCPS_CLOSING: 6446 cp = "TCP_CLOSING"; 6447 break; 6448 case TCPS_LAST_ACK: 6449 cp = "TCP_LAST_ACK"; 6450 break; 6451 case TCPS_FIN_WAIT_2: 6452 cp = "TCP_FIN_WAIT_2"; 6453 break; 6454 case TCPS_TIME_WAIT: 6455 cp = "TCP_TIME_WAIT"; 6456 break; 6457 default: 6458 (void) sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); 6459 cp = buf1; 6460 break; 6461 } 6462 switch (format) { 6463 case DISP_ADDR_AND_PORT: 6464 /* 6465 * Note that we use the remote address in the tcp_b 6466 * structure. This means that it will print out 6467 * the real destination address, not the next hop's 6468 * address if source routing is used. 6469 */ 6470 addr.s_addr = tcp->tcp_bound_source; 6471 bcopy(inet_ntoa(addr), local_addrbuf, sizeof (local_addrbuf)); 6472 addr.s_addr = tcp->tcp_remote; 6473 bcopy(inet_ntoa(addr), remote_addrbuf, sizeof (remote_addrbuf)); 6474 (void) snprintf(buf, sizeof (priv_buf), "[%s.%u, %s.%u] %s", 6475 local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf, 6476 ntohs(tcp->tcp_fport), cp); 6477 break; 6478 case DISP_PORT_ONLY: 6479 default: 6480 (void) snprintf(buf, sizeof (priv_buf), "[%u, %u] %s", 6481 ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp); 6482 break; 6483 } 6484 6485 return (buf); 6486 } 6487 6488 /* 6489 * Add a new piece to the tcp reassembly queue. If the gap at the beginning 6490 * is filled, return as much as we can. The message passed in may be 6491 * multi-part, chained using b_cont. "start" is the starting sequence 6492 * number for this piece. 6493 */ 6494 static mblk_t * 6495 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) 6496 { 6497 uint32_t end; 6498 mblk_t *mp1; 6499 mblk_t *mp2; 6500 mblk_t *next_mp; 6501 uint32_t u1; 6502 6503 /* Walk through all the new pieces. */ 6504 do { 6505 assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 6506 (uintptr_t)INT_MAX); 6507 end = start + (int)(mp->b_wptr - mp->b_rptr); 6508 next_mp = mp->b_cont; 6509 if (start == end) { 6510 /* Empty. Blast it. */ 6511 freeb(mp); 6512 continue; 6513 } 6514 mp->b_cont = NULL; 6515 TCP_REASS_SET_SEQ(mp, start); 6516 TCP_REASS_SET_END(mp, end); 6517 mp1 = tcp->tcp_reass_tail; 6518 if (!mp1) { 6519 tcp->tcp_reass_tail = mp; 6520 tcp->tcp_reass_head = mp; 6521 BUMP_MIB(tcp_mib.tcpInDataUnorderSegs); 6522 UPDATE_MIB(tcp_mib.tcpInDataUnorderBytes, end - start); 6523 continue; 6524 } 6525 /* New stuff completely beyond tail? */ 6526 if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { 6527 /* Link it on end. */ 6528 mp1->b_cont = mp; 6529 tcp->tcp_reass_tail = mp; 6530 BUMP_MIB(tcp_mib.tcpInDataUnorderSegs); 6531 UPDATE_MIB(tcp_mib.tcpInDataUnorderBytes, end - start); 6532 continue; 6533 } 6534 mp1 = tcp->tcp_reass_head; 6535 u1 = TCP_REASS_SEQ(mp1); 6536 /* New stuff at the front? */ 6537 if (SEQ_LT(start, u1)) { 6538 /* Yes... Check for overlap. */ 6539 mp->b_cont = mp1; 6540 tcp->tcp_reass_head = mp; 6541 tcp_reass_elim_overlap(tcp, mp); 6542 continue; 6543 } 6544 /* 6545 * The new piece fits somewhere between the head and tail. 6546 * We find our slot, where mp1 precedes us and mp2 trails. 6547 */ 6548 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { 6549 u1 = TCP_REASS_SEQ(mp2); 6550 if (SEQ_LEQ(start, u1)) 6551 break; 6552 } 6553 /* Link ourselves in */ 6554 mp->b_cont = mp2; 6555 mp1->b_cont = mp; 6556 6557 /* Trim overlap with following mblk(s) first */ 6558 tcp_reass_elim_overlap(tcp, mp); 6559 6560 /* Trim overlap with preceding mblk */ 6561 tcp_reass_elim_overlap(tcp, mp1); 6562 6563 } while (start = end, mp = next_mp); 6564 mp1 = tcp->tcp_reass_head; 6565 /* Anything ready to go? */ 6566 if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) 6567 return (NULL); 6568 /* Eat what we can off the queue */ 6569 for (;;) { 6570 mp = mp1->b_cont; 6571 end = TCP_REASS_END(mp1); 6572 TCP_REASS_SET_SEQ(mp1, 0); 6573 TCP_REASS_SET_END(mp1, 0); 6574 if (!mp) { 6575 tcp->tcp_reass_tail = NULL; 6576 break; 6577 } 6578 if (end != TCP_REASS_SEQ(mp)) { 6579 mp1->b_cont = NULL; 6580 break; 6581 } 6582 mp1 = mp; 6583 } 6584 mp1 = tcp->tcp_reass_head; 6585 tcp->tcp_reass_head = mp; 6586 return (mp1); 6587 } 6588 6589 /* Eliminate any overlap that mp may have over later mblks */ 6590 static void 6591 tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) 6592 { 6593 uint32_t end; 6594 mblk_t *mp1; 6595 uint32_t u1; 6596 6597 end = TCP_REASS_END(mp); 6598 while ((mp1 = mp->b_cont) != NULL) { 6599 u1 = TCP_REASS_SEQ(mp1); 6600 if (!SEQ_GT(end, u1)) 6601 break; 6602 if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { 6603 mp->b_wptr -= end - u1; 6604 TCP_REASS_SET_END(mp, u1); 6605 BUMP_MIB(tcp_mib.tcpInDataPartDupSegs); 6606 UPDATE_MIB(tcp_mib.tcpInDataPartDupBytes, end - u1); 6607 break; 6608 } 6609 mp->b_cont = mp1->b_cont; 6610 freeb(mp1); 6611 BUMP_MIB(tcp_mib.tcpInDataDupSegs); 6612 UPDATE_MIB(tcp_mib.tcpInDataDupBytes, end - u1); 6613 } 6614 if (!mp1) 6615 tcp->tcp_reass_tail = mp; 6616 } 6617 6618 /* 6619 * Remove a connection from the list of detached TIME_WAIT connections. 6620 */ 6621 static void 6622 tcp_time_wait_remove(tcp_t *tcp) 6623 { 6624 if (tcp->tcp_time_wait_expire == 0) { 6625 assert(tcp->tcp_time_wait_next == NULL); 6626 assert(tcp->tcp_time_wait_prev == NULL); 6627 return; 6628 } 6629 assert(tcp->tcp_state == TCPS_TIME_WAIT); 6630 if (tcp == tcp_time_wait_head) { 6631 assert(tcp->tcp_time_wait_prev == NULL); 6632 tcp_time_wait_head = tcp->tcp_time_wait_next; 6633 if (tcp_time_wait_head != NULL) { 6634 tcp_time_wait_head->tcp_time_wait_prev = NULL; 6635 } else { 6636 tcp_time_wait_tail = NULL; 6637 } 6638 } else if (tcp == tcp_time_wait_tail) { 6639 assert(tcp != tcp_time_wait_head); 6640 assert(tcp->tcp_time_wait_next == NULL); 6641 tcp_time_wait_tail = tcp->tcp_time_wait_prev; 6642 assert(tcp_time_wait_tail != NULL); 6643 tcp_time_wait_tail->tcp_time_wait_next = NULL; 6644 } else { 6645 assert(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); 6646 assert(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); 6647 tcp->tcp_time_wait_prev->tcp_time_wait_next = 6648 tcp->tcp_time_wait_next; 6649 tcp->tcp_time_wait_next->tcp_time_wait_prev = 6650 tcp->tcp_time_wait_prev; 6651 } 6652 tcp->tcp_time_wait_next = NULL; 6653 tcp->tcp_time_wait_prev = NULL; 6654 tcp->tcp_time_wait_expire = 0; 6655 } 6656 6657 /* 6658 * Add a connection to the list of detached TIME_WAIT connections 6659 * and set its time to expire ... 6660 */ 6661 static void 6662 tcp_time_wait_append(tcp_t *tcp) 6663 { 6664 tcp->tcp_time_wait_expire = prom_gettime() + tcp_time_wait_interval; 6665 if (tcp->tcp_time_wait_expire == 0) 6666 tcp->tcp_time_wait_expire = 1; 6667 6668 if (tcp_time_wait_head == NULL) { 6669 assert(tcp_time_wait_tail == NULL); 6670 tcp_time_wait_head = tcp; 6671 } else { 6672 assert(tcp_time_wait_tail != NULL); 6673 assert(tcp_time_wait_tail->tcp_state == TCPS_TIME_WAIT); 6674 tcp_time_wait_tail->tcp_time_wait_next = tcp; 6675 tcp->tcp_time_wait_prev = tcp_time_wait_tail; 6676 } 6677 tcp_time_wait_tail = tcp; 6678 6679 /* for ndd stats about compression */ 6680 tcp_cum_timewait++; 6681 } 6682 6683 /* 6684 * Periodic qtimeout routine run on the default queue. 6685 * Performs 2 functions. 6686 * 1. Does TIME_WAIT compression on all recently added tcps. List 6687 * traversal is done backwards from the tail. 6688 * 2. Blows away all tcps whose TIME_WAIT has expired. List traversal 6689 * is done forwards from the head. 6690 */ 6691 void 6692 tcp_time_wait_collector(void) 6693 { 6694 tcp_t *tcp; 6695 uint32_t now; 6696 6697 /* 6698 * In order to reap time waits reliably, we should use a 6699 * source of time that is not adjustable by the user 6700 */ 6701 now = prom_gettime(); 6702 while ((tcp = tcp_time_wait_head) != NULL) { 6703 /* 6704 * Compare times using modular arithmetic, since 6705 * lbolt can wrapover. 6706 */ 6707 if ((int32_t)(now - tcp->tcp_time_wait_expire) < 0) { 6708 break; 6709 } 6710 /* 6711 * Note that the err must be 0 as there is no socket 6712 * associated with this TCP... 6713 */ 6714 (void) tcp_clean_death(-1, tcp, 0); 6715 } 6716 /* Schedule next run time. */ 6717 tcp_time_wait_runtime = prom_gettime() + 10000; 6718 } 6719 6720 void 6721 tcp_time_wait_report(void) 6722 { 6723 tcp_t *tcp; 6724 6725 printf("Current time %u\n", prom_gettime()); 6726 for (tcp = tcp_time_wait_head; tcp != NULL; 6727 tcp = tcp->tcp_time_wait_next) { 6728 printf("%s expires at %u\n", tcp_display(tcp, NULL, 6729 DISP_ADDR_AND_PORT), tcp->tcp_time_wait_expire); 6730 } 6731 } 6732 6733 /* 6734 * Send up all messages queued on tcp_rcv_list. 6735 * Have to set tcp_co_norm since we use putnext. 6736 */ 6737 static void 6738 tcp_rcv_drain(int sock_id, tcp_t *tcp) 6739 { 6740 mblk_t *mp; 6741 struct inetgram *in_gram; 6742 mblk_t *in_mp; 6743 int len; 6744 6745 /* Don't drain if the app has not finished reading all the data. */ 6746 if (sockets[sock_id].so_rcvbuf <= 0) 6747 return; 6748 6749 /* We might have come here just to updated the rwnd */ 6750 if (tcp->tcp_rcv_list == NULL) 6751 goto win_update; 6752 6753 if ((in_gram = (struct inetgram *)bkmem_zalloc( 6754 sizeof (struct inetgram))) == NULL) { 6755 return; 6756 } 6757 if ((in_mp = allocb(tcp->tcp_rcv_cnt, 0)) == NULL) { 6758 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 6759 return; 6760 } 6761 in_gram->igm_level = APP_LVL; 6762 in_gram->igm_mp = in_mp; 6763 in_gram->igm_id = 0; 6764 6765 while ((mp = tcp->tcp_rcv_list) != NULL) { 6766 tcp->tcp_rcv_list = mp->b_cont; 6767 len = mp->b_wptr - mp->b_rptr; 6768 bcopy(mp->b_rptr, in_mp->b_wptr, len); 6769 in_mp->b_wptr += len; 6770 freeb(mp); 6771 } 6772 6773 tcp->tcp_rcv_last_tail = NULL; 6774 tcp->tcp_rcv_cnt = 0; 6775 add_grams(&sockets[sock_id].inq, in_gram); 6776 6777 /* This means that so_rcvbuf can be less than 0. */ 6778 sockets[sock_id].so_rcvbuf -= in_mp->b_wptr - in_mp->b_rptr; 6779 win_update: 6780 /* 6781 * Increase the receive window to max. But we need to do receiver 6782 * SWS avoidance. This means that we need to check the increase of 6783 * of receive window is at least 1 MSS. 6784 */ 6785 if (sockets[sock_id].so_rcvbuf > 0 && 6786 (tcp->tcp_rwnd_max - tcp->tcp_rwnd >= tcp->tcp_mss)) { 6787 tcp->tcp_rwnd = tcp->tcp_rwnd_max; 6788 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 6789 tcp->tcp_tcph->th_win); 6790 } 6791 } 6792 6793 /* 6794 * Wrapper for recvfrom to call 6795 */ 6796 void 6797 tcp_rcv_drain_sock(int sock_id) 6798 { 6799 tcp_t *tcp; 6800 if ((tcp = sockets[sock_id].pcb) == NULL) 6801 return; 6802 tcp_rcv_drain(sock_id, tcp); 6803 } 6804 6805 /* 6806 * If the inq == NULL and the tcp_rcv_list != NULL, we have data that 6807 * recvfrom could read. Place a magic message in the inq to let recvfrom 6808 * know that it needs to call tcp_rcv_drain_sock to pullup the data. 6809 */ 6810 static void 6811 tcp_drain_needed(int sock_id, tcp_t *tcp) 6812 { 6813 struct inetgram *in_gram; 6814 #ifdef DEBUG 6815 printf("tcp_drain_needed: inq %x, tcp_rcv_list %x\n", 6816 sockets[sock_id].inq, tcp->tcp_rcv_list); 6817 #endif 6818 if ((sockets[sock_id].inq != NULL) || 6819 (tcp->tcp_rcv_list == NULL)) 6820 return; 6821 6822 if ((in_gram = (struct inetgram *)bkmem_zalloc( 6823 sizeof (struct inetgram))) == NULL) 6824 return; 6825 6826 in_gram->igm_level = APP_LVL; 6827 in_gram->igm_mp = NULL; 6828 in_gram->igm_id = TCP_CALLB_MAGIC_ID; 6829 6830 add_grams(&sockets[sock_id].inq, in_gram); 6831 } 6832 6833 /* 6834 * Queue data on tcp_rcv_list which is a b_next chain. 6835 * Each element of the chain is a b_cont chain. 6836 * 6837 * M_DATA messages are added to the current element. 6838 * Other messages are added as new (b_next) elements. 6839 */ 6840 static void 6841 tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) 6842 { 6843 assert(seg_len == msgdsize(mp)); 6844 if (tcp->tcp_rcv_list == NULL) { 6845 tcp->tcp_rcv_list = mp; 6846 } else { 6847 tcp->tcp_rcv_last_tail->b_cont = mp; 6848 } 6849 while (mp->b_cont) 6850 mp = mp->b_cont; 6851 tcp->tcp_rcv_last_tail = mp; 6852 tcp->tcp_rcv_cnt += seg_len; 6853 tcp->tcp_rwnd -= seg_len; 6854 #ifdef DEBUG 6855 printf("tcp_rcv_enqueue rwnd %d\n", tcp->tcp_rwnd); 6856 #endif 6857 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 6858 } 6859 6860 /* The minimum of smoothed mean deviation in RTO calculation. */ 6861 #define TCP_SD_MIN 400 6862 6863 /* 6864 * Set RTO for this connection. The formula is from Jacobson and Karels' 6865 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 6866 * are the same as those in Appendix A.2 of that paper. 6867 * 6868 * m = new measurement 6869 * sa = smoothed RTT average (8 * average estimates). 6870 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). 6871 */ 6872 static void 6873 tcp_set_rto(tcp_t *tcp, int32_t rtt) 6874 { 6875 int32_t m = rtt; 6876 uint32_t sa = tcp->tcp_rtt_sa; 6877 uint32_t sv = tcp->tcp_rtt_sd; 6878 uint32_t rto; 6879 6880 BUMP_MIB(tcp_mib.tcpRttUpdate); 6881 tcp->tcp_rtt_update++; 6882 6883 /* tcp_rtt_sa is not 0 means this is a new sample. */ 6884 if (sa != 0) { 6885 /* 6886 * Update average estimator: 6887 * new rtt = 7/8 old rtt + 1/8 Error 6888 */ 6889 6890 /* m is now Error in estimate. */ 6891 m -= sa >> 3; 6892 if ((int32_t)(sa += m) <= 0) { 6893 /* 6894 * Don't allow the smoothed average to be negative. 6895 * We use 0 to denote reinitialization of the 6896 * variables. 6897 */ 6898 sa = 1; 6899 } 6900 6901 /* 6902 * Update deviation estimator: 6903 * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) 6904 */ 6905 if (m < 0) 6906 m = -m; 6907 m -= sv >> 2; 6908 sv += m; 6909 } else { 6910 /* 6911 * This follows BSD's implementation. So the reinitialized 6912 * RTO is 3 * m. We cannot go less than 2 because if the 6913 * link is bandwidth dominated, doubling the window size 6914 * during slow start means doubling the RTT. We want to be 6915 * more conservative when we reinitialize our estimates. 3 6916 * is just a convenient number. 6917 */ 6918 sa = m << 3; 6919 sv = m << 1; 6920 } 6921 if (sv < TCP_SD_MIN) { 6922 /* 6923 * We do not know that if sa captures the delay ACK 6924 * effect as in a long train of segments, a receiver 6925 * does not delay its ACKs. So set the minimum of sv 6926 * to be TCP_SD_MIN, which is default to 400 ms, twice 6927 * of BSD DATO. That means the minimum of mean 6928 * deviation is 100 ms. 6929 * 6930 */ 6931 sv = TCP_SD_MIN; 6932 } 6933 tcp->tcp_rtt_sa = sa; 6934 tcp->tcp_rtt_sd = sv; 6935 /* 6936 * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) 6937 * 6938 * Add tcp_rexmit_interval extra in case of extreme environment 6939 * where the algorithm fails to work. The default value of 6940 * tcp_rexmit_interval_extra should be 0. 6941 * 6942 * As we use a finer grained clock than BSD and update 6943 * RTO for every ACKs, add in another .25 of RTT to the 6944 * deviation of RTO to accomodate burstiness of 1/4 of 6945 * window size. 6946 */ 6947 rto = (sa >> 3) + sv + tcp_rexmit_interval_extra + (sa >> 5); 6948 6949 if (rto > tcp_rexmit_interval_max) { 6950 tcp->tcp_rto = tcp_rexmit_interval_max; 6951 } else if (rto < tcp_rexmit_interval_min) { 6952 tcp->tcp_rto = tcp_rexmit_interval_min; 6953 } else { 6954 tcp->tcp_rto = rto; 6955 } 6956 6957 /* Now, we can reset tcp_timer_backoff to use the new RTO... */ 6958 tcp->tcp_timer_backoff = 0; 6959 } 6960 6961 /* 6962 * Initiate closedown sequence on an active connection. 6963 * Return value zero for OK return, non-zero for error return. 6964 */ 6965 static int 6966 tcp_xmit_end(tcp_t *tcp, int sock_id) 6967 { 6968 mblk_t *mp; 6969 6970 if (tcp->tcp_state < TCPS_SYN_RCVD || 6971 tcp->tcp_state > TCPS_CLOSE_WAIT) { 6972 /* 6973 * Invalid state, only states TCPS_SYN_RCVD, 6974 * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid 6975 */ 6976 return (-1); 6977 } 6978 6979 tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent; 6980 tcp->tcp_valid_bits |= TCP_FSS_VALID; 6981 /* 6982 * If there is nothing more unsent, send the FIN now. 6983 * Otherwise, it will go out with the last segment. 6984 */ 6985 if (tcp->tcp_unsent == 0) { 6986 mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 6987 tcp->tcp_fss, B_FALSE, NULL, B_FALSE); 6988 6989 if (mp != NULL) { 6990 /* Dump the packet when debugging. */ 6991 TCP_DUMP_PACKET("tcp_xmit_end", mp); 6992 (void) ipv4_tcp_output(sock_id, mp); 6993 freeb(mp); 6994 } else { 6995 /* 6996 * Couldn't allocate msg. Pretend we got it out. 6997 * Wait for rexmit timeout. 6998 */ 6999 tcp->tcp_snxt = tcp->tcp_fss + 1; 7000 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 7001 } 7002 7003 /* 7004 * If needed, update tcp_rexmit_snxt as tcp_snxt is 7005 * changed. 7006 */ 7007 if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) { 7008 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 7009 } 7010 } else { 7011 tcp_wput_data(tcp, NULL, B_FALSE); 7012 } 7013 7014 return (0); 7015 } 7016 7017 int 7018 tcp_opt_set(tcp_t *tcp, int level, int option, const void *optval, 7019 socklen_t optlen) 7020 { 7021 switch (level) { 7022 case SOL_SOCKET: { 7023 switch (option) { 7024 case SO_RCVBUF: 7025 if (optlen == sizeof (int)) { 7026 int val = *(int *)optval; 7027 7028 if (val > tcp_max_buf) { 7029 errno = ENOBUFS; 7030 break; 7031 } 7032 /* Silently ignore zero */ 7033 if (val != 0) { 7034 val = MSS_ROUNDUP(val, tcp->tcp_mss); 7035 (void) tcp_rwnd_set(tcp, val); 7036 } 7037 } else { 7038 errno = EINVAL; 7039 } 7040 break; 7041 case SO_SNDBUF: 7042 if (optlen == sizeof (int)) { 7043 tcp->tcp_xmit_hiwater = *(int *)optval; 7044 if (tcp->tcp_xmit_hiwater > tcp_max_buf) 7045 tcp->tcp_xmit_hiwater = tcp_max_buf; 7046 } else { 7047 errno = EINVAL; 7048 } 7049 break; 7050 case SO_LINGER: 7051 if (optlen == sizeof (struct linger)) { 7052 struct linger *lgr = (struct linger *)optval; 7053 7054 if (lgr->l_onoff) { 7055 tcp->tcp_linger = 1; 7056 tcp->tcp_lingertime = lgr->l_linger; 7057 } else { 7058 tcp->tcp_linger = 0; 7059 tcp->tcp_lingertime = 0; 7060 } 7061 } else { 7062 errno = EINVAL; 7063 } 7064 break; 7065 default: 7066 errno = ENOPROTOOPT; 7067 break; 7068 } 7069 break; 7070 } /* case SOL_SOCKET */ 7071 case IPPROTO_TCP: { 7072 switch (option) { 7073 default: 7074 errno = ENOPROTOOPT; 7075 break; 7076 } 7077 break; 7078 } /* case IPPROTO_TCP */ 7079 case IPPROTO_IP: { 7080 switch (option) { 7081 default: 7082 errno = ENOPROTOOPT; 7083 break; 7084 } 7085 break; 7086 } /* case IPPROTO_IP */ 7087 default: 7088 errno = ENOPROTOOPT; 7089 break; 7090 } /* switch (level) */ 7091 7092 if (errno != 0) 7093 return (-1); 7094 else 7095 return (0); 7096 } 7097